Fix extract_domain to work with unicode domains
authorEvgeni Kunev <evgeni.kunev@gmail.com>
Fri, 15 Aug 2014 17:11:52 +0300
changeset 35 0de2468a27e8
parent 34 f9da616f15cf
child 36 af839b9c0ed1
Fix extract_domain to work with unicode domains
test/test_main.py
whois/__init__.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_main.py	Fri Aug 15 17:11:52 2014 +0300
@@ -0,0 +1,26 @@
+# coding=utf-8
+
+import unittest
+from whois import extract_domain
+
+
+class TestExtractDomain(unittest.TestCase):
+    def test_simple_ascii_domain(self):
+        url = 'google.com'
+        domain = url
+        self.assertEqual(domain, extract_domain(url))
+
+    def test_ascii_with_schema_path_and_query(self):
+        url = 'https://www.google.com/search?q=why+is+domain+whois+such+a+mess'
+        domain = 'google.com'
+        self.assertEqual(domain, extract_domain(url))
+
+    def test_simple_unicode_domain(self):
+        url = 'http://нарояци.com/'
+        domain = 'нарояци.com'
+        self.assertEqual(domain, extract_domain(url))
+
+    def test_unicode_domain_and_tld(self):
+        url = 'http://россия.рф/'
+        domain = 'россия.рф'
+        self.assertEqual(domain, extract_domain(url))
--- a/whois/__init__.py	Fri Aug 15 13:31:24 2014 +0300
+++ b/whois/__init__.py	Fri Aug 15 17:11:52 2014 +0300
@@ -58,12 +58,17 @@
 
     url = re.sub('^.*://', '', url).split('/')[0].lower()
     domain = []
-    for section in url.split('.'):
+    url_sections = (
+        section.decode('utf-8').encode('idna')
+        for section in url.split('.')
+    )
+
+    for section in url_sections:
         if section in suffixes:
             domain.append(section)
         else:
             domain = [section]
-    return '.'.join(domain)
+    return '.'.join(domain).decode('idna').encode('utf-8')
 
 
 if __name__ == '__main__':