# HG changeset patch # User Evgeni Kunev # Date 1408111912 -10800 # Node ID 0de2468a27e8e156e159cbc1205dad260b223a2b # Parent f9da616f15cf97e620ecd1b32349ab63ce352cec Fix extract_domain to work with unicode domains diff -r f9da616f15cf -r 0de2468a27e8 test/test_main.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_main.py Fri Aug 15 17:11:52 2014 +0300 @@ -0,0 +1,26 @@ +# coding=utf-8 + +import unittest +from whois import extract_domain + + +class TestExtractDomain(unittest.TestCase): + def test_simple_ascii_domain(self): + url = 'google.com' + domain = url + self.assertEqual(domain, extract_domain(url)) + + def test_ascii_with_schema_path_and_query(self): + url = 'https://www.google.com/search?q=why+is+domain+whois+such+a+mess' + domain = 'google.com' + self.assertEqual(domain, extract_domain(url)) + + def test_simple_unicode_domain(self): + url = 'http://нарояци.com/' + domain = 'нарояци.com' + self.assertEqual(domain, extract_domain(url)) + + def test_unicode_domain_and_tld(self): + url = 'http://россия.рф/' + domain = 'россия.рф' + self.assertEqual(domain, extract_domain(url)) diff -r f9da616f15cf -r 0de2468a27e8 whois/__init__.py --- a/whois/__init__.py Fri Aug 15 13:31:24 2014 +0300 +++ b/whois/__init__.py Fri Aug 15 17:11:52 2014 +0300 @@ -58,12 +58,17 @@ url = re.sub('^.*://', '', url).split('/')[0].lower() domain = [] - for section in url.split('.'): + url_sections = ( + section.decode('utf-8').encode('idna') + for section in url.split('.') + ) + + for section in url_sections: if section in suffixes: domain.append(section) else: domain = [section] - return '.'.join(domain) + return '.'.join(domain).decode('idna').encode('utf-8') if __name__ == '__main__':