pywhois: comparison whois/__init_

equal deleted inserted replaced

-:95feee1af1da
+:03c72d0d1182
+# -*- coding: utf-8 -*-
 from __future__ import print_function
 from __future__ import absolute_import
 from __future__ import unicode_literals
 from __future__ import division
 from future import standard_library
 nic_client = NICClient()
 text = nic_client.whois_lookup(None, domain.encode('idna'), 0)
 return WhoisEntry.load(domain, text)
+suffixes = None
 def extract_domain(url):
 """Extract the domain from the given URL
->>> extract_domain('http://www.google.com.au/tos.html')
+>>> print(extract_domain('http://www.google.com.au/tos.html'))
-'google.com.au'
+google.com.au
->>> extract_domain('www.webscraping.com')
+>>> print(extract_domain('abc.def.com'))
-'webscraping.com'
+def.com
->>> extract_domain('198.252.206.140')
+>>> print(extract_domain(u'www.公司.hk'))
-'stackoverflow.com'
+公司.hk
->>> extract_domain('102.112.2O7.net')
+>>> print(extract_domain('chambagri.fr'))
-'2o7.net'
+chambagri.fr
->>> extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info')
+>>> print(extract_domain('www.webscraping.com'))
-'0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'
+webscraping.com
+>>> print(extract_domain('198.252.206.140'))
+stackoverflow.com
+>>> print(extract_domain('102.112.2O7.net'))
+2o7.net
+>>> print(extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'))
+0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info
 """
 if re.match(r'\d+\.\d+\.\d+\.\d+', url):
 # this is an IP address
 return socket.gethostbyaddr(url)[0]
-tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt')
+# load known TLD suffixes
-with open(tlds_path) as tlds_fil:
+global suffixes
-suffixes = [line.lower().encode('utf-8')
+if not suffixes:
-for line in (x.strip() for x in tlds_fil)
+# downloaded from https://publicsuffix.org/list/public_suffix_list.dat
-if not line.startswith('#')]
+tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'public_suffix_list.dat')
-suff = 'xn--p1ai'
+with open(tlds_path) as tlds_fp:
+suffixes = set(line.encode('utf-8') for line in tlds_fp.read().splitlines() if line and not line.startswith('//'))
 if not isinstance(url, str):
 url = url.decode('utf-8')
 url = re.sub('^.*://', '', url)
 url = url.split('/')[0].lower().encode('idna')
-domain = []
+# find the longest suffix match
-for section in url.split(b'.'):
+domain = b''
-if section in suffixes:
+for section in reversed(url.split(b'.')):
-domain.append(section)
+if domain:
-else:
+domain = '.' + domain
-domain = [section]
+domain = section + domain
-return b'.'.join(domain).decode('idna')
+if domain not in suffixes:
+break
+return domain.decode('idna')
 if __name__ == '__main__':
 try:
 url = sys.argv[1]

changeset 123	03c72d0d1182
parent 122	95feee1af1da
child 124	dfa8657bdefc