37 nic_client = NICClient() |
39 nic_client = NICClient() |
38 text = nic_client.whois_lookup(None, domain.encode('idna'), 0) |
40 text = nic_client.whois_lookup(None, domain.encode('idna'), 0) |
39 return WhoisEntry.load(domain, text) |
41 return WhoisEntry.load(domain, text) |
40 |
42 |
41 |
43 |
|
44 suffixes = None |
42 def extract_domain(url): |
45 def extract_domain(url): |
43 """Extract the domain from the given URL |
46 """Extract the domain from the given URL |
44 |
47 |
45 >>> extract_domain('http://www.google.com.au/tos.html') |
48 >>> print(extract_domain('http://www.google.com.au/tos.html')) |
46 'google.com.au' |
49 google.com.au |
47 >>> extract_domain('www.webscraping.com') |
50 >>> print(extract_domain('abc.def.com')) |
48 'webscraping.com' |
51 def.com |
49 >>> extract_domain('198.252.206.140') |
52 >>> print(extract_domain(u'www.公司.hk')) |
50 'stackoverflow.com' |
53 公司.hk |
51 >>> extract_domain('102.112.2O7.net') |
54 >>> print(extract_domain('chambagri.fr')) |
52 '2o7.net' |
55 chambagri.fr |
53 >>> extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info') |
56 >>> print(extract_domain('www.webscraping.com')) |
54 '0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info' |
57 webscraping.com |
|
58 >>> print(extract_domain('198.252.206.140')) |
|
59 stackoverflow.com |
|
60 >>> print(extract_domain('102.112.2O7.net')) |
|
61 2o7.net |
|
62 >>> print(extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info')) |
|
63 0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info |
55 """ |
64 """ |
56 if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
65 if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
57 # this is an IP address |
66 # this is an IP address |
58 return socket.gethostbyaddr(url)[0] |
67 return socket.gethostbyaddr(url)[0] |
59 |
68 |
60 tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
69 # load known TLD suffixes |
61 with open(tlds_path) as tlds_fil: |
70 global suffixes |
62 suffixes = [line.lower().encode('utf-8') |
71 if not suffixes: |
63 for line in (x.strip() for x in tlds_fil) |
72 # downloaded from https://publicsuffix.org/list/public_suffix_list.dat |
64 if not line.startswith('#')] |
73 tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'public_suffix_list.dat') |
65 suff = 'xn--p1ai' |
74 with open(tlds_path) as tlds_fp: |
|
75 suffixes = set(line.encode('utf-8') for line in tlds_fp.read().splitlines() if line and not line.startswith('//')) |
66 |
76 |
67 if not isinstance(url, str): |
77 if not isinstance(url, str): |
68 url = url.decode('utf-8') |
78 url = url.decode('utf-8') |
69 url = re.sub('^.*://', '', url) |
79 url = re.sub('^.*://', '', url) |
70 url = url.split('/')[0].lower().encode('idna') |
80 url = url.split('/')[0].lower().encode('idna') |
71 |
81 |
72 domain = [] |
82 # find the longest suffix match |
73 for section in url.split(b'.'): |
83 domain = b'' |
74 if section in suffixes: |
84 for section in reversed(url.split(b'.')): |
75 domain.append(section) |
85 if domain: |
76 else: |
86 domain = '.' + domain |
77 domain = [section] |
87 domain = section + domain |
78 return b'.'.join(domain).decode('idna') |
88 if domain not in suffixes: |
|
89 break |
|
90 return domain.decode('idna') |
79 |
91 |
80 |
92 |
81 if __name__ == '__main__': |
93 if __name__ == '__main__': |
82 try: |
94 try: |
83 url = sys.argv[1] |
95 url = sys.argv[1] |