55 if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
55 if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
56 # this is an IP address |
56 # this is an IP address |
57 return socket.gethostbyaddr(url)[0] |
57 return socket.gethostbyaddr(url)[0] |
58 |
58 |
59 tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
59 tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
60 suffixes = [ |
60 with open(tlds_path) as tlds_fil: |
61 line.lower().strip().encode('utf-8') |
61 suffixes = [line.lower().encode('utf-8') |
62 for line in open(tlds_path).readlines() |
62 for line in (x.strip() for x in tlds_fil) |
63 if not line.startswith('#') |
63 if not line.startswith('#')] |
64 ] |
64 suff = 'xn--p1ai' |
65 |
65 |
66 if not isinstance(url, str): |
66 if not isinstance(url, str): |
67 url = url.decode('utf-8') |
67 url = url.decode('utf-8') |
68 url = re.sub(b'^.*://', b'', url.encode('idna')).split(b'/')[0].lower() |
68 url = re.sub('^.*://', '', url) |
|
69 url = url.split('/')[0].lower().encode('idna') |
|
70 |
69 domain = [] |
71 domain = [] |
70 |
|
71 for section in url.split(b'.'): |
72 for section in url.split(b'.'): |
72 if section in suffixes: |
73 if section in suffixes: |
73 domain.append(section) |
74 domain.append(section) |
74 else: |
75 else: |
75 domain = [section] |
76 domain = [section] |