49 if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
49 if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
50 # this is an IP address |
50 # this is an IP address |
51 return socket.gethostbyaddr(url)[0] |
51 return socket.gethostbyaddr(url)[0] |
52 |
52 |
53 tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
53 tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
54 suffixes = [ |
54 with open(tlds_path) as tlds_fil: |
55 line.lower().strip().encode('utf-8') |
55 suffixes = [line.lower().encode('utf-8') |
56 for line in open(tlds_path).readlines() |
56 for line in (x.strip() for x in tlds_fil) |
57 if not line.startswith('#') |
57 if not line.startswith('#')] |
58 ] |
58 suff = 'xn--p1ai' |
59 |
59 |
60 if not isinstance(url, str): |
60 if not isinstance(url, str): |
61 url = url.decode('utf-8') |
61 url = url.decode('utf-8') |
62 url = re.sub(b'^.*://', b'', url.encode('idna')).split(b'/')[0].lower() |
62 url = re.sub('^.*://', '', url) |
|
63 url = url.split('/')[0].lower().encode('idna') |
|
64 |
63 domain = [] |
65 domain = [] |
64 |
|
65 for section in url.split(b'.'): |
66 for section in url.split(b'.'): |
66 if section in suffixes: |
67 if section in suffixes: |
67 domain.append(section) |
68 domain.append(section) |
68 else: |
69 else: |
69 domain = [section] |
70 domain = [section] |