50 # this is an IP address |
50 # this is an IP address |
51 return socket.gethostbyaddr(url)[0] |
51 return socket.gethostbyaddr(url)[0] |
52 |
52 |
53 tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
53 tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
54 suffixes = [ |
54 suffixes = [ |
55 line.lower().strip() |
55 line.lower().strip().encode('utf-8') |
56 for line in open(tlds_path).readlines() |
56 for line in open(tlds_path).readlines() |
57 if not line.startswith('#') |
57 if not line.startswith('#') |
58 ] |
58 ] |
59 |
59 |
60 if type(url) is not str: |
60 if not isinstance(url, str): |
61 url = url.decode('utf-8') |
61 url = url.decode('utf-8') |
62 url = re.sub('^.*://', '', url.encode('idna')).split('/')[0].lower() |
62 url = re.sub(b'^.*://', b'', url.encode('idna')).split(b'/')[0].lower() |
63 domain = [] |
63 domain = [] |
|
64 print('url:', url) |
64 |
65 |
65 for section in url.split('.'): |
66 for section in url.split(b'.'): |
66 if section in suffixes: |
67 if section in suffixes: |
67 domain.append(section) |
68 domain.append(section) |
68 else: |
69 else: |
69 domain = [section] |
70 domain = [section] |
70 return '.'.join(domain).decode('idna').encode('utf-8') |
71 return b'.'.join(domain).decode('idna') |
71 |
72 |
72 |
73 |
73 if __name__ == '__main__': |
74 if __name__ == '__main__': |
74 try: |
75 try: |
75 url = sys.argv[1] |
76 url = sys.argv[1] |