| author | Richard Penman |
| Fri, 03 Feb 2017 16:17:46 +0800 | |
| changeset 124 | dfa8657bdefc |
| parent 123 | 03c72d0d1182 |
| child 136 | 30259bf0523f |
| permissions | -rw-r--r-- |
| 123 | 1 |
# -*- coding: utf-8 -*- |
2 |
||
| 70 | 3 |
from __future__ import print_function |
4 |
from __future__ import absolute_import |
|
5 |
from __future__ import unicode_literals |
|
6 |
from __future__ import division |
|
7 |
from future import standard_library |
|
8 |
standard_library.install_aliases() |
|
9 |
from builtins import * |
|
| 0 | 10 |
import re |
11 |
import sys |
|
|
29
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
12 |
import os |
| 0 | 13 |
import subprocess |
| 8 | 14 |
import socket |
| 70 | 15 |
from .parser import WhoisEntry |
16 |
from .whois import NICClient |
|
| 0 | 17 |
|
18 |
||
| 122 | 19 |
|
| 60 | 20 |
def whois(url, command=False): |
| 0 | 21 |
# clean domain to expose netloc |
| 25 | 22 |
ip_match = re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", url)
|
23 |
if ip_match: |
|
24 |
domain = url |
|
|
86
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
25 |
try: |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
26 |
result = socket.gethostbyaddr(url) |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
27 |
except socket.herror as e: |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
28 |
pass |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
29 |
else: |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
30 |
domain = result[0] |
| 25 | 31 |
else: |
32 |
domain = extract_domain(url) |
|
| 60 | 33 |
if command: |
34 |
# try native whois command |
|
35 |
r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE) |
|
36 |
text = r.stdout.read() |
|
|
33
8c4c05eb65f4
Allow explicit usage of NICClient even if whois binary is available
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
31
diff
changeset
|
37 |
else: |
| 60 | 38 |
# try builtin client |
| 0 | 39 |
nic_client = NICClient() |
| 122 | 40 |
text = nic_client.whois_lookup(None, domain.encode('idna'), 0)
|
| 0 | 41 |
return WhoisEntry.load(domain, text) |
42 |
||
| 5 | 43 |
|
| 123 | 44 |
suffixes = None |
| 0 | 45 |
def extract_domain(url): |
46 |
"""Extract the domain from the given URL |
|
47 |
||
| 123 | 48 |
>>> print(extract_domain('http://www.google.com.au/tos.html'))
|
49 |
google.com.au |
|
50 |
>>> print(extract_domain('abc.def.com'))
|
|
51 |
def.com |
|
52 |
>>> print(extract_domain(u'www.公司.hk')) |
|
53 |
公司.hk |
|
54 |
>>> print(extract_domain('chambagri.fr'))
|
|
55 |
chambagri.fr |
|
56 |
>>> print(extract_domain('www.webscraping.com'))
|
|
57 |
webscraping.com |
|
58 |
>>> print(extract_domain('198.252.206.140'))
|
|
59 |
stackoverflow.com |
|
60 |
>>> print(extract_domain('102.112.2O7.net'))
|
|
61 |
2o7.net |
|
| 124 | 62 |
>>> print(extract_domain('globoesporte.globo.com'))
|
63 |
globo.com |
|
| 123 | 64 |
>>> print(extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'))
|
65 |
0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info |
|
| 0 | 66 |
""" |
| 22 | 67 |
if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
| 8 | 68 |
# this is an IP address |
69 |
return socket.gethostbyaddr(url)[0] |
|
| 11 | 70 |
|
| 123 | 71 |
# load known TLD suffixes |
72 |
global suffixes |
|
73 |
if not suffixes: |
|
74 |
# downloaded from https://publicsuffix.org/list/public_suffix_list.dat |
|
75 |
tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'public_suffix_list.dat') |
|
76 |
with open(tlds_path) as tlds_fp: |
|
77 |
suffixes = set(line.encode('utf-8') for line in tlds_fp.read().splitlines() if line and not line.startswith('//'))
|
|
|
29
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
78 |
|
| 71 | 79 |
if not isinstance(url, str): |
|
38
da8f2956db7e
Ensure lookups work with both unicode and bytes objects
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
35
diff
changeset
|
80 |
url = url.decode('utf-8')
|
|
97
44522cd37b07
Fixed tests. Also some UTF bugs (python2/3 hell)
Mario D. Santana <mario@elorangutan.com>
parents:
73
diff
changeset
|
81 |
url = re.sub('^.*://', '', url)
|
|
44522cd37b07
Fixed tests. Also some UTF bugs (python2/3 hell)
Mario D. Santana <mario@elorangutan.com>
parents:
73
diff
changeset
|
82 |
url = url.split('/')[0].lower().encode('idna')
|
|
44522cd37b07
Fixed tests. Also some UTF bugs (python2/3 hell)
Mario D. Santana <mario@elorangutan.com>
parents:
73
diff
changeset
|
83 |
|
| 123 | 84 |
# find the longest suffix match |
85 |
domain = b'' |
|
86 |
for section in reversed(url.split(b'.')): |
|
87 |
if domain: |
|
88 |
domain = '.' + domain |
|
89 |
domain = section + domain |
|
90 |
if domain not in suffixes: |
|
91 |
break |
|
92 |
return domain.decode('idna')
|
|
| 0 | 93 |
|
94 |
||
95 |
if __name__ == '__main__': |
|
96 |
try: |
|
97 |
url = sys.argv[1] |
|
98 |
except IndexError: |
|
| 70 | 99 |
print('Usage: %s url' % sys.argv[0])
|
| 0 | 100 |
else: |
| 70 | 101 |
print(whois(url)) |