| author | Evgeni Kunev <evgeni.kunev@gmail.com> |
| Tue, 12 Aug 2014 15:57:28 +0300 | |
| changeset 31 | 92176112c2d6 |
| parent 29 | 1ebe960587b1 |
| child 33 | 8c4c05eb65f4 |
| permissions | -rw-r--r-- |
| 0 | 1 |
import re |
2 |
import sys |
|
|
29
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
3 |
import os |
| 0 | 4 |
import subprocess |
| 8 | 5 |
import socket |
| 0 | 6 |
from parser import WhoisEntry |
7 |
from whois import NICClient |
|
8 |
||
9 |
||
10 |
def whois(url): |
|
11 |
# clean domain to expose netloc |
|
| 25 | 12 |
ip_match = re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", url)
|
13 |
if ip_match: |
|
14 |
domain = url |
|
15 |
else: |
|
16 |
domain = extract_domain(url) |
|
| 0 | 17 |
try: |
18 |
# try native whois command first |
|
19 |
r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE) |
|
20 |
text = r.stdout.read() |
|
21 |
except OSError: |
|
22 |
# try experimental client |
|
23 |
nic_client = NICClient() |
|
24 |
text = nic_client.whois_lookup(None, domain, 0) |
|
25 |
return WhoisEntry.load(domain, text) |
|
26 |
||
| 5 | 27 |
|
| 0 | 28 |
def extract_domain(url): |
29 |
"""Extract the domain from the given URL |
|
30 |
||
31 |
>>> extract_domain('http://www.google.com.au/tos.html')
|
|
32 |
'google.com.au' |
|
| 11 | 33 |
>>> extract_domain('http://blog.webscraping.com')
|
34 |
'webscraping.com' |
|
| 23 | 35 |
>>> extract_domain('www.bbc.co.uk')
|
36 |
'bbc.co.uk' |
|
| 22 | 37 |
>>> extract_domain('198.252.206.140')
|
| 8 | 38 |
'stackoverflow.com' |
| 22 | 39 |
>>> extract_domain('102.112.2O7.net')
|
40 |
'2o7.net' |
|
41 |
>>> extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info')
|
|
42 |
'0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info' |
|
| 0 | 43 |
""" |
| 22 | 44 |
if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
| 8 | 45 |
# this is an IP address |
46 |
return socket.gethostbyaddr(url)[0] |
|
| 11 | 47 |
|
|
31
92176112c2d6
Move tlds.txt to a data/ folder and add it to the package
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
29
diff
changeset
|
48 |
tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
|
29
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
49 |
suffixes = [ |
|
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
50 |
line.lower().strip() |
|
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
51 |
for line in open(tlds_path).readlines() |
|
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
52 |
if not line.startswith('#')
|
|
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
53 |
] |
|
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
54 |
|
| 0 | 55 |
url = re.sub('^.*://', '', url).split('/')[0].lower()
|
56 |
domain = [] |
|
57 |
for section in url.split('.'):
|
|
58 |
if section in suffixes: |
|
59 |
domain.append(section) |
|
60 |
else: |
|
61 |
domain = [section] |
|
62 |
return '.'.join(domain) |
|
63 |
||
64 |
||
65 |
if __name__ == '__main__': |
|
66 |
try: |
|
67 |
url = sys.argv[1] |
|
68 |
except IndexError: |
|
| 12 | 69 |
print('Usage: %s url' % sys.argv[0])
|
| 0 | 70 |
else: |
| 12 | 71 |
print(whois(url)) |