| author | Richard Penman |
| Thu, 02 Jun 2016 13:56:11 -0700 | |
| changeset 86 | d6fcfa5acc7b |
| parent 73 | 644d81a7995b |
| child 98 | 3202436d89d0 |
| permissions | -rw-r--r-- |
| 70 | 1 |
from __future__ import print_function |
2 |
from __future__ import absolute_import |
|
3 |
from __future__ import unicode_literals |
|
4 |
from __future__ import division |
|
5 |
from future import standard_library |
|
6 |
standard_library.install_aliases() |
|
7 |
from builtins import * |
|
| 0 | 8 |
import re |
9 |
import sys |
|
|
29
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
10 |
import os |
| 0 | 11 |
import subprocess |
| 8 | 12 |
import socket |
| 70 | 13 |
from .parser import WhoisEntry |
14 |
from .whois import NICClient |
|
| 0 | 15 |
|
16 |
||
| 60 | 17 |
def whois(url, command=False): |
| 0 | 18 |
# clean domain to expose netloc |
| 25 | 19 |
ip_match = re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", url)
|
20 |
if ip_match: |
|
21 |
domain = url |
|
|
86
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
22 |
try: |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
23 |
result = socket.gethostbyaddr(url) |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
24 |
except socket.herror as e: |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
25 |
pass |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
26 |
else: |
|
d6fcfa5acc7b
added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents:
73
diff
changeset
|
27 |
domain = result[0] |
| 25 | 28 |
else: |
29 |
domain = extract_domain(url) |
|
| 60 | 30 |
if command: |
31 |
# try native whois command |
|
32 |
r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE) |
|
33 |
text = r.stdout.read() |
|
|
33
8c4c05eb65f4
Allow explicit usage of NICClient even if whois binary is available
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
31
diff
changeset
|
34 |
else: |
| 60 | 35 |
# try builtin client |
| 0 | 36 |
nic_client = NICClient() |
37 |
text = nic_client.whois_lookup(None, domain, 0) |
|
38 |
return WhoisEntry.load(domain, text) |
|
39 |
||
| 5 | 40 |
|
| 0 | 41 |
def extract_domain(url): |
42 |
"""Extract the domain from the given URL |
|
43 |
||
44 |
>>> extract_domain('http://www.google.com.au/tos.html')
|
|
45 |
'google.com.au' |
|
| 60 | 46 |
>>> extract_domain('www.webscraping.com')
|
| 11 | 47 |
'webscraping.com' |
| 22 | 48 |
>>> extract_domain('198.252.206.140')
|
| 8 | 49 |
'stackoverflow.com' |
| 22 | 50 |
>>> extract_domain('102.112.2O7.net')
|
51 |
'2o7.net' |
|
52 |
>>> extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info')
|
|
53 |
'0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info' |
|
| 0 | 54 |
""" |
| 22 | 55 |
if re.match(r'\d+\.\d+\.\d+\.\d+', url): |
| 8 | 56 |
# this is an IP address |
57 |
return socket.gethostbyaddr(url)[0] |
|
| 11 | 58 |
|
|
31
92176112c2d6
Move tlds.txt to a data/ folder and add it to the package
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
29
diff
changeset
|
59 |
tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt') |
|
29
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
60 |
suffixes = [ |
| 71 | 61 |
line.lower().strip().encode('utf-8')
|
|
29
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
62 |
for line in open(tlds_path).readlines() |
|
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
63 |
if not line.startswith('#')
|
|
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
64 |
] |
|
1ebe960587b1
Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
25
diff
changeset
|
65 |
|
| 71 | 66 |
if not isinstance(url, str): |
|
38
da8f2956db7e
Ensure lookups work with both unicode and bytes objects
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
35
diff
changeset
|
67 |
url = url.decode('utf-8')
|
| 71 | 68 |
url = re.sub(b'^.*://', b'', url.encode('idna')).split(b'/')[0].lower()
|
| 0 | 69 |
domain = [] |
|
35
0de2468a27e8
Fix extract_domain to work with unicode domains
Evgeni Kunev <evgeni.kunev@gmail.com>
parents:
33
diff
changeset
|
70 |
|
| 71 | 71 |
for section in url.split(b'.'): |
| 0 | 72 |
if section in suffixes: |
73 |
domain.append(section) |
|
74 |
else: |
|
75 |
domain = [section] |
|
| 71 | 76 |
return b'.'.join(domain).decode('idna')
|
| 0 | 77 |
|
78 |
||
79 |
if __name__ == '__main__': |
|
80 |
try: |
|
81 |
url = sys.argv[1] |
|
82 |
except IndexError: |
|
| 70 | 83 |
print('Usage: %s url' % sys.argv[0])
|
| 0 | 84 |
else: |
| 70 | 85 |
print(whois(url)) |