whois/__init__.py
author Evgeni Kunev <evgeni.kunev@gmail.com>
Tue, 12 Aug 2014 15:57:28 +0300
changeset 31 92176112c2d6
parent 29 1ebe960587b1
child 33 8c4c05eb65f4
permissions -rw-r--r--
Move tlds.txt to a data/ folder and add it to the package
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
     1
import re
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
     2
import sys
29
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
     3
import os
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
     4
import subprocess
8
9cf495a1e2e9 added support for IP addresses
Richard Baron Penman
parents: 7
diff changeset
     5
import socket
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
     6
from parser import WhoisEntry
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
     7
from whois import NICClient
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
     8
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
     9
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    10
def whois(url):
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    11
    # clean domain to expose netloc
25
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    12
    ip_match = re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", url)
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    13
    if ip_match:
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    14
        domain = url
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    15
    else:
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    16
        domain = extract_domain(url)
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    17
    try:
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    18
        # try native whois command first
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    19
        r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE)
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    20
        text = r.stdout.read()
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    21
    except OSError:
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    22
        # try experimental client
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    23
        nic_client = NICClient()
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    24
        text = nic_client.whois_lookup(None, domain, 0)
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    25
    return WhoisEntry.load(domain, text)
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    26
5
7ace7955a131 added support for japanese domains
Richard Baron Penman
parents: 0
diff changeset
    27
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    28
def extract_domain(url):
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    29
    """Extract the domain from the given URL
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    30
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    31
    >>> extract_domain('http://www.google.com.au/tos.html')
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    32
    'google.com.au'
11
5083c26d8f93 added support for .pl domain
Richard Baron Penman
parents: 10
diff changeset
    33
    >>> extract_domain('http://blog.webscraping.com')
5083c26d8f93 added support for .pl domain
Richard Baron Penman
parents: 10
diff changeset
    34
    'webscraping.com'
23
5c3e1a2c9c9a added support for .org expiry date
Richard Penman
parents: 22
diff changeset
    35
    >>> extract_domain('www.bbc.co.uk')
5c3e1a2c9c9a added support for .org expiry date
Richard Penman
parents: 22
diff changeset
    36
    'bbc.co.uk'
22
1a6c1830d4bc added IP test cases
Richard Penman
parents: 14
diff changeset
    37
    >>> extract_domain('198.252.206.140')
8
9cf495a1e2e9 added support for IP addresses
Richard Baron Penman
parents: 7
diff changeset
    38
    'stackoverflow.com'
22
1a6c1830d4bc added IP test cases
Richard Penman
parents: 14
diff changeset
    39
    >>> extract_domain('102.112.2O7.net')
1a6c1830d4bc added IP test cases
Richard Penman
parents: 14
diff changeset
    40
    '2o7.net'
1a6c1830d4bc added IP test cases
Richard Penman
parents: 14
diff changeset
    41
    >>> extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info')
1a6c1830d4bc added IP test cases
Richard Penman
parents: 14
diff changeset
    42
    '0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    43
    """
22
1a6c1830d4bc added IP test cases
Richard Penman
parents: 14
diff changeset
    44
    if re.match(r'\d+\.\d+\.\d+\.\d+', url):
8
9cf495a1e2e9 added support for IP addresses
Richard Baron Penman
parents: 7
diff changeset
    45
        # this is an IP address
9cf495a1e2e9 added support for IP addresses
Richard Baron Penman
parents: 7
diff changeset
    46
        return socket.gethostbyaddr(url)[0]
11
5083c26d8f93 added support for .pl domain
Richard Baron Penman
parents: 10
diff changeset
    47
31
92176112c2d6 Move tlds.txt to a data/ folder and add it to the package
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 29
diff changeset
    48
    tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt')
29
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
    49
    suffixes = [
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
    50
        line.lower().strip()
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
    51
        for line in open(tlds_path).readlines()
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
    52
        if not line.startswith('#')
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
    53
    ]
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
    54
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    55
    url = re.sub('^.*://', '', url).split('/')[0].lower()
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    56
    domain = []
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    57
    for section in url.split('.'):
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    58
        if section in suffixes:
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    59
            domain.append(section)
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    60
        else:
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    61
            domain = [section]
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    62
    return '.'.join(domain)
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    63
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    64
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    65
if __name__ == '__main__':
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    66
    try:
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    67
        url = sys.argv[1]
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    68
    except IndexError:
12
c57439b500cb fixed test cases
Richard Baron Penman
parents: 11
diff changeset
    69
        print('Usage: %s url' % sys.argv[0])
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    70
    else:
12
c57439b500cb fixed test cases
Richard Baron Penman
parents: 11
diff changeset
    71
        print(whois(url))