whois/__init__.py
author Evgeni Kunev <evgeni.kunev@gmail.com>
Fri, 22 Aug 2014 15:00:14 +0300
changeset 42 d187963bb7e9
parent 38 da8f2956db7e
child 60 7801a420f679
permissions -rw-r--r--
Use timezone naive datetime objects Otherwise comparisons don't always work

import re
import sys
import os
import subprocess
import socket
from parser import WhoisEntry
from whois import NICClient


def whois(url, experimental=False):
    # clean domain to expose netloc
    ip_match = re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", url)
    if ip_match:
        domain = url
    else:
        domain = extract_domain(url)
    if not experimental:
        try:
            # try native whois command first
            r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE)
            text = r.stdout.read()
        except OSError:
            # try experimental client
            nic_client = NICClient()
            text = nic_client.whois_lookup(None, domain, 0)
    else:
        nic_client = NICClient()
        text = nic_client.whois_lookup(None, domain, 0)
    return WhoisEntry.load(domain, text)


def extract_domain(url):
    """Extract the domain from the given URL

    >>> extract_domain('http://www.google.com.au/tos.html')
    'google.com.au'
    >>> extract_domain('http://blog.webscraping.com')
    'webscraping.com'
    >>> extract_domain('www.bbc.co.uk')
    'bbc.co.uk'
    >>> extract_domain('198.252.206.140')
    'stackoverflow.com'
    >>> extract_domain('102.112.2O7.net')
    '2o7.net'
    >>> extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info')
    '0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'
    """
    if re.match(r'\d+\.\d+\.\d+\.\d+', url):
        # this is an IP address
        return socket.gethostbyaddr(url)[0]

    tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt')
    suffixes = [
        line.lower().strip()
        for line in open(tlds_path).readlines()
        if not line.startswith('#')
    ]

    if type(url) is not unicode:
        url = url.decode('utf-8')
    url = re.sub('^.*://', '', url.encode('idna')).split('/')[0].lower()
    domain = []

    for section in url.split('.'):
        if section in suffixes:
            domain.append(section)
        else:
            domain = [section]
    return '.'.join(domain).decode('idna').encode('utf-8')


if __name__ == '__main__':
    try:
        url = sys.argv[1]
    except IndexError:
        print('Usage: %s url' % sys.argv[0])
    else:
        print(whois(url))