whois/__init__.py
author Johnny Wezel<j@wezel.name>
Sun, 07 Feb 2016 23:40:59 +0100
branchpython3
changeset 73 644d81a7995b
parent 71 b181f795cc0d
child 86 d6fcfa5acc7b
child 97 44522cd37b07
permissions -rw-r--r--
Removed debug statements

from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import division
from future import standard_library
standard_library.install_aliases()
from builtins import *
import re
import sys
import os
import subprocess
import socket
from .parser import WhoisEntry
from .whois import NICClient


def whois(url, command=False):
    # clean domain to expose netloc
    ip_match = re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", url)
    if ip_match:
        domain = url
    else:
        domain = extract_domain(url)
    if command:
        # try native whois command
        r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE)
        text = r.stdout.read()
    else:
        # try builtin client
        nic_client = NICClient()
        text = nic_client.whois_lookup(None, domain, 0)
    return WhoisEntry.load(domain, text)


def extract_domain(url):
    """Extract the domain from the given URL

    >>> extract_domain('http://www.google.com.au/tos.html')
    'google.com.au'
    >>> extract_domain('www.webscraping.com')
    'webscraping.com'
    >>> extract_domain('198.252.206.140')
    'stackoverflow.com'
    >>> extract_domain('102.112.2O7.net')
    '2o7.net'
    >>> extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info')
    '0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'
    """
    if re.match(r'\d+\.\d+\.\d+\.\d+', url):
        # this is an IP address
        return socket.gethostbyaddr(url)[0]

    tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt')
    suffixes = [
        line.lower().strip().encode('utf-8')
        for line in open(tlds_path).readlines()
        if not line.startswith('#')
    ]

    if not isinstance(url, str):
        url = url.decode('utf-8')
    url = re.sub(b'^.*://', b'', url.encode('idna')).split(b'/')[0].lower()
    domain = []

    for section in url.split(b'.'):
        if section in suffixes:
            domain.append(section)
        else:
            domain = [section]
    return b'.'.join(domain).decode('idna')


if __name__ == '__main__':
    try:
        url = sys.argv[1]
    except IndexError:
        print('Usage: %s url' % sys.argv[0])
    else:
        print(whois(url))