whois/__init__.py
author richardpenman
Tue, 22 May 2018 11:09:53 -0400
changeset 178 06e9b88e0c20
parent 158 bcae8cb61002
permissions -rw-r--r--
raise the import error exception when package is missing
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
     1
# -*- coding: utf-8 -*-
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
     2
70
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
     3
from __future__ import print_function
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
     4
from __future__ import absolute_import
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
     5
from __future__ import unicode_literals
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
     6
from __future__ import division
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
     7
from future import standard_library
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
     8
standard_library.install_aliases()
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
     9
from builtins import *
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    10
import re
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    11
import sys
29
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
    12
import os
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    13
import subprocess
8
9cf495a1e2e9 added support for IP addresses
Richard Baron Penman
parents: 7
diff changeset
    14
import socket
70
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
    15
from .parser import WhoisEntry
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
    16
from .whois import NICClient
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    17
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    18
122
95feee1af1da encode domain at start issue #107
Richard Penman
parents: 98
diff changeset
    19
60
7801a420f679 added support for native client
Richard Penman
parents: 38
diff changeset
    20
def whois(url, command=False):
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    21
    # clean domain to expose netloc
25
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    22
    ip_match = re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", url)
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    23
    if ip_match:
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    24
        domain = url
86
d6fcfa5acc7b added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents: 73
diff changeset
    25
        try:
d6fcfa5acc7b added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents: 73
diff changeset
    26
            result = socket.gethostbyaddr(url)
d6fcfa5acc7b added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents: 73
diff changeset
    27
        except socket.herror as e:
d6fcfa5acc7b added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents: 73
diff changeset
    28
            pass
d6fcfa5acc7b added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents: 73
diff changeset
    29
        else:
d6fcfa5acc7b added support for resolving IP addresses to domain when has a PTR record
Richard Penman
parents: 73
diff changeset
    30
            domain = result[0]
25
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    31
    else:
f3c0a9a442aa avoid DNS lookup of IP addresses
Richard Penman
parents: 23
diff changeset
    32
        domain = extract_domain(url)
60
7801a420f679 added support for native client
Richard Penman
parents: 38
diff changeset
    33
    if command:
7801a420f679 added support for native client
Richard Penman
parents: 38
diff changeset
    34
        # try native whois command
7801a420f679 added support for native client
Richard Penman
parents: 38
diff changeset
    35
        r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE)
158
bcae8cb61002 decode string for python 3
richardpenman
parents: 140
diff changeset
    36
        text = r.stdout.read().decode()
33
8c4c05eb65f4 Allow explicit usage of NICClient even if whois binary is available
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 31
diff changeset
    37
    else:
60
7801a420f679 added support for native client
Richard Penman
parents: 38
diff changeset
    38
        # try builtin client
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    39
        nic_client = NICClient()
122
95feee1af1da encode domain at start issue #107
Richard Penman
parents: 98
diff changeset
    40
        text = nic_client.whois_lookup(None, domain.encode('idna'), 0)
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    41
    return WhoisEntry.load(domain, text)
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    42
5
7ace7955a131 added support for japanese domains
Richard Baron Penman
parents: 0
diff changeset
    43
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    44
suffixes = None
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    45
def extract_domain(url):
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    46
    """Extract the domain from the given URL
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    47
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    48
    >>> print(extract_domain('http://www.google.com.au/tos.html'))
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    49
    google.com.au
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    50
    >>> print(extract_domain('abc.def.com'))
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    51
    def.com
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    52
    >>> print(extract_domain(u'www.公司.hk'))
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    53
    公司.hk
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    54
    >>> print(extract_domain('chambagri.fr'))
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    55
    chambagri.fr
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    56
    >>> print(extract_domain('www.webscraping.com'))
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    57
    webscraping.com
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    58
    >>> print(extract_domain('198.252.206.140'))
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    59
    stackoverflow.com
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    60
    >>> print(extract_domain('102.112.2O7.net'))
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    61
    2o7.net
124
dfa8657bdefc fix for many results
Richard Penman
parents: 123
diff changeset
    62
    >>> print(extract_domain('globoesporte.globo.com'))
dfa8657bdefc fix for many results
Richard Penman
parents: 123
diff changeset
    63
    globo.com
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    64
    >>> print(extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'))
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    65
    0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    66
    """
22
1a6c1830d4bc added IP test cases
Richard Penman
parents: 14
diff changeset
    67
    if re.match(r'\d+\.\d+\.\d+\.\d+', url):
8
9cf495a1e2e9 added support for IP addresses
Richard Baron Penman
parents: 7
diff changeset
    68
        # this is an IP address
9cf495a1e2e9 added support for IP addresses
Richard Baron Penman
parents: 7
diff changeset
    69
        return socket.gethostbyaddr(url)[0]
11
5083c26d8f93 added support for .pl domain
Richard Baron Penman
parents: 10
diff changeset
    70
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    71
    # load known TLD suffixes
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    72
    global suffixes
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    73
    if not suffixes:
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    74
        # downloaded from https://publicsuffix.org/list/public_suffix_list.dat
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    75
        tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'public_suffix_list.dat')
136
30259bf0523f Fixes for python 3.6
joan <aseques@gmail.com>
parents: 124
diff changeset
    76
        with open(tlds_path, encoding='utf-8') as tlds_fp:
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    77
            suffixes = set(line.encode('utf-8') for line in tlds_fp.read().splitlines() if line and not line.startswith('//'))
29
1ebe960587b1 Read in all TLDs from a file
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 25
diff changeset
    78
71
b181f795cc0d Python3 hell
Johnny Wezel<j@wezel.name>
parents: 70
diff changeset
    79
    if not isinstance(url, str):
38
da8f2956db7e Ensure lookups work with both unicode and bytes objects
Evgeni Kunev <evgeni.kunev@gmail.com>
parents: 35
diff changeset
    80
        url = url.decode('utf-8')
97
44522cd37b07 Fixed tests. Also some UTF bugs (python2/3 hell)
Mario D. Santana <mario@elorangutan.com>
parents: 73
diff changeset
    81
    url = re.sub('^.*://', '', url)
140
196df98347d8 All the failing tests fixed
joan <aseques@gmail.com>
parents: 136
diff changeset
    82
    url = url.split('/')[0].lower()
97
44522cd37b07 Fixed tests. Also some UTF bugs (python2/3 hell)
Mario D. Santana <mario@elorangutan.com>
parents: 73
diff changeset
    83
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    84
    # find the longest suffix match
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    85
    domain = b''
140
196df98347d8 All the failing tests fixed
joan <aseques@gmail.com>
parents: 136
diff changeset
    86
    for section in reversed(url.split('.')):
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    87
        if domain:
136
30259bf0523f Fixes for python 3.6
joan <aseques@gmail.com>
parents: 124
diff changeset
    88
            domain = b'.' + domain
140
196df98347d8 All the failing tests fixed
joan <aseques@gmail.com>
parents: 136
diff changeset
    89
        domain = section.encode('utf-8') + domain
123
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    90
        if domain not in suffixes:
03c72d0d1182 added support for full TLD domains
Richard Penman
parents: 122
diff changeset
    91
            break
140
196df98347d8 All the failing tests fixed
joan <aseques@gmail.com>
parents: 136
diff changeset
    92
    return domain.decode('utf-8')
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    93
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    94
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    95
if __name__ == '__main__':
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    96
    try:
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    97
        url = sys.argv[1]
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
    98
    except IndexError:
70
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
    99
        print('Usage: %s url' % sys.argv[0])
0
ea0e45971cea initial commit to mercurial
Richard Baron Penman
parents:
diff changeset
   100
    else:
70
1fe2c20adeba Python3 support
Johnny Wezel<j@wezel.name>
parents: 60
diff changeset
   101
        print(whois(url))