whois/__init__.py
changeset 123 03c72d0d1182
parent 122 95feee1af1da
child 124 dfa8657bdefc
equal deleted inserted replaced
122:95feee1af1da 123:03c72d0d1182
       
     1 # -*- coding: utf-8 -*-
       
     2 
     1 from __future__ import print_function
     3 from __future__ import print_function
     2 from __future__ import absolute_import
     4 from __future__ import absolute_import
     3 from __future__ import unicode_literals
     5 from __future__ import unicode_literals
     4 from __future__ import division
     6 from __future__ import division
     5 from future import standard_library
     7 from future import standard_library
    37         nic_client = NICClient()
    39         nic_client = NICClient()
    38         text = nic_client.whois_lookup(None, domain.encode('idna'), 0)
    40         text = nic_client.whois_lookup(None, domain.encode('idna'), 0)
    39     return WhoisEntry.load(domain, text)
    41     return WhoisEntry.load(domain, text)
    40 
    42 
    41 
    43 
       
    44 suffixes = None
    42 def extract_domain(url):
    45 def extract_domain(url):
    43     """Extract the domain from the given URL
    46     """Extract the domain from the given URL
    44 
    47 
    45     >>> extract_domain('http://www.google.com.au/tos.html')
    48     >>> print(extract_domain('http://www.google.com.au/tos.html'))
    46     'google.com.au'
    49     google.com.au
    47     >>> extract_domain('www.webscraping.com')
    50     >>> print(extract_domain('abc.def.com'))
    48     'webscraping.com'
    51     def.com
    49     >>> extract_domain('198.252.206.140')
    52     >>> print(extract_domain(u'www.公司.hk'))
    50     'stackoverflow.com'
    53     公司.hk
    51     >>> extract_domain('102.112.2O7.net')
    54     >>> print(extract_domain('chambagri.fr'))
    52     '2o7.net'
    55     chambagri.fr
    53     >>> extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info')
    56     >>> print(extract_domain('www.webscraping.com'))
    54     '0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'
    57     webscraping.com
       
    58     >>> print(extract_domain('198.252.206.140'))
       
    59     stackoverflow.com
       
    60     >>> print(extract_domain('102.112.2O7.net'))
       
    61     2o7.net
       
    62     >>> print(extract_domain('1-0-1-1-1-0-1-1-1-1-1-1-1-.0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info'))
       
    63     0-0-0-0-0-0-0-0-0-0-0-0-0-10-0-0-0-0-0-0-0-0-0-0-0-0-0.info
    55     """
    64     """
    56     if re.match(r'\d+\.\d+\.\d+\.\d+', url):
    65     if re.match(r'\d+\.\d+\.\d+\.\d+', url):
    57         # this is an IP address
    66         # this is an IP address
    58         return socket.gethostbyaddr(url)[0]
    67         return socket.gethostbyaddr(url)[0]
    59 
    68 
    60     tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'tlds.txt')
    69     # load known TLD suffixes
    61     with open(tlds_path) as tlds_fil:
    70     global suffixes
    62         suffixes = [line.lower().encode('utf-8')
    71     if not suffixes:
    63                     for line in (x.strip() for x in tlds_fil)
    72         # downloaded from https://publicsuffix.org/list/public_suffix_list.dat
    64                     if not line.startswith('#')]
    73         tlds_path = os.path.join(os.getcwd(), os.path.dirname(__file__), 'data', 'public_suffix_list.dat')
    65     suff = 'xn--p1ai'
    74         with open(tlds_path) as tlds_fp:
       
    75             suffixes = set(line.encode('utf-8') for line in tlds_fp.read().splitlines() if line and not line.startswith('//'))
    66 
    76 
    67     if not isinstance(url, str):
    77     if not isinstance(url, str):
    68         url = url.decode('utf-8')
    78         url = url.decode('utf-8')
    69     url = re.sub('^.*://', '', url)
    79     url = re.sub('^.*://', '', url)
    70     url = url.split('/')[0].lower().encode('idna')
    80     url = url.split('/')[0].lower().encode('idna')
    71 
    81 
    72     domain = []
    82     # find the longest suffix match
    73     for section in url.split(b'.'):
    83     domain = b''
    74         if section in suffixes:
    84     for section in reversed(url.split(b'.')):
    75             domain.append(section)
    85         if domain:
    76         else:
    86             domain = '.' + domain
    77             domain = [section]
    87         domain = section + domain
    78     return b'.'.join(domain).decode('idna')
    88         if domain not in suffixes:
       
    89             break
       
    90     return domain.decode('idna')
    79 
    91 
    80 
    92 
    81 if __name__ == '__main__':
    93 if __name__ == '__main__':
    82     try:
    94     try:
    83         url = sys.argv[1]
    95         url = sys.argv[1]