#!/usr/bin/env python3
# HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
# httpz/parsers.py

try:
    import bs4
except ImportError:
    raise ImportError('missing bs4 module (pip install beautifulsoup4)')

try:
    from cryptography import x509
    from cryptography.hazmat.primitives import hashes
    from cryptography.x509.oid import NameOID
except ImportError:
    raise ImportError('missing cryptography module (pip install cryptography)')

try:
    import mmh3
except ImportError:
    raise ImportError('missing mmh3 module (pip install mmh3)')

from .utils import debug, error


def parse_domain_url(domain: str) -> tuple:
    '''
    Parse domain string into base domain, port, and protocol list
    
    :param domain: Raw domain string to parse
    :return: Tuple of (base_domain, port, protocols)
    '''
    port = None
    base_domain = domain.rstrip('/')
    
    if base_domain.startswith(('http://', 'https://')):
        protocol = 'https://' if base_domain.startswith('https://') else 'http://'
        base_domain = base_domain.split('://', 1)[1]
        if ':' in base_domain.split('/')[0]:
            base_domain, port_str = base_domain.split(':', 1)
            try:
                port = int(port_str.split('/')[0])
            except ValueError:
                port = 443 if protocol == 'https://' else 80
        else:
            port = 443 if protocol == 'https://' else 80
        protocols = [f'{protocol}{base_domain}{":" + str(port) if port else ""}']
    else:
        if ':' in base_domain.split('/')[0]:
            base_domain, port_str = base_domain.split(':', 1)
            port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else 443
        else:
            port = 443
        protocols = [
            f'https://{base_domain}{":" + str(port) if port else ""}',
            f'http://{base_domain}{":"  + str(port) if port else ""}'
        ]
    
    return base_domain, port, protocols

async def get_cert_info(ssl_object, url: str) -> dict:
    '''
    Get SSL certificate information for a domain
    
    :param ssl_object: SSL object to get certificate info from
    :param url: URL to get certificate info from
    '''
    try:            
        if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)):
            return None

        cert = x509.load_der_x509_certificate(cert_der)

        try:
            san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
            alt_names     = [name.value for name in san_extension.value] if san_extension else []
        except x509.extensions.ExtensionNotFound:
            alt_names = []

        try:
            common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
        except IndexError:
            common_name = None

        try:
            issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
        except IndexError:
            issuer = None

        return {
            'fingerprint'   : cert.fingerprint(hashes.SHA256()).hex(),
            'common_name'   : common_name,
            'issuer'        : issuer,
            'alt_names'     : alt_names,
            'not_before'    : cert.not_valid_before_utc.isoformat(),
            'not_after'     : cert.not_valid_after_utc.isoformat(),
            'version'       : cert.version.value,
            'serial_number' : format(cert.serial_number, 'x'),
        }
    except Exception as e:
        error(f'Error getting cert info for {url}: {str(e)}')
        return None

async def get_favicon_hash(session, base_url: str, html: str) -> str:
    '''
    Get favicon hash from a webpage
    
    :param session: aiohttp client session
    :param base_url: base URL of the website
    :param html: HTML content of the page
    '''
    try:
        soup = bs4.BeautifulSoup(html, 'html.parser')
        
        favicon_url = None
        for link in soup.find_all('link'):
            if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')):
                favicon_url = link.get('href')
                break
        
        if not favicon_url:
            favicon_url = '/favicon.ico'
        
        if favicon_url.startswith('//'):
            favicon_url = 'https:' + favicon_url
        elif favicon_url.startswith('/'):
            favicon_url = base_url + favicon_url
        elif not favicon_url.startswith(('http://', 'https://')):
            favicon_url = base_url + '/' + favicon_url

        async with session.get(favicon_url, timeout=10) as response:
            if response.status == 200:
                content    = (await response.read())[:1024*1024]
                hash_value = mmh3.hash64(content)[0]
                if hash_value != 0:
                    return str(hash_value)

    except Exception as e:
        debug(f'Error getting favicon for {base_url}: {str(e)}')
    
    return None