httpz/httpz_scanner/parsers.py

#!/usr/bin/env python3
# HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
# httpz_scanner/parsers.py

try:
    import bs4
except ImportError:
    raise ImportError('missing bs4 module (pip install beautifulsoup4)')

try:
    from cryptography                   import x509
    from cryptography.hazmat.primitives import hashes
    from cryptography.x509.oid          import NameOID
except ImportError:
    raise ImportError('missing cryptography module (pip install cryptography)')

try:
    import mmh3
except ImportError:
    raise ImportError('missing mmh3 module (pip install mmh3)')

from .utils import debug, error
import argparse


def parse_domain_url(domain: str) -> tuple:
    '''
    Parse domain string into base domain, port, and protocol list

    :param domain: Raw domain string to parse
    '''

    port = None
    base_domain = domain.rstrip('/')

    if base_domain.startswith(('http://', 'https://')):
        protocol = 'https://' if base_domain.startswith('https://') else 'http://'
        base_domain = base_domain.split('://', 1)[1]
        if ':' in base_domain.split('/')[0]:
            base_domain, port_str = base_domain.split(':', 1)
            try:
                port = int(port_str.split('/')[0])
            except ValueError:
                port = 443 if protocol == 'https://' else 80
        else:
            port = 443 if protocol == 'https://' else 80
        protocols = [f'{protocol}{base_domain}{":" + str(port) if port else ""}']
    else:
        if ':' in base_domain.split('/')[0]:
            base_domain, port_str = base_domain.split(':', 1)
            port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else 443
        else:
            port = 443
        protocols = [
            f'https://{base_domain}{":" + str(port) if port else ""}',
            f'http://{base_domain}{":"  + str(port) if port else ""}'
        ]

    return base_domain, port, protocols


async def get_cert_info(ssl_object, url: str) -> dict:
    '''
    Get SSL certificate information for a domain

    :param ssl_object: SSL object to get certificate info from
    :param url: URL to get certificate info from
    '''

    try:
        if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)):
            return None

        cert = x509.load_der_x509_certificate(cert_der)

        try:
            san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
            alt_names     = [name.value for name in san_extension.value] if san_extension else []
        except x509.extensions.ExtensionNotFound:
            alt_names = []

        try:
            common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
        except IndexError:
            common_name = None

        try:
            issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
        except IndexError:
            issuer = None

        return {
            'fingerprint'   : cert.fingerprint(hashes.SHA256()).hex(),
            'common_name'   : common_name,
            'issuer'        : issuer,
            'alt_names'     : alt_names,
            'not_before'    : cert.not_valid_before_utc.isoformat(),
            'not_after'     : cert.not_valid_after_utc.isoformat(),
            'version'       : cert.version.value,
            'serial_number' : format(cert.serial_number, 'x'),
        }
    except Exception as e:
        error(f'Error getting cert info for {url}: {str(e)}')
        return None


async def get_favicon_hash(session, base_url: str, html: str) -> str:
    '''
    Get favicon hash from a webpage

    :param session: aiohttp client session
    :param base_url: base URL of the website
    :param html: HTML content of the page
    '''

    try:
        soup = bs4.BeautifulSoup(html, 'html.parser')

        favicon_url = None
        for link in soup.find_all('link'):
            if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')):
                favicon_url = link.get('href')
                break

        if not favicon_url:
            favicon_url = '/favicon.ico'

        if favicon_url.startswith('//'):
            favicon_url = 'https:' + favicon_url
        elif favicon_url.startswith('/'):
            favicon_url = base_url + favicon_url
        elif not favicon_url.startswith(('http://', 'https://')):
            favicon_url = base_url + '/' + favicon_url

        async with session.get(favicon_url, timeout=10) as response:
            if response.status == 200:
                content    = (await response.read())[:1024*1024]
                hash_value = mmh3.hash64(content)[0]
                if hash_value != 0:
                    return str(hash_value)

    except Exception as e:
        debug(f'Error getting favicon for {base_url}: {str(e)}')

    return None


def parse_status_codes(codes_str: str) -> set:
    '''
    Parse comma-separated status codes and ranges into a set of integers

    :param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
    '''

    codes = set()
    try:
        for part in codes_str.split(','):
            if '-' in part:
                start, end = map(int, part.split('-'))
                codes.update(range(start, end + 1))
            else:
                codes.add(int(part))
        return codes
    except ValueError:
        raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')


def parse_shard(shard_str: str) -> tuple:
    '''
    Parse shard argument in format INDEX/TOTAL

    :param shard_str: Shard string in format "INDEX/TOTAL"
    '''

    try:
        shard_index, total_shards = map(int, shard_str.split('/'))
        if shard_index < 1 or total_shards < 1 or shard_index > total_shards:
            raise ValueError
        return shard_index - 1, total_shards  # Convert to 0-based index
    except (ValueError, TypeError):
        raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL')


def parse_title(html: str, content_type: str = None) -> str:
    '''
    Parse title from HTML content

    :param html: HTML content of the page
    :param content_type: Content-Type header value
    '''

    # Only parse title for HTML content
    if content_type and not any(x in content_type.lower() for x in ['text/html', 'application/xhtml']):
        return None

    try:
        soup = bs4.BeautifulSoup(html, 'html.parser', from_encoding='utf-8', features='lxml')
        if title := soup.title:
            return title.string.strip()
    except:
        pass

    return None