#!/usr/bin/env python3 # HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz) # httpz/parsers.py try: import bs4 except ImportError: raise ImportError('missing bs4 module (pip install beautifulsoup4)') try: from cryptography import x509 from cryptography.hazmat.primitives import hashes from cryptography.x509.oid import NameOID except ImportError: raise ImportError('missing cryptography module (pip install cryptography)') try: import mmh3 except ImportError: raise ImportError('missing mmh3 module (pip install mmh3)') from .utils import debug, error def parse_domain_url(domain: str) -> tuple: ''' Parse domain string into base domain, port, and protocol list :param domain: Raw domain string to parse :return: Tuple of (base_domain, port, protocols) ''' port = None base_domain = domain.rstrip('/') if base_domain.startswith(('http://', 'https://')): protocol = 'https://' if base_domain.startswith('https://') else 'http://' base_domain = base_domain.split('://', 1)[1] if ':' in base_domain.split('/')[0]: base_domain, port_str = base_domain.split(':', 1) try: port = int(port_str.split('/')[0]) except ValueError: port = 443 if protocol == 'https://' else 80 else: port = 443 if protocol == 'https://' else 80 protocols = [f'{protocol}{base_domain}{":" + str(port) if port else ""}'] else: if ':' in base_domain.split('/')[0]: base_domain, port_str = base_domain.split(':', 1) port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else 443 else: port = 443 protocols = [ f'https://{base_domain}{":" + str(port) if port else ""}', f'http://{base_domain}{":" + str(port) if port else ""}' ] return base_domain, port, protocols async def get_cert_info(ssl_object, url: str) -> dict: ''' Get SSL certificate information for a domain :param ssl_object: SSL object to get certificate info from :param url: URL to get certificate info from ''' try: if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)): return None cert = x509.load_der_x509_certificate(cert_der) try: san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME) alt_names = [name.value for name in san_extension.value] if san_extension else [] except x509.extensions.ExtensionNotFound: alt_names = [] try: common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value except IndexError: common_name = None try: issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value except IndexError: issuer = None return { 'fingerprint' : cert.fingerprint(hashes.SHA256()).hex(), 'common_name' : common_name, 'issuer' : issuer, 'alt_names' : alt_names, 'not_before' : cert.not_valid_before_utc.isoformat(), 'not_after' : cert.not_valid_after_utc.isoformat(), 'version' : cert.version.value, 'serial_number' : format(cert.serial_number, 'x'), } except Exception as e: error(f'Error getting cert info for {url}: {str(e)}') return None async def get_favicon_hash(session, base_url: str, html: str) -> str: ''' Get favicon hash from a webpage :param session: aiohttp client session :param base_url: base URL of the website :param html: HTML content of the page ''' try: soup = bs4.BeautifulSoup(html, 'html.parser') favicon_url = None for link in soup.find_all('link'): if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')): favicon_url = link.get('href') break if not favicon_url: favicon_url = '/favicon.ico' if favicon_url.startswith('//'): favicon_url = 'https:' + favicon_url elif favicon_url.startswith('/'): favicon_url = base_url + favicon_url elif not favicon_url.startswith(('http://', 'https://')): favicon_url = base_url + '/' + favicon_url async with session.get(favicon_url, timeout=10) as response: if response.status == 200: content = (await response.read())[:1024*1024] hash_value = mmh3.hash64(content)[0] if hash_value != 0: return str(hash_value) except Exception as e: debug(f'Error getting favicon for {base_url}: {str(e)}') return None