httpz/httpz-scanner/parsers.py

140 lines
4.9 KiB
Python
Raw Normal View History

2025-02-11 07:15:39 +00:00
#!/usr/bin/env python3
# HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
# httpz/parsers.py
try:
import bs4
except ImportError:
raise ImportError('missing bs4 module (pip install beautifulsoup4)')
try:
from cryptography import x509
from cryptography.hazmat.primitives import hashes
from cryptography.x509.oid import NameOID
except ImportError:
raise ImportError('missing cryptography module (pip install cryptography)')
try:
import mmh3
except ImportError:
raise ImportError('missing mmh3 module (pip install mmh3)')
from .utils import debug, error
def parse_domain_url(domain: str) -> tuple:
'''
Parse domain string into base domain, port, and protocol list
:param domain: Raw domain string to parse
:return: Tuple of (base_domain, port, protocols)
'''
port = None
base_domain = domain.rstrip('/')
if base_domain.startswith(('http://', 'https://')):
protocol = 'https://' if base_domain.startswith('https://') else 'http://'
base_domain = base_domain.split('://', 1)[1]
if ':' in base_domain.split('/')[0]:
base_domain, port_str = base_domain.split(':', 1)
try:
port = int(port_str.split('/')[0])
except ValueError:
port = 443 if protocol == 'https://' else 80
else:
port = 443 if protocol == 'https://' else 80
protocols = [f'{protocol}{base_domain}{":" + str(port) if port else ""}']
else:
if ':' in base_domain.split('/')[0]:
base_domain, port_str = base_domain.split(':', 1)
port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else 443
else:
port = 443
protocols = [
f'https://{base_domain}{":" + str(port) if port else ""}',
f'http://{base_domain}{":" + str(port) if port else ""}'
]
return base_domain, port, protocols
async def get_cert_info(ssl_object, url: str) -> dict:
'''
Get SSL certificate information for a domain
:param ssl_object: SSL object to get certificate info from
:param url: URL to get certificate info from
'''
try:
if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)):
return None
cert = x509.load_der_x509_certificate(cert_der)
try:
san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
alt_names = [name.value for name in san_extension.value] if san_extension else []
except x509.extensions.ExtensionNotFound:
alt_names = []
try:
common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
except IndexError:
common_name = None
try:
issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
except IndexError:
issuer = None
return {
'fingerprint' : cert.fingerprint(hashes.SHA256()).hex(),
'common_name' : common_name,
'issuer' : issuer,
'alt_names' : alt_names,
'not_before' : cert.not_valid_before_utc.isoformat(),
'not_after' : cert.not_valid_after_utc.isoformat(),
'version' : cert.version.value,
'serial_number' : format(cert.serial_number, 'x'),
}
except Exception as e:
error(f'Error getting cert info for {url}: {str(e)}')
return None
async def get_favicon_hash(session, base_url: str, html: str) -> str:
'''
Get favicon hash from a webpage
:param session: aiohttp client session
:param base_url: base URL of the website
:param html: HTML content of the page
'''
try:
soup = bs4.BeautifulSoup(html, 'html.parser')
favicon_url = None
for link in soup.find_all('link'):
if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')):
favicon_url = link.get('href')
break
if not favicon_url:
favicon_url = '/favicon.ico'
if favicon_url.startswith('//'):
favicon_url = 'https:' + favicon_url
elif favicon_url.startswith('/'):
favicon_url = base_url + favicon_url
elif not favicon_url.startswith(('http://', 'https://')):
favicon_url = base_url + '/' + favicon_url
async with session.get(favicon_url, timeout=10) as response:
if response.status == 200:
content = (await response.read())[:1024*1024]
hash_value = mmh3.hash64(content)[0]
if hash_value != 0:
return str(hash_value)
except Exception as e:
debug(f'Error getting favicon for {base_url}: {str(e)}')
return None