203 lines
6.7 KiB
Python
203 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
# HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
|
|
# httpz_scanner/parsers.py
|
|
|
|
try:
|
|
import bs4
|
|
except ImportError:
|
|
raise ImportError('missing bs4 module (pip install beautifulsoup4)')
|
|
|
|
try:
|
|
from cryptography import x509
|
|
from cryptography.hazmat.primitives import hashes
|
|
from cryptography.x509.oid import NameOID
|
|
except ImportError:
|
|
raise ImportError('missing cryptography module (pip install cryptography)')
|
|
|
|
try:
|
|
import mmh3
|
|
except ImportError:
|
|
raise ImportError('missing mmh3 module (pip install mmh3)')
|
|
|
|
from .utils import debug, error
|
|
import argparse
|
|
|
|
|
|
def parse_domain_url(domain: str) -> tuple:
|
|
'''
|
|
Parse domain string into base domain, port, and protocol list
|
|
|
|
:param domain: Raw domain string to parse
|
|
'''
|
|
|
|
port = None
|
|
base_domain = domain.rstrip('/')
|
|
|
|
if base_domain.startswith(('http://', 'https://')):
|
|
protocol = 'https://' if base_domain.startswith('https://') else 'http://'
|
|
base_domain = base_domain.split('://', 1)[1]
|
|
if ':' in base_domain.split('/')[0]:
|
|
base_domain, port_str = base_domain.split(':', 1)
|
|
try:
|
|
port = int(port_str.split('/')[0])
|
|
except ValueError:
|
|
port = 443 if protocol == 'https://' else 80
|
|
else:
|
|
port = 443 if protocol == 'https://' else 80
|
|
protocols = [f'{protocol}{base_domain}{":" + str(port) if port else ""}']
|
|
else:
|
|
if ':' in base_domain.split('/')[0]:
|
|
base_domain, port_str = base_domain.split(':', 1)
|
|
port = int(port_str.split('/')[0]) if port_str.split('/')[0].isdigit() else 443
|
|
else:
|
|
port = 443
|
|
protocols = [
|
|
f'https://{base_domain}{":" + str(port) if port else ""}',
|
|
f'http://{base_domain}{":" + str(port) if port else ""}'
|
|
]
|
|
|
|
return base_domain, port, protocols
|
|
|
|
|
|
async def get_cert_info(ssl_object, url: str) -> dict:
|
|
'''
|
|
Get SSL certificate information for a domain
|
|
|
|
:param ssl_object: SSL object to get certificate info from
|
|
:param url: URL to get certificate info from
|
|
'''
|
|
|
|
try:
|
|
if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)):
|
|
return None
|
|
|
|
cert = x509.load_der_x509_certificate(cert_der)
|
|
|
|
try:
|
|
san_extension = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME)
|
|
alt_names = [name.value for name in san_extension.value] if san_extension else []
|
|
except x509.extensions.ExtensionNotFound:
|
|
alt_names = []
|
|
|
|
try:
|
|
common_name = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
|
|
except IndexError:
|
|
common_name = None
|
|
|
|
try:
|
|
issuer = cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value
|
|
except IndexError:
|
|
issuer = None
|
|
|
|
return {
|
|
'fingerprint' : cert.fingerprint(hashes.SHA256()).hex(),
|
|
'common_name' : common_name,
|
|
'issuer' : issuer,
|
|
'alt_names' : alt_names,
|
|
'not_before' : cert.not_valid_before_utc.isoformat(),
|
|
'not_after' : cert.not_valid_after_utc.isoformat(),
|
|
'version' : cert.version.value,
|
|
'serial_number' : format(cert.serial_number, 'x'),
|
|
}
|
|
except Exception as e:
|
|
error(f'Error getting cert info for {url}: {str(e)}')
|
|
return None
|
|
|
|
|
|
async def get_favicon_hash(session, base_url: str, html: str) -> str:
|
|
'''
|
|
Get favicon hash from a webpage
|
|
|
|
:param session: aiohttp client session
|
|
:param base_url: base URL of the website
|
|
:param html: HTML content of the page
|
|
'''
|
|
|
|
try:
|
|
soup = bs4.BeautifulSoup(html, 'html.parser')
|
|
|
|
favicon_url = None
|
|
for link in soup.find_all('link'):
|
|
if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')):
|
|
favicon_url = link.get('href')
|
|
break
|
|
|
|
if not favicon_url:
|
|
favicon_url = '/favicon.ico'
|
|
|
|
if favicon_url.startswith('//'):
|
|
favicon_url = 'https:' + favicon_url
|
|
elif favicon_url.startswith('/'):
|
|
favicon_url = base_url + favicon_url
|
|
elif not favicon_url.startswith(('http://', 'https://')):
|
|
favicon_url = base_url + '/' + favicon_url
|
|
|
|
async with session.get(favicon_url, timeout=10) as response:
|
|
if response.status == 200:
|
|
content = (await response.read())[:1024*1024]
|
|
hash_value = mmh3.hash64(content)[0]
|
|
if hash_value != 0:
|
|
return str(hash_value)
|
|
|
|
except Exception as e:
|
|
debug(f'Error getting favicon for {base_url}: {str(e)}')
|
|
|
|
return None
|
|
|
|
|
|
def parse_status_codes(codes_str: str) -> set:
|
|
'''
|
|
Parse comma-separated status codes and ranges into a set of integers
|
|
|
|
:param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
|
|
'''
|
|
|
|
codes = set()
|
|
try:
|
|
for part in codes_str.split(','):
|
|
if '-' in part:
|
|
start, end = map(int, part.split('-'))
|
|
codes.update(range(start, end + 1))
|
|
else:
|
|
codes.add(int(part))
|
|
return codes
|
|
except ValueError:
|
|
raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
|
|
|
|
|
|
def parse_shard(shard_str: str) -> tuple:
|
|
'''
|
|
Parse shard argument in format INDEX/TOTAL
|
|
|
|
:param shard_str: Shard string in format "INDEX/TOTAL"
|
|
'''
|
|
|
|
try:
|
|
shard_index, total_shards = map(int, shard_str.split('/'))
|
|
if shard_index < 1 or total_shards < 1 or shard_index > total_shards:
|
|
raise ValueError
|
|
return shard_index - 1, total_shards # Convert to 0-based index
|
|
except (ValueError, TypeError):
|
|
raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL')
|
|
|
|
|
|
def parse_title(html: str, content_type: str = None) -> str:
|
|
'''
|
|
Parse title from HTML content
|
|
|
|
:param html: HTML content of the page
|
|
:param content_type: Content-Type header value
|
|
'''
|
|
|
|
# Only parse title for HTML content
|
|
if content_type and not any(x in content_type.lower() for x in ['text/html', 'application/xhtml']):
|
|
return None
|
|
|
|
try:
|
|
soup = bs4.BeautifulSoup(html, 'html.parser', from_encoding='utf-8', features='lxml')
|
|
if title := soup.title:
|
|
return title.string.strip()
|
|
except:
|
|
pass
|
|
|
|
return None |