httpz/httpz_scanner/scanner.py
2025-02-12 00:50:02 -05:00

328 lines
14 KiB
Python

#!/usr/bin/env python3
# HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
# httpz_scanner/scanner.py
import asyncio
import random
import urllib.parse
try:
import aiohttp
except ImportError:
raise ImportError('missing aiohttp module (pip install aiohttp)')
try:
import bs4
except ImportError:
raise ImportError('missing bs4 module (pip install beautifulsoup4)')
from .dns import resolve_all_dns, load_resolvers
from .parsers import parse_domain_url, get_cert_info, get_favicon_hash
from .utils import debug, USER_AGENTS, input_generator
class HTTPZScanner:
'''Core scanner class for HTTP domain checking'''
def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None, paths = None):
'''
Initialize the HTTPZScanner class
:param concurrent_limit: Maximum number of concurrent requests
:param timeout: Request timeout in seconds
:param follow_redirects: Follow redirects
:param check_axfr: Check for AXFR
:param resolver_file: Path to resolver file
:param output_file: Path to output file
:param show_progress: Show progress bar
:param debug_mode: Enable debug mode
:param jsonl_output: Output in JSONL format
:param show_fields: Fields to show
:param match_codes: Status codes to match
:param exclude_codes: Status codes to exclude
:param shard: Tuple of (shard_index, total_shards) for distributed scanning
:param paths: List of additional paths to check on each domain
'''
self.concurrent_limit = concurrent_limit
self.timeout = timeout
self.follow_redirects = follow_redirects
self.check_axfr = check_axfr
self.resolver_file = resolver_file
self.output_file = output_file
self.show_progress = show_progress
self.debug_mode = debug_mode
self.jsonl_output = jsonl_output
self.shard = shard
self.paths = paths or []
self.show_fields = show_fields or {
'status_code' : True,
'content_type' : True,
'content_length' : True,
'title' : True,
'body' : True,
'ip' : True,
'favicon' : True,
'headers' : True,
'follow_redirects' : True,
'cname' : True,
'tls' : True
}
self.match_codes = match_codes
self.exclude_codes = exclude_codes
self.resolvers = None
self.processed_domains = 0
self.progress_count = 0
async def check_domain(self, session: aiohttp.ClientSession, domain: str):
'''
Check a single domain and return results
:param session: aiohttp.ClientSession
:param domain: str
'''
# Parse domain
base_domain, port, protocols = parse_domain_url(domain)
results = []
# For each protocol (http/https)
for base_url in protocols:
try:
# Check base URL first
if result := await self._check_url(session, base_url):
results.append(result)
# Check additional paths
for path in self.paths:
path = path.strip('/')
url = f'{base_url}/{path}'
if result := await self._check_url(session, url):
results.append(result)
if results: # If we got any successful results, return them
break
except Exception as e:
debug(f'Error checking {base_url}: {str(e)}')
continue
return results[0] if results else None # Return first successful result or None
async def _check_url(self, session: aiohttp.ClientSession, url: str):
'''
Check a single URL and return results
:param session: aiohttp.ClientSession
:param url: URL to check
'''
try:
headers = {'User-Agent': random.choice(USER_AGENTS)}
async with session.get(url, timeout=self.timeout,
allow_redirects=self.follow_redirects,
max_redirects=10 if self.follow_redirects else 0,
headers=headers) as response:
# Properly parse the URL
parsed_url = urllib.parse.urlparse(url)
parsed_domain = parsed_url.hostname
result = {
'domain': parsed_domain,
'status': response.status,
'url': str(response.url),
'port': parsed_url.port or ('443' if parsed_url.scheme == 'https' else '80')
}
# Early exit conditions
if result['status'] == -1:
return None
if self.match_codes and result['status'] not in self.match_codes:
return result
if self.exclude_codes and result['status'] in self.exclude_codes:
return result
# Continue with full processing only if status code matches criteria
result['url'] = str(response.url)
# Add headers if requested
headers = dict(response.headers)
if headers and (self.show_fields.get('headers') or self.show_fields.get('all_flags')):
result['headers'] = headers
else:
# Only add content type/length if headers aren't included
if content_type := response.headers.get('content-type', '').split(';')[0]:
result['content_type'] = content_type
if content_length := response.headers.get('content-length'):
result['content_length'] = content_length
# Only add redirect chain if it exists
if self.follow_redirects and response.history:
result['redirect_chain'] = [str(h.url) for h in response.history] + [str(response.url)]
# Do DNS lookups only if we're going to use the result
ips, cname, nameservers, _ = await resolve_all_dns(
parsed_domain, self.timeout, None, self.check_axfr
)
# Only add DNS fields if they have values
if ips:
result['ips'] = ips
if cname:
result['cname'] = cname
if nameservers:
result['nameservers'] = nameservers
# Only add TLS info if available
if response.url.scheme == 'https':
try:
if ssl_object := response._protocol.transport.get_extra_info('ssl_object'):
if tls_info := await get_cert_info(ssl_object, str(response.url)):
# Only add TLS fields that have values
result['tls'] = {k: v for k, v in tls_info.items() if v}
except AttributeError:
debug(f'Failed to get SSL info for {url}')
content_type = response.headers.get('Content-Type', '')
html = await response.text() if any(x in content_type.lower() for x in ['text/html', 'application/xhtml']) else None
# Only add title if it exists
if soup := bs4.BeautifulSoup(html, 'html.parser'):
if soup.title and soup.title.string:
result['title'] = ' '.join(soup.title.string.strip().split()).rstrip('.')[:300]
# Only add body if it exists
if body_text := soup.get_text():
result['body'] = ' '.join(body_text.split()).rstrip('.')[:500]
# Only add favicon hash if it exists
if favicon_hash := await get_favicon_hash(session, url, html):
result['favicon_hash'] = favicon_hash
return result
except Exception as e:
debug(f'Error checking {url}: {str(e)}')
return None
async def scan(self, input_source):
'''
Scan domains from a file, stdin, or async generator
:param input_source: Can be:
- Path to file (str)
- stdin ('-')
- List/tuple of domains
- Async generator yielding domains
:yields: Result dictionary for each domain scanned
'''
if not self.resolvers:
self.resolvers = await load_resolvers(self.resolver_file)
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
tasks = {} # Change to dict to track domain for each task
domain_queue = asyncio.Queue()
queue_empty = False
async def process_domain(domain):
try:
result = await self.check_domain(session, domain)
if result:
if self.show_progress:
self.progress_count += 1
return result
except Exception as e:
debug(f'Error processing {domain}: {str(e)}')
return None
# Add domains to queue based on input type
async def queue_domains():
try:
if isinstance(input_source, str):
# File or stdin input
gen = input_generator(input_source, self.shard)
async for domain in gen:
await domain_queue.put(domain)
elif isinstance(input_source, (list, tuple)):
# List/tuple input
for line_num, domain in enumerate(input_source):
if domain := str(domain).strip():
if self.shard is None or line_num % self.shard[1] == self.shard[0]:
await domain_queue.put(domain)
else:
# Async generator input
line_num = 0
async for domain in input_source:
if isinstance(domain, bytes):
domain = domain.decode()
if domain := domain.strip():
if self.shard is None or line_num % self.shard[1] == self.shard[0]:
await domain_queue.put(domain)
line_num += 1
except Exception as e:
debug(f'Error queuing domains: {str(e)}')
finally:
# Signal queue completion
await domain_queue.put(None)
# Start domain queuing task
queue_task = asyncio.create_task(queue_domains())
try:
while not queue_empty or tasks:
# Start new tasks if needed
while len(tasks) < self.concurrent_limit and not queue_empty:
try:
domain = await domain_queue.get()
if domain is None:
queue_empty = True
break
task = asyncio.create_task(process_domain(domain))
tasks[task] = domain
except Exception as e:
debug(f'Error creating task: {str(e)}')
if not tasks:
break
# Wait for the FIRST task to complete
try:
done, _ = await asyncio.wait(
tasks.keys(),
timeout=self.timeout,
return_when=asyncio.FIRST_COMPLETED
)
# Process completed task immediately
for task in done:
domain = tasks.pop(task)
try:
if result := await task:
yield result
except Exception as e:
debug(f'Error processing result for {domain}: {str(e)}')
except Exception as e:
debug(f'Error in task processing loop: {str(e)}')
# Remove any failed tasks
failed_tasks = [t for t in tasks if t.done() and t.exception()]
for task in failed_tasks:
tasks.pop(task)
finally:
# Clean up
for task in tasks:
task.cancel()
queue_task.cancel()
try:
await queue_task
except asyncio.CancelledError:
pass