2025-02-11 07:15:39 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz)
|
2025-02-11 07:46:01 +00:00
|
|
|
# httpz_scanner/scanner.py
|
2025-02-11 07:15:39 +00:00
|
|
|
|
|
|
|
import asyncio
|
|
|
|
import random
|
2025-02-12 05:50:02 +00:00
|
|
|
import urllib.parse
|
2025-02-12 07:55:31 +00:00
|
|
|
import json
|
2025-02-11 07:15:39 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
import aiohttp
|
|
|
|
except ImportError:
|
|
|
|
raise ImportError('missing aiohttp module (pip install aiohttp)')
|
|
|
|
|
|
|
|
try:
|
|
|
|
import bs4
|
|
|
|
except ImportError:
|
|
|
|
raise ImportError('missing bs4 module (pip install beautifulsoup4)')
|
|
|
|
|
2025-02-12 03:30:22 +00:00
|
|
|
from .dns import resolve_all_dns, load_resolvers
|
|
|
|
from .parsers import parse_domain_url, get_cert_info, get_favicon_hash
|
|
|
|
from .utils import debug, USER_AGENTS, input_generator
|
2025-02-11 07:15:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
class HTTPZScanner:
|
|
|
|
'''Core scanner class for HTTP domain checking'''
|
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None, paths = None, custom_headers=None, post_data=None):
|
2025-02-11 07:15:39 +00:00
|
|
|
'''
|
|
|
|
Initialize the HTTPZScanner class
|
|
|
|
|
|
|
|
:param concurrent_limit: Maximum number of concurrent requests
|
|
|
|
:param timeout: Request timeout in seconds
|
|
|
|
:param follow_redirects: Follow redirects
|
|
|
|
:param check_axfr: Check for AXFR
|
|
|
|
:param resolver_file: Path to resolver file
|
|
|
|
:param output_file: Path to output file
|
|
|
|
:param show_progress: Show progress bar
|
|
|
|
:param debug_mode: Enable debug mode
|
|
|
|
:param jsonl_output: Output in JSONL format
|
|
|
|
:param show_fields: Fields to show
|
|
|
|
:param match_codes: Status codes to match
|
|
|
|
:param exclude_codes: Status codes to exclude
|
2025-02-12 00:18:52 +00:00
|
|
|
:param shard: Tuple of (shard_index, total_shards) for distributed scanning
|
2025-02-12 05:50:02 +00:00
|
|
|
:param paths: List of additional paths to check on each domain
|
2025-02-12 07:55:31 +00:00
|
|
|
:param custom_headers: Dictionary of custom headers to send with each request
|
|
|
|
:param post_data: Data to send with POST requests
|
2025-02-11 07:15:39 +00:00
|
|
|
'''
|
|
|
|
|
|
|
|
self.concurrent_limit = concurrent_limit
|
|
|
|
self.timeout = timeout
|
|
|
|
self.follow_redirects = follow_redirects
|
|
|
|
self.check_axfr = check_axfr
|
|
|
|
self.resolver_file = resolver_file
|
|
|
|
self.output_file = output_file
|
|
|
|
self.show_progress = show_progress
|
|
|
|
self.debug_mode = debug_mode
|
|
|
|
self.jsonl_output = jsonl_output
|
2025-02-12 00:18:52 +00:00
|
|
|
self.shard = shard
|
2025-02-12 05:50:02 +00:00
|
|
|
self.paths = paths or []
|
2025-02-12 07:55:31 +00:00
|
|
|
self.custom_headers = custom_headers or {}
|
|
|
|
self.post_data = post_data
|
2025-02-11 07:15:39 +00:00
|
|
|
|
|
|
|
self.show_fields = show_fields or {
|
|
|
|
'status_code' : True,
|
|
|
|
'content_type' : True,
|
|
|
|
'content_length' : True,
|
|
|
|
'title' : True,
|
|
|
|
'body' : True,
|
|
|
|
'ip' : True,
|
|
|
|
'favicon' : True,
|
|
|
|
'headers' : True,
|
|
|
|
'follow_redirects' : True,
|
|
|
|
'cname' : True,
|
|
|
|
'tls' : True
|
|
|
|
}
|
|
|
|
|
|
|
|
self.match_codes = match_codes
|
|
|
|
self.exclude_codes = exclude_codes
|
|
|
|
self.resolvers = None
|
|
|
|
self.processed_domains = 0
|
2025-02-12 02:21:45 +00:00
|
|
|
self.progress_count = 0
|
2025-02-11 07:15:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
async def check_domain(self, session: aiohttp.ClientSession, domain: str):
|
2025-02-12 07:55:31 +00:00
|
|
|
'''Check a single domain and return results'''
|
2025-02-11 07:15:39 +00:00
|
|
|
base_domain, port, protocols = parse_domain_url(domain)
|
2025-02-12 05:50:02 +00:00
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
for protocol in protocols:
|
|
|
|
url = f'{protocol}{base_domain}'
|
|
|
|
if port:
|
|
|
|
url += f':{port}'
|
2025-02-11 07:15:39 +00:00
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
try:
|
|
|
|
debug(f'Trying {url}...')
|
|
|
|
result = await self._check_url(session, url)
|
|
|
|
debug(f'Got result for {url}: {result}')
|
|
|
|
if result and (result['status'] != 400 or result.get('redirect_chain')): # Accept redirects
|
|
|
|
return result
|
2025-02-11 07:15:39 +00:00
|
|
|
except Exception as e:
|
2025-02-12 07:55:31 +00:00
|
|
|
debug(f'Error checking {url}: {str(e)}')
|
2025-02-12 05:50:02 +00:00
|
|
|
continue
|
2025-02-12 07:55:31 +00:00
|
|
|
|
|
|
|
return None
|
2025-02-11 07:15:39 +00:00
|
|
|
|
2025-02-12 05:50:02 +00:00
|
|
|
async def _check_url(self, session: aiohttp.ClientSession, url: str):
|
2025-02-12 07:55:31 +00:00
|
|
|
'''Check a single URL and return results'''
|
2025-02-12 05:50:02 +00:00
|
|
|
try:
|
|
|
|
headers = {'User-Agent': random.choice(USER_AGENTS)}
|
2025-02-12 07:55:31 +00:00
|
|
|
headers.update(self.custom_headers)
|
2025-02-12 05:50:02 +00:00
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
debug(f'Making request to {url} with headers: {headers}')
|
|
|
|
async with session.request('GET', url,
|
|
|
|
timeout=self.timeout,
|
|
|
|
allow_redirects=True, # Always follow redirects
|
|
|
|
max_redirects=10,
|
|
|
|
ssl=False, # Don't verify SSL
|
|
|
|
headers=headers) as response:
|
2025-02-12 05:50:02 +00:00
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
debug(f'Got response from {url}: status={response.status}, headers={dict(response.headers)}')
|
2025-02-12 05:50:02 +00:00
|
|
|
|
|
|
|
result = {
|
2025-02-12 07:55:31 +00:00
|
|
|
'domain': urllib.parse.urlparse(url).hostname,
|
2025-02-12 05:50:02 +00:00
|
|
|
'status': response.status,
|
|
|
|
'url': str(response.url),
|
2025-02-12 07:55:31 +00:00
|
|
|
'response_headers': dict(response.headers)
|
2025-02-12 05:50:02 +00:00
|
|
|
}
|
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
if response.history:
|
2025-02-12 05:50:02 +00:00
|
|
|
result['redirect_chain'] = [str(h.url) for h in response.history] + [str(response.url)]
|
2025-02-12 07:55:31 +00:00
|
|
|
debug(f'Redirect chain for {url}: {result["redirect_chain"]}')
|
2025-02-12 05:50:02 +00:00
|
|
|
|
|
|
|
return result
|
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
except aiohttp.ClientSSLError as e:
|
|
|
|
debug(f'SSL Error for {url}: {str(e)}')
|
|
|
|
return {
|
|
|
|
'domain': urllib.parse.urlparse(url).hostname,
|
|
|
|
'status': -1,
|
|
|
|
'error': f'SSL Error: {str(e)}',
|
|
|
|
'protocol': 'https' if url.startswith('https://') else 'http',
|
|
|
|
'error_type': 'SSL'
|
|
|
|
}
|
|
|
|
except aiohttp.ClientConnectorCertificateError as e:
|
|
|
|
debug(f'Certificate Error for {url}: {str(e)}')
|
|
|
|
return {
|
|
|
|
'domain': urllib.parse.urlparse(url).hostname,
|
|
|
|
'status': -1,
|
|
|
|
'error': f'Certificate Error: {str(e)}',
|
|
|
|
'protocol': 'https' if url.startswith('https://') else 'http',
|
|
|
|
'error_type': 'CERT'
|
|
|
|
}
|
|
|
|
except aiohttp.ClientConnectorError as e:
|
|
|
|
debug(f'Connection Error for {url}: {str(e)}')
|
|
|
|
return {
|
|
|
|
'domain': urllib.parse.urlparse(url).hostname,
|
|
|
|
'status': -1,
|
|
|
|
'error': f'Connection Failed: {str(e)}',
|
|
|
|
'protocol': 'https' if url.startswith('https://') else 'http',
|
|
|
|
'error_type': 'CONN'
|
|
|
|
}
|
|
|
|
except aiohttp.ClientError as e:
|
|
|
|
debug(f'HTTP Error for {url}: {e.__class__.__name__}: {str(e)}')
|
|
|
|
return {
|
|
|
|
'domain': urllib.parse.urlparse(url).hostname,
|
|
|
|
'status': -1,
|
|
|
|
'error': f'HTTP Error: {e.__class__.__name__}: {str(e)}',
|
|
|
|
'protocol': 'https' if url.startswith('https://') else 'http',
|
|
|
|
'error_type': 'HTTP'
|
|
|
|
}
|
|
|
|
except asyncio.TimeoutError:
|
|
|
|
debug(f'Timeout for {url}')
|
|
|
|
return {
|
|
|
|
'domain': urllib.parse.urlparse(url).hostname,
|
|
|
|
'status': -1,
|
|
|
|
'error': f'Connection Timed Out after {self.timeout}s',
|
|
|
|
'protocol': 'https' if url.startswith('https://') else 'http',
|
|
|
|
'error_type': 'TIMEOUT'
|
|
|
|
}
|
2025-02-12 05:50:02 +00:00
|
|
|
except Exception as e:
|
2025-02-12 07:55:31 +00:00
|
|
|
debug(f'Unexpected error for {url}: {e.__class__.__name__}: {str(e)}')
|
|
|
|
return {
|
|
|
|
'domain': urllib.parse.urlparse(url).hostname,
|
|
|
|
'status': -1,
|
|
|
|
'error': f'Error: {e.__class__.__name__}: {str(e)}',
|
|
|
|
'protocol': 'https' if url.startswith('https://') else 'http',
|
|
|
|
'error_type': 'UNKNOWN'
|
|
|
|
}
|
2025-02-11 07:15:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
async def scan(self, input_source):
|
|
|
|
'''
|
2025-02-12 01:57:01 +00:00
|
|
|
Scan domains from a file, stdin, or async generator
|
2025-02-11 07:15:39 +00:00
|
|
|
|
2025-02-12 01:57:01 +00:00
|
|
|
:param input_source: Can be:
|
|
|
|
- Path to file (str)
|
|
|
|
- stdin ('-')
|
|
|
|
- List/tuple of domains
|
|
|
|
- Async generator yielding domains
|
|
|
|
:yields: Result dictionary for each domain scanned
|
2025-02-11 07:15:39 +00:00
|
|
|
'''
|
2025-02-12 01:57:01 +00:00
|
|
|
|
2025-02-11 07:15:39 +00:00
|
|
|
if not self.resolvers:
|
2025-02-12 03:30:22 +00:00
|
|
|
self.resolvers = await load_resolvers(self.resolver_file)
|
2025-02-11 07:15:39 +00:00
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
# Just use ssl=False, that's all we need
|
|
|
|
connector = aiohttp.TCPConnector(ssl=False, enable_cleanup_closed=True)
|
|
|
|
async with aiohttp.ClientSession(connector=connector) as session:
|
2025-02-12 05:32:28 +00:00
|
|
|
tasks = {} # Change to dict to track domain for each task
|
2025-02-12 05:28:46 +00:00
|
|
|
domain_queue = asyncio.Queue()
|
|
|
|
queue_empty = False
|
2025-02-11 07:15:39 +00:00
|
|
|
|
2025-02-12 05:28:46 +00:00
|
|
|
async def process_domain(domain):
|
|
|
|
try:
|
|
|
|
result = await self.check_domain(session, domain)
|
2025-02-12 07:55:31 +00:00
|
|
|
if self.show_progress:
|
|
|
|
self.progress_count += 1
|
2025-02-12 05:28:46 +00:00
|
|
|
if result:
|
2025-02-12 07:55:31 +00:00
|
|
|
return domain, result
|
2025-02-12 05:28:46 +00:00
|
|
|
else:
|
2025-02-12 07:55:31 +00:00
|
|
|
# Create a proper error result if check_domain returns None
|
|
|
|
return domain, {
|
|
|
|
'domain': domain,
|
|
|
|
'status': -1,
|
|
|
|
'error': 'No successful response from either HTTP or HTTPS',
|
|
|
|
'protocol': 'unknown',
|
|
|
|
'error_type': 'NO_RESPONSE'
|
|
|
|
}
|
2025-02-12 05:28:46 +00:00
|
|
|
except Exception as e:
|
2025-02-12 07:55:31 +00:00
|
|
|
debug(f'Error processing {domain}: {e.__class__.__name__}: {str(e)}')
|
|
|
|
# Return structured error information
|
|
|
|
return domain, {
|
|
|
|
'domain': domain,
|
|
|
|
'status': -1,
|
|
|
|
'error': f'{e.__class__.__name__}: {str(e)}',
|
|
|
|
'protocol': 'unknown',
|
|
|
|
'error_type': 'PROCESS'
|
|
|
|
}
|
|
|
|
|
|
|
|
# Queue processor
|
|
|
|
async def queue_processor():
|
|
|
|
async for domain in input_generator(input_source, self.shard):
|
|
|
|
await domain_queue.put(domain)
|
|
|
|
self.processed_domains += 1
|
|
|
|
nonlocal queue_empty
|
|
|
|
queue_empty = True
|
|
|
|
|
|
|
|
# Start queue processor
|
|
|
|
queue_task = asyncio.create_task(queue_processor())
|
2025-02-11 07:15:39 +00:00
|
|
|
|
2025-02-12 05:28:46 +00:00
|
|
|
try:
|
2025-02-12 07:55:31 +00:00
|
|
|
while not (queue_empty and domain_queue.empty() and not tasks):
|
|
|
|
# Fill up tasks until we hit concurrent limit
|
|
|
|
while len(tasks) < self.concurrent_limit and not domain_queue.empty():
|
|
|
|
domain = await domain_queue.get()
|
|
|
|
task = asyncio.create_task(process_domain(domain))
|
|
|
|
tasks[task] = domain
|
2025-02-12 05:28:46 +00:00
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
if tasks:
|
|
|
|
# Wait for at least one task to complete
|
2025-02-12 05:32:28 +00:00
|
|
|
done, _ = await asyncio.wait(
|
|
|
|
tasks.keys(),
|
2025-02-12 05:28:46 +00:00
|
|
|
return_when=asyncio.FIRST_COMPLETED
|
|
|
|
)
|
|
|
|
|
2025-02-12 07:55:31 +00:00
|
|
|
# Process completed tasks
|
2025-02-12 05:28:46 +00:00
|
|
|
for task in done:
|
2025-02-12 05:32:28 +00:00
|
|
|
domain = tasks.pop(task)
|
2025-02-12 05:28:46 +00:00
|
|
|
try:
|
2025-02-12 07:55:31 +00:00
|
|
|
_, result = await task
|
|
|
|
if result:
|
2025-02-12 05:28:46 +00:00
|
|
|
yield result
|
|
|
|
except Exception as e:
|
2025-02-12 07:55:31 +00:00
|
|
|
debug(f'Task error for {domain}: {e.__class__.__name__}: {str(e)}')
|
|
|
|
yield {
|
|
|
|
'domain': domain,
|
|
|
|
'status': -1,
|
|
|
|
'error': f'Task Error: {e.__class__.__name__}: {str(e)}',
|
|
|
|
'protocol': 'unknown',
|
|
|
|
'error_type': 'TASK'
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
await asyncio.sleep(0.1) # Prevent CPU spin when no tasks
|
|
|
|
|
2025-02-12 05:28:46 +00:00
|
|
|
finally:
|
|
|
|
# Clean up
|
|
|
|
for task in tasks:
|
|
|
|
task.cancel()
|
|
|
|
queue_task.cancel()
|
|
|
|
try:
|
|
|
|
await queue_task
|
|
|
|
except asyncio.CancelledError:
|
|
|
|
pass
|