Allow any form of input for scanning

This commit is contained in:
Dionysus 2025-02-11 19:18:52 -05:00
parent a6fc596547
commit e27e5e4095
Signed by: acidvegas
GPG Key ID: EF4B922DB85DC9DE
5 changed files with 184 additions and 91 deletions

120
README.md
View File

@ -16,7 +16,7 @@ A high-performance concurrent web scanner written in Python. HTTPZ efficiently s
## Installation
### Via pip (recommended)
### Via pip *(recommended)*
```bash
# Install from PyPI
pip install httpz_scanner
@ -68,9 +68,32 @@ Full scan with all options:
python -m httpz_scanner domains.txt -c 100 -o output.jsonl -j -all -to 10 -mc 200,301 -ec 404,500 -p -ax -r resolvers.txt
```
### Distributed Scanning
Split scanning across multiple machines using the `--shard` argument:
```bash
# Machine 1
httpz domains.txt --shard 1/3
# Machine 2
httpz domains.txt --shard 2/3
# Machine 3
httpz domains.txt --shard 3/3
```
Each machine will process a different subset of domains without overlap. For example, with 3 shards:
- Machine 1 processes lines 0,3,6,9,...
- Machine 2 processes lines 1,4,7,10,...
- Machine 3 processes lines 2,5,8,11,...
This allows efficient distribution of large scans across multiple machines.
### Python Library
```python
import asyncio
import aiohttp
import aioboto3
from httpz_scanner import HTTPZScanner
async def scan_domains():
@ -86,6 +109,7 @@ async def scan_domains():
show_progress=False, # Show progress counter
debug_mode=False, # Show error states and debug info
jsonl_output=False, # Output in JSONL format
shard=None, # Tuple of (shard_index, total_shards) for distributed scanning
# Control which fields to show (all False by default unless show_fields is None)
show_fields={
@ -103,69 +127,77 @@ async def scan_domains():
},
# Filter results
match_codes={200, 301, 302}, # Only show these status codes
exclude_codes={404, 500, 503} # Exclude these status codes
match_codes={200,301,302}, # Only show these status codes
exclude_codes={404,500,503} # Exclude these status codes
)
# Initialize resolvers (required before scanning)
await scanner.init()
# Scan domains from file
await scanner.scan('domains.txt')
# Example 1: Stream from S3/MinIO using aioboto3
async with aioboto3.Session().client('s3',
endpoint_url='http://minio.example.com:9000',
aws_access_key_id='access_key',
aws_secret_access_key='secret_key') as s3:
# Or scan from stdin
await scanner.scan('-')
response = await s3.get_object(Bucket='my-bucket', Key='huge-domains.txt')
async with response['Body'] as stream:
async def s3_generator():
while True:
line = await stream.readline()
if not line:
break
yield line.decode().strip()
await scanner.scan(s3_generator())
# Example 2: Stream from URL using aiohttp
async with aiohttp.ClientSession() as session:
# For large files - stream line by line
async with session.get('https://example.com/huge-domains.txt') as resp:
async def url_generator():
async for line in resp.content:
yield line.decode().strip()
await scanner.scan(url_generator())
# For small files - read all at once
async with session.get('https://example.com/small-domains.txt') as resp:
content = await resp.text()
await scanner.scan(content) # Library handles splitting into lines
# Example 3: Simple list of domains
domains = [
'example1.com',
'example2.com',
'example3.com'
]
await scanner.scan(domains)
if __name__ == '__main__':
asyncio.run(scan_domains())
```
The scanner will return results in this format:
```python
{
'domain': 'example.com', # Base domain
'url': 'https://example.com', # Full URL
'status': 200, # HTTP status code
'port': 443, # Port number
'title': 'Example Domain', # Page title
'body': 'Example body text...', # Body preview
'content_type': 'text/html', # Content type
'content_length': '12345', # Content length
'ips': ['93.184.216.34'], # IP addresses
'cname': 'cdn.example.com', # CNAME record
'nameservers': ['ns1.example.com'],# Nameservers
'favicon_hash': '123456789', # Favicon hash
'headers': { # Response headers
'Server': 'nginx',
'Content-Type': 'text/html'
},
'redirect_chain': [ # Redirect history
'http://example.com',
'https://example.com'
],
'tls': { # TLS certificate info
'fingerprint': 'sha256...',
'common_name': 'example.com',
'issuer': 'Let\'s Encrypt',
'alt_names': ['www.example.com'],
'not_before': '2023-01-01T00:00:00',
'not_after': '2024-01-01T00:00:00',
'version': 3,
'serial_number': 'abcdef1234'
}
}
```
The scanner accepts various input types:
- Async/sync generators that yield domains
- String content with newlines
- Lists/tuples of domains
- File paths
- stdin (using '-')
All inputs support sharding for distributed scanning.
## Arguments
| Argument | Long Form | Description |
|-----------|------------------|-------------------------------------------------------------|
| `file` | - | File containing domains *(one per line)*, use `-` for stdin |
|---------------|------------------|-------------------------------------------------------------|
| `file` | | File containing domains *(one per line)*, use `-` for stdin |
| `-d` | `--debug` | Show error states and debug information |
| `-c N` | `--concurrent N` | Number of concurrent checks *(default: 100)* |
| `-o FILE` | `--output FILE` | Output file path *(JSONL format)* |
| `-j` | `--jsonl` | Output JSON Lines format to console |
| `-all` | `--all-flags` | Enable all output flags |
| `-sh` | `--shard N/T` | Process shard N of T total shards *(e.g., 1/3)* |
### Output Field Flags

View File

@ -11,6 +11,7 @@ import sys
from .colors import Colors
from .scanner import HTTPZScanner
from .utils import SILENT_MODE, info
from .parsers import parse_status_codes, parse_shard
def setup_logging(level='INFO', log_to_disk=False):
'''
@ -49,25 +50,6 @@ def setup_logging(level='INFO', log_to_disk=False):
handlers=handlers
)
def parse_status_codes(codes_str: str) -> set:
'''
Parse comma-separated status codes and ranges into a set of integers
:param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
'''
codes = set()
try:
for part in codes_str.split(','):
if '-' in part:
start, end = map(int, part.split('-'))
codes.update(range(start, end + 1))
else:
codes.add(int(part))
return codes
except ValueError:
raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
async def main():
parser = argparse.ArgumentParser(
description=f'{Colors.GREEN}Hyper-fast HTTP Scraping Tool{Colors.RESET}',
@ -103,6 +85,9 @@ async def main():
parser.add_argument('-r', '--resolvers', help='File containing DNS resolvers (one per line)')
parser.add_argument('-to', '--timeout', type=int, default=5, help='Request timeout in seconds')
# Add shard argument
parser.add_argument('-sh','--shard', type=parse_shard, help='Shard index and total shards (e.g., 1/3)')
# If no arguments provided, print help and exit
if len(sys.argv) == 1:
parser.print_help()
@ -158,7 +143,8 @@ async def main():
jsonl_output=args.jsonl,
show_fields=show_fields,
match_codes=args.match_codes,
exclude_codes=args.exclude_codes
exclude_codes=args.exclude_codes,
shard=args.shard
)
# Run the scanner with file/stdin input

View File

@ -20,6 +20,7 @@ except ImportError:
raise ImportError('missing mmh3 module (pip install mmh3)')
from .utils import debug, error
import argparse
def parse_domain_url(domain: str) -> tuple:
@ -108,6 +109,7 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str:
:param base_url: base URL of the website
:param html: HTML content of the page
'''
try:
soup = bs4.BeautifulSoup(html, 'html.parser')
@ -138,3 +140,38 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str:
debug(f'Error getting favicon for {base_url}: {str(e)}')
return None
def parse_status_codes(codes_str: str) -> set:
'''
Parse comma-separated status codes and ranges into a set of integers
:param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
'''
codes = set()
try:
for part in codes_str.split(','):
if '-' in part:
start, end = map(int, part.split('-'))
codes.update(range(start, end + 1))
else:
codes.add(int(part))
return codes
except ValueError:
raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
def parse_shard(shard_str: str) -> tuple:
'''
Parse shard argument in format INDEX/TOTAL
:param shard_str: Shard string in format "INDEX/TOTAL"
'''
try:
shard_index, total_shards = map(int, shard_str.split('/'))
if shard_index < 1 or total_shards < 1 or shard_index > total_shards:
raise ValueError
return shard_index - 1, total_shards # Convert to 0-based index
except (ValueError, TypeError):
raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL')

View File

@ -27,7 +27,7 @@ from .utils import debug, info, USER_AGENTS, input_generator
class HTTPZScanner:
'''Core scanner class for HTTP domain checking'''
def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None):
def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None):
'''
Initialize the HTTPZScanner class
@ -43,6 +43,7 @@ class HTTPZScanner:
:param show_fields: Fields to show
:param match_codes: Status codes to match
:param exclude_codes: Status codes to exclude
:param shard: Tuple of (shard_index, total_shards) for distributed scanning
'''
self.concurrent_limit = concurrent_limit
@ -54,6 +55,7 @@ class HTTPZScanner:
self.show_progress = show_progress
self.debug_mode = debug_mode
self.jsonl_output = jsonl_output
self.shard = shard
self.show_fields = show_fields or {
'status_code' : True,
@ -218,8 +220,8 @@ class HTTPZScanner:
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
tasks = set()
# Process domains with concurrent limit
for domain in input_generator(input_source):
# Pass shard info to input_generator
for domain in input_generator(input_source, self.shard):
if len(tasks) >= self.concurrent_limit:
done, tasks = await asyncio.wait(
tasks, return_when=asyncio.FIRST_COMPLETED

View File

@ -3,6 +3,7 @@
# httpz_scanner/utils.py
import logging
import os
import sys
@ -97,19 +98,54 @@ def human_size(size_bytes: int) -> str:
return f'{size:.1f}{units[unit_index]}'
def input_generator(input_source: str):
def input_generator(input_source, shard: tuple = None):
'''
Generator function to yield domains from file or stdin
Generator function to yield domains from various input sources with optional sharding
:param input_source: file or stdin
:param input_source: Can be:
- string path to local file
- "-" for stdin
- list/tuple of domains
- generator/iterator yielding domains
- string content with newlines
:param shard: Tuple of (shard_index, total_shards) for distributed scanning
'''
line_num = 0
# Handle stdin
if input_source == '-' or input_source is None:
for line in sys.stdin:
if line.strip():
yield line.strip()
else:
if line := line.strip():
if shard is None or line_num % shard[1] == shard[0]:
yield line
line_num += 1
# Handle local files
elif isinstance(input_source, str) and os.path.exists(input_source):
with open(input_source, 'r') as f:
for line in f:
if line.strip():
yield line.strip()
if line := line.strip():
if shard is None or line_num % shard[1] == shard[0]:
yield line
line_num += 1
# Handle iterables (generators, lists, etc)
elif hasattr(input_source, '__iter__') and not isinstance(input_source, (str, bytes)):
for line in input_source:
if isinstance(line, bytes):
line = line.decode()
if line := line.strip():
if shard is None or line_num % shard[1] == shard[0]:
yield line
line_num += 1
# Handle string content with newlines
elif isinstance(input_source, (str, bytes)):
if isinstance(input_source, bytes):
input_source = input_source.decode()
for line in input_source.splitlines():
if line := line.strip():
if shard is None or line_num % shard[1] == shard[0]:
yield line
line_num += 1