diff --git a/README.md b/README.md index 65939b6..0496c04 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ A high-performance concurrent web scanner written in Python. HTTPZ efficiently s ## Installation -### Via pip (recommended) +### Via pip *(recommended)* ```bash # Install from PyPI pip install httpz_scanner @@ -68,104 +68,136 @@ Full scan with all options: python -m httpz_scanner domains.txt -c 100 -o output.jsonl -j -all -to 10 -mc 200,301 -ec 404,500 -p -ax -r resolvers.txt ``` +### Distributed Scanning +Split scanning across multiple machines using the `--shard` argument: + +```bash +# Machine 1 +httpz domains.txt --shard 1/3 + +# Machine 2 +httpz domains.txt --shard 2/3 + +# Machine 3 +httpz domains.txt --shard 3/3 +``` + +Each machine will process a different subset of domains without overlap. For example, with 3 shards: +- Machine 1 processes lines 0,3,6,9,... +- Machine 2 processes lines 1,4,7,10,... +- Machine 3 processes lines 2,5,8,11,... + +This allows efficient distribution of large scans across multiple machines. + ### Python Library ```python import asyncio +import aiohttp +import aioboto3 from httpz_scanner import HTTPZScanner async def scan_domains(): # Initialize scanner with all possible options (showing defaults) scanner = HTTPZScanner( # Core settings - concurrent_limit=100, # Number of concurrent requests + concurrent_limit=100, # Number of concurrent requests timeout=5, # Request timeout in seconds - follow_redirects=False, # Follow redirects (max 10) + follow_redirects=False, # Follow redirects (max 10) check_axfr=False, # Try AXFR transfer against nameservers resolver_file=None, # Path to custom DNS resolvers file output_file=None, # Path to JSONL output file show_progress=False, # Show progress counter debug_mode=False, # Show error states and debug info jsonl_output=False, # Output in JSONL format + shard=None, # Tuple of (shard_index, total_shards) for distributed scanning # Control which fields to show (all False by default unless show_fields is None) show_fields={ - 'status_code': True, # Show status code - 'content_type': True, # Show content type - 'content_length': True, # Show content length + 'status_code': True, # Show status code + 'content_type': True, # Show content type + 'content_length': True, # Show content length 'title': True, # Show page title 'body': True, # Show body preview 'ip': True, # Show IP addresses 'favicon': True, # Show favicon hash 'headers': True, # Show response headers - 'follow_redirects': True, # Show redirect chain + 'follow_redirects': True, # Show redirect chain 'cname': True, # Show CNAME records 'tls': True # Show TLS certificate info }, # Filter results - match_codes={200, 301, 302}, # Only show these status codes - exclude_codes={404, 500, 503} # Exclude these status codes + match_codes={200,301,302}, # Only show these status codes + exclude_codes={404,500,503} # Exclude these status codes ) # Initialize resolvers (required before scanning) await scanner.init() - # Scan domains from file - await scanner.scan('domains.txt') - - # Or scan from stdin - await scanner.scan('-') + # Example 1: Stream from S3/MinIO using aioboto3 + async with aioboto3.Session().client('s3', + endpoint_url='http://minio.example.com:9000', + aws_access_key_id='access_key', + aws_secret_access_key='secret_key') as s3: + + response = await s3.get_object(Bucket='my-bucket', Key='huge-domains.txt') + async with response['Body'] as stream: + async def s3_generator(): + while True: + line = await stream.readline() + if not line: + break + yield line.decode().strip() + + await scanner.scan(s3_generator()) + + # Example 2: Stream from URL using aiohttp + async with aiohttp.ClientSession() as session: + # For large files - stream line by line + async with session.get('https://example.com/huge-domains.txt') as resp: + async def url_generator(): + async for line in resp.content: + yield line.decode().strip() + + await scanner.scan(url_generator()) + + # For small files - read all at once + async with session.get('https://example.com/small-domains.txt') as resp: + content = await resp.text() + await scanner.scan(content) # Library handles splitting into lines + + # Example 3: Simple list of domains + domains = [ + 'example1.com', + 'example2.com', + 'example3.com' + ] + await scanner.scan(domains) if __name__ == '__main__': asyncio.run(scan_domains()) ``` -The scanner will return results in this format: -```python -{ - 'domain': 'example.com', # Base domain - 'url': 'https://example.com', # Full URL - 'status': 200, # HTTP status code - 'port': 443, # Port number - 'title': 'Example Domain', # Page title - 'body': 'Example body text...', # Body preview - 'content_type': 'text/html', # Content type - 'content_length': '12345', # Content length - 'ips': ['93.184.216.34'], # IP addresses - 'cname': 'cdn.example.com', # CNAME record - 'nameservers': ['ns1.example.com'],# Nameservers - 'favicon_hash': '123456789', # Favicon hash - 'headers': { # Response headers - 'Server': 'nginx', - 'Content-Type': 'text/html' - }, - 'redirect_chain': [ # Redirect history - 'http://example.com', - 'https://example.com' - ], - 'tls': { # TLS certificate info - 'fingerprint': 'sha256...', - 'common_name': 'example.com', - 'issuer': 'Let\'s Encrypt', - 'alt_names': ['www.example.com'], - 'not_before': '2023-01-01T00:00:00', - 'not_after': '2024-01-01T00:00:00', - 'version': 3, - 'serial_number': 'abcdef1234' - } -} -``` +The scanner accepts various input types: +- Async/sync generators that yield domains +- String content with newlines +- Lists/tuples of domains +- File paths +- stdin (using '-') + +All inputs support sharding for distributed scanning. ## Arguments -| Argument | Long Form | Description | -|-----------|------------------|-------------------------------------------------------------| -| `file` | - | File containing domains *(one per line)*, use `-` for stdin | -| `-d` | `--debug` | Show error states and debug information | -| `-c N` | `--concurrent N` | Number of concurrent checks *(default: 100)* | -| `-o FILE` | `--output FILE` | Output file path *(JSONL format)* | -| `-j` | `--jsonl` | Output JSON Lines format to console | -| `-all` | `--all-flags` | Enable all output flags | +| Argument | Long Form | Description | +|---------------|------------------|-------------------------------------------------------------| +| `file` | | File containing domains *(one per line)*, use `-` for stdin | +| `-d` | `--debug` | Show error states and debug information | +| `-c N` | `--concurrent N` | Number of concurrent checks *(default: 100)* | +| `-o FILE` | `--output FILE` | Output file path *(JSONL format)* | +| `-j` | `--jsonl` | Output JSON Lines format to console | +| `-all` | `--all-flags` | Enable all output flags | +| `-sh` | `--shard N/T` | Process shard N of T total shards *(e.g., 1/3)* | ### Output Field Flags @@ -191,5 +223,5 @@ The scanner will return results in this format: | `-mc CODES` | `--match-codes CODES` | Only show specific status codes *(comma-separated)* | | `-ec CODES` | `--exclude-codes CODES` | Exclude specific status codes *(comma-separated)* | | `-p` | `--progress` | Show progress counter | -| `-ax` | `--axfr` | Try AXFR transfer against nameservers | -| `-r FILE` | `--resolvers FILE` | File containing DNS resolvers *(one per line)* | \ No newline at end of file +| `-ax` | `--axfr` | Try AXFR transfer against nameservers | +| `-r FILE` | `--resolvers FILE` | File containing DNS resolvers *(one per line)* | \ No newline at end of file diff --git a/httpz_scanner/cli.py b/httpz_scanner/cli.py index a4a3b9f..bb0f398 100644 --- a/httpz_scanner/cli.py +++ b/httpz_scanner/cli.py @@ -11,6 +11,7 @@ import sys from .colors import Colors from .scanner import HTTPZScanner from .utils import SILENT_MODE, info +from .parsers import parse_status_codes, parse_shard def setup_logging(level='INFO', log_to_disk=False): ''' @@ -49,25 +50,6 @@ def setup_logging(level='INFO', log_to_disk=False): handlers=handlers ) -def parse_status_codes(codes_str: str) -> set: - ''' - Parse comma-separated status codes and ranges into a set of integers - - :param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503") - ''' - - codes = set() - try: - for part in codes_str.split(','): - if '-' in part: - start, end = map(int, part.split('-')) - codes.update(range(start, end + 1)) - else: - codes.add(int(part)) - return codes - except ValueError: - raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)') - async def main(): parser = argparse.ArgumentParser( description=f'{Colors.GREEN}Hyper-fast HTTP Scraping Tool{Colors.RESET}', @@ -103,6 +85,9 @@ async def main(): parser.add_argument('-r', '--resolvers', help='File containing DNS resolvers (one per line)') parser.add_argument('-to', '--timeout', type=int, default=5, help='Request timeout in seconds') + # Add shard argument + parser.add_argument('-sh','--shard', type=parse_shard, help='Shard index and total shards (e.g., 1/3)') + # If no arguments provided, print help and exit if len(sys.argv) == 1: parser.print_help() @@ -158,7 +143,8 @@ async def main(): jsonl_output=args.jsonl, show_fields=show_fields, match_codes=args.match_codes, - exclude_codes=args.exclude_codes + exclude_codes=args.exclude_codes, + shard=args.shard ) # Run the scanner with file/stdin input diff --git a/httpz_scanner/parsers.py b/httpz_scanner/parsers.py index 387efe0..e757647 100644 --- a/httpz_scanner/parsers.py +++ b/httpz_scanner/parsers.py @@ -20,6 +20,7 @@ except ImportError: raise ImportError('missing mmh3 module (pip install mmh3)') from .utils import debug, error +import argparse def parse_domain_url(domain: str) -> tuple: @@ -108,6 +109,7 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str: :param base_url: base URL of the website :param html: HTML content of the page ''' + try: soup = bs4.BeautifulSoup(html, 'html.parser') @@ -137,4 +139,39 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str: except Exception as e: debug(f'Error getting favicon for {base_url}: {str(e)}') - return None \ No newline at end of file + return None + +def parse_status_codes(codes_str: str) -> set: + ''' + Parse comma-separated status codes and ranges into a set of integers + + :param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503") + ''' + + codes = set() + try: + for part in codes_str.split(','): + if '-' in part: + start, end = map(int, part.split('-')) + codes.update(range(start, end + 1)) + else: + codes.add(int(part)) + return codes + except ValueError: + raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)') + + +def parse_shard(shard_str: str) -> tuple: + ''' + Parse shard argument in format INDEX/TOTAL + + :param shard_str: Shard string in format "INDEX/TOTAL" + ''' + + try: + shard_index, total_shards = map(int, shard_str.split('/')) + if shard_index < 1 or total_shards < 1 or shard_index > total_shards: + raise ValueError + return shard_index - 1, total_shards # Convert to 0-based index + except (ValueError, TypeError): + raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL') \ No newline at end of file diff --git a/httpz_scanner/scanner.py b/httpz_scanner/scanner.py index c64a890..9aac1ba 100644 --- a/httpz_scanner/scanner.py +++ b/httpz_scanner/scanner.py @@ -27,7 +27,7 @@ from .utils import debug, info, USER_AGENTS, input_generator class HTTPZScanner: '''Core scanner class for HTTP domain checking''' - def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None): + def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None): ''' Initialize the HTTPZScanner class @@ -43,6 +43,7 @@ class HTTPZScanner: :param show_fields: Fields to show :param match_codes: Status codes to match :param exclude_codes: Status codes to exclude + :param shard: Tuple of (shard_index, total_shards) for distributed scanning ''' self.concurrent_limit = concurrent_limit @@ -54,6 +55,7 @@ class HTTPZScanner: self.show_progress = show_progress self.debug_mode = debug_mode self.jsonl_output = jsonl_output + self.shard = shard self.show_fields = show_fields or { 'status_code' : True, @@ -218,8 +220,8 @@ class HTTPZScanner: async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: tasks = set() - # Process domains with concurrent limit - for domain in input_generator(input_source): + # Pass shard info to input_generator + for domain in input_generator(input_source, self.shard): if len(tasks) >= self.concurrent_limit: done, tasks = await asyncio.wait( tasks, return_when=asyncio.FIRST_COMPLETED diff --git a/httpz_scanner/utils.py b/httpz_scanner/utils.py index f7266cb..918e3b1 100644 --- a/httpz_scanner/utils.py +++ b/httpz_scanner/utils.py @@ -3,6 +3,7 @@ # httpz_scanner/utils.py import logging +import os import sys @@ -97,19 +98,54 @@ def human_size(size_bytes: int) -> str: return f'{size:.1f}{units[unit_index]}' -def input_generator(input_source: str): +def input_generator(input_source, shard: tuple = None): ''' - Generator function to yield domains from file or stdin + Generator function to yield domains from various input sources with optional sharding - :param input_source: file or stdin + :param input_source: Can be: + - string path to local file + - "-" for stdin + - list/tuple of domains + - generator/iterator yielding domains + - string content with newlines + :param shard: Tuple of (shard_index, total_shards) for distributed scanning ''' + line_num = 0 + + # Handle stdin if input_source == '-' or input_source is None: for line in sys.stdin: - if line.strip(): - yield line.strip() - else: + if line := line.strip(): + if shard is None or line_num % shard[1] == shard[0]: + yield line + line_num += 1 + + # Handle local files + elif isinstance(input_source, str) and os.path.exists(input_source): with open(input_source, 'r') as f: for line in f: - if line.strip(): - yield line.strip() \ No newline at end of file + if line := line.strip(): + if shard is None or line_num % shard[1] == shard[0]: + yield line + line_num += 1 + + # Handle iterables (generators, lists, etc) + elif hasattr(input_source, '__iter__') and not isinstance(input_source, (str, bytes)): + for line in input_source: + if isinstance(line, bytes): + line = line.decode() + if line := line.strip(): + if shard is None or line_num % shard[1] == shard[0]: + yield line + line_num += 1 + + # Handle string content with newlines + elif isinstance(input_source, (str, bytes)): + if isinstance(input_source, bytes): + input_source = input_source.decode() + for line in input_source.splitlines(): + if line := line.strip(): + if shard is None or line_num % shard[1] == shard[0]: + yield line + line_num += 1 \ No newline at end of file