Allow any form of input for scanning
This commit is contained in:
parent
a6fc596547
commit
e27e5e4095
116
README.md
116
README.md
@ -16,7 +16,7 @@ A high-performance concurrent web scanner written in Python. HTTPZ efficiently s
|
||||
|
||||
## Installation
|
||||
|
||||
### Via pip (recommended)
|
||||
### Via pip *(recommended)*
|
||||
```bash
|
||||
# Install from PyPI
|
||||
pip install httpz_scanner
|
||||
@ -68,9 +68,32 @@ Full scan with all options:
|
||||
python -m httpz_scanner domains.txt -c 100 -o output.jsonl -j -all -to 10 -mc 200,301 -ec 404,500 -p -ax -r resolvers.txt
|
||||
```
|
||||
|
||||
### Distributed Scanning
|
||||
Split scanning across multiple machines using the `--shard` argument:
|
||||
|
||||
```bash
|
||||
# Machine 1
|
||||
httpz domains.txt --shard 1/3
|
||||
|
||||
# Machine 2
|
||||
httpz domains.txt --shard 2/3
|
||||
|
||||
# Machine 3
|
||||
httpz domains.txt --shard 3/3
|
||||
```
|
||||
|
||||
Each machine will process a different subset of domains without overlap. For example, with 3 shards:
|
||||
- Machine 1 processes lines 0,3,6,9,...
|
||||
- Machine 2 processes lines 1,4,7,10,...
|
||||
- Machine 3 processes lines 2,5,8,11,...
|
||||
|
||||
This allows efficient distribution of large scans across multiple machines.
|
||||
|
||||
### Python Library
|
||||
```python
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import aioboto3
|
||||
from httpz_scanner import HTTPZScanner
|
||||
|
||||
async def scan_domains():
|
||||
@ -86,6 +109,7 @@ async def scan_domains():
|
||||
show_progress=False, # Show progress counter
|
||||
debug_mode=False, # Show error states and debug info
|
||||
jsonl_output=False, # Output in JSONL format
|
||||
shard=None, # Tuple of (shard_index, total_shards) for distributed scanning
|
||||
|
||||
# Control which fields to show (all False by default unless show_fields is None)
|
||||
show_fields={
|
||||
@ -110,62 +134,70 @@ async def scan_domains():
|
||||
# Initialize resolvers (required before scanning)
|
||||
await scanner.init()
|
||||
|
||||
# Scan domains from file
|
||||
await scanner.scan('domains.txt')
|
||||
# Example 1: Stream from S3/MinIO using aioboto3
|
||||
async with aioboto3.Session().client('s3',
|
||||
endpoint_url='http://minio.example.com:9000',
|
||||
aws_access_key_id='access_key',
|
||||
aws_secret_access_key='secret_key') as s3:
|
||||
|
||||
# Or scan from stdin
|
||||
await scanner.scan('-')
|
||||
response = await s3.get_object(Bucket='my-bucket', Key='huge-domains.txt')
|
||||
async with response['Body'] as stream:
|
||||
async def s3_generator():
|
||||
while True:
|
||||
line = await stream.readline()
|
||||
if not line:
|
||||
break
|
||||
yield line.decode().strip()
|
||||
|
||||
await scanner.scan(s3_generator())
|
||||
|
||||
# Example 2: Stream from URL using aiohttp
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# For large files - stream line by line
|
||||
async with session.get('https://example.com/huge-domains.txt') as resp:
|
||||
async def url_generator():
|
||||
async for line in resp.content:
|
||||
yield line.decode().strip()
|
||||
|
||||
await scanner.scan(url_generator())
|
||||
|
||||
# For small files - read all at once
|
||||
async with session.get('https://example.com/small-domains.txt') as resp:
|
||||
content = await resp.text()
|
||||
await scanner.scan(content) # Library handles splitting into lines
|
||||
|
||||
# Example 3: Simple list of domains
|
||||
domains = [
|
||||
'example1.com',
|
||||
'example2.com',
|
||||
'example3.com'
|
||||
]
|
||||
await scanner.scan(domains)
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(scan_domains())
|
||||
```
|
||||
|
||||
The scanner will return results in this format:
|
||||
```python
|
||||
{
|
||||
'domain': 'example.com', # Base domain
|
||||
'url': 'https://example.com', # Full URL
|
||||
'status': 200, # HTTP status code
|
||||
'port': 443, # Port number
|
||||
'title': 'Example Domain', # Page title
|
||||
'body': 'Example body text...', # Body preview
|
||||
'content_type': 'text/html', # Content type
|
||||
'content_length': '12345', # Content length
|
||||
'ips': ['93.184.216.34'], # IP addresses
|
||||
'cname': 'cdn.example.com', # CNAME record
|
||||
'nameservers': ['ns1.example.com'],# Nameservers
|
||||
'favicon_hash': '123456789', # Favicon hash
|
||||
'headers': { # Response headers
|
||||
'Server': 'nginx',
|
||||
'Content-Type': 'text/html'
|
||||
},
|
||||
'redirect_chain': [ # Redirect history
|
||||
'http://example.com',
|
||||
'https://example.com'
|
||||
],
|
||||
'tls': { # TLS certificate info
|
||||
'fingerprint': 'sha256...',
|
||||
'common_name': 'example.com',
|
||||
'issuer': 'Let\'s Encrypt',
|
||||
'alt_names': ['www.example.com'],
|
||||
'not_before': '2023-01-01T00:00:00',
|
||||
'not_after': '2024-01-01T00:00:00',
|
||||
'version': 3,
|
||||
'serial_number': 'abcdef1234'
|
||||
}
|
||||
}
|
||||
```
|
||||
The scanner accepts various input types:
|
||||
- Async/sync generators that yield domains
|
||||
- String content with newlines
|
||||
- Lists/tuples of domains
|
||||
- File paths
|
||||
- stdin (using '-')
|
||||
|
||||
All inputs support sharding for distributed scanning.
|
||||
|
||||
## Arguments
|
||||
|
||||
| Argument | Long Form | Description |
|
||||
|-----------|------------------|-------------------------------------------------------------|
|
||||
| `file` | - | File containing domains *(one per line)*, use `-` for stdin |
|
||||
|---------------|------------------|-------------------------------------------------------------|
|
||||
| `file` | | File containing domains *(one per line)*, use `-` for stdin |
|
||||
| `-d` | `--debug` | Show error states and debug information |
|
||||
| `-c N` | `--concurrent N` | Number of concurrent checks *(default: 100)* |
|
||||
| `-o FILE` | `--output FILE` | Output file path *(JSONL format)* |
|
||||
| `-j` | `--jsonl` | Output JSON Lines format to console |
|
||||
| `-all` | `--all-flags` | Enable all output flags |
|
||||
| `-sh` | `--shard N/T` | Process shard N of T total shards *(e.g., 1/3)* |
|
||||
|
||||
### Output Field Flags
|
||||
|
||||
|
@ -11,6 +11,7 @@ import sys
|
||||
from .colors import Colors
|
||||
from .scanner import HTTPZScanner
|
||||
from .utils import SILENT_MODE, info
|
||||
from .parsers import parse_status_codes, parse_shard
|
||||
|
||||
def setup_logging(level='INFO', log_to_disk=False):
|
||||
'''
|
||||
@ -49,25 +50,6 @@ def setup_logging(level='INFO', log_to_disk=False):
|
||||
handlers=handlers
|
||||
)
|
||||
|
||||
def parse_status_codes(codes_str: str) -> set:
|
||||
'''
|
||||
Parse comma-separated status codes and ranges into a set of integers
|
||||
|
||||
:param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
|
||||
'''
|
||||
|
||||
codes = set()
|
||||
try:
|
||||
for part in codes_str.split(','):
|
||||
if '-' in part:
|
||||
start, end = map(int, part.split('-'))
|
||||
codes.update(range(start, end + 1))
|
||||
else:
|
||||
codes.add(int(part))
|
||||
return codes
|
||||
except ValueError:
|
||||
raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=f'{Colors.GREEN}Hyper-fast HTTP Scraping Tool{Colors.RESET}',
|
||||
@ -103,6 +85,9 @@ async def main():
|
||||
parser.add_argument('-r', '--resolvers', help='File containing DNS resolvers (one per line)')
|
||||
parser.add_argument('-to', '--timeout', type=int, default=5, help='Request timeout in seconds')
|
||||
|
||||
# Add shard argument
|
||||
parser.add_argument('-sh','--shard', type=parse_shard, help='Shard index and total shards (e.g., 1/3)')
|
||||
|
||||
# If no arguments provided, print help and exit
|
||||
if len(sys.argv) == 1:
|
||||
parser.print_help()
|
||||
@ -158,7 +143,8 @@ async def main():
|
||||
jsonl_output=args.jsonl,
|
||||
show_fields=show_fields,
|
||||
match_codes=args.match_codes,
|
||||
exclude_codes=args.exclude_codes
|
||||
exclude_codes=args.exclude_codes,
|
||||
shard=args.shard
|
||||
)
|
||||
|
||||
# Run the scanner with file/stdin input
|
||||
|
@ -20,6 +20,7 @@ except ImportError:
|
||||
raise ImportError('missing mmh3 module (pip install mmh3)')
|
||||
|
||||
from .utils import debug, error
|
||||
import argparse
|
||||
|
||||
|
||||
def parse_domain_url(domain: str) -> tuple:
|
||||
@ -108,6 +109,7 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str:
|
||||
:param base_url: base URL of the website
|
||||
:param html: HTML content of the page
|
||||
'''
|
||||
|
||||
try:
|
||||
soup = bs4.BeautifulSoup(html, 'html.parser')
|
||||
|
||||
@ -138,3 +140,38 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str:
|
||||
debug(f'Error getting favicon for {base_url}: {str(e)}')
|
||||
|
||||
return None
|
||||
|
||||
def parse_status_codes(codes_str: str) -> set:
|
||||
'''
|
||||
Parse comma-separated status codes and ranges into a set of integers
|
||||
|
||||
:param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
|
||||
'''
|
||||
|
||||
codes = set()
|
||||
try:
|
||||
for part in codes_str.split(','):
|
||||
if '-' in part:
|
||||
start, end = map(int, part.split('-'))
|
||||
codes.update(range(start, end + 1))
|
||||
else:
|
||||
codes.add(int(part))
|
||||
return codes
|
||||
except ValueError:
|
||||
raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
|
||||
|
||||
|
||||
def parse_shard(shard_str: str) -> tuple:
|
||||
'''
|
||||
Parse shard argument in format INDEX/TOTAL
|
||||
|
||||
:param shard_str: Shard string in format "INDEX/TOTAL"
|
||||
'''
|
||||
|
||||
try:
|
||||
shard_index, total_shards = map(int, shard_str.split('/'))
|
||||
if shard_index < 1 or total_shards < 1 or shard_index > total_shards:
|
||||
raise ValueError
|
||||
return shard_index - 1, total_shards # Convert to 0-based index
|
||||
except (ValueError, TypeError):
|
||||
raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL')
|
@ -27,7 +27,7 @@ from .utils import debug, info, USER_AGENTS, input_generator
|
||||
class HTTPZScanner:
|
||||
'''Core scanner class for HTTP domain checking'''
|
||||
|
||||
def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None):
|
||||
def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None):
|
||||
'''
|
||||
Initialize the HTTPZScanner class
|
||||
|
||||
@ -43,6 +43,7 @@ class HTTPZScanner:
|
||||
:param show_fields: Fields to show
|
||||
:param match_codes: Status codes to match
|
||||
:param exclude_codes: Status codes to exclude
|
||||
:param shard: Tuple of (shard_index, total_shards) for distributed scanning
|
||||
'''
|
||||
|
||||
self.concurrent_limit = concurrent_limit
|
||||
@ -54,6 +55,7 @@ class HTTPZScanner:
|
||||
self.show_progress = show_progress
|
||||
self.debug_mode = debug_mode
|
||||
self.jsonl_output = jsonl_output
|
||||
self.shard = shard
|
||||
|
||||
self.show_fields = show_fields or {
|
||||
'status_code' : True,
|
||||
@ -218,8 +220,8 @@ class HTTPZScanner:
|
||||
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
|
||||
tasks = set()
|
||||
|
||||
# Process domains with concurrent limit
|
||||
for domain in input_generator(input_source):
|
||||
# Pass shard info to input_generator
|
||||
for domain in input_generator(input_source, self.shard):
|
||||
if len(tasks) >= self.concurrent_limit:
|
||||
done, tasks = await asyncio.wait(
|
||||
tasks, return_when=asyncio.FIRST_COMPLETED
|
||||
|
@ -3,6 +3,7 @@
|
||||
# httpz_scanner/utils.py
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
@ -97,19 +98,54 @@ def human_size(size_bytes: int) -> str:
|
||||
return f'{size:.1f}{units[unit_index]}'
|
||||
|
||||
|
||||
def input_generator(input_source: str):
|
||||
def input_generator(input_source, shard: tuple = None):
|
||||
'''
|
||||
Generator function to yield domains from file or stdin
|
||||
Generator function to yield domains from various input sources with optional sharding
|
||||
|
||||
:param input_source: file or stdin
|
||||
:param input_source: Can be:
|
||||
- string path to local file
|
||||
- "-" for stdin
|
||||
- list/tuple of domains
|
||||
- generator/iterator yielding domains
|
||||
- string content with newlines
|
||||
:param shard: Tuple of (shard_index, total_shards) for distributed scanning
|
||||
'''
|
||||
|
||||
line_num = 0
|
||||
|
||||
# Handle stdin
|
||||
if input_source == '-' or input_source is None:
|
||||
for line in sys.stdin:
|
||||
if line.strip():
|
||||
yield line.strip()
|
||||
else:
|
||||
if line := line.strip():
|
||||
if shard is None or line_num % shard[1] == shard[0]:
|
||||
yield line
|
||||
line_num += 1
|
||||
|
||||
# Handle local files
|
||||
elif isinstance(input_source, str) and os.path.exists(input_source):
|
||||
with open(input_source, 'r') as f:
|
||||
for line in f:
|
||||
if line.strip():
|
||||
yield line.strip()
|
||||
if line := line.strip():
|
||||
if shard is None or line_num % shard[1] == shard[0]:
|
||||
yield line
|
||||
line_num += 1
|
||||
|
||||
# Handle iterables (generators, lists, etc)
|
||||
elif hasattr(input_source, '__iter__') and not isinstance(input_source, (str, bytes)):
|
||||
for line in input_source:
|
||||
if isinstance(line, bytes):
|
||||
line = line.decode()
|
||||
if line := line.strip():
|
||||
if shard is None or line_num % shard[1] == shard[0]:
|
||||
yield line
|
||||
line_num += 1
|
||||
|
||||
# Handle string content with newlines
|
||||
elif isinstance(input_source, (str, bytes)):
|
||||
if isinstance(input_source, bytes):
|
||||
input_source = input_source.decode()
|
||||
for line in input_source.splitlines():
|
||||
if line := line.strip():
|
||||
if shard is None or line_num % shard[1] == shard[0]:
|
||||
yield line
|
||||
line_num += 1
|
Loading…
Reference in New Issue
Block a user