diff --git a/README.md b/README.md index df3e071..d905514 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,76 @@ -# HTTP-Z -###### This is still a work in progress...stay tuned for updates! +# HTTPZ Web Scanner -## Information -This script is developed as a robust alternative to HTTPX, addressing the limitations in customizing JSON outputs and other functionalities that HTTPX lacks. It is specifically designed for asynchronous lookups on a list of domains, efficiently gathering DNS information and web content details such as page titles and body previews. +A high-performance concurrent web scanner written in Python. HTTPZ efficiently scans domains for HTTP/HTTPS services, extracting valuable information like status codes, titles, SSL certificates, and more. + +## Requirements + +- [Python](https://www.python.org/downloads/) + - [aiohttp](https://pypi.org/project/aiohttp/) + - [apv](https://pypi.org/project/apv/) + - [beautifulsoup4](https://pypi.org/project/beautifulsoup4/) + - [cryptography](https://pypi.org/project/cryptography/) + - [dnspython](https://pypi.org/project/dnspython/) + - [mmh3](https://pypi.org/project/mmh3/) + - [python-dotenv](https://pypi.org/project/python-dotenv/) + - [tqdm](https://pypi.org/project/tqdm/) + +## Installation +```bash +git clone https://github.com/acidvegas/httpz +cd httpz +chmod +x setup.sh +./setup.sh +``` ## Usage -| Argument | Description | -| ---------------------- | ----------------------------------------------------------- | -| `` | File containing list of domains | -| `-c`, `--concurrency` | Number of concurrent requests | -| `-m`, `--memory_limit` | Number of results to store in memory before syncing to file | -| `-o`, `--output` | Output file | -| `-t`, `--timeout` | Timeout for HTTP requests | -| `-u`, `--user_agent` | User agent to use for HTTP requests | -| `-x`, `--proxy` | Proxy to use for HTTP requests | -| `-r`, `--retry` | Number of times to retry failed requests | -| `-v`, `--verbose` | Increase output verbosity | -| `-p`, `--preview` | Preview size in bytes for body & title *(default: 500)* | +```bash +python httpz.py domains.txt [options] +``` -___ +### Arguments -###### Mirrors -[acid.vegas](https://git.acid.vegas/httpz) • [GitHub](https://github.com/acidvegas/httpz) • [GitLab](https://gitlab.com/acidvegas/httpz) • [SuperNETs](https://git.supernets.org/acidvegas/httpz) +| Argument | Long Form | Description | +|-----------|------------------|-------------------------------------------------------------| +| `file` | - | File containing domains *(one per line)*, use `-` for stdin | +| `-d` | `--debug` | Show error states and debug information | +| `-c N` | `--concurrent N` | Number of concurrent checks *(default: 100)* | +| `-o FILE` | `--output FILE` | Output file path *(JSONL format)* | +| `-j` | `--jsonl` | Output JSON Lines format to console | +| `-all` | `--all-flags` | Enable all output flags | + +### Output Field Flags + +| Flag | Long Form | Description | +|--------| ---------------------|----------------------------------| +| `-sc` | `--status-code` | Show status code | +| `-ct` | `--content-type` | Show content type | +| `-ti` | `--title` | Show page title | +| `-b` | `--body` | Show body preview | +| `-i` | `--ip` | Show IP addresses | +| `-f` | `--favicon` | Show favicon hash | +| `-hr` | `--headers` | Show response headers | +| `-cl` | `--content-length` | Show content length | +| `-fr` | `--follow-redirects` | Follow redirects *(max 10)* | +| `-cn` | `--cname` | Show CNAME records | +| `-tls` | `--tls-info` | Show TLS certificate information | + +### Other Options + +| Option | Long Form | Description | +|-------------|-------------------------|-----------------------------------------------------| +| `-to N` | `--timeout N` | Request timeout in seconds *(default: 5)* | +| `-mc CODES` | `--match-codes CODES` | Only show specific status codes *(comma-separated)* | +| `-ec CODES` | `--exclude-codes CODES` | Exclude specific status codes *(comma-separated)* | +| `-p` | `--progress` | Show progress counter | + +## Examples + +Scan domains with all flags enabled and output to JSONL: +```bash +python httpz.py domains.txt -c 100 -o output.jsonl -j -all -to 10 -mc 200,301 -ec 404,500 -p +``` + +Scan domains from stdin: +```bash +cat domains.txt | python httpz.py - -c 100 -o output.jsonl -j -all -to 10 -mc 200,301 -ec 404,500 -p +``` \ No newline at end of file diff --git a/httpz.py b/httpz.py index 1542a55..c847e9e 100644 --- a/httpz.py +++ b/httpz.py @@ -1,282 +1,608 @@ -#!/usr/bin/env python -# HTTPZ Crawler - Developed by acidvegas in Python (https://git.acid.vegas/httpz) +#!/usr/bin/env python3 +# HTTPZ Web Scanner - Developed by acidvegas in Python (https://github.com/acidvegas/httpz) ''' -BCUZ FUCK HTTPX PYTHON STILL GO HARD +BCUZ FUCK PROJECT DISCOVERY PYTHON STILL GO HARD +REAL BAY SHIT FOR REAL BAY MOTHER FUCKERS ''' import argparse import asyncio +import itertools import json -import random -import re import logging -import ssl -import urllib.request +from pathlib import Path +import sys try: - import aiodns + import aiohttp except ImportError: - print('Missing required module \'aiodns\'. (pip install aiodns)') - exit(1) + raise ImportError('missing \'aiohttp\' library (pip install aiohttp)') try: - import aiohttp + import apv except ImportError: - print('Missing required module \'aiohttp\'. (pip install aiohttp)') - exit(1) + raise ImportError('missing \'apv\' library (pip install apv)') -# ANSI escape codes for colors -BLUE = '\033[34m' -CYAN = '\033[36m' -RED = '\033[91m' -GREEN = '\033[92m' -DARK_GREY = '\033[90m' -YELLOW = '\033[93m' -RESET = '\033[0m' +try: + import bs4 +except ImportError: + raise ImportError('missing \'bs4\' library (pip install beautifulsoup4)') -# Globals -DNS_SERVERS = None -args = None # Global args variable +try: + from cryptography import x509 + from cryptography.hazmat.primitives import hashes + from cryptography.x509.oid import NameOID +except ImportError: + raise ImportError('missing \'cryptography\' library (pip install cryptography)') -def vlog(msg: str): - ''' - Verbose logging only if enabled +try: + import dns.asyncresolver +except ImportError: + raise ImportError('missing \'dns\' library (pip install dnspython)') - :param msg: Message to print to console - ''' - if args.verbose: - logging.info(msg) +try: + import mmh3 +except ImportError: + raise ImportError('missing \'mmh3\' library (pip install mmh3)') -def create_session(user_agent: str, timeout: int, proxy: str = None) -> dict: - ''' - Create a custom aiohttp session +class Colors: + '''ANSI color codes for terminal output''' - :param user_agent: User agent to use for HTTP requests - :param timeout: Timeout for HTTP requests - ''' - ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - ssl_context.check_hostname = False - ssl_context.verify_mode = ssl.CERT_NONE - - headers = {'User-Agent': user_agent} - connector = aiohttp.TCPConnector(ssl=ssl_context) - - session_params = { - 'connector': connector, - 'headers': headers, - 'timeout': aiohttp.ClientTimeout(total=timeout) - } - - return session_params + HEADER = '\033[95m' # Light purple + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + RESET = '\033[0m' + PURPLE = '\033[35m' # Dark purple + LIGHT_RED = '\033[38;5;203m' # Light red + DARK_GREEN = '\033[38;5;22m' # Dark green + PINK = '\033[38;5;198m' # Bright pink -def get_dns_servers() -> dict: - '''Get a list of DNS servers to use for lookups.''' - with urllib.request.urlopen('https://public-dns.info/nameservers.txt') as source: - results = source.read().decode().split('\n') +async def resolve_dns(domain: str) -> tuple: + ''' + Resolve A, AAAA, and CNAME records for a domain + + :param domain: domain to resolve + :return: tuple of (ips, cname) + ''' - v4_servers = [server for server in results if ':' not in server] - v6_servers = [server for server in results if ':' in server] + resolver = dns.asyncresolver.Resolver() + ips = [] + cname = None - return {'4': v4_servers, '6': v6_servers} + try: + # Check for CNAME first + cname_result = await resolver.resolve(domain, 'CNAME') + cname = str(cname_result[0].target).rstrip('.') + except Exception: + pass + + try: + # Query A records + a_result = await resolver.resolve(domain, 'A') + ips.extend(str(ip) for ip in a_result) + except Exception as e: + logging.debug(f'Error resolving A records for {domain}: {str(e)}') + + try: + # Query AAAA records + aaaa_result = await resolver.resolve(domain, 'AAAA') + ips.extend(str(ip) for ip in aaaa_result) + except Exception as e: + logging.debug(f'Error resolving AAAA records for {domain}: {str(e)}') + + return sorted(set(ips)), cname -async def dns_lookup(domain: str, record_type: str, timeout: int, retry: int) -> list: - ''' - Resolve DNS information from a domain +async def get_favicon_hash(session: aiohttp.ClientSession, base_url: str, html: str) -> str: + ''' + Get favicon hash from a webpage + + :param session: aiohttp client session + :param base_url: base URL of the website + :param html: HTML content of the page + ''' - :param domain: Domain name to resolve - :param record_type: DNS record type to resolve - :param timeout: Timeout for DNS request - :param retry: Number of times to retry failed requests - ''' - for i in range(retry): - try: - version = '4' if record_type == 'A' else '6' if record_type == 'AAAA' else random.choice(['4','6']) - nameserver = random.choice(DNS_SERVERS[version]) - resolver = aiodns.DNSResolver(nameservers=[nameserver], timeout=timeout) - records = await resolver.query(domain, record_type) - return records.cname if record_type == 'CNAME' else [record.host for record in records] - except Exception as e: - vlog(f'{RED}[ERROR]{RESET} {domain} - Failed to resolve {record_type} record using {nameserver} {DARK_GREY}({str(e)}){RESET}') - return [] + try: + soup = bs4.BeautifulSoup(html, 'html.parser') + + # Try to find favicon in link tags + favicon_url = None + for link in soup.find_all('link'): + if link.get('rel') and any(x.lower() == 'icon' for x in link.get('rel')): + favicon_url = link.get('href') + break + + if not favicon_url: + # Try default location + favicon_url = '/favicon.ico' + + # Handle relative URLs + if favicon_url.startswith('//'): + favicon_url = 'https:' + favicon_url + elif favicon_url.startswith('/'): + favicon_url = base_url + favicon_url + elif not favicon_url.startswith(('http://', 'https://')): + favicon_url = base_url + '/' + favicon_url + + async with session.get(favicon_url, timeout=10) as response: + if response.status == 200: + content = await response.read() + if len(content) <= 1024*1024: # Check if favicon is <= 1MB + hash_value = mmh3.hash64(content)[0] + # Only return hash if it's not 0 (likely invalid favicon) + if hash_value != 0: + return str(hash_value) + except Exception as e: + logging.debug(f'Error getting favicon for {base_url}: {str(e)}') + + return None -async def get_body(source: str, preview: int) -> str: - ''' - Get the body of a webpage +async def get_cert_info(session: aiohttp.ClientSession, url: str) -> dict: + ''' + Get SSL certificate information for a domain + + :param session: aiohttp client session + :param url: URL to check + ''' - :param source: HTML source of the webpage - :param preview: Number of bytes to preview - ''' - body_content = re.search(r'(.*?)', source[:5000], re.DOTALL | re.IGNORECASE) - processed_content = body_content.group(1) if body_content else source - clean_content = re.sub(r'<[^>]+>', '', processed_content) - return clean_content[:preview] + try: + async with session.get(url, timeout=10) as response: + # Get the SSL context from the connection + ssl_object = response.connection.transport.get_extra_info('ssl_object') + if not ssl_object: + return None + + cert_bin = ssl_object.getpeercert(binary_form=True) + cert = x509.load_der_x509_certificate(cert_bin) + + # Get certificate details + cert_info = { + 'fingerprint': cert.fingerprint(hashes.SHA256()).hex(), + 'subject': cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value, + 'issuer': cert.issuer.get_attributes_for_oid(NameOID.COMMON_NAME)[0].value, + 'alt_names': [], + 'not_before': cert.not_valid_before_utc.isoformat(), + 'not_after': cert.not_valid_after_utc.isoformat() + } + + # Get Subject Alternative Names + try: + ext = cert.extensions.get_extension_for_oid(x509.oid.ExtensionOID.SUBJECT_ALTERNATIVE_NAME) + cert_info['alt_names'] = [name.value for name in ext.value] + except x509.ExtensionNotFound: + pass + + return cert_info + except Exception as e: + logging.debug(f'Error getting certificate info for {url}: {str(e)}') + return None -async def get_title(session: aiohttp.ClientSession, domain: str): - ''' - Get the title of a webpage and its status code +async def check_domain(session: aiohttp.ClientSession, domain: str, follow_redirects: bool = False, timeout: int = 5) -> dict: + ''' + Check a single domain for its status code, title, and body preview + + :param session: aiohttp client session + :param domain: domain to check + :param follow_redirects: whether to follow redirects + :param timeout: timeout in seconds + ''' - :param session: aiohttp session - :param domain: URL to get the title of - ''' - title = None - body = None - status_code = None + if not domain.startswith(('http://', 'https://')): + protocols = ['https://', 'http://'] + base_domain = domain + else: + protocols = [domain] + base_domain = domain.split('://')[-1].split('/')[0] - try: - async with session.get(domain, timeout=args.timeout, allow_redirects=False) as response: - status_code = response.status - if status_code in (200, 201): - html_content = await response.text() - match = re.search(r'(.*?)', html_content, re.IGNORECASE | re.DOTALL) - title = match.group(1).strip() if match else None - title = bytes(title, 'utf-8').decode('unicode_escape') if title else None - title = re.sub(r'[\r\n]+', ' ', title)[:300] if title else None # Fix this ugly shit - body = await get_body(html_content, args.preview) - body = re.sub(r'\s+', ' ', body).strip() if body else None - elif status_code in (301, 302, 303, 307, 308) and args.retry > 0: # Need to implement a max redirect limit - redirect_url = response.headers.get('Location') - if redirect_url: - vlog(f'{YELLOW}[WARN]{RESET} {domain} -> {redirect_url} {DARK_GREY}({status_code}){RESET}') - return await get_title(session, redirect_url) - else: - vlog(f'{RED}[ERROR]{RESET} No redirect URL found for {domain} {DARK_GREY}({status_code}){RESET}') - else: - vlog(f'{RED}[ERROR]{RESET} {domain} - Invalid status code {DARK_GREY}{status_code}{RESET}') - except asyncio.TimeoutError: - vlog(f'{RED}[ERROR]{RESET} {domain} - HTTP request timed out') - except Exception as e: - vlog(f'{RED}[ERROR]{RESET} Failed to get title for {domain} {DARK_GREY}({e}){RESET}') - return title, body, status_code # Fix this ugly shit + result = { + 'domain' : base_domain, + 'status' : 0, + 'title' : None, + 'body' : None, + 'content_type' : None, + 'url' : f"https://{base_domain}" if base_domain else domain, + 'ips' : [], + 'cname' : None, + 'favicon_hash' : None, + 'headers' : {}, + 'content_length' : None, + 'redirect_chain' : [], + 'tls' : None + } + + # Resolve DNS records + result['ips'], result['cname'] = await resolve_dns(base_domain) + + for protocol in protocols: + url = f'{protocol}{base_domain}' + try: + max_redirects = 10 if follow_redirects else 0 + async with session.get(url, timeout=timeout, allow_redirects=follow_redirects, max_redirects=max_redirects) as response: + result['status'] = response.status + result['url'] = str(response.url) + result['headers'] = dict(response.headers) + result['content_type'] = response.headers.get('content-type', '').split(';')[0] + result['content_length'] = response.headers.get('content-length') + + # Track redirect chain + if follow_redirects: + result['redirect_chain'] = [str(h.url) for h in response.history] + if result['redirect_chain']: + result['redirect_chain'].append(str(response.url)) + + # Get TLS info if HTTPS + if url.startswith('https://'): + result['tls'] = await get_cert_info(session, url) + + if response.status == 200: + html = (await response.text())[:1024*1024] + soup = bs4.BeautifulSoup(html, 'html.parser') + if soup.title: + title = soup.title.string.strip() if soup.title.string else '' + result['title'] = title[:300] + if soup.get_text(): + body = ' '.join(soup.get_text().split()[:50]) + result['body'] = body[:500] # Changed from preview + result['favicon_hash'] = await get_favicon_hash(session, url, html) + break + except Exception as e: + logging.debug(f'Error checking {url}: {str(e)}') + result['status'] = -1 + continue + + return result -async def check_url(session: aiohttp.ClientSession, domain: str): - ''' - Process a domain name - - :param session: aiohttp session - :param domain: URL to get the title of - ''' - dns_records = {} - - for record_type in ('A', 'AAAA'): - records = await dns_lookup(domain, record_type, args.timeout, args.retry) - if records: - dns_records[record_type] = records - if not dns_records: - cname_record = await dns_lookup(domain, 'CNAME', args.timeout, args.retry) - if cname_record: - dns_records['CNAME'] = cname_record - domain = cname_record - else: - vlog(f'{RED}[ERROR]{RESET} No DNS records found for {domain}') - return domain, None, None, None, None, None - - title, body, status_code = await get_title(session, f'https://{domain}') - if not title and not body: - title, body, status_code = await get_title(session, f'http://{domain}') - - if title or body: - if status_code in (200, 201): - status_code = f'[{GREEN}200{RESET}]' - elif status_code in (301, 302, 303, 307, 308): - status_code = f'[{YELLOW}{status_code}{RESET}]' - logging.info(f'{domain} {status_code} [{CYAN}{title}{RESET}] - [{BLUE}{body}{RESET}]') - return domain, 'https', title, body, dns_records, status_code - else: - vlog(f'{RED}[ERROR]{RESET} {domain} - Failed to retrieve title') - - return domain, None, None, None, None, status_code +def domain_generator(input_source: str = None): + ''' + Generator function to yield domains from file or stdin + + :param input_source: path to file containing domains, or None for stdin + ''' + if input_source == '-' or input_source is None: + for line in sys.stdin: + if line.strip(): + yield line.strip() + else: + with open(input_source, 'r') as f: + for line in f: + if line.strip(): + yield line.strip() -async def process_file(): - ''' - Process a list of domains from file - ''' - - session_params = create_session(args.user_agent, args.timeout, args.proxy) +def human_size(size_bytes: int) -> str: + ''' + Convert bytes to human readable string + + :param size_bytes: Size in bytes + ''' - async with aiohttp.ClientSession(**session_params) as session: - tasks = set() - with open(args.file, 'r') as file: - for line in file: - domain = line.strip() - if domain: - tasks.add(asyncio.create_task(check_url(session, domain))) - - if len(tasks) >= args.concurrency: # Should be a better way to do this - done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) - - for task in done: - domain, protocol, title, body, dns_records, status_code = task.result() - if title or body or dns_records: - write_result_to_file(domain, protocol, title, body, dns_records, status_code) + if not size_bytes: + return '0B' + + units = ('B', 'KB', 'MB', 'GB') + size = float(size_bytes) + unit_index = 0 + + while size >= 1024 and unit_index < len(units) - 1: + size /= 1024 + unit_index += 1 + + return f"{size:.1f}{units[unit_index]}" - if tasks: - done, _ = await asyncio.wait(tasks) - for task in done: - domain, protocol, title, body, dns_records, status_code = task.result() - if title: - write_result_to_file(domain, protocol, title, body, dns_records, status_code) +def parse_status_codes(codes_str: str) -> set: + ''' + Parse comma-separated status codes into a set of integers + + :param codes_str: Comma-separated status codes + ''' + + try: + return {int(code.strip()) for code in codes_str.split(',')} + except ValueError: + raise argparse.ArgumentTypeError('Status codes must be comma-separated numbers (e.g., 200,301,404)') +def format_status_output(result: dict, debug: bool = False, show_fields: dict = None, match_codes: set = None, exclude_codes: set = None) -> str: + ''' + Format the output with colored sections + + :param result: Dictionary containing domain check results + :param debug: Whether to show error states + :param show_fields: Dictionary of fields to show + :param match_codes: Set of status codes to match + :param exclude_codes: Set of status codes to exclude + ''' + + # Skip errors unless in debug mode + if result['status'] < 0 and not debug: + return '' + + # Skip if status code doesn't match filters + if match_codes and result['status'] not in match_codes: + return '' + if exclude_codes and result['status'] in exclude_codes: + return '' + + parts = [] + + # Status code + if show_fields['status_code']: + if result['status'] < 0: + status = f"{Colors.RED}[{result['status']}]{Colors.RESET}" + elif 200 <= result['status'] < 300: + status = f"{Colors.GREEN}[{result['status']}]{Colors.RESET}" + elif 300 <= result['status'] < 400: + status = f"{Colors.YELLOW}[{result['status']}]{Colors.RESET}" + else: # 400+ and 500+ codes + status = f"{Colors.RED}[{result['status']}]{Colors.RESET}" + parts.append(status) + + # Domain (always shown) + parts.append(f"[{result['url']}]") + + # Title + if show_fields['title'] and result['title']: + parts.append(f"{Colors.DARK_GREEN}[{result['title']}]{Colors.RESET}") + + # Body + if show_fields['body'] and result['body']: + body = result['body'][:100] + ('...' if len(result['body']) > 100 else '') + parts.append(f"{Colors.BLUE}[{body}]{Colors.RESET}") + + # IPs + if show_fields['ip'] and result['ips']: + ips_text = ', '.join(result['ips']) + parts.append(f"{Colors.YELLOW}[{ips_text}]{Colors.RESET}") + + # Favicon hash + if show_fields['favicon'] and result['favicon_hash']: + parts.append(f"{Colors.PURPLE}[{result['favicon_hash']}]{Colors.RESET}") + + # Headers (includes content-type and content-length) + if show_fields['headers'] and result['headers']: + headers_text = [] + for k, v in result['headers'].items(): + headers_text.append(f"{k}: {v}") + parts.append(f"{Colors.LIGHT_RED}[{', '.join(headers_text)}]{Colors.RESET}") + else: + # Only show content-type and content-length if headers aren't shown + if show_fields['content_type'] and result['content_type']: + parts.append(f"{Colors.HEADER}[{result['content_type']}]{Colors.RESET}") + + if show_fields['content_length'] and result['content_length']: + try: + size = human_size(int(result['content_length'])) + parts.append(f"{Colors.PINK}[{size}]{Colors.RESET}") + except (ValueError, TypeError): + parts.append(f"{Colors.PINK}[{result['content_length']}]{Colors.RESET}") + + # CNAME + if show_fields['cname'] and result['cname']: + parts.append(f"{Colors.PURPLE}[CNAME: {result['cname']}]{Colors.RESET}") + + # Redirect Chain + if show_fields['follow_redirects'] and result['redirect_chain']: + chain = ' -> '.join(result['redirect_chain']) + parts.append(f"{Colors.YELLOW}[Redirects: {chain}]{Colors.RESET}") + + # TLS Certificate Info + if show_fields['tls'] and result['tls']: + cert = result['tls'] + tls_parts = [] + tls_parts.append(f"Fingerprint: {cert['fingerprint']}") + tls_parts.append(f"Subject: {cert['subject']}") + tls_parts.append(f"Issuer: {cert['issuer']}") + if cert['alt_names']: + tls_parts.append(f"SANs: {', '.join(cert['alt_names'])}") + tls_parts.append(f"Valid: {cert['not_before']} to {cert['not_after']}") + parts.append(f"{Colors.GREEN}[{' | '.join(tls_parts)}]{Colors.RESET}") + + return ' '.join(parts) -def write_result_to_file(domain, protocol, title, body, dns_records, status_code): - ''' - Write a single domain result to file +def count_domains(input_source: str = None) -> int: + ''' + Count total number of domains from file or stdin + + :param input_source: path to file containing domains, or None for stdin + ''' + if input_source == '-' or input_source is None: + # Can't count lines from stdin without consuming them + return 0 + else: + with open(input_source, 'r') as f: + return sum(1 for line in f if line.strip()) - :param domain: Domain name - :param protocol: Protocol used (http or https) - :param title: Title of the domain - :param dns_records: DNS records of the domain - :param status_code: HTTP status code - ''' - result = { - 'domain': domain, - 'protocol': protocol, - 'status_code': status_code, - 'title': title, - 'body': body, - 'dns_records': dns_records - } - with open(args.output, 'a') as f: - json.dump(result, f) - f.write('\n') + +async def process_domains(input_source: str = None, debug: bool = False, concurrent_limit: int = 100, show_fields: dict = None, output_file: str = None, jsonl: bool = None, timeout: int = 5, match_codes: set = None, exclude_codes: set = None, show_progress: bool = False): + ''' + Process domains from a file or stdin with concurrent requests + + :param input_source: path to file containing domains, or None for stdin + :param debug: Whether to show error states + :param concurrent_limit: maximum number of concurrent requests + :param show_fields: Dictionary of fields to show + :param output_file: Path to output file (JSONL format) + :param timeout: Request timeout in seconds + :param match_codes: Set of status codes to match + :param exclude_codes: Set of status codes to exclude + :param show_progress: Whether to show progress counter + ''' + if input_source and input_source != '-' and not Path(input_source).exists(): + raise FileNotFoundError(f'Domain file not found: {input_source}') + + # Get total domain count if showing progress (only works for files) + total_domains = count_domains(input_source) if show_progress else 0 + processed_domains = 0 + + # Clear the output file if specified + if output_file: + open(output_file, 'w').close() + + tasks = set() + + async def write_result(result: dict): + '''Write a single result to the output file''' + nonlocal processed_domains + + # Create JSON output dict + output_dict = { + 'url': result['url'], + 'domain': result['domain'], + 'status': result['status'] + } + # Add optional fields if they exist + if result['title']: + output_dict['title'] = result['title'] + if result['body']: + output_dict['body'] = result['body'] + if result['ips']: + output_dict['ips'] = result['ips'] + if result['favicon_hash']: + output_dict['favicon_hash'] = result['favicon_hash'] + if result['headers']: + output_dict['headers'] = result['headers'] + if result['cname']: + output_dict['cname'] = result['cname'] + if result['redirect_chain']: + output_dict['redirect_chain'] = result['redirect_chain'] + if result['tls']: + output_dict['tls'] = result['tls'] + + # Get formatted output based on filters + formatted = format_status_output(result, debug, show_fields, match_codes, exclude_codes) + if formatted: + # Write to file if specified + if output_file: + if (not match_codes or result['status'] in match_codes) and \ + (not exclude_codes or result['status'] not in exclude_codes): + with open(output_file, 'a') as f: + json.dump(output_dict, f, ensure_ascii=False) + f.write('\n') + + # Console output + if jsonl: + # Pure JSON Lines output without any logging prefixes + print(json.dumps(output_dict)) + else: + if show_progress: + processed_domains += 1 + logging.info(f"{Colors.BOLD}[{processed_domains}/{total_domains}]{Colors.RESET} {formatted}") + else: + logging.info(formatted) + + async with aiohttp.ClientSession() as session: + # Start initial batch of tasks + for domain in itertools.islice(domain_generator(input_source), concurrent_limit): + task = asyncio.create_task(check_domain(session, domain, follow_redirects=show_fields['follow_redirects'], timeout=timeout)) + tasks.add(task) + + # Process remaining domains, maintaining concurrent_limit active tasks + domains_iter = domain_generator(input_source) + next(itertools.islice(domains_iter, concurrent_limit, concurrent_limit), None) # Skip first concurrent_limit domains + + for domain in domains_iter: + done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + tasks = pending + + for task in done: + result = await task + await write_result(result) + + task = asyncio.create_task(check_domain(session, domain, follow_redirects=show_fields['follow_redirects'], timeout=timeout)) + tasks.add(task) + + # Wait for remaining tasks + if tasks: + done, _ = await asyncio.wait(tasks) + for task in done: + result = await task + await write_result(result) def main(): - global DNS_SERVERS, args + '''Main function to handle command line arguments and run the domain checker''' + parser = argparse.ArgumentParser(description=f'{Colors.HEADER}Concurrent domain checker{Colors.RESET}', formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('file', nargs='?', default='-', help='File containing domains to check (one per line), use - for stdin') + parser.add_argument('-d', '--debug', action='store_true', help='Show error states and debug information') + parser.add_argument('-c', '--concurrent', type=int, default=100, help='Number of concurrent checks') + parser.add_argument('-o', '--output', help='Output file path (JSONL format)') + parser.add_argument('-j', '--jsonl', action='store_true', help='Output JSON Lines format to console') + + # Add all-flags argument + parser.add_argument('-all', '--all-flags', action='store_true', help='Enable all output flags') + + # Output field flags + parser.add_argument('-sc', '--status-code', action='store_true', help='Show status code') + parser.add_argument('-ct', '--content-type', action='store_true', help='Show content type') + parser.add_argument('-ti', '--title', action='store_true', help='Show page title') + parser.add_argument('-b', '--body', action='store_true', help='Show body preview') + parser.add_argument('-i', '--ip', action='store_true', help='Show IP addresses') + parser.add_argument('-f', '--favicon', action='store_true', help='Show favicon hash') + parser.add_argument('-hr', '--headers', action='store_true', help='Show response headers') + parser.add_argument('-cl', '--content-length', action='store_true', help='Show content length') + parser.add_argument('-fr', '--follow-redirects', action='store_true', help='Follow redirects (max 10)') + parser.add_argument('-cn', '--cname', action='store_true', help='Show CNAME records') + parser.add_argument('-tls', '--tls-info', action='store_true', help='Show TLS certificate information') + + # Other arguments + parser.add_argument('-to', '--timeout', type=int, default=5, help='Request timeout in seconds') + parser.add_argument('-mc', '--match-codes', type=parse_status_codes, help='Only show these status codes (comma-separated, e.g., 200,301,404)') + parser.add_argument('-ec', '--exclude-codes', type=parse_status_codes, help='Exclude these status codes (comma-separated, e.g., 404,500)') + parser.add_argument('-p', '--progress', action='store_true', help='Show progress counter') + + args = parser.parse_args() - parser = argparse.ArgumentParser(description='Check URLs from a file asynchronously, perform DNS lookups and store results in JSON.') - parser.add_argument('file', help='File containing list of domains') - parser.add_argument('-c', '--concurrency', type=int, default=10, help='Number of concurrent requests') - parser.add_argument('-m', '--memory_limit', type=int, default=1000, help='Number of results to store in memory before syncing to file') - parser.add_argument('-o', '--output', default='results.json', help='Output file') - parser.add_argument('-t', '--timeout', type=int, default=10, help='Timeout for HTTP requests') - parser.add_argument('-u', '--user_agent', default='Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', help='User agent to use for HTTP requests') - parser.add_argument('-x', '--proxy', type=str, help='Proxy to use for HTTP requests') - parser.add_argument('-r', '--retry', type=int, default=2, help='Number of times to retry failed requests') - parser.add_argument('-v', '--verbose', action='store_true', help='Increase output verbosity') - parser.add_argument('-p', '--preview', type=int, default=500, help='Preview size in bytes for body & title (default: 500)') - args = parser.parse_args() + # Only setup logging if we're not in JSONL mode + if not args.jsonl: + apv.setup_logging(level='DEBUG' if args.debug else 'INFO') + logging.info(f'{Colors.BOLD}Starting domain checker...{Colors.RESET}') + if args.file == '-': + logging.info('Reading domains from stdin') + else: + logging.info(f'Processing file: {Colors.UNDERLINE}{args.file}{Colors.RESET}') + logging.info(f'Concurrent checks: {args.concurrent}') - log_level = logging.INFO - logging.basicConfig(level=log_level, format=f'{DARK_GREY}%(asctime)s{RESET} %(message)s', datefmt='%H:%M:%S') + show_fields = { + 'status_code' : args.all_flags or args.status_code, + 'content_type' : args.all_flags or args.content_type, + 'title' : args.all_flags or args.title, + 'body' : args.all_flags or args.body, + 'ip' : args.all_flags or args.ip, + 'favicon' : args.all_flags or args.favicon, + 'headers' : args.all_flags or args.headers, + 'content_length' : args.all_flags or args.content_length, + 'follow_redirects' : args.all_flags or args.follow_redirects, + 'cname' : args.all_flags or args.cname, + 'tls' : args.all_flags or args.tls_info + } - logging.info('Loading DNS servers...') - DNS_SERVERS = get_dns_servers() - if not DNS_SERVERS: - logging.fatal('Failed to get DNS servers.') - logging.info(f'Found {len(DNS_SERVERS["4"])} IPv4 and {len(DNS_SERVERS["6"])} IPv6 DNS servers.') + # If no fields specified and no -all flag, show all (maintain existing behavior) + if not any(show_fields.values()): + show_fields = {k: True for k in show_fields} + + try: + asyncio.run(process_domains(args.file, args.debug, args.concurrent, show_fields, args.output, args.jsonl, args.timeout, args.match_codes, args.exclude_codes, args.progress)) + except KeyboardInterrupt: + if not args.jsonl: + logging.warning(f'{Colors.YELLOW}Process interrupted by user{Colors.RESET}') + sys.exit(1) + except Exception as e: + if not args.jsonl: + logging.error(f'{Colors.RED}An error occurred: {str(e)}{Colors.RESET}') + sys.exit(1) - asyncio.run(process_file()) if __name__ == '__main__': - main() \ No newline at end of file + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4ea7afa --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +aiohttp>=3.8.0 +apv>=1.0.0 +beautifulsoup4>=4.9.3 +cryptography>=3.4.7 +dnspython>=2.1.0 +mmh3>=3.0.0 \ No newline at end of file diff --git a/setup.sh b/setup.sh new file mode 100644 index 0000000..d579f82 --- /dev/null +++ b/setup.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Create virtual environment +python3 -m venv venv + +# Activate virtual environment +source venv/bin/activate + +# Upgrade pip +pip install --upgrade pip + +# Install requirements +pip install -r requirements.txt + +# Make the main script executable +chmod +x httpz.py + +echo "Setup complete! Activate the virtual environment with: source venv/bin/activate" \ No newline at end of file