From dfb11b0a1cb7c0557e19554d7f675aca17e3216b Mon Sep 17 00:00:00 2001 From: acidvegas Date: Tue, 11 Feb 2025 20:57:01 -0500 Subject: [PATCH] Better input processing --- README.md | 85 ++++++++++++---------------- httpz_scanner/__init__.py | 2 +- httpz_scanner/parsers.py | 34 ++++++++++-- httpz_scanner/scanner.py | 89 +++++++++++++++++++++++------- httpz_scanner/utils.py | 113 ++++++++++++++++++++------------------ setup.py | 3 +- 6 files changed, 197 insertions(+), 129 deletions(-) diff --git a/README.md b/README.md index 0496c04..ef1b734 100644 --- a/README.md +++ b/README.md @@ -92,14 +92,29 @@ This allows efficient distribution of large scans across multiple machines. ### Python Library ```python import asyncio -import aiohttp -import aioboto3 +import urllib.request from httpz_scanner import HTTPZScanner -async def scan_domains(): +async def scan_from_list() -> list: + with urllib.request.urlopen('https://example.com/domains.txt') as response: + content = response.read().decode() + return [line.strip() for line in content.splitlines() if line.strip()][:20] + +async def scan_from_url(): + with urllib.request.urlopen('https://example.com/domains.txt') as response: + for line in response: + if line := line.strip(): + yield line.decode().strip() + +async def scan_from_file(): + with open('domains.txt', 'r') as file: + for line in file: + if line := line.strip(): + yield line + +async def main(): # Initialize scanner with all possible options (showing defaults) scanner = HTTPZScanner( - # Core settings concurrent_limit=100, # Number of concurrent requests timeout=5, # Request timeout in seconds follow_redirects=False, # Follow redirects (max 10) @@ -131,61 +146,33 @@ async def scan_domains(): exclude_codes={404,500,503} # Exclude these status codes ) - # Initialize resolvers (required before scanning) - await scanner.init() + # Example 1: Process file + print('\nProcessing file:') + async for result in scanner.scan(scan_from_file()): + print(f"{result['domain']}: {result['status']}") - # Example 1: Stream from S3/MinIO using aioboto3 - async with aioboto3.Session().client('s3', - endpoint_url='http://minio.example.com:9000', - aws_access_key_id='access_key', - aws_secret_access_key='secret_key') as s3: - - response = await s3.get_object(Bucket='my-bucket', Key='huge-domains.txt') - async with response['Body'] as stream: - async def s3_generator(): - while True: - line = await stream.readline() - if not line: - break - yield line.decode().strip() - - await scanner.scan(s3_generator()) + # Example 2: Stream URLs + print('\nStreaming URLs:') + async for result in scanner.scan(scan_from_url()): + print(f"{result['domain']}: {result['status']}") - # Example 2: Stream from URL using aiohttp - async with aiohttp.ClientSession() as session: - # For large files - stream line by line - async with session.get('https://example.com/huge-domains.txt') as resp: - async def url_generator(): - async for line in resp.content: - yield line.decode().strip() - - await scanner.scan(url_generator()) - - # For small files - read all at once - async with session.get('https://example.com/small-domains.txt') as resp: - content = await resp.text() - await scanner.scan(content) # Library handles splitting into lines - - # Example 3: Simple list of domains - domains = [ - 'example1.com', - 'example2.com', - 'example3.com' - ] - await scanner.scan(domains) + # Example 3: Process list + print('\nProcessing list:') + domains = await scan_from_list() + async for result in scanner.scan(domains): + print(f"{result['domain']}: {result['status']}") if __name__ == '__main__': - asyncio.run(scan_domains()) + asyncio.run(main()) ``` The scanner accepts various input types: -- Async/sync generators that yield domains -- String content with newlines +- File paths (string) - Lists/tuples of domains -- File paths - stdin (using '-') +- Async generators that yield domains -All inputs support sharding for distributed scanning. +All inputs support sharding for distributed scanning using the `shard` parameter. ## Arguments diff --git a/httpz_scanner/__init__.py b/httpz_scanner/__init__.py index 852b982..a547cbd 100644 --- a/httpz_scanner/__init__.py +++ b/httpz_scanner/__init__.py @@ -6,4 +6,4 @@ from .scanner import HTTPZScanner from .colors import Colors -__version__ = '2.0.0' \ No newline at end of file +__version__ = '2.0.1' \ No newline at end of file diff --git a/httpz_scanner/parsers.py b/httpz_scanner/parsers.py index e757647..b0b3b07 100644 --- a/httpz_scanner/parsers.py +++ b/httpz_scanner/parsers.py @@ -8,9 +8,9 @@ except ImportError: raise ImportError('missing bs4 module (pip install beautifulsoup4)') try: - from cryptography import x509 + from cryptography import x509 from cryptography.hazmat.primitives import hashes - from cryptography.x509.oid import NameOID + from cryptography.x509.oid import NameOID except ImportError: raise ImportError('missing cryptography module (pip install cryptography)') @@ -28,8 +28,8 @@ def parse_domain_url(domain: str) -> tuple: Parse domain string into base domain, port, and protocol list :param domain: Raw domain string to parse - :return: Tuple of (base_domain, port, protocols) ''' + port = None base_domain = domain.rstrip('/') @@ -58,6 +58,7 @@ def parse_domain_url(domain: str) -> tuple: return base_domain, port, protocols + async def get_cert_info(ssl_object, url: str) -> dict: ''' Get SSL certificate information for a domain @@ -65,6 +66,7 @@ async def get_cert_info(ssl_object, url: str) -> dict: :param ssl_object: SSL object to get certificate info from :param url: URL to get certificate info from ''' + try: if not ssl_object or not (cert_der := ssl_object.getpeercert(binary_form=True)): return None @@ -101,6 +103,7 @@ async def get_cert_info(ssl_object, url: str) -> dict: error(f'Error getting cert info for {url}: {str(e)}') return None + async def get_favicon_hash(session, base_url: str, html: str) -> str: ''' Get favicon hash from a webpage @@ -141,6 +144,7 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str: return None + def parse_status_codes(codes_str: str) -> set: ''' Parse comma-separated status codes and ranges into a set of integers @@ -174,4 +178,26 @@ def parse_shard(shard_str: str) -> tuple: raise ValueError return shard_index - 1, total_shards # Convert to 0-based index except (ValueError, TypeError): - raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL') \ No newline at end of file + raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL') + + +def parse_title(html: str, content_type: str = None) -> str: + ''' + Parse title from HTML content + + :param html: HTML content of the page + :param content_type: Content-Type header value + ''' + + # Only parse title for HTML content + if content_type and not any(x in content_type.lower() for x in ['text/html', 'application/xhtml']): + return None + + try: + soup = bs4.BeautifulSoup(html, 'html.parser', from_encoding='utf-8', features='lxml') + if title := soup.title: + return title.string.strip() + except: + pass + + return None \ No newline at end of file diff --git a/httpz_scanner/scanner.py b/httpz_scanner/scanner.py index 9aac1ba..49dd698 100644 --- a/httpz_scanner/scanner.py +++ b/httpz_scanner/scanner.py @@ -5,7 +5,6 @@ import asyncio import json import random -import sys try: import aiohttp @@ -20,7 +19,7 @@ except ImportError: from .dns import resolve_all_dns, load_resolvers from .formatters import format_console_output from .colors import Colors -from .parsers import parse_domain_url, get_cert_info, get_favicon_hash +from .parsers import parse_domain_url, get_cert_info, get_favicon_hash, parse_title from .utils import debug, info, USER_AGENTS, input_generator @@ -154,12 +153,13 @@ class HTTPZScanner: except AttributeError: debug(f'Failed to get SSL info for {url}') - html = (await response.text())[:1024*1024] - soup = bs4.BeautifulSoup(html, 'html.parser') + content_type = response.headers.get('Content-Type', '') + html = await response.text() if any(x in content_type.lower() for x in ['text/html', 'application/xhtml']) else None # Only add title if it exists - if soup.title and soup.title.string: - result['title'] = ' '.join(soup.title.string.strip().split()).rstrip('.')[:300] + if soup := bs4.BeautifulSoup(html, 'html.parser'): + if soup.title and soup.title.string: + result['title'] = ' '.join(soup.title.string.strip().split()).rstrip('.')[:300] # Only add body if it exists if body_text := soup.get_text(): @@ -210,32 +210,81 @@ class HTTPZScanner: async def scan(self, input_source): ''' - Scan domains from a file or stdin + Scan domains from a file, stdin, or async generator - :param input_source: Path to file or '-' for stdin + :param input_source: Can be: + - Path to file (str) + - stdin ('-') + - List/tuple of domains + - Async generator yielding domains + :yields: Result dictionary for each domain scanned ''' + if not self.resolvers: await self.init() async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: tasks = set() - # Pass shard info to input_generator - for domain in input_generator(input_source, self.shard): - if len(tasks) >= self.concurrent_limit: - done, tasks = await asyncio.wait( - tasks, return_when=asyncio.FIRST_COMPLETED - ) - for task in done: - result = await task - await self.process_result(result) + # Handle different input types + if isinstance(input_source, str): + # File or stdin input + domain_iter = input_generator(input_source, self.shard) + for domain in domain_iter: + if len(tasks) >= self.concurrent_limit: + done, tasks = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED + ) + for task in done: + result = await task + await self.process_result(result) + yield result - task = asyncio.create_task(self.check_domain(session, domain)) - tasks.add(task) + task = asyncio.create_task(self.check_domain(session, domain)) + tasks.add(task) + elif isinstance(input_source, (list, tuple)): + # List/tuple input + for line_num, domain in enumerate(input_source): + if domain := str(domain).strip(): + if self.shard is None or line_num % self.shard[1] == self.shard[0]: + if len(tasks) >= self.concurrent_limit: + done, tasks = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED + ) + for task in done: + result = await task + await self.process_result(result) + yield result + + task = asyncio.create_task(self.check_domain(session, domain)) + tasks.add(task) + else: + # Async generator input + line_num = 0 + async for domain in input_source: + if isinstance(domain, bytes): + domain = domain.decode() + domain = domain.strip() + + if domain: + if self.shard is None or line_num % self.shard[1] == self.shard[0]: + if len(tasks) >= self.concurrent_limit: + done, tasks = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED + ) + for task in done: + result = await task + await self.process_result(result) + yield result + + task = asyncio.create_task(self.check_domain(session, domain)) + tasks.add(task) + line_num += 1 # Process remaining tasks if tasks: done, _ = await asyncio.wait(tasks) for task in done: result = await task - await self.process_result(result) \ No newline at end of file + await self.process_result(result) + yield result \ No newline at end of file diff --git a/httpz_scanner/utils.py b/httpz_scanner/utils.py index 918e3b1..84cb74f 100644 --- a/httpz_scanner/utils.py +++ b/httpz_scanner/utils.py @@ -5,6 +5,7 @@ import logging import os import sys +import asyncio # Global for silent mode @@ -12,58 +13,58 @@ SILENT_MODE = False # List of user agents to randomize requests USER_AGENTS = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.6.5 Chrome/124.0.6367.243 Electron/30.1.2 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 OPR/116.0.0.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:134.0) Gecko/20100101 Firefox/134.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.8.3 Chrome/130.0.6723.191 Electron/33.3.2 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.6613.137 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.15", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:135.0) Gecko/20100101 Firefox/135.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.12 Chrome/120.0.6099.283 Electron/28.2.3 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0", - "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 OPR/114.0.0.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.7.7 Chrome/128.0.6613.186 Electron/32.2.5 Safari/537.36" + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.6.5 Chrome/124.0.6367.243 Electron/30.1.2 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 OPR/116.0.0.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:134.0) Gecko/20100101 Firefox/134.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.8.3 Chrome/130.0.6723.191 Electron/33.3.2 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.6613.137 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1.1 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:135.0) Gecko/20100101 Firefox/135.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.12 Chrome/120.0.6099.283 Electron/28.2.3 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0', + 'Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 OPR/114.0.0.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.1 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.7.7 Chrome/128.0.6613.186 Electron/32.2.5 Safari/537.36' ] @@ -98,9 +99,9 @@ def human_size(size_bytes: int) -> str: return f'{size:.1f}{units[unit_index]}' -def input_generator(input_source, shard: tuple = None): +async def input_generator(input_source, shard: tuple = None): ''' - Generator function to yield domains from various input sources with optional sharding + Async generator function to yield domains from various input sources with optional sharding :param input_source: Can be: - string path to local file @@ -116,6 +117,7 @@ def input_generator(input_source, shard: tuple = None): # Handle stdin if input_source == '-' or input_source is None: for line in sys.stdin: + await asyncio.sleep(0) # Yield control if line := line.strip(): if shard is None or line_num % shard[1] == shard[0]: yield line @@ -125,6 +127,7 @@ def input_generator(input_source, shard: tuple = None): elif isinstance(input_source, str) and os.path.exists(input_source): with open(input_source, 'r') as f: for line in f: + await asyncio.sleep(0) # Yield control if line := line.strip(): if shard is None or line_num % shard[1] == shard[0]: yield line @@ -133,6 +136,7 @@ def input_generator(input_source, shard: tuple = None): # Handle iterables (generators, lists, etc) elif hasattr(input_source, '__iter__') and not isinstance(input_source, (str, bytes)): for line in input_source: + await asyncio.sleep(0) # Yield control if isinstance(line, bytes): line = line.decode() if line := line.strip(): @@ -145,6 +149,7 @@ def input_generator(input_source, shard: tuple = None): if isinstance(input_source, bytes): input_source = input_source.decode() for line in input_source.splitlines(): + await asyncio.sleep(0) # Yield control if line := line.strip(): if shard is None or line_num % shard[1] == shard[0]: yield line diff --git a/setup.py b/setup.py index 6e93add..b520440 100644 --- a/setup.py +++ b/setup.py @@ -4,12 +4,13 @@ from setuptools import setup, find_packages + with open('README.md', 'r', encoding='utf-8') as f: long_description = f.read() setup( name='httpz_scanner', - version='2.0.0', + version='2.0.1', author='acidvegas', author_email='acid.vegas@acid.vegas', description='Hyper-fast HTTP Scraping Tool',