Allow any form of input for scanning
This commit is contained in:
parent
a6fc596547
commit
e27e5e4095
150
README.md
150
README.md
@ -16,7 +16,7 @@ A high-performance concurrent web scanner written in Python. HTTPZ efficiently s
|
|||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
### Via pip (recommended)
|
### Via pip *(recommended)*
|
||||||
```bash
|
```bash
|
||||||
# Install from PyPI
|
# Install from PyPI
|
||||||
pip install httpz_scanner
|
pip install httpz_scanner
|
||||||
@ -68,104 +68,136 @@ Full scan with all options:
|
|||||||
python -m httpz_scanner domains.txt -c 100 -o output.jsonl -j -all -to 10 -mc 200,301 -ec 404,500 -p -ax -r resolvers.txt
|
python -m httpz_scanner domains.txt -c 100 -o output.jsonl -j -all -to 10 -mc 200,301 -ec 404,500 -p -ax -r resolvers.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Distributed Scanning
|
||||||
|
Split scanning across multiple machines using the `--shard` argument:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Machine 1
|
||||||
|
httpz domains.txt --shard 1/3
|
||||||
|
|
||||||
|
# Machine 2
|
||||||
|
httpz domains.txt --shard 2/3
|
||||||
|
|
||||||
|
# Machine 3
|
||||||
|
httpz domains.txt --shard 3/3
|
||||||
|
```
|
||||||
|
|
||||||
|
Each machine will process a different subset of domains without overlap. For example, with 3 shards:
|
||||||
|
- Machine 1 processes lines 0,3,6,9,...
|
||||||
|
- Machine 2 processes lines 1,4,7,10,...
|
||||||
|
- Machine 3 processes lines 2,5,8,11,...
|
||||||
|
|
||||||
|
This allows efficient distribution of large scans across multiple machines.
|
||||||
|
|
||||||
### Python Library
|
### Python Library
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import aioboto3
|
||||||
from httpz_scanner import HTTPZScanner
|
from httpz_scanner import HTTPZScanner
|
||||||
|
|
||||||
async def scan_domains():
|
async def scan_domains():
|
||||||
# Initialize scanner with all possible options (showing defaults)
|
# Initialize scanner with all possible options (showing defaults)
|
||||||
scanner = HTTPZScanner(
|
scanner = HTTPZScanner(
|
||||||
# Core settings
|
# Core settings
|
||||||
concurrent_limit=100, # Number of concurrent requests
|
concurrent_limit=100, # Number of concurrent requests
|
||||||
timeout=5, # Request timeout in seconds
|
timeout=5, # Request timeout in seconds
|
||||||
follow_redirects=False, # Follow redirects (max 10)
|
follow_redirects=False, # Follow redirects (max 10)
|
||||||
check_axfr=False, # Try AXFR transfer against nameservers
|
check_axfr=False, # Try AXFR transfer against nameservers
|
||||||
resolver_file=None, # Path to custom DNS resolvers file
|
resolver_file=None, # Path to custom DNS resolvers file
|
||||||
output_file=None, # Path to JSONL output file
|
output_file=None, # Path to JSONL output file
|
||||||
show_progress=False, # Show progress counter
|
show_progress=False, # Show progress counter
|
||||||
debug_mode=False, # Show error states and debug info
|
debug_mode=False, # Show error states and debug info
|
||||||
jsonl_output=False, # Output in JSONL format
|
jsonl_output=False, # Output in JSONL format
|
||||||
|
shard=None, # Tuple of (shard_index, total_shards) for distributed scanning
|
||||||
|
|
||||||
# Control which fields to show (all False by default unless show_fields is None)
|
# Control which fields to show (all False by default unless show_fields is None)
|
||||||
show_fields={
|
show_fields={
|
||||||
'status_code': True, # Show status code
|
'status_code': True, # Show status code
|
||||||
'content_type': True, # Show content type
|
'content_type': True, # Show content type
|
||||||
'content_length': True, # Show content length
|
'content_length': True, # Show content length
|
||||||
'title': True, # Show page title
|
'title': True, # Show page title
|
||||||
'body': True, # Show body preview
|
'body': True, # Show body preview
|
||||||
'ip': True, # Show IP addresses
|
'ip': True, # Show IP addresses
|
||||||
'favicon': True, # Show favicon hash
|
'favicon': True, # Show favicon hash
|
||||||
'headers': True, # Show response headers
|
'headers': True, # Show response headers
|
||||||
'follow_redirects': True, # Show redirect chain
|
'follow_redirects': True, # Show redirect chain
|
||||||
'cname': True, # Show CNAME records
|
'cname': True, # Show CNAME records
|
||||||
'tls': True # Show TLS certificate info
|
'tls': True # Show TLS certificate info
|
||||||
},
|
},
|
||||||
|
|
||||||
# Filter results
|
# Filter results
|
||||||
match_codes={200, 301, 302}, # Only show these status codes
|
match_codes={200,301,302}, # Only show these status codes
|
||||||
exclude_codes={404, 500, 503} # Exclude these status codes
|
exclude_codes={404,500,503} # Exclude these status codes
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize resolvers (required before scanning)
|
# Initialize resolvers (required before scanning)
|
||||||
await scanner.init()
|
await scanner.init()
|
||||||
|
|
||||||
# Scan domains from file
|
# Example 1: Stream from S3/MinIO using aioboto3
|
||||||
await scanner.scan('domains.txt')
|
async with aioboto3.Session().client('s3',
|
||||||
|
endpoint_url='http://minio.example.com:9000',
|
||||||
# Or scan from stdin
|
aws_access_key_id='access_key',
|
||||||
await scanner.scan('-')
|
aws_secret_access_key='secret_key') as s3:
|
||||||
|
|
||||||
|
response = await s3.get_object(Bucket='my-bucket', Key='huge-domains.txt')
|
||||||
|
async with response['Body'] as stream:
|
||||||
|
async def s3_generator():
|
||||||
|
while True:
|
||||||
|
line = await stream.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
yield line.decode().strip()
|
||||||
|
|
||||||
|
await scanner.scan(s3_generator())
|
||||||
|
|
||||||
|
# Example 2: Stream from URL using aiohttp
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
# For large files - stream line by line
|
||||||
|
async with session.get('https://example.com/huge-domains.txt') as resp:
|
||||||
|
async def url_generator():
|
||||||
|
async for line in resp.content:
|
||||||
|
yield line.decode().strip()
|
||||||
|
|
||||||
|
await scanner.scan(url_generator())
|
||||||
|
|
||||||
|
# For small files - read all at once
|
||||||
|
async with session.get('https://example.com/small-domains.txt') as resp:
|
||||||
|
content = await resp.text()
|
||||||
|
await scanner.scan(content) # Library handles splitting into lines
|
||||||
|
|
||||||
|
# Example 3: Simple list of domains
|
||||||
|
domains = [
|
||||||
|
'example1.com',
|
||||||
|
'example2.com',
|
||||||
|
'example3.com'
|
||||||
|
]
|
||||||
|
await scanner.scan(domains)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
asyncio.run(scan_domains())
|
asyncio.run(scan_domains())
|
||||||
```
|
```
|
||||||
|
|
||||||
The scanner will return results in this format:
|
The scanner accepts various input types:
|
||||||
```python
|
- Async/sync generators that yield domains
|
||||||
{
|
- String content with newlines
|
||||||
'domain': 'example.com', # Base domain
|
- Lists/tuples of domains
|
||||||
'url': 'https://example.com', # Full URL
|
- File paths
|
||||||
'status': 200, # HTTP status code
|
- stdin (using '-')
|
||||||
'port': 443, # Port number
|
|
||||||
'title': 'Example Domain', # Page title
|
All inputs support sharding for distributed scanning.
|
||||||
'body': 'Example body text...', # Body preview
|
|
||||||
'content_type': 'text/html', # Content type
|
|
||||||
'content_length': '12345', # Content length
|
|
||||||
'ips': ['93.184.216.34'], # IP addresses
|
|
||||||
'cname': 'cdn.example.com', # CNAME record
|
|
||||||
'nameservers': ['ns1.example.com'],# Nameservers
|
|
||||||
'favicon_hash': '123456789', # Favicon hash
|
|
||||||
'headers': { # Response headers
|
|
||||||
'Server': 'nginx',
|
|
||||||
'Content-Type': 'text/html'
|
|
||||||
},
|
|
||||||
'redirect_chain': [ # Redirect history
|
|
||||||
'http://example.com',
|
|
||||||
'https://example.com'
|
|
||||||
],
|
|
||||||
'tls': { # TLS certificate info
|
|
||||||
'fingerprint': 'sha256...',
|
|
||||||
'common_name': 'example.com',
|
|
||||||
'issuer': 'Let\'s Encrypt',
|
|
||||||
'alt_names': ['www.example.com'],
|
|
||||||
'not_before': '2023-01-01T00:00:00',
|
|
||||||
'not_after': '2024-01-01T00:00:00',
|
|
||||||
'version': 3,
|
|
||||||
'serial_number': 'abcdef1234'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Arguments
|
## Arguments
|
||||||
|
|
||||||
| Argument | Long Form | Description |
|
| Argument | Long Form | Description |
|
||||||
|-----------|------------------|-------------------------------------------------------------|
|
|---------------|------------------|-------------------------------------------------------------|
|
||||||
| `file` | - | File containing domains *(one per line)*, use `-` for stdin |
|
| `file` | | File containing domains *(one per line)*, use `-` for stdin |
|
||||||
| `-d` | `--debug` | Show error states and debug information |
|
| `-d` | `--debug` | Show error states and debug information |
|
||||||
| `-c N` | `--concurrent N` | Number of concurrent checks *(default: 100)* |
|
| `-c N` | `--concurrent N` | Number of concurrent checks *(default: 100)* |
|
||||||
| `-o FILE` | `--output FILE` | Output file path *(JSONL format)* |
|
| `-o FILE` | `--output FILE` | Output file path *(JSONL format)* |
|
||||||
| `-j` | `--jsonl` | Output JSON Lines format to console |
|
| `-j` | `--jsonl` | Output JSON Lines format to console |
|
||||||
| `-all` | `--all-flags` | Enable all output flags |
|
| `-all` | `--all-flags` | Enable all output flags |
|
||||||
|
| `-sh` | `--shard N/T` | Process shard N of T total shards *(e.g., 1/3)* |
|
||||||
|
|
||||||
### Output Field Flags
|
### Output Field Flags
|
||||||
|
|
||||||
@ -191,5 +223,5 @@ The scanner will return results in this format:
|
|||||||
| `-mc CODES` | `--match-codes CODES` | Only show specific status codes *(comma-separated)* |
|
| `-mc CODES` | `--match-codes CODES` | Only show specific status codes *(comma-separated)* |
|
||||||
| `-ec CODES` | `--exclude-codes CODES` | Exclude specific status codes *(comma-separated)* |
|
| `-ec CODES` | `--exclude-codes CODES` | Exclude specific status codes *(comma-separated)* |
|
||||||
| `-p` | `--progress` | Show progress counter |
|
| `-p` | `--progress` | Show progress counter |
|
||||||
| `-ax` | `--axfr` | Try AXFR transfer against nameservers |
|
| `-ax` | `--axfr` | Try AXFR transfer against nameservers |
|
||||||
| `-r FILE` | `--resolvers FILE` | File containing DNS resolvers *(one per line)* |
|
| `-r FILE` | `--resolvers FILE` | File containing DNS resolvers *(one per line)* |
|
@ -11,6 +11,7 @@ import sys
|
|||||||
from .colors import Colors
|
from .colors import Colors
|
||||||
from .scanner import HTTPZScanner
|
from .scanner import HTTPZScanner
|
||||||
from .utils import SILENT_MODE, info
|
from .utils import SILENT_MODE, info
|
||||||
|
from .parsers import parse_status_codes, parse_shard
|
||||||
|
|
||||||
def setup_logging(level='INFO', log_to_disk=False):
|
def setup_logging(level='INFO', log_to_disk=False):
|
||||||
'''
|
'''
|
||||||
@ -49,25 +50,6 @@ def setup_logging(level='INFO', log_to_disk=False):
|
|||||||
handlers=handlers
|
handlers=handlers
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse_status_codes(codes_str: str) -> set:
|
|
||||||
'''
|
|
||||||
Parse comma-separated status codes and ranges into a set of integers
|
|
||||||
|
|
||||||
:param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
|
|
||||||
'''
|
|
||||||
|
|
||||||
codes = set()
|
|
||||||
try:
|
|
||||||
for part in codes_str.split(','):
|
|
||||||
if '-' in part:
|
|
||||||
start, end = map(int, part.split('-'))
|
|
||||||
codes.update(range(start, end + 1))
|
|
||||||
else:
|
|
||||||
codes.add(int(part))
|
|
||||||
return codes
|
|
||||||
except ValueError:
|
|
||||||
raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description=f'{Colors.GREEN}Hyper-fast HTTP Scraping Tool{Colors.RESET}',
|
description=f'{Colors.GREEN}Hyper-fast HTTP Scraping Tool{Colors.RESET}',
|
||||||
@ -103,6 +85,9 @@ async def main():
|
|||||||
parser.add_argument('-r', '--resolvers', help='File containing DNS resolvers (one per line)')
|
parser.add_argument('-r', '--resolvers', help='File containing DNS resolvers (one per line)')
|
||||||
parser.add_argument('-to', '--timeout', type=int, default=5, help='Request timeout in seconds')
|
parser.add_argument('-to', '--timeout', type=int, default=5, help='Request timeout in seconds')
|
||||||
|
|
||||||
|
# Add shard argument
|
||||||
|
parser.add_argument('-sh','--shard', type=parse_shard, help='Shard index and total shards (e.g., 1/3)')
|
||||||
|
|
||||||
# If no arguments provided, print help and exit
|
# If no arguments provided, print help and exit
|
||||||
if len(sys.argv) == 1:
|
if len(sys.argv) == 1:
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
@ -158,7 +143,8 @@ async def main():
|
|||||||
jsonl_output=args.jsonl,
|
jsonl_output=args.jsonl,
|
||||||
show_fields=show_fields,
|
show_fields=show_fields,
|
||||||
match_codes=args.match_codes,
|
match_codes=args.match_codes,
|
||||||
exclude_codes=args.exclude_codes
|
exclude_codes=args.exclude_codes,
|
||||||
|
shard=args.shard
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run the scanner with file/stdin input
|
# Run the scanner with file/stdin input
|
||||||
|
@ -20,6 +20,7 @@ except ImportError:
|
|||||||
raise ImportError('missing mmh3 module (pip install mmh3)')
|
raise ImportError('missing mmh3 module (pip install mmh3)')
|
||||||
|
|
||||||
from .utils import debug, error
|
from .utils import debug, error
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def parse_domain_url(domain: str) -> tuple:
|
def parse_domain_url(domain: str) -> tuple:
|
||||||
@ -108,6 +109,7 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str:
|
|||||||
:param base_url: base URL of the website
|
:param base_url: base URL of the website
|
||||||
:param html: HTML content of the page
|
:param html: HTML content of the page
|
||||||
'''
|
'''
|
||||||
|
|
||||||
try:
|
try:
|
||||||
soup = bs4.BeautifulSoup(html, 'html.parser')
|
soup = bs4.BeautifulSoup(html, 'html.parser')
|
||||||
|
|
||||||
@ -137,4 +139,39 @@ async def get_favicon_hash(session, base_url: str, html: str) -> str:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
debug(f'Error getting favicon for {base_url}: {str(e)}')
|
debug(f'Error getting favicon for {base_url}: {str(e)}')
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def parse_status_codes(codes_str: str) -> set:
|
||||||
|
'''
|
||||||
|
Parse comma-separated status codes and ranges into a set of integers
|
||||||
|
|
||||||
|
:param codes_str: Comma-separated status codes (e.g., "200,301-399,404,500-503")
|
||||||
|
'''
|
||||||
|
|
||||||
|
codes = set()
|
||||||
|
try:
|
||||||
|
for part in codes_str.split(','):
|
||||||
|
if '-' in part:
|
||||||
|
start, end = map(int, part.split('-'))
|
||||||
|
codes.update(range(start, end + 1))
|
||||||
|
else:
|
||||||
|
codes.add(int(part))
|
||||||
|
return codes
|
||||||
|
except ValueError:
|
||||||
|
raise argparse.ArgumentTypeError('Invalid status code format. Use comma-separated numbers or ranges (e.g., 200,301-399,404,500-503)')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_shard(shard_str: str) -> tuple:
|
||||||
|
'''
|
||||||
|
Parse shard argument in format INDEX/TOTAL
|
||||||
|
|
||||||
|
:param shard_str: Shard string in format "INDEX/TOTAL"
|
||||||
|
'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
shard_index, total_shards = map(int, shard_str.split('/'))
|
||||||
|
if shard_index < 1 or total_shards < 1 or shard_index > total_shards:
|
||||||
|
raise ValueError
|
||||||
|
return shard_index - 1, total_shards # Convert to 0-based index
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
raise argparse.ArgumentTypeError('Shard must be in format INDEX/TOTAL where INDEX <= TOTAL')
|
@ -27,7 +27,7 @@ from .utils import debug, info, USER_AGENTS, input_generator
|
|||||||
class HTTPZScanner:
|
class HTTPZScanner:
|
||||||
'''Core scanner class for HTTP domain checking'''
|
'''Core scanner class for HTTP domain checking'''
|
||||||
|
|
||||||
def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None):
|
def __init__(self, concurrent_limit = 100, timeout = 5, follow_redirects = False, check_axfr = False, resolver_file = None, output_file = None, show_progress = False, debug_mode = False, jsonl_output = False, show_fields = None, match_codes = None, exclude_codes = None, shard = None):
|
||||||
'''
|
'''
|
||||||
Initialize the HTTPZScanner class
|
Initialize the HTTPZScanner class
|
||||||
|
|
||||||
@ -43,6 +43,7 @@ class HTTPZScanner:
|
|||||||
:param show_fields: Fields to show
|
:param show_fields: Fields to show
|
||||||
:param match_codes: Status codes to match
|
:param match_codes: Status codes to match
|
||||||
:param exclude_codes: Status codes to exclude
|
:param exclude_codes: Status codes to exclude
|
||||||
|
:param shard: Tuple of (shard_index, total_shards) for distributed scanning
|
||||||
'''
|
'''
|
||||||
|
|
||||||
self.concurrent_limit = concurrent_limit
|
self.concurrent_limit = concurrent_limit
|
||||||
@ -54,6 +55,7 @@ class HTTPZScanner:
|
|||||||
self.show_progress = show_progress
|
self.show_progress = show_progress
|
||||||
self.debug_mode = debug_mode
|
self.debug_mode = debug_mode
|
||||||
self.jsonl_output = jsonl_output
|
self.jsonl_output = jsonl_output
|
||||||
|
self.shard = shard
|
||||||
|
|
||||||
self.show_fields = show_fields or {
|
self.show_fields = show_fields or {
|
||||||
'status_code' : True,
|
'status_code' : True,
|
||||||
@ -218,8 +220,8 @@ class HTTPZScanner:
|
|||||||
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
|
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
|
||||||
tasks = set()
|
tasks = set()
|
||||||
|
|
||||||
# Process domains with concurrent limit
|
# Pass shard info to input_generator
|
||||||
for domain in input_generator(input_source):
|
for domain in input_generator(input_source, self.shard):
|
||||||
if len(tasks) >= self.concurrent_limit:
|
if len(tasks) >= self.concurrent_limit:
|
||||||
done, tasks = await asyncio.wait(
|
done, tasks = await asyncio.wait(
|
||||||
tasks, return_when=asyncio.FIRST_COMPLETED
|
tasks, return_when=asyncio.FIRST_COMPLETED
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
# httpz_scanner/utils.py
|
# httpz_scanner/utils.py
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
@ -97,19 +98,54 @@ def human_size(size_bytes: int) -> str:
|
|||||||
return f'{size:.1f}{units[unit_index]}'
|
return f'{size:.1f}{units[unit_index]}'
|
||||||
|
|
||||||
|
|
||||||
def input_generator(input_source: str):
|
def input_generator(input_source, shard: tuple = None):
|
||||||
'''
|
'''
|
||||||
Generator function to yield domains from file or stdin
|
Generator function to yield domains from various input sources with optional sharding
|
||||||
|
|
||||||
:param input_source: file or stdin
|
:param input_source: Can be:
|
||||||
|
- string path to local file
|
||||||
|
- "-" for stdin
|
||||||
|
- list/tuple of domains
|
||||||
|
- generator/iterator yielding domains
|
||||||
|
- string content with newlines
|
||||||
|
:param shard: Tuple of (shard_index, total_shards) for distributed scanning
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
line_num = 0
|
||||||
|
|
||||||
|
# Handle stdin
|
||||||
if input_source == '-' or input_source is None:
|
if input_source == '-' or input_source is None:
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
if line.strip():
|
if line := line.strip():
|
||||||
yield line.strip()
|
if shard is None or line_num % shard[1] == shard[0]:
|
||||||
else:
|
yield line
|
||||||
|
line_num += 1
|
||||||
|
|
||||||
|
# Handle local files
|
||||||
|
elif isinstance(input_source, str) and os.path.exists(input_source):
|
||||||
with open(input_source, 'r') as f:
|
with open(input_source, 'r') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
if line.strip():
|
if line := line.strip():
|
||||||
yield line.strip()
|
if shard is None or line_num % shard[1] == shard[0]:
|
||||||
|
yield line
|
||||||
|
line_num += 1
|
||||||
|
|
||||||
|
# Handle iterables (generators, lists, etc)
|
||||||
|
elif hasattr(input_source, '__iter__') and not isinstance(input_source, (str, bytes)):
|
||||||
|
for line in input_source:
|
||||||
|
if isinstance(line, bytes):
|
||||||
|
line = line.decode()
|
||||||
|
if line := line.strip():
|
||||||
|
if shard is None or line_num % shard[1] == shard[0]:
|
||||||
|
yield line
|
||||||
|
line_num += 1
|
||||||
|
|
||||||
|
# Handle string content with newlines
|
||||||
|
elif isinstance(input_source, (str, bytes)):
|
||||||
|
if isinstance(input_source, bytes):
|
||||||
|
input_source = input_source.decode()
|
||||||
|
for line in input_source.splitlines():
|
||||||
|
if line := line.strip():
|
||||||
|
if shard is None or line_num % shard[1] == shard[0]:
|
||||||
|
yield line
|
||||||
|
line_num += 1
|
Loading…
Reference in New Issue
Block a user