Updated README, copied over consistencies across the ingestors, docstring updates to reflect on new arguments

This commit is contained in:
Dionysus 2024-01-27 04:28:30 -05:00
parent 88e0dbfea8
commit c105db705d
Signed by: acidvegas
GPG Key ID: EF4B922DB85DC9DE
6 changed files with 456 additions and 188 deletions

View File

@ -13,6 +13,10 @@ python ingest_XXXX.py [options] <input>
``` ```
**Note:** The `<input>` can be a file or a directory of files, depending on the ingestion script. **Note:** The `<input>` can be a file or a directory of files, depending on the ingestion script.
## Operations
This ingestion suite will use the built in node sniffer, so by connecting to a single node, you can load balance across the entire cluster.
It is good to know how much nodes you have in the cluster to determine how to fine tune the arguments for the best performance, based on your environment.
### ###
###### General arguments ###### General arguments
| Argument | Description | | Argument | Description |
@ -22,28 +26,32 @@ python ingest_XXXX.py [options] <input>
| `--watch` | Watch the input file for new lines and index them in real time | | `--watch` | Watch the input file for new lines and index them in real time |
###### Elasticsearch arguments ###### Elasticsearch arguments
| Argument | Description | Default | | Argument | Description | Default |
|-------------------|--------------------------------------------------------------------------------------|---------------| |-------------------|--------------------------------------------------------|----------------|
| `--host` | Elasticsearch host *(Will sniff for other nodes in the cluster)* | `localhost` | | `--host` | Elasticsearch host | `localhost` |
| `--port` | Elasticsearch port | `9200` | | `--port` | Elasticsearch port | `9200` |
| `--user` | Elasticsearch username | `elastic` | | `--user` | Elasticsearch username | `elastic` |
| `--password` | Elasticsearch password *(if not provided, check environment variable `ES_PASSWORD`)* | | | `--password` | Elasticsearch password | `$ES_PASSWORD` |
| `--api-key` | Elasticsearch API Key for authentication | | | `--api-key` | Elasticsearch API Key for authentication | |
| `--self-signed` | Elastic search instance is using a self-signed certificate | `true` | | `--self-signed` | Elasticsearch connection with aself-signed certificate | |
| `--index` | Elasticsearch index name | `masscan-logs`|
| `--shards` | Number of shards for the index | `1` | ###### Elasticsearch indexing arguments
| `--replicas` | Number of replicas for the index | `1` | | Argument | Description | Default |
|-------------------|----------------------------------|---------------------|
| `--index` | Elasticsearch index name | Depends on ingestor |
| `--shards` | Number of shards for the index | `1` |
| `--replicas` | Number of replicas for the index | `1` |
###### Performance arguments ###### Performance arguments
| Argument | Description | Default | | Argument | Description | Default |
|-------------------|--------------------------------------------------------------------------------------|---------------| |-------------------|----------------------------------------------------------|---------|
| `--batch-max` | Maximum size in MB of a batch | `10` | | `--batch-max` | Maximum size in MB of a batch | `10` |
| `--batch-size` | Number of records to index in a batch | `5000` | | `--batch-size` | Number of records to index in a batch | `5000` |
| `--batch-threads` | Number of threads to use when indexing in batches | `2` | | `--batch-threads` | Number of threads to use when indexing in batches | `2` |
| `--retries` | Number of times to retry indexing a batch before failing | `10` | | `--retries` | Number of times to retry indexing a batch before failing | `10` |
| `--timeout` | Number of seconds to wait before retrying a batch | `30` | | `--timeout` | Number of seconds to wait before retrying a batch | `30` |
**NOTE:** Using `--batch-threads` as 4 and `--batch-size` as 10000 with 3 nodes would process 120,000 records before indexing 40,000 per node. Using `--batch-threads` as 4 and `--batch-size` as 10000 with 3 nodes would process 120,000 records before indexing 40,000 per node. Take these kind of metrics int account when consider how much records you want to process at once and the memory limitations of your environment, aswell as the networking constraint it may have ono your node(s), depending on the size of your cluster.
___ ___

View File

@ -15,7 +15,7 @@ import os
import time import time
try: try:
from elasticsearch import Elasticsearch, helpers, ConnectionError, TransportError from elasticsearch import Elasticsearch, helpers
except ImportError: except ImportError:
raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)') raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)')
@ -25,11 +25,11 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(level
class ElasticIndexer: class ElasticIndexer:
def __init__(self, es_host: str, es_port: str, es_user: str, es_password: str, es_api_key: str, es_index: str, dry_run: bool = False, self_signed: bool = False): def __init__(self, es_host: str, es_port: int, es_user: str, es_password: str, es_api_key: str, es_index: str, dry_run: bool = False, self_signed: bool = False, retries: int = 10, timeout: int = 30):
''' '''
Initialize the Elastic Search indexer. Initialize the Elastic Search indexer.
:param es_host: Elasticsearch host :param es_host: Elasticsearch host(s)
:param es_port: Elasticsearch port :param es_port: Elasticsearch port
:param es_user: Elasticsearch username :param es_user: Elasticsearch username
:param es_password: Elasticsearch password :param es_password: Elasticsearch password
@ -37,6 +37,8 @@ class ElasticIndexer:
:param es_index: Elasticsearch index name :param es_index: Elasticsearch index name
:param dry_run: If True, do not initialize Elasticsearch client :param dry_run: If True, do not initialize Elasticsearch client
:param self_signed: If True, do not verify SSL certificates :param self_signed: If True, do not verify SSL certificates
:param retries: Number of times to retry indexing a batch before failing
:param timeout: Number of seconds to wait before retrying a batch
''' '''
self.dry_run = dry_run self.dry_run = dry_run
@ -44,19 +46,53 @@ class ElasticIndexer:
self.es_index = es_index self.es_index = es_index
if not dry_run: if not dry_run:
es_config = {
'hosts': [f'{es_host}:{es_port}'],
'verify_certs': self_signed,
'ssl_show_warn': self_signed,
'request_timeout': timeout,
'max_retries': retries,
'retry_on_timeout': True,
'sniff_on_start': False,
'sniff_on_node_failure': True,
'min_delay_between_sniffing': 60
}
if es_api_key: if es_api_key:
self.es = Elasticsearch([f'{es_host}:{es_port}'], headers={'Authorization': f'ApiKey {es_api_key}'}, verify_certs=self_signed, ssl_show_warn=self_signed) es_config['headers'] = {'Authorization': f'ApiKey {es_api_key}'}
else: else:
self.es = Elasticsearch([f'{es_host}:{es_port}'], basic_auth=(es_user, es_password), verify_certs=self_signed, ssl_show_warn=self_signed) es_config['basic_auth'] = (es_user, es_password)
# Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
import sniff_patch
self.es = sniff_patch.init_elasticsearch(**es_config)
# Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
#self.es = Elasticsearch(**es_config)
def process_file(self, file_path: str, batch_size: int): def get_cluster_health(self) -> dict:
'''Get the health of the Elasticsearch cluster.'''
return self.es.cluster.health()
def get_cluster_size(self) -> int:
'''Get the number of nodes in the Elasticsearch cluster.'''
cluster_stats = self.es.cluster.stats()
number_of_nodes = cluster_stats['nodes']['count']['total']
return number_of_nodes
def process_file(self, file_path: str, watch: bool = False, chunk: dict = {}):
''' '''
Read and index HTTPX records in batches to Elasticsearch, handling large volumes efficiently. Read and index HTTPX records in batches to Elasticsearch, handling large volumes efficiently.
:param file_path: Path to the HTTPX log file :param file_path: Path to the HTTPX log file
:param batch_size: Number of records to process before indexing :param watch: If True, watch the file for new lines and index them in real time
:param chunk: Chunking configuration
Example record: Example record:
{ {
@ -110,7 +146,7 @@ class ElasticIndexer:
records = [] records = []
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
for line in file: for line in (file := follow(file) if watch else file):
line = line.strip() line = line.strip()
if not line: if not line:
@ -129,41 +165,83 @@ class ElasticIndexer:
record = {'_index': self.es_index, '_source': record} record = {'_index': self.es_index, '_source': record}
records.append(record) records.append(record)
count += 1 count += 1
if len(records) >= batch_size: if len(records) >= chunk['batch']:
while True: self.bulk_index(records, file_path, chunk, count)
try: records = []
success, _ = helpers.bulk(self.es, records)
except (ConnectionError, TransportError) as e:
logging.error(f'Failed to index records to Elasticsearch. ({e})')
time.sleep(60)
else:
logging.info(f'Successfully indexed {success:,} ({count:,}) records to {self.es_index} from {file_path}')
records = []
break
if records: if records:
while True: self.bulk_index(records, file_path, chunk, count)
try:
success, _ = helpers.bulk(self.es, records) def bulk_index(self, documents: list, file_path: str, chunk: dict, count: int):
except (ConnectionError, TransportError) as e: '''
logging.error(f'Failed to index records to Elasticsearch. ({e})') Index a batch of documents to Elasticsearch.
time.sleep(60)
else: :param documents: List of documents to index
logging.info(f'Successfully indexed {success:,} ({count:,}) records to {self.es_index} from {file_path}') :param file_path: Path to the file being indexed
:param count: Total number of records processed
'''
remaining_documents = documents
parallel_bulk_config = {
'client': self.es,
'chunk_size': chunk['size'],
'max_chunk_bytes': chunk['max_size'] * 1024 * 1024, # MB
'thread_count': chunk['threads'],
'queue_size': 2
}
while remaining_documents:
failed_documents = []
try:
for success, response in helpers.parallel_bulk(actions=remaining_documents, **parallel_bulk_config):
if not success:
failed_documents.append(response)
if not failed_documents:
ingested = parallel_bulk_config['chunk_size'] * parallel_bulk_config['thread_count']
logging.info(f'Successfully indexed {ingested:,} ({count:,} processed) records to {self.es_index} from {file_path}')
break break
else:
logging.warning(f'Failed to index {len(failed_documents):,} failed documents! Retrying...')
remaining_documents = failed_documents
except Exception as e:
logging.error(f'Failed to index documents! ({e})')
time.sleep(30)
def follow(file) -> str:
'''
Generator function that yields new lines in a file in real time.
:param file: File object to read from
'''
file.seek(0,2) # Go to the end of the file
while True:
line = file.readline()
if not line:
time.sleep(0.1)
continue
yield line
def main(): def main():
'''Main function when running this script directly.''' '''Main function when running this script directly.'''
parser = argparse.ArgumentParser(description='Index data into Elasticsearch.') parser = argparse.ArgumentParser(description='Index data into Elasticsearch.')
parser.add_argument('input_path', help='Path to the input file or directory')
# General arguments # General arguments
parser.add_argument('input_path', help='Path to the input file or directory') # Required
parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)') parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)')
parser.add_argument('--batch_size', type=int, default=50000, help='Number of records to index in a batch') parser.add_argument('--watch', action='store_true', help='Watch the input file for new lines and index them in real time')
# Elasticsearch connection arguments # Elasticsearch arguments
parser.add_argument('--host', default='localhost', help='Elasticsearch host') parser.add_argument('--host', default='localhost', help='Elasticsearch host')
parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port') parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port')
parser.add_argument('--user', default='elastic', help='Elasticsearch username') parser.add_argument('--user', default='elastic', help='Elasticsearch username')
@ -172,54 +250,87 @@ def main():
parser.add_argument('--self-signed', action='store_false', help='Elasticsearch is using self-signed certificates') parser.add_argument('--self-signed', action='store_false', help='Elasticsearch is using self-signed certificates')
# Elasticsearch indexing arguments # Elasticsearch indexing arguments
parser.add_argument('--index', default='zone-files', help='Elasticsearch index name') parser.add_argument('--index', default='httpx-logs', help='Elasticsearch index name')
parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index') parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index')
parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index') parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index')
# Performance arguments
parser.add_argument('--batch-max', type=int, default=10, help='Maximum size in MB of a batch')
parser.add_argument('--batch-size', type=int, default=5000, help='Number of records to index in a batch')
parser.add_argument('--batch-threads', type=int, default=2, help='Number of threads to use when indexing in batches')
parser.add_argument('--retries', type=int, default=10, help='Number of times to retry indexing a batch before failing')
parser.add_argument('--timeout', type=int, default=30, help='Number of seconds to wait before retrying a batch')
args = parser.parse_args() args = parser.parse_args()
if not os.path.exists(args.input_path): # Argument validation
raise FileNotFoundError(f'Input file {args.input_path} does not exist') if not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
if not args.dry_run: if not args.dry_run:
if args.batch_size < 1: if args.batch_size < 1:
raise ValueError('Batch size must be greater than 0') raise ValueError('Batch size must be greater than 0')
if not args.host: elif args.retries < 1:
raise ValueError('Number of retries must be greater than 0')
elif args.timeout < 5:
raise ValueError('Timeout must be greater than 4')
elif args.batch_max < 1:
raise ValueError('Batch max size must be greater than 0')
elif args.batch_threads < 1:
raise ValueError('Batch threads must be greater than 0')
elif not args.host:
raise ValueError('Missing required Elasticsearch argument: host') raise ValueError('Missing required Elasticsearch argument: host')
if not args.api_key and (not args.user or not args.password): elif not args.api_key and (not args.user or not args.password):
raise ValueError('Missing required Elasticsearch argument: either user and password or apikey') raise ValueError('Missing required Elasticsearch argument: either user and password or apikey')
if args.shards < 1: elif args.shards < 1:
raise ValueError('Number of shards must be greater than 0') raise ValueError('Number of shards must be greater than 0')
if args.replicas < 1: elif args.replicas < 0:
raise ValueError('Number of replicas must be greater than 0') raise ValueError('Number of replicas must be greater than 0')
logging.info(f'Connecting to Elasticsearch at {args.host}:{args.port}...') edx = ElasticIndexer(args.host, args.port, args.user, args.password, args.api_key, args.index, args.dry_run, args.self_signed, args.retries, args.timeout)
edx = ElasticIndexer(args.host, args.port, args.user, args.password, args.api_key, args.index, args.dry_run, args.self_signed)
if not args.dry_run: if not args.dry_run:
print(edx.get_cluster_health())
time.sleep(3) # Delay to allow time for sniffing to complete
nodes = edx.get_cluster_size()
logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
edx.create_index(args.shards, args.replicas) # Create the index if it does not exist edx.create_index(args.shards, args.replicas) # Create the index if it does not exist
chunk = {
'size': args.batch_size,
'max_size': args.batch_max * 1024 * 1024, # MB
'threads': args.batch_threads
}
chunk['batch'] = nodes * (chunk['size'] * chunk['threads'])
else:
chunk = {} # Ugly hack to get this working...
if os.path.isfile(args.input_path): if os.path.isfile(args.input_path):
logging.info(f'Processing file: {args.input_path}') logging.info(f'Processing file: {args.input_path}')
edx.process_file(args.input_path, args.batch_size) edx.process_file(args.input_path, args.watch, chunk)
elif os.path.isdir(args.input_path): elif os.path.isdir(args.input_path):
logging.info(f'Processing files in directory: {args.input_path}') count = 1
total = len(os.listdir(args.input_path))
logging.info(f'Processing {total:,} files in directory: {args.input_path}')
for file in sorted(os.listdir(args.input_path)): for file in sorted(os.listdir(args.input_path)):
file_path = os.path.join(args.input_path, file) file_path = os.path.join(args.input_path, file)
if os.path.isfile(file_path): if os.path.isfile(file_path):
logging.info(f'Processing file: {file_path}') logging.info(f'[{count:,}/{total:,}] Processing file: {file_path}')
edx.process_file(file_path, args.batch_size) edx.process_file(file_path, args.watch, chunk)
count += 1
else: else:
raise ValueError(f'Input path {args.input_path} is not a file or directory') logging.warning(f'[{count:,}/{total:,}] Skipping non-file: {file_path}')
if __name__ == '__main__':
main()

View File

@ -60,7 +60,7 @@ class ElasticIndexer:
'request_timeout': timeout, 'request_timeout': timeout,
'max_retries': retries, 'max_retries': retries,
'retry_on_timeout': True, 'retry_on_timeout': True,
'sniff_on_start': True, 'sniff_on_start': False,
'sniff_on_node_failure': True, 'sniff_on_node_failure': True,
'min_delay_between_sniffing': 60 'min_delay_between_sniffing': 60
} }
@ -70,11 +70,21 @@ class ElasticIndexer:
else: else:
es_config['basic_auth'] = (es_user, es_password) es_config['basic_auth'] = (es_user, es_password)
self.es = Elasticsearch(**es_config) # Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
import sniff_patch
self.es = sniff_patch.init_elasticsearch(**es_config)
# Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
#self.es = Elasticsearch(**es_config)
def create_index(self, shards: int = 1, replicas: int = 1): def create_index(self, shards: int = 1, replicas: int = 1):
'''Create the Elasticsearch index with the defined mapping.''' '''
Create the Elasticsearch index with the defined mapping.
:param shards: Number of shards for the index
:param replicas: Number of replicas for the index
'''
mapping = { mapping = {
'settings': { 'settings': {
@ -106,6 +116,12 @@ class ElasticIndexer:
logging.warning(f'Index \'{self.es_index}\' already exists.') logging.warning(f'Index \'{self.es_index}\' already exists.')
def get_cluster_health(self) -> dict:
'''Get the health of the Elasticsearch cluster.'''
return self.es.cluster.health()
def get_cluster_size(self) -> int: def get_cluster_size(self) -> int:
'''Get the number of nodes in the Elasticsearch cluster.''' '''Get the number of nodes in the Elasticsearch cluster.'''
@ -120,8 +136,8 @@ class ElasticIndexer:
Read and index Masscan records in batches to Elasticsearch, handling large volumes efficiently. Read and index Masscan records in batches to Elasticsearch, handling large volumes efficiently.
:param file_path: Path to the Masscan log file :param file_path: Path to the Masscan log file
:param batch_size: Number of records to process before indexing :param watch: If True, input file will be watched for new lines and indexed in real time\
:param watch: If True, input file will be watched for new lines and indexed in real time :param chunk: Chunk configuration for indexing in batches
Example record: Example record:
{ {
@ -207,6 +223,7 @@ class ElasticIndexer:
:param file_path: Path to the file being indexed :param file_path: Path to the file being indexed
:param count: Total number of records processed :param count: Total number of records processed
''' '''
remaining_documents = documents remaining_documents = documents
parallel_bulk_config = { parallel_bulk_config = {
@ -261,9 +278,9 @@ def main():
'''Main function when running this script directly.''' '''Main function when running this script directly.'''
parser = argparse.ArgumentParser(description='Index data into Elasticsearch.') parser = argparse.ArgumentParser(description='Index data into Elasticsearch.')
parser.add_argument('input_path', help='Path to the input file or directory')
# General arguments # General arguments
parser.add_argument('input_path', help='Path to the input file or directory') # Required
parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)') parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)')
parser.add_argument('--watch', action='store_true', help='Watch the input file for new lines and index them in real time') parser.add_argument('--watch', action='store_true', help='Watch the input file for new lines and index them in real time')
@ -280,22 +297,18 @@ def main():
parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index') parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index')
parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index') parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index')
# Batch arguments (for fine tuning performance) # Performance arguments
parser.add_argument('--batch-max', type=int, default=10, help='Maximum size in MB of a batch') parser.add_argument('--batch-max', type=int, default=10, help='Maximum size in MB of a batch')
parser.add_argument('--batch-size', type=int, default=5000, help='Number of records to index in a batch') parser.add_argument('--batch-size', type=int, default=5000, help='Number of records to index in a batch')
parser.add_argument('--batch-threads', type=int, default=2, help='Number of threads to use when indexing in batches') parser.add_argument('--batch-threads', type=int, default=2, help='Number of threads to use when indexing in batches')
# NOTE: Using --batch-threads as 4 and --batch-size as 10000 means we will process 40,000 records per-node before indexing, so 3 nodes would process 120,000 records before indexing
# Elasticsearch retry arguments
parser.add_argument('--retries', type=int, default=10, help='Number of times to retry indexing a batch before failing') parser.add_argument('--retries', type=int, default=10, help='Number of times to retry indexing a batch before failing')
parser.add_argument('--timeout', type=int, default=30, help='Number of seconds to wait before retrying a batch') parser.add_argument('--timeout', type=int, default=30, help='Number of seconds to wait before retrying a batch')
args = parser.parse_args() args = parser.parse_args()
if not os.path.exists(args.input_path): # Argument validation
raise FileNotFoundError(f'Input file {args.input_path} does not exist') if not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
elif not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path): raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
raise ValueError(f'Input path {args.input_path} is not a file or directory')
if not args.dry_run: if not args.dry_run:
if args.batch_size < 1: if args.batch_size < 1:
@ -328,16 +341,19 @@ def main():
edx = ElasticIndexer(args.host, args.port, args.user, args.password, args.api_key, args.index, args.dry_run, args.self_signed, args.retries, args.timeout) edx = ElasticIndexer(args.host, args.port, args.user, args.password, args.api_key, args.index, args.dry_run, args.self_signed, args.retries, args.timeout)
if not args.dry_run: if not args.dry_run:
print(edx.get_cluster_health())
time.sleep(3) # Delay to allow time for sniffing to complete time.sleep(3) # Delay to allow time for sniffing to complete
nodes = edx.get_cluster_size() nodes = edx.get_cluster_size()
logging.info(f'Connected to {nodes:,} Elasticsearch node(s)') logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
edx.create_index(args.shards, args.replicas) # Create the index if it does not exist edx.create_index(args.shards, args.replicas) # Create the index if it does not exist
chunk = { chunk = {
'size': args.batch_size, 'size': args.batch_size,
'max_size': args.batch_max * 1024 * 1024, 'max_size': args.batch_max * 1024 * 1024, # MB
'threads': args.batch_threads 'threads': args.batch_threads
} }

View File

@ -15,7 +15,6 @@ import time
try: try:
from elasticsearch import Elasticsearch, helpers from elasticsearch import Elasticsearch, helpers
import sniff_patch
except ImportError: except ImportError:
raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)') raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)')
@ -63,12 +62,21 @@ class ElasticIndexer:
else: else:
es_config['basic_auth'] = (es_user, es_password) es_config['basic_auth'] = (es_user, es_password)
#self.es = Elasticsearch(**es_config) # Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
import sniff_patch
self.es = sniff_patch.init_elasticsearch(**es_config) self.es = sniff_patch.init_elasticsearch(**es_config)
# Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
#self.es = Elasticsearch(**es_config)
def create_index(self, shards: int = 1, replicas: int = 1): def create_index(self, shards: int = 1, replicas: int = 1):
'''Create the Elasticsearch index with the defined mapping.''' '''
Create the Elasticsearch index with the defined mapping.
:param shards: Number of shards for the index
:param replicas: Number of replicas for the index
'''
mapping = { mapping = {
'settings': { 'settings': {
@ -97,6 +105,12 @@ class ElasticIndexer:
logging.warning(f'Index \'{self.es_index}\' already exists.') logging.warning(f'Index \'{self.es_index}\' already exists.')
def get_cluster_health(self) -> dict:
'''Get the health of the Elasticsearch cluster.'''
return self.es.cluster.health()
def get_cluster_size(self) -> int: def get_cluster_size(self) -> int:
'''Get the number of nodes in the Elasticsearch cluster.''' '''Get the number of nodes in the Elasticsearch cluster.'''
@ -106,13 +120,13 @@ class ElasticIndexer:
return number_of_nodes return number_of_nodes
def process_file(self, file_path: str, watch: bool = False, chunk: dict = {}): def process_file(self, file_path: str, watch: bool = False, chunk: dict = {}):
''' '''
Read and index PTR records in batches to Elasticsearch, handling large volumes efficiently. Read and index PTR records in batches to Elasticsearch, handling large volumes efficiently.
:param file_path: Path to the PTR log file :param file_path: Path to the Masscan log file
:param batch_size: Number of records to process before indexing :param watch: If True, input file will be watched for new lines and indexed in real time\
:param chunk: Chunk configuration for indexing in batches
Example PTR record: Example PTR record:
0.6.229.47.in-addr.arpa. PTR 047-229-006-000.res.spectrum.com. 0.6.229.47.in-addr.arpa. PTR 047-229-006-000.res.spectrum.com.
@ -181,14 +195,16 @@ class ElasticIndexer:
:param documents: List of documents to index :param documents: List of documents to index
:param file_path: Path to the file being indexed :param file_path: Path to the file being indexed
:param chunk: Chunk configuration
:param count: Total number of records processed :param count: Total number of records processed
''' '''
remaining_documents = documents remaining_documents = documents
parallel_bulk_config = { parallel_bulk_config = {
'client': self.es, 'client': self.es,
'chunk_size': chunk['size'], 'chunk_size': chunk['size'],
'max_chunk_bytes': chunk['max_size'] * 1024 * 1024, # MB 'max_chunk_bytes': chunk['max_size'],
'thread_count': chunk['threads'], 'thread_count': chunk['threads'],
'queue_size': 2 'queue_size': 2
} }
@ -237,9 +253,9 @@ def main():
'''Main function when running this script directly.''' '''Main function when running this script directly.'''
parser = argparse.ArgumentParser(description='Index data into Elasticsearch.') parser = argparse.ArgumentParser(description='Index data into Elasticsearch.')
parser.add_argument('input_path', help='Path to the input file or directory')
# General arguments # General arguments
parser.add_argument('input_path', help='Path to the input file or directory') # Required
parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)') parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)')
parser.add_argument('--watch', action='store_true', help='Watch the input file for new lines and index them in real time') parser.add_argument('--watch', action='store_true', help='Watch the input file for new lines and index them in real time')
@ -256,22 +272,18 @@ def main():
parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index') parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index')
parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index') parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index')
# Batch arguments (for fine tuning performance) # Performance arguments
parser.add_argument('--batch-max', type=int, default=10, help='Maximum size in MB of a batch') parser.add_argument('--batch-max', type=int, default=10, help='Maximum size in MB of a batch')
parser.add_argument('--batch-size', type=int, default=5000, help='Number of records to index in a batch') parser.add_argument('--batch-size', type=int, default=5000, help='Number of records to index in a batch')
parser.add_argument('--batch-threads', type=int, default=2, help='Number of threads to use when indexing in batches') parser.add_argument('--batch-threads', type=int, default=2, help='Number of threads to use when indexing in batches')
# NOTE: Using --batch-threads as 4 and --batch-size as 10000 means we will process 40,000 records per-node before indexing, so 3 nodes would process 120,000 records before indexing
# Elasticsearch retry arguments
parser.add_argument('--retries', type=int, default=10, help='Number of times to retry indexing a batch before failing') parser.add_argument('--retries', type=int, default=10, help='Number of times to retry indexing a batch before failing')
parser.add_argument('--timeout', type=int, default=30, help='Number of seconds to wait before retrying a batch') parser.add_argument('--timeout', type=int, default=30, help='Number of seconds to wait before retrying a batch')
args = parser.parse_args() args = parser.parse_args()
if not os.path.exists(args.input_path): # Argument validation
raise FileNotFoundError(f'Input file {args.input_path} does not exist') if not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
elif not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path): raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
raise ValueError(f'Input path {args.input_path} is not a file or directory')
if not args.dry_run: if not args.dry_run:
if args.batch_size < 1: if args.batch_size < 1:
@ -304,16 +316,19 @@ def main():
edx = ElasticIndexer(args.host, args.port, args.user, args.password, args.api_key, args.index, args.dry_run, args.self_signed, args.retries, args.timeout) edx = ElasticIndexer(args.host, args.port, args.user, args.password, args.api_key, args.index, args.dry_run, args.self_signed, args.retries, args.timeout)
if not args.dry_run: if not args.dry_run:
time.sleep(5) # Delay to allow time for sniffing to complete print(edx.get_cluster_health())
time.sleep(3) # Delay to allow time for sniffing to complete
nodes = edx.get_cluster_size() nodes = edx.get_cluster_size()
logging.info(f'Connected to {nodes:,} Elasticsearch node(s)') logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
edx.create_index(args.shards, args.replicas) # Create the index if it does not exist edx.create_index(args.shards, args.replicas) # Create the index if it does not exist
chunk = { chunk = {
'size': args.batch_size, 'size': args.batch_size,
'max_size': args.batch_max * 1024 * 1024, 'max_size': args.batch_max * 1024 * 1024, # MB
'threads': args.batch_threads 'threads': args.batch_threads
} }

View File

@ -39,11 +39,11 @@ record_types = ('a','aaaa','caa','cdnskey','cds','cname','dnskey','ds','mx','nap
class ElasticIndexer: class ElasticIndexer:
def __init__(self, es_host: str, es_port: str, es_user: str, es_password: str, es_api_key: str, es_index: str, dry_run: bool = False, self_signed: bool = False): def __init__(self, es_host: str, es_port: int, es_user: str, es_password: str, es_api_key: str, es_index: str, dry_run: bool = False, self_signed: bool = False, retries: int = 10, timeout: int = 30):
''' '''
Initialize the Elastic Search indexer. Initialize the Elastic Search indexer.
:param es_host: Elasticsearch host :param es_host: Elasticsearch host(s)
:param es_port: Elasticsearch port :param es_port: Elasticsearch port
:param es_user: Elasticsearch username :param es_user: Elasticsearch username
:param es_password: Elasticsearch password :param es_password: Elasticsearch password
@ -51,6 +51,8 @@ class ElasticIndexer:
:param es_index: Elasticsearch index name :param es_index: Elasticsearch index name
:param dry_run: If True, do not initialize Elasticsearch client :param dry_run: If True, do not initialize Elasticsearch client
:param self_signed: If True, do not verify SSL certificates :param self_signed: If True, do not verify SSL certificates
:param retries: Number of times to retry indexing a batch before failing
:param timeout: Number of seconds to wait before retrying a batch
''' '''
self.dry_run = dry_run self.dry_run = dry_run
@ -58,15 +60,38 @@ class ElasticIndexer:
self.es_index = es_index self.es_index = es_index
if not dry_run: if not dry_run:
es_config = {
'hosts': [f'{es_host}:{es_port}'],
'verify_certs': self_signed,
'ssl_show_warn': self_signed,
'request_timeout': timeout,
'max_retries': retries,
'retry_on_timeout': True,
'sniff_on_start': False,
'sniff_on_node_failure': True,
'min_delay_between_sniffing': 60
}
if es_api_key: if es_api_key:
self.es = Elasticsearch([f'{es_host}:{es_port}'], headers={'Authorization': f'ApiKey {es_api_key}'}, verify_certs=self_signed, ssl_show_warn=self_signed) es_config['headers'] = {'Authorization': f'ApiKey {es_api_key}'}
else: else:
self.es = Elasticsearch([f'{es_host}:{es_port}'], basic_auth=(es_user, es_password), verify_certs=self_signed, ssl_show_warn=self_signed) es_config['basic_auth'] = (es_user, es_password)
# Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
import sniff_patch
self.es = sniff_patch.init_elasticsearch(**es_config)
# Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
#self.es = Elasticsearch(**es_config)
def create_index(self, shards: int = 1, replicas: int = 1): def create_index(self, shards: int = 1, replicas: int = 1):
'''Create the Elasticsearch index with the defined mapping.''' '''
Create the Elasticsearch index with the defined mapping.
:param shards: Number of shards for the index
:param replicas: Number of replicas for the index
'''
mapping = { mapping = {
'settings': { 'settings': {
@ -111,12 +136,28 @@ class ElasticIndexer:
logging.warning(f'Index \'{self.es_index}\' already exists.') logging.warning(f'Index \'{self.es_index}\' already exists.')
def process_file(self, file_path: str, batch_size: int): def get_cluster_health(self) -> dict:
'''Get the health of the Elasticsearch cluster.'''
return self.es.cluster.health()
def get_cluster_size(self) -> int:
'''Get the number of nodes in the Elasticsearch cluster.'''
cluster_stats = self.es.cluster.stats()
number_of_nodes = cluster_stats['nodes']['count']['total']
return number_of_nodes
def process_file(self, file_path: str, watch: bool = False, chunk: dict = {}):
''' '''
Read and index DNS records in batches to Elasticsearch, handling large volumes efficiently. Read and index DNS records in batches to Elasticsearch, handling large volumes efficiently.
:param file_path: Path to the DNS zone file :param file_path: Path to the Masscan log file
:param batch_size: Number of records to process before indexing :param watch: If True, input file will be watched for new lines and indexed in real time\
:param chunk: Chunk configuration for indexing in batches
Example record: Example record:
0so9l9nrl425q3tf7dkv1nmv2r3is6vm.vegas. 3600 in nsec3 1 1 100 332539EE7F95C32A 10MHUKG4FHIAVEFDOTF6NKU5KFCB2J3A NS DS RRSIG 0so9l9nrl425q3tf7dkv1nmv2r3is6vm.vegas. 3600 in nsec3 1 1 100 332539EE7F95C32A 10MHUKG4FHIAVEFDOTF6NKU5KFCB2J3A NS DS RRSIG
@ -192,17 +233,9 @@ class ElasticIndexer:
struct = {'_index': self.es_index, '_source': source} struct = {'_index': self.es_index, '_source': source}
records.append(struct) records.append(struct)
count += 1 count += 1
if len(records) >= batch_size: if len(records) >= chunk['batch']:
while True: self.bulk_index(records, file_path, chunk, count)
try: records = []
success, _ = helpers.bulk(self.es, records)
except (ConnectionError, TransportError) as e:
logging.error(f'Failed to index records to Elasticsearch. ({e})')
time.sleep(60)
else:
logging.info(f'Successfully indexed {success:,} ({count:,}) records to {self.es_index} from {file_path}')
records = []
break
last_domain = domain last_domain = domain
@ -214,28 +247,60 @@ class ElasticIndexer:
domain_records[domain][record_type].append({'ttl': ttl, 'data': data}) domain_records[domain][record_type].append({'ttl': ttl, 'data': data})
if records: if records:
while True: self.bulk_index(records, file_path, chunk, count)
try:
success, _ = helpers.bulk(self.es, records)
except (ConnectionError, TransportError) as e: def bulk_index(self, documents: list, file_path: str, chunk: dict, count: int):
logging.error(f'Failed to index records to Elasticsearch. ({e})') '''
time.sleep(60) Index a batch of documents to Elasticsearch.
else:
logging.info(f'Successfully indexed {success:,} ({count:,}) records to {self.es_index} from {file_path}') :param documents: List of documents to index
:param file_path: Path to the file being indexed
:param count: Total number of records processed
'''
remaining_documents = documents
parallel_bulk_config = {
'client': self.es,
'chunk_size': chunk['size'],
'max_chunk_bytes': chunk['max_size'] * 1024 * 1024, # MB
'thread_count': chunk['threads'],
'queue_size': 2
}
while remaining_documents:
failed_documents = []
try:
for success, response in helpers.parallel_bulk(actions=remaining_documents, **parallel_bulk_config):
if not success:
failed_documents.append(response)
if not failed_documents:
ingested = parallel_bulk_config['chunk_size'] * parallel_bulk_config['thread_count']
logging.info(f'Successfully indexed {ingested:,} ({count:,} processed) records to {self.es_index} from {file_path}')
break break
else:
logging.warning(f'Failed to index {len(failed_documents):,} failed documents! Retrying...')
remaining_documents = failed_documents
except Exception as e:
logging.error(f'Failed to index documents! ({e})')
time.sleep(30)
def main(): def main():
'''Main function when running this script directly.''' '''Main function when running this script directly.'''
parser = argparse.ArgumentParser(description='Index data into Elasticsearch.') parser = argparse.ArgumentParser(description='Index data into Elasticsearch.')
parser.add_argument('input_path', help='Path to the input file or directory')
# General arguments # General arguments
parser.add_argument('input_path', help='Path to the input file or directory') # Required
parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)') parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)')
parser.add_argument('--batch_size', type=int, default=50000, help='Number of records to index in a batch') parser.add_argument('--watch', action='store_true', help='Watch the input file for new lines and index them in real time')
# Elasticsearch connection arguments # Elasticsearch arguments
parser.add_argument('--host', default='localhost', help='Elasticsearch host') parser.add_argument('--host', default='localhost', help='Elasticsearch host')
parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port') parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port')
parser.add_argument('--user', default='elastic', help='Elasticsearch username') parser.add_argument('--user', default='elastic', help='Elasticsearch username')
@ -245,51 +310,90 @@ def main():
# Elasticsearch indexing arguments # Elasticsearch indexing arguments
parser.add_argument('--index', default='dns-zones', help='Elasticsearch index name') parser.add_argument('--index', default='dns-zones', help='Elasticsearch index name')
parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index') # This depends on your cluster configuration parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index')
parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index') # This depends on your cluster configuration parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index')
# Performance arguments
parser.add_argument('--batch-max', type=int, default=10, help='Maximum size in MB of a batch')
parser.add_argument('--batch-size', type=int, default=5000, help='Number of records to index in a batch')
parser.add_argument('--batch-threads', type=int, default=2, help='Number of threads to use when indexing in batches')
parser.add_argument('--retries', type=int, default=10, help='Number of times to retry indexing a batch before failing')
parser.add_argument('--timeout', type=int, default=30, help='Number of seconds to wait before retrying a batch')
args = parser.parse_args() args = parser.parse_args()
if not os.path.exists(args.input_path): # Argument validation
raise FileNotFoundError(f'Input file {args.input_path} does not exist') if not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
if not args.dry_run: if not args.dry_run:
if args.batch_size < 1: if args.batch_size < 1:
raise ValueError('Batch size must be greater than 0') raise ValueError('Batch size must be greater than 0')
if not args.host: elif args.retries < 1:
raise ValueError('Number of retries must be greater than 0')
elif args.timeout < 5:
raise ValueError('Timeout must be greater than 4')
elif args.batch_max < 1:
raise ValueError('Batch max size must be greater than 0')
elif args.batch_threads < 1:
raise ValueError('Batch threads must be greater than 0')
elif not args.host:
raise ValueError('Missing required Elasticsearch argument: host') raise ValueError('Missing required Elasticsearch argument: host')
if not args.api_key and (not args.user or not args.password): elif not args.api_key and (not args.user or not args.password):
raise ValueError('Missing required Elasticsearch argument: either user and password or apikey') raise ValueError('Missing required Elasticsearch argument: either user and password or apikey')
if args.shards < 1: elif args.shards < 1:
raise ValueError('Number of shards must be greater than 0') raise ValueError('Number of shards must be greater than 0')
if args.replicas < 1: elif args.replicas < 0:
raise ValueError('Number of replicas must be greater than 0') raise ValueError('Number of replicas must be greater than 0')
logging.info(f'Connecting to Elasticsearch at {args.host}:{args.port}...') edx = ElasticIndexer(args.host, args.port, args.user, args.password, args.api_key, args.index, args.dry_run, args.self_signed, args.retries, args.timeout)
edx = ElasticIndexer(args.host, args.port, args.user, args.password, args.api_key, args.index, args.dry_run, args.self_signed)
if not args.dry_run: if not args.dry_run:
print(edx.get_cluster_health())
time.sleep(3) # Delay to allow time for sniffing to complete
nodes = edx.get_cluster_size()
logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
edx.create_index(args.shards, args.replicas) # Create the index if it does not exist edx.create_index(args.shards, args.replicas) # Create the index if it does not exist
chunk = {
'size': args.batch_size,
'max_size': args.batch_max * 1024 * 1024, # MB
'threads': args.batch_threads
}
chunk['batch'] = nodes * (chunk['size'] * chunk['threads'])
else:
chunk = {} # Ugly hack to get this working...
if os.path.isfile(args.input_path): if os.path.isfile(args.input_path):
logging.info(f'Processing file: {args.input_path}') logging.info(f'Processing file: {args.input_path}')
edx.process_file(args.input_path, args.batch_size) edx.process_file(args.input_path, args.watch, chunk)
elif os.path.isdir(args.input_path): elif os.path.isdir(args.input_path):
logging.info(f'Processing files in directory: {args.input_path}') count = 1
total = len(os.listdir(args.input_path))
logging.info(f'Processing {total:,} files in directory: {args.input_path}')
for file in sorted(os.listdir(args.input_path)): for file in sorted(os.listdir(args.input_path)):
file_path = os.path.join(args.input_path, file) file_path = os.path.join(args.input_path, file)
if os.path.isfile(file_path): if os.path.isfile(file_path):
logging.info(f'Processing file: {file_path}') logging.info(f'[{count:,}/{total:,}] Processing file: {file_path}')
edx.process_file(file_path, args.batch_size) edx.process_file(file_path, args.watch, chunk)
count += 1
else:
logging.warning(f'[{count:,}/{total:,}] Skipping non-file: {file_path}')
else:
raise ValueError(f'Input path {args.input_path} is not a file or directory')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,26 +1,41 @@
# sniff_* patch for elastic 8 clients #!/usr/bin/env python
# Call init_elasticsearch() with normal Elasticsearch params # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
# Only needed if you use sniff_* options and only works with basic auth, feel free to edit to your needs.
# Note:
# This is a patch for the elasticsearch 8.x client to fix the sniff_* options.
# This patch is only needed if you use the sniff_* options and only works with basic auth.
# Call init_elasticsearch() with normal Elasticsearch params.
#
# Source:
# - https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960
import base64 import base64
import elasticsearch._sync.client as client import elasticsearch._sync.client as client
from elasticsearch.exceptions import SerializationError, ConnectionError from elasticsearch.exceptions import SerializationError, ConnectionError
def init_elasticsearch(*args, **kwargs): def init_elasticsearch(*args, **kwargs):
'''
Initialize the Elasticsearch client with the sniff patch.
:param args: Elasticsearch positional arguments.
:param kwargs: Elasticsearch keyword arguments.
'''
client.default_sniff_callback = _override_sniff_callback(kwargs['basic_auth']) client.default_sniff_callback = _override_sniff_callback(kwargs['basic_auth'])
return client.Elasticsearch(*args, **kwargs) return client.Elasticsearch(*args, **kwargs)
def _override_sniff_callback(basic_auth): def _override_sniff_callback(basic_auth):
""" '''
Taken from https://github.com/elastic/elasticsearch-py/blob/8.8/elasticsearch/_sync/client/_base.py#L166 Taken from https://github.com/elastic/elasticsearch-py/blob/8.8/elasticsearch/_sync/client/_base.py#L166
Completely unmodified except for adding the auth header to the elastic request. Completely unmodified except for adding the auth header to the elastic request.
Allows us to continue using the sniff_* options while this is broken in the library. Allows us to continue using the sniff_* options while this is broken in the library.
TODO: Remove this when this issue is patched: TODO: Remove this when this issue is patched:
https://github.com/elastic/elasticsearch-py/issues/2005 - https://github.com/elastic/elasticsearch-py/issues/2005
""" '''
auth_str = base64.b64encode(':'.join(basic_auth).encode()).decode() auth_str = base64.b64encode(':'.join(basic_auth).encode()).decode()
sniffed_node_callback = client._base._default_sniffed_node_callback sniffed_node_callback = client._base._default_sniffed_node_callback
@ -28,14 +43,13 @@ def _override_sniff_callback(basic_auth):
for _ in transport.node_pool.all(): for _ in transport.node_pool.all():
try: try:
meta, node_infos = transport.perform_request( meta, node_infos = transport.perform_request(
"GET", 'GET',
"/_nodes/_all/http", '/_nodes/_all/http',
headers={ headers = {
"accept": "application/vnd.elasticsearch+json; compatible-with=8", 'accept': 'application/vnd.elasticsearch+json; compatible-with=8',
# This auth header is missing in 8.x releases of the client, and causes 401s 'authorization': f'Basic {auth_str}' # This auth header is missing in 8.x releases of the client, and causes 401s
"authorization": f"Basic {auth_str}"
}, },
request_timeout=( request_timeout = (
sniff_options.sniff_timeout sniff_options.sniff_timeout
if not sniff_options.is_initial_sniff if not sniff_options.is_initial_sniff
else None else None
@ -48,19 +62,19 @@ def _override_sniff_callback(basic_auth):
continue continue
node_configs = [] node_configs = []
for node_info in node_infos.get("nodes", {}).values(): for node_info in node_infos.get('nodes', {}).values():
address = node_info.get("http", {}).get("publish_address") address = node_info.get('http', {}).get('publish_address')
if not address or ":" not in address: if not address or ':' not in address:
continue continue
if "/" in address: if '/' in address:
# Support 7.x host/ip:port behavior where http.publish_host has been set. # Support 7.x host/ip:port behavior where http.publish_host has been set.
fqdn, ipaddress = address.split("/", 1) fqdn, ipaddress = address.split('/', 1)
host = fqdn host = fqdn
_, port_str = ipaddress.rsplit(":", 1) _, port_str = ipaddress.rsplit(':', 1)
port = int(port_str) port = int(port_str)
else: else:
host, port_str = address.rsplit(":", 1) host, port_str = address.rsplit(':', 1)
port = int(port_str) port = int(port_str)
assert sniffed_node_callback is not None assert sniffed_node_callback is not None