2024-02-02 05:11:18 +00:00
#!/usr/bin/env python
# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
2024-03-06 03:19:11 +00:00
# eris.py
2024-02-02 05:11:18 +00:00
2024-03-06 03:19:11 +00:00
import asyncio
2024-02-02 05:11:18 +00:00
import argparse
import logging
2024-03-12 03:07:35 +00:00
import logging . handlers
2024-02-02 05:11:18 +00:00
import os
import stat
import sys
2024-11-22 06:18:28 +00:00
import json
2024-02-02 05:11:18 +00:00
2024-11-22 06:18:28 +00:00
sys . dont_write_bytecode = True # FUCKOFF __pycache__
2024-02-02 05:11:18 +00:00
try :
2024-03-12 02:46:48 +00:00
from elasticsearch import AsyncElasticsearch
from elasticsearch . exceptions import NotFoundError
from elasticsearch . helpers import async_streaming_bulk
2024-02-02 05:11:18 +00:00
except ImportError :
2024-03-12 02:46:48 +00:00
raise ImportError ( ' Missing required \' elasticsearch \' library. (pip install elasticsearch) ' )
2024-02-02 05:11:18 +00:00
class ElasticIndexer :
2024-03-12 02:46:48 +00:00
def __init__ ( self , args : argparse . Namespace ) :
'''
Initialize the Elastic Search indexer .
: param args : Parsed arguments from argparse
'''
self . chunk_max = args . chunk_max * 1024 * 1024 # MB
self . chunk_size = args . chunk_size
self . es_index = args . index
# Sniffing disabled due to an issue with the elasticsearch 8.x client (https://github.com/elastic/elasticsearch-py/issues/2005)
es_config = {
2024-11-22 06:18:28 +00:00
' hosts ' : [ f ' { args . host } : { args . port } ' ] ,
#'hosts' : [f'{args.host}:{port}' for port in ('9200',)], # Temporary alternative to sniffing
2024-03-15 05:25:09 +00:00
' verify_certs ' : args . self_signed ,
' ssl_show_warn ' : args . self_signed ,
' request_timeout ' : args . timeout ,
' max_retries ' : args . retries ,
' retry_on_timeout ' : True ,
' http_compress ' : True ,
' connections_per_node ' : 3 # Experiment with this value
2024-03-12 02:46:48 +00:00
#'sniff_on_start': True,
#'sniff_on_node_failure': True,
#'min_delay_between_sniffing': 60
}
if args . api_key :
es_config [ ' api_key ' ] = ( args . api_key , ' ' ) # Verify this is correct
else :
es_config [ ' basic_auth ' ] = ( args . user , args . password )
self . es = AsyncElasticsearch ( * * es_config )
2024-02-02 05:11:18 +00:00
2024-03-24 02:47:30 +00:00
async def close_connect ( self ) :
''' Close the Elasticsearch connection. '''
await self . es . close ( )
2024-03-12 02:46:48 +00:00
async def create_index ( self , map_body : dict , pipeline : str = None , replicas : int = 1 , shards : int = 1 ) :
'''
Create the Elasticsearch index with the defined mapping .
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
: param map_body : Mapping for the index
: param pipeline : Name of the ingest pipeline to use for the index
: param replicas : Number of replicas for the index
: param shards : Number of shards for the index
'''
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
if await self . es . indices . exists ( index = self . es_index ) :
logging . info ( f ' Index \' { self . es_index } \' already exists. ' )
return
2024-03-06 03:19:11 +00:00
2024-03-12 02:46:48 +00:00
mapping = map_body
2024-03-08 17:13:57 +00:00
2024-03-12 02:46:48 +00:00
mapping [ ' settings ' ] = {
' number_of_shards ' : shards ,
' number_of_replicas ' : replicas
}
2024-03-08 17:13:57 +00:00
2024-03-12 02:46:48 +00:00
if pipeline :
try :
await self . es . ingest . get_pipeline ( id = pipeline )
logging . info ( f ' Using ingest pipeline \' { pipeline } \' for index \' { self . es_index } \' ' )
mapping [ ' settings ' ] [ ' index.default_pipeline ' ] = pipeline
except NotFoundError :
raise ValueError ( f ' Ingest pipeline \' { pipeline } \' does not exist. ' )
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
response = await self . es . indices . create ( index = self . es_index , body = mapping )
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
if response . get ( ' acknowledged ' ) and response . get ( ' shards_acknowledged ' ) :
logging . info ( f ' Index \' { self . es_index } \' successfully created. ' )
else :
raise Exception ( f ' Failed to create index. ( { response } ) ' )
2024-02-02 05:11:18 +00:00
2024-03-06 03:19:11 +00:00
2024-03-12 02:46:48 +00:00
async def process_data ( self , file_path : str , data_generator : callable ) :
'''
Index records in chunks to Elasticsearch .
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
: param file_path : Path to the file
: param data_generator : Generator for the records to index
'''
2024-02-02 05:11:18 +00:00
2024-11-22 06:18:28 +00:00
count = 0
total = 0
errors = [ ]
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
try :
2024-11-22 06:18:28 +00:00
async for ok , result in async_streaming_bulk ( self . es , actions = data_generator ( file_path ) , chunk_size = self . chunk_size , max_chunk_bytes = self . chunk_max , raise_on_error = False ) :
2024-03-12 02:46:48 +00:00
action , result = result . popitem ( )
2024-03-08 17:13:57 +00:00
2024-03-12 02:46:48 +00:00
if not ok :
2024-11-22 06:18:28 +00:00
error_type = result . get ( ' error ' , { } ) . get ( ' type ' , ' unknown ' )
error_reason = result . get ( ' error ' , { } ) . get ( ' reason ' , ' unknown ' )
logging . error ( ' FAILED DOCUMENT: ' )
logging . error ( f ' Error Type : { error_type } ' )
logging . error ( f ' Error Reason : { error_reason } ' )
logging . error ( ' Document : ' )
logging . error ( json . dumps ( result , indent = 2 ) )
input ( ' Press Enter to continue... ' )
errors . append ( result )
2024-03-12 02:46:48 +00:00
continue
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
count + = 1
total + = 1
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
if count == self . chunk_size :
logging . info ( f ' Successfully indexed { self . chunk_size : , } ( { total : , } processed) records to { self . es_index } from { file_path } ' )
count = 0
2024-02-02 05:11:18 +00:00
2024-11-22 06:18:28 +00:00
if errors :
raise Exception ( f ' { len ( errors ) : , } document(s) failed to index. Check the logs above for details. ' )
2024-02-02 05:11:18 +00:00
2024-03-12 02:46:48 +00:00
except Exception as e :
raise Exception ( f ' Failed to index records to { self . es_index } from { file_path } ( { e } ) ' )
2024-02-02 05:11:18 +00:00
2024-03-24 02:47:30 +00:00
def setup_logger ( console_level : int = logging . INFO , file_level : int = None , log_file : str = ' debug.json ' , max_file_size : int = 10 * 1024 * 1024 , backups : int = 5 , ecs_format : bool = False ) :
2024-03-12 02:46:48 +00:00
'''
2024-03-15 05:25:09 +00:00
Setup the global logger for the application .
2024-02-02 05:11:18 +00:00
2024-03-15 05:25:09 +00:00
: param console_level : Minimum level to capture logs to the console .
: param file_level : Minimum level to capture logs to the file .
: param log_file : File to write logs to .
: param max_file_size : Maximum size of the log file before it is rotated .
: param backups : Number of backup log files to keep .
2024-03-24 02:47:30 +00:00
: param ecs_format : Use the Elastic Common Schema ( ECS ) format for logs .
2024-03-12 02:46:48 +00:00
'''
2024-03-08 17:13:57 +00:00
2024-03-15 05:25:09 +00:00
# Configure the root logger
2024-03-12 03:30:48 +00:00
logger = logging . getLogger ( )
2024-03-15 05:25:09 +00:00
logger . setLevel ( logging . DEBUG ) # Minimum level to capture all logs
2024-03-12 03:30:48 +00:00
2024-11-22 06:18:28 +00:00
# Clear existing handlers
2024-03-15 05:25:09 +00:00
logger . handlers = [ ]
2024-02-02 05:11:18 +00:00
2024-03-15 05:25:09 +00:00
# Setup console handler
console_handler = logging . StreamHandler ( )
console_handler . setLevel ( console_level )
console_formatter = logging . Formatter ( ' %(asctime)s | %(levelname)9s | %(message)s ' , ' % I: % M: % S ' )
console_handler . setFormatter ( console_formatter )
logger . addHandler ( console_handler )
2024-02-02 05:11:18 +00:00
2024-03-15 05:25:09 +00:00
# Setup rotating file handler if file logging is enabled
if file_level is not None :
file_handler = logging . handlers . RotatingFileHandler ( log_file , maxBytes = max_file_size , backupCount = backups )
file_handler . setLevel ( file_level )
2024-03-24 02:47:30 +00:00
# Setup formatter to use ECS format if enabled or default format
if ecs_format :
try :
from ecs_logging import StdlibFormatter
except ImportError :
raise ImportError ( ' Missing required \' ecs-logging \' library. (pip install ecs-logging) ' )
file_formatter = StdlibFormatter ( ) # ECS formatter
else :
file_formatter = logging . Formatter ( ' %(asctime)s | %(levelname)9s | %(message)s ' , ' % Y- % m- %d % H: % M: % S ' )
file_handler . setFormatter ( file_formatter )
2024-03-15 05:25:09 +00:00
logger . addHandler ( file_handler )
2024-02-02 05:11:18 +00:00
2024-03-06 03:19:11 +00:00
async def main ( ) :
2024-03-12 02:46:48 +00:00
''' Main function when running this script directly. '''
parser = argparse . ArgumentParser ( description = ' Elasticsearch Recon Ingestion Scripts (ERIS) ' )
# General arguments
parser . add_argument ( ' input_path ' , help = ' Path to the input file or directory ' ) # Required
parser . add_argument ( ' --watch ' , action = ' store_true ' , help = ' Create or watch a FIFO for real-time indexing ' )
2024-03-15 05:25:09 +00:00
parser . add_argument ( ' --log ' , choices = [ ' debug ' , ' info ' , ' warning ' , ' error ' , ' critical ' ] , help = ' Logging file level (default: disabled) ' )
2024-03-24 02:47:30 +00:00
parser . add_argument ( ' --ecs ' , action = ' store_true ' , default = False , help = ' Use the Elastic Common Schema (ECS) for logging ' )
2024-03-12 02:46:48 +00:00
# Elasticsearch arguments
parser . add_argument ( ' --host ' , default = ' http://localhost ' , help = ' Elasticsearch host ' )
parser . add_argument ( ' --port ' , type = int , default = 9200 , help = ' Elasticsearch port ' )
parser . add_argument ( ' --user ' , default = ' elastic ' , help = ' Elasticsearch username ' )
parser . add_argument ( ' --password ' , default = os . getenv ( ' ES_PASSWORD ' ) , help = ' Elasticsearch password (if not provided, check environment variable ES_PASSWORD) ' )
parser . add_argument ( ' --api-key ' , default = os . getenv ( ' ES_APIKEY ' ) , help = ' Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY) ' )
parser . add_argument ( ' --self-signed ' , action = ' store_false ' , help = ' Elasticsearch is using self-signed certificates ' )
# Elasticsearch indexing arguments
parser . add_argument ( ' --index ' , help = ' Elasticsearch index name ' )
parser . add_argument ( ' --pipeline ' , help = ' Use an ingest pipeline for the index ' )
parser . add_argument ( ' --replicas ' , type = int , default = 1 , help = ' Number of replicas for the index ' )
parser . add_argument ( ' --shards ' , type = int , default = 1 , help = ' Number of shards for the index ' )
# Performance arguments
2024-11-22 06:18:28 +00:00
parser . add_argument ( ' --chunk-size ' , type = int , default = 5000 , help = ' Number of records to index in a chunk ' )
parser . add_argument ( ' --chunk-max ' , type = int , default = 10485760 , help = ' Maximum size of a chunk in bytes (default 10mb) ' )
2024-03-12 02:46:48 +00:00
parser . add_argument ( ' --retries ' , type = int , default = 30 , help = ' Number of times to retry indexing a chunk before failing ' )
parser . add_argument ( ' --timeout ' , type = int , default = 60 , help = ' Number of seconds to wait before retrying a chunk ' )
# Ingestion arguments
parser . add_argument ( ' --certstream ' , action = ' store_true ' , help = ' Index Certstream records ' )
parser . add_argument ( ' --httpx ' , action = ' store_true ' , help = ' Index Httpx records ' )
parser . add_argument ( ' --masscan ' , action = ' store_true ' , help = ' Index Masscan records ' )
parser . add_argument ( ' --massdns ' , action = ' store_true ' , help = ' Index Massdns records ' )
parser . add_argument ( ' --zone ' , action = ' store_true ' , help = ' Index Zone records ' )
2024-11-22 06:18:28 +00:00
parser . add_argument ( ' --rir-delegations ' , action = ' store_true ' , help = ' Index RIR Delegations records ' )
parser . add_argument ( ' --rir-transfers ' , action = ' store_true ' , help = ' Index RIR Transfers records ' )
2024-03-12 02:46:48 +00:00
2024-03-24 02:47:30 +00:00
args = parser . parse_args ( )
2024-03-15 05:25:09 +00:00
if args . log :
levels = { ' debug ' : logging . DEBUG , ' info ' : logging . INFO , ' warning ' : logging . WARNING , ' error ' : logging . ERROR , ' critical ' : logging . CRITICAL }
2024-03-24 02:47:30 +00:00
setup_logger ( file_level = levels [ args . log ] , log_file = ' eris.log ' , ecs_format = args . ecs )
2024-03-15 05:25:09 +00:00
else :
setup_logger ( )
2024-03-12 02:46:48 +00:00
if args . host . endswith ( ' / ' ) :
args . host = args . host [ : - 1 ]
if args . watch :
if not os . path . exists ( args . input_path ) :
os . mkfifo ( args . input_path )
elif not stat . S_ISFIFO ( os . stat ( args . input_path ) . st_mode ) :
raise ValueError ( f ' Path { args . input_path } is not a FIFO ' )
elif not os . path . isdir ( args . input_path ) and not os . path . isfile ( args . input_path ) :
raise FileNotFoundError ( f ' Input path { args . input_path } does not exist or is not a file or directory ' )
2024-03-15 05:25:09 +00:00
logging . info ( f ' Connecting to Elasticsearch at { args . host } : { args . port } ' )
2024-03-12 02:46:48 +00:00
edx = ElasticIndexer ( args )
if args . certstream :
2024-11-22 06:18:28 +00:00
from ingestors import ingest_certstream as ingestor
2024-03-12 02:46:48 +00:00
elif args . httpx :
2024-11-22 06:18:28 +00:00
from ingestors import ingest_httpx as ingestor
2024-03-12 02:46:48 +00:00
elif args . masscan :
2024-11-22 06:18:28 +00:00
from ingestors import ingest_masscan as ingestor
2024-03-12 02:46:48 +00:00
elif args . massdns :
2024-11-22 06:18:28 +00:00
from ingestors import ingest_massdns as ingestor
elif args . rir_delegations :
from ingestors import ingest_rir_delegations as ingestor
elif args . rir_transfers :
from ingestors import ingest_rir_transfers as ingestor
2024-03-12 02:46:48 +00:00
elif args . zone :
2024-11-22 06:18:28 +00:00
from ingestors import ingest_zone as ingestor
2024-03-12 02:46:48 +00:00
else :
raise ValueError ( ' No ingestor specified ' )
2024-03-12 03:30:48 +00:00
health = await edx . es . cluster . health ( )
2024-03-12 02:46:48 +00:00
logging . info ( health )
2024-03-15 05:25:09 +00:00
#await asyncio.sleep(5) # Delay to allow time for sniffing to complete (Sniffer temporarily disabled)
2024-03-12 02:46:48 +00:00
if not edx . es_index :
edx . es_index = ingestor . default_index
map_body = ingestor . construct_map ( )
await edx . create_index ( map_body , args . pipeline , args . replicas , args . shards )
if os . path . isfile ( args . input_path ) :
logging . info ( f ' Processing file: { args . input_path } ' )
await edx . process_data ( args . input_path , ingestor . process_data )
2024-03-15 05:25:09 +00:00
elif stat . S_ISFIFO ( os . stat ( args . input_path ) . st_mode ) :
logging . info ( f ' Watching FIFO: { args . input_path } ' )
await edx . process_data ( args . input_path , ingestor . process_data )
2024-03-12 02:46:48 +00:00
elif os . path . isdir ( args . input_path ) :
count = 1
total = len ( os . listdir ( args . input_path ) )
logging . info ( f ' Processing { total : , } files in directory: { args . input_path } ' )
for file in sorted ( os . listdir ( args . input_path ) ) :
file_path = os . path . join ( args . input_path , file )
if os . path . isfile ( file_path ) :
logging . info ( f ' [ { count : , } / { total : , } ] Processing file: { file_path } ' )
await edx . process_data ( file_path , ingestor . process_data )
count + = 1
else :
logging . warning ( f ' [ { count : , } / { total : , } ] Skipping non-file: { file_path } ' )
2024-02-02 05:11:18 +00:00
2024-03-24 02:47:30 +00:00
await edx . close_connect ( ) # Close the Elasticsearch connection to stop "Unclosed client session" warnings
2024-02-02 05:11:18 +00:00
if __name__ == ' __main__ ' :
2024-03-12 02:46:48 +00:00
print ( ' ' )
print ( ' ┏┓┳┓┳┏┓ Elasticsearch Recon Ingestion Scripts ' )
print ( ' ┣ ┣┫┃┗┓ Developed by Acidvegas in Python ' )
print ( ' ┗┛┛┗┻┗┛ https://git.acid.vegas/eris ' )
print ( ' ' )
2024-03-15 05:25:09 +00:00
asyncio . run ( main ( ) )