Removed _id from certstream yield and renamed ingest_certs.py to ingest_certstream.py

2024-03-11 22:46:48 -04:00 · 2024-03-11 22:46:48 -04:00 · de3878ef6b
commit de3878ef6b
parent 87f2cf27ea
4 changed files with 229 additions and 202 deletions
--- a/README.md
+++ b/README.md
@ -25,8 +25,8 @@ python eris.py [options] <input>
 ###### Elasticsearch arguments
 | Argument        | Description                                             | Default            |
-|-----------------|---------------------------------------------------------|---------------------|
+|-----------------|---------------------------------------------------------|--------------------|
-| `--host`        | Elasticsearch host                                      | `http://localhost/` |
+| `--host`        | Elasticsearch host                                      | `http://localhost` |
 | `--port`        | Elasticsearch port                                      | `9200`             |
 | `--user`        | Elasticsearch username                                  | `elastic`          |
 | `--password`    | Elasticsearch password                                  | `$ES_PASSWORD`     |
@ -51,19 +51,22 @@ python eris.py [options] <input>
 ###### Ingestion arguments
 | Argument      | Description              |
-|-------------|--------------------------|
+|---------------|--------------------------|
-| `--certs`   | Index Certstream records |
+| `--certstrem` | Index Certstream records |
 | `--httpx`     | Index HTTPX records      |
 | `--masscan`   | Index Masscan records    |
 | `--massdns`   | Index massdns records    |
 | `--zone`      | Index zone DNS records   |
-This ingestion suite will use the built in node sniffer, so by connecting to a single node, you can load balance across the entire cluster.
+~~This ingestion suite will use the built in node sniffer, so by connecting to a single node, you can load balance across the entire cluster.~~
-It is good to know how much nodes you have in the cluster to determine how to fine tune the arguments for the best performance, based on your environment.
+
 **Note:** The sniffer has been disabled for now due an [issue](https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960) with the 8.x elasticsearch client. The auth headers are not properly sent when enabling the sniffer. A working [patch](https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960) was shared and has been *mostly* converted in [helpers/sniff_patch.py](./helpers/sniff_patch.py) for the async client.
 ## Roadmap
 - Create a module for RIR database ingestion *(WHOIS, delegations, transfer, ASN mapping, peering, etc)*
 - Dynamically update the batch metrics when the sniffer adds or removes nodes.
 - Fix issue with leftover FIFO files *(catch SIGTERM / SIGINT signals)*
 - Create a working patch for the async client to properly send auth headers.
 ___
--- a/eris.py
+++ b/eris.py
@ -12,16 +12,12 @@ import sys
 sys.dont_write_bytecode = True
 try:
-    # This is commented out because there is a bug with the elasticsearch library that requires a patch (see initialize() method below)
+	from elasticsearch            import AsyncElasticsearch
    #from elasticsearch import AsyncElasticsearch
 	from elasticsearch.exceptions import NotFoundError
 	from elasticsearch.helpers    import async_streaming_bulk
 except ImportError:
 	raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)')
 # Setting up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d %I:%M:%S')
 class ElasticIndexer:
 	def __init__(self, args: argparse.Namespace):
@ -33,39 +29,31 @@ class ElasticIndexer:
 		self.chunk_max  = args.chunk_max * 1024 * 1024 # MB
 		self.chunk_size = args.chunk_size
        self.es = None
 		self.es_index   = args.index
-        self.es_config = {
+		# Sniffing disabled due to an issue with the elasticsearch 8.x client (https://github.com/elastic/elasticsearch-py/issues/2005)
-            'hosts': [f'{args.host}:{args.port}'],
+		es_config = {
-            'verify_certs': args.self_signed,
+			#'hosts'           : [f'{args.host}:{args.port}'],
-            'ssl_show_warn': args.self_signed,
+			'hosts'            : [f'{args.host}:{port}' for port in ('9200', '9201', '9202')], # Temporary alternative to sniffing
-            'request_timeout': args.timeout,
+			'verify_certs'     : args.self_signed,
-            'max_retries': args.retries,
+			'ssl_show_warn'    : args.self_signed,
-            'retry_on_timeout': True,
+			'request_timeout'  : args.timeout,
-            'sniff_on_start': False, # Problems when True....
+			'max_retries'      : args.retries,
-            'sniff_on_node_failure': True,
+			'retry_on_timeout' : True
-            'min_delay_between_sniffing': 60 # Add config option for this?
+			#'sniff_on_start': True,
 			#'sniff_on_node_failure': True,
 			#'min_delay_between_sniffing': 60
 		}
-        #if args.api_key:
+		if args.api_key:
-        #    self.es_config['api_key'] = (args.api_key, '') # Verify this is correct
+			es_config['api_key'] = (args.api_key, '') # Verify this is correct
-        #else:
+		else:
-        self.es_config['basic_auth'] = (args.user, args.password)
+			es_config['basic_auth'] = (args.user, args.password)
 		self.es = AsyncElasticsearch(**es_config)
-    async def initialize(self):
+	async def create_index(self, map_body: dict, pipeline: str = None, replicas: int = 1, shards: int = 1):
        '''Initialize the Elasticsearch client.'''
        # Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
        import sniff_patch
        self.es = await sniff_patch.init_elasticsearch(**self.es_config)
        # Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
        #self.es = AsyncElasticsearch(**es_config)
    async def create_index(self, map_body: dict, pipeline: str = '', replicas: int = 1, shards: int = 1):
 		'''
 		Create the Elasticsearch index with the defined mapping.
@ -82,8 +70,8 @@ class ElasticIndexer:
 		mapping = map_body
 		mapping['settings'] = {
-            'number_of_shards': shards,
+			'number_of_shards'   : shards,
-            'number_of_replicas': replicas
+			'number_of_replicas' : replicas
 		}
 		if pipeline:
@ -129,12 +117,12 @@ class ElasticIndexer:
 		count = 0
 		total = 0
 		try:
 			async for ok, result in async_streaming_bulk(self.es, actions=data_generator(file_path), chunk_size=self.chunk_size, max_chunk_bytes=self.chunk_max):
 				action, result = result.popitem()
 				if not ok:
 					logging.error(f'Failed to index document ({result["_id"]}) to {self.es_index} from {file_path} ({result})')
                input('Press Enter to continue...') # Debugging (will possibly remove this since we have retries enabled)
 					continue
 				count += 1
@ -146,22 +134,55 @@ class ElasticIndexer:
 			logging.info(f'Finished indexing {total:,} records to {self.es_index} from {file_path}')
 		except Exception as e:
 			raise Exception(f'Failed to index records to {self.es_index} from {file_path} ({e})')
 def setup_logger(name: str, level: logging._Level = logging.INFO, to_file: bool = False, max_bytes: int = 250000, backups: int = 7) -> logging.Logger:
 	'''
 	Setup a custom logger with options for console and file logging.
 	:param name: Name of the logger.
 	:param level: Logging level.
 	:param to_file: Whether to log to a file.
 	:param max_bytes: Maximum size in bytes before rotating log file.
 	:param backups: Number of backup files to keep.
 	'''
 	logger = logging.getLogger(name)
 	logger.setLevel(level)
 	formatter_console = logging.Formatter('%(asctime)s | %(levelname)9s | %(message)s', '%I:%M:%S')
 	formatter_file    = logging.Formatter('%(asctime)s | %(levelname)9s | %(filename)s.%(funcName)s | %(message)s', '%Y-%m-%d %I:%M:%S')
 	sh = logging.StreamHandler()
 	sh.setFormatter(formatter_console)
 	logger.addHandler(sh)
 	if to_file:
 		os.makedirs('logs', exist_ok=True)
 		fh = logging.handlers.RotatingFileHandler('logs/debug.log', maxBytes=max_bytes, backupCount=backups, encoding='utf-8')
 		fh.setFormatter(formatter_file)
 		logger.addHandler(fh)
 	return logger
 async def main():
 	'''Main function when running this script directly.'''
-    parser = argparse.ArgumentParser(description='Index data into Elasticsearch.')
+	parser = argparse.ArgumentParser(description='Elasticsearch Recon Ingestion Scripts (ERIS)')
 	# General arguments
 	parser.add_argument('input_path', help='Path to the input file or directory') # Required
 	parser.add_argument('--watch', action='store_true', help='Create or watch a FIFO for real-time indexing')
 	# Elasticsearch arguments
-    parser.add_argument('--host', default='http://localhost/', help='Elasticsearch host')
+	parser.add_argument('--host', default='http://localhost', help='Elasticsearch host')
 	parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port')
 	parser.add_argument('--user', default='elastic', help='Elasticsearch username')
 	parser.add_argument('--password', default=os.getenv('ES_PASSWORD'), help='Elasticsearch password (if not provided, check environment variable ES_PASSWORD)')
-    #parser.add_argument('--api-key', default=os.getenv('ES_APIKEY'), help='Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY)')
+	parser.add_argument('--api-key', default=os.getenv('ES_APIKEY'), help='Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY)')
 	parser.add_argument('--self-signed', action='store_false', help='Elasticsearch is using self-signed certificates')
 	# Elasticsearch indexing arguments
@ -173,11 +194,11 @@ async def main():
 	# Performance arguments
 	parser.add_argument('--chunk-size', type=int, default=50000, help='Number of records to index in a chunk')
 	parser.add_argument('--chunk-max', type=int, default=100, help='Maximum size of a chunk in bytes')
-    parser.add_argument('--retries', type=int, default=100, help='Number of times to retry indexing a chunk before failing')
+	parser.add_argument('--retries', type=int, default=30, help='Number of times to retry indexing a chunk before failing')
 	parser.add_argument('--timeout', type=int, default=60, help='Number of seconds to wait before retrying a chunk')
 	# Ingestion arguments
-    parser.add_argument('--cert', action='store_true', help='Index Certstream records')
+	parser.add_argument('--certstream', action='store_true', help='Index Certstream records')
 	parser.add_argument('--httpx', action='store_true', help='Index Httpx records')
 	parser.add_argument('--masscan', action='store_true', help='Index Masscan records')
 	parser.add_argument('--massdns', action='store_true', help='Index Massdns records')
@ -185,20 +206,22 @@ async def main():
 	args = parser.parse_args()
 	if args.host.endswith('/'):
 		args.host = args.host[:-1]
 	if args.watch:
 		if not os.path.exists(args.input_path):
 			os.mkfifo(args.input_path)
-        elif os.path.exists(args.input_path) and stat.S_ISFIFO(os.stat(args.input_path).st_mode):
+		elif not stat.S_ISFIFO(os.stat(args.input_path).st_mode):
 			raise ValueError(f'Path {args.input_path} is not a FIFO')
 	elif not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
 		raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
 	edx = ElasticIndexer(args)
    await edx.initialize() # Initialize the Elasticsearch client asyncronously
-    if args.cert:
+	if args.certstream:
-        from ingestors import ingest_certs   as ingestor
+		from ingestors import ingest_certstream as ingestor
-    if args.httpx:
+	elif args.httpx:
 		from ingestors import ingest_httpx      as ingestor
 	elif args.masscan:
 		from ingestors import ingest_masscan    as ingestor
@ -206,12 +229,11 @@ async def main():
 		from ingestors import ingest_massdns    as ingestor
 	elif args.zone:
 		from ingestors import ingest_zone       as ingestor
-
+	else:
-    if not isinstance(ingestor, object):
+		raise ValueError('No ingestor specified')
        raise ValueError('No ingestor selected')
 	health = await edx.get_cluster_health()
-    print(health)
+	logging.info(health)
 	await asyncio.sleep(5) # Delay to allow time for sniffing to complete
@ -228,10 +250,6 @@ async def main():
 		logging.info(f'Processing file: {args.input_path}')
 		await edx.process_data(args.input_path, ingestor.process_data)
    elif stat.S_ISFIFO(os.stat(args.input_path).st_mode):
        logging.info(f'Watching FIFO: {args.input_path}')
        await edx.process_data(args.input_path, ingestor.process_data)
 	elif os.path.isdir(args.input_path):
 		count = 1
 		total = len(os.listdir(args.input_path))
@ -248,4 +266,10 @@ async def main():
 if __name__ == '__main__':
 	setup_logger('eris', level=logging.INFO, to_file=True)
 	print('')
 	print('┏┓┳┓┳┏┓   Elasticsearch Recon Ingestion Scripts')
 	print('┣ ┣┫┃┗┓        Developed by Acidvegas in Python')
 	print('┗┛┛┗┻┗┛             https://git.acid.vegas/eris')
 	print('')
 	asyncio.run(main())
--- a/helpers/sniff_patch.py
+++ b/helpers/sniff_patch.py
@ -23,12 +23,12 @@ async def init_elasticsearch(*args, **kwargs):
    :param args: Async Elasticsearch positional arguments.
    :param kwargs: Async Elasticsearch keyword arguments.
    '''
-    async_client.default_sniff_callback = _override_sniff_callback(kwargs['basic_auth'])
+    async_client.default_sniff_callback = await _override_sniff_callback(kwargs['basic_auth'])
    return async_client.AsyncElasticsearch(*args, **kwargs)
-def _override_sniff_callback(basic_auth):
+async def _override_sniff_callback(basic_auth):
    '''
    Taken from https://github.com/elastic/elasticsearch-py/blob/8.8/elasticsearch/_sync/client/_base.py#L166
    Completely unmodified except for adding the auth header to the elastic request.
--- a/ingestors/ingest_certstream.py
+++ b/ingestors/ingest_certstream.py
@ -68,7 +68,7 @@ async def process_data(place_holder: str = None):
 							'seen'   : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
 						}
-						yield {'_id': id, '_index': default_index, '_source': struct}
+						yield {'_index': default_index, '_source': struct}
 		except websockets.ConnectionClosed:
 			logging.error('Connection to Certstream was closed. Attempting to reconnect...')