From b6fb68ba3a060d715b966f163cfefc5a75531b61 Mon Sep 17 00:00:00 2001
From: acidvegas <acid.vegas@acid.vegas>
Date: Tue, 5 Mar 2024 16:47:11 -0500
Subject: [PATCH] Asyncronous developed mon ERIS is complete, need to refactor
 ingestion helpers before pushing this as the main version

---
 README.md                                     |   1 +
 async_dev/eris.py                             | 126 +++++++-----------
 async_dev/ingestors/ingest_certs.py           |   9 +-
 ...est_masscan_async.py => ingest_masscan.py} |  23 ++--
 4 files changed, 73 insertions(+), 86 deletions(-)
 rename async_dev/ingestors/{ingest_masscan_async.py => ingest_masscan.py} (78%)

diff --git a/README.md b/README.md
index accbb03..f91f638 100644
--- a/README.md
+++ b/README.md
@@ -86,6 +86,7 @@ Create & add a geoip pipeline and use the following in your index mappings:
 ## Roadmap
 - Implement [async elasticsearch](https://elasticsearch-py.readthedocs.io/en/v8.12.1/async.html) into the code.
 - WHOIS database ingestion scripts
+- Dynamically update the batch metrics when the sniffer adds or removes nodes
 
 ___
 
diff --git a/async_dev/eris.py b/async_dev/eris.py
index 183b21f..093901a 100644
--- a/async_dev/eris.py
+++ b/async_dev/eris.py
@@ -2,11 +2,11 @@
 # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
 # eris.py [asyncronous developement]
 
+import asyncio
 import argparse
 import logging
 import os
 import stat
-import time
 import sys
 
 sys.dont_write_bytecode = True
@@ -31,41 +31,44 @@ class ElasticIndexer:
         '''
 
         self.chunk_size = args.chunk_size
-        self.chunk_threads = args.chunk_threads
-        self.dry_run = args.dry_run
+        self.es = None
         self.es_index = args.index
+        
+        self.es_config = {
+            'hosts': [f'{args.host}:{args.port}'],
+            'verify_certs': args.self_signed,
+            'ssl_show_warn': args.self_signed,
+            'request_timeout': args.timeout,
+            'max_retries': args.retries,
+            'retry_on_timeout': True,
+            'sniff_on_start': True, # Is this problematic? 
+            'sniff_on_node_failure': True,
+            'min_delay_between_sniffing': 60 # Add config option for this?
+        }
 
-        if not args.dry_run:
-            es_config = {
-                'hosts': [f'{args.host}:{args.port}'],
-                'verify_certs': args.self_signed,
-                'ssl_show_warn': args.self_signed,
-                'request_timeout': args.timeout,
-                'max_retries': args.retries,
-                'retry_on_timeout': True,
-                'sniff_on_start': True, # Is this problematic? 
-                'sniff_on_node_failure': True,
-                'min_delay_between_sniffing': 60 # Add config option for this?
-            }
+        if args.api_key:
+            self.es_config['api_key'] = (args.api_key, '') # Verify this is correct
+        else:
+            self.es_config['basic_auth'] = (args.user, args.password)
+            
+        
+    async def initialize(self):
+        '''Initialize the Elasticsearch client.'''
 
-            if args.api_key:
-                es_config['api_key'] = (args.key, '') # Verify this is correct
-            else:
-                es_config['basic_auth'] = (args.user, args.password)
-                
-            # Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
-            import sniff_patch
-            self.es = sniff_patch.init_elasticsearch(**es_config)
+        # Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
+        import sniff_patch
+        self.es = sniff_patch.init_elasticsearch(**self.es_config)
 
-            # Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
-            #self.es = AsyncElasticsearch(**es_config)
+        # Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
+        #self.es = AsyncElasticsearch(**es_config)
 
 
-    async def create_index(self, map_body: dict, pipeline: str = '', replicas: int = 1, shards: int = 1, ):
+    async def create_index(self, map_body: dict, pipeline: str = '', replicas: int = 1, shards: int = 1):
         '''
         Create the Elasticsearch index with the defined mapping.
         
-        :param pipline: Name of the ingest pipeline to use for the index
+        :param map_body: Mapping for the index
+        :param pipeline: Name of the ingest pipeline to use for the index
         :param replicas: Number of replicas for the index
         :param shards: Number of shards for the index
         '''
@@ -112,7 +115,7 @@ class ElasticIndexer:
         return number_of_nodes
 
 
-    async def async_bulk_index_data(self, file_path: str, index_name: str, data_generator: callable):
+    async def process_data(self, file_path: str, data_generator: callable):
         '''
         Index records in chunks to Elasticsearch.
 
@@ -124,11 +127,11 @@ class ElasticIndexer:
         count = 0
         total = 0
         
-        async for ok, result in async_streaming_bulk(self.es, index_name=self.es_index, actions=data_generator(file_path), chunk_size=self.chunk_size):
+        async for ok, result in async_streaming_bulk(self.es, actions=data_generator(file_path), chunk_size=self.chunk_size):
             action, result = result.popitem()
 
             if not ok:
-                logging.error(f'Failed to index document ({result["_id"]}) to {index_name} from {file_path} ({result})')
+                logging.error(f'Failed to index document ({result["_id"]}) to {self.es_index} from {file_path} ({result})')
                 input('Press Enter to continue...') # Debugging (will possibly remove this since we have retries enabled)
                 continue
 
@@ -139,41 +142,16 @@ class ElasticIndexer:
                 logging.info(f'Successfully indexed {self.chunk_size:,} ({total:,} processed) records to {self.es_index} from {file_path}')
                 count = 0
 
-        logging.info(f'Finished indexing {self.total:,} records to {self.es_index} from {file_path}')
+        logging.info(f'Finished indexing {total:,} records to {self.es_index} from {file_path}')
 
 
-    async def process_file(self, file_path: str, ingest_function: callable):
-        '''
-        Read and index records in batches to Elasticsearch.
-
-        :param file_path: Path to the file
-        :param batch_size: Number of records to index per batch
-        :param ingest_function: Function to process the file
-        '''
-
-        count = 0
-
-        async for processed in ingest_function(file_path):
-            if not processed:
-                break
-
-            if self.dry_run:
-                print(processed)
-                continue
-
-            count += 1
-
-            yield {'_index': self.es_index, '_source': processed}
-
-
-def main():
+async def main():
     '''Main function when running this script directly.'''
 
     parser = argparse.ArgumentParser(description='Index data into Elasticsearch.')
     
     # General arguments
     parser.add_argument('input_path', help='Path to the input file or directory') # Required
-    parser.add_argument('--dry-run', action='store_true', help='Dry run (do not index records to Elasticsearch)')
     parser.add_argument('--watch', action='store_true', help='Create or watch a FIFO for real-time indexing')
     
     # Elasticsearch arguments
@@ -192,7 +170,6 @@ def main():
     
     # Performance arguments
     parser.add_argument('--chunk-size', type=int, default=50000, help='Number of records to index in a chunk')
-    parser.add_argument('--chunk-threads', type=int, default=3, help='Number of threads to use when indexing in chunks')
     parser.add_argument('--retries', type=int, default=60, help='Number of times to retry indexing a chunk before failing')
     parser.add_argument('--timeout', type=int, default=30, help='Number of seconds to wait before retrying a chunk')
 
@@ -214,6 +191,7 @@ def main():
         raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
 
     edx = ElasticIndexer(args)
+    await edx.initialize() # Initialize the Elasticsearch client asyncronously
 
     if args.cert:
         from ingestors import ingest_certs   as ingestor
@@ -225,32 +203,28 @@ def main():
         from ingestors import ingest_massdns as ingestor
     elif args.zone:
         from ingestors import ingest_zone    as ingestor
-
-    batch_size = 0
     
-    if not args.dry_run:
-        print(edx.get_cluster_health())
+    health = await edx.get_cluster_health()
+    print(health)
 
-        time.sleep(3) # Delay to allow time for sniffing to complete
-        
-        nodes = edx.get_cluster_size()
-        logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
+    await asyncio.sleep(5) # Delay to allow time for sniffing to complete
+    
+    nodes = await edx.get_cluster_size()
+    logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
 
-        if not edx.es_index:
-            edx.es_index = ingestor.default_index
+    if not edx.es_index:
+        edx.es_index = ingestor.default_index
 
-        map_body = ingestor.construct_map()
-        edx.create_index(map_body, args.pipeline, args.replicas, args.shards)
-
-        batch_size = int(nodes * (args.chunk_size * args.chunk_threads))
+    map_body = ingestor.construct_map()
+    await edx.create_index(map_body, args.pipeline, args.replicas, args.shards)
     
     if os.path.isfile(args.input_path):
         logging.info(f'Processing file: {args.input_path}')
-        edx.process_file(args.input_path, batch_size, ingestor.process_file)
+        await edx.process_data(args.input_path, ingestor.process_data)
 
     elif stat.S_ISFIFO(os.stat(args.input_path).st_mode):
         logging.info(f'Watching FIFO: {args.input_path}')
-        edx.process_file(args.input_path, batch_size, ingestor.process_file)
+        await edx.process_data(args.input_path, ingestor.process_data)
 
     elif os.path.isdir(args.input_path):
         count = 1
@@ -260,7 +234,7 @@ def main():
             file_path = os.path.join(args.input_path, file)
             if os.path.isfile(file_path):
                 logging.info(f'[{count:,}/{total:,}] Processing file: {file_path}')
-                edx.process_file(file_path, batch_size, ingestor.process_file)
+                await edx.process_data(file_path, ingestor.process_data)
                 count += 1
             else:
                 logging.warning(f'[{count:,}/{total:,}] Skipping non-file: {file_path}')
@@ -268,4 +242,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    asyncio.run(main())
\ No newline at end of file
diff --git a/async_dev/ingestors/ingest_certs.py b/async_dev/ingestors/ingest_certs.py
index 5a5e300..40404d4 100644
--- a/async_dev/ingestors/ingest_certs.py
+++ b/async_dev/ingestors/ingest_certs.py
@@ -91,8 +91,12 @@ def construct_map() -> dict:
     return mapping
 
 
-async def process():
-    '''Read and process Certsream records live from the Websocket stream.'''
+async def process_data(file_path: str = None):
+    '''
+    Read and process Certsream records live from the Websocket stream.
+    
+    :param file_path: Path to the Certstream log file (unused, placeholder for consistency with other ingestors)
+    '''
 
     while True:
         try:
@@ -105,6 +109,7 @@ async def process():
                         record = json.loads(line)
                     except json.decoder.JSONDecodeError:
                         logging.error(f'Failed to parse JSON record from Certstream! ({line})')
+                        input('Press Enter to continue...') # Pause the script to allow the user to read the error message
                         continue
 
                     yield record
diff --git a/async_dev/ingestors/ingest_masscan_async.py b/async_dev/ingestors/ingest_masscan.py
similarity index 78%
rename from async_dev/ingestors/ingest_masscan_async.py
rename to async_dev/ingestors/ingest_masscan.py
index df0c2a3..8fcc0f2 100644
--- a/async_dev/ingestors/ingest_masscan_async.py
+++ b/async_dev/ingestors/ingest_masscan.py
@@ -59,7 +59,7 @@ def construct_map() -> dict:
     return mapping
 
 
-async def process_file(file_path: str):
+async def process_data(file_path: str):
     '''
     Read and process Masscan records from the log file.
 
@@ -79,22 +79,29 @@ async def process_file(file_path: str):
             try:
                 record = json.loads(line)
             except json.decoder.JSONDecodeError:
+                # In rare cases, the JSON record may be incomplete or malformed:
+                #   {   "ip": "51.161.12.223",   "timestamp": "1707628302", "ports": [ {"port": 22, "proto": "tcp", "service": {"name": "ssh", "banner":
+                #   {   "ip": "83.66.211.246",   "timestamp": "1706557002"
                 logging.error(f'Failed to parse JSON record! ({line})')
-                input('Press Enter to continue...') # Debugging
+                input('Press Enter to continue...') # Pause for review & debugging (Will remove pausing in production, still investigating the cause of this issue.)
                 continue
 
+            if len(record['ports']) > 1:
+                logging.warning(f'Multiple ports found for record! ({record})')
+                input('Press Enter to continue...') # Pause for review (Will remove pausing in production, still investigating if you ever seen more than one port in a record.)
+
             for port_info in record['ports']:
                 struct = {
-                    'ip': record['ip'],
-                    'port': port_info['port'],
-                    'proto': port_info['proto'],
-                    'seen': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp']))),
+                    'ip'    : record['ip'],
+                    'port'  : port_info['port'],
+                    'proto' : port_info['proto'],
+                    'seen'  : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp']))),
                 }
 
                 if 'service' in port_info:
                     if 'name' in port_info['service']:
-                        if port_info['service']['name'] != 'unknown':
-                            struct['service'] = port_info['service']['name']
+                        if (service_name := port_info['service']['name']) not in ('unknown',''):
+                            struct['service'] = service_name
 
                     if 'banner' in port_info['service']:
                         banner = ' '.join(port_info['service']['banner'].split()) # Remove extra whitespace