From 6c4ae3e988920565b33a2604599e59e22789ff72 Mon Sep 17 00:00:00 2001
From: acidvegas <acid.vegas@acid.vegas>
Date: Fri, 8 Mar 2024 12:13:57 -0500
Subject: [PATCH] Many bugs fixed in sniffer and async model.

---
 eris.py                     | 43 ++++++++++++++++++++-----------------
 ingestors/ingest_massdns.py | 31 +++++++++++++-------------
 sniff_patch.py              | 24 ++++++++++++---------
 3 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/eris.py b/eris.py
index a3d37cf..fc47fc4 100644
--- a/eris.py
+++ b/eris.py
@@ -13,7 +13,7 @@ sys.dont_write_bytecode = True
 
 try:
     # This is commented out because there is a bug with the elasticsearch library that requires a patch (see initialize() method below)
-    #from elasticsearch import AsyncElasticsearch 
+    #from elasticsearch import AsyncElasticsearch
     from elasticsearch.exceptions import NotFoundError
     from elasticsearch.helpers import async_streaming_bulk
 except ImportError:
@@ -43,23 +43,23 @@ class ElasticIndexer:
             'request_timeout': args.timeout,
             'max_retries': args.retries,
             'retry_on_timeout': True,
-            'sniff_on_start': True, # Is this problematic? 
+            'sniff_on_start': False, # Problems when True....
             'sniff_on_node_failure': True,
             'min_delay_between_sniffing': 60 # Add config option for this?
         }
 
-        if args.api_key:
-            self.es_config['api_key'] = (args.api_key, '') # Verify this is correct
-        else:
-            self.es_config['basic_auth'] = (args.user, args.password)
-            
-        
+        #if args.api_key:
+        #    self.es_config['api_key'] = (args.api_key, '') # Verify this is correct
+        #else:
+        self.es_config['basic_auth'] = (args.user, args.password)
+
+
     async def initialize(self):
         '''Initialize the Elasticsearch client.'''
 
         # Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960)
         import sniff_patch
-        self.es = sniff_patch.init_elasticsearch(**self.es_config)
+        self.es = await sniff_patch.init_elasticsearch(**self.es_config)
 
         # Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client:
         #self.es = AsyncElasticsearch(**es_config)
@@ -68,7 +68,7 @@ class ElasticIndexer:
     async def create_index(self, map_body: dict, pipeline: str = '', replicas: int = 1, shards: int = 1):
         '''
         Create the Elasticsearch index with the defined mapping.
-        
+
         :param map_body: Mapping for the index
         :param pipeline: Name of the ingest pipeline to use for the index
         :param replicas: Number of replicas for the index
@@ -106,7 +106,7 @@ class ElasticIndexer:
         '''Get the health of the Elasticsearch cluster.'''
 
         return await self.es.cluster.health()
-    
+
 
     async def get_cluster_size(self) -> int:
         '''Get the number of nodes in the Elasticsearch cluster.'''
@@ -128,7 +128,7 @@ class ElasticIndexer:
 
         count = 0
         total = 0
-        
+
         async for ok, result in async_streaming_bulk(self.es, actions=data_generator(file_path), chunk_size=self.chunk_size, max_chunk_bytes=self.chunk_max):
             action, result = result.popitem()
 
@@ -151,17 +151,17 @@ async def main():
     '''Main function when running this script directly.'''
 
     parser = argparse.ArgumentParser(description='Index data into Elasticsearch.')
-    
+
     # General arguments
     parser.add_argument('input_path', help='Path to the input file or directory') # Required
     parser.add_argument('--watch', action='store_true', help='Create or watch a FIFO for real-time indexing')
-    
+
     # Elasticsearch arguments
     parser.add_argument('--host', default='http://localhost/', help='Elasticsearch host')
     parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port')
     parser.add_argument('--user', default='elastic', help='Elasticsearch username')
     parser.add_argument('--password', default=os.getenv('ES_PASSWORD'), help='Elasticsearch password (if not provided, check environment variable ES_PASSWORD)')
-    parser.add_argument('--api-key', default=os.getenv('ES_APIKEY'), help='Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY)')
+    #parser.add_argument('--api-key', default=os.getenv('ES_APIKEY'), help='Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY)')
     parser.add_argument('--self-signed', action='store_false', help='Elasticsearch is using self-signed certificates')
 
     # Elasticsearch indexing arguments
@@ -169,7 +169,7 @@ async def main():
     parser.add_argument('--pipeline', help='Use an ingest pipeline for the index')
     parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index')
     parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index')
-    
+
     # Performance arguments
     parser.add_argument('--chunk-size', type=int, default=50000, help='Number of records to index in a chunk')
     parser.add_argument('--chunk-max', type=int, default=100, help='Maximum size of a chunk in bytes')
@@ -206,12 +206,15 @@ async def main():
         from ingestors import ingest_massdns as ingestor
     elif args.zone:
         from ingestors import ingest_zone    as ingestor
-    
+
+    if not isinstance(ingestor, object):
+        raise ValueError('No ingestor selected')
+
     health = await edx.get_cluster_health()
     print(health)
 
     await asyncio.sleep(5) # Delay to allow time for sniffing to complete
-    
+
     nodes = await edx.get_cluster_size()
     logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
 
@@ -220,7 +223,7 @@ async def main():
 
     map_body = ingestor.construct_map()
     await edx.create_index(map_body, args.pipeline, args.replicas, args.shards)
-    
+
     if os.path.isfile(args.input_path):
         logging.info(f'Processing file: {args.input_path}')
         await edx.process_data(args.input_path, ingestor.process_data)
@@ -245,4 +248,4 @@ async def main():
 
 
 if __name__ == '__main__':
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/ingestors/ingest_massdns.py b/ingestors/ingest_massdns.py
index 7066f3e..e11d1ca 100644
--- a/ingestors/ingest_massdns.py
+++ b/ingestors/ingest_massdns.py
@@ -48,7 +48,8 @@ async def process_data(file_path: str):
 
             # Sentinel value to indicate the end of a process (for closing out a FIFO stream)
             if line == '~eof':
-                return last
+                yield last
+                break
 
             # Skip empty lines
             if not line:
@@ -61,50 +62,47 @@ async def process_data(file_path: str):
             if len(parts) < 3:
                 logging.warning(f'Invalid PTR record: {line}')
                 continue
-            
+
             # Split the PTR record into its parts
             name, record_type, record = parts[0].rstrip('.'), parts[1], ' '.join(parts[2:]).rstrip('.')
 
             # Do not index other records
             if record_type != 'PTR':
-                logging.warning(f'Invalid record type: {record_type}: {line}')
                 continue
 
             # Do not index PTR records that do not have a record
             if not record:
-                logging.warning(f'Empty PTR record: {line}')
                 continue
 
             # Let's not index the PTR record if it's the same as the in-addr.arpa domain
             if record == name:
-                logging.warning(f'PTR record is the same as the in-addr.arpa domain: {line}')
                 continue
-            
+
             # Get the IP address from the in-addr.arpa domain
             ip = '.'.join(name.replace('.in-addr.arpa', '').split('.')[::-1])
 
             # Check if we are still processing the same IP address
             if last:
                 if ip == last['_id']:
-                    last_record = last['_doc']['record']
+                    last_record = last['doc']['record']
                     if isinstance(last_record, list):
                         if record not in last_record:
-                            last['_doc']['record'].append(record)
+                            last['doc']['record'].append(record)
                         else:
                             logging.warning(f'Duplicate PTR record: {line}')
                     else:
                         if record != last_record:
-                            last['_doc']['record'] = [last_record, record] # IP addresses with more than one PTR record will turn into a list
+                            last['doc']['record'] = [last_record, record] # IP addresses with more than one PTR record will turn into a list
                     continue
                 else:
-                    yield last
-            
+                    yield last # Return the last document and start a new one
+
             # Cache the this document in-case we have more for the same IP address
             last = {
                 '_op_type' : 'update',
                 '_id'      : ip,
                 '_index'   : default_index,
-                '_doc'     : {
+                'doc'      : {
                     'ip'     : ip,
                     'record' : record,
                     'seen'   : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
@@ -116,7 +114,7 @@ async def process_data(file_path: str):
 async def test(input_path: str):
     '''
     Test the MassDNS ingestion process
-    
+
     :param input_path: Path to the MassDNS log file
     '''
     async for document in process_data(input_path):
@@ -131,16 +129,17 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='MassDNS Ingestor for ERIS')
     parser.add_argument('input_path', help='Path to the input file or directory')
     args = parser.parse_args()
-    
+
     asyncio.run(test(args.input_path))
 
 
 
 '''
 Deployment:
+    sudo apt-get install build-essential gcc make
     git clone --depth 1 https://github.com/blechschmidt/massdns.git $HOME/massdns && cd $HOME/massdns && make
     curl -s https://public-dns.info/nameservers.txt | grep -v ':' > $HOME/massdns/nameservers.txt
-    pythons ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR-s 1000 -o S -w $HOME/massdns/fifo.json
+    python3 ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR-s 500 -o S -w $HOME/massdns/fifo.json
     or...
     while true; do python ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR -s 1000 -o S -w $HOME/massdns/fifo.json; done
 
@@ -163,4 +162,4 @@ Input:
 Notes:
     Why do some IP addresses return a CNAME from a PTR request
     What is dns-servfail.net (Frequent CNAME response from PTR requests)
-'''
\ No newline at end of file
+'''
diff --git a/sniff_patch.py b/sniff_patch.py
index 0500cf0..2fd75fa 100644
--- a/sniff_patch.py
+++ b/sniff_patch.py
@@ -16,19 +16,19 @@ import elasticsearch._async.client as async_client
 from elasticsearch.exceptions import SerializationError, ConnectionError
 
 
-async def init_elasticsearch_async(*args, **kwargs):
+async def init_elasticsearch(*args, **kwargs):
     '''
     Initialize the Async Elasticsearch client with the sniff patch.
-    
+
     :param args: Async Elasticsearch positional arguments.
     :param kwargs: Async Elasticsearch keyword arguments.
     '''
-    async_client.default_sniff_callback = _override_async_sniff_callback(kwargs['basic_auth'])
+    async_client.default_sniff_callback = _override_sniff_callback(kwargs['basic_auth'])
 
     return async_client.AsyncElasticsearch(*args, **kwargs)
 
 
-def _override_async_sniff_callback(basic_auth):
+def _override_sniff_callback(basic_auth):
     '''
     Taken from https://github.com/elastic/elasticsearch-py/blob/8.8/elasticsearch/_sync/client/_base.py#L166
     Completely unmodified except for adding the auth header to the elastic request.
@@ -40,7 +40,7 @@ def _override_async_sniff_callback(basic_auth):
     auth_str = base64.b64encode(':'.join(basic_auth).encode()).decode()
     sniffed_node_callback = async_client._base._default_sniffed_node_callback
 
-    async def modified_async_sniff_callback(transport, sniff_options):
+    async def modified_sniff_callback(transport, sniff_options):
         for _ in transport.node_pool.all():
             try:
                 meta, node_infos = await transport.perform_request(
@@ -48,7 +48,7 @@ def _override_async_sniff_callback(basic_auth):
                     '/_nodes/_all/http',
                     headers={
                         'accept': 'application/vnd.elasticsearch+json; compatible-with=8',
-                        'authorization': f'Basic {auth_str}'  # This auth header is missing in 8.x releases of the client, and causes 401s
+                        'authorization': f'Basic {auth_str}' # This auth header is missing in 8.x releases of the client, and causes 401s
                     },
                     request_timeout=(
                         sniff_options.sniff_timeout
@@ -79,9 +79,13 @@ def _override_async_sniff_callback(basic_auth):
                     port = int(port_str)
 
                 assert sniffed_node_callback is not None
-                sniffed_node = await sniffed_node_callback(
-                    node_info, meta.node.replace(host=host, port=port)
-                )
+
+                # Pay not mind to this, it's just a workaround for my own setup.
+                #host = elastic.domain.com
+                #port = int(str(port).replace('', ''))
+
+                sniffed_node = sniffed_node_callback(node_info, meta.node.replace(host=host, port=port))
+
                 if sniffed_node is None:
                     continue
 
@@ -93,4 +97,4 @@ def _override_async_sniff_callback(basic_auth):
 
         return []
 
-    return modified_async_sniff_callback
\ No newline at end of file
+    return modified_sniff_callback