From b15b3d8241c12ec304e1fabe293281868c604cc2 Mon Sep 17 00:00:00 2001
From: acidvegas <acid.vegas@acid.vegas>
Date: Wed, 6 Mar 2024 15:07:52 -0500
Subject: [PATCH] OCD about formatting again

---
 ingestors/ingest_certs.py   |  14 ++---
 ingestors/ingest_httpx.py   | 111 ++++++++++++++++++++++--------------
 ingestors/ingest_massdns.py |   8 +--
 ingestors/ingest_zone.py    |  10 ++--
 4 files changed, 83 insertions(+), 60 deletions(-)

diff --git a/ingestors/ingest_certs.py b/ingestors/ingest_certs.py
index 185e60b..af5a6be 100644
--- a/ingestors/ingest_certs.py
+++ b/ingestors/ingest_certs.py
@@ -16,7 +16,7 @@ default_index = 'cert-stream'
 def construct_map() -> dict:
     '''Construct the Elasticsearch index mapping for Certstream records.'''
 
-    keyword_mapping = { 'type': 'text',  'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
+    keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
 
     mapping = {
         'mappings': {
@@ -24,8 +24,8 @@ def construct_map() -> dict:
                 'data': {
                     'properties': {
                         'cert_index': { 'type': 'integer' },
-                        'cert_link': { 'type': 'keyword' },
-                        'leaf_cert': {
+                        'cert_link' : { 'type': 'keyword' },
+                        'leaf_cert' : {
                             'properties': {
                                 'all_domains': { 'type': 'keyword' },
                                 'extensions': {
@@ -55,10 +55,10 @@ def construct_map() -> dict:
                                         'emailAddress' : { 'type': 'text'    }
                                     }
                                 },
-                                'not_after': { 'type': 'integer' },
-                                'not_before': { 'type': 'integer' },
-                                'serial_number': { 'type': 'keyword' },
-                                'signature_algorithm': { 'type': 'text' },
+                                'not_after'           : { 'type': 'integer' },
+                                'not_before'          : { 'type': 'integer' },
+                                'serial_number'       : { 'type': 'keyword' },
+                                'signature_algorithm' : { 'type': 'text' },
                                 'subject': {
                                     'properties': {
                                         'C'            : { 'type': 'keyword' },
diff --git a/ingestors/ingest_httpx.py b/ingestors/ingest_httpx.py
index 93d8b58..0bb9bc2 100644
--- a/ingestors/ingest_httpx.py
+++ b/ingestors/ingest_httpx.py
@@ -19,7 +19,37 @@ def construct_map() -> dict:
     mapping = {
         'mappings': {
             'properties': {
-                'change': 'me'
+                "timestamp" : { 'type' : 'date' },
+                "hash"      : {
+                    "body_md5"       : { 'type': 'keyword' },
+                    "body_mmh3"      : { 'type': 'keyword' },
+                    "body_sha256"    : { 'type': 'keyword' },
+                    "body_simhash"   : { 'type': 'keyword' },
+                    "header_md5"     : { 'type': 'keyword' },
+                    "header_mmh3"    : { 'type': 'keyword' },
+                    "header_sha256"  : { 'type': 'keyword' },
+                    "header_simhash" : { 'type': 'keyword' }
+                },
+                "port"           : { 'type': 'integer' },
+                "url"            : keyword_mapping,
+                "input"          : keyword_mapping,
+                "title"          : keyword_mapping,
+                "scheme"         : { 'type': 'keyword' },
+                "webserver"      : { 'type': 'keyword' },
+                "body_preview"   : keyword_mapping,
+                "content_type"   : { 'type': 'keyword' },
+                "method"         : { 'type': 'keyword'},
+                "host"           : { 'type': 'ip'},
+                "path"           : keyword_mapping,
+                "favicon"        : { 'type': 'keyword' },
+                "favicon_path"   : keyword_mapping,
+                "a"              : { 'type': 'ip'},
+                "aaaa"           : { 'type': 'ip'},
+                "tech"           : keyword_mapping,
+                "words"          : { 'type': 'integer'},
+                "lines"          : { 'type': 'integer'},
+                "status_code"    : { 'type': 'integer'},
+                "content_length" : { 'type': 'integer'}
             }
         }
     }
@@ -43,14 +73,13 @@ async def process_data(file_path: str):
 
             record = json.loads(line)
 
-            record['seen'] = record.pop('timestamp').split('.')[0] + 'Z' # Hacky solution to maintain ISO 8601 format without milliseconds or offsets
+            record['seen']   = record.pop('timestamp').split('.')[0] + 'Z' # Hacky solution to maintain ISO 8601 format without milliseconds or offsets
             record['domain'] = record.pop('input')
 
-            del record['failed'], record['knowledgebase'], record['time']
+            for item in ('failed', 'knowledgebase', 'time'):
+                del record[item]
 
-            yield {'_index': default_index, '_source': record}
-
-    return None # EOF
+            yield {'_id': record['domain'], '_index': default_index, '_source': record}
 
 
   
@@ -59,45 +88,39 @@ Example record:
 {
     "timestamp":"2024-01-14T13:08:15.117348474-05:00", # Rename to seen and remove milliseconds and offset
     "hash": { # Do we need all of these ?
-        "body_md5":"4ae9394eb98233b482508cbda3b33a66",
-        "body_mmh3":"-4111954",
-        "body_sha256":"89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3",
-        "body_simhash":"9814303593401624250",
-        "header_md5":"980366deb2b2fb5df2ad861fc63e79ce",
-        "header_mmh3":"-813072798",
-        "header_sha256":"39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d",
-        "header_simhash":"10962523587435277678"
+        "body_md5"       : "4ae9394eb98233b482508cbda3b33a66",
+        "body_mmh3"      : "-4111954",
+        "body_sha256"    : "89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3",
+        "body_simhash"   : "9814303593401624250",
+        "header_md5"     : "980366deb2b2fb5df2ad861fc63e79ce",
+        "header_mmh3"    : "-813072798",
+        "header_sha256"  : "39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d",
+        "header_simhash" : "10962523587435277678"
     },
-    "port":"443",
-    "url":"https://supernets.org", # Remove this and only use the input field as "domain" maybe
-    "input":"supernets.org", # rename to domain
-    "title":"SuperNETs",
-    "scheme":"https",
-    "webserver":"nginx",
-    "body_preview":"SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup",
-    "content_type":"text/html",
-    "method":"GET", # Do we need this ?
-    "host":"51.89.151.158",
-    "path":"/",
-    "favicon":"-674048714",
-    "favicon_path":"/i/favicon.png",
-    "time":"592.907689ms", # Do we need this ?
-    "a":[
-        "6.150.220.23"
-    ],
-    "tech":[
-        "Bootstrap:4.0.0",
-        "HSTS",
-        "Nginx"
-    ],
-    "words":436, # Do we need this ?
-    "lines":79, # Do we need this ?
-    "status_code":200, 
-    "content_length":4597,
-    "failed":false, # Do we need this ?
-    "knowledgebase":{ # Do we need this ?
-        "PageType":"nonerror",
-        "pHash":0
+    "port"           : "443",
+    "url"            : "https://supernets.org", # Remove this and only use the input field as "domain" maybe
+    "input"          : "supernets.org", # rename to domain
+    "title"          : "SuperNETs",
+    "scheme"         : "https",
+    "webserver"      : "nginx",
+    "body_preview"   : "SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup",
+    "content_type"   : "text/html",
+    "method"         : "GET", # Remove this
+    "host"           : "51.89.151.158",
+    "path"           : "/",
+    "favicon"        : "-674048714",
+    "favicon_path"   : "/i/favicon.png",
+    "time"           : "592.907689ms", # Do we need this ?
+    "a"              : ["6.150.220.23"],
+    "tech"           : ["Bootstrap:4.0.0", "HSTS", "Nginx"],
+    "words"          : 436, # Do we need this ?
+    "lines"          : 79, # Do we need this ?
+    "status_code"    : 200, 
+    "content_length" : 4597,
+    "failed"         : false, # Do we need this ?
+    "knowledgebase"  : { # Do we need this ?
+        "PageType" : "nonerror",
+        "pHash"    : 0
     }
 }
 '''
\ No newline at end of file
diff --git a/ingestors/ingest_massdns.py b/ingestors/ingest_massdns.py
index 158fa7b..7f18aad 100644
--- a/ingestors/ingest_massdns.py
+++ b/ingestors/ingest_massdns.py
@@ -15,15 +15,15 @@ default_index = 'ptr-records'
 def construct_map() -> dict:
     '''Construct the Elasticsearch index mapping for MassDNS records'''
 
-    keyword_mapping = { 'type': 'text',  'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
+    keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
 
     mapping = {
         'mappings': {
                 'properties': {
-                    'ip'     : { 'type' : 'ip' },
-                    'name'   : { 'type' : 'keyword' },
+                    'ip'     : { 'type': 'ip' },
+                    'name'   : { 'type': 'keyword' },
                     'record' : keyword_mapping,
-                    'seen'   : { 'type' : 'date' }
+                    'seen'   : { 'type': 'date' }
                 }
             }
         }
diff --git a/ingestors/ingest_zone.py b/ingestors/ingest_zone.py
index 5089387..077355a 100644
--- a/ingestors/ingest_zone.py
+++ b/ingestors/ingest_zone.py
@@ -17,14 +17,14 @@ record_types  = ('a','aaaa','caa','cdnskey','cds','cname','dnskey','ds','mx','na
 def construct_map() -> dict:
     '''Construct the Elasticsearch index mapping for zone file records.'''
 
-    keyword_mapping = { 'type': 'text',  'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
+    keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
 
     mapping = {
         'mappings': {
             'properties': {
-                'domain':  keyword_mapping,
-                'records': { 'properties': {} },
-                'seen':    {'type': 'date'}
+                'domain'  : keyword_mapping,
+                'records' : { 'properties': {} },
+                'seen'    : { 'type': 'date' }
             }
         }
     }
@@ -93,7 +93,7 @@ async def process_data(file_path: str):
             if record_type == 'nsec':
                 data = ' '.join([data.split()[0].rstrip('.'), *data.split()[1:]])
             elif record_type == 'soa':
-                    data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()])
+                data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()])
             elif data.endswith('.'):
                 data = data.rstrip('.')