diff --git a/ingestors/ingest_certs.py b/ingestors/ingest_certs.py index 185e60b..af5a6be 100644 --- a/ingestors/ingest_certs.py +++ b/ingestors/ingest_certs.py @@ -16,7 +16,7 @@ default_index = 'cert-stream' def construct_map() -> dict: '''Construct the Elasticsearch index mapping for Certstream records.''' - keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } + keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } mapping = { 'mappings': { @@ -24,8 +24,8 @@ def construct_map() -> dict: 'data': { 'properties': { 'cert_index': { 'type': 'integer' }, - 'cert_link': { 'type': 'keyword' }, - 'leaf_cert': { + 'cert_link' : { 'type': 'keyword' }, + 'leaf_cert' : { 'properties': { 'all_domains': { 'type': 'keyword' }, 'extensions': { @@ -55,10 +55,10 @@ def construct_map() -> dict: 'emailAddress' : { 'type': 'text' } } }, - 'not_after': { 'type': 'integer' }, - 'not_before': { 'type': 'integer' }, - 'serial_number': { 'type': 'keyword' }, - 'signature_algorithm': { 'type': 'text' }, + 'not_after' : { 'type': 'integer' }, + 'not_before' : { 'type': 'integer' }, + 'serial_number' : { 'type': 'keyword' }, + 'signature_algorithm' : { 'type': 'text' }, 'subject': { 'properties': { 'C' : { 'type': 'keyword' }, diff --git a/ingestors/ingest_httpx.py b/ingestors/ingest_httpx.py index 93d8b58..0bb9bc2 100644 --- a/ingestors/ingest_httpx.py +++ b/ingestors/ingest_httpx.py @@ -19,7 +19,37 @@ def construct_map() -> dict: mapping = { 'mappings': { 'properties': { - 'change': 'me' + "timestamp" : { 'type' : 'date' }, + "hash" : { + "body_md5" : { 'type': 'keyword' }, + "body_mmh3" : { 'type': 'keyword' }, + "body_sha256" : { 'type': 'keyword' }, + "body_simhash" : { 'type': 'keyword' }, + "header_md5" : { 'type': 'keyword' }, + "header_mmh3" : { 'type': 'keyword' }, + "header_sha256" : { 'type': 'keyword' }, + "header_simhash" : { 'type': 'keyword' } + }, + "port" : { 'type': 'integer' }, + "url" : keyword_mapping, + "input" : keyword_mapping, + "title" : keyword_mapping, + "scheme" : { 'type': 'keyword' }, + "webserver" : { 'type': 'keyword' }, + "body_preview" : keyword_mapping, + "content_type" : { 'type': 'keyword' }, + "method" : { 'type': 'keyword'}, + "host" : { 'type': 'ip'}, + "path" : keyword_mapping, + "favicon" : { 'type': 'keyword' }, + "favicon_path" : keyword_mapping, + "a" : { 'type': 'ip'}, + "aaaa" : { 'type': 'ip'}, + "tech" : keyword_mapping, + "words" : { 'type': 'integer'}, + "lines" : { 'type': 'integer'}, + "status_code" : { 'type': 'integer'}, + "content_length" : { 'type': 'integer'} } } } @@ -43,14 +73,13 @@ async def process_data(file_path: str): record = json.loads(line) - record['seen'] = record.pop('timestamp').split('.')[0] + 'Z' # Hacky solution to maintain ISO 8601 format without milliseconds or offsets + record['seen'] = record.pop('timestamp').split('.')[0] + 'Z' # Hacky solution to maintain ISO 8601 format without milliseconds or offsets record['domain'] = record.pop('input') - del record['failed'], record['knowledgebase'], record['time'] + for item in ('failed', 'knowledgebase', 'time'): + del record[item] - yield {'_index': default_index, '_source': record} - - return None # EOF + yield {'_id': record['domain'], '_index': default_index, '_source': record} @@ -59,45 +88,39 @@ Example record: { "timestamp":"2024-01-14T13:08:15.117348474-05:00", # Rename to seen and remove milliseconds and offset "hash": { # Do we need all of these ? - "body_md5":"4ae9394eb98233b482508cbda3b33a66", - "body_mmh3":"-4111954", - "body_sha256":"89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3", - "body_simhash":"9814303593401624250", - "header_md5":"980366deb2b2fb5df2ad861fc63e79ce", - "header_mmh3":"-813072798", - "header_sha256":"39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d", - "header_simhash":"10962523587435277678" + "body_md5" : "4ae9394eb98233b482508cbda3b33a66", + "body_mmh3" : "-4111954", + "body_sha256" : "89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3", + "body_simhash" : "9814303593401624250", + "header_md5" : "980366deb2b2fb5df2ad861fc63e79ce", + "header_mmh3" : "-813072798", + "header_sha256" : "39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d", + "header_simhash" : "10962523587435277678" }, - "port":"443", - "url":"https://supernets.org", # Remove this and only use the input field as "domain" maybe - "input":"supernets.org", # rename to domain - "title":"SuperNETs", - "scheme":"https", - "webserver":"nginx", - "body_preview":"SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup", - "content_type":"text/html", - "method":"GET", # Do we need this ? - "host":"51.89.151.158", - "path":"/", - "favicon":"-674048714", - "favicon_path":"/i/favicon.png", - "time":"592.907689ms", # Do we need this ? - "a":[ - "6.150.220.23" - ], - "tech":[ - "Bootstrap:4.0.0", - "HSTS", - "Nginx" - ], - "words":436, # Do we need this ? - "lines":79, # Do we need this ? - "status_code":200, - "content_length":4597, - "failed":false, # Do we need this ? - "knowledgebase":{ # Do we need this ? - "PageType":"nonerror", - "pHash":0 + "port" : "443", + "url" : "https://supernets.org", # Remove this and only use the input field as "domain" maybe + "input" : "supernets.org", # rename to domain + "title" : "SuperNETs", + "scheme" : "https", + "webserver" : "nginx", + "body_preview" : "SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup", + "content_type" : "text/html", + "method" : "GET", # Remove this + "host" : "51.89.151.158", + "path" : "/", + "favicon" : "-674048714", + "favicon_path" : "/i/favicon.png", + "time" : "592.907689ms", # Do we need this ? + "a" : ["6.150.220.23"], + "tech" : ["Bootstrap:4.0.0", "HSTS", "Nginx"], + "words" : 436, # Do we need this ? + "lines" : 79, # Do we need this ? + "status_code" : 200, + "content_length" : 4597, + "failed" : false, # Do we need this ? + "knowledgebase" : { # Do we need this ? + "PageType" : "nonerror", + "pHash" : 0 } } ''' \ No newline at end of file diff --git a/ingestors/ingest_massdns.py b/ingestors/ingest_massdns.py index 158fa7b..7f18aad 100644 --- a/ingestors/ingest_massdns.py +++ b/ingestors/ingest_massdns.py @@ -15,15 +15,15 @@ default_index = 'ptr-records' def construct_map() -> dict: '''Construct the Elasticsearch index mapping for MassDNS records''' - keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } + keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } mapping = { 'mappings': { 'properties': { - 'ip' : { 'type' : 'ip' }, - 'name' : { 'type' : 'keyword' }, + 'ip' : { 'type': 'ip' }, + 'name' : { 'type': 'keyword' }, 'record' : keyword_mapping, - 'seen' : { 'type' : 'date' } + 'seen' : { 'type': 'date' } } } } diff --git a/ingestors/ingest_zone.py b/ingestors/ingest_zone.py index 5089387..077355a 100644 --- a/ingestors/ingest_zone.py +++ b/ingestors/ingest_zone.py @@ -17,14 +17,14 @@ record_types = ('a','aaaa','caa','cdnskey','cds','cname','dnskey','ds','mx','na def construct_map() -> dict: '''Construct the Elasticsearch index mapping for zone file records.''' - keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } + keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } mapping = { 'mappings': { 'properties': { - 'domain': keyword_mapping, - 'records': { 'properties': {} }, - 'seen': {'type': 'date'} + 'domain' : keyword_mapping, + 'records' : { 'properties': {} }, + 'seen' : { 'type': 'date' } } } } @@ -93,7 +93,7 @@ async def process_data(file_path: str): if record_type == 'nsec': data = ' '.join([data.split()[0].rstrip('.'), *data.split()[1:]]) elif record_type == 'soa': - data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()]) + data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()]) elif data.endswith('.'): data = data.rstrip('.')