eris/ingestors/ingest_httpx.py

#!/usr/bin/env python
# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
# ingest_httpx.py

import json
import logging

try:
	import aiofiles
except ImportError:
	raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')


# Set a default elasticsearch index if one is not provided
default_index = 'eris-httpx'


def construct_map() -> dict:
	'''Construct the Elasticsearch index mapping for Masscan records.'''

	# Match on exact value or full text search
	keyword_mapping = { 'type': 'text',  'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }

	# Construct the index mapping
	mapping = {
		'mappings': {
			'properties': {
				'timestamp' : { 'type' : 'date' },
				'hash'      : {
					'properties': {
						'body_md5'       : { 'type': 'keyword' },
						'body_mmh3'      : { 'type': 'keyword' },
						'body_sha256'    : { 'type': 'keyword' },
						'body_simhash'   : { 'type': 'keyword' },
						'header_md5'     : { 'type': 'keyword' },
						'header_mmh3'    : { 'type': 'keyword' },
						'header_sha256'  : { 'type': 'keyword' },
						'header_simhash' : { 'type': 'keyword' }
					}
				},
				'port'               : { 'type': 'integer' },
				'url'                : keyword_mapping,
				'final_url'          : keyword_mapping,
				'input'              : keyword_mapping,
				'title'              : keyword_mapping,
				'scheme'             : { 'type': 'keyword' },
				'webserver'          : { 'type': 'keyword' },
				'body_preview'       : keyword_mapping,
				'content_type'       : { 'type': 'keyword' },
				'method'             : { 'type': 'keyword' },
				'host'               : { 'type': 'ip' },
				'path'               : keyword_mapping,
				'favicon'            : { 'type': 'keyword' },
				'favicon_path'       : keyword_mapping,
				'a'                  : { 'type': 'ip' },
				'cname'              : keyword_mapping,
				'aaaa'               : { 'type': 'ip' },
				'tech'               : keyword_mapping,
				'words'              : { 'type': 'integer' },
				'lines'              : { 'type': 'integer' },
				'status_code'        : { 'type': 'integer' },
				'chain_status_codes' : { 'type': 'integer' },
				'content_length'     : { 'type': 'integer' }
			}
		}
	}

	return mapping


async def process_data(input_path: str):
	'''
	Read and process the input file

	:param input_path: Path to the input file
	'''

	async with aiofiles.open(input_path) as input_file:
		# Read the input file line by line
		async for line in input_file:
			line = line.strip()

			# Sentinel value to indicate the end of a process (for closing out a FIFO stream)
			if line == '~eof':
				break

			# Skip empty lines
			if not line:
				continue

			# Parse the JSON record
			try:
				record = json.loads(line)
			except json.JSONDecodeError:
				logging.error(f'Failed to parse JSON record: {line}')
				continue

		# Hacky solution to maintain ISO 8601 format without milliseconds or offsets
		record['timestamp'] = record['timestamp'].split('.')[0] + 'Z'

		# Remove unnecessary fields we don't care about
		for item in ('failed', 'knowledgebase', 'time', 'csp'):
			if item in record:
				del record[item]

		yield {'_index': default_index, '_source': record}


async def test(input_path: str):
	'''
	Test the ingestion process

	:param input_path: Path to the input file
	'''

	async for document in process_data(input_path):
		print(document)


if __name__ == '__main__':
	import argparse
	import asyncio

	parser = argparse.ArgumentParser(description='Ingestor for ERIS')
	parser.add_argument('input_path', help='Path to the input file or directory')
	args = parser.parse_args()

	asyncio.run(test(args.input_path))


'''
Deploy:
	go install -v github.com/projectdiscovery/httpx/cmd/httpx@latest 
	curl -s https://public-dns.info/nameservers.txt -o nameservers.txt
	httpx -l fulldomains.txt -t 200 -sc -location -favicon -title -bp -td -ip -cname -mc 200,201,301,302,303,307,308 -fr -r nameservers.txt -retries 2 -stream -sd -j -o fifo.json -v

Output:
	{
		"timestamp":"2024-01-14T13:08:15.117348474-05:00", # Rename to seen and remove milliseconds and offset
		"hash": { # Do we need all of these ?
			"body_md5"       : "4ae9394eb98233b482508cbda3b33a66",
			"body_mmh3"      : "-4111954",
			"body_sha256"    : "89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3",
			"body_simhash"   : "9814303593401624250",
			"header_md5"     : "980366deb2b2fb5df2ad861fc63e79ce",
			"header_mmh3"    : "-813072798",
			"header_sha256"  : "39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d",
			"header_simhash" : "10962523587435277678"
		},
		"port"           : "443",
		"url"            : "https://supernets.org", # Remove this and only use the input field as "domain" maybe
		"input"          : "supernets.org", # rename to domain
		"title"          : "SuperNETs",
		"scheme"         : "https",
		"webserver"      : "nginx",
		"body_preview"   : "SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup",
		"content_type"   : "text/html",
		"method"         : "GET", # Remove this
		"host"           : "51.89.151.158",
		"path"           : "/",
		"favicon"        : "-674048714",
		"favicon_path"   : "/i/favicon.png",
		"time"           : "592.907689ms", # Do we need this ?
		"a"              : ["6.150.220.23"],
		"tech"           : ["Bootstrap:4.0.0", "HSTS", "Nginx"],
		"words"          : 436, # Do we need this ?
		"lines"          : 79, # Do we need this ?
		"status_code"    : 200,
		"content_length" : 4597,
		"failed"         : false, # Do we need this ?
		"knowledgebase"  : { # Do we need this ?
			"PageType" : "nonerror",
			"pHash"    : 0
		}
	}
'''
Initial commit 2024-01-20 07:04:50 +00:00			`#!/usr/bin/env python`
			`# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)`
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`# ingest_httpx.py`
Initial commit 2024-01-20 07:04:50 +00:00
			`import json`
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`import logging`
Updated README, copied over consistencies across the ingestors, docstring updates to reflect on new arguments 2024-01-27 09:28:30 +00:00
Asyncronous refactorization pushed as main version 💯 2024-03-06 03:19:11 +00:00			`try:`
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`import aiofiles`
Asyncronous refactorization pushed as main version 💯 2024-03-06 03:19:11 +00:00			`except ImportError:`
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')`


			`# Set a default elasticsearch index if one is not provided`
			`default_index = 'eris-httpx'`
Asyncronous refactorization pushed as main version 💯 2024-03-06 03:19:11 +00:00
Updated README, copied over consistencies across the ingestors, docstring updates to reflect on new arguments 2024-01-27 09:28:30 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`def construct_map() -> dict:`
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`'''Construct the Elasticsearch index mapping for Masscan records.'''`

			`# Match on exact value or full text search`
			`keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }`

			`# Construct the index mapping`
			`mapping = {`
			`'mappings': {`
			`'properties': {`
			`'timestamp' : { 'type' : 'date' },`
			`'hash' : {`
			`'properties': {`
			`'body_md5' : { 'type': 'keyword' },`
			`'body_mmh3' : { 'type': 'keyword' },`
			`'body_sha256' : { 'type': 'keyword' },`
			`'body_simhash' : { 'type': 'keyword' },`
			`'header_md5' : { 'type': 'keyword' },`
			`'header_mmh3' : { 'type': 'keyword' },`
			`'header_sha256' : { 'type': 'keyword' },`
			`'header_simhash' : { 'type': 'keyword' }`
			`}`
			`},`
			`'port' : { 'type': 'integer' },`
			`'url' : keyword_mapping,`
			`'final_url' : keyword_mapping,`
			`'input' : keyword_mapping,`
			`'title' : keyword_mapping,`
			`'scheme' : { 'type': 'keyword' },`
			`'webserver' : { 'type': 'keyword' },`
			`'body_preview' : keyword_mapping,`
			`'content_type' : { 'type': 'keyword' },`
			`'method' : { 'type': 'keyword' },`
			`'host' : { 'type': 'ip' },`
			`'path' : keyword_mapping,`
			`'favicon' : { 'type': 'keyword' },`
			`'favicon_path' : keyword_mapping,`
			`'a' : { 'type': 'ip' },`
			`'cname' : keyword_mapping,`
			`'aaaa' : { 'type': 'ip' },`
			`'tech' : keyword_mapping,`
			`'words' : { 'type': 'integer' },`
			`'lines' : { 'type': 'integer' },`
			`'status_code' : { 'type': 'integer' },`
			`'chain_status_codes' : { 'type': 'integer' },`
			`'content_length' : { 'type': 'integer' }`
			`}`
			`}`
			`}`

			`return mapping`


			`async def process_data(input_path: str):`
			`'''`
			`Read and process the input file`

			`:param input_path: Path to the input file`
			`'''`

			`async with aiofiles.open(input_path) as input_file:`
			`# Read the input file line by line`
			`async for line in input_file:`
			`line = line.strip()`

			`# Sentinel value to indicate the end of a process (for closing out a FIFO stream)`
			`if line == '~eof':`
			`break`

			`# Skip empty lines`
			`if not line:`
			`continue`

			`# Parse the JSON record`
			`try:`
			`record = json.loads(line)`
			`except json.JSONDecodeError:`
			`logging.error(f'Failed to parse JSON record: {line}')`
			`continue`

			`# Hacky solution to maintain ISO 8601 format without milliseconds or offsets`
			`record['timestamp'] = record['timestamp'].split('.')[0] + 'Z'`

			`# Remove unnecessary fields we don't care about`
			`for item in ('failed', 'knowledgebase', 'time', 'csp'):`
			`if item in record:`
			`del record[item]`

			`yield {'_index': default_index, '_source': record}`
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00

Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00			`async def test(input_path: str):`
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`'''`
			`Test the ingestion process`

			`:param input_path: Path to the input file`
			`'''`

			`async for document in process_data(input_path):`
			`print(document)`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00


			`if __name__ == '__main__':`
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`import argparse`
			`import asyncio`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`parser = argparse.ArgumentParser(description='Ingestor for ERIS')`
			`parser.add_argument('input_path', help='Path to the input file or directory')`
			`args = parser.parse_args()`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`asyncio.run(test(args.input_path))`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00

			`'''`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00			`Deploy:`
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`go install -v github.com/projectdiscovery/httpx/cmd/httpx@latest`
			`curl -s https://public-dns.info/nameservers.txt -o nameservers.txt`
			`httpx -l fulldomains.txt -t 200 -sc -location -favicon -title -bp -td -ip -cname -mc 200,201,301,302,303,307,308 -fr -r nameservers.txt -retries 2 -stream -sd -j -o fifo.json -v`

Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00			`Output:`
Added IXP ingestor, updated all other ingestors 2024-03-19 23:00:12 +00:00			`{`
			`"timestamp":"2024-01-14T13:08:15.117348474-05:00", # Rename to seen and remove milliseconds and offset`
			`"hash": { # Do we need all of these ?`
			`"body_md5" : "4ae9394eb98233b482508cbda3b33a66",`
			`"body_mmh3" : "-4111954",`
			`"body_sha256" : "89e06e8374353469c65adb227b158b265641b424fba7ddb2c67eef0c4c1280d3",`
			`"body_simhash" : "9814303593401624250",`
			`"header_md5" : "980366deb2b2fb5df2ad861fc63e79ce",`
			`"header_mmh3" : "-813072798",`
			`"header_sha256" : "39aea75ad548e38b635421861641ad1919ed3b103b17a33c41e7ad46516f736d",`
			`"header_simhash" : "10962523587435277678"`
			`},`
			`"port" : "443",`
			`"url" : "https://supernets.org", # Remove this and only use the input field as "domain" maybe`
			`"input" : "supernets.org", # rename to domain`
			`"title" : "SuperNETs",`
			`"scheme" : "https",`
			`"webserver" : "nginx",`
			`"body_preview" : "SUPERNETS Home About Contact Donate Docs Network IRC Git Invidious Jitsi LibreX Mastodon Matrix Sup",`
			`"content_type" : "text/html",`
			`"method" : "GET", # Remove this`
			`"host" : "51.89.151.158",`
			`"path" : "/",`
			`"favicon" : "-674048714",`
			`"favicon_path" : "/i/favicon.png",`
			`"time" : "592.907689ms", # Do we need this ?`
			`"a" : ["6.150.220.23"],`
			`"tech" : ["Bootstrap:4.0.0", "HSTS", "Nginx"],`
			`"words" : 436, # Do we need this ?`
			`"lines" : 79, # Do we need this ?`
			`"status_code" : 200,`
			`"content_length" : 4597,`
			`"failed" : false, # Do we need this ?`
			`"knowledgebase" : { # Do we need this ?`
			`"PageType" : "nonerror",`
			`"pHash" : 0`
			`}`
			`}`
			`'''`