eris/ingestors/ingest_zone.py

#!/usr/bin/env python
# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
# ingest_zone.py

import logging
import time

try:
	import aiofiles
except ImportError:
	raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')


# Set a default elasticsearch index if one is not provided
default_index = 'eris-zones'

# Known DNS record types found in zone files
record_types  = ('a','aaaa','caa','cdnskey','cds','cname','dnskey','ds','mx','naptr','ns','nsec','nsec3','nsec3param','ptr','rrsig','rp','sshfp','soa','srv','txt','type65534')


def construct_map() -> dict:
	'''Construct the Elasticsearch index mapping for zone file records.'''

	# Match on exact value or full text search
	keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }

	# Construct the index mapping
	mapping = {
		'mappings': {
			'properties': {
				'domain'  : keyword_mapping,
				'zone'    : { 'type': 'keyword' },
				'records' : { 'type': 'nested', 'properties': {} },
				'source'  : { 'type': 'keyword' },
				'seen'    : { 'type': 'date' }
			}
		}
	}

	# Add record types to mapping dynamically to not clutter the code
	for record_type in record_types:
		if record_type in ('a','aaaa'):
			mapping['mappings']['properties']['records']['properties'][record_type] = {
				'type'       : 'nested',
				'properties' : {
					'data' : { 'type': 'ip' if record_type in ('a','aaaa') else keyword_mapping },
					'ttl'  : { 'type': 'integer' }
				}
			}

	return mapping


async def process_data(file_path: str):
	'''
	Read and process the input file

	:param input_path: Path to the input file
	'''

	async with aiofiles.open(file_path) as input_file:

		# Initialize the cache
		last = None

		# Default source for the records
		source = 'czds'

		# Determine the zone name from the file path (e.g., /path/to/zones/com.eu.txt -> com.eu zone)
		zone = '.'.join(file_path.split('/')[-1].split('.')[:-1])
		# Note: For now, this is the best way because we are not just ingesting TLD zone files, but entire zones for domains aswell...

		# Read the input file line by line
		async for line in input_file:
			line = line.strip()

			# Sentinel value to indicate the end of a process (for closing out a FIFO stream)
			if line == '~eof':
				yield last
				break

			# Skip empty lines and comments
			if not line:
				continue

			# Skip comments but detect AXFR transfers to change the source)
			if line.startswith(';'):
				if 'DiG' in line and 'AXFR' in line: # Do we need to worry about case sensitivity? How can we store the nameserver aswell?
					source = 'axfr'
				continue

			# Split the line into its parts
			parts = line.split()

			# Ensure the line has at least 3 parts
			if len(parts) < 5:
				logging.warning(f'Invalid line: {line}')
				continue

			# Split the record into its parts
			domain, ttl, record_class, record_type, data = parts[0].rstrip('.').lower(), parts[1], parts[2].lower(), parts[3].lower(), ' '.join(parts[4:])

			# Ensure the TTL is a number
			if not ttl.isdigit():
				logging.warning(f'Invalid TTL: {ttl} with line: {line}')
				continue
			else:
				ttl = int(ttl)

			# Do not index other record classes (doubtful any CHAOS/HESIOD records will be found in zone files)
			if record_class != 'in':
				logging.warning(f'Unsupported record class: {record_class} with line: {line}')
				continue

			# Do not index other record types
			if record_type not in record_types:
				logging.warning(f'Unsupported record type: {record_type} with line: {line}')
				continue

			# Little tidying up for specific record types (removing trailing dots, etc)
			if record_type == 'nsec':
				data = ' '.join([data.split()[0].rstrip('.'), *data.split()[1:]])
			elif record_type == 'soa':
				data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()])
			elif data.endswith('.'):
				data = data.rstrip('.')

			# Check if we are still processing the same domain
			if last:
				if domain == last['doc']['domain']:
					if record_type in last['doc']['records']:
						last['doc']['records'][record_type].append({'ttl': ttl, 'data': data}) # Do we need to check for duplicate records?
					else:
						last['doc']['records'][record_type] = [{'ttl': ttl, 'data': data}]
					continue
				else:
					yield last

			# Cache the document
			last = {
				'_op_type' : 'update',
				'_id'      : domain,
				'_index'   : default_index,
				'doc'     : {
					'domain'  : domain,
					'zone'    : zone,
					'records' : {record_type: [{'data': data, 'ttl': ttl}]},
					'source'  : source,
					'seen'    : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time
				},
				'doc_as_upsert' : True # This will create the document if it does not exist
			}


async def test(input_path: str):
	'''
	Test the ingestion process

	:param input_path: Path to the input file
	'''

	async for document in process_data(input_path):
		print(document)


if __name__ == '__main__':
	import argparse
	import asyncio

	parser = argparse.ArgumentParser(description='Ingestor for ERIS')
	parser.add_argument('input_path', help='Path to the input file or directory')
	args = parser.parse_args()

	asyncio.run(test(args.input_path))


'''
Output:
	1001.vegas. 3600 in ns ns11.waterrockdigital.com.
	1001.vegas. 3600 in ns ns12.waterrockdigital.com.

Input:
	{
		'_id'     : '1001.vegas'
		'_index'  : 'dns-zones',
		'_source' : {
			'domain'  : '1001.vegas',
			'zone'    : 'vegas',
			'records' : {
				'ns': [
					{'ttl': 3600, 'data': 'ns11.waterrockdigital.com'},
					{'ttl': 3600, 'data': 'ns12.waterrockdigital.com'}
				]
			},
			'source'  : 'czds',
			'seen'    : '2021-09-01T00:00:00Z'
		}
	}

Notes:
	How do we want to handle hashed NSEC3 records? Do we ignest them as they are, or crack the NSEC3 hashes first and ingest?
	Can an AXFR transfer return data out of order? If so, how do we handle that?
'''
Initial commit 2024-01-20 07:04:50 +00:00			`#!/usr/bin/env python`
			`# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)`
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`# ingest_zone.py`
Initial commit 2024-01-20 07:04:50 +00:00
Overall code cleanup 2024-03-08 05:07:26 +00:00			`import logging`
Initial commit 2024-01-20 07:04:50 +00:00			`import time`

Asyncronous refactorization pushed as main version 💯 2024-03-06 03:19:11 +00:00			`try:`
Code cleanup 2024-03-12 02:33:18 +00:00			`import aiofiles`
Asyncronous refactorization pushed as main version 💯 2024-03-06 03:19:11 +00:00			`except ImportError:`
Code cleanup 2024-03-12 02:33:18 +00:00			`raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')`
Asyncronous refactorization pushed as main version 💯 2024-03-06 03:19:11 +00:00
Zone file ingestion script now uses the same sentinal value as masscans ingestion, set document id as the domain name to allow updating records if they exist 2024-03-06 19:12:27 +00:00
Code cleanup 2024-03-12 02:33:18 +00:00			`# Set a default elasticsearch index if one is not provided`
Records stored as a nested type for better querying, added source field so we can identify where the zone data derived (icann, axfr, breach, etc) 2024-03-23 17:46:38 +00:00			`default_index = 'eris-zones'`
Code cleanup 2024-03-12 02:33:18 +00:00
			`# Known DNS record types found in zone files`
Zone file ingestion script now uses the same sentinal value as masscans ingestion, set document id as the domain name to allow updating records if they exist 2024-03-06 19:12:27 +00:00			`record_types = ('a','aaaa','caa','cdnskey','cds','cname','dnskey','ds','mx','naptr','ns','nsec','nsec3','nsec3param','ptr','rrsig','rp','sshfp','soa','srv','txt','type65534')`

Initial commit 2024-01-20 07:04:50 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`def construct_map() -> dict:`
Code cleanup 2024-03-12 02:33:18 +00:00			`'''Construct the Elasticsearch index mapping for zone file records.'''`

			`# Match on exact value or full text search`
			`keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }`

			`# Construct the index mapping`
			`mapping = {`
			`'mappings': {`
			`'properties': {`
			`'domain' : keyword_mapping,`
Added zone field too better querying on specific tlds/zones 2024-03-23 18:31:39 +00:00			`'zone' : { 'type': 'keyword' },`
Records stored as a nested type for better querying, added source field so we can identify where the zone data derived (icann, axfr, breach, etc) 2024-03-23 17:46:38 +00:00			`'records' : { 'type': 'nested', 'properties': {} },`
			`'source' : { 'type': 'keyword' },`
Code cleanup 2024-03-12 02:33:18 +00:00			`'seen' : { 'type': 'date' }`
			`}`
			`}`
			`}`

			`# Add record types to mapping dynamically to not clutter the code`
			`for record_type in record_types:`
			`if record_type in ('a','aaaa'):`
			`mapping['mappings']['properties']['records']['properties'][record_type] = {`
Records stored as a nested type for better querying, added source field so we can identify where the zone data derived (icann, axfr, breach, etc) 2024-03-23 17:46:38 +00:00			`'type' : 'nested',`
			`'properties' : {`
			`'data' : { 'type': 'ip' if record_type in ('a','aaaa') else keyword_mapping },`
			`'ttl' : { 'type': 'integer' }`
Code cleanup 2024-03-12 02:33:18 +00:00			`}`
			`}`

			`return mapping`
Updated README, copied over consistencies across the ingestors, docstring updates to reflect on new arguments 2024-01-27 09:28:30 +00:00
Initial commit 2024-01-20 07:04:50 +00:00
Asyncronous refactorization pushed as main version 💯 2024-03-06 03:19:11 +00:00			`async def process_data(file_path: str):`
Code cleanup 2024-03-12 02:33:18 +00:00			`'''`
			`Read and process the input file`

			`:param input_path: Path to the input file`
			`'''`

			`async with aiofiles.open(file_path) as input_file:`

			`# Initialize the cache`
			`last = None`

AXFR detection in zone file input, to determine if the data is form ICANN or an AXFR against a dns (for source tracking) 2024-03-23 18:42:51 +00:00			`# Default source for the records`
			`source = 'czds'`

Added zone field too better querying on specific tlds/zones 2024-03-23 18:31:39 +00:00			`# Determine the zone name from the file path (e.g., /path/to/zones/com.eu.txt -> com.eu zone)`
			`zone = '.'.join(file_path.split('/')[-1].split('.')[:-1])`
Added a comment about the methodology of determining the zone the data is from. 2024-03-23 18:48:45 +00:00			`# Note: For now, this is the best way because we are not just ingesting TLD zone files, but entire zones for domains aswell...`
Added zone field too better querying on specific tlds/zones 2024-03-23 18:31:39 +00:00
Code cleanup 2024-03-12 02:33:18 +00:00			`# Read the input file line by line`
			`async for line in input_file:`
			`line = line.strip()`

			`# Sentinel value to indicate the end of a process (for closing out a FIFO stream)`
			`if line == '~eof':`
			`yield last`
			`break`

			`# Skip empty lines and comments`
AXFR detection in zone file input, to determine if the data is form ICANN or an AXFR against a dns (for source tracking) 2024-03-23 18:42:51 +00:00			`if not line:`
Code cleanup 2024-03-12 02:33:18 +00:00			`continue`

Updated example record comment under source to reflect on the new record structure 2024-03-23 18:46:44 +00:00			`# Skip comments but detect AXFR transfers to change the source)`
AXFR detection in zone file input, to determine if the data is form ICANN or an AXFR against a dns (for source tracking) 2024-03-23 18:42:51 +00:00			`if line.startswith(';'):`
Added more comments and thoughts 2024-03-23 19:31:15 +00:00			`if 'DiG' in line and 'AXFR' in line: # Do we need to worry about case sensitivity? How can we store the nameserver aswell?`
AXFR detection in zone file input, to determine if the data is form ICANN or an AXFR against a dns (for source tracking) 2024-03-23 18:42:51 +00:00			`source = 'axfr'`
			`continue`
Fixed _doc to doc in record 2024-03-23 21:26:55 +00:00
Code cleanup 2024-03-12 02:33:18 +00:00			`# Split the line into its parts`
			`parts = line.split()`

			`# Ensure the line has at least 3 parts`
			`if len(parts) < 5:`
			`logging.warning(f'Invalid line: {line}')`
			`continue`

			`# Split the record into its parts`
			`domain, ttl, record_class, record_type, data = parts[0].rstrip('.').lower(), parts[1], parts[2].lower(), parts[3].lower(), ' '.join(parts[4:])`

			`# Ensure the TTL is a number`
			`if not ttl.isdigit():`
			`logging.warning(f'Invalid TTL: {ttl} with line: {line}')`
			`continue`
			`else:`
			`ttl = int(ttl)`

			`# Do not index other record classes (doubtful any CHAOS/HESIOD records will be found in zone files)`
			`if record_class != 'in':`
			`logging.warning(f'Unsupported record class: {record_class} with line: {line}')`
			`continue`

			`# Do not index other record types`
			`if record_type not in record_types:`
			`logging.warning(f'Unsupported record type: {record_type} with line: {line}')`
			`continue`

			`# Little tidying up for specific record types (removing trailing dots, etc)`
			`if record_type == 'nsec':`
			`data = ' '.join([data.split()[0].rstrip('.'), *data.split()[1:]])`
			`elif record_type == 'soa':`
			`data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()])`
			`elif data.endswith('.'):`
			`data = data.rstrip('.')`

			`# Check if we are still processing the same domain`
			`if last:`
Fixed _doc to doc in record 2024-03-23 21:26:55 +00:00			`if domain == last['doc']['domain']:`
			`if record_type in last['doc']['records']:`
			`last['doc']['records'][record_type].append({'ttl': ttl, 'data': data}) # Do we need to check for duplicate records?`
Code cleanup 2024-03-12 02:33:18 +00:00			`else:`
Fixed _doc to doc in record 2024-03-23 21:26:55 +00:00			`last['doc']['records'][record_type] = [{'ttl': ttl, 'data': data}]`
Code cleanup 2024-03-12 02:33:18 +00:00			`continue`
			`else:`
Fixed _doc to doc in record 2024-03-23 21:26:55 +00:00			`yield last`
Code cleanup 2024-03-12 02:33:18 +00:00
			`# Cache the document`
			`last = {`
			`'_op_type' : 'update',`
			`'_id' : domain,`
			`'_index' : default_index,`
Fixed _doc to doc in record 2024-03-23 21:26:55 +00:00			`'doc' : {`
Code cleanup 2024-03-12 02:33:18 +00:00			`'domain' : domain,`
Added zone field too better querying on specific tlds/zones 2024-03-23 18:31:39 +00:00			`'zone' : zone,`
Records stored as a nested type for better querying, added source field so we can identify where the zone data derived (icann, axfr, breach, etc) 2024-03-23 17:46:38 +00:00			`'records' : {record_type: [{'data': data, 'ttl': ttl}]},`
AXFR detection in zone file input, to determine if the data is form ICANN or an AXFR against a dns (for source tracking) 2024-03-23 18:42:51 +00:00			`'source' : source,`
Code cleanup 2024-03-12 02:33:18 +00:00			`'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time`
			`},`
			`'doc_as_upsert' : True # This will create the document if it does not exist`
			`}`
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00

Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00			`async def test(input_path: str):`
Code cleanup 2024-03-12 02:33:18 +00:00			`'''`
			`Test the ingestion process`

			`:param input_path: Path to the input file`
			`'''`

			`async for document in process_data(input_path):`
			`print(document)`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00


			`if __name__ == '__main__':`
Code cleanup 2024-03-12 02:33:18 +00:00			`import argparse`
			`import asyncio`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00
Code cleanup 2024-03-12 02:33:18 +00:00			`parser = argparse.ArgumentParser(description='Ingestor for ERIS')`
			`parser.add_argument('input_path', help='Path to the input file or directory')`
			`args = parser.parse_args()`

			`asyncio.run(test(args.input_path))`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00

Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00
			`'''`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00			`Output:`
Code cleanup 2024-03-12 02:33:18 +00:00			`1001.vegas. 3600 in ns ns11.waterrockdigital.com.`
			`1001.vegas. 3600 in ns ns12.waterrockdigital.com.`
Testing function added to every ingestor to debug directly. No more --dry-run needed. 2024-03-08 04:31:30 +00:00
			`Input:`
Code cleanup 2024-03-12 02:33:18 +00:00			`{`
			`'_id' : '1001.vegas'`
			`'_index' : 'dns-zones',`
			`'_source' : {`
			`'domain' : '1001.vegas',`
Updated example record comment under source to reflect on the new record structure 2024-03-23 18:46:44 +00:00			`'zone' : 'vegas',`
Code cleanup 2024-03-12 02:33:18 +00:00			`'records' : {`
			`'ns': [`
			`{'ttl': 3600, 'data': 'ns11.waterrockdigital.com'},`
			`{'ttl': 3600, 'data': 'ns12.waterrockdigital.com'}`
			`]`
			`},`
Updated example record comment under source to reflect on the new record structure 2024-03-23 18:46:44 +00:00			`'source' : 'czds',`
Code cleanup 2024-03-12 02:33:18 +00:00			`'seen' : '2021-09-01T00:00:00Z'`
			`}`
			`}`
Added a note about NSEC3 hash cracking for zone file data 2024-03-06 19:16:05 +00:00
			`Notes:`
Code cleanup 2024-03-12 02:33:18 +00:00			`How do we want to handle hashed NSEC3 records? Do we ignest them as they are, or crack the NSEC3 hashes first and ingest?`
Added more comments and thoughts 2024-03-23 19:31:15 +00:00			`Can an AXFR transfer return data out of order? If so, how do we handle that?`
Code cleanup 2024-03-12 02:33:18 +00:00			`'''`