#!/usr/bin/env python # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris) # ingest_zone.py import logging import time try: import aiofiles except ImportError: raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)') # Set a default elasticsearch index if one is not provided default_index = 'eris-zones' # Known DNS record types found in zone files record_types = ('a','aaaa','caa','cdnskey','cds','cname','dnskey','ds','mx','naptr','ns','nsec','nsec3','nsec3param','ptr','rrsig','rp','sshfp','soa','srv','txt','type65534') def construct_map() -> dict: '''Construct the Elasticsearch index mapping for zone file records.''' # Match on exact value or full text search keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } # Construct the index mapping mapping = { 'mappings': { 'properties': { 'domain' : keyword_mapping, 'zone' : { 'type': 'keyword' }, 'records' : { 'type': 'nested', 'properties': {} }, 'source' : { 'type': 'keyword' }, 'seen' : { 'type': 'date' } } } } # Add record types to mapping dynamically to not clutter the code for record_type in record_types: if record_type in ('a','aaaa'): mapping['mappings']['properties']['records']['properties'][record_type] = { 'type' : 'nested', 'properties' : { 'data' : { 'type': 'ip' if record_type in ('a','aaaa') else keyword_mapping }, 'ttl' : { 'type': 'integer' } } } return mapping async def process_data(file_path: str): ''' Read and process the input file :param input_path: Path to the input file ''' async with aiofiles.open(file_path) as input_file: # Initialize the cache last = None # Default source for the records source = 'czds' # Determine the zone name from the file path (e.g., /path/to/zones/com.eu.txt -> com.eu zone) zone = '.'.join(file_path.split('/')[-1].split('.')[:-1]) # Note: For now, this is the best way because we are not just ingesting TLD zone files, but entire zones for domains aswell... # Read the input file line by line async for line in input_file: line = line.strip() # Sentinel value to indicate the end of a process (for closing out a FIFO stream) if line == '~eof': yield last break # Skip empty lines and comments if not line: continue # Skip comments but detect AXFR transfers to change the source) if line.startswith(';'): if 'DiG' in line and 'AXFR' in line: # Do we need to worry about case sensitivity? How can we store the nameserver aswell? source = 'axfr' continue # Split the line into its parts parts = line.split() # Ensure the line has at least 3 parts if len(parts) < 5: logging.warning(f'Invalid line: {line}') continue # Split the record into its parts domain, ttl, record_class, record_type, data = parts[0].rstrip('.').lower(), parts[1], parts[2].lower(), parts[3].lower(), ' '.join(parts[4:]) # Ensure the TTL is a number if not ttl.isdigit(): logging.warning(f'Invalid TTL: {ttl} with line: {line}') continue else: ttl = int(ttl) # Do not index other record classes (doubtful any CHAOS/HESIOD records will be found in zone files) if record_class != 'in': logging.warning(f'Unsupported record class: {record_class} with line: {line}') continue # Do not index other record types if record_type not in record_types: logging.warning(f'Unsupported record type: {record_type} with line: {line}') continue # Little tidying up for specific record types (removing trailing dots, etc) if record_type == 'nsec': data = ' '.join([data.split()[0].rstrip('.'), *data.split()[1:]]) elif record_type == 'soa': data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()]) elif data.endswith('.'): data = data.rstrip('.') # Check if we are still processing the same domain if last: if domain == last['domain']: # This record is for the same domain as the cached document if record_type in last['_doc']['records']: last['_doc']['records'][record_type].append({'ttl': ttl, 'data': data}) # Do we need to check for duplicate records? else: last['_doc']['records'][record_type] = [{'ttl': ttl, 'data': data}] continue else: yield last # Return the last document and start a new one # Cache the document last = { '_op_type' : 'update', '_id' : domain, '_index' : default_index, '_doc' : { 'domain' : domain, 'zone' : zone, 'records' : {record_type: [{'data': data, 'ttl': ttl}]}, 'source' : source, 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time }, 'doc_as_upsert' : True # This will create the document if it does not exist } async def test(input_path: str): ''' Test the ingestion process :param input_path: Path to the input file ''' async for document in process_data(input_path): print(document) if __name__ == '__main__': import argparse import asyncio parser = argparse.ArgumentParser(description='Ingestor for ERIS') parser.add_argument('input_path', help='Path to the input file or directory') args = parser.parse_args() asyncio.run(test(args.input_path)) ''' Output: 1001.vegas. 3600 in ns ns11.waterrockdigital.com. 1001.vegas. 3600 in ns ns12.waterrockdigital.com. Input: { '_id' : '1001.vegas' '_index' : 'dns-zones', '_source' : { 'domain' : '1001.vegas', 'zone' : 'vegas', 'records' : { 'ns': [ {'ttl': 3600, 'data': 'ns11.waterrockdigital.com'}, {'ttl': 3600, 'data': 'ns12.waterrockdigital.com'} ] }, 'source' : 'czds', 'seen' : '2021-09-01T00:00:00Z' } } Notes: How do we want to handle hashed NSEC3 records? Do we ignest them as they are, or crack the NSEC3 hashes first and ingest? Can an AXFR transfer return data out of order? If so, how do we handle that? '''