diff --git a/ingestors/ingest_httpx.py b/ingestors/ingest_httpx.py index 795b6ff..1579bf5 100644 --- a/ingestors/ingest_httpx.py +++ b/ingestors/ingest_httpx.py @@ -64,7 +64,7 @@ async def process_data(file_path: str): :param file_path: Path to the HTTPX log file ''' - async with aiofiles.open(file_path, mode='r') as input_file: + async with aiofiles.open(file_path) as input_file: async for line in input_file: line = line.strip() diff --git a/ingestors/ingest_masscan.py b/ingestors/ingest_masscan.py index 546ae01..d5d3acb 100644 --- a/ingestors/ingest_masscan.py +++ b/ingestors/ingest_masscan.py @@ -59,7 +59,7 @@ async def process_data(file_path: str): :param file_path: Path to the Masscan log file ''' - async with aiofiles.open(file_path, mode='r') as input_file: + async with aiofiles.open(file_path) as input_file: async for line in input_file: line = line.strip() diff --git a/ingestors/ingest_massdns.py b/ingestors/ingest_massdns.py index 91ae3c7..7066f3e 100644 --- a/ingestors/ingest_massdns.py +++ b/ingestors/ingest_massdns.py @@ -39,7 +39,7 @@ async def process_data(file_path: str): :param file_path: Path to the Massdns log file ''' - async with aiofiles.open(file_path, mode='r') as input_file: + async with aiofiles.open(file_path) as input_file: last = None @@ -152,7 +152,7 @@ Output: Input: { "_id" : "47.229.6.0" - "_index" : "ptr-records", + "_index" : "eris-massdns", "_source" : { "ip" : "47.229.6.0", "record" : "047-229-006-000.res.spectrum.com", # This will be a list if there are more than one PTR record @@ -161,6 +161,6 @@ Input: } Notes: -- Why do some IP addresses return a CNAME from a PTR request -- What is dns-servfail.net (Frequent CNAME response from PTR requests) + Why do some IP addresses return a CNAME from a PTR request + What is dns-servfail.net (Frequent CNAME response from PTR requests) ''' \ No newline at end of file diff --git a/ingestors/ingest_zone.py b/ingestors/ingest_zone.py index 35c995b..ce97a0f 100644 --- a/ingestors/ingest_zone.py +++ b/ingestors/ingest_zone.py @@ -2,6 +2,7 @@ # Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris) # ingest_zone.py +import logging import time try: @@ -56,15 +57,15 @@ async def process_data(file_path: str): :param file_path: Path to the zone file ''' - domain_records = {} - last_domain = None + async with aiofiles.open(file_path) as input_file: + + last = None - async with aiofiles.open(file_path, mode='r') as input_file: async for line in input_file: line = line.strip() if line == '~eof': # Sentinel value to indicate the end of a process (Used with --watch with FIFO) - break + return last if not line or line.startswith(';'): continue @@ -72,22 +73,25 @@ async def process_data(file_path: str): parts = line.split() if len(parts) < 5: - raise ValueError(f'Invalid line: {line}') + logging.warning(f'Invalid line: {line}') domain, ttl, record_class, record_type, data = parts[0].rstrip('.').lower(), parts[1], parts[2].lower(), parts[3].lower(), ' '.join(parts[4:]) if not ttl.isdigit(): - raise ValueError(f'Invalid TTL: {ttl} with line: {line}') + logging.warning(f'Invalid TTL: {ttl} with line: {line}') + continue ttl = int(ttl) # Anomaly...Doubtful any CHAOS/HESIOD records will be found in zone files if record_class != 'in': - raise ValueError(f'Unsupported record class: {record_class} with line: {line}') + logging.warning(f'Unsupported record class: {record_class} with line: {line}') + continue # We do not want to collide with our current mapping (Again, this is an anomaly) if record_type not in record_types: - raise ValueError(f'Unsupported record type: {record_type} with line: {line}') + logging.warning(f'Unsupported record type: {record_type} with line: {line}') + continue # Little tidying up for specific record types (removing trailing dots, etc) if record_type == 'nsec': @@ -97,26 +101,27 @@ async def process_data(file_path: str): elif data.endswith('.'): data = data.rstrip('.') - if domain != last_domain: - if last_domain: - struct = { - 'domain' : last_domain, - 'records' : domain_records[last_domain], - 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time - } - - del domain_records[last_domain] + if last: + if domain == last['domain']: + if record_type in last['_doc']['records']: + last['_doc']['records'][record_type].append({'ttl': ttl, 'data': data}) # Do we need to check for duplicate records? + else: + last['_doc']['records'][record_type] = [{'ttl': ttl, 'data': data}] + continue + else: + yield last - yield {'_id': domain, '_index': default_index, '_source': struct} # Set the ID to the domain name to allow the record to be reindexed if it exists. - - last_domain = domain - - domain_records[domain] = {} - - if record_type not in domain_records[domain]: - domain_records[domain][record_type] = [] - - domain_records[domain][record_type].append({'ttl': ttl, 'data': data}) + last = { + '_op_type' : 'update', + '_id' : domain, + '_index' : default_index, + '_doc' : { + 'domain' : domain, + 'records' : {record_type: [{'ttl': ttl, 'data': data}]}, + 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time + }, + 'doc_as_upsert' : True # This will create the document if it does not exist + } async def test(input_path: str): @@ -144,22 +149,22 @@ if __name__ == '__main__': ''' Output: - 1001.vegas. 3600 in ns ns11.waterrockdigital.com. - 1001.vegas. 3600 in ns ns12.waterrockdigital.com. + 1001.vegas. 3600 in ns ns11.waterrockdigital.com. + 1001.vegas. 3600 in ns ns12.waterrockdigital.com. Input: { - "_id" : "1001.vegas" - "_index" : "dns-zones", - "_source" : { - "domain" : "1001.vegas", - "records" : { - "ns": [ - {"ttl": 3600, "data": "ns11.waterrockdigital.com"}, - {"ttl": 3600, "data": "ns12.waterrockdigital.com"} + '_id' : '1001.vegas' + '_index' : 'dns-zones', + '_source' : { + 'domain' : '1001.vegas', + 'records' : { + 'ns': [ + {'ttl': 3600, 'data': 'ns11.waterrockdigital.com'}, + {'ttl': 3600, 'data': 'ns12.waterrockdigital.com'} ] }, - "seen" : "2021-09-01T00:00:00Z" + 'seen' : '2021-09-01T00:00:00Z' } }