Code cleanup

This commit is contained in:
Dionysus 2024-03-11 22:33:18 -04:00
parent b018da4e4d
commit 87f2cf27ea
Signed by: acidvegas
GPG Key ID: EF4B922DB85DC9DE
3 changed files with 352 additions and 421 deletions

View File

@ -5,231 +5,151 @@
import asyncio import asyncio
import json import json
import logging import logging
import time
try: try:
import websockets import websockets
except ImportError: except ImportError:
raise ImportError('Missing required \'websockets\' library. (pip install websockets)') raise ImportError('Missing required \'websockets\' library. (pip install websockets)')
# Set a default elasticsearch index if one is not provided
default_index = 'cert-stream' default_index = 'cert-stream'
def construct_map() -> dict: def construct_map() -> dict:
'''Construct the Elasticsearch index mapping for Certstream records.''' '''Construct the Elasticsearch index mapping for Certstream records.'''
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } # Match on exact value or full text search
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
mapping = { # Construct the index mapping
'mappings': { mapping = {
'properties': { 'mappings': {
'data': { 'properties' : {
'properties': { 'domain' : keyword_mapping,
'cert_index': { 'type': 'integer' }, 'seen' : { 'type': 'date' }
'cert_link' : { 'type': 'keyword' }, }
'leaf_cert' : { }
'properties': { }
'all_domains': { 'type': 'keyword' },
'extensions': {
'properties': {
'authorityInfoAccess' : { 'type': 'text' },
'authorityKeyIdentifier' : { 'type': 'text' },
'basicConstraints' : { 'type': 'text' },
'certificatePolicies' : { 'type': 'text' },
'crlDistributionPoints' : { 'type': 'text' },
'ctlPoisonByte' : { 'type': 'boolean' },
'extendedKeyUsage' : { 'type': 'text' },
'keyUsage' : { 'type': 'text' },
'subjectAltName' : { 'type': 'text' },
'subjectKeyIdentifier' : { 'type': 'text' }
}
},
'fingerprint': { 'type': 'keyword' },
'issuer': {
'properties': {
'C' : { 'type': 'keyword' },
'CN' : { 'type': 'text' },
'L' : { 'type': 'text' },
'O' : { 'type': 'text' },
'OU' : { 'type': 'text' },
'ST' : { 'type': 'text' },
'aggregated' : { 'type': 'text' },
'emailAddress' : { 'type': 'text' }
}
},
'not_after' : { 'type': 'integer' },
'not_before' : { 'type': 'integer' },
'serial_number' : { 'type': 'keyword' },
'signature_algorithm' : { 'type': 'text' },
'subject': {
'properties': {
'C' : { 'type': 'keyword' },
'CN' : { 'type': 'text' },
'L' : { 'type': 'text' },
'O' : { 'type': 'text' },
'OU' : { 'type': 'text' },
'ST' : { 'type': 'text' },
'aggregated' : { 'type': 'text' },
'emailAddress' : { 'type': 'text' }
}
}
}
},
'seen': { 'type': 'date', 'format': 'epoch_second' },
'source': {
'properties': {
'name' : { 'type': 'keyword' },
'url' : { 'type': 'keyword' }
}
},
'update_type': { 'type': 'keyword' }
}
},
'message_type': { 'type': 'keyword' }
}
}
}
return mapping return mapping
async def process_data(place_holder: str = None): async def process_data(place_holder: str = None):
''' '''
Read and process Certsream records live from the Websocket stream. Read and process Certsream records live from the Websocket stream.
:param place_holder: Placeholder parameter to match the process_data function signature of other ingestors. :param place_holder: Placeholder parameter to match the process_data function signature of other ingestors.
''' '''
while True: while True:
try: try:
async with websockets.connect('wss://certstream.calidog.io/') as websocket: async with websockets.connect('wss://certstream.calidog.io') as websocket:
while True: while True:
line = await websocket.recv() # Read a line from the websocket
line = await websocket.recv()
if line == '~eof': # Sentinel value to indicate the end of a process (Used with --watch with FIFO) # Parse the JSON record
break try:
record = json.loads(line)
except json.decoder.JSONDecodeError:
logging.error(f'Invalid line from the websocket: {line}')
continue
try: # Grab the unique domains from the record (excluding wildcards)
record = json.loads(line) domains = record['data']['leaf_cert']['all_domains']
except json.decoder.JSONDecodeError: domains = set([domain[2:] if domain.startswith('*.') else domain for domain in domains])
logging.error(f'Failed to parse JSON record from Certstream! ({line})')
input('Press Enter to continue...')
continue
yield record # Construct the document
for domain in domains:
struct = {
'domain' : domain,
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
}
except websockets.ConnectionClosed: yield {'_id': id, '_index': default_index, '_source': struct}
logging.error('Connection to Certstream was closed. Attempting to reconnect...')
await asyncio.sleep(15)
except Exception as e: except websockets.ConnectionClosed:
logging.error(f'An error occurred while processing Certstream records! ({e})') logging.error('Connection to Certstream was closed. Attempting to reconnect...')
await asyncio.sleep(15) await asyncio.sleep(15)
except Exception as e:
async def strip_struct_empty(data: dict) -> dict: logging.error(f'An error occurred while processing Certstream records! ({e})')
''' break
Recursively remove empty values from a nested dictionary or list.
:param data: The dictionary or list to clean.
'''
empties = [None, '', [], {}]
if isinstance(data, dict):
for key, value in list(data.items()):
if value in empties:
del data[key]
else:
cleaned_value = strip_struct_empty(value)
if cleaned_value in empties:
del data[key]
else:
data[key] = cleaned_value
return data
elif isinstance(data, list):
return [strip_struct_empty(item) for item in data if item not in empties and strip_struct_empty(item) not in empties]
else:
return data
async def test(): async def test():
'''Test the Cert stream ingestion process''' '''Test the ingestion process.'''
async for document in process_data(): async for document in process_data():
print(document) print(document)
if __name__ == '__main__': if __name__ == '__main__':
import argparse import asyncio
import asyncio
parser = argparse.ArgumentParser(description='Certstream Ingestor for ERIS') asyncio.run(test())
parser.add_argument('input_path', help='Path to the input file or directory')
args = parser.parse_args()
asyncio.run(test(args.input_path))
''' '''
Output: Output:
{ {
"data": { "data": {
"cert_index": 43061646, "cert_index": 43061646,
"cert_link": "https://yeti2025.ct.digicert.com/log/ct/v1/get-entries?start=43061646&end=43061646", "cert_link": "https://yeti2025.ct.digicert.com/log/ct/v1/get-entries?start=43061646&end=43061646",
"leaf_cert": { "leaf_cert": {
"all_domains": [ "all_domains": [
"*.d7zdnegbre53n.amplifyapp.com", "*.d7zdnegbre53n.amplifyapp.com",
"d7zdnegbre53n.amplifyapp.com" "d7zdnegbre53n.amplifyapp.com"
], ],
"extensions": { "extensions": {
"authorityInfoAccess" : "CA Issuers - URI:http://crt.r2m02.amazontrust.com/r2m02.cer\nOCSP - URI:http://ocsp.r2m02.amazontrust.com\n", "authorityInfoAccess" : "CA Issuers - URI:http://crt.r2m02.amazontrust.com/r2m02.cer\nOCSP - URI:http://ocsp.r2m02.amazontrust.com\n",
"authorityKeyIdentifier" : "keyid:C0:31:52:CD:5A:50:C3:82:7C:74:71:CE:CB:E9:9C:F9:7A:EB:82:E2\n", "authorityKeyIdentifier" : "keyid:C0:31:52:CD:5A:50:C3:82:7C:74:71:CE:CB:E9:9C:F9:7A:EB:82:E2\n",
"basicConstraints" : "CA:FALSE", "basicConstraints" : "CA:FALSE",
"certificatePolicies" : "Policy: 2.23.140.1.2.1", "certificatePolicies" : "Policy: 2.23.140.1.2.1",
"crlDistributionPoints" : "Full Name:\n URI:http://crl.r2m02.amazontrust.com/r2m02.crl", "crlDistributionPoints" : "Full Name:\n URI:http://crl.r2m02.amazontrust.com/r2m02.crl",
"ctlPoisonByte" : true, "ctlPoisonByte" : true,
"extendedKeyUsage" : "TLS Web server authentication, TLS Web client authentication", "extendedKeyUsage" : "TLS Web server authentication, TLS Web client authentication",
"keyUsage" : "Digital Signature, Key Encipherment", "keyUsage" : "Digital Signature, Key Encipherment",
"subjectAltName" : "DNS:d7zdnegbre53n.amplifyapp.com, DNS:*.d7zdnegbre53n.amplifyapp.com", "subjectAltName" : "DNS:d7zdnegbre53n.amplifyapp.com, DNS:*.d7zdnegbre53n.amplifyapp.com",
"subjectKeyIdentifier" : "59:32:78:2A:11:03:62:55:BB:3B:B9:80:24:76:28:90:2E:D1:A4:56" "subjectKeyIdentifier" : "59:32:78:2A:11:03:62:55:BB:3B:B9:80:24:76:28:90:2E:D1:A4:56"
}, },
"fingerprint": "D9:05:A3:D5:AA:F9:68:BC:0C:0A:15:69:C9:5E:11:92:32:67:4F:FA", "fingerprint": "D9:05:A3:D5:AA:F9:68:BC:0C:0A:15:69:C9:5E:11:92:32:67:4F:FA",
"issuer": { "issuer": {
"C" : "US", "C" : "US",
"CN" : "Amazon RSA 2048 M02", "CN" : "Amazon RSA 2048 M02",
"L" : null, "L" : null,
"O" : "Amazon", "O" : "Amazon",
"OU" : null, "OU" : null,
"ST" : null, "ST" : null,
"aggregated" : "/C=US/CN=Amazon RSA 2048 M02/O=Amazon", "aggregated" : "/C=US/CN=Amazon RSA 2048 M02/O=Amazon",
"emailAddress" : null "emailAddress" : null
}, },
"not_after" : 1743811199, "not_after" : 1743811199,
"not_before" : 1709596800, "not_before" : 1709596800,
"serial_number" : "FDB450C1942E3D30A18737063449E62", "serial_number" : "FDB450C1942E3D30A18737063449E62",
"signature_algorithm" : "sha256, rsa", "signature_algorithm" : "sha256, rsa",
"subject": { "subject": {
"C" : null, "C" : null,
"CN" : "*.d7zdnegbre53n.amplifyapp.com", "CN" : "*.d7zdnegbre53n.amplifyapp.com",
"L" : null, "L" : null,
"O" : null, "O" : null,
"OU" : null, "OU" : null,
"ST" : null, "ST" : null,
"aggregated" : "/CN=*.d7zdnegbre53n.amplifyapp.com", "aggregated" : "/CN=*.d7zdnegbre53n.amplifyapp.com",
"emailAddress" : null "emailAddress" : null
} }
}, },
"seen": 1709651773.594684, "seen": 1709651773.594684,
"source": { "source": {
"name" : "DigiCert Yeti2025 Log", "name" : "DigiCert Yeti2025 Log",
"url" : "https://yeti2025.ct.digicert.com/log/" "url" : "https://yeti2025.ct.digicert.com/log/"
}, },
"update_type": "PrecertLogEntry" "update_type": "PrecertLogEntry"
}, },
"message_type": "certificate_update" "message_type": "certificate_update"
} }
''' '''

View File

@ -6,9 +6,9 @@ import logging
import time import time
try: try:
import aiofiles import aiofiles
except ImportError: except ImportError:
raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)') raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')
# Set a default elasticsearch index if one is not provided # Set a default elasticsearch index if one is not provided
@ -16,154 +16,154 @@ default_index = 'eris-massdns'
def construct_map() -> dict: def construct_map() -> dict:
'''Construct the Elasticsearch index mapping for records''' '''Construct the Elasticsearch index mapping for records'''
# Match on exact value or full text search # Match on exact value or full text search
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
# Construct the index mapping # Construct the index mapping
mapping = { mapping = {
'mappings': { 'mappings': {
'properties': { 'properties': {
'ip' : { 'type': 'ip' }, 'ip' : { 'type': 'ip' },
'record' : keyword_mapping, 'record' : keyword_mapping,
'seen' : { 'type': 'date' } 'seen' : { 'type': 'date' }
} }
} }
} }
return mapping return mapping
async def process_data(input_path: str): async def process_data(input_path: str):
''' '''
Read and process the input file Read and process the input file
:param input_path: Path to the input file :param input_path: Path to the input file
''' '''
async with aiofiles.open(input_path) as input_file: async with aiofiles.open(input_path) as input_file:
# Cache the last document to avoid creating a new one for the same IP address # Cache the last document to avoid creating a new one for the same IP address
last = None last = None
try: try:
# Read the input file line by line # Read the input file line by line
async for line in input_file: async for line in input_file:
line = line.strip() line = line.strip()
# Sentinel value to indicate the end of a process (for closing out a FIFO stream) # Sentinel value to indicate the end of a process (for closing out a FIFO stream)
if line == '~eof': if line == '~eof':
yield last yield last
break break
# Skip empty lines (doubtful we will have any, but just in case) # Skip empty lines (doubtful we will have any, but just in case)
if not line: if not line:
continue continue
# Split the line into its parts # Split the line into its parts
parts = line.split() parts = line.split()
# Ensure the line has at least 3 parts # Ensure the line has at least 3 parts
if len(parts) < 3: if len(parts) < 3:
logging.warning(f'Invalid PTR record: {line}') logging.warning(f'Invalid PTR record: {line}')
continue continue
# Split the PTR record into its parts # Split the PTR record into its parts
name, record_type, record = parts[0].rstrip('.'), parts[1], ' '.join(parts[2:]).rstrip('.') name, record_type, record = parts[0].rstrip('.'), parts[1], ' '.join(parts[2:]).rstrip('.')
# Do not index other records # Do not index other records
if record_type != 'PTR': if record_type != 'PTR':
continue continue
# Do not index PTR records that do not have a record # Do not index PTR records that do not have a record
if not record: if not record:
continue continue
# Do not index PTR records that have the same record as the in-addr.arpa domain # Do not index PTR records that have the same record as the in-addr.arpa domain
if record == name: if record == name:
continue continue
# Get the IP address from the in-addr.arpa domain # Get the IP address from the in-addr.arpa domain
ip = '.'.join(name.replace('.in-addr.arpa', '').split('.')[::-1]) ip = '.'.join(name.replace('.in-addr.arpa', '').split('.')[::-1])
# Check if we are still processing the same IP address # Check if we are still processing the same IP address
if last: if last:
if ip == last['_id']: # This record is for the same IP address as the cached document if ip == last['_id']: # This record is for the same IP address as the cached document
last_records = last['doc']['record'] last_records = last['doc']['record']
if record not in last_records: # Do not index duplicate records if record not in last_records: # Do not index duplicate records
last['doc']['record'].append(record) last['doc']['record'].append(record)
continue continue
else: else:
yield last # Return the last document and start a new one yield last # Return the last document and start a new one
# Cache the document # Cache the document
last = { last = {
'_op_type' : 'update', '_op_type' : 'update',
'_id' : ip, '_id' : ip,
'_index' : default_index, '_index' : default_index,
'doc' : { 'doc' : {
'ip' : ip, 'ip' : ip,
'record' : [record], # Consider using painless script to add to list if it exists (Use 'seen' per-record and 'last_seen' for the IP address) 'record' : [record], # Consider using painless script to add to list if it exists (Use 'seen' per-record and 'last_seen' for the IP address)
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
}, },
'doc_as_upsert' : True # Create the document if it does not exist 'doc_as_upsert' : True # Create the document if it does not exist
} }
except Exception as e: except Exception as e:
logging.error(f'Error processing data: {e}') logging.error(f'Error processing data: {e}')
async def test(input_path: str): async def test(input_path: str):
''' '''
Test the ingestion process Test the ingestion process
:param input_path: Path to the input file :param input_path: Path to the input file
''' '''
async for document in process_data(input_path): async for document in process_data(input_path):
print(document) print(document)
if __name__ == '__main__': if __name__ == '__main__':
import argparse import argparse
import asyncio import asyncio
parser = argparse.ArgumentParser(description='Ingestor for ERIS') parser = argparse.ArgumentParser(description='Ingestor for ERIS')
parser.add_argument('input_path', help='Path to the input file or directory') parser.add_argument('input_path', help='Path to the input file or directory')
args = parser.parse_args() args = parser.parse_args()
asyncio.run(test(args.input_path)) asyncio.run(test(args.input_path))
''' '''
Deployment: Deployment:
sudo apt-get install build-essential gcc make sudo apt-get install build-essential gcc make
git clone --depth 1 https://github.com/blechschmidt/massdns.git $HOME/massdns && cd $HOME/massdns && make git clone --depth 1 https://github.com/blechschmidt/massdns.git $HOME/massdns && cd $HOME/massdns && make
curl -s https://public-dns.info/nameservers.txt | grep -v ':' > $HOME/massdns/nameservers.txt curl -s https://public-dns.info/nameservers.txt | grep -v ':' > $HOME/massdns/nameservers.txt
python3 ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR -o S -w $HOME/massdns/fifo.json python3 ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR -o S -w $HOME/massdns/fifo.json
or... or...
while true; do python ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR -o S -w $HOME/massdns/fifo.json; done while true; do python ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR -o S -w $HOME/massdns/fifo.json; done
Output: Output:
0.6.229.47.in-addr.arpa. PTR 047-229-006-000.res.spectrum.com. 0.6.229.47.in-addr.arpa. PTR 047-229-006-000.res.spectrum.com.
0.6.228.75.in-addr.arpa. PTR 0.sub-75-228-6.myvzw.com. 0.6.228.75.in-addr.arpa. PTR 0.sub-75-228-6.myvzw.com.
0.6.207.73.in-addr.arpa. PTR c-73-207-6-0.hsd1.ga.comcast.net. 0.6.207.73.in-addr.arpa. PTR c-73-207-6-0.hsd1.ga.comcast.net.
Input: Input:
{ {
'_id' : '47.229.6.0' '_id' : '47.229.6.0'
'_index' : 'eris-massdns', '_index' : 'eris-massdns',
'_source' : { '_source' : {
'ip' : '47.229.6.0', 'ip' : '47.229.6.0',
'record' : ['047-229-006-000.res.spectrum.com'], # We will store as a list for IP addresses with multiple PTR records 'record' : ['047-229-006-000.res.spectrum.com'], # We will store as a list for IP addresses with multiple PTR records
'seen' : '2021-06-30T18:31:00Z' 'seen' : '2021-06-30T18:31:00Z'
} }
} }
Notes: Notes:
Why do some IP addresses return a A/CNAME from a PTR request Why do some IP addresses return a A/CNAME from a PTR request
What is dns-servfail.net (Frequent CNAME response from PTR requests) What is dns-servfail.net (Frequent CNAME response from PTR requests)
''' '''

View File

@ -6,168 +6,179 @@ import logging
import time import time
try: try:
import aiofiles import aiofiles
except ImportError: except ImportError:
raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)') raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')
# Set a default elasticsearch index if one is not provided
default_index = 'dns-zones' default_index = 'dns-zones'
# Known DNS record types found in zone files
record_types = ('a','aaaa','caa','cdnskey','cds','cname','dnskey','ds','mx','naptr','ns','nsec','nsec3','nsec3param','ptr','rrsig','rp','sshfp','soa','srv','txt','type65534') record_types = ('a','aaaa','caa','cdnskey','cds','cname','dnskey','ds','mx','naptr','ns','nsec','nsec3','nsec3param','ptr','rrsig','rp','sshfp','soa','srv','txt','type65534')
def construct_map() -> dict: def construct_map() -> dict:
'''Construct the Elasticsearch index mapping for zone file records.''' '''Construct the Elasticsearch index mapping for zone file records.'''
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } # Match on exact value or full text search
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
mapping = { # Construct the index mapping
'mappings': { mapping = {
'properties': { 'mappings': {
'domain' : keyword_mapping, 'properties': {
'records' : { 'properties': {} }, 'domain' : keyword_mapping,
'seen' : { 'type': 'date' } 'records' : { 'properties': {} },
} 'seen' : { 'type': 'date' }
} }
} }
}
# Add record types to mapping dynamically to not clutter the code # Add record types to mapping dynamically to not clutter the code
for item in record_types: for record_type in record_types:
if item in ('a','aaaa'): if record_type in ('a','aaaa'):
mapping['mappings']['properties']['records']['properties'][item] = { mapping['mappings']['properties']['records']['properties'][record_type] = {
'properties': { 'properties': {
'data': { 'type': 'ip' }, 'data': { 'type': 'ip' if record_type in ('a','aaaa') else keyword_mapping},
'ttl': { 'type': 'integer' } 'ttl': { 'type': 'integer' }
} }
} }
else:
mapping['mappings']['properties']['records']['properties'][item] = {
'properties': {
'data': keyword_mapping,
'ttl': { 'type': 'integer' }
}
}
return mapping return mapping
async def process_data(file_path: str): async def process_data(file_path: str):
''' '''
Read and process zone file records. Read and process the input file
:param file_path: Path to the zone file :param input_path: Path to the input file
''' '''
async with aiofiles.open(file_path) as input_file: async with aiofiles.open(file_path) as input_file:
last = None # Initialize the cache
last = None
async for line in input_file: # Read the input file line by line
line = line.strip() async for line in input_file:
line = line.strip()
if line == '~eof': # Sentinel value to indicate the end of a process (Used with --watch with FIFO) # Sentinel value to indicate the end of a process (for closing out a FIFO stream)
return last if line == '~eof':
yield last
break
if not line or line.startswith(';'): # Skip empty lines and comments
continue if not line or line.startswith(';'):
continue
parts = line.split() # Split the line into its parts
parts = line.split()
if len(parts) < 5: # Ensure the line has at least 3 parts
logging.warning(f'Invalid line: {line}') if len(parts) < 5:
logging.warning(f'Invalid line: {line}')
continue
domain, ttl, record_class, record_type, data = parts[0].rstrip('.').lower(), parts[1], parts[2].lower(), parts[3].lower(), ' '.join(parts[4:]) # Split the record into its parts
domain, ttl, record_class, record_type, data = parts[0].rstrip('.').lower(), parts[1], parts[2].lower(), parts[3].lower(), ' '.join(parts[4:])
if not ttl.isdigit(): # Ensure the TTL is a number
logging.warning(f'Invalid TTL: {ttl} with line: {line}') if not ttl.isdigit():
continue logging.warning(f'Invalid TTL: {ttl} with line: {line}')
continue
else:
ttl = int(ttl)
ttl = int(ttl) # Do not index other record classes (doubtful any CHAOS/HESIOD records will be found in zone files)
if record_class != 'in':
logging.warning(f'Unsupported record class: {record_class} with line: {line}')
continue
# Anomaly...Doubtful any CHAOS/HESIOD records will be found in zone files # Do not index other record types
if record_class != 'in': if record_type not in record_types:
logging.warning(f'Unsupported record class: {record_class} with line: {line}') logging.warning(f'Unsupported record type: {record_type} with line: {line}')
continue continue
# We do not want to collide with our current mapping (Again, this is an anomaly) # Little tidying up for specific record types (removing trailing dots, etc)
if record_type not in record_types: if record_type == 'nsec':
logging.warning(f'Unsupported record type: {record_type} with line: {line}') data = ' '.join([data.split()[0].rstrip('.'), *data.split()[1:]])
continue elif record_type == 'soa':
data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()])
elif data.endswith('.'):
data = data.rstrip('.')
# Little tidying up for specific record types (removing trailing dots, etc) # Check if we are still processing the same domain
if record_type == 'nsec': if last:
data = ' '.join([data.split()[0].rstrip('.'), *data.split()[1:]]) if domain == last['domain']: # This record is for the same domain as the cached document
elif record_type == 'soa': if record_type in last['_doc']['records']:
data = ' '.join([part.rstrip('.') if '.' in part else part for part in data.split()]) last['_doc']['records'][record_type].append({'ttl': ttl, 'data': data}) # Do we need to check for duplicate records?
elif data.endswith('.'): else:
data = data.rstrip('.') last['_doc']['records'][record_type] = [{'ttl': ttl, 'data': data}]
continue
else:
yield last # Return the last document and start a new one
if last: # Cache the document
if domain == last['domain']: last = {
if record_type in last['_doc']['records']: '_op_type' : 'update',
last['_doc']['records'][record_type].append({'ttl': ttl, 'data': data}) # Do we need to check for duplicate records? '_id' : domain,
else: '_index' : default_index,
last['_doc']['records'][record_type] = [{'ttl': ttl, 'data': data}] '_doc' : {
continue 'domain' : domain,
else: 'records' : {record_type: [{'ttl': ttl, 'data': data}]},
yield last 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time
},
last = { 'doc_as_upsert' : True # This will create the document if it does not exist
'_op_type' : 'update', }
'_id' : domain,
'_index' : default_index,
'_doc' : {
'domain' : domain,
'records' : {record_type: [{'ttl': ttl, 'data': data}]},
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Zone files do not contain a timestamp, so we use the current time
},
'doc_as_upsert' : True # This will create the document if it does not exist
}
async def test(input_path: str): async def test(input_path: str):
''' '''
Test the Zone file ingestion process Test the ingestion process
:param input_path: Path to the MassDNS log file :param input_path: Path to the input file
''' '''
async for document in process_data(input_path):
print(document) async for document in process_data(input_path):
print(document)
if __name__ == '__main__': if __name__ == '__main__':
import argparse import argparse
import asyncio import asyncio
parser = argparse.ArgumentParser(description='Zone file Ingestor for ERIS') parser = argparse.ArgumentParser(description='Ingestor for ERIS')
parser.add_argument('input_path', help='Path to the input file or directory') parser.add_argument('input_path', help='Path to the input file or directory')
args = parser.parse_args() args = parser.parse_args()
asyncio.run(test(args.input_path)) asyncio.run(test(args.input_path))
''' '''
Output: Output:
1001.vegas. 3600 in ns ns11.waterrockdigital.com. 1001.vegas. 3600 in ns ns11.waterrockdigital.com.
1001.vegas. 3600 in ns ns12.waterrockdigital.com. 1001.vegas. 3600 in ns ns12.waterrockdigital.com.
Input: Input:
{ {
'_id' : '1001.vegas' '_id' : '1001.vegas'
'_index' : 'dns-zones', '_index' : 'dns-zones',
'_source' : { '_source' : {
'domain' : '1001.vegas', 'domain' : '1001.vegas',
'records' : { 'records' : {
'ns': [ 'ns': [
{'ttl': 3600, 'data': 'ns11.waterrockdigital.com'}, {'ttl': 3600, 'data': 'ns11.waterrockdigital.com'},
{'ttl': 3600, 'data': 'ns12.waterrockdigital.com'} {'ttl': 3600, 'data': 'ns12.waterrockdigital.com'}
] ]
}, },
'seen' : '2021-09-01T00:00:00Z' 'seen' : '2021-09-01T00:00:00Z'
} }
} }
Notes: Notes:
How do we want to handle hashed NSEC3 records? Do we ignest them as they are, or crack the NSEC3 hashes first and ingest? How do we want to handle hashed NSEC3 records? Do we ignest them as they are, or crack the NSEC3 hashes first and ingest?
''' '''