From 1ab7199f7db2c66b08f9dba4743dbb48f15663ce Mon Sep 17 00:00:00 2001 From: acidvegas Date: Wed, 13 Mar 2024 22:34:20 -0400 Subject: [PATCH] Certstream ingestor now only logs sub-domains since we already ingested zone files. Ignores www. and wildcard domains. --- ingestors/ingest_certstream.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/ingestors/ingest_certstream.py b/ingestors/ingest_certstream.py index 6e6394c..2cceb29 100644 --- a/ingestors/ingest_certstream.py +++ b/ingestors/ingest_certstream.py @@ -54,10 +54,20 @@ async def process_data(place_holder: str = None): logging.error(f'Invalid line from the websocket: {line}') continue - # Grab the unique domains from the record (excluding wildcards) - domains = record['data']['leaf_cert']['all_domains'] - domains = set([domain[2:] if domain.startswith('*.') else domain for domain in domains]) + # Grab the unique domains from the records + all_domains = record['data']['leaf_cert']['all_domains'] + domains = list() + # We only care about subdomains (excluding www. and wildcards) + for domain in all_domains: + if domain.startswith('*.'): + domain = domain[2:] + elif domain.startswith('www.') and domain.count('.') == 2: + continue + if domain.count('.') > 1: + if domain not in domains: + domains.append(domain) + # Construct the document for domain in domains: struct = {