Added IXP ingestor, updated all other ingestors

This commit is contained in:
Dionysus 2024-03-19 19:00:12 -04:00
parent 24850ea976
commit 603d005a47
Signed by: acidvegas
GPG Key ID: EF4B922DB85DC9DE
7 changed files with 446 additions and 294 deletions

View File

@ -45,6 +45,7 @@ async def process_data(place_holder: str = None):
async for websocket in websockets.connect('wss://certstream.calidog.io', ping_interval=15, ping_timeout=60):
try:
# Read the websocket stream
async for line in websocket:
# Parse the JSON record
@ -65,6 +66,7 @@ async def process_data(place_holder: str = None):
elif domain.startswith('www.') and domain.count('.') == 2:
continue
if domain.count('.') > 1:
# TODO: Add a check for PSL TLDs...domain.co.uk, domain.com.au, etc. (we want to ignore these if they are not subdomains)
if domain not in domains:
domains.append(domain)
@ -81,6 +83,10 @@ async def process_data(place_holder: str = None):
logging.error(f'Connection to Certstream was closed. Attempting to reconnect... ({e})')
await asyncio.sleep(3)
except Exception as e:
logging.error(f'Error processing Certstream data: {e}')
await asyncio.sleep(3)
async def test():
'''Test the ingestion process.'''
@ -91,8 +97,6 @@ async def test():
if __name__ == '__main__':
import asyncio
asyncio.run(test())
@ -155,4 +159,7 @@ Output:
},
"message_type": "certificate_update"
}
Notes:
- Fix the "no close frame received or sent" error
'''

View File

@ -3,53 +3,64 @@
# ingest_httpx.py
import json
import logging
try:
import aiofiles
except ImportError:
raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')
default_index = 'httpx-logs'
# Set a default elasticsearch index if one is not provided
default_index = 'eris-httpx'
def construct_map() -> dict:
'''Construct the Elasticsearch index mapping for Masscan records.'''
# Match on exact value or full text search
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
# Construct the index mapping
mapping = {
'mappings': {
'properties': {
"timestamp" : { 'type' : 'date' },
"hash" : {
"body_md5" : { 'type': 'keyword' },
"body_mmh3" : { 'type': 'keyword' },
"body_sha256" : { 'type': 'keyword' },
"body_simhash" : { 'type': 'keyword' },
"header_md5" : { 'type': 'keyword' },
"header_mmh3" : { 'type': 'keyword' },
"header_sha256" : { 'type': 'keyword' },
"header_simhash" : { 'type': 'keyword' }
'timestamp' : { 'type' : 'date' },
'hash' : {
'properties': {
'body_md5' : { 'type': 'keyword' },
'body_mmh3' : { 'type': 'keyword' },
'body_sha256' : { 'type': 'keyword' },
'body_simhash' : { 'type': 'keyword' },
'header_md5' : { 'type': 'keyword' },
'header_mmh3' : { 'type': 'keyword' },
'header_sha256' : { 'type': 'keyword' },
'header_simhash' : { 'type': 'keyword' }
}
},
"port" : { 'type': 'integer' },
"url" : keyword_mapping,
"input" : keyword_mapping,
"title" : keyword_mapping,
"scheme" : { 'type': 'keyword' },
"webserver" : { 'type': 'keyword' },
"body_preview" : keyword_mapping,
"content_type" : { 'type': 'keyword' },
"method" : { 'type': 'keyword'},
"host" : { 'type': 'ip'},
"path" : keyword_mapping,
"favicon" : { 'type': 'keyword' },
"favicon_path" : keyword_mapping,
"a" : { 'type': 'ip'},
"aaaa" : { 'type': 'ip'},
"tech" : keyword_mapping,
"words" : { 'type': 'integer'},
"lines" : { 'type': 'integer'},
"status_code" : { 'type': 'integer'},
"content_length" : { 'type': 'integer'}
'port' : { 'type': 'integer' },
'url' : keyword_mapping,
'final_url' : keyword_mapping,
'input' : keyword_mapping,
'title' : keyword_mapping,
'scheme' : { 'type': 'keyword' },
'webserver' : { 'type': 'keyword' },
'body_preview' : keyword_mapping,
'content_type' : { 'type': 'keyword' },
'method' : { 'type': 'keyword' },
'host' : { 'type': 'ip' },
'path' : keyword_mapping,
'favicon' : { 'type': 'keyword' },
'favicon_path' : keyword_mapping,
'a' : { 'type': 'ip' },
'cname' : keyword_mapping,
'aaaa' : { 'type': 'ip' },
'tech' : keyword_mapping,
'words' : { 'type': 'integer' },
'lines' : { 'type': 'integer' },
'status_code' : { 'type': 'integer' },
'chain_status_codes' : { 'type': 'integer' },
'content_length' : { 'type': 'integer' }
}
}
}
@ -57,37 +68,51 @@ def construct_map() -> dict:
return mapping
async def process_data(file_path: str):
async def process_data(input_path: str):
'''
Read and process HTTPX records from the log file.
Read and process the input file
:param file_path: Path to the HTTPX log file
:param input_path: Path to the input file
'''
async with aiofiles.open(file_path) as input_file:
async with aiofiles.open(input_path) as input_file:
# Read the input file line by line
async for line in input_file:
line = line.strip()
# Sentinel value to indicate the end of a process (for closing out a FIFO stream)
if line == '~eof':
break
# Skip empty lines
if not line:
continue
# Parse the JSON record
try:
record = json.loads(line)
except json.JSONDecodeError:
logging.error(f'Failed to parse JSON record: {line}')
continue
record['seen'] = record.pop('timestamp').split('.')[0] + 'Z' # Hacky solution to maintain ISO 8601 format without milliseconds or offsets
record['domain'] = record.pop('input')
# Hacky solution to maintain ISO 8601 format without milliseconds or offsets
record['timestamp'] = record['timestamp'].split('.')[0] + 'Z'
for item in ('failed', 'knowledgebase', 'time'):
# Remove unnecessary fields we don't care about
for item in ('failed', 'knowledgebase', 'time', 'csp'):
if item in record:
del record[item]
yield {'_id': record['domain'], '_index': default_index, '_source': record}
yield {'_index': default_index, '_source': record}
async def test(input_path: str):
'''
Test the HTTPX ingestion process
Test the ingestion process
:param input_path: Path to the HTTPX log file
:param input_path: Path to the input file
'''
async for document in process_data(input_path):
print(document)
@ -97,7 +122,7 @@ if __name__ == '__main__':
import argparse
import asyncio
parser = argparse.ArgumentParser(description='HTTPX Ingestor for ERIS')
parser = argparse.ArgumentParser(description='Ingestor for ERIS')
parser.add_argument('input_path', help='Path to the input file or directory')
args = parser.parse_args()
@ -105,11 +130,11 @@ if __name__ == '__main__':
''''
'''
Deploy:
go install -v github.com/projectdiscovery/httpx/cmd/httpx@latest
curl -s https://public-dns.info/nameservers.txt -o nameservers.txt
httpx -l zone.txt -t 200 -sc -location -favicon -title -bp -td -ip -cname -mc 200,201,301,302,303,307,308 -fr -r nameservers.txt -retries 2 -stream -sd -j -o httpx.json -v
httpx -l fulldomains.txt -t 200 -sc -location -favicon -title -bp -td -ip -cname -mc 200,201,301,302,303,307,308 -fr -r nameservers.txt -retries 2 -stream -sd -j -o fifo.json -v
Output:
{

111
ingestors/ingest_ixps.py Normal file
View File

@ -0,0 +1,111 @@
#!/usr/bin/env python
# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
# ingest_ixps.py
import json
import ipaddress
import time
try:
import aiohttp
except ImportError:
raise ImportError('Missing required \'aiohttp\' library. (pip install aiohttp)')
# Set a default elasticsearch index if one is not provided
default_index = 'ixp-' + time.strftime('%Y-%m-%d')
def construct_map() -> dict:
'''Construct the Elasticsearch index mapping for records'''
# Match on exact value or full text search
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
# Construct the index mapping
mapping = {
'mappings': {
'properties': {
'name' : {'type': 'keyword'},
'alternatenames' : {'type': 'keyword'},
'sources' : {'type': 'keyword'},
'prefixes' : { 'properties': { 'ipv4' : {'type': 'ip'}, 'ipv6' : {'type': 'ip_range'} } },
'url' : { 'type': 'keyword' },
'region' : { 'type': 'keyword' },
'country' : { 'type': 'keyword' },
'city' : { 'type': 'keyword' },
'state' : { 'type': 'keyword' },
'zip' : { 'type': 'keyword' },
'address' : keyword_mapping,
'iata' : { 'type': 'keyword' },
'latitude' : { 'type': 'float' },
'longitude' : { 'type': 'float' },
'geo_id' : { 'type': 'integer' },
'ix_id' : { 'type': 'integer' },
'org_id' : { 'type': 'integer' },
'pdb_id' : { 'type': 'integer' },
'pdb_org_id' : { 'type': 'integer' },
'pch_id' : { 'type': 'integer' }
}
}
}
return mapping
async def process_data():
'''Read and process the transfers data.'''
try:
async with aiohttp.ClientSession() as session:
async with session.get('https://publicdata.caida.org/datasets/ixps/ixs_202401.jsonl') as response:
if response.status != 200:
raise Exception(f'Failed to fetch IXP data: {response.status}')
data = await response.text()
try:
json_data = json.loads(data)
except json.JSONDecodeError as e:
raise Exception(f'Failed to parse IXP data: {e}')
pass
except Exception as e:
raise Exception(f'Error processing IXP data: {e}')
async def test():
'''Test the ingestion process'''
async for document in process_data():
print(document)
if __name__ == '__main__':
import asyncio
asyncio.run(test())
'''
Output:
{
"pch_id" : 1848,
"name" : "ANGONIX",
"country" : "AO",
"region" : "Africa",
"city" : "Luanda",
"iata" : "LAD",
"alternatenames" : [],
"sources" : ["pch"],
"prefixes" : {
"ipv4" : ["196.11.234.0/24"],
"ipv6" : ["2001:43f8:9d0::/48"]
},
"geo_id" : 2240449,
"ix_id" : 10
}
'''

View File

@ -12,14 +12,17 @@ except ImportError:
raise ImportError('Missing required \'aiofiles\' library. (pip install aiofiles)')
# Set a default elasticsearch index if one is not provided
default_index = 'masscan-logs'
def construct_map() -> dict:
'''Construct the Elasticsearch index mapping for Masscan records.'''
# Match on exact value or full text search
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
# Construct the geoip mapping (Used with the geoip pipeline to enrich the data)
geoip_mapping = {
'city_name' : keyword_mapping,
'continent_name' : keyword_mapping,
@ -30,21 +33,17 @@ def construct_map() -> dict:
'region_name' : keyword_mapping,
}
# Construct the index mapping
mapping = {
'mappings': {
'properties': {
'ip' : { 'type': 'ip' },
'port' : { 'type': 'integer' },
'data' : {
'properties': {
'proto' : { 'type': 'keyword' },
'service' : { 'type': 'keyword' },
'banner' : keyword_mapping,
'seen' : { 'type': 'date' }
}
},
#'geoip' : { 'properties': geoip_mapping } # Used with the geoip pipeline to enrich the data
'last_seen' : { 'type': 'date' }
#'geoip' : { 'properties': geoip_mapping }
}
}
}
@ -52,26 +51,31 @@ def construct_map() -> dict:
return mapping
async def process_data(file_path: str):
async def process_data(input_path: str):
'''
Read and process Masscan records from the log file.
Read and process the input file
:param file_path: Path to the Masscan log file
:param input_path: Path to the input file
'''
async with aiofiles.open(file_path) as input_file:
async with aiofiles.open(input_path) as input_file:
# Read the input file line by line
async for line in input_file:
line = line.strip()
if line == '~eof': # Sentinel value to indicate the end of a process (Used with --watch with FIFO)
# Sentinel value to indicate the end of a process (for closing out a FIFO stream)
if line == '~eof':
break
# Skip empty lines and lines that do not start with a JSON object
if not line or not line.startswith('{'):
continue
if line.endswith(','): # Do we need this? Masscan JSON output seems with seperate records with a comma between lines for some reason...
# Do we need this? Masscan JSON output seems with seperate records with a comma between lines for some reason...
if line.endswith(','):
line = line[:-1]
# Parse the JSON record
try:
record = json.loads(line)
except json.decoder.JSONDecodeError:
@ -82,43 +86,45 @@ async def process_data(file_path: str):
input('Press Enter to continue...') # Pause for review & debugging (remove this in production)
continue
if len(record['ports']) > 1:
# In rare cases, a single record may contain multiple ports, though I have yet to witness this...
if len(record['ports']) > 1:
logging.warning(f'Multiple ports found for record! ({record})')
input('Press Enter to continue...') # Pause for review (remove this in production)
# Process each port in the record
for port_info in record['ports']:
struct = {
'ip' : record['ip'],
'data' : {
'port' : port_info['port'],
'proto' : port_info['proto'],
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp']))),
},
'last_seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp']))),
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp'])))
}
# Add the service information if available (this field is optional)
if 'service' in port_info:
# Add the service name if available
if 'name' in port_info['service']:
if (service_name := port_info['service']['name']) not in ('unknown',''):
struct['service'] = service_name
# Add the service banner if available
if 'banner' in port_info['service']:
banner = ' '.join(port_info['service']['banner'].split()) # Remove extra whitespace
if banner:
struct['banner'] = banner
id = f'{record["ip"]}:{port_info["port"]}' # Store with ip:port as the unique id to allow the record to be reindexed if it exists.
yield {'_id': id, '_index': default_index, '_source': struct}
# Yield the record
yield {'_index': default_index, '_source': struct}
async def test(input_path: str):
'''
Test the Masscan ingestion process
Test the ingestion process
:param input_path: Path to the MassDNS log file
:param input_path: Path to the input file
'''
async for document in process_data(input_path):
print(document)
@ -128,7 +134,7 @@ if __name__ == '__main__':
import argparse
import asyncio
parser = argparse.ArgumentParser(description='Masscan Ingestor for ERIS')
parser = argparse.ArgumentParser(description='Ingestor for ERIS')
parser.add_argument('input_path', help='Path to the input file or directory')
args = parser.parse_args()
@ -143,7 +149,7 @@ Deploy:
/sbin/iptables -A INPUT -p tcp --dport 61010 -j DROP # Not persistent
printf "0.0.0.0/8\n10.0.0.0/8\n100.64.0.0/10\n127.0.0.0/8\n169.254.0.0/16\n172.16.0.0/12\n192.0.0.0/24\n192.0.2.0/24\n192.31.196.0/24\n192.52.193.0/24\n192.88.99.0/24\n192.168.0.0/16\n192.175.48.0/24\n198.18.0.0/15\n198.51.100.0/24\n203.0.113.0/24\n224.0.0.0/3\n255.255.255.255/32" > exclude.conf
screen -S scan
masscan 0.0.0.0/0 -p21,22,23 --banners --http-user-agent "USER_AGENT" --source-port 61010 --open-only --rate 30000 --excludefile exclude.conf -oJ output.json
masscan 0.0.0.0/0 -p18000 --banners --http-user-agent "USER_AGENT" --source-port 61010 --open-only --rate 30000 --excludefile exclude.conf -oJ 18000.json
masscan 0.0.0.0/0 -p21,22,23 --banners --http-user-agent "USER_AGENT" --source-port 61000-65503 --open-only --rate 30000 --excludefile exclude.conf -oJ output_new.json --shard $i/$TOTAL
Output:

View File

@ -140,18 +140,21 @@ if __name__ == '__main__':
'''
Deployment:
sudo apt-get install build-essential gcc make
sudo apt-get install build-essential gcc make python3 python3-pip
pip install aiofiles aiohttp elasticsearch
git clone --depth 1 https://github.com/acidvegas/eris.git $HOME/eris
git clone --depth 1 https://github.com/blechschmidt/massdns.git $HOME/massdns && cd $HOME/massdns && make
curl -s https://public-dns.info/nameservers.txt | grep -v ':' > $HOME/massdns/nameservers.txt
python3 ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR -o S -w $HOME/massdns/fifo.json
or...
while true; do python ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR -o S -w $HOME/massdns/fifo.json; done
while true; do python3 ./scripts/ptr.py | ./bin/massdns -r $HOME/massdns/nameservers.txt -t PTR --filter NOERROR -o S -w $HOME/eris/FIFO; done
Output:
0.6.229.47.in-addr.arpa. PTR 047-229-006-000.res.spectrum.com.
0.6.228.75.in-addr.arpa. PTR 0.sub-75-228-6.myvzw.com.
0.6.207.73.in-addr.arpa. PTR c-73-207-6-0.hsd1.ga.comcast.net.
Input:
{
'_id' : '47.229.6.0'
@ -163,6 +166,7 @@ Input:
}
}
Notes:
Why do some IP addresses return a A/CNAME from a PTR request
What is dns-servfail.net (Frequent CNAME response from PTR requests)

View File

@ -191,13 +191,12 @@ Input:
'registry' : 'arin',
'cc' : 'us',
'type' : 'ipv4',
'start' : { 'ip': '76.15.132.0' },
'value' : 1024,
'ip' : { 'start': '76.15.132.0', 'end': '76.16.146.0' },
'date' : '2007-05-02T00:00:00Z',
'status' : 'allocated',
'extensions' : '6c065d5b54b877781f05e7d30ebfff28'
}
Notes:
Notes:
Do we make this handle the database locally or load it into ram?
'''

View File

@ -82,7 +82,7 @@ async def process_data():
try:
json_data = json.loads(data)
except aiohttp.ContentTypeError as e:
except json.JSONDecodeError as e:
raise Exception(f'Failed to parse {registry} delegation data: {e}')
if 'transfers' not in json_data:
@ -183,5 +183,5 @@ Output:
}
Input:
- Nothing changed from the output for now...
Nothing changed from the output for now...
'''