Updated a few ingestors, code cleanup, added ttls, etc

This commit is contained in:
Dionysus 2025-01-30 17:34:08 -05:00
parent 3673a60918
commit 2fbd560afd
Signed by: acidvegas
GPG Key ID: EF4B922DB85DC9DE
3 changed files with 174 additions and 242 deletions

View File

@ -4,194 +4,158 @@
import ipaddress import ipaddress
import logging import logging
import os
import time import time
import re
try: try:
import aiohttp import git
except ImportError: except ImportError:
raise ImportError('Missing required libraries. (pip install aiohttp)') raise ImportError('Missing required libraries. (pip install gitpython)')
# Set a default elasticsearch index if one is not provided # Set a default elasticsearch index if one is not provided
default_index = 'eris-firehol' default_index = 'eris-firehol'
# Base URLs for Firehol IP lists # Git repository settings
FIREHOL_BASE_URL = 'https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/' REPO_URL = 'https://github.com/firehol/blocklist-ipsets.git'
FIREHOL_API_URL = 'https://api.github.com/repos/firehol/blocklist-ipsets/git/trees/master' REPO_PATH = os.path.join('data', 'firehol-blocklist') # Local path to store the repo
# File suffixes to ignore
IGNORES = ('_1d', '_7d', '_30d', '_90d', '_180d', '_365d', '_730d')
def construct_map() -> dict: def construct_map() -> dict:
'''Construct the Elasticsearch index mapping for Firehol records.''' '''Construct the Elasticsearch index mapping for Firehol records.'''
# Match on exact value or full text search
keyword_mapping = {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}
# Construct the index mapping
mapping = { mapping = {
'mappings': { 'mappings': {
'properties': { 'properties': {
'ip' : { 'type': 'ip' }, 'ip' : { 'type': 'ip_range' },
'network' : { 'type': 'ip_range' }, 'ipsets' : { 'type': 'keyword' },
'ipset' : { 'type': 'keyword' }, 'categories' : { 'type': 'keyword' },
'category' : { 'type': 'keyword' }, 'seen' : { 'type': 'date' },
'description' : keyword_mapping,
'maintainer' : { 'type': 'keyword' },
'source' : { 'type': 'keyword' },
'seen' : { 'type': 'date' },
'last_updated' : { 'type': 'date' }
} }
} }
} }
return mapping return mapping
async def fetch_ipset(session: aiohttp.ClientSession, ipset_name: str) -> dict:
'''
Fetch and parse a Firehol ipset.
:param session: aiohttp client session def update_repo():
:param ipset_name: Name of the ipset to fetch '''Update the repository locally.'''
:return: Dictionary containing IPs and metadata
# If the repository doesn't exist, clone it
if not os.path.exists(REPO_PATH):
logging.info(f'Cloning repository to {REPO_PATH}...')
# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(REPO_PATH), exist_ok=True)
# Clone the repository
git.Repo.clone_from(REPO_URL, REPO_PATH)
else:
# If the repository already exists, update it
repo = git.Repo(REPO_PATH)
logging.info('Updating repository...')
repo.remotes.origin.pull()
def stream_ips(file_path: str):
''' '''
# Try both .netset and .ipset extensions Stream IPs from file, skipping comments and validating each IP.
for ext in ['.netset', '.ipset']:
url = f'{FIREHOL_BASE_URL}{ipset_name}{ext}' :param file_path: Path to the ipset file.
try: '''
async with session.get(url) as response:
if response.status != 200: try:
# Open the file
with open(file_path) as f:
# Iterate over each line
for line in f:
# Skip comments and empty lines
line = line.strip()
if line.startswith('#') or not line:
continue
# Validate IP/network
try:
if not '/' in line:
line = f'{line}/32'
ipaddress.ip_network(line, strict=True)
except ValueError as e:
logging.warning(f'Invalid IP/network in {os.path.basename(file_path)}: {line} ({e})')
continue continue
content = await response.text() # Yield the valid IP/network
ips = set() yield line
metadata = {
'category': '',
'description': '',
'maintainer': ''
}
for line in content.splitlines():
line = line.strip()
# Skip empty lines
if not line:
continue
# Parse metadata from comments
if line.startswith('#'):
lower_line = line.lower()
if 'category:' in lower_line:
metadata['category'] = line.split('Category:', 1)[1].strip()
elif 'description:' in lower_line:
metadata['description'] = line.split('Description:', 1)[1].strip()
elif 'maintainer:' in lower_line:
metadata['maintainer'] = line.split('Maintainer:', 1)[1].strip()
continue
# Skip invalid lines
if not any(c in '0123456789./:' for c in line):
continue
try:
# Validate IP/network
if '/' in line:
ipaddress.ip_network(line, strict=False)
else:
ipaddress.ip_address(line)
ips.add(line)
except ValueError as e:
logging.warning(f'Invalid IP/network in {ipset_name}: {line} ({e})')
continue
return {
'ips': ips,
'metadata': metadata
}
except Exception as e:
logging.error(f'Error fetching {ipset_name}: {e}')
continue
return None
async def get_all_ipsets(session: aiohttp.ClientSession) -> list:
'''
Fetch list of all available ipsets from the Firehol repository.
:param session: aiohttp client session
:return: List of ipset names
'''
try:
headers = {'Accept': 'application/vnd.github.v3+json'}
async with session.get(FIREHOL_API_URL, headers=headers) as response:
if response.status != 200:
logging.error(f'Failed to fetch ipset list: HTTP {response.status}')
return []
data = await response.json()
ipsets = []
for item in data['tree']:
filename = item['path']
# Include only .netset and .ipset files, exclude directories and other files
if filename.endswith(('.netset', '.ipset')) and not any(x in filename for x in [
'_log', '_report', '_latest', '_1d', '_7d', '_30d', '_90d', '_180d', '_360d', '_720d',
'README', 'COPYING', 'LICENSE', 'excluded'
]):
ipsets.append(filename.rsplit('.', 1)[0])
logging.info(f'Found {len(ipsets)} ipsets')
return ipsets
except Exception as e: except Exception as e:
logging.error(f'Error fetching ipset list: {e}') logging.error(f'Error streaming IPs from {file_path}: {e}')
return []
async def process_data(input_path: str = None): async def process_data(input_path = None):
''' '''
Process Firehol ipsets and yield records for indexing. Process Firehol ipsets and yield records for indexing.
:param input_path: Optional path to local file (not used for Firehol ingestion) :param input_path: Placeholder for uniformity
''' '''
# Create a client session # Update the repository
async with aiohttp.ClientSession() as session: update_repo()
# Get list of all available ipsets
ipset_names = await get_all_ipsets(session)
if not ipset_names: # Get all files
logging.error('No ipsets found') files = []
return for filename in os.listdir(REPO_PATH):
if filename.endswith(('.ipset', '.netset')):
for ipset_name in ipset_names: if any(filename.rsplit('.', 1)[0].endswith(x) for x in IGNORES):
logging.info(f'Fetching {ipset_name}...') logging.debug(f'Ignoring {filename} because it ends with {IGNORES}')
result = await fetch_ipset(session, ipset_name)
if not result:
logging.warning(f'Failed to fetch {ipset_name}')
continue continue
files.append(os.path.join(REPO_PATH, filename))
ips = result['ips'] logging.info(f'Processing {len(files)} files...')
metadata = result['metadata']
# Dictionary to store unique IPs and their metadata
ip_records = {}
# Process each file
for file_path in files:
logging.info(f'Processing {os.path.basename(file_path)}...')
# Get the ipset name
ipset_name = os.path.splitext(os.path.basename(file_path))[0]
# Extract category if present
category = None
with open(file_path) as f:
for line in f:
if match := re.search(r'^#\s*Category\s*:\s*(.+)$', line, re.IGNORECASE):
category = match.group(1).strip()
break
# Stream IPs from the file
for ip in stream_ips(file_path):
# Initialize record if IP not seen before
if ip not in ip_records:
ip_records[ip] = {'ip': ip, 'ipsets': set(), 'categories': set()}
timestamp = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) # Update arrays
ip_records[ip]['ipsets'].add(ipset_name)
if category:
ip_records[ip]['categories'].add(category)
for ip in ips: # Yield unique records with converted sets to lists
document = { for ip, record in ip_records.items():
'ip' : ip.split('/')[0] if '/' in ip else ip, # Convert sets to lists for JSON serialization
'ipset' : ipset_name, record['ipsets'] = list(record['ipsets'])
'category' : metadata['category'], record['categories'] = list(record['categories'])
'description' : metadata['description'], record['seen'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
'maintainer' : metadata['maintainer'],
'source' : 'firehol', # Yield the document with _id set to the IP
'seen' : timestamp, yield {'_index': default_index, '_id': ip, '_source': record}
'last_updated' : timestamp
}
if '/' in ip:
document['network'] = ip
yield {'_index': default_index, '_source': document}
async def test(): async def test():
'''Test the ingestion process''' '''Test the ingestion process'''
@ -204,25 +168,4 @@ async def test():
if __name__ == '__main__': if __name__ == '__main__':
import asyncio import asyncio
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
asyncio.run(test()) asyncio.run(test())
'''
Output Example:
{
"_index": "eris-firehol",
"_source": {
"ip" : "1.2.3.4",
"network" : "1.2.3.0/24",
"ipset" : "firehol_level1",
"category" : "attacks",
"description" : "Basic protection against attacks",
"maintainer" : "FireHOL team",
"source" : "firehol",
"seen" : "2024-03-05T12:00:00Z",
"last_updated" : "2024-03-05T12:00:00Z"
}
}
'''

View File

@ -22,17 +22,6 @@ def construct_map() -> dict:
# Match on exact value or full text search # Match on exact value or full text search
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } } keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
# Construct the geoip mapping (Used with the geoip pipeline to enrich the data)
geoip_mapping = {
'city_name' : keyword_mapping,
'continent_name' : keyword_mapping,
'country_iso_code' : keyword_mapping,
'country_name' : keyword_mapping,
'location' : { 'type': 'geo_point' },
'region_iso_code' : keyword_mapping,
'region_name' : keyword_mapping,
}
# Construct the index mapping # Construct the index mapping
mapping = { mapping = {
'mappings': { 'mappings': {
@ -42,8 +31,8 @@ def construct_map() -> dict:
'proto' : { 'type': 'keyword' }, 'proto' : { 'type': 'keyword' },
'service' : { 'type': 'keyword' }, 'service' : { 'type': 'keyword' },
'banner' : keyword_mapping, 'banner' : keyword_mapping,
'ttl' : { 'type': 'integer' },
'seen' : { 'type': 'date' } 'seen' : { 'type': 'date' }
#'geoip' : { 'properties': geoip_mapping }
} }
} }
} }
@ -83,6 +72,7 @@ async def process_data(input_path: str):
'ip' : record['ip'], 'ip' : record['ip'],
'port' : record['port'], 'port' : record['port'],
'proto' : record['proto'], 'proto' : record['proto'],
'ttl' : record['ttl'],
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp']))) 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp'])))
} }

View File

@ -42,76 +42,75 @@ async def process_data(input_path: str):
:param input_path: Path to the input file :param input_path: Path to the input file
''' '''
# Open the input file
async with aiofiles.open(input_path) as input_file: async with aiofiles.open(input_path) as input_file:
# Cache the last document to avoid creating a new one for the same IP address # Cache the last document to avoid creating a new one for the same IP address
last = None last = None
try: # Read the input file line by line
# Read the input file line by line async for line in input_file:
async for line in input_file:
line = line.strip()
# Sentinel value to indicate the end of a process (for closing out a FIFO stream) # Strip whitespace
if line == '~eof': line = line.strip()
yield last
break
# Skip empty lines (doubtful we will have any, but just in case) # Skip empty lines
if not line: if not line:
continue
# Sentinel value to indicate the end of a process (for closing out a FIFO stream)
if line == '~eof':
yield last
break
# Split the line into its parts
parts = line.split()
# Ensure the line has at least 3 parts
if len(parts) < 3:
logging.warning(f'Invalid PTR record: {line}')
continue
# Split the PTR record into its parts
name, record_type, record = parts[0].rstrip('.'), parts[1], ' '.join(parts[2:]).rstrip('.')
# Do not index other records
if record_type != 'PTR':
continue
# Do not index PTR records that do not have a record
if not record:
continue
# Do not index PTR records that have the same record as the in-addr.arpa domain
if record == name:
continue
# Get the IP address from the in-addr.arpa domain
ip = '.'.join(name.replace('.in-addr.arpa', '').split('.')[::-1])
# Check if we are still processing the same IP address
if last:
if ip == last['_id']: # This record is for the same IP address as the cached document
last_records = last['doc']['record']
if record not in last_records: # Do not index duplicate records
last['doc']['record'].append(record)
continue continue
else:
yield last # Return the last document and start a new one
# Split the line into its parts # Cache the document
parts = line.split() last = {
'_op_type' : 'update',
# Ensure the line has at least 3 parts '_id' : ip,
if len(parts) < 3: '_index' : default_index,
logging.warning(f'Invalid PTR record: {line}') 'doc' : {
continue 'ip' : ip,
'record' : [record],
# Split the PTR record into its parts 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
name, record_type, record = parts[0].rstrip('.'), parts[1], ' '.join(parts[2:]).rstrip('.') },
'doc_as_upsert' : True # Create the document if it does not exist
# Do not index other records }
if record_type != 'PTR':
continue
# Do not index PTR records that do not have a record
if not record:
continue
# Do not index PTR records that have the same record as the in-addr.arpa domain
if record == name:
continue
# Get the IP address from the in-addr.arpa domain
ip = '.'.join(name.replace('.in-addr.arpa', '').split('.')[::-1])
# Check if we are still processing the same IP address
if last:
if ip == last['_id']: # This record is for the same IP address as the cached document
last_records = last['doc']['record']
if record not in last_records: # Do not index duplicate records
last['doc']['record'].append(record)
continue
else:
yield last # Return the last document and start a new one
# Cache the document
last = {
'_op_type' : 'update',
'_id' : ip,
'_index' : default_index,
'doc' : {
'ip' : ip,
'record' : [record],
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
},
'doc_as_upsert' : True # Create the document if it does not exist
}
except Exception as e:
logging.error(f'Error processing data: {e}')
async def test(input_path: str): async def test(input_path: str):