Updated a few ingestors, code cleanup, added ttls, etc
This commit is contained in:
parent
3673a60918
commit
2fbd560afd
@ -4,194 +4,158 @@
|
|||||||
|
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import aiohttp
|
import git
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError('Missing required libraries. (pip install aiohttp)')
|
raise ImportError('Missing required libraries. (pip install gitpython)')
|
||||||
|
|
||||||
|
|
||||||
# Set a default elasticsearch index if one is not provided
|
# Set a default elasticsearch index if one is not provided
|
||||||
default_index = 'eris-firehol'
|
default_index = 'eris-firehol'
|
||||||
|
|
||||||
# Base URLs for Firehol IP lists
|
# Git repository settings
|
||||||
FIREHOL_BASE_URL = 'https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/'
|
REPO_URL = 'https://github.com/firehol/blocklist-ipsets.git'
|
||||||
FIREHOL_API_URL = 'https://api.github.com/repos/firehol/blocklist-ipsets/git/trees/master'
|
REPO_PATH = os.path.join('data', 'firehol-blocklist') # Local path to store the repo
|
||||||
|
|
||||||
|
# File suffixes to ignore
|
||||||
|
IGNORES = ('_1d', '_7d', '_30d', '_90d', '_180d', '_365d', '_730d')
|
||||||
|
|
||||||
|
|
||||||
def construct_map() -> dict:
|
def construct_map() -> dict:
|
||||||
'''Construct the Elasticsearch index mapping for Firehol records.'''
|
'''Construct the Elasticsearch index mapping for Firehol records.'''
|
||||||
|
|
||||||
# Match on exact value or full text search
|
|
||||||
keyword_mapping = {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}
|
|
||||||
|
|
||||||
# Construct the index mapping
|
|
||||||
mapping = {
|
mapping = {
|
||||||
'mappings': {
|
'mappings': {
|
||||||
'properties': {
|
'properties': {
|
||||||
'ip' : { 'type': 'ip' },
|
'ip' : { 'type': 'ip_range' },
|
||||||
'network' : { 'type': 'ip_range' },
|
'ipsets' : { 'type': 'keyword' },
|
||||||
'ipset' : { 'type': 'keyword' },
|
'categories' : { 'type': 'keyword' },
|
||||||
'category' : { 'type': 'keyword' },
|
'seen' : { 'type': 'date' },
|
||||||
'description' : keyword_mapping,
|
|
||||||
'maintainer' : { 'type': 'keyword' },
|
|
||||||
'source' : { 'type': 'keyword' },
|
|
||||||
'seen' : { 'type': 'date' },
|
|
||||||
'last_updated' : { 'type': 'date' }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return mapping
|
return mapping
|
||||||
|
|
||||||
async def fetch_ipset(session: aiohttp.ClientSession, ipset_name: str) -> dict:
|
|
||||||
'''
|
|
||||||
Fetch and parse a Firehol ipset.
|
|
||||||
|
|
||||||
:param session: aiohttp client session
|
def update_repo():
|
||||||
:param ipset_name: Name of the ipset to fetch
|
'''Update the repository locally.'''
|
||||||
:return: Dictionary containing IPs and metadata
|
|
||||||
|
# If the repository doesn't exist, clone it
|
||||||
|
if not os.path.exists(REPO_PATH):
|
||||||
|
logging.info(f'Cloning repository to {REPO_PATH}...')
|
||||||
|
|
||||||
|
# Create the directory if it doesn't exist
|
||||||
|
os.makedirs(os.path.dirname(REPO_PATH), exist_ok=True)
|
||||||
|
|
||||||
|
# Clone the repository
|
||||||
|
git.Repo.clone_from(REPO_URL, REPO_PATH)
|
||||||
|
else:
|
||||||
|
# If the repository already exists, update it
|
||||||
|
repo = git.Repo(REPO_PATH)
|
||||||
|
logging.info('Updating repository...')
|
||||||
|
repo.remotes.origin.pull()
|
||||||
|
|
||||||
|
|
||||||
|
def stream_ips(file_path: str):
|
||||||
'''
|
'''
|
||||||
# Try both .netset and .ipset extensions
|
Stream IPs from file, skipping comments and validating each IP.
|
||||||
for ext in ['.netset', '.ipset']:
|
|
||||||
url = f'{FIREHOL_BASE_URL}{ipset_name}{ext}'
|
:param file_path: Path to the ipset file.
|
||||||
try:
|
'''
|
||||||
async with session.get(url) as response:
|
|
||||||
if response.status != 200:
|
try:
|
||||||
|
# Open the file
|
||||||
|
with open(file_path) as f:
|
||||||
|
|
||||||
|
# Iterate over each line
|
||||||
|
for line in f:
|
||||||
|
|
||||||
|
# Skip comments and empty lines
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith('#') or not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Validate IP/network
|
||||||
|
try:
|
||||||
|
if not '/' in line:
|
||||||
|
line = f'{line}/32'
|
||||||
|
ipaddress.ip_network(line, strict=True)
|
||||||
|
except ValueError as e:
|
||||||
|
logging.warning(f'Invalid IP/network in {os.path.basename(file_path)}: {line} ({e})')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
content = await response.text()
|
# Yield the valid IP/network
|
||||||
ips = set()
|
yield line
|
||||||
metadata = {
|
|
||||||
'category': '',
|
|
||||||
'description': '',
|
|
||||||
'maintainer': ''
|
|
||||||
}
|
|
||||||
|
|
||||||
for line in content.splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
|
|
||||||
# Skip empty lines
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Parse metadata from comments
|
|
||||||
if line.startswith('#'):
|
|
||||||
lower_line = line.lower()
|
|
||||||
if 'category:' in lower_line:
|
|
||||||
metadata['category'] = line.split('Category:', 1)[1].strip()
|
|
||||||
elif 'description:' in lower_line:
|
|
||||||
metadata['description'] = line.split('Description:', 1)[1].strip()
|
|
||||||
elif 'maintainer:' in lower_line:
|
|
||||||
metadata['maintainer'] = line.split('Maintainer:', 1)[1].strip()
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Skip invalid lines
|
|
||||||
if not any(c in '0123456789./:' for c in line):
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Validate IP/network
|
|
||||||
if '/' in line:
|
|
||||||
ipaddress.ip_network(line, strict=False)
|
|
||||||
else:
|
|
||||||
ipaddress.ip_address(line)
|
|
||||||
ips.add(line)
|
|
||||||
except ValueError as e:
|
|
||||||
logging.warning(f'Invalid IP/network in {ipset_name}: {line} ({e})')
|
|
||||||
continue
|
|
||||||
|
|
||||||
return {
|
|
||||||
'ips': ips,
|
|
||||||
'metadata': metadata
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f'Error fetching {ipset_name}: {e}')
|
|
||||||
continue
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def get_all_ipsets(session: aiohttp.ClientSession) -> list:
|
|
||||||
'''
|
|
||||||
Fetch list of all available ipsets from the Firehol repository.
|
|
||||||
|
|
||||||
:param session: aiohttp client session
|
|
||||||
:return: List of ipset names
|
|
||||||
'''
|
|
||||||
try:
|
|
||||||
headers = {'Accept': 'application/vnd.github.v3+json'}
|
|
||||||
async with session.get(FIREHOL_API_URL, headers=headers) as response:
|
|
||||||
if response.status != 200:
|
|
||||||
logging.error(f'Failed to fetch ipset list: HTTP {response.status}')
|
|
||||||
return []
|
|
||||||
|
|
||||||
data = await response.json()
|
|
||||||
ipsets = []
|
|
||||||
|
|
||||||
for item in data['tree']:
|
|
||||||
filename = item['path']
|
|
||||||
# Include only .netset and .ipset files, exclude directories and other files
|
|
||||||
if filename.endswith(('.netset', '.ipset')) and not any(x in filename for x in [
|
|
||||||
'_log', '_report', '_latest', '_1d', '_7d', '_30d', '_90d', '_180d', '_360d', '_720d',
|
|
||||||
'README', 'COPYING', 'LICENSE', 'excluded'
|
|
||||||
]):
|
|
||||||
ipsets.append(filename.rsplit('.', 1)[0])
|
|
||||||
|
|
||||||
logging.info(f'Found {len(ipsets)} ipsets')
|
|
||||||
return ipsets
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'Error fetching ipset list: {e}')
|
logging.error(f'Error streaming IPs from {file_path}: {e}')
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
async def process_data(input_path: str = None):
|
async def process_data(input_path = None):
|
||||||
'''
|
'''
|
||||||
Process Firehol ipsets and yield records for indexing.
|
Process Firehol ipsets and yield records for indexing.
|
||||||
|
|
||||||
:param input_path: Optional path to local file (not used for Firehol ingestion)
|
:param input_path: Placeholder for uniformity
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# Create a client session
|
# Update the repository
|
||||||
async with aiohttp.ClientSession() as session:
|
update_repo()
|
||||||
# Get list of all available ipsets
|
|
||||||
ipset_names = await get_all_ipsets(session)
|
|
||||||
|
|
||||||
if not ipset_names:
|
# Get all files
|
||||||
logging.error('No ipsets found')
|
files = []
|
||||||
return
|
for filename in os.listdir(REPO_PATH):
|
||||||
|
if filename.endswith(('.ipset', '.netset')):
|
||||||
for ipset_name in ipset_names:
|
if any(filename.rsplit('.', 1)[0].endswith(x) for x in IGNORES):
|
||||||
logging.info(f'Fetching {ipset_name}...')
|
logging.debug(f'Ignoring {filename} because it ends with {IGNORES}')
|
||||||
|
|
||||||
result = await fetch_ipset(session, ipset_name)
|
|
||||||
if not result:
|
|
||||||
logging.warning(f'Failed to fetch {ipset_name}')
|
|
||||||
continue
|
continue
|
||||||
|
files.append(os.path.join(REPO_PATH, filename))
|
||||||
|
|
||||||
ips = result['ips']
|
logging.info(f'Processing {len(files)} files...')
|
||||||
metadata = result['metadata']
|
|
||||||
|
# Dictionary to store unique IPs and their metadata
|
||||||
|
ip_records = {}
|
||||||
|
|
||||||
|
# Process each file
|
||||||
|
for file_path in files:
|
||||||
|
logging.info(f'Processing {os.path.basename(file_path)}...')
|
||||||
|
|
||||||
|
# Get the ipset name
|
||||||
|
ipset_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||||
|
|
||||||
|
# Extract category if present
|
||||||
|
category = None
|
||||||
|
with open(file_path) as f:
|
||||||
|
for line in f:
|
||||||
|
if match := re.search(r'^#\s*Category\s*:\s*(.+)$', line, re.IGNORECASE):
|
||||||
|
category = match.group(1).strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
# Stream IPs from the file
|
||||||
|
for ip in stream_ips(file_path):
|
||||||
|
# Initialize record if IP not seen before
|
||||||
|
if ip not in ip_records:
|
||||||
|
ip_records[ip] = {'ip': ip, 'ipsets': set(), 'categories': set()}
|
||||||
|
|
||||||
timestamp = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
# Update arrays
|
||||||
|
ip_records[ip]['ipsets'].add(ipset_name)
|
||||||
|
if category:
|
||||||
|
ip_records[ip]['categories'].add(category)
|
||||||
|
|
||||||
for ip in ips:
|
# Yield unique records with converted sets to lists
|
||||||
document = {
|
for ip, record in ip_records.items():
|
||||||
'ip' : ip.split('/')[0] if '/' in ip else ip,
|
# Convert sets to lists for JSON serialization
|
||||||
'ipset' : ipset_name,
|
record['ipsets'] = list(record['ipsets'])
|
||||||
'category' : metadata['category'],
|
record['categories'] = list(record['categories'])
|
||||||
'description' : metadata['description'],
|
record['seen'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
||||||
'maintainer' : metadata['maintainer'],
|
|
||||||
'source' : 'firehol',
|
# Yield the document with _id set to the IP
|
||||||
'seen' : timestamp,
|
yield {'_index': default_index, '_id': ip, '_source': record}
|
||||||
'last_updated' : timestamp
|
|
||||||
}
|
|
||||||
|
|
||||||
if '/' in ip:
|
|
||||||
document['network'] = ip
|
|
||||||
|
|
||||||
yield {'_index': default_index, '_source': document}
|
|
||||||
|
|
||||||
async def test():
|
async def test():
|
||||||
'''Test the ingestion process'''
|
'''Test the ingestion process'''
|
||||||
@ -204,25 +168,4 @@ async def test():
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import asyncio
|
import asyncio
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
asyncio.run(test())
|
asyncio.run(test())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
Output Example:
|
|
||||||
|
|
||||||
{
|
|
||||||
"_index": "eris-firehol",
|
|
||||||
"_source": {
|
|
||||||
"ip" : "1.2.3.4",
|
|
||||||
"network" : "1.2.3.0/24",
|
|
||||||
"ipset" : "firehol_level1",
|
|
||||||
"category" : "attacks",
|
|
||||||
"description" : "Basic protection against attacks",
|
|
||||||
"maintainer" : "FireHOL team",
|
|
||||||
"source" : "firehol",
|
|
||||||
"seen" : "2024-03-05T12:00:00Z",
|
|
||||||
"last_updated" : "2024-03-05T12:00:00Z"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
'''
|
|
@ -22,17 +22,6 @@ def construct_map() -> dict:
|
|||||||
# Match on exact value or full text search
|
# Match on exact value or full text search
|
||||||
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
|
keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }
|
||||||
|
|
||||||
# Construct the geoip mapping (Used with the geoip pipeline to enrich the data)
|
|
||||||
geoip_mapping = {
|
|
||||||
'city_name' : keyword_mapping,
|
|
||||||
'continent_name' : keyword_mapping,
|
|
||||||
'country_iso_code' : keyword_mapping,
|
|
||||||
'country_name' : keyword_mapping,
|
|
||||||
'location' : { 'type': 'geo_point' },
|
|
||||||
'region_iso_code' : keyword_mapping,
|
|
||||||
'region_name' : keyword_mapping,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Construct the index mapping
|
# Construct the index mapping
|
||||||
mapping = {
|
mapping = {
|
||||||
'mappings': {
|
'mappings': {
|
||||||
@ -42,8 +31,8 @@ def construct_map() -> dict:
|
|||||||
'proto' : { 'type': 'keyword' },
|
'proto' : { 'type': 'keyword' },
|
||||||
'service' : { 'type': 'keyword' },
|
'service' : { 'type': 'keyword' },
|
||||||
'banner' : keyword_mapping,
|
'banner' : keyword_mapping,
|
||||||
|
'ttl' : { 'type': 'integer' },
|
||||||
'seen' : { 'type': 'date' }
|
'seen' : { 'type': 'date' }
|
||||||
#'geoip' : { 'properties': geoip_mapping }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -83,6 +72,7 @@ async def process_data(input_path: str):
|
|||||||
'ip' : record['ip'],
|
'ip' : record['ip'],
|
||||||
'port' : record['port'],
|
'port' : record['port'],
|
||||||
'proto' : record['proto'],
|
'proto' : record['proto'],
|
||||||
|
'ttl' : record['ttl'],
|
||||||
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp'])))
|
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp'])))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,76 +42,75 @@ async def process_data(input_path: str):
|
|||||||
:param input_path: Path to the input file
|
:param input_path: Path to the input file
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# Open the input file
|
||||||
async with aiofiles.open(input_path) as input_file:
|
async with aiofiles.open(input_path) as input_file:
|
||||||
|
|
||||||
# Cache the last document to avoid creating a new one for the same IP address
|
# Cache the last document to avoid creating a new one for the same IP address
|
||||||
last = None
|
last = None
|
||||||
|
|
||||||
try:
|
# Read the input file line by line
|
||||||
# Read the input file line by line
|
async for line in input_file:
|
||||||
async for line in input_file:
|
|
||||||
line = line.strip()
|
|
||||||
|
|
||||||
# Sentinel value to indicate the end of a process (for closing out a FIFO stream)
|
# Strip whitespace
|
||||||
if line == '~eof':
|
line = line.strip()
|
||||||
yield last
|
|
||||||
break
|
|
||||||
|
|
||||||
# Skip empty lines (doubtful we will have any, but just in case)
|
# Skip empty lines
|
||||||
if not line:
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Sentinel value to indicate the end of a process (for closing out a FIFO stream)
|
||||||
|
if line == '~eof':
|
||||||
|
yield last
|
||||||
|
break
|
||||||
|
|
||||||
|
# Split the line into its parts
|
||||||
|
parts = line.split()
|
||||||
|
|
||||||
|
# Ensure the line has at least 3 parts
|
||||||
|
if len(parts) < 3:
|
||||||
|
logging.warning(f'Invalid PTR record: {line}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Split the PTR record into its parts
|
||||||
|
name, record_type, record = parts[0].rstrip('.'), parts[1], ' '.join(parts[2:]).rstrip('.')
|
||||||
|
|
||||||
|
# Do not index other records
|
||||||
|
if record_type != 'PTR':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Do not index PTR records that do not have a record
|
||||||
|
if not record:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Do not index PTR records that have the same record as the in-addr.arpa domain
|
||||||
|
if record == name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get the IP address from the in-addr.arpa domain
|
||||||
|
ip = '.'.join(name.replace('.in-addr.arpa', '').split('.')[::-1])
|
||||||
|
|
||||||
|
# Check if we are still processing the same IP address
|
||||||
|
if last:
|
||||||
|
if ip == last['_id']: # This record is for the same IP address as the cached document
|
||||||
|
last_records = last['doc']['record']
|
||||||
|
if record not in last_records: # Do not index duplicate records
|
||||||
|
last['doc']['record'].append(record)
|
||||||
continue
|
continue
|
||||||
|
else:
|
||||||
|
yield last # Return the last document and start a new one
|
||||||
|
|
||||||
# Split the line into its parts
|
# Cache the document
|
||||||
parts = line.split()
|
last = {
|
||||||
|
'_op_type' : 'update',
|
||||||
# Ensure the line has at least 3 parts
|
'_id' : ip,
|
||||||
if len(parts) < 3:
|
'_index' : default_index,
|
||||||
logging.warning(f'Invalid PTR record: {line}')
|
'doc' : {
|
||||||
continue
|
'ip' : ip,
|
||||||
|
'record' : [record],
|
||||||
# Split the PTR record into its parts
|
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
||||||
name, record_type, record = parts[0].rstrip('.'), parts[1], ' '.join(parts[2:]).rstrip('.')
|
},
|
||||||
|
'doc_as_upsert' : True # Create the document if it does not exist
|
||||||
# Do not index other records
|
}
|
||||||
if record_type != 'PTR':
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Do not index PTR records that do not have a record
|
|
||||||
if not record:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Do not index PTR records that have the same record as the in-addr.arpa domain
|
|
||||||
if record == name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get the IP address from the in-addr.arpa domain
|
|
||||||
ip = '.'.join(name.replace('.in-addr.arpa', '').split('.')[::-1])
|
|
||||||
|
|
||||||
# Check if we are still processing the same IP address
|
|
||||||
if last:
|
|
||||||
if ip == last['_id']: # This record is for the same IP address as the cached document
|
|
||||||
last_records = last['doc']['record']
|
|
||||||
if record not in last_records: # Do not index duplicate records
|
|
||||||
last['doc']['record'].append(record)
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
yield last # Return the last document and start a new one
|
|
||||||
|
|
||||||
# Cache the document
|
|
||||||
last = {
|
|
||||||
'_op_type' : 'update',
|
|
||||||
'_id' : ip,
|
|
||||||
'_index' : default_index,
|
|
||||||
'doc' : {
|
|
||||||
'ip' : ip,
|
|
||||||
'record' : [record],
|
|
||||||
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
|
||||||
},
|
|
||||||
'doc_as_upsert' : True # Create the document if it does not exist
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f'Error processing data: {e}')
|
|
||||||
|
|
||||||
|
|
||||||
async def test(input_path: str):
|
async def test(input_path: str):
|
||||||
|
Loading…
Reference in New Issue
Block a user