Removed _id from certstream yield and renamed ingest_certs.py to ingest_certstream.py

This commit is contained in:
Dionysus 2024-03-11 22:46:48 -04:00
parent 87f2cf27ea
commit de3878ef6b
Signed by: acidvegas
GPG Key ID: EF4B922DB85DC9DE
4 changed files with 229 additions and 202 deletions

View File

@ -24,14 +24,14 @@ python eris.py [options] <input>
| `--watch` | Create or watch a FIFO for real-time indexing | | `--watch` | Create or watch a FIFO for real-time indexing |
###### Elasticsearch arguments ###### Elasticsearch arguments
| Argument | Description | Default | | Argument | Description | Default |
|-----------------|---------------------------------------------------------|---------------------| |-----------------|---------------------------------------------------------|--------------------|
| `--host` | Elasticsearch host | `http://localhost/` | | `--host` | Elasticsearch host | `http://localhost` |
| `--port` | Elasticsearch port | `9200` | | `--port` | Elasticsearch port | `9200` |
| `--user` | Elasticsearch username | `elastic` | | `--user` | Elasticsearch username | `elastic` |
| `--password` | Elasticsearch password | `$ES_PASSWORD` | | `--password` | Elasticsearch password | `$ES_PASSWORD` |
| `--api-key` | Elasticsearch API Key for authentication | `$ES_APIKEY` | | `--api-key` | Elasticsearch API Key for authentication | `$ES_APIKEY` |
| `--self-signed` | Elasticsearch connection with a self-signed certificate | | | `--self-signed` | Elasticsearch connection with a self-signed certificate | |
###### Elasticsearch indexing arguments ###### Elasticsearch indexing arguments
| Argument | Description | Default | | Argument | Description | Default |
@ -50,20 +50,23 @@ python eris.py [options] <input>
| `--timeout` | Number of seconds to wait before retrying a chunk | `60` | | `--timeout` | Number of seconds to wait before retrying a chunk | `60` |
###### Ingestion arguments ###### Ingestion arguments
| Argument | Description | | Argument | Description |
|-------------|--------------------------| |---------------|--------------------------|
| `--certs` | Index Certstream records | | `--certstrem` | Index Certstream records |
| `--httpx` | Index HTTPX records | | `--httpx` | Index HTTPX records |
| `--masscan` | Index Masscan records | | `--masscan` | Index Masscan records |
| `--massdns` | Index massdns records | | `--massdns` | Index massdns records |
| `--zone` | Index zone DNS records | | `--zone` | Index zone DNS records |
This ingestion suite will use the built in node sniffer, so by connecting to a single node, you can load balance across the entire cluster. ~~This ingestion suite will use the built in node sniffer, so by connecting to a single node, you can load balance across the entire cluster.~~
It is good to know how much nodes you have in the cluster to determine how to fine tune the arguments for the best performance, based on your environment.
**Note:** The sniffer has been disabled for now due an [issue](https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960) with the 8.x elasticsearch client. The auth headers are not properly sent when enabling the sniffer. A working [patch](https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960) was shared and has been *mostly* converted in [helpers/sniff_patch.py](./helpers/sniff_patch.py) for the async client.
## Roadmap ## Roadmap
- Create a module for RIR database ingestion *(WHOIS, delegations, transfer, ASN mapping, peering, etc)* - Create a module for RIR database ingestion *(WHOIS, delegations, transfer, ASN mapping, peering, etc)*
- Dynamically update the batch metrics when the sniffer adds or removes nodes. - Dynamically update the batch metrics when the sniffer adds or removes nodes.
- Fix issue with leftover FIFO files *(catch SIGTERM / SIGINT signals)*
- Create a working patch for the async client to properly send auth headers.
___ ___

388
eris.py
View File

@ -12,240 +12,264 @@ import sys
sys.dont_write_bytecode = True sys.dont_write_bytecode = True
try: try:
# This is commented out because there is a bug with the elasticsearch library that requires a patch (see initialize() method below) from elasticsearch import AsyncElasticsearch
#from elasticsearch import AsyncElasticsearch from elasticsearch.exceptions import NotFoundError
from elasticsearch.exceptions import NotFoundError from elasticsearch.helpers import async_streaming_bulk
from elasticsearch.helpers import async_streaming_bulk
except ImportError: except ImportError:
raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)') raise ImportError('Missing required \'elasticsearch\' library. (pip install elasticsearch)')
# Setting up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d %I:%M:%S')
class ElasticIndexer: class ElasticIndexer:
def __init__(self, args: argparse.Namespace): def __init__(self, args: argparse.Namespace):
''' '''
Initialize the Elastic Search indexer. Initialize the Elastic Search indexer.
:param args: Parsed arguments from argparse :param args: Parsed arguments from argparse
''' '''
self.chunk_max = args.chunk_max * 1024 * 1024 # MB self.chunk_max = args.chunk_max * 1024 * 1024 # MB
self.chunk_size = args.chunk_size self.chunk_size = args.chunk_size
self.es = None self.es_index = args.index
self.es_index = args.index
self.es_config = { # Sniffing disabled due to an issue with the elasticsearch 8.x client (https://github.com/elastic/elasticsearch-py/issues/2005)
'hosts': [f'{args.host}:{args.port}'], es_config = {
'verify_certs': args.self_signed, #'hosts' : [f'{args.host}:{args.port}'],
'ssl_show_warn': args.self_signed, 'hosts' : [f'{args.host}:{port}' for port in ('9200', '9201', '9202')], # Temporary alternative to sniffing
'request_timeout': args.timeout, 'verify_certs' : args.self_signed,
'max_retries': args.retries, 'ssl_show_warn' : args.self_signed,
'retry_on_timeout': True, 'request_timeout' : args.timeout,
'sniff_on_start': False, # Problems when True.... 'max_retries' : args.retries,
'sniff_on_node_failure': True, 'retry_on_timeout' : True
'min_delay_between_sniffing': 60 # Add config option for this? #'sniff_on_start': True,
} #'sniff_on_node_failure': True,
#'min_delay_between_sniffing': 60
}
#if args.api_key: if args.api_key:
# self.es_config['api_key'] = (args.api_key, '') # Verify this is correct es_config['api_key'] = (args.api_key, '') # Verify this is correct
#else: else:
self.es_config['basic_auth'] = (args.user, args.password) es_config['basic_auth'] = (args.user, args.password)
self.es = AsyncElasticsearch(**es_config)
async def initialize(self): async def create_index(self, map_body: dict, pipeline: str = None, replicas: int = 1, shards: int = 1):
'''Initialize the Elasticsearch client.''' '''
Create the Elasticsearch index with the defined mapping.
# Patching the Elasticsearch client to fix a bug with sniffing (https://github.com/elastic/elasticsearch-py/issues/2005#issuecomment-1645641960) :param map_body: Mapping for the index
import sniff_patch :param pipeline: Name of the ingest pipeline to use for the index
self.es = await sniff_patch.init_elasticsearch(**self.es_config) :param replicas: Number of replicas for the index
:param shards: Number of shards for the index
'''
# Remove the above and uncomment the below if the bug is fixed in the Elasticsearch client: if await self.es.indices.exists(index=self.es_index):
#self.es = AsyncElasticsearch(**es_config) logging.info(f'Index \'{self.es_index}\' already exists.')
return
mapping = map_body
mapping['settings'] = {
'number_of_shards' : shards,
'number_of_replicas' : replicas
}
if pipeline:
try:
await self.es.ingest.get_pipeline(id=pipeline)
logging.info(f'Using ingest pipeline \'{pipeline}\' for index \'{self.es_index}\'')
mapping['settings']['index.default_pipeline'] = pipeline
except NotFoundError:
raise ValueError(f'Ingest pipeline \'{pipeline}\' does not exist.')
response = await self.es.indices.create(index=self.es_index, body=mapping)
if response.get('acknowledged') and response.get('shards_acknowledged'):
logging.info(f'Index \'{self.es_index}\' successfully created.')
else:
raise Exception(f'Failed to create index. ({response})')
async def create_index(self, map_body: dict, pipeline: str = '', replicas: int = 1, shards: int = 1): async def get_cluster_health(self) -> dict:
''' '''Get the health of the Elasticsearch cluster.'''
Create the Elasticsearch index with the defined mapping.
:param map_body: Mapping for the index return await self.es.cluster.health()
:param pipeline: Name of the ingest pipeline to use for the index
:param replicas: Number of replicas for the index
:param shards: Number of shards for the index
'''
if await self.es.indices.exists(index=self.es_index):
logging.info(f'Index \'{self.es_index}\' already exists.')
return
mapping = map_body
mapping['settings'] = {
'number_of_shards': shards,
'number_of_replicas': replicas
}
if pipeline:
try:
await self.es.ingest.get_pipeline(id=pipeline)
logging.info(f'Using ingest pipeline \'{pipeline}\' for index \'{self.es_index}\'')
mapping['settings']['index.default_pipeline'] = pipeline
except NotFoundError:
raise ValueError(f'Ingest pipeline \'{pipeline}\' does not exist.')
response = await self.es.indices.create(index=self.es_index, body=mapping)
if response.get('acknowledged') and response.get('shards_acknowledged'):
logging.info(f'Index \'{self.es_index}\' successfully created.')
else:
raise Exception(f'Failed to create index. ({response})')
async def get_cluster_health(self) -> dict: async def get_cluster_size(self) -> int:
'''Get the health of the Elasticsearch cluster.''' '''Get the number of nodes in the Elasticsearch cluster.'''
return await self.es.cluster.health() cluster_stats = await self.es.cluster.stats()
number_of_nodes = cluster_stats['nodes']['count']['total']
return number_of_nodes
async def get_cluster_size(self) -> int: async def process_data(self, file_path: str, data_generator: callable):
'''Get the number of nodes in the Elasticsearch cluster.''' '''
Index records in chunks to Elasticsearch.
cluster_stats = await self.es.cluster.stats() :param file_path: Path to the file
number_of_nodes = cluster_stats['nodes']['count']['total'] :param index_name: Name of the index
:param data_generator: Generator for the records to index
'''
return number_of_nodes count = 0
total = 0
try:
async for ok, result in async_streaming_bulk(self.es, actions=data_generator(file_path), chunk_size=self.chunk_size, max_chunk_bytes=self.chunk_max):
action, result = result.popitem()
if not ok:
logging.error(f'Failed to index document ({result["_id"]}) to {self.es_index} from {file_path} ({result})')
continue
count += 1
total += 1
if count == self.chunk_size:
logging.info(f'Successfully indexed {self.chunk_size:,} ({total:,} processed) records to {self.es_index} from {file_path}')
count = 0
logging.info(f'Finished indexing {total:,} records to {self.es_index} from {file_path}')
except Exception as e:
raise Exception(f'Failed to index records to {self.es_index} from {file_path} ({e})')
async def process_data(self, file_path: str, data_generator: callable): def setup_logger(name: str, level: logging._Level = logging.INFO, to_file: bool = False, max_bytes: int = 250000, backups: int = 7) -> logging.Logger:
''' '''
Index records in chunks to Elasticsearch. Setup a custom logger with options for console and file logging.
:param file_path: Path to the file :param name: Name of the logger.
:param index_name: Name of the index :param level: Logging level.
:param data_generator: Generator for the records to index :param to_file: Whether to log to a file.
''' :param max_bytes: Maximum size in bytes before rotating log file.
:param backups: Number of backup files to keep.
'''
count = 0 logger = logging.getLogger(name)
total = 0 logger.setLevel(level)
async for ok, result in async_streaming_bulk(self.es, actions=data_generator(file_path), chunk_size=self.chunk_size, max_chunk_bytes=self.chunk_max): formatter_console = logging.Formatter('%(asctime)s | %(levelname)9s | %(message)s', '%I:%M:%S')
action, result = result.popitem() formatter_file = logging.Formatter('%(asctime)s | %(levelname)9s | %(filename)s.%(funcName)s | %(message)s', '%Y-%m-%d %I:%M:%S')
if not ok: sh = logging.StreamHandler()
logging.error(f'Failed to index document ({result["_id"]}) to {self.es_index} from {file_path} ({result})') sh.setFormatter(formatter_console)
input('Press Enter to continue...') # Debugging (will possibly remove this since we have retries enabled) logger.addHandler(sh)
continue
count += 1 if to_file:
total += 1 os.makedirs('logs', exist_ok=True)
fh = logging.handlers.RotatingFileHandler('logs/debug.log', maxBytes=max_bytes, backupCount=backups, encoding='utf-8')
fh.setFormatter(formatter_file)
logger.addHandler(fh)
if count == self.chunk_size: return logger
logging.info(f'Successfully indexed {self.chunk_size:,} ({total:,} processed) records to {self.es_index} from {file_path}')
count = 0
logging.info(f'Finished indexing {total:,} records to {self.es_index} from {file_path}')
async def main(): async def main():
'''Main function when running this script directly.''' '''Main function when running this script directly.'''
parser = argparse.ArgumentParser(description='Index data into Elasticsearch.') parser = argparse.ArgumentParser(description='Elasticsearch Recon Ingestion Scripts (ERIS)')
# General arguments # General arguments
parser.add_argument('input_path', help='Path to the input file or directory') # Required parser.add_argument('input_path', help='Path to the input file or directory') # Required
parser.add_argument('--watch', action='store_true', help='Create or watch a FIFO for real-time indexing') parser.add_argument('--watch', action='store_true', help='Create or watch a FIFO for real-time indexing')
# Elasticsearch arguments # Elasticsearch arguments
parser.add_argument('--host', default='http://localhost/', help='Elasticsearch host') parser.add_argument('--host', default='http://localhost', help='Elasticsearch host')
parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port') parser.add_argument('--port', type=int, default=9200, help='Elasticsearch port')
parser.add_argument('--user', default='elastic', help='Elasticsearch username') parser.add_argument('--user', default='elastic', help='Elasticsearch username')
parser.add_argument('--password', default=os.getenv('ES_PASSWORD'), help='Elasticsearch password (if not provided, check environment variable ES_PASSWORD)') parser.add_argument('--password', default=os.getenv('ES_PASSWORD'), help='Elasticsearch password (if not provided, check environment variable ES_PASSWORD)')
#parser.add_argument('--api-key', default=os.getenv('ES_APIKEY'), help='Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY)') parser.add_argument('--api-key', default=os.getenv('ES_APIKEY'), help='Elasticsearch API Key for authentication (if not provided, check environment variable ES_APIKEY)')
parser.add_argument('--self-signed', action='store_false', help='Elasticsearch is using self-signed certificates') parser.add_argument('--self-signed', action='store_false', help='Elasticsearch is using self-signed certificates')
# Elasticsearch indexing arguments # Elasticsearch indexing arguments
parser.add_argument('--index', help='Elasticsearch index name') parser.add_argument('--index', help='Elasticsearch index name')
parser.add_argument('--pipeline', help='Use an ingest pipeline for the index') parser.add_argument('--pipeline', help='Use an ingest pipeline for the index')
parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index') parser.add_argument('--replicas', type=int, default=1, help='Number of replicas for the index')
parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index') parser.add_argument('--shards', type=int, default=1, help='Number of shards for the index')
# Performance arguments # Performance arguments
parser.add_argument('--chunk-size', type=int, default=50000, help='Number of records to index in a chunk') parser.add_argument('--chunk-size', type=int, default=50000, help='Number of records to index in a chunk')
parser.add_argument('--chunk-max', type=int, default=100, help='Maximum size of a chunk in bytes') parser.add_argument('--chunk-max', type=int, default=100, help='Maximum size of a chunk in bytes')
parser.add_argument('--retries', type=int, default=100, help='Number of times to retry indexing a chunk before failing') parser.add_argument('--retries', type=int, default=30, help='Number of times to retry indexing a chunk before failing')
parser.add_argument('--timeout', type=int, default=60, help='Number of seconds to wait before retrying a chunk') parser.add_argument('--timeout', type=int, default=60, help='Number of seconds to wait before retrying a chunk')
# Ingestion arguments # Ingestion arguments
parser.add_argument('--cert', action='store_true', help='Index Certstream records') parser.add_argument('--certstream', action='store_true', help='Index Certstream records')
parser.add_argument('--httpx', action='store_true', help='Index Httpx records') parser.add_argument('--httpx', action='store_true', help='Index Httpx records')
parser.add_argument('--masscan', action='store_true', help='Index Masscan records') parser.add_argument('--masscan', action='store_true', help='Index Masscan records')
parser.add_argument('--massdns', action='store_true', help='Index Massdns records') parser.add_argument('--massdns', action='store_true', help='Index Massdns records')
parser.add_argument('--zone', action='store_true', help='Index Zone records') parser.add_argument('--zone', action='store_true', help='Index Zone records')
args = parser.parse_args() args = parser.parse_args()
if args.watch: if args.host.endswith('/'):
if not os.path.exists(args.input_path): args.host = args.host[:-1]
os.mkfifo(args.input_path)
elif os.path.exists(args.input_path) and stat.S_ISFIFO(os.stat(args.input_path).st_mode):
raise ValueError(f'Path {args.input_path} is not a FIFO')
elif not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
edx = ElasticIndexer(args) if args.watch:
await edx.initialize() # Initialize the Elasticsearch client asyncronously if not os.path.exists(args.input_path):
os.mkfifo(args.input_path)
elif not stat.S_ISFIFO(os.stat(args.input_path).st_mode):
raise ValueError(f'Path {args.input_path} is not a FIFO')
elif not os.path.isdir(args.input_path) and not os.path.isfile(args.input_path):
raise FileNotFoundError(f'Input path {args.input_path} does not exist or is not a file or directory')
if args.cert: edx = ElasticIndexer(args)
from ingestors import ingest_certs as ingestor
if args.httpx:
from ingestors import ingest_httpx as ingestor
elif args.masscan:
from ingestors import ingest_masscan as ingestor
elif args.massdns:
from ingestors import ingest_massdns as ingestor
elif args.zone:
from ingestors import ingest_zone as ingestor
if not isinstance(ingestor, object): if args.certstream:
raise ValueError('No ingestor selected') from ingestors import ingest_certstream as ingestor
elif args.httpx:
from ingestors import ingest_httpx as ingestor
elif args.masscan:
from ingestors import ingest_masscan as ingestor
elif args.massdns:
from ingestors import ingest_massdns as ingestor
elif args.zone:
from ingestors import ingest_zone as ingestor
else:
raise ValueError('No ingestor specified')
health = await edx.get_cluster_health() health = await edx.get_cluster_health()
print(health) logging.info(health)
await asyncio.sleep(5) # Delay to allow time for sniffing to complete await asyncio.sleep(5) # Delay to allow time for sniffing to complete
nodes = await edx.get_cluster_size() nodes = await edx.get_cluster_size()
logging.info(f'Connected to {nodes:,} Elasticsearch node(s)') logging.info(f'Connected to {nodes:,} Elasticsearch node(s)')
if not edx.es_index: if not edx.es_index:
edx.es_index = ingestor.default_index edx.es_index = ingestor.default_index
map_body = ingestor.construct_map() map_body = ingestor.construct_map()
await edx.create_index(map_body, args.pipeline, args.replicas, args.shards) await edx.create_index(map_body, args.pipeline, args.replicas, args.shards)
if os.path.isfile(args.input_path): if os.path.isfile(args.input_path):
logging.info(f'Processing file: {args.input_path}') logging.info(f'Processing file: {args.input_path}')
await edx.process_data(args.input_path, ingestor.process_data) await edx.process_data(args.input_path, ingestor.process_data)
elif stat.S_ISFIFO(os.stat(args.input_path).st_mode): elif os.path.isdir(args.input_path):
logging.info(f'Watching FIFO: {args.input_path}') count = 1
await edx.process_data(args.input_path, ingestor.process_data) total = len(os.listdir(args.input_path))
logging.info(f'Processing {total:,} files in directory: {args.input_path}')
elif os.path.isdir(args.input_path): for file in sorted(os.listdir(args.input_path)):
count = 1 file_path = os.path.join(args.input_path, file)
total = len(os.listdir(args.input_path)) if os.path.isfile(file_path):
logging.info(f'Processing {total:,} files in directory: {args.input_path}') logging.info(f'[{count:,}/{total:,}] Processing file: {file_path}')
for file in sorted(os.listdir(args.input_path)): await edx.process_data(file_path, ingestor.process_data)
file_path = os.path.join(args.input_path, file) count += 1
if os.path.isfile(file_path): else:
logging.info(f'[{count:,}/{total:,}] Processing file: {file_path}') logging.warning(f'[{count:,}/{total:,}] Skipping non-file: {file_path}')
await edx.process_data(file_path, ingestor.process_data)
count += 1
else:
logging.warning(f'[{count:,}/{total:,}] Skipping non-file: {file_path}')
if __name__ == '__main__': if __name__ == '__main__':
asyncio.run(main()) setup_logger('eris', level=logging.INFO, to_file=True)
print('')
print('┏┓┳┓┳┏┓ Elasticsearch Recon Ingestion Scripts')
print('┣ ┣┫┃┗┓ Developed by Acidvegas in Python')
print('┗┛┛┗┻┗┛ https://git.acid.vegas/eris')
print('')
asyncio.run(main())

View File

@ -23,12 +23,12 @@ async def init_elasticsearch(*args, **kwargs):
:param args: Async Elasticsearch positional arguments. :param args: Async Elasticsearch positional arguments.
:param kwargs: Async Elasticsearch keyword arguments. :param kwargs: Async Elasticsearch keyword arguments.
''' '''
async_client.default_sniff_callback = _override_sniff_callback(kwargs['basic_auth']) async_client.default_sniff_callback = await _override_sniff_callback(kwargs['basic_auth'])
return async_client.AsyncElasticsearch(*args, **kwargs) return async_client.AsyncElasticsearch(*args, **kwargs)
def _override_sniff_callback(basic_auth): async def _override_sniff_callback(basic_auth):
''' '''
Taken from https://github.com/elastic/elasticsearch-py/blob/8.8/elasticsearch/_sync/client/_base.py#L166 Taken from https://github.com/elastic/elasticsearch-py/blob/8.8/elasticsearch/_sync/client/_base.py#L166
Completely unmodified except for adding the auth header to the elastic request. Completely unmodified except for adding the auth header to the elastic request.

View File

@ -68,7 +68,7 @@ async def process_data(place_holder: str = None):
'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) 'seen' : time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
} }
yield {'_id': id, '_index': default_index, '_source': struct} yield {'_index': default_index, '_source': struct}
except websockets.ConnectionClosed: except websockets.ConnectionClosed:
logging.error('Connection to Certstream was closed. Attempting to reconnect...') logging.error('Connection to Certstream was closed. Attempting to reconnect...')