eris/ingestors/ingest_masscan.py

#!/usr/bin/env python
# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)
# ingest_masscan.py

import json
import logging
import re
import time

default_index = 'masscan-logs'

def construct_map() -> dict:
    '''Construct the Elasticsearch index mapping for Masscan records.'''

    keyword_mapping = { 'type': 'text',  'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }

    mapping = {
        'mappings': {
            'properties': {
                'ip':      { 'type': 'ip' },
                'port':    { 'type': 'integer' },
                'proto':   { 'type': 'keyword' },
                'service': { 'type': 'keyword' },
                'banner':  keyword_mapping,
                'ref_id':  { 'type': 'keyword' },
                'seen':    { 'type': 'date' },
                'geoip':   {
                    'properties': {
                        'city_name':        keyword_mapping,
                        'continent_name':   keyword_mapping,
                        'country_iso_code': keyword_mapping,
                        'country_name':     keyword_mapping,
                        'location':         { 'type': 'geo_point' },
                        'region_iso_code':  keyword_mapping,
                        'region_name':      keyword_mapping,
                    }
                }
            }
        }
    }

    return mapping


def process_file(file_path: str):
    '''
    Read and process Masscan records from the log file.

    :param file_path: Path to the Masscan log file
    '''

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            if not line or not line.startswith('{'):
                continue

            try:
                record = json.loads(line)
            except json.decoder.JSONDecodeError:
                logging.error(f'Failed to parse JSON record! ({line})')
                input('Press Enter to continue...') # Debugging
                continue

            for port_info in record['ports']:
                struct = {
                    'ip': record['ip'],
                    'port': port_info['port'],
                    'proto': port_info['proto'],
                    'seen': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp']))),
                }

                if 'service' in port_info:
                    if 'name' in port_info['service']:
                        if port_info['service']['name'] != 'unknown':
                            struct['service'] = port_info['service']['name']

                    if 'banner' in port_info['service']:
                        banner = ' '.join(port_info['service']['banner'].split()) # Remove extra whitespace
                        if banner:
                            match = re.search(r'\(Ref\.Id: (.*?)\)', banner)
                            if match:
                                struct['ref_id'] = match.group(1)
                            else:
                                struct['banner'] = banner

                yield struct
 
    return None # EOF


'''
Example record:
{
    "ip": "43.134.51.142",
    "timestamp": "1705255468", # Convert to ZULU BABY
    "ports": [ # We will create a record for each port opened
        {
            "port": 22,
            "proto": "tcp",
            "service": { # This field is optional
                "name": "ssh",
                "banner": "SSH-2.0-OpenSSH_8.9p1 Ubuntu-3ubuntu0.4"
            }
        }
    ]
}

Will be indexed as:
{
    "ip": "43.134.51.142",
    "port": 22,
    "proto": "tcp",
    "service": "ssh",
    "banner": "SSH-2.0-OpenSSH_8.9p1 Ubuntu-3ubuntu0.4",
    "seen": "2021-10-08T02:04:28Z",
    "ref_id": "?sKfOvsC4M4a2W8PaC4zF?", # TCP RST Payload, Might be useful..

    # GeoIP ingestion pipeline fields
    "geoip": {
        "city_name": "City",
        "continent_name": "Continent",
        "country_iso_code": "CC",
        "country_name": "Country",
        "location": {
            "lat": 0.0000,
            "lon": 0.0000
        },
        "region_iso_code": "RR",
        "region_name": "Region"
    }
}
'''
Initial commit 2024-01-20 07:04:50 +00:00			`#!/usr/bin/env python`
			`# Elasticsearch Recon Ingestion Scripts (ERIS) - Developed by Acidvegas (https://git.acid.vegas/eris)`
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`# ingest_masscan.py`
Initial commit 2024-01-20 07:04:50 +00:00
			`import json`
			`import logging`
			`import re`
			`import time`

Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`default_index = 'masscan-logs'`

			`def construct_map() -> dict:`
			`'''Construct the Elasticsearch index mapping for Masscan records.'''`

			`keyword_mapping = { 'type': 'text', 'fields': { 'keyword': { 'type': 'keyword', 'ignore_above': 256 } } }`

			`mapping = {`
			`'mappings': {`
			`'properties': {`
			`'ip': { 'type': 'ip' },`
			`'port': { 'type': 'integer' },`
			`'proto': { 'type': 'keyword' },`
			`'service': { 'type': 'keyword' },`
			`'banner': keyword_mapping,`
			`'ref_id': { 'type': 'keyword' },`
			`'seen': { 'type': 'date' },`
			`'geoip': {`
			`'properties': {`
			`'city_name': keyword_mapping,`
			`'continent_name': keyword_mapping,`
			`'country_iso_code': keyword_mapping,`
			`'country_name': keyword_mapping,`
			`'location': { 'type': 'geo_point' },`
			`'region_iso_code': keyword_mapping,`
			`'region_name': keyword_mapping,`
			`}`
Initial commit 2024-01-20 07:04:50 +00:00			`}`
			`}`
			`}`
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`}`
Initial commit 2024-01-20 07:04:50 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`return mapping`
Updated README, fixed issue using the wrong domain in records for zone file ingestion (woops) 2024-01-20 15:53:55 +00:00
Initial commit 2024-01-20 07:04:50 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`def process_file(file_path: str):`
			`'''`
			`Read and process Masscan records from the log file.`
Updated README, fixed issue using the wrong domain in records for zone file ingestion (woops) 2024-01-20 15:53:55 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`:param file_path: Path to the Masscan log file`
			`'''`
Initial commit 2024-01-20 07:04:50 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`with open(file_path, 'r') as file:`
			`for line in file:`
			`line = line.strip()`
Updated README, copied over consistencies across the ingestors, docstring updates to reflect on new arguments 2024-01-27 09:28:30 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`if not line or not line.startswith('{'):`
			`continue`
Updated README, copied over consistencies across the ingestors, docstring updates to reflect on new arguments 2024-01-27 09:28:30 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`try:`
			`record = json.loads(line)`
			`except json.decoder.JSONDecodeError:`
			`logging.error(f'Failed to parse JSON record! ({line})')`
			`input('Press Enter to continue...') # Debugging`
			`continue`

			`for port_info in record['ports']:`
			`struct = {`
			`'ip': record['ip'],`
			`'port': port_info['port'],`
			`'proto': port_info['proto'],`
			`'seen': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(int(record['timestamp']))),`
			`}`
Added parallel bulk uploading, error handling, sniffing nodes for discovery, dynamic batch sizes, and more 2024-01-27 06:13:11 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`if 'service' in port_info:`
			`if 'name' in port_info['service']:`
			`if port_info['service']['name'] != 'unknown':`
			`struct['service'] = port_info['service']['name']`
Added parallel bulk uploading, error handling, sniffing nodes for discovery, dynamic batch sizes, and more 2024-01-27 06:13:11 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`if 'banner' in port_info['service']:`
			`banner = ' '.join(port_info['service']['banner'].split()) # Remove extra whitespace`
			`if banner:`
			`match = re.search(r'\(Ref\.Id: (.*?)\)', banner)`
			`if match:`
			`struct['ref_id'] = match.group(1)`
			`else:`
			`struct['banner'] = banner`
Added parallel bulk uploading, error handling, sniffing nodes for discovery, dynamic batch sizes, and more 2024-01-27 06:13:11 +00:00
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`yield struct`

			`return None # EOF`
Added parallel bulk uploading, error handling, sniffing nodes for discovery, dynamic batch sizes, and more 2024-01-27 06:13:11 +00:00
Initial commit 2024-01-20 07:04:50 +00:00

Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`'''`
			`Example record:`
			`{`
			`"ip": "43.134.51.142",`
			`"timestamp": "1705255468", # Convert to ZULU BABY`
			`"ports": [ # We will create a record for each port opened`
Initial commit 2024-01-20 07:04:50 +00:00			`{`
			`"port": 22,`
			`"proto": "tcp",`
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`"service": { # This field is optional`
			`"name": "ssh",`
			`"banner": "SSH-2.0-OpenSSH_8.9p1 Ubuntu-3ubuntu0.4"`
			`}`
Added parallel bulk uploading, error handling, sniffing nodes for discovery, dynamic batch sizes, and more 2024-01-27 06:13:11 +00:00			`}`
Ingestion agents are now modular, FIFO live ingestion added 2024-02-02 05:11:18 +00:00			`]`
			`}`

			`Will be indexed as:`
			`{`
			`"ip": "43.134.51.142",`
			`"port": 22,`
			`"proto": "tcp",`
			`"service": "ssh",`
			`"banner": "SSH-2.0-OpenSSH_8.9p1 Ubuntu-3ubuntu0.4",`
			`"seen": "2021-10-08T02:04:28Z",`
			`"ref_id": "?sKfOvsC4M4a2W8PaC4zF?", # TCP RST Payload, Might be useful..`

			`# GeoIP ingestion pipeline fields`
			`"geoip": {`
			`"city_name": "City",`
			`"continent_name": "Continent",`
			`"country_iso_code": "CC",`
			`"country_name": "Country",`
			`"location": {`
			`"lat": 0.0000,`
			`"lon": 0.0000`
			`},`
			`"region_iso_code": "RR",`
			`"region_name": "Region"`
			`}`
			`}`
			`'''`