2023-12-14 20:48:50 -08:00
#!/usr/bin/env python
# HTTPZ Crawler - Developed by acidvegas in Python (https://git.acid.vegas/httpz)
'''
BCUZ FUCK HTTPX PYTHON STILL GO HARD
'''
import argparse
import asyncio
import json
import random
import re
import logging
import ssl
import urllib . request
try :
import aiodns
except ImportError :
print ( ' Missing required module \' aiodns \' . (pip install aiodns) ' )
exit ( 1 )
try :
import aiohttp
except ImportError :
print ( ' Missing required module \' aiohttp \' . (pip install aiohttp) ' )
exit ( 1 )
# ANSI escape codes for colors
2023-12-16 20:26:23 -08:00
BLUE = ' \033 [34m '
CYAN = ' \033 [36m '
2023-12-15 20:02:37 -08:00
RED = ' \033 [91m '
GREEN = ' \033 [92m '
2023-12-14 20:48:50 -08:00
DARK_GREY = ' \033 [90m '
2023-12-16 20:26:23 -08:00
YELLOW = ' \033 [93m '
2023-12-15 20:02:37 -08:00
RESET = ' \033 [0m '
2023-12-14 20:48:50 -08:00
# Globals
DNS_SERVERS = None
2023-12-15 20:02:37 -08:00
args = None # Global args variable
def vlog ( msg : str ) :
'''
Verbose logging only if enabled
: param msg : Message to print to console
'''
if args . verbose :
logging . info ( msg )
2023-12-14 20:48:50 -08:00
2023-12-16 20:26:23 -08:00
def create_session ( user_agent : str , timeout : int , proxy : str = None ) - > dict :
'''
Create a custom aiohttp session
: param user_agent : User agent to use for HTTP requests
: param timeout : Timeout for HTTP requests
'''
ssl_context = ssl . SSLContext ( ssl . PROTOCOL_TLS_CLIENT )
ssl_context . check_hostname = False
ssl_context . verify_mode = ssl . CERT_NONE
headers = { ' User-Agent ' : user_agent }
connector = aiohttp . TCPConnector ( ssl = ssl_context )
session_params = {
' connector ' : connector ,
' headers ' : headers ,
' timeout ' : aiohttp . ClientTimeout ( total = timeout )
}
return session_params
2023-12-15 20:02:37 -08:00
def get_dns_servers ( ) - > dict :
2023-12-14 20:48:50 -08:00
''' Get a list of DNS servers to use for lookups. '''
2023-12-15 20:02:37 -08:00
with urllib . request . urlopen ( ' https://public-dns.info/nameservers.txt ' ) as source :
results = source . read ( ) . decode ( ) . split ( ' \n ' )
2023-12-16 20:26:23 -08:00
2023-12-14 20:48:50 -08:00
v4_servers = [ server for server in results if ' : ' not in server ]
2023-12-15 20:02:37 -08:00
v6_servers = [ server for server in results if ' : ' in server ]
2023-12-16 20:26:23 -08:00
2023-12-14 20:48:50 -08:00
return { ' 4 ' : v4_servers , ' 6 ' : v6_servers }
2023-12-16 20:26:23 -08:00
async def dns_lookup ( domain : str , record_type : str , timeout : int , retry : int ) - > list :
2023-12-14 20:48:50 -08:00
'''
Resolve DNS information from a domain
: param domain : Domain name to resolve
: param record_type : DNS record type to resolve
2023-12-15 20:02:37 -08:00
: param timeout : Timeout for DNS request
2023-12-16 20:26:23 -08:00
: param retry : Number of times to retry failed requests
2023-12-14 20:48:50 -08:00
'''
2023-12-16 20:26:23 -08:00
for i in range ( retry ) :
2023-12-15 20:02:37 -08:00
try :
version = ' 4 ' if record_type == ' A ' else ' 6 ' if record_type == ' AAAA ' else random . choice ( [ ' 4 ' , ' 6 ' ] )
nameserver = random . choice ( DNS_SERVERS [ version ] )
resolver = aiodns . DNSResolver ( nameservers = [ nameserver ] , timeout = timeout )
records = await resolver . query ( domain , record_type )
return records . cname if record_type == ' CNAME ' else [ record . host for record in records ]
except Exception as e :
vlog ( f ' { RED } [ERROR] { RESET } { domain } - Failed to resolve { record_type } record using { nameserver } { DARK_GREY } ( { str ( e ) } ) { RESET } ' )
return [ ]
async def get_body ( source : str , preview : int ) - > str :
'''
Get the body of a webpage
: param source : HTML source of the webpage
: param preview : Number of bytes to preview
'''
2023-12-16 20:26:23 -08:00
body_content = re . search ( r ' <body.*?>(.*?)</body> ' , source [ : 5000 ] , re . DOTALL | re . IGNORECASE )
2023-12-15 20:02:37 -08:00
processed_content = body_content . group ( 1 ) if body_content else source
clean_content = re . sub ( r ' <[^>]+> ' , ' ' , processed_content )
return clean_content [ : preview ]
2023-12-14 20:48:50 -08:00
2023-12-15 20:02:37 -08:00
async def get_title ( session : aiohttp . ClientSession , domain : str ) :
2023-12-14 20:48:50 -08:00
'''
2023-12-15 20:02:37 -08:00
Get the title of a webpage and its status code
2023-12-14 20:48:50 -08:00
: param session : aiohttp session
: param domain : URL to get the title of
'''
2023-12-16 20:26:23 -08:00
title = None
2023-12-15 20:02:37 -08:00
body = None
status_code = None
2023-12-14 20:48:50 -08:00
try :
2023-12-15 20:02:37 -08:00
async with session . get ( domain , timeout = args . timeout , allow_redirects = False ) as response :
status_code = response . status
if status_code in ( 200 , 201 ) :
2023-12-14 20:48:50 -08:00
html_content = await response . text ( )
match = re . search ( r ' <title>(.*?)</title> ' , html_content , re . IGNORECASE | re . DOTALL )
2023-12-15 20:02:37 -08:00
title = match . group ( 1 ) . strip ( ) if match else None
2023-12-16 20:26:23 -08:00
title = bytes ( title , ' utf-8 ' ) . decode ( ' unicode_escape ' ) if title else None
2023-12-15 20:02:37 -08:00
title = re . sub ( r ' [ \ r \ n]+ ' , ' ' , title ) [ : 300 ] if title else None # Fix this ugly shit
body = await get_body ( html_content , args . preview )
2023-12-16 20:26:23 -08:00
body = re . sub ( r ' \ s+ ' , ' ' , body ) . strip ( ) if body else None
2023-12-15 20:02:37 -08:00
elif status_code in ( 301 , 302 , 303 , 307 , 308 ) and args . retry > 0 : # Need to implement a max redirect limit
2023-12-14 20:48:50 -08:00
redirect_url = response . headers . get ( ' Location ' )
if redirect_url :
2023-12-16 20:26:23 -08:00
vlog ( f ' { YELLOW } [WARN] { RESET } { domain } -> { redirect_url } { DARK_GREY } ( { status_code } ) { RESET } ' )
2023-12-15 20:02:37 -08:00
return await get_title ( session , redirect_url )
else :
2023-12-16 20:26:23 -08:00
vlog ( f ' { RED } [ERROR] { RESET } No redirect URL found for { domain } { DARK_GREY } ( { status_code } ) { RESET } ' )
2023-12-14 20:48:50 -08:00
else :
2023-12-15 20:02:37 -08:00
vlog ( f ' { RED } [ERROR] { RESET } { domain } - Invalid status code { DARK_GREY } { status_code } { RESET } ' )
except asyncio . TimeoutError :
vlog ( f ' { RED } [ERROR] { RESET } { domain } - HTTP request timed out ' )
2023-12-14 20:48:50 -08:00
except Exception as e :
2023-12-15 20:02:37 -08:00
vlog ( f ' { RED } [ERROR] { RESET } Failed to get title for { domain } { DARK_GREY } ( { e } ) { RESET } ' )
2023-12-16 20:26:23 -08:00
return title , body , status_code # Fix this ugly shit
2023-12-14 20:48:50 -08:00
2023-12-15 20:02:37 -08:00
async def check_url ( session : aiohttp . ClientSession , domain : str ) :
2023-12-14 20:48:50 -08:00
'''
Process a domain name
: param session : aiohttp session
: param domain : URL to get the title of
'''
dns_records = { }
2023-12-15 20:02:37 -08:00
2023-12-14 20:48:50 -08:00
for record_type in ( ' A ' , ' AAAA ' ) :
2023-12-16 20:26:23 -08:00
records = await dns_lookup ( domain , record_type , args . timeout , args . retry )
2023-12-14 20:48:50 -08:00
if records :
dns_records [ record_type ] = records
if not dns_records :
2023-12-16 20:26:23 -08:00
cname_record = await dns_lookup ( domain , ' CNAME ' , args . timeout , args . retry )
2023-12-15 20:02:37 -08:00
if cname_record :
dns_records [ ' CNAME ' ] = cname_record
domain = cname_record
else :
vlog ( f ' { RED } [ERROR] { RESET } No DNS records found for { domain } ' )
return domain , None , None , None , None , None
title , body , status_code = await get_title ( session , f ' https:// { domain } ' )
if not title and not body :
title , body , status_code = await get_title ( session , f ' http:// { domain } ' )
if title or body :
2023-12-16 20:26:23 -08:00
if status_code in ( 200 , 201 ) :
status_code = f ' [ { GREEN } 200 { RESET } ] '
elif status_code in ( 301 , 302 , 303 , 307 , 308 ) :
status_code = f ' [ { YELLOW } { status_code } { RESET } ] '
logging . info ( f ' { domain } { status_code } [ { CYAN } { title } { RESET } ] - [ { BLUE } { body } { RESET } ] ' )
2023-12-15 20:02:37 -08:00
return domain , ' https ' , title , body , dns_records , status_code
2023-12-14 20:48:50 -08:00
else :
2023-12-15 20:02:37 -08:00
vlog ( f ' { RED } [ERROR] { RESET } { domain } - Failed to retrieve title ' )
2023-12-14 20:48:50 -08:00
2023-12-15 20:02:37 -08:00
return domain , None , None , None , None , status_code
2023-12-14 20:48:50 -08:00
2023-12-15 20:02:37 -08:00
async def process_file ( ) :
2023-12-14 20:48:50 -08:00
'''
Process a list of domains from file
'''
2023-12-16 20:26:23 -08:00
session_params = create_session ( args . user_agent , args . timeout , args . proxy )
2023-12-14 20:48:50 -08:00
async with aiohttp . ClientSession ( * * session_params ) as session :
tasks = set ( )
2023-12-15 20:02:37 -08:00
with open ( args . file , ' r ' ) as file :
2023-12-14 20:48:50 -08:00
for line in file :
domain = line . strip ( )
if domain :
2023-12-15 20:02:37 -08:00
tasks . add ( asyncio . create_task ( check_url ( session , domain ) ) )
2023-12-14 20:48:50 -08:00
2023-12-15 20:15:44 -08:00
if len ( tasks ) > = args . concurrency : # Should be a better way to do this
2023-12-14 20:48:50 -08:00
done , tasks = await asyncio . wait ( tasks , return_when = asyncio . FIRST_COMPLETED )
for task in done :
2023-12-15 20:02:37 -08:00
domain , protocol , title , body , dns_records , status_code = task . result ( )
2023-12-15 20:15:44 -08:00
if title or body or dns_records :
2023-12-15 20:02:37 -08:00
write_result_to_file ( domain , protocol , title , body , dns_records , status_code )
2023-12-14 20:48:50 -08:00
if tasks :
2023-12-15 20:02:37 -08:00
done , _ = await asyncio . wait ( tasks )
for task in done :
domain , protocol , title , body , dns_records , status_code = task . result ( )
2023-12-14 20:48:50 -08:00
if title :
2023-12-15 20:02:37 -08:00
write_result_to_file ( domain , protocol , title , body , dns_records , status_code )
2023-12-14 20:48:50 -08:00
2023-12-16 20:26:23 -08:00
2023-12-15 20:02:37 -08:00
def write_result_to_file ( domain , protocol , title , body , dns_records , status_code ) :
'''
Write a single domain result to file
: param domain : Domain name
: param protocol : Protocol used ( http or https )
: param title : Title of the domain
: param dns_records : DNS records of the domain
: param status_code : HTTP status code
'''
result = {
' domain ' : domain ,
' protocol ' : protocol ,
' status_code ' : status_code ,
' title ' : title ,
' body ' : body ,
' dns_records ' : dns_records
}
with open ( args . output , ' a ' ) as f :
json . dump ( result , f )
f . write ( ' \n ' )
2023-12-14 20:48:50 -08:00
def main ( ) :
2023-12-15 20:02:37 -08:00
global DNS_SERVERS , args
2023-12-14 20:48:50 -08:00
parser = argparse . ArgumentParser ( description = ' Check URLs from a file asynchronously, perform DNS lookups and store results in JSON. ' )
parser . add_argument ( ' file ' , help = ' File containing list of domains ' )
parser . add_argument ( ' -c ' , ' --concurrency ' , type = int , default = 10 , help = ' Number of concurrent requests ' )
parser . add_argument ( ' -m ' , ' --memory_limit ' , type = int , default = 1000 , help = ' Number of results to store in memory before syncing to file ' )
parser . add_argument ( ' -o ' , ' --output ' , default = ' results.json ' , help = ' Output file ' )
2023-12-16 20:26:23 -08:00
parser . add_argument ( ' -t ' , ' --timeout ' , type = int , default = 10 , help = ' Timeout for HTTP requests ' )
2023-12-14 20:48:50 -08:00
parser . add_argument ( ' -u ' , ' --user_agent ' , default = ' Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) ' , help = ' User agent to use for HTTP requests ' )
2023-12-16 20:26:23 -08:00
parser . add_argument ( ' -x ' , ' --proxy ' , type = str , help = ' Proxy to use for HTTP requests ' )
parser . add_argument ( ' -r ' , ' --retry ' , type = int , default = 2 , help = ' Number of times to retry failed requests ' )
2023-12-14 20:48:50 -08:00
parser . add_argument ( ' -v ' , ' --verbose ' , action = ' store_true ' , help = ' Increase output verbosity ' )
2023-12-15 20:02:37 -08:00
parser . add_argument ( ' -p ' , ' --preview ' , type = int , default = 500 , help = ' Preview size in bytes for body & title (default: 500) ' )
2023-12-14 20:48:50 -08:00
args = parser . parse_args ( )
2023-12-15 20:02:37 -08:00
log_level = logging . INFO
logging . basicConfig ( level = log_level , format = f ' { DARK_GREY } %(asctime)s { RESET } %(message)s ' , datefmt = ' % H: % M: % S ' )
2023-12-14 20:48:50 -08:00
logging . info ( ' Loading DNS servers... ' )
DNS_SERVERS = get_dns_servers ( )
if not DNS_SERVERS :
logging . fatal ( ' Failed to get DNS servers. ' )
logging . info ( f ' Found { len ( DNS_SERVERS [ " 4 " ] ) } IPv4 and { len ( DNS_SERVERS [ " 6 " ] ) } IPv6 DNS servers. ' )
2023-12-15 20:02:37 -08:00
asyncio . run ( process_file ( ) )
2023-12-14 20:48:50 -08:00
if __name__ == ' __main__ ' :
main ( )