72 lines
2.7 KiB
Python
72 lines
2.7 KiB
Python
#!/usr/bin/env python
|
|
# SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools)
|
|
|
|
'''
|
|
There is a file in this repository called proxy_sources.txt which contains a list of URLs to scrape for proxies.
|
|
This list it not maintained and may contain dead links or links to sites that no longer contain proxies.
|
|
'''
|
|
|
|
import concurrent.futures
|
|
import logging
|
|
import os
|
|
import re
|
|
import urllib.request
|
|
|
|
# Global
|
|
proxies = list()
|
|
|
|
def find_proxies(url: str) -> str:
|
|
'''
|
|
Check a URL for IP:PORT proxies.
|
|
|
|
:param url: The URL to check for proxies.
|
|
'''
|
|
try:
|
|
source = urllib.request.urlopen(urllib.request.Request(url, headers={'User-Agent': 'SockHub/1.0'})).read().decode()
|
|
if source:
|
|
found = set(re.findall('[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+', source, re.MULTILINE))
|
|
if (new_proxies := [proxy for proxy in found if proxy not in proxies]):
|
|
proxies += new_proxies
|
|
print(f'found \033[32m{len(found):,}\033[0m new proxies on \033[34m{url}\033[0m')
|
|
else:
|
|
logging.warning(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m(source is empty)\033[0m')
|
|
except Exception as ex:
|
|
logging.error(f'found \033[31m0\033[0m new proxies on \033[34m{url}\033[0m \033[30m({ex})\033[0m')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='SockHub Proxy Scraper - Developed by acidvegas in Python (https://git.acid.vegas/proxytools)')
|
|
parser.add_argument('-i', '--input', help='input file containing a list of URLs to scrape (one per line) or a single URL')
|
|
parser.add_argument('-o', '--output', help='output file to save proxies to', default='proxies.txt')
|
|
parser.add_argument('-c', '--concurrency', help='number of concurrent threads to use (default: 10)', default=10, type=int)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(format='%(level)s %(message)s', level=logging.INFO)
|
|
|
|
if not os.path.isfile(args.input):
|
|
if args.input.startswith('https://') or args.input.startswith('http://'):
|
|
logging.info('using input as a single url...')
|
|
proxy_sources = [args.input]
|
|
else:
|
|
logging.fatal('input file does not exist!')
|
|
|
|
proxy_sources = open(args.input, 'r').read().split('\n')
|
|
|
|
if not proxy_sources:
|
|
logging.fatal('proxy sources input file is empty!')
|
|
|
|
logging.debug('scanning \033[35m{len(urls):,}\033[0m urls from list...')
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=args.concurrency) as executor:
|
|
futures = [executor.submit(find_proxies, url) for url in proxy_sources]
|
|
concurrent.futures.wait(futures)
|
|
|
|
if proxies:
|
|
logging.info('found \033[32m{len(proxies):,}\033[0m total proxies!')
|
|
proxies.sort()
|
|
with open (args.output, 'w') as output_file:
|
|
for proxy in proxies:
|
|
output_file.write(proxy + '\n')
|
|
else:
|
|
logging.warning('no proxies found!') |