Files
fcc-uls-database-dumper/fcc-uls-database-dumper.py
2025-11-26 02:27:09 -05:00

211 lines
7.8 KiB
Python

#!/usr/bin/env python
# FCC Universal Licensing System Database Dumper - Developed by acidvegas (https://github.com/acidvegas/fcc-uls-database-dumper)
import argparse
import asyncio
import logging
import os
import re
import zipfile
from datetime import datetime
from urllib.parse import urljoin
try:
import aiohttp
except ImportError:
raise ImportError('missing \'aiohttp\' library (pip install aiohttp)')
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError('missing \'beautifulsoup4\' library (pip install beautifulsoup4)')
try:
from tqdm.asyncio import tqdm
except ImportError:
raise ImportError('missing \'tqdm\' library (pip install tqdm)')
# HTTP headers for all requests
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
}
async def fetch_weekly_databases() -> dict:
'''
Fetch weekly database URLs from the FCC website
:return: dictionary of categories and their download URLs
'''
base_url = 'https://www.fcc.gov'
page_url = 'https://www.fcc.gov/uls/transactions/daily-weekly'
try:
timeout = aiohttp.ClientTimeout(total=60)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(page_url, headers=HEADERS) as response:
response.raise_for_status()
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
weekly_table = soup.find('table', class_='uls-transaction-weekly')
if not weekly_table:
raise Exception('Could not find weekly databases table on FCC website')
urls = {}
current_category = None
for row in weekly_table.find_all('tr'):
if 'group-header' in row.get('class', []):
category_name = row.find('th').text.strip()
category_key = re.sub(r'[^a-z0-9]+', '_', category_name.lower()).strip('_')
current_category = category_key
urls[current_category] = {}
elif current_category:
link = row.find('a')
if link and link.get('href'):
file_type = link.text.strip()
file_key = re.sub(r'[^a-z0-9]+', '_', file_type.lower()).strip('_')
url = urljoin(base_url, link['href'])
urls[current_category][file_key] = url
if not urls:
raise Exception('No URLs found on FCC website')
logging.info(f'Fetched {len(urls)} categories from FCC website')
return urls
except Exception as e:
logging.error(f'Failed to fetch URLs from website: {e}')
raise
async def download_file(session: aiohttp.ClientSession, category: str, file_type: str, url: str, base_dir: str, pbar: tqdm, semaphore: asyncio.Semaphore):
'''
Download a single file from the given URL and extract if needed
:param session: aiohttp client session
:param category: category name for the file
:param file_type: type of file (applications, licenses, etc.)
:param url: URL of the file to download
:param base_dir: base directory for all downloads
:param pbar: progress bar instance
:param semaphore: semaphore to limit concurrent downloads
'''
async with semaphore:
filename = os.path.basename(url)
category_dir = os.path.join(base_dir, category)
type_dir = os.path.join(category_dir, file_type)
is_zip = filename.endswith('.zip')
file_path = os.path.join(category_dir if is_zip else type_dir, filename)
os.makedirs(type_dir, exist_ok=True)
try:
async with session.get(url, headers=HEADERS, timeout=aiohttp.ClientTimeout(total=600)) as response:
response.raise_for_status()
with open(file_path, 'wb') as f:
while True:
chunk = await response.content.read(8192)
if not chunk:
break
f.write(chunk)
if is_zip:
try:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(type_dir)
except zipfile.BadZipFile:
logging.error(f'Corrupted zip file for {category}/{file_type} ({filename})')
raise
finally:
if os.path.exists(file_path):
os.remove(file_path)
pbar.set_description(f'Downloaded {category}/{file_type}')
pbar.update(1)
except Exception as e:
logging.error(f'Failed to download/extract {category}/{file_type} ({filename}): {e}')
if os.path.exists(file_path):
os.remove(file_path)
pbar.update(1)
async def main(concurrency: int = 5):
'''
Main function to download all FCC database files
:param concurrency: maximum number of concurrent downloads
'''
date_str = datetime.now().strftime('%Y-%m-%d')
base_dir = f'assets/fcc_data/{date_str}'
os.makedirs(base_dir, exist_ok=True)
logging.info('Fetching weekly database URLs from FCC website...')
urls = await fetch_weekly_databases()
total_files = sum(len(files) for files in urls.values())
semaphore = asyncio.Semaphore(concurrency)
logging.info(f'Starting download of {total_files} files with {concurrency} concurrent downloads')
with tqdm(total=total_files, desc='Downloading FCC databases', unit='file') as pbar:
async with aiohttp.ClientSession() as session:
tasks = []
for category, files in urls.items():
for file_type, url in files.items():
tasks.append(download_file(session, category, file_type, url, base_dir, pbar, semaphore))
await asyncio.gather(*tasks)
print(f'\nAll {total_files} files downloaded and extracted to {base_dir}')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Download FCC ULS database files')
parser.add_argument('-l', '--list', action='store_true', help='List available databases without downloading')
parser.add_argument('-c', '--concurrency', type=int, default=5, help='Maximum concurrent downloads (default: 5)')
parser.add_argument('-d', '--debug', action='store_true', help='Enable debug logging')
args = parser.parse_args()
log_level = logging.DEBUG if args.debug else logging.INFO
logging.basicConfig(
level = log_level,
format = '%(asctime)s - %(levelname)s - %(message)s',
datefmt = '%Y-%m-%d %H:%M:%S'
)
if args.list:
print('Fetching weekly database URLs from FCC website...\n')
urls = asyncio.run(fetch_weekly_databases())
total_files = 0
for category, files in urls.items():
print(f'\n{category}:')
for file_type, url in files.items():
print(f' {file_type}: {url}')
total_files += 1
print(f'\n\nTotal: {total_files} files found across {len(urls)} categories')
else:
asyncio.run(main(args.concurrency))