211 lines
7.8 KiB
Python
211 lines
7.8 KiB
Python
#!/usr/bin/env python
|
|
# FCC Universal Licensing System Database Dumper - Developed by acidvegas (https://github.com/acidvegas/fcc-uls-database-dumper)
|
|
|
|
import argparse
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import re
|
|
import zipfile
|
|
|
|
from datetime import datetime
|
|
from urllib.parse import urljoin
|
|
|
|
try:
|
|
import aiohttp
|
|
except ImportError:
|
|
raise ImportError('missing \'aiohttp\' library (pip install aiohttp)')
|
|
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
raise ImportError('missing \'beautifulsoup4\' library (pip install beautifulsoup4)')
|
|
|
|
try:
|
|
from tqdm.asyncio import tqdm
|
|
except ImportError:
|
|
raise ImportError('missing \'tqdm\' library (pip install tqdm)')
|
|
|
|
|
|
# HTTP headers for all requests
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'none',
|
|
'Sec-Fetch-User': '?1',
|
|
}
|
|
|
|
|
|
async def fetch_weekly_databases() -> dict:
|
|
'''
|
|
Fetch weekly database URLs from the FCC website
|
|
|
|
:return: dictionary of categories and their download URLs
|
|
'''
|
|
|
|
base_url = 'https://www.fcc.gov'
|
|
page_url = 'https://www.fcc.gov/uls/transactions/daily-weekly'
|
|
|
|
try:
|
|
timeout = aiohttp.ClientTimeout(total=60)
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
async with session.get(page_url, headers=HEADERS) as response:
|
|
response.raise_for_status()
|
|
html = await response.text()
|
|
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
weekly_table = soup.find('table', class_='uls-transaction-weekly')
|
|
if not weekly_table:
|
|
raise Exception('Could not find weekly databases table on FCC website')
|
|
|
|
urls = {}
|
|
current_category = None
|
|
|
|
for row in weekly_table.find_all('tr'):
|
|
if 'group-header' in row.get('class', []):
|
|
category_name = row.find('th').text.strip()
|
|
category_key = re.sub(r'[^a-z0-9]+', '_', category_name.lower()).strip('_')
|
|
current_category = category_key
|
|
urls[current_category] = {}
|
|
elif current_category:
|
|
link = row.find('a')
|
|
if link and link.get('href'):
|
|
file_type = link.text.strip()
|
|
file_key = re.sub(r'[^a-z0-9]+', '_', file_type.lower()).strip('_')
|
|
url = urljoin(base_url, link['href'])
|
|
urls[current_category][file_key] = url
|
|
|
|
if not urls:
|
|
raise Exception('No URLs found on FCC website')
|
|
|
|
logging.info(f'Fetched {len(urls)} categories from FCC website')
|
|
return urls
|
|
|
|
except Exception as e:
|
|
logging.error(f'Failed to fetch URLs from website: {e}')
|
|
raise
|
|
|
|
|
|
async def download_file(session: aiohttp.ClientSession, category: str, file_type: str, url: str, base_dir: str, pbar: tqdm, semaphore: asyncio.Semaphore):
|
|
'''
|
|
Download a single file from the given URL and extract if needed
|
|
|
|
:param session: aiohttp client session
|
|
:param category: category name for the file
|
|
:param file_type: type of file (applications, licenses, etc.)
|
|
:param url: URL of the file to download
|
|
:param base_dir: base directory for all downloads
|
|
:param pbar: progress bar instance
|
|
:param semaphore: semaphore to limit concurrent downloads
|
|
'''
|
|
|
|
async with semaphore:
|
|
filename = os.path.basename(url)
|
|
category_dir = os.path.join(base_dir, category)
|
|
type_dir = os.path.join(category_dir, file_type)
|
|
is_zip = filename.endswith('.zip')
|
|
file_path = os.path.join(category_dir if is_zip else type_dir, filename)
|
|
|
|
os.makedirs(type_dir, exist_ok=True)
|
|
|
|
try:
|
|
async with session.get(url, headers=HEADERS, timeout=aiohttp.ClientTimeout(total=600)) as response:
|
|
response.raise_for_status()
|
|
with open(file_path, 'wb') as f:
|
|
while True:
|
|
chunk = await response.content.read(8192)
|
|
if not chunk:
|
|
break
|
|
f.write(chunk)
|
|
|
|
if is_zip:
|
|
try:
|
|
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
|
zip_ref.extractall(type_dir)
|
|
except zipfile.BadZipFile:
|
|
logging.error(f'Corrupted zip file for {category}/{file_type} ({filename})')
|
|
raise
|
|
finally:
|
|
if os.path.exists(file_path):
|
|
os.remove(file_path)
|
|
|
|
pbar.set_description(f'Downloaded {category}/{file_type}')
|
|
pbar.update(1)
|
|
except Exception as e:
|
|
logging.error(f'Failed to download/extract {category}/{file_type} ({filename}): {e}')
|
|
if os.path.exists(file_path):
|
|
os.remove(file_path)
|
|
pbar.update(1)
|
|
|
|
|
|
async def main(concurrency: int = 5):
|
|
'''
|
|
Main function to download all FCC database files
|
|
|
|
:param concurrency: maximum number of concurrent downloads
|
|
'''
|
|
|
|
date_str = datetime.now().strftime('%Y-%m-%d')
|
|
base_dir = f'assets/fcc_data/{date_str}'
|
|
os.makedirs(base_dir, exist_ok=True)
|
|
|
|
logging.info('Fetching weekly database URLs from FCC website...')
|
|
urls = await fetch_weekly_databases()
|
|
|
|
total_files = sum(len(files) for files in urls.values())
|
|
semaphore = asyncio.Semaphore(concurrency)
|
|
|
|
logging.info(f'Starting download of {total_files} files with {concurrency} concurrent downloads')
|
|
|
|
with tqdm(total=total_files, desc='Downloading FCC databases', unit='file') as pbar:
|
|
async with aiohttp.ClientSession() as session:
|
|
tasks = []
|
|
for category, files in urls.items():
|
|
for file_type, url in files.items():
|
|
tasks.append(download_file(session, category, file_type, url, base_dir, pbar, semaphore))
|
|
await asyncio.gather(*tasks)
|
|
|
|
print(f'\nAll {total_files} files downloaded and extracted to {base_dir}')
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='Download FCC ULS database files')
|
|
parser.add_argument('-l', '--list', action='store_true', help='List available databases without downloading')
|
|
parser.add_argument('-c', '--concurrency', type=int, default=5, help='Maximum concurrent downloads (default: 5)')
|
|
parser.add_argument('-d', '--debug', action='store_true', help='Enable debug logging')
|
|
|
|
args = parser.parse_args()
|
|
|
|
log_level = logging.DEBUG if args.debug else logging.INFO
|
|
logging.basicConfig(
|
|
level = log_level,
|
|
format = '%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt = '%Y-%m-%d %H:%M:%S'
|
|
)
|
|
|
|
if args.list:
|
|
print('Fetching weekly database URLs from FCC website...\n')
|
|
|
|
urls = asyncio.run(fetch_weekly_databases())
|
|
|
|
total_files = 0
|
|
for category, files in urls.items():
|
|
print(f'\n{category}:')
|
|
for file_type, url in files.items():
|
|
print(f' {file_type}: {url}')
|
|
total_files += 1
|
|
|
|
print(f'\n\nTotal: {total_files} files found across {len(urls)} categories')
|
|
else:
|
|
asyncio.run(main(args.concurrency))
|
|
|