g1mp/plugins/url_title_sniffer.py

130 lines
5.4 KiB
Python

# -*- coding: utf-8 -*-
"""
IRC3 Bot Plugin: URL Title Fetcher
A plugin for IRC3 bots that monitors chat messages for URLs, fetches their webpage titles, and displays them
with formatted styling in the chat. Provides visual enhancement to URL sharing in IRC channels.
Features:
- Asynchronous URL processing using aiohttp for efficient network operations
- Robust HTML parsing with lxml for accurate title extraction
- Configurable message styling with color and formatting options
- Built-in exclusion of YouTube URLs to avoid conflicts with dedicated YouTube plugins
- Error handling for network and parsing operations
- Proper resource cleanup through session management
- Queue-based processing system with strict rate limiting
Dependencies:
- aiohttp: For asynchronous HTTP requests
- irc3: Core IRC bot functionality
- ircstyle: IRC text formatting utilities
- lxml: HTML parsing capabilities
Author: Zodiac
Date: 2025-02-14
"""
import re
import time
import aiohttp
import ircstyle
from lxml import html
import irc3
from irc3 import event
from irc3.compat import Queue
from plugins.services.permissions import check_ignore
@irc3.plugin
class URLTitlePlugin:
"""Plugin for fetching and displaying webpage titles from URLs shared in IRC messages.
Monitors IRC messages for URLs, retrieves their webpage titles, and posts formatted responses
back to the channel. Supports styled text output with configurable formatting options.
Attributes:
bot (irc3.IrcBot): Reference to the main IRC bot instance
session (aiohttp.ClientSession): Persistent HTTP session for making web requests
url_pattern (re.Pattern): Compiled regex for URL detection in messages
queue (Queue): Processing queue for URL handling tasks
last_processed (float): Timestamp of last successful URL processing
"""
def __init__(self, bot):
"""Initialize plugin with bot instance and set up components."""
self.bot = bot
self.session = aiohttp.ClientSession(loop=self.bot.loop)
self.url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+")
self.queue = Queue()
self.last_processed = 0 # Initialize to epoch start
self.bot.create_task(self.process_queue())
@event(irc3.rfc.PRIVMSG)
@check_ignore
async def on_privmsg(self, mask, event, target, data):
"""Handle incoming messages and enqueue URLs for processing."""
urls = self.url_pattern.findall(data)
for url in urls:
if "youtube.com" in url.lower() or "youtu.be" in url.lower():
continue
self.queue.put_nowait((target, url))
async def process_queue(self):
"""Process URLs from the queue with strict 5-second cooldown between requests."""
while True:
target, url = await self.queue.get()
try:
current_time = time.time()
elapsed = current_time - self.last_processed
if elapsed < 5:
self.bot.log.info(f"Rate limited: Waiting {5 - elapsed:.1f}s to process {url}")
continue
title = await self.fetch_title(url)
if title:
formatted_message = self.format_message(title, url)
await self.bot.privmsg(target, formatted_message)
self.last_processed = time.time() # Update after successful processing
except Exception as e:
self.bot.log.error(f"Error processing URL {url}: {e}")
finally:
self.queue.task_done()
def format_message(self, title, url):
"""Create a styled IRC message containing the webpage title and source URL."""
prefix = ircstyle.style("", fg="cyan", bold=True, reset=True)
title_label = ircstyle.style("Title", fg="blue", bold=True, reset=True)
title_text = ircstyle.style(title, fg="green", italics=True, underline=True, reset=True)
separator = ircstyle.style("", fg="grey", bold=True, reset=True)
url_label = ircstyle.style("Source", fg="blue", bold=True, underline=True, reset=True)
url_text = ircstyle.style(url, fg="cyan", italics=True, reset=True)
suffix = ircstyle.style("", fg="cyan", bold=True, reset=True)
return f"{prefix} {title_label}: {title_text} {separator} {url_label}: {url_text} {suffix}"
async def fetch_title(self, url):
"""Retrieve the title of a webpage using asynchronous HTTP requests."""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
async with self.session.get(url, headers=headers, timeout=10) as response:
response.raise_for_status()
content = await response.text()
tree = html.fromstring(content)
title = tree.findtext(".//title")
return title.strip() if title else "No title found"
async def close(self):
"""Clean up resources by closing the HTTP session."""
await self.session.close()
def __del__(self):
"""Ensure proper cleanup when the plugin is destroyed."""
self.bot.create_task(self.close())