From af260d785a8e4fb1b82ddaa9e349b5d3fe83accf Mon Sep 17 00:00:00 2001 From: Zodiac Date: Fri, 14 Feb 2025 17:16:18 -0800 Subject: [PATCH] update for url sniffer --- plugins/url_title_sniffer.py | 187 +++++++++++++++++------------------ 1 file changed, 89 insertions(+), 98 deletions(-) diff --git a/plugins/url_title_sniffer.py b/plugins/url_title_sniffer.py index ca015b6..3939b3a 100644 --- a/plugins/url_title_sniffer.py +++ b/plugins/url_title_sniffer.py @@ -2,148 +2,139 @@ """ IRC3 Bot Plugin: URL Title Fetcher -This plugin for an IRC bot fetches and displays the titles of URLs shared in IRC messages. -It uses aiohttp for asynchronous HTTP requests and lxml for HTML parsing. +A plugin for IRC3 bots that monitors chat messages for URLs, fetches their webpage titles, and displays them +with formatted styling in the chat. Provides visual enhancement to URL sharing in IRC channels. Features: -- Listens for PRIVMSG events in the IRC channel. -- Extracts URLs from messages and fetches their titles. -- Posts the title and URL back to the IRC channel. + - Asynchronous URL processing using aiohttp for efficient network operations + - Robust HTML parsing with lxml for accurate title extraction + - Configurable message styling with color and formatting options + - Built-in exclusion of YouTube URLs to avoid conflicts with dedicated YouTube plugins + - Error handling for network and parsing operations + - Proper resource cleanup through session management -Usage: -====== -To use this module, load it as a plugin in your IRC bot configuration. +Dependencies: + - aiohttp: For asynchronous HTTP requests + - irc3: Core IRC bot functionality + - ircstyle: IRC text formatting utilities + - lxml: HTML parsing capabilities -Example: - @event - def on_privmsg(self, mask, event, target, data): - # Extract URLs from messages and fetch their titles. - -Author: Zodiac -Date: 2025-02-13 +Author: Zodiac (simplified by Claude) +Date: 2025-02-14 """ import re -import asyncio import aiohttp +import ircstyle from lxml import html import irc3 from irc3 import event -from irc3.compat import Queue +from plugins.services.permissions import check_ignore @irc3.plugin class URLTitlePlugin: - """ - A plugin to fetch and display the titles of URLs shared in IRC messages. + """Plugin for fetching and displaying webpage titles from URLs shared in IRC messages. + + Monitors IRC messages for URLs, retrieves their webpage titles, and posts formatted responses + back to the channel. Supports styled text output with configurable formatting options. Attributes: - bot (irc3.IrcBot): The IRC bot instance. - url_queue (Queue): A queue to manage URL processing asynchronously. - session (aiohttp.ClientSession): An HTTP session for making requests. + bot (irc3.IrcBot): Reference to the main IRC bot instance + session (aiohttp.ClientSession): Persistent HTTP session for making web requests + url_pattern (re.Pattern): Compiled regex for URL detection in messages """ def __init__(self, bot): - """ - Initialize the URLTitlePlugin. - + """Initialize plugin with bot instance and set up HTTP session. + Args: - bot (irc3.IrcBot): The IRC bot instance. + bot (irc3.IrcBot): The IRC bot instance this plugin will be attached to """ self.bot = bot - self.url_queue = Queue() # Queue for managing URL processing + # Create persistent HTTP session for better performance self.session = aiohttp.ClientSession(loop=self.bot.loop) - self.bot.create_task(self.process_urls()) # Start URL processor + # Regex pattern matches both http(s):// URLs and www. domains + self.url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+") @event(irc3.rfc.PRIVMSG) + @check_ignore async def on_privmsg(self, mask, event, target, data): - """ - Listen for PRIVMSG events and check for URLs. - + """Handle incoming private messages by processing any URLs they contain. + Args: - mask (str): The user's mask (e.g., nick!user@host). - event (str): The IRC event type (e.g., PRIVMSG). - target (str): The target of the message (e.g., channel or user). - data (str): The content of the message. - - This method extracts URLs from the message and adds them to the queue - for asynchronous processing. + mask (str): IRC user mask of message sender + event (str): IRC event type + target (str): Channel or user the message was sent to + data (str): Content of the message """ - url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+") - urls = url_pattern.findall(data) + # Extract all URLs from the message + urls = self.url_pattern.findall(data) + for url in urls: - # Use put_nowait to avoid blocking if the queue is full - await self.url_queue.put((url, target)) - - async def process_urls(self): - """ - Process URLs from the queue and fetch their titles. - - This method runs indefinitely, processing one URL at a time from the queue. - It fetches the title of each URL and sends it back to the IRC channel. - """ - while True: - url, target = await self.url_queue.get() + # Skip YouTube URLs as they're typically handled by dedicated plugins + if "youtube.com" in url.lower() or "youtu.be" in url.lower(): + continue + try: title = await self.fetch_title(url) if title: - # Format the IRC message with colors and styles - formatted_message = ( - f"\x02\x0312Title:\x03 \x034{title}\x03 \x02|\x02 " - f"\x032URL:\x03 \x0311{url}\x03" - ) + formatted_message = self.format_message(title, url) await self.bot.privmsg(target, formatted_message) - else: - # Handle cases where no title is found - pass except Exception as e: + # Log errors but continue processing other URLs self.bot.log.error(f"Error processing URL {url}: {e}") - finally: - self.url_queue.task_done() + + def format_message(self, title, url): + """Create a styled IRC message containing the webpage title and source URL. + + Args: + title (str): The webpage title to display + url (str): The source URL + + Returns: + str: Formatted IRC message with styling applied + """ + # Define styled components for the message + prefix = ircstyle.style("►", fg="cyan", bold=True, reset=True) + title_label = ircstyle.style("Title", fg="blue", bold=True, reset=True) + title_text = ircstyle.style(title, fg="green", italics=True, underline=True, reset=True) + separator = ircstyle.style("❘", fg="grey", bold=True, reset=True) + url_label = ircstyle.style("Source", fg="blue", bold=True, underline=True, reset=True) + url_text = ircstyle.style(url, fg="cyan", italics=True, reset=True) + suffix = ircstyle.style("◄", fg="cyan", bold=True, reset=True) + + return f"{prefix} {title_label}: {title_text} {separator} {url_label}: {url_text} {suffix}" async def fetch_title(self, url): - """ - Fetch the title of a web page using aiohttp and lxml. - + """Retrieve the title of a webpage using asynchronous HTTP requests. + Args: - url (str): The URL of the web page. - + url (str): The URL to fetch the title from + Returns: - str: The title of the web page, or None if it could not be fetched. - - This method makes an HTTP GET request to the URL, parses the HTML content, - and extracts the title element. + str: The webpage title or "No title found" if title cannot be extracted + + Raises: + Any exceptions from aiohttp or lxml processing """ - headers = {"User-Agent": "Mozilla/5.0"} - try: - async with self.session.get(url, headers=headers, timeout=10) as response: - # Check if the response was successful - response.raise_for_status() - content = await response.text() - tree = html.fromstring(content) - title = tree.findtext(".//title") - return title.strip() if title else "No title found" - except aiohttp.ClientError as e: - self.bot.log.error(f"HTTP error for {url}: {e}") - except asyncio.TimeoutError: - self.bot.log.error(f"Request timed out for {url}") - except Exception as e: - self.bot.log.error(f"Unexpected error for {url}: {e}") - return None + # Use modern browser User-Agent to avoid being blocked + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + async with self.session.get(url, headers=headers, timeout=10) as response: + response.raise_for_status() + content = await response.text() + tree = html.fromstring(content) + title = tree.findtext(".//title") + return title.strip() if title else "No title found" async def close(self): - """ - Clean up resources when the plugin is unloaded. - - This method ensures that the aiohttp session is properly closed to avoid - resource leaks. - """ + """Clean up resources by closing the HTTP session.""" await self.session.close() def __del__(self): - """ - Ensure session closing when the object is destroyed. - - This method schedules the session cleanup task on the bot's event loop. - """ + """Ensure proper cleanup when the plugin is destroyed.""" self.bot.create_task(self.close()) \ No newline at end of file