update for url sniffer

This commit is contained in:
Zodiac 2025-02-14 17:16:18 -08:00
parent 430eb73d82
commit af260d785a

View File

@ -2,148 +2,139 @@
""" """
IRC3 Bot Plugin: URL Title Fetcher IRC3 Bot Plugin: URL Title Fetcher
This plugin for an IRC bot fetches and displays the titles of URLs shared in IRC messages. A plugin for IRC3 bots that monitors chat messages for URLs, fetches their webpage titles, and displays them
It uses aiohttp for asynchronous HTTP requests and lxml for HTML parsing. with formatted styling in the chat. Provides visual enhancement to URL sharing in IRC channels.
Features: Features:
- Listens for PRIVMSG events in the IRC channel. - Asynchronous URL processing using aiohttp for efficient network operations
- Extracts URLs from messages and fetches their titles. - Robust HTML parsing with lxml for accurate title extraction
- Posts the title and URL back to the IRC channel. - Configurable message styling with color and formatting options
- Built-in exclusion of YouTube URLs to avoid conflicts with dedicated YouTube plugins
- Error handling for network and parsing operations
- Proper resource cleanup through session management
Usage: Dependencies:
====== - aiohttp: For asynchronous HTTP requests
To use this module, load it as a plugin in your IRC bot configuration. - irc3: Core IRC bot functionality
- ircstyle: IRC text formatting utilities
- lxml: HTML parsing capabilities
Example: Author: Zodiac (simplified by Claude)
@event Date: 2025-02-14
def on_privmsg(self, mask, event, target, data):
# Extract URLs from messages and fetch their titles.
Author: Zodiac
Date: 2025-02-13
""" """
import re import re
import asyncio
import aiohttp import aiohttp
import ircstyle
from lxml import html from lxml import html
import irc3 import irc3
from irc3 import event from irc3 import event
from irc3.compat import Queue from plugins.services.permissions import check_ignore
@irc3.plugin @irc3.plugin
class URLTitlePlugin: class URLTitlePlugin:
""" """Plugin for fetching and displaying webpage titles from URLs shared in IRC messages.
A plugin to fetch and display the titles of URLs shared in IRC messages.
Monitors IRC messages for URLs, retrieves their webpage titles, and posts formatted responses
back to the channel. Supports styled text output with configurable formatting options.
Attributes: Attributes:
bot (irc3.IrcBot): The IRC bot instance. bot (irc3.IrcBot): Reference to the main IRC bot instance
url_queue (Queue): A queue to manage URL processing asynchronously. session (aiohttp.ClientSession): Persistent HTTP session for making web requests
session (aiohttp.ClientSession): An HTTP session for making requests. url_pattern (re.Pattern): Compiled regex for URL detection in messages
""" """
def __init__(self, bot): def __init__(self, bot):
""" """Initialize plugin with bot instance and set up HTTP session.
Initialize the URLTitlePlugin.
Args: Args:
bot (irc3.IrcBot): The IRC bot instance. bot (irc3.IrcBot): The IRC bot instance this plugin will be attached to
""" """
self.bot = bot self.bot = bot
self.url_queue = Queue() # Queue for managing URL processing # Create persistent HTTP session for better performance
self.session = aiohttp.ClientSession(loop=self.bot.loop) self.session = aiohttp.ClientSession(loop=self.bot.loop)
self.bot.create_task(self.process_urls()) # Start URL processor # Regex pattern matches both http(s):// URLs and www. domains
self.url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+")
@event(irc3.rfc.PRIVMSG) @event(irc3.rfc.PRIVMSG)
@check_ignore
async def on_privmsg(self, mask, event, target, data): async def on_privmsg(self, mask, event, target, data):
""" """Handle incoming private messages by processing any URLs they contain.
Listen for PRIVMSG events and check for URLs.
Args: Args:
mask (str): The user's mask (e.g., nick!user@host). mask (str): IRC user mask of message sender
event (str): The IRC event type (e.g., PRIVMSG). event (str): IRC event type
target (str): The target of the message (e.g., channel or user). target (str): Channel or user the message was sent to
data (str): The content of the message. data (str): Content of the message
This method extracts URLs from the message and adds them to the queue
for asynchronous processing.
""" """
url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+") # Extract all URLs from the message
urls = url_pattern.findall(data) urls = self.url_pattern.findall(data)
for url in urls: for url in urls:
# Use put_nowait to avoid blocking if the queue is full # Skip YouTube URLs as they're typically handled by dedicated plugins
await self.url_queue.put((url, target)) if "youtube.com" in url.lower() or "youtu.be" in url.lower():
continue
async def process_urls(self):
"""
Process URLs from the queue and fetch their titles.
This method runs indefinitely, processing one URL at a time from the queue.
It fetches the title of each URL and sends it back to the IRC channel.
"""
while True:
url, target = await self.url_queue.get()
try: try:
title = await self.fetch_title(url) title = await self.fetch_title(url)
if title: if title:
# Format the IRC message with colors and styles formatted_message = self.format_message(title, url)
formatted_message = (
f"\x02\x0312Title:\x03 \x034{title}\x03 \x02|\x02 "
f"\x032URL:\x03 \x0311{url}\x03"
)
await self.bot.privmsg(target, formatted_message) await self.bot.privmsg(target, formatted_message)
else:
# Handle cases where no title is found
pass
except Exception as e: except Exception as e:
# Log errors but continue processing other URLs
self.bot.log.error(f"Error processing URL {url}: {e}") self.bot.log.error(f"Error processing URL {url}: {e}")
finally:
self.url_queue.task_done()
async def fetch_title(self, url): def format_message(self, title, url):
""" """Create a styled IRC message containing the webpage title and source URL.
Fetch the title of a web page using aiohttp and lxml.
Args: Args:
url (str): The URL of the web page. title (str): The webpage title to display
url (str): The source URL
Returns: Returns:
str: The title of the web page, or None if it could not be fetched. str: Formatted IRC message with styling applied
This method makes an HTTP GET request to the URL, parses the HTML content,
and extracts the title element.
""" """
headers = {"User-Agent": "Mozilla/5.0"} # Define styled components for the message
try: prefix = ircstyle.style("", fg="cyan", bold=True, reset=True)
title_label = ircstyle.style("Title", fg="blue", bold=True, reset=True)
title_text = ircstyle.style(title, fg="green", italics=True, underline=True, reset=True)
separator = ircstyle.style("", fg="grey", bold=True, reset=True)
url_label = ircstyle.style("Source", fg="blue", bold=True, underline=True, reset=True)
url_text = ircstyle.style(url, fg="cyan", italics=True, reset=True)
suffix = ircstyle.style("", fg="cyan", bold=True, reset=True)
return f"{prefix} {title_label}: {title_text} {separator} {url_label}: {url_text} {suffix}"
async def fetch_title(self, url):
"""Retrieve the title of a webpage using asynchronous HTTP requests.
Args:
url (str): The URL to fetch the title from
Returns:
str: The webpage title or "No title found" if title cannot be extracted
Raises:
Any exceptions from aiohttp or lxml processing
"""
# Use modern browser User-Agent to avoid being blocked
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
async with self.session.get(url, headers=headers, timeout=10) as response: async with self.session.get(url, headers=headers, timeout=10) as response:
# Check if the response was successful
response.raise_for_status() response.raise_for_status()
content = await response.text() content = await response.text()
tree = html.fromstring(content) tree = html.fromstring(content)
title = tree.findtext(".//title") title = tree.findtext(".//title")
return title.strip() if title else "No title found" return title.strip() if title else "No title found"
except aiohttp.ClientError as e:
self.bot.log.error(f"HTTP error for {url}: {e}")
except asyncio.TimeoutError:
self.bot.log.error(f"Request timed out for {url}")
except Exception as e:
self.bot.log.error(f"Unexpected error for {url}: {e}")
return None
async def close(self): async def close(self):
""" """Clean up resources by closing the HTTP session."""
Clean up resources when the plugin is unloaded.
This method ensures that the aiohttp session is properly closed to avoid
resource leaks.
"""
await self.session.close() await self.session.close()
def __del__(self): def __del__(self):
""" """Ensure proper cleanup when the plugin is destroyed."""
Ensure session closing when the object is destroyed.
This method schedules the session cleanup task on the bot's event loop.
"""
self.bot.create_task(self.close()) self.bot.create_task(self.close())