update for url sniffer
This commit is contained in:
parent
430eb73d82
commit
af260d785a
@ -2,148 +2,139 @@
|
||||
"""
|
||||
IRC3 Bot Plugin: URL Title Fetcher
|
||||
|
||||
This plugin for an IRC bot fetches and displays the titles of URLs shared in IRC messages.
|
||||
It uses aiohttp for asynchronous HTTP requests and lxml for HTML parsing.
|
||||
A plugin for IRC3 bots that monitors chat messages for URLs, fetches their webpage titles, and displays them
|
||||
with formatted styling in the chat. Provides visual enhancement to URL sharing in IRC channels.
|
||||
|
||||
Features:
|
||||
- Listens for PRIVMSG events in the IRC channel.
|
||||
- Extracts URLs from messages and fetches their titles.
|
||||
- Posts the title and URL back to the IRC channel.
|
||||
- Asynchronous URL processing using aiohttp for efficient network operations
|
||||
- Robust HTML parsing with lxml for accurate title extraction
|
||||
- Configurable message styling with color and formatting options
|
||||
- Built-in exclusion of YouTube URLs to avoid conflicts with dedicated YouTube plugins
|
||||
- Error handling for network and parsing operations
|
||||
- Proper resource cleanup through session management
|
||||
|
||||
Usage:
|
||||
======
|
||||
To use this module, load it as a plugin in your IRC bot configuration.
|
||||
Dependencies:
|
||||
- aiohttp: For asynchronous HTTP requests
|
||||
- irc3: Core IRC bot functionality
|
||||
- ircstyle: IRC text formatting utilities
|
||||
- lxml: HTML parsing capabilities
|
||||
|
||||
Example:
|
||||
@event
|
||||
def on_privmsg(self, mask, event, target, data):
|
||||
# Extract URLs from messages and fetch their titles.
|
||||
|
||||
Author: Zodiac
|
||||
Date: 2025-02-13
|
||||
Author: Zodiac (simplified by Claude)
|
||||
Date: 2025-02-14
|
||||
"""
|
||||
|
||||
import re
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import ircstyle
|
||||
from lxml import html
|
||||
import irc3
|
||||
from irc3 import event
|
||||
from irc3.compat import Queue
|
||||
from plugins.services.permissions import check_ignore
|
||||
|
||||
|
||||
@irc3.plugin
|
||||
class URLTitlePlugin:
|
||||
"""
|
||||
A plugin to fetch and display the titles of URLs shared in IRC messages.
|
||||
"""Plugin for fetching and displaying webpage titles from URLs shared in IRC messages.
|
||||
|
||||
Monitors IRC messages for URLs, retrieves their webpage titles, and posts formatted responses
|
||||
back to the channel. Supports styled text output with configurable formatting options.
|
||||
|
||||
Attributes:
|
||||
bot (irc3.IrcBot): The IRC bot instance.
|
||||
url_queue (Queue): A queue to manage URL processing asynchronously.
|
||||
session (aiohttp.ClientSession): An HTTP session for making requests.
|
||||
bot (irc3.IrcBot): Reference to the main IRC bot instance
|
||||
session (aiohttp.ClientSession): Persistent HTTP session for making web requests
|
||||
url_pattern (re.Pattern): Compiled regex for URL detection in messages
|
||||
"""
|
||||
|
||||
def __init__(self, bot):
|
||||
"""
|
||||
Initialize the URLTitlePlugin.
|
||||
"""Initialize plugin with bot instance and set up HTTP session.
|
||||
|
||||
Args:
|
||||
bot (irc3.IrcBot): The IRC bot instance.
|
||||
bot (irc3.IrcBot): The IRC bot instance this plugin will be attached to
|
||||
"""
|
||||
self.bot = bot
|
||||
self.url_queue = Queue() # Queue for managing URL processing
|
||||
# Create persistent HTTP session for better performance
|
||||
self.session = aiohttp.ClientSession(loop=self.bot.loop)
|
||||
self.bot.create_task(self.process_urls()) # Start URL processor
|
||||
# Regex pattern matches both http(s):// URLs and www. domains
|
||||
self.url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+")
|
||||
|
||||
@event(irc3.rfc.PRIVMSG)
|
||||
@check_ignore
|
||||
async def on_privmsg(self, mask, event, target, data):
|
||||
"""
|
||||
Listen for PRIVMSG events and check for URLs.
|
||||
"""Handle incoming private messages by processing any URLs they contain.
|
||||
|
||||
Args:
|
||||
mask (str): The user's mask (e.g., nick!user@host).
|
||||
event (str): The IRC event type (e.g., PRIVMSG).
|
||||
target (str): The target of the message (e.g., channel or user).
|
||||
data (str): The content of the message.
|
||||
|
||||
This method extracts URLs from the message and adds them to the queue
|
||||
for asynchronous processing.
|
||||
mask (str): IRC user mask of message sender
|
||||
event (str): IRC event type
|
||||
target (str): Channel or user the message was sent to
|
||||
data (str): Content of the message
|
||||
"""
|
||||
url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+")
|
||||
urls = url_pattern.findall(data)
|
||||
# Extract all URLs from the message
|
||||
urls = self.url_pattern.findall(data)
|
||||
|
||||
for url in urls:
|
||||
# Use put_nowait to avoid blocking if the queue is full
|
||||
await self.url_queue.put((url, target))
|
||||
# Skip YouTube URLs as they're typically handled by dedicated plugins
|
||||
if "youtube.com" in url.lower() or "youtu.be" in url.lower():
|
||||
continue
|
||||
|
||||
async def process_urls(self):
|
||||
"""
|
||||
Process URLs from the queue and fetch their titles.
|
||||
|
||||
This method runs indefinitely, processing one URL at a time from the queue.
|
||||
It fetches the title of each URL and sends it back to the IRC channel.
|
||||
"""
|
||||
while True:
|
||||
url, target = await self.url_queue.get()
|
||||
try:
|
||||
title = await self.fetch_title(url)
|
||||
if title:
|
||||
# Format the IRC message with colors and styles
|
||||
formatted_message = (
|
||||
f"\x02\x0312Title:\x03 \x034{title}\x03 \x02|\x02 "
|
||||
f"\x032URL:\x03 \x0311{url}\x03"
|
||||
)
|
||||
formatted_message = self.format_message(title, url)
|
||||
await self.bot.privmsg(target, formatted_message)
|
||||
else:
|
||||
# Handle cases where no title is found
|
||||
pass
|
||||
except Exception as e:
|
||||
# Log errors but continue processing other URLs
|
||||
self.bot.log.error(f"Error processing URL {url}: {e}")
|
||||
finally:
|
||||
self.url_queue.task_done()
|
||||
|
||||
async def fetch_title(self, url):
|
||||
"""
|
||||
Fetch the title of a web page using aiohttp and lxml.
|
||||
def format_message(self, title, url):
|
||||
"""Create a styled IRC message containing the webpage title and source URL.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the web page.
|
||||
title (str): The webpage title to display
|
||||
url (str): The source URL
|
||||
|
||||
Returns:
|
||||
str: The title of the web page, or None if it could not be fetched.
|
||||
|
||||
This method makes an HTTP GET request to the URL, parses the HTML content,
|
||||
and extracts the title element.
|
||||
str: Formatted IRC message with styling applied
|
||||
"""
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
try:
|
||||
# Define styled components for the message
|
||||
prefix = ircstyle.style("►", fg="cyan", bold=True, reset=True)
|
||||
title_label = ircstyle.style("Title", fg="blue", bold=True, reset=True)
|
||||
title_text = ircstyle.style(title, fg="green", italics=True, underline=True, reset=True)
|
||||
separator = ircstyle.style("❘", fg="grey", bold=True, reset=True)
|
||||
url_label = ircstyle.style("Source", fg="blue", bold=True, underline=True, reset=True)
|
||||
url_text = ircstyle.style(url, fg="cyan", italics=True, reset=True)
|
||||
suffix = ircstyle.style("◄", fg="cyan", bold=True, reset=True)
|
||||
|
||||
return f"{prefix} {title_label}: {title_text} {separator} {url_label}: {url_text} {suffix}"
|
||||
|
||||
async def fetch_title(self, url):
|
||||
"""Retrieve the title of a webpage using asynchronous HTTP requests.
|
||||
|
||||
Args:
|
||||
url (str): The URL to fetch the title from
|
||||
|
||||
Returns:
|
||||
str: The webpage title or "No title found" if title cannot be extracted
|
||||
|
||||
Raises:
|
||||
Any exceptions from aiohttp or lxml processing
|
||||
"""
|
||||
# Use modern browser User-Agent to avoid being blocked
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
|
||||
async with self.session.get(url, headers=headers, timeout=10) as response:
|
||||
# Check if the response was successful
|
||||
response.raise_for_status()
|
||||
content = await response.text()
|
||||
tree = html.fromstring(content)
|
||||
title = tree.findtext(".//title")
|
||||
return title.strip() if title else "No title found"
|
||||
except aiohttp.ClientError as e:
|
||||
self.bot.log.error(f"HTTP error for {url}: {e}")
|
||||
except asyncio.TimeoutError:
|
||||
self.bot.log.error(f"Request timed out for {url}")
|
||||
except Exception as e:
|
||||
self.bot.log.error(f"Unexpected error for {url}: {e}")
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Clean up resources when the plugin is unloaded.
|
||||
|
||||
This method ensures that the aiohttp session is properly closed to avoid
|
||||
resource leaks.
|
||||
"""
|
||||
"""Clean up resources by closing the HTTP session."""
|
||||
await self.session.close()
|
||||
|
||||
def __del__(self):
|
||||
"""
|
||||
Ensure session closing when the object is destroyed.
|
||||
|
||||
This method schedules the session cleanup task on the bot's event loop.
|
||||
"""
|
||||
"""Ensure proper cleanup when the plugin is destroyed."""
|
||||
self.bot.create_task(self.close())
|
Loading…
Reference in New Issue
Block a user