update for url sniffer
This commit is contained in:
parent
430eb73d82
commit
af260d785a
@ -2,148 +2,139 @@
|
|||||||
"""
|
"""
|
||||||
IRC3 Bot Plugin: URL Title Fetcher
|
IRC3 Bot Plugin: URL Title Fetcher
|
||||||
|
|
||||||
This plugin for an IRC bot fetches and displays the titles of URLs shared in IRC messages.
|
A plugin for IRC3 bots that monitors chat messages for URLs, fetches their webpage titles, and displays them
|
||||||
It uses aiohttp for asynchronous HTTP requests and lxml for HTML parsing.
|
with formatted styling in the chat. Provides visual enhancement to URL sharing in IRC channels.
|
||||||
|
|
||||||
Features:
|
Features:
|
||||||
- Listens for PRIVMSG events in the IRC channel.
|
- Asynchronous URL processing using aiohttp for efficient network operations
|
||||||
- Extracts URLs from messages and fetches their titles.
|
- Robust HTML parsing with lxml for accurate title extraction
|
||||||
- Posts the title and URL back to the IRC channel.
|
- Configurable message styling with color and formatting options
|
||||||
|
- Built-in exclusion of YouTube URLs to avoid conflicts with dedicated YouTube plugins
|
||||||
|
- Error handling for network and parsing operations
|
||||||
|
- Proper resource cleanup through session management
|
||||||
|
|
||||||
Usage:
|
Dependencies:
|
||||||
======
|
- aiohttp: For asynchronous HTTP requests
|
||||||
To use this module, load it as a plugin in your IRC bot configuration.
|
- irc3: Core IRC bot functionality
|
||||||
|
- ircstyle: IRC text formatting utilities
|
||||||
|
- lxml: HTML parsing capabilities
|
||||||
|
|
||||||
Example:
|
Author: Zodiac (simplified by Claude)
|
||||||
@event
|
Date: 2025-02-14
|
||||||
def on_privmsg(self, mask, event, target, data):
|
|
||||||
# Extract URLs from messages and fetch their titles.
|
|
||||||
|
|
||||||
Author: Zodiac
|
|
||||||
Date: 2025-02-13
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import asyncio
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
import ircstyle
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import irc3
|
import irc3
|
||||||
from irc3 import event
|
from irc3 import event
|
||||||
from irc3.compat import Queue
|
from plugins.services.permissions import check_ignore
|
||||||
|
|
||||||
|
|
||||||
@irc3.plugin
|
@irc3.plugin
|
||||||
class URLTitlePlugin:
|
class URLTitlePlugin:
|
||||||
"""
|
"""Plugin for fetching and displaying webpage titles from URLs shared in IRC messages.
|
||||||
A plugin to fetch and display the titles of URLs shared in IRC messages.
|
|
||||||
|
Monitors IRC messages for URLs, retrieves their webpage titles, and posts formatted responses
|
||||||
|
back to the channel. Supports styled text output with configurable formatting options.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
bot (irc3.IrcBot): The IRC bot instance.
|
bot (irc3.IrcBot): Reference to the main IRC bot instance
|
||||||
url_queue (Queue): A queue to manage URL processing asynchronously.
|
session (aiohttp.ClientSession): Persistent HTTP session for making web requests
|
||||||
session (aiohttp.ClientSession): An HTTP session for making requests.
|
url_pattern (re.Pattern): Compiled regex for URL detection in messages
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, bot):
|
def __init__(self, bot):
|
||||||
"""
|
"""Initialize plugin with bot instance and set up HTTP session.
|
||||||
Initialize the URLTitlePlugin.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
bot (irc3.IrcBot): The IRC bot instance.
|
bot (irc3.IrcBot): The IRC bot instance this plugin will be attached to
|
||||||
"""
|
"""
|
||||||
self.bot = bot
|
self.bot = bot
|
||||||
self.url_queue = Queue() # Queue for managing URL processing
|
# Create persistent HTTP session for better performance
|
||||||
self.session = aiohttp.ClientSession(loop=self.bot.loop)
|
self.session = aiohttp.ClientSession(loop=self.bot.loop)
|
||||||
self.bot.create_task(self.process_urls()) # Start URL processor
|
# Regex pattern matches both http(s):// URLs and www. domains
|
||||||
|
self.url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+")
|
||||||
|
|
||||||
@event(irc3.rfc.PRIVMSG)
|
@event(irc3.rfc.PRIVMSG)
|
||||||
|
@check_ignore
|
||||||
async def on_privmsg(self, mask, event, target, data):
|
async def on_privmsg(self, mask, event, target, data):
|
||||||
"""
|
"""Handle incoming private messages by processing any URLs they contain.
|
||||||
Listen for PRIVMSG events and check for URLs.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
mask (str): The user's mask (e.g., nick!user@host).
|
mask (str): IRC user mask of message sender
|
||||||
event (str): The IRC event type (e.g., PRIVMSG).
|
event (str): IRC event type
|
||||||
target (str): The target of the message (e.g., channel or user).
|
target (str): Channel or user the message was sent to
|
||||||
data (str): The content of the message.
|
data (str): Content of the message
|
||||||
|
|
||||||
This method extracts URLs from the message and adds them to the queue
|
|
||||||
for asynchronous processing.
|
|
||||||
"""
|
"""
|
||||||
url_pattern = re.compile(r"https?://[^\s<>\"']+|www\.[^\s<>\"']+")
|
# Extract all URLs from the message
|
||||||
urls = url_pattern.findall(data)
|
urls = self.url_pattern.findall(data)
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
# Use put_nowait to avoid blocking if the queue is full
|
# Skip YouTube URLs as they're typically handled by dedicated plugins
|
||||||
await self.url_queue.put((url, target))
|
if "youtube.com" in url.lower() or "youtu.be" in url.lower():
|
||||||
|
continue
|
||||||
|
|
||||||
async def process_urls(self):
|
|
||||||
"""
|
|
||||||
Process URLs from the queue and fetch their titles.
|
|
||||||
|
|
||||||
This method runs indefinitely, processing one URL at a time from the queue.
|
|
||||||
It fetches the title of each URL and sends it back to the IRC channel.
|
|
||||||
"""
|
|
||||||
while True:
|
|
||||||
url, target = await self.url_queue.get()
|
|
||||||
try:
|
try:
|
||||||
title = await self.fetch_title(url)
|
title = await self.fetch_title(url)
|
||||||
if title:
|
if title:
|
||||||
# Format the IRC message with colors and styles
|
formatted_message = self.format_message(title, url)
|
||||||
formatted_message = (
|
|
||||||
f"\x02\x0312Title:\x03 \x034{title}\x03 \x02|\x02 "
|
|
||||||
f"\x032URL:\x03 \x0311{url}\x03"
|
|
||||||
)
|
|
||||||
await self.bot.privmsg(target, formatted_message)
|
await self.bot.privmsg(target, formatted_message)
|
||||||
else:
|
|
||||||
# Handle cases where no title is found
|
|
||||||
pass
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# Log errors but continue processing other URLs
|
||||||
self.bot.log.error(f"Error processing URL {url}: {e}")
|
self.bot.log.error(f"Error processing URL {url}: {e}")
|
||||||
finally:
|
|
||||||
self.url_queue.task_done()
|
|
||||||
|
|
||||||
async def fetch_title(self, url):
|
def format_message(self, title, url):
|
||||||
"""
|
"""Create a styled IRC message containing the webpage title and source URL.
|
||||||
Fetch the title of a web page using aiohttp and lxml.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): The URL of the web page.
|
title (str): The webpage title to display
|
||||||
|
url (str): The source URL
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: The title of the web page, or None if it could not be fetched.
|
str: Formatted IRC message with styling applied
|
||||||
|
|
||||||
This method makes an HTTP GET request to the URL, parses the HTML content,
|
|
||||||
and extracts the title element.
|
|
||||||
"""
|
"""
|
||||||
headers = {"User-Agent": "Mozilla/5.0"}
|
# Define styled components for the message
|
||||||
try:
|
prefix = ircstyle.style("►", fg="cyan", bold=True, reset=True)
|
||||||
|
title_label = ircstyle.style("Title", fg="blue", bold=True, reset=True)
|
||||||
|
title_text = ircstyle.style(title, fg="green", italics=True, underline=True, reset=True)
|
||||||
|
separator = ircstyle.style("❘", fg="grey", bold=True, reset=True)
|
||||||
|
url_label = ircstyle.style("Source", fg="blue", bold=True, underline=True, reset=True)
|
||||||
|
url_text = ircstyle.style(url, fg="cyan", italics=True, reset=True)
|
||||||
|
suffix = ircstyle.style("◄", fg="cyan", bold=True, reset=True)
|
||||||
|
|
||||||
|
return f"{prefix} {title_label}: {title_text} {separator} {url_label}: {url_text} {suffix}"
|
||||||
|
|
||||||
|
async def fetch_title(self, url):
|
||||||
|
"""Retrieve the title of a webpage using asynchronous HTTP requests.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL to fetch the title from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The webpage title or "No title found" if title cannot be extracted
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Any exceptions from aiohttp or lxml processing
|
||||||
|
"""
|
||||||
|
# Use modern browser User-Agent to avoid being blocked
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
}
|
||||||
|
|
||||||
async with self.session.get(url, headers=headers, timeout=10) as response:
|
async with self.session.get(url, headers=headers, timeout=10) as response:
|
||||||
# Check if the response was successful
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
content = await response.text()
|
content = await response.text()
|
||||||
tree = html.fromstring(content)
|
tree = html.fromstring(content)
|
||||||
title = tree.findtext(".//title")
|
title = tree.findtext(".//title")
|
||||||
return title.strip() if title else "No title found"
|
return title.strip() if title else "No title found"
|
||||||
except aiohttp.ClientError as e:
|
|
||||||
self.bot.log.error(f"HTTP error for {url}: {e}")
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
self.bot.log.error(f"Request timed out for {url}")
|
|
||||||
except Exception as e:
|
|
||||||
self.bot.log.error(f"Unexpected error for {url}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""
|
"""Clean up resources by closing the HTTP session."""
|
||||||
Clean up resources when the plugin is unloaded.
|
|
||||||
|
|
||||||
This method ensures that the aiohttp session is properly closed to avoid
|
|
||||||
resource leaks.
|
|
||||||
"""
|
|
||||||
await self.session.close()
|
await self.session.close()
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
"""
|
"""Ensure proper cleanup when the plugin is destroyed."""
|
||||||
Ensure session closing when the object is destroyed.
|
|
||||||
|
|
||||||
This method schedules the session cleanup task on the bot's event loop.
|
|
||||||
"""
|
|
||||||
self.bot.create_task(self.close())
|
self.bot.create_task(self.close())
|
Loading…
Reference in New Issue
Block a user