hardlounge/server/plugins/irc-events/link.ts

593 lines
14 KiB
TypeScript
Raw Permalink Normal View History

import * as cheerio from "cheerio";
import got from "got";
2023-12-10 04:44:10 +00:00
import { URL } from "url";
import mime from "mime-types";
import log from "../../log";
import Config from "../../config";
2023-12-10 04:44:10 +00:00
import { findLinksWithSchema } from "../../../shared/linkify";
import storage from "../storage";
import Client from "../../client";
import Chan from "../../models/chan";
import Msg from "../../models/msg";
type FetchRequest = {
data: Buffer;
type: string;
size: number;
};
const currentFetchPromises = new Map<string, Promise<FetchRequest>>();
2019-04-15 16:19:50 +00:00
const imageTypeRegex = /^image\/.+/;
const mediaTypeRegex = /^(audio|video)\/.+/;
export type LinkPreview = {
type: string;
head: string;
body: string;
thumb: string;
size: number;
link: string; // Send original matched link to the client
shown?: boolean | null;
error?: string;
message?: string;
media?: string;
mediaType?: string;
maxSize?: number;
thumbActualUrl?: string;
};
2023-12-10 04:44:10 +00:00
export default function (
client: Client,
chan: Chan,
msg: Msg,
cleanText: string
) {
if (!Config.values.prefetch) {
return;
}
2023-12-10 04:44:10 +00:00
msg.previews = findLinksWithSchema(cleanText).reduce(
(cleanLinks: LinkPreview[], link) => {
const url = normalizeURL(link.link);
2023-12-10 04:44:10 +00:00
// If the URL is invalid and cannot be normalized, don't fetch it
if (!url) {
return cleanLinks;
}
2023-12-10 04:44:10 +00:00
// If there are too many urls in this message, only fetch first X valid links
if (cleanLinks.length > 4) {
return cleanLinks;
}
2023-12-10 04:44:10 +00:00
// Do not fetch duplicate links twice
if (cleanLinks.some((l) => l.link === link.link)) {
return cleanLinks;
}
2023-12-10 04:44:10 +00:00
const preview: LinkPreview = {
type: "loading",
head: "",
body: "",
thumb: "",
size: -1,
link: link.link, // Send original matched link to the client
shown: null,
};
cleanLinks.push(preview);
fetch(url, {
accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
language: client.config.browser?.language || "",
2019-07-17 09:33:59 +00:00
})
2023-12-10 04:44:10 +00:00
.then((res) => {
parse(msg, chan, preview, res, client);
})
.catch((err) => {
preview.type = "error";
preview.error = "message";
preview.message = err.message;
emitPreview(client, chan, msg, preview);
});
2023-12-10 04:44:10 +00:00
return cleanLinks;
},
[]
);
}
2014-09-27 19:17:05 +00:00
function parseHtml(preview, res, client: Client) {
// TODO:
// eslint-disable-next-line @typescript-eslint/no-misused-promises
return new Promise((resolve: (preview: FetchRequest | null) => void) => {
2018-01-11 11:33:36 +00:00
const $ = cheerio.load(res.data);
2018-07-10 11:57:11 +00:00
return parseHtmlMedia($, preview, client)
.then((newRes) => resolve(newRes))
.catch(() => {
preview.type = "link";
preview.head =
2019-07-17 09:33:59 +00:00
$('meta[property="og:title"]').attr("content") ||
$("head > title, title").first().text() ||
2019-07-17 09:33:59 +00:00
"";
preview.body =
2019-07-17 09:33:59 +00:00
$('meta[property="og:description"]').attr("content") ||
$('meta[name="description"]').attr("content") ||
"";
2019-12-19 11:40:32 +00:00
if (preview.head.length) {
preview.head = preview.head.substr(0, 100);
}
if (preview.body.length) {
preview.body = preview.body.substr(0, 300);
}
2023-12-10 04:44:10 +00:00
if (
!Config.values.prefetchStorage &&
Config.values.disableMediaPreview
) {
resolve(res);
return;
}
let thumb =
$('meta[property="og:image"]').attr("content") ||
$('meta[name="twitter:image:src"]').attr("content") ||
$('link[rel="image_src"]').attr("href") ||
"";
// Make sure thumbnail is a valid and absolute url
if (thumb.length) {
thumb = normalizeURL(thumb, preview.link) || "";
}
// Verify that thumbnail pic exists and is under allowed size
if (thumb.length) {
2023-12-10 04:44:10 +00:00
fetch(thumb, {
language: client.config.browser?.language || "",
})
2019-07-17 09:33:59 +00:00
.then((resThumb) => {
if (
resThumb !== null &&
imageTypeRegex.test(resThumb.type) &&
2023-12-10 04:44:10 +00:00
resThumb.size <=
Config.values.prefetchMaxImageSize * 1024
2019-07-17 09:33:59 +00:00
) {
preview.thumbActualUrl = thumb;
2019-07-17 09:33:59 +00:00
}
resolve(resThumb);
})
.catch(() => resolve(null));
} else {
resolve(res);
}
});
});
}
// TODO: type $
2023-12-10 04:44:10 +00:00
function parseHtmlMedia(
$: any,
preview,
client: Client
): Promise<FetchRequest> {
return new Promise((resolve, reject) => {
if (Config.values.disableMediaPreview) {
reject();
2020-08-21 07:18:41 +00:00
return;
}
let foundMedia = false;
const openGraphType = $('meta[property="og:type"]').attr("content");
// Certain news websites may include video and audio tags,
// despite actually being an article (as indicated by og:type).
// If there is og:type tag, we will only select video or audio if it matches
if (
openGraphType &&
!openGraphType.startsWith("video") &&
!openGraphType.startsWith("music")
) {
reject();
return;
}
["video", "audio"].forEach((type) => {
if (foundMedia) {
return;
}
2023-12-10 04:44:10 +00:00
$(`meta[property="og:${type}:type"]`).each(function (
this: cheerio.Element,
i: number
) {
const mimeType = $(this).attr("content");
if (!mimeType) {
return;
}
if (mediaTypeRegex.test(mimeType)) {
// If we match a clean video or audio tag, parse that as a preview instead
2023-12-10 04:44:10 +00:00
let mediaUrl = $(
$(`meta[property="og:${type}"]`).get(i)
).attr("content");
if (!mediaUrl) {
return;
}
// Make sure media is a valid url
mediaUrl = normalizeURL(mediaUrl, preview.link, true);
// Make sure media is a valid url
if (!mediaUrl) {
return;
}
foundMedia = true;
fetch(mediaUrl, {
2019-07-17 09:33:59 +00:00
accept:
type === "video"
? "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5"
: "audio/webm, audio/ogg, audio/wav, audio/*;q=0.9, application/ogg;q=0.7, video/*;q=0.6; */*;q=0.5",
language: client.config.browser?.language || "",
2019-07-17 09:33:59 +00:00
})
.then((resMedia) => {
2023-12-10 04:44:10 +00:00
if (
resMedia === null ||
!mediaTypeRegex.test(resMedia.type)
) {
2019-07-17 09:33:59 +00:00
return reject();
}
2019-07-17 09:33:59 +00:00
preview.type = type;
preview.media = mediaUrl;
preview.mediaType = resMedia.type;
2019-07-17 09:33:59 +00:00
resolve(resMedia);
})
.catch(reject);
return false;
}
});
});
if (!foundMedia) {
reject();
}
});
}
2023-12-10 04:44:10 +00:00
function parse(
msg: Msg,
chan: Chan,
preview: LinkPreview,
res: FetchRequest,
client: Client
) {
let promise: Promise<FetchRequest | null> | null = null;
2019-08-09 20:20:08 +00:00
preview.size = res.size;
switch (res.type) {
2019-07-17 09:33:59 +00:00
case "text/html":
2019-08-09 20:20:08 +00:00
preview.size = -1;
2019-07-17 09:33:59 +00:00
promise = parseHtml(preview, res, client);
break;
2019-12-19 11:40:32 +00:00
case "text/plain":
preview.type = "link";
preview.body = res.data.toString().substr(0, 300);
break;
2019-07-17 09:33:59 +00:00
case "image/png":
case "image/gif":
case "image/jpg":
case "image/jpeg":
2021-05-08 08:10:45 +00:00
case "image/jxl":
2019-07-17 09:33:59 +00:00
case "image/webp":
2020-08-23 09:51:52 +00:00
case "image/avif":
2023-12-10 04:44:10 +00:00
if (
!Config.values.prefetchStorage &&
Config.values.disableMediaPreview
) {
return removePreview(msg, preview);
}
if (res.size > Config.values.prefetchMaxImageSize * 1024) {
2019-07-17 09:33:59 +00:00
preview.type = "error";
preview.error = "image-too-big";
preview.maxSize = Config.values.prefetchMaxImageSize * 1024;
2019-07-17 09:33:59 +00:00
} else {
preview.type = "image";
preview.thumbActualUrl = preview.link;
2019-07-17 09:33:59 +00:00
}
2017-12-06 22:27:35 +00:00
break;
2019-07-17 09:33:59 +00:00
case "audio/midi":
case "audio/mpeg":
case "audio/mpeg3":
case "audio/ogg":
case "audio/wav":
2020-02-26 08:07:40 +00:00
case "audio/x-wav":
2019-07-17 09:33:59 +00:00
case "audio/x-mid":
case "audio/x-midi":
case "audio/x-mpeg":
case "audio/x-mpeg-3":
case "audio/flac":
case "audio/x-flac":
case "audio/mp4":
case "audio/x-m4a":
2019-07-17 09:33:59 +00:00
if (!preview.link.startsWith("https://")) {
break;
}
2017-12-09 23:25:01 +00:00
if (Config.values.disableMediaPreview) {
return removePreview(msg, preview);
}
2019-07-17 09:33:59 +00:00
preview.type = "audio";
preview.media = preview.link;
preview.mediaType = res.type;
2017-12-09 23:25:01 +00:00
break;
2019-07-17 09:33:59 +00:00
case "video/webm":
case "video/ogg":
case "video/mp4":
if (!preview.link.startsWith("https://")) {
break;
}
2017-12-09 23:25:01 +00:00
if (Config.values.disableMediaPreview) {
return removePreview(msg, preview);
}
2019-07-17 09:33:59 +00:00
preview.type = "video";
preview.media = preview.link;
preview.mediaType = res.type;
2017-12-06 22:27:35 +00:00
2019-07-17 09:33:59 +00:00
break;
default:
return removePreview(msg, preview);
2014-09-27 19:17:05 +00:00
}
2014-09-27 23:47:04 +00:00
if (!promise) {
2018-07-10 11:57:11 +00:00
return handlePreview(client, chan, msg, preview, res);
}
2023-12-10 04:44:10 +00:00
void promise.then((newRes) =>
handlePreview(client, chan, msg, preview, newRes)
);
}
2023-12-10 04:44:10 +00:00
function handlePreview(
client: Client,
chan: Chan,
msg: Msg,
preview: LinkPreview,
res
) {
const thumb = preview.thumbActualUrl || "";
delete preview.thumbActualUrl;
if (!thumb.length || !Config.values.prefetchStorage) {
preview.thumb = thumb;
2018-07-10 11:57:11 +00:00
return emitPreview(client, chan, msg, preview);
}
// Get the correct file extension for the provided content-type
// This is done to prevent user-input being stored in the file name (extension)
const extension = mime.extension(res.type);
if (!extension) {
// For link previews, drop the thumbnail
// For other types, do not display preview at all
if (preview.type !== "link") {
return removePreview(msg, preview);
}
2018-07-10 11:57:11 +00:00
return emitPreview(client, chan, msg, preview);
}
storage.store(res.data, extension, (uri) => {
preview.thumb = uri;
2018-07-10 11:57:11 +00:00
emitPreview(client, chan, msg, preview);
});
}
2023-12-10 04:44:10 +00:00
function emitPreview(
client: Client,
chan: Chan,
msg: Msg,
preview: LinkPreview
) {
// If there is no title but there is preview or description, set title
// otherwise bail out and show no preview
2017-06-26 09:01:55 +00:00
if (!preview.head.length && preview.type === "link") {
if (preview.thumb.length || preview.body.length) {
preview.head = "Untitled page";
} else {
return removePreview(msg, preview);
}
}
2018-07-10 11:57:11 +00:00
client.emit("msg:preview", {
id: msg.id,
chan: chan.id,
preview: preview,
});
2014-09-27 19:17:05 +00:00
}
function removePreview(msg: Msg, preview: LinkPreview) {
// If a preview fails to load, remove the link from msg object
// So that client doesn't attempt to display an preview on page reload
const index = msg.previews.indexOf(preview);
if (index > -1) {
msg.previews.splice(index, 1);
}
}
function getRequestHeaders(headers: Record<string, string>) {
2018-03-23 14:50:52 +00:00
const formattedHeaders = {
// Certain websites like Amazon only add <meta> tags to known bots,
// lets pretend to be them to get the metadata
2019-07-17 09:33:59 +00:00
"User-Agent":
2023-12-10 04:44:10 +00:00
"Mozilla/5.0 (compatible; Hard Lounge IRC Client; COLD HARD CHATS ONLY ON IRC.SUPERNETS.ORG; +https://git.supernets.org/supernets/hardlounge)" +
" facebookexternalhit/1.1 Twitterbot/1.0",
2019-07-17 09:33:59 +00:00
Accept: headers.accept || "*/*",
"X-Purpose": "preview",
};
2018-03-23 14:50:52 +00:00
if (headers.language) {
formattedHeaders["Accept-Language"] = headers.language;
}
2018-03-23 14:50:52 +00:00
return formattedHeaders;
}
function fetch(uri: string, headers: Record<string, string>) {
// Stringify the object otherwise the objects won't compute to the same value
const cacheKey = JSON.stringify([uri, headers]);
let promise = currentFetchPromises.get(cacheKey);
if (promise) {
return promise;
2015-01-04 02:58:12 +00:00
}
const prefetchTimeout = Config.values.prefetchTimeout;
if (!prefetchTimeout) {
log.warn(
2023-10-09 10:28:14 +00:00
"prefetchTimeout is missing from your Hard Lounge configuration, defaulting to 5000 ms"
);
}
promise = new Promise<FetchRequest>((resolve, reject) => {
2019-04-15 16:19:50 +00:00
let buffer = Buffer.from("");
let contentLength = 0;
let contentType: string | undefined;
let limit = Config.values.prefetchMaxImageSize * 1024;
2019-04-15 16:19:50 +00:00
try {
const gotStream = got.stream(uri, {
retry: 0,
timeout: prefetchTimeout || 5000, // milliseconds
2019-07-17 09:33:59 +00:00
headers: getRequestHeaders(headers),
localAddress: Config.values.bind,
});
gotStream
.on("response", function (res) {
2023-12-10 04:44:10 +00:00
contentLength =
parseInt(res.headers["content-length"], 10) || 0;
contentType = res.headers["content-type"];
2019-04-15 16:19:50 +00:00
if (contentType && imageTypeRegex.test(contentType)) {
2019-04-15 16:19:50 +00:00
// response is an image
// if Content-Length header reports a size exceeding the prefetch limit, abort fetch
// and if file is not to be stored we don't need to download further either
2023-12-10 04:44:10 +00:00
if (
contentLength > limit ||
!Config.values.prefetchStorage
) {
gotStream.destroy();
2019-04-15 16:19:50 +00:00
}
2023-12-10 04:44:10 +00:00
} else if (
contentType &&
mediaTypeRegex.test(contentType)
) {
2019-04-15 16:19:50 +00:00
// We don't need to download the file any further after we received content-type header
gotStream.destroy();
2019-04-15 16:19:50 +00:00
} else {
// if not image, limit download to the max search size, since we need only meta tags
// twitter.com sends opengraph meta tags within ~20kb of data for individual tweets, the default is set to 50.
// for sites like Youtube the og tags are in the first 300K and hence this is configurable by the admin
limit =
"prefetchMaxSearchSize" in Config.values
? Config.values.prefetchMaxSearchSize * 1024
: // set to the previous size if config option is unset
50 * 1024;
}
2019-04-15 16:19:50 +00:00
})
.on("error", (e) => reject(e))
.on("data", (data) => {
buffer = Buffer.concat(
[buffer, data],
buffer.length + (data as Array<any>).length
);
2019-04-15 16:19:50 +00:00
if (buffer.length >= limit) {
gotStream.destroy();
2019-04-15 16:19:50 +00:00
}
})
.on("end", () => gotStream.destroy())
.on("close", () => {
2019-04-15 16:19:50 +00:00
let type = "";
// If we downloaded more data then specified in Content-Length, use real data size
2023-12-10 04:44:10 +00:00
const size =
contentLength > buffer.length
? contentLength
: buffer.length;
if (contentType) {
type = contentType.split(/ *; */).shift() || "";
2019-04-15 16:19:50 +00:00
}
2023-12-10 04:44:10 +00:00
resolve({ data: buffer, type, size });
2019-04-15 16:19:50 +00:00
});
} catch (e: any) {
2019-04-15 16:19:50 +00:00
return reject(e);
}
});
const removeCache = () => currentFetchPromises.delete(cacheKey);
promise.then(removeCache).catch(removeCache);
currentFetchPromises.set(cacheKey, promise);
return promise;
2014-09-27 19:17:05 +00:00
}
function normalizeURL(link: string, baseLink?: string, disallowHttp = false) {
try {
const url = new URL(link, baseLink);
// Only fetch http and https links
if (url.protocol !== "http:" && url.protocol !== "https:") {
return undefined;
}
if (disallowHttp && url.protocol === "http:") {
return undefined;
}
// Do not fetch links without hostname or ones that contain authorization
if (!url.hostname || url.username || url.password) {
return undefined;
}
// Drop hash from the url, if any
url.hash = "";
return url.toString();
} catch (e: any) {
// if an exception was thrown, the url is not valid
}
return undefined;
}