621fa92036
When a URL is prefixed with a TLS scheme, we should make sure that the remote provides a valid cert, even just for prefetches. Else MITM of such a site is trivial. This probably breaks some people with self signed cert, but the age where that was acceptable is past. We have free CAs now like Let's Encrypt.
534 lines
14 KiB
TypeScript
534 lines
14 KiB
TypeScript
import * as cheerio from "cheerio";
|
|
import got from "got";
|
|
import {URL} from "url";
|
|
import mime from "mime-types";
|
|
|
|
import log from "../../log";
|
|
import Config from "../../config";
|
|
import {findLinksWithSchema} from "../../../client/js/helpers/ircmessageparser/findLinks";
|
|
import storage from "../storage";
|
|
import Client from "../../client";
|
|
import Chan from "../../models/chan";
|
|
import Msg from "../../models/msg";
|
|
|
|
type FetchRequest = {
|
|
data: Buffer;
|
|
type: string;
|
|
size: number;
|
|
};
|
|
const currentFetchPromises = new Map<string, Promise<FetchRequest>>();
|
|
const imageTypeRegex = /^image\/.+/;
|
|
const mediaTypeRegex = /^(audio|video)\/.+/;
|
|
|
|
export type LinkPreview = {
|
|
type: string;
|
|
head: string;
|
|
body: string;
|
|
thumb: string;
|
|
size: number;
|
|
link: string; // Send original matched link to the client
|
|
shown?: boolean | null;
|
|
error?: string;
|
|
message?: string;
|
|
|
|
media?: string;
|
|
mediaType?: string;
|
|
maxSize?: number;
|
|
thumbActualUrl?: string;
|
|
};
|
|
|
|
export default function (client: Client, chan: Chan, msg: Msg, cleanText: string) {
|
|
if (!Config.values.prefetch) {
|
|
return;
|
|
}
|
|
|
|
msg.previews = findLinksWithSchema(cleanText).reduce((cleanLinks: LinkPreview[], link) => {
|
|
const url = normalizeURL(link.link);
|
|
|
|
// If the URL is invalid and cannot be normalized, don't fetch it
|
|
if (!url) {
|
|
return cleanLinks;
|
|
}
|
|
|
|
// If there are too many urls in this message, only fetch first X valid links
|
|
if (cleanLinks.length > 4) {
|
|
return cleanLinks;
|
|
}
|
|
|
|
// Do not fetch duplicate links twice
|
|
if (cleanLinks.some((l) => l.link === link.link)) {
|
|
return cleanLinks;
|
|
}
|
|
|
|
const preview: LinkPreview = {
|
|
type: "loading",
|
|
head: "",
|
|
body: "",
|
|
thumb: "",
|
|
size: -1,
|
|
link: link.link, // Send original matched link to the client
|
|
shown: null,
|
|
};
|
|
|
|
cleanLinks.push(preview);
|
|
|
|
fetch(url, {
|
|
accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
language: client.config.browser?.language || "",
|
|
})
|
|
.then((res) => {
|
|
parse(msg, chan, preview, res, client);
|
|
})
|
|
.catch((err) => {
|
|
preview.type = "error";
|
|
preview.error = "message";
|
|
preview.message = err.message;
|
|
emitPreview(client, chan, msg, preview);
|
|
});
|
|
|
|
return cleanLinks;
|
|
}, []);
|
|
}
|
|
|
|
function parseHtml(preview, res, client: Client) {
|
|
// TODO:
|
|
// eslint-disable-next-line @typescript-eslint/no-misused-promises
|
|
return new Promise((resolve: (preview: FetchRequest | null) => void) => {
|
|
const $ = cheerio.load(res.data);
|
|
|
|
return parseHtmlMedia($, preview, client)
|
|
.then((newRes) => resolve(newRes))
|
|
.catch(() => {
|
|
preview.type = "link";
|
|
preview.head =
|
|
$('meta[property="og:title"]').attr("content") ||
|
|
$("head > title, title").first().text() ||
|
|
"";
|
|
preview.body =
|
|
$('meta[property="og:description"]').attr("content") ||
|
|
$('meta[name="description"]').attr("content") ||
|
|
"";
|
|
|
|
if (preview.head.length) {
|
|
preview.head = preview.head.substr(0, 100);
|
|
}
|
|
|
|
if (preview.body.length) {
|
|
preview.body = preview.body.substr(0, 300);
|
|
}
|
|
|
|
if (!Config.values.prefetchStorage && Config.values.disableMediaPreview) {
|
|
resolve(res);
|
|
return;
|
|
}
|
|
|
|
let thumb =
|
|
$('meta[property="og:image"]').attr("content") ||
|
|
$('meta[name="twitter:image:src"]').attr("content") ||
|
|
$('link[rel="image_src"]').attr("href") ||
|
|
"";
|
|
|
|
// Make sure thumbnail is a valid and absolute url
|
|
if (thumb.length) {
|
|
thumb = normalizeURL(thumb, preview.link) || "";
|
|
}
|
|
|
|
// Verify that thumbnail pic exists and is under allowed size
|
|
if (thumb.length) {
|
|
fetch(thumb, {language: client.config.browser?.language || ""})
|
|
.then((resThumb) => {
|
|
if (
|
|
resThumb !== null &&
|
|
imageTypeRegex.test(resThumb.type) &&
|
|
resThumb.size <= Config.values.prefetchMaxImageSize * 1024
|
|
) {
|
|
preview.thumbActualUrl = thumb;
|
|
}
|
|
|
|
resolve(resThumb);
|
|
})
|
|
.catch(() => resolve(null));
|
|
} else {
|
|
resolve(res);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
// TODO: type $
|
|
function parseHtmlMedia($: any, preview, client: Client): Promise<FetchRequest> {
|
|
return new Promise((resolve, reject) => {
|
|
if (Config.values.disableMediaPreview) {
|
|
reject();
|
|
return;
|
|
}
|
|
|
|
let foundMedia = false;
|
|
const openGraphType = $('meta[property="og:type"]').attr("content");
|
|
|
|
// Certain news websites may include video and audio tags,
|
|
// despite actually being an article (as indicated by og:type).
|
|
// If there is og:type tag, we will only select video or audio if it matches
|
|
if (
|
|
openGraphType &&
|
|
!openGraphType.startsWith("video") &&
|
|
!openGraphType.startsWith("music")
|
|
) {
|
|
reject();
|
|
return;
|
|
}
|
|
|
|
["video", "audio"].forEach((type) => {
|
|
if (foundMedia) {
|
|
return;
|
|
}
|
|
|
|
$(`meta[property="og:${type}:type"]`).each(function (this: cheerio.Element, i: number) {
|
|
const mimeType = $(this).attr("content");
|
|
|
|
if (!mimeType) {
|
|
return;
|
|
}
|
|
|
|
if (mediaTypeRegex.test(mimeType)) {
|
|
// If we match a clean video or audio tag, parse that as a preview instead
|
|
let mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content");
|
|
|
|
if (!mediaUrl) {
|
|
return;
|
|
}
|
|
|
|
// Make sure media is a valid url
|
|
mediaUrl = normalizeURL(mediaUrl, preview.link, true);
|
|
|
|
// Make sure media is a valid url
|
|
if (!mediaUrl) {
|
|
return;
|
|
}
|
|
|
|
foundMedia = true;
|
|
|
|
fetch(mediaUrl, {
|
|
accept:
|
|
type === "video"
|
|
? "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5"
|
|
: "audio/webm, audio/ogg, audio/wav, audio/*;q=0.9, application/ogg;q=0.7, video/*;q=0.6; */*;q=0.5",
|
|
language: client.config.browser?.language || "",
|
|
})
|
|
.then((resMedia) => {
|
|
if (resMedia === null || !mediaTypeRegex.test(resMedia.type)) {
|
|
return reject();
|
|
}
|
|
|
|
preview.type = type;
|
|
preview.media = mediaUrl;
|
|
preview.mediaType = resMedia.type;
|
|
|
|
resolve(resMedia);
|
|
})
|
|
.catch(reject);
|
|
|
|
return false;
|
|
}
|
|
});
|
|
});
|
|
|
|
if (!foundMedia) {
|
|
reject();
|
|
}
|
|
});
|
|
}
|
|
|
|
function parse(msg: Msg, chan: Chan, preview: LinkPreview, res: FetchRequest, client: Client) {
|
|
let promise: Promise<FetchRequest | null> | null = null;
|
|
|
|
preview.size = res.size;
|
|
|
|
switch (res.type) {
|
|
case "text/html":
|
|
preview.size = -1;
|
|
promise = parseHtml(preview, res, client);
|
|
break;
|
|
|
|
case "text/plain":
|
|
preview.type = "link";
|
|
preview.body = res.data.toString().substr(0, 300);
|
|
break;
|
|
|
|
case "image/png":
|
|
case "image/gif":
|
|
case "image/jpg":
|
|
case "image/jpeg":
|
|
case "image/jxl":
|
|
case "image/webp":
|
|
case "image/avif":
|
|
if (!Config.values.prefetchStorage && Config.values.disableMediaPreview) {
|
|
return removePreview(msg, preview);
|
|
}
|
|
|
|
if (res.size > Config.values.prefetchMaxImageSize * 1024) {
|
|
preview.type = "error";
|
|
preview.error = "image-too-big";
|
|
preview.maxSize = Config.values.prefetchMaxImageSize * 1024;
|
|
} else {
|
|
preview.type = "image";
|
|
preview.thumbActualUrl = preview.link;
|
|
}
|
|
|
|
break;
|
|
|
|
case "audio/midi":
|
|
case "audio/mpeg":
|
|
case "audio/mpeg3":
|
|
case "audio/ogg":
|
|
case "audio/wav":
|
|
case "audio/x-wav":
|
|
case "audio/x-mid":
|
|
case "audio/x-midi":
|
|
case "audio/x-mpeg":
|
|
case "audio/x-mpeg-3":
|
|
case "audio/flac":
|
|
case "audio/x-flac":
|
|
case "audio/mp4":
|
|
case "audio/x-m4a":
|
|
if (!preview.link.startsWith("https://")) {
|
|
break;
|
|
}
|
|
|
|
if (Config.values.disableMediaPreview) {
|
|
return removePreview(msg, preview);
|
|
}
|
|
|
|
preview.type = "audio";
|
|
preview.media = preview.link;
|
|
preview.mediaType = res.type;
|
|
|
|
break;
|
|
|
|
case "video/webm":
|
|
case "video/ogg":
|
|
case "video/mp4":
|
|
if (!preview.link.startsWith("https://")) {
|
|
break;
|
|
}
|
|
|
|
if (Config.values.disableMediaPreview) {
|
|
return removePreview(msg, preview);
|
|
}
|
|
|
|
preview.type = "video";
|
|
preview.media = preview.link;
|
|
preview.mediaType = res.type;
|
|
|
|
break;
|
|
|
|
default:
|
|
return removePreview(msg, preview);
|
|
}
|
|
|
|
if (!promise) {
|
|
return handlePreview(client, chan, msg, preview, res);
|
|
}
|
|
|
|
void promise.then((newRes) => handlePreview(client, chan, msg, preview, newRes));
|
|
}
|
|
|
|
function handlePreview(client: Client, chan: Chan, msg: Msg, preview: LinkPreview, res) {
|
|
const thumb = preview.thumbActualUrl || "";
|
|
delete preview.thumbActualUrl;
|
|
|
|
if (!thumb.length || !Config.values.prefetchStorage) {
|
|
preview.thumb = thumb;
|
|
return emitPreview(client, chan, msg, preview);
|
|
}
|
|
|
|
// Get the correct file extension for the provided content-type
|
|
// This is done to prevent user-input being stored in the file name (extension)
|
|
const extension = mime.extension(res.type);
|
|
|
|
if (!extension) {
|
|
// For link previews, drop the thumbnail
|
|
// For other types, do not display preview at all
|
|
if (preview.type !== "link") {
|
|
return removePreview(msg, preview);
|
|
}
|
|
|
|
return emitPreview(client, chan, msg, preview);
|
|
}
|
|
|
|
storage.store(res.data, extension, (uri) => {
|
|
preview.thumb = uri;
|
|
|
|
emitPreview(client, chan, msg, preview);
|
|
});
|
|
}
|
|
|
|
function emitPreview(client: Client, chan: Chan, msg: Msg, preview: LinkPreview) {
|
|
// If there is no title but there is preview or description, set title
|
|
// otherwise bail out and show no preview
|
|
if (!preview.head.length && preview.type === "link") {
|
|
if (preview.thumb.length || preview.body.length) {
|
|
preview.head = "Untitled page";
|
|
} else {
|
|
return removePreview(msg, preview);
|
|
}
|
|
}
|
|
|
|
client.emit("msg:preview", {
|
|
id: msg.id,
|
|
chan: chan.id,
|
|
preview: preview,
|
|
});
|
|
}
|
|
|
|
function removePreview(msg: Msg, preview: LinkPreview) {
|
|
// If a preview fails to load, remove the link from msg object
|
|
// So that client doesn't attempt to display an preview on page reload
|
|
const index = msg.previews.indexOf(preview);
|
|
|
|
if (index > -1) {
|
|
msg.previews.splice(index, 1);
|
|
}
|
|
}
|
|
|
|
function getRequestHeaders(headers: Record<string, string>) {
|
|
const formattedHeaders = {
|
|
// Certain websites like Amazon only add <meta> tags to known bots,
|
|
// lets pretend to be them to get the metadata
|
|
"User-Agent":
|
|
"Mozilla/5.0 (compatible; The Lounge IRC Client; +https://github.com/thelounge/thelounge)" +
|
|
" facebookexternalhit/1.1 Twitterbot/1.0",
|
|
Accept: headers.accept || "*/*",
|
|
"X-Purpose": "preview",
|
|
};
|
|
|
|
if (headers.language) {
|
|
formattedHeaders["Accept-Language"] = headers.language;
|
|
}
|
|
|
|
return formattedHeaders;
|
|
}
|
|
|
|
function fetch(uri: string, headers: Record<string, string>) {
|
|
// Stringify the object otherwise the objects won't compute to the same value
|
|
const cacheKey = JSON.stringify([uri, headers]);
|
|
let promise = currentFetchPromises.get(cacheKey);
|
|
|
|
if (promise) {
|
|
return promise;
|
|
}
|
|
|
|
const prefetchTimeout = Config.values.prefetchTimeout;
|
|
|
|
if (!prefetchTimeout) {
|
|
log.warn(
|
|
"prefetchTimeout is missing from your The Lounge configuration, defaulting to 5000 ms"
|
|
);
|
|
}
|
|
|
|
promise = new Promise<FetchRequest>((resolve, reject) => {
|
|
let buffer = Buffer.from("");
|
|
let contentLength = 0;
|
|
let contentType: string | undefined;
|
|
let limit = Config.values.prefetchMaxImageSize * 1024;
|
|
|
|
try {
|
|
const gotStream = got.stream(uri, {
|
|
retry: 0,
|
|
timeout: prefetchTimeout || 5000, // milliseconds
|
|
headers: getRequestHeaders(headers),
|
|
});
|
|
|
|
gotStream
|
|
.on("response", function (res) {
|
|
contentLength = parseInt(res.headers["content-length"], 10) || 0;
|
|
contentType = res.headers["content-type"];
|
|
|
|
if (contentType && imageTypeRegex.test(contentType)) {
|
|
// response is an image
|
|
// if Content-Length header reports a size exceeding the prefetch limit, abort fetch
|
|
// and if file is not to be stored we don't need to download further either
|
|
if (contentLength > limit || !Config.values.prefetchStorage) {
|
|
gotStream.destroy();
|
|
}
|
|
} else if (contentType && mediaTypeRegex.test(contentType)) {
|
|
// We don't need to download the file any further after we received content-type header
|
|
gotStream.destroy();
|
|
} else {
|
|
// if not image, limit download to the max search size, since we need only meta tags
|
|
// twitter.com sends opengraph meta tags within ~20kb of data for individual tweets, the default is set to 50.
|
|
// for sites like Youtube the og tags are in the first 300K and hence this is configurable by the admin
|
|
limit =
|
|
"prefetchMaxSearchSize" in Config.values
|
|
? Config.values.prefetchMaxSearchSize * 1024
|
|
: // set to the previous size if config option is unset
|
|
50 * 1024;
|
|
}
|
|
})
|
|
.on("error", (e) => reject(e))
|
|
.on("data", (data) => {
|
|
buffer = Buffer.concat(
|
|
[buffer, data],
|
|
buffer.length + (data as Array<any>).length
|
|
);
|
|
|
|
if (buffer.length >= limit) {
|
|
gotStream.destroy();
|
|
}
|
|
})
|
|
.on("end", () => gotStream.destroy())
|
|
.on("close", () => {
|
|
let type = "";
|
|
|
|
// If we downloaded more data then specified in Content-Length, use real data size
|
|
const size = contentLength > buffer.length ? contentLength : buffer.length;
|
|
|
|
if (contentType) {
|
|
type = contentType.split(/ *; */).shift() || "";
|
|
}
|
|
|
|
resolve({data: buffer, type, size});
|
|
});
|
|
} catch (e: any) {
|
|
return reject(e);
|
|
}
|
|
});
|
|
|
|
const removeCache = () => currentFetchPromises.delete(cacheKey);
|
|
|
|
promise.then(removeCache).catch(removeCache);
|
|
|
|
currentFetchPromises.set(cacheKey, promise);
|
|
|
|
return promise;
|
|
}
|
|
|
|
function normalizeURL(link: string, baseLink?: string, disallowHttp = false) {
|
|
try {
|
|
const url = new URL(link, baseLink);
|
|
|
|
// Only fetch http and https links
|
|
if (url.protocol !== "http:" && url.protocol !== "https:") {
|
|
return undefined;
|
|
}
|
|
|
|
if (disallowHttp && url.protocol === "http:") {
|
|
return undefined;
|
|
}
|
|
|
|
// Do not fetch links without hostname or ones that contain authorization
|
|
if (!url.hostname || url.username || url.password) {
|
|
return undefined;
|
|
}
|
|
|
|
// Drop hash from the url, if any
|
|
url.hash = "";
|
|
|
|
return url.toString();
|
|
} catch (e: any) {
|
|
// if an exception was thrown, the url is not valid
|
|
}
|
|
|
|
return undefined;
|
|
}
|