Use WHATWG URL parser in link prefetcher

This commit is contained in:
Pavel Djundik 2018-04-27 16:27:26 +03:00
parent 629ae8bfa4
commit d4fa6bbcb0
2 changed files with 79 additions and 43 deletions

View File

@ -2,8 +2,7 @@
const cheerio = require("cheerio");
const request = require("request");
const url = require("url");
const URI = require("urijs");
const URL = require("url").URL;
const mime = require("mime-types");
const Helper = require("../../helper");
const cleanIrcMessage = require("../../../client/js/libs/handlebars/ircmessageparser/cleanIrcMessage");
@ -32,26 +31,36 @@ module.exports = function(client, chan, msg) {
// Remove all IRC formatting characters before searching for links
const cleanText = cleanIrcMessage(msg.text);
// We will only try to prefetch http(s) links
const links = findLinks(cleanText).filter((w) => isValidLink(w.link));
msg.previews = findLinks(cleanText).reduce((cleanLinks, link) => {
const url = normalizeURL(link.link);
if (links.length === 0) {
return;
}
// If the URL is invalid and cannot be normalized, don't fetch it
if (url === null) {
return cleanLinks;
}
msg.previews = Array.from(new Set( // Remove duplicate links
links.map((link) => link.link)
)).map((link) => ({
type: "loading",
head: "",
body: "",
thumb: "",
link: link,
shown: true,
})).slice(0, 5); // Only preview the first 5 URLs in message to avoid abuse
// If there are too many urls in this message, only fetch first X valid links
if (cleanLinks.length > 4) {
return cleanLinks;
}
msg.previews.forEach((preview) => {
fetch(normalizeURL(preview.link), {
// Do not fetch duplicate links twice
if (cleanLinks.some((l) => l.link === link.link)) {
return cleanLinks;
}
const preview = {
type: "loading",
head: "",
body: "",
thumb: "",
link: link.link, // Send original matched link to the client
shown: true,
};
cleanLinks.push(preview);
fetch(url, {
accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
language: client.language,
}, function(res, err) {
@ -68,7 +77,9 @@ module.exports = function(client, chan, msg) {
parse(msg, preview, res, client);
});
});
return cleanLinks;
}, []);
};
function parseHtml(preview, res, client) {
@ -93,18 +104,14 @@ function parseHtml(preview, res, client) {
|| $('link[rel="image_src"]').attr("href")
|| "";
// Make sure thumbnail is a valid and absolute url
if (preview.thumb.length) {
preview.thumb = url.resolve(preview.link, preview.thumb);
}
// Make sure thumbnail is a valid url
if (!isValidLink(preview.thumb)) {
preview.thumb = "";
preview.thumb = normalizeURL(preview.thumb, preview.link) || "";
}
// Verify that thumbnail pic exists and is under allowed size
if (preview.thumb.length) {
fetch(normalizeURL(preview.thumb), {language: client.language}, (resThumb) => {
fetch(preview.thumb, {language: client.language}, (resThumb) => {
if (resThumb === null
|| !(/^image\/.+/.test(resThumb.type))
|| resThumb.size > (Helper.config.prefetchMaxImageSize * 1024)) {
@ -134,16 +141,19 @@ function parseHtmlMedia($, preview, res, client) {
if (mediaTypeRegex.test(mimeType)) {
// If we match a clean video or audio tag, parse that as a preview instead
const mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content");
let mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content");
// Make sure media is a valid url
if (!mediaUrl.startsWith("https://")) {
mediaUrl = normalizeURL(mediaUrl, preview.link, true);
// Make sure media is a valid url
if (!mediaUrl) {
return;
}
foundMedia = true;
fetch(normalizeURL(mediaUrl), {
fetch(mediaUrl, {
accept: type === "video" ?
"video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5" :
"audio/webm, audio/ogg, audio/wav, audio/*;q=0.9, application/ogg;q=0.7, video/*;q=0.6; */*;q=0.5",
@ -360,27 +370,31 @@ function fetch(uri, headers, cb) {
});
}
function normalizeURL(header) {
return URI(header).normalize().toString();
}
function isValidLink(link) {
function normalizeURL(link, baseLink, disallowHttp = false) {
try {
const uri = URI(link);
const protocol = uri.protocol();
const url = new URL(link, baseLink);
// Only fetch http and https links
if (protocol !== "http" && protocol !== "https") {
return false;
if (url.protocol !== "http:" && url.protocol !== "https:") {
return null;
}
if (disallowHttp && url.protocol === "http:") {
return null;
}
// Do not fetch links without hostname or ones that contain authorization
if (!uri.hostname() || uri.username() || uri.password()) {
return false;
if (!url.hostname || url.username || url.password) {
return null;
}
// Drop hash from the url, if any
url.hash = "";
return url.toString();
} catch (e) {
return false;
// if an exception was thrown, the url is not valid
}
return true;
return null;
}

View File

@ -385,6 +385,28 @@ describe("Link plugin", function() {
});
});
it("should de-duplicate links", function(done) {
const message = this.irc.createMessage({
text: "//localhost:9002 http://localhost:9002 http://localhost:9002",
});
link(this.irc, this.network.channels[0], message);
expect(message.previews).to.deep.equal([{
type: "loading",
head: "",
body: "",
thumb: "",
link: "http://localhost:9002",
shown: true,
}]);
this.irc.once("msg:preview", function(data) {
expect(data.preview.link).to.equal("http://localhost:9002");
done();
});
});
it("should not try to fetch links with wrong protocol", function() {
const message = this.irc.createMessage({
text: "ssh://example.com ftp://example.com irc://example.com http:////////example.com",