Use WHATWG URL parser in link prefetcher

This commit is contained in:
Pavel Djundik 2018-04-27 16:27:26 +03:00
parent 629ae8bfa4
commit d4fa6bbcb0
2 changed files with 79 additions and 43 deletions

View File

@ -2,8 +2,7 @@
const cheerio = require("cheerio"); const cheerio = require("cheerio");
const request = require("request"); const request = require("request");
const url = require("url"); const URL = require("url").URL;
const URI = require("urijs");
const mime = require("mime-types"); const mime = require("mime-types");
const Helper = require("../../helper"); const Helper = require("../../helper");
const cleanIrcMessage = require("../../../client/js/libs/handlebars/ircmessageparser/cleanIrcMessage"); const cleanIrcMessage = require("../../../client/js/libs/handlebars/ircmessageparser/cleanIrcMessage");
@ -32,26 +31,36 @@ module.exports = function(client, chan, msg) {
// Remove all IRC formatting characters before searching for links // Remove all IRC formatting characters before searching for links
const cleanText = cleanIrcMessage(msg.text); const cleanText = cleanIrcMessage(msg.text);
// We will only try to prefetch http(s) links msg.previews = findLinks(cleanText).reduce((cleanLinks, link) => {
const links = findLinks(cleanText).filter((w) => isValidLink(w.link)); const url = normalizeURL(link.link);
if (links.length === 0) { // If the URL is invalid and cannot be normalized, don't fetch it
return; if (url === null) {
return cleanLinks;
} }
msg.previews = Array.from(new Set( // Remove duplicate links // If there are too many urls in this message, only fetch first X valid links
links.map((link) => link.link) if (cleanLinks.length > 4) {
)).map((link) => ({ return cleanLinks;
}
// Do not fetch duplicate links twice
if (cleanLinks.some((l) => l.link === link.link)) {
return cleanLinks;
}
const preview = {
type: "loading", type: "loading",
head: "", head: "",
body: "", body: "",
thumb: "", thumb: "",
link: link, link: link.link, // Send original matched link to the client
shown: true, shown: true,
})).slice(0, 5); // Only preview the first 5 URLs in message to avoid abuse };
msg.previews.forEach((preview) => { cleanLinks.push(preview);
fetch(normalizeURL(preview.link), {
fetch(url, {
accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
language: client.language, language: client.language,
}, function(res, err) { }, function(res, err) {
@ -68,7 +77,9 @@ module.exports = function(client, chan, msg) {
parse(msg, preview, res, client); parse(msg, preview, res, client);
}); });
});
return cleanLinks;
}, []);
}; };
function parseHtml(preview, res, client) { function parseHtml(preview, res, client) {
@ -93,18 +104,14 @@ function parseHtml(preview, res, client) {
|| $('link[rel="image_src"]').attr("href") || $('link[rel="image_src"]').attr("href")
|| ""; || "";
// Make sure thumbnail is a valid and absolute url
if (preview.thumb.length) { if (preview.thumb.length) {
preview.thumb = url.resolve(preview.link, preview.thumb); preview.thumb = normalizeURL(preview.thumb, preview.link) || "";
}
// Make sure thumbnail is a valid url
if (!isValidLink(preview.thumb)) {
preview.thumb = "";
} }
// Verify that thumbnail pic exists and is under allowed size // Verify that thumbnail pic exists and is under allowed size
if (preview.thumb.length) { if (preview.thumb.length) {
fetch(normalizeURL(preview.thumb), {language: client.language}, (resThumb) => { fetch(preview.thumb, {language: client.language}, (resThumb) => {
if (resThumb === null if (resThumb === null
|| !(/^image\/.+/.test(resThumb.type)) || !(/^image\/.+/.test(resThumb.type))
|| resThumb.size > (Helper.config.prefetchMaxImageSize * 1024)) { || resThumb.size > (Helper.config.prefetchMaxImageSize * 1024)) {
@ -134,16 +141,19 @@ function parseHtmlMedia($, preview, res, client) {
if (mediaTypeRegex.test(mimeType)) { if (mediaTypeRegex.test(mimeType)) {
// If we match a clean video or audio tag, parse that as a preview instead // If we match a clean video or audio tag, parse that as a preview instead
const mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content"); let mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content");
// Make sure media is a valid url // Make sure media is a valid url
if (!mediaUrl.startsWith("https://")) { mediaUrl = normalizeURL(mediaUrl, preview.link, true);
// Make sure media is a valid url
if (!mediaUrl) {
return; return;
} }
foundMedia = true; foundMedia = true;
fetch(normalizeURL(mediaUrl), { fetch(mediaUrl, {
accept: type === "video" ? accept: type === "video" ?
"video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5" : "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5" :
"audio/webm, audio/ogg, audio/wav, audio/*;q=0.9, application/ogg;q=0.7, video/*;q=0.6; */*;q=0.5", "audio/webm, audio/ogg, audio/wav, audio/*;q=0.9, application/ogg;q=0.7, video/*;q=0.6; */*;q=0.5",
@ -360,27 +370,31 @@ function fetch(uri, headers, cb) {
}); });
} }
function normalizeURL(header) { function normalizeURL(link, baseLink, disallowHttp = false) {
return URI(header).normalize().toString();
}
function isValidLink(link) {
try { try {
const uri = URI(link); const url = new URL(link, baseLink);
const protocol = uri.protocol();
// Only fetch http and https links // Only fetch http and https links
if (protocol !== "http" && protocol !== "https") { if (url.protocol !== "http:" && url.protocol !== "https:") {
return false; return null;
}
if (disallowHttp && url.protocol === "http:") {
return null;
} }
// Do not fetch links without hostname or ones that contain authorization // Do not fetch links without hostname or ones that contain authorization
if (!uri.hostname() || uri.username() || uri.password()) { if (!url.hostname || url.username || url.password) {
return false; return null;
}
} catch (e) {
return false;
} }
return true; // Drop hash from the url, if any
url.hash = "";
return url.toString();
} catch (e) {
// if an exception was thrown, the url is not valid
}
return null;
} }

View File

@ -385,6 +385,28 @@ describe("Link plugin", function() {
}); });
}); });
it("should de-duplicate links", function(done) {
const message = this.irc.createMessage({
text: "//localhost:9002 http://localhost:9002 http://localhost:9002",
});
link(this.irc, this.network.channels[0], message);
expect(message.previews).to.deep.equal([{
type: "loading",
head: "",
body: "",
thumb: "",
link: "http://localhost:9002",
shown: true,
}]);
this.irc.once("msg:preview", function(data) {
expect(data.preview.link).to.equal("http://localhost:9002");
done();
});
});
it("should not try to fetch links with wrong protocol", function() { it("should not try to fetch links with wrong protocol", function() {
const message = this.irc.createMessage({ const message = this.irc.createMessage({
text: "ssh://example.com ftp://example.com irc://example.com http:////////example.com", text: "ssh://example.com ftp://example.com irc://example.com http:////////example.com",