diff --git a/client/js/libs/handlebars/ircmessageparser/findLinks.js b/client/js/libs/handlebars/ircmessageparser/findLinks.js index 48726dc3..91f69958 100644 --- a/client/js/libs/handlebars/ircmessageparser/findLinks.js +++ b/client/js/libs/handlebars/ircmessageparser/findLinks.js @@ -1,13 +1,30 @@ "use strict"; -const URI = require("urijs"); +const LinkifyIt = require("linkify-it"); -// Known schemes to detect in a text. If a text contains `foo...bar://foo.com`, -// the parsed scheme should be `foo...bar` but if it contains -// `foo...http://foo.com`, we assume the scheme to extract will be `http`. +LinkifyIt.prototype.normalize = function normalize(match) { + if (!match.schema) { + match.schema = "https:"; + match.url = "https://" + match.url; + } + + if (match.schema === "//") { + match.schema = "https:"; + match.url = "https:" + match.url; + } + + if (match.schema === "mailto:" && !/^mailto:/i.test(match.url)) { + match.url = "mailto:" + match.url; + } +}; + +const linkify = LinkifyIt() + .tlds(require("tlds")) + .tlds("onion", true); + +// Known schemes to detect in text const commonSchemes = [ - "http", "https", - "ftp", "sftp", + "sftp", "smb", "file", "irc", "ircs", "svn", "git", @@ -15,54 +32,22 @@ const commonSchemes = [ "svn+ssh", "ssh", ]; +for (const schema of commonSchemes) { + linkify.add(schema + ":", "http:"); +} + function findLinks(text) { - const result = []; + const matches = linkify.match(text); - // URI.withinString() identifies URIs within text, e.g. to translate them to - // -Tags. - // See https://medialize.github.io/URI.js/docs.html#static-withinString - // In our case, we store each URI encountered in a result array. - try { - URI.withinString(text, function(url, start, end) { - let parsedScheme; - - try { - // Extract the scheme of the URL detected, if there is one - parsedScheme = URI(url).scheme().toLowerCase(); - } catch (e) { - // URI may throw an exception for malformed urls, - // as to why withinString finds these in the first place is a mystery - return; - } - - // Check if the scheme of the detected URL matches a common one above. - // In a URL like `foo..http://example.com`, the scheme would be `foo..http`, - // so we need to clean up the end of the scheme and filter out the rest. - const matchedScheme = commonSchemes.find((scheme) => parsedScheme.endsWith(scheme)); - - // A known scheme was found, extract the unknown part from the URL - if (matchedScheme) { - const prefix = parsedScheme.length - matchedScheme.length; - start += prefix; - url = url.slice(prefix); - } - - // The URL matched but does not start with a scheme (`www.foo.com`), add it - if (!parsedScheme.length) { - url = "http://" + url; - } - - result.push({ - start: start, - end: end, - link: url, - }); - }); - } catch (e) { - // withinString is wrapped in a try/catch due to https://github.com/medialize/URI.js/issues/359 + if (!matches) { + return []; } - return result; + return matches.map((url) => ({ + start: url.index, + end: url.lastIndex, + link: url.url, + })); } module.exports = findLinks; diff --git a/package.json b/package.json index 478e59c7..eef77985 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ ], "license": "MIT", "engines": { - "node": ">=6" + "node": ">=6.13.0" }, "dependencies": { "bcryptjs": "2.4.3", @@ -46,6 +46,7 @@ "express": "4.16.3", "fs-extra": "6.0.1", "irc-framework": "2.11.0", + "linkify-it": "2.0.3", "lodash": "4.17.10", "mime-types": "2.1.18", "moment": "2.22.1", @@ -58,6 +59,7 @@ "spdy": "3.4.7", "sqlite3": "4.0.0", "thelounge-ldapjs-non-maintained-fork": "1.0.2", + "tlds": "1.203.1", "ua-parser-js": "0.7.18", "urijs": "1.19.1", "uuid": "3.2.1", diff --git a/src/plugins/irc-events/link.js b/src/plugins/irc-events/link.js index aec71826..6d0bede6 100644 --- a/src/plugins/irc-events/link.js +++ b/src/plugins/irc-events/link.js @@ -2,8 +2,7 @@ const cheerio = require("cheerio"); const request = require("request"); -const url = require("url"); -const URI = require("urijs"); +const URL = require("url").URL; const mime = require("mime-types"); const Helper = require("../../helper"); const cleanIrcMessage = require("../../../client/js/libs/handlebars/ircmessageparser/cleanIrcMessage"); @@ -11,7 +10,6 @@ const findLinks = require("../../../client/js/libs/handlebars/ircmessageparser/f const storage = require("../storage"); const mediaTypeRegex = /^(audio|video)\/.+/; -const linkRegex = /^https?:\/\//; // Fix ECDH curve client compatibility in Node v8/v9 // This is fixed in Node 10, but The Lounge supports LTS versions @@ -33,26 +31,36 @@ module.exports = function(client, chan, msg) { // Remove all IRC formatting characters before searching for links const cleanText = cleanIrcMessage(msg.text); - // We will only try to prefetch http(s) links - const links = findLinks(cleanText).filter((w) => linkRegex.test(w.link)); + msg.previews = findLinks(cleanText).reduce((cleanLinks, link) => { + const url = normalizeURL(link.link); - if (links.length === 0) { - return; - } + // If the URL is invalid and cannot be normalized, don't fetch it + if (url === null) { + return cleanLinks; + } - msg.previews = Array.from(new Set( // Remove duplicate links - links.map((link) => link.link) - )).map((link) => ({ - type: "loading", - head: "", - body: "", - thumb: "", - link: link, - shown: true, - })).slice(0, 5); // Only preview the first 5 URLs in message to avoid abuse + // If there are too many urls in this message, only fetch first X valid links + if (cleanLinks.length > 4) { + return cleanLinks; + } - msg.previews.forEach((preview) => { - fetch(normalizeURL(preview.link), { + // Do not fetch duplicate links twice + if (cleanLinks.some((l) => l.link === link.link)) { + return cleanLinks; + } + + const preview = { + type: "loading", + head: "", + body: "", + thumb: "", + link: link.link, // Send original matched link to the client + shown: true, + }; + + cleanLinks.push(preview); + + fetch(url, { accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", language: client.language, }, function(res, err) { @@ -69,7 +77,9 @@ module.exports = function(client, chan, msg) { parse(msg, preview, res, client); }); - }); + + return cleanLinks; + }, []); }; function parseHtml(preview, res, client) { @@ -94,18 +104,14 @@ function parseHtml(preview, res, client) { || $('link[rel="image_src"]').attr("href") || ""; + // Make sure thumbnail is a valid and absolute url if (preview.thumb.length) { - preview.thumb = url.resolve(preview.link, preview.thumb); - } - - // Make sure thumbnail is a valid url - if (!linkRegex.test(preview.thumb)) { - preview.thumb = ""; + preview.thumb = normalizeURL(preview.thumb, preview.link) || ""; } // Verify that thumbnail pic exists and is under allowed size if (preview.thumb.length) { - fetch(normalizeURL(preview.thumb), {language: client.language}, (resThumb) => { + fetch(preview.thumb, {language: client.language}, (resThumb) => { if (resThumb === null || !(/^image\/.+/.test(resThumb.type)) || resThumb.size > (Helper.config.prefetchMaxImageSize * 1024)) { @@ -135,16 +141,19 @@ function parseHtmlMedia($, preview, res, client) { if (mediaTypeRegex.test(mimeType)) { // If we match a clean video or audio tag, parse that as a preview instead - const mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content"); + let mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content"); // Make sure media is a valid url - if (!mediaUrl.startsWith("https://")) { + mediaUrl = normalizeURL(mediaUrl, preview.link, true); + + // Make sure media is a valid url + if (!mediaUrl) { return; } foundMedia = true; - fetch(normalizeURL(mediaUrl), { + fetch(mediaUrl, { accept: type === "video" ? "video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5" : "audio/webm, audio/ogg, audio/wav, audio/*;q=0.9, application/ogg;q=0.7, video/*;q=0.6; */*;q=0.5", @@ -361,6 +370,31 @@ function fetch(uri, headers, cb) { }); } -function normalizeURL(header) { - return URI(header).normalize().toString(); +function normalizeURL(link, baseLink, disallowHttp = false) { + try { + const url = new URL(link, baseLink); + + // Only fetch http and https links + if (url.protocol !== "http:" && url.protocol !== "https:") { + return null; + } + + if (disallowHttp && url.protocol === "http:") { + return null; + } + + // Do not fetch links without hostname or ones that contain authorization + if (!url.hostname || url.username || url.password) { + return null; + } + + // Drop hash from the url, if any + url.hash = ""; + + return url.toString(); + } catch (e) { + // if an exception was thrown, the url is not valid + } + + return null; } diff --git a/test/client/js/libs/handlebars/ircmessageparser/findLinks.js b/test/client/js/libs/handlebars/ircmessageparser/findLinks.js index 7f8ab4e2..36cfa8e3 100644 --- a/test/client/js/libs/handlebars/ircmessageparser/findLinks.js +++ b/test/client/js/libs/handlebars/ircmessageparser/findLinks.js @@ -22,7 +22,7 @@ describe("findLinks", () => { const expected = [{ start: 0, end: 24, - link: "http://www.nooooooooooooooo.com", + link: "https://www.nooooooooooooooo.com", }]; const actual = findLinks(input); @@ -46,7 +46,7 @@ describe("findLinks", () => { it("should find urls in strings starting with www", () => { const input = "use www.duckduckgo.com for privacy reasons"; const expected = [{ - link: "http://www.duckduckgo.com", + link: "https://www.duckduckgo.com", start: 4, end: 22, }]; @@ -69,12 +69,12 @@ describe("findLinks", () => { expect(actual).to.deep.equal(expected); }); - it("should find urls with starting with www. and odd surroundings", () => { - const input = ".:www.github.com:."; + it("should find urls with starting with http:// and odd surroundings", () => { + const input = ".:http://www.github.com:. .:www.github.com:."; const expected = [{ link: "http://www.github.com", start: 2, - end: 16, + end: 23, }]; const actual = findLinks(input); @@ -94,7 +94,7 @@ describe("findLinks", () => { it("should handle multiple www. correctly", () => { const input = "www.www.test.com"; const expected = [{ - link: "http://www.www.test.com", + link: "https://www.www.test.com", start: 0, end: 16, }]; @@ -104,16 +104,150 @@ describe("findLinks", () => { expect(actual).to.deep.equal(expected); }); + it("should find domains without www. but valid tld", () => { + const input = "google.com google.lv google.museum"; + const expected = [{ + link: "https://google.com", + start: 0, + end: 10, + }, { + link: "https://google.lv", + start: 11, + end: 20, + }, { + link: "https://google.museum", + start: 21, + end: 34, + }]; + + const actual = findLinks(input); + + expect(actual).to.deep.equal(expected); + }); + + it("should find .onion domains", () => { + const input = "facebookcorewwwi.onion/test?url"; + const expected = [{ + link: "https://facebookcorewwwi.onion/test?url", + start: 0, + end: 31, + }]; + + const actual = findLinks(input); + + expect(actual).to.deep.equal(expected); + }); + + it("should not consider invalid TLDs as domains", () => { + const input = "google.wtfgugl google.xx www.google.wtfgugl www.google.xx"; + const expected = []; + + const actual = findLinks(input); + + expect(actual).to.deep.equal(expected); + }); + + it("should consider invalid TLDs as domains if protocol is specified", () => { + const input = "http://google.wtfgugl http://google.xx http://www.google.wtfgugl http://www.google.xx"; + const expected = [{ + link: "http://google.wtfgugl", + start: 0, + end: 21, + }, { + link: "http://google.xx", + start: 22, + end: 38, + }, { + link: "http://www.google.wtfgugl", + start: 39, + end: 64, + }, { + link: "http://www.google.xx", + start: 65, + end: 85, + }]; + + const actual = findLinks(input); + + expect(actual).to.deep.equal(expected); + }); + + it("should correctly stop at punctuation", () => { // Issue #2351 + const input = + "https://en.wikipedia.org/wiki/Dig! " + + "https://en.wikipedia.org/wiki/Dig? " + + "https://en.wikipedia.org/wiki/Dig. " + + "https://www.google.com* " + + "https://www.google.com/test* " + + "https://www.google.com@ " + + "https://www.google.com/test@ " + + "https://www.google.com! "; + const expected = [{ + link: "https://en.wikipedia.org/wiki/Dig", + start: 0, + end: 33, + }, { + link: "https://en.wikipedia.org/wiki/Dig", + start: 35, + end: 68, + }, { + link: "https://en.wikipedia.org/wiki/Dig", + start: 70, + end: 103, + }, { + link: "https://www.google.com", + start: 105, + end: 127, + }, { + link: "https://www.google.com/test*", + start: 129, + end: 157, + }, { + link: "https://www.google.com", + start: 158, + end: 180, + }, { + link: "https://www.google.com/test@", + start: 182, + end: 210, + }, { + link: "https://www.google.com", + start: 211, + end: 233, + }]; + + const actual = findLinks(input); + + expect(actual).to.deep.equal(expected); + }); + + it("should correctly stop at apostrophe", () => { + const input = "https://www.google.com's www.google.com's google.com's"; // Issue #1302 + const expected = [{ + link: "https://www.google.com", + start: 0, + end: 22, + }, { + link: "https://www.google.com", + start: 25, + end: 39, + }, { + link: "https://google.com", + start: 42, + end: 52, + }]; + + const actual = findLinks(input); + + expect(actual).to.deep.equal(expected); + }); + it("does not find invalid urls", () => { const input = "www.example.com ssh://-oProxyCommand=whois"; // Issue #1412 const expected = [{ start: 0, end: 15, - link: "http://www.example.com", - }, { - end: 42, - start: 16, - link: "ssh://-oProxyCommand=whois", + link: "https://www.example.com", }]; const actual = findLinks(input); @@ -124,7 +258,11 @@ describe("findLinks", () => { const expected2 = [{ start: 0, end: 15, - link: "http://www.example.com", + link: "https://www.example.com", + }, { + start: 16, + end: 57, + link: "http://root:'some%pass'@hostname/database", }]; const actual2 = findLinks(input2); @@ -137,7 +275,11 @@ describe("findLinks", () => { const expected = [{ start: 0, end: 15, - link: "http://www.example.com", + link: "https://www.example.com", + }, { + start: 16, + end: 29, + link: "http://a:%p@c", }, { start: 30, end: 51, @@ -148,4 +290,17 @@ describe("findLinks", () => { expect(actual).to.deep.equal(expected); }); + + it("should add protocol to protocol-aware urls", () => { + const input = "//example.com"; + const expected = [{ + link: "https://example.com", + start: 0, + end: 13, + }]; + + const actual = findLinks(input); + + expect(actual).to.deep.equal(expected); + }); }); diff --git a/test/client/js/libs/handlebars/parse.js b/test/client/js/libs/handlebars/parse.js index 98d79cd0..4b247429 100644 --- a/test/client/js/libs/handlebars/parse.js +++ b/test/client/js/libs/handlebars/parse.js @@ -7,7 +7,7 @@ describe("parse Handlebars helper", () => { it("should not introduce xss", () => { const testCases = [{ input: "", - expected: "<img onerror='location.href="//youtube.com"'>", + expected: "<img onerror='location.href="//youtube.com"'>", }, { input: '#&">bug', expected: '#&">bug', @@ -41,7 +41,7 @@ describe("parse Handlebars helper", () => { }, { input: "www.nooooooooooooooo.com", expected: - '' + + '' + "www.nooooooooooooooo.com" + "", }, { @@ -56,7 +56,7 @@ describe("parse Handlebars helper", () => { input: "use www.duckduckgo.com for privacy reasons", expected: "use " + - '' + + '' + "www.duckduckgo.com" + "" + " for privacy reasons", @@ -101,7 +101,7 @@ describe("parse Handlebars helper", () => { input: "abc (www.example.com)", expected: "abc (" + - '' + + '' + "www.example.com" + "" + ")", @@ -114,7 +114,7 @@ describe("parse Handlebars helper", () => { }, { input: "www.example.com/Test_(Page)", expected: - '' + + '' + "www.example.com/Test_(Page)" + "", }]; diff --git a/test/plugins/link.js b/test/plugins/link.js index 22a9c633..78a27bcb 100644 --- a/test/plugins/link.js +++ b/test/plugins/link.js @@ -371,4 +371,55 @@ describe("Link plugin", function() { } }); }); + + it("should fetch protocol-aware links", function(done) { + const message = this.irc.createMessage({ + text: "//localhost:9002", + }); + + link(this.irc, this.network.channels[0], message); + + this.irc.once("msg:preview", function(data) { + expect(data.preview.link).to.equal("https://localhost:9002"); + done(); + }); + }); + + it("should de-duplicate links", function(done) { + const message = this.irc.createMessage({ + text: "//localhost:9002 https://localhost:9002 https://localhost:9002", + }); + + link(this.irc, this.network.channels[0], message); + + expect(message.previews).to.deep.equal([{ + type: "loading", + head: "", + body: "", + thumb: "", + link: "https://localhost:9002", + shown: true, + }]); + + this.irc.once("msg:preview", function(data) { + expect(data.preview.link).to.equal("https://localhost:9002"); + done(); + }); + }); + + it("should not try to fetch links with wrong protocol", function() { + const message = this.irc.createMessage({ + text: "ssh://example.com ftp://example.com irc://example.com http:////////example.com", + }); + + expect(message.previews).to.be.empty; + }); + + it("should not try to fetch links with username or password", function() { + const message = this.irc.createMessage({ + text: "http://root:'some%pass'@hostname/database http://a:%p@c http://a:%p@example.com http://test@example.com", + }); + + expect(message.previews).to.be.empty; + }); }); diff --git a/yarn.lock b/yarn.lock index 47a1d4e8..ec1554fe 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4459,6 +4459,12 @@ levn@^0.3.0, levn@~0.3.0: prelude-ls "~1.1.2" type-check "~0.3.2" +linkify-it@2.0.3: + version "2.0.3" + resolved "https://registry.yarnpkg.com/linkify-it/-/linkify-it-2.0.3.tgz#d94a4648f9b1c179d64fa97291268bdb6ce9434f" + dependencies: + uc.micro "^1.0.1" + listr-silent-renderer@^1.1.1: version "1.1.1" resolved "https://registry.yarnpkg.com/listr-silent-renderer/-/listr-silent-renderer-1.1.1.tgz#924b5a3757153770bf1a8e3fbf74b8bbf3f9242e" @@ -7602,6 +7608,10 @@ timers-browserify@^2.0.4: dependencies: setimmediate "^1.0.4" +tlds@1.203.1: + version "1.203.1" + resolved "https://registry.yarnpkg.com/tlds/-/tlds-1.203.1.tgz#4dc9b02f53de3315bc98b80665e13de3edfc1dfc" + tmp@^0.0.33: version "0.0.33" resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.0.33.tgz#6d34335889768d21b2bcda0aa277ced3b1bfadf9" @@ -7719,6 +7729,10 @@ ua-parser-js@0.7.18: version "0.7.18" resolved "https://registry.yarnpkg.com/ua-parser-js/-/ua-parser-js-0.7.18.tgz#a7bfd92f56edfb117083b69e31d2aa8882d4b1ed" +uc.micro@^1.0.1: + version "1.0.5" + resolved "https://registry.yarnpkg.com/uc.micro/-/uc.micro-1.0.5.tgz#0c65f15f815aa08b560a61ce8b4db7ffc3f45376" + uglify-es@^3.3.4: version "3.3.9" resolved "https://registry.yarnpkg.com/uglify-es/-/uglify-es-3.3.9.tgz#0c1c4f0700bed8dbc124cdb304d2592ca203e677"