From 58d9490c2a6ed15c96b6f4c2ba4b17c66c12dfef Mon Sep 17 00:00:00 2001
From: Pavel Djundik <github@xpaw.me>
Date: Thu, 14 Dec 2017 13:14:45 +0200
Subject: [PATCH] Try to find og:video and og:audio on html pages

---
 client/views/msg_preview.tpl   |   4 +-
 src/plugins/irc-events/link.js | 145 ++++++++++++++++++++++++---------
 2 files changed, 108 insertions(+), 41 deletions(-)
diff --git a/client/views/msg_preview.tpl b/client/views/msg_preview.tpl
index 05cf7680..1e1cb916 100644
--- a/client/views/msg_preview.tpl
+++ b/client/views/msg_preview.tpl
@@ -7,13 +7,13 @@
 	{{/equal}}
 	{{#equal type "audio"}}
 		<audio controls preload="metadata">
-			<source src="{{link}}" type="{{res}}">
+			<source src="{{media}}" type="{{mediaType}}">
 			Your browser does not support the audio element.
 		</audio>
 	{{/equal}}
 	{{#equal type "video"}}
 		<video preload="metadata" controls>
-			<source src="{{link}}" type="{{res}}">
+			<source src="{{media}}" type="{{mediaType}}">
 			Your browser does not support the video element.
 		</video>
 	{{/equal}}
diff --git a/src/plugins/irc-events/link.js b/src/plugins/irc-events/link.js
index f20a1b21..46c66adb 100644
--- a/src/plugins/irc-events/link.js
+++ b/src/plugins/irc-events/link.js
@@ -9,6 +9,9 @@ const cleanIrcMessage = require("../../../client/js/libs/handlebars/ircmessagepa
 const findLinks = require("../../../client/js/libs/handlebars/ircmessageparser/findLinks");
 const storage = require("../storage");
 
+const mediaTypeRegex = /^(audio|video)\/.+/;
+const linkRegex = /^https?:\/\//;
+
 // Fix ECDH curve client compatibility in Node v8/v9
 // This is fixed in Node 10, but The Lounge supports LTS versions
 // https://github.com/nodejs/node/issues/16196
@@ -30,7 +33,7 @@ module.exports = function(client, chan, msg) {
 	const cleanText = cleanIrcMessage(msg.text);
 
 	// We will only try to prefetch http(s) links
-	const links = findLinks(cleanText).filter((w) => /^https?:\/\//.test(w.link));
+	const links = findLinks(cleanText).filter((w) => linkRegex.test(w.link));
 
 	if (links.length === 0) {
 		return;
@@ -65,51 +68,108 @@ module.exports = function(client, chan, msg) {
 	});
 };
 
-function parse(msg, preview, res, client) {
-	switch (res.type) {
-	case "text/html": {
+function parseHtml(preview, res, client) {
+	return new Promise((resolve) => {
 		const $ = cheerio.load(res.data);
-		preview.type = "link";
-		preview.head =
-			$('meta[property="og:title"]').attr("content")
-			|| $("title").text()
-			|| "";
-		preview.body =
-			$('meta[property="og:description"]').attr("content")
-			|| $('meta[name="description"]').attr("content")
-			|| "";
-		preview.thumb =
-			$('meta[property="og:image"]').attr("content")
-			|| $('meta[name="twitter:image:src"]').attr("content")
-			|| $('link[rel="image_src"]').attr("href")
-			|| "";
 
-		if (preview.thumb.length) {
-			preview.thumb = url.resolve(preview.link, preview.thumb);
-		}
+		return parseHtmlMedia($, preview, res, client)
+			.then((newRes) => resolve(newRes))
+			.catch(() => {
+				preview.type = "link";
+				preview.head =
+					$('meta[property="og:title"]').attr("content")
+					|| $("title").text()
+					|| "";
+				preview.body =
+					$('meta[property="og:description"]').attr("content")
+					|| $('meta[name="description"]').attr("content")
+					|| "";
+				preview.thumb =
+					$('meta[property="og:image"]').attr("content")
+					|| $('meta[name="twitter:image:src"]').attr("content")
+					|| $('link[rel="image_src"]').attr("href")
+					|| "";
 
-		// Make sure thumbnail is a valid url
-		if (!/^https?:\/\//.test(preview.thumb)) {
-			preview.thumb = "";
-		}
+				if (preview.thumb.length) {
+					preview.thumb = url.resolve(preview.link, preview.thumb);
+				}
 
-		// Verify that thumbnail pic exists and is under allowed size
-		if (preview.thumb.length) {
-			fetch(escapeHeader(preview.thumb), {language: client.language}, (resThumb) => {
-				if (resThumb === null
-				|| !(/^image\/.+/.test(resThumb.type))
-				|| resThumb.size > (Helper.config.prefetchMaxImageSize * 1024)) {
+				// Make sure thumbnail is a valid url
+				if (!linkRegex.test(preview.thumb)) {
 					preview.thumb = "";
 				}
 
-				handlePreview(client, msg, preview, resThumb);
+				// Verify that thumbnail pic exists and is under allowed size
+				if (preview.thumb.length) {
+					fetch(escapeHeader(preview.thumb), {language: client.language}, (resThumb) => {
+						if (resThumb === null
+						|| !(/^image\/.+/.test(resThumb.type))
+						|| resThumb.size > (Helper.config.prefetchMaxImageSize * 1024)) {
+							preview.thumb = "";
+						}
+
+						resolve(resThumb);
+					});
+				} else {
+					resolve(res);
+				}
 			});
+	});
+}
 
-			return;
+function parseHtmlMedia($, preview, res, client) {
+	return new Promise((resolve, reject) => {
+		let foundMedia = false;
+
+		["video", "audio"].forEach((type) => {
+			if (foundMedia) {
+				return;
+			}
+
+			$(`meta[property="og:${type}:type"]`).each(function(i) {
+				const mimeType = $(this).attr("content");
+
+				if (mediaTypeRegex.test(mimeType)) {
+					// If we match a clean video or audio tag, parse that as a preview instead
+					const mediaUrl = $($(`meta[property="og:${type}"]`).get(i)).attr("content");
+
+					// Make sure media is a valid url
+					if (!mediaUrl.startsWith("https://")) {
+						return;
+					}
+
+					foundMedia = true;
+
+					fetch(escapeHeader(mediaUrl), {language: client.language}, (resMedia) => {
+						if (resMedia === null || !mediaTypeRegex.test(resMedia.type)) {
+							return reject();
+						}
+
+						preview.type = type;
+						preview.media = mediaUrl;
+						preview.mediaType = resMedia.type;
+
+						resolve(resMedia);
+					});
+
+					return false;
+				}
+			});
+		});
+
+		if (!foundMedia) {
+			reject();
 		}
+	});
+}
 
+function parse(msg, preview, res, client) {
+	let promise;
+
+	switch (res.type) {
+	case "text/html":
+		promise = parseHtml(preview, res, client);
 		break;
-	}
 
 	case "image/png":
 	case "image/gif":
@@ -141,7 +201,8 @@ function parse(msg, preview, res, client) {
 		}
 
 		preview.type = "audio";
-		preview.res = res.type;
+		preview.media = preview.link;
+		preview.mediaType = res.type;
 
 		break;
 
@@ -152,8 +213,9 @@ function parse(msg, preview, res, client) {
 			break;
 		}
 
-		preview.res = res.type;
 		preview.type = "video";
+		preview.media = preview.link;
+		preview.mediaType = res.type;
 
 		break;
 
@@ -161,7 +223,11 @@ function parse(msg, preview, res, client) {
 		return;
 	}
 
-	handlePreview(client, msg, preview, res);
+	if (!promise) {
+		return handlePreview(client, msg, preview, res);
+	}
+
+	promise.then((newRes) => handlePreview(client, msg, preview, newRes));
 }
 
 function handlePreview(client, msg, preview, res) {
@@ -248,8 +314,9 @@ function fetch(uri, {language}, cb) {
 				if (contentLength > limit) {
 					req.abort();
 				}
-			} else if (/^(audio|video)\/.+/.test(res.headers["content-type"])) {
-				req.abort(); // ensure server doesn't download the audio file
+			} else if (mediaTypeRegex.test(res.headers["content-type"])) {
+				// We don't need to download the file any further after we received content-type header
+				req.abort();
 			} else {
 				// if not image, limit download to 50kb, since we need only meta tags
 				// twitter.com sends opengraph meta tags within ~20kb of data for individual tweets