Explain the modules of the message parser and add tests

- Add comments and descriptions to: - `findChannels.js` - `parseStyle` - `findLinks` - `fill` - `anyIntersection` - `merge` - `parse` - Minor optimizations to `parseStyle` - Add tests for `fill`
2017-04-04 00:36:03 -04:00 · 2017-04-04 00:36:03 -04:00 · 03e3444a35
commit 03e3444a35
parent 90f4a94bb2
9 changed files with 195 additions and 63 deletions
--- a/client/js/libs/handlebars/ircmessageparser/anyIntersection.js
+++ b/client/js/libs/handlebars/ircmessageparser/anyIntersection.js
@ -1,5 +1,7 @@
 "use strict";

+// Return true if any section of "a" or "b" parts (defined by their start/end
+// markers) intersect each other, false otherwise.
 function anyIntersection(a, b) {
 	return a.start <= b.start && b.start < a.end ||
 		a.start < b.end && b.end <= a.end ||
--- a/client/js/libs/handlebars/ircmessageparser/fill.js
+++ b/client/js/libs/handlebars/ircmessageparser/fill.js
@ -1,21 +1,26 @@
 "use strict";

+// Create plain text entries corresponding to areas of the text that match no
+// existing entries. Returns an empty array if all parts of the text have been
+// parsed into recognizable entries already.
 function fill(existingEntries, text) {
 	let position = 0;
-	const result = [];
-
-	for (let i = 0; i < existingEntries.length; i++) {
-		const textSegment = existingEntries[i];

+	// Fill inner parts of the text. For example, if text is `foobarbaz` and both
+	// `foo` and `baz` have matched into an entry, this will return a dummy entry
+	// corresponding to `bar`.
+	const result = existingEntries.reduce((acc, textSegment) => {
 		if (textSegment.start > position) {
-			result.push({
+			acc.push({
 				start: position,
 				end: textSegment.start
 			});
 		}
 		position = textSegment.end;
-	}
+		return acc;
+	}, []);

+	// Complete the unmatched end of the text with a dummy entry
 	if (position < text.length) {
 		result.push({
 			start: position,
--- a/client/js/libs/handlebars/ircmessageparser/findChannels.js
+++ b/client/js/libs/handlebars/ircmessageparser/findChannels.js
@ -1,13 +1,22 @@
 "use strict";

+// Escapes the RegExp special characters "^", "$", "", ".", "*", "+", "?", "(",
+// ")", "[", "]", "{", "}", and "|" in string.
+// See https://lodash.com/docs/#escapeRegExp
 const escapeRegExp = require("lodash/escapeRegExp");

-// NOTE: channel prefixes should be RPL_ISUPPORT.CHANTYPES
-// NOTE: userModes should be RPL_ISUPPORT.PREFIX
+// Given an array of channel prefixes (such as "#" and "&") and an array of user
+// modes (such as "@" and "+"), this function extracts channels and nicks from a
+// text.
+// It returns an array of objects for each channel found with their start index,
+// end index and channel name.
 function findChannels(text, channelPrefixes, userModes) {
+	// `userModePattern` is necessary to ignore user modes in /whois responses.
+	// For example, a voiced user in #thelounge will have a /whois response of:
+	// > foo is on the following channels: +#thelounge
+	// We need to explicitly ignore user modes to parse such channels correctly.
 	const userModePattern = userModes.map(escapeRegExp).join("");
 	const channelPrefixPattern = channelPrefixes.map(escapeRegExp).join("");
-
 	const channelPattern = `(?:^|\\s)[${userModePattern}]*([${channelPrefixPattern}][^ \u0007]+)`;
 	const channelRegExp = new RegExp(channelPattern, "g");

@ -15,6 +24,8 @@ function findChannels(text, channelPrefixes, userModes) {
 	let match;

 	do {
+		// With global ("g") regexes, calling `exec` multiple times will find
+		// successive matches in the same string.
 		match = channelRegExp.exec(text);

 		if (match) {
--- a/client/js/libs/handlebars/ircmessageparser/findLinks.js
+++ b/client/js/libs/handlebars/ircmessageparser/findLinks.js
@ -2,6 +2,9 @@

 const URI = require("urijs");

+// Known schemes to detect in a text. If a text contains `foo...bar://foo.com`,
+// the parsed scheme should be `foo...bar` but if it contains
+// `foo...http://foo.com`, we assume the scheme to extract will be `http`.
 const commonSchemes = [
 	"http", "https",
 	"ftp", "sftp",
@ -16,6 +19,10 @@ function findLinks(text) {
 	let result = [];
 	let lastPosition = 0;

+	// URI.withinString() identifies URIs within text, e.g. to translate them to
+	// <a>-Tags.
+	// See https://medialize.github.io/URI.js/docs.html#static-withinString
+	// In our case, we store each URI encountered in a result array.
 	URI.withinString(text, function(url, start, end) {
 		// v-- fix: url was modified and does not match input string -> cant be mapped
 		if (text.indexOf(url, lastPosition) < 0) {
@ -23,19 +30,22 @@ function findLinks(text) {
 		}
 		// ^-- /fix: url was modified and does not match input string -> cant be mapped

-		// v-- fix: use prefered scheme
-		const parsed = URI(url);
-		const parsedScheme = parsed.scheme().toLowerCase();
+		// Extract the scheme of the URL detected, if there is one
+		const parsedScheme = URI(url).scheme().toLowerCase();
+
+		// Check if the scheme of the detected URL matches a common one above.
+		// In a URL like `foo..http://example.com`, the scheme would be `foo..http`,
+		// so we need to clean up the end of the scheme and filter out the rest.
 		const matchedScheme = commonSchemes.find(scheme => parsedScheme.endsWith(scheme));

+		// A known scheme was found, extract the unknown part from the URL
 		if (matchedScheme) {
 			const prefix = parsedScheme.length - matchedScheme.length;
 			start += prefix;
 			url = url.slice(prefix);
 		}
-		// ^-- /fix: use prefered scheme

-		// URL matched, but does not start with a protocol, add it
+		// The URL matched but does not start with a scheme (`www.foo.com`), add it
 		if (!parsedScheme.length) {
 			url = "http://" + url;
 		}
--- a/client/js/libs/handlebars/ircmessageparser/merge.js
+++ b/client/js/libs/handlebars/ircmessageparser/merge.js
@ -16,6 +16,7 @@ if (typeof Object_assign !== "function") {
 	};
 }

+// Merge text part information within a styling fragment
 function assign(textPart, fragment) {
 	const fragStart = fragment.start;
 	const start = Math.max(fragment.start, textPart.start);
@ -28,13 +29,25 @@ function assign(textPart, fragment) {
 	});
 }

+// Merge the style fragments withing the text parts, taking into account
+// boundaries and text sections that have not matched to links or channels.
+// For example, given a string "foobar" where "foo" and "bar" have been
+// identified as parts (channels, links, etc.) and "fo", "ob" and "ar" have 3
+// different styles, the first resulting part will contain fragments "fo" and
+// "o", and the second resulting part will contain "b" and "ar". "o" and "b"
+// fragments will contain duplicate styling attributes.
 function merge(textParts, styleFragments) {
-	const cleanText = styleFragments.map(fragment => fragment.text).join("");
+	// Re-build the overall text (without control codes) from the style fragments
+	const cleanText = styleFragments.reduce((acc, frag) => acc + frag.text, "");

+	// Every section of the original text that has not been captured in a "part"
+	// is filled with "text" parts, dummy objects with start/end but no extra
+	// metadata.
 	const allParts = textParts
 		.concat(fill(textParts, cleanText))
 		.sort((a, b) => a.start - b.start);

+	// Distribute the style fragments within the text parts
 	return allParts.map(textPart => {
 		textPart.fragments = styleFragments
 			.filter(fragment => anyIntersection(textPart, fragment))
--- a/client/js/libs/handlebars/ircmessageparser/parseStyle.js
+++ b/client/js/libs/handlebars/ircmessageparser/parseStyle.js
@ -1,5 +1,6 @@
 "use strict";

+// Styling control codes
 const BOLD = "\x02";
 const COLOR = "\x03";
 const RESET = "\x0f";
@ -7,14 +8,24 @@ const REVERSE = "\x16";
 const ITALIC = "\x1d";
 const UNDERLINE = "\x1f";

+// Color code matcher, with format `XX,YY` where both `XX` and `YY` are
+// integers, `XX` is the text color and `YY` is an optional background color.
 const colorRx = /^(\d{1,2})(?:,(\d{1,2}))?/;
+
+// Represents all other control codes that to be ignored/filtered from the text
 const controlCodesRx = /[\u0000-\u001F]/g;

+// Converts a given text into an array of objects, each of them representing a
+// similarly styled section of the text. Each object carries the `text`, style
+// information (`bold`, `textColor`, `bgcolor`, `reverse`, `italic`,
+// `underline`), and `start`/`end` cursors.
 function parseStyle(text) {
 	const result = [];
 	let start = 0;
 	let position = 0;

+	// At any given time, these carry style information since last time a styling
+	// control code was met.
 	let colorCodes, bold, textColor, bgColor, reverse, italic, underline;

 	const resetStyle = () => {
@ -27,15 +38,20 @@ function parseStyle(text) {
 	};
 	resetStyle();

+	// When called, this "closes" the current fragment by adding an entry to the
+	// `result` array using the styling information set last time a control code
+	// was met.
 	const emitFragment = () => {
+		// Uses the text fragment starting from the last control code position up to
+		// the current position
 		const textPart = text.slice(start, position);
-		start = position + 1;

+		// Filters out all non-style related control codes present in this text
 		const processedText = textPart.replace(controlCodesRx, "");

-		if (!processedText.length) {
-			return;
-		}
+		if (processedText.length) {
+			// Current fragment starts where the previous one ends, or at 0 if none
+			const fragmentStart = result.length ? result[result.length - 1].end : 0;

 			result.push({
 				bold,
@ -44,10 +60,20 @@ function parseStyle(text) {
 				reverse,
 				italic,
 				underline,
-			text: processedText
+				text: processedText,
+				start: fragmentStart,
+				end: fragmentStart + processedText.length
 			});
+		}
+
+		// Now that a fragment has been "closed", the next one will start after that
+		start = position + 1;
 	};

+	// This loop goes through each character of the given text one by one by
+	// bumping the `position` cursor. Every time a new special "styling" character
+	// is met, an object gets created (with `emitFragment()`)information on text
+	// encountered since the previous styling character.
 	while (position < text.length) {
 		switch (text[position]) {

@ -56,6 +82,10 @@ function parseStyle(text) {
 			resetStyle();
 			break;

+		// Meeting a BOLD character means that the ongoing text is either going to
+		// be in bold or that the previous one was in bold and the following one
+		// must be reset.
+		// This same behavior applies to COLOR, REVERSE, ITALIC, and UNDERLINE.
 		case BOLD:
 			emitFragment();
 			bold = !bold;
@ -64,20 +94,23 @@ function parseStyle(text) {
 		case COLOR:
 			emitFragment();

+			// Go one step further to find the corresponding color
 			colorCodes = text.slice(position + 1).match(colorRx);

 			if (colorCodes) {
 				textColor = Number(colorCodes[1]);
+				if (colorCodes[2]) {
 					bgColor = Number(colorCodes[2]);
-				if (Number.isNaN(bgColor)) {
-					bgColor = undefined;
 				}
+				// Color code length is > 1, so bump the current position cursor by as
+				// much (and reset the start cursor for the current text block as well)
 				position += colorCodes[0].length;
+				start = position + 1;
 			} else {
+				// If no color codes were found, toggles back to no colors (like BOLD).
 				textColor = undefined;
 				bgColor = undefined;
 			}
-			start = position + 1;
 			break;

 		case REVERSE:
@ -95,9 +128,12 @@ function parseStyle(text) {
 			underline = !underline;
 			break;
 		}
+
+		// Evaluate the next character at the next iteration
 		position += 1;
 	}

+	// The entire text has been parsed, so we finalize the current text fragment.
 	emitFragment();

 	return result;
@ -107,25 +143,19 @@ const properties = ["bold", "textColor", "bgColor", "italic", "underline", "reve

 function prepare(text) {
 	return parseStyle(text)
-		.filter(fragment => fragment.text.length)
-		.reduce((prev, curr, i) => {
-			if (i === 0) {
-				return prev.concat([curr]);
-			}
-
+		// This optimizes fragments by combining them together when all their values
+		// for the properties defined above are equal.
+		.reduce((prev, curr) => {
+			if (prev.length) {
 				const lastEntry = prev[prev.length - 1];
-			if (properties.some(key => curr[key] !== lastEntry[key])) {
-				return prev.concat([curr]);
-			}
-
+				if (properties.every(key => curr[key] === lastEntry[key])) {
 					lastEntry.text += curr.text;
+					lastEntry.end += curr.text.length;
 					return prev;
-		}, [])
-		.map((fragment, i, array) => {
-			fragment.start = i === 0 ? 0 : array[i - 1].end;
-			fragment.end = fragment.start + fragment.text.length;
-			return fragment;
-		});
+				}
+			}
+			return prev.concat([curr]);
+		}, []);
 }

 module.exports = prepare;
--- a/client/js/libs/handlebars/parse.js
+++ b/client/js/libs/handlebars/parse.js
@ -6,6 +6,7 @@ const findChannels = require("./ircmessageparser/findChannels");
 const findLinks = require("./ircmessageparser/findLinks");
 const merge = require("./ircmessageparser/merge");

+// Create an HTML `span` with styling information for a given fragment
 function createFragment(fragment) {
 	let classes = [];
 	if (fragment.bold) {
@ -30,23 +31,33 @@ function createFragment(fragment) {
 	return escapedText;
 }

+// Transform an IRC message potentially filled with styling control codes, URLs
+// and channels into a string of HTML elements to display on the client.
 module.exports = function parse(text) {
+	// Extract the styling information and get the plain text version from it
 	const styleFragments = parseStyle(text);
 	const cleanText = styleFragments.map(fragment => fragment.text).join("");

-	const channelPrefixes = ["#", "&"]; // RPL_ISUPPORT.CHANTYPES
-	const userModes = ["!", "@", "%", "+"]; // RPL_ISUPPORT.PREFIX
+	// On the plain text, find channels and URLs, returned as "parts". Parts are
+	// arrays of objects containing start and end markers, as well as metadata
+	// depending on what was found (channel or link).
+	const channelPrefixes = ["#", "&"]; // TODO Channel prefixes should be RPL_ISUPPORT.CHANTYPES
+	const userModes = ["!", "@", "%", "+"]; // TODO User modes should be RPL_ISUPPORT.PREFIX
 	const channelParts = findChannels(cleanText, channelPrefixes, userModes);
-
 	const linkParts = findLinks(cleanText);

+	// Sort all parts identified based on their position in the original text
 	const parts = channelParts
 		.concat(linkParts)
 		.sort((a, b) => a.start - b.start);

+	// Merge the styling information with the channels / URLs / text objects and
+	// generate HTML strings with the resulting fragments
 	return merge(parts, styleFragments).map(textPart => {
+		// Create HTML strings with styling information
 		const fragments = textPart.fragments.map(createFragment).join("");

+		// Wrap these potentially styled fragments with links and channel buttons
 		if (textPart.link) {
 			const escapedLink = Handlebars.Utils.escapeExpression(textPart.link);
 			return `<a href="${escapedLink}" target="_blank" rel="noopener">${fragments}</a>`;
--- a/test/client/js/libs/handlebars/ircmessageparser/fill.js
+++ b/test/client/js/libs/handlebars/ircmessageparser/fill.js
@ -0,0 +1,50 @@
+"use strict";
+
+const expect = require("chai").expect;
+const fill = require("../../../../../../client/js/libs/handlebars/ircmessageparser/fill");
+
+describe("fill", () => {
+	const text = "01234567890123456789";
+
+	it("should return an entry for the unmatched end of string", () => {
+		const existingEntries = [
+			{start: 0, end: 10},
+			{start: 5, end: 15},
+		];
+
+		const expected = [
+			{start: 15, end: 20},
+		];
+
+		const actual = fill(existingEntries, text);
+
+		expect(actual).to.deep.equal(expected);
+	});
+
+	it("should return an entry per unmatched areas of the text", () => {
+		const existingEntries = [
+			{start: 0, end: 5},
+			{start: 10, end: 15},
+		];
+
+		const expected = [
+			{start: 5, end: 10},
+			{start: 15, end: 20},
+		];
+
+		const actual = fill(existingEntries, text);
+
+		expect(actual).to.deep.equal(expected);
+	});
+
+	it("should not return anything when entries match all text", () => {
+		const existingEntries = [
+			{start: 0, end: 10},
+			{start: 10, end: 20},
+		];
+
+		const actual = fill(existingEntries, text);
+
+		expect(actual).to.be.empty;
+	});
+});
--- a/test/client/js/libs/handlebars/ircmessageparser/findChannels.js
+++ b/test/client/js/libs/handlebars/ircmessageparser/findChannels.js
@ -1,7 +1,7 @@
 "use strict";

 const expect = require("chai").expect;
-const analyseText = require("../../../../../../client/js/libs/handlebars/ircmessageparser/findChannels");
+const findChannels = require("../../../../../../client/js/libs/handlebars/ircmessageparser/findChannels");

 describe("findChannels", () => {
 	it("should find single letter channel", () => {
@ -12,7 +12,7 @@ describe("findChannels", () => {
 			end: 2
 		}];

-		const actual = analyseText(input, ["#"], ["@", "+"]);
+		const actual = findChannels(input, ["#"], ["@", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});
@ -25,7 +25,7 @@ describe("findChannels", () => {
 			end: 4
 		}];

-		const actual = analyseText(input, ["#"], ["@", "+"]);
+		const actual = findChannels(input, ["#"], ["@", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});
@ -38,7 +38,7 @@ describe("findChannels", () => {
 			end: 15
 		}];

-		const actual = analyseText(input, ["#"], ["@", "+"]);
+		const actual = findChannels(input, ["#"], ["@", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});
@ -51,7 +51,7 @@ describe("findChannels", () => {
 			end: 5
 		}];

-		const actual = analyseText(input, ["#"], ["@", "+"]);
+		const actual = findChannels(input, ["#"], ["@", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});
@ -64,7 +64,7 @@ describe("findChannels", () => {
 			end: 6
 		}];

-		const actual = analyseText(input, ["#"], ["@", "+"]);
+		const actual = findChannels(input, ["#"], ["@", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});
@ -77,7 +77,7 @@ describe("findChannels", () => {
 			end: 3
 		}];

-		const actual = analyseText(input, ["#"], ["@", "+"]);
+		const actual = findChannels(input, ["#"], ["@", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});
@ -90,7 +90,7 @@ describe("findChannels", () => {
 			end: 6
 		}];

-		const actual = analyseText(input, ["#"], ["!", "@", "%", "+"]);
+		const actual = findChannels(input, ["#"], ["!", "@", "%", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});
@ -103,7 +103,7 @@ describe("findChannels", () => {
 			end: 2
 		}];

-		const actual = analyseText(input, ["@"], ["#", "+"]);
+		const actual = findChannels(input, ["@"], ["#", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});
@ -116,7 +116,7 @@ describe("findChannels", () => {
 			end: 6
 		}];

-		const actual = analyseText(input, ["#"], ["@", "+"]);
+		const actual = findChannels(input, ["#"], ["@", "+"]);

 		expect(actual).to.deep.equal(expected);
 	});