unrealircd/src/utf8.c

#include "unrealircd.h"

/**************** UTF8 HELPER FUNCTIONS START HERE *****************/

/* Operations on UTF-8 strings.
 * This part is taken from "glib" with the following copyright:
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 * Taken from the master snapshot on Oct 23, 2018, glib/gutf8.c.
 * The library uses LGPL 2.1. From what I understand this allows me to
 * use this code in a GPLv2-compatible way which fits the rest of
 * the UnrealIRCd project.
 *
 * Code stripped and converted heavily to fit in UnrealIRCd by
 * Bram Matthys ("Syzop") in 2019. Thanks to i <info@servx.org>
 * for all the directions and help with regards to UTF8 handling.
 *
 * Note that with UnrealIRCd, a char is always unsigned char,
 * which allows us to cut some corners and make more readable
 * code without 100 casts.
 */

#define VALIDATE_BYTE(mask, expect) \
  do {                              \
    if ((*p & (mask)) != (expect))  \
      goto error;                   \
  } while(0)

/* see IETF RFC 3629 Section 4 */

static const char *fast_validate(const char *str)
{
	const char *p;

	for (p = str; *p; p++)
	{
		if (*p >= 128)
		{
			const char *last;

			last = p;
			if (*p < 0xe0) /* 110xxxxx */
			{
				// ehm.. did you forget a ++p ? ;) or whatever
				if (*p < 0xc2)
				{
					goto error;
				}
			}
			else
			{
				if (*p < 0xf0) /* 1110xxxx */
				{
					switch (*p++ & 0x0f)
					{
						case 0:
							VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
							break;
						case 0x0d:
							VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
							break;
						default:
							VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
					}
				}
				else if (*p < 0xf5) /* 11110xxx excluding out-of-range */
				{
					switch (*p++ & 0x07)
					{
						case 0:
							VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
							if ((*p & 0x30) == 0)
								goto error;
							break;
						case 4:
							VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
							break;
						default:
							VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
					}
					p++;
					VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
				}
				else
				{
					goto error;
				}
			}

			p++;
			VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

			continue;

error:
			return last;
		}
	}

	return p;
}

/** Check if a string is valid UTF8.
 * @param str   The string to validate
 * @param end   Pointer to char *, as explained in notes below.
 * @returns 1 if the string is valid UTF8, 0 if not.
 * @note  The variable *end will be set to the first invalid UTF8 sequence.
 *        If no invalid UTF8 sequence is encountered then it points to the NUL byte.
 */
int unrl_utf8_validate(const char *str, const char **end)
{
	const char *p;

	p = fast_validate(str);

	if (end)
		*end = p;

	if (*p != '\0')
		return 0;
	else
		return 1;
}

/** Go backwards in a string until we are at the end of an UTF8 sequence.
 * Or more accurately: skip sequences that are part of an UTF8 sequence.
 * @param begin   The string to check
 * @param p       Where to start backtracking
 * @returns Byte that is not in the middle of an UTF8 sequence,
 *          or NULL if we reached the beginning and that isn't valid either.
 */
char *unrl_utf8_find_prev_char (const char *begin, const char *p)
{
	for (--p; p >= begin; --p)
	{
		if ((*p & 0xc0) != 0x80)
			return (char *)p;
	}
	return NULL;
}

/** Return a valid UTF8 string based on the input.
 * @param str		The input string
 * @param outputbuf	The output buffer
 * @param outputbuflen	Length of the output buffer
 * @param strictlen	If set to 1 we never return more than
 *                      outputbuflen-1 characters.
 *                      If set to 0, we may do that, if the
 *                      input string was already 100% valid UTF8.
 * @retval Returns a valid UTF8 string, either the input buffer
 *         (if it was already valid UTF8) or the output buffer.
 *         NULL is returned if either 'str' was NULL or outputlen is zero.
 * @notes The 'outputbuf' is unused if the string is already valid UTF8.
 *        So don't rely on it being always set, use the returned string.
 */
char *unrl_utf8_make_valid(const char *str, char *outputbuf, size_t outputbuflen, int strictlen)
{
	const char *remainder, *invalid;
	int remaining_bytes, valid_bytes, len;
	int replaced = 0; /**< UTF8 string needed replacement (was invalid) */

	if (!str || !outputbuflen)
		return NULL;

	len = strlen(str);

	*outputbuf = '\0';
	remainder = str;
	remaining_bytes = len;

	while (remaining_bytes != 0)
	{
		if (unrl_utf8_validate(remainder, &invalid))
		{
			if (!replaced)
			{
				if (strictlen)
				{
					/* Caller wants us to go through the 'replaced' branch */
					strlcpy(outputbuf, str, outputbuflen);
					replaced = 1;
				}
				break;
			} else {
				/* We already replaced earlier, now just put the rest at the end. */
				strlcat(outputbuf, remainder, outputbuflen);
				break;
			}
		}
		replaced = 1;
		valid_bytes = invalid - remainder;

		strlncat(outputbuf, remainder, outputbuflen, valid_bytes); /*g_string_append_len(string, remainder, valid_bytes);*/
		strlcat(outputbuf, "\357\277\275", outputbuflen);

		remaining_bytes -= valid_bytes + 1;
		remainder = invalid + 1;
	}

	if (!replaced)
		return (char *)str; /* return original string (no changes needed) */

	/* If we took up all the space, then backtrack one character and cut
	 * things off from there. This to ensure that we don't end up with
	 * invalid UTF8 due to cutting half-way a UTF8 byte sequence.
	 * NOTE: This may cause us to remove 1 character needlessly at the
	 *       end even though there was still (some) space. So be it.
	 */
	if (strlen(outputbuf) == outputbuflen-1)
	{
		char *cut_at = unrl_utf8_find_prev_char(outputbuf, outputbuf+outputbuflen-1);
		if (cut_at)
			*cut_at = '\0';
	}

#ifdef DEBUGMODE
	if (!unrl_utf8_validate(outputbuf, NULL))
		abort(); /* this should never happen, it means our conversion resulted in an invalid UTF8 string */
#endif

	return outputbuf;
}

/**************** END OF UTF8 HELPER FUNCTIONS *****************/

/** This is just for internal testing */
void utf8_test(void)
{
	char buf[1024];
	char *res;
	int cnt = 0;
	char *heapbuf; /* for strict OOB testing with ASan */
	char *workbuf = safe_alloc(500);
	size_t workbuflen = 500;

	while ((fgets(buf, sizeof(buf), stdin)))
	{
		stripcrlf(buf);
		heapbuf = strdup(buf);
		res = unrl_utf8_make_valid(heapbuf, workbuf, workbuflen, 1);
		if (heapbuf == res)
		{
			printf("    %s\n", res);
		} else {
			printf("[!] %s\n", res);
		}
		free(heapbuf);
	}
	safe_free(workbuf);
}
Initial commit 2020-03-29 09:16:53 +00:00			`#include "unrealircd.h"`

			`/************** UTF8 HELPER FUNCTIONS START HERE ***************/`

			`/* Operations on UTF-8 strings.`
			`* This part is taken from "glib" with the following copyright:`
			`* Copyright (C) 1999 Tom Tromey`
			`* Copyright (C) 2000 Red Hat, Inc.`
			`* Taken from the master snapshot on Oct 23, 2018, glib/gutf8.c.`
			`* The library uses LGPL 2.1. From what I understand this allows me to`
			`* use this code in a GPLv2-compatible way which fits the rest of`
			`* the UnrealIRCd project.`
			`*`
			`* Code stripped and converted heavily to fit in UnrealIRCd by`
			`* Bram Matthys ("Syzop") in 2019. Thanks to i <info@servx.org>`
			`* for all the directions and help with regards to UTF8 handling.`
			`*`
			`* Note that with UnrealIRCd, a char is always unsigned char,`
			`* which allows us to cut some corners and make more readable`
			`* code without 100 casts.`
			`*/`

			`#define VALIDATE_BYTE(mask, expect) \`
			`do { \`
			`if ((*p & (mask)) != (expect)) \`
			`goto error; \`
			`} while(0)`

			`/* see IETF RFC 3629 Section 4 */`

			`static const char fast_validate(const char str)`
			`{`
			`const char *p;`

			`for (p = str; *p; p++)`
			`{`
			`if (*p >= 128)`
			`{`
			`const char *last;`

			`last = p;`
			`if (p < 0xe0) / 110xxxxx */`
			`{`
			`// ehm.. did you forget a ++p ? ;) or whatever`
			`if (*p < 0xc2)`
			`{`
			`goto error;`
			`}`
			`}`
			`else`
			`{`
			`if (p < 0xf0) / 1110xxxx */`
			`{`
			`switch (*p++ & 0x0f)`
			`{`
			`case 0:`
			`VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */`
			`break;`
			`case 0x0d:`
			`VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */`
			`break;`
			`default:`
			`VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */`
			`}`
			`}`
			`else if (p < 0xf5) / 11110xxx excluding out-of-range */`
			`{`
			`switch (*p++ & 0x07)`
			`{`
			`case 0:`
			`VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */`
			`if ((*p & 0x30) == 0)`
			`goto error;`
			`break;`
			`case 4:`
			`VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */`
			`break;`
			`default:`
			`VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */`
			`}`
			`p++;`
			`VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */`
			`}`
			`else`
			`{`
			`goto error;`
			`}`
			`}`

			`p++;`
			`VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */`

			`continue;`

			`error:`
			`return last;`
			`}`
			`}`

			`return p;`
			`}`

			`/** Check if a string is valid UTF8.`
			`* @param str The string to validate`
			`* @param end Pointer to char *, as explained in notes below.`
			`* @returns 1 if the string is valid UTF8, 0 if not.`
			`* @note The variable *end will be set to the first invalid UTF8 sequence.`
			`* If no invalid UTF8 sequence is encountered then it points to the NUL byte.`
			`*/`
			`int unrl_utf8_validate(const char str, const char *end)`
			`{`
			`const char *p;`

			`p = fast_validate(str);`

			`if (end)`
			`*end = p;`

			`if (*p != '\0')`
			`return 0;`
			`else`
			`return 1;`
			`}`

			`/** Go backwards in a string until we are at the end of an UTF8 sequence.`
			`* Or more accurately: skip sequences that are part of an UTF8 sequence.`
			`* @param begin The string to check`
			`* @param p Where to start backtracking`
			`* @returns Byte that is not in the middle of an UTF8 sequence,`
			`* or NULL if we reached the beginning and that isn't valid either.`
			`*/`
			`char unrl_utf8_find_prev_char (const char begin, const char *p)`
			`{`
			`for (--p; p >= begin; --p)`
			`{`
			`if ((*p & 0xc0) != 0x80)`
			`return (char *)p;`
			`}`
			`return NULL;`
			`}`

			`/** Return a valid UTF8 string based on the input.`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`* @param str The input string`
			`* @param outputbuf The output buffer`
			`* @param outputbuflen Length of the output buffer`
			`* @param strictlen If set to 1 we never return more than`
			`* outputbuflen-1 characters.`
			`* If set to 0, we may do that, if the`
			`* input string was already 100% valid UTF8.`
			`* @retval Returns a valid UTF8 string, either the input buffer`
			`* (if it was already valid UTF8) or the output buffer.`
			`* NULL is returned if either 'str' was NULL or outputlen is zero.`
			`* @notes The 'outputbuf' is unused if the string is already valid UTF8.`
			`* So don't rely on it being always set, use the returned string.`
Initial commit 2020-03-29 09:16:53 +00:00			`*/`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`char unrl_utf8_make_valid(const char str, char *outputbuf, size_t outputbuflen, int strictlen)`
Initial commit 2020-03-29 09:16:53 +00:00			`{`
			`const char remainder, invalid;`
			`int remaining_bytes, valid_bytes, len;`
			`int replaced = 0; /*< UTF8 string needed replacement (was invalid) /`

Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`if (!str \|\| !outputbuflen)`
Initial commit 2020-03-29 09:16:53 +00:00			`return NULL;`

			`len = strlen(str);`

Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`*outputbuf = '\0';`
Initial commit 2020-03-29 09:16:53 +00:00			`remainder = str;`
			`remaining_bytes = len;`

			`while (remaining_bytes != 0)`
			`{`
			`if (unrl_utf8_validate(remainder, &invalid))`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`{`
Updated to 6.1.0 2023-05-05 22:12:01 +00:00			`if (!replaced)`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`{`
Updated to 6.1.0 2023-05-05 22:12:01 +00:00			`if (strictlen)`
			`{`
			`/* Caller wants us to go through the 'replaced' branch */`
			`strlcpy(outputbuf, str, outputbuflen);`
			`replaced = 1;`
			`}`
			`break;`
			`} else {`
			`/* We already replaced earlier, now just put the rest at the end. */`
			`strlcat(outputbuf, remainder, outputbuflen);`
			`break;`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`}`
			`}`
Initial commit 2020-03-29 09:16:53 +00:00			`replaced = 1;`
			`valid_bytes = invalid - remainder;`

Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`strlncat(outputbuf, remainder, outputbuflen, valid_bytes); /g_string_append_len(string, remainder, valid_bytes);/`
			`strlcat(outputbuf, "\357\277\275", outputbuflen);`
Initial commit 2020-03-29 09:16:53 +00:00
			`remaining_bytes -= valid_bytes + 1;`
			`remainder = invalid + 1;`
			`}`

			`if (!replaced)`
			`return (char )str; / return original string (no changes needed) */`

Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`/* If we took up all the space, then backtrack one character and cut`
			`* things off from there. This to ensure that we don't end up with`
			`* invalid UTF8 due to cutting half-way a UTF8 byte sequence.`
			`* NOTE: This may cause us to remove 1 character needlessly at the`
			`* end even though there was still (some) space. So be it.`
Initial commit 2020-03-29 09:16:53 +00:00			`*/`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`if (strlen(outputbuf) == outputbuflen-1)`
Initial commit 2020-03-29 09:16:53 +00:00			`{`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`char *cut_at = unrl_utf8_find_prev_char(outputbuf, outputbuf+outputbuflen-1);`
Initial commit 2020-03-29 09:16:53 +00:00			`if (cut_at)`
			`*cut_at = '\0';`
			`}`

Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`#ifdef DEBUGMODE`
			`if (!unrl_utf8_validate(outputbuf, NULL))`
Initial commit 2020-03-29 09:16:53 +00:00			`abort(); /* this should never happen, it means our conversion resulted in an invalid UTF8 string */`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`#endif`
Initial commit 2020-03-29 09:16:53 +00:00
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`return outputbuf;`
Initial commit 2020-03-29 09:16:53 +00:00			`}`

			`/************** END OF UTF8 HELPER FUNCTIONS ***************/`

			`/** This is just for internal testing */`
			`void utf8_test(void)`
			`{`
			`char buf[1024];`
			`char *res;`
			`int cnt = 0;`
			`char heapbuf; / for strict OOB testing with ASan */`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`char *workbuf = safe_alloc(500);`
			`size_t workbuflen = 500;`
Initial commit 2020-03-29 09:16:53 +00:00
			`while ((fgets(buf, sizeof(buf), stdin)))`
			`{`
			`stripcrlf(buf);`
			`heapbuf = strdup(buf);`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`res = unrl_utf8_make_valid(heapbuf, workbuf, workbuflen, 1);`
Initial commit 2020-03-29 09:16:53 +00:00			`if (heapbuf == res)`
			`{`
			`printf(" %s\n", res);`
			`} else {`
			`printf("[!] %s\n", res);`
			`}`
			`free(heapbuf);`
			`}`
Updated to 6.0.1.1 2022-01-15 05:16:34 +00:00			`safe_free(workbuf);`
Initial commit 2020-03-29 09:16:53 +00:00			`}`