From f15900dd75ff9065727ec73ec5896631951cc9ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lucio=20Andr=C3=A9s=20Illanes=20Albornoz?= Date: Thu, 25 Oct 2018 02:50:37 +0200 Subject: [PATCH] js/ext/unicode.js: cleanup. --- js/ext/unicode.js | 488 +++++++--------------------------------------- 1 file changed, 74 insertions(+), 414 deletions(-) diff --git a/js/ext/unicode.js b/js/ext/unicode.js index 2199750..0d4af8c 100644 --- a/js/ext/unicode.js +++ b/js/ext/unicode.js @@ -2,256 +2,8 @@ var unicode = (function(){ var UNICODE_BLOCK_LIST = [ 0x0020, 0x007F, "Basic Latin", 0x0080, 0x00FF, "Latin-1 Supplement", - 0x0100, 0x017F, "Latin Extended-A", - 0x0180, 0x024F, "Latin Extended-B", - 0x0250, 0x02AF, "IPA Extensions", - 0x02B0, 0x02FF, "Spacing Modifier Letters", - 0x0300, 0x036F, "Combining Diacritical Marks", - 0x0370, 0x03FF, "Greek and Coptic", - 0x0400, 0x04FF, "Cyrillic", - 0x0500, 0x052F, "Cyrillic Supplement", - 0x0530, 0x058F, "Armenian", - 0x0590, 0x05FF, "Hebrew", - 0x0600, 0x06FF, "Arabic", - 0x0700, 0x074F, "Syriac", - 0x0750, 0x077F, "Arabic Supplement", - 0x0780, 0x07BF, "Thaana", - 0x07C0, 0x07FF, "NKo", - 0x0800, 0x083F, "Samaritan", - 0x0840, 0x085F, "Mandaic", - 0x08A0, 0x08FF, "Arabic Extended-A", - 0x0900, 0x097F, "Devanagari", - 0x0980, 0x09FF, "Bengali", - 0x0A00, 0x0A7F, "Gurmukhi", - 0x0A80, 0x0AFF, "Gujarati", - 0x0B00, 0x0B7F, "Oriya", - 0x0B80, 0x0BFF, "Tamil", - 0x0C00, 0x0C7F, "Telugu", - 0x0C80, 0x0CFF, "Kannada", - 0x0D00, 0x0D7F, "Malayalam", - 0x0D80, 0x0DFF, "Sinhala", - 0x0E00, 0x0E7F, "Thai", - 0x0E80, 0x0EFF, "Lao", - 0x0F00, 0x0FFF, "Tibetan", - 0x1000, 0x109F, "Myanmar", - 0x10A0, 0x10FF, "Georgian", - 0x1100, 0x11FF, "Hangul Jamo", - 0x1200, 0x137F, "Ethiopic", - 0x1380, 0x139F, "Ethiopic Supplement", - 0x13A0, 0x13FF, "Cherokee", - 0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics", - 0x1680, 0x169F, "Ogham", - 0x16A0, 0x16FF, "Runic", - 0x1700, 0x171F, "Tagalog", - 0x1720, 0x173F, "Hanunoo", - 0x1740, 0x175F, "Buhid", - 0x1760, 0x177F, "Tagbanwa", - 0x1780, 0x17FF, "Khmer", - 0x1800, 0x18AF, "Mongolian", - 0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended", - 0x1900, 0x194F, "Limbu", - 0x1950, 0x197F, "Tai Le", - 0x1980, 0x19DF, "New Tai Lue", - 0x19E0, 0x19FF, "Khmer Symbols", - 0x1A00, 0x1A1F, "Buginese", - 0x1A20, 0x1AAF, "Tai Tham", - 0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended", - 0x1B00, 0x1B7F, "Balinese", - 0x1B80, 0x1BBF, "Sundanese", - 0x1BC0, 0x1BFF, "Batak", - 0x1C00, 0x1C4F, "Lepcha", - 0x1C50, 0x1C7F, "Ol Chiki", - 0x1CC0, 0x1CCF, "Sundanese Supplement", - 0x1CD0, 0x1CFF, "Vedic Extensions", - 0x1D00, 0x1D7F, "Phonetic Extensions", - 0x1D80, 0x1DBF, "Phonetic Extensions Supplement", - 0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement", - 0x1E00, 0x1EFF, "Latin Extended Additional", - 0x1F00, 0x1FFF, "Greek Extended", - 0x2000, 0x206F, "General Punctuation", - 0x2070, 0x209F, "Superscripts and Subscripts", - 0x20A0, 0x20CF, "Currency Symbols", - 0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols", - 0x2100, 0x214F, "Letterlike Symbols", - 0x2150, 0x218F, "Number Forms", - 0x2190, 0x21FF, "Arrows", - 0x2200, 0x22FF, "Mathematical Operators", - 0x2300, 0x23FF, "Miscellaneous Technical", - 0x2400, 0x243F, "Control Pictures", - 0x2440, 0x245F, "Optical Character Recognition", - 0x2460, 0x24FF, "Enclosed Alphanumerics", 0x2500, 0x257F, "Box Drawing", 0x2580, 0x259F, "Block Elements", - 0x25A0, 0x25FF, "Geometric Shapes", - 0x2600, 0x26FF, "Miscellaneous Symbols", - 0x2700, 0x27BF, "Dingbats", - 0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A", - 0x27F0, 0x27FF, "Supplemental Arrows-A", - 0x2800, 0x28FF, "Braille Patterns", - 0x2900, 0x297F, "Supplemental Arrows-B", - 0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B", - 0x2A00, 0x2AFF, "Supplemental Mathematical Operators", - 0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows", - 0x2C00, 0x2C5F, "Glagolitic", - 0x2C60, 0x2C7F, "Latin Extended-C", - 0x2C80, 0x2CFF, "Coptic", - 0x2D00, 0x2D2F, "Georgian Supplement", - 0x2D30, 0x2D7F, "Tifinagh", - 0x2D80, 0x2DDF, "Ethiopic Extended", - 0x2DE0, 0x2DFF, "Cyrillic Extended-A", - 0x2E00, 0x2E7F, "Supplemental Punctuation", - 0x2E80, 0x2EFF, "CJK Radicals Supplement", - 0x2F00, 0x2FDF, "Kangxi Radicals", - 0x2FF0, 0x2FFF, "Ideographic Description Characters", - 0x3000, 0x303F, "CJK Symbols and Punctuation", - 0x3040, 0x309F, "Hiragana", - 0x30A0, 0x30FF, "Katakana", - 0x3100, 0x312F, "Bopomofo", - 0x3130, 0x318F, "Hangul Compatibility Jamo", - 0x3190, 0x319F, "Kanbun", - 0x31A0, 0x31BF, "Bopomofo Extended", - 0x31C0, 0x31EF, "CJK Strokes", - 0x31F0, 0x31FF, "Katakana Phonetic Extensions", - 0x3200, 0x32FF, "Enclosed CJK Letters and Months", - 0x3300, 0x33FF, "CJK Compatibility", - 0x3400, 0x4DBF, "CJK Unified Ideographs Extension A", - 0x4DC0, 0x4DFF, "Yijing Hexagram Symbols", - 0x4E00, 0x9FFF, "CJK Unified Ideographs", - 0xA000, 0xA48F, "Yi Syllables", - 0xA490, 0xA4CF, "Yi Radicals", - 0xA4D0, 0xA4FF, "Lisu", - 0xA500, 0xA63F, "Vai", - 0xA640, 0xA69F, "Cyrillic Extended-B", - 0xA6A0, 0xA6FF, "Bamum", - 0xA700, 0xA71F, "Modifier Tone Letters", - 0xA720, 0xA7FF, "Latin Extended-D", - 0xA800, 0xA82F, "Syloti Nagri", - 0xA830, 0xA83F, "Common Indic Number Forms", - 0xA840, 0xA87F, "Phags-pa", - 0xA880, 0xA8DF, "Saurashtra", - 0xA8E0, 0xA8FF, "Devanagari Extended", - 0xA900, 0xA92F, "Kayah Li", - 0xA930, 0xA95F, "Rejang", - 0xA960, 0xA97F, "Hangul Jamo Extended-A", - 0xA980, 0xA9DF, "Javanese", - 0xA9E0, 0xA9FF, "Myanmar Extended-B", - 0xAA00, 0xAA5F, "Cham", - 0xAA60, 0xAA7F, "Myanmar Extended-A", - 0xAA80, 0xAADF, "Tai Viet", - 0xAAE0, 0xAAFF, "Meetei Mayek Extensions", - 0xAB00, 0xAB2F, "Ethiopic Extended-A", - 0xAB30, 0xAB6F, "Latin Extended-E", - 0xABC0, 0xABFF, "Meetei Mayek", - 0xAC00, 0xD7AF, "Hangul Syllables", - 0xD7B0, 0xD7FF, "Hangul Jamo Extended-B", - 0xD800, 0xDB7F, "High Surrogates", - 0xDB80, 0xDBFF, "High Private Use Surrogates", - 0xDC00, 0xDFFF, "Low Surrogates", - 0xE000, 0xF8FF, "Private Use Area", - 0xF900, 0xFAFF, "CJK Compatibility Ideographs", - 0xFB00, 0xFB4F, "Alphabetic Presentation Forms", - 0xFB50, 0xFDFF, "Arabic Presentation Forms-A", - 0xFE00, 0xFE0F, "Variation Selectors", - 0xFE10, 0xFE1F, "Vertical Forms", - 0xFE20, 0xFE2F, "Combining Half Marks", - 0xFE30, 0xFE4F, "CJK Compatibility Forms", - 0xFE50, 0xFE6F, "Small Form Variants", - 0xFE70, 0xFEFF, "Arabic Presentation Forms-B", - 0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms", - 0xFFF0, 0xFFFF, "Specials", - 0x10000, 0x1007F, "Linear B Syllabary", - 0x10080, 0x100FF, "Linear B Ideograms", - 0x10100, 0x1013F, "Aegean Numbers", - 0x10140, 0x1018F, "Ancient Greek Numbers", - 0x10190, 0x101CF, "Ancient Symbols", - 0x101D0, 0x101FF, "Phaistos Disc", - 0x10280, 0x1029F, "Lycian", - 0x102A0, 0x102DF, "Carian", - 0x102E0, 0x102FF, "Coptic Epact Numbers", - 0x10300, 0x1032F, "Old Italic", - 0x10330, 0x1034F, "Gothic", - 0x10350, 0x1037F, "Old Permic", - 0x10380, 0x1039F, "Ugaritic", - 0x103A0, 0x103DF, "Old Persian", - 0x10400, 0x1044F, "Deseret", - 0x10450, 0x1047F, "Shavian", - 0x10480, 0x104AF, "Osmanya", - 0x10500, 0x1052F, "Elbasan", - 0x10530, 0x1056F, "Caucasian Albanian", - 0x10600, 0x1077F, "Linear A", - 0x10800, 0x1083F, "Cypriot Syllabary", - 0x10840, 0x1085F, "Imperial Aramaic", - 0x10860, 0x1087F, "Palmyrene", - 0x10880, 0x108AF, "Nabataean", - 0x10900, 0x1091F, "Phoenician", - 0x10920, 0x1093F, "Lydian", - 0x10980, 0x1099F, "Meroitic Hieroglyphs", - 0x109A0, 0x109FF, "Meroitic Cursive", - 0x10A00, 0x10A5F, "Kharoshthi", - 0x10A60, 0x10A7F, "Old South Arabian", - 0x10A80, 0x10A9F, "Old North Arabian", - 0x10AC0, 0x10AFF, "Manichaean", - 0x10B00, 0x10B3F, "Avestan", - 0x10B40, 0x10B5F, "Inscriptional Parthian", - 0x10B60, 0x10B7F, "Inscriptional Pahlavi", - 0x10B80, 0x10BAF, "Psalter Pahlavi", - 0x10C00, 0x10C4F, "Old Turkic", - 0x10E60, 0x10E7F, "Rumi Numeral Symbols", - 0x11000, 0x1107F, "Brahmi", - 0x11080, 0x110CF, "Kaithi", - 0x110D0, 0x110FF, "Sora Sompeng", - 0x11100, 0x1114F, "Chakma", - 0x11150, 0x1117F, "Mahajani", - 0x11180, 0x111DF, "Sharada", - 0x111E0, 0x111FF, "Sinhala Archaic Numbers", - 0x11200, 0x1124F, "Khojki", - 0x112B0, 0x112FF, "Khudawadi", - 0x11300, 0x1137F, "Grantha", - 0x11480, 0x114DF, "Tirhuta", - 0x11580, 0x115FF, "Siddham", - 0x11600, 0x1165F, "Modi", - 0x11680, 0x116CF, "Takri", - 0x118A0, 0x118FF, "Warang Citi", - 0x11AC0, 0x11AFF, "Pau Cin Hau", - 0x12000, 0x123FF, "Cuneiform", - 0x12400, 0x1247F, "Cuneiform Numbers and Punctuation", - 0x13000, 0x1342F, "Egyptian Hieroglyphs", - 0x16800, 0x16A3F, "Bamum Supplement", - 0x16A40, 0x16A6F, "Mro", - 0x16AD0, 0x16AFF, "Bassa Vah", - 0x16B00, 0x16B8F, "Pahawh Hmong", - 0x16F00, 0x16F9F, "Miao", - 0x1B000, 0x1B0FF, "Kana Supplement", - 0x1BC00, 0x1BC9F, "Duployan", - 0x1BCA0, 0x1BCAF, "Shorthand Format Controls", - 0x1D000, 0x1D0FF, "Byzantine Musical Symbols", - 0x1D100, 0x1D1FF, "Musical Symbols", - 0x1D200, 0x1D24F, "Ancient Greek Musical Notation", - 0x1D300, 0x1D35F, "Tai Xuan Jing Symbols", - 0x1D360, 0x1D37F, "Counting Rod Numerals", - 0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols", - 0x1E800, 0x1E8DF, "Mende Kikakui", - 0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols", - 0x1F000, 0x1F02F, "Mahjong Tiles", - 0x1F030, 0x1F09F, "Domino Tiles", - 0x1F0A0, 0x1F0FF, "Playing Cards", - 0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement", - 0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement", - 0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs", - 0x1F600, 0x1F64F, "Emoticons", - 0x1F650, 0x1F67F, "Ornamental Dingbats", - 0x1F680, 0x1F6FF, "Transport and Map Symbols", - 0x1F700, 0x1F77F, "Alchemical Symbols", - 0x1F780, 0x1F7FF, "Geometric Shapes Extended", - 0x1F800, 0x1F8FF, "Supplemental Arrows-C", - 0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B", - 0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C", - 0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D", - 0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement", - 0xE0000, 0xE007F, "Tags", - 0xE0100, 0xE01EF, "Variation Selectors Supplement", - 0xF0000, 0xFFFFF, "Supplementary Private Use Area-A", - 0x100000, 0x10FFFF, "Supplementary Private Use Area-B", ] var UNICODE_BLOCK_COUNT = UNICODE_BLOCK_LIST.length / 3 var UNICODE_LOOKUP = {} @@ -259,6 +11,14 @@ var unicode = (function(){ UNICODE_LOOKUP[ UNICODE_BLOCK_LIST[i+2] ] = [ UNICODE_BLOCK_LIST[i], UNICODE_BLOCK_LIST[i+1] ] } + function block (name, n){ + var b = UNICODE_LOOKUP[name] + if (! b) return "" + return range.apply(null, b).map(function(n){ return String.fromCharCode(n) }) + } + function entities (a) { + return a.map(function(k){ return "&#" + k.join(";&#") + ";" }).join("
") + } function index (j) { return [ UNICODE_BLOCK_LIST[j*3], UNICODE_BLOCK_LIST[j*3+1], UNICODE_BLOCK_LIST[j*3+2], [] ] } @@ -270,89 +30,7 @@ var unicode = (function(){ } return a } - function paginate (a, n){ - var aa = [], ai, i = 0 - while (i < 100) { - ai = a.slice(i * n, (i+1) * n) - if (! ai.length) break - aa.push(ai) - i++ - } - return aa - } - function block (name, n){ - var b = UNICODE_LOOKUP[name] - if (! b) return "" - return range.apply(null, b).map(function(n){ return String.fromCharCode(n) }) - } - function entities (a) { - return a.map(function(k){ return "&#" + k.join(";&#") + ";" }).join("
") - } - function findGroups (chars){ - var groups = [], row, list - for (var i = 0, j = -1, next = -1, len = chars.length; i < len; i++) { - if (chars[i] < next) { - list.push(chars[i]) - continue - } - do { - j += 1 - next = UNICODE_BLOCK_LIST[(j+1)*3] - } while (chars[i] > next) - row = index(j) - list = row[3] - groups.push( row ) - } - return groups - } - - // encodes unicode characters as escaped utf16 - \xFFFF - // encodes ONLY non-ascii characters - function escapeToUtf16 (txt) { - var escaped_txt = "", kode - for (var i = 0; i < txt.length; i++) { - kode = txt.charCodeAt(i) - if (kode > 0x7f) { - kode = kode.toString(16) - switch (kode.length) { - case 2: - kode = "0" + kode - case 3: - kode = "0" + kode - } - escaped_txt += "\\u" + kode - } - else { - escaped_txt += txt[i] - } - } - return escaped_txt - } - // encodes unicode characters as escaped bytes - \xFF - // encodes ONLY non-ascii characters - function escapeToEscapedBytes (txt) { - var escaped_txt = "", kode, utf8_bytes - for (var i = 0; i < txt.length; i++) { - kode = txt.charCodeAt(i) - if (kode > 0x7f) { - utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode) - escaped_txt += convertBytesToEscapedString(utf8_bytes, 16) - } - else { - escaped_txt += txt[i] - } - } - return escaped_txt - } - - // encodes unicode characters as escaped bytes - \xFF - // encodes an ENTIRE string - function escapeAllToEscapedBytes(str, base) { - var unicode_codes = convertStringToUnicodeCodePoints(str); - var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes); - return convertBytesToEscapedString(data_bytes, 16); - } // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => '\xE3\x81\x82\xE3\x81\x84' // [ 0343, 0201, 0202, 0343, 0201, 0204 ] => '\343\201\202\343\201\204' function convertBytesToEscapedString(data_bytes, base) { @@ -365,14 +43,27 @@ var unicode = (function(){ } return escaped; } - // [ 0x3042, 0x3044 ] => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] - function convertUnicodeCodePointsToBytes(unicode_codes) { - var utf8_bytes = []; - for (var i = 0; i < unicode_codes.length; ++i) { - var bytes = convertUnicodeCodePointToUtf8Bytes(unicode_codes[i]); - utf8_bytes = utf8_bytes.concat(bytes); + // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] + // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ] + function convertEscapedBytesToBytes(str) { + var parts = str.split("\\x"); + parts.shift(); // Trim the first element. + var codes = []; + var max = Math.pow(2, 8); + for (var i = 0; i < parts.length; ++i) { + var code = parseInt(parts[i], 16); + if (code >= 0 && code < max) { + codes.push(code); + } else { + // Malformed code ignored. + } } - return utf8_bytes; + return codes; + } + // [ 0x3042, 0x3044 ] => "あい" + function convertUnicodeCodePointsToString(unicode_codes) { + var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes); + return convertUtf16CodesToString(utf16_codes); } // 0x3042 => [ 0xE3, 0x81, 0x82 ] function convertUnicodeCodePointToUtf8Bytes(unicode_code) { @@ -394,62 +85,22 @@ var unicode = (function(){ } return utf8_bytes; } - // "あい" => [ 0x3042, 0x3044 ] - function convertStringToUnicodeCodePoints(str) { - var surrogate_1st = 0; - var unicode_codes = []; - for (var i = 0; i < str.length; ++i) { - var utf16_code = str.charCodeAt(i); - if (surrogate_1st != 0) { - if (utf16_code >= 0xDC00 && utf16_code <= 0xDFFF) { - var surrogate_2nd = utf16_code; - var unicode_code = (surrogate_1st - 0xD800) * (1 << 10) + (1 << 16) + - (surrogate_2nd - 0xDC00); - unicode_codes.push(unicode_code); - } else { - // Malformed surrogate pair ignored. - } - surrogate_1st = 0; - } else if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) { - surrogate_1st = utf16_code; + // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ] + // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair. + function convertUnicodeCodePointsToUtf16Codes(unicode_codes) { + var utf16_codes = []; + for (var i = 0; i < unicode_codes.length; ++i) { + var unicode_code = unicode_codes[i]; + if (unicode_code < (1 << 16)) { + utf16_codes.push(unicode_code); } else { - unicode_codes.push(utf16_code); + var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800; + var second = (unicode_code % (1 << 10)) + 0xDC00; + utf16_codes.push(first) + utf16_codes.push(second) } } - return unicode_codes; - } - // 0xff => "ff" - // 0xff => "377" - function formatNumber(number, base, num_digits) { - var str = number.toString(base).toUpperCase(); - for (var i = str.length; i < num_digits; ++i) { - str = "0" + str; - } - return str; - } - - // convert \xFF\xFF\xFF to unicode - function unescapeFromEscapedBytes (str) { - var data_bytes = convertEscapedBytesToBytes(str); - var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes); - return convertUnicodeCodePointsToString(unicode_codes); - } - // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] - // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ] - function convertEscapedBytesToBytes(str) { - var parts = str.split("\\x"); - parts.shift(); // Trim the first element. - var codes = []; - var max = Math.pow(2, 8); - for (var i = 0; i < parts.length; ++i) { - var code = parseInt(parts[i], 16); - if (code >= 0 && code < max) { - codes.push(code); - } else { - // Malformed code ignored. - } - } - return codes; + return utf16_codes; } // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ] function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) { @@ -498,28 +149,6 @@ var unicode = (function(){ unicode_codes.shift(); // Trim the first element. return unicode_codes; } - // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ] - // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair. - function convertUnicodeCodePointsToUtf16Codes(unicode_codes) { - var utf16_codes = []; - for (var i = 0; i < unicode_codes.length; ++i) { - var unicode_code = unicode_codes[i]; - if (unicode_code < (1 << 16)) { - utf16_codes.push(unicode_code); - } else { - var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800; - var second = (unicode_code % (1 << 10)) + 0xDC00; - utf16_codes.push(first) - utf16_codes.push(second) - } - } - return utf16_codes; - } - // [ 0x3042, 0x3044 ] => "あい" - function convertUnicodeCodePointsToString(unicode_codes) { - var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes); - return convertUtf16CodesToString(utf16_codes); - } // [ 0x3042, 0x3044 ] => "あい" function convertUtf16CodesToString(utf16_codes) { var unescaped = ''; @@ -528,6 +157,39 @@ var unicode = (function(){ } return unescaped; } + // 0xff => "ff" + // 0xff => "377" + function formatNumber(number, base, num_digits) { + var str = number.toString(base).toUpperCase(); + for (var i = str.length; i < num_digits; ++i) { + str = "0" + str; + } + return str; + } + + // encodes unicode characters as escaped bytes - \xFF + // encodes ONLY non-ascii characters + function escapeToEscapedBytes (txt) { + var escaped_txt = "", kode, utf8_bytes + for (var i = 0; i < txt.length; i++) { + kode = txt.charCodeAt(i) + if (kode > 0x7f) { + utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode) + escaped_txt += convertBytesToEscapedString(utf8_bytes, 16) + } + else { + escaped_txt += txt[i] + } + } + return escaped_txt + } + + // convert \xFF\xFF\xFF to unicode + function unescapeFromEscapedBytes (str) { + var data_bytes = convertEscapedBytesToBytes(str); + var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes); + return convertUnicodeCodePointsToString(unicode_codes); + } return { raw: UNICODE_BLOCK_LIST, @@ -535,8 +197,6 @@ var unicode = (function(){ index: index, range: range, block: block, - findGroups: findGroups, - paginate: paginate, escapeToEscapedBytes: escapeToEscapedBytes, unescapeFromEscapedBytes: unescapeFromEscapedBytes, }