js/ext/unicode.js: cleanup.

2025-06-11 19:39:53 +00:00 · 2018-10-25 02:50:37 +02:00 · 2018-10-25 02:50:37 +02:00 · f15900dd75
commit f15900dd75
parent 5d8921b826
1 changed files with 74 additions and 414 deletions
--- a/js/ext/unicode.js
+++ b/js/ext/unicode.js
@ -2,256 +2,8 @@ var unicode = (function(){
  var UNICODE_BLOCK_LIST = [
    0x0020, 0x007F, "Basic Latin",
    0x0080, 0x00FF, "Latin-1 Supplement",
-    0x0100, 0x017F, "Latin Extended-A",
-    0x0180, 0x024F, "Latin Extended-B",
-    0x0250, 0x02AF, "IPA Extensions",
-    0x02B0, 0x02FF, "Spacing Modifier Letters",
-    0x0300, 0x036F, "Combining Diacritical Marks",
-    0x0370, 0x03FF, "Greek and Coptic",
-    0x0400, 0x04FF, "Cyrillic",
-    0x0500, 0x052F, "Cyrillic Supplement",
-    0x0530, 0x058F, "Armenian",
-    0x0590, 0x05FF, "Hebrew",
-    0x0600, 0x06FF, "Arabic",
-    0x0700, 0x074F, "Syriac",
-    0x0750, 0x077F, "Arabic Supplement",
-    0x0780, 0x07BF, "Thaana",
-    0x07C0, 0x07FF, "NKo",
-    0x0800, 0x083F, "Samaritan",
-    0x0840, 0x085F, "Mandaic",
-    0x08A0, 0x08FF, "Arabic Extended-A",
-    0x0900, 0x097F, "Devanagari",
-    0x0980, 0x09FF, "Bengali",
-    0x0A00, 0x0A7F, "Gurmukhi",
-    0x0A80, 0x0AFF, "Gujarati",
-    0x0B00, 0x0B7F, "Oriya",
-    0x0B80, 0x0BFF, "Tamil",
-    0x0C00, 0x0C7F, "Telugu",
-    0x0C80, 0x0CFF, "Kannada",
-    0x0D00, 0x0D7F, "Malayalam",
-    0x0D80, 0x0DFF, "Sinhala",
-    0x0E00, 0x0E7F, "Thai",
-    0x0E80, 0x0EFF, "Lao",
-    0x0F00, 0x0FFF, "Tibetan",
-    0x1000, 0x109F, "Myanmar",
-    0x10A0, 0x10FF, "Georgian",
-    0x1100, 0x11FF, "Hangul Jamo",
-    0x1200, 0x137F, "Ethiopic",
-    0x1380, 0x139F, "Ethiopic Supplement",
-    0x13A0, 0x13FF, "Cherokee",
-    0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics",
-    0x1680, 0x169F, "Ogham",
-    0x16A0, 0x16FF, "Runic",
-    0x1700, 0x171F, "Tagalog",
-    0x1720, 0x173F, "Hanunoo",
-    0x1740, 0x175F, "Buhid",
-    0x1760, 0x177F, "Tagbanwa",
-    0x1780, 0x17FF, "Khmer",
-    0x1800, 0x18AF, "Mongolian",
-    0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended",
-    0x1900, 0x194F, "Limbu",
-    0x1950, 0x197F, "Tai Le",
-    0x1980, 0x19DF, "New Tai Lue",
-    0x19E0, 0x19FF, "Khmer Symbols",
-    0x1A00, 0x1A1F, "Buginese",
-    0x1A20, 0x1AAF, "Tai Tham",
-    0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended",
-    0x1B00, 0x1B7F, "Balinese",
-    0x1B80, 0x1BBF, "Sundanese",
-    0x1BC0, 0x1BFF, "Batak",
-    0x1C00, 0x1C4F, "Lepcha",
-    0x1C50, 0x1C7F, "Ol Chiki",
-    0x1CC0, 0x1CCF, "Sundanese Supplement",
-    0x1CD0, 0x1CFF, "Vedic Extensions",
-    0x1D00, 0x1D7F, "Phonetic Extensions",
-    0x1D80, 0x1DBF, "Phonetic Extensions Supplement",
-    0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement",
-    0x1E00, 0x1EFF, "Latin Extended Additional",
-    0x1F00, 0x1FFF, "Greek Extended",
-    0x2000, 0x206F, "General Punctuation",
-    0x2070, 0x209F, "Superscripts and Subscripts",
-    0x20A0, 0x20CF, "Currency Symbols",
-    0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols",
-    0x2100, 0x214F, "Letterlike Symbols",
-    0x2150, 0x218F, "Number Forms",
-    0x2190, 0x21FF, "Arrows",
-    0x2200, 0x22FF, "Mathematical Operators",
-    0x2300, 0x23FF, "Miscellaneous Technical",
-    0x2400, 0x243F, "Control Pictures",
-    0x2440, 0x245F, "Optical Character Recognition",
-    0x2460, 0x24FF, "Enclosed Alphanumerics",
    0x2500, 0x257F, "Box Drawing",
    0x2580, 0x259F, "Block Elements",
-    0x25A0, 0x25FF, "Geometric Shapes",
-    0x2600, 0x26FF, "Miscellaneous Symbols",
-    0x2700, 0x27BF, "Dingbats",
-    0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A",
-    0x27F0, 0x27FF, "Supplemental Arrows-A",
-    0x2800, 0x28FF, "Braille Patterns",
-    0x2900, 0x297F, "Supplemental Arrows-B",
-    0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B",
-    0x2A00, 0x2AFF, "Supplemental Mathematical Operators",
-    0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows",
-    0x2C00, 0x2C5F, "Glagolitic",
-    0x2C60, 0x2C7F, "Latin Extended-C",
-    0x2C80, 0x2CFF, "Coptic",
-    0x2D00, 0x2D2F, "Georgian Supplement",
-    0x2D30, 0x2D7F, "Tifinagh",
-    0x2D80, 0x2DDF, "Ethiopic Extended",
-    0x2DE0, 0x2DFF, "Cyrillic Extended-A",
-    0x2E00, 0x2E7F, "Supplemental Punctuation",
-    0x2E80, 0x2EFF, "CJK Radicals Supplement",
-    0x2F00, 0x2FDF, "Kangxi Radicals",
-    0x2FF0, 0x2FFF, "Ideographic Description Characters",
-    0x3000, 0x303F, "CJK Symbols and Punctuation",
-    0x3040, 0x309F, "Hiragana",
-    0x30A0, 0x30FF, "Katakana",
-    0x3100, 0x312F, "Bopomofo",
-    0x3130, 0x318F, "Hangul Compatibility Jamo",
-    0x3190, 0x319F, "Kanbun",
-    0x31A0, 0x31BF, "Bopomofo Extended",
-    0x31C0, 0x31EF, "CJK Strokes",
-    0x31F0, 0x31FF, "Katakana Phonetic Extensions",
-    0x3200, 0x32FF, "Enclosed CJK Letters and Months",
-    0x3300, 0x33FF, "CJK Compatibility",
-    0x3400, 0x4DBF, "CJK Unified Ideographs Extension A",
-    0x4DC0, 0x4DFF, "Yijing Hexagram Symbols",
-    0x4E00, 0x9FFF, "CJK Unified Ideographs",
-    0xA000, 0xA48F, "Yi Syllables",
-    0xA490, 0xA4CF, "Yi Radicals",
-    0xA4D0, 0xA4FF, "Lisu",
-    0xA500, 0xA63F, "Vai",
-    0xA640, 0xA69F, "Cyrillic Extended-B",
-    0xA6A0, 0xA6FF, "Bamum",
-    0xA700, 0xA71F, "Modifier Tone Letters",
-    0xA720, 0xA7FF, "Latin Extended-D",
-    0xA800, 0xA82F, "Syloti Nagri",
-    0xA830, 0xA83F, "Common Indic Number Forms",
-    0xA840, 0xA87F, "Phags-pa",
-    0xA880, 0xA8DF, "Saurashtra",
-    0xA8E0, 0xA8FF, "Devanagari Extended",
-    0xA900, 0xA92F, "Kayah Li",
-    0xA930, 0xA95F, "Rejang",
-    0xA960, 0xA97F, "Hangul Jamo Extended-A",
-    0xA980, 0xA9DF, "Javanese",
-    0xA9E0, 0xA9FF, "Myanmar Extended-B",
-    0xAA00, 0xAA5F, "Cham",
-    0xAA60, 0xAA7F, "Myanmar Extended-A",
-    0xAA80, 0xAADF, "Tai Viet",
-    0xAAE0, 0xAAFF, "Meetei Mayek Extensions",
-    0xAB00, 0xAB2F, "Ethiopic Extended-A",
-    0xAB30, 0xAB6F, "Latin Extended-E",
-    0xABC0, 0xABFF, "Meetei Mayek",
-    0xAC00, 0xD7AF, "Hangul Syllables",
-    0xD7B0, 0xD7FF, "Hangul Jamo Extended-B",
-    0xD800, 0xDB7F, "High Surrogates",
-    0xDB80, 0xDBFF, "High Private Use Surrogates",
-    0xDC00, 0xDFFF, "Low Surrogates",
-    0xE000, 0xF8FF, "Private Use Area",
-    0xF900, 0xFAFF, "CJK Compatibility Ideographs",
-    0xFB00, 0xFB4F, "Alphabetic Presentation Forms",
-    0xFB50, 0xFDFF, "Arabic Presentation Forms-A",
-    0xFE00, 0xFE0F, "Variation Selectors",
-    0xFE10, 0xFE1F, "Vertical Forms",
-    0xFE20, 0xFE2F, "Combining Half Marks",
-    0xFE30, 0xFE4F, "CJK Compatibility Forms",
-    0xFE50, 0xFE6F, "Small Form Variants",
-    0xFE70, 0xFEFF, "Arabic Presentation Forms-B",
-    0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms",
-    0xFFF0, 0xFFFF, "Specials",
-    0x10000, 0x1007F, "Linear B Syllabary",
-    0x10080, 0x100FF, "Linear B Ideograms",
-    0x10100, 0x1013F, "Aegean Numbers",
-    0x10140, 0x1018F, "Ancient Greek Numbers",
-    0x10190, 0x101CF, "Ancient Symbols",
-    0x101D0, 0x101FF, "Phaistos Disc",
-    0x10280, 0x1029F, "Lycian",
-    0x102A0, 0x102DF, "Carian",
-    0x102E0, 0x102FF, "Coptic Epact Numbers",
-    0x10300, 0x1032F, "Old Italic",
-    0x10330, 0x1034F, "Gothic",
-    0x10350, 0x1037F, "Old Permic",
-    0x10380, 0x1039F, "Ugaritic",
-    0x103A0, 0x103DF, "Old Persian",
-    0x10400, 0x1044F, "Deseret",
-    0x10450, 0x1047F, "Shavian",
-    0x10480, 0x104AF, "Osmanya",
-    0x10500, 0x1052F, "Elbasan",
-    0x10530, 0x1056F, "Caucasian Albanian",
-    0x10600, 0x1077F, "Linear A",
-    0x10800, 0x1083F, "Cypriot Syllabary",
-    0x10840, 0x1085F, "Imperial Aramaic",
-    0x10860, 0x1087F, "Palmyrene",
-    0x10880, 0x108AF, "Nabataean",
-    0x10900, 0x1091F, "Phoenician",
-    0x10920, 0x1093F, "Lydian",
-    0x10980, 0x1099F, "Meroitic Hieroglyphs",
-    0x109A0, 0x109FF, "Meroitic Cursive",
-    0x10A00, 0x10A5F, "Kharoshthi",
-    0x10A60, 0x10A7F, "Old South Arabian",
-    0x10A80, 0x10A9F, "Old North Arabian",
-    0x10AC0, 0x10AFF, "Manichaean",
-    0x10B00, 0x10B3F, "Avestan",
-    0x10B40, 0x10B5F, "Inscriptional Parthian",
-    0x10B60, 0x10B7F, "Inscriptional Pahlavi",
-    0x10B80, 0x10BAF, "Psalter Pahlavi",
-    0x10C00, 0x10C4F, "Old Turkic",
-    0x10E60, 0x10E7F, "Rumi Numeral Symbols",
-    0x11000, 0x1107F, "Brahmi",
-    0x11080, 0x110CF, "Kaithi",
-    0x110D0, 0x110FF, "Sora Sompeng",
-    0x11100, 0x1114F, "Chakma",
-    0x11150, 0x1117F, "Mahajani",
-    0x11180, 0x111DF, "Sharada",
-    0x111E0, 0x111FF, "Sinhala Archaic Numbers",
-    0x11200, 0x1124F, "Khojki",
-    0x112B0, 0x112FF, "Khudawadi",
-    0x11300, 0x1137F, "Grantha",
-    0x11480, 0x114DF, "Tirhuta",
-    0x11580, 0x115FF, "Siddham",
-    0x11600, 0x1165F, "Modi",
-    0x11680, 0x116CF, "Takri",
-    0x118A0, 0x118FF, "Warang Citi",
-    0x11AC0, 0x11AFF, "Pau Cin Hau",
-    0x12000, 0x123FF, "Cuneiform",
-    0x12400, 0x1247F, "Cuneiform Numbers and Punctuation",
-    0x13000, 0x1342F, "Egyptian Hieroglyphs",
-    0x16800, 0x16A3F, "Bamum Supplement",
-    0x16A40, 0x16A6F, "Mro",
-    0x16AD0, 0x16AFF, "Bassa Vah",
-    0x16B00, 0x16B8F, "Pahawh Hmong",
-    0x16F00, 0x16F9F, "Miao",
-    0x1B000, 0x1B0FF, "Kana Supplement",
-    0x1BC00, 0x1BC9F, "Duployan",
-    0x1BCA0, 0x1BCAF, "Shorthand Format Controls",
-    0x1D000, 0x1D0FF, "Byzantine Musical Symbols",
-    0x1D100, 0x1D1FF, "Musical Symbols",
-    0x1D200, 0x1D24F, "Ancient Greek Musical Notation",
-    0x1D300, 0x1D35F, "Tai Xuan Jing Symbols",
-    0x1D360, 0x1D37F, "Counting Rod Numerals",
-    0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols",
-    0x1E800, 0x1E8DF, "Mende Kikakui",
-    0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols",
-    0x1F000, 0x1F02F, "Mahjong Tiles",
-    0x1F030, 0x1F09F, "Domino Tiles",
-    0x1F0A0, 0x1F0FF, "Playing Cards",
-    0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement",
-    0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement",
-    0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs",
-    0x1F600, 0x1F64F, "Emoticons",
-    0x1F650, 0x1F67F, "Ornamental Dingbats",
-    0x1F680, 0x1F6FF, "Transport and Map Symbols",
-    0x1F700, 0x1F77F, "Alchemical Symbols",
-    0x1F780, 0x1F7FF, "Geometric Shapes Extended",
-    0x1F800, 0x1F8FF, "Supplemental Arrows-C",
-    0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B",
-    0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C",
-    0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D",
-    0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement",
-    0xE0000, 0xE007F, "Tags",
-    0xE0100, 0xE01EF, "Variation Selectors Supplement",
-    0xF0000, 0xFFFFF, "Supplementary Private Use Area-A",
-    0x100000, 0x10FFFF, "Supplementary Private Use Area-B",
  ]
  var UNICODE_BLOCK_COUNT = UNICODE_BLOCK_LIST.length / 3
  var UNICODE_LOOKUP = {}
@ -259,6 +11,14 @@ var unicode = (function(){
    UNICODE_LOOKUP[ UNICODE_BLOCK_LIST[i+2] ] = [ UNICODE_BLOCK_LIST[i], UNICODE_BLOCK_LIST[i+1] ]
  }

+  function block (name, n){
+    var b = UNICODE_LOOKUP[name]
+    if (! b) return ""
+    return range.apply(null, b).map(function(n){ return String.fromCharCode(n) })
+  }
+  function entities (a) {
+    return a.map(function(k){ return "&#" + k.join(";&#") + ";" }).join("<br>")
+  }
  function index (j) {
    return [ UNICODE_BLOCK_LIST[j*3], UNICODE_BLOCK_LIST[j*3+1], UNICODE_BLOCK_LIST[j*3+2], [] ]
  }
@ -270,89 +30,7 @@ var unicode = (function(){
    }
    return a
  }
-  function paginate (a, n){
-    var aa = [], ai, i = 0
-    while (i < 100) {
-      ai = a.slice(i * n, (i+1) * n)
-      if (! ai.length) break
-      aa.push(ai)
-      i++
-    }
-    return aa
-  }
-  function block (name, n){
-    var b = UNICODE_LOOKUP[name]
-    if (! b) return ""
-    return range.apply(null, b).map(function(n){ return String.fromCharCode(n) })
-  }
-  function entities (a) {
-    return a.map(function(k){ return "&#" + k.join(";&#") + ";" }).join("<br>")
-  }
-  function findGroups (chars){
-    var groups = [], row, list
-    for (var i = 0, j = -1, next = -1, len = chars.length; i < len; i++) {
-      if (chars[i] < next) {
-        list.push(chars[i])
-        continue
-      }
-      do {
-        j += 1
-        next = UNICODE_BLOCK_LIST[(j+1)*3]
-      } while (chars[i] > next)
-      row = index(j)
-      list = row[3]
-      groups.push( row )
-    }
-    return groups
-  }
-  
-  // encodes unicode characters as escaped utf16 - \xFFFF
-  // encodes ONLY non-ascii characters
-  function escapeToUtf16 (txt) {
-    var escaped_txt = "", kode
-    for (var i = 0; i < txt.length; i++) {
-      kode = txt.charCodeAt(i)
-      if (kode > 0x7f) {
-        kode = kode.toString(16)
-        switch (kode.length) {
-          case 2:
-            kode = "0" + kode
-          case 3:
-            kode = "0" + kode
-        }
-        escaped_txt += "\\u" + kode
-      }
-      else {
-        escaped_txt += txt[i]
-      }
-    }
-    return escaped_txt
-  }

-  // encodes unicode characters as escaped bytes - \xFF
-  // encodes ONLY non-ascii characters
-  function escapeToEscapedBytes (txt) {
-    var escaped_txt = "", kode, utf8_bytes
-    for (var i = 0; i < txt.length; i++) {
-      kode = txt.charCodeAt(i)
-      if (kode > 0x7f) {
-        utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode)
-        escaped_txt += convertBytesToEscapedString(utf8_bytes, 16)
-      }
-      else {
-        escaped_txt += txt[i]
-      }
-    }
-    return escaped_txt
-  }
-
-  // encodes unicode characters as escaped bytes - \xFF
-  // encodes an ENTIRE string
-  function escapeAllToEscapedBytes(str, base) {
-    var unicode_codes = convertStringToUnicodeCodePoints(str);
-    var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes);
-    return convertBytesToEscapedString(data_bytes, 16);
-  }
  // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => '\xE3\x81\x82\xE3\x81\x84'
  // [ 0343, 0201, 0202, 0343, 0201, 0204 ] => '\343\201\202\343\201\204'
  function convertBytesToEscapedString(data_bytes, base) {
@ -365,14 +43,27 @@ var unicode = (function(){
    }
    return escaped;
  }
-  // [ 0x3042, 0x3044 ] => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
-  function convertUnicodeCodePointsToBytes(unicode_codes) {
-    var utf8_bytes = [];
-    for (var i = 0; i < unicode_codes.length; ++i) {
-      var bytes = convertUnicodeCodePointToUtf8Bytes(unicode_codes[i]);
-      utf8_bytes = utf8_bytes.concat(bytes);
+  // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
+  // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ]
+  function convertEscapedBytesToBytes(str) {
+    var parts = str.split("\\x");
+    parts.shift();  // Trim the first element.
+    var codes = [];
+    var max = Math.pow(2, 8);
+    for (var i = 0; i < parts.length; ++i) {
+      var code = parseInt(parts[i], 16);
+      if (code >= 0 && code < max) {
+        codes.push(code);
+      } else {
+        // Malformed code ignored.
+      }
    }
-    return utf8_bytes;
+    return codes;
+  }
+  // [ 0x3042, 0x3044 ] => "ã‚ã„"
+  function convertUnicodeCodePointsToString(unicode_codes) {
+    var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes);
+    return convertUtf16CodesToString(utf16_codes);
  }
  // 0x3042 => [ 0xE3, 0x81, 0x82 ]
  function convertUnicodeCodePointToUtf8Bytes(unicode_code) {
@ -394,62 +85,22 @@ var unicode = (function(){
    }
    return utf8_bytes;
  }
-  // "ã‚ã„" => [ 0x3042,  0x3044 ]
-  function convertStringToUnicodeCodePoints(str) {
-    var surrogate_1st = 0;
-    var unicode_codes = [];
-    for (var i = 0; i < str.length; ++i) {
-      var utf16_code = str.charCodeAt(i);
-      if (surrogate_1st != 0) {
-        if (utf16_code >= 0xDC00 && utf16_code <= 0xDFFF) {
-          var surrogate_2nd = utf16_code;
-          var unicode_code = (surrogate_1st - 0xD800) * (1 << 10) + (1 << 16) +
-                             (surrogate_2nd - 0xDC00);
-          unicode_codes.push(unicode_code);
-        } else {
-          // Malformed surrogate pair ignored.
-        }
-        surrogate_1st = 0;
-      } else if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) {
-        surrogate_1st = utf16_code;
+  // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ]
+  // [ 0xD840, 0xDC0B ] => [ 0x2000B ]  // A surrogate pair.
+  function convertUnicodeCodePointsToUtf16Codes(unicode_codes) {
+    var utf16_codes = [];
+    for (var i = 0; i < unicode_codes.length; ++i) {
+      var unicode_code = unicode_codes[i];
+      if (unicode_code < (1 << 16)) {
+        utf16_codes.push(unicode_code);
      } else {
-        unicode_codes.push(utf16_code);
+        var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800;
+        var second = (unicode_code % (1 << 10)) + 0xDC00;
+        utf16_codes.push(first)
+        utf16_codes.push(second)
      }
    }
-    return unicode_codes;
-  }
-  // 0xff => "ff"
-  // 0xff => "377"
-  function formatNumber(number, base, num_digits) {
-    var str = number.toString(base).toUpperCase();
-    for (var i = str.length; i < num_digits; ++i) {
-      str = "0" + str;
-    }
-    return str;
-  }
-
-  // convert \xFF\xFF\xFF to unicode
-  function unescapeFromEscapedBytes (str) {
-    var data_bytes = convertEscapedBytesToBytes(str);
-    var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes);
-    return convertUnicodeCodePointsToString(unicode_codes);
-  }
-  // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
-  // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ]
-  function convertEscapedBytesToBytes(str) {
-    var parts = str.split("\\x");
-    parts.shift();  // Trim the first element.
-    var codes = [];
-    var max = Math.pow(2, 8);
-    for (var i = 0; i < parts.length; ++i) {
-      var code = parseInt(parts[i], 16);
-      if (code >= 0 && code < max) {
-        codes.push(code);
-      } else {
-        // Malformed code ignored.
-      }
-    }
-    return codes;
+    return utf16_codes;
  }
  // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ]
  function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) {
@ -498,28 +149,6 @@ var unicode = (function(){
    unicode_codes.shift();  // Trim the first element.
    return unicode_codes;
  }
-  // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ]
-  // [ 0xD840, 0xDC0B ] => [ 0x2000B ]  // A surrogate pair.
-  function convertUnicodeCodePointsToUtf16Codes(unicode_codes) {
-    var utf16_codes = [];
-    for (var i = 0; i < unicode_codes.length; ++i) {
-      var unicode_code = unicode_codes[i];
-      if (unicode_code < (1 << 16)) {
-        utf16_codes.push(unicode_code);
-      } else {
-        var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800;
-        var second = (unicode_code % (1 << 10)) + 0xDC00;
-        utf16_codes.push(first)
-        utf16_codes.push(second)
-      }
-    }
-    return utf16_codes;
-  }
-  // [ 0x3042, 0x3044 ] => "ã‚ã„"
-  function convertUnicodeCodePointsToString(unicode_codes) {
-    var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes);
-    return convertUtf16CodesToString(utf16_codes);
-  }
  // [ 0x3042, 0x3044 ] => "ã‚ã„"
  function convertUtf16CodesToString(utf16_codes) {
    var unescaped = '';
@ -528,6 +157,39 @@ var unicode = (function(){
    }
    return unescaped;
  }
+  // 0xff => "ff"
+  // 0xff => "377"
+  function formatNumber(number, base, num_digits) {
+    var str = number.toString(base).toUpperCase();
+    for (var i = str.length; i < num_digits; ++i) {
+      str = "0" + str;
+    }
+    return str;
+  }
+
+  // encodes unicode characters as escaped bytes - \xFF
+  // encodes ONLY non-ascii characters
+  function escapeToEscapedBytes (txt) {
+    var escaped_txt = "", kode, utf8_bytes
+    for (var i = 0; i < txt.length; i++) {
+      kode = txt.charCodeAt(i)
+      if (kode > 0x7f) {
+        utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode)
+        escaped_txt += convertBytesToEscapedString(utf8_bytes, 16)
+      }
+      else {
+        escaped_txt += txt[i]
+      }
+    }
+    return escaped_txt
+  }
+
+  // convert \xFF\xFF\xFF to unicode
+  function unescapeFromEscapedBytes (str) {
+    var data_bytes = convertEscapedBytesToBytes(str);
+    var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes);
+    return convertUnicodeCodePointsToString(unicode_codes);
+  }

  return {
    raw: UNICODE_BLOCK_LIST,
@ -535,8 +197,6 @@ var unicode = (function(){
    index: index,
    range: range,
    block: block,
-    findGroups: findGroups,
-    paginate: paginate,
    escapeToEscapedBytes: escapeToEscapedBytes,
    unescapeFromEscapedBytes: unescapeFromEscapedBytes,
  }