diff --git a/js/ext/unicode.js b/js/ext/unicode.js
index 2199750..0d4af8c 100644
--- a/js/ext/unicode.js
+++ b/js/ext/unicode.js
@@ -2,256 +2,8 @@ var unicode = (function(){
var UNICODE_BLOCK_LIST = [
0x0020, 0x007F, "Basic Latin",
0x0080, 0x00FF, "Latin-1 Supplement",
- 0x0100, 0x017F, "Latin Extended-A",
- 0x0180, 0x024F, "Latin Extended-B",
- 0x0250, 0x02AF, "IPA Extensions",
- 0x02B0, 0x02FF, "Spacing Modifier Letters",
- 0x0300, 0x036F, "Combining Diacritical Marks",
- 0x0370, 0x03FF, "Greek and Coptic",
- 0x0400, 0x04FF, "Cyrillic",
- 0x0500, 0x052F, "Cyrillic Supplement",
- 0x0530, 0x058F, "Armenian",
- 0x0590, 0x05FF, "Hebrew",
- 0x0600, 0x06FF, "Arabic",
- 0x0700, 0x074F, "Syriac",
- 0x0750, 0x077F, "Arabic Supplement",
- 0x0780, 0x07BF, "Thaana",
- 0x07C0, 0x07FF, "NKo",
- 0x0800, 0x083F, "Samaritan",
- 0x0840, 0x085F, "Mandaic",
- 0x08A0, 0x08FF, "Arabic Extended-A",
- 0x0900, 0x097F, "Devanagari",
- 0x0980, 0x09FF, "Bengali",
- 0x0A00, 0x0A7F, "Gurmukhi",
- 0x0A80, 0x0AFF, "Gujarati",
- 0x0B00, 0x0B7F, "Oriya",
- 0x0B80, 0x0BFF, "Tamil",
- 0x0C00, 0x0C7F, "Telugu",
- 0x0C80, 0x0CFF, "Kannada",
- 0x0D00, 0x0D7F, "Malayalam",
- 0x0D80, 0x0DFF, "Sinhala",
- 0x0E00, 0x0E7F, "Thai",
- 0x0E80, 0x0EFF, "Lao",
- 0x0F00, 0x0FFF, "Tibetan",
- 0x1000, 0x109F, "Myanmar",
- 0x10A0, 0x10FF, "Georgian",
- 0x1100, 0x11FF, "Hangul Jamo",
- 0x1200, 0x137F, "Ethiopic",
- 0x1380, 0x139F, "Ethiopic Supplement",
- 0x13A0, 0x13FF, "Cherokee",
- 0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics",
- 0x1680, 0x169F, "Ogham",
- 0x16A0, 0x16FF, "Runic",
- 0x1700, 0x171F, "Tagalog",
- 0x1720, 0x173F, "Hanunoo",
- 0x1740, 0x175F, "Buhid",
- 0x1760, 0x177F, "Tagbanwa",
- 0x1780, 0x17FF, "Khmer",
- 0x1800, 0x18AF, "Mongolian",
- 0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended",
- 0x1900, 0x194F, "Limbu",
- 0x1950, 0x197F, "Tai Le",
- 0x1980, 0x19DF, "New Tai Lue",
- 0x19E0, 0x19FF, "Khmer Symbols",
- 0x1A00, 0x1A1F, "Buginese",
- 0x1A20, 0x1AAF, "Tai Tham",
- 0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended",
- 0x1B00, 0x1B7F, "Balinese",
- 0x1B80, 0x1BBF, "Sundanese",
- 0x1BC0, 0x1BFF, "Batak",
- 0x1C00, 0x1C4F, "Lepcha",
- 0x1C50, 0x1C7F, "Ol Chiki",
- 0x1CC0, 0x1CCF, "Sundanese Supplement",
- 0x1CD0, 0x1CFF, "Vedic Extensions",
- 0x1D00, 0x1D7F, "Phonetic Extensions",
- 0x1D80, 0x1DBF, "Phonetic Extensions Supplement",
- 0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement",
- 0x1E00, 0x1EFF, "Latin Extended Additional",
- 0x1F00, 0x1FFF, "Greek Extended",
- 0x2000, 0x206F, "General Punctuation",
- 0x2070, 0x209F, "Superscripts and Subscripts",
- 0x20A0, 0x20CF, "Currency Symbols",
- 0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols",
- 0x2100, 0x214F, "Letterlike Symbols",
- 0x2150, 0x218F, "Number Forms",
- 0x2190, 0x21FF, "Arrows",
- 0x2200, 0x22FF, "Mathematical Operators",
- 0x2300, 0x23FF, "Miscellaneous Technical",
- 0x2400, 0x243F, "Control Pictures",
- 0x2440, 0x245F, "Optical Character Recognition",
- 0x2460, 0x24FF, "Enclosed Alphanumerics",
0x2500, 0x257F, "Box Drawing",
0x2580, 0x259F, "Block Elements",
- 0x25A0, 0x25FF, "Geometric Shapes",
- 0x2600, 0x26FF, "Miscellaneous Symbols",
- 0x2700, 0x27BF, "Dingbats",
- 0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A",
- 0x27F0, 0x27FF, "Supplemental Arrows-A",
- 0x2800, 0x28FF, "Braille Patterns",
- 0x2900, 0x297F, "Supplemental Arrows-B",
- 0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B",
- 0x2A00, 0x2AFF, "Supplemental Mathematical Operators",
- 0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows",
- 0x2C00, 0x2C5F, "Glagolitic",
- 0x2C60, 0x2C7F, "Latin Extended-C",
- 0x2C80, 0x2CFF, "Coptic",
- 0x2D00, 0x2D2F, "Georgian Supplement",
- 0x2D30, 0x2D7F, "Tifinagh",
- 0x2D80, 0x2DDF, "Ethiopic Extended",
- 0x2DE0, 0x2DFF, "Cyrillic Extended-A",
- 0x2E00, 0x2E7F, "Supplemental Punctuation",
- 0x2E80, 0x2EFF, "CJK Radicals Supplement",
- 0x2F00, 0x2FDF, "Kangxi Radicals",
- 0x2FF0, 0x2FFF, "Ideographic Description Characters",
- 0x3000, 0x303F, "CJK Symbols and Punctuation",
- 0x3040, 0x309F, "Hiragana",
- 0x30A0, 0x30FF, "Katakana",
- 0x3100, 0x312F, "Bopomofo",
- 0x3130, 0x318F, "Hangul Compatibility Jamo",
- 0x3190, 0x319F, "Kanbun",
- 0x31A0, 0x31BF, "Bopomofo Extended",
- 0x31C0, 0x31EF, "CJK Strokes",
- 0x31F0, 0x31FF, "Katakana Phonetic Extensions",
- 0x3200, 0x32FF, "Enclosed CJK Letters and Months",
- 0x3300, 0x33FF, "CJK Compatibility",
- 0x3400, 0x4DBF, "CJK Unified Ideographs Extension A",
- 0x4DC0, 0x4DFF, "Yijing Hexagram Symbols",
- 0x4E00, 0x9FFF, "CJK Unified Ideographs",
- 0xA000, 0xA48F, "Yi Syllables",
- 0xA490, 0xA4CF, "Yi Radicals",
- 0xA4D0, 0xA4FF, "Lisu",
- 0xA500, 0xA63F, "Vai",
- 0xA640, 0xA69F, "Cyrillic Extended-B",
- 0xA6A0, 0xA6FF, "Bamum",
- 0xA700, 0xA71F, "Modifier Tone Letters",
- 0xA720, 0xA7FF, "Latin Extended-D",
- 0xA800, 0xA82F, "Syloti Nagri",
- 0xA830, 0xA83F, "Common Indic Number Forms",
- 0xA840, 0xA87F, "Phags-pa",
- 0xA880, 0xA8DF, "Saurashtra",
- 0xA8E0, 0xA8FF, "Devanagari Extended",
- 0xA900, 0xA92F, "Kayah Li",
- 0xA930, 0xA95F, "Rejang",
- 0xA960, 0xA97F, "Hangul Jamo Extended-A",
- 0xA980, 0xA9DF, "Javanese",
- 0xA9E0, 0xA9FF, "Myanmar Extended-B",
- 0xAA00, 0xAA5F, "Cham",
- 0xAA60, 0xAA7F, "Myanmar Extended-A",
- 0xAA80, 0xAADF, "Tai Viet",
- 0xAAE0, 0xAAFF, "Meetei Mayek Extensions",
- 0xAB00, 0xAB2F, "Ethiopic Extended-A",
- 0xAB30, 0xAB6F, "Latin Extended-E",
- 0xABC0, 0xABFF, "Meetei Mayek",
- 0xAC00, 0xD7AF, "Hangul Syllables",
- 0xD7B0, 0xD7FF, "Hangul Jamo Extended-B",
- 0xD800, 0xDB7F, "High Surrogates",
- 0xDB80, 0xDBFF, "High Private Use Surrogates",
- 0xDC00, 0xDFFF, "Low Surrogates",
- 0xE000, 0xF8FF, "Private Use Area",
- 0xF900, 0xFAFF, "CJK Compatibility Ideographs",
- 0xFB00, 0xFB4F, "Alphabetic Presentation Forms",
- 0xFB50, 0xFDFF, "Arabic Presentation Forms-A",
- 0xFE00, 0xFE0F, "Variation Selectors",
- 0xFE10, 0xFE1F, "Vertical Forms",
- 0xFE20, 0xFE2F, "Combining Half Marks",
- 0xFE30, 0xFE4F, "CJK Compatibility Forms",
- 0xFE50, 0xFE6F, "Small Form Variants",
- 0xFE70, 0xFEFF, "Arabic Presentation Forms-B",
- 0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms",
- 0xFFF0, 0xFFFF, "Specials",
- 0x10000, 0x1007F, "Linear B Syllabary",
- 0x10080, 0x100FF, "Linear B Ideograms",
- 0x10100, 0x1013F, "Aegean Numbers",
- 0x10140, 0x1018F, "Ancient Greek Numbers",
- 0x10190, 0x101CF, "Ancient Symbols",
- 0x101D0, 0x101FF, "Phaistos Disc",
- 0x10280, 0x1029F, "Lycian",
- 0x102A0, 0x102DF, "Carian",
- 0x102E0, 0x102FF, "Coptic Epact Numbers",
- 0x10300, 0x1032F, "Old Italic",
- 0x10330, 0x1034F, "Gothic",
- 0x10350, 0x1037F, "Old Permic",
- 0x10380, 0x1039F, "Ugaritic",
- 0x103A0, 0x103DF, "Old Persian",
- 0x10400, 0x1044F, "Deseret",
- 0x10450, 0x1047F, "Shavian",
- 0x10480, 0x104AF, "Osmanya",
- 0x10500, 0x1052F, "Elbasan",
- 0x10530, 0x1056F, "Caucasian Albanian",
- 0x10600, 0x1077F, "Linear A",
- 0x10800, 0x1083F, "Cypriot Syllabary",
- 0x10840, 0x1085F, "Imperial Aramaic",
- 0x10860, 0x1087F, "Palmyrene",
- 0x10880, 0x108AF, "Nabataean",
- 0x10900, 0x1091F, "Phoenician",
- 0x10920, 0x1093F, "Lydian",
- 0x10980, 0x1099F, "Meroitic Hieroglyphs",
- 0x109A0, 0x109FF, "Meroitic Cursive",
- 0x10A00, 0x10A5F, "Kharoshthi",
- 0x10A60, 0x10A7F, "Old South Arabian",
- 0x10A80, 0x10A9F, "Old North Arabian",
- 0x10AC0, 0x10AFF, "Manichaean",
- 0x10B00, 0x10B3F, "Avestan",
- 0x10B40, 0x10B5F, "Inscriptional Parthian",
- 0x10B60, 0x10B7F, "Inscriptional Pahlavi",
- 0x10B80, 0x10BAF, "Psalter Pahlavi",
- 0x10C00, 0x10C4F, "Old Turkic",
- 0x10E60, 0x10E7F, "Rumi Numeral Symbols",
- 0x11000, 0x1107F, "Brahmi",
- 0x11080, 0x110CF, "Kaithi",
- 0x110D0, 0x110FF, "Sora Sompeng",
- 0x11100, 0x1114F, "Chakma",
- 0x11150, 0x1117F, "Mahajani",
- 0x11180, 0x111DF, "Sharada",
- 0x111E0, 0x111FF, "Sinhala Archaic Numbers",
- 0x11200, 0x1124F, "Khojki",
- 0x112B0, 0x112FF, "Khudawadi",
- 0x11300, 0x1137F, "Grantha",
- 0x11480, 0x114DF, "Tirhuta",
- 0x11580, 0x115FF, "Siddham",
- 0x11600, 0x1165F, "Modi",
- 0x11680, 0x116CF, "Takri",
- 0x118A0, 0x118FF, "Warang Citi",
- 0x11AC0, 0x11AFF, "Pau Cin Hau",
- 0x12000, 0x123FF, "Cuneiform",
- 0x12400, 0x1247F, "Cuneiform Numbers and Punctuation",
- 0x13000, 0x1342F, "Egyptian Hieroglyphs",
- 0x16800, 0x16A3F, "Bamum Supplement",
- 0x16A40, 0x16A6F, "Mro",
- 0x16AD0, 0x16AFF, "Bassa Vah",
- 0x16B00, 0x16B8F, "Pahawh Hmong",
- 0x16F00, 0x16F9F, "Miao",
- 0x1B000, 0x1B0FF, "Kana Supplement",
- 0x1BC00, 0x1BC9F, "Duployan",
- 0x1BCA0, 0x1BCAF, "Shorthand Format Controls",
- 0x1D000, 0x1D0FF, "Byzantine Musical Symbols",
- 0x1D100, 0x1D1FF, "Musical Symbols",
- 0x1D200, 0x1D24F, "Ancient Greek Musical Notation",
- 0x1D300, 0x1D35F, "Tai Xuan Jing Symbols",
- 0x1D360, 0x1D37F, "Counting Rod Numerals",
- 0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols",
- 0x1E800, 0x1E8DF, "Mende Kikakui",
- 0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols",
- 0x1F000, 0x1F02F, "Mahjong Tiles",
- 0x1F030, 0x1F09F, "Domino Tiles",
- 0x1F0A0, 0x1F0FF, "Playing Cards",
- 0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement",
- 0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement",
- 0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs",
- 0x1F600, 0x1F64F, "Emoticons",
- 0x1F650, 0x1F67F, "Ornamental Dingbats",
- 0x1F680, 0x1F6FF, "Transport and Map Symbols",
- 0x1F700, 0x1F77F, "Alchemical Symbols",
- 0x1F780, 0x1F7FF, "Geometric Shapes Extended",
- 0x1F800, 0x1F8FF, "Supplemental Arrows-C",
- 0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B",
- 0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C",
- 0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D",
- 0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement",
- 0xE0000, 0xE007F, "Tags",
- 0xE0100, 0xE01EF, "Variation Selectors Supplement",
- 0xF0000, 0xFFFFF, "Supplementary Private Use Area-A",
- 0x100000, 0x10FFFF, "Supplementary Private Use Area-B",
]
var UNICODE_BLOCK_COUNT = UNICODE_BLOCK_LIST.length / 3
var UNICODE_LOOKUP = {}
@@ -259,6 +11,14 @@ var unicode = (function(){
UNICODE_LOOKUP[ UNICODE_BLOCK_LIST[i+2] ] = [ UNICODE_BLOCK_LIST[i], UNICODE_BLOCK_LIST[i+1] ]
}
+ function block (name, n){
+ var b = UNICODE_LOOKUP[name]
+ if (! b) return ""
+ return range.apply(null, b).map(function(n){ return String.fromCharCode(n) })
+ }
+ function entities (a) {
+ return a.map(function(k){ return "" + k.join(";") + ";" }).join("
")
+ }
function index (j) {
return [ UNICODE_BLOCK_LIST[j*3], UNICODE_BLOCK_LIST[j*3+1], UNICODE_BLOCK_LIST[j*3+2], [] ]
}
@@ -270,89 +30,7 @@ var unicode = (function(){
}
return a
}
- function paginate (a, n){
- var aa = [], ai, i = 0
- while (i < 100) {
- ai = a.slice(i * n, (i+1) * n)
- if (! ai.length) break
- aa.push(ai)
- i++
- }
- return aa
- }
- function block (name, n){
- var b = UNICODE_LOOKUP[name]
- if (! b) return ""
- return range.apply(null, b).map(function(n){ return String.fromCharCode(n) })
- }
- function entities (a) {
- return a.map(function(k){ return "" + k.join(";") + ";" }).join("
")
- }
- function findGroups (chars){
- var groups = [], row, list
- for (var i = 0, j = -1, next = -1, len = chars.length; i < len; i++) {
- if (chars[i] < next) {
- list.push(chars[i])
- continue
- }
- do {
- j += 1
- next = UNICODE_BLOCK_LIST[(j+1)*3]
- } while (chars[i] > next)
- row = index(j)
- list = row[3]
- groups.push( row )
- }
- return groups
- }
-
- // encodes unicode characters as escaped utf16 - \xFFFF
- // encodes ONLY non-ascii characters
- function escapeToUtf16 (txt) {
- var escaped_txt = "", kode
- for (var i = 0; i < txt.length; i++) {
- kode = txt.charCodeAt(i)
- if (kode > 0x7f) {
- kode = kode.toString(16)
- switch (kode.length) {
- case 2:
- kode = "0" + kode
- case 3:
- kode = "0" + kode
- }
- escaped_txt += "\\u" + kode
- }
- else {
- escaped_txt += txt[i]
- }
- }
- return escaped_txt
- }
- // encodes unicode characters as escaped bytes - \xFF
- // encodes ONLY non-ascii characters
- function escapeToEscapedBytes (txt) {
- var escaped_txt = "", kode, utf8_bytes
- for (var i = 0; i < txt.length; i++) {
- kode = txt.charCodeAt(i)
- if (kode > 0x7f) {
- utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode)
- escaped_txt += convertBytesToEscapedString(utf8_bytes, 16)
- }
- else {
- escaped_txt += txt[i]
- }
- }
- return escaped_txt
- }
-
- // encodes unicode characters as escaped bytes - \xFF
- // encodes an ENTIRE string
- function escapeAllToEscapedBytes(str, base) {
- var unicode_codes = convertStringToUnicodeCodePoints(str);
- var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes);
- return convertBytesToEscapedString(data_bytes, 16);
- }
// [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => '\xE3\x81\x82\xE3\x81\x84'
// [ 0343, 0201, 0202, 0343, 0201, 0204 ] => '\343\201\202\343\201\204'
function convertBytesToEscapedString(data_bytes, base) {
@@ -365,14 +43,27 @@ var unicode = (function(){
}
return escaped;
}
- // [ 0x3042, 0x3044 ] => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
- function convertUnicodeCodePointsToBytes(unicode_codes) {
- var utf8_bytes = [];
- for (var i = 0; i < unicode_codes.length; ++i) {
- var bytes = convertUnicodeCodePointToUtf8Bytes(unicode_codes[i]);
- utf8_bytes = utf8_bytes.concat(bytes);
+ // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
+ // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ]
+ function convertEscapedBytesToBytes(str) {
+ var parts = str.split("\\x");
+ parts.shift(); // Trim the first element.
+ var codes = [];
+ var max = Math.pow(2, 8);
+ for (var i = 0; i < parts.length; ++i) {
+ var code = parseInt(parts[i], 16);
+ if (code >= 0 && code < max) {
+ codes.push(code);
+ } else {
+ // Malformed code ignored.
+ }
}
- return utf8_bytes;
+ return codes;
+ }
+ // [ 0x3042, 0x3044 ] => "ã‚ã„"
+ function convertUnicodeCodePointsToString(unicode_codes) {
+ var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes);
+ return convertUtf16CodesToString(utf16_codes);
}
// 0x3042 => [ 0xE3, 0x81, 0x82 ]
function convertUnicodeCodePointToUtf8Bytes(unicode_code) {
@@ -394,62 +85,22 @@ var unicode = (function(){
}
return utf8_bytes;
}
- // "ã‚ã„" => [ 0x3042, 0x3044 ]
- function convertStringToUnicodeCodePoints(str) {
- var surrogate_1st = 0;
- var unicode_codes = [];
- for (var i = 0; i < str.length; ++i) {
- var utf16_code = str.charCodeAt(i);
- if (surrogate_1st != 0) {
- if (utf16_code >= 0xDC00 && utf16_code <= 0xDFFF) {
- var surrogate_2nd = utf16_code;
- var unicode_code = (surrogate_1st - 0xD800) * (1 << 10) + (1 << 16) +
- (surrogate_2nd - 0xDC00);
- unicode_codes.push(unicode_code);
- } else {
- // Malformed surrogate pair ignored.
- }
- surrogate_1st = 0;
- } else if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) {
- surrogate_1st = utf16_code;
+ // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ]
+ // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair.
+ function convertUnicodeCodePointsToUtf16Codes(unicode_codes) {
+ var utf16_codes = [];
+ for (var i = 0; i < unicode_codes.length; ++i) {
+ var unicode_code = unicode_codes[i];
+ if (unicode_code < (1 << 16)) {
+ utf16_codes.push(unicode_code);
} else {
- unicode_codes.push(utf16_code);
+ var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800;
+ var second = (unicode_code % (1 << 10)) + 0xDC00;
+ utf16_codes.push(first)
+ utf16_codes.push(second)
}
}
- return unicode_codes;
- }
- // 0xff => "ff"
- // 0xff => "377"
- function formatNumber(number, base, num_digits) {
- var str = number.toString(base).toUpperCase();
- for (var i = str.length; i < num_digits; ++i) {
- str = "0" + str;
- }
- return str;
- }
-
- // convert \xFF\xFF\xFF to unicode
- function unescapeFromEscapedBytes (str) {
- var data_bytes = convertEscapedBytesToBytes(str);
- var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes);
- return convertUnicodeCodePointsToString(unicode_codes);
- }
- // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
- // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ]
- function convertEscapedBytesToBytes(str) {
- var parts = str.split("\\x");
- parts.shift(); // Trim the first element.
- var codes = [];
- var max = Math.pow(2, 8);
- for (var i = 0; i < parts.length; ++i) {
- var code = parseInt(parts[i], 16);
- if (code >= 0 && code < max) {
- codes.push(code);
- } else {
- // Malformed code ignored.
- }
- }
- return codes;
+ return utf16_codes;
}
// [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ]
function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) {
@@ -498,28 +149,6 @@ var unicode = (function(){
unicode_codes.shift(); // Trim the first element.
return unicode_codes;
}
- // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ]
- // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair.
- function convertUnicodeCodePointsToUtf16Codes(unicode_codes) {
- var utf16_codes = [];
- for (var i = 0; i < unicode_codes.length; ++i) {
- var unicode_code = unicode_codes[i];
- if (unicode_code < (1 << 16)) {
- utf16_codes.push(unicode_code);
- } else {
- var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800;
- var second = (unicode_code % (1 << 10)) + 0xDC00;
- utf16_codes.push(first)
- utf16_codes.push(second)
- }
- }
- return utf16_codes;
- }
- // [ 0x3042, 0x3044 ] => "ã‚ã„"
- function convertUnicodeCodePointsToString(unicode_codes) {
- var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes);
- return convertUtf16CodesToString(utf16_codes);
- }
// [ 0x3042, 0x3044 ] => "ã‚ã„"
function convertUtf16CodesToString(utf16_codes) {
var unescaped = '';
@@ -528,6 +157,39 @@ var unicode = (function(){
}
return unescaped;
}
+ // 0xff => "ff"
+ // 0xff => "377"
+ function formatNumber(number, base, num_digits) {
+ var str = number.toString(base).toUpperCase();
+ for (var i = str.length; i < num_digits; ++i) {
+ str = "0" + str;
+ }
+ return str;
+ }
+
+ // encodes unicode characters as escaped bytes - \xFF
+ // encodes ONLY non-ascii characters
+ function escapeToEscapedBytes (txt) {
+ var escaped_txt = "", kode, utf8_bytes
+ for (var i = 0; i < txt.length; i++) {
+ kode = txt.charCodeAt(i)
+ if (kode > 0x7f) {
+ utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode)
+ escaped_txt += convertBytesToEscapedString(utf8_bytes, 16)
+ }
+ else {
+ escaped_txt += txt[i]
+ }
+ }
+ return escaped_txt
+ }
+
+ // convert \xFF\xFF\xFF to unicode
+ function unescapeFromEscapedBytes (str) {
+ var data_bytes = convertEscapedBytesToBytes(str);
+ var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes);
+ return convertUnicodeCodePointsToString(unicode_codes);
+ }
return {
raw: UNICODE_BLOCK_LIST,
@@ -535,8 +197,6 @@ var unicode = (function(){
index: index,
range: range,
block: block,
- findGroups: findGroups,
- paginate: paginate,
escapeToEscapedBytes: escapeToEscapedBytes,
unescapeFromEscapedBytes: unescapeFromEscapedBytes,
}