var unicode = (function(){ var UNICODE_BLOCK_LIST = [ 0x0020, 0x007F, "Basic Latin", 0x0080, 0x00FF, "Latin-1 Supplement", 0x2500, 0x257F, "Box Drawing", 0x2580, 0x259F, "Block Elements", ] var UNICODE_BLOCK_COUNT = UNICODE_BLOCK_LIST.length / 3 var UNICODE_LOOKUP = {} for (var i = 0, len = UNICODE_BLOCK_LIST.length; i < len; i += 3) { UNICODE_LOOKUP[ UNICODE_BLOCK_LIST[i+2] ] = [ UNICODE_BLOCK_LIST[i], UNICODE_BLOCK_LIST[i+1] ] } function block (name, n){ var b = UNICODE_LOOKUP[name] if (! b) return "" return range.apply(null, b).map(function(n){ return String.fromCharCode(n) }) } function entities (a) { return a.map(function(k){ return "&#" + k.join(";&#") + ";" }).join("
") } function index (j) { return [ UNICODE_BLOCK_LIST[j*3], UNICODE_BLOCK_LIST[j*3+1], UNICODE_BLOCK_LIST[j*3+2], [] ] } function range(m,n){ if (m > n) return [] var a = new Array (n-m) for (var i = 0, j = m; j <= n; i++, j++) { a[i] = j } return a } // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => '\xE3\x81\x82\xE3\x81\x84' // [ 0343, 0201, 0202, 0343, 0201, 0204 ] => '\343\201\202\343\201\204' function convertBytesToEscapedString(data_bytes, base) { var escaped = ''; for (var i = 0; i < data_bytes.length; ++i) { var prefix = (base == 16 ? "\\x" : "\\"); var num_digits = base == 16 ? 2 : 3; var escaped_byte = prefix + formatNumber(data_bytes[i], base, num_digits) escaped += escaped_byte; } return escaped; } // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ] function convertEscapedBytesToBytes(str) { var parts = str.split("\\x"); parts.shift(); // Trim the first element. var codes = []; var max = Math.pow(2, 8); for (var i = 0; i < parts.length; ++i) { var code = parseInt(parts[i], 16); if (code >= 0 && code < max) { codes.push(code); } else { // Malformed code ignored. } } return codes; } // [ 0x3042, 0x3044 ] => "あい" function convertUnicodeCodePointsToString(unicode_codes) { var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes); return convertUtf16CodesToString(utf16_codes); } // 0x3042 => [ 0xE3, 0x81, 0x82 ] function convertUnicodeCodePointToUtf8Bytes(unicode_code) { var utf8_bytes = []; if (unicode_code < 0x80) { // 1-byte utf8_bytes.push(unicode_code); } else if (unicode_code < (1 << 11)) { // 2-byte utf8_bytes.push((unicode_code >>> 6) | 0xC0); utf8_bytes.push((unicode_code & 0x3F) | 0x80); } else if (unicode_code < (1 << 16)) { // 3-byte utf8_bytes.push((unicode_code >>> 12) | 0xE0); utf8_bytes.push(((unicode_code >> 6) & 0x3f) | 0x80); utf8_bytes.push((unicode_code & 0x3F) | 0x80); } else if (unicode_code < (1 << 21)) { // 4-byte utf8_bytes.push((unicode_code >>> 18) | 0xF0); utf8_bytes.push(((unicode_code >> 12) & 0x3F) | 0x80); utf8_bytes.push(((unicode_code >> 6) & 0x3F) | 0x80); utf8_bytes.push((unicode_code & 0x3F) | 0x80); } return utf8_bytes; } // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ] // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair. function convertUnicodeCodePointsToUtf16Codes(unicode_codes) { var utf16_codes = []; for (var i = 0; i < unicode_codes.length; ++i) { var unicode_code = unicode_codes[i]; if (unicode_code < (1 << 16)) { utf16_codes.push(unicode_code); } else { var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800; var second = (unicode_code % (1 << 10)) + 0xDC00; utf16_codes.push(first) utf16_codes.push(second) } } return utf16_codes; } // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ] function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) { var unicode_codes = []; var unicode_code = 0; var num_followed = 0; for (var i = 0; i < utf8_bytes.length; ++i) { var utf8_byte = utf8_bytes[i]; if (utf8_byte >= 0x100) { // Malformed utf8 byte ignored. } else if ((utf8_byte & 0xC0) == 0x80) { if (num_followed > 0) { unicode_code = (unicode_code << 6) | (utf8_byte & 0x3f); num_followed -= 1; } else { // Malformed UTF-8 sequence ignored. } } else { if (num_followed == 0) { unicode_codes.push(unicode_code); } else { // Malformed UTF-8 sequence ignored. } if (utf8_byte < 0x80){ // 1-byte unicode_code = utf8_byte; num_followed = 0; } else if ((utf8_byte & 0xE0) == 0xC0) { // 2-byte unicode_code = utf8_byte & 0x1f; num_followed = 1; } else if ((utf8_byte & 0xF0) == 0xE0) { // 3-byte unicode_code = utf8_byte & 0x0f; num_followed = 2; } else if ((utf8_byte & 0xF8) == 0xF0) { // 4-byte unicode_code = utf8_byte & 0x07; num_followed = 3; } else { // Malformed UTF-8 sequence ignored. } } } if (num_followed == 0) { unicode_codes.push(unicode_code); } else { // Malformed UTF-8 sequence ignored. } unicode_codes.shift(); // Trim the first element. return unicode_codes; } // [ 0x3042, 0x3044 ] => "あい" function convertUtf16CodesToString(utf16_codes) { var unescaped = ''; for (var i = 0; i < utf16_codes.length; ++i) { unescaped += String.fromCharCode(utf16_codes[i]); } return unescaped; } // 0xff => "ff" // 0xff => "377" function formatNumber(number, base, num_digits) { var str = number.toString(base).toUpperCase(); for (var i = str.length; i < num_digits; ++i) { str = "0" + str; } return str; } // encodes unicode characters as escaped bytes - \xFF // encodes ONLY non-ascii characters function escapeToEscapedBytes (txt) { var escaped_txt = "", kode, utf8_bytes for (var i = 0; i < txt.length; i++) { kode = txt.charCodeAt(i) if (kode > 0x7f) { utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode) escaped_txt += convertBytesToEscapedString(utf8_bytes, 16) } else { escaped_txt += txt[i] } } return escaped_txt } // convert \xFF\xFF\xFF to unicode function unescapeFromEscapedBytes (str) { var data_bytes = convertEscapedBytesToBytes(str); var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes); return convertUnicodeCodePointsToString(unicode_codes); } return { raw: UNICODE_BLOCK_LIST, lookup: UNICODE_LOOKUP, index: index, range: range, block: block, escapeToEscapedBytes: escapeToEscapedBytes, unescapeFromEscapedBytes: unescapeFromEscapedBytes, } })()