2018-10-24 15:14:46 +00:00
|
|
|
var unicode = (function(){
|
|
|
|
var UNICODE_BLOCK_LIST = [
|
|
|
|
0x0020, 0x007F, "Basic Latin",
|
|
|
|
0x0080, 0x00FF, "Latin-1 Supplement",
|
|
|
|
0x2500, 0x257F, "Box Drawing",
|
|
|
|
0x2580, 0x259F, "Block Elements",
|
|
|
|
]
|
|
|
|
var UNICODE_BLOCK_COUNT = UNICODE_BLOCK_LIST.length / 3
|
|
|
|
var UNICODE_LOOKUP = {}
|
|
|
|
for (var i = 0, len = UNICODE_BLOCK_LIST.length; i < len; i += 3) {
|
|
|
|
UNICODE_LOOKUP[ UNICODE_BLOCK_LIST[i+2] ] = [ UNICODE_BLOCK_LIST[i], UNICODE_BLOCK_LIST[i+1] ]
|
|
|
|
}
|
|
|
|
|
|
|
|
function block (name, n){
|
|
|
|
var b = UNICODE_LOOKUP[name]
|
|
|
|
if (! b) return ""
|
|
|
|
return range.apply(null, b).map(function(n){ return String.fromCharCode(n) })
|
|
|
|
}
|
|
|
|
function entities (a) {
|
|
|
|
return a.map(function(k){ return "&#" + k.join(";&#") + ";" }).join("<br>")
|
|
|
|
}
|
2018-10-25 00:50:37 +00:00
|
|
|
function index (j) {
|
|
|
|
return [ UNICODE_BLOCK_LIST[j*3], UNICODE_BLOCK_LIST[j*3+1], UNICODE_BLOCK_LIST[j*3+2], [] ]
|
2018-10-24 15:14:46 +00:00
|
|
|
}
|
2018-10-25 00:50:37 +00:00
|
|
|
function range(m,n){
|
|
|
|
if (m > n) return []
|
|
|
|
var a = new Array (n-m)
|
|
|
|
for (var i = 0, j = m; j <= n; i++, j++) {
|
|
|
|
a[i] = j
|
2018-10-24 15:14:46 +00:00
|
|
|
}
|
2018-10-25 00:50:37 +00:00
|
|
|
return a
|
2018-10-24 15:14:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => '\xE3\x81\x82\xE3\x81\x84'
|
|
|
|
// [ 0343, 0201, 0202, 0343, 0201, 0204 ] => '\343\201\202\343\201\204'
|
|
|
|
function convertBytesToEscapedString(data_bytes, base) {
|
|
|
|
var escaped = '';
|
|
|
|
for (var i = 0; i < data_bytes.length; ++i) {
|
|
|
|
var prefix = (base == 16 ? "\\x" : "\\");
|
|
|
|
var num_digits = base == 16 ? 2 : 3;
|
|
|
|
var escaped_byte = prefix + formatNumber(data_bytes[i], base, num_digits)
|
|
|
|
escaped += escaped_byte;
|
|
|
|
}
|
|
|
|
return escaped;
|
|
|
|
}
|
2018-10-25 00:50:37 +00:00
|
|
|
// r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ]
|
|
|
|
// r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ]
|
|
|
|
function convertEscapedBytesToBytes(str) {
|
|
|
|
var parts = str.split("\\x");
|
|
|
|
parts.shift(); // Trim the first element.
|
|
|
|
var codes = [];
|
|
|
|
var max = Math.pow(2, 8);
|
|
|
|
for (var i = 0; i < parts.length; ++i) {
|
|
|
|
var code = parseInt(parts[i], 16);
|
|
|
|
if (code >= 0 && code < max) {
|
|
|
|
codes.push(code);
|
|
|
|
} else {
|
|
|
|
// Malformed code ignored.
|
|
|
|
}
|
2018-10-24 15:14:46 +00:00
|
|
|
}
|
2018-10-25 00:50:37 +00:00
|
|
|
return codes;
|
|
|
|
}
|
|
|
|
// [ 0x3042, 0x3044 ] => "ã‚ã„"
|
|
|
|
function convertUnicodeCodePointsToString(unicode_codes) {
|
|
|
|
var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes);
|
|
|
|
return convertUtf16CodesToString(utf16_codes);
|
2018-10-24 15:14:46 +00:00
|
|
|
}
|
|
|
|
// 0x3042 => [ 0xE3, 0x81, 0x82 ]
|
|
|
|
function convertUnicodeCodePointToUtf8Bytes(unicode_code) {
|
|
|
|
var utf8_bytes = [];
|
|
|
|
if (unicode_code < 0x80) { // 1-byte
|
|
|
|
utf8_bytes.push(unicode_code);
|
|
|
|
} else if (unicode_code < (1 << 11)) { // 2-byte
|
|
|
|
utf8_bytes.push((unicode_code >>> 6) | 0xC0);
|
|
|
|
utf8_bytes.push((unicode_code & 0x3F) | 0x80);
|
|
|
|
} else if (unicode_code < (1 << 16)) { // 3-byte
|
|
|
|
utf8_bytes.push((unicode_code >>> 12) | 0xE0);
|
|
|
|
utf8_bytes.push(((unicode_code >> 6) & 0x3f) | 0x80);
|
|
|
|
utf8_bytes.push((unicode_code & 0x3F) | 0x80);
|
|
|
|
} else if (unicode_code < (1 << 21)) { // 4-byte
|
|
|
|
utf8_bytes.push((unicode_code >>> 18) | 0xF0);
|
|
|
|
utf8_bytes.push(((unicode_code >> 12) & 0x3F) | 0x80);
|
|
|
|
utf8_bytes.push(((unicode_code >> 6) & 0x3F) | 0x80);
|
|
|
|
utf8_bytes.push((unicode_code & 0x3F) | 0x80);
|
|
|
|
}
|
|
|
|
return utf8_bytes;
|
|
|
|
}
|
2018-10-25 00:50:37 +00:00
|
|
|
// [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ]
|
|
|
|
// [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair.
|
|
|
|
function convertUnicodeCodePointsToUtf16Codes(unicode_codes) {
|
|
|
|
var utf16_codes = [];
|
|
|
|
for (var i = 0; i < unicode_codes.length; ++i) {
|
|
|
|
var unicode_code = unicode_codes[i];
|
|
|
|
if (unicode_code < (1 << 16)) {
|
|
|
|
utf16_codes.push(unicode_code);
|
2018-10-24 15:14:46 +00:00
|
|
|
} else {
|
2018-10-25 00:50:37 +00:00
|
|
|
var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800;
|
|
|
|
var second = (unicode_code % (1 << 10)) + 0xDC00;
|
|
|
|
utf16_codes.push(first)
|
|
|
|
utf16_codes.push(second)
|
2018-10-24 15:14:46 +00:00
|
|
|
}
|
|
|
|
}
|
2018-10-25 00:50:37 +00:00
|
|
|
return utf16_codes;
|
2018-10-24 15:14:46 +00:00
|
|
|
}
|
|
|
|
// [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ]
|
|
|
|
function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) {
|
|
|
|
var unicode_codes = [];
|
|
|
|
var unicode_code = 0;
|
|
|
|
var num_followed = 0;
|
|
|
|
for (var i = 0; i < utf8_bytes.length; ++i) {
|
|
|
|
var utf8_byte = utf8_bytes[i];
|
|
|
|
if (utf8_byte >= 0x100) {
|
|
|
|
// Malformed utf8 byte ignored.
|
|
|
|
} else if ((utf8_byte & 0xC0) == 0x80) {
|
|
|
|
if (num_followed > 0) {
|
|
|
|
unicode_code = (unicode_code << 6) | (utf8_byte & 0x3f);
|
|
|
|
num_followed -= 1;
|
|
|
|
} else {
|
|
|
|
// Malformed UTF-8 sequence ignored.
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (num_followed == 0) {
|
|
|
|
unicode_codes.push(unicode_code);
|
|
|
|
} else {
|
|
|
|
// Malformed UTF-8 sequence ignored.
|
|
|
|
}
|
|
|
|
if (utf8_byte < 0x80){ // 1-byte
|
|
|
|
unicode_code = utf8_byte;
|
|
|
|
num_followed = 0;
|
|
|
|
} else if ((utf8_byte & 0xE0) == 0xC0) { // 2-byte
|
|
|
|
unicode_code = utf8_byte & 0x1f;
|
|
|
|
num_followed = 1;
|
|
|
|
} else if ((utf8_byte & 0xF0) == 0xE0) { // 3-byte
|
|
|
|
unicode_code = utf8_byte & 0x0f;
|
|
|
|
num_followed = 2;
|
|
|
|
} else if ((utf8_byte & 0xF8) == 0xF0) { // 4-byte
|
|
|
|
unicode_code = utf8_byte & 0x07;
|
|
|
|
num_followed = 3;
|
|
|
|
} else {
|
|
|
|
// Malformed UTF-8 sequence ignored.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_followed == 0) {
|
|
|
|
unicode_codes.push(unicode_code);
|
|
|
|
} else {
|
|
|
|
// Malformed UTF-8 sequence ignored.
|
|
|
|
}
|
|
|
|
unicode_codes.shift(); // Trim the first element.
|
|
|
|
return unicode_codes;
|
|
|
|
}
|
|
|
|
// [ 0x3042, 0x3044 ] => "ã‚ã„"
|
|
|
|
function convertUtf16CodesToString(utf16_codes) {
|
|
|
|
var unescaped = '';
|
|
|
|
for (var i = 0; i < utf16_codes.length; ++i) {
|
|
|
|
unescaped += String.fromCharCode(utf16_codes[i]);
|
|
|
|
}
|
|
|
|
return unescaped;
|
|
|
|
}
|
2018-10-25 00:50:37 +00:00
|
|
|
// 0xff => "ff"
|
|
|
|
// 0xff => "377"
|
|
|
|
function formatNumber(number, base, num_digits) {
|
|
|
|
var str = number.toString(base).toUpperCase();
|
|
|
|
for (var i = str.length; i < num_digits; ++i) {
|
|
|
|
str = "0" + str;
|
|
|
|
}
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
|
|
|
// encodes unicode characters as escaped bytes - \xFF
|
|
|
|
// encodes ONLY non-ascii characters
|
|
|
|
function escapeToEscapedBytes (txt) {
|
|
|
|
var escaped_txt = "", kode, utf8_bytes
|
|
|
|
for (var i = 0; i < txt.length; i++) {
|
|
|
|
kode = txt.charCodeAt(i)
|
|
|
|
if (kode > 0x7f) {
|
|
|
|
utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode)
|
|
|
|
escaped_txt += convertBytesToEscapedString(utf8_bytes, 16)
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
escaped_txt += txt[i]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return escaped_txt
|
|
|
|
}
|
|
|
|
|
|
|
|
// convert \xFF\xFF\xFF to unicode
|
|
|
|
function unescapeFromEscapedBytes (str) {
|
|
|
|
var data_bytes = convertEscapedBytesToBytes(str);
|
|
|
|
var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes);
|
|
|
|
return convertUnicodeCodePointsToString(unicode_codes);
|
|
|
|
}
|
2018-10-24 15:14:46 +00:00
|
|
|
|
|
|
|
return {
|
|
|
|
raw: UNICODE_BLOCK_LIST,
|
|
|
|
lookup: UNICODE_LOOKUP,
|
|
|
|
index: index,
|
|
|
|
range: range,
|
|
|
|
block: block,
|
|
|
|
escapeToEscapedBytes: escapeToEscapedBytes,
|
|
|
|
unescapeFromEscapedBytes: unescapeFromEscapedBytes,
|
|
|
|
}
|
|
|
|
})()
|