Files
rappaurio-sae501_502/app/node_modules/cejs/data/character.js
2023-09-25 13:27:24 +02:00

633 lines
18 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* @name CeL function for character encoding
* @fileoverview 本檔案包含了文字/字元編碼用的 functions。文字コード変換ライブラリ
*
* @example <code>
* CeL.run('data.character',function(){
* // ..
* });
* </code>
*
* @since 2017/1/22 22:38:52
*
* @see [[en:binary-to-text encoding]], [[en:character encoding]]
*/
'use strict';
// 'use asm';
// --------------------------------------------------------------------------------------------
typeof CeL === 'function' && CeL.run({
// module name
name : 'data.character',
// for String.prototype.chars()
require : 'data.native.',
// 設定不匯出的子函式。
no_extend : 'add_map',
// 為了方便格式化程式碼,因此將 module 函式主體另外抽出。
code : module_code
});
function module_code(library_namespace) {
var module_name = this.id;
/**
* null module constructor
*
* @class 處理編碼的 functions
*/
var _// JSDT:_module_
= function() {
// null module constructor
};
/**
* for JSDT: 有 prototype 才會將之當作 Class
*/
_// JSDT:_module_
.prototype = {};
// TODO
function Base64(text) {
var index = 0, length = text.length, result = [];
for (; index < length; index++) {
// Buffer.from('編碼');
}
return result.join('');
}
// =============================================================================================
// character encoding 字元編碼
var
/** {Natural}base of 16 bit */
HEX_BASE = 0x10,
// 取得正式名稱。預先設定以供load_code_map()使用。
// [經過normalize_encoding_name(encoding)之前期處理的key]
// = module/file name below data/encoding/
code_of_alias = {
big5 : 'Big5',
gbk : 'GBK',
// 因為以實用性為主,因此全部導向到擴張至最大的最新字碼。
gb2312 : 'GBK',
eucjp : 'EUC-JP',
shiftjis : 'Shift_JIS',
sjis : 'Shift_JIS'
},
// coding map / config hash for decoding specified coding to Unicode.
// map_set[encoding_name]
// = [ config, [1 byte map], [2 byte map], [3 byte map], [4 byte map] ]
map_set = Object.create(null),
// encoding Unicode to specified coding
// encode_map_set[encoding_name]
// = {Unicode_char:char_code}
encode_map_set = Object.create(null),
/** {String}REPLACEMENT CHARACTER U+FFFD, '?' in old IE */
UNKNOWN_CHARACTER = '<27>', UNKNOWN_CHARACTER_CODE = UNKNOWN_CHARACTER
.codePointAt(0);
// _.map_set = map_set;
function normalize_encoding_name(encoding) {
encoding = String(encoding).trim();
return code_of_alias[encoding.toLowerCase().replace(/[-_ ]+/g, '')]
|| encoding;
}
function encoding_is_loaded(encoding) {
return normalize_encoding_name(encoding) in map_set;
}
_.is_loaded = encoding_is_loaded;
/**
* <code>
encoding.map.json規格書:包含map:
{
// to single byte / 2 or multi bytes set, continuous, split by /./u:
start_char_code_in_hex:'map',
// ** deprecated: to single byte / 2 bytes set, continuous, .split('split string'):
start_char_code_in_hex:['map', 'split string'],
// ** deprecated: 2 bytes set, .split('split string'):
start_char_code_in_hex:[start of second byte, 'map', 'split string'],
// ** deprecated: .split(''):
start_char_code_in_hex:[start of second byte, 'map', ''],
// ** deprecated: split by /./u:
start_char_code_in_hex:[start of second byte, 'map'],
// ** deprecated: convert single code to single string
start_char_code_in_hex:['map', 0],
// 這邊的count表示中間有count個字元分別是自char開始unicode編碼之後的序列。
start_char_code_in_hex:['char', {Natural}count, 'char', {Natural}count],
}
e.g.,
// split by .chars(true)
{'A180':[0x80,'~~~~~~'],'A4B3':'##'}
// .split('')
{'A180':[0x80,'~~~~~~', ''],'A4B3':['#,#',',']}
to_multi的不能跨越to_single的範圍。
e.g.,
{'A1FF':[0xFF,'abcde'],'A2FF':'12','A4B3':'~'}
'A2FF','A4B3': 不在'A1FF'範圍內: A1FF:a, A2FF:b, A3FF:c, ...
實作將直接以+1的方式配入 convert_map 中因此A2FF之第二組"2"將被配入A300!
</code>
*
* @see [[en:character encoding]]
* https://github.com/ashtuchkin/iconv-lite/tree/master/encodings/tables
*/
/**
*
* @param {String}code_name
* encoding name
* @param {Object}map_data
*/
function add_code_map(code_name, map_data) {
library_namespace.debug(code_name, 1, 'add_code_map');
var encoding = normalize_encoding_name(code_name);
if (!(encoding in code_of_alias)) {
code_of_alias[encoding] = code_name;
}
if (!map_set[encoding]) {
// 不重新設定,以允許多次設定。
map_set[encoding] = [];
}
var code_map = map_set[encoding], config = code_map,
// main_encode_map[Unicode character]
// = {ℕ⁰:Natural+0}code of specified coding
main_encode_map = encode_map_set[encoding]
|| (encode_map_set[encoding] = Object.create(null));
// console.log(Object.keys(map_data));
for ( var key in map_data) {
var char_list = map_data[key], matched = key
.match(/^_?([\dA-F]+)$/i);
// console.log([ key, matched, char_list ]);
if (!matched) {
// console.log(key);
// config?
code_map[key] = char_list;
continue;
}
var base_byte_code = matched[1],
//
char_code = parseInt(base_byte_code, HEX_BASE),
//
main_map = Math.ceil(base_byte_code.length / 2);
main_map = code_map[main_map]
// initialize 稀疏矩陣。
|| (code_map[main_map] = []);
if (typeof char_list === 'string') {
char_list.chars(true).forEach(function(character) {
if (main_encode_map[character]) {
library_namespace.debug(code_name
// http://founder.acgvlyric.org/iu/doku.php/%E9%80%A0%E5%AD%97:%E5%BA%8F_%E5%B8%B8%E7%94%A8%E9%A6%99%E6%B8%AF%E5%A4%96%E5%AD%97%E8%A1%A8
+ ': character mapping ['
// 除了少數幾個特殊的字之外,其他大部分都對應到後來指定的字碼。
+ character + ']: 0x'
// @see data/character/Big5.js
+ main_encode_map[character].toString(16).toUpperCase()
// e.g., "包" should be A55D in Big5, not FABD
+ ' → 0x'
// "者" should be AACC in Big5, not 8ECD
+ char_code.toString(16).toUpperCase(),
//
2, 'add_code_map');
}
// register
main_encode_map[character] = char_code;
// 為了能辨識,無論哪種都還是得設定這個對應 to Unicode。
main_map[char_code++] = character;
});
continue;
}
if (!Array.isArray(char_list)) {
library_namespace.error('Not Array: '
+ JSON.stringify(char_list));
throw new Error('Invalid character code map: ' + code_name
+ '.' + base_byte_code);
}
// start_first_byte
if (!('start_byte_code' in config)) {
if (char_code === 0 && char_list[0] === '\u0000'
&& char_list[1] > 0) {
char_list.shift();
// e.g., ['\0',2] → byte code < 2+1 的都能直接轉string。
config.start_byte_code = char_list.shift() + 1;
} else {
config.start_byte_code = 0;
}
}
// console.log(char_list);
var last_char_code;
char_list.forEach(function(slice, index) {
if (typeof slice === 'string') {
char_list = slice.chars(true);
char_list.forEach(function(character) {
// register
main_encode_map[character] = char_code;
main_map[char_code++] = character;
});
last_char_code = char_list.at(-1).codePointAt(0);
return;
}
if (!(last_char_code >= 0) || !(slice > 0)) {
throw new Error('Invalid character of code map: '
+ code_name + '.' + base_byte_code + '.' + slice);
}
// console.log([last_char_code, slice]);
var end = last_char_code + slice;
while (last_char_code < end) {
// register
var character = String.fromCodePoint(++last_char_code);
main_encode_map[character] = char_code;
main_map[char_code++] = character;
}
});
}
}
_.add_map = add_code_map;
function load_code_map(encoding_list, callback) {
if (!Array.isArray(encoding_list)) {
encoding_list = [ encoding_list ];
}
encoding_list = encoding_list.map(normalize_encoding_name);
// resources need to load
var resources_path_list = [];
encoding_list.forEach(function(encoding) {
if (!(encoding in map_set)) {
resources_path_list.push(library_namespace.get_module_path(
module_name, encoding + '.js'));
}
})
if (resources_path_list.length === 0) {
callback && callback();
return true;
}
if (resources_path_list.length === 1) {
resources_path_list = resources_path_list[0];
}
library_namespace.debug(resources_path_list, 1, 'load_code_map');
library_namespace.run(resources_path_list, callback);
}
_.load = load_code_map;
// ===============================================================
// String.prototype.encode(), string.encode()
function String_to_code(encoding, options) {
encoding = normalize_encoding_name(encoding);
// 4: 保險用,幾乎都夠用,卻仍舊不能保證。
var buffer = Buffer.allocUnsafe(4 * this.length), index = 0,
// main_encode_map[Unicode character]
// = {ℕ⁰:Natural+0}code of specified coding
main_encode_map = encode_map_set[encoding],
//
start_byte_code = map_set[encoding]
&& map_set[encoding].start_byte_code;
if (!main_encode_map) {
throw new Error('Unknown encoding: ' + encoding
+ '. You may need to ' + module_name + '.load("' + encoding
+ '") first?');
}
// TODO: 對於不是以character分割以及雙/多位元卻是0x0000的情況需要特別處理這裡會被當作0x00而非0x0000!
this.chars(true).forEach(function(character) {
var code = character.charCodeAt(0);
if (code < start_byte_code) {
buffer[index++] = code;
return;
}
var _i = code = (main_encode_map[character]
//
|| UNKNOWN_CHARACTER_CODE) | 0, end = index;
// 8: 0x100=2^8
while ((_i >>= 8) > 0) {
end++;
}
_i = end;
while (true) {
buffer[_i] = code % 0x100;
if (--_i < index) {
break;
}
code >>= 8;
}
index = end + 1;
});
// assert: buffer.length >= index
return buffer.slice(0, index);
}
// ===============================================================
if (library_namespace.platform.nodejs) {
// Buffer.prototype.to_UTF8;
// Buffer.prototype.to_Big5;
// Buffer.prototype.to_EUC_JP;
// cache original Buffer.prototype.toString
Buffer.prototype.native_toString = Buffer.prototype.toString;
/** @deprecated */
Buffer.prototype.toString = function deprecated_Buffer_toString(
encoding) {
var endoding_error;
try {
return this.native_toString(encoding);
} catch (e) {
endoding_error = e;
}
try {
return code_array_to_String.call(this, encoding);
} catch (e) {
// throw e;
throw endoding_error;
}
};
// 把 Buffer 物件的內容當作是 encoding 編碼,並解析成 {String}UTF-8 string。
Buffer.prototype.toString = function Buffer_toString(encoding, options) {
try {
// buffer.toString(null) will throw!
return this.native_toString(encoding);
} catch (e) {
}
// 有錯誤直接丟出去。
return code_array_to_String.call(this, encoding, options);
};
// TODO: use StringDecoder
}
if (false) {
CeL.run('data.character');
CeL.character.load('Big-5', function() {
console.assert('作輩' === Buffer.from('A740BDFA', 'hex').toString(
'Big-5'));
var text = '做基本檢測。';
console.assert(text === text.encode('Big_5').toString('Big 5'));
});
}
// assert: this = [ byte_code, byte_code, ... ]
function code_array_to_String(encoding, options) {
// check if we can convert the encoding.
encoding = normalize_encoding_name(encoding);
var code_map = map_set[encoding];
if (!code_map) {
// Unknown encoding: e.
// You may need to run CeL.data.character.load("e") first?
throw new Error('Unknown encoding: ' + encoding
+ '. You may need to run ' + module_name + '.load("'
+ encoding + '") first?');
}
// console.log(code_map);
var code_index = 0,
// converted result
converted = '';
for (var start_byte_code = code_map.start_byte_code, reminder = 0, max_byte = code_map.length,
// main loop to decode to default inner encoding (Unicode).
byte_index = 0, length = this.length; byte_index < length; byte_index++) {
if (code_index === 0) {
reminder = this[byte_index];
if (reminder < start_byte_code) {
converted += String.fromCharCode(reminder);
continue;
}
} else {
reminder = reminder * 0x100 + this[byte_index];
}
if (++code_index === max_byte) {
// 自這次搜尋開始無法找到能mapping的character。
converted += UNKNOWN_CHARACTER;
// rollback至自這次搜尋開始後的下一個byte。
byte_index -= code_index - 2;
// reset
reminder = code_index = 0;
continue;
}
var map_single = code_map[code_index];
if (false) {
library_namespace.debug('Test ' + code_index + ' bytes: '
+ reminder.toString(HEX_BASE), 6,
'code_array_to_String');
if (map_single) {
library_namespace.debug(map_single.slice(Math.max(0,
reminder - 9), reminder + 9), 9,
'code_array_to_String');
}
}
if (map_single && (reminder in map_single)) {
// find
converted += map_single[reminder];
// reset
reminder = code_index = 0;
}
}
if (code_index > 0) {
converted += UNKNOWN_CHARACTER;
}
return converted;
}
// ---------------------------------------------------------------
function Array_to_String(encoding, options) {
var array = this.map(function(byte, index) {
// 做基本檢測。
if (typeof byte === 'string' && byte.length === 1) {
byte = byte.charCodeAt(0);
}
if (typeof byte === 'number' && 0 <= byte && byte < 0x100
&& ((byte | 0) === byte)) {
return byte;
}
throw new Error('Invalid byte: [' + index + '] ' + byte);
});
return code_array_to_String.call(array, encoding, options);
}
library_namespace.set_method(Array.prototype, {
decode : Array_to_String
});
library_namespace.set_method(String.prototype, {
encode : String_to_code,
// assert: /^[\x00-\xFF]*$/i.test(this)
decode : function decode_as_byte_String(encoding, options) {
if (false && !/^[\x00-\xFF]*$/i.test(this)) {
throw new Error('Invalid byte: [' + index + '] ' + byte);
}
// use Array_to_String()
return this.split('').decode(encoding, options);
}
});
// ---------------------------------------------------------------
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent
// /^[*\-._0-9A-Za-z]$/
var PATTERN_has_URI_component_invalid_character = /[^a-zA-Z0-9\-_.!~*'()]/;
// _.PATTERN_has_URI_component_invalid_character =
// PATTERN_has_URI_component_invalid_character;
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
var PATTERN_has_URI_invalid_character = /[^a-zA-Z0-9;,/?:@&=+$\-_.!~*'()#]/;
// _.PATTERN_has_URI_invalid_character = PATTERN_has_URI_invalid_character;
/**
* @see https://www.w3.org/TR/html5/forms.html#url-encoded-form-data
*/
var encode_URI_component_base_map = [];
// " "→"+"
encode_URI_component_base_map[0x20] = '+';
(function() {
for (var code = 0x2A; code < 0x7A; code++) {
var character = String.fromCharCode(code);
if (!PATTERN_has_URI_component_invalid_character.test(character)) {
encode_URI_component_base_map[code] = character;
}
}
})();
/**
*
* @param {String}string
* @param [encoding]
* @returns
*/
function encode_URI_component(string, encoding) {
if (!encoding || /^UTF-?8$/i.test(encoding)) {
// fallback: native methods are faster
return encodeURIComponent(string);
}
// charset
encoding = normalize_encoding_name(encoding);
if (false) {
// for pure Big5, no 香港增補字符集
string = string.replace(/[―喰蔃瀞靝鼗弌鍮蠏覩瑨牐]/g, function(char) {
return '&#' + char.charCodeAt(0) + ';';
});
}
var encoded = '';
string.encode(encoding).forEach(function(byte) {
encoded += byte in encode_URI_component_base_map
//
? encode_URI_component_base_map[byte]
//
: '%' + byte.toString(0x10).toUpperCase();
});
return encoded;
}
_.encode_URI_component = encode_URI_component;
function encode_URI(string, encoding) {
if (!encoding || /^UTF-?8$/i.test(encoding)) {
// fallback: native methods are faster
return encodeURI(string);
}
// charset
encoding = normalize_encoding_name(encoding);
var encoded = '';
string.encode(encoding).forEach(function(byte) {
encoded += (byte in encode_URI_component_base_map)
//
&& !PATTERN_has_URI_invalid_character.test(byte)
//
? encode_URI_component_base_map[byte]
//
: '%' + byte.toString(0x10).toUpperCase();
});
return encoded;
}
_.encode_URI = encode_URI;
/**
* @see http://qiita.com/weal/items/3b3ddfb8157047119554
* http://polygon-planet-log.blogspot.tw/2012/04/javascript.html
*/
function decode_URI_component(encoded, encoding) {
if (!encoding || /^UTF-?8$/i.test(encoding)) {
// fallback
return decodeURIComponent(encoded);
}
// charset
encoding = normalize_encoding_name(encoding);
var string = '', buffer = [], PATTERN = /%([\dA-F]{2})|[\s\S]/ig, matched, code;
while (matched = PATTERN.exec(encoded)) {
if (matched[1]) {
buffer.push(parseInt(matched[1], 0x10));
} else if ((matched = matched[0]) === '+') {
// "+"→" "
buffer.push(0x20);
} else if ((code = matched.charCodeAt(0)) < 0x100) {
buffer.push(code);
} else {
if (buffer.length > 0) {
string += code_array_to_String.call(buffer, encoding);
buffer.length = 0;
}
string += matched;
}
}
if (buffer.length > 0) {
string += code_array_to_String.call(buffer, encoding);
}
return string;
}
_.decode_URI_component = decode_URI_component;
// https://www.geeksforgeeks.org/difference-between-decodeuricomponent-and-decodeuri-functions-in-javascript/
// decodeURI(): It takes encodeURI(url) string so it cannot decoded
// characters (, / ? : @ & = + $ #)
// TODO: decodeURI("%26") === "%26" && decodeURIComponent("%26") === "&"
_.decode_URI = decode_URI_component;
// ---------------------------------------------------------------
return (_// JSDT:_module_
);
}