/**
* @name CeL function for encoding and language identification
* @fileoverview 本檔案包含了編碼,例如自動偵測地區語系/文化設定編碼的 functions。言語判定ライブラリ。
* @since
*/
'use strict';
// 'use asm';
// --------------------------------------------------------------------------------------------
// 不採用 if 陳述式,可以避免 Eclipse JSDoc 與 format 多縮排一層。
typeof CeL === 'function' && CeL.run({
// module name
name : 'application.locale.encoding',
// initialize_kana_romaji() need CeL.data.Convert_Pairs()
require : 'data.|data.Convert_Pairs.|data.native.'
//
+ '|application.OS.Windows.new_COM'
//
+ '|application.locale.language_tag'
//
+ '|application.OS.Windows.file.open_file'
// library_namespace.file_exists()
+ '|application.storage.'
//
+ '|application.OS.Windows.file.is_file'
//
+ '|application.OS.Windows.file.AdoEnums'
//
+ '|application.OS.Windows.file.translate_ADO_Stream_binary_data',
// 設定不匯出的子函式。
// no_extend : '*',
// 為了方便格式化程式碼,因此將 module 函式主體另外抽出。
code : module_code
});
function module_code(library_namespace) {
var module_name = this.id,
// requiring
new_COM = this.r('new_COM'), language_tag = this.r('language_tag'), open_file = this
.r('open_file'), is_file = this.r('is_file'), AdoEnums = this
.r('AdoEnums'), translate_ADO_Stream_binary_data = this
.r('translate_ADO_Stream_binary_data');
// var to_standard_language_tag = gettext.to_standard;
/**
* null module constructor
*
* @class encoding and language identification 的 functions
*/
var _// JSDT:_module_
= function() {
// null module constructor
};
/**
* for JSDT: 有 prototype 才會將之當作 Class
*/
_// JSDT:_module_
.prototype = {};
// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
// ascii
var binary_charset = 'ISO-8859-1';
/**
*
TODO:
考慮字頻。
只檢測常用的幾個字,無法判別才廣泛測試。
http://atedev.wordpress.com/2007/09/19/bom-bom-bom/
00 00 fe ff UTF-32, Big Endian
fe ff 00 00 UTF-32, Little Endian
fe ff ## ## UTF-16, Big Endian
ff fe ## ## UTF-16, Little Endian
ef bb bf UTF-8
var FN='I:\\Documents and Settings\\kanashimi\\My Documents\\kanashimi\\www\\cgi-bin\\game\\Shift_JIS.txt',enc=guess_encoding(FN);alert('['+enc+'] '+FN+'\n'+_.read_file(FN,enc).slice(0,900));
*/
/**
* 自動判別檔案(或字串)之編碼 文字エンコーディング判定を行う
autodetect encoding
http://www.hawk.34sp.com/stdpls/dwsh/charset_adodb.html
http://www.ericphelps.com/q193998/
http://hp.vector.co.jp/authors/VA003334/ado/adostream.htm
*/
// guess_encoding[generateCode.dLK]='is_file,open_file,guess_text_language,get_HTML_encoding';
/**
* guess character encoding / character set of file. 偵測檔案編碼。現可偵測中日韓東亞語言。
*
* @param {String}file_path
* 檔案path
* @param {Boolean}[is_HTML]
* 為 HTML 檔案
* @param [open_function]
* open_function(file_path, length, encoding)
* @returns
* @see 字符集探测, A composite approach to
* language/encoding detection, 一种语言/编码检测的复合方法, 一种语言/编码检测的复合方法, Automatic Detection of Character Encoding and
* Language,
*/
function guess_encoding(file_path, is_HTML, options) {
// 前置處理。
if (library_namespace.is_Object(is_HTML))
options = is_HTML, is_HTML = undefined;
else if (!library_namespace.is_Object(options)) {
options = isNaN(options) ? Object.create(null) : {
reading_length : options
};
}
var t, code;
if (false && typeof ActiveXObject == 'undefined') {
alert("guess_encoding: Can't find ActiveXObject!");
return;
}
if (false && typeof _.get_HTML_encoding != 'function')
is_HTML = false;
// TODO: using library_namespace.file_exists(file_path) @ build.js
if (!is_file(file_path)) {
library_namespace.debug('Treat [' + file_path + '] as string.');
return file_path.length < 1024 ? guess_encoding.unknown_encoding
: (t = guess_text_language(file_path, undefined, options)) ? t
: (is_HTML || typeof is_HTML === 'undefined')
&& (t = _.get_HTML_encoding(file_path)) ? t
: guess_encoding.unknown_encoding;
}
// 讀 binary data 用 'ISO-8859-1' 會 error encoding.
var ADO_Stream = open_file(file_path, binary_charset);
if (library_namespace.is_type(ADO_Stream, 'Error')) {
return guess_encoding.unknown_encoding;
}
// 0: read all.
var reading_length = options.reading_length;
if (isNaN(reading_length) || (reading_length |= 0) < 0
|| reading_length > guess_encoding.max_length_to_test)
// default
reading_length = is_HTML ? guess_encoding.min_length_of_HTML
: guess_encoding.min_length_to_test;
t = ADO_Stream.ReadText(3);
// t = ADO_Stream.Read(3);
library_namespace.debug(file_path + ': ['
+ t.slice(0, 3).split('').join(',') + ']..', 2);
if (typeof is_HTML === 'undefined')
is_HTML = /\.[xs]?html?$/i.test(file_path);
var question_mark_count;
if (typeof t != 'string') {
// 此時type通常是unknown,不能用+=
// t=''+t;
return guess_encoding.unknown_encoding;
}
/**
* Unicode的Byte Order Mark(BOM)在UTF-16LE(little
* endian)裏,它是以FF-FE這兩個bytes表達,在BE(big
* endian)裏,是FEFF。而在UTF-8裏,它是以EF-BB-BF這三個bytes表達。
*/
if (t.slice(0, 2) === '\xFF\xFE') {
// Unicode big-endian
code = 'unicodeFFFE';
// code = 'UTF-16BE';
// code = 'unicodeFFFE';
} else if (t.slice(0, 2) === '\xFE\xFF') {
// UTF-16LE: Unicode little-endian.
// In practice, due to Windows using little-endian order by default,
// many applications also assume little-endian encoding by default.
code = 'unicode';
} else if (t === '\xEF\xBB\xBF')
code = 'UTF-8';
else {
// 即使是用OpenTextFile(_.open_format.TristateFalse),UTF-8還是會被轉換而判別不出來。
// from http://www.hawk.34sp.com/stdpls/dwsh/charset_adodb.html
var l, codes = {}, reg = new RegExp(), stream = open_file(
file_path, 'binary');
codes[binary_charset] = '[\\x09\\x0a\\x0d\\x20-\\x7e]';
// http://www.cns11643.gov.tw/web/word/big5/index.html
if (false)
codes['Big5'] = codes[binary_charset]
+ '|[\\xa4-\\xc6\\xc9-\\xf9][\\x40-\\xfe]';
// http://hp.vector.co.jp/authors/VA013241/misc/shiftjis.html
if (false)
codes['Shift_JIS'] = codes[binary_charset]
+ '|[\\x81-\\x9f\\xe0-\\xef\\xfa-\\xfc][\\x40-\\x7e\\x80-\\xfc]|[\\xa1-\\xdf]';
if (false)
codes['EUC-JP'] = codes[binary_charset]
+ '|\\x8f[\\xa1-\\xfe][\\xa1-\\xfe]|[\\xa1-\\xfe][\\xa1-\\xfe]|\\x8e[\\xa1-\\xdf]';
codes['UTF-8'] = codes[binary_charset]
+ '|[\\xc0-\\xdf][\\x80-\\xbf]|[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3}'
+ '|[\\xf8-\\xfb][\\x80-\\xbf]{4}|[\\xfc-\\xfd][\\x80-\\xbf]{5}';
// GBK
// http://zh.wikipedia.org/wiki/GB_18030
// http://zh.wikipedia.org/wiki/GB_2312
if (false)
codes['GB 2312'] = codes[binary_charset]
+ '|[\\xa1-\\xf7][\\xa1-\\xfe]';
t = stream.read();
stream.close();
stream = null;
code = translate_ADO_Stream_binary_data(t, reading_length);
if (!is_HTML
&& code.indexOf('<') !== -1
// 檢測是否符合 XML 文件標準。
// ** WARNING: 使用以下方法,對某些奇怪的檔案會產生問題!
// && /^(<\/?[^<>]+>|[^<>]+)*(<[^<>]*)?$/.test(code)
// && /^(?:[^<>]+|<\/?[\s\S]+?>)*$/.test(code)
// 以下 OK.
&& !/[<>]/.test(code.replace(/<\/?[a-z][^<>]*>/gi, '')
.replace(/<(?:[a-z][^<>]*)?$/, ''))) {
library_namespace.debug('Treat [' + file_path + '] as HTML.',
1, 'guess_encoding');
is_HTML = true;
// reading_length === 0: 已 read all.
if (reading_length
&& reading_length < guess_encoding.min_length_of_HTML) {
library_namespace.debug('原先取樣文本長度: ' + reading_length
+ ' 過短,重新讀取長度: '
+ guess_encoding.min_length_of_HTML + '。', 2);
reading_length = guess_encoding.min_length_of_HTML;
t = translate_ADO_Stream_binary_data(t, reading_length);
} else
t = code;
} else
t = code;
code = undefined;
library_namespace.debug('取樣文本 (' + t.length
+ ') [' + t.replace(/]', 2, 'guess_encoding');
// 另可使用 .split(',').length - 1
question_mark_count = t.count_of('?');
library_namespace.debug("question mark '?' count = "
+ question_mark_count, 2, 'guess_encoding');
for ( var _e in codes) {
reg = new RegExp('^(?:' + codes[_e] + ')');
var l = 0, s = t;
while (l !== s.length)
l = s.length, s = s.replace(reg, '');
if (s === '') {
code = _e;
break;
}
}
}
library_namespace.debug('coding: [' + code + '] in phase 1.', 2,
'guess_encoding');
// 假如是HTML檔,判斷是否有 charset 設定。這個判別放在unicode之後,其他自動判別之前。
if (is_HTML) {
if (ADO_Stream.Type === AdoEnums.adTypeBinary) {
ADO_Stream.Close();
ADO_Stream = open_file(file_path, binary_charset);
}
ADO_Stream.Position = 0;
// 讀 binary data 用 'ISO-8859-1' 會 error encoding.
ADO_Stream.Charset = binary_charset;
if (t = _.get_HTML_encoding(reading_length ? ADO_Stream
.ReadText(reading_length)
// ADO_Stream.ReadText(adReadAll)
: ADO_Stream.ReadText()))
code = t;
library_namespace.debug('coding: [' + code + '] in phase 2.', 2,
'guess_encoding');
}
var i,
// best confidence
most_probable_code, highest_ratio = 0, unknown_character_count;
if (!code) {
var enc, enc_tmp, lang_code = guess_encoding.test_group, c, test_group = {}, EMPTY_TEST_GROUP = 0, most_probable_code_changed;
// 初始化 test_group.
if (false) {
test_group['EUC-KR'] = {
'EUC-KR' : 1,
'EUC-JP' : 1
};
test_group['EUC-JP'] = {
'EUC-KR' : 1,
'EUC-JP' : 1
};
}
if (Array.isArray(lang_code)) {
for (i = 0; i < lang_code.length; i++) {
c = {};
for (enc_tmp = 0; enc_tmp < lang_code[i].length; enc_tmp++) {
c[lang_code[i][enc_tmp]] = lang_code[i];
test_group[lang_code[i][enc_tmp]] = c;
}
}
}
for (enc in guess_encoding.mapping) {
if (ADO_Stream.Type === AdoEnums.adTypeBinary) {
ADO_Stream.Close();
ADO_Stream = open_file(file_path, binary_charset);
}
ADO_Stream.Position = 0;
try {
ADO_Stream.Charset = enc in guess_encoding.OS_alias ? guess_encoding.OS_alias[enc]
: enc;
} catch (e) {
library_namespace
.error('ADO Stream DO NOT support encoding [' + enc
+ ']!', 1, 'guess_encoding');
}
t = reading_length ? ADO_Stream.ReadText(reading_length)
// ADO_Stream.ReadText(adReadAll)
: ADO_Stream.ReadText();
library_namespace.debug(enc + '(' + ADO_Stream.Size
+ '):
' + t.slice(0, 200), 4);
if (enc === 'EUC-JP') {
// 半角・全角形[ヲ-ン] 可能在以 'EUC-JP' 讀取 'EUC-KR' 或 'GB 2312'
// 時大量出現而造成誤判。加以日本語文件本身也少僅用這些字母,因此割愛。
t = t.replace(/[\uFF66-\uFF9D]+/g, '');
i = t.replace(/[\t\x20-\x7f]+/g, '')
// Windows 下, EUC-JP 對無法編碼者會改成 '・' 而非 '�'。
if (library_namespace.is_debug(2))
library_namespace.debug('[・] count: ' + i.count_of('・')
+ ' / ' + i.length + '='
+ (i.count_of('・') / i.length) + ' @ ' + enc,
1, 'guess_encoding');
// 計算 '・' 佔非 ASCII 之比例。
// .02: 依據經驗而得之 magic number 閥值。
if (i.count_of('・') / i.length > .08)
continue;
}
// http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29#Replacement_character
// 可惜 ADO_Stream.ReadText 僅會把無法編碼者改成 '?',而不會改成 replacement
// character '�'。
// TODO: '�' 的比例過大時(e.g., ratio>.0001 && count>1)則 pass.
if ((i = t.indexOf('�')) === -1) {
unknown_character_count = t.count_of('?')
- question_mark_count;
if (library_namespace.is_debug(2)) {
library_namespace.debug("question mark '?' count of ["
+ enc + "] = " + unknown_character_count, 2,
'guess_encoding');
}
} else if (t.indexOf('�', i) !== -1) {
unknown_character_count = t.count_of('�');
}
// .001: 依據經驗而得之 magic number 閥值。
if (unknown_character_count / t.length > .001) {
library_namespace.debug('看來似乎不是 encoding [' + enc
+ ']! Unknown characters: '
+ unknown_character_count + ' / ' + t.length
+ ' = ' + (unknown_character_count / t.length), 2,
'guess_encoding');
continue;
}
lang_code = guess_encoding.mapping[enc];
library_namespace.debug('Test charset [' + enc + ' ('
+ lang_code + ')' + '] decoded as (' + t.length
+ ') ['
+ t.replace(/]', 2,
'guess_encoding');
c = guess_text_language(t, lang_code, Object.assign({
return_ratio : true
}, options));
library_namespace.debug(function() {
return 'Wish ' + lang_code + ' and get ' + c;
}, 3, 'guess_encoding');
most_probable_code_changed = false;
for (i in c)
if (c[i] > highest_ratio) {
library_namespace.debug('Most probable code now: ['
+ enc + '].[' + i + '] = ' + c[i], 2,
'guess_encoding');
most_probable_code_changed = true;
most_probable_code = enc;
highest_ratio = c[i];
}
// 處理 test_group.
if (library_namespace.is_Object(test_group[enc])) {
delete test_group[enc][enc];
enc_tmp = 0;
for (enc_tmp in test_group[enc])
break;
if (!enc_tmp)
test_group[enc] = EMPTY_TEST_GROUP;
}
// most_probable_code_changed 或 test_group 剛結束時皆需要作測試。
if (most_probable_code_changed
|| test_group[enc] === EMPTY_TEST_GROUP) {
if (most_probable_code_changed)
library_namespace.debug('Test ' + enc + ': ['
+ lang_code + '] and get ['
+ most_probable_code + '].', 2,
'guess_encoding');
if (test_group[enc] === EMPTY_TEST_GROUP)
delete test_group[enc];
if (highest_ratio >= guess_encoding.default_select_boundary) {
if (!(enc in test_group)) {
// code = ADO_Stream.Charset;
code = most_probable_code;
break;
} else if (library_namespace.is_debug(2)) {
for (i in test_group[enc_tmp]) {
library_namespace.debug('由於 ' + enc + ' @ ['
+ test_group[enc_tmp][i]
+ '] 編碼類似,有時會產生誤判,因此持續作測試以找出最可能之編碼。',
2, 'guess_encoding');
break;
}
}
}
}
}
}
ADO_Stream.Close();
ADO_Stream = null;
// ascii=ISO-8859-1, _autodetect, _autodetect_all
return code || most_probable_code || guess_encoding.unknown_encoding;
}
// default code
guess_encoding.unknown_encoding = undefined;
guess_encoding.max_length_to_test = 1e8;
// 特殊字元,各種編碼及判別所需最短長度。對 HTML 來說,需要更長一點。
guess_encoding.min_length_to_test = 8e3;
// 對 HTML 來說,判別所需最短長度需要更長一點。
guess_encoding.min_length_of_HTML = 1e5;
guess_encoding.default_select_boundary = .9;
// 對相似的 code,需要 test 完 group 中所有 codes 方得確認。
// 基於經驗,而非理論或是算法的作法。
guess_encoding.test_group = [ [ 'EUC-KR', 'EUC-JP', 'GB 2312' ] ];
// HKEY_CLASSES_ROOT\MIME\Database\Charset
// 語言文字標記. 起碼須列出至 script (書寫文字格式). See ISO 15924 - Alphabetical Code List.
// { character encoding : IANA language tag }
// @see IETF language tag (script code)
// TODO: EUC-TW, ISO2022-XX,和HZ。
// TODO: 提供一種通用模式來處理單字節編碼 - 俄語編碼(KOI8-R, ISO8859-5, window1251,
// Mac-cyrillic, ibm866, ibm855)
// TODO: parse IANA language
// tag: cmn-Hant-TW, cmn-Hans-CN
// @see
// http://www.cnblogs.com/sink_cup/archive/2010/07/01/language_subtag_registry.html
guess_encoding.mapping = {
// EUC
// 將 EUC-KR 排在 EUC-JP 前面是因為 EUC-KR 字碼的字在 EUC-JP 中看來常常只是些罕用字,而非無法辨識的情況。
'EUC-KR' : {
// EUC-KR 的標準需要再降低一點。
'ko-KR' : .8
},
'EUC-JP' : 'ja-JP',
// 將 GB 2312 排在 Big5 前面是因為 GB 2312 常用字在 Big5 中常常是\u8000之後的常用字,Big5
// 常用字卻常常是 GB 2312 中奇怪字碼與罕用字。
'GB 2312' : {
// 0: use guess_text_language.default_select_boundary
'cmn-Hans-CN' : 0,
// 偶爾會有以 'GB 2312' 編碼的 'cmn-Hant-TW'. .8: 依據經驗而得之 magic number 閥值。
'cmn-Hant-TW' : 0
},
'Big5' : {
// 0: use guess_text_language.default_select_boundary
'cmn-Hant-TW' : 0,
// 偶爾會有以 'Big5' 編碼的 'cmn-Hans-CN'. .8: 依據經驗而得之 magic number 閥值。
'cmn-Hans-CN' : 0
},
'Shift_JIS' : 'ja-JP',
// 阿拉伯字母,
// Arabic on the Internet: History of Arabic
// on Computers | The Baheyeldin Dynasty.
// ar-SA — Arabic
'Windows-1256' : 'arb-Arab',
// bn-IN — Bengali (India)
// '':'bn-Beng-IN',
// 俄語字母,
// Appendix D. Language codes
// ru-RU — Russian
'Windows-1251' : 'ru-RU',
'ISO-8859-1' : 'en-US'
};
// character encoding used in guess_encoding.mapping : character encoding
// name used in OS.
// TODO: 與 open_file.OS_alias 統合。
guess_encoding.OS_alias = {
'GB 2312' : 'GB2312'
};
// will ignore upper/lower case
// TODO
guess_encoding.alias = {
'Shift_JIS' : [ 'Shift-JIS', 'ShiftJIS', 'Shift JIS', 'x-sjis' ],
'Big5' : [ 'Big 5', 'BIG-5' ],
'EUC-JP' : 'EUCJP',
'EUC-KR' : 'EUCKR',
'GB 2312' : 'GB2312',
// 'GB 2312' : 'EUC-CN',
'UTF-8' : [ 'UTF8', 'UTF 8' ]
};
guess_encoding.alias_to_official = function alias_code_to_official(code) {
var map = guess_encoding.alias.map;
if (!map) {
map = {};
var i, j, alias = guess_encoding.alias, list;
for (i in alias) {
list = alias[i];
if (Array.isArray(list)) {
for (j in list)
if (typeof list[j] === 'string')
map[list[j].toLowerCase()] = i;
} else if (typeof list === 'string')
map[list.toLowerCase()] = i;
}
guess_encoding.alias.map = map;
}
return map[('' + code).toLowerCase()] || code;
};
_// JSDT:_module_
.guess_encoding = guess_encoding;
// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
_// JSDT:_module_
.
// 偵測是否為 HTML。
is_HTML_file = function(text) {
if (typeof text !== 'string')
return;
var HTML_pattern = /^(([\s\n]*<[\w?!][^>]*>)*?)[\s\n]*]*)?>/i;
library_namespace.debug('is_HTML_file: text is '
+ (HTML_pattern.test(text) ? '' : 'NOT ')
+ 'HTML document.', 3);
if (false) {
var m = text.match(HTML_pattern);
if (m && /<\?xml([\s\n][^>?]+)?\?>/.test(m[1])) {
}
}
return HTML_pattern.test(text);
};
_// JSDT:_module_
.guess_text_type = function(text) {
if (typeof text !== 'string')
return;
if (_.is_HTML_file(text)) {
return 'html';
}
return type;
};
_// JSDT:_module_
.
/**
* 判斷 HTML 檔是否有 charset 設定
*
* @param file_contents
* file contents
* @returns
*/
get_HTML_encoding = function(file_contents) {
var m;
if ((m = file_contents.match(/]*)?>/i))
&& (m = m[1].match(/content="([^"]+)"/i)
|| m[1].match(/content=([^\w]+)/i))
&& (m = m[1].match(/charset=([\w-]{2,})/i))
|| (m = file_contents
.match(/<\?xml([\s\n][^>]*)?[\s\n]encoding[\s\n]*=[\s\n]*["']([a-zA-Z\d\-]+)["']/))) {
library_namespace.debug('get_HTML_encoding: coding: [' + m[1]
+ '].', 3);
return m[1];
}
};
// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
/**
* 靠常用字自動判別文本或字串之編碼。
* detect language.
* TODO: 天城文, 孟加拉文
*
* @param {String}text
* 文本或字串
* @param {Object|String}[language_to_test]
* 指定欲判別之編碼。
* @param {Object}[option]
* default: {
* return_ratio (return ratio object): false,
* test_all (test all language codes): false };
*
* @returns {String}code (default) 判別出之編碼。
* @returns {Object}ratio object {code: 比例/ratio/可能性/possibility}
*
* @since 2011/12/11 00:18:07 重構
* 2011/12/28 22:28:35 refactoring: 重構以判別 EUC-KR。 2012/3/17 17:16:32
* 兩段式判別:降低不常用字之比重。 2012/3/17 23:20:20 Adding Arabic, Russian.
* 2012/3/28 23:51:59 move to application.locale.encoding
*/
function guess_text_language(text, language_to_test, options) {
text = guess_text_language.remove_support_contents(text);
if (!text)
return;
library_namespace.debug(text.length
+ ' characters after remove_support_contents():'
+ ' [' + text.replace(/]', 2);
// 前置處理。
if (!library_namespace.is_Object(options)) {
options = Object.create(null);
}
var i, old_length = text.length,
// 特殊 chacacters.
featured,
// 未辨識 length
unrecognized, character_count = {
all : old_length
}, seldom_character_count = {}, return_ratio = options.return_ratio, test_all = options.test_all, signature_RegExp = guess_text_language
.get_signature_RegExp(), remove_lang = function(lang, add_tag) {
var filter = lang;
if (add_tag === seldom_character_count)
filter += guess_text_language.seldom_postfix;
if (old_length && signature_RegExp[filter]) {
if (library_namespace.is_debug(2)) {
if (unrecognized = text.match(signature_RegExp[filter])) {
unrecognized = unrecognized.join('');
library_namespace.debug('character of ' + filter
+ ': (' + unrecognized.length + ') ['
+ unrecognized.slice(0, 80) + ']'
+ (unrecognized.length > 80 ? '..' : ''), 2,
'guess_text_language');
}
}
text = text.replace(signature_RegExp[filter], '');
unrecognized = text.length;
// add_tag 本 lang 為計算所必須,不能 miss。但一一判斷速度過慢。
if (false)
if (typeof add_tag === 'undefined')
add_tag = !(lang in guess_text_language.boundary);
if (add_tag || old_length !== unrecognized) {
if (add_tag !== seldom_character_count)
character_count[lang] = old_length - unrecognized;
else if (old_length !== unrecognized)
seldom_character_count[lang] = old_length
- unrecognized;
old_length = unrecognized;
}
}
};
// 按照特徵碼一個個將之去除,計算符合的長度,猜測最有可能者。
remove_lang('en-US', true);
remove_lang('en-US', seldom_character_count);
// 因為 'en-US' ⊂ 'x-general',必須將 'en-US' 排在 'x-general' 之前。
remove_lang('x-general', true);
remove_lang('x-CJK', true);
for (i in guess_text_language.boundary) {
remove_lang(i);
remove_lang(i, seldom_character_count);
}
if (library_namespace.is_debug(2)) {
for (i in character_count) {
if (character_count[i])
library_namespace.debug('count ' + i + ': '
+ character_count[i], 2, 'guess_text_language');
}
if (unrecognized)
library_namespace.debug(unrecognized + ' unknown characters: ['
+ text.slice(0, 300) + ']');
}
// 依各種常用字母之經驗法則偵測/判別。.95, .5: 依據經驗而得之 magic number 閥值。
if ((character_count['en-US'] + (seldom_character_count['en-US'] || 0))
/ (character_count.all - unrecognized) > .95
&& character_count['en-US']
/ (character_count.all - unrecognized) > .5)
return 'en-US';
featured = character_count.all - character_count['en-US']
- (seldom_character_count['en-US'] || 0)
- character_count['x-general'];
library_namespace
.debug('unrecognized ' + unrecognized + ' / featured '
+ featured + ' = '
+ +((unrecognized / featured).toFixed(3))
+ ', boundary '
+ guess_text_language.unrecognized_boundary, 2);
if (
// 判斷已知的同時,未知字碼需要極少…不過這在遇上符號時可能會出現錯誤。因此最好將符號排除。
unrecognized / featured > guess_text_language.unrecognized_boundary) {
if (library_namespace.is_debug(2)) {
library_namespace.debug('unrecognized > boundary: total '
+ text.length + ' unknown characters.', 2);
for (var i = 0, l = text.length; i < l; i++) {
library_namespace.debug('['
+ text.charAt(i) + ']: U+'
+ text.charCodeAt(i).toString(16).toUpperCase()
+ ' (' + text.charCodeAt(i)
+ '10)', 3);
}
}
// 若 ratio === 1 但指定 language_to_test{},可能造成回傳與原先結構不同之
// language_to_test{}!
return return_ratio ? {} : undefined;
}
var count, seldom_count, denominator, ratio,
// best confidence
most_probable_code, highest_ratio = 0, language, recognized_featured_characters = featured
- unrecognized - character_count['x-CJK'];
library_namespace.debug('recognized featured characters: '
+ recognized_featured_characters, 2);
if (options.contains_JP && (i = character_count['ja-JP'])) {
library_namespace.debug('Add JP count to TW, CN.', 2);
character_count['cmn-Hant-TW'] = (character_count['cmn-Hant-TW'] || 0)
+ i;
character_count['cmn-Hans-CN'] = (character_count['cmn-Hans-CN'] || 0)
+ i;
}
if (i = character_count['x-CJK']) {
// 由於 CJK 於各語言各有不同比例,因此加點比重至此。各比例為依據經驗而得之 magic number。
character_count['cmn-Hant-TW'] = (character_count['cmn-Hant-TW'] || 0)
+ i * .2;
character_count['cmn-Hans-CN'] = (character_count['cmn-Hans-CN'] || 0)
+ i * .15;
character_count['ja-JP'] = (character_count['ja-JP'] || 0) + i * .1;
}
// 設定要測試的 codes。
if (!library_namespace.is_Object(language_to_test))
if (language_to_test in guess_text_language.boundary) {
(i = {})[language_to_test] = guess_text_language.boundary[language_to_test];
language_to_test = i;
} else {
language_to_test = guess_text_language.boundary;
}
if (return_ratio) {
// clone object
// http://jsperf.com/cloning-an-object/50
var j = {};
for (i in language_to_test)
j[i] = 0;
language_to_test = j;
}
for (i in language_to_test) {
count = character_count[i] || 0;
seldom_count = seldom_character_count[i] || 0;
denominator = recognized_featured_characters
//
+ character_count['x-CJK'] * .2
// 加上 unrecognized 的影響。3: 實為依據經驗而得之 magic number。
+ (i in {
'ru-RU' : 9
} ? 9 : 3) * unrecognized;
// 'ko-KR' 幾乎只用한글(朝鲜字母),為作平衡加回來。.5: 實為依據經驗而得之 magic number。
if (i === 'ko-KR') {
denominator += character_count['x-CJK'] * .5;
} else if (options.contains_JP
&& (i === 'cmn-Hant-TW' || i === 'cmn-Hans-CN')) {
count += character_count['x-CJK'];
denominator += character_count['x-CJK'];
}
ratio = (count + seldom_count * guess_text_language.seldom_weight)
/ denominator;
library_namespace.debug('test language [' + i + ']: ' + count
+ ' + ' + seldom_count + ' / (all featured characters '
+ recognized_featured_characters
+ (recognized_featured_characters === denominator ? ''
//
: ' → ' + denominator) + ') ≈ ' + +(ratio.toFixed(3))
+ ' (boundary: ' + (language_to_test[i]
//
|| guess_text_language.default_boundary) + ')', 2);
if (return_ratio) {
// 設定好 ratio
language_to_test[i] = ratio;
}
if (// count > (most_probable_code ?
// character_count[most_probable_code] : 0) &&
ratio > Math.max(guess_text_language.default_boundary,
highest_ratio)) {
if (ratio >= (test_all ? 1 : language_to_test[i]
|| guess_text_language.default_select_boundary)) {
library_namespace
.debug('return the most probable code [' + i
+ ']: ' + ratio + '.', 2,
'guess_text_language');
return return_ratio ? language_to_test : i;
}
library_namespace.debug('Most probable code now: [' + i
+ '] = ' + ratio, 1, 'guess_text_language');
most_probable_code = i;
highest_ratio = ratio;
}
}
if (return_ratio) {
return language_to_test;
}
// 經過廝殺戰的才當作有其價值。
if (!test_all)
library_namespace.debug('沒有所佔比例超過門檻,且可以準確判斷的 encoding。 ', 2,
'guess_text_language');
library_namespace.debug('the most probable code [' + most_probable_code
+ ']: ' + highest_ratio, 2, 'guess_text_language');
return most_probable_code;
}
;
// 某種語言之字元數大於此界線閥值,即視為此種語言。依據經驗而得之 magic number。應 > Math.max(.5,
// guess_text_language.default_boundary)。若低於此,則進入廝殺戰。
guess_text_language.default_select_boundary = .9;
// 要作為候選者之最低限度閥值。依據經驗而得之 magic number。
guess_text_language.default_boundary = .4;
// 若無法判別之字元比例大於此界線閥值,則當作 miss 過多,無法判別。依據經驗而得之 magic number。
guess_text_language.unrecognized_boundary = .3;
// 依據經驗而得之 magic number。
guess_text_language.seldom_weight = .3;
guess_text_language.seldom_postfix = '.seldom';
guess_text_language.remove_support_contents = function(text) {
if (typeof text !== 'string')
return;
if (_.is_HTML_file(text)) {
if (library_namespace.is_debug(3)) {
library_namespace.debug('1. ' + text.length
+ ' characters: [' + text.replace(/2. '
+ text.length
+ ' characters: ['
+ text.replace(//g, '').replace(/3. '
+ text.length
+ ' characters: ['
+ text.replace(//g, '').replace(
/
和製漢字(国字)は、和語(ヤマトコトバ)に相当する漢字が無い場合に新規につくられたもので、奈良時代から作られた。ほとんどは訓読みしかない。魚篇や木篇が多い。
http://homepage2.nifty.com/TAB01645/ohara/index.htm
http://zh.wiktionary.org/wiki/%E8%BE%BB
http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=8fbb
http://jprs.jp/doc/rule/saisoku-1-wideusejp-furoku-4.html
http://m2000.idv.tw/informer/zhi/char-root.htm
http://www.ajisai.sakura.ne.jp/~dindi/chrc/ref/wincode2.txt
http://cs-people.bu.edu/butta1/personal/hkscs/hkscs-oct.html
http://www.nobi.or.jp/i/kotoba/kanji/wasei-kanji.html
http://www.melma.com/mag/52/m00011552/a00000066.html
韓語字母/諺文
http://www.sinica.edu.tw/~cytseng/Korean%20reader/hangul.htm
http://www.unicode.org/charts/normalization/
old:
// 自動判別檔案(或字串)之編碼
function guess_encoding(FN) {
if (!is_file(FN))
return FN.length > 64 ? guess_String_language(FN)
: guess_encoding.unknown_encoding;
open_file(FN, binary_charset);
if (!AdoEnums)
return guess_encoding.unknown_encoding;
// ADO_Stream.Type=AdoEnums.adTypeBinary;
ADO_Stream.LoadFromFile(FN);
var t = ADO_Stream.ReadText(3), code;
// Unicode的Byte Order Mark(BOM)在UTF-16LE(little endian)裏,它是以FF-FE這兩個bytes表達,在BE(big endian)裏,是FEFF。而在UTF-8裏,它是以EF-BB-BF這三個bytes表達。
if (t.slice(0, 2) === '\xFF\xFE')
code = 'unicodeFFFE';
if (t.slice(0, 2) === '\xFE\xFF')
code = 'unicode';
if (t === '\xEF\xBB\xBF')
code = 'UTF-8';
if (code) {
ADO_Stream.Close();
return code;
}
if (!code) {
// 將 Shift_JIS 排在 GB 2312 與 Big5 前面是因為 Shift_JIS 常符合 GB 2312,且 Shift_JIS
// 之判定相當嚴。
if (!code)
ADO_Stream.Position = 0, ADO_Stream.Charset = 'Shift_JIS',
code = guess_String_language(ADO_Stream.ReadText(900),
ADO_Stream.Charset);
// 將 GB 2312 排在 Big5 前面是因為 GB 2312 常用字在 Big5 中常常是0x8000之後的常用字,Big5
// 常用字卻常常是 GB 2312 中奇怪字碼與罕用字。
if (!code)
ADO_Stream.Position = 0, ADO_Stream.Charset = 'GB 2312',
code = guess_String_language(ADO_Stream.ReadText(2000),
ADO_Stream.Charset);
if (!code)
ADO_Stream.Position = 0, ADO_Stream.Charset = 'Big5',
code = guess_String_language(ADO_Stream.ReadText(2000),
ADO_Stream.Charset);
}
ADO_Stream.Close();
return code || guess_encoding.unknown_encoding; // ascii=ISO-8859-1,_autodetect,_autodetect_all
}
// 靠常用字自動判別字串之編碼 string,預設編碼
function guess_String_language(str, dcode) {
var code;
if (str.length > 9000)
str = str.slice(0, 9000);
// 將Shift_JIS排在 GB 2312 與Big5前面是因為Shift_JIS常符合gb,且Shift_JIS之判定相當嚴。
if (dcode == 'Shift_JIS' || !dcode && !code) {
// http://www.asahi-net.or.jp/~hc3j-tkg/unicode/
// http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
var i = 0, c, k = 0, u = 0, h = 0;// h_=u_=k_='';
for (; i < str.length; i++)
if (c = str.charCodeAt(i), c > 0xFF)
if (c == 0x30FB || c > 0xFF65 && c < 0xFF9E) {
// HALFWIDTH
h++;
// h_+=str.charAt(i);//||c===0xE134
}
// KATAKANA LETTER等可能不是日文文件中會出現的char
else if (c > 0x3040 && c < 0x30FF) {
// kana
k++;
// k_+=str.charAt(i);
} else {
// unknown kanji
u++;
// u_+=str.charAt(i);
}
if (false) {
alert(k + ',' + u + ',' + h + '\n*' + k_ + '\n*' + u_ + '\n*' + h_);
alert(u_.charCodeAt(2));
}
if (k + u > 2 * h)
// HALFWIDTH KATAKANA LETTER數目比漢字少時判別為Shift_JIS
code = 'Shift_JIS';
}
// 將 GB 2312 排在Big5前面是因為 GB 2312 常用字在Big5中常常是0x8000之後的常用字,Big5常用字卻常常是 GB
// 2312 中奇怪字碼與罕用字
if (dcode == 'Big5' || dcode == 'GB 2312' || !dcode && !code) {
var i = 0, c, k = 0, u = 0;
// k_=u_='';
for (; i < str.length; i++)
if (c = str.charCodeAt(i), c > 0xFF)
if (c > 0x4DFF && c < 0x9FA6 || c > 0xFF00 && c < 0xFF5F
|| c > 0x33ff && c < 0x4DB6 || c == 0x2605
|| c == 0x2606) {
// 2605,6:★☆
k++;
// k_+=str.charAt(i);
} else {
u++;
// ,u_+=str.charAt(i);
}
if (false)
alert(k + ',' + u + '\n' + k_ + '\n*' + u_);
if (k > 5 * u)
// 漢字比不認識的字多時判定
code = dcode || 'Big5';
}
if (dcode == binary_charset || dcode == 'ascii' || !dcode && !code) {
}
return code;
}
*/
// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
var to_kana_pair, to_romaji_pair;
/**
* convert romaji to kana. ロマ字→仮名.
*
* @example
// More examples: see /_test suite/test.js
*
*
* @param {String}text
* text to be converted.
* @returns {String} text converted.
*/
function to_kana(text) {
return to_kana_pair.convert(text);
}
/**
* convert kana to romaji. 仮名→ロマ字.
*
* @param {String}text
* text to be converted.
* @returns {String} text converted.
*/
function to_romaji(text) {
return to_romaji_pair.convert(text);
}
/**
* initialize 仮名/ロマ字(羅馬字) pair.
*/
function initialize_kana_romaji(function_name) {
if (!to_kana_pair) {
to_kana_pair = new library_namespace.data.Convert_Pairs(null, {
path : library_namespace.get_module_path(module_name.replace(
/[^.]+$/, ''),
// 'resources/kana romaji.txt'
library_namespace.env.resources_directory_name
+ '/kana romaji.txt'),
encoding : 'UTF-8',
remove_comments : true
});
to_romaji_pair = to_kana_pair.clone().reverse();
}
return function_name === 'to_kana' ? to_kana : to_romaji;
}
library_namespace.set_initializor('to_kana', initialize_kana_romaji, _);
library_namespace.set_initializor('to_romaji', initialize_kana_romaji, _);
// --------------------------------------------------------
_// JSDT:_module_
.
/**
* 將 BIG5 日文假名碼修改為 Unicode 日文假名。
*
* @param {String}
* text Unicode text
* @return {String}Unicode 日文假名。
* @see from Unicode 補完計畫 jrename.js
*/
Big5_kana_fix = function(text) {
var H = [], t, i = 0;
for (; i < text.length; i++) {
t = c.charCodeAt(0);
// 某次破解 Windows Installer 所用的資料
// H += String.fromCharCode(t > 61300 ? t - 48977 : t);
H.push(t === 63219 ? 'ー' : String.fromCharCode(
// ひらがな
t >= 63223 && t <= 63305 ? t - 50870 :
// カタカナ
t >= 63306 && t <= 63391 ? t - 50857 :
// text.charAt(i);
t));
}
return H.join('');
};
// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
return (_// JSDT:_module_
);
}