/**
 * @name CeL function for encoding and language identification
 * @fileoverview 本檔案包含了編碼,例如自動偵測地區語系/文化設定編碼的 functions。言語判定ライブラリ。
 * @since
 */
'use strict';
// 'use asm';
// --------------------------------------------------------------------------------------------
// 不採用 if 陳述式,可以避免 Eclipse JSDoc 與 format 多縮排一層。
typeof CeL === 'function' && CeL.run({
	// module name
	name : 'application.locale.encoding',
	// initialize_kana_romaji() need CeL.data.Convert_Pairs()
	require : 'data.|data.Convert_Pairs.|data.native.'
	//
	+ '|application.OS.Windows.new_COM'
	//
	+ '|application.locale.language_tag'
	//
	+ '|application.OS.Windows.file.open_file'
	// library_namespace.file_exists()
	+ '|application.storage.'
	//
	+ '|application.OS.Windows.file.is_file'
	//
	+ '|application.OS.Windows.file.AdoEnums'
	//
	+ '|application.OS.Windows.file.translate_ADO_Stream_binary_data',
	// 設定不匯出的子函式。
	// no_extend : '*',
	// 為了方便格式化程式碼,因此將 module 函式主體另外抽出。
	code : module_code
});
function module_code(library_namespace) {
	var module_name = this.id,
	// requiring
	new_COM = this.r('new_COM'), language_tag = this.r('language_tag'), open_file = this
			.r('open_file'), is_file = this.r('is_file'), AdoEnums = this
			.r('AdoEnums'), translate_ADO_Stream_binary_data = this
			.r('translate_ADO_Stream_binary_data');
	// var to_standard_language_tag = gettext.to_standard;
	/**
	 * null module constructor
	 * 
	 * @class encoding and language identification 的 functions
	 */
	var _// JSDT:_module_
	= function() {
		// null module constructor
	};
	/**
	 * for JSDT: 有 prototype 才會將之當作 Class
	 */
	_// JSDT:_module_
	.prototype = {};
	// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
	// ascii
	var binary_charset = 'ISO-8859-1';
	/**
	 * 
	TODO:
	考慮字頻。
	只檢測常用的幾個字,無法判別才廣泛測試。
	http://atedev.wordpress.com/2007/09/19/bom-bom-bom/
	00 00 fe ff UTF-32, Big Endian
	fe ff 00 00 UTF-32, Little Endian
	fe ff ## ## UTF-16, Big Endian
	ff fe ## ## UTF-16, Little Endian
	ef bb bf UTF-8
	var FN='I:\\Documents and Settings\\kanashimi\\My Documents\\kanashimi\\www\\cgi-bin\\game\\Shift_JIS.txt',enc=guess_encoding(FN);alert('['+enc+'] '+FN+'\n'+_.read_file(FN,enc).slice(0,900));
	
	 */
	/**
	 * 	自動判別檔案(或字串)之編碼	文字エンコーディング判定を行う
	autodetect encoding
	http://www.hawk.34sp.com/stdpls/dwsh/charset_adodb.html
	http://www.ericphelps.com/q193998/
	http://hp.vector.co.jp/authors/VA003334/ado/adostream.htm
	
	 */
	// guess_encoding[generateCode.dLK]='is_file,open_file,guess_text_language,get_HTML_encoding';
	/**
	 * guess character encoding / character set of file. 偵測檔案編碼。現可偵測中日韓東亞語言。
	 * 
	 * @param {String}file_path
	 *            檔案path
	 * @param {Boolean}[is_HTML]
	 *            為 HTML 檔案
	 * @param [open_function]
	 *            open_function(file_path, length, encoding)
	 * @returns
	 * @see 字符集探测, A composite approach to
	 *      language/encoding detection, 一种语言/编码检测的复合方法, 一种语言/编码检测的复合方法, Automatic Detection of Character Encoding and
	 *      Language,
	 */
	function guess_encoding(file_path, is_HTML, options) {
		// 前置處理。
		if (library_namespace.is_Object(is_HTML))
			options = is_HTML, is_HTML = undefined;
		else if (!library_namespace.is_Object(options)) {
			options = isNaN(options) ? Object.create(null) : {
				reading_length : options
			};
		}
		var t, code;
		if (false && typeof ActiveXObject == 'undefined') {
			alert("guess_encoding: Can't find ActiveXObject!");
			return;
		}
		if (false && typeof _.get_HTML_encoding != 'function')
			is_HTML = false;
		// TODO: using library_namespace.file_exists(file_path) @ build.js
		if (!is_file(file_path)) {
			library_namespace.debug('Treat [' + file_path + '] as string.');
			return file_path.length < 1024 ? guess_encoding.unknown_encoding
					: (t = guess_text_language(file_path, undefined, options)) ? t
							: (is_HTML || typeof is_HTML === 'undefined')
									&& (t = _.get_HTML_encoding(file_path)) ? t
									: guess_encoding.unknown_encoding;
		}
		// 讀 binary data 用 'ISO-8859-1' 會 error encoding.
		var ADO_Stream = open_file(file_path, binary_charset);
		if (library_namespace.is_type(ADO_Stream, 'Error')) {
			return guess_encoding.unknown_encoding;
		}
		// 0: read all.
		var reading_length = options.reading_length;
		if (isNaN(reading_length) || (reading_length |= 0) < 0
				|| reading_length > guess_encoding.max_length_to_test)
			// default
			reading_length = is_HTML ? guess_encoding.min_length_of_HTML
					: guess_encoding.min_length_to_test;
		t = ADO_Stream.ReadText(3);
		// t = ADO_Stream.Read(3);
		library_namespace.debug(file_path + ': ['
				+ t.slice(0, 3).split('').join(',') + ']..', 2);
		if (typeof is_HTML === 'undefined')
			is_HTML = /\.[xs]?html?$/i.test(file_path);
		var question_mark_count;
		if (typeof t != 'string') {
			// 此時type通常是unknown,不能用+=
			// t=''+t;
			return guess_encoding.unknown_encoding;
		}
		/**
		 * Unicode的Byte Order Mark(BOM)在UTF-16LE(little
		 * endian)裏,它是以FF-FE這兩個bytes表達,在BE(big
		 * endian)裏,是FEFF。而在UTF-8裏,它是以EF-BB-BF這三個bytes表達。
		 */
		if (t.slice(0, 2) === '\xFF\xFE') {
			// Unicode big-endian
			code = 'unicodeFFFE';
			// code = 'UTF-16BE';
			// code = 'unicodeFFFE';
		} else if (t.slice(0, 2) === '\xFE\xFF') {
			// UTF-16LE: Unicode little-endian.
			// In practice, due to Windows using little-endian order by default,
			// many applications also assume little-endian encoding by default.
			code = 'unicode';
		} else if (t === '\xEF\xBB\xBF')
			code = 'UTF-8';
		else {
			// 即使是用OpenTextFile(_.open_format.TristateFalse),UTF-8還是會被轉換而判別不出來。
			// from http://www.hawk.34sp.com/stdpls/dwsh/charset_adodb.html
			var l, codes = {}, reg = new RegExp(), stream = open_file(
					file_path, 'binary');
			codes[binary_charset] = '[\\x09\\x0a\\x0d\\x20-\\x7e]';
			// http://www.cns11643.gov.tw/web/word/big5/index.html
			if (false)
				codes['Big5'] = codes[binary_charset]
						+ '|[\\xa4-\\xc6\\xc9-\\xf9][\\x40-\\xfe]';
			// http://hp.vector.co.jp/authors/VA013241/misc/shiftjis.html
			if (false)
				codes['Shift_JIS'] = codes[binary_charset]
						+ '|[\\x81-\\x9f\\xe0-\\xef\\xfa-\\xfc][\\x40-\\x7e\\x80-\\xfc]|[\\xa1-\\xdf]';
			if (false)
				codes['EUC-JP'] = codes[binary_charset]
						+ '|\\x8f[\\xa1-\\xfe][\\xa1-\\xfe]|[\\xa1-\\xfe][\\xa1-\\xfe]|\\x8e[\\xa1-\\xdf]';
			codes['UTF-8'] = codes[binary_charset]
					+ '|[\\xc0-\\xdf][\\x80-\\xbf]|[\\xe0-\\xef][\\x80-\\xbf]{2}|[\\xf0-\\xf7][\\x80-\\xbf]{3}'
					+ '|[\\xf8-\\xfb][\\x80-\\xbf]{4}|[\\xfc-\\xfd][\\x80-\\xbf]{5}';
			// GBK
			// http://zh.wikipedia.org/wiki/GB_18030
			// http://zh.wikipedia.org/wiki/GB_2312
			if (false)
				codes['GB 2312'] = codes[binary_charset]
						+ '|[\\xa1-\\xf7][\\xa1-\\xfe]';
			t = stream.read();
			stream.close();
			stream = null;
			code = translate_ADO_Stream_binary_data(t, reading_length);
			if (!is_HTML
					&& code.indexOf('<') !== -1
					// 檢測是否符合 XML 文件標準。
					// ** WARNING: 使用以下方法,對某些奇怪的檔案會產生問題!
					// && /^(<\/?[^<>]+>|[^<>]+)*(<[^<>]*)?$/.test(code)
					// && /^(?:[^<>]+|<\/?[\s\S]+?>)*$/.test(code)
					// 以下 OK.
					&& !/[<>]/.test(code.replace(/<\/?[a-z][^<>]*>/gi, '')
							.replace(/<(?:[a-z][^<>]*)?$/, ''))) {
				library_namespace.debug('Treat [' + file_path + '] as HTML.',
						1, 'guess_encoding');
				is_HTML = true;
				// reading_length === 0: 已 read all.
				if (reading_length
						&& reading_length < guess_encoding.min_length_of_HTML) {
					library_namespace.debug('原先取樣文本長度: ' + reading_length
							+ ' 過短,重新讀取長度: '
							+ guess_encoding.min_length_of_HTML + '。', 2);
					reading_length = guess_encoding.min_length_of_HTML;
					t = translate_ADO_Stream_binary_data(t, reading_length);
				} else
					t = code;
			} else
				t = code;
			code = undefined;
			library_namespace.debug('取樣文本 (' + t.length
					+ ') [' + t.replace(/]', 2, 'guess_encoding');
			// 另可使用 .split(',').length - 1
			question_mark_count = t.count_of('?');
			library_namespace.debug("question mark '?' count = "
					+ question_mark_count, 2, 'guess_encoding');
			for ( var _e in codes) {
				reg = new RegExp('^(?:' + codes[_e] + ')');
				var l = 0, s = t;
				while (l !== s.length)
					l = s.length, s = s.replace(reg, '');
				if (s === '') {
					code = _e;
					break;
				}
			}
		}
		library_namespace.debug('coding: [' + code + '] in phase 1.', 2,
				'guess_encoding');
		// 假如是HTML檔,判斷是否有 charset 設定。這個判別放在unicode之後,其他自動判別之前。
		if (is_HTML) {
			if (ADO_Stream.Type === AdoEnums.adTypeBinary) {
				ADO_Stream.Close();
				ADO_Stream = open_file(file_path, binary_charset);
			}
			ADO_Stream.Position = 0;
			// 讀 binary data 用 'ISO-8859-1' 會 error encoding.
			ADO_Stream.Charset = binary_charset;
			if (t = _.get_HTML_encoding(reading_length ? ADO_Stream
					.ReadText(reading_length)
			// ADO_Stream.ReadText(adReadAll)
			: ADO_Stream.ReadText()))
				code = t;
			library_namespace.debug('coding: [' + code + '] in phase 2.', 2,
					'guess_encoding');
		}
		var i,
		// best confidence
		most_probable_code, highest_ratio = 0, unknown_character_count;
		if (!code) {
			var enc, enc_tmp, lang_code = guess_encoding.test_group, c, test_group = {}, EMPTY_TEST_GROUP = 0, most_probable_code_changed;
			// 初始化 test_group.
			if (false) {
				test_group['EUC-KR'] = {
					'EUC-KR' : 1,
					'EUC-JP' : 1
				};
				test_group['EUC-JP'] = {
					'EUC-KR' : 1,
					'EUC-JP' : 1
				};
			}
			if (Array.isArray(lang_code)) {
				for (i = 0; i < lang_code.length; i++) {
					c = {};
					for (enc_tmp = 0; enc_tmp < lang_code[i].length; enc_tmp++) {
						c[lang_code[i][enc_tmp]] = lang_code[i];
						test_group[lang_code[i][enc_tmp]] = c;
					}
				}
			}
			for (enc in guess_encoding.mapping) {
				if (ADO_Stream.Type === AdoEnums.adTypeBinary) {
					ADO_Stream.Close();
					ADO_Stream = open_file(file_path, binary_charset);
				}
				ADO_Stream.Position = 0;
				try {
					ADO_Stream.Charset = enc in guess_encoding.OS_alias ? guess_encoding.OS_alias[enc]
							: enc;
				} catch (e) {
					library_namespace
							.error('ADO Stream DO NOT support encoding [' + enc
									+ ']!', 1, 'guess_encoding');
				}
				t = reading_length ? ADO_Stream.ReadText(reading_length)
				// ADO_Stream.ReadText(adReadAll)
				: ADO_Stream.ReadText();
				library_namespace.debug(enc + '(' + ADO_Stream.Size
						+ '):
' + t.slice(0, 200), 4);
				if (enc === 'EUC-JP') {
					// 半角・全角形[ヲ-ン] 可能在以 'EUC-JP' 讀取 'EUC-KR' 或 'GB 2312'
					// 時大量出現而造成誤判。加以日本語文件本身也少僅用這些字母,因此割愛。
					t = t.replace(/[\uFF66-\uFF9D]+/g, '');
					i = t.replace(/[\t\x20-\x7f]+/g, '')
					// Windows 下, EUC-JP 對無法編碼者會改成 '・' 而非 '�'。
					if (library_namespace.is_debug(2))
						library_namespace.debug('[・] count: ' + i.count_of('・')
								+ ' / ' + i.length + '='
								+ (i.count_of('・') / i.length) + ' @ ' + enc,
								1, 'guess_encoding');
					// 計算 '・' 佔非 ASCII 之比例。
					// .02: 依據經驗而得之 magic number 閥值。
					if (i.count_of('・') / i.length > .08)
						continue;
				}
				// http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29#Replacement_character
				// 可惜 ADO_Stream.ReadText 僅會把無法編碼者改成 '?',而不會改成 replacement
				// character '�'。
				// TODO: '�' 的比例過大時(e.g., ratio>.0001 && count>1)則 pass.
				if ((i = t.indexOf('�')) === -1) {
					unknown_character_count = t.count_of('?')
							- question_mark_count;
					if (library_namespace.is_debug(2)) {
						library_namespace.debug("question mark '?' count of ["
								+ enc + "] = " + unknown_character_count, 2,
								'guess_encoding');
					}
				} else if (t.indexOf('�', i) !== -1) {
					unknown_character_count = t.count_of('�');
				}
				// .001: 依據經驗而得之 magic number 閥值。
				if (unknown_character_count / t.length > .001) {
					library_namespace.debug('看來似乎不是 encoding [' + enc
							+ ']! Unknown characters: '
							+ unknown_character_count + ' / ' + t.length
							+ ' = ' + (unknown_character_count / t.length), 2,
							'guess_encoding');
					continue;
				}
				lang_code = guess_encoding.mapping[enc];
				library_namespace.debug('Test charset [' + enc + ' ('
						+ lang_code + ')' + '] decoded as (' + t.length
						+ ') ['
						+ t.replace(/]', 2,
						'guess_encoding');
				c = guess_text_language(t, lang_code, Object.assign({
					return_ratio : true
				}, options));
				library_namespace.debug(function() {
					return 'Wish ' + lang_code + ' and get ' + c;
				}, 3, 'guess_encoding');
				most_probable_code_changed = false;
				for (i in c)
					if (c[i] > highest_ratio) {
						library_namespace.debug('Most probable code now: ['
								+ enc + '].[' + i + '] = ' + c[i], 2,
								'guess_encoding');
						most_probable_code_changed = true;
						most_probable_code = enc;
						highest_ratio = c[i];
					}
				// 處理 test_group.
				if (library_namespace.is_Object(test_group[enc])) {
					delete test_group[enc][enc];
					enc_tmp = 0;
					for (enc_tmp in test_group[enc])
						break;
					if (!enc_tmp)
						test_group[enc] = EMPTY_TEST_GROUP;
				}
				// most_probable_code_changed 或 test_group 剛結束時皆需要作測試。
				if (most_probable_code_changed
						|| test_group[enc] === EMPTY_TEST_GROUP) {
					if (most_probable_code_changed)
						library_namespace.debug('Test ' + enc + ': ['
								+ lang_code + '] and get ['
								+ most_probable_code + '].', 2,
								'guess_encoding');
					if (test_group[enc] === EMPTY_TEST_GROUP)
						delete test_group[enc];
					if (highest_ratio >= guess_encoding.default_select_boundary) {
						if (!(enc in test_group)) {
							// code = ADO_Stream.Charset;
							code = most_probable_code;
							break;
						} else if (library_namespace.is_debug(2)) {
							for (i in test_group[enc_tmp]) {
								library_namespace.debug('由於 ' + enc + ' @ ['
										+ test_group[enc_tmp][i]
										+ '] 編碼類似,有時會產生誤判,因此持續作測試以找出最可能之編碼。',
										2, 'guess_encoding');
								break;
							}
						}
					}
				}
			}
		}
		ADO_Stream.Close();
		ADO_Stream = null;
		// ascii=ISO-8859-1, _autodetect, _autodetect_all
		return code || most_probable_code || guess_encoding.unknown_encoding;
	}
	// default code
	guess_encoding.unknown_encoding = undefined;
	guess_encoding.max_length_to_test = 1e8;
	// 特殊字元,各種編碼及判別所需最短長度。對 HTML 來說,需要更長一點。
	guess_encoding.min_length_to_test = 8e3;
	// 對 HTML 來說,判別所需最短長度需要更長一點。
	guess_encoding.min_length_of_HTML = 1e5;
	guess_encoding.default_select_boundary = .9;
	// 對相似的 code,需要 test 完 group 中所有 codes 方得確認。
	// 基於經驗,而非理論或是算法的作法。
	guess_encoding.test_group = [ [ 'EUC-KR', 'EUC-JP', 'GB 2312' ] ];
	// HKEY_CLASSES_ROOT\MIME\Database\Charset
	// 語言文字標記. 起碼須列出至 script (書寫文字格式). See ISO 15924 - Alphabetical Code List.
	// { character encoding : IANA language tag }
	// @see IETF language tag (script code)
	// TODO: EUC-TW, ISO2022-XX,和HZ。
	// TODO: 提供一種通用模式來處理單字節編碼 - 俄語編碼(KOI8-R, ISO8859-5, window1251,
	// Mac-cyrillic, ibm866, ibm855)
	// TODO: parse IANA language
	// tag: cmn-Hant-TW, cmn-Hans-CN
	// @see
	// http://www.cnblogs.com/sink_cup/archive/2010/07/01/language_subtag_registry.html
	guess_encoding.mapping = {
		// EUC
		// 將 EUC-KR 排在 EUC-JP 前面是因為 EUC-KR 字碼的字在 EUC-JP 中看來常常只是些罕用字,而非無法辨識的情況。
		'EUC-KR' : {
			// EUC-KR 的標準需要再降低一點。
			'ko-KR' : .8
		},
		'EUC-JP' : 'ja-JP',
		// 將 GB 2312 排在 Big5 前面是因為 GB 2312 常用字在 Big5 中常常是\u8000之後的常用字,Big5
		// 常用字卻常常是 GB 2312 中奇怪字碼與罕用字。
		'GB 2312' : {
			// 0: use guess_text_language.default_select_boundary
			'cmn-Hans-CN' : 0,
			// 偶爾會有以 'GB 2312' 編碼的 'cmn-Hant-TW'. .8: 依據經驗而得之 magic number 閥值。
			'cmn-Hant-TW' : 0
		},
		'Big5' : {
			// 0: use guess_text_language.default_select_boundary
			'cmn-Hant-TW' : 0,
			// 偶爾會有以 'Big5' 編碼的 'cmn-Hans-CN'. .8: 依據經驗而得之 magic number 閥值。
			'cmn-Hans-CN' : 0
		},
		'Shift_JIS' : 'ja-JP',
		// 阿拉伯字母,
		// Arabic on the Internet: History of Arabic
		// on Computers | The Baheyeldin Dynasty.
		// ar-SA — Arabic
		'Windows-1256' : 'arb-Arab',
		// bn-IN — Bengali (India)
		// '':'bn-Beng-IN',
		// 俄語字母,
		// Appendix D. Language codes
		// ru-RU — Russian
		'Windows-1251' : 'ru-RU',
		'ISO-8859-1' : 'en-US'
	};
	// character encoding used in guess_encoding.mapping : character encoding
	// name used in OS.
	// TODO: 與 open_file.OS_alias 統合。
	guess_encoding.OS_alias = {
		'GB 2312' : 'GB2312'
	};
	// will ignore upper/lower case
	// TODO
	guess_encoding.alias = {
		'Shift_JIS' : [ 'Shift-JIS', 'ShiftJIS', 'Shift JIS', 'x-sjis' ],
		'Big5' : [ 'Big 5', 'BIG-5' ],
		'EUC-JP' : 'EUCJP',
		'EUC-KR' : 'EUCKR',
		'GB 2312' : 'GB2312',
		// 'GB 2312' : 'EUC-CN',
		'UTF-8' : [ 'UTF8', 'UTF 8' ]
	};
	guess_encoding.alias_to_official = function alias_code_to_official(code) {
		var map = guess_encoding.alias.map;
		if (!map) {
			map = {};
			var i, j, alias = guess_encoding.alias, list;
			for (i in alias) {
				list = alias[i];
				if (Array.isArray(list)) {
					for (j in list)
						if (typeof list[j] === 'string')
							map[list[j].toLowerCase()] = i;
				} else if (typeof list === 'string')
					map[list.toLowerCase()] = i;
			}
			guess_encoding.alias.map = map;
		}
		return map[('' + code).toLowerCase()] || code;
	};
	_// JSDT:_module_
	.guess_encoding = guess_encoding;
	// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
	_// JSDT:_module_
	.
	// 偵測是否為 HTML。
	is_HTML_file = function(text) {
		if (typeof text !== 'string')
			return;
		var HTML_pattern = /^(([\s\n]*<[\w?!][^>]*>)*?)[\s\n]*]*)?>/i;
		library_namespace.debug('is_HTML_file: text is '
				+ (HTML_pattern.test(text) ? '' : 'NOT ')
				+ 'HTML document.', 3);
		if (false) {
			var m = text.match(HTML_pattern);
			if (m && /<\?xml([\s\n][^>?]+)?\?>/.test(m[1])) {
			}
		}
		return HTML_pattern.test(text);
	};
	_// JSDT:_module_
	.guess_text_type = function(text) {
		if (typeof text !== 'string')
			return;
		if (_.is_HTML_file(text)) {
			return 'html';
		}
		return type;
	};
	_// JSDT:_module_
	.
	/**
	 * 判斷 HTML 檔是否有 charset 設定
	 * 
	 * @param file_contents
	 *            file contents
	 * @returns
	 */
	get_HTML_encoding = function(file_contents) {
		var m;
		if ((m = file_contents.match(/]*)?>/i))
				&& (m = m[1].match(/content="([^"]+)"/i)
						|| m[1].match(/content=([^\w]+)/i))
				&& (m = m[1].match(/charset=([\w-]{2,})/i))
				|| (m = file_contents
						.match(/<\?xml([\s\n][^>]*)?[\s\n]encoding[\s\n]*=[\s\n]*["']([a-zA-Z\d\-]+)["']/))) {
			library_namespace.debug('get_HTML_encoding: coding: [' + m[1]
					+ '].', 3);
			return m[1];
		}
	};
	// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
	/**
	 * 靠常用字自動判別文本或字串之編碼。
	 * detect language.
	 * TODO: 天城文, 孟加拉文
	 * 
	 * @param {String}text
	 *            文本或字串
	 * @param {Object|String}[language_to_test]
	 *            指定欲判別之編碼。
	 * @param {Object}[option]
	 *            default: {
	 *            return_ratio (return ratio object): false,
	 *            test_all (test all language codes): false };
	 * 
	 * @returns {String}code (default) 判別出之編碼。
	 * @returns {Object}ratio object {code: 比例/ratio/可能性/possibility}
	 * 
	 * @since 2011/12/11 00:18:07 重構
	 *        2011/12/28 22:28:35 refactoring: 重構以判別 EUC-KR。 2012/3/17 17:16:32
	 *        兩段式判別:降低不常用字之比重。 2012/3/17 23:20:20 Adding Arabic, Russian.
	 *        2012/3/28 23:51:59 move to application.locale.encoding
	 */
	function guess_text_language(text, language_to_test, options) {
		text = guess_text_language.remove_support_contents(text);
		if (!text)
			return;
		library_namespace.debug(text.length
				+ ' characters after remove_support_contents():'
				+ ' [' + text.replace(/]', 2);
		// 前置處理。
		if (!library_namespace.is_Object(options)) {
			options = Object.create(null);
		}
		var i, old_length = text.length,
		// 特殊 chacacters.
		featured,
		// 未辨識 length
		unrecognized, character_count = {
			all : old_length
		}, seldom_character_count = {}, return_ratio = options.return_ratio, test_all = options.test_all, signature_RegExp = guess_text_language
				.get_signature_RegExp(), remove_lang = function(lang, add_tag) {
			var filter = lang;
			if (add_tag === seldom_character_count)
				filter += guess_text_language.seldom_postfix;
			if (old_length && signature_RegExp[filter]) {
				if (library_namespace.is_debug(2)) {
					if (unrecognized = text.match(signature_RegExp[filter])) {
						unrecognized = unrecognized.join('');
						library_namespace.debug('character of ' + filter
								+ ': (' + unrecognized.length + ') ['
								+ unrecognized.slice(0, 80) + ']'
								+ (unrecognized.length > 80 ? '..' : ''), 2,
								'guess_text_language');
					}
				}
				text = text.replace(signature_RegExp[filter], '');
				unrecognized = text.length;
				// add_tag 本 lang 為計算所必須,不能 miss。但一一判斷速度過慢。
				if (false)
					if (typeof add_tag === 'undefined')
						add_tag = !(lang in guess_text_language.boundary);
				if (add_tag || old_length !== unrecognized) {
					if (add_tag !== seldom_character_count)
						character_count[lang] = old_length - unrecognized;
					else if (old_length !== unrecognized)
						seldom_character_count[lang] = old_length
								- unrecognized;
					old_length = unrecognized;
				}
			}
		};
		// 按照特徵碼一個個將之去除,計算符合的長度,猜測最有可能者。
		remove_lang('en-US', true);
		remove_lang('en-US', seldom_character_count);
		// 因為 'en-US' ⊂ 'x-general',必須將 'en-US' 排在 'x-general' 之前。
		remove_lang('x-general', true);
		remove_lang('x-CJK', true);
		for (i in guess_text_language.boundary) {
			remove_lang(i);
			remove_lang(i, seldom_character_count);
		}
		if (library_namespace.is_debug(2)) {
			for (i in character_count) {
				if (character_count[i])
					library_namespace.debug('count ' + i + ': '
							+ character_count[i], 2, 'guess_text_language');
			}
			if (unrecognized)
				library_namespace.debug(unrecognized + ' unknown characters: ['
						+ text.slice(0, 300) + ']');
		}
		// 依各種常用字母之經驗法則偵測/判別。.95, .5: 依據經驗而得之 magic number 閥值。
		if ((character_count['en-US'] + (seldom_character_count['en-US'] || 0))
				/ (character_count.all - unrecognized) > .95
				&& character_count['en-US']
						/ (character_count.all - unrecognized) > .5)
			return 'en-US';
		featured = character_count.all - character_count['en-US']
				- (seldom_character_count['en-US'] || 0)
				- character_count['x-general'];
		library_namespace
				.debug('unrecognized ' + unrecognized + ' / featured '
						+ featured + ' = '
						+ +((unrecognized / featured).toFixed(3))
						+ ',  boundary '
						+ guess_text_language.unrecognized_boundary, 2);
		if (
		// 判斷已知的同時,未知字碼需要極少…不過這在遇上符號時可能會出現錯誤。因此最好將符號排除。
		unrecognized / featured > guess_text_language.unrecognized_boundary) {
			if (library_namespace.is_debug(2)) {
				library_namespace.debug('unrecognized > boundary: total '
						+ text.length + ' unknown characters.', 2);
				for (var i = 0, l = text.length; i < l; i++) {
					library_namespace.debug('['
							+ text.charAt(i) + ']: U+'
							+ text.charCodeAt(i).toString(16).toUpperCase()
							+ ' (' + text.charCodeAt(i)
							+ '10)', 3);
				}
			}
			// 若 ratio === 1 但指定 language_to_test{},可能造成回傳與原先結構不同之
			// language_to_test{}!
			return return_ratio ? {} : undefined;
		}
		var count, seldom_count, denominator, ratio,
		// best confidence
		most_probable_code, highest_ratio = 0, language, recognized_featured_characters = featured
				- unrecognized - character_count['x-CJK'];
		library_namespace.debug('recognized featured characters: '
				+ recognized_featured_characters, 2);
		if (options.contains_JP && (i = character_count['ja-JP'])) {
			library_namespace.debug('Add JP count to TW, CN.', 2);
			character_count['cmn-Hant-TW'] = (character_count['cmn-Hant-TW'] || 0)
					+ i;
			character_count['cmn-Hans-CN'] = (character_count['cmn-Hans-CN'] || 0)
					+ i;
		}
		if (i = character_count['x-CJK']) {
			// 由於 CJK 於各語言各有不同比例,因此加點比重至此。各比例為依據經驗而得之 magic number。
			character_count['cmn-Hant-TW'] = (character_count['cmn-Hant-TW'] || 0)
					+ i * .2;
			character_count['cmn-Hans-CN'] = (character_count['cmn-Hans-CN'] || 0)
					+ i * .15;
			character_count['ja-JP'] = (character_count['ja-JP'] || 0) + i * .1;
		}
		// 設定要測試的 codes。
		if (!library_namespace.is_Object(language_to_test))
			if (language_to_test in guess_text_language.boundary) {
				(i = {})[language_to_test] = guess_text_language.boundary[language_to_test];
				language_to_test = i;
			} else {
				language_to_test = guess_text_language.boundary;
			}
		if (return_ratio) {
			// clone object
			// http://jsperf.com/cloning-an-object/50
			var j = {};
			for (i in language_to_test)
				j[i] = 0;
			language_to_test = j;
		}
		for (i in language_to_test) {
			count = character_count[i] || 0;
			seldom_count = seldom_character_count[i] || 0;
			denominator = recognized_featured_characters
			//
			+ character_count['x-CJK'] * .2
			// 加上 unrecognized 的影響。3: 實為依據經驗而得之 magic number。
			+ (i in {
				'ru-RU' : 9
			} ? 9 : 3) * unrecognized;
			// 'ko-KR' 幾乎只用한글(朝鲜字母),為作平衡加回來。.5: 實為依據經驗而得之 magic number。
			if (i === 'ko-KR') {
				denominator += character_count['x-CJK'] * .5;
			} else if (options.contains_JP
					&& (i === 'cmn-Hant-TW' || i === 'cmn-Hans-CN')) {
				count += character_count['x-CJK'];
				denominator += character_count['x-CJK'];
			}
			ratio = (count + seldom_count * guess_text_language.seldom_weight)
					/ denominator;
			library_namespace.debug('test language [' + i + ']: ' + count
					+ ' + ' + seldom_count + ' / (all featured characters '
					+ recognized_featured_characters
					+ (recognized_featured_characters === denominator ? ''
					//
					: ' → ' + denominator) + ') ≈ ' + +(ratio.toFixed(3))
					+ ' (boundary: ' + (language_to_test[i]
					//
					|| guess_text_language.default_boundary) + ')', 2);
			if (return_ratio) {
				// 設定好 ratio
				language_to_test[i] = ratio;
			}
			if (// count > (most_probable_code ?
			// character_count[most_probable_code] : 0) &&
			ratio > Math.max(guess_text_language.default_boundary,
					highest_ratio)) {
				if (ratio >= (test_all ? 1 : language_to_test[i]
						|| guess_text_language.default_select_boundary)) {
					library_namespace
							.debug('return the most probable code [' + i
									+ ']: ' + ratio + '.', 2,
									'guess_text_language');
					return return_ratio ? language_to_test : i;
				}
				library_namespace.debug('Most probable code now: [' + i
						+ '] = ' + ratio, 1, 'guess_text_language');
				most_probable_code = i;
				highest_ratio = ratio;
			}
		}
		if (return_ratio) {
			return language_to_test;
		}
		// 經過廝殺戰的才當作有其價值。
		if (!test_all)
			library_namespace.debug('沒有所佔比例超過門檻,且可以準確判斷的 encoding。 ', 2,
					'guess_text_language');
		library_namespace.debug('the most probable code [' + most_probable_code
				+ ']: ' + highest_ratio, 2, 'guess_text_language');
		return most_probable_code;
	}
	;
	// 某種語言之字元數大於此界線閥值,即視為此種語言。依據經驗而得之 magic number。應 > Math.max(.5,
	// guess_text_language.default_boundary)。若低於此,則進入廝殺戰。
	guess_text_language.default_select_boundary = .9;
	// 要作為候選者之最低限度閥值。依據經驗而得之 magic number。
	guess_text_language.default_boundary = .4;
	// 若無法判別之字元比例大於此界線閥值,則當作 miss 過多,無法判別。依據經驗而得之 magic number。
	guess_text_language.unrecognized_boundary = .3;
	// 依據經驗而得之 magic number。
	guess_text_language.seldom_weight = .3;
	guess_text_language.seldom_postfix = '.seldom';
	guess_text_language.remove_support_contents = function(text) {
		if (typeof text !== 'string')
			return;
		if (_.is_HTML_file(text)) {
			if (library_namespace.is_debug(3)) {
				library_namespace.debug('1. ' + text.length
						+ ' characters: [' + text.replace(/2. '
						+ text.length
						+ ' characters: ['
						+ text.replace(//g, '').replace(/3. '
						+ text.length
						+ ' characters: ['
						+ text.replace(//g, '').replace(
								/
	和製漢字(国字)は、和語(ヤマトコトバ)に相当する漢字が無い場合に新規につくられたもので、奈良時代から作られた。ほとんどは訓読みしかない。魚篇や木篇が多い。
	http://homepage2.nifty.com/TAB01645/ohara/index.htm
	http://zh.wiktionary.org/wiki/%E8%BE%BB
	http://www.unicode.org/cgi-bin/GetUnihanData.pl?codepoint=8fbb
	http://jprs.jp/doc/rule/saisoku-1-wideusejp-furoku-4.html
	http://m2000.idv.tw/informer/zhi/char-root.htm
	http://www.ajisai.sakura.ne.jp/~dindi/chrc/ref/wincode2.txt
	http://cs-people.bu.edu/butta1/personal/hkscs/hkscs-oct.html
	http://www.nobi.or.jp/i/kotoba/kanji/wasei-kanji.html
	http://www.melma.com/mag/52/m00011552/a00000066.html
	韓語字母/諺文
	http://www.sinica.edu.tw/~cytseng/Korean%20reader/hangul.htm
	http://www.unicode.org/charts/normalization/
	old:
	//	自動判別檔案(或字串)之編碼
	function guess_encoding(FN) {
		if (!is_file(FN))
			return FN.length > 64 ? guess_String_language(FN)
					: guess_encoding.unknown_encoding;
		open_file(FN, binary_charset);
		if (!AdoEnums)
			return guess_encoding.unknown_encoding;
		// ADO_Stream.Type=AdoEnums.adTypeBinary;
		ADO_Stream.LoadFromFile(FN);
		var t = ADO_Stream.ReadText(3), code;
		// Unicode的Byte Order Mark(BOM)在UTF-16LE(little endian)裏,它是以FF-FE這兩個bytes表達,在BE(big endian)裏,是FEFF。而在UTF-8裏,它是以EF-BB-BF這三個bytes表達。
		if (t.slice(0, 2) === '\xFF\xFE')
			code = 'unicodeFFFE';
		if (t.slice(0, 2) === '\xFE\xFF')
			code = 'unicode';
		if (t === '\xEF\xBB\xBF')
			code = 'UTF-8';
		if (code) {
			ADO_Stream.Close();
			return code;
		}
	
		if (!code) {
			// 將 Shift_JIS 排在 GB 2312 與 Big5 前面是因為 Shift_JIS 常符合 GB 2312,且 Shift_JIS
			// 之判定相當嚴。
			if (!code)
				ADO_Stream.Position = 0, ADO_Stream.Charset = 'Shift_JIS',
						code = guess_String_language(ADO_Stream.ReadText(900),
								ADO_Stream.Charset);
			// 將 GB 2312 排在 Big5 前面是因為 GB 2312 常用字在 Big5 中常常是0x8000之後的常用字,Big5
			// 常用字卻常常是 GB 2312 中奇怪字碼與罕用字。
			if (!code)
				ADO_Stream.Position = 0, ADO_Stream.Charset = 'GB 2312',
						code = guess_String_language(ADO_Stream.ReadText(2000),
								ADO_Stream.Charset);
			if (!code)
				ADO_Stream.Position = 0, ADO_Stream.Charset = 'Big5',
						code = guess_String_language(ADO_Stream.ReadText(2000),
								ADO_Stream.Charset);
		}
	
		ADO_Stream.Close();
		return code || guess_encoding.unknown_encoding; // ascii=ISO-8859-1,_autodetect,_autodetect_all
	}
	// 靠常用字自動判別字串之編碼 string,預設編碼
	function guess_String_language(str, dcode) {
		var code;
		if (str.length > 9000)
			str = str.slice(0, 9000);
	
		// 將Shift_JIS排在 GB 2312 與Big5前面是因為Shift_JIS常符合gb,且Shift_JIS之判定相當嚴。
		if (dcode == 'Shift_JIS' || !dcode && !code) {
			// http://www.asahi-net.or.jp/~hc3j-tkg/unicode/
			// http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
			var i = 0, c, k = 0, u = 0, h = 0;// h_=u_=k_='';
			for (; i < str.length; i++)
				if (c = str.charCodeAt(i), c > 0xFF)
					if (c == 0x30FB || c > 0xFF65 && c < 0xFF9E) {
						// HALFWIDTH
						h++;
						// h_+=str.charAt(i);//||c===0xE134
					}
					// KATAKANA LETTER等可能不是日文文件中會出現的char
					else if (c > 0x3040 && c < 0x30FF) {
						// kana
						k++;
						// k_+=str.charAt(i);
					} else {
						// unknown kanji
						u++;
						// u_+=str.charAt(i);
					}
			if (false) {
				alert(k + ',' + u + ',' + h + '\n*' + k_ + '\n*' + u_ + '\n*' + h_);
				alert(u_.charCodeAt(2));
			}
			if (k + u > 2 * h)
				// HALFWIDTH KATAKANA LETTER數目比漢字少時判別為Shift_JIS
				code = 'Shift_JIS';
		}
	
		// 將 GB 2312 排在Big5前面是因為 GB 2312 常用字在Big5中常常是0x8000之後的常用字,Big5常用字卻常常是 GB
		// 2312 中奇怪字碼與罕用字
		if (dcode == 'Big5' || dcode == 'GB 2312' || !dcode && !code) {
			var i = 0, c, k = 0, u = 0;
			// k_=u_='';
			for (; i < str.length; i++)
				if (c = str.charCodeAt(i), c > 0xFF)
					if (c > 0x4DFF && c < 0x9FA6 || c > 0xFF00 && c < 0xFF5F
							|| c > 0x33ff && c < 0x4DB6 || c == 0x2605
							|| c == 0x2606) {
						// 2605,6:★☆
						k++;
						// k_+=str.charAt(i);
					} else {
						u++;
						// ,u_+=str.charAt(i);
					}
			if (false)
				alert(k + ',' + u + '\n' + k_ + '\n*' + u_);
			if (k > 5 * u)
				// 漢字比不認識的字多時判定
				code = dcode || 'Big5';
		}
	
		if (dcode == binary_charset || dcode == 'ascii' || !dcode && !code) {
		}
	
		return code;
	}
	
	 */
	// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
	var to_kana_pair, to_romaji_pair;
	/**
	 * convert romaji to kana. ロマ字→仮名.
	 * 
	 * @example 
	// More examples: see /_test suite/test.js
	 * 
	 * 
	 * @param {String}text
	 *            text to be converted.
	 * @returns {String} text converted.
	 */
	function to_kana(text) {
		return to_kana_pair.convert(text);
	}
	/**
	 * convert kana to romaji. 仮名→ロマ字.
	 * 
	 * @param {String}text
	 *            text to be converted.
	 * @returns {String} text converted.
	 */
	function to_romaji(text) {
		return to_romaji_pair.convert(text);
	}
	/**
	 * initialize 仮名/ロマ字(羅馬字) pair.
	 */
	function initialize_kana_romaji(function_name) {
		if (!to_kana_pair) {
			to_kana_pair = new library_namespace.data.Convert_Pairs(null, {
				path : library_namespace.get_module_path(module_name.replace(
						/[^.]+$/, ''),
				// 'resources/kana romaji.txt'
				library_namespace.env.resources_directory_name
						+ '/kana romaji.txt'),
				encoding : 'UTF-8',
				remove_comments : true
			});
			to_romaji_pair = to_kana_pair.clone().reverse();
		}
		return function_name === 'to_kana' ? to_kana : to_romaji;
	}
	library_namespace.set_initializor('to_kana', initialize_kana_romaji, _);
	library_namespace.set_initializor('to_romaji', initialize_kana_romaji, _);
	// --------------------------------------------------------
	_// JSDT:_module_
	.
	/**
	 * 將 BIG5 日文假名碼修改為 Unicode 日文假名。
	 * 
	 * @param {String}
	 *            text Unicode text
	 * @return {String}Unicode 日文假名。
	 * @see from Unicode 補完計畫 jrename.js
	 */
	Big5_kana_fix = function(text) {
		var H = [], t, i = 0;
		for (; i < text.length; i++) {
			t = c.charCodeAt(0);
			// 某次破解 Windows Installer 所用的資料
			// H += String.fromCharCode(t > 61300 ? t - 48977 : t);
			H.push(t === 63219 ? 'ー' : String.fromCharCode(
			// ひらがな
			t >= 63223 && t <= 63305 ? t - 50870 :
			// カタカナ
			t >= 63306 && t <= 63391 ? t - 50857 :
			// text.charAt(i);
			t));
		}
		return H.join('');
	};
	// ----------------------------------------------------------------------------------------------------------------------------------------------------------//
	return (_// JSDT:_module_
	);
}