/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.DataInputBuffer; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.util.HttpHeaderUtils.ContentTypeAndCharset; import org.commoncrawl.util.Tuples.Pair; import org.mozilla.intl.chardet.nsDetector; import org.mozilla.intl.chardet.nsICharsetDetectionObserver; import org.mozilla.intl.chardet.nsPSMDetector; import com.google.common.collect.ImmutableMap; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; /** * * Charset detection and decoding helpers * * @author rana * */ public class CharsetUtils { public static final Log LOG = LogFactory.getLog(CharsetUtils.class); public static final int CHARSET_SRC_HEADERS = 0; public static final int CHARSET_SRC_META = 1; public static final int CHARSET_SRC_MOZILLA_DETECTOR = 3; public static final int CHARSET_SRC_ICU_DETECTOR = 2; public static final int CHARSET_SRC_NO_MATCH = 10; /** check for a normalized (java friendly) alias for the original charset name **/ public static String aliasCharset(String inputCharset) { String alias = aliasTable.get(inputCharset.toLowerCase()); return (alias != null) ? alias : inputCharset; } public static Pair<Pair<Integer,Charset>,String> bestEffortDecodeBytes(String headers, byte[] crawlData,int offset,int length) throws IOException { Pair<Integer,Charset> charsetTuple = bestEffortDetectCharset(headers, crawlData,offset,length); if (charsetTuple != null) { try { CharBuffer ucs2Chars = charsetTuple.e1.decode(ByteBuffer.wrap(crawlData,offset,length)); return new Pair<Pair<Integer,Charset>,String>(charsetTuple,ucs2Chars.toString()); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } } return new Pair<Pair<Integer,Charset>,String>(new Pair<Integer,Charset>(CHARSET_SRC_NO_MATCH,Charset.forName("ASCII")),new String(crawlData,Charset.forName("ASCII"))); } public static Pair<Integer,Charset> bestEffortDetectCharset(String headers, byte[] crawlData) throws IOException { return bestEffortDetectCharset(headers, crawlData, 0, crawlData.length); } private static Pair<Integer,Charset> resolveCharset(Map<Integer,String> charsets) { for (int i=CHARSET_SRC_HEADERS;i<=CHARSET_SRC_ICU_DETECTOR;++i) { String charsetName = charsets.get(i); Charset charsetObject = null; if (charsetName != null) { try { charsetObject = Charset.forName(charsetName); } catch (Exception e) { } if (charsetObject == null && i <= CHARSET_SRC_META) { // alias it .. charsetName = aliasCharset(charsetName); if (charsetName != null) { try { charsetObject = Charset.forName(charsetName); } catch (Exception e) { } } } if (charsetObject == null) { charsets.put(i, null); } } if (charsetObject != null) { return new Pair<Integer,Charset>(i,charsetObject); } } return null; } public static Pair<Integer,Charset> bestEffortDetectCharset(String headers,byte[] crawlDataBytes,int offset,int length) throws IOException { HashMap<Integer,String> charsetMap = new HashMap<Integer, String>(); ContentTypeAndCharset urlMetadata = new ContentTypeAndCharset(); // parse final header result and populate appropriate metadata fields NIOHttpHeaders finalHeaders = NIOHttpHeaders.parseHttpHeaders(headers); HttpHeaderUtils.parseContentType(finalHeaders, urlMetadata); if (urlMetadata._charset != null) { charsetMap.put(CHARSET_SRC_HEADERS, urlMetadata._charset); } if (urlMetadata._contentType != null && crawlDataBytes != null) { if (MimeTypeFilter.isValidHTMLType(urlMetadata._contentType)) { // sniff encoding in metadata ... String alternateCharset = CharsetUtils.sniffMetaCharacterEncoding(crawlDataBytes,offset,length); if (alternateCharset != null) { charsetMap.put(CHARSET_SRC_META, alternateCharset); } } } Pair<Integer,Charset> charsetMatch = resolveCharset(charsetMap); if (charsetMatch == null) { // now if charset is still NOT available ... if (crawlDataBytes != null) { if (urlMetadata._contentType != null && MimeTypeFilter.isTextType(urlMetadata._contentType)) { // try to detect the charset from the stream ... charsetMap.put(CHARSET_SRC_MOZILLA_DETECTOR,CharsetUtils .detectCharacterEncoding(crawlDataBytes,offset,length,EncodingDetector.MOZILLA)); charsetMap.put(CHARSET_SRC_ICU_DETECTOR,CharsetUtils .detectCharacterEncoding(crawlDataBytes,offset,length,EncodingDetector.ICU)); charsetMatch = resolveCharset(charsetMap); } } } if (charsetMatch == null) { // punt to latin one charsetMatch = new Pair<Integer,Charset>(CHARSET_SRC_NO_MATCH,Charset.forName("ISO-8859-1")); } return charsetMatch; } private static final int CHUNK_SIZE = 2000; private static Pattern metaPattern = Pattern .compile( "<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", Pattern.CASE_INSENSITIVE); private static Pattern charsetPattern = Pattern .compile( "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); /** * Given a <code>byte[]</code> representing an html file of an * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag * from the first <code>CHUNK_SIZE</code> bytes. If there's no meta tag for * Content-Type or no charset is specified, <code>null</code> is returned. <br /> * FIXME: non-byte oriented character encodings (UTF-16, UTF-32) can't be * handled with this. We need to do something similar to what's done by * mozilla * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser * .cpp#1993). See also http://www.w3.org/TR/REC-xml/#sec-guessing <br /> * * @param content * <code>byte[]</code> representation of an html file */ public static String sniffMetaCharacterEncoding(byte[] contentBytes,int offset,int length) { // LOG.info("ENTERING SNIFFCHARENCODING..."); int lengthToUse = length < CHUNK_SIZE ? length : CHUNK_SIZE; // We don't care about non-ASCII parts so that it's sufficient // to just inflate each byte to a 16-bit value by padding. // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into // {U+0041, U+0082, U+00B7}. String str = ""; try { str = new String(contentBytes, offset, lengthToUse, Charset.forName("ASCII").toString()); } catch (UnsupportedEncodingException e) { // code should never come here, but just in case... return null; } // LOG.info("RUNNING METAPATTERN MATCHER..."); Matcher metaMatcher = metaPattern.matcher(str); String encoding = null; if (metaMatcher.find()) { // LOG.info("RUNNING CHARSET PATTERN MATCHER..."); Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); if (charsetMatcher.find()) encoding = new String(charsetMatcher.group(1)); } // LOG.info("ENCODING IS:" + encoding); return encoding; } public static class DetectorState implements nsICharsetDetectionObserver { public boolean _done = false; public String _detectedCharset = null; @Override public void Notify(String charset) { _detectedCharset = charset; _done = true; } } private static int MAX_CHARS_TO_DETECT = 1 << 15; // 32K public enum EncodingDetector { MOZILLA, ICU } /** last resort - detect encoding using charset detector **/ public static String detectCharacterEncoding(byte[] contentBytes,int offset,int length,EncodingDetector detectorType ) { if (contentBytes != null && length != 0) { if (detectorType == EncodingDetector.MOZILLA) { DetectorState state = new DetectorState(); nsDetector detector = new nsDetector(nsPSMDetector.ALL); if (offset != 0) { int tempBufferLen = Math.min(length, MAX_CHARS_TO_DETECT); byte[] tempBuffer = new byte[tempBufferLen]; System.arraycopy(contentBytes,offset,tempBuffer,0,tempBufferLen); contentBytes = tempBuffer; offset = 0; length = tempBufferLen; } detector.Init(state); boolean isAscii = detector.isAscii(contentBytes, length); if (!isAscii) { isAscii = detector.DoIt(contentBytes, Math.min(length, MAX_CHARS_TO_DETECT), false); } detector.DataEnd(); if (isAscii) { return null; } else if (state._detectedCharset != null) { return state._detectedCharset; } else { String prob[] = detector.getProbableCharsets(); if (prob != null && prob.length != 0) { return prob[0]; } } } else { // instantiate icu charset detector ... CharsetDetector detector = new CharsetDetector(); DataInputBuffer buffer = new DataInputBuffer(); buffer.reset(contentBytes,offset,length); try { detector.setText(buffer); CharsetMatch matches[] = detector.detectAll(); if (matches != null && matches.length != 0) { int kThresold = 10; CharsetMatch bestMatch = null; for (int i = 0; i < matches.length; ++i) { if (bestMatch == null || matches[i].getConfidence() > bestMatch.getConfidence()) { bestMatch = matches[i]; } } if (bestMatch != null) { return bestMatch.getName(); } else { return matches[0].getName(); } } } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } finally { } } } return null; } static final ImmutableMap<String, String> aliasTable; static { ImmutableMap.Builder<String, String> builder = new ImmutableMap.Builder<String, String>(); builder.put("ibm-1208", "UTF-8"); builder.put("ibm-1209", "UTF-8"); builder.put("ibm-5304", "UTF-8"); builder.put("ibm-5305", "UTF-8"); builder.put("ibm-13496", "UTF-8"); builder.put("ibm-13497", "UTF-8"); builder.put("ibm-17592", "UTF-8"); builder.put("ibm-17593", "UTF-8"); builder.put("windows-65001", "UTF-8"); builder.put("cp1208", "UTF-8"); builder.put("iso-10646-ucs-2", "UTF-16"); builder.put("ibm-1204", "UTF-16"); builder.put("ibm-1205", "UTF-16"); builder.put("unicode", "UTF-16"); builder.put("csunicode", "UTF-16"); builder.put("ucs-2", "UTF-16"); builder.put("x-utf-16be", "UTF-16BE"); builder.put("unicodebigunmarked", "UTF-16BE"); builder.put("ibm-1200", "UTF-16BE"); builder.put("ibm-1201", "UTF-16BE"); builder.put("ibm-13488", "UTF-16BE"); builder.put("ibm-13489", "UTF-16BE"); builder.put("ibm-17584", "UTF-16BE"); builder.put("ibm-17585", "UTF-16BE"); builder.put("ibm-21680", "UTF-16BE"); builder.put("ibm-21681", "UTF-16BE"); builder.put("ibm-25776", "UTF-16BE"); builder.put("ibm-25777", "UTF-16BE"); builder.put("ibm-29872", "UTF-16BE"); builder.put("ibm-29873", "UTF-16BE"); builder.put("ibm-61955", "UTF-16BE"); builder.put("ibm-61956", "UTF-16BE"); builder.put("windows-1201", "UTF-16BE"); builder.put("cp1200", "UTF-16BE"); builder.put("cp1201", "UTF-16BE"); builder.put("utf16_bigendian", "UTF-16BE"); builder.put("x-utf-16le", "UTF-16LE"); builder.put("unicodelittleunmarked", "UTF-16LE"); builder.put("ibm-1202", "UTF-16LE"); builder.put("ibm-1203", "UTF-16LE"); builder.put("ibm-13490", "UTF-16LE"); builder.put("ibm-13491", "UTF-16LE"); builder.put("ibm-17586", "UTF-16LE"); builder.put("ibm-17587", "UTF-16LE"); builder.put("ibm-21682", "UTF-16LE"); builder.put("ibm-21683", "UTF-16LE"); builder.put("ibm-25778", "UTF-16LE"); builder.put("ibm-25779", "UTF-16LE"); builder.put("ibm-29874", "UTF-16LE"); builder.put("ibm-29875", "UTF-16LE"); builder.put("utf16_littleendian", "UTF-16LE"); builder.put("windows-1200", "UTF-16LE"); builder.put("ibm-819", "ISO-8859-1"); builder.put("ibm819", "ISO-8859-1"); builder.put("cp819", "ISO-8859-1"); builder.put("latin1", "ISO-8859-1"); builder.put("8859_1", "ISO-8859-1"); builder.put("csisolatin1", "ISO-8859-1"); builder.put("iso-ir-100", "ISO-8859-1"); builder.put("iso_8859-1:1987", "ISO-8859-1"); builder.put("l1", "ISO-8859-1"); builder.put("819", "ISO-8859-1"); builder.put("ascii", "US-ASCII"); builder.put("ansi_x3.4-1968", "US-ASCII"); builder.put("ansi_x3.4-1986", "US-ASCII"); builder.put("iso_646.irv:1991", "US-ASCII"); builder.put("iso_646.irv:1983", "US-ASCII"); builder.put("iso646-us", "US-ASCII"); builder.put("us", "US-ASCII"); builder.put("csascii", "US-ASCII"); builder.put("iso-ir-6", "US-ASCII"); builder.put("cp367", "US-ASCII"); builder.put("ascii7", "US-ASCII"); builder.put("646", "US-ASCII"); builder.put("windows-20127", "US-ASCII"); builder.put("ibm-367", "US-ASCII"); builder.put("ibm367", "US-ASCII"); builder.put("ibm-912_p100-1995", "ISO-8859-2"); builder.put("ibm-912", "ISO-8859-2"); builder.put("iso_8859-2:1987", "ISO-8859-2"); builder.put("latin2", "ISO-8859-2"); builder.put("csisolatin2", "ISO-8859-2"); builder.put("iso-ir-101", "ISO-8859-2"); builder.put("l2", "ISO-8859-2"); builder.put("8859_2", "ISO-8859-2"); builder.put("cp912", "ISO-8859-2"); builder.put("912", "ISO-8859-2"); builder.put("windows-28592", "ISO-8859-2"); builder.put("ibm-913_p100-2000", "ISO-8859-3"); builder.put("ibm-913", "ISO-8859-3"); builder.put("iso_8859-3:1988", "ISO-8859-3"); builder.put("latin3", "ISO-8859-3"); builder.put("csisolatin3", "ISO-8859-3"); builder.put("iso-ir-109", "ISO-8859-3"); builder.put("l3", "ISO-8859-3"); builder.put("8859_3", "ISO-8859-3"); builder.put("cp913", "ISO-8859-3"); builder.put("913", "ISO-8859-3"); builder.put("windows-28593", "ISO-8859-3"); builder.put("ibm-914_p100-1995", "ISO-8859-4"); builder.put("ibm-914", "ISO-8859-4"); builder.put("latin4", "ISO-8859-4"); builder.put("csisolatin4", "ISO-8859-4"); builder.put("iso-ir-110", "ISO-8859-4"); builder.put("iso_8859-4:1988", "ISO-8859-4"); builder.put("l4", "ISO-8859-4"); builder.put("8859_4", "ISO-8859-4"); builder.put("cp914", "ISO-8859-4"); builder.put("914", "ISO-8859-4"); builder.put("windows-28594", "ISO-8859-4"); builder.put("ibm-915_p100-1995", "ISO-8859-5"); builder.put("ibm-915", "ISO-8859-5"); builder.put("cyrillic", "ISO-8859-5"); builder.put("csisolatincyrillic", "ISO-8859-5"); builder.put("iso-ir-144", "ISO-8859-5"); builder.put("iso_8859-5:1988", "ISO-8859-5"); builder.put("8859_5", "ISO-8859-5"); builder.put("cp915", "ISO-8859-5"); builder.put("915", "ISO-8859-5"); builder.put("windows-28595", "ISO-8859-5"); builder.put("ibm-1089_p100-1995", "ISO-8859-6"); builder.put("ibm-1089", "ISO-8859-6"); builder.put("arabic", "ISO-8859-6"); builder.put("csisolatinarabic", "ISO-8859-6"); builder.put("iso-ir-127", "ISO-8859-6"); builder.put("iso_8859-6:1987", "ISO-8859-6"); builder.put("ecma-114", "ISO-8859-6"); builder.put("asmo-708", "ISO-8859-6"); builder.put("8859_6", "ISO-8859-6"); builder.put("cp1089", "ISO-8859-6"); builder.put("1089", "ISO-8859-6"); builder.put("windows-28596", "ISO-8859-6"); builder.put("iso-8859-6-i", "ISO-8859-6"); builder.put("iso-8859-6-e", "ISO-8859-6"); builder.put("ibm-9005_x110-2007", "ISO-8859-7"); builder.put("ibm-9005", "ISO-8859-7"); builder.put("greek", "ISO-8859-7"); builder.put("greek8", "ISO-8859-7"); builder.put("elot_928", "ISO-8859-7"); builder.put("ecma-118", "ISO-8859-7"); builder.put("csisolatingreek", "ISO-8859-7"); builder.put("iso-ir-126", "ISO-8859-7"); builder.put("iso_8859-7:1987", "ISO-8859-7"); builder.put("windows-28597", "ISO-8859-7"); builder.put("sun_eu_greek", "ISO-8859-7"); builder.put("ibm-813_p100-1995", "ISO-8859-7"); builder.put("ibm-813", "ISO-8859-7"); builder.put("8859_7", "ISO-8859-7"); builder.put("cp813", "ISO-8859-7"); builder.put("813", "ISO-8859-7"); builder.put("ibm-5012_p100-1999", "ISO-8859-8"); builder.put("ibm-5012", "ISO-8859-8"); builder.put("hebrew", "ISO-8859-8"); builder.put("csisolatinhebrew", "ISO-8859-8"); builder.put("iso-ir-138", "ISO-8859-8"); builder.put("iso_8859-8:1988", "ISO-8859-8"); builder.put("iso-8859-8-i", "ISO-8859-8"); builder.put("iso-8859-8-e", "ISO-8859-8"); builder.put("8859_8", "ISO-8859-8"); builder.put("windows-28598", "ISO-8859-8"); builder.put("hebrew8", "ISO-8859-8"); builder.put("ibm-916_p100-1995", "ibm-916"); builder.put("cp916", "ibm-916"); builder.put("916", "ibm-916"); builder.put("ibm-920_p100-1995", "ISO-8859-9"); builder.put("ibm-920", "ISO-8859-9"); builder.put("latin5", "ISO-8859-9"); builder.put("csisolatin5", "ISO-8859-9"); builder.put("iso-ir-148", "ISO-8859-9"); builder.put("iso_8859-9:1989", "ISO-8859-9"); builder.put("l5", "ISO-8859-9"); builder.put("8859_9", "ISO-8859-9"); builder.put("cp920", "ISO-8859-9"); builder.put("920", "ISO-8859-9"); builder.put("windows-28599", "ISO-8859-9"); builder.put("ecma-128", "ISO-8859-9"); builder.put("turkish8", "ISO-8859-9"); builder.put("turkish", "ISO-8859-9"); builder.put("ibm-921_p100-1995", "ISO-8859-13"); builder.put("ibm-921", "ISO-8859-13"); builder.put("8859_13", "ISO-8859-13"); builder.put("windows-28603", "ISO-8859-13"); builder.put("cp921", "ISO-8859-13"); builder.put("921", "ISO-8859-13"); builder.put("ibm-923_p100-1998", "ISO-8859-15"); builder.put("ibm-923", "ISO-8859-15"); builder.put("latin-9", "ISO-8859-15"); builder.put("l9", "ISO-8859-15"); builder.put("8859_15", "ISO-8859-15"); builder.put("latin0", "ISO-8859-15"); builder.put("csisolatin0", "ISO-8859-15"); builder.put("csisolatin9", "ISO-8859-15"); builder.put("iso8859_15_fdis", "ISO-8859-15"); builder.put("cp923", "ISO-8859-15"); builder.put("923", "ISO-8859-15"); builder.put("windows-28605", "ISO-8859-15"); builder.put("ibm-943_p15a-2003", "Shift_JIS"); builder.put("ms_kanji", "Shift_JIS"); builder.put("csshiftjis", "Shift_JIS"); builder.put("windows-31j", "Shift_JIS"); builder.put("cswindows31j", "Shift_JIS"); builder.put("x-sjis", "Shift_JIS"); builder.put("x-ms-cp932", "Shift_JIS"); builder.put("cp932", "Shift_JIS"); builder.put("windows-932", "Shift_JIS"); builder.put("cp943c", "Shift_JIS"); builder.put("ibm-943c", "Shift_JIS"); builder.put("ms932", "Shift_JIS"); builder.put("pck", "Shift_JIS"); builder.put("sjis", "Shift_JIS"); builder.put("s-jis", "Shift_JIS"); builder.put("ibm-943_vsub_vpua", "Shift_JIS"); builder.put("ibm-943_p130-1999", "x-IBM943"); builder.put("ibm-943", "x-IBM943"); builder.put("shift_jis", "x-IBM943"); builder.put("943", "x-IBM943"); builder.put("ibm-943_vascii_vsub_vpu", "x-IBM943"); builder.put("cp943", "x-IBM943"); builder.put("ibm-33722_p120-1999", "x-IBM33722"); builder.put("ibm-5050", "x-IBM33722"); builder.put("cp33722", "x-IBM33722"); builder.put("33722", "x-IBM33722"); builder.put("ibm-33722_vascii_vpua", "x-IBM33722"); builder.put("ibm-954_p101-2007", "x-JISAutoDetect"); builder.put("ibm-954", "x-JISAutoDetect"); builder.put("euc-jp", "x-JISAutoDetect"); builder.put("cseucpkdfmtjapanese", "x-JISAutoDetect"); builder.put("x-euc-jp", "x-JISAutoDetect"); builder.put("eucjis", "x-JISAutoDetect"); builder.put("ujis", "x-JISAutoDetect"); builder.put("windows-950-2000", "Big5"); builder.put("csbig5", "Big5"); builder.put("windows-950", "Big5"); builder.put("x-big5", "Big5"); builder.put("ibm-950_p110-1999", "x-IBM950"); builder.put("ibm-950", "x-IBM950"); builder.put("cp950", "x-IBM950"); builder.put("950", "x-IBM950"); builder.put("ibm-1375_p100-2007", "Big5-HKSCS"); builder.put("ibm-1375", "Big5-HKSCS"); builder.put("big5-hkscs", "Big5-HKSCS"); builder.put("big5hk", "Big5-HKSCS"); builder.put("hkscs-big5", "Big5-HKSCS"); builder.put("ibm-5471_p100-2006", "x-MS950-HKSCS"); builder.put("ibm-5471", "x-MS950-HKSCS"); builder.put("ms950_hkscs", "x-MS950-HKSCS"); builder.put("hkbig5", "x-MS950-HKSCS"); builder.put("big5-hkscs:unicode3.0", "x-MS950-HKSCS"); builder.put("windows-936-2000", "GBK"); builder.put("cp936", "GBK"); builder.put("ms936", "GBK"); builder.put("windows-936", "GBK"); builder.put("ibm-1383_p110-1999", "GB2312"); builder.put("ibm-1383", "GB2312"); builder.put("csgb2312", "GB2312"); builder.put("cp1383", "GB2312"); builder.put("1383", "GB2312"); builder.put("euc-cn", "GB2312"); builder.put("ibm-euccn", "GB2312"); builder.put("hp15cn", "GB2312"); builder.put("ibm-1383_vpua", "GB2312"); builder.put("ibm-964_p110-1999", "x-IBM964"); builder.put("ibm-964", "x-IBM964"); builder.put("euc-tw", "x-IBM964"); builder.put("ibm-euctw", "x-IBM964"); builder.put("cns11643", "x-IBM964"); builder.put("cp964", "x-IBM964"); builder.put("964", "x-IBM964"); builder.put("ibm-964_vpua", "x-IBM964"); builder.put("ibm-949_p110-1999", "x-IBM949"); builder.put("ibm-949", "x-IBM949"); builder.put("cp949", "x-IBM949"); builder.put("949", "x-IBM949"); builder.put("ibm-949_vascii_vsub_vpua", "x-IBM949"); builder.put("ibm-970_p110_p110-2006_u2", "EUC-KR"); builder.put("ibm-970", "EUC-KR"); builder.put("euc-kr", "EUC-KR"); builder.put("ks_c_5601-1987", "EUC-KR"); builder.put("windows-51949", "EUC-KR"); builder.put("cseuckr", "EUC-KR"); builder.put("ibm-euckr", "EUC-KR"); builder.put("ksc_5601", "EUC-KR"); builder.put("5601", "EUC-KR"); builder.put("cp970", "EUC-KR"); builder.put("970", "EUC-KR"); builder.put("ibm-970_vpua", "EUC-KR"); builder.put("windows-949-2000", "x-windows-949"); builder.put("windows-949", "x-windows-949"); builder.put("ks_c_5601-1989", "x-windows-949"); builder.put("csksc56011987", "x-windows-949"); builder.put("korean", "x-windows-949"); builder.put("iso-ir-149", "x-windows-949"); builder.put("ms949", "x-windows-949"); builder.put("windows-874-2000", "x-windows-874"); builder.put("windows-874", "x-windows-874"); builder.put("ms874", "x-windows-874"); builder.put("ibm-874_p100-1995", "x-IBM874"); builder.put("ibm-874", "x-IBM874"); builder.put("ibm-9066", "x-IBM874"); builder.put("cp874", "x-IBM874"); builder.put("tis-620", "x-IBM874"); builder.put("tis620.2533", "x-IBM874"); builder.put("eucth", "x-IBM874"); builder.put("ibm-437_p100-1995", "IBM437"); builder.put("ibm437", "IBM437"); builder.put("cp437", "IBM437"); builder.put("437", "IBM437"); builder.put("cspc8codepage437", "IBM437"); builder.put("windows-437", "IBM437"); builder.put("ibm-737_p100-1997", "x-IBM737"); builder.put("ibm-737", "x-IBM737"); builder.put("ibm737", "x-IBM737"); builder.put("cp737", "x-IBM737"); builder.put("windows-737", "x-IBM737"); builder.put("737", "x-IBM737"); builder.put("ibm-775_p100-1996", "IBM775"); builder.put("ibm-775", "IBM775"); builder.put("ibm775", "IBM775"); builder.put("cp775", "IBM775"); builder.put("cspc775baltic", "IBM775"); builder.put("windows-775", "IBM775"); builder.put("775", "IBM775"); builder.put("ibm-850_p100-1995", "IBM850"); builder.put("ibm-850", "IBM850"); builder.put("ibm850", "IBM850"); builder.put("cp850", "IBM850"); builder.put("850", "IBM850"); builder.put("cspc850multilingual", "IBM850"); builder.put("windows-850", "IBM850"); builder.put("ibm-852_p100-1995", "IBM852"); builder.put("ibm-852", "IBM852"); builder.put("ibm852", "IBM852"); builder.put("cp852", "IBM852"); builder.put("852", "IBM852"); builder.put("cspcp852", "IBM852"); builder.put("windows-852", "IBM852"); builder.put("ibm-855_p100-1995", "IBM855"); builder.put("ibm-855", "IBM855"); builder.put("ibm855", "IBM855"); builder.put("cp855", "IBM855"); builder.put("855", "IBM855"); builder.put("csibm855", "IBM855"); builder.put("cspcp855", "IBM855"); builder.put("windows-855", "IBM855"); builder.put("ibm-856_p100-1995", "x-IBM856"); builder.put("ibm-856", "x-IBM856"); builder.put("ibm856", "x-IBM856"); builder.put("cp856", "x-IBM856"); builder.put("856", "x-IBM856"); builder.put("ibm-857_p100-1995", "IBM857"); builder.put("ibm-857", "IBM857"); builder.put("cp857", "IBM857"); builder.put("857", "IBM857"); builder.put("csibm857", "IBM857"); builder.put("windows-857", "IBM857"); builder.put("ibm-858_p100-1997", "IBM00858"); builder.put("ibm-858", "IBM00858"); builder.put("ccsid00858", "IBM00858"); builder.put("cp00858", "IBM00858"); builder.put("pc-multilingual-850+euro", "IBM00858"); builder.put("cp858", "IBM00858"); builder.put("windows-858", "IBM00858"); builder.put("ibm-860_p100-1995", "IBM860"); builder.put("ibm-860", "IBM860"); builder.put("cp860", "IBM860"); builder.put("860", "IBM860"); builder.put("csibm860", "IBM860"); builder.put("ibm-861_p100-1995", "IBM861"); builder.put("ibm-861", "IBM861"); builder.put("cp861", "IBM861"); builder.put("861", "IBM861"); builder.put("cp-is", "IBM861"); builder.put("csibm861", "IBM861"); builder.put("windows-861", "IBM861"); builder.put("ibm-862_p100-1995", "IBM862"); builder.put("ibm-862", "IBM862"); builder.put("cp862", "IBM862"); builder.put("862", "IBM862"); builder.put("cspc862latinhebrew", "IBM862"); builder.put("dos-862", "IBM862"); builder.put("windows-862", "IBM862"); builder.put("ibm-863_p100-1995", ""); builder.put("ibm-863", "IBM863"); builder.put("cp863", "IBM863"); builder.put("863", "IBM863"); builder.put("csibm863", "IBM863"); builder.put("ibm-864_x110-1999", "IBM864"); builder.put("ibm-864", "IBM864"); builder.put("cp864", "IBM864"); builder.put("csibm864", "IBM864"); builder.put("ibm-865_p100-1995", "IBM865"); builder.put("ibm-865", "IBM865"); builder.put("cp865", "IBM865"); builder.put("865", "IBM865"); builder.put("csibm865", "IBM865"); builder.put("ibm-866_p100-1995", "IBM866"); builder.put("ibm-866", "IBM866"); builder.put("cp866", "IBM866"); builder.put("866", "IBM866"); builder.put("csibm866", "IBM866"); builder.put("windows-866", "IBM866"); builder.put("ibm-868_p100-1995", "IBM868"); builder.put("ibm-868", "IBM868"); builder.put("cp868", "IBM868"); builder.put("868", "IBM868"); builder.put("csibm868", "IBM868"); builder.put("cp-ar", "IBM868"); builder.put("ibm-869_p100-1995", "IBM869"); builder.put("ibm-869", "IBM869"); builder.put("cp869", "IBM869"); builder.put("869", "IBM869"); builder.put("cp-gr", "IBM869"); builder.put("csibm869", "IBM869"); builder.put("windows-869", "IBM869"); builder.put("ibm-878_p100-1996", "KOI8-R"); builder.put("ibm-878", "KOI8-R"); builder.put("koi8-r", "KOI8-R"); builder.put("koi8", "KOI8-R"); builder.put("cskoi8r", "KOI8-R"); builder.put("windows-20866", "KOI8-R"); builder.put("cp878", "KOI8-R"); builder.put("ibm-922_p100-1999", "x-IBM922"); builder.put("ibm-922", "x-IBM922"); builder.put("ibm922", "x-IBM922"); builder.put("cp922", "x-IBM922"); builder.put("922", "x-IBM922"); builder.put("ibm-5346_p100-1998", "windows-1250"); builder.put("ibm-5346", "windows-1250"); builder.put("cp1250", "windows-1250"); builder.put("ibm-5347_p100-1998", "windows-1251"); builder.put("ibm-5347", "windows-1251"); builder.put("cp1251", "windows-1251"); builder.put("ansi1251", "windows-1251"); builder.put("ibm-5348_p100-1997", "windows-1252"); builder.put("ibm-5348", "windows-1252"); builder.put("cp1252", "windows-1252"); builder.put("ibm-5349_p100-1998", "windows-1253"); builder.put("ibm-5349", "windows-1253"); builder.put("cp1253", "windows-1253"); builder.put("ibm-5350_p100-1998", "windows-1254"); builder.put("ibm-5350", "windows-1254"); builder.put("cp1254", "windows-1254"); builder.put("ibm-9447_p100-2002", "windows-1255"); builder.put("ibm-9447", "windows-1255"); builder.put("cp1255", "windows-1255"); builder.put("ibm-9448_x100-2005", "windows-1256"); builder.put("ibm-9448", "windows-1256"); builder.put("cp1256", "windows-1256"); builder.put("ibm-9449_p100-2002", "windows-1257"); builder.put("ibm-9449", "windows-1257"); builder.put("cp1257", "windows-1257"); builder.put("ibm-5354_p100-1998", "windows-1258"); builder.put("ibm-5354", "windows-1258"); builder.put("cp1258", "windows-1258"); builder.put("ibm-1006_p100-1995", "x-IBM1006"); builder.put("ibm-1006", "x-IBM1006"); builder.put("ibm1006", "x-IBM1006"); builder.put("cp1006", "x-IBM1006"); builder.put("1006", "x-IBM1006"); builder.put("ibm-1098_p100-1995", "x-IBM1006"); builder.put("ibm-1098", "x-IBM1006"); builder.put("ibm1098", "x-IBM1006"); builder.put("cp1098", "x-IBM1006"); builder.put("1098", "x-IBM1006"); builder.put("ibm-1124_p100-1996", "x-IBM1124"); builder.put("ibm-1124", "x-IBM1124"); builder.put("cp1124", "x-IBM1124"); builder.put("1124", "x-IBM1124"); builder.put("ISO_2022,locale=ja,version=0", "ISO-2022-JP"); builder.put("iso-2022-jp", "ISO-2022-JP"); builder.put("csiso2022jp", "ISO-2022-JP"); builder.put("ISO_2022,locale=ko,version=0", "ISO-2022-KR"); builder.put("iso-2022-kr", "ISO-2022-KR"); builder.put("csiso2022kr", "ISO-2022-KR"); builder.put("ISO_2022,locale=zh,version=0", "ISO-2022-CN"); builder.put("iso-2022-cn", "ISO-2022-CN"); builder.put("csiso2022cn", "ISO-2022-CN"); builder.put("ibm-37_p100-1995", "IBM037"); builder.put("ibm-37", "IBM037"); builder.put("ibm-037", "IBM037"); builder.put("ebcdic-cp-us", "IBM037"); builder.put("ebcdic-cp-ca", "IBM037"); builder.put("ebcdic-cp-wt", "IBM037"); builder.put("ebcdic-cp-nl", "IBM037"); builder.put("csibm037", "IBM037"); builder.put("cp037", "IBM037"); builder.put("37", "IBM037"); builder.put("cpibm37", "IBM037"); builder.put("cp37", "IBM037"); builder.put("ibm-273_p100-1995", "IBM273"); builder.put("ibm-273", "IBM273"); builder.put("cp273", "IBM273"); builder.put("csibm273", "IBM273"); builder.put("ebcdic-de", "IBM273"); builder.put("273", "IBM273"); builder.put("ibm-277_p100-1995", "IBM277"); builder.put("ibm-277", "IBM277"); builder.put("cp277", "IBM277"); builder.put("ebcdic-cp-dk", "IBM277"); builder.put("ebcdic-cp-no", "IBM277"); builder.put("csibm277", "IBM277"); builder.put("ebcdic-dk", "IBM277"); builder.put("277", "IBM277"); builder.put("ibm-278_p100-1995", "IBM278"); builder.put("ibm-278", "IBM278"); builder.put("cp278", "IBM278"); builder.put("ebcdic-cp-fi", "IBM278"); builder.put("ebcdic-cp-se", "IBM278"); builder.put("csibm278", "IBM278"); builder.put("ebcdic-sv", "IBM278"); builder.put("278", "IBM278"); builder.put("ibm-280_p100-1995", "IBM280"); builder.put("ibm-280", "IBM280"); builder.put("cp280", "IBM280"); builder.put("ebcdic-cp-it", "IBM280"); builder.put("csibm280", "IBM280"); builder.put("280", "IBM280"); builder.put("ibm-284_p100-1995", "IBM284"); builder.put("ibm-284", "IBM284"); builder.put("cp284", "IBM284"); builder.put("ebcdic-cp-es", "IBM284"); builder.put("csibm284", "IBM284"); builder.put("cpibm284", "IBM284"); builder.put("284", "IBM284"); builder.put("ibm-285_p100-1995", "IBM285"); builder.put("ibm-285", "IBM285"); builder.put("cp285", "IBM285"); builder.put("ebcdic-cp-gb", "IBM285"); builder.put("csibm285", "IBM285"); builder.put("cpibm285", "IBM285"); builder.put("ebcdic-gb", "IBM285"); builder.put("285", "IBM285"); builder.put("ibm-297_p100-1995", "IBM297"); builder.put("ibm-297", "IBM297"); builder.put("cp297", "IBM297"); builder.put("ebcdic-cp-fr", "IBM297"); builder.put("csibm297", "IBM297"); builder.put("cpibm297", "IBM297"); builder.put("297", "IBM297"); builder.put("ibm-420_x120-1999", "IBM420"); builder.put("ibm-420", "IBM420"); builder.put("ibm420", "IBM420"); builder.put("cp420", "IBM420"); builder.put("ebcdic-cp-ar1", "IBM420"); builder.put("csibm420", "IBM420"); builder.put("420", "IBM420"); builder.put("ibm-424_p100-1995", "IBM424"); builder.put("ibm-424", "IBM424"); builder.put("cp424", "IBM424"); builder.put("ebcdic-cp-he", "IBM424"); builder.put("csibm424", "IBM424"); builder.put("424", "IBM424"); builder.put("ibm-500_p100-1995", "IBM500"); builder.put("ibm-500", "IBM500"); builder.put("cp500", "IBM500"); builder.put("ebcdic-cp-be", "IBM500"); builder.put("csibm500", "IBM500"); builder.put("ebcdic-cp-ch", "IBM500"); builder.put("500", "IBM500"); builder.put("ibm-838_p100-1995", "IBM-Thai"); builder.put("ibm-838", "IBM-Thai"); builder.put("ibm838", "IBM-Thai"); builder.put("csibmthai", "IBM-Thai"); builder.put("cp838", "IBM-Thai"); builder.put("838", "IBM-Thai"); builder.put("ibm-9030", "IBM-Thai"); builder.put("ibm-870_p100-1995", "IBM870"); builder.put("ibm-870", "IBM870"); builder.put("cp870", "IBM870"); builder.put("ebcdic-cp-roece", "IBM870"); builder.put("ebcdic-cp-yu", "IBM870"); builder.put("csibm870", "IBM870"); builder.put("ibm-871_p100-1995", "IBM871"); builder.put("ibm-871", "IBM871"); builder.put("ebcdic-cp-is", "IBM871"); builder.put("csibm871", "IBM871"); builder.put("cp871", "IBM871"); builder.put("ebcdic-is", "IBM871"); builder.put("871", "IBM871"); builder.put("ibm-875_p100-1995", "x-IBM875"); builder.put("ibm-875", "x-IBM875"); builder.put("ibm875", "x-IBM875"); builder.put("cp875", "x-IBM875"); builder.put("875", "x-IBM875"); builder.put("ibm-918_p100-1995", "IBM918"); builder.put("ibm-918", "IBM918"); builder.put("cp918", "IBM918"); builder.put("ebcdic-cp-ar2", "IBM918"); builder.put("csibm918", "IBM918"); builder.put("ibm-930_p120-1999", "x-IBM930"); builder.put("ibm-930", "x-IBM930"); builder.put("ibm-5026", "x-IBM930"); builder.put("ibm930", "x-IBM930"); builder.put("cp930", "x-IBM930"); builder.put("930", "x-IBM930"); builder.put("ibm-933_p110-1995", "x-IBM933"); builder.put("ibm-933", "x-IBM933"); builder.put("cp933", "x-IBM933"); builder.put("933", "x-IBM933"); builder.put("ibm-935_p110-1999", "x-IBM935"); builder.put("ibm-935", "x-IBM935"); builder.put("cp935", "x-IBM935"); builder.put("935", "x-IBM935"); builder.put("ibm-937_p110-1999", "x-IBM937"); builder.put("ibm-937", "x-IBM937"); builder.put("cp937", "x-IBM937"); builder.put("937", "x-IBM937"); builder.put("ibm-939_p120-1999", "x-IBM939"); builder.put("ibm-939", "x-IBM939"); builder.put("ibm-931", "x-IBM939"); builder.put("ibm-5035", "x-IBM939"); builder.put("ibm939", "x-IBM939"); builder.put("cp939", "x-IBM939"); builder.put("939", "x-IBM939"); builder.put("ibm-1025_p100-1995", "x-IBM1025"); builder.put("ibm-1025", "x-IBM1025"); builder.put("cp1025", "x-IBM1025"); builder.put("1025", "x-IBM1025"); builder.put("ibm-1026_p100-1995", "IBM1026"); builder.put("ibm-1026", "IBM1026"); builder.put("ibm1026", "IBM1026"); builder.put("cp1026", "IBM1026"); builder.put("csibm1026", "IBM1026"); builder.put("1026", "IBM1026"); builder.put("ibm-1047_p100-1995", "IBM1047"); builder.put("ibm-1047", "IBM1047"); builder.put("ibm1047", "IBM1047"); builder.put("cp1047", "IBM1047"); builder.put("1047", "IBM1047"); builder.put("ibm-1097_p100-1995", "x-IBM1097"); builder.put("ibm-1097", "x-IBM1097"); builder.put("cp1097", "x-IBM1097"); builder.put("1097", "x-IBM1097"); builder.put("ibm-1112_p100-1995", "x-IBM1112"); builder.put("ibm-1112", "x-IBM1112"); builder.put("cp1112", "x-IBM1112"); builder.put("1112", "x-IBM1112"); builder.put("ibm-1122_p100-1999", "x-IBM1122"); builder.put("ibm-1122", "x-IBM1122"); builder.put("cp1122", "x-IBM1122"); builder.put("1122", "x-IBM1122"); builder.put("ibm-1123_p100-1995", "x-IBM1123"); builder.put("ibm-1123", "x-IBM1123"); builder.put("cp1123", "x-IBM1123"); builder.put("1123", "x-IBM1123"); builder.put("ibm-1140_p100-1997", "IBM01140"); builder.put("ibm-1140", "IBM01140"); builder.put("ccsid01140", "IBM01140"); builder.put("cp01140", "IBM01140"); builder.put("cp1140", "IBM01140"); builder.put("ebcdic-us-37+euro", "IBM01140"); builder.put("ibm-1141_p100-1997", "IBM01141"); builder.put("ibm-1141", "IBM01141"); builder.put("ccsid01141", "IBM01141"); builder.put("cp01141", "IBM01141"); builder.put("cp1141", "IBM01141"); builder.put("ebcdic-de-273+euro", "IBM01141"); builder.put("ibm-1142_p100-1997", "IBM01142"); builder.put("ibm-1142", "IBM01142"); builder.put("ccsid01142", "IBM01142"); builder.put("cp01142", "IBM01142"); builder.put("cp1142", "IBM01142"); builder.put("ebcdic-dk-277+euro", "IBM01142"); builder.put("ebcdic-no-277+euro", "IBM01142"); builder.put("ibm-1143_p100-1997", "IBM01143"); builder.put("ibm-1143", "IBM01143"); builder.put("ccsid01143", "IBM01143"); builder.put("cp01143", "IBM01143"); builder.put("cp1143", "IBM01143"); builder.put("ebcdic-fi-278+euro", "IBM01143"); builder.put("ebcdic-se-278+euro", "IBM01143"); builder.put("ibm-1144_p100-1997", "IBM01144"); builder.put("ibm-1144", "IBM01144"); builder.put("ccsid01144", "IBM01144"); builder.put("cp01144", "IBM01144"); builder.put("cp1144", "IBM01144"); builder.put("ebcdic-it-280+euro", "IBM01144"); builder.put("ibm-1145_p100-1997", "IBM01145"); builder.put("ibm-1145", "IBM01145"); builder.put("ccsid01145", "IBM01145"); builder.put("cp01145", "IBM01145"); builder.put("cp1145", "IBM01145"); builder.put("ebcdic-es-284+euro", "IBM01145"); builder.put("ibm-1146_p100-1997", "IBM01146"); builder.put("ibm-1146", "IBM01146"); builder.put("ccsid01146", "IBM01146"); builder.put("cp01146", "IBM01146"); builder.put("cp1146", "IBM01146"); builder.put("ebcdic-gb-285+euro", "IBM01146"); builder.put("ibm-1147_p100-1997", "IBM01147"); builder.put("ibm-1147", "IBM01147"); builder.put("ccsid01147", "IBM01147"); builder.put("cp01147", "IBM01147"); builder.put("cp1147", "IBM01147"); builder.put("ebcdic-fr-297+euro", "IBM01147"); builder.put("ibm-1148_p100-1997", "IBM01148"); builder.put("ibm-1148", "IBM01148"); builder.put("ccsid01148", "IBM01148"); builder.put("cp01148", "IBM01148"); builder.put("cp1148", "IBM01148"); builder.put("ebcdic-international-500+euro", "IBM01148"); builder.put("ibm-1149_p100-1997", "IBM01149"); builder.put("ibm-1149", "IBM01149"); builder.put("ccsid01149", "IBM01149"); builder.put("cp01149", "IBM01149"); builder.put("cp1149", "IBM01149"); builder.put("ebcdic-is-871+euro", "IBM01149"); aliasTable = builder.build(); } }