package org.caudexorigo.text; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.caudexorigo.io.UnsynchronizedBufferedReader; import org.caudexorigo.io.UnsynchronizedByteArrayInputStream; /** * Class to detect the encoding of the input * * @see http://diveintomark.org/archives/2004/02/13/xml-media-types * @author david * */ public class DetectEncoding { class Magic { byte[] magic; boolean hasBOM; String name; public Magic(String name, boolean hasBOM, byte a, byte b, byte c, byte d) { this.magic = new byte[4]; this.magic[0] = a; this.magic[1] = b; this.magic[2] = c; this.magic[3] = d; this.hasBOM = hasBOM; this.name = name; } public Magic(String name, boolean hasBOM, byte a, byte b, byte c) { this.magic = new byte[3]; this.magic[0] = a; this.magic[1] = b; this.magic[2] = c; this.hasBOM = hasBOM; this.name = name; } public Magic(String name, boolean hasBOM, byte a, byte b) { this.magic = new byte[2]; this.magic[0] = a; this.magic[1] = b; this.hasBOM = hasBOM; this.name = name; } } final Magic[] magics = { // UCS-4, big-endian machine (1234 order) new Magic("utf-32be", true, (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF), // UCS-4, little-endian machine (4321 order) new Magic("utf-32le", true, (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00), // UCS-4, unusual octet order (2143) new Magic("USC-4odd", true, (byte) 0x00, (byte) 0x00, (byte) 0xFF, (byte) 0xFE), // UCS-4, unusual octet order (3412) new Magic("USC-4odder", true, (byte) 0xFE, (byte) 0xFF, (byte) 0x00, (byte) 0x00), // UTF-16, big-endian new Magic("utf-16be", true, (byte) 0xFE, (byte) 0xFF), // UTF-16, little-endian new Magic("utf-16le", true, (byte) 0xFF, (byte) 0xFE), // UTF-8 new Magic("utf-8", true, (byte) 0xEF, (byte) 0xBB, (byte) 0xBF), // UCS-4 or other encoding with a 32-bit code unit and ASCII // characters encoded as ASCII values, in respectively big-endian // (1234), little-endian (4321) and two unusual byte orders (2143 // and 3412). The encoding declaration must be read to determine // which of UCS-4 or other supported 32-bit encodings applies. new Magic("utf-32be", false, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x3C), new Magic("utf-32le", false, (byte) 0x3C, (byte) 0x00, (byte) 0x00, (byte) 0x00), new Magic("USC-4 ASCII 2143", false, (byte) 0x00, (byte) 0x00, (byte) 0x3C, (byte) 0x00), new Magic("USC-4 ASCII 3412", false, (byte) 0x00, (byte) 0x3C, (byte) 0x00, (byte) 0x00), // UTF-16BE or big-endian ISO-10646-UCS-2 or other encoding with a // 16-bit code unit in big-endian order and ASCII characters encoded // as ASCII values (the encoding declaration must be read to // determine which) new Magic("utf-16be", false, (byte) 0x00, (byte) 0x3C, (byte) 0x00, (byte) 0x3F), // UTF-16LE or little-endian ISO-10646-UCS-2 or other // encoding with a 16-bit code unit in little-endian order // and ASCII characters encoded as ASCII values (the // encoding declaration must be read to determine which) new Magic("utf-16le", false, (byte) 0x3C, (byte) 0x00, (byte) 0x3F, (byte) 0x00), // UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, // EUC, or any other 7-bit, 8-bit, or mixed-width encoding // which ensures that the characters of ASCII have their // normal positions, width, and values; the actual encoding // declaration must be read to detect which of these // applies, but since all of these encodings use the same // bit patterns for the relevant ASCII characters, the // encoding declaration itself may be read reliably new Magic("UTF-8", false, (byte) 0x3C, (byte) 0x3F, (byte) 0x78, (byte) 0x6D), // EBCDIC (in some flavor; the full encoding declaration // must be read to tell which code page is in use) new Magic("EBCDIC", false, (byte) 0x4C, (byte) 0x6F, (byte) 0xA7, (byte) 0x94), // Other UTF-8 without an encoding declaration, or else the // data stream is mislabeled (lacking a required encoding // declaration), corrupt, fragmentary, or enclosed in a // wrapper of some kind }; static Map<String, String> encoding_map = getEncodingMap(); String defaultEncoding; private boolean error = false; private int bomLength = 0; public DetectEncoding(String defaultEncoding) { this.defaultEncoding = defaultEncoding; } public String detect(String filename) throws FileNotFoundException { return detect(new FileInputStream(filename)); } private static Map<String, String> getEncodingMap() { Map<String, String> map = new HashMap<String, String>(); map.put("windows_1250", "windows-1250"); map.put("windows_1251", "windows-1251"); map.put("windows_1252", "windows-1252"); map.put("windows_1253", "windows-1253"); map.put("windows_1254", "windows-1254"); map.put("windows_1255", "windows-1255"); map.put("windows_1256", "windows-1256"); map.put("windows_1257", "windows-1257"); map.put("windows_1258", "windows-1258"); map.put("ms-ee", "windows-1250"); map.put("ms-cyrl", "windows-1251"); map.put("ms-ansi", "windows-1252"); map.put("ms-greek", "windows-1253"); map.put("ms-turk", "windows-1254"); map.put("ms-hebr", "windows-1255"); map.put("ms-arab", "windows-1256"); map.put("winbaltrim", "windows-1257"); map.put("maclatin2", "x-MacRoman"); map.put("macintosh", "x-MacRoman"); map.put("csmacintosh", "x-MacRoman"); map.put("mac", "x-MacRoman"); map.put("mac-cyrillic", "x-MacCyrillic"); map.put("us-ascii", "US-ASCII"); map.put("cspc775baltic", "IBM775"); map.put("iso-10646-ucs-2", "utf-16be"); map.put("ebcdic_cp_be", "CP500"); map.put("ebcdic_cp_us", "IBM037"); map.put("ebcdic_cp_ca", "IBM037"); map.put("ebcdic_cp_nl", "IBM037"); map.put("ebcdic_cp_wt", "IBM037"); map.put("ebcdic_cp_dk", "CP277"); map.put("ebcdic_cp_no", "CP277"); map.put("ebcdic_cp_fi", "CP278"); map.put("ebcdic_cp_se", "CP278"); map.put("ebcdic_cp_it", "CP280"); map.put("ebcdic_cp_es", "CP284"); map.put("ebcdic_cp_gb", "CP285"); map.put("ebcdic_cp_fr", "CP297"); map.put("ebcdic_cp_ch", "CP500"); map.put("ebcdic_cp_be", "CP500"); map.put("ebcdic-cp-be", "CP500"); map.put("ebcdic-cp-us", "IBM037"); map.put("ebcdic-cp-ca", "IBM037"); map.put("ebcdic-cp-nl", "IBM037"); map.put("ebcdic-cp-wt", "IBM037"); map.put("ebcdic-cp-dk", "CP277"); map.put("ebcdic-cp-no", "CP277"); map.put("ebcdic-cp-fi", "CP278"); map.put("ebcdic-cp-se", "CP278"); map.put("ebcdic-cp-it", "CP280"); map.put("ebcdic-cp-es", "CP284"); map.put("ebcdic-cp-gb", "CP285"); map.put("ebcdic-cp-fr", "CP297"); map.put("ebcdic-cp-ch", "CP500"); map.put("ebcdic-cp-be", "CP500"); map.put("ibm039", "IBM037"); map.put("ibm1140", "IBM037"); map.put("dbcs", "CP1252"); // This isn't really true, but it's as close as we're going to get // http://www.haible.de/bruno/charsets/conversion-tables/CP1125.html map.put("cp1125", "CP866"); map.put("cp_is", "cp861"); map.put("IBM277", "CP277"); map.put("CP277", "CP277"); map.put("CSIBM277", "CP277"); map.put("IBM278", "CP278"); map.put("CP278", "CP278"); map.put("CSIBM278", "CP278"); map.put("IBM280", "CP280"); map.put("CP280", "CP280"); map.put("CSIBM280", "CP280"); map.put("IBM284", "CP284"); map.put("CP284", "CP284"); map.put("CSIBM284", "CP284"); map.put("IBM285", "CP285"); map.put("CP285", "CP285"); map.put("CSIBM285", "CP285"); map.put("IBM297", "CP297"); map.put("CP297", "CP297"); map.put("CSIBM297", "CP297"); map.put("IBM420", "CP420"); map.put("CP420", "CP420"); map.put("CSIBM420", "CP420"); map.put("ebcdic-cp-ar1", "CP420"); map.put("ebcdic-cp-he", "CP424"); map.put("IBM424", "CP424"); map.put("CP424", "CP424"); map.put("CSIBM424", "CP424"); map.put("IBM500", "CP500"); map.put("CP500", "CP500"); map.put("CSIBM500", "CP500"); map.put("csibm855", "CP855"); map.put("IBM868", "CP868"); map.put("CP868", "CP868"); map.put("CSIBM868", "CP868"); map.put("CP-AR", "CP868"); map.put("IBM869", "CP869"); map.put("CP869", "CP869"); map.put("CSIBM869", "CP869"); map.put("CP-GR", "CP869"); map.put("IBM870", "CP870"); map.put("CP870", "CP870"); map.put("CSIBM870", "CP870"); map.put("IBM871", "CP871"); map.put("CP871", "CP871"); map.put("CSIBM871", "CP871"); map.put("ebcdic-cp-is", "CP871"); map.put("IBM918", "CP918"); map.put("CP918", "CP918"); map.put("CSIBM918", "CP918"); map.put("ebcdic-cp-ar2", "CP918"); map.put("EUC-JP", "EUCJIS"); map.put("CSEUCPkdFmtJapanese", "EUCJIS"); map.put("EUC-KR", "KSC5601"); map.put("GB2312", "GB2312"); map.put("CSGB2312", "GB2312"); map.put("ISO-2022-JP", "JIS"); map.put("CSISO2022JP", "JIS"); map.put("ISO-2022-KR", "ISO2022KR"); map.put("CSISO2022KR", "ISO2022KR"); map.put("ISO-2022-CN", "ISO2022CN"); return map; } public String detect(InputStream in) { byte[] data = new byte[100]; try { in.read(data, 0, 100); } catch (IOException e) { e.printStackTrace(); } return detect(data); } public String detect(byte[] data) { String encoding = null; // check for a <? as the first two byte if (data[0] == 0x3c && data[1] == 0x3f) { encoding = detectUsingXML(data); } else { encoding = detectUsingMagic(data); if ("EBCDIC".equals(encoding)) { encoding = detectEBCDIC(data); } } encoding = alias(encoding); if (encoding == null) { encoding = defaultEncoding; } try { Charset charset = Charset.forName(encoding); } catch (UnsupportedCharsetException ex) { encoding = defaultEncoding; error = true; } return encoding; } private String detectEBCDIC(byte[] data) { data = asciiToEbcdic(data); return detectUsingXML(data); } public String alias(final String encoding) { String new_encoding = encoding; if (encoding_map.containsKey(encoding)) { new_encoding = encoding_map.get(encoding); } return new_encoding; } private String detectUsingXML(byte[] data) { String encoding = null; try { UnsynchronizedBufferedReader reader = new UnsynchronizedBufferedReader(new InputStreamReader(new UnsynchronizedByteArrayInputStream(data))); String str = reader.readLine(); Pattern pattern = Pattern.compile(".*encoding=\"([^\"]*)\".*"); Matcher matcher = pattern.matcher(str); if (matcher.matches()) { encoding = matcher.group(1); } } catch (IOException e) { e.printStackTrace(); } return encoding; } private String detectUsingMagic(byte[] data) { String encoding = null; int index = 0; for (Magic magic : magics) { if (checkArray(data, magic.magic)) { encoding = magic.name; if (magic.hasBOM) { bomLength = magic.magic.length; } break; } index++; } return encoding; } private boolean checkArray(byte[] a, byte[] b) { boolean equals = true; for (int i = 0; i < b.length; ++i) { if (a[i] != b[i]) { equals = false; break; } } return equals; } private byte[] asciiToEbcdic(byte[] data) { int[] map = { 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15, 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31, 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7, 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26, 32, 160, 161, 162, 163, 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33, 38, 169, 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59, 94, 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 44, 37, 95, 62, 63, 186, 187, 188, 189, 190, 191, 192, 193, 194, 96, 58, 35, 64, 39, 61, 34, 195, 97, 98, 99, 100, 101, 102, 103, 104, 105, 196, 197, 198, 199, 200, 201, 202, 106, 107, 108, 109, 110, 111, 112, 113, 114, 203, 204, 205, 206, 207, 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 123, 65, 66, 67, 68, 69, 70, 71, 72, 73, 232, 233, 234, 235, 236, 237, 125, 74, 75, 76, 77, 78, 79, 80, 81, 82, 238, 239, 240, 241, 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89, 90, 244, 245, 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 250, 251, 252, 253, 254, 255 }; for (int i = 0; i < data.length; i++) { data[i] = (byte) map[data[i] > 0 ? data[i] : 256 + data[i]]; } return data; } public void stripBOM(InputStream is) throws IOException { for (int i = 0; i < bomLength; i++) { is.read(); } } public boolean isError() { return error; } }