/* * Copyright 2007-2009 Medsea Business Solutions S.L. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package eu.medsea.util; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.SortedMap; import java.util.TreeSet; import java.util.logging.Level; import com.delcyon.capo.CapoApplication; /** * This class contains a list of known encodings used by TextMimeType. It is * used by the TextMimeDetector but can be used as a stand alone utility class * in other parts of your program if you want. * <p> * The getPossibleEncodings() method takes a byte [] as its source and the * bigger the array the better the detection ratio will be. * </p> * <p> * The class is initialised with an empty list of encodings so it is effectively * disabled by default. You can set the supported encodings to ALL of the * encodings supported by your JVM at any point during your program execution * using the following method * EncodingGuesser.setSupportedEncodings(EncodingGuesser * .getCanonicalEncodingNamesSupportedByJVM()); You can also clear the encodings * and disable the detector at any point by calling * EncodingGuesser.setSupportedEncodings(new ArrayList()). If later on you * dynamically add more encodings they will NOT be detected automatically by * this class but you can recall the above method. * </p> * <p> * As the JVM can have a large number of encodings and each one is checked * against the byte array it may be wise to remove all encodings you are sure * you will not use to trim down on the number of tests. It will not stop at the * first match but will try to match as many encodings as possible and return * this as a Collection. * </p> * <p> * A common scenario is where an application can handle only a small set of text * encodings such as UTF-8 and windows-1252. If this is your case you can use * the setSupportedEncodings() method so that these are the only encodings in * the supported encodings Collection. This will dramatically improve the * performance of this class. * </p> * <p> * It's possible that small byte arrays that should contain binary data are * considered possible text matches but generally binary data, such as images, * should return no matches. * </p> * <p> * There are some optimisations that are applicable to text files containing * BOM's (Byte Order Marks) such as UTF-8, UTF-16LE, UTF-16BE, UTF-32LE and * UTF-32BE. These are not required but if present will greatly improve the * resultant possible matches returned from the getPossibleEncodings() method. * </p> */ public class EncodingGuesser { private static final long serialVersionUID = -247389882161262839L; // We want the CANONICAL name of the default Charset for the JVM. private static String defaultJVMEncoding = Charset.forName(new java.io.OutputStreamWriter(new java.io.ByteArrayOutputStream()).getEncoding()).name(); private static Collection supportedEncodings = new TreeSet(); private static Map boms = new HashMap(); /** * Initialise the supported encodings to be those supported by the JVM. This * will NOT be updated should you later add encodings dynamically to your * running code. You can also remove some of these later if you know they * will not be used. The more you remove the more performant the it will be. */ static { // We have this switched off by default. If you want to initialise with // all encodings // supported by your JVM the just un-comment the following line // EncodingGuesser.supportedEncodings = // getCanonicalEncodingNamesSupportedByJVM(); // Initialise some known BOM (s) keyed by their canonical encoding name. boms.put("UTF-32BE", new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF }); boms.put("UTF-32LE", new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00 }); boms.put("UTF-16BE", new byte[] { (byte) 0xFE, (byte) 0xFF }); boms.put("UTF-16LE", new byte[] { (byte) 0xFF, (byte) 0xFE }); boms.put("UTF-8", new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }); boms.put("UTF-7", new byte[] { (byte) 0x2B, (byte) 0x2F, (byte) 0x76 }); // We // may // need // to // cater // for // the // next // char // as // well // which // can // be // one // of // [38 // | // 39 // | // 2B // | // 2F] boms.put("UTF-1", new byte[] { (byte) 0xF7, (byte) 0x64, (byte) 0x4C }); boms.put("UTF-EBCDIC", new byte[] { (byte) 0xDD, (byte) 0x73, (byte) 0x66, (byte) 0x73 }); boms.put("SCSU", new byte[] { (byte) 0x0E, (byte) 0xFE, (byte) 0xFF }); boms.put("BOCU-1", new byte[] { (byte) 0xFB, (byte) 0xEE, (byte) 0x28 }); // optionally // followed // by // 0xFF } /** * Check if the encoding String is one of the encodings supported. * * @param encoding * @return true if encoding is understood by this class */ public static boolean isKnownEncoding(String encoding) { return supportedEncodings.contains(encoding); } /** * Get a Collection of all the possible encodings this byte array could be * used to represent. * * @param data * @return the Collection of possible encodings from the supported encodings */ public static Collection getPossibleEncodings(byte[] data) { Collection possibleEncodings = new TreeSet(); if (data == null || data.length == 0) { return possibleEncodings; } // We may have to take account of a BOM (Byte Order Mark) as this could // be present at the beginning of // the source byte array. These sequences may match valid bytes at the // beginning of binary data but this shouldn't // match any encodings anyway. String encoding = null; for (Iterator it = supportedEncodings.iterator(); it.hasNext();) { // This will eliminate encodings it can't possibly be from the // supported encodings // by converting the source byte array to a String using each // encoding in turn and // then getting the resultant byte array and checking it against the // passed in data. try { // One problem to overcome is that the passed in data may be // terminated by an // incomplete character for the current encoding so we need to // remove the last character // then get the resulting bytes and only match this against the // source byte array. encoding = (String) it.next(); // Check if this encoding has a known bom and if so does it // match the beginning of the data array ? // returns either 0 or the length of the bom int lengthBOM = getLengthBOM(encoding, data); // Don't use the BOM when constructing the String String test = new String(getByteArraySubArray(data, lengthBOM, data.length - lengthBOM), encoding); // Only remove the last character if the String is more than 1 // character long if (test.length() > 1) { // Remove last character from the test string. test = test.substring(0, test.length() - 2); } // This is the byte array we will compare with the passed in // source array copy byte[] compare = null; try { compare = test.getBytes(encoding); } catch (UnsupportedOperationException ignore) { continue; } // Check if source and destination byte arrays are equal if (!compareByteArrays(data, lengthBOM, compare, 0, compare.length)) { // dosn't match so ignore this encoding as it is unlikely to // be correct // even if it does contain valid text data. continue; } // If we get this far and the lengthBOM is not 0 then we have a // match for this encoding. if (lengthBOM != 0) { // We know we have a perfect match for this encoding so // ditch the rest and return just this one possibleEncodings.clear(); possibleEncodings.add(encoding); return possibleEncodings; } // This is a possible match. possibleEncodings.add(encoding); } catch (UnsupportedEncodingException uee) { CapoApplication.logger.log(Level.SEVERE, "The encoding [" + encoding + "] is not supported by your JVM."); } catch (Exception e) { // Log the error but carry on with the next encoding CapoApplication.logger.log(Level.SEVERE, e.getLocalizedMessage(), e); } } return possibleEncodings; } /** * Allows you to remove an encoding from the supported encodings you are not * interested in. * * @param encoding * @return true if removed else false */ public static boolean removeEncoding(String encoding) { return supportedEncodings.remove(encoding); } /** * Remove all valid encodings in the string array * * @param encodings * String [] containing the encodings to remove * @return true if at least one of the encodings was removed else false */ public static boolean removeEncodings(String[] encodings) { boolean removedAtLeast_1 = false; for (int i = 0; i < encodings.length; i++) { if (removeEncoding(encodings[i])) { removedAtLeast_1 = true; } } return removedAtLeast_1; } /** * Get a Collection containing entries in both the supported encodings and * the passed in String [] of encodings. This is used by TextMimeDetector to * get a valid list of the preferred encodings. * * @param encodings * @return a Collection containing all valid encodings contained in the * passed in encodings array */ public static Collection getValidEncodings(String[] encodings) { Collection c = new ArrayList(); for (int i = 0; i < encodings.length; i++) { if (supportedEncodings.contains(encodings[i])) { c.add(encodings[i]); } } return c; } /** * Get the JVM default canonical encoding. For instance the canonical * encoding for cp1252 is windows-1252 * * @return the default canonical encoding name for the JVM */ public static String getDefaultEncoding() { return EncodingGuesser.defaultJVMEncoding; } /** * Get the Collection of currently supported encodings * * @return the supported encodings. */ public static Collection getSupportedEncodings() { return supportedEncodings; } /** * Set the supported encodings * * @param encodings * . If this is null the supported encodings are left unchanged. * @return a copy of the currently supported encodings */ public static Collection setSupportedEncodings(Collection encodings) { Collection current = new TreeSet(); for (Iterator it = supportedEncodings.iterator(); it.hasNext();) { current.add(it.next()); } if (encodings != null) { supportedEncodings.clear(); for (Iterator it = encodings.iterator(); it.hasNext();) { supportedEncodings.add(it.next()); } } return current; } /** * Get the length of a BOM for this this encoding and byte array * * @param encoding * @param data * @return length of BOM if the data contains a BOM else returns 0 */ public static int getLengthBOM(String encoding, byte[] data) { if (!boms.containsKey(encoding)) { return 0; } byte[] bom = (byte[]) boms.get(encoding); if (compareByteArrays(bom, 0, data, 0, bom.length)) { return bom.length; } else { return 0; } } /** * Get a sub array of this byte array starting at offset until length * * @param a * @param offset * @param length * @return new byte array unless is would replicate or increase the original * array in which case it returns the original */ public static byte[] getByteArraySubArray(byte[] a, int offset, int length) { if ((offset + length > a.length)) { return a; } byte[] data = new byte[length]; for (int i = 0; i < length; i++) { data[i] = a[offset + i]; } return data; } /** * Utility method to compare a region of two byte arrays for equality * * @param a * @param aOffset * @param b * @param bOffset * @param length * @return true is the two regions contain the same byte values else false */ public static boolean compareByteArrays(byte[] a, int aOffset, byte[] b, int bOffset, int length) { if ((a.length < aOffset + length) || (b.length < bOffset + length)) { // would match beyond one of the arrays return false; } for (int i = 0; i < length; i++) { if (a[aOffset + i] != b[bOffset + i]) { return false; } } return true; } /** * Utility method to get all of the current encoding names, in canonical * format, supported by your JVM at the time this is called. * * @return current Collection of canonical encoding names */ public static Collection getCanonicalEncodingNamesSupportedByJVM() { Collection encodings = new TreeSet(); SortedMap charSets = Charset.availableCharsets(); Collection charSetNames = charSets.keySet(); for (Iterator it = charSetNames.iterator(); it.hasNext();) { encodings.add((String) it.next()); } CapoApplication.logger.fine("The following [" + encodings.size() + "] encodings will be used: " + encodings); return encodings; } }