/*
* Copyright 2007-2009 Medsea Business Solutions S.L.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.medsea.util;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeSet;
import java.util.logging.Level;
import com.delcyon.capo.CapoApplication;
/**
* This class contains a list of known encodings used by TextMimeType. It is
* used by the TextMimeDetector but can be used as a stand alone utility class
* in other parts of your program if you want.
* <p>
* The getPossibleEncodings() method takes a byte [] as its source and the
* bigger the array the better the detection ratio will be.
* </p>
* <p>
* The class is initialised with an empty list of encodings so it is effectively
* disabled by default. You can set the supported encodings to ALL of the
* encodings supported by your JVM at any point during your program execution
* using the following method
* EncodingGuesser.setSupportedEncodings(EncodingGuesser
* .getCanonicalEncodingNamesSupportedByJVM()); You can also clear the encodings
* and disable the detector at any point by calling
* EncodingGuesser.setSupportedEncodings(new ArrayList()). If later on you
* dynamically add more encodings they will NOT be detected automatically by
* this class but you can recall the above method.
* </p>
* <p>
* As the JVM can have a large number of encodings and each one is checked
* against the byte array it may be wise to remove all encodings you are sure
* you will not use to trim down on the number of tests. It will not stop at the
* first match but will try to match as many encodings as possible and return
* this as a Collection.
* </p>
* <p>
* A common scenario is where an application can handle only a small set of text
* encodings such as UTF-8 and windows-1252. If this is your case you can use
* the setSupportedEncodings() method so that these are the only encodings in
* the supported encodings Collection. This will dramatically improve the
* performance of this class.
* </p>
* <p>
* It's possible that small byte arrays that should contain binary data are
* considered possible text matches but generally binary data, such as images,
* should return no matches.
* </p>
* <p>
* There are some optimisations that are applicable to text files containing
* BOM's (Byte Order Marks) such as UTF-8, UTF-16LE, UTF-16BE, UTF-32LE and
* UTF-32BE. These are not required but if present will greatly improve the
* resultant possible matches returned from the getPossibleEncodings() method.
* </p>
*/
public class EncodingGuesser
{
private static final long serialVersionUID = -247389882161262839L;
// We want the CANONICAL name of the default Charset for the JVM.
private static String defaultJVMEncoding = Charset.forName(new java.io.OutputStreamWriter(new java.io.ByteArrayOutputStream()).getEncoding()).name();
private static Collection supportedEncodings = new TreeSet();
private static Map boms = new HashMap();
/**
* Initialise the supported encodings to be those supported by the JVM. This
* will NOT be updated should you later add encodings dynamically to your
* running code. You can also remove some of these later if you know they
* will not be used. The more you remove the more performant the it will be.
*/
static
{
// We have this switched off by default. If you want to initialise with
// all encodings
// supported by your JVM the just un-comment the following line
// EncodingGuesser.supportedEncodings =
// getCanonicalEncodingNamesSupportedByJVM();
// Initialise some known BOM (s) keyed by their canonical encoding name.
boms.put("UTF-32BE", new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF });
boms.put("UTF-32LE", new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00 });
boms.put("UTF-16BE", new byte[] { (byte) 0xFE, (byte) 0xFF });
boms.put("UTF-16LE", new byte[] { (byte) 0xFF, (byte) 0xFE });
boms.put("UTF-8", new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
boms.put("UTF-7", new byte[] { (byte) 0x2B, (byte) 0x2F, (byte) 0x76 }); // We
// may
// need
// to
// cater
// for
// the
// next
// char
// as
// well
// which
// can
// be
// one
// of
// [38
// |
// 39
// |
// 2B
// |
// 2F]
boms.put("UTF-1", new byte[] { (byte) 0xF7, (byte) 0x64, (byte) 0x4C });
boms.put("UTF-EBCDIC", new byte[] { (byte) 0xDD, (byte) 0x73, (byte) 0x66, (byte) 0x73 });
boms.put("SCSU", new byte[] { (byte) 0x0E, (byte) 0xFE, (byte) 0xFF });
boms.put("BOCU-1", new byte[] { (byte) 0xFB, (byte) 0xEE, (byte) 0x28 }); // optionally
// followed
// by
// 0xFF
}
/**
* Check if the encoding String is one of the encodings supported.
*
* @param encoding
* @return true if encoding is understood by this class
*/
public static boolean isKnownEncoding(String encoding)
{
return supportedEncodings.contains(encoding);
}
/**
* Get a Collection of all the possible encodings this byte array could be
* used to represent.
*
* @param data
* @return the Collection of possible encodings from the supported encodings
*/
public static Collection getPossibleEncodings(byte[] data)
{
Collection possibleEncodings = new TreeSet();
if (data == null || data.length == 0)
{
return possibleEncodings;
}
// We may have to take account of a BOM (Byte Order Mark) as this could
// be present at the beginning of
// the source byte array. These sequences may match valid bytes at the
// beginning of binary data but this shouldn't
// match any encodings anyway.
String encoding = null;
for (Iterator it = supportedEncodings.iterator(); it.hasNext();)
{
// This will eliminate encodings it can't possibly be from the
// supported encodings
// by converting the source byte array to a String using each
// encoding in turn and
// then getting the resultant byte array and checking it against the
// passed in data.
try
{
// One problem to overcome is that the passed in data may be
// terminated by an
// incomplete character for the current encoding so we need to
// remove the last character
// then get the resulting bytes and only match this against the
// source byte array.
encoding = (String) it.next();
// Check if this encoding has a known bom and if so does it
// match the beginning of the data array ?
// returns either 0 or the length of the bom
int lengthBOM = getLengthBOM(encoding, data);
// Don't use the BOM when constructing the String
String test = new String(getByteArraySubArray(data, lengthBOM, data.length - lengthBOM), encoding);
// Only remove the last character if the String is more than 1
// character long
if (test.length() > 1)
{
// Remove last character from the test string.
test = test.substring(0, test.length() - 2);
}
// This is the byte array we will compare with the passed in
// source array copy
byte[] compare = null;
try
{
compare = test.getBytes(encoding);
} catch (UnsupportedOperationException ignore)
{
continue;
}
// Check if source and destination byte arrays are equal
if (!compareByteArrays(data, lengthBOM, compare, 0, compare.length))
{
// dosn't match so ignore this encoding as it is unlikely to
// be correct
// even if it does contain valid text data.
continue;
}
// If we get this far and the lengthBOM is not 0 then we have a
// match for this encoding.
if (lengthBOM != 0)
{
// We know we have a perfect match for this encoding so
// ditch the rest and return just this one
possibleEncodings.clear();
possibleEncodings.add(encoding);
return possibleEncodings;
}
// This is a possible match.
possibleEncodings.add(encoding);
} catch (UnsupportedEncodingException uee)
{
CapoApplication.logger.log(Level.SEVERE, "The encoding [" + encoding + "] is not supported by your JVM.");
} catch (Exception e)
{
// Log the error but carry on with the next encoding
CapoApplication.logger.log(Level.SEVERE, e.getLocalizedMessage(), e);
}
}
return possibleEncodings;
}
/**
* Allows you to remove an encoding from the supported encodings you are not
* interested in.
*
* @param encoding
* @return true if removed else false
*/
public static boolean removeEncoding(String encoding)
{
return supportedEncodings.remove(encoding);
}
/**
* Remove all valid encodings in the string array
*
* @param encodings
* String [] containing the encodings to remove
* @return true if at least one of the encodings was removed else false
*/
public static boolean removeEncodings(String[] encodings)
{
boolean removedAtLeast_1 = false;
for (int i = 0; i < encodings.length; i++)
{
if (removeEncoding(encodings[i]))
{
removedAtLeast_1 = true;
}
}
return removedAtLeast_1;
}
/**
* Get a Collection containing entries in both the supported encodings and
* the passed in String [] of encodings. This is used by TextMimeDetector to
* get a valid list of the preferred encodings.
*
* @param encodings
* @return a Collection containing all valid encodings contained in the
* passed in encodings array
*/
public static Collection getValidEncodings(String[] encodings)
{
Collection c = new ArrayList();
for (int i = 0; i < encodings.length; i++)
{
if (supportedEncodings.contains(encodings[i]))
{
c.add(encodings[i]);
}
}
return c;
}
/**
* Get the JVM default canonical encoding. For instance the canonical
* encoding for cp1252 is windows-1252
*
* @return the default canonical encoding name for the JVM
*/
public static String getDefaultEncoding()
{
return EncodingGuesser.defaultJVMEncoding;
}
/**
* Get the Collection of currently supported encodings
*
* @return the supported encodings.
*/
public static Collection getSupportedEncodings()
{
return supportedEncodings;
}
/**
* Set the supported encodings
*
* @param encodings
* . If this is null the supported encodings are left unchanged.
* @return a copy of the currently supported encodings
*/
public static Collection setSupportedEncodings(Collection encodings)
{
Collection current = new TreeSet();
for (Iterator it = supportedEncodings.iterator(); it.hasNext();)
{
current.add(it.next());
}
if (encodings != null)
{
supportedEncodings.clear();
for (Iterator it = encodings.iterator(); it.hasNext();)
{
supportedEncodings.add(it.next());
}
}
return current;
}
/**
* Get the length of a BOM for this this encoding and byte array
*
* @param encoding
* @param data
* @return length of BOM if the data contains a BOM else returns 0
*/
public static int getLengthBOM(String encoding, byte[] data)
{
if (!boms.containsKey(encoding))
{
return 0;
}
byte[] bom = (byte[]) boms.get(encoding);
if (compareByteArrays(bom, 0, data, 0, bom.length))
{
return bom.length;
}
else
{
return 0;
}
}
/**
* Get a sub array of this byte array starting at offset until length
*
* @param a
* @param offset
* @param length
* @return new byte array unless is would replicate or increase the original
* array in which case it returns the original
*/
public static byte[] getByteArraySubArray(byte[] a, int offset, int length)
{
if ((offset + length > a.length))
{
return a;
}
byte[] data = new byte[length];
for (int i = 0; i < length; i++)
{
data[i] = a[offset + i];
}
return data;
}
/**
* Utility method to compare a region of two byte arrays for equality
*
* @param a
* @param aOffset
* @param b
* @param bOffset
* @param length
* @return true is the two regions contain the same byte values else false
*/
public static boolean compareByteArrays(byte[] a, int aOffset, byte[] b, int bOffset, int length)
{
if ((a.length < aOffset + length) || (b.length < bOffset + length))
{
// would match beyond one of the arrays
return false;
}
for (int i = 0; i < length; i++)
{
if (a[aOffset + i] != b[bOffset + i])
{
return false;
}
}
return true;
}
/**
* Utility method to get all of the current encoding names, in canonical
* format, supported by your JVM at the time this is called.
*
* @return current Collection of canonical encoding names
*/
public static Collection getCanonicalEncodingNamesSupportedByJVM()
{
Collection encodings = new TreeSet();
SortedMap charSets = Charset.availableCharsets();
Collection charSetNames = charSets.keySet();
for (Iterator it = charSetNames.iterator(); it.hasNext();)
{
encodings.add((String) it.next());
}
CapoApplication.logger.fine("The following [" + encodings.size() + "] encodings will be used: " + encodings);
return encodings;
}
}