EncodingGuesser.java example

Explorer

delcyon-capo-master
- java
  - com
    - delcyon
      - capo
        CapoApplication.java
        CapoThreadFactory.java
        Configuration.java
        ContextThread.java
        InterruptibleRunnable.java
        annotations
        ControlNamespaceURI.java
        DefaultDocumentProvider.java
        DirectoyProvider.java
        XmlMappedArrays.java
        client
        CapoClient.java
        controller
        AbstractClientSideControl.java
        AbstractControl.java
        ControlElement.java
        ControlElementProvider.java
        Group.java
        LocalRequestProcessor.java
        client
        ClientSideControl.java
        ControllerRequest.java
        ServerControllerResponse.java
        elements
        AppendElement.java
        CallElement.java
        ChooseElement.java
        CommandElement.java
        CreateElement.java
        DebugElement.java
        DiffElement.java
        ExportElement.java
        GroupElement.java
        ImportElement.java
        InsertBeforeElement.java
        LogElement.java
        OpenElement.java
        OtherwiseElement.java
        ParseElement.java
        RemoteGroupElement.java
        RemoteGroupMessage.java
        RemoveElement.java
        RepeatElement.java
        ReplaceElement.java
        RequestElement.java
        ResourceControlElement.java
        ResourceMetaDataElement.java
        ResourceMonitorElement.java
        RestartElement.java
        SetAttributeElement.java
        SetIDElement.java
        SnapshotElement.java
        StepElement.java
        SyncElement.java
        TaskElement.java
        TransformElement.java
        UpdateElement.java
        VarElement.java
        WhenElement.java
        server
        ClientControllerRequest.java
        ControllerClientRequestProcessor.java
        ControllerProcessingException.java
        ControllerResponse.java
        ServerSideControl.java
        crypto
        CertificateRequest.java
        CertificateRequestProcessor.java
        datastream
        AccessibleByteArrayOutputStream.java
        BufferedSocket.java
        ConsoleOutputStreamFilter.java
        NullOutputStream.java
        OutputStreamAttributeFilterProvider.java
        RegexFilterOutputStream.java
        SocketFinalizer.java
        StreamEventFilterInputStream.java
        StreamEventFilterOutputStream.java
        StreamEventListener.java
        StreamFinalizer.java
        StreamHandler.java
        StreamProcessor.java
        StreamProcessorProvider.java
        StreamUtil.java
        TriggerFilterOutputStream.java
        stream_attribute_filter
        AbstractFilterInputStream.java
        ContentFormatTypeFilterInputStream.java
        ContentFormatTypeFilterOutputStream.java
        InputStreamAttributeFilterProvider.java
        MD5FilterInputStream.java
        MD5FilterOutputStream.java
        MimeTypeFilterInputStream.java
        SizeFilterInputStream.java
        SizeFilterOutputStream.java
        StreamAttributeFilter.java
        exceptions
        MissingAttributeException.java
        http
        HTTPStreamConsumer.java
        SimpleHttpRequest.java
        SimpleHttpResponse.java
        modules
        ModuleProvider.java
        ModuleRequest.java
        ModuleRequestProcessor.java
        parsers
        GrammarParser.java
        ParseNode.java
        ParseRule.java
        ParseTape.java
        ParseToken.java
        ParseTree.java
        Tokenizer.java
        preferences
        Preference.java
        PreferenceInfo.java
        PreferenceInfoHelper.java
        PreferenceProvider.java
        protocol
        client
        CapoConnection.java
        Request.java
        XMLRequest.java
        XMLServerResponse.java
        XMLServerResponseProcessor.java
        XMLServerResponseProcessorProvider.java
        server
        AbstractClientRequestProcessor.java
        AbstractResponse.java
        ClientRequest.java
        ClientRequestProcessor.java
        ClientRequestProcessorProvider.java
        ClientRequestProcessorSession.java
        ClientRequestProcessorSessionManager.java
        ClientRequestXMLProcessor.java
        Response.java
        XMLResponse.java
        resourcemanager
        CapoDataManager.java
        ContentFormatType.java
        ErrorResourceDescriptor.java
        ResourceDescriptor.java
        ResourceListener.java
        ResourceManager.java
        ResourceParameter.java
        ResourceParameterBuilder.java
        ResourceType.java
        ResourceTypeProvider.java
        ResourceURI.java
        remote
        RemoteResourceDescriptorMessage.java
        RemoteResourceDescriptorProxy.java
        RemoteResourceRequest.java
        RemoteResourceResponse.java
        RemoteResourceResponseProcessor.java
        RemoteResourceType.java
        types
        AbstractContentMetaData.java
        AbstractResourceDescriptor.java
        AbstractResourceType.java
        ClientsResourceDescriptor.java
        ClientsResourceType.java
        ContentMetaData.java
        FileResourceContentMetaData.java
        FileResourceDescriptor.java
        FileResourceType.java
        HttpResourceDescriptor.java
        HttpResourceType.java
        JcrContentMetaData.java
        JcrResourceDescriptor.java
        JcrResourceType.java
        JcrVersionContentMetaData.java
        JdbcResourceDescriptor.java
        JdbcResourceType.java
        RefResourceDescriptor.java
        RefResourceType.java
        ShellResourceDescriptor.java
        ShellResourceType.java
        SimpleContentMetaData.java
        StateParameters.java
        Versionable.java
        server
        CapoServer.java
        jackrabbit
        CapoJcrServer.java
        jetty
        CapoJettyServer.java
        tasks
        TaskManagerDocumentUpdaterThread.java
        TaskManagerThread.java
        util
        CloneControl.java
        CommandExecution.java
        ControlledClone.java
        EqualityProcessor.java
        HexUtil.java
        InternHashMap.java
        LeveledConsoleHandler.java
        LogPrefixFormatter.java
        MarshalWrapper.java
        MarshalWrapperInterface.java
        NamespaceContextMap.java
        ReflectionUtility.java
        StacktraceElementMarshalWrapper.java
        ToStringControl.java
        VariableContainerWrapper.java
        XMLAttribute.java
        XMLElement.java
        XMLSerializer.java
        diff
        Diff.java
        DiffDataConsumer.java
        DiffDataProvider.java
        DiffEntry.java
        InputStreamTokenizer.java
        Window.java
        WindowItem.java
        WindowItemLink.java
        XMLTextDiff.java
        webapp
        models
        DomItemModel.java
        ResourceDescriptorItemModel.java
        WContentMetaDataItemModel.java
        servlets
        CapoWebApplication.java
        CapoWebWTServlet.java
        resource
        AbstractResourceServlet.java
        DefaultResourceStreamer.java
        ResourceStreamer.java
        WResourceDescriptor.java
        WebResourcesServlet.java
        widgets
        CapoWTreeView.java
        WAceEditor.java
        WBoundedContainerWidget.java
        WCSSItemDelegate.java
        WCapoResourceEditor.java
        WCapoResourceExplorer.java
        WCapoResourceTreeView.java
        WCapoSearchControl.java
        WCapoXmlTreeView.java
        WConsoleWidget.java
        WCursorState.java
        WDiffWidget.java
        WLoginControl.java
        WResourceFactory.java
        WTailFileWidget.java
        WValidatorFactory.java
        WWindowAnchor.java
        WWorker.java
        WXMLEditor.java
        WXmlElementEditor.java
        WXmlNavigationBar.java
        xml
        CapoXPathFunction.java
        CapoXPathFunctionResolver.java
        XMLDiff.java
        XMLProcessor.java
        XMLProcessorProvider.java
        XMLStreamProcessor.java
        XPath.java
        XPathFunctionProcessor.java
        XPathFunctionProvider.java
        XPathFunctionUtility.java
        cdom
        CAttr.java
        CComment.java
        CDOMEvent.java
        CDOMEventListener.java
        CDOMHandler.java
        CDOMImplementation.java
        CDocument.java
        CDocumentBuilder.java
        CDocumentBuilderFactory.java
        CDocumentType.java
        CElement.java
        CNamedNodeMap.java
        CNode.java
        CNodeDefinition.java
        CNodeList.java
        CNodeValidator.java
        CNodeValidator2.java
        CProcessingInstruction.java
        CText.java
        CValidationException.java
        NodeProcessor.java
        NodeValidationUtilitesFI.java
        OccurancePredicate.java
        VariableContainer.java
        VariableProcessor.java
        dom
        ResourceAttr.java
        ResourceDeclarationElement.java
        ResourceDocument.java
        ResourceDocumentBuilder.java
        ResourceElement.java
        ResourceElementResourceDescriptor.java
        ResourceElementResourceType.java
        ResourceNode.java
        ResourceText.java
  - eu
    - medsea
      - mimeutil
        MimeException.java
        MimeType.java
        MimeTypeHashSet.java
        MimeUtil.java
        MimeUtil2.java
        TextMimeDetector.java
        TextMimeType.java
        detector
        ExtensionMimeDetector.java
        InvalidMagicMimeEntryException.java
        MagicMimeEntry.java
        MagicMimeEntryOperation.java
        MagicMimeMimeDetector.java
        MatchingMagicMimeEntry.java
        MimeDetector.java
        OpendesktopMimeDetector.java
        WindowsRegistryMimeDetector.java
        handler
        TextMimeHandler.java
      - util
        EncodingGuesser.java
        StringUtil.java
        ZipJarUtil.java
- tests
  - com
    - delcyon
      - capo
        ProblemTests.java
        controller
        elements
        GroupElementTest.java
        ImportElementTest.java
        ParserElementTest.java
        ResourceElementTest.java
        RestartElementTest.java
        SnapshotElementTest.java
        SyncElementTest.java
        TaskElementTest.java
        crypto
        CertificateRequestProcessorTest.java
        datastream
        RegexFilterOutputStreamTest.java
        parsers
        GrammarParserTest.java
        TokenizerTest.java
        resourcemanager
        ResourceDescriptorTest.java
        ResourceURITest.java
        types
        ClientsResourceDescriptorTest.java
        FileResourceDescriptorTest.java
        HttpResourceDescriptorTest.java
        JcrResourceDescriptorTest.java
        JdbcResourceDescriptorTest.java
        RefResourceDescriptorTest.java
        ShellResourceDescriptorTest.java
        server
        CapoServerTest.java
        tests
        util
        ExternalTestClient.java
        ExternalTestServer.java
        TestCapoApplication.java
        TestClient.java
        TestServer.java
        Util.java
        external
        Util.java
        util
        TestInterface.java
        XMLSerializerTest.java
        XMLSerializerTestData.java
        diff
        DiffTest.java
        XMLDiffTest.java
        xml
        cdom
        CDocumentTest.java
        dom
        ResourceDocumentTest.java
        xsd
        SchemaDocumentTest.java

/*
 * Copyright 2007-2009 Medsea Business Solutions S.L.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.medsea.util;

import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeSet;
import java.util.logging.Level;

import com.delcyon.capo.CapoApplication;

/**
 * This class contains a list of known encodings used by TextMimeType. It is
 * used by the TextMimeDetector but can be used as a stand alone utility class
 * in other parts of your program if you want.
 * <p>
 * The getPossibleEncodings() method takes a byte [] as its source and the
 * bigger the array the better the detection ratio will be.
 * </p>
 * <p>
 * The class is initialised with an empty list of encodings so it is effectively
 * disabled by default. You can set the supported encodings to ALL of the
 * encodings supported by your JVM at any point during your program execution
 * using the following method
 * EncodingGuesser.setSupportedEncodings(EncodingGuesser
 * .getCanonicalEncodingNamesSupportedByJVM()); You can also clear the encodings
 * and disable the detector at any point by calling
 * EncodingGuesser.setSupportedEncodings(new ArrayList()). If later on you
 * dynamically add more encodings they will NOT be detected automatically by
 * this class but you can recall the above method.
 * </p>
 * <p>
 * As the JVM can have a large number of encodings and each one is checked
 * against the byte array it may be wise to remove all encodings you are sure
 * you will not use to trim down on the number of tests. It will not stop at the
 * first match but will try to match as many encodings as possible and return
 * this as a Collection.
 * </p>
 * <p>
 * A common scenario is where an application can handle only a small set of text
 * encodings such as UTF-8 and windows-1252. If this is your case you can use
 * the setSupportedEncodings() method so that these are the only encodings in
 * the supported encodings Collection. This will dramatically improve the
 * performance of this class.
 * </p>
 * <p>
 * It's possible that small byte arrays that should contain binary data are
 * considered possible text matches but generally binary data, such as images,
 * should return no matches.
 * </p>
 * <p>
 * There are some optimisations that are applicable to text files containing
 * BOM's (Byte Order Marks) such as UTF-8, UTF-16LE, UTF-16BE, UTF-32LE and
 * UTF-32BE. These are not required but if present will greatly improve the
 * resultant possible matches returned from the getPossibleEncodings() method.
 * </p>
 */
public class EncodingGuesser
{
	private static final long serialVersionUID = -247389882161262839L;

	// We want the CANONICAL name of the default Charset for the JVM.
	private static String defaultJVMEncoding = Charset.forName(new java.io.OutputStreamWriter(new java.io.ByteArrayOutputStream()).getEncoding()).name();
	private static Collection supportedEncodings = new TreeSet();

	private static Map boms = new HashMap();

	/**
	 * Initialise the supported encodings to be those supported by the JVM. This
	 * will NOT be updated should you later add encodings dynamically to your
	 * running code. You can also remove some of these later if you know they
	 * will not be used. The more you remove the more performant the it will be.
	 */
	static
	{
		// We have this switched off by default. If you want to initialise with
		// all encodings
		// supported by your JVM the just un-comment the following line
		// EncodingGuesser.supportedEncodings =
		// getCanonicalEncodingNamesSupportedByJVM();

		// Initialise some known BOM (s) keyed by their canonical encoding name.
		boms.put("UTF-32BE", new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF });
		boms.put("UTF-32LE", new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00 });
		boms.put("UTF-16BE", new byte[] { (byte) 0xFE, (byte) 0xFF });
		boms.put("UTF-16LE", new byte[] { (byte) 0xFF, (byte) 0xFE });
		boms.put("UTF-8", new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
		boms.put("UTF-7", new byte[] { (byte) 0x2B, (byte) 0x2F, (byte) 0x76 }); // We
																					// may
																					// need
																					// to
																					// cater
																					// for
																					// the
																					// next
																					// char
																					// as
																					// well
																					// which
																					// can
																					// be
																					// one
																					// of
																					// [38
																					// |
																					// 39
																					// |
																					// 2B
																					// |
																					// 2F]
		boms.put("UTF-1", new byte[] { (byte) 0xF7, (byte) 0x64, (byte) 0x4C });
		boms.put("UTF-EBCDIC", new byte[] { (byte) 0xDD, (byte) 0x73, (byte) 0x66, (byte) 0x73 });
		boms.put("SCSU", new byte[] { (byte) 0x0E, (byte) 0xFE, (byte) 0xFF });
		boms.put("BOCU-1", new byte[] { (byte) 0xFB, (byte) 0xEE, (byte) 0x28 }); // optionally
																					// followed
																					// by
																					// 0xFF

	}

	/**
	 * Check if the encoding String is one of the encodings supported.
	 * 
	 * @param encoding
	 * @return true if encoding is understood by this class
	 */
	public static boolean isKnownEncoding(String encoding)
	{
		return supportedEncodings.contains(encoding);
	}

	/**
	 * Get a Collection of all the possible encodings this byte array could be
	 * used to represent.
	 * 
	 * @param data
	 * @return the Collection of possible encodings from the supported encodings
	 */
	public static Collection getPossibleEncodings(byte[] data)
	{

		Collection possibleEncodings = new TreeSet();
		if (data == null || data.length == 0)
		{
			return possibleEncodings;
		}

		// We may have to take account of a BOM (Byte Order Mark) as this could
		// be present at the beginning of
		// the source byte array. These sequences may match valid bytes at the
		// beginning of binary data but this shouldn't
		// match any encodings anyway.

		String encoding = null;
		for (Iterator it = supportedEncodings.iterator(); it.hasNext();)
		{
			// This will eliminate encodings it can't possibly be from the
			// supported encodings
			// by converting the source byte array to a String using each
			// encoding in turn and
			// then getting the resultant byte array and checking it against the
			// passed in data.

			try
			{
				// One problem to overcome is that the passed in data may be
				// terminated by an
				// incomplete character for the current encoding so we need to
				// remove the last character
				// then get the resulting bytes and only match this against the
				// source byte array.

				encoding = (String) it.next();

				// Check if this encoding has a known bom and if so does it
				// match the beginning of the data array ?
				// returns either 0 or the length of the bom
				int lengthBOM = getLengthBOM(encoding, data);

				// Don't use the BOM when constructing the String
				String test = new String(getByteArraySubArray(data, lengthBOM, data.length - lengthBOM), encoding);

				// Only remove the last character if the String is more than 1
				// character long
				if (test.length() > 1)
				{
					// Remove last character from the test string.
					test = test.substring(0, test.length() - 2);
				}

				// This is the byte array we will compare with the passed in
				// source array copy
				byte[] compare = null;
				try
				{
					compare = test.getBytes(encoding);
				} catch (UnsupportedOperationException ignore)
				{
					continue;
				}

				// Check if source and destination byte arrays are equal
				if (!compareByteArrays(data, lengthBOM, compare, 0, compare.length))
				{
					// dosn't match so ignore this encoding as it is unlikely to
					// be correct
					// even if it does contain valid text data.
					continue;
				}

				// If we get this far and the lengthBOM is not 0 then we have a
				// match for this encoding.
				if (lengthBOM != 0)
				{
					// We know we have a perfect match for this encoding so
					// ditch the rest and return just this one
					possibleEncodings.clear();
					possibleEncodings.add(encoding);
					return possibleEncodings;
				}

				// This is a possible match.
				possibleEncodings.add(encoding);
			} catch (UnsupportedEncodingException uee)
			{
				CapoApplication.logger.log(Level.SEVERE, "The encoding [" + encoding + "] is not supported by your JVM.");
			} catch (Exception e)
			{
				// Log the error but carry on with the next encoding
				CapoApplication.logger.log(Level.SEVERE, e.getLocalizedMessage(), e);
			}
		}
		return possibleEncodings;
	}

	/**
	 * Allows you to remove an encoding from the supported encodings you are not
	 * interested in.
	 * 
	 * @param encoding
	 * @return true if removed else false
	 */
	public static boolean removeEncoding(String encoding)
	{
		return supportedEncodings.remove(encoding);
	}

	/**
	 * Remove all valid encodings in the string array
	 * 
	 * @param encodings
	 *            String [] containing the encodings to remove
	 * @return true if at least one of the encodings was removed else false
	 */
	public static boolean removeEncodings(String[] encodings)
	{
		boolean removedAtLeast_1 = false;
		for (int i = 0; i < encodings.length; i++)
		{
			if (removeEncoding(encodings[i]))
			{
				removedAtLeast_1 = true;
			}
		}
		return removedAtLeast_1;
	}

	/**
	 * Get a Collection containing entries in both the supported encodings and
	 * the passed in String [] of encodings. This is used by TextMimeDetector to
	 * get a valid list of the preferred encodings.
	 * 
	 * @param encodings
	 * @return a Collection containing all valid encodings contained in the
	 *         passed in encodings array
	 */
	public static Collection getValidEncodings(String[] encodings)
	{
		Collection c = new ArrayList();
		for (int i = 0; i < encodings.length; i++)
		{
			if (supportedEncodings.contains(encodings[i]))
			{
				c.add(encodings[i]);
			}
		}
		return c;
	}

	/**
	 * Get the JVM default canonical encoding. For instance the canonical
	 * encoding for cp1252 is windows-1252
	 * 
	 * @return the default canonical encoding name for the JVM
	 */
	public static String getDefaultEncoding()
	{
		return EncodingGuesser.defaultJVMEncoding;
	}

	/**
	 * Get the Collection of currently supported encodings
	 * 
	 * @return the supported encodings.
	 */
	public static Collection getSupportedEncodings()
	{
		return supportedEncodings;
	}

	/**
	 * Set the supported encodings
	 * 
	 * @param encodings
	 *            . If this is null the supported encodings are left unchanged.
	 * @return a copy of the currently supported encodings
	 */
	public static Collection setSupportedEncodings(Collection encodings)
	{
		Collection current = new TreeSet();
		for (Iterator it = supportedEncodings.iterator(); it.hasNext();)
		{
			current.add(it.next());
		}
		if (encodings != null)
		{
			supportedEncodings.clear();
			for (Iterator it = encodings.iterator(); it.hasNext();)
			{
				supportedEncodings.add(it.next());
			}
		}
		return current;
	}

	/**
	 * Get the length of a BOM for this this encoding and byte array
	 * 
	 * @param encoding
	 * @param data
	 * @return length of BOM if the data contains a BOM else returns 0
	 */
	public static int getLengthBOM(String encoding, byte[] data)
	{
		if (!boms.containsKey(encoding))
		{
			return 0;
		}
		byte[] bom = (byte[]) boms.get(encoding);
		if (compareByteArrays(bom, 0, data, 0, bom.length))
		{
			return bom.length;
		}
		else
		{
			return 0;
		}
	}

	/**
	 * Get a sub array of this byte array starting at offset until length
	 * 
	 * @param a
	 * @param offset
	 * @param length
	 * @return new byte array unless is would replicate or increase the original
	 *         array in which case it returns the original
	 */
	public static byte[] getByteArraySubArray(byte[] a, int offset, int length)
	{
		if ((offset + length > a.length))
		{
			return a;
		}
		byte[] data = new byte[length];
		for (int i = 0; i < length; i++)
		{
			data[i] = a[offset + i];
		}
		return data;
	}

	/**
	 * Utility method to compare a region of two byte arrays for equality
	 * 
	 * @param a
	 * @param aOffset
	 * @param b
	 * @param bOffset
	 * @param length
	 * @return true is the two regions contain the same byte values else false
	 */
	public static boolean compareByteArrays(byte[] a, int aOffset, byte[] b, int bOffset, int length)
	{
		if ((a.length < aOffset + length) || (b.length < bOffset + length))
		{
			// would match beyond one of the arrays
			return false;
		}

		for (int i = 0; i < length; i++)
		{
			if (a[aOffset + i] != b[bOffset + i])
			{
				return false;
			}
		}
		return true;
	}

	/**
	 * Utility method to get all of the current encoding names, in canonical
	 * format, supported by your JVM at the time this is called.
	 * 
	 * @return current Collection of canonical encoding names
	 */
	public static Collection getCanonicalEncodingNamesSupportedByJVM()
	{
		Collection encodings = new TreeSet();

		SortedMap charSets = Charset.availableCharsets();
		Collection charSetNames = charSets.keySet();
		for (Iterator it = charSetNames.iterator(); it.hasNext();)
		{
			encodings.add((String) it.next());
		}

		CapoApplication.logger.fine("The following [" + encodings.size() + "] encodings will be used: " + encodings);

		return encodings;
	}
}