/*
* Copyright 2007-2009 Medsea Business Solutions S.L.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* <p>
* </p>
* @author Steven McArdle
*/
package eu.medsea.mimeutil;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.logging.Level;
import com.delcyon.capo.CapoApplication;
import eu.medsea.mimeutil.detector.MimeDetector;
import eu.medsea.mimeutil.handler.TextMimeHandler;
import eu.medsea.util.EncodingGuesser;
/**
* This MimeDetector cannot be registered, unregistered or subclassed. It is a
* default MimeDetector that is pre-installed into the mime-util utility and is
* used as the FIRST MimeDetector.
* <p>
* You can influence this MimeDetector in several ways.
* <ul>
* <li>Specify a different list of preferred encodings using the static
* TextMimeDetector.setPreferredEncodings(...) method.</li>
* <li>Change the list of supported encodings using the static
* EncodingGuesser.setSupportedEncodings(...) method.</li>
* <li>Register TextMimeHandler(s) using the static
* TextMimeDetector.registerTextMimeHandler(...) method (very, VERY powerful).</li>
* </ul>
* <p>
* The TextMimeDetector.setPreferredEncodings(...) method is used to provide a
* preferred list of encodings. The final encoding for the MimeType will be the
* first one in this list that is also contained in the possible encodings
* returned from the EncodingGuesser class. If none of these match then the
* first entry in the possible encodings collection is used.
* </p>
* <p>
* The EncodingGuesser.setSupportedEncodings(...) method is used to set the list
* of encodings that will be considered when trying to guess the encoding. If
* you provide encodings that are not supported by your JVM an error is logged
* and the next encoding is tried. If you set this to an empty Collection then
* you will effectively turn this MimeDetector OFF (the default). This is the
* recommended way to disable this MimeDetector. The most common usage scenario
* for this method is when your application is designed to support only a
* limited set of encodings such as UTF-8 and UTF-16 encoded text files. You can
* set the supported encodings list to this sub set of encodings and improve the
* performance of this MimeDetector greatly.
* </p>
* <p>
* The TextMimeDetector.registerTextMimeHandler(...) method can be used to
* register special TextMimeHandler(s). These MimeHandler(s) are delegated to
* when once valid encodings have been found for the content contained in File,
* InputStream or byte []. The handlers can influence both the returned MimeType
* and encoding of any matched content. For instance, the default behavior is to
* return a MimeType of text/plain and encoding set according to the rules
* above. The Handler(s) allow you to further process the content and decide
* that it is in fact a text/xml or application/svg-xml or even
* mytype/mysubtype. You can also change the assigned encoding as it may be
* wrong for your new MimeType. For instance, if you decide the MimeType is
* really an XML file and not just a standard text/plain file and the detector
* calculated that the best encoding is UTF-8 but you detect and encoding
* attribute in the XML content for ISO-8859-1, you can set this as well thus
* returning a TextMimeType of application/xml with an encoding or ISO-8859-1
* instead of a TextMimeType of text/plain and an encoding of UTF-8.<br/>
* <br/>
* IMPORTANT: Your handler(s) will only get to see and act on content that this
* MimeDetector thinks is text in the first place. So if your restrictions on
* supported encodings will no longer detect a file as text then your handler(s)
* will never be called.
* </p>
* </p>
* <p>
* The methods will do their best to eliminate any binary files before trying to
* detect an encoding. However, if a binary file contains only a few bytes of
* data or you are very unlucky it could be mistakenly recognised as a text file
* and processed by this MimeDetector.
* </p>
* <p>
* The Collection(s) returned from the methods in this class will contain either
* 0 or 1 MimeType entry of type TextMimeType with a mime type of "text/plain"
* or whatever matching registered TextMimeHandler(s) decide to return. You can
* test for matches from this MimeDetector by using the instanceof operator on
* the Collection of returned MimeType(s) to your code (remember, the returned
* Collection to you is the accumulated collection from ALL registered
* MimeDetectors. You can retrieve the encoding using the getEncoding() method
* of TextMimeType after casting the MimeType to a TextMimeType.
* </p>
* <p>
* You should also remember that if this MimeDetector puts a TextMimeType into
* the eventual Collection of MimeType(s) returned to your code of say
* "text/plain" and one or more of the other registered MimeDetector(s) also add
* an instance of "text/plain" in accordance with their detection rules, the
* type will not be changed from TextMimeType to MimeType. Only the specificity
* value of the MimeType will be increased thus improving the likelihood that
* this MimeType will be returned from the
* MimeUtil.getMostSpecificMimeType(Collection mimeTypes) method.
* </p>
*
* @author Steven McArdle
*/
public final class TextMimeDetector extends MimeDetector
{
// The maximum amount of data to retrieve from a stream
private static final int BUFFER_SIZE = 1024;
// No text file should have 2 or more consecutive NULL values
private static final int MAX_NULL_VALUES = 1;
private static Collection preferredEncodings = new ArrayList();
static
{
TextMimeDetector.setPreferredEncodings(new String[] { "UTF-16", "UTF-8", "ISO-8859-1", "windows-1252", "US-ASCII" });
}
// Registered list of TextMimeHandler(s)
private static Collection handlers = new ArrayList();
// Private so nobody can register one using the
// MimeUtil.registerMimeDetector(...) method
private TextMimeDetector()
{
}
// Package scoped so that the class can still be create for use by mime-util
// without resorting to a singleton approach
// Could change this in the future !!!
TextMimeDetector(int dummy)
{
this();
}
/**
* @see MimeDetector.getDescription()
*/
public String getDescription()
{
return "Determine if a file or stream contains a text mime type. If so then return TextMimeType with text/plain and the best guess encoding.";
}
/**
* This MimeDetector requires content so defer to the file method
*/
public Collection getMimeTypesFileName(String fileName) throws UnsupportedOperationException
{
return getMimeTypesFile(new File(fileName));
}
/**
* We only want to deal with the stream from the URL
*
* @see MimeDetector.getMimeTypesURL(URL url)
*/
public Collection getMimeTypesURL(URL url) throws UnsupportedOperationException
{
InputStream in = null;
try
{
return getMimeTypesInputStream(in = new BufferedInputStream(MimeUtil.getInputStreamForURL(url)));
} catch (UnsupportedOperationException e)
{
throw e;
} catch (Exception e)
{
throw new MimeException(e);
} finally
{
try
{
in.close();
} catch (Exception ignore)
{
CapoApplication.logger.log(Level.SEVERE,ignore.getLocalizedMessage());
}
}
}
/**
* We only want to deal with the stream for the file
*
* @see MimeDetector.getMimeTypesURL(URL url)
*/
public Collection getMimeTypesFile(File file) throws UnsupportedOperationException
{
if (!file.exists())
{
throw new UnsupportedOperationException("This MimeDetector requires actual content.");
}
InputStream in = null;
try
{
in = new BufferedInputStream(new FileInputStream(file));
return getMimeTypesInputStream(in);
} catch (UnsupportedOperationException e)
{
throw e;
} catch (Exception e)
{
throw new MimeException(e);
} finally
{
try
{
in.close();
} catch (Exception ignore)
{
CapoApplication.logger.log(Level.SEVERE,ignore.getLocalizedMessage());
}
}
}
/**
* @see MimeDetector.getMimeTypesInputStream(InputStream in)
*/
public Collection getMimeTypesInputStream(InputStream in) throws UnsupportedOperationException
{
int offset = 0;
int len = TextMimeDetector.BUFFER_SIZE;
byte[] data = new byte[len];
byte[] copy = null;
// Mark the input stream
in.mark(len);
try
{
// Since an InputStream might return only some data (not all
// requested), we have to read in a loop until
// either EOF is reached or the desired number of bytes have been
// read.
int restBytesToRead = len;
while (restBytesToRead > 0)
{
int bytesRead = in.read(data, offset, restBytesToRead);
if (bytesRead < 0)
break; // EOF
offset += bytesRead;
restBytesToRead -= bytesRead;
}
if (offset < len)
{
copy = new byte[offset];
System.arraycopy(data, 0, copy, 0, offset);
}
else
{
copy = data;
}
} catch (IOException ioe)
{
throw new MimeException(ioe);
} finally
{
try
{
// Reset the input stream to where it was marked.
in.reset();
} catch (Exception e)
{
throw new MimeException(e);
}
}
return getMimeTypesByteArray(copy);
}
/**
* @see MimeDetector.getMimeTypesByteArray(byte [] data)
*/
public Collection getMimeTypesByteArray(byte[] data) throws UnsupportedOperationException
{
// Check if the array contains binary data
if (EncodingGuesser.getSupportedEncodings().isEmpty() || isBinary(data))
{
throw new UnsupportedOperationException();
}
Collection mimeTypes = new ArrayList();
Collection possibleEncodings = EncodingGuesser.getPossibleEncodings(data);
CapoApplication.logger.fine("Possible encodings [" + possibleEncodings.size() + "] " + possibleEncodings);
if (possibleEncodings.isEmpty())
{
// Is not a text file understood by this JVM
throw new UnsupportedOperationException();
}
String encoding = null;
// Iterate over the preferedEncodings array in the order defined and
// return the first one found
for (Iterator it = TextMimeDetector.preferredEncodings.iterator(); it.hasNext();)
{
encoding = (String) it.next();
if (possibleEncodings.contains(encoding))
{
mimeTypes.add(new TextMimeType("text/plain", encoding));
break;
}
}
// If none of the preferred encodings were acceptable lets see if the
// default encoding can be used.
if (mimeTypes.isEmpty() && possibleEncodings.contains(EncodingGuesser.getDefaultEncoding()))
{
encoding = EncodingGuesser.getDefaultEncoding();
mimeTypes.add(new TextMimeType("text/plain", encoding));
}
// If none of our preferredEncodings or the default encoding are in the
// possible encodings list we return the first possibleEncoding;
if (mimeTypes.isEmpty())
{
Iterator it = possibleEncodings.iterator();
encoding = (String) it.next();
mimeTypes.add(new TextMimeType("text/plain", encoding));
}
if (mimeTypes.isEmpty() || handlers.isEmpty())
{
// Nothing to handle
return mimeTypes;
}
// String will be passed in as is currently in the encoding defined by
// encoding
try
{
int lengthBOM = EncodingGuesser.getLengthBOM(encoding, data);
String content = new String(EncodingGuesser.getByteArraySubArray(data, lengthBOM, data.length - lengthBOM), encoding);
return fireMimeHandlers(mimeTypes, content);
} catch (UnsupportedEncodingException ignore)
{
// This should never, never, never happen
}
return mimeTypes;
}
/**
* Change the list of preferred encodings. This list is used where multiple
* possible encodings are identified to refer to the contents in a byte
* array passed in or read in from a Stream or File object. This list is
* iterated over in order and the first match is set as the encoding for the
* text/plain TextMimeType ONLY if the JVM default encoding is not in the
* list. If the neither the defaultEncoding or any of these preferred
* encodings are in the list of possible encodings then the first possible
* encoding will be used.
*
* @param encodings
* String array of canonical encoding names.
*/
public static void setPreferredEncodings(String[] encodings)
{
TextMimeDetector.preferredEncodings = EncodingGuesser.getValidEncodings(encodings);
CapoApplication.logger.fine("Preferred Encodings set to " + TextMimeDetector.preferredEncodings);
}
/**
* Register a TexMimeHandler(s)
*
* @param handler
* to register
*/
public static void registerTextMimeHandler(TextMimeHandler handler)
{
handlers.add(handler);
}
/**
* Unregister a TextMimeHandler
*
* @param handler
* to unregister
*/
public static void unregisterTextMimeHandler(TextMimeHandler handler)
{
handlers.remove(handler);
}
/**
* Get the current Collection of registered TexMimeHandler(s)
*
* @return currently registered collection of TextMimeHandler(s)
*/
public static Collection getRegisteredTextMimeHandlers()
{
return handlers;
}
/**
* Give registered TextMimeHandler(s) the opportunity to influence the
* actual mime type before returning from the getMimeTypesXXX(...) methods
*
* @param mimeTypes
* @param content
* @return
*/
private Collection fireMimeHandlers(Collection mimeTypes, String content)
{
// We only have one entry in the mimeTypes Collection due to the way
// this MimeDetector works.
TextMimeType mimeType = (TextMimeType) mimeTypes.iterator().next();
for (Iterator it = handlers.iterator(); it.hasNext();)
{
TextMimeHandler tmh = (TextMimeHandler) it.next();
if (tmh.handle(mimeType, content))
{
// The first handler to return true will short circuit the rest
// of the handlers
break;
}
}
return mimeTypes;
}
/*
* This is a quick check for the byte array to see if it contains binary
* data. As no known text encoding can have more than MAX_NULL_VALUES
* consecutive null values the method does a quick and dirty elimination of
* what are probably binary files but should never eliminate possible text
* files. It is possible that some binary files will not have
* MAX_NULL_VALUES consecutive byte values especially if it's a small file
* and will slip through here. Later tests should eliminate these. We will
* modify this method to include other known sequences as and when we
* discover them
*/
private boolean isBinary(byte[] data)
{
int negCount = 0;
for (int i = 0; i < data.length; i++)
{
if (data[i] == 0)
{
negCount++;
}
else
{
negCount = 0;
}
if (negCount == MAX_NULL_VALUES)
{
return true;
}
}
return false;
}
}