MimeTypeDetector.java example

Explorer
manager.v3-master
- projects
// Copyright 2009 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.enterprise.connector.util;

import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
import com.google.enterprise.connector.spi.TraversalContext;

import eu.medsea.mimeutil.MimeType;
import eu.medsea.mimeutil.MimeUtil2;
import eu.medsea.mimeutil.detector.ExtensionMimeDetector;
import eu.medsea.mimeutil.detector.MagicMimeMimeDetector;
import eu.medsea.util.EncodingGuesser;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Detector for MIME type based on file name and content.
 *
 * @since 3.0
 */
public class MimeTypeDetector {
  private static final Logger LOGGER =
      Logger.getLogger(MimeTypeDetector.class.getName());

  /**
   * MIME type for documents whose MIME type cannot be determined.
   */
  public static final String UNKNOWN_MIME_TYPE =
      mimeTypeStringValue(MimeUtil2.UNKNOWN_MIME_TYPE);

  private static MimeUtil2 extensionDetector;
  private static MimeUtil2 magicDetector;

  /**
   * The mime-util library leaks memory like a sieve on each new instance,
   * and is not thread-safe. So we want to share instances of MimeUtil2.
   * To avoid problems with mime-util trying to open a file with the given
   * name, we use two separate instances, one using only the extension
   * detector which we give the file name to, and the other using only the
   * magic detector which we give the byte[] to.
   */
  private static synchronized void init() {
    if (magicDetector == null) {
      LOGGER.info("Initializing MimeTypeDetector");
      setSupportedEncodings(
          Sets.newHashSet("UTF-8", "ISO-8859-1", "windows-1252"));

      extensionDetector = new MimeUtil2();
      extensionDetector.registerMimeDetector(
          ExtensionMimeDetector.class.getName());
      // TODO: Should we add the WindowsRegistryMimeDetector?  This might
      // yield different results when run on Windows vs. Unix.

      // TODO: If "/usr/share/mime/mime.cache exists use
      // OpendesktopMimeDetector instead of MagicMimeMimeDetector. It seems
      // more accurate but was logging NullPointerExceptions so I temporarily
      // removed it pending further testing/fixing.
      magicDetector = new MimeUtil2();
      magicDetector.registerMimeDetector(MagicMimeMimeDetector.class.getName());
    }
  }

  /** TraversalContext used to rank differented mime types. */
  private static TraversalContext traversalContext;

  /** TraversalContext injected by Spring from the manager configuration. */
  public static void setTraversalContext(TraversalContext traversalContext) {
    Preconditions.checkNotNull(traversalContext,
                               "traversalContext must not be null.");
    MimeTypeDetector.traversalContext = traversalContext;
  }

  public MimeTypeDetector() {
    init();
  }

  /**
   * Sets the supported
   * <a href="http://docs.oracle.com/javase/7/docs/technotes/guides/intl/encoding.doc.html">
   * character encodings</a> for the {@code MimeTypeDetector}. When determining
   * Mime type based upon content, MimeTypeDetector will interpret the content
   * using the various encodings until it has found a match.  For performance
   * reasons, the Set of expected encodings should remain as small as possible.
   * The JVM default encoding is automatically supported.
   * <p>
   * The default set of supported encodings is "UTF-8", "ISO-8859-1",
   * "windows-1252", and the current JVM default encoding.
   * <p>
   *
   * @param encodings a Set of canonical encoding names.
   * @see <a href="http://docs.oracle.com/javase/7/docs/technotes/guides/intl/encoding.doc.html">Java Supported Encodings</a>
   */
  public static synchronized void setSupportedEncodings(Set<String> encodings) {
    Set<String> enc = Sets.newHashSet(encodings);
    enc.add(EncodingGuesser.getDefaultEncoding());
    EncodingGuesser.setSupportedEncodings(enc);
  }

  /**
   * Returns the MIME type for the document with the provided filename and/or
   * content.
   * <p>
   * If {@code filename} is provided, the file will not be accessed; however,
   * the filename extension will be used for MIME type determination.  For
   * this reason, filenames that are extracted from ECMs, remote filesytems,
   * even URLs (using
   * <a href="http://docs.oracle.com/javase/7/docs/api/java/net/URL.html#getPath()">
   * URL.getPath()</a>) should work.  If {@code filename} is {@code null},
   * only the supplied {@code content} will be used to determine the MIME type.
   * <p>
   * If {@code content} is provided, {@link MimeTypeDetector} will examine
   * the first few thousand bytes of the content, looking for a match against
   * a set of known character sequences found in common file formats.  The
   * caller need not supply the entire document content - only the
   * beginning of the content is examined to determine MIME type, so the
   * first 4 kilobytes of content is sufficient at this time.
   * If {@code content} is {@code null}, only the filename extension will be
   * used to determine the MIME type.
   *
   * @param filename used for filename extension MIME type detection
   *        (may be {@code null})
   * @param content a byte array of document content used for MIME type
   *        detection (may be {@code null})
   * @return the most preferred MIME type for the document
   * @throws IllegalArgumentException if both {@code filename} and
   *        {@code content} are {@code null}.
   */
  public String getMimeType(String filename, byte[] content) {
    Preconditions.checkArgument((filename != null || content != null),
                                "filename and content may not both be null");
    // We munge the file name we pass to getMimeTypes so that it will
    // not find the file exists, open it and perform content based
    // detection here.
    String bestMimeType =
        pickBestMimeType(getMimeTypes(filename), getMimeTypes(content));
    if (LOGGER.isLoggable(Level.FINEST)) {
      LOGGER.finest("MimeType " + bestMimeType + " determined for "
                    + ((filename == null) ? "content." : filename));
    }
    return bestMimeType;
  }

  /**
   * Returns the MIME type for the document with the provided filename or
   * content read from an {@code InputStream}.
   * <p>
   * If {@code filename} is provided, the file will not be accessed; however,
   * the filename extension will be used for MIME type determination.  For
   * this reason, filenames that are extracted from ECMs, remote filesytems,
   * even URLs (using
   * <a href="http://docs.oracle.com/javase/7/docs/api/java/net/URL.html#getPath()">
   * URL.getPath()</a>) should work.  If the MIME type can be determined
   * solely by the filename extension, it will be returned.
   * <p>
   * If the MIME type cannot be determined solely from the filename extension
   * and {@code inputStreamFactory} is provided, {@link MimeTypeDetector} will
   * get an {@code InputStream} from the factory and read the
   * first few thousand bytes of the content, looking for a match against
   * a set of known character sequences found in common file formats.
   * If {@code inputStreamFactory} is {@code null}, only the filename extension
   * will be used to determine the MIME type.
   *
   * @param filename used for filename extension MIME type detection
   *        (may be {@code null})
   * @param inputStreamFactory an {@link InputStreamFactory} used to fetch
   *        and {@code InputStream} from which the document content may be read
   *        (may be {@code null})
   * @return the most preferred MIME type for the document
   * @throws IllegalArgumentException if both {@code filename} and
   *        {@code InputStreamFactory} are {@code null}
   * @throws IOException if there is an error reading from the InputStream
   */
  public String getMimeType(String filename,
        InputStreamFactory inputStreamFactory) throws IOException {
    Preconditions.checkArgument((filename != null || inputStreamFactory != null),
        "filename and inputStreamFactory may not both be null");
    Collection<MimeType> mimeTypes = getMimeTypes(filename);
    String bestMimeType = pickBestMimeType(mimeTypes);
    if (UNKNOWN_MIME_TYPE.equals(bestMimeType) && inputStreamFactory != null) {
      InputStream is = inputStreamFactory.getInputStream();
      try {
        byte[] bytes = getBytes(is);
        mimeTypes = getMimeTypes(bytes);
      } finally {
        is.close();
      }
      bestMimeType = pickBestMimeType(mimeTypes);
    }
    if (LOGGER.isLoggable(Level.FINEST)) {
      LOGGER.finest("MimeType " + bestMimeType + " determined for "
                    + ((filename == null) ? "content." : filename));
    }
    return bestMimeType;
  }

  @SuppressWarnings("unchecked")
  private Collection<MimeType> getMimeTypes(String filename) {
    if (filename == null) {
      return null;
    }
    synchronized (extensionDetector) {
      return extensionDetector.getMimeTypes(filename);
    }
  }

  @SuppressWarnings("unchecked")
  private Collection<MimeType> getMimeTypes(byte[] content) {
    if (content == null) {
      return null;
    }
    synchronized (magicDetector) {
      return magicDetector.getMimeTypes(content);
    }
  }

  /**
   * This method returns the most suitable MIME type of the document
   * from the MIME types collected by the filename extension MIME type
   * detector and/or the document content MIME type detector.
   *
   * @param extensionMimeTypes a Collection of MimeTypes as determined by
   *        the filename extension (may be {@code null})
   * @param contentMimeTypes a Collection of MimeTypes as determined by
   *        the document content (may be {@code null})
   * @return most suitable MIME type for the document
   */
  private String pickBestMimeType(Collection<MimeType> extensionMimeTypes,
                                  Collection<MimeType> contentMimeTypes) {
    // Use a LinkedHashSet so we preserve the order of the mimetypes
    // as they are returned by MimeUtil.
    Set<String> mimeTypeNames = new LinkedHashSet<String>();
    if (extensionMimeTypes != null) {
      for (MimeType mimeType : extensionMimeTypes) {
        if (!MimeUtil2.UNKNOWN_MIME_TYPE.equals(mimeType)) {
          mimeTypeNames.add(mimeTypeStringValue(mimeType));
        }
      }
    }
    if (contentMimeTypes != null) {
      for (MimeType mimeType : contentMimeTypes) {
        if (!MimeUtil2.UNKNOWN_MIME_TYPE.equals(mimeType)) {
          mimeTypeNames.add(mimeTypeStringValue(mimeType));
        }
      } 
    }
    if (mimeTypeNames.isEmpty()) {
      return UNKNOWN_MIME_TYPE;
    }
    // get the most suitable MIME type for this document
    Preconditions.checkState(traversalContext != null,
                             "traversalContext must be set.");
    return traversalContext.preferredMimeType(mimeTypeNames);
  }

  private String pickBestMimeType(Collection<MimeType> mimeTypes) {
    return pickBestMimeType(mimeTypes, null);
  }

  private static String mimeTypeStringValue(MimeType mimeType) {
    return mimeType.getMediaType() + "/" + mimeType.getSubType();
  }

  /** Read up to 4KB of content from the InputStream. */
  private static byte[] getBytes(InputStream is) throws IOException {
    // As of mime-utils v2.1.3, buffer needs to be at least 2120 bytes.
    byte[] result = new byte[4096];
    int bytesRead = 0;
    while (bytesRead < result.length) {
      int bytesThisTime = is.read(result, bytesRead, result.length - bytesRead);
      if (bytesThisTime == -1) {
        break;
      }
      bytesRead += bytesThisTime;
    }
    return trim(result, bytesRead);
  }

  /**
   * Trims the passed in array to the desired length and
   * returns the result. If the passed in array is already
   * the desired length this simply returns the passed in
   * array.
   */
  /* TODO: When we move to Java 6, replace this with Arrays.copyOf() */
  private static byte[] trim(byte[] input, int desiredLength) {
    if (input.length == desiredLength) {
      return input;
    } else {
      byte[] result = new byte[desiredLength];
      System.arraycopy(input, 0, result, 0, desiredLength);
      return result;
    }
  }
}