MimeTypes.java example

Explorer
tika-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.mime;

// JDK imports
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import javax.xml.namespace.QName;

import org.apache.tika.Tika;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.TextDetector;
import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.metadata.Metadata;

/**
 * This class is a MimeType repository. It gathers a set of MimeTypes and
 * enables to retrieves a content-type from its name, from a file name, or from
 * a magic character sequence.
 * <p>
 * The MIME type detection methods that take an {@link InputStream} as
 * an argument will never reads more than {@link #getMinLength()} bytes
 * from the stream. Also the given stream is never
 * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
 * or {@link InputStream#reset() reset} by the methods. Thus a client can
 * use the {@link InputStream#markSupported() mark feature} of the stream
 * (if available) to restore the stream back to the state it was before type
 * detection if it wants to process the stream based on the detected type.
 */
public final class MimeTypes implements Detector, Serializable {

    /**
     * Serial version UID.
     */
    private static final long serialVersionUID = -1350863170146349036L;

    /**
     * Name of the {@link #rootMimeType root} type, application/octet-stream.
     */
    public static final String OCTET_STREAM = "application/octet-stream";

    /**
     * Name of the {@link #textMimeType text} type, text/plain.
     */
    public static final String PLAIN_TEXT = "text/plain";
    
    /**
     * Name of the {@link #xmlMimeType xml} type, application/xml.
     */
    public static final String XML = "application/xml";

    /**
     * Root type, application/octet-stream.
     */
    private final MimeType rootMimeType;
    private final List<MimeType> rootMimeTypeL;

    /**
     * Text type, text/plain.
     */
    private final MimeType textMimeType;

    /*
     * xml type, application/xml
     */
    private final MimeType xmlMimeType;

    /**
     * Registered media types and their aliases.
     */
    private final MediaTypeRegistry registry = new MediaTypeRegistry();

    /** All the registered MimeTypes indexed on their canonical names */
    private final Map<MediaType, MimeType> types =
        new HashMap<MediaType, MimeType>();

    /** The patterns matcher */
    private Patterns patterns = new Patterns(registry);

    /** Sorted list of all registered magics */
    private final List<Magic> magics = new ArrayList<Magic>();

    /** Sorted list of all registered rootXML */
    private final List<MimeType> xmls = new ArrayList<MimeType>();

    public MimeTypes() {
        rootMimeType = new MimeType(MediaType.OCTET_STREAM);
        textMimeType = new MimeType(MediaType.TEXT_PLAIN);
        xmlMimeType = new MimeType(MediaType.APPLICATION_XML);
        
        rootMimeTypeL = Collections.singletonList(rootMimeType);

        add(rootMimeType);
        add(textMimeType);
        add(xmlMimeType);
    }

    /**
     * Find the Mime Content Type of a document from its name.
     * Returns application/octet-stream if no better match is found.
     *
     * @deprecated Use {@link Tika#detect(String)} instead
     * @param name of the document to analyze.
     * @return the Mime Content Type of the specified document name
     */
    public MimeType getMimeType(String name) {
        MimeType type = patterns.matches(name);
        if (type != null) {
            return type;
        }
        type = patterns.matches(name.toLowerCase(Locale.ENGLISH));
        if (type != null) {
            return type;
        } else {
            return rootMimeType;
        }
    }

    /**
     * Find the Mime Content Type of a document stored in the given file.
     * Returns application/octet-stream if no better match is found.
     *
     * @deprecated Use {@link Tika#detect(File)} instead
     * @param file file to analyze
     * @return the Mime Content Type of the specified document
     * @throws MimeTypeException if the type can't be detected
     * @throws IOException if the file can't be read
     */
    public MimeType getMimeType(File file)
            throws MimeTypeException, IOException {
        return forName(new Tika(this).detect(file));
    }

    /**
     * Returns the MIME type that best matches the given first few bytes
     * of a document stream. Returns application/octet-stream if no better
     * match is found. 
     * <p>
     * If multiple matches are found, the best (highest priority) matching
     * type is returned. If multiple matches are found with the same priority,
     * then all of these are returned.
     * <p>
     * The given byte array is expected to be at least {@link #getMinLength()}
     * long, or shorter only if the document stream itself is shorter.
     *
     * @param data first few bytes of a document stream
     * @return matching MIME type
     */
    List<MimeType> getMimeType(byte[] data) {
        if (data == null) {
            throw new IllegalArgumentException("Data is missing");
        } else if (data.length == 0) {
            // See https://issues.apache.org/jira/browse/TIKA-483
            return rootMimeTypeL;
        }

        // Then, check for magic bytes
        List<MimeType> result = new ArrayList<MimeType>(1);
        int currentPriority = -1;
        for (Magic magic : magics) {
            if (currentPriority > 0 && currentPriority > magic.getPriority()) {
                break;
            }
            if (magic.eval(data)) {
                result.add(magic.getType());
                currentPriority = magic.getPriority();
            }
        }
 
        if (!result.isEmpty()) {
            for (int i=0; i<result.size(); i++) {
                final MimeType matched = result.get(i);
                
                // When detecting generic XML (or possibly XHTML),
                // extract the root element and match it against known types
                if ("application/xml".equals(matched.getName())
                        || "text/html".equals(matched.getName())) {
                    XmlRootExtractor extractor = new XmlRootExtractor();

                    QName rootElement = extractor.extractRootElement(data);
                    if (rootElement != null) {
                        for (MimeType type : xmls) {
                            if (type.matchesXML(
                                    rootElement.getNamespaceURI(),
                                    rootElement.getLocalPart())) {
                                result.set(i, type);
                                break;
                            }
                        }
                    } else if ("application/xml".equals(matched.getName())) {
                        // Downgrade from application/xml to text/plain since
                        // the document seems not to be well-formed.
                        result.set(i, textMimeType);
                    }
                }
            }
            return result;
        }

        // Finally, assume plain text if no control bytes are found
        try {
            TextDetector detector = new TextDetector(getMinLength());
            ByteArrayInputStream stream = new ByteArrayInputStream(data);
            MimeType type = forName(detector.detect(stream, new Metadata()).toString());
            return Collections.singletonList(type);
        } catch (Exception e) {
            return rootMimeTypeL;
        }
    }

    /**
     * Reads the first {@link #getMinLength()} bytes from the given stream.
     * If the stream is shorter, then the entire content of the stream is
     * returned.
     * <p>
     * The given stream is never {@link InputStream#close() closed},
     * {@link InputStream#mark(int) marked}, or
     * {@link InputStream#reset() reset} by this method.
     *
     * @param stream stream to be read
     * @return first {@link #getMinLength()} (or fewer) bytes of the stream
     * @throws IOException if the stream can not be read
     */
    byte[] readMagicHeader(InputStream stream) throws IOException {
        if (stream == null) {
            throw new IllegalArgumentException("InputStream is missing");
        }

        byte[] bytes = new byte[getMinLength()];
        int totalRead = 0;

        int lastRead = stream.read(bytes);
        while (lastRead != -1) {
            totalRead += lastRead;
            if (totalRead == bytes.length) {
                return bytes;
            }
            lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
        }

        byte[] shorter = new byte[totalRead];
        System.arraycopy(bytes, 0, shorter, 0, totalRead);
        return shorter;
    }

    /**
     * Returns the registered media type with the given name (or alias).
     * The named media type is automatically registered (and returned) if
     * it doesn't already exist.
     *
     * @param name media type name (case-insensitive)
     * @return the registered media type with the given name or alias
     * @throws MimeTypeException if the given media type name is invalid
     */
    public MimeType forName(String name) throws MimeTypeException {
        MediaType type = MediaType.parse(name);
        if (type != null) {
            MediaType normalisedType = registry.normalize(type);
            MimeType mime = types.get(normalisedType);
            
            if (mime == null) {
                synchronized (this) {
                   // Double check it didn't already get added while 
                   //  we were waiting for the lock
                   mime = types.get(normalisedType);
                   if (mime == null) {
                      mime = new MimeType(type);
                      add(mime);
                      types.put(type, mime);
                   }
                }
            }
            return mime;
        } else {
            throw new MimeTypeException("Invalid media type name: " + name);
        }
    }

    /**
     * Returns the registered, normalised media type with the given name (or alias).
     * 
     * <p>Unlike {@link #forName(String)}, this function will <em>not<em> create a 
     * new MimeType and register it. Instead, <code>null</code> will be returned if 
     * there is no definition available for the given name.
     *
     * <p>Also, unlike {@link #forName(String)}, this function may return a
     * mime type that has fewer parameters than were included in the supplied name.
     * If the registered mime type has parameters (e.g. 
     * <code>application/dita+xml;format=map</code>), then those will be maintained.  
     * However, if the supplied name has paramenters that the <em>registered</em> mime
     * type does not (e.g. <code>application/xml; charset=UTF-8</code> as a name, 
     * compared to just <code>application/xml</code> for the type in the registry), 
     * then those parameters will not be included in the returned type.
     *
     * @param name media type name (case-insensitive)
     * @return the registered media type with the given name or alias, or null if not found
     * @throws MimeTypeException if the given media type name is invalid
     */
    public MimeType getRegisteredMimeType(String name) throws MimeTypeException {
        MediaType type = MediaType.parse(name);
        if (type != null) {
            MediaType normalisedType = registry.normalize(type);
            MimeType candidate = types.get(normalisedType);
            if (candidate != null) {
                return candidate;
            }
            if (normalisedType.hasParameters()) {
                return types.get(normalisedType.getBaseType());
            }
            return null;
        } else {
            throw new MimeTypeException("Invalid media type name: " + name);
        }
    }
    
    public synchronized void setSuperType(MimeType type, MediaType parent) {
        registry.addSuperType(type.getType(), parent);
    }

    /**
     * Adds an alias for the given media type. This method should only
     * be called from {@link MimeType#addAlias(String)}.
     *
     * @param type media type
     * @param alias media type alias (normalized to lower case)
     */
    synchronized void addAlias(MimeType type, MediaType alias) {
        registry.addAlias(type.getType(), alias);
    }

    /**
     * Adds a file name pattern for the given media type. Assumes that the
     * pattern being added is <b>not</b> a JDK standard regular expression.
     *
     * @param type
     *            media type
     * @param pattern
     *            file name pattern
     * @throws MimeTypeException
     *             if the pattern conflicts with existing ones
     */
    public void addPattern(MimeType type, String pattern)
            throws MimeTypeException {
        this.addPattern(type, pattern, false);
    }

    /**
     * Adds a file name pattern for the given media type. The caller can specify
     * whether the pattern being added <b>is</b> or <b>is not</b> a JDK standard
     * regular expression via the <code>isRegex</code> parameter. If the value
     * is set to true, then a JDK standard regex is assumed, otherwise the
     * freedesktop glob type is assumed.
     *
     * @param type
     *            media type
     * @param pattern
     *            file name pattern
     * @param isRegex
     *            set to true if JDK std regexs are desired, otherwise set to
     *            false.
     * @throws MimeTypeException
     *             if the pattern conflicts with existing ones.
     *
     */
    public void addPattern(MimeType type, String pattern, boolean isRegex)
            throws MimeTypeException {
        patterns.add(pattern, isRegex, type);
    }

    public MediaTypeRegistry getMediaTypeRegistry() {
        return registry;
    }

    /**
     * Return the minimum length of data to provide to analyzing methods based
     * on the document's content in order to check all the known MimeTypes.
     *
     * @return the minimum length of data to provide.
     * @see #getMimeType(byte[])
     * @see #getMimeType(String, byte[])
     */
    public int getMinLength() {
        // This needs to be reasonably large to be able to correctly detect
        // things like XML root elements after initial comment and DTDs
        return 64 * 1024;
    }

    /**
     * Add the specified mime-type in the repository.
     *
     * @param type
     *            is the mime-type to add.
     */
    void add(MimeType type) {
        registry.addType(type.getType());
        types.put(type.getType(), type);

        // Update the magics index...
        if (type.hasMagic()) {
            magics.addAll(type.getMagics());
        }

        // Update the xml (xmlRoot) index...
        if (type.hasRootXML()) {
            xmls.add(type);
        }
    }

    /**
     * Called after all configured types have been loaded.
     * Initializes the magics and xmls sets.
     */
    void init() {
        for (MimeType type : types.values()) {
            magics.addAll(type.getMagics());
            if (type.hasRootXML()) {
                xmls.add(type);
            }
        }
        Collections.sort(magics);
        Collections.sort(xmls);
    }

    /**
     * Automatically detects the MIME type of a document based on magic
     * markers in the stream prefix and any given metadata hints.
     * <p>
     * The given stream is expected to support marks, so that this method
     * can reset the stream to the position it was in before this method
     * was called.
     *
     * @param input document stream, or <code>null</code>
     * @param metadata metadata hints
     * @return MIME type of the document
     * @throws IOException if the document stream could not be read
     */
    public MediaType detect(InputStream input, Metadata metadata)
            throws IOException {
        List<MimeType> possibleTypes = null;

        // Get type based on magic prefix
        if (input != null) {
            input.mark(getMinLength());
            try {
                byte[] prefix = readMagicHeader(input);
                possibleTypes = getMimeType(prefix);
            } finally {
                input.reset();
            }
        }

        // Get type based on resourceName hint (if available)
        String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (resourceName != null) {
            String name = null;

            // Deal with a URI or a path name in as the resource  name
            try {
                URI uri = new URI(resourceName);
                String path = uri.getPath();
                if (path != null) {
                    int slash = path.lastIndexOf('/');
                    if (slash + 1 < path.length()) {
                        name = path.substring(slash + 1);
                    }
                }
            } catch (URISyntaxException e) {
                name = resourceName;
            }

            if (name != null) {
                MimeType hint = getMimeType(name);
                
                // If we have some types based on mime magic, try to specialise
                //  and/or select the type based on that
                // Otherwise, use the type identified from the name
                possibleTypes = applyHint(possibleTypes, hint);
            }
        }

        // Get type based on metadata hint (if available)
        String typeName = metadata.get(Metadata.CONTENT_TYPE);
        if (typeName != null) {
            try {
                MimeType hint = forName(typeName);
                possibleTypes = applyHint(possibleTypes, hint);
            } catch (MimeTypeException e) {
                // Malformed type name, ignore
            }
        }

        if (possibleTypes == null || possibleTypes.isEmpty()) {
            // Report that we don't know what it is
            return MediaType.OCTET_STREAM;
        } else {
            return possibleTypes.get(0).getType();
        }
    }
    /**
     * Use the MimeType hint to try to clarify or specialise the current
     *  possible types list.
     * If the hint is a specialised form, use that instead
     * If there are multiple possible types, use the hint to select one
     */
    private List<MimeType> applyHint(List<MimeType> possibleTypes, MimeType hint) {
        if (possibleTypes == null || possibleTypes.isEmpty()) {
            return Collections.singletonList(hint);
        } else {
            for (int i=0; i<possibleTypes.size(); i++) {
                final MimeType type = possibleTypes.get(i);
                if (hint.equals(type) ||
                    registry.isSpecializationOf(hint.getType(), type.getType())) {
                    // Use just this type
                    return Collections.singletonList(hint);
                }
            }
        }
        
        // Hint didn't help, sorry
        return possibleTypes;
    }

    private static MimeTypes DEFAULT_TYPES = null;
    private static Map<ClassLoader,MimeTypes> CLASSLOADER_SPECIFIC_DEFAULT_TYPES =
            new HashMap<ClassLoader, MimeTypes>();

    /**
     * Get the default MimeTypes. This includes all the build in
     * media types, and any custom override ones present.
     * 
     * @return MimeTypes default type registry
     */
    public static synchronized MimeTypes getDefaultMimeTypes() {
        return getDefaultMimeTypes(null);
    }
    /**
     * Get the default MimeTypes. This includes all the built-in
     * media types, and any custom override ones present.
     * 
     * @param classLoader to use, if not the default
     * @return MimeTypes default type registry
     */
    public static synchronized MimeTypes getDefaultMimeTypes(ClassLoader classLoader) {
        MimeTypes types = DEFAULT_TYPES;
        if (classLoader != null) {
            types = CLASSLOADER_SPECIFIC_DEFAULT_TYPES.get(classLoader);
        }
            
        if (types == null) {
            try {
                types = MimeTypesFactory.create(
                      "tika-mimetypes.xml", "custom-mimetypes.xml", classLoader);
            } catch (MimeTypeException e) {
                throw new RuntimeException(
                        "Unable to parse the default media type registry", e);
            } catch (IOException e) {
                throw new RuntimeException(
                        "Unable to read the default media type registry", e);
            }
            
            if (classLoader == null) {
                DEFAULT_TYPES = types;
            } else {
                CLASSLOADER_SPECIFIC_DEFAULT_TYPES.put(classLoader, types);
            }
        }
        return types;
    }
}