MimeTypes.java example

Explorer
junrar-android-master
- gen
  - com
    - example
      - rarxt
        BuildConfig.java
        R.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.mime;

// JDK imports
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;

import javax.xml.namespace.QName;

import org.apache.tika.detect.Detector;
import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.metadata.Metadata;

/**
 * This class is a MimeType repository. It gathers a set of MimeTypes and
 * enables to retrieves a content-type from its name, from a file name, or from
 * a magic character sequence.
 * <p>
 * The MIME type detection methods that take an {@link InputStream} as
 * an argument will never reads more than {@link #getMinLength()} bytes
 * from the stream. Also the given stream is never
 * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
 * or {@link InputStream#reset() reset} by the methods. Thus a client can
 * use the {@link InputStream#markSupported() mark feature} of the stream
 * (if available) to restore the stream back to the state it was before type
 * detection if it wants to process the stream based on the detected type.
 */
public final class MimeTypes implements Detector, Serializable {

    /**
     * Serial version UID.
     */
    private static final long serialVersionUID = -1350863170146349036L;

    /**
     * Name of the {@link #rootMimeType root} type, application/octet-stream.
     */
    public static final String OCTET_STREAM = "application/octet-stream";

    /**
     * Name of the {@link #textMimeType text} type, text/plain.
     */
    public static final String PLAIN_TEXT = "text/plain";
    
    /**
     * Name of the {@link #xml xml} type, application/xml.
     */
    public static final String XML = "application/xml";


    
    /**
     * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
     * in the range below 0x20 (the space character). If an entry in this
     * table is <code>true</code> then that byte is very unlikely to occur
     * in a plain text document.
     * <p>
     * The contents of this lookup table are based on the following definition
     * from section 4 of the "Content-Type Processing Model" Internet-draft
     * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
     * >draft-abarth-mime-sniff-01</a>).
     * <pre>
     * +-------------------------+
     * | Binary data byte ranges |
     * +-------------------------+
     * | 0x00 -- 0x08            |
     * | 0x0B                    |
     * | 0x0E -- 0x1A            |
     * | 0x1C -- 0x1F            |
     * +-------------------------+
     * </pre>
     *
     * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a>
     */
    private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20];
    static {
        Arrays.fill(IS_CONTROL_BYTE, true);
        IS_CONTROL_BYTE[0x09] = false; // tabulator
        IS_CONTROL_BYTE[0x0A] = false; // new line
        IS_CONTROL_BYTE[0x0C] = false; // new page
        IS_CONTROL_BYTE[0x0D] = false; // carriage return
        IS_CONTROL_BYTE[0x1B] = false; // escape
    }

    /**
     * Root type, application/octet-stream.
     */
    private final MimeType rootMimeType;

    /**
     * Text type, text/plain.
     */
    private final MimeType textMimeType;

    /*
     * xml type, application/xml
     */
    private final MimeType xmlMimeType;

    /**
     * Registered media types and their aliases.
     */
    private final MediaTypeRegistry registry = new MediaTypeRegistry();

    /** All the registered MimeTypes indexed on their canonical names */
    private final Map<MediaType, MimeType> types =
        new HashMap<MediaType, MimeType>();

    /** The patterns matcher */
    private Patterns patterns = new Patterns(registry);

    /** List of all registered magics */
    private SortedSet<Magic> magics = new TreeSet<Magic>();

    /** List of all registered rootXML */
    private SortedSet<MimeType> xmls = new TreeSet<MimeType>();

    public MimeTypes() {
        rootMimeType = new MimeType(MediaType.OCTET_STREAM);
        textMimeType = new MimeType(MediaType.TEXT_PLAIN);
        xmlMimeType = new MimeType(MediaType.APPLICATION_XML);

        add(rootMimeType);
        add(textMimeType);
        add(xmlMimeType);
    }

    /**
     * Find the Mime Content Type of a file.
     *
     * @param file
     *            to analyze.
     * @return the Mime Content Type of the specified file, or <code>null</code>
     *         if none is found.
     */
    public MimeType getMimeType(File file) {
        return getMimeType(file.getName());
    }

    /**
     * Find the Mime Content Type of a document from its URL.
     *
     * @param url
     *            of the document to analyze.
     * @return the Mime Content Type of the specified document URL, or
     *         <code>null</code> if none is found.
     */
    public MimeType getMimeType(URL url) {
        return getMimeType(url.getPath());
    }

    /**
     * Find the Mime Content Type of a document from its name.
     * Returns application/octet-stream if no better match is found.
     *
     * @param name of the document to analyze.
     * @return the Mime Content Type of the specified document name
     */
    public MimeType getMimeType(String name) {
        MimeType type = patterns.matches(name);
        if (type != null) {
            return type;
        }
        type = patterns.matches(name.toLowerCase(Locale.ENGLISH));
        if (type != null) {
            return type;
        } else {
            return rootMimeType;
        }
    }

    /**
     * Returns the MIME type that best matches the given first few bytes
     * of a document stream. Returns application/octet-stream if no better
     * match is found.
     * <p>
     * The given byte array is expected to be at least {@link #getMinLength()}
     * long, or shorter only if the document stream itself is shorter.
     *
     * @param data first few bytes of a document stream
     * @return matching MIME type
     */
    public MimeType getMimeType(byte[] data) {
        if (data == null) {
            throw new IllegalArgumentException("Data is missing");
        } else if (data.length == 0) {
            // See https://issues.apache.org/jira/browse/TIKA-483
            return rootMimeType;
        }

        // Then, check for magic bytes
        MimeType result = null;
        for (Magic magic : magics) {
            if (magic.eval(data)) {
                result = magic.getType();
                break;
            }
        }
 
        if (result != null) {
            // When detecting generic XML (or possibly XHTML),
            // extract the root element and match it against known types
            if ("application/xml".equals(result.getName())
                    || "text/html".equals(result.getName())) {
                XmlRootExtractor extractor = new XmlRootExtractor();

                QName rootElement = extractor.extractRootElement(data);
                if (rootElement != null) {
                    for (MimeType type : xmls) {
                        if (type.matchesXML(
                                rootElement.getNamespaceURI(),
                                rootElement.getLocalPart())) {
                            result = type;
                            break;
                        }
                    }
                } else if ("application/xml".equals(result.getName())) {
                    // Downgrade from application/xml to text/plain since
                    // the document seems not to be well-formed.
                    result = textMimeType;
                }
            }
            return result;
        }


        // Finally, assume plain text if no control bytes are found
        for (int i = 0; i < data.length; i++) {
            int b = data[i] & 0xFF; // prevent sign extension
            if (b < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[b]) {
                return rootMimeType;
            }
        }
        return textMimeType;
    }

    /**
     * Returns the MIME type that best matches the first few bytes of the
     * given document stream.
     *
     * @see #getMimeType(byte[])
     * @param stream document stream
     * @return matching MIME type, or <code>null</code> if no match is found
     * @throws IOException if the stream can be read
     */
    public MimeType getMimeType(InputStream stream) throws IOException {
        return getMimeType(readMagicHeader(stream));
    }

    /**
     * Reads the first {@link #getMinLength()} bytes from the given stream.
     * If the stream is shorter, then the entire content of the stream is
     * returned.
     * <p>
     * The given stream is never {@link InputStream#close() closed},
     * {@link InputStream#mark(int) marked}, or
     * {@link InputStream#reset() reset} by this method.
     *
     * @param stream stream to be read
     * @return first {@link #getMinLength()} (or fewer) bytes of the stream
     * @throws IOException if the stream can not be read
     */
    private byte[] readMagicHeader(InputStream stream) throws IOException {
        if (stream == null) {
            throw new IllegalArgumentException("InputStream is missing");
        }

        byte[] bytes = new byte[getMinLength()];
        int totalRead = 0;

        int lastRead = stream.read(bytes);
        while (lastRead != -1) {
            totalRead += lastRead;
            if (totalRead == bytes.length) {
                return bytes;
            }
            lastRead = stream.read(bytes, totalRead, bytes.length - totalRead);
        }

        byte[] shorter = new byte[totalRead];
        System.arraycopy(bytes, 0, shorter, 0, totalRead);
        return shorter;
    }

    public String getType(String typeName, String url, byte[] data) {
        try {
            Metadata metadata = new Metadata();
            if (url != null) {
                metadata.set(Metadata.RESOURCE_NAME_KEY, url);
            }
            if (typeName != null) {
                metadata.set(Metadata.CONTENT_TYPE, typeName);
            }
            return detect(new ByteArrayInputStream(data), metadata).toString();
        } catch (IOException e) {
            throw new IllegalStateException(
                    "ByteArrayInputStream throws an IOException!", e);
        }
    }

    /**
     * Determines the MIME type of the resource pointed to by the specified URL.
     * Examines the file's header, and if it cannot determine the MIME type
     * from the header, guesses the MIME type from the URL extension
     * (e.g. "pdf).
     *
     * @param url URL of the document
     * @return type of the document
     * @throws IOException if the document can not be accessed
     */
    public String getType(URL url) throws IOException {
        InputStream stream = url.openStream();
        try {
            Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, url.toString());
            return detect(stream, metadata).toString();
        } finally {
            stream.close();
        }
    }

    /**
     * Find the Mime Content Type of a document from its name and its content.
     * The policy used to guess the Mime Content Type is:
     * <ol>
     * <li>Try to find the type based on the provided data.</li>
     * <li>If a type is found, then return it, otherwise try to find the type
     * based on the file name</li>
     * </ol>
     *
     * @param name
     *            of the document to analyze.
     * @param data
     *            are the first bytes of the document's content.
     * @return the Mime Content Type of the specified document, or
     *         <code>null</code> if none is found.
     * @see #getMinLength()
     */
    public MimeType getMimeType(String name, byte[] data) {
        // First, try to get the mime-type from the content
        MimeType mimeType = getMimeType(data);

        // If no mime-type found, then try to get the mime-type from
        // the document name
        if (mimeType == null) {
            mimeType = getMimeType(name);
        }

        return mimeType;
    }

    /**
     * Returns the MIME type that best matches the given document name and
     * the first few bytes of the given document stream.
     *
     * @see #getMimeType(String, byte[])
     * @param name document name
     * @param stream document stream
     * @return matching MIME type, or <code>null</code> if no match is found
     * @throws IOException if the stream can not be read
     */
    public MimeType getMimeType(String name, InputStream stream)
            throws IOException {
        return getMimeType(name, readMagicHeader(stream));
    }

    /**
     * Returns the registered media type with the given name (or alias).
     * The named media type is automatically registered (and returned) if
     * it doesn't already exist.
     *
     * @param name media type name (case-insensitive)
     * @return the registered media type with the given name or alias
     * @throws MimeTypeException if the given media type name is invalid
     */
    public synchronized MimeType forName(String name)
            throws MimeTypeException {
        MediaType type = MediaType.parse(name);
        if (type != null) {
            MimeType mime = types.get(registry.normalize(type));
            if (mime == null) {
                mime = new MimeType(type);
                add(mime);
                types.put(type, mime);
            }
            return mime;
        } else {
            throw new MimeTypeException("Invalid media type name: " + name);
        }
    }

    public synchronized void setSuperType(MimeType type, MediaType parent) {
        registry.addSuperType(type.getType(), parent);
    }

    /**
     * Adds an alias for the given media type. This method should only
     * be called from {@link MimeType#addAlias(String)}.
     *
     * @param type media type
     * @param alias media type alias (normalized to lower case)
     */
    synchronized void addAlias(MimeType type, MediaType alias) {
        registry.addAlias(type.getType(), alias);
    }

    /**
     * Adds a file name pattern for the given media type. Assumes that the
     * pattern being added is <b>not</b> a JDK standard regular expression.
     *
     * @param type
     *            media type
     * @param pattern
     *            file name pattern
     * @throws MimeTypeException
     *             if the pattern conflicts with existing ones
     */
    public void addPattern(MimeType type, String pattern)
            throws MimeTypeException {
        this.addPattern(type, pattern, false);
    }

    /**
     * Adds a file name pattern for the given media type. The caller can specify
     * whether the pattern being added <b>is</b> or <b>is not</b> a JDK standard
     * regular expression via the <code>isRegex</code> parameter. If the value
     * is set to true, then a JDK standard regex is assumed, otherwise the
     * freedesktop glob type is assumed.
     *
     * @param type
     *            media type
     * @param pattern
     *            file name pattern
     * @param isRegex
     *            set to true if JDK std regexs are desired, otherwise set to
     *            false.
     * @throws MimeTypeException
     *             if the pattern conflicts with existing ones.
     *
     */
    public void addPattern(MimeType type, String pattern, boolean isRegex)
            throws MimeTypeException {
        patterns.add(pattern, isRegex, type);
    }

    public MediaTypeRegistry getMediaTypeRegistry() {
        return registry;
    }

    /**
     * Return the minimum length of data to provide to analyzing methods based
     * on the document's content in order to check all the known MimeTypes.
     *
     * @return the minimum length of data to provide.
     * @see #getMimeType(byte[])
     * @see #getMimeType(String, byte[])
     */
    public int getMinLength() {
        // This needs to be reasonably large to be able to correctly detect
        // things like XML root elements after initial comment and DTDs
        return 8 * 1024;
    }

    /**
     * Add the specified mime-type in the repository.
     *
     * @param type
     *            is the mime-type to add.
     */
    void add(MimeType type) {
        registry.addType(type.getType());
        types.put(type.getType(), type);

        // Update the magics index...
        if (type.hasMagic()) {
            magics.addAll(Arrays.asList(type.getMagics()));
        }

        // Update the xml (xmlRoot) index...
        if (type.hasRootXML()) {
            xmls.add(type);
        }
    }

    /**
     * Automatically detects the MIME type of a document based on magic
     * markers in the stream prefix and any given metadata hints.
     * <p>
     * The given stream is expected to support marks, so that this method
     * can reset the stream to the position it was in before this method
     * was called.
     *
     * @param input document stream, or <code>null</code>
     * @param metadata metadata hints
     * @return MIME type of the document
     * @throws IOException if the document stream could not be read
     */
    public MediaType detect(InputStream input, Metadata metadata)
            throws IOException {
        MediaType type = MediaType.OCTET_STREAM;

        // Get type based on magic prefix
        if (input != null) {
            input.mark(getMinLength());
            try {
                byte[] prefix = readMagicHeader(input);
                type = getMimeType(prefix).getType();
            } finally {
                input.reset();
            }
        }

        // Get type based on resourceName hint (if available)
        String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (resourceName != null) {
            String name = null;

            // Deal with a URI or a path name in as the resource  name
            try {
                URI uri = new URI(resourceName);
                String path = uri.getPath();
                if (path != null) {
                    int slash = path.lastIndexOf('/');
                    if (slash + 1 < path.length()) {
                        name = path.substring(slash + 1);
                    }
                }
            } catch (URISyntaxException e) {
                name = resourceName;
            }

            if (name != null) {
                MediaType hint = getMimeType(name).getType();
                if (registry.isSpecializationOf(hint, type)) {
                    type = hint;
                }
            }
        }

        // Get type based on metadata hint (if available)
        String typeName = metadata.get(Metadata.CONTENT_TYPE);
        if (typeName != null) {
            try {
                MediaType hint = forName(typeName).getType();
                if (registry.isSpecializationOf(hint, type)) {
                    type = hint;
                }
            } catch (MimeTypeException e) {
                // Malformed type name, ignore
            }
        }

        return type;
    }
    
    /**
     * Get the default MimeTypes
     * 
     * @return MimeTypes
     * @throws MimeTypeException
     * @throws IOException
     */
    public static MimeTypes getDefaultMimeTypes() {
        try {
            return MimeTypesFactory.create("tika-mimetypes.xml");
        } catch (MimeTypeException e) {
            throw new RuntimeException("Unable to read default mimetypes", e);
        } catch (IOException e) {
            throw new RuntimeException("Unable to read default mimetypes", e);
        }
    }

}