/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.mime; // JDK imports import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Arrays; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; import javax.xml.namespace.QName; import org.apache.tika.detect.Detector; import org.apache.tika.detect.XmlRootExtractor; import org.apache.tika.metadata.Metadata; /** * This class is a MimeType repository. It gathers a set of MimeTypes and * enables to retrieves a content-type from its name, from a file name, or from * a magic character sequence. * <p> * The MIME type detection methods that take an {@link InputStream} as * an argument will never reads more than {@link #getMinLength()} bytes * from the stream. Also the given stream is never * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked}, * or {@link InputStream#reset() reset} by the methods. Thus a client can * use the {@link InputStream#markSupported() mark feature} of the stream * (if available) to restore the stream back to the state it was before type * detection if it wants to process the stream based on the detected type. */ public final class MimeTypes implements Detector, Serializable { /** * Serial version UID. */ private static final long serialVersionUID = -1350863170146349036L; /** * Name of the {@link #rootMimeType root} type, application/octet-stream. */ public static final String OCTET_STREAM = "application/octet-stream"; /** * Name of the {@link #textMimeType text} type, text/plain. */ public static final String PLAIN_TEXT = "text/plain"; /** * Name of the {@link #xml xml} type, application/xml. */ public static final String XML = "application/xml"; /** * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes * in the range below 0x20 (the space character). If an entry in this * table is <code>true</code> then that byte is very unlikely to occur * in a plain text document. * <p> * The contents of this lookup table are based on the following definition * from section 4 of the "Content-Type Processing Model" Internet-draft * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt" * >draft-abarth-mime-sniff-01</a>). * <pre> * +-------------------------+ * | Binary data byte ranges | * +-------------------------+ * | 0x00 -- 0x08 | * | 0x0B | * | 0x0E -- 0x1A | * | 0x1C -- 0x1F | * +-------------------------+ * </pre> * * @see <a href="https://issues.apache.org/jira/browse/TIKA-154">TIKA-154</a> */ private static final boolean[] IS_CONTROL_BYTE = new boolean[0x20]; static { Arrays.fill(IS_CONTROL_BYTE, true); IS_CONTROL_BYTE[0x09] = false; // tabulator IS_CONTROL_BYTE[0x0A] = false; // new line IS_CONTROL_BYTE[0x0C] = false; // new page IS_CONTROL_BYTE[0x0D] = false; // carriage return IS_CONTROL_BYTE[0x1B] = false; // escape } /** * Root type, application/octet-stream. */ private final MimeType rootMimeType; /** * Text type, text/plain. */ private final MimeType textMimeType; /* * xml type, application/xml */ private final MimeType xmlMimeType; /** * Registered media types and their aliases. */ private final MediaTypeRegistry registry = new MediaTypeRegistry(); /** All the registered MimeTypes indexed on their canonical names */ private final Map<MediaType, MimeType> types = new HashMap<MediaType, MimeType>(); /** The patterns matcher */ private Patterns patterns = new Patterns(registry); /** List of all registered magics */ private SortedSet<Magic> magics = new TreeSet<Magic>(); /** List of all registered rootXML */ private SortedSet<MimeType> xmls = new TreeSet<MimeType>(); public MimeTypes() { rootMimeType = new MimeType(MediaType.OCTET_STREAM); textMimeType = new MimeType(MediaType.TEXT_PLAIN); xmlMimeType = new MimeType(MediaType.APPLICATION_XML); add(rootMimeType); add(textMimeType); add(xmlMimeType); } /** * Find the Mime Content Type of a file. * * @param file * to analyze. * @return the Mime Content Type of the specified file, or <code>null</code> * if none is found. */ public MimeType getMimeType(File file) { return getMimeType(file.getName()); } /** * Find the Mime Content Type of a document from its URL. * * @param url * of the document to analyze. * @return the Mime Content Type of the specified document URL, or * <code>null</code> if none is found. */ public MimeType getMimeType(URL url) { return getMimeType(url.getPath()); } /** * Find the Mime Content Type of a document from its name. * Returns application/octet-stream if no better match is found. * * @param name of the document to analyze. * @return the Mime Content Type of the specified document name */ public MimeType getMimeType(String name) { MimeType type = patterns.matches(name); if (type != null) { return type; } type = patterns.matches(name.toLowerCase(Locale.ENGLISH)); if (type != null) { return type; } else { return rootMimeType; } } /** * Returns the MIME type that best matches the given first few bytes * of a document stream. Returns application/octet-stream if no better * match is found. * <p> * The given byte array is expected to be at least {@link #getMinLength()} * long, or shorter only if the document stream itself is shorter. * * @param data first few bytes of a document stream * @return matching MIME type */ public MimeType getMimeType(byte[] data) { if (data == null) { throw new IllegalArgumentException("Data is missing"); } else if (data.length == 0) { // See https://issues.apache.org/jira/browse/TIKA-483 return rootMimeType; } // Then, check for magic bytes MimeType result = null; for (Magic magic : magics) { if (magic.eval(data)) { result = magic.getType(); break; } } if (result != null) { // When detecting generic XML (or possibly XHTML), // extract the root element and match it against known types if ("application/xml".equals(result.getName()) || "text/html".equals(result.getName())) { XmlRootExtractor extractor = new XmlRootExtractor(); QName rootElement = extractor.extractRootElement(data); if (rootElement != null) { for (MimeType type : xmls) { if (type.matchesXML( rootElement.getNamespaceURI(), rootElement.getLocalPart())) { result = type; break; } } } else if ("application/xml".equals(result.getName())) { // Downgrade from application/xml to text/plain since // the document seems not to be well-formed. result = textMimeType; } } return result; } // Finally, assume plain text if no control bytes are found for (int i = 0; i < data.length; i++) { int b = data[i] & 0xFF; // prevent sign extension if (b < IS_CONTROL_BYTE.length && IS_CONTROL_BYTE[b]) { return rootMimeType; } } return textMimeType; } /** * Returns the MIME type that best matches the first few bytes of the * given document stream. * * @see #getMimeType(byte[]) * @param stream document stream * @return matching MIME type, or <code>null</code> if no match is found * @throws IOException if the stream can be read */ public MimeType getMimeType(InputStream stream) throws IOException { return getMimeType(readMagicHeader(stream)); } /** * Reads the first {@link #getMinLength()} bytes from the given stream. * If the stream is shorter, then the entire content of the stream is * returned. * <p> * The given stream is never {@link InputStream#close() closed}, * {@link InputStream#mark(int) marked}, or * {@link InputStream#reset() reset} by this method. * * @param stream stream to be read * @return first {@link #getMinLength()} (or fewer) bytes of the stream * @throws IOException if the stream can not be read */ private byte[] readMagicHeader(InputStream stream) throws IOException { if (stream == null) { throw new IllegalArgumentException("InputStream is missing"); } byte[] bytes = new byte[getMinLength()]; int totalRead = 0; int lastRead = stream.read(bytes); while (lastRead != -1) { totalRead += lastRead; if (totalRead == bytes.length) { return bytes; } lastRead = stream.read(bytes, totalRead, bytes.length - totalRead); } byte[] shorter = new byte[totalRead]; System.arraycopy(bytes, 0, shorter, 0, totalRead); return shorter; } public String getType(String typeName, String url, byte[] data) { try { Metadata metadata = new Metadata(); if (url != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, url); } if (typeName != null) { metadata.set(Metadata.CONTENT_TYPE, typeName); } return detect(new ByteArrayInputStream(data), metadata).toString(); } catch (IOException e) { throw new IllegalStateException( "ByteArrayInputStream throws an IOException!", e); } } /** * Determines the MIME type of the resource pointed to by the specified URL. * Examines the file's header, and if it cannot determine the MIME type * from the header, guesses the MIME type from the URL extension * (e.g. "pdf). * * @param url URL of the document * @return type of the document * @throws IOException if the document can not be accessed */ public String getType(URL url) throws IOException { InputStream stream = url.openStream(); try { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, url.toString()); return detect(stream, metadata).toString(); } finally { stream.close(); } } /** * Find the Mime Content Type of a document from its name and its content. * The policy used to guess the Mime Content Type is: * <ol> * <li>Try to find the type based on the provided data.</li> * <li>If a type is found, then return it, otherwise try to find the type * based on the file name</li> * </ol> * * @param name * of the document to analyze. * @param data * are the first bytes of the document's content. * @return the Mime Content Type of the specified document, or * <code>null</code> if none is found. * @see #getMinLength() */ public MimeType getMimeType(String name, byte[] data) { // First, try to get the mime-type from the content MimeType mimeType = getMimeType(data); // If no mime-type found, then try to get the mime-type from // the document name if (mimeType == null) { mimeType = getMimeType(name); } return mimeType; } /** * Returns the MIME type that best matches the given document name and * the first few bytes of the given document stream. * * @see #getMimeType(String, byte[]) * @param name document name * @param stream document stream * @return matching MIME type, or <code>null</code> if no match is found * @throws IOException if the stream can not be read */ public MimeType getMimeType(String name, InputStream stream) throws IOException { return getMimeType(name, readMagicHeader(stream)); } /** * Returns the registered media type with the given name (or alias). * The named media type is automatically registered (and returned) if * it doesn't already exist. * * @param name media type name (case-insensitive) * @return the registered media type with the given name or alias * @throws MimeTypeException if the given media type name is invalid */ public synchronized MimeType forName(String name) throws MimeTypeException { MediaType type = MediaType.parse(name); if (type != null) { MimeType mime = types.get(registry.normalize(type)); if (mime == null) { mime = new MimeType(type); add(mime); types.put(type, mime); } return mime; } else { throw new MimeTypeException("Invalid media type name: " + name); } } public synchronized void setSuperType(MimeType type, MediaType parent) { registry.addSuperType(type.getType(), parent); } /** * Adds an alias for the given media type. This method should only * be called from {@link MimeType#addAlias(String)}. * * @param type media type * @param alias media type alias (normalized to lower case) */ synchronized void addAlias(MimeType type, MediaType alias) { registry.addAlias(type.getType(), alias); } /** * Adds a file name pattern for the given media type. Assumes that the * pattern being added is <b>not</b> a JDK standard regular expression. * * @param type * media type * @param pattern * file name pattern * @throws MimeTypeException * if the pattern conflicts with existing ones */ public void addPattern(MimeType type, String pattern) throws MimeTypeException { this.addPattern(type, pattern, false); } /** * Adds a file name pattern for the given media type. The caller can specify * whether the pattern being added <b>is</b> or <b>is not</b> a JDK standard * regular expression via the <code>isRegex</code> parameter. If the value * is set to true, then a JDK standard regex is assumed, otherwise the * freedesktop glob type is assumed. * * @param type * media type * @param pattern * file name pattern * @param isRegex * set to true if JDK std regexs are desired, otherwise set to * false. * @throws MimeTypeException * if the pattern conflicts with existing ones. * */ public void addPattern(MimeType type, String pattern, boolean isRegex) throws MimeTypeException { patterns.add(pattern, isRegex, type); } public MediaTypeRegistry getMediaTypeRegistry() { return registry; } /** * Return the minimum length of data to provide to analyzing methods based * on the document's content in order to check all the known MimeTypes. * * @return the minimum length of data to provide. * @see #getMimeType(byte[]) * @see #getMimeType(String, byte[]) */ public int getMinLength() { // This needs to be reasonably large to be able to correctly detect // things like XML root elements after initial comment and DTDs return 8 * 1024; } /** * Add the specified mime-type in the repository. * * @param type * is the mime-type to add. */ void add(MimeType type) { registry.addType(type.getType()); types.put(type.getType(), type); // Update the magics index... if (type.hasMagic()) { magics.addAll(Arrays.asList(type.getMagics())); } // Update the xml (xmlRoot) index... if (type.hasRootXML()) { xmls.add(type); } } /** * Automatically detects the MIME type of a document based on magic * markers in the stream prefix and any given metadata hints. * <p> * The given stream is expected to support marks, so that this method * can reset the stream to the position it was in before this method * was called. * * @param input document stream, or <code>null</code> * @param metadata metadata hints * @return MIME type of the document * @throws IOException if the document stream could not be read */ public MediaType detect(InputStream input, Metadata metadata) throws IOException { MediaType type = MediaType.OCTET_STREAM; // Get type based on magic prefix if (input != null) { input.mark(getMinLength()); try { byte[] prefix = readMagicHeader(input); type = getMimeType(prefix).getType(); } finally { input.reset(); } } // Get type based on resourceName hint (if available) String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY); if (resourceName != null) { String name = null; // Deal with a URI or a path name in as the resource name try { URI uri = new URI(resourceName); String path = uri.getPath(); if (path != null) { int slash = path.lastIndexOf('/'); if (slash + 1 < path.length()) { name = path.substring(slash + 1); } } } catch (URISyntaxException e) { name = resourceName; } if (name != null) { MediaType hint = getMimeType(name).getType(); if (registry.isSpecializationOf(hint, type)) { type = hint; } } } // Get type based on metadata hint (if available) String typeName = metadata.get(Metadata.CONTENT_TYPE); if (typeName != null) { try { MediaType hint = forName(typeName).getType(); if (registry.isSpecializationOf(hint, type)) { type = hint; } } catch (MimeTypeException e) { // Malformed type name, ignore } } return type; } /** * Get the default MimeTypes * * @return MimeTypes * @throws MimeTypeException * @throws IOException */ public static MimeTypes getDefaultMimeTypes() { try { return MimeTypesFactory.create("tika-mimetypes.xml"); } catch (MimeTypeException e) { throw new RuntimeException("Unable to read default mimetypes", e); } catch (IOException e) { throw new RuntimeException("Unable to read default mimetypes", e); } } }