/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
// JDK imports
import java.io.File;
import java.util.logging.Logger;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
// Tika imports
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
/**
* @author mattmann
* @since NUTCH-608
*
* <p>
* This is a facade class to insulate Nutch from its underlying Mime Type
* substrate library, <a href="http://incubator.apache.org/tika/">Apache Tika</a>.
* Any mime handling code should be placed in this utility class, and hidden
* from the Nutch classes that rely on it.
* </p>
*/
public final class MimeUtil {
private static final String SEPARATOR = ";";
/* our Tika mime type registry */
private MimeTypes mimeTypes;
/* whether or not magic should be employed or not */
private boolean mimeMagic;
/* our log stream */
private static final Logger LOG = Logger.getLogger(MimeUtil.class.getName());
public MimeUtil(Configuration conf) {
ObjectCache objectCache = ObjectCache.get(conf);
MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
.getName());
if (mimeTypez == null) {
mimeTypez = MimeTypesFactory.create(conf
.getConfResourceAsInputStream(conf.get("mime.types.file")));
objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
}
this.mimeTypes = mimeTypez;
this.mimeMagic = conf.getBoolean("mime.type.magic", true);
}
/**
* Cleans a {@link MimeType} name by removing out the actual {@link MimeType},
* from a string of the form:
*
* <pre>
* <primary type>/<sub type> ; < optional params
* </pre>
*
* @param origType
* The original mime type string to be cleaned.
* @return The primary type, and subtype, concatenated, e.g., the actual mime
* type.
*/
public static String cleanMimeType(String origType) {
if (origType == null)
return null;
// take the origType and split it on ';'
String[] tokenizedMimeType = origType.split(SEPARATOR);
if (tokenizedMimeType.length > 1) {
// there was a ';' in there, take the first value
return tokenizedMimeType[0];
} else {
// there wasn't a ';', so just return the orig type
return origType;
}
}
/**
* A facade interface to trying all the possible mime type resolution
* strategies available within Tika. First, the mime type provided in
* <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
* Then the cleaned mime type is looked up in the underlying Tika
* {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is
* found, then that mime type is used, otherwise {@link URL} resolution is
* used to try and determine the mime type. If that means is unsuccessful, and
* if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
* then mime type magic resolution is used to try and obtain a
* better-than-the-default approximation of the {@link MimeType}.
*
* @param typeName
* The original mime type, returned from a {@link ProtocolOutput}.
* @param url
* The given {@link URL}, that Nutch was trying to crawl.
* @param data
* The byte data, returned from the crawl, if any.
* @return The correctly, automatically guessed {@link MimeType} name.
*/
public String autoResolveContentType(String typeName, String url, byte[] data) {
MimeType type = null;
String cleanedMimeType = null;
try {
cleanedMimeType = MimeUtil.cleanMimeType(typeName) != null ? this.mimeTypes
.forName(MimeUtil.cleanMimeType(typeName)).getName()
: null;
} catch (MimeTypeException mte) {
// Seems to be a malformed mime type name...
}
// first try to get the type from the cleaned type name
try {
type = cleanedMimeType != null ? this.mimeTypes.forName(cleanedMimeType)
: null;
} catch (MimeTypeException e) {
type = null;
}
// if returned null, or if it's the default type then try url resolution
if (type == null
|| (type != null && type.getName().equals(MimeTypes.DEFAULT))) {
// If no mime-type header, or cannot find a corresponding registered
// mime-type, then guess a mime-type from the url pattern
type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
.getMimeType(url) : type;
}
// if magic is enabled use mime magic to guess if the mime type returned
// from the magic guess is different than the one that's already set so far
// if it is, and it's not the default mime type, then go with the mime type
// returned by the magic
if (this.mimeMagic) {
MimeType magicType = this.mimeTypes.getMimeType(data);
if (magicType != null && !magicType.getName().equals(MimeTypes.DEFAULT)
&& type != null && !type.getName().equals(magicType.getName())) {
// If magic enabled and the current mime type differs from that of the
// one returned from the magic, take the magic mimeType
type = magicType;
}
// if type is STILL null after all the resolution strategies, go for the
// default type
if (type == null) {
try {
type = this.mimeTypes.forName(MimeTypes.DEFAULT);
} catch (Exception ignore) {
}
}
}
return type.getName();
}
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)}
* method.
*
* @param url
* A string representation of the document {@link URL} to sense the
* {@link MimeType} for.
* @return An appropriate {@link MimeType}, identified from the given
* Document url in string form.
*/
public MimeType getMimeType(String url) {
return this.mimeTypes.getMimeType(url);
}
/**
* A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
* method.
*
* @param name
* The name of a valid {@link MimeType} in the Tika mime registry.
* @return The object representation of the {@link MimeType}, if it exists,
* or null otherwise.
*/
public MimeType forName(String name) {
try {
return this.mimeTypes.forName(name);
} catch (MimeTypeException e) {
LOG.warning("Exception getting mime type by name: [" + name
+ "]: Message: " + e.getMessage());
return null;
}
}
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
* method.
*
* @param f
* The {@link File} to sense the {@link MimeType} for.
* @return The {@link MimeType} of the given {@link File}, or null if it
* cannot be determined.
*/
public MimeType getMimeType(File f) {
return this.mimeTypes.getMimeType(f);
}
}