/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kitesdk.morphline.tika;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map.Entry;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.CommandBuilder;
import org.kitesdk.morphline.api.MorphlineCompilationException;
import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.api.Record;
import org.kitesdk.morphline.base.AbstractCommand;
import org.kitesdk.morphline.base.Fields;
import com.google.common.base.Preconditions;
import org.kitesdk.morphline.shaded.com.google.common.io.Closeables;
import com.typesafe.config.Config;
/**
* Command that auto-detects the MIME type of the first attachment, if no MIME type is defined yet.
*/
public final class DetectMimeTypeBuilder implements CommandBuilder {
@Override
public Collection<String> getNames() {
return Collections.singletonList("detectMimeType");
}
@Override
public Command build(Config config, Command parent, Command child, MorphlineContext context) {
try {
return new DetectMimeType(this, config, parent, child, context);
} catch (IOException e) {
throw new MorphlineCompilationException("Cannot instantiate command", config, e, this);
} catch (MimeTypeException e) {
throw new MorphlineCompilationException("Cannot instantiate command", config, e, this);
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class DetectMimeType extends AbstractCommand {
private final Detector detector;
private final boolean preserveExisting;
private final boolean includeMetaData;
private final boolean excludeParameters;
public DetectMimeType(CommandBuilder builder, Config config, Command parent, Command child, MorphlineContext context) throws IOException, MimeTypeException {
super(builder, config, parent, child, context);
this.preserveExisting = getConfigs().getBoolean(config, "preserveExisting", true);
this.includeMetaData = getConfigs().getBoolean(config, "includeMetaData", false);
this.excludeParameters = getConfigs().getBoolean(config, "excludeParameters", true);
List<InputStream> inputStreams = new ArrayList();
try {
if (getConfigs().getBoolean(config, "includeDefaultMimeTypes", true)) {
// adapted from Tika MimeTypesFactory.create(String coreFilePath, String extensionFilePath)
String coreFilePath = "tika-mimetypes.xml";
String classPrefix = MimeTypesFactory.class.getPackage().getName().replace('.', '/') + "/";
ClassLoader cl = MimeTypesFactory.class.getClassLoader();
URL coreURL = cl.getResource(classPrefix + coreFilePath);
InputStream in = new BufferedInputStream(coreURL.openStream());
inputStreams.add(in);
}
for (String mimeTypesFile : getConfigs().getStringList(config, "mimeTypesFiles", Collections.<String>emptyList())) {
InputStream in = new BufferedInputStream(new FileInputStream(new File(mimeTypesFile)));
inputStreams.add(in);
}
String mimeTypesString = getConfigs().getString(config, "mimeTypesString", null);
if (mimeTypesString != null) {
InputStream in = new ByteArrayInputStream(mimeTypesString.getBytes("UTF-8"));
inputStreams.add(in);
}
if (inputStreams.size() > 0) {
MimeTypes mimeTypes = MimeTypesFactory.create(inputStreams.toArray(new InputStream[inputStreams.size()]));
ServiceLoader loader = new ServiceLoader();
this.detector = new DefaultDetector(mimeTypes, loader);
} else {
throw new MorphlineCompilationException("Missing specification for MIME type mappings", config);
}
} finally {
for (InputStream in : inputStreams) {
Closeables.closeQuietly(in);
}
}
validateArguments();
}
@Override
protected boolean doProcess(Record record) {
if (preserveExisting && record.getFields().containsKey(Fields.ATTACHMENT_MIME_TYPE)) {
; // we must preserve the existing MIME type
} else {
List attachments = record.get(Fields.ATTACHMENT_BODY);
if (attachments.size() > 0) {
Object attachment = attachments.get(0);
Preconditions.checkNotNull(attachment);
InputStream stream;
if (attachment instanceof byte[]) {
stream = new ByteArrayInputStream((byte[]) attachment);
} else {
stream = (InputStream) attachment;
}
Metadata metadata = new Metadata();
// If you specify the resource name (the filename, roughly) with this
// parameter, then Tika can use it in guessing the right MIME type
String resourceName = (String) record.getFirstValue(Fields.ATTACHMENT_NAME);
if (resourceName != null) {
metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName);
}
// Provide stream's charset as hint to Tika for better auto detection
String charset = (String) record.getFirstValue(Fields.ATTACHMENT_CHARSET);
if (charset != null) {
metadata.add(Metadata.CONTENT_ENCODING, charset);
}
if (includeMetaData) {
for (Entry<String, Object> entry : record.getFields().entries()) {
metadata.add(entry.getKey(), entry.getValue().toString());
}
}
String mimeType = getMediaType(stream, metadata, excludeParameters);
record.replaceValues(Fields.ATTACHMENT_MIME_TYPE, mimeType);
}
}
return super.doProcess(record);
}
/**
* Detects the content type of the given input event. Returns
* <code>application/octet-stream</code> if the type of the event can not be
* detected.
* <p>
* It is legal for the event headers or body to be empty. The detector may
* read bytes from the start of the body stream to help in type detection.
*
* @return detected media type, or <code>application/octet-stream</code>
*/
private String getMediaType(InputStream in, Metadata metadata, boolean excludeParameters) {
MediaType mediaType;
try {
mediaType = getDetector().detect(in, metadata);
} catch (IOException e) {
throw new MorphlineRuntimeException(e);
}
String mediaTypeStr = mediaType.toString();
if (excludeParameters) {
int i = mediaTypeStr.indexOf(';');
if (i >= 0) {
mediaTypeStr = mediaTypeStr.substring(0, i);
}
}
return mediaTypeStr;
}
protected Detector getDetector() {
return detector;
}
}
}