/**
* Copyright (c) Codice Foundation
* <p/>
* This is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser
* General Public License as published by the Free Software Foundation, either version 3 of the
* License, or any later version.
* <p/>
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. A copy of the GNU Lesser General Public License
* is distributed along with this program and can be found at
* <http://www.gnu.org/licenses/lgpl.html>.
*/
package ddf.catalog.transformer.input.tika;
import java.awt.Graphics2D;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.net.URI;
import java.util.ArrayList;
import java.util.Date;
import java.util.Hashtable;
import java.util.List;
import java.util.SortedSet;
import javax.imageio.ImageIO;
import javax.imageio.spi.IIORegistry;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.ToXMLContentHandler;
import org.imgscalr.Scalr;
import org.osgi.framework.BundleContext;
import org.osgi.framework.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import com.google.common.io.FileBackedOutputStream;
import com.sun.media.imageioimpl.plugins.jpeg2000.J2KImageReaderSpi;
import com.sun.media.imageioimpl.plugins.tiff.TIFFImageReaderSpi;
import ddf.catalog.data.Metacard;
import ddf.catalog.data.impl.AttributeImpl;
import ddf.catalog.data.impl.BasicTypes;
import ddf.catalog.data.impl.MetacardImpl;
import ddf.catalog.transform.CatalogTransformerException;
import ddf.catalog.transform.InputTransformer;
public class TikaInputTransformer implements InputTransformer {
private static final Logger LOGGER = LoggerFactory.getLogger(TikaInputTransformer.class);
private static final TransformerFactory TRANSFORMER_FACTORY = TransformerFactory.newInstance();
private static final String XSLT = "/metadata.xslt";
public TikaInputTransformer(BundleContext bundleContext) {
if (bundleContext == null) {
LOGGER.error("Bundle context is null. Unable to register {} as an osgi service.",
TikaInputTransformer.class.getSimpleName());
return;
}
registerService(bundleContext);
IIORegistry.getDefaultInstance().registerServiceProvider(new J2KImageReaderSpi());
IIORegistry.getDefaultInstance().registerServiceProvider(new TIFFImageReaderSpi());
}
@Override
public Metacard transform(InputStream input) throws IOException, CatalogTransformerException {
return transform(input, null);
}
@Override
public Metacard transform(InputStream input, String uri)
throws IOException, CatalogTransformerException {
LOGGER.debug("Transforming input stream using Tika.");
if (input == null) {
throw new CatalogTransformerException("Cannot transform null input.");
}
Metacard metacard;
try (FileBackedOutputStream fileBackedOutputStream = new FileBackedOutputStream(1000000)) {
try {
IOUtils.copy(input, fileBackedOutputStream);
} catch (IOException e) {
throw new CatalogTransformerException("Could not copy bytes of content message.",
e);
}
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ToXMLContentHandler xmlHandler = new ToXMLContentHandler();
try (InputStream inputStreamMessageCopy = fileBackedOutputStream.asByteSource()
.openStream()) {
parser.parse(inputStreamMessageCopy, xmlHandler, metadata, new ParseContext());
} catch (SAXException e) {
throw new CatalogTransformerException("SAX exception processing input.", e);
} catch (TikaException e) {
throw new CatalogTransformerException("Tika exception processing input.", e);
}
try (InputStream inputStreamMessageCopy = fileBackedOutputStream.asByteSource()
.openStream()) {
metacard = createMetacard(inputStreamMessageCopy, metadata, uri,
transformToXml(xmlHandler.toString()));
}
}
LOGGER.debug("Finished transforming input stream using Tika.");
return metacard;
}
private Metacard createMetacard(InputStream input, Metadata metadata, String uri,
String metacardMetadata) {
Metacard metacard = new MetacardImpl(BasicTypes.BASIC_METACARD);
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (StringUtils.isNotBlank(contentType)) {
metacard.setAttribute(new AttributeImpl(Metacard.CONTENT_TYPE, contentType));
}
String title = metadata.get(TikaCoreProperties.TITLE);
if (StringUtils.isNotBlank(title)) {
metacard.setAttribute(new AttributeImpl(Metacard.TITLE, title));
}
String createdDateStr = metadata.get(TikaCoreProperties.CREATED);
Date createdDate = convertDate(createdDateStr);
if (createdDate != null) {
metacard.setAttribute(new AttributeImpl(Metacard.CREATED, createdDate));
}
String modifiedDateStr = metadata.get(TikaCoreProperties.MODIFIED);
Date modifiedDate = convertDate(modifiedDateStr);
if (modifiedDate != null) {
metacard.setAttribute(new AttributeImpl(Metacard.MODIFIED, modifiedDate));
}
if (StringUtils.isNotBlank(uri)) {
metacard.setAttribute(new AttributeImpl(Metacard.RESOURCE_URI, URI.create(uri)));
} else {
metacard.setAttribute(new AttributeImpl(Metacard.RESOURCE_URI, null));
}
if (StringUtils.isNotBlank(metacardMetadata)) {
metacard.setAttribute(new AttributeImpl(Metacard.METADATA, metacardMetadata));
}
String lat = metadata.get(Metadata.LATITUDE);
String lon = metadata.get(Metadata.LONGITUDE);
String wkt = toWkt(lon, lat);
if (StringUtils.isNotBlank(wkt)) {
metacard.setAttribute(new AttributeImpl(Metacard.GEOGRAPHY, wkt));
}
if (StringUtils.isNotBlank(contentType) && contentType.startsWith("image")) {
//this must be an image so lets add a thumbnail
createThumbnail(input, metacard);
}
return metacard;
}
private String toWkt(String lon, String lat) {
if (StringUtils.isBlank(lon) || StringUtils.isBlank(lat)) {
return null;
}
StringBuilder wkt = new StringBuilder();
wkt.append("POINT(");
wkt.append(lon);
wkt.append(" ");
wkt.append(lat);
wkt.append(")");
LOGGER.debug("wkt: {} ", wkt.toString());
return wkt.toString();
}
private Date convertDate(String dateStr) {
if (StringUtils.isBlank(dateStr)) {
return null;
}
return javax.xml.bind.DatatypeConverter.parseDateTime(dateStr).getTime();
}
/**
* We programmatically register the Tika Input Transformer so we can programmatically build the
* list of supported mime types.
*/
private void registerService(BundleContext bundleContext) {
LOGGER.debug("Registering {} as an osgi service.",
TikaInputTransformer.class.getSimpleName());
bundleContext.registerService(ddf.catalog.transform.InputTransformer.class, this,
getServiceProperties());
}
private Hashtable<String, Object> getServiceProperties() {
Hashtable<String, Object> properties = new Hashtable<>();
properties.put(ddf.catalog.Constants.SERVICE_ID, "tika");
properties.put(ddf.catalog.Constants.SERVICE_TITLE, "Tika Input Transformer");
properties.put(ddf.catalog.Constants.SERVICE_DESCRIPTION,
"The Tika Input Transformer detects and extracts metadata and text content from various documents.");
properties.put("mime-type", getSupportedMimeTypes());
// The Tika Input Transformer should be tried last, so we set the service ranking to -1
properties.put(Constants.SERVICE_RANKING, -1);
return properties;
}
private List<String> getSupportedMimeTypes() {
SortedSet<MediaType> mediaTypes = MediaTypeRegistry.getDefaultRegistry().getTypes();
List<String> mimeTypes = new ArrayList<>(mediaTypes.size());
for (MediaType mediaType : mediaTypes) {
String mimeType = mediaType.getType() + "/" + mediaType.getSubtype();
mimeTypes.add(mimeType);
}
mimeTypes.add("image/jp2");
mimeTypes.add("image/bmp");
LOGGER.debug("supported mime types: {}", mimeTypes);
return mimeTypes;
}
private void createThumbnail(InputStream input, Metacard metacard) {
try {
Image image = ImageIO.read(new CloseShieldInputStream(input));
if (null != image) {
BufferedImage bufferedImage = new BufferedImage(image.getWidth(null),
image.getHeight(null), BufferedImage.TYPE_INT_RGB);
Graphics2D graphics = bufferedImage.createGraphics();
graphics.drawImage(image, null, null);
graphics.dispose();
BufferedImage thumb = Scalr.resize(bufferedImage, 200);
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
ImageIO.write(thumb, "jpeg", out);
byte[] thumbBytes = out.toByteArray();
metacard.setAttribute(new AttributeImpl(Metacard.THUMBNAIL, thumbBytes));
}
} else {
LOGGER.warn("Unable to read image from input stream to create thumbnail.");
}
} catch (Exception e) {
LOGGER.warn("Unable to read image from input stream to create thumbnail.", e);
}
}
private String transformToXml(String xhtml) {
LOGGER.debug("Transforming xhtml to xml.");
Writer xml = new StringWriter();
try {
Transformer transformer = TRANSFORMER_FACTORY
.newTransformer(new StreamSource(this.getClass().getResourceAsStream(XSLT)));
transformer.transform(new StreamSource(new StringReader(xhtml)), new StreamResult(xml));
} catch (TransformerException e) {
LOGGER.warn("Unable to transform metdata from XHTML to XML.", e);
return xhtml;
}
return xml.toString();
}
}