//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentextractors;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.uima.jcas.JCas;
import org.xml.sax.SAXException;
import com.google.common.base.Strings;
import uk.gov.dstl.baleen.contentextractors.helpers.AbstractContentExtractor;
/**
* Extracts metadata and text content from the supplied input, using Apache Tika.
*/
public class TikaContentExtractor extends AbstractContentExtractor {
public static final String CORRUPT_FILE_TEXT = "FILE CONTENTS CORRUPT - UNABLE TO PROCESS";
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
super.doProcessStream(stream, source, jCas);
try {
BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
AutoDetectParser autoParser = new AutoDetectParser();
autoParser.parse(stream, textHandler, metadata, context);
jCas.setDocumentText(textHandler.toString());
for (String name : metadata.names()) {
addMetadata(jCas, name, metadata.get(name));
}
} catch (SAXException | TikaException e) {
getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
jCas.setDocumentText(CORRUPT_FILE_TEXT);
}
}
}
}