package org.icij.extract.extractor;
import java.io.FileNotFoundException;
import java.io.Reader;
import java.io.Writer;
import java.nio.file.Path;
import java.time.Duration;
import java.util.*;
import java.io.IOException;
import java.util.function.Function;
import org.apache.commons.io.TaggedIOException;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.*;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.DefaultHtmlMapper;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.parser.utils.CommonsDigester.DigestAlgorithm;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.icij.extract.document.Document;
import org.icij.extract.parser.ParsingReader;
import org.icij.extract.report.Reporter;
import org.icij.extract.sax.HTML5Serializer;
import org.icij.extract.spewer.MetadataTransformer;
import org.icij.extract.spewer.Spewer;
import org.icij.task.Options;
import org.icij.task.annotation.Option;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
/**
* A reusable class that sets up Tika parsers based on runtime options.
*
* @since 1.0.0-beta
*/
@Option(name = "digestMethod", description = "The hash digest method used for documents, for example \"SHA256\". May" +
" be specified multiple times", parameter = "name")
@Option(name = "outputFormat", description = "Set the output format. Either \"text\" or \"HTML\". " +
"Defaults to text output.", parameter = "type")
@Option(name = "embedHandling", description = "Set the embed handling mode. Either \"ignore\", " +
"\"concatenate\" or \"spawn\". When set to concatenate, embeds are parsed and the output is " +
"in-lined into the main output." +
"Defaults to spawning, which spawns new documents for each embedded document encountered.", parameter = "type")
@Option(name = "embedOutput", description = "Path to a directory for outputting attachments en masse.",
parameter = "path")
@Option(name = "ocrLanguage", description = "Set the languages used by Tesseract. Multiple languages may be " +
"specified, separated by plus characters. Tesseract uses 3-character ISO 639-2 language codes.", parameter =
"language")
@Option(name = "ocrTimeout", description = "Set the timeout for the Tesseract process to finish e.g. \"5s\" or \"1m\"" +
". Defaults to 12 hours.", parameter = "duration")
@Option(name = "ocr", description = "Enable or disable automatic OCR. On by default.")
public class Extractor {
public enum OutputFormat {
HTML, TEXT;
public static OutputFormat parse(final String outputFormat) {
return valueOf(outputFormat.toUpperCase(Locale.ROOT));
}
}
public enum EmbedHandling {
CONCATENATE, SPAWN, IGNORE;
public static EmbedHandling parse(final String outputFormat) {
return valueOf(outputFormat.toUpperCase(Locale.ROOT));
}
public static EmbedHandling getDefault() {
return SPAWN;
}
}
private static final Logger logger = LoggerFactory.getLogger(Extractor.class);
private boolean ocrDisabled = false;
private DigestingParser.Digester digester = null;
private Parser defaultParser = TikaConfig.getDefaultConfig().getParser();
private final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
private final PDFParserConfig pdfConfig = new PDFParserConfig();
private final Collection<Class<? extends Parser>> excludedParsers = new HashSet<>();
private OutputFormat outputFormat = OutputFormat.TEXT;
private EmbedHandling embedHandling = EmbedHandling.getDefault();
private Path embedOutput = null;
/**
* Create a new extractor, which will OCR images by default if Tesseract is available locally, extract inline
* images from PDF files and OCR them and use PDFBox's non-sequential PDF parser.
*/
public Extractor() {
// Calculate the SHA256 digest by default.
setDigestAlgorithms(DigestAlgorithm.SHA256);
// Run OCR on images contained within PDFs and not on pages.
pdfConfig.setExtractInlineImages(true);
pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
// By default, only the object IDs are used for determining uniqueness.
// In scanned documents under test from the Panama registry, different embedded images had the same ID, leading to incomplete OCRing when uniqueness detection was turned on.
pdfConfig.setExtractUniqueInlineImagesOnly(false);
// Set a long OCR timeout by default, because Tika's is too short.
setOcrTimeout(Duration.ofDays(1));
ocrConfig.setEnableImageProcessing(0); // See TIKA-2167. Image processing causes OCR to fail.
// English and Spanish text recognition.
ocrConfig.setLanguage("eng+spa");
}
public Extractor configure(final Options<String> options) {
options.get("outputFormat").parse().asEnum(OutputFormat::parse).ifPresent(this::setOutputFormat);
options.get("embedHandling").parse().asEnum(EmbedHandling::parse).ifPresent(this::setEmbedHandling);
options.get("embedOutput").parse().asPath().ifPresent(this::setEmbedOutputPath);
options.get("ocrLanguage").value().ifPresent(this::setOcrLanguage);
options.get("ocrTimeout").parse().asDuration().ifPresent(this::setOcrTimeout);
final Collection<DigestAlgorithm> digestAlgorithms = options.get("digestMethod").values
(DigestAlgorithm::valueOf);
if (!digestAlgorithms.isEmpty()) {
setDigestAlgorithms(digestAlgorithms.toArray(new DigestAlgorithm[digestAlgorithms.size()]));
}
if (options.get("ocr").parse().isOff()) {
disableOcr();
}
return this;
}
/**
* Set the output format.
*
* @param outputFormat the output format
*/
public void setOutputFormat(final OutputFormat outputFormat) {
this.outputFormat = outputFormat;
}
/**
* Get the extraction output format.
*
* @return the output format
*/
public OutputFormat getOutputFormat() {
return outputFormat;
}
/**
* Set the embed handling mode.
*
* @param embedHandling the embed handling mode
*/
public void setEmbedHandling(final EmbedHandling embedHandling) {
this.embedHandling = embedHandling;
}
/**
* Get the embed handling mode.
*
* @return the embed handling mode.
*/
public EmbedHandling getEmbedHandling() {
return embedHandling;
}
/**
* Set the output directory path for embed files.
*
* @param embedOutput the embed output path
*/
public void setEmbedOutputPath(final Path embedOutput) {
this.embedOutput = embedOutput;
}
/**
* Get the output directory path for embed files.
*
* @return the embed output path.
*/
public Path getEmbedOutputPath() {
return embedOutput;
}
/**
* Set the languages used by Tesseract.
*
* @param ocrLanguage the languages to use, for example "eng" or "ita+spa"
*/
public void setOcrLanguage(final String ocrLanguage) {
ocrConfig.setLanguage(ocrLanguage);
}
/**
* Instructs Tesseract to attempt OCR for no longer than the given duration in seconds.
*
* @param ocrTimeout the duration in seconds
*/
private void setOcrTimeout(final int ocrTimeout) {
ocrConfig.setTimeout(ocrTimeout);
}
/**
* Instructs Tesseract to attempt OCR for no longer than the given duration.
*
* @param duration the duration before timeout
*/
public void setOcrTimeout(final Duration duration) {
setOcrTimeout(Math.toIntExact(duration.getSeconds()));
}
public void setDigestAlgorithms(final DigestAlgorithm... digestAlgorithms) {
digester = new CommonsDigester(20 * 1024 * 1024, digestAlgorithms);
}
/**
* Disable OCR. This method only has an effect if Tesseract is installed.
*/
public void disableOcr() {
if (!ocrDisabled) {
excludeParser(TesseractOCRParser.class);
ocrDisabled = true;
pdfConfig.setExtractInlineImages(false);
}
}
/**
* This method will wrap the given {@link Document} in a {@link TikaInputStream} and return a {@link Reader}
* which can be used to initiate extraction on demand.
*
* Internally, this method uses {@link TikaInputStream#get} which ensures that the resource name and content
* length metadata properties are set automatically.
*
* @param document the file to extract from
* @return A {@link Reader} that can be used to read extracted text on demand.
*/
public Reader extract(final Document document, final TemporaryResources tmp) throws IOException {
// Use the the TikaInputStream.parse method that accepts a file, because this sets metadata properties like the
// resource name and size.
return extract(document, TikaInputStream.get(document.getPath(), document.getMetadata()), tmp);
}
/**
* Extract and spew content from a document. Internally, as with {@link #extract(Document, TemporaryResources)},
* this method creates a {@link TikaInputStream} from the path of the given document.
*
* @param document document to extract from
* @param spewer endpoint to write to
* @throws IOException if there was an error reading or writing the document
*/
public void extract(final Document document, final Spewer spewer) throws IOException {
try (final TemporaryResources tmp = new TemporaryResources(); final Reader reader = extract(document, tmp)) {
spewer.write(document, reader);
}
}
/**
* Extract and spew content from a document. This method is the same as {@link #extract(Document, Spewer)} with
* the exception that the document will be skipped if the reporter returns {@literal false} for a call to
* {@link Reporter#skip(Document)}.
*
* If the document is not skipped, then the result of the extraction is passed to the reporter in a call to
* {@link Reporter#save(Document, ExtractionStatus, Exception)}.
*
* @param document document to extract from
* @param spewer endpoint to write to
* @param reporter used to check whether the document should be skipped and save extraction status
*/
public void extract(final Document document, final Spewer spewer, final Reporter reporter) {
Objects.requireNonNull(reporter);
if (reporter.skip(document)) {
logger.info(String.format("File already extracted; skipping: \"%s\".", document));
return;
}
ExtractionStatus status = ExtractionStatus.SUCCESS;
Exception exception = null;
try {
extract(document, spewer);
} catch (final Exception e) {
status = status(e, spewer);
log(e, status, document);
exception = e;
}
reporter.save(document, status, exception);
}
private void log(final Exception e, final ExtractionStatus status, final Document document) {
switch (status) {
case FAILURE_NOT_SAVED:
logger.error(String.format("The extraction result could not be outputted: \"%s\".", document),
e.getCause());
break;
case FAILURE_NOT_FOUND:
logger.error(String.format("File not found: \"%s\".", document), e);
break;
case FAILURE_NOT_DECRYPTED:
logger.warn(String.format("Skipping encrypted file: \"%s\".", document), e);
break;
case FAILURE_NOT_PARSED:
logger.error(String.format("The document could not be parsed: \"%s\".", document), e);
break;
case FAILURE_UNREADABLE:
logger.error(String.format("The document stream could not be read: \"%s\".", document), e);
break;
default:
logger.error(String.format("Unknown exception during extraction or output: \"%s\".", document), e);
break;
}
}
/**
* Convert the given {@link Exception} into an {@link ExtractionStatus} for addition to a report.
*
* Logs an appropriate message depending on the exception.
*
* @param e the exception to convert and log
* @return the resulting status
*/
private ExtractionStatus status(final Exception e, final Spewer spewer) {
if (TaggedIOException.isTaggedWith(e, spewer)) {
return ExtractionStatus.FAILURE_NOT_SAVED;
}
if (TaggedIOException.isTaggedWith(e, MetadataTransformer.class)) {
return ExtractionStatus.FAILURE_NOT_PARSED;
}
if (e instanceof FileNotFoundException) {
return ExtractionStatus.FAILURE_NOT_FOUND;
}
if (!(e instanceof IOException)) {
return ExtractionStatus.FAILURE_UNKNOWN;
}
final Throwable cause = e.getCause();
if (cause instanceof EncryptedDocumentException) {
return ExtractionStatus.FAILURE_NOT_DECRYPTED;
}
// TIKA-198: IOExceptions thrown by parsers will be wrapped in a TikaException.
// This helps us differentiate input stream exceptions from output stream exceptions.
// https://issues.apache.org/jira/browse/TIKA-198
if (cause instanceof TikaException) {
return ExtractionStatus.FAILURE_NOT_PARSED;
}
return ExtractionStatus.FAILURE_UNREADABLE;
}
/**
* Create a pull-parser from the given {@link TikaInputStream}.
*
* @param input the stream to extract from
* @param document file that is being extracted from
* @return A pull-parsing reader.
*/
protected Reader extract(final Document document, final TikaInputStream input, final TemporaryResources tmp)
throws IOException {
final Metadata metadata = document.getMetadata();
final ParseContext context = new ParseContext();
final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser);
final Parser parser;
if (null != digester) {
parser = new DigestingParser(autoDetectParser, digester);
} else {
parser = autoDetectParser;
}
if (!ocrDisabled) {
context.set(TesseractOCRConfig.class, ocrConfig);
}
context.set(PDFParserConfig.class, pdfConfig);
autoDetectParser.setFallback(ErrorParser.INSTANCE);
// Only include "safe" tags in the HTML output from Tika's HTML parser.
// This excludes script tags and objects.
context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);
final Reader reader;
final Function<Writer, ContentHandler> handler;
if (OutputFormat.HTML == outputFormat) {
handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer));
} else {
// The default BodyContentHandler is used when constructing the ParsingReader for text output, but
// because only the body of embeds is pushed to the content handler further down the line, we can't
// expect a body tag.
handler = WriteOutContentHandler::new;
}
if (EmbedHandling.SPAWN == embedHandling) {
context.set(Parser.class, parser);
context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(document, tmp, context, embedOutput,
handler));
} else if (EmbedHandling.CONCATENATE == embedHandling) {
context.set(Parser.class, parser);
context.set(EmbeddedDocumentExtractor.class, new EmbedParser(document, context));
} else {
context.set(Parser.class, EmptyParser.INSTANCE);
context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker());
}
if (OutputFormat.HTML == outputFormat) {
reader = new ParsingReader(parser, input, metadata, context, handler);
} else {
reader = new ParsingReader(parser, input, metadata, context);
}
return reader;
}
private void excludeParser(final Class<? extends Parser> exclude) {
if (defaultParser instanceof CompositeParser) {
final CompositeParser composite = (CompositeParser) defaultParser;
final List<Parser> parsers = composite.getAllComponentParsers();
excludedParsers.add(exclude);
defaultParser = new CompositeParser(composite.getMediaTypeRegistry(), parsers, excludedParsers);
}
}
}