Extractor.java example

Explorer

extract-master
- src
  - main
    - java
      - org
        icij
        extract
        IndexType.java
        OutputType.java
        cli
        Main.java
        tasks
        HelpTask.java
        VersionTask.java
        document
        AbstractIdentifier.java
        DigestIdentifier.java
        Document.java
        DocumentFactory.java
        EmbeddedDocument.java
        Identifier.java
        PathIdentifier.java
        encoder
        DataURIEncodingInputStream.java
        extractor
        DocumentConsumer.java
        EmbedBlocker.java
        EmbedLinker.java
        EmbedParser.java
        EmbedSpawner.java
        ExtractionStatus.java
        Extractor.java
        json
        DocumentQueueDeserializer.java
        DocumentQueueSerializer.java
        ReportDeserializer.java
        ReportSerializer.java
        mysql
        DataSourceFactory.java
        parser
        EmbeddingHTMLParsingReader.java
        ParsingReader.java
        emf
        EMFParser.java
        ocr
        ExtendedTesseractOCRParser.java
        wmf
        WMFParser.java
        queue
        ArrayDocumentQueue.java
        DocumentQueue.java
        DocumentQueueDrainer.java
        DocumentQueueFactory.java
        DocumentQueueType.java
        MySQLDocumentQueue.java
        RedisDocumentQueue.java
        SQLDocumentQueueCodec.java
        Scanner.java
        redis
        ConnectionManagerFactory.java
        DocumentDecoder.java
        DocumentEncoder.java
        ResultDecoder.java
        ResultEncoder.java
        report
        HashMapReportMap.java
        MySQLReportMap.java
        RedisReportMap.java
        Report.java
        ReportMap.java
        ReportMapFactory.java
        ReportMapType.java
        Reporter.java
        SQLReportCodec.java
        sax
        HTML5Serializer.java
        solr
        SolrComplementConsumer.java
        SolrCopyConsumer.java
        SolrIntersectionConsumer.java
        SolrMachine.java
        SolrMachineConsumer.java
        SolrMachineProducer.java
        SolrRehashConsumer.java
        SolrTaggingConsumer.java
        spewer
        FieldNames.java
        FileSpewer.java
        MergingSolrSpewer.java
        MetadataTransformer.java
        PrintStreamSpewer.java
        RESTSpewer.java
        SolrSpewer.java
        Spewer.java
        SpewerFactory.java
        tasks
        CleanReportTask.java
        CommitTask.java
        CopyTask.java
        DeleteTask.java
        DumpQueueTask.java
        DumpReportTask.java
        LoadQueueTask.java
        LoadReportTask.java
        QueueTask.java
        RehashTask.java
        RollbackTask.java
        SpewTask.java
        TagTask.java
        WipeQueueTask.java
        WipeReportTask.java
        imageio
        jpx
        JPXImageReaderSpi.java
        JPXImageWriterSpi.java
        net
        http
        CountdownHttpRequestRetryHandler.java
        PinnedHttpClientBuilder.java
        task
        DefaultTask.java
        DefaultTaskFactory.java
        MonitorableTask.java
        Option.java
        OptionParser.java
        Options.java
        OptionsIterator.java
        StringOptionParser.java
        Task.java
        annotation
        Option.java
        Options.java
        OptionsClass.java
        OptionsClasses.java
        Task.java
        transformers
        CommonsTransformer.java
        time
        HumanDuration.java
  - test
    - java
      - org
        icij
        extract
        extractor
        DocumentConsumerTest.java
        ExtractorTest.java
        queue
        ArrayDocumentQueueTest.java
        DocumentQueueDrainerTest.java
        ScannerTest.java
        report
        ReporterTest.java
        solr
        SolrCopyMachineTest.java
        SolrSpewerTest.java
        SolrTagMachineTest.java
        spewer
        PrintStreamSpewerTest.java
        SpewerTest.java
        test
        CauseMatcher.java
        RegexMatcher.java
        SolrJettyTestBase.java
        time
        HumanDurationTest.java

package org.icij.extract.extractor;

import java.io.FileNotFoundException;
import java.io.Reader;
import java.io.Writer;
import java.nio.file.Path;
import java.time.Duration;
import java.util.*;

import java.io.IOException;
import java.util.function.Function;

import org.apache.commons.io.TaggedIOException;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.*;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.DefaultHtmlMapper;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.parser.utils.CommonsDigester.DigestAlgorithm;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.icij.extract.document.Document;
import org.icij.extract.parser.ParsingReader;
import org.icij.extract.report.Reporter;
import org.icij.extract.sax.HTML5Serializer;
import org.icij.extract.spewer.MetadataTransformer;
import org.icij.extract.spewer.Spewer;
import org.icij.task.Options;
import org.icij.task.annotation.Option;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;

/**
 * A reusable class that sets up Tika parsers based on runtime options.
 *
 * @since 1.0.0-beta
 */
@Option(name = "digestMethod", description = "The hash digest method used for documents, for example \"SHA256\". May" +
		" be specified multiple times", parameter = "name")
@Option(name = "outputFormat", description = "Set the output format. Either \"text\" or \"HTML\". " +
		"Defaults to text output.", parameter = "type")
@Option(name = "embedHandling", description = "Set the embed handling mode. Either \"ignore\", " +
		"\"concatenate\" or \"spawn\". When set to concatenate, embeds are parsed and the output is " +
		"in-lined into the main output." +
		"Defaults to spawning, which spawns new documents for each embedded document encountered.", parameter = "type")
@Option(name = "embedOutput", description = "Path to a directory for outputting attachments en masse.",
		parameter = "path")
@Option(name = "ocrLanguage", description = "Set the languages used by Tesseract. Multiple  languages may be " +
		"specified, separated by plus characters. Tesseract uses 3-character ISO 639-2 language codes.", parameter =
		"language")
@Option(name = "ocrTimeout", description = "Set the timeout for the Tesseract process to finish e.g. \"5s\" or \"1m\"" +
		". Defaults to 12 hours.", parameter = "duration")
@Option(name = "ocr", description = "Enable or disable automatic OCR. On by default.")
public class Extractor {

	public enum OutputFormat {
		HTML, TEXT;

		public static OutputFormat parse(final String outputFormat) {
			return valueOf(outputFormat.toUpperCase(Locale.ROOT));
		}
	}

	public enum EmbedHandling {
		CONCATENATE, SPAWN, IGNORE;

		public static EmbedHandling parse(final String outputFormat) {
			return valueOf(outputFormat.toUpperCase(Locale.ROOT));
		}

		public static EmbedHandling getDefault() {
			return SPAWN;
		}
	}

	private static final Logger logger = LoggerFactory.getLogger(Extractor.class);

	private boolean ocrDisabled = false;
	private DigestingParser.Digester digester = null;

	private Parser defaultParser = TikaConfig.getDefaultConfig().getParser();
	private final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
	private final PDFParserConfig pdfConfig = new PDFParserConfig();

	private final Collection<Class<? extends Parser>> excludedParsers = new HashSet<>();

	private OutputFormat outputFormat = OutputFormat.TEXT;
	private EmbedHandling embedHandling = EmbedHandling.getDefault();
	private Path embedOutput = null;

	/**
	 * Create a new extractor, which will OCR images by default if Tesseract is available locally, extract inline
	 * images from PDF files and OCR them and use PDFBox's non-sequential PDF parser.
	 */
	public Extractor() {

		// Calculate the SHA256 digest by default.
		setDigestAlgorithms(DigestAlgorithm.SHA256);

		// Run OCR on images contained within PDFs and not on pages.
		pdfConfig.setExtractInlineImages(true);
		pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);

		// By default, only the object IDs are used for determining uniqueness.
		// In scanned documents under test from the Panama registry, different embedded images had the same ID, leading to incomplete OCRing when uniqueness detection was turned on.
		pdfConfig.setExtractUniqueInlineImagesOnly(false);

		// Set a long OCR timeout by default, because Tika's is too short.
		setOcrTimeout(Duration.ofDays(1));
		ocrConfig.setEnableImageProcessing(0); // See TIKA-2167. Image processing causes OCR to fail.

		// English and Spanish text recognition.
		ocrConfig.setLanguage("eng+spa");
	}

	public Extractor configure(final Options<String> options) {
		options.get("outputFormat").parse().asEnum(OutputFormat::parse).ifPresent(this::setOutputFormat);
		options.get("embedHandling").parse().asEnum(EmbedHandling::parse).ifPresent(this::setEmbedHandling);
		options.get("embedOutput").parse().asPath().ifPresent(this::setEmbedOutputPath);
		options.get("ocrLanguage").value().ifPresent(this::setOcrLanguage);
		options.get("ocrTimeout").parse().asDuration().ifPresent(this::setOcrTimeout);

		final Collection<DigestAlgorithm> digestAlgorithms = options.get("digestMethod").values
				(DigestAlgorithm::valueOf);

		if (!digestAlgorithms.isEmpty()) {
			setDigestAlgorithms(digestAlgorithms.toArray(new DigestAlgorithm[digestAlgorithms.size()]));
		}

		if (options.get("ocr").parse().isOff()) {
			disableOcr();
		}

		return this;
	}

	/**
	 * Set the output format.
	 *
	 * @param outputFormat the output format
	 */
	public void setOutputFormat(final OutputFormat outputFormat) {
		this.outputFormat = outputFormat;
	}

	/**
	 * Get the extraction output format.
	 *
	 * @return the output format
	 */
	public OutputFormat getOutputFormat() {
		return outputFormat;
	}

	/**
	 * Set the embed handling mode.
	 *
	 * @param embedHandling the embed handling mode
	 */
	public void setEmbedHandling(final EmbedHandling embedHandling) {
		this.embedHandling = embedHandling;
	}

	/**
	 * Get the embed handling mode.
	 *
	 * @return the embed handling mode.
	 */
	public EmbedHandling getEmbedHandling() {
		return embedHandling;
	}

	/**
	 * Set the output directory path for embed files.
	 *
	 * @param embedOutput the embed output path
	 */
	public void setEmbedOutputPath(final Path embedOutput) {
		this.embedOutput = embedOutput;
	}

	/**
	 * Get the output directory path for embed files.
	 *
	 * @return the embed output path.
	 */
	public Path getEmbedOutputPath() {
		return embedOutput;
	}

	/**
	 * Set the languages used by Tesseract.
	 *
	 * @param ocrLanguage the languages to use, for example "eng" or "ita+spa"
	 */
	public void setOcrLanguage(final String ocrLanguage) {
		ocrConfig.setLanguage(ocrLanguage);
	}

	/**
	 * Instructs Tesseract to attempt OCR for no longer than the given duration in seconds.
	 *
	 * @param ocrTimeout the duration in seconds
	 */
	private void setOcrTimeout(final int ocrTimeout) {
		ocrConfig.setTimeout(ocrTimeout);
	}

	/**
	 * Instructs Tesseract to attempt OCR for no longer than the given duration.
	 *
	 * @param duration the duration before timeout
	 */
	public void setOcrTimeout(final Duration duration) {
		setOcrTimeout(Math.toIntExact(duration.getSeconds()));
	}

	public void setDigestAlgorithms(final DigestAlgorithm... digestAlgorithms) {
		digester = new CommonsDigester(20 * 1024 * 1024, digestAlgorithms);
	}

	/**
	 * Disable OCR. This method only has an effect if Tesseract is installed.
	 */
	public void disableOcr() {
		if (!ocrDisabled) {
			excludeParser(TesseractOCRParser.class);
			ocrDisabled = true;
			pdfConfig.setExtractInlineImages(false);
		}
	}

	/**
	 * This method will wrap the given {@link Document} in a {@link TikaInputStream} and return a {@link Reader}
	 * which can be used to initiate extraction on demand.
	 *
	 * Internally, this method uses {@link TikaInputStream#get} which ensures that the resource name and content
	 * length metadata properties are set automatically.
	 *
	 * @param document the file to extract from
	 * @return A {@link Reader} that can be used to read extracted text on demand.
	 */
	public Reader extract(final Document document, final TemporaryResources tmp) throws IOException {

		// Use the the TikaInputStream.parse method that accepts a file, because this sets metadata properties like the
		// resource name and size.
		return extract(document, TikaInputStream.get(document.getPath(), document.getMetadata()), tmp);
	}

	/**
	 * Extract and spew content from a document. Internally, as with {@link #extract(Document, TemporaryResources)},
	 * this method creates a {@link TikaInputStream} from the path of the given document.
	 *
	 * @param document document to extract from
	 * @param spewer endpoint to write to
	 * @throws IOException if there was an error reading or writing the document
	 */
	public void extract(final Document document, final Spewer spewer) throws IOException {
		try (final TemporaryResources tmp = new TemporaryResources(); final Reader reader = extract(document, tmp)) {
			spewer.write(document, reader);
		}
	}

	/**
	 * Extract and spew content from a document. This method is the same as {@link #extract(Document, Spewer)} with
	 * the exception that the document will be skipped if the reporter returns {@literal false} for a call to
	 * {@link Reporter#skip(Document)}.
	 *
	 * If the document is not skipped, then the result of the extraction is passed to the reporter in a call to
	 * {@link Reporter#save(Document, ExtractionStatus, Exception)}.
	 *
	 * @param document document to extract from
	 * @param spewer endpoint to write to
	 * @param reporter used to check whether the document should be skipped and save extraction status
	 */
	public void extract(final Document document, final Spewer spewer, final Reporter reporter) {
		Objects.requireNonNull(reporter);

		if (reporter.skip(document)) {
			logger.info(String.format("File already extracted; skipping: \"%s\".", document));
			return;
		}

		ExtractionStatus status = ExtractionStatus.SUCCESS;
		Exception exception = null;

		try {
			extract(document, spewer);
		} catch (final Exception e) {
			status = status(e, spewer);
			log(e, status, document);
			exception = e;
		}

		reporter.save(document, status, exception);
	}

	private void log(final Exception e, final ExtractionStatus status, final Document document) {
		switch (status) {
			case FAILURE_NOT_SAVED:
				logger.error(String.format("The extraction result could not be outputted: \"%s\".", document),
						e.getCause());
				break;
			case FAILURE_NOT_FOUND:
				logger.error(String.format("File not found: \"%s\".", document), e);
				break;
			case FAILURE_NOT_DECRYPTED:
				logger.warn(String.format("Skipping encrypted file: \"%s\".", document), e);
				break;
			case FAILURE_NOT_PARSED:
				logger.error(String.format("The document could not be parsed: \"%s\".", document), e);
				break;
			case FAILURE_UNREADABLE:
				logger.error(String.format("The document stream could not be read: \"%s\".", document), e);
				break;
			default:
				logger.error(String.format("Unknown exception during extraction or output: \"%s\".", document), e);
				break;
		}
	}

	/**
	 * Convert the given {@link Exception} into an {@link ExtractionStatus} for addition to a report.
	 *
	 * Logs an appropriate message depending on the exception.
	 *
	 * @param e the exception to convert and log
	 * @return the resulting status
	 */
	private ExtractionStatus status(final Exception e, final Spewer spewer) {
		if (TaggedIOException.isTaggedWith(e, spewer)) {
			return ExtractionStatus.FAILURE_NOT_SAVED;
		}

		if (TaggedIOException.isTaggedWith(e, MetadataTransformer.class)) {
			return ExtractionStatus.FAILURE_NOT_PARSED;
		}

		if (e instanceof FileNotFoundException) {
			return ExtractionStatus.FAILURE_NOT_FOUND;
		}

		if (!(e instanceof IOException)) {
			return ExtractionStatus.FAILURE_UNKNOWN;
		}

		final Throwable cause = e.getCause();

		if (cause instanceof EncryptedDocumentException) {
			return ExtractionStatus.FAILURE_NOT_DECRYPTED;
		}

		// TIKA-198: IOExceptions thrown by parsers will be wrapped in a TikaException.
		// This helps us differentiate input stream exceptions from output stream exceptions.
		// https://issues.apache.org/jira/browse/TIKA-198
		if (cause instanceof TikaException) {
			return ExtractionStatus.FAILURE_NOT_PARSED;
		}

		return ExtractionStatus.FAILURE_UNREADABLE;
	}

	/**
	 * Create a pull-parser from the given {@link TikaInputStream}.
	 *
	 * @param input the stream to extract from
	 * @param document file that is being extracted from
	 * @return A pull-parsing reader.
	 */
	protected Reader extract(final Document document, final TikaInputStream input, final TemporaryResources tmp)
			throws IOException {
		final Metadata metadata = document.getMetadata();
		final ParseContext context = new ParseContext();
		final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser);
		final Parser parser;

		if (null != digester) {
			parser = new DigestingParser(autoDetectParser, digester);
		} else {
			parser = autoDetectParser;
		}

		if (!ocrDisabled) {
			context.set(TesseractOCRConfig.class, ocrConfig);
		}

		context.set(PDFParserConfig.class, pdfConfig);
		autoDetectParser.setFallback(ErrorParser.INSTANCE);

		// Only include "safe" tags in the HTML output from Tika's HTML parser.
		// This excludes script tags and objects.
		context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);

		final Reader reader;
		final Function<Writer, ContentHandler> handler;

		if (OutputFormat.HTML == outputFormat) {
			handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer));
		} else {

			// The default BodyContentHandler is used when constructing the ParsingReader for text output, but
			// because only the body of embeds is pushed to the content handler further down the line, we can't
			// expect a body tag.
			handler = WriteOutContentHandler::new;
		}

		if (EmbedHandling.SPAWN == embedHandling) {
			context.set(Parser.class, parser);
			context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(document, tmp, context, embedOutput,
					handler));
		} else if (EmbedHandling.CONCATENATE == embedHandling) {
			context.set(Parser.class, parser);
			context.set(EmbeddedDocumentExtractor.class, new EmbedParser(document, context));
		} else {
			context.set(Parser.class, EmptyParser.INSTANCE);
			context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker());
		}

		if (OutputFormat.HTML == outputFormat) {
			reader = new ParsingReader(parser, input, metadata, context, handler);
		} else {
			reader = new ParsingReader(parser, input, metadata, context);
		}

		return reader;
	}

	private void excludeParser(final Class<? extends Parser> exclude) {
		if (defaultParser instanceof CompositeParser) {
			final CompositeParser composite = (CompositeParser) defaultParser;
			final List<Parser> parsers = composite.getAllComponentParsers();

			excludedParsers.add(exclude);
			defaultParser = new CompositeParser(composite.getMediaTypeRegistry(), parsers, excludedParsers);
		}
	}
}