package org.icij.extract.extractor; import java.lang.Runtime; import java.util.concurrent.ExecutorService; import java.util.concurrent.RejectedExecutionException; import java.util.function.Consumer; import org.icij.kaxxa.concurrent.BlockingThreadPoolExecutor; import org.icij.kaxxa.concurrent.ExecutorProxy; import org.icij.extract.document.Document; import org.icij.extract.report.Reporter; import org.icij.extract.spewer.Spewer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Base consumer for documents. Superclasses should call {@link #accept(Document)}. All tasks are sent to a * work-stealing thread pool. * * The parallelism of the thread pool is defined in the call to the constructor. * * A task is defined as both the extraction from a file and the output of extracted data. * Completion is only considered successful if both parts of the task complete with no exceptions. * * The final status of each task is saved to the reporter, if any is set. * * @since 1.0.0-beta */ public class DocumentConsumer extends ExecutorProxy implements Consumer<Document> { private static final Logger logger = LoggerFactory.getLogger(DocumentConsumer.class); protected final Spewer spewer; protected final Extractor extractor; /** * The {@code Reporter} that will receive extraction results. */ private Reporter reporter = null; /** * Returns the default thread pool size, which is equivalent to the number of available processors minus 1, or 1 * - whichever is greater. * * @return the default pool size */ public static int defaultPoolSize() { return Math.max(1, Runtime.getRuntime().availableProcessors() - 1); } /** * Create a new consumer that submits tasks to the given {@code Executor}. * * @param spewer the {@code Spewer} used to write extracted text and metadata * @param extractor the {@code Extractor} used to extract from files * @param executor the executor used to run consuming tasks */ public DocumentConsumer(final Spewer spewer, final Extractor extractor, final ExecutorService executor) { super(executor); this.spewer = spewer; this.extractor = extractor; } /** * Create a new consumer with the given pool size. Uses a {@link BlockingThreadPoolExecutor}, which means that calls * to {@link #accept} will block when the thread pool is full of running tasks. * * @param spewer the {@code Spewer} used to write extracted text and metadata * @param extractor the {@code Extractor} used to extract from files * @param poolSize the fixed size of the thread pool used to consume documents */ public DocumentConsumer(final Spewer spewer, final Extractor extractor, final int poolSize) { this(spewer, extractor, new BlockingThreadPoolExecutor(poolSize)); } /** * Create a new consumer with the default pool size, which is the number of available processors. * * @param spewer the {@code Spewer} used to write extracted text and metadata * @param extractor the {@code Extractor} used to extract from files */ public DocumentConsumer(final Spewer spewer, final Extractor extractor) { this(spewer, extractor, defaultPoolSize()); } /** * Set the reporter. * * @param reporter reporter */ public void setReporter(final Reporter reporter) { this.reporter = reporter; } /** * Get the reporter. * * @return The reporter. */ public Reporter getReporter() { return reporter; } /** * Consume a file. * * If a blocking executor such as {@link BlockingThreadPoolExecutor} is being used (the default when no * {@link ExecutorService} is passed to the constructor) then this method will block until a thread becomes * available. Otherwise the behaviour is similar to {@link ExecutorService#execute(Runnable)}, causing the task * to be put in a queue. * * @param document the document to consume * @throws RejectedExecutionException if unable to queue the consumer task for execution, including when the * current thread is interrupted. */ @Override public void accept(final Document document) { logger.info(String.format("Sending to thread pool; will queue if full: \"%s\".", document)); executor.execute(()-> { logger.info(String.format("Beginning extraction: \"%s\".", document)); try { if (null != reporter) { extractor.extract(document, spewer, reporter); } else { extractor.extract(document, spewer); } } catch (Exception e) { logger.error(String.format("Exception while consuming file: \"%s\".", document), e); } }); } }