/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.job.runner;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.metamodel.schema.Table;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.InjectionManager;
import org.datacleaner.configuration.InjectionManagerFactory;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.FilterOutcome;
import org.datacleaner.job.FilterOutcomes;
import org.datacleaner.job.concurrent.SingleThreadedTaskRunner;
import org.datacleaner.job.concurrent.TaskListener;
import org.datacleaner.job.tasks.Task;
import org.datacleaner.lifecycle.LifeCycleHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Object that can handle the task of consuming a number of rows. The
* {@link ConsumeRowHandler} is internally used to execute all necesary
* components for every record, but it can also be used as a utility if
* DataCleaner jobs are being embedded or applied in externally.
*/
public class ConsumeRowHandler {
public static class Configuration {
public boolean includeNonDistributedTasks = true;
public AnalysisListener analysisListener = new InfoLoggingAnalysisListener();
public boolean includeAnalyzers = true;
public Collection<? extends FilterOutcome> alwaysSatisfiedOutcomes;
public Table table;
}
private static final Logger logger = LoggerFactory.getLogger(ConsumeRowHandler.class);
private final List<RowProcessingConsumer> _consumers;
private final Collection<? extends FilterOutcome> _alwaysSatisfiedOutcomes;
/**
* Builds a {@link ConsumeRowHandler} based on a job, and the configuration
* to read the job's consumers
*
* @param job
* @param configuration
* @param rowConsumerConfiguration
*/
public ConsumeRowHandler(final AnalysisJob job, final DataCleanerConfiguration configuration,
final Configuration rowConsumerConfiguration) {
_consumers = extractConsumers(job, configuration, rowConsumerConfiguration);
_alwaysSatisfiedOutcomes = rowConsumerConfiguration.alwaysSatisfiedOutcomes;
}
/**
* Builds a {@link ConsumeRowHandler} based on a list of consumers.
*
* @param consumers
*/
public ConsumeRowHandler(final List<RowProcessingConsumer> consumers) {
this(consumers, null);
}
/**
* Builds a {@link ConsumeRowHandler} based on a list of consumers as well
* as a collection of always-satisfied outcomes.
*
* @param consumers
* @param alwaysSatisfiedOutcomes
*/
public ConsumeRowHandler(final List<RowProcessingConsumer> consumers,
final Collection<? extends FilterOutcome> alwaysSatisfiedOutcomes) {
_consumers = consumers;
_alwaysSatisfiedOutcomes = alwaysSatisfiedOutcomes;
}
/**
* Gets the {@link RowProcessingConsumer}s that this handler is working on.
*
* @return
*/
public List<RowProcessingConsumer> getConsumers() {
return _consumers;
}
/**
* Gets the output columns produced by all the consumers of this
* {@link ConsumeRowHandler}.
*
* @return
*/
public List<InputColumn<?>> getOutputColumns() {
final List<InputColumn<?>> result = new ArrayList<>();
for (final RowProcessingConsumer consumer : _consumers) {
final InputColumn<?>[] outputColumns = consumer.getOutputColumns();
for (final InputColumn<?> outputColumn : outputColumns) {
result.add(outputColumn);
}
}
return result;
}
/**
* @deprecated use {@link #consumeRow(InputRow)} instead
*/
@Deprecated
public List<InputRow> consume(final InputRow row) {
final ConsumeRowResult result = consumeRow(row);
return result.getRows();
}
/**
* Consumes a {@link InputRow} by applying all transformations etc. to it,
* returning a result of transformed rows and their {@link FilterOutcomes}s.
*
* @param row
* @return
*/
public ConsumeRowResult consumeRow(final InputRow row) {
final FilterOutcomes outcomes = new FilterOutcomesImpl(_alwaysSatisfiedOutcomes);
final ConsumeRowHandlerDelegate delegate = new ConsumeRowHandlerDelegate(_consumers, row, 0, outcomes);
return delegate.consume();
}
private List<RowProcessingConsumer> extractConsumers(final AnalysisJob analysisJob,
final DataCleanerConfiguration configuration, final Configuration rowConsumeConfiguration) {
final InjectionManagerFactory injectionManagerFactory =
configuration.getEnvironment().getInjectionManagerFactory();
final InjectionManager injectionManager =
injectionManagerFactory.getInjectionManager(configuration, analysisJob);
final LifeCycleHelper lifeCycleHelper =
new LifeCycleHelper(injectionManager, rowConsumeConfiguration.includeNonDistributedTasks);
/**
* Use a single threaded task runner since this handler is invoked in a
* blocking way - the calling code may itself be multithreaded without
* issues.
*/
final SingleThreadedTaskRunner taskRunner = new SingleThreadedTaskRunner();
final ErrorAwareAnalysisListener errorAwareAnalysisListener = new ErrorAwareAnalysisListener();
final AnalysisListener analysisListener =
new CompositeAnalysisListener(rowConsumeConfiguration.analysisListener, errorAwareAnalysisListener);
final RowProcessingPublishers rowProcessingPublishers =
new RowProcessingPublishers(analysisJob, analysisListener, errorAwareAnalysisListener, taskRunner,
lifeCycleHelper);
final RowProcessingPublisher publisher;
if (rowConsumeConfiguration.table != null) {
@SuppressWarnings("deprecation") final RowProcessingPublisher tablePublisher =
rowProcessingPublishers.getRowProcessingPublisher(rowConsumeConfiguration.table);
if (tablePublisher == null) {
throw new IllegalArgumentException(
"Job does not consume records from table: " + rowConsumeConfiguration.table);
}
publisher = tablePublisher;
} else {
final Collection<RowProcessingPublisher> publishers = rowProcessingPublishers.getRowProcessingPublishers();
publisher = publishers.iterator().next();
for (final RowProcessingPublisher thisPublisher : publishers) {
if (thisPublisher != publisher) {
if (thisPublisher.getStream().isSourceTable()) {
throw new IllegalArgumentException(
"Job consumes multiple source tables, but ConsumeRowHandler can only handle a single "
+ "table's components. Please specify a Table constructor argument.");
}
}
}
}
final AtomicReference<Throwable> errorReference = new AtomicReference<>();
publisher.initializeConsumers(new TaskListener() {
@Override
public void onError(final Task task, final Throwable throwable) {
logger.error("Exception thrown while initializing consumers.", throwable);
errorReference.compareAndSet(null, throwable);
}
@Override
public void onComplete(final Task task) {
logger.info("Consumers initialized successfully.");
}
@Override
public void onBegin(final Task task) {
logger.info("Beginning the process of initializing consumers.");
}
});
final Throwable throwable = errorReference.get();
if (throwable != null) {
if (throwable instanceof RuntimeException) {
logger.warn("A consumer failed", throwable);
}
}
List<RowProcessingConsumer> consumers = publisher.getConsumers();
if (!rowConsumeConfiguration.includeAnalyzers) {
consumers = removeAnalyzers(consumers);
}
final RowProcessingConsumerSorter sorter = new RowProcessingConsumerSorter(consumers);
consumers = sorter.createProcessOrderedConsumerList();
return consumers;
}
private List<RowProcessingConsumer> removeAnalyzers(final List<RowProcessingConsumer> consumers) {
final List<RowProcessingConsumer> result = new ArrayList<>();
for (final RowProcessingConsumer rowProcessingConsumer : consumers) {
final Object component = rowProcessingConsumer.getComponent();
if (!(component instanceof Analyzer<?>)) {
result.add(rowProcessingConsumer);
}
}
return result;
}
}