/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.job.runner;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Queue;
import org.apache.metamodel.schema.Table;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.InjectionManager;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.ComponentJob;
import org.datacleaner.job.OutputDataStreamJob;
import org.datacleaner.job.concurrent.JobCompletionTaskListener;
import org.datacleaner.job.concurrent.JoinTaskListener;
import org.datacleaner.job.concurrent.TaskListener;
import org.datacleaner.job.concurrent.TaskRunner;
import org.datacleaner.lifecycle.LifeCycleHelper;
import org.datacleaner.util.SourceColumnFinder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A delegate for the AnalysisRunner to put the state of a single job into.
*
* As opposed to the AnalysisRunner, this class is NOT thread-safe (which is why
* the AnalysisRunner instantiates a new delegate for each execution).
*/
final class AnalysisRunnerJobDelegate {
private static final Logger logger = LoggerFactory.getLogger(AnalysisRunnerJobDelegate.class);
private final AnalysisJob _job;
private final DataCleanerConfiguration _configuration;
private final TaskRunner _taskRunner;
private final AnalysisListener _analysisListener;
private final Queue<JobAndResult> _resultQueue;
private final ErrorAware _errorAware;
private final boolean _includeNonDistributedTasks;
/**
*
* @param job
* @param configuration
* @param taskRunner
* @param analysisListener
* @param resultQueue
* @param errorAware
* @param includeNonDistributedTasks
* determines if non-distributed tasks on components, such as
* {@link Initialize} methods that are not distributed, should be
* executed or not. On single-node executions, this will
* typically be true, on slave nodes in a cluster, this will
* typically be false.
*/
public AnalysisRunnerJobDelegate(final AnalysisJob job, final DataCleanerConfiguration configuration,
final TaskRunner taskRunner, final AnalysisListener analysisListener, final Queue<JobAndResult> resultQueue,
final ErrorAware errorAware, final boolean includeNonDistributedTasks) {
_job = job;
_configuration = configuration;
_taskRunner = taskRunner;
_analysisListener = analysisListener;
_resultQueue = resultQueue;
_includeNonDistributedTasks = includeNonDistributedTasks;
_errorAware = errorAware;
}
/**
* Runs the job
*
* @return
*/
public AnalysisResultFuture run() {
try {
// the injection manager is job scoped
final InjectionManager injectionManager = _configuration.getEnvironment().getInjectionManagerFactory()
.getInjectionManager(_configuration, _job);
final LifeCycleHelper rowProcessingLifeCycleHelper =
new LifeCycleHelper(injectionManager, _includeNonDistributedTasks);
final RowProcessingPublishers publishers =
new RowProcessingPublishers(_job, _analysisListener, _errorAware, _taskRunner,
rowProcessingLifeCycleHelper);
final AnalysisJobMetrics analysisJobMetrics = publishers.getAnalysisJobMetrics();
// A task listener that will register either succesfull executions
// or unexpected errors (which will be delegated to the
// errorListener)
final JobCompletionTaskListener jobCompletionTaskListener =
new JobCompletionTaskListener(analysisJobMetrics, _analysisListener, 1);
_analysisListener.jobBegin(_job, analysisJobMetrics);
validateSingleTableInput(_job);
// at this point we are done validating the job, it will run.
scheduleRowProcessing(publishers, rowProcessingLifeCycleHelper, jobCompletionTaskListener,
analysisJobMetrics);
return new AnalysisResultFutureImpl(_resultQueue, jobCompletionTaskListener, _errorAware);
} catch (final RuntimeException e) {
_analysisListener.errorUnknown(_job, e);
throw e;
}
}
/**
* Starts row processing job flows.
*
* @param publishers
* @param analysisJobMetrics
*
* @param injectionManager
*/
private void scheduleRowProcessing(final RowProcessingPublishers publishers, final LifeCycleHelper lifeCycleHelper,
final JobCompletionTaskListener jobCompletionTaskListener, final AnalysisJobMetrics analysisJobMetrics) {
logger.info("Created {} row processor publisher(s)", publishers.size());
final TaskListener rowProcessorPublishersDoneCompletionListener =
new JoinTaskListener(publishers.size(), jobCompletionTaskListener);
final Collection<RowProcessingPublisher> rowProcessingPublishers = publishers.getRowProcessingPublishers();
logger.debug("RowProcessingPublishers: {}", rowProcessingPublishers);
dispatchWhenReady(rowProcessingPublishers, rowProcessorPublishersDoneCompletionListener);
}
private void dispatchWhenReady(final Collection<RowProcessingPublisher> rowProcessingPublishers,
final TaskListener rowProcessorPublishersDoneCompletionListener) {
final LinkedList<RowProcessingPublisher> remainingPublishers = new LinkedList<>(rowProcessingPublishers);
while (!remainingPublishers.isEmpty()) {
boolean progressThisIteration = false;
for (final Iterator<RowProcessingPublisher> it = remainingPublishers.iterator(); it.hasNext(); ) {
final RowProcessingPublisher rowProcessingPublisher = it.next();
final boolean started = rowProcessingPublisher
.runRowProcessing(_resultQueue, rowProcessorPublishersDoneCompletionListener);
if (started) {
logger.debug("Scheduled row processing publisher: {}", rowProcessingPublisher);
it.remove();
progressThisIteration = true;
}
}
if (!progressThisIteration) {
try {
// Give way for the data processing to happen in other
// threads. Better to sleep() than to yield().
Thread.sleep(100);
} catch (final InterruptedException e) {
// do nothing
}
}
}
}
/**
* Prevents that any row processing components have input from different
* tables.
*
* @param job
*/
private void validateSingleTableInput(final AnalysisJob job) {
final SourceColumnFinder sourceColumnFinder = new SourceColumnFinder();
sourceColumnFinder.addSources(job);
validateSingleTableInput(sourceColumnFinder, job.getTransformerJobs());
validateSingleTableInput(sourceColumnFinder, job.getFilterJobs());
validateSingleTableInput(sourceColumnFinder, job.getAnalyzerJobs());
}
/**
* Prevents that any row processing components have input from different
* tables.
*
* @param sourceColumnFinder
* @param componentJobs
*/
private void validateSingleTableInput(final SourceColumnFinder sourceColumnFinder,
final Collection<? extends ComponentJob> componentJobs) {
for (final ComponentJob componentJob : componentJobs) {
if (!componentJob.getDescriptor().isMultiStreamComponent()) {
Table originatingTable = null;
final InputColumn<?>[] input = componentJob.getInput();
for (final InputColumn<?> inputColumn : input) {
final Table table = sourceColumnFinder.findOriginatingTable(inputColumn);
if (table != null) {
if (originatingTable == null) {
originatingTable = table;
} else {
if (!originatingTable.equals(table)) {
throw new IllegalArgumentException(
"Input columns in " + componentJob + " originate from different tables");
}
}
}
}
}
final OutputDataStreamJob[] outputDataStreamJobs = componentJob.getOutputDataStreamJobs();
for (final OutputDataStreamJob outputDataStreamJob : outputDataStreamJobs) {
validateSingleTableInput(outputDataStreamJob.getJob());
}
}
}
}