AnalysisRunnerJobDelegate.java example

Explorer
DataCleaner-master
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.job.runner;

import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Queue;

import org.apache.metamodel.schema.Table;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.InjectionManager;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.ComponentJob;
import org.datacleaner.job.OutputDataStreamJob;
import org.datacleaner.job.concurrent.JobCompletionTaskListener;
import org.datacleaner.job.concurrent.JoinTaskListener;
import org.datacleaner.job.concurrent.TaskListener;
import org.datacleaner.job.concurrent.TaskRunner;
import org.datacleaner.lifecycle.LifeCycleHelper;
import org.datacleaner.util.SourceColumnFinder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A delegate for the AnalysisRunner to put the state of a single job into.
 *
 * As opposed to the AnalysisRunner, this class is NOT thread-safe (which is why
 * the AnalysisRunner instantiates a new delegate for each execution).
 */
final class AnalysisRunnerJobDelegate {

    private static final Logger logger = LoggerFactory.getLogger(AnalysisRunnerJobDelegate.class);

    private final AnalysisJob _job;
    private final DataCleanerConfiguration _configuration;
    private final TaskRunner _taskRunner;
    private final AnalysisListener _analysisListener;
    private final Queue<JobAndResult> _resultQueue;
    private final ErrorAware _errorAware;
    private final boolean _includeNonDistributedTasks;

    /**
     *
     * @param job
     * @param configuration
     * @param taskRunner
     * @param analysisListener
     * @param resultQueue
     * @param errorAware
     * @param includeNonDistributedTasks
     *            determines if non-distributed tasks on components, such as
     *            {@link Initialize} methods that are not distributed, should be
     *            executed or not. On single-node executions, this will
     *            typically be true, on slave nodes in a cluster, this will
     *            typically be false.
     */
    public AnalysisRunnerJobDelegate(final AnalysisJob job, final DataCleanerConfiguration configuration,
            final TaskRunner taskRunner, final AnalysisListener analysisListener, final Queue<JobAndResult> resultQueue,
            final ErrorAware errorAware, final boolean includeNonDistributedTasks) {
        _job = job;
        _configuration = configuration;
        _taskRunner = taskRunner;
        _analysisListener = analysisListener;
        _resultQueue = resultQueue;
        _includeNonDistributedTasks = includeNonDistributedTasks;
        _errorAware = errorAware;
    }

    /**
     * Runs the job
     *
     * @return
     */
    public AnalysisResultFuture run() {
        try {
            // the injection manager is job scoped
            final InjectionManager injectionManager = _configuration.getEnvironment().getInjectionManagerFactory()
                    .getInjectionManager(_configuration, _job);

            final LifeCycleHelper rowProcessingLifeCycleHelper =
                    new LifeCycleHelper(injectionManager, _includeNonDistributedTasks);

            final RowProcessingPublishers publishers =
                    new RowProcessingPublishers(_job, _analysisListener, _errorAware, _taskRunner,
                            rowProcessingLifeCycleHelper);

            final AnalysisJobMetrics analysisJobMetrics = publishers.getAnalysisJobMetrics();

            // A task listener that will register either succesfull executions
            // or unexpected errors (which will be delegated to the
            // errorListener)
            final JobCompletionTaskListener jobCompletionTaskListener =
                    new JobCompletionTaskListener(analysisJobMetrics, _analysisListener, 1);

            _analysisListener.jobBegin(_job, analysisJobMetrics);

            validateSingleTableInput(_job);

            // at this point we are done validating the job, it will run.
            scheduleRowProcessing(publishers, rowProcessingLifeCycleHelper, jobCompletionTaskListener,
                    analysisJobMetrics);

            return new AnalysisResultFutureImpl(_resultQueue, jobCompletionTaskListener, _errorAware);
        } catch (final RuntimeException e) {
            _analysisListener.errorUnknown(_job, e);
            throw e;
        }

    }

    /**
     * Starts row processing job flows.
     *
     * @param publishers
     * @param analysisJobMetrics
     *
     * @param injectionManager
     */
    private void scheduleRowProcessing(final RowProcessingPublishers publishers, final LifeCycleHelper lifeCycleHelper,
            final JobCompletionTaskListener jobCompletionTaskListener, final AnalysisJobMetrics analysisJobMetrics) {

        logger.info("Created {} row processor publisher(s)", publishers.size());
        final TaskListener rowProcessorPublishersDoneCompletionListener =
                new JoinTaskListener(publishers.size(), jobCompletionTaskListener);

        final Collection<RowProcessingPublisher> rowProcessingPublishers = publishers.getRowProcessingPublishers();
        logger.debug("RowProcessingPublishers: {}", rowProcessingPublishers);

        dispatchWhenReady(rowProcessingPublishers, rowProcessorPublishersDoneCompletionListener);
    }

    private void dispatchWhenReady(final Collection<RowProcessingPublisher> rowProcessingPublishers,
            final TaskListener rowProcessorPublishersDoneCompletionListener) {
        final LinkedList<RowProcessingPublisher> remainingPublishers = new LinkedList<>(rowProcessingPublishers);

        while (!remainingPublishers.isEmpty()) {
            boolean progressThisIteration = false;

            for (final Iterator<RowProcessingPublisher> it = remainingPublishers.iterator(); it.hasNext(); ) {
                final RowProcessingPublisher rowProcessingPublisher = it.next();
                final boolean started = rowProcessingPublisher
                        .runRowProcessing(_resultQueue, rowProcessorPublishersDoneCompletionListener);
                if (started) {
                    logger.debug("Scheduled row processing publisher: {}", rowProcessingPublisher);
                    it.remove();
                    progressThisIteration = true;
                }
            }

            if (!progressThisIteration) {
                try {
                    // Give way for the data processing to happen in other
                    // threads. Better to sleep() than to yield().
                    Thread.sleep(100);
                } catch (final InterruptedException e) {
                    // do nothing
                }
            }
        }
    }

    /**
     * Prevents that any row processing components have input from different
     * tables.
     *
     * @param job
     */
    private void validateSingleTableInput(final AnalysisJob job) {
        final SourceColumnFinder sourceColumnFinder = new SourceColumnFinder();
        sourceColumnFinder.addSources(job);
        validateSingleTableInput(sourceColumnFinder, job.getTransformerJobs());
        validateSingleTableInput(sourceColumnFinder, job.getFilterJobs());
        validateSingleTableInput(sourceColumnFinder, job.getAnalyzerJobs());
    }

    /**
     * Prevents that any row processing components have input from different
     * tables.
     *
     * @param sourceColumnFinder
     * @param componentJobs
     */
    private void validateSingleTableInput(final SourceColumnFinder sourceColumnFinder,
            final Collection<? extends ComponentJob> componentJobs) {
        for (final ComponentJob componentJob : componentJobs) {
            if (!componentJob.getDescriptor().isMultiStreamComponent()) {
                Table originatingTable = null;
                final InputColumn<?>[] input = componentJob.getInput();

                for (final InputColumn<?> inputColumn : input) {
                    final Table table = sourceColumnFinder.findOriginatingTable(inputColumn);
                    if (table != null) {
                        if (originatingTable == null) {
                            originatingTable = table;
                        } else {
                            if (!originatingTable.equals(table)) {
                                throw new IllegalArgumentException(
                                        "Input columns in " + componentJob + " originate from different tables");
                            }
                        }
                    }
                }
            }

            final OutputDataStreamJob[] outputDataStreamJobs = componentJob.getOutputDataStreamJobs();
            for (final OutputDataStreamJob outputDataStreamJob : outputDataStreamJobs) {
                validateSingleTableInput(outputDataStreamJob.getJob());
            }
        }

    }

}