/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.job.runner; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.metamodel.MetaModelHelper; import org.apache.metamodel.schema.Column; import org.apache.metamodel.schema.Table; import org.apache.metamodel.util.CollectionUtils; import org.datacleaner.api.Analyzer; import org.datacleaner.api.Component; import org.datacleaner.api.Filter; import org.datacleaner.api.InputColumn; import org.datacleaner.api.Transformer; import org.datacleaner.configuration.ContextAwareInjectionManager; import org.datacleaner.configuration.InjectionManager; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.AnalyzerJob; import org.datacleaner.job.ComponentJob; import org.datacleaner.job.ComponentRequirement; import org.datacleaner.job.FilterJob; import org.datacleaner.job.FilterOutcome; import org.datacleaner.job.OutputDataStreamJob; import org.datacleaner.job.TransformerJob; import org.datacleaner.job.concurrent.TaskRunner; import org.datacleaner.lifecycle.LifeCycleHelper; import org.datacleaner.util.SourceColumnFinder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Class which partitions a single {@link AnalysisJob}'s components into * {@link RowProcessingPublisher}s. */ public final class RowProcessingPublishers { private static class ConsumerCreation { final RowProcessingConsumer _consumer; final boolean _componentCreated; public ConsumerCreation(final RowProcessingConsumer consumer, final boolean componentCreated) { _consumer = consumer; _componentCreated = componentCreated; } } private static final Logger logger = LoggerFactory.getLogger(RowProcessingPublishers.class); private final AnalysisJob _analysisJob; private final AnalysisListener _analysisListener; private final TaskRunner _taskRunner; private final LifeCycleHelper _lifeCycleHelper; private final Map<RowProcessingStream, RowProcessingPublisher> _rowProcessingPublishers; private final Map<ComponentJob, RowProcessingConsumer> _consumers; private final ErrorAware _errorAware; /** * Constructs a {@link RowProcessingPublishers}s instance. * * @param analysisJob * @param analysisListener * @param taskRunner * @param lifeCycleHelper * @param sourceColumnFinder * * @deprecated the {@link SourceColumnFinder} is no longer used here. Use * {@link #RowProcessingPublishers(AnalysisJob, AnalysisListener, TaskRunner, LifeCycleHelper)} * instead. */ @Deprecated public RowProcessingPublishers(final AnalysisJob analysisJob, final AnalysisListener analysisListener, final TaskRunner taskRunner, final LifeCycleHelper lifeCycleHelper, final SourceColumnFinder sourceColumnFinder) { this(analysisJob, analysisListener, (ErrorAware) analysisListener, taskRunner, lifeCycleHelper); } /** * Constructs a {@link RowProcessingPublishers}s instance. * * @param analysisJob * @param analysisListener * @param errorAware * @param taskRunner * @param lifeCycleHelper */ public RowProcessingPublishers(final AnalysisJob analysisJob, final AnalysisListener analysisListener, final ErrorAware errorAware, final TaskRunner taskRunner, final LifeCycleHelper lifeCycleHelper) { _analysisJob = analysisJob; _analysisListener = analysisListener; _errorAware = errorAware; _taskRunner = taskRunner; _lifeCycleHelper = lifeCycleHelper; // note that insertion and extraction order consistency is important // since OutputDataStreamJobs should be initialized after their parent // jobs. For this reason we use a LinkedHashMap and not a regular // HashMap. _rowProcessingPublishers = new LinkedHashMap<>(); _consumers = new IdentityHashMap<>(); registerAll(); } public static Collection<ComponentJob> getAllComponents(final AnalysisJob job) { return CollectionUtils.concat(false, job.getFilterJobs(), job.getTransformerJobs(), job.getAnalyzerJobs()); } private void registerAll() { registerJob(_analysisJob); final Collection<RowProcessingPublisher> publishers = _rowProcessingPublishers.values(); for (final RowProcessingPublisher publisher : publishers) { publisher.onAllConsumersRegistered(); } if (logger.isInfoEnabled()) { logger.info("Registered {} publishers: {}", _rowProcessingPublishers.size(), publishers); } } private void registerJob(final AnalysisJob job) { final SourceColumnFinder sourceColumnFinder = new SourceColumnFinder(); sourceColumnFinder.addSources(job); for (final ComponentJob componentJob : getAllComponents(job)) { registerRowProcessingPublishers(sourceColumnFinder, job, componentJob); } } private void registerJob(final AnalysisJob job, final RowProcessingStream dataStream, final RowProcessingConsumer parentConsumer) { final SourceColumnFinder sourceColumnFinder = new SourceColumnFinder(); sourceColumnFinder.addSources(job); for (final ComponentJob componentJob : getAllComponents(job)) { registerRowProcessingPublishers(sourceColumnFinder, job, dataStream, componentJob, parentConsumer); } } private void registerOutputDataStream(final RowProcessingPublisher parentPublisher, final RowProcessingConsumer publishingConsumer, final OutputDataStreamJob outputDataStreamJob) { final RowProcessingStream dataStream = RowProcessingStream.ofOutputDataStream(outputDataStreamJob); // first initialize the nested job like any other set of components registerJob(outputDataStreamJob.getJob(), dataStream, publishingConsumer); // then we wire the publisher for this output data stream to a // OutputRowCollector which will get injected via the // HasOutputDataStreams interface. final RowProcessingPublisher publisherForOutputDataStream = getRowProcessingPublisher(dataStream); publishingConsumer.registerOutputDataStream(outputDataStreamJob, publisherForOutputDataStream); } public Column[] getPhysicalColumns(final SourceColumnFinder sourceColumnFinder, final ComponentJob componentJob) { final Set<Column> physicalColumns = new HashSet<>(); final InputColumn<?>[] inputColumns = componentJob.getInput(); for (final InputColumn<?> inputColumn : inputColumns) { physicalColumns.addAll(sourceColumnFinder.findOriginatingColumns(inputColumn)); } final ComponentRequirement requirement = componentJob.getComponentRequirement(); if (requirement != null) { for (final FilterOutcome filterOutcome : requirement.getProcessingDependencies()) { physicalColumns.addAll(sourceColumnFinder.findOriginatingColumns(filterOutcome)); } } return physicalColumns.toArray(new Column[physicalColumns.size()]); } public Table[] getTables(final SourceColumnFinder sourceColumnFinder, final ComponentJob componentJob) { return getTables(sourceColumnFinder, componentJob, null); } public Table[] getTables(final SourceColumnFinder sourceColumnFinder, final ComponentJob componentJob, Column[] physicalColumns) { if (physicalColumns == null) { physicalColumns = getPhysicalColumns(sourceColumnFinder, componentJob); } final Table[] tables; if (physicalColumns.length == 0) { // if not dependent on any specific tables, make component available // for all tables final Set<Table> allTables = new HashSet<>(); final Collection<InputColumn<?>> allSourceColumns = _analysisJob.getSourceColumns(); for (final InputColumn<?> inputColumn : allSourceColumns) { allTables.add(inputColumn.getPhysicalColumn().getTable()); } tables = allTables.toArray(new Table[allTables.size()]); } else { tables = MetaModelHelper.getTables(physicalColumns); } if (tables.length > 1) { if (!componentJob.getDescriptor().isMultiStreamComponent()) { throw new IllegalStateException("Component has input columns from multiple tables: " + componentJob); } } if (tables.length == 0) { throw new IllegalStateException("Component has no dependent tables: " + componentJob); } return tables; } private void registerRowProcessingPublishers(final SourceColumnFinder sourceColumnFinder, final AnalysisJob job, final ComponentJob componentJob) { final Column[] physicalColumns = getPhysicalColumns(sourceColumnFinder, componentJob); final Table[] tables = getTables(sourceColumnFinder, componentJob, physicalColumns); for (final Table table : tables) { final RowProcessingStream dataStream = RowProcessingStream.ofSourceTable(job, table); registerRowProcessingPublishers(sourceColumnFinder, job, dataStream, componentJob, null); } } private void registerRowProcessingPublishers(final SourceColumnFinder sourceColumnFinder, final AnalysisJob job, final RowProcessingStream dataStream, final ComponentJob componentJob, final RowProcessingConsumer parentConsumer) { RowProcessingPublisher rowPublisher = _rowProcessingPublishers.get(dataStream); if (rowPublisher == null) { if (parentConsumer == null) { final SourceTableRowProcessingPublisher sourceTableRowPublisher = new SourceTableRowProcessingPublisher(this, dataStream); sourceTableRowPublisher.addPrimaryKeysIfSourced(); rowPublisher = sourceTableRowPublisher; } else { rowPublisher = new OutputDataStreamRowProcessingPublisher(this, parentConsumer, dataStream); } _rowProcessingPublishers.put(dataStream, rowPublisher); } if (rowPublisher instanceof SourceTableRowProcessingPublisher) { final SourceTableRowProcessingPublisher sourceTableRowPublisher = (SourceTableRowProcessingPublisher) rowPublisher; // register the physical columns needed by this job final Column[] physicalColumns = getPhysicalColumns(sourceColumnFinder, componentJob); final Column[] relevantColumns = MetaModelHelper.getTableColumns(dataStream.getTable(), physicalColumns); sourceTableRowPublisher.addPhysicalColumns(relevantColumns); } // find which input columns (both physical or virtual) are needed by // this per-table instance final InputColumn<?>[] localInputColumns = getLocalInputColumns(sourceColumnFinder, dataStream.getTable(), componentJob.getInput()); final ConsumerCreation consumerCreation = getOrCreateConsumer(rowPublisher, componentJob, localInputColumns); final RowProcessingConsumer consumer = consumerCreation._consumer; rowPublisher.registerConsumer(consumer); if (consumerCreation._componentCreated) { final OutputDataStreamJob[] outputDataStreamJobs = componentJob.getOutputDataStreamJobs(); for (final OutputDataStreamJob outputDataStreamJob : outputDataStreamJobs) { registerOutputDataStream(rowPublisher, consumer, outputDataStreamJob); } } } public ConsumerCreation getOrCreateConsumer(final RowProcessingPublisher publisher, final ComponentJob componentJob, final InputColumn<?>[] inputColumns) { RowProcessingConsumer consumer = _consumers.get(componentJob); final boolean create = consumer == null; if (create) { final Component component = (Component) componentJob.getDescriptor().newInstance(); if (componentJob instanceof AnalyzerJob) { final AnalyzerJob analyzerJob = (AnalyzerJob) componentJob; final Analyzer<?> analyzer = (Analyzer<?>) component; consumer = new AnalyzerConsumer(analyzer, analyzerJob, inputColumns, publisher); } else if (componentJob instanceof TransformerJob) { final TransformerJob transformerJob = (TransformerJob) componentJob; final Transformer transformer = (Transformer) component; consumer = new TransformerConsumer(transformer, transformerJob, inputColumns, publisher); } else if (componentJob instanceof FilterJob) { final FilterJob filterJob = (FilterJob) componentJob; final Filter<?> filter = (Filter<?>) component; consumer = new FilterConsumer(filter, filterJob, inputColumns, publisher); } else { throw new UnsupportedOperationException("Unsupported component job type: " + componentJob); } _consumers.put(componentJob, consumer); } consumer.registerPublisher(publisher); return new ConsumerCreation(consumer, create); } private InputColumn<?>[] getLocalInputColumns(final SourceColumnFinder sourceColumnFinder, final Table table, final InputColumn<?>[] inputColumns) { if (table == null || inputColumns == null || inputColumns.length == 0) { return new InputColumn<?>[0]; } final List<InputColumn<?>> result = new ArrayList<>(); for (final InputColumn<?> inputColumn : inputColumns) { final Set<Column> sourcePhysicalColumns = sourceColumnFinder.findOriginatingColumns(inputColumn); for (final Column physicalColumn : sourcePhysicalColumns) { if (table.equals(physicalColumn.getTable())) { result.add(inputColumn); break; } } } return result.toArray(new InputColumn<?>[result.size()]); } public int size() { return _rowProcessingPublishers.size(); } public RowProcessingPublisher getRowProcessingPublisher(final RowProcessingStream stream) { return _rowProcessingPublishers.get(stream); } public RowProcessingStream getStream(final Table table) { final Set<RowProcessingStream> dataStreams = _rowProcessingPublishers.keySet(); for (final RowProcessingStream stream : dataStreams) { // first try with object equality because tables may be equal in // some corner cases if (stream.getTable() == table) { return stream; } } for (final RowProcessingStream stream : dataStreams) { if (table.equals(stream.getTable())) { return stream; } } return null; } /** * * @param table * @return * * @deprecated use {@link #getRowProcessingPublisher(RowProcessingStream)} * instead */ @Deprecated public RowProcessingPublisher getRowProcessingPublisher(final Table table) { final RowProcessingStream stream = getStream(table); return getRowProcessingPublisher(stream); } public Collection<RowProcessingPublisher> getRowProcessingPublishers() { return _rowProcessingPublishers.values(); } public RowProcessingStream[] getStreams() { final Set<RowProcessingStream> streams = _rowProcessingPublishers.keySet(); return streams.toArray(new RowProcessingStream[streams.size()]); } /** * * @return * * @deprecated use {@link #getStreams()} instead */ @Deprecated public Table[] getTables() { final RowProcessingStream[] streams = getStreams(); final Table[] tables = new Table[streams.length]; for (int i = 0; i < tables.length; i++) { tables[i] = streams[i].getTable(); } return tables; } public AnalysisJobMetrics getAnalysisJobMetrics() { return new AnalysisJobMetricsImpl(_analysisJob, this); } protected AnalysisListener getAnalysisListener() { return _analysisListener; } protected LifeCycleHelper getLifeCycleHelper() { return _lifeCycleHelper; } public TaskRunner getTaskRunner() { return _taskRunner; } public LifeCycleHelper getConsumerSpecificLifeCycleHelper(final RowProcessingConsumer consumer) { final LifeCycleHelper outerLifeCycleHelper = getLifeCycleHelper(); final boolean includeNonDistributedTasks = outerLifeCycleHelper.isIncludeNonDistributedTasks(); final InjectionManager outerInjectionManager = outerLifeCycleHelper.getInjectionManager(); final ContextAwareInjectionManager injectionManager = new ContextAwareInjectionManager(outerInjectionManager, consumer.getAnalysisJob(), consumer.getComponentJob(), getAnalysisListener()); return new LifeCycleHelper(injectionManager, includeNonDistributedTasks); } public ErrorAware getErrorAware() { return _errorAware; } }