/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.util;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.Table;
import org.datacleaner.api.ExpressionBasedInputColumn;
import org.datacleaner.api.InputColumn;
import org.datacleaner.job.AnalysisJob;
import org.datacleaner.job.ComponentRequirement;
import org.datacleaner.job.FilterOutcome;
import org.datacleaner.job.HasComponentRequirement;
import org.datacleaner.job.HasFilterOutcomes;
import org.datacleaner.job.InputColumnSinkJob;
import org.datacleaner.job.InputColumnSourceJob;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.job.builder.SourceColumns;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Helper class for traversing dependencies between virtual and physical columns.
*
* For performance reasons this class stores found sources in an internal cache. As there is no mechanism to
* invalidate or refresh this cache, instances of this class should not be assigned to fields of other
* classes.
*/
public class SourceColumnFinder {
private static final String LOG_MESSAGE_RECURSIVE_TRAVERSAL =
"Ending traversal of object graph because the same originating objects are appearing recursively";
private static final Logger logger = LoggerFactory.getLogger(SourceColumnFinder.class);
private final Map<InputColumn<?>, Set<Column>> originatingColumnsOfInputColumnCache = new HashMap<>();
private final Map<Object, Set<Column>> originatingColumnsOfSourceCache = new HashMap<>();
private Set<InputColumnSinkJob> _inputColumnSinks = new HashSet<>();
private Set<InputColumnSourceJob> _inputColumnSources = new LinkedHashSet<>();
private Set<HasFilterOutcomes> _outcomeSources = new HashSet<>();
private Set<HasComponentRequirement> _outcomeSinks = new HashSet<>();
private void addSources(final Object... sources) {
for (final Object source : sources) {
if (source instanceof InputColumnSinkJob) {
_inputColumnSinks.add((InputColumnSinkJob) source);
}
if (source instanceof InputColumnSourceJob) {
_inputColumnSources.add((InputColumnSourceJob) source);
}
if (source instanceof HasFilterOutcomes) {
_outcomeSources.add((HasFilterOutcomes) source);
}
if (source instanceof HasComponentRequirement) {
_outcomeSinks.add((HasComponentRequirement) source);
}
}
}
private void addSources(final Collection<?> sources) {
addSources(sources.toArray());
}
public void addSources(final AnalysisJobBuilder job) {
addSources(new SourceColumns(job.getSourceColumns()));
addSources(job.getFilterComponentBuilders());
addSources(job.getTransformerComponentBuilders());
addSources(job.getAnalyzerComponentBuilders());
}
public void addSources(final AnalysisJob job) {
addSources(new SourceColumns(job.getSourceColumns()));
addSources(job.getFilterJobs());
addSources(job.getTransformerJobs());
addSources(job.getAnalyzerJobs());
}
public List<InputColumn<?>> findInputColumns(final Class<?> dataType) {
final List<InputColumn<?>> result = new ArrayList<>();
for (final InputColumnSourceJob source : _inputColumnSources) {
final InputColumn<?>[] outputColumns = source.getOutput();
for (final InputColumn<?> col : outputColumns) {
final Class<?> columnDataType = col.getDataType();
if (dataType == null || columnDataType == null) {
result.add(col);
} else {
if (ReflectionUtils.is(columnDataType, dataType)) {
result.add(col);
}
}
}
}
return result;
}
/**
* Finds all source jobs/components for a particular job/component. This
* method uses {@link Object} as types because input and output can be quite
* polymorphic. Typically {@link InputColumnSinkJob},
* {@link InputColumnSourceJob}, {@link HasComponentRequirement} and
* {@link OutcomeSourceJob} implementations are used.
*
* @param job
* typically some {@link InputColumnSinkJob}
* @return a list of jobs/components that are a source of this job.
*/
public Set<Object> findAllSourceJobs(final Object job) {
final Set<Object> result = new HashSet<>();
findAllSourceJobs(job, result);
return result;
}
private void findAllSourceJobs(final Object job, final Set<Object> result) {
if (job == null) {
return;
}
if (job instanceof InputColumnSinkJob) {
final InputColumn<?>[] inputColumns = ((InputColumnSinkJob) job).getInput();
for (final InputColumn<?> inputColumn : inputColumns) {
final InputColumnSourceJob source = findInputColumnSource(inputColumn);
if (source != null) {
final boolean added = result.add(source);
if (added) {
findAllSourceJobs(source, result);
}
}
}
}
if (job instanceof HasComponentRequirement) {
final HasComponentRequirement hasComponentRequirement = (HasComponentRequirement) job;
final ComponentRequirement requirement = hasComponentRequirement.getComponentRequirement();
findAllSourceJobs(requirement, result);
}
if (job instanceof ComponentRequirement) {
final Collection<FilterOutcome> requirements = getProcessingDependencies((ComponentRequirement) job);
for (final FilterOutcome outcome : requirements) {
final HasFilterOutcomes source = findOutcomeSource(outcome);
if (source != null) {
final boolean added = result.add(source);
if (added) {
findAllSourceJobs(source, result);
}
}
}
}
}
public InputColumnSourceJob findInputColumnSource(final InputColumn<?> inputColumn) {
if (inputColumn instanceof ExpressionBasedInputColumn) {
return null;
}
for (final InputColumnSourceJob source : _inputColumnSources) {
final InputColumn<?>[] output = source.getOutput();
for (final InputColumn<?> column : output) {
if (inputColumn.equals(column)) {
return source;
}
}
}
return null;
}
public HasFilterOutcomes findOutcomeSource(final FilterOutcome requirement) {
for (final HasFilterOutcomes source : _outcomeSources) {
final Collection<FilterOutcome> outcomes = source.getFilterOutcomes();
for (final FilterOutcome outcome : outcomes) {
if (requirement.equals(outcome)) {
return source;
}
}
}
return null;
}
public Set<Column> findOriginatingColumns(final FilterOutcome requirement) {
final HasFilterOutcomes source = findOutcomeSource(requirement);
return findOriginatingColumnsOfSource(source);
}
public Table findOriginatingTable(final FilterOutcome requirement) {
return findOriginatingTable(requirement, new HashSet<>());
}
private Table findOriginatingTable(final FilterOutcome requirement, final Set<Object> resolvedSet) {
final HasFilterOutcomes source = findOutcomeSource(requirement);
if (!resolvedSet.add(source)) {
logger.debug(LOG_MESSAGE_RECURSIVE_TRAVERSAL);
return null;
}
return findOriginatingTableOfSource(source, resolvedSet);
}
public Table findOriginatingTable(final InputColumn<?> inputColumn) {
return findOriginatingTable(inputColumn, new HashSet<>());
}
private Table findOriginatingTable(final InputColumn<?> inputColumn, final Set<Object> resolvedSet) {
if (!resolvedSet.add(inputColumn)) {
logger.debug(LOG_MESSAGE_RECURSIVE_TRAVERSAL);
return null;
}
if (inputColumn == null) {
logger.warn("InputColumn was null, no originating table found");
return null;
}
if (inputColumn.isPhysicalColumn()) {
return inputColumn.getPhysicalColumn().getTable();
}
final InputColumnSourceJob inputColumnSource = findInputColumnSource(inputColumn);
if (!resolvedSet.add(inputColumnSource)) {
logger.debug(LOG_MESSAGE_RECURSIVE_TRAVERSAL);
return null;
}
return findOriginatingTableOfSource(inputColumnSource, resolvedSet);
}
private Table findOriginatingTableOfSource(final Object source, final Set<Object> resolvedSet) {
final Set<Table> result = new TreeSet<>();
if (source instanceof InputColumnSinkJob) {
final InputColumn<?>[] input = ((InputColumnSinkJob) source).getInput();
if (input != null) {
for (final InputColumn<?> col : input) {
if (col == null) {
logger.warn("InputColumn sink had a null-column element!");
} else {
final Table table = findOriginatingTable(col, resolvedSet);
if (table != null) {
result.add(table);
}
}
}
}
}
if (source instanceof HasComponentRequirement) {
final HasComponentRequirement hasComponentRequirement = (HasComponentRequirement) source;
final ComponentRequirement componentRequirement = hasComponentRequirement.getComponentRequirement();
final Collection<FilterOutcome> requirements = getProcessingDependencies(componentRequirement);
for (final FilterOutcome outcome : requirements) {
final Table table = findOriginatingTable(outcome, resolvedSet);
if (table != null) {
result.add(table);
}
}
}
if (result.isEmpty()) {
return null;
}
if (result.size() == 1) {
return result.iterator().next();
}
final StringBuilder sb = new StringBuilder();
for (final Table table : result) {
if (sb.length() != 0) {
sb.append(", ");
}
sb.append(table.getName());
}
throw new IllegalStateException("Multiple originating tables (" + sb + ") found for source: " + source);
}
private Set<Column> findOriginatingColumnsOfInputColumn(final InputColumn<?> inputColumn) {
final Set<Column> cachedOriginatingColumns = originatingColumnsOfInputColumnCache.get(inputColumn);
if (cachedOriginatingColumns != null) {
return cachedOriginatingColumns;
}
final Set<Column> originatingColumns = new HashSet<>();
if (inputColumn != null) {
if (inputColumn.isPhysicalColumn()) {
originatingColumns.add(inputColumn.getPhysicalColumn());
} else {
final InputColumnSourceJob source = findInputColumnSource(inputColumn);
originatingColumns.addAll(findOriginatingColumnsOfSource(source));
}
}
originatingColumnsOfInputColumnCache.put(inputColumn, originatingColumns);
return originatingColumns;
}
private Set<Column> findOriginatingColumnsOfOutcome(final FilterOutcome requirement) {
final HasFilterOutcomes source = findOutcomeSource(requirement);
return findOriginatingColumnsOfSource(source);
}
private Set<Column> findOriginatingColumnsOfSource(final Object source) {
final Set<Column> cachedOriginatingColumns = originatingColumnsOfSourceCache.get(source);
if (cachedOriginatingColumns != null) {
return cachedOriginatingColumns;
}
final Set<Column> originatingColumns = new HashSet<>();
if (source != null) {
if (source instanceof InputColumnSinkJob) {
final InputColumn<?>[] input = ((InputColumnSinkJob) source).getInput();
if (input != null) {
for (final InputColumn<?> inputColumn : input) {
originatingColumns.addAll(findOriginatingColumnsOfInputColumn(inputColumn));
}
}
}
if (source instanceof HasComponentRequirement) {
final HasComponentRequirement hasComponentRequirement = (HasComponentRequirement) source;
final ComponentRequirement componentRequirement = hasComponentRequirement.getComponentRequirement();
final Collection<FilterOutcome> requirements = getProcessingDependencies(componentRequirement);
for (final FilterOutcome outcome : requirements) {
originatingColumns.addAll(findOriginatingColumnsOfOutcome(outcome));
}
}
}
originatingColumnsOfSourceCache.put(source, originatingColumns);
return originatingColumns;
}
private Collection<FilterOutcome> getProcessingDependencies(final ComponentRequirement componentRequirement) {
if (componentRequirement == null) {
return Collections.emptyList();
}
final Collection<FilterOutcome> processingDependencies = componentRequirement.getProcessingDependencies();
if (processingDependencies == null) {
return Collections.emptyList();
}
return processingDependencies;
}
public Set<Column> findOriginatingColumns(final InputColumn<?> inputColumn) {
// TODO: Detect cyclic dependencies between transformers (A depends on
// B, B depends on A)
return findOriginatingColumnsOfInputColumn(inputColumn);
}
}