/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.job.builder; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.metamodel.schema.Table; import org.datacleaner.api.Analyzer; import org.datacleaner.api.ColumnProperty; import org.datacleaner.api.InputColumn; import org.datacleaner.api.OutputDataStream; import org.datacleaner.descriptors.AnalyzerDescriptor; import org.datacleaner.descriptors.ConfiguredPropertyDescriptor; import org.datacleaner.job.AnalysisJobImmutabilizer; import org.datacleaner.job.AnalyzerJob; import org.datacleaner.job.ComponentConfigurationException; import org.datacleaner.job.ComponentRequirement; import org.datacleaner.job.ImmutableAnalyzerJob; import org.datacleaner.job.ImmutableComponentConfiguration; import org.datacleaner.job.OutputDataStreamJob; import org.datacleaner.util.LabelUtils; import org.datacleaner.util.ReflectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A {@link ComponentBuilder} for {@link Analyzer}s. * * @param <A> * the type of {@link Analyzer} being built. */ public final class AnalyzerComponentBuilder<A extends Analyzer<?>> extends AbstractComponentBuilder<AnalyzerDescriptor<A>, A, AnalyzerComponentBuilder<A>> { public static final String METADATA_PROPERTY_BUILDER_ID = "org.datacleaner.componentbuilder.id"; public static final String METADATA_PROPERTY_BUILDER_PARTITION_INDEX = "org.datacleaner.componentbuilder.partition.index"; private static final Logger logger = LoggerFactory.getLogger(AnalysisJobBuilder.class); /** * Field that determines if this analyzer is applicable for building * multiple jobs where the input columns have been partitioned based on * input size (single or multiple) and originating table */ private final boolean _multipleJobsSupported; private final List<InputColumn<?>> _escalatingInputColumns; private final ConfiguredPropertyDescriptor _escalatingInputProperty; private final List<AnalyzerChangeListener> _localChangeListeners; public AnalyzerComponentBuilder(final AnalysisJobBuilder analysisJobBuilder, final AnalyzerDescriptor<A> descriptor) { super(analysisJobBuilder, descriptor, AnalyzerComponentBuilder.class); final Set<ConfiguredPropertyDescriptor> requiredInputProperties = descriptor.getConfiguredPropertiesForInput(false); if (requiredInputProperties.size() == 1) { _escalatingInputProperty = requiredInputProperties.iterator().next(); final ColumnProperty columnProperty = _escalatingInputProperty.getAnnotation(ColumnProperty.class); _multipleJobsSupported = columnProperty != null && !_escalatingInputProperty.isArray() && columnProperty .escalateToMultipleJobs(); _escalatingInputColumns = new ArrayList<>(); } else { _multipleJobsSupported = false; _escalatingInputProperty = null; _escalatingInputColumns = Collections.emptyList(); } _localChangeListeners = new ArrayList<>(0); } /** * Builds a temporary list of all listeners, both global and local * * @return */ private List<AnalyzerChangeListener> getAllListeners() { @SuppressWarnings("deprecation") final List<AnalyzerChangeListener> globalChangeListeners = getAnalysisJobBuilder().getAnalyzerChangeListeners(); final List<AnalyzerChangeListener> list = new ArrayList<>(globalChangeListeners.size() + _localChangeListeners.size()); list.addAll(globalChangeListeners); list.addAll(_localChangeListeners); return list; } public boolean isMultipleJobsDeterminedBy(final ConfiguredPropertyDescriptor propertyDescriptor) { return _multipleJobsSupported && !propertyDescriptor.isArray() && propertyDescriptor.isInputColumn() && propertyDescriptor.isRequired(); } public AnalyzerJob toAnalyzerJob() throws IllegalStateException { return toAnalyzerJob(true); } public AnalyzerJob toAnalyzerJob(final boolean validate) throws IllegalStateException { final AnalyzerJob[] analyzerJobs = toAnalyzerJobs(validate); if (analyzerJobs == null || analyzerJobs.length == 0) { return null; } if (validate && analyzerJobs.length > 1) { throw new IllegalStateException( "This builder generates " + analyzerJobs.length + " jobs, but a single job was requested"); } return analyzerJobs[0]; } public AnalyzerJob[] toAnalyzerJobs() throws IllegalStateException { return toAnalyzerJobs(true); } public AnalyzerJob[] toAnalyzerJobs(final AnalysisJobImmutabilizer immutabilizer) throws IllegalStateException { return toAnalyzerJobs(true, immutabilizer); } public AnalyzerJob[] toAnalyzerJobs(final boolean validate) throws IllegalStateException { return toAnalyzerJobs(validate, new AnalysisJobImmutabilizer()); } public AnalyzerJob[] toAnalyzerJobs(final boolean validate, final AnalysisJobImmutabilizer immutabilizer) throws IllegalStateException { final Map<ConfiguredPropertyDescriptor, Object> configuredProperties = getConfiguredProperties(); final ComponentRequirement componentRequirement = immutabilizer.load(getComponentRequirement()); final List<InputColumn<?>> inputColumns; if (_escalatingInputProperty != null && !_escalatingInputColumns.isEmpty()) { inputColumns = _escalatingInputColumns; } else { inputColumns = getInputColumns(); } if (validate && inputColumns.isEmpty()) { throw new IllegalStateException("No input column(s) configured"); } final List<InputColumn<?>> tableLessColumns = new ArrayList<>(); final Map<Table, List<InputColumn<?>>> originatingTables = new LinkedHashMap<>(); for (final InputColumn<?> inputColumn : inputColumns) { final Table table = getAnalysisJobBuilder().getOriginatingTable(inputColumn); if (table == null) { // some columns (such as those based on an expression) don't // originate from a table. They should be applied to all jobs. tableLessColumns.add(inputColumn); } else { List<InputColumn<?>> list = originatingTables.get(table); if (list == null) { list = new ArrayList<>(); } list.add(inputColumn); originatingTables.put(table, list); } } if (validate && originatingTables.isEmpty()) { final List<Table> sourceTables = getAnalysisJobBuilder().getSourceTables(); if (sourceTables.size() == 1) { logger.info("Only a single source table is available, so the source of analyzer '{}' is inferred", this); final Table table = sourceTables.get(0); originatingTables.put(table, new ArrayList<>()); } else { throw new IllegalStateException("Could not determine source for analyzer '" + this + "'"); } } if (!isMultipleJobsSupported() && originatingTables.size() == 1) { // there's only a single table involved - leave the input columns // untouched and keep the output data stream final OutputDataStreamJob[] outputDataStreamJobs = immutabilizer.load(getOutputDataStreamJobs(), validate); final ImmutableAnalyzerJob job = new ImmutableAnalyzerJob(getName(), getDescriptor(), new ImmutableComponentConfiguration(configuredProperties), componentRequirement, getMetadataProperties(), outputDataStreamJobs); return new AnalyzerJob[] { job }; } for (final Entry<Table, List<InputColumn<?>>> entry : originatingTables.entrySet()) { entry.getValue().addAll(tableLessColumns); } final List<AnalyzerJob> jobs = new ArrayList<>(); final Set<Entry<Table, List<InputColumn<?>>>> entrySet = originatingTables.entrySet(); int partitionIndex = 0; for (final Entry<Table, List<InputColumn<?>>> entry : entrySet) { final List<InputColumn<?>> columnsOfTable = entry.getValue(); if (_escalatingInputProperty == null || _escalatingInputProperty.isArray()) { // escalation will happen only for multi-table input jobs.add(createPartitionedJob(null, columnsOfTable, configuredProperties, partitionIndex++)); } else { for (final InputColumn<?> escalatingColumn : columnsOfTable) { // escalation happens for each column jobs.add(createPartitionedJob(escalatingColumn, columnsOfTable, configuredProperties, partitionIndex++)); } } } if (validate && !isConfigured()) { throw new IllegalStateException("Row processing Analyzer job is not correctly configured"); } return jobs.toArray(new AnalyzerJob[jobs.size()]); } @Override public AnalyzerComponentBuilder<A> addInputColumn(final InputColumn<?> inputColumn, final ConfiguredPropertyDescriptor propertyDescriptor) { assert propertyDescriptor.isInputColumn(); if (inputColumn == null) { throw new IllegalArgumentException("InputColumn cannot be null"); } if (isMultipleJobsDeterminedBy(propertyDescriptor)) { _escalatingInputColumns.add(inputColumn); registerListenerIfLinkedToTransformer(propertyDescriptor, _escalatingInputColumns.toArray(new InputColumn<?>[_escalatingInputColumns.size()])); return this; } else { return super.addInputColumn(inputColumn, propertyDescriptor); } } @Override public AnalyzerComponentBuilder<A> removeInputColumn(final InputColumn<?> inputColumn, final ConfiguredPropertyDescriptor propertyDescriptor) { assert propertyDescriptor.isInputColumn(); if (inputColumn == null) { throw new IllegalArgumentException("InputColumn cannot be null"); } if (isMultipleJobsDeterminedBy(propertyDescriptor)) { _escalatingInputColumns.remove(inputColumn); return this; } else { return super.removeInputColumn(inputColumn, propertyDescriptor); } } @Override public boolean isConfigured(final ConfiguredPropertyDescriptor configuredProperty, final boolean throwException) { if (isMultipleJobsSupported() && configuredProperty == _escalatingInputProperty) { if (_escalatingInputColumns.isEmpty()) { final Object propertyValue = super.getConfiguredProperty(configuredProperty); if (propertyValue != null) { if (propertyValue.getClass().isArray() && Array.getLength(propertyValue) > 0) { setConfiguredProperty(configuredProperty, propertyValue); return isConfigured(configuredProperty, throwException); } } if (throwException) { throw new ComponentConfigurationException( "No input columns configured for " + LabelUtils.getLabel(this)); } else { return false; } } return true; } return super.isConfigured(configuredProperty, throwException); } private AnalyzerJob createPartitionedJob(final InputColumn<?> escalatingColumnValue, final Collection<InputColumn<?>> availableColumns, final Map<ConfiguredPropertyDescriptor, Object> configuredProperties, final int partitionIndex) { final Map<ConfiguredPropertyDescriptor, Object> jobProperties = new HashMap<>(configuredProperties); for (final Entry<ConfiguredPropertyDescriptor, Object> jobProperty : jobProperties.entrySet()) { final ConfiguredPropertyDescriptor propertyDescriptor = jobProperty.getKey(); if (propertyDescriptor.isInputColumn()) { final Object unpartitionedValue; if (escalatingColumnValue != null && _escalatingInputProperty == propertyDescriptor) { unpartitionedValue = escalatingColumnValue; } else { unpartitionedValue = jobProperty.getValue(); } final Object partitionedValue = partitionValue(propertyDescriptor, unpartitionedValue, availableColumns); jobProperty.setValue(partitionedValue); } } // set the component builder ID property to allow correlating partion // jobs back to their builder final Map<String, String> metadataProperties = new LinkedHashMap<>(getMetadataProperties()); metadataProperties.put(METADATA_PROPERTY_BUILDER_ID, "" + System.identityHashCode(this)); metadataProperties.put(METADATA_PROPERTY_BUILDER_PARTITION_INDEX, "" + partitionIndex); // we do not currently support this combination of multiple analyzer // jobs and having output data streams final OutputDataStreamJob[] outputDataStreamJobs = new OutputDataStreamJob[0]; final ComponentRequirement componentRequirement = new AnalysisJobImmutabilizer().load(getComponentRequirement()); return new ImmutableAnalyzerJob(getName(), getDescriptor(), new ImmutableComponentConfiguration(jobProperties), componentRequirement, metadataProperties, outputDataStreamJobs); } private Object partitionValue(final ConfiguredPropertyDescriptor key, final Object unpartitionedValue, final Collection<InputColumn<?>> availableColumns) { if (unpartitionedValue instanceof InputColumn[]) { final InputColumn<?>[] array = (InputColumn<?>[]) unpartitionedValue; final List<InputColumn<?>> result = new ArrayList<>(); for (final InputColumn<?> inputColumn : array) { if (availableColumns.contains(inputColumn)) { result.add(inputColumn); } } if (!key.isArray()) { if (result.isEmpty()) { return null; } return result.get(0); } return result.toArray(new InputColumn<?>[result.size()]); } return unpartitionedValue; } @Override public String toString() { return "AnalyzerComponentBuilder[analyzer=" + getDescriptor().getDisplayName() + ",inputColumns=" + getInputColumns() + "]"; } @Override public AnalyzerComponentBuilder<A> setConfiguredProperty(final ConfiguredPropertyDescriptor configuredProperty, final Object value) { if (isMultipleJobsDeterminedBy(configuredProperty)) { // the dummy value is used just to pass something to the underlying // prototype bean. final InputColumn<?> dummyValue; _escalatingInputColumns.clear(); if (value == null) { dummyValue = null; } else if (ReflectionUtils.isArray(value)) { final int length = Array.getLength(value); for (int i = 0; i < length; i++) { final InputColumn<?> inputColumn = (InputColumn<?>) Array.get(value, i); _escalatingInputColumns.add(inputColumn); } if (_escalatingInputColumns.isEmpty()) { dummyValue = null; } else { dummyValue = _escalatingInputColumns.iterator().next(); } } else { final InputColumn<?> col = (InputColumn<?>) value; _escalatingInputColumns.add(col); dummyValue = col; } final AnalyzerComponentBuilder<A> componentBuilder; if (configuredProperty.isArray()) { final InputColumn<?>[] inputColumsArray; if (dummyValue == null) { inputColumsArray = new InputColumn[0]; } else { inputColumsArray = new InputColumn[] { dummyValue }; } componentBuilder = super.setConfiguredProperty(configuredProperty, inputColumsArray); } else { componentBuilder = super.setConfiguredProperty(configuredProperty, dummyValue); } registerListenerIfLinkedToTransformer(configuredProperty, value); return componentBuilder; } else { return super.setConfiguredProperty(configuredProperty, value); } } @Override public Object getConfiguredProperty(final ConfiguredPropertyDescriptor propertyDescriptor) { if (isMultipleJobsDeterminedBy(propertyDescriptor)) { return _escalatingInputColumns.toArray(new InputColumn[_escalatingInputColumns.size()]); } else { return super.getConfiguredProperty(propertyDescriptor); } } @Override public void onConfigurationChanged() { super.onConfigurationChanged(); final List<AnalyzerChangeListener> listeners = getAllListeners(); for (final AnalyzerChangeListener listener : listeners) { listener.onConfigurationChanged(this); } } @Override public void onRequirementChanged() { super.onRequirementChanged(); final List<AnalyzerChangeListener> listeners = getAllListeners(); for (final AnalyzerChangeListener listener : listeners) { listener.onRequirementChanged(this); } } public boolean isMultipleJobsSupported() { return _multipleJobsSupported; } @Override public List<OutputDataStream> getOutputDataStreams() { if (isMultipleJobsSupported()) { return Collections.emptyList(); } return super.getOutputDataStreams(); } @Override protected Map<ConfiguredPropertyDescriptor, Object> getConfiguredPropertiesForQuestioning() { final Map<ConfiguredPropertyDescriptor, Object> properties = super.getConfiguredPropertiesForQuestioning(); if (!isMultipleJobsSupported()) { return properties; } // create a mutable copy and replace the property values that are final Map<ConfiguredPropertyDescriptor, Object> map = new HashMap<>(properties); for (final Entry<ConfiguredPropertyDescriptor, Object> entry : map.entrySet()) { if (isMultipleJobsDeterminedBy(entry.getKey())) { final Object value = entry.getValue(); if (Array.getLength(value) > 1) { // pick the first element final Object element = Array.get(value, 0); entry.setValue(element); } } } return Collections.unmodifiableMap(map); } /** * Notification method invoked when transformer is removed. */ @Override protected void onRemovedInternal() { final List<AnalyzerChangeListener> listeners = getAllListeners(); for (final AnalyzerChangeListener listener : listeners) { listener.onRemove(this); } } /** * Adds a change listener to this component * * @param listener */ public void addChangeListener(final AnalyzerChangeListener listener) { _localChangeListeners.add(listener); } /** * Removes a change listener from this component * * @param listener * @return whether or not the listener was found and removed. */ public boolean removeChangeListener(final AnalyzerChangeListener listener) { return _localChangeListeners.remove(listener); } }