/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.job.runner; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Queue; import org.apache.metamodel.schema.ColumnType; import org.apache.metamodel.schema.MutableColumn; import org.apache.metamodel.schema.MutableTable; import org.apache.metamodel.schema.Table; import org.datacleaner.beans.StringAnalyzer; import org.datacleaner.components.convert.ConvertToStringTransformer; import org.datacleaner.components.fuse.CoalesceMultipleFieldsTransformer; import org.datacleaner.components.fuse.CoalesceUnit; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.data.MetaModelInputColumn; import org.datacleaner.data.MutableInputColumn; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.AnalyzerJob; import org.datacleaner.job.ComponentJob; import org.datacleaner.job.FilterJob; import org.datacleaner.job.TransformerJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.FilterComponentBuilder; import org.datacleaner.job.builder.TransformerComponentBuilder; import org.datacleaner.test.MockTransformer; import junit.framework.TestCase; public class RowProcessingConsumerSorterTest extends TestCase { private MutableColumn physicalColumn; @Override protected void setUp() throws Exception { super.setUp(); physicalColumn = new MutableColumn("foo", ColumnType.VARCHAR); physicalColumn.setTable(new MutableTable("bar").addColumn(physicalColumn)); } public void testCreateProcessOrderedConsumerListNoConsumers() throws Exception { final List<RowProcessingConsumer> consumerList = new RowProcessingConsumerSorter(new ArrayList<>()).createProcessOrderedConsumerList(); assertTrue(consumerList.isEmpty()); } public void testCreateProcessOrderedConsumerListWithMergedOutcomes() throws Exception { final AnalysisJobBuilder ajb = new AnalysisJobBuilder(new DataCleanerConfigurationImpl()); ajb.setDatastoreConnection(new MockDatastoreConnection()); ajb.addSourceColumn(physicalColumn); final MetaModelInputColumn inputColumn = ajb.getSourceColumns().get(0); // 1: add a filter final FilterComponentBuilder<MockFilter, MockFilter.Category> fjb1 = ajb.addFilter(MockFilter.class); fjb1.addInputColumn(inputColumn); fjb1.setName("fjb1"); // 2: trim (depends on filter) final TransformerComponentBuilder<MockTransformer> tjb1 = ajb.addTransformer(MockTransformer.class); tjb1.addInputColumn(inputColumn); tjb1.setRequirement(fjb1, MockFilter.Category.VALID); tjb1.setName("tjb1"); // 3: merge either the null or the trimmed value final TransformerComponentBuilder<CoalesceMultipleFieldsTransformer> coalesce = ajb.addTransformer(CoalesceMultipleFieldsTransformer.class); final CoalesceUnit unit1 = new CoalesceUnit(tjb1.getOutputColumns().get(0)); final CoalesceUnit unit2 = new CoalesceUnit(inputColumn); coalesce.getComponentInstance().configureUsingCoalesceUnits(unit1, unit2); final MutableInputColumn<?> mergedColumn1 = coalesce.getOutputColumns().get(0); // 4: add another filter (depends on merged output) final FilterComponentBuilder<MockFilter, MockFilter.Category> fjb2 = ajb.addFilter(MockFilter.class); fjb2.addInputColumn(mergedColumn1); fjb2.setName("fjb2"); // 5: add an analyzer ajb.addAnalyzer(StringAnalyzer.class).addInputColumn(mergedColumn1) .setRequirement(fjb2, MockFilter.Category.VALID); assertTrue(ajb.isConfigured()); List<RowProcessingConsumer> consumers = getConsumers(ajb.toAnalysisJob()); consumers = new RowProcessingConsumerSorter(consumers).createProcessOrderedConsumerList(); assertEquals(5, consumers.size()); assertEquals("ImmutableFilterJob[name=fjb1,filter=Mock filter]", consumers.get(0).getComponentJob().toString()); assertEquals("ImmutableTransformerJob[name=tjb1,transformer=Mock transformer]", consumers.get(1).getComponentJob().toString()); assertEquals("ImmutableTransformerJob[name=null,transformer=Fuse / Coalesce fields]", consumers.get(2).getComponentJob().toString()); assertEquals("ImmutableFilterJob[name=fjb2,filter=Mock filter]", consumers.get(3).getComponentJob().toString()); assertEquals("ImmutableAnalyzerJob[name=null,analyzer=String analyzer]", consumers.get(4).getComponentJob().toString()); ajb.close(); } public void testCreateProcessOrderedConsumerListWithFilterDependencies() throws Exception { final AnalysisJobBuilder ajb = new AnalysisJobBuilder(new DataCleanerConfigurationImpl()); ajb.setDatastoreConnection(new MockDatastoreConnection()); ajb.addSourceColumn(physicalColumn); final MetaModelInputColumn inputColumn = ajb.getSourceColumns().get(0); // 1: add a filter final FilterComponentBuilder<MockFilter, MockFilter.Category> fjb1 = ajb.addFilter(MockFilter.class); fjb1.addInputColumn(inputColumn); fjb1.setName("fjb1"); // 2: trim (depends on filter) final TransformerComponentBuilder<TransformerMock> tjb1 = ajb.addTransformer(TransformerMock.class); tjb1.addInputColumn(inputColumn); tjb1.setRequirement(fjb1, MockFilter.Category.VALID); tjb1.setName("tjb1"); // 3: trim again, just to examplify (depends on first trim output) final TransformerComponentBuilder<TransformerMock> tjb2 = ajb.addTransformer(TransformerMock.class); tjb2.addInputColumn(tjb1.getOutputColumns().get(0)); tjb2.setName("tjb2"); // 4: add a single word filter (depends on second trim) final FilterComponentBuilder<MockFilter, MockFilter.Category> fjb2 = ajb.addFilter(MockFilter.class); fjb2.addInputColumn(tjb2.getOutputColumns().get(0)); fjb2.setName("fjb2"); // 5 and 6: Analyze VALID and INVALID output of single-word filter // separately (the order of these two are not deterministic because of // the shuffle) ajb.addAnalyzer(StringAnalyzer.class).addInputColumn(inputColumn) .setRequirement(fjb2, MockFilter.Category.VALID); ajb.addAnalyzer(StringAnalyzer.class).addInputColumn(inputColumn) .setRequirement(fjb2, MockFilter.Category.INVALID); assertTrue(ajb.isConfigured()); List<RowProcessingConsumer> consumers = getConsumers(ajb.toAnalysisJob()); assertEquals(6, consumers.size()); consumers = new RowProcessingConsumerSorter(consumers).createProcessOrderedConsumerList(); assertEquals("ImmutableFilterJob[name=fjb1,filter=Mock filter]", consumers.get(0).getComponentJob().toString()); assertEquals("ImmutableTransformerJob[name=tjb1,transformer=Transformer mock]", consumers.get(1).getComponentJob().toString()); assertEquals("ImmutableTransformerJob[name=tjb2,transformer=Transformer mock]", consumers.get(2).getComponentJob().toString()); assertEquals("ImmutableFilterJob[name=fjb2,filter=Mock filter]", consumers.get(3).getComponentJob().toString()); ajb.close(); } public void testCreateProcessOrderedConsumerListChainedTransformers() throws Exception { final AnalysisJobBuilder ajb = new AnalysisJobBuilder(new DataCleanerConfigurationImpl()); ajb.addSourceColumn(physicalColumn); final TransformerComponentBuilder<TransformerMock> tjb1 = ajb.addTransformer(TransformerMock.class).addInputColumn(ajb.getSourceColumns().get(0)); final TransformerComponentBuilder<TransformerMock> tjb2 = ajb.addTransformer(TransformerMock.class).addInputColumn(tjb1.getOutputColumns().get(0)); final TransformerComponentBuilder<ConvertToStringTransformer> tjb3 = ajb.addTransformer(ConvertToStringTransformer.class).addInputColumn(tjb2.getOutputColumns().get(0)); ajb.addAnalyzer(StringAnalyzer.class).addInputColumn(ajb.getSourceColumns().get(0)); ajb.addAnalyzer(StringAnalyzer.class).addInputColumn(tjb3.getOutputColumns().get(0)); ajb.setDatastoreConnection(new MockDatastoreConnection()); assertTrue(ajb.isConfigured()); final AnalysisJob analysisJob = ajb.toAnalysisJob(); List<RowProcessingConsumer> consumers = getConsumers(analysisJob); consumers = new RowProcessingConsumerSorter(consumers).createProcessOrderedConsumerList(); assertEquals(5, consumers.size()); final List<TransformerJob> transformerJobs = new ArrayList<>(analysisJob.getTransformerJobs()); final List<AnalyzerJob> analyzerJobs = new ArrayList<>(analysisJob.getAnalyzerJobs()); // create a list that represents the expected dependent sequence final Queue<ComponentJob> jobDependencies = new LinkedList<>(); jobDependencies.add(transformerJobs.get(0)); jobDependencies.add(transformerJobs.get(1)); jobDependencies.add(transformerJobs.get(2)); jobDependencies.add(analyzerJobs.get(1)); int jobDependenciesFound = 0; boolean analyzerJob1found = false; ComponentJob nextJobDependency = jobDependencies.poll(); for (final RowProcessingConsumer rowProcessingConsumer : consumers) { final ComponentJob job = rowProcessingConsumer.getComponentJob(); if (job == nextJobDependency) { nextJobDependency = jobDependencies.poll(); jobDependenciesFound++; } else if (job == analyzerJobs.get(0)) { assertFalse(analyzerJob1found); analyzerJob1found = true; } else { fail("The consumers sort order is wrong! Found: " + job + " but expected: " + nextJobDependency); } } assertTrue(analyzerJob1found); assertEquals(4, jobDependenciesFound); ajb.close(); } private List<RowProcessingConsumer> getConsumers(final AnalysisJob analysisJob) { final List<RowProcessingConsumer> consumers = new ArrayList<>(); final ErrorAwareAnalysisListener errorListener = new ErrorAwareAnalysisListener(); final RowProcessingPublishers publishers = new RowProcessingPublishers(analysisJob, null, errorListener, null, null); final Table table = analysisJob.getSourceColumns().get(0).getPhysicalColumn().getTable(); final RowProcessingPublisher publisher = publishers.getRowProcessingPublisher(publishers.getStream(table)); for (final AnalyzerJob analyzerJob : analysisJob.getAnalyzerJobs()) { final RowProcessingConsumer consumer = new AnalyzerConsumer(analyzerJob.getDescriptor().newInstance(), analyzerJob, analyzerJob.getInput(), publisher); consumers.add(consumer); } for (final TransformerJob transformerJob : analysisJob.getTransformerJobs()) { final RowProcessingConsumer consumer = new TransformerConsumer(transformerJob.getDescriptor().newInstance(), transformerJob, transformerJob.getInput(), publisher); consumers.add(consumer); } for (final FilterJob filterJob : analysisJob.getFilterJobs()) { final FilterConsumer consumer = new FilterConsumer(filterJob.getDescriptor().newInstance(), filterJob, filterJob.getInput(), publisher); consumers.add(consumer); } // shuffle the list (it should work regardless of the initial sort // order) Collections.shuffle(consumers); return consumers; } }