/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.job.runner; import java.util.ArrayList; import java.util.List; import org.apache.metamodel.query.Query; import org.apache.metamodel.schema.Column; import org.apache.metamodel.schema.Table; import org.datacleaner.api.Analyzer; import org.datacleaner.api.Filter; import org.datacleaner.api.InputColumn; import org.datacleaner.api.Transformer; import org.datacleaner.beans.StringAnalyzer; import org.datacleaner.beans.filter.NullCheckFilter; import org.datacleaner.beans.filter.NullCheckFilter.NullCheckCategory; import org.datacleaner.beans.standardize.EmailStandardizerTransformer; import org.datacleaner.beans.stringpattern.PatternFinderAnalyzer; import org.datacleaner.components.maxrows.MaxRowsFilter; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.connection.CsvDatastore; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.data.MutableInputColumn; import org.datacleaner.descriptors.AnalyzerDescriptor; import org.datacleaner.descriptors.FilterDescriptor; import org.datacleaner.descriptors.TransformerDescriptor; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.AnalyzerJob; import org.datacleaner.job.FilterJob; import org.datacleaner.job.TransformerJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.builder.FilterComponentBuilder; import org.datacleaner.job.builder.TransformerComponentBuilder; import org.datacleaner.job.concurrent.SingleThreadedTaskRunner; import org.datacleaner.job.concurrent.TaskRunner; import org.datacleaner.lifecycle.LifeCycleHelper; import org.datacleaner.test.TestHelper; import junit.framework.TestCase; @SuppressWarnings("deprecation") public class RowProcessingQueryOptimizerTest extends TestCase { private final LifeCycleHelper lifeCycleHelper = new LifeCycleHelper(null, true); private Datastore datastore; private DataCleanerConfiguration conf; private AnalysisJobBuilder ajb; private FilterComponentBuilder<MaxRowsFilter, MaxRowsFilter.Category> maxRowsBuilder; private AnalyzerComponentBuilder<StringAnalyzer> stringAnalyzerBuilder; private DatastoreConnection con; private Column lastnameColumn; private InputColumn<?> lastNameInputColumn; private ArrayList<RowProcessingConsumer> consumers; private Query baseQuery; private RowProcessingPublisher publisher; @Override protected void setUp() throws Exception { super.setUp(); // set up a common fixture with a simple Max rows filter and a String // analyzer on the LASTNAME // column datastore = TestHelper.createSampleDatabaseDatastore("mydb"); conf = new DataCleanerConfigurationImpl().withDatastores(datastore); ajb = new AnalysisJobBuilder(conf); ajb.setDatastore(datastore); maxRowsBuilder = ajb.addFilter(MaxRowsFilter.class); stringAnalyzerBuilder = ajb.addAnalyzer(StringAnalyzer.class); stringAnalyzerBuilder.setRequirement(maxRowsBuilder, MaxRowsFilter.Category.VALID); con = conf.getDatastoreCatalog().getDatastore("mydb").openConnection(); lastnameColumn = con.getSchemaNavigator().convertToColumn("EMPLOYEES.LASTNAME"); ajb.addSourceColumn(lastnameColumn); lastNameInputColumn = ajb.getSourceColumnByName("lastname"); stringAnalyzerBuilder.addInputColumn(lastNameInputColumn); consumers = new ArrayList<>(); baseQuery = con.getDataContext().query().from("EMPLOYEES").select("LASTNAME").toQuery(); } @Override protected void tearDown() throws Exception { super.tearDown(); con.close(); } private RowProcessingPublisher createPublisher() { final AnalysisJob analysisJob = ajb.toAnalysisJob(false); final AnalysisListener analysisListener = new InfoLoggingAnalysisListener(); final ErrorAwareAnalysisListener errorListener = new ErrorAwareAnalysisListener(); final TaskRunner taskRunner = new SingleThreadedTaskRunner(); final RowProcessingPublishers publishers = new RowProcessingPublishers(analysisJob, analysisListener, errorListener, taskRunner, lifeCycleHelper); final Table table = ajb.getSourceColumns().get(0).getPhysicalColumn().getTable(); return publishers.getRowProcessingPublisher(publishers.getStream(table)); } public void testSimpleOptimization() throws Exception { publisher = createPublisher(); consumers.add(createConsumer(maxRowsBuilder, publisher)); consumers.add(createConsumer(stringAnalyzerBuilder, publisher)); final RowProcessingQueryOptimizer optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); assertTrue(optimizer.isOptimizable()); final Query optimizedQuery = optimizer.getOptimizedQuery(); final Integer maxRows = optimizedQuery.getMaxRows(); assertNotNull("No max rows specified!", maxRows); assertEquals(1000, maxRows.intValue()); } public void testAlwaysOptimizableFilter() throws Exception { final Datastore datastore = new CsvDatastore("foo", "src/test/resources/projects.csv"); publisher = createPublisher(); consumers.add(createConsumer(maxRowsBuilder, publisher)); consumers.add(createConsumer(stringAnalyzerBuilder, publisher)); RowProcessingQueryOptimizer optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); assertTrue(optimizer.isOptimizable()); final FilterComponentBuilder<?, ?> fjb = ajb.addFilter(NullCheckFilter.class).addInputColumn(lastNameInputColumn); maxRowsBuilder.setRequirement(fjb, NullCheckCategory.NOT_NULL); publisher = createPublisher(); consumers.add(0, createConsumer(fjb, publisher)); optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); assertFalse(optimizer.isOptimizable()); } public void testOptimizedChainedTransformer() throws Exception { final TransformerComponentBuilder<EmailStandardizerTransformer> emailStdBuilder = ajb.addTransformer(EmailStandardizerTransformer.class); final Column emailColumn = con.getSchemaNavigator().convertToColumn("EMPLOYEES.EMAIL"); ajb.addSourceColumn(emailColumn); final InputColumn<?> emailInputColumn = ajb.getSourceColumnByName("email"); emailStdBuilder.addInputColumn(emailInputColumn); // reconfigure the string analyzer to depend on transformed columns stringAnalyzerBuilder.clearInputColumns(); final List<MutableInputColumn<?>> outputColumns = emailStdBuilder.getOutputColumns(); stringAnalyzerBuilder.addInputColumns(outputColumns); // remove the string analyzer and add the transformer in between publisher = createPublisher(); consumers.add(createConsumer(maxRowsBuilder, publisher)); consumers.add(createConsumer(emailStdBuilder, publisher)); consumers.add(createConsumer(stringAnalyzerBuilder, publisher)); RowProcessingQueryOptimizer optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); // not optimizable because the transformer doesn't have the requirement assertFalse(optimizer.isOptimizable()); consumers.remove(2); consumers.remove(1); emailStdBuilder.setRequirement(maxRowsBuilder, MaxRowsFilter.Category.VALID); publisher = createPublisher(); consumers.add(createConsumer(emailStdBuilder, publisher)); consumers.add(createConsumer(stringAnalyzerBuilder, publisher)); optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); assertTrue(optimizer.isOptimizable()); // even without the requirement, the string analyzer should still be // optimizable, because of it's dependency to the email standardizer stringAnalyzerBuilder.setRequirement(null); consumers.remove(2); publisher = createPublisher(); consumers.add(createConsumer(stringAnalyzerBuilder, publisher)); optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); assertTrue(optimizer.isOptimizable()); } public void testDontOptimizeWhenComponentsHaveNoRequirements() throws Exception { final AnalyzerComponentBuilder<PatternFinderAnalyzer> patternFinderBuilder = ajb.addAnalyzer(PatternFinderAnalyzer.class); patternFinderBuilder.addInputColumn(lastNameInputColumn); publisher = createPublisher(); consumers.add(createConsumer(maxRowsBuilder, publisher)); consumers.add(createConsumer(stringAnalyzerBuilder, publisher)); consumers.add(createConsumer(patternFinderBuilder, publisher)); final RowProcessingQueryOptimizer optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); assertFalse(optimizer.isOptimizable()); } public void testMultipleOptimizations() throws Exception { final FilterComponentBuilder<NullCheckFilter, NullCheckFilter.NullCheckCategory> notNullBuilder = ajb.addFilter(NullCheckFilter.class); final Column emailColumn = con.getSchemaNavigator().convertToColumn("EMPLOYEES.EMAIL"); ajb.addSourceColumn(emailColumn); final InputColumn<?> emailInputColumn = ajb.getSourceColumnByName("email"); notNullBuilder.addInputColumn(emailInputColumn); notNullBuilder.setRequirement(maxRowsBuilder, MaxRowsFilter.Category.VALID); stringAnalyzerBuilder.setRequirement(notNullBuilder, NullCheckCategory.NOT_NULL); publisher = createPublisher(); consumers.add(createConsumer(maxRowsBuilder, publisher)); consumers.add(createConsumer(notNullBuilder, publisher)); consumers.add(createConsumer(stringAnalyzerBuilder, publisher)); final RowProcessingQueryOptimizer optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); assertTrue(optimizer.isOptimizable()); final List<RowProcessingConsumer> optimizedConsumers = optimizer.getOptimizedConsumers(); assertEquals(1, optimizedConsumers.size()); final Query q = optimizer.getOptimizedQuery(); assertEquals( "SELECT \"EMPLOYEES\".\"LASTNAME\" FROM PUBLIC.\"EMPLOYEES\" WHERE \"EMPLOYEES\".\"EMAIL\" IS NOT NULL", q.toSql()); assertEquals(1000, q.getMaxRows().intValue()); } public void testMultipleOutcomesUsed() throws Exception { final AnalyzerComponentBuilder<PatternFinderAnalyzer> patternFinderBuilder = ajb.addAnalyzer(PatternFinderAnalyzer.class); patternFinderBuilder.addInputColumn(lastNameInputColumn); patternFinderBuilder.setRequirement(maxRowsBuilder, MaxRowsFilter.Category.INVALID); publisher = createPublisher(); consumers.add(createConsumer(maxRowsBuilder, publisher)); consumers.add(createConsumer(stringAnalyzerBuilder, publisher)); consumers.add(createConsumer(patternFinderBuilder, publisher)); final RowProcessingQueryOptimizer optimizer = new RowProcessingQueryOptimizerImpl(datastore, consumers, baseQuery); assertFalse(optimizer.isOptimizable()); } private FilterConsumer createConsumer(final FilterComponentBuilder<?, ?> filterJobBuilder, final RowProcessingPublisher publisher) { final FilterJob filterJob = filterJobBuilder.toFilterJob(); final FilterDescriptor<?, ?> descriptor = filterJob.getDescriptor(); final Filter<?> filter = descriptor.newInstance(); lifeCycleHelper.assignConfiguredProperties(descriptor, filter, filterJob.getConfiguration()); return new FilterConsumer(filter, filterJob, filterJobBuilder.getInput(), publisher); } private TransformerConsumer createConsumer(final TransformerComponentBuilder<?> transformerJobBuilder, final RowProcessingPublisher publisher) { final TransformerJob transformerJob = transformerJobBuilder.toTransformerJob(); final TransformerDescriptor<?> descriptor = transformerJob.getDescriptor(); final Transformer transformer = descriptor.newInstance(); lifeCycleHelper.assignConfiguredProperties(descriptor, transformer, transformerJob.getConfiguration()); return new TransformerConsumer(transformer, transformerJob, transformerJobBuilder.getInput(), publisher); } private AnalyzerConsumer createConsumer(final AnalyzerComponentBuilder<?> analyzerBuilder, final RowProcessingPublisher publisher) { final AnalyzerJob analyzerJob = analyzerBuilder.toAnalyzerJob(); final AnalyzerDescriptor<?> descriptor = analyzerJob.getDescriptor(); final Analyzer<?> analyzer = descriptor.newInstance(); lifeCycleHelper.assignConfiguredProperties(descriptor, analyzer, analyzerJob.getConfiguration()); return new AnalyzerConsumer(analyzer, analyzerJob, analyzerBuilder.getInput(), publisher); } }