/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans; import java.util.Collections; import java.util.List; import org.apache.metamodel.pojo.TableDataProvider; import org.apache.metamodel.schema.ColumnType; import org.apache.metamodel.schema.MutableColumn; import org.apache.metamodel.util.Resource; import org.apache.metamodel.util.UrlResource; import org.datacleaner.api.InputColumn; import org.datacleaner.api.OutputDataStream; import org.datacleaner.beans.CompletenessAnalyzer.Condition; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.configuration.DataCleanerEnvironment; import org.datacleaner.configuration.DataCleanerEnvironmentImpl; import org.datacleaner.connection.CsvDatastore; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.PojoDatastore; import org.datacleaner.data.MetaModelInputColumn; import org.datacleaner.data.MockInputColumn; import org.datacleaner.data.MockInputRow; import org.datacleaner.descriptors.AnalyzerDescriptor; import org.datacleaner.descriptors.Descriptors; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.AnalyzerJob; import org.datacleaner.job.OutputDataStreamJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.concurrent.MultiThreadedTaskRunner; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.AnalysisRunnerImpl; import org.datacleaner.result.ListResult; import org.datacleaner.test.MockAnalyzer; import junit.framework.TestCase; public class CompletenessAnalyzerTest extends TestCase { public void testIsDistributable() throws Exception { final AnalyzerDescriptor<CompletenessAnalyzer> descriptor = Descriptors.ofAnalyzer(CompletenessAnalyzer.class); assertTrue(descriptor.isDistributable()); } public void testAllFieldsEvaluationMode() throws Exception { final InputColumn<?> col1 = new MockInputColumn<String>("foo"); final InputColumn<?> col2 = new MockInputColumn<String>("bar"); final CompletenessAnalyzer analyzer = new CompletenessAnalyzer(); analyzer._evaluationMode = CompletenessAnalyzer.EvaluationMode.ALL_FIELDS; analyzer._valueColumns = new InputColumn[] { col1, col2 }; analyzer._conditions = new CompletenessAnalyzer.Condition[] { CompletenessAnalyzer.Condition.NOT_NULL, CompletenessAnalyzer.Condition.NOT_NULL }; analyzer.init(); analyzer.run(new MockInputRow(1001).put(col1, null).put(col2, null), 1); analyzer.run(new MockInputRow(1002).put(col1, "hello").put(col2, null), 1); analyzer.run(new MockInputRow(1002).put(col1, null).put(col2, "world"), 1); analyzer.run(new MockInputRow(1002).put(col1, "hello").put(col2, "world"), 1); assertEquals(4, analyzer.getResult().getTotalRowCount()); assertEquals(1, analyzer.getResult().getInvalidRowCount()); assertEquals(3, analyzer.getResult().getValidRowCount()); } public void testSomeFieldsRequiredToBeNullFieldsEvaluationMode() throws Exception { final InputColumn<?> col1 = new MockInputColumn<String>("foo"); final InputColumn<?> col2 = new MockInputColumn<String>("bar"); final InputColumn<?> col3 = new MockInputColumn<String>("baz"); final CompletenessAnalyzer analyzer = new CompletenessAnalyzer(); analyzer._valueColumns = new InputColumn[] { col1, col2, col3 }; analyzer._conditions = new CompletenessAnalyzer.Condition[] { CompletenessAnalyzer.Condition.NOT_BLANK_OR_NULL, CompletenessAnalyzer.Condition.NULL, CompletenessAnalyzer.Condition.BLANK_OR_NULL }; analyzer.init(); analyzer.run(new MockInputRow().put(col1, null).put(col2, null), 1); assertEquals(1, analyzer.getResult().getInvalidRowCount()); assertEquals(0, analyzer.getResult().getValidRowCount()); analyzer.run(new MockInputRow().put(col1, "hello").put(col2, null), 1); assertEquals(1, analyzer.getResult().getInvalidRowCount()); assertEquals(1, analyzer.getResult().getValidRowCount()); analyzer.run(new MockInputRow().put(col1, null).put(col2, "world"), 1); assertEquals(2, analyzer.getResult().getInvalidRowCount()); assertEquals(1, analyzer.getResult().getValidRowCount()); analyzer.run(new MockInputRow().put(col1, "hello").put(col2, "world"), 1); assertEquals(3, analyzer.getResult().getInvalidRowCount()); assertEquals(1, analyzer.getResult().getValidRowCount()); analyzer.run(new MockInputRow().put(col1, "hello").put(col2, "").put(col3, ""), 1); assertEquals(4, analyzer.getResult().getInvalidRowCount()); assertEquals(1, analyzer.getResult().getValidRowCount()); analyzer.run(new MockInputRow().put(col1, "hello").put(col2, null).put(col3, ""), 1); assertEquals(4, analyzer.getResult().getInvalidRowCount()); assertEquals(2, analyzer.getResult().getValidRowCount()); analyzer.run(new MockInputRow().put(col1, "hello").put(col2, null).put(col3, null), 1); assertEquals(4, analyzer.getResult().getInvalidRowCount()); assertEquals(3, analyzer.getResult().getValidRowCount()); } public void testConfigurableBeanConfiguration() throws Exception { try (AnalysisJobBuilder ajb = new AnalysisJobBuilder(new DataCleanerConfigurationImpl())) { final List<TableDataProvider<?>> tableDataProviders = Collections.emptyList(); ajb.setDatastore(new PojoDatastore("ds", tableDataProviders)); ajb.addSourceColumn(new MutableColumn("foo", ColumnType.VARCHAR)); final AnalyzerComponentBuilder<CompletenessAnalyzer> analyzer = ajb.addAnalyzer(CompletenessAnalyzer.class); analyzer.getComponentInstance().setValueColumns(ajb.getSourceColumns().toArray(new InputColumn[0])); analyzer.getComponentInstance().fillAllConditions(Condition.NOT_BLANK_OR_NULL); assertTrue(analyzer.isConfigured(true)); } } public void testSimpleScenario() throws Exception { final InputColumn<?> col1 = new MockInputColumn<String>("foo"); final InputColumn<?> col2 = new MockInputColumn<String>("bar"); final InputColumn<?> col3 = new MockInputColumn<String>("baz"); final CompletenessAnalyzer analyzer = new CompletenessAnalyzer(); analyzer._valueColumns = new InputColumn[] { col1, col2, col3 }; analyzer._conditions = new CompletenessAnalyzer.Condition[] { CompletenessAnalyzer.Condition.NOT_NULL, CompletenessAnalyzer.Condition.NOT_BLANK_OR_NULL, CompletenessAnalyzer.Condition.NOT_NULL }; analyzer.init(); analyzer.run(new MockInputRow(1001).put(col1, null).put(col2, null).put(col3, null), 1); analyzer.run(new MockInputRow(1002).put(col1, "").put(col2, "").put(col3, ""), 1); assertEquals(2, analyzer.getResult().getTotalRowCount()); assertEquals(0, analyzer.getResult().getValidRowCount()); assertEquals(2, analyzer.getResult().getInvalidRowCount()); analyzer.run(new MockInputRow(1002).put(col1, "").put(col2, "not blank").put(col3, ""), 1); analyzer.run(new MockInputRow(1002).put(col1, "not blank").put(col2, "not blank").put(col3, "not blank"), 1); assertEquals(4, analyzer.getResult().getTotalRowCount()); assertEquals(2, analyzer.getResult().getValidRowCount()); assertEquals(2, analyzer.getResult().getInvalidRowCount()); analyzer.run(new MockInputRow(1002).put(col1, null).put(col2, "not blank").put(col3, ""), 1); assertEquals(5, analyzer.getResult().getTotalRowCount()); assertEquals(2, analyzer.getResult().getValidRowCount()); assertEquals(3, analyzer.getResult().getInvalidRowCount()); } public void testOutputDataStream() throws Throwable { final MultiThreadedTaskRunner taskRunner = new MultiThreadedTaskRunner(16); final DataCleanerEnvironment environment = new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner); final Resource file = new UrlResource(this.getClass().getResource("/completeness_output_stream_test.csv")); final Datastore datastore = new CsvDatastore("testoutputdatastream", file); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withDatastores(datastore).withEnvironment(environment); final AnalysisJob job; try (AnalysisJobBuilder ajb = new AnalysisJobBuilder(configuration)) { ajb.setDatastore(datastore); ajb.addSourceColumns("A"); ajb.addSourceColumns("B"); ajb.addSourceColumns("C"); final AnalyzerComponentBuilder<CompletenessAnalyzer> analyzer1 = ajb.addAnalyzer(CompletenessAnalyzer.class); final List<MetaModelInputColumn> sourceColumns = ajb.getSourceColumns(); analyzer1.setName("analyzer1"); analyzer1.addInputColumns(sourceColumns); analyzer1.setConfiguredProperty(CompletenessAnalyzer.PROPERTY_EVALUATION_MODE, CompletenessAnalyzer.EvaluationMode.ANY_FIELD); analyzer1.setConfiguredProperty(CompletenessAnalyzer.PROPERTY_CONDITIONS, new CompletenessAnalyzer.Condition[] { Condition.NOT_BLANK_OR_NULL, Condition.NOT_BLANK_OR_NULL, Condition.NOT_BLANK_OR_NULL }); assertTrue(analyzer1.isConfigured()); final OutputDataStream completeStream = analyzer1.getOutputDataStream(CompletenessAnalyzer.OUTPUT_STREAM_COMPLETE); assertNotNull(completeStream); final OutputDataStream incompleteStream = analyzer1.getOutputDataStream(CompletenessAnalyzer.OUTPUT_STREAM_INCOMPLETE); assertNotNull(incompleteStream); final AnalysisJobBuilder completeDataStreamJobBuilder = analyzer1.getOutputDataStreamJobBuilder(completeStream); final List<MetaModelInputColumn> completeDataStreamColumns = completeDataStreamJobBuilder.getSourceColumns(); assertEquals(3, completeDataStreamColumns.size()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_COMPLETE + ".A]", completeDataStreamColumns.get(0).toString()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_COMPLETE + ".B]", completeDataStreamColumns.get(1).toString()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_COMPLETE + ".C]", completeDataStreamColumns.get(2).toString()); final AnalysisJobBuilder incompleteDataStreamJobBuilder = analyzer1.getOutputDataStreamJobBuilder(incompleteStream); final List<MetaModelInputColumn> incompleteDataStreamColumns = incompleteDataStreamJobBuilder.getSourceColumns(); assertEquals(3, incompleteDataStreamColumns.size()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_INCOMPLETE + ".A]", incompleteDataStreamColumns.get(0).toString()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_INCOMPLETE + ".B]", incompleteDataStreamColumns.get(1).toString()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_INCOMPLETE + ".C]", incompleteDataStreamColumns.get(2).toString()); final AnalyzerComponentBuilder<MockAnalyzer> analyzer2 = completeDataStreamJobBuilder.addAnalyzer(MockAnalyzer.class); analyzer2.addInputColumns(completeDataStreamColumns); analyzer2.setName("analyzer2"); assertTrue(analyzer2.isConfigured()); assertTrue(analyzer1.isOutputDataStreamConsumed(completeStream)); final AnalyzerComponentBuilder<MockAnalyzer> analyzer3 = incompleteDataStreamJobBuilder.addAnalyzer(MockAnalyzer.class); analyzer3.addInputColumns(incompleteDataStreamColumns); analyzer3.setName("analyzer3"); assertTrue(analyzer3.isConfigured()); assertTrue(analyzer1.isOutputDataStreamConsumed(incompleteStream)); job = ajb.toAnalysisJob(); } final AnalyzerJob analyzerJob1 = job.getAnalyzerJobs().get(0); final OutputDataStreamJob[] outputDataStreamJobs = analyzerJob1.getOutputDataStreamJobs(); final AnalyzerJob analyzerJob2 = outputDataStreamJobs[0].getJob().getAnalyzerJobs().get(0); final AnalyzerJob analyzerJob3 = outputDataStreamJobs[1].getJob().getAnalyzerJobs().get(0); // now run the job(s) final AnalysisRunnerImpl runner = new AnalysisRunnerImpl(configuration); final AnalysisResultFuture resultFuture = runner.run(job); resultFuture.await(); if (resultFuture.isErrornous()) { throw resultFuture.getErrors().get(0); } assertEquals(3, resultFuture.getResults().size()); final CompletenessAnalyzerResult result1 = (CompletenessAnalyzerResult) resultFuture.getResult(analyzerJob1); assertNotNull(result1); assertEquals(1, result1.getValidRowCount()); assertEquals(1, result1.getInvalidRowCount()); final ListResult<?> result2 = (ListResult<?>) resultFuture.getResult(analyzerJob2); assertNotNull(result2); assertEquals(1, result2.getValues().size()); assertEquals("MetaModelInputRow[Row[values=[a, b, c]]]", result2.getValues().get(0).toString()); final ListResult<?> result3 = (ListResult<?>) resultFuture.getResult(analyzerJob3); assertNotNull(result3); assertEquals(1, result3.getValues().size()); assertEquals("MetaModelInputRow[Row[values=[, , ]]]", result3.getValues().get(0).toString()); } public void testOutputDataStreamWithAdditionalFields() throws Throwable { final MultiThreadedTaskRunner taskRunner = new MultiThreadedTaskRunner(16); final DataCleanerEnvironment environment = new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner); final Resource file = new UrlResource(this.getClass().getResource("/completeness_output_stream_test.csv")); final Datastore datastore = new CsvDatastore("testoutputdatastream", file); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withDatastores(datastore).withEnvironment(environment); final AnalysisJob job; try (AnalysisJobBuilder ajb = new AnalysisJobBuilder(configuration)) { ajb.setDatastore(datastore); ajb.addSourceColumns("A"); ajb.addSourceColumns("B"); ajb.addSourceColumns("C"); final AnalyzerComponentBuilder<CompletenessAnalyzer> analyzer1 = ajb.addAnalyzer(CompletenessAnalyzer.class); analyzer1.setName("analyzer1"); analyzer1.addInputColumns(ajb.getSourceColumnByName("A")); analyzer1.setConfiguredProperty(CompletenessAnalyzer.PROPERTY_ADDITIONAL_OUTPUT_VALUES, new InputColumn[] { ajb.getSourceColumnByName("A"), ajb.getSourceColumnByName("C") }); analyzer1.setConfiguredProperty(CompletenessAnalyzer.PROPERTY_EVALUATION_MODE, CompletenessAnalyzer.EvaluationMode.ANY_FIELD); analyzer1.setConfiguredProperty(CompletenessAnalyzer.PROPERTY_CONDITIONS, new CompletenessAnalyzer.Condition[] { Condition.NOT_BLANK_OR_NULL, Condition.NOT_BLANK_OR_NULL, Condition.NOT_BLANK_OR_NULL }); assertTrue(analyzer1.isConfigured()); final OutputDataStream completeStream = analyzer1.getOutputDataStream(CompletenessAnalyzer.OUTPUT_STREAM_COMPLETE); assertNotNull(completeStream); final OutputDataStream incompleteStream = analyzer1.getOutputDataStream(CompletenessAnalyzer.OUTPUT_STREAM_INCOMPLETE); assertNotNull(incompleteStream); final AnalysisJobBuilder completeDataStreamJobBuilder = analyzer1.getOutputDataStreamJobBuilder(completeStream); final List<MetaModelInputColumn> completeDataStreamColumns = completeDataStreamJobBuilder.getSourceColumns(); assertEquals(2, completeDataStreamColumns.size()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_COMPLETE + ".A]", completeDataStreamColumns.get(0).toString()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_COMPLETE + ".C]", completeDataStreamColumns.get(1).toString()); final AnalysisJobBuilder incompleteDataStreamJobBuilder = analyzer1.getOutputDataStreamJobBuilder(incompleteStream); final List<MetaModelInputColumn> incompleteDataStreamColumns = incompleteDataStreamJobBuilder.getSourceColumns(); assertEquals(2, incompleteDataStreamColumns.size()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_INCOMPLETE + ".A]", incompleteDataStreamColumns.get(0).toString()); assertEquals("MetaModelInputColumn[" + CompletenessAnalyzer.OUTPUT_STREAM_INCOMPLETE + ".C]", incompleteDataStreamColumns.get(1).toString()); final AnalyzerComponentBuilder<MockAnalyzer> analyzer2 = completeDataStreamJobBuilder.addAnalyzer(MockAnalyzer.class); analyzer2.addInputColumns(completeDataStreamColumns); analyzer2.setName("analyzer2"); assertTrue(analyzer2.isConfigured()); assertTrue(analyzer1.isOutputDataStreamConsumed(completeStream)); final AnalyzerComponentBuilder<MockAnalyzer> analyzer3 = incompleteDataStreamJobBuilder.addAnalyzer(MockAnalyzer.class); analyzer3.addInputColumns(incompleteDataStreamColumns); analyzer3.setName("analyzer3"); assertTrue(analyzer3.isConfigured()); assertTrue(analyzer1.isOutputDataStreamConsumed(incompleteStream)); job = ajb.toAnalysisJob(); } final AnalyzerJob analyzerJob1 = job.getAnalyzerJobs().get(0); final OutputDataStreamJob[] outputDataStreamJobs = analyzerJob1.getOutputDataStreamJobs(); final AnalyzerJob analyzerJob2 = outputDataStreamJobs[0].getJob().getAnalyzerJobs().get(0); final AnalyzerJob analyzerJob3 = outputDataStreamJobs[1].getJob().getAnalyzerJobs().get(0); // now run the job(s) final AnalysisRunnerImpl runner = new AnalysisRunnerImpl(configuration); final AnalysisResultFuture resultFuture = runner.run(job); resultFuture.await(); if (resultFuture.isErrornous()) { throw resultFuture.getErrors().get(0); } assertEquals(3, resultFuture.getResults().size()); final CompletenessAnalyzerResult result1 = (CompletenessAnalyzerResult) resultFuture.getResult(analyzerJob1); assertNotNull(result1); assertEquals(1, result1.getValidRowCount()); assertEquals(1, result1.getInvalidRowCount()); final ListResult<?> result2 = (ListResult<?>) resultFuture.getResult(analyzerJob2); assertNotNull(result2); assertEquals(1, result2.getValues().size()); assertEquals("MetaModelInputRow[Row[values=[a, c]]]", result2.getValues().get(0).toString()); final ListResult<?> result3 = (ListResult<?>) resultFuture.getResult(analyzerJob3); assertNotNull(result3); assertEquals(1, result3.getValues().size()); assertEquals("MetaModelInputRow[Row[values=[, ]]]", result3.getValues().get(0).toString()); } }