/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans; import static org.junit.Assert.*; import java.io.File; import java.util.ArrayList; import java.util.Collection; import org.datacleaner.api.InputColumn; import org.datacleaner.components.maxrows.MaxRowsFilter; import org.datacleaner.components.maxrows.MaxRowsFilter.Category; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreCatalogImpl; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.builder.FilterComponentBuilder; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.AnalysisRunnerImpl; import org.datacleaner.reference.SynonymCatalog; import org.datacleaner.reference.TextFileSynonymCatalog; import org.datacleaner.result.Crosstab; import org.datacleaner.result.renderer.CrosstabTextRenderer; import org.datacleaner.test.TestHelper; import org.junit.Test; public class ReferenceDataMatcherAnalyzerReducerTest { @Test public void test() throws Throwable { final AnalysisJobBuilder jobBuilder = getAnalysisJobBuilder(); final AnalysisJobBuilder jobBuilder1 = getAnalysisJobBuilder(); final AnalysisJobBuilder jobBuilder2 = getAnalysisJobBuilder(); final AnalysisJobBuilder jobBuilder3 = getAnalysisJobBuilder(); final BooleanAnalyzerResult fullResult = getPartialResult(jobBuilder, 1, 23); final BooleanAnalyzerResult partialResult1 = getPartialResult(jobBuilder1, 1, 12); final BooleanAnalyzerResult partialResult2 = getPartialResult(jobBuilder2, 13, 23); final BooleanAnalyzerResult partialResult3 = getPartialResult(jobBuilder3, 24, null); // assert first job { final Crosstab<Number> partialResult1columnStatisticsCrosstab = partialResult1.getColumnStatisticsCrosstab(); final String[] resultLines1 = new CrosstabTextRenderer().render(partialResult1columnStatisticsCrosstab).split("\n"); assertEquals(5, resultLines1.length); assertEquals(" JOBTITLE in Job Titles ", resultLines1[0]); assertEquals("Row count 12 ", resultLines1[1]); assertEquals("Null count 0 ", resultLines1[2]); assertEquals("True count 9 ", resultLines1[3]); assertEquals("False count 3 ", resultLines1[4]); // assert the value combination crosstab assertNull(partialResult1.getValueCombinationCrosstab()); } // assert second job { final Crosstab<Number> partialResult2columnStatisticsCrosstab = partialResult2.getColumnStatisticsCrosstab(); final String[] resultLines2 = new CrosstabTextRenderer().render(partialResult2columnStatisticsCrosstab).split("\n"); assertEquals(5, resultLines2.length); assertEquals(" JOBTITLE in Job Titles ", resultLines2[0]); assertEquals("Row count 11 ", resultLines2[1]); assertEquals("Null count 0 ", resultLines2[2]); assertEquals("True count 11 ", resultLines2[3]); assertEquals("False count 0 ", resultLines2[4]); // assert the value combination crosstab assertNull(partialResult2.getValueCombinationCrosstab()); } // assert third job { final Crosstab<Number> partialResult3columnStatisticsCrosstab = partialResult3.getColumnStatisticsCrosstab(); final String[] resultLines3 = new CrosstabTextRenderer().render(partialResult3columnStatisticsCrosstab).split("\n"); assertEquals(5, resultLines3.length); assertEquals(" JOBTITLE in Job Titles ", resultLines3[0]); assertEquals("Row count 0 ", resultLines3[1]); assertEquals("Null count 0 ", resultLines3[2]); assertEquals("True count 0 ", resultLines3[3]); assertEquals("False count 0 ", resultLines3[4]); // assert the value combination crosstab assertNull(partialResult3.getValueCombinationCrosstab()); } // assert full job result final Crosstab<Number> fullResultcolumnStatisticsCrosstab = fullResult.getColumnStatisticsCrosstab(); final String[] resultLinesFull = new CrosstabTextRenderer().render(fullResultcolumnStatisticsCrosstab).split("\n"); assertEquals(5, resultLinesFull.length); assertEquals(" JOBTITLE in Job Titles ", resultLinesFull[0]); assertEquals("Row count 23 ", resultLinesFull[1]); assertEquals("Null count 0 ", resultLinesFull[2]); assertEquals("True count 20 ", resultLinesFull[3]); assertEquals("False count 3 ", resultLinesFull[4]); // assert the value combination crosstab assertNull(fullResult.getValueCombinationCrosstab()); assertEquals(0, fullResult.getNullCount().getValue(BooleanAnalyzer.MEASURE_NULL_COUNT).intValue()); assertEquals(0, fullResult.getFalseCount().getValue(BooleanAnalyzer.MEASURE_FALSE_COUNT).intValue()); assertEquals(23, fullResult.getRowCount().intValue()); assertEquals(0, fullResult.getTrueCount().getValue(BooleanAnalyzer.MEASURE_TRUE_COUNT).intValue()); final Collection<BooleanAnalyzerResult> partialsResults = new ArrayList<>(); partialsResults.add(partialResult1); partialsResults.add(partialResult2); partialsResults.add(partialResult3); final BooleanAnalyzerReducer reducer = new BooleanAnalyzerReducer(); final BooleanAnalyzerResult reducedResult = reducer.reduce(partialsResults); // Assert the reduced values final Crosstab<Number> columnStatisticsCrosstab = reducedResult.getColumnStatisticsCrosstab(); final String[] reducerLinesResults = new CrosstabTextRenderer().render(columnStatisticsCrosstab).split("\n"); assertEquals(5, reducerLinesResults.length); assertEquals(" JOBTITLE in Job Titles ", reducerLinesResults[0]); assertEquals("Row count 23 ", reducerLinesResults[1]); assertEquals("Null count 0 ", reducerLinesResults[2]); assertEquals("True count 20 ", reducerLinesResults[3]); assertEquals("False count 3 ", reducerLinesResults[4]); assertEquals(0, reducedResult.getNullCount().getValue(BooleanAnalyzer.MEASURE_NULL_COUNT).intValue()); assertEquals(0, reducedResult.getFalseCount().getValue(BooleanAnalyzer.MEASURE_FALSE_COUNT).intValue()); assertEquals(23, reducedResult.getRowCount().intValue()); assertEquals(0, reducedResult.getTrueCount().getValue(BooleanAnalyzer.MEASURE_TRUE_COUNT).intValue()); // compare reduced result with full results { assertEquals(resultLinesFull.length, reducerLinesResults.length); assertEquals(resultLinesFull[0], reducerLinesResults[0]); assertEquals(resultLinesFull[1], reducerLinesResults[1]); assertEquals(resultLinesFull[2], reducerLinesResults[2]); assertEquals(resultLinesFull[3], reducerLinesResults[3]); assertEquals(resultLinesFull[4], reducerLinesResults[4]); } } private AnalysisJobBuilder getAnalysisJobBuilder() { final Datastore datastore = TestHelper.createSampleDatabaseDatastore("orderdb"); final DataCleanerConfigurationImpl configuration = new DataCleanerConfigurationImpl().withDatastoreCatalog(new DatastoreCatalogImpl(datastore)); final AnalysisJobBuilder jobBuilder = new AnalysisJobBuilder(configuration); jobBuilder.setDatastore(datastore); jobBuilder.addSourceColumns("employees.JOBTITLE"); return jobBuilder; } private BooleanAnalyzerResult getPartialResult(final AnalysisJobBuilder jobBuilder, final Integer firstRow, final Integer maxRows) throws Throwable { final InputColumn<?> jobTitleColumn = jobBuilder.getSourceColumnByName("JOBTITLE"); final FilterComponentBuilder<MaxRowsFilter, Category> maxRowsFilter = jobBuilder.addFilter(MaxRowsFilter.class); maxRowsFilter.addInputColumn(jobTitleColumn); if (firstRow != null) { maxRowsFilter.setConfiguredProperty("First row", firstRow); } if (maxRows != null) { maxRowsFilter.setConfiguredProperty("Max rows", maxRows); } final AnalyzerComponentBuilder<ReferenceDataMatcherAnalyzer> referenceDataMatcherAnalyzer = jobBuilder.addAnalyzer(ReferenceDataMatcherAnalyzer.class); referenceDataMatcherAnalyzer.setRequirement(maxRowsFilter.getFilterOutcome(MaxRowsFilter.Category.VALID)); final ReferenceDataMatcherAnalyzer referenceDataMatcher = referenceDataMatcherAnalyzer.getComponentInstance(); referenceDataMatcher.columns = new InputColumn<?>[] { jobTitleColumn }; final File file = new File("src/test/resources/synonym_titles_test.txt"); assertTrue(file.exists()); final TextFileSynonymCatalog jobTitlesCatalog = new TextFileSynonymCatalog("Job Titles", file, true, "UTF-8"); referenceDataMatcher.synonymCatalogs = new SynonymCatalog[] { jobTitlesCatalog }; final AnalysisJob analysisJob = jobBuilder.toAnalysisJob(); jobBuilder.close(); final AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(jobBuilder.getConfiguration()).run(analysisJob); resultFuture.await(); if (resultFuture.isErrornous()) { throw resultFuture.getErrors().get(0); } return resultFuture.getResults(BooleanAnalyzerResult.class).get(0); } }