/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.test.full.scenarios; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.apache.metamodel.DataContext; import org.apache.metamodel.schema.Column; import org.apache.metamodel.schema.Table; import org.datacleaner.api.AnalyzerResult; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.beans.StringAnalyzer; import org.datacleaner.beans.valuedist.ValueDistributionAnalyzer; import org.datacleaner.beans.valuedist.ValueDistributionAnalyzerResult; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.configuration.DataCleanerEnvironment; import org.datacleaner.configuration.DataCleanerEnvironmentImpl; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.concurrent.MultiThreadedTaskRunner; import org.datacleaner.job.concurrent.TaskRunner; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.AnalysisRunner; import org.datacleaner.job.runner.AnalysisRunnerImpl; import org.datacleaner.result.AnnotatedRowsResult; import org.datacleaner.result.Crosstab; import org.datacleaner.result.CrosstabNavigator; import org.datacleaner.result.CrosstabResult; import org.datacleaner.result.ResultProducer; import org.datacleaner.result.ValueFrequency; import org.datacleaner.result.renderer.CrosstabTextRenderer; import org.datacleaner.test.TestHelper; import junit.framework.TestCase; public class ValueDistributionAndStringAnalysisTest extends TestCase { public void testScenario() throws Exception { final TaskRunner taskRunner = new MultiThreadedTaskRunner(5); final DataCleanerEnvironment environment = new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withEnvironment(environment); final AnalysisRunner runner = new AnalysisRunnerImpl(configuration); final Datastore datastore = TestHelper.createSampleDatabaseDatastore("ds"); final DatastoreConnection con = datastore.openConnection(); final DataContext dc = con.getDataContext(); final AnalysisJobBuilder analysisJobBuilder = new AnalysisJobBuilder(configuration); analysisJobBuilder.setDatastoreConnection(con); final Table table = dc.getDefaultSchema().getTableByName("EMPLOYEES"); assertNotNull(table); final Column[] columns = table.getColumns(); analysisJobBuilder.addSourceColumns(columns); for (final InputColumn<?> inputColumn : analysisJobBuilder.getSourceColumns()) { final AnalyzerComponentBuilder<ValueDistributionAnalyzer> valueDistribuitionJobBuilder = analysisJobBuilder.addAnalyzer(ValueDistributionAnalyzer.class); valueDistribuitionJobBuilder.addInputColumn(inputColumn); valueDistribuitionJobBuilder.setConfiguredProperty("Record unique values", false); valueDistribuitionJobBuilder.setConfiguredProperty("Top n most frequent values", null); valueDistribuitionJobBuilder.setConfiguredProperty("Bottom n most frequent values", null); } final AnalyzerComponentBuilder<StringAnalyzer> stringAnalyzerJob = analysisJobBuilder.addAnalyzer(StringAnalyzer.class); stringAnalyzerJob.addInputColumns(analysisJobBuilder.getAvailableInputColumns(String.class)); final AnalysisJob analysisJob = analysisJobBuilder.toAnalysisJob(); analysisJobBuilder.close(); final AnalysisResultFuture resultFuture = runner.run(analysisJob); assertFalse(resultFuture.isDone()); final List<AnalyzerResult> results = resultFuture.getResults(); assertTrue(resultFuture.isDone()); // expect 1 result for each column (the value distributions) and 1 // result for the string analyzer assertEquals(table.getColumnCount() + 1, results.size()); int stringAnalyzerResults = 0; int valueDistributionResults = 0; final CrosstabResult stringAnalyzerResult = (CrosstabResult) resultFuture.getResult(stringAnalyzerJob.toAnalyzerJob()); for (final AnalyzerResult result : results) { if (result instanceof CrosstabResult) { stringAnalyzerResults++; assertTrue(result instanceof CrosstabResult); final CrosstabResult cr = (CrosstabResult) result; final Crosstab<?> crosstab = cr.getCrosstab(); assertEquals("[Column, Measures]", Arrays.toString(crosstab.getDimensionNames())); assertEquals("[LASTNAME, FIRSTNAME, EXTENSION, EMAIL, OFFICECODE, JOBTITLE]", crosstab.getDimension(0).getCategories().toString()); assertEquals("[Row count, Null count, Blank count, Entirely uppercase count, Entirely lowercase count, " + "Total char count, Max chars, Min chars, Avg chars, Max white spaces, " + "Min white spaces, Avg white spaces, Uppercase chars, " + "Uppercase chars (excl. first letters), Lowercase chars, Digit chars, " + "Diacritic chars, Non-letter chars, Word count, Max words, Min words]", crosstab.getDimension(1).getCategories().toString()); final CrosstabNavigator<?> nav = crosstab.navigate(); nav.where("Column", "EMAIL"); nav.where("Measures", "Total char count"); assertEquals("655", nav.get().toString()); } else { assertTrue(result instanceof ValueDistributionAnalyzerResult); valueDistributionResults++; } } assertEquals(1, stringAnalyzerResults); assertEquals(8, valueDistributionResults); ValueDistributionAnalyzerResult jobTitleResult = null; ValueDistributionAnalyzerResult lastnameResult = null; for (final AnalyzerResult result : results) { if (result instanceof ValueDistributionAnalyzerResult) { final ValueDistributionAnalyzerResult vdResult = (ValueDistributionAnalyzerResult) result; if ("JOBTITLE".equals(vdResult.getName())) { jobTitleResult = vdResult; } else if ("LASTNAME".equals(vdResult.getName())) { lastnameResult = vdResult; } } } assertNotNull(jobTitleResult); assertNotNull(lastnameResult); final Iterator<ValueFrequency> it = lastnameResult.getValueCounts().iterator(); assertEquals("<unique>", it.next().getName()); assertEquals("Patterson", it.next().getName()); assertEquals(2, it.next().getCount()); assertEquals(16, lastnameResult.getUniqueCount().intValue()); assertEquals(0, lastnameResult.getNullCount()); assertEquals("Sales Rep", jobTitleResult.getValueCounts().iterator().next().getValue()); final String[] resultLines = new CrosstabTextRenderer().render(stringAnalyzerResult).split("\n"); assertEquals( " LASTNAME FIRSTNAME EXTENSION EMAIL OFFICECODE JOBTITLE ", resultLines[0]); assertEquals( "Uppercase chars (excl. first letters) 0 1 0 0 0 39 ", resultLines[14]); assertEquals( "Diacritic chars 0 0 0 0 0 0 ", resultLines[17]); // do some drill-to-detail on the StringAnalyzerResult final Crosstab<?> crosstab = stringAnalyzerResult.getCrosstab(); ResultProducer resultProducer = crosstab.where("Column", "FIRSTNAME").where("Measures", "Uppercase chars (excl. first letters)") .explore(); assertNotNull(resultProducer); final AnnotatedRowsResult arr = (AnnotatedRowsResult) resultProducer.getResult(); final List<InputRow> rows = arr.getSampleRows(); assertEquals(1, rows.size()); assertEquals("Foon Yue", rows.get(0).getValue(analysisJobBuilder.getSourceColumnByName("FIRSTNAME")).toString()); resultProducer = crosstab.where("Column", "FIRSTNAME").where("Measures", "Diacritic chars").explore(); assertNull(resultProducer); con.close(); taskRunner.shutdown(); } }