/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.test.full.scenarios; import java.util.Collection; import java.util.List; import java.util.TreeSet; import org.apache.metamodel.DataContext; import org.apache.metamodel.schema.Column; import org.apache.metamodel.schema.Table; import org.datacleaner.api.AnalyzerResult; import org.datacleaner.api.InputColumn; import org.datacleaner.beans.transform.TokenizerTransformer; import org.datacleaner.beans.valuedist.ValueDistributionAnalyzer; import org.datacleaner.beans.valuedist.ValueDistributionAnalyzerResult; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.configuration.DataCleanerEnvironment; import org.datacleaner.configuration.DataCleanerEnvironmentImpl; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.data.MutableInputColumn; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.builder.TransformerComponentBuilder; import org.datacleaner.job.concurrent.MultiThreadedTaskRunner; import org.datacleaner.job.concurrent.TaskRunner; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.AnalysisRunner; import org.datacleaner.job.runner.AnalysisRunnerImpl; import org.datacleaner.test.TestHelper; import junit.framework.TestCase; public class TokenizerAndValueDistributionTest extends TestCase { public void testScenario() throws Throwable { final TaskRunner taskRunner = new MultiThreadedTaskRunner(5); final DataCleanerEnvironment environment = new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withEnvironment(environment); final AnalysisRunner runner = new AnalysisRunnerImpl(configuration); final Datastore datastore = TestHelper.createSampleDatabaseDatastore("ds"); final DatastoreConnection con = datastore.openConnection(); final DataContext dc = con.getDataContext(); final AnalysisJobBuilder analysisJobBuilder = new AnalysisJobBuilder(configuration); analysisJobBuilder.setDatastoreConnection(con); final Table table = dc.getDefaultSchema().getTableByName("EMPLOYEES"); assertNotNull(table); final Column jobTitleColumn = table.getColumnByName("JOBTITLE"); assertNotNull(jobTitleColumn); analysisJobBuilder.addSourceColumns(jobTitleColumn); final TransformerComponentBuilder<TokenizerTransformer> transformerJobBuilder = analysisJobBuilder.addTransformer(TokenizerTransformer.class); transformerJobBuilder.addInputColumn(analysisJobBuilder.getSourceColumns().get(0)); transformerJobBuilder.setConfiguredProperty("Number of tokens", 4); final List<MutableInputColumn<?>> transformerOutput = transformerJobBuilder.getOutputColumns(); assertEquals(4, transformerOutput.size()); transformerOutput.get(0).setName("first word"); transformerOutput.get(1).setName("second word"); transformerOutput.get(2).setName("third words"); transformerOutput.get(3).setName("fourth words"); for (final InputColumn<?> inputColumn : transformerOutput) { final AnalyzerComponentBuilder<ValueDistributionAnalyzer> valueDistribuitionJobBuilder = analysisJobBuilder.addAnalyzer(ValueDistributionAnalyzer.class); valueDistribuitionJobBuilder.addInputColumn(inputColumn); valueDistribuitionJobBuilder.setConfiguredProperty("Record unique values", true); valueDistribuitionJobBuilder.setConfiguredProperty("Top n most frequent values", null); valueDistribuitionJobBuilder.setConfiguredProperty("Bottom n most frequent values", null); } final AnalysisJob analysisJob = analysisJobBuilder.toAnalysisJob(); analysisJobBuilder.close(); final AnalysisResultFuture resultFuture = runner.run(analysisJob); assertFalse(resultFuture.isDone()); final List<AnalyzerResult> results = resultFuture.getResults(); assertTrue(resultFuture.isDone()); if (!resultFuture.isSuccessful()) { final List<Throwable> errors = resultFuture.getErrors(); throw errors.get(0); } // expect 1 result for each token assertEquals(4, results.size()); for (final AnalyzerResult analyzerResult : results) { final ValueDistributionAnalyzerResult result = (ValueDistributionAnalyzerResult) analyzerResult; final Collection<String> uniqueValues = new TreeSet<>(result.getUniqueValues()); if ("first word".equals(result.getName())) { assertEquals("[[Sales->19], [VP->2], [<unique>->2]]", result.getValueCounts().toString()); assertEquals(0, result.getNullCount()); assertEquals(2, result.getUniqueCount().intValue()); } else if ("second word".equals(result.getName())) { assertEquals("[[Rep->17], [Manager->3], [<unique>->2], [<null>->1]]", result.getValueCounts().toString()); assertEquals(1, result.getNullCount()); assertEquals(2, result.getUniqueCount().intValue()); } else if ("third words".equals(result.getName())) { assertEquals("[[<null>->20], [<unique>->3]]", result.getValueCounts().toString()); assertEquals(20, result.getNullCount()); assertEquals(3, result.getUniqueCount().intValue()); assertEquals("[(EMEA), (JAPAN,, (NA)]", uniqueValues.toString()); } else if ("fourth words".equals(result.getName())) { assertEquals("[[<null>->22], [<unique>->1]]", result.getValueCounts().toString()); assertEquals(22, result.getNullCount()); assertEquals(1, result.getUniqueCount().intValue()); assertEquals("[APAC)]", uniqueValues.toString()); } else { fail("Unexpected columnName: " + result.getName()); } } con.close(); taskRunner.shutdown(); } }