/** * AnalyzerBeans * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.eobjects.analyzer.test.full.scenarios; import java.util.Collection; import java.util.List; import java.util.TreeSet; import junit.framework.TestCase; import org.eobjects.analyzer.beans.transform.TokenizerTransformer; import org.eobjects.analyzer.beans.valuedist.ValueDistributionAnalyzer; import org.eobjects.analyzer.beans.valuedist.ValueDistributionAnalyzerResult; import org.eobjects.analyzer.configuration.AnalyzerBeansConfiguration; import org.eobjects.analyzer.configuration.AnalyzerBeansConfigurationImpl; import org.eobjects.analyzer.connection.Datastore; import org.eobjects.analyzer.connection.DatastoreConnection; import org.eobjects.analyzer.data.InputColumn; import org.eobjects.analyzer.data.MutableInputColumn; import org.eobjects.analyzer.job.AnalysisJob; import org.eobjects.analyzer.job.builder.AnalysisJobBuilder; import org.eobjects.analyzer.job.builder.AnalyzerJobBuilder; import org.eobjects.analyzer.job.builder.TransformerJobBuilder; import org.eobjects.analyzer.job.concurrent.MultiThreadedTaskRunner; import org.eobjects.analyzer.job.concurrent.TaskRunner; import org.eobjects.analyzer.job.runner.AnalysisResultFuture; import org.eobjects.analyzer.job.runner.AnalysisRunner; import org.eobjects.analyzer.job.runner.AnalysisRunnerImpl; import org.eobjects.analyzer.result.AnalyzerResult; import org.eobjects.analyzer.test.TestHelper; import org.apache.metamodel.DataContext; import org.apache.metamodel.schema.Column; import org.apache.metamodel.schema.Table; public class TokenizerAndValueDistributionTest extends TestCase { public void testScenario() throws Throwable { TaskRunner taskRunner = new MultiThreadedTaskRunner(5); AnalyzerBeansConfiguration configuration = new AnalyzerBeansConfigurationImpl().replace(taskRunner); AnalysisRunner runner = new AnalysisRunnerImpl(configuration); Datastore datastore = TestHelper.createSampleDatabaseDatastore("ds"); DatastoreConnection con = datastore.openConnection(); DataContext dc = con.getDataContext(); AnalysisJobBuilder analysisJobBuilder = new AnalysisJobBuilder(configuration); analysisJobBuilder.setDatastoreConnection(con); Table table = dc.getDefaultSchema().getTableByName("EMPLOYEES"); assertNotNull(table); Column jobTitleColumn = table.getColumnByName("JOBTITLE"); assertNotNull(jobTitleColumn); analysisJobBuilder.addSourceColumns(jobTitleColumn); TransformerJobBuilder<TokenizerTransformer> transformerJobBuilder = analysisJobBuilder .addTransformer(TokenizerTransformer.class); transformerJobBuilder.addInputColumn(analysisJobBuilder.getSourceColumns().get(0)); transformerJobBuilder.setConfiguredProperty("Number of tokens", 4); List<MutableInputColumn<?>> transformerOutput = transformerJobBuilder.getOutputColumns(); assertEquals(4, transformerOutput.size()); transformerOutput.get(0).setName("first word"); transformerOutput.get(1).setName("second word"); transformerOutput.get(2).setName("third words"); transformerOutput.get(3).setName("fourth words"); for (InputColumn<?> inputColumn : transformerOutput) { AnalyzerJobBuilder<ValueDistributionAnalyzer> valueDistribuitionJobBuilder = analysisJobBuilder .addAnalyzer(ValueDistributionAnalyzer.class); valueDistribuitionJobBuilder.addInputColumn(inputColumn); valueDistribuitionJobBuilder.setConfiguredProperty("Record unique values", true); valueDistribuitionJobBuilder.setConfiguredProperty("Top n most frequent values", null); valueDistribuitionJobBuilder.setConfiguredProperty("Bottom n most frequent values", null); } AnalysisJob analysisJob = analysisJobBuilder.toAnalysisJob(); analysisJobBuilder.close(); AnalysisResultFuture resultFuture = runner.run(analysisJob); assertFalse(resultFuture.isDone()); List<AnalyzerResult> results = resultFuture.getResults(); assertTrue(resultFuture.isDone()); if (!resultFuture.isSuccessful()) { List<Throwable> errors = resultFuture.getErrors(); throw errors.get(0); } // expect 1 result for each token assertEquals(4, results.size()); for (AnalyzerResult analyzerResult : results) { ValueDistributionAnalyzerResult result = (ValueDistributionAnalyzerResult) analyzerResult; Collection<String> uniqueValues = new TreeSet<String>(result.getUniqueValues()); if ("first word".equals(result.getName())) { assertEquals("[[Sales->19], [VP->2], [<unique>->2]]", result.getValueCounts().toString()); assertEquals(0, result.getNullCount()); assertEquals(2, result.getUniqueCount().intValue()); } else if ("second word".equals(result.getName())) { assertEquals("[[Rep->17], [Manager->3], [<unique>->2], [<null>->1]]", result.getValueCounts().toString()); assertEquals(1, result.getNullCount()); assertEquals(2, result.getUniqueCount().intValue()); } else if ("third words".equals(result.getName())) { assertEquals("[[<null>->20], [<unique>->3]]", result.getValueCounts().toString()); assertEquals(20, result.getNullCount()); assertEquals(3, result.getUniqueCount().intValue()); assertEquals("[(EMEA), (JAPAN,, (NA)]", uniqueValues.toString()); } else if ("fourth words".equals(result.getName())) { assertEquals("[[<null>->22], [<unique>->1]]", result.getValueCounts().toString()); assertEquals(22, result.getNullCount()); assertEquals(1, result.getUniqueCount().intValue()); assertEquals("[APAC)]", uniqueValues.toString()); } else { fail("Unexpected columnName: " + result.getName()); } } con.close(); taskRunner.shutdown(); } }