/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.test.full.scenarios; import java.util.Arrays; import java.util.List; import org.apache.metamodel.DataContext; import org.apache.metamodel.schema.Column; import org.apache.metamodel.schema.Table; import org.datacleaner.api.AnalyzerResult; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.beans.StringAnalyzer; import org.datacleaner.beans.standardize.EmailStandardizerTransformer; import org.datacleaner.beans.stringpattern.PatternFinderAnalyzer; import org.datacleaner.beans.stringpattern.PatternFinderResult; import org.datacleaner.beans.stringpattern.PatternFinderResultTextRenderer; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.configuration.DataCleanerEnvironment; import org.datacleaner.configuration.DataCleanerEnvironmentImpl; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.builder.TransformerComponentBuilder; import org.datacleaner.job.concurrent.MultiThreadedTaskRunner; import org.datacleaner.job.concurrent.TaskRunner; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.AnalysisRunnerImpl; import org.datacleaner.result.AnnotatedRowsResult; import org.datacleaner.result.CrosstabResult; import org.datacleaner.result.DefaultResultProducer; import org.datacleaner.result.ResultProducer; import org.datacleaner.result.renderer.CrosstabTextRenderer; import org.datacleaner.test.TestHelper; import junit.framework.TestCase; @SuppressWarnings("deprecation") public class PatternFinderAndStringAnalyzerDrillToDetailTest extends TestCase { public void testScenario() throws Throwable { final TaskRunner taskRunner = new MultiThreadedTaskRunner(5); final DataCleanerEnvironment environment = new DataCleanerEnvironmentImpl().withTaskRunner(taskRunner); final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl().withEnvironment(environment); final Datastore datastore = TestHelper.createSampleDatabaseDatastore("ds"); final DatastoreConnection con = datastore.openConnection(); final DataContext dc = con.getDataContext(); try (AnalysisJobBuilder ajb = new AnalysisJobBuilder(configuration)) { ajb.setDatastoreConnection(con); final Table table = dc.getDefaultSchema().getTableByName("EMPLOYEES"); assertNotNull(table); final Column jobTitleColumn = table.getColumnByName("JOBTITLE"); assertNotNull(jobTitleColumn); final Column emailColumn = table.getColumnByName("EMAIL"); assertNotNull(emailColumn); ajb.addSourceColumns(jobTitleColumn, emailColumn); final InputColumn<?> emailInputColumn = ajb.getSourceColumnByName("EMAIL"); final TransformerComponentBuilder<EmailStandardizerTransformer> emailStd1 = ajb.addTransformer(EmailStandardizerTransformer.class).addInputColumn(emailInputColumn); final AnalyzerComponentBuilder<PatternFinderAnalyzer> pf = ajb.addAnalyzer(PatternFinderAnalyzer.class); final InputColumn<?> jobtitleInputColumn = ajb.getSourceColumnByName("JOBTITLE"); pf.addInputColumn(jobtitleInputColumn); pf.getComponentInstance().setDiscriminateTextCase(false); final AnalyzerComponentBuilder<StringAnalyzer> sa = ajb.addAnalyzer(StringAnalyzer.class); sa.addInputColumns(emailInputColumn, emailStd1.getOutputColumnByName("Username"), emailStd1.getOutputColumnByName("Domain")); final AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(configuration).run(ajb.toAnalysisJob()); if (!resultFuture.isSuccessful()) { throw resultFuture.getErrors().iterator().next(); } // pattern finder result tests { final PatternFinderResult result = (PatternFinderResult) resultFuture.getResult(pf.toAnalyzerJob()); final String resultString = new PatternFinderResultTextRenderer().render(result); final String[] resultLines = resultString.split("\n"); assertEquals(resultString, 5, resultLines.length); assertEquals(resultString, " Match count Sample ", resultLines[0]); assertTrue(resultString, resultLines[1].startsWith("aaaaa aaaaaaaaa 19")); final ResultProducer resultProducer = result.getSingleCrosstab().where("Pattern", "aaaaa aaaaaaaaa").where("Measures", "Match count") .explore(); assertEquals(DefaultResultProducer.class, resultProducer.getClass()); final AnalyzerResult result2 = resultProducer.getResult(); assertEquals(AnnotatedRowsResult.class, result2.getClass()); final AnnotatedRowsResult annotatedRowsResult = (AnnotatedRowsResult) result2; assertEquals(19, annotatedRowsResult.getAnnotatedRowCount()); final List<InputRow> rows = annotatedRowsResult.getSampleRows(); assertEquals(19, rows.size()); final String[] values = new String[19]; for (int i = 0; i < values.length; i++) { values[i] = (String) rows.get(i).getValue(jobtitleInputColumn); } Arrays.sort(values); assertEquals( "[Sales Rep, Sales Rep, Sales Rep, Sales Rep, Sales Rep, Sales Rep, Sales Rep, " + "Sales Rep, Sales Rep, Sales Rep, Sales Rep, Sales Rep, Sales Rep, Sales Rep, " + "Sales Rep, Sales Rep, Sales Rep, VP Marketing, VP Sales]", Arrays.toString(values)); } // string analyzer tests { final CrosstabResult result = (CrosstabResult) resultFuture.getResult(sa.toAnalyzerJob()); final String[] resultLines = new CrosstabTextRenderer().render(result).split("\n"); assertEquals(" EMAIL Username Domain ", resultLines[0]); assertEquals("Total char count 655 172 460 ", resultLines[6]); assertEquals("Max chars 31 10 20 ", resultLines[7]); assertEquals("Min chars 26 5 20 ", resultLines[8]); // username is a virtual columns, but because of the // row-annotation // system it is still possible to drill to detail on it. ResultProducer resultProducer = result.getCrosstab().where("Column", "Username").where("Measures", "Max chars").explore(); assertNotNull(resultProducer); assertEquals(AnnotatedRowsResult.class, resultProducer.getResult().getClass()); // email is a physical column so it IS queryable resultProducer = result.getCrosstab().where("Column", "EMAIL").where("Measures", "Max chars").explore(); assertNotNull(resultProducer); final AnalyzerResult result2 = resultProducer.getResult(); assertEquals(AnnotatedRowsResult.class, result2.getClass()); final AnnotatedRowsResult arr = (AnnotatedRowsResult) result2; final List<InputRow> rows = arr.getSampleRows(); assertEquals(1, rows.size()); assertEquals("wpatterson@classicmodelcars.com", rows.get(0).getValue(emailInputColumn).toString()); } } con.close(); taskRunner.shutdown(); } }