/** * AnalyzerBeans * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.eobjects.analyzer.test.full.scenarios; import java.util.List; import junit.framework.TestCase; import org.eobjects.analyzer.beans.StringAnalyzer; import org.eobjects.analyzer.beans.StringAnalyzerResult; import org.eobjects.analyzer.beans.script.JavaScriptFilter; import org.eobjects.analyzer.beans.standardize.EmailStandardizerTransformer; import org.eobjects.analyzer.beans.standardize.NameStandardizerTransformer; import org.eobjects.analyzer.beans.valuedist.ValueDistributionAnalyzer; import org.eobjects.analyzer.beans.valuedist.ValueDistributionAnalyzerResult; import org.eobjects.analyzer.configuration.AnalyzerBeansConfiguration; import org.eobjects.analyzer.configuration.AnalyzerBeansConfigurationImpl; import org.eobjects.analyzer.connection.CsvDatastore; import org.eobjects.analyzer.connection.DatastoreConnection; import org.eobjects.analyzer.data.InputColumn; import org.eobjects.analyzer.data.MutableInputColumn; import org.eobjects.analyzer.job.builder.AnalysisJobBuilder; import org.eobjects.analyzer.job.builder.AnalyzerJobBuilder; import org.eobjects.analyzer.job.builder.FilterJobBuilder; import org.eobjects.analyzer.job.builder.TransformerJobBuilder; import org.eobjects.analyzer.job.concurrent.SingleThreadedTaskRunner; import org.eobjects.analyzer.job.concurrent.TaskRunner; import org.eobjects.analyzer.job.runner.AnalysisResultFuture; import org.eobjects.analyzer.job.runner.AnalysisRunner; import org.eobjects.analyzer.job.runner.AnalysisRunnerImpl; import org.eobjects.analyzer.result.AnalyzerResult; import org.apache.metamodel.schema.Column; import org.apache.metamodel.schema.Schema; import org.apache.metamodel.schema.Table; public class NameAndEmailPartEqualityTest extends TestCase { public void testScenario() throws Throwable { TaskRunner taskRunner = new SingleThreadedTaskRunner(); AnalyzerBeansConfiguration configuration = new AnalyzerBeansConfigurationImpl().replace(taskRunner); AnalysisRunner runner = new AnalysisRunnerImpl(configuration); CsvDatastore ds = new CsvDatastore("data.csv", "src/test/resources/NameAndEmailPartEqualityTest-data.csv"); AnalysisJobBuilder analysisJobBuilder = new AnalysisJobBuilder(configuration); analysisJobBuilder.setDatastore(ds); DatastoreConnection con = ds.openConnection(); Schema schema = con.getDataContext().getDefaultSchema(); Table table = schema.getTables()[0]; assertNotNull(table); Column nameColumn = table.getColumnByName("name"); Column emailColumn = table.getColumnByName("email"); analysisJobBuilder.addSourceColumns(nameColumn, emailColumn); TransformerJobBuilder<NameStandardizerTransformer> nameTransformerJobBuilder = analysisJobBuilder .addTransformer(NameStandardizerTransformer.class); nameTransformerJobBuilder.addInputColumn(analysisJobBuilder.getSourceColumnByName("name")); nameTransformerJobBuilder.setConfiguredProperty("Name patterns", NameStandardizerTransformer.DEFAULT_PATTERNS); assertTrue(nameTransformerJobBuilder.isConfigured()); final List<MutableInputColumn<?>> nameColumns = nameTransformerJobBuilder.getOutputColumns(); assertEquals(4, nameColumns.size()); assertEquals("Firstname", nameColumns.get(0).getName()); assertEquals("Lastname", nameColumns.get(1).getName()); assertEquals("Middlename", nameColumns.get(2).getName()); assertEquals("Titulation", nameColumns.get(3).getName()); TransformerJobBuilder<EmailStandardizerTransformer> emailTransformerJobBuilder = analysisJobBuilder .addTransformer(EmailStandardizerTransformer.class); emailTransformerJobBuilder.addInputColumn(analysisJobBuilder.getSourceColumnByName("email")); assertTrue(emailTransformerJobBuilder.isConfigured()); @SuppressWarnings("unchecked") final MutableInputColumn<String> usernameColumn = (MutableInputColumn<String>) emailTransformerJobBuilder .getOutputColumnByName("Username"); assertNotNull(usernameColumn); assertTrue(analysisJobBuilder.addAnalyzer(StringAnalyzer.class).addInputColumns(nameColumns) .addInputColumns(emailTransformerJobBuilder.getOutputColumns()).isConfigured()); for (InputColumn<?> inputColumn : nameColumns) { AnalyzerJobBuilder<ValueDistributionAnalyzer> analyzerJobBuilder = analysisJobBuilder .addAnalyzer(ValueDistributionAnalyzer.class); analyzerJobBuilder.addInputColumn(inputColumn); analyzerJobBuilder.setConfiguredProperty("Record unique values", false); analyzerJobBuilder.setConfiguredProperty("Top n most frequent values", 1000); analyzerJobBuilder.setConfiguredProperty("Bottom n most frequent values", 1000); assertTrue(analyzerJobBuilder.isConfigured()); } FilterJobBuilder<JavaScriptFilter, JavaScriptFilter.Category> fjb = analysisJobBuilder .addFilter(JavaScriptFilter.class); fjb.addInputColumn(nameTransformerJobBuilder.getOutputColumnByName("Firstname")); fjb.addInputColumn(usernameColumn); fjb.setConfiguredProperty("Source code", "values[0] == values[1];"); assertTrue(fjb.isConfigured()); analysisJobBuilder.addAnalyzer(StringAnalyzer.class) .addInputColumn(analysisJobBuilder.getSourceColumnByName("email")) .setRequirement(fjb, JavaScriptFilter.Category.VALID); analysisJobBuilder.addAnalyzer(StringAnalyzer.class) .addInputColumn(analysisJobBuilder.getSourceColumnByName("email")) .setRequirement(fjb, JavaScriptFilter.Category.INVALID); AnalysisResultFuture resultFuture = runner.run(analysisJobBuilder.toAnalysisJob()); analysisJobBuilder.close(); con.close(); if (!resultFuture.isSuccessful()) { List<Throwable> errors = resultFuture.getErrors(); throw errors.get(0); } List<AnalyzerResult> results = resultFuture.getResults(); assertEquals(7, results.size()); ValueDistributionAnalyzerResult vdResult = (ValueDistributionAnalyzerResult) results.get(1); assertEquals("Firstname", vdResult.getName()); assertEquals(0, vdResult.getNullCount()); assertEquals(2, vdResult.getUniqueCount().intValue()); assertEquals("[[barack->4], [<unique>->2]]", vdResult.getValueCounts().toString()); vdResult = (ValueDistributionAnalyzerResult) results.get(2); assertEquals("Lastname", vdResult.getName()); assertEquals(0, vdResult.getNullCount()); assertEquals(0, vdResult.getUniqueCount().intValue()); assertEquals("[[obama->4], [doe->2]]", vdResult.getValueCounts().toString()); vdResult = (ValueDistributionAnalyzerResult) results.get(3); assertEquals("Middlename", vdResult.getName()); assertEquals(4, vdResult.getNullCount()); assertEquals(0, vdResult.getUniqueCount().intValue()); assertEquals("[[<null>->4], [hussein->2]]", vdResult.getValueCounts() .toString()); vdResult = (ValueDistributionAnalyzerResult) results.get(4); assertEquals("Titulation", vdResult.getName()); assertEquals(6, vdResult.getNullCount()); assertEquals(0, vdResult.getUniqueCount().intValue()); assertEquals("[[<null>->6]]", vdResult.getValueCounts().toString()); StringAnalyzerResult stringAnalyzerResult = (StringAnalyzerResult) results.get(5); assertEquals(1, stringAnalyzerResult.getColumns().length); assertEquals("4", stringAnalyzerResult.getCrosstab().where("Column", "email").where("Measures", "Row count") .get().toString()); stringAnalyzerResult = (StringAnalyzerResult) results.get(6); assertEquals(1, stringAnalyzerResult.getColumns().length); assertEquals("2", stringAnalyzerResult.getCrosstab().where("Column", "email").where("Measures", "Row count") .get().toString()); } }