/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.test.full.scenarios;
import java.util.List;
import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.Schema;
import org.apache.metamodel.schema.Table;
import org.datacleaner.api.AnalyzerResult;
import org.datacleaner.api.InputColumn;
import org.datacleaner.beans.StringAnalyzer;
import org.datacleaner.beans.StringAnalyzerResult;
import org.datacleaner.beans.script.JavaScriptFilter;
import org.datacleaner.beans.standardize.EmailStandardizerTransformer;
import org.datacleaner.beans.standardize.NameStandardizerTransformer;
import org.datacleaner.beans.valuedist.ValueDistributionAnalyzer;
import org.datacleaner.beans.valuedist.ValueDistributionAnalyzerResult;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.configuration.DataCleanerConfigurationImpl;
import org.datacleaner.connection.CsvDatastore;
import org.datacleaner.connection.DatastoreConnection;
import org.datacleaner.data.MutableInputColumn;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.job.builder.AnalyzerComponentBuilder;
import org.datacleaner.job.builder.FilterComponentBuilder;
import org.datacleaner.job.builder.TransformerComponentBuilder;
import org.datacleaner.job.runner.AnalysisResultFuture;
import org.datacleaner.job.runner.AnalysisRunner;
import org.datacleaner.job.runner.AnalysisRunnerImpl;
import junit.framework.TestCase;
@SuppressWarnings("deprecation")
public class NameAndEmailPartEqualityTest extends TestCase {
public void testScenario() throws Throwable {
final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl();
final AnalysisRunner runner = new AnalysisRunnerImpl(configuration);
final CsvDatastore ds =
new CsvDatastore("data.csv", "src/test/resources/NameAndEmailPartEqualityTest-data.csv");
final AnalysisJobBuilder analysisJobBuilder = new AnalysisJobBuilder(configuration);
analysisJobBuilder.setDatastore(ds);
final DatastoreConnection con = ds.openConnection();
final Schema schema = con.getDataContext().getDefaultSchema();
final Table table = schema.getTables()[0];
assertNotNull(table);
final Column nameColumn = table.getColumnByName("name");
final Column emailColumn = table.getColumnByName("email");
analysisJobBuilder.addSourceColumns(nameColumn, emailColumn);
final TransformerComponentBuilder<NameStandardizerTransformer> nameTransformerComponentBuilder =
analysisJobBuilder.addTransformer(NameStandardizerTransformer.class);
nameTransformerComponentBuilder.addInputColumn(analysisJobBuilder.getSourceColumnByName("name"));
nameTransformerComponentBuilder
.setConfiguredProperty("Name patterns", NameStandardizerTransformer.DEFAULT_PATTERNS);
assertTrue(nameTransformerComponentBuilder.isConfigured());
final List<MutableInputColumn<?>> nameColumns = nameTransformerComponentBuilder.getOutputColumns();
assertEquals(4, nameColumns.size());
assertEquals("Firstname", nameColumns.get(0).getName());
assertEquals("Lastname", nameColumns.get(1).getName());
assertEquals("Middlename", nameColumns.get(2).getName());
assertEquals("Titulation", nameColumns.get(3).getName());
final TransformerComponentBuilder<EmailStandardizerTransformer> emailTransformerComponentBuilder =
analysisJobBuilder.addTransformer(EmailStandardizerTransformer.class);
emailTransformerComponentBuilder.addInputColumn(analysisJobBuilder.getSourceColumnByName("email"));
assertTrue(emailTransformerComponentBuilder.isConfigured());
@SuppressWarnings("unchecked") final MutableInputColumn<String> usernameColumn =
(MutableInputColumn<String>) emailTransformerComponentBuilder.getOutputColumnByName("Username");
assertNotNull(usernameColumn);
assertTrue(analysisJobBuilder.addAnalyzer(StringAnalyzer.class).addInputColumns(nameColumns)
.addInputColumns(emailTransformerComponentBuilder.getOutputColumns()).isConfigured());
for (final InputColumn<?> inputColumn : nameColumns) {
final AnalyzerComponentBuilder<ValueDistributionAnalyzer> analyzerJobBuilder =
analysisJobBuilder.addAnalyzer(ValueDistributionAnalyzer.class);
analyzerJobBuilder.addInputColumn(inputColumn);
analyzerJobBuilder.setConfiguredProperty("Record unique values", false);
analyzerJobBuilder.setConfiguredProperty("Top n most frequent values", 1000);
analyzerJobBuilder.setConfiguredProperty("Bottom n most frequent values", 1000);
assertTrue(analyzerJobBuilder.isConfigured());
}
final FilterComponentBuilder<JavaScriptFilter, JavaScriptFilter.Category> fjb =
analysisJobBuilder.addFilter(JavaScriptFilter.class);
fjb.addInputColumn(nameTransformerComponentBuilder.getOutputColumnByName("Firstname"));
fjb.addInputColumn(usernameColumn);
fjb.setConfiguredProperty("Source code", "values[0] == values[1];");
assertTrue(fjb.isConfigured());
analysisJobBuilder.addAnalyzer(StringAnalyzer.class)
.addInputColumn(analysisJobBuilder.getSourceColumnByName("email"))
.setRequirement(fjb, JavaScriptFilter.Category.VALID);
analysisJobBuilder.addAnalyzer(StringAnalyzer.class)
.addInputColumn(analysisJobBuilder.getSourceColumnByName("email"))
.setRequirement(fjb, JavaScriptFilter.Category.INVALID);
final AnalysisResultFuture resultFuture = runner.run(analysisJobBuilder.toAnalysisJob());
analysisJobBuilder.close();
con.close();
if (!resultFuture.isSuccessful()) {
final List<Throwable> errors = resultFuture.getErrors();
throw errors.get(0);
}
final List<AnalyzerResult> results = resultFuture.getResults();
assertEquals(7, results.size());
ValueDistributionAnalyzerResult vdResult = (ValueDistributionAnalyzerResult) results.get(1);
assertEquals("Firstname", vdResult.getName());
assertEquals(0, vdResult.getNullCount());
assertEquals(2, vdResult.getUniqueCount().intValue());
assertEquals("[[barack->4], [<unique>->2]]", vdResult.getValueCounts().toString());
vdResult = (ValueDistributionAnalyzerResult) results.get(2);
assertEquals("Lastname", vdResult.getName());
assertEquals(0, vdResult.getNullCount());
assertEquals(0, vdResult.getUniqueCount().intValue());
assertEquals("[[obama->4], [doe->2]]", vdResult.getValueCounts().toString());
vdResult = (ValueDistributionAnalyzerResult) results.get(3);
assertEquals("Middlename", vdResult.getName());
assertEquals(4, vdResult.getNullCount());
assertEquals(0, vdResult.getUniqueCount().intValue());
assertEquals("[[<null>->4], [hussein->2]]", vdResult.getValueCounts().toString());
vdResult = (ValueDistributionAnalyzerResult) results.get(4);
assertEquals("Titulation", vdResult.getName());
assertEquals(6, vdResult.getNullCount());
assertEquals(0, vdResult.getUniqueCount().intValue());
assertEquals("[[<null>->6]]", vdResult.getValueCounts().toString());
StringAnalyzerResult stringAnalyzerResult = (StringAnalyzerResult) results.get(5);
assertEquals(1, stringAnalyzerResult.getColumns().length);
assertEquals("4",
stringAnalyzerResult.getCrosstab().where("Column", "email").where("Measures", "Row count").get()
.toString());
stringAnalyzerResult = (StringAnalyzerResult) results.get(6);
assertEquals(1, stringAnalyzerResult.getColumns().length);
assertEquals("2",
stringAnalyzerResult.getCrosstab().where("Column", "email").where("Measures", "Row count").get()
.toString());
}
}