/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans.transform; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import org.datacleaner.api.AnalyzerResult; import org.datacleaner.beans.valuedist.ValueDistributionAnalyzer; import org.datacleaner.beans.valuedist.ValueDistributionAnalyzerResult; import org.datacleaner.components.convert.ConvertToNumberTransformer; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.connection.CsvDatastore; import org.datacleaner.connection.Datastore; import org.datacleaner.data.MutableInputColumn; import org.datacleaner.job.AnalysisJob; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.job.builder.AnalyzerComponentBuilder; import org.datacleaner.job.builder.TransformerComponentBuilder; import org.datacleaner.job.runner.AnalysisResultFuture; import org.datacleaner.job.runner.AnalysisRunnerImpl; import org.datacleaner.reference.Dictionary; import org.datacleaner.reference.ReferenceDataCatalogImpl; import org.datacleaner.reference.SimpleDictionary; import org.datacleaner.reference.SimpleSynonym; import org.datacleaner.reference.SimpleSynonymCatalog; import org.datacleaner.reference.StringPattern; import org.datacleaner.reference.SynonymCatalog; import junit.framework.TestCase; public class DictionaryMatcherTransformerTest extends TestCase { public void testParseAndAssignDictionaries() throws Throwable { final Collection<Dictionary> dictionaries = new ArrayList<>(); dictionaries.add(new SimpleDictionary("eobjects.org products", "MetaModel", "DataCleaner", "AnalyzerBeans")); dictionaries.add(new SimpleDictionary("apache products", "commons-lang", "commons-math", "commons-codec", "commons-logging")); dictionaries.add(new SimpleDictionary("logging products", "commons-logging", "log4j", "slf4j", "java.util.Logging")); final Collection<SynonymCatalog> synonymCatalogs = new ArrayList<>(); synonymCatalogs.add(new SimpleSynonymCatalog("translated terms", new SimpleSynonym("hello", "howdy", "hi", "yo", "hey"), new SimpleSynonym("goodbye", "bye", "see you", "hey"))); final Collection<StringPattern> stringPatterns = new ArrayList<>(); final ReferenceDataCatalogImpl ref = new ReferenceDataCatalogImpl(dictionaries, synonymCatalogs, stringPatterns); final Datastore datastore = new CsvDatastore("my database", "src/test/resources/projects.csv"); final DataCleanerConfigurationImpl conf = new DataCleanerConfigurationImpl(); final AnalysisJobBuilder job = new AnalysisJobBuilder(conf); job.setDatastore(datastore); job.addSourceColumns("product", "version"); final TransformerComponentBuilder<DictionaryMatcherTransformer> tjb1 = job.addTransformer(DictionaryMatcherTransformer.class); tjb1.setConfiguredProperty("Dictionaries", new Dictionary[] { ref.getDictionary("eobjects.org products"), ref.getDictionary("apache products"), ref.getDictionary("logging products") }); tjb1.addInputColumn(job.getSourceColumnByName("product")); final List<MutableInputColumn<?>> outputColumns = tjb1.getOutputColumns(); assertEquals(3, outputColumns.size()); outputColumns.get(0).setName("eobjects match"); outputColumns.get(1).setName("apache match"); outputColumns.get(2).setName("logging match"); final TransformerComponentBuilder<ConvertToNumberTransformer> tjb2 = job.addTransformer(ConvertToNumberTransformer.class); tjb2.addInputColumn(outputColumns.get(2)); tjb2.getOutputColumns().get(0).setName("logging match -> number"); final AnalyzerComponentBuilder<ValueDistributionAnalyzer> ajb = job.addAnalyzer(ValueDistributionAnalyzer.class); ajb.addInputColumns(tjb1.getOutputColumns()); ajb.addInputColumns(tjb2.getOutputColumns()); assertTrue(job.isConfigured()); final AnalysisJob analysisJob = job.toAnalysisJob(); final AnalysisResultFuture resultFuture = new AnalysisRunnerImpl(conf).run(analysisJob); if (!resultFuture.isSuccessful()) { job.close(); throw resultFuture.getErrors().get(0); } final List<AnalyzerResult> results = resultFuture.getResults(); assertEquals(4, results.size()); ValueDistributionAnalyzerResult res = (ValueDistributionAnalyzerResult) results.get(0); assertEquals("eobjects match", res.getName()); assertEquals(8, res.getCount("true").intValue()); assertEquals(4, res.getCount("false").intValue()); res = (ValueDistributionAnalyzerResult) results.get(1); assertEquals("apache match", res.getName()); assertEquals(2, res.getCount("true").intValue()); assertEquals(10, res.getCount("false").intValue()); res = (ValueDistributionAnalyzerResult) results.get(2); assertEquals("logging match", res.getName()); assertEquals(3, res.getCount("true").intValue()); assertEquals(9, res.getCount("false").intValue()); res = (ValueDistributionAnalyzerResult) results.get(3); assertEquals("logging match -> number", res.getName()); assertEquals(3, res.getCount("1").intValue()); assertEquals(9, res.getCount("0").intValue()); job.close(); } public void testTransform() throws Exception { final Dictionary[] dictionaries = new Dictionary[] { new SimpleDictionary("danish male names", "kasper", "kim", "asbjørn"), new SimpleDictionary("danish female names", "trine", "kim", "lene") }; final DictionaryMatcherTransformer transformer = new DictionaryMatcherTransformer(null, dictionaries, new DataCleanerConfigurationImpl()); transformer.init(); assertEquals("[true, false]", Arrays.toString(transformer.transform("kasper"))); assertEquals("[false, false]", Arrays.toString(transformer.transform("foobar"))); assertEquals("[false, true]", Arrays.toString(transformer.transform("trine"))); assertEquals("[true, true]", Arrays.toString(transformer.transform("kim"))); transformer._outputType = MatchOutputType.INPUT_OR_NULL; assertEquals("[kim, kim]", Arrays.toString(transformer.transform("kim"))); assertEquals("[null, trine]", Arrays.toString(transformer.transform("trine"))); transformer.close(); } }