/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans.transform; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import java.util.List; import org.datacleaner.configuration.DataCleanerConfiguration; import org.datacleaner.configuration.DataCleanerConfigurationImpl; import org.datacleaner.data.MockInputColumn; import org.datacleaner.data.MockInputRow; import org.datacleaner.reference.SynonymCatalog; import org.datacleaner.reference.TextFileSynonymCatalog; import org.junit.Test; public class SynonymLookupTransformerTest { private final DataCleanerConfiguration configuration = new DataCleanerConfigurationImpl(); private final SynonymCatalog sc = new TextFileSynonymCatalog("my synonyms", "src/test/resources/synonym-countries.txt", true, "UTF8"); @Test public void testCaseInsensitiveMathingOnEveryToken() throws Exception { final SynonymCatalog sc = new TextFileSynonymCatalog("my synonyms", "src/test/resources/synonym-countries.txt", false, "UTF8"); final MockInputColumn<String> col = new MockInputColumn<>("my col", String.class); final SynonymLookupTransformer transformer = new SynonymLookupTransformer(col, sc, true, configuration); transformer.replaceInlinedSynonyms = true; transformer.init(); assertEquals("Hello DNK DNK DNK!", transformer.transform(new MockInputRow().put(col, "Hello denmark dnk dk!"))[0]); assertEquals("DNK", transformer.transform(new MockInputRow().put(col, "dk"))[0]); assertEquals("Hello GBR DNK GBR.", transformer.transform(new MockInputRow().put(col, "Hello United KINGDOM danmark uk."))[0]); transformer.close(); } @Test public void testCaseInsensitiveMathingOnCompleteExpression() throws Exception { final SynonymCatalog sc = new TextFileSynonymCatalog("my synonyms", "src/test/resources/synonym-countries.txt", false, "UTF8"); final MockInputColumn<String> col = new MockInputColumn<>("my col", String.class); final SynonymLookupTransformer transformer = new SynonymLookupTransformer(col, sc, true, configuration); transformer.replaceInlinedSynonyms = false; transformer.init(); assertEquals("Hello denmark dnk dk!", transformer.transform(new MockInputRow().put(col, "Hello denmark dnk dk!"))[0]); assertNull(transformer.transform(new MockInputRow().put(col, "Hello denmark dnk dk!"))[1]); assertNull(transformer.transform(new MockInputRow().put(col, "Hello denmark dnk dk!"))[2]); assertEquals("DNK", transformer.transform(new MockInputRow().put(col, "dk"))[0]); assertEquals("dk", transformer.transform(new MockInputRow().put(col, "dk"))[1]); assertEquals("DNK", transformer.transform(new MockInputRow().put(col, "dk"))[2]); assertEquals("Hello United KINGDOM danmark uk.", transformer.transform(new MockInputRow().put(col, "Hello United KINGDOM danmark uk."))[0]); assertNull(transformer.transform(new MockInputRow().put(col, "Hello United KINGDOM danmark uk."))[1]); assertNull(transformer.transform(new MockInputRow().put(col, "Hello United KINGDOM danmark uk."))[2]); transformer.close(); } @Test public void testTransformWithCompleteInput() throws Exception { final MockInputColumn<String> col = new MockInputColumn<>("my col", String.class); // with retain original value SynonymLookupTransformer transformer = new SynonymLookupTransformer(col, sc, true, configuration); transformer.replaceInlinedSynonyms = false; assertEquals(3, transformer.getOutputColumns().getColumnCount()); assertEquals("my col (synonyms replaced)", transformer.getOutputColumns().getColumnName(0)); transformer.init(); assertEquals("hello", transformer.transform(new MockInputRow().put(col, "hello"))[0]); assertEquals("ALB", transformer.transform(new MockInputRow().put(col, "Albania"))[0]); assertEquals("I come from Albania!", transformer.transform(new MockInputRow().put(col, "I come from Albania!"))[0]); transformer.close(); // without retain original value transformer = new SynonymLookupTransformer(col, sc, false, configuration); transformer.replaceInlinedSynonyms = false; assertEquals(3, transformer.getOutputColumns().getColumnCount()); assertEquals("my col (synonyms replaced)", transformer.getOutputColumns().getColumnName(0)); assertEquals("my col (synonyms found)", transformer.getOutputColumns().getColumnName(1)); assertEquals("my col (master terms found)", transformer.getOutputColumns().getColumnName(2)); transformer.init(); assertNull(transformer.transform(new MockInputRow().put(col, "hello"))[0]); assertEquals("ALB", transformer.transform(new MockInputRow().put(col, "Albania"))[0]); assertEquals("Albania", transformer.transform(new MockInputRow().put(col, "Albania"))[1]); assertEquals("ALB", transformer.transform(new MockInputRow().put(col, "Albania"))[2]); assertNull(transformer.transform(new MockInputRow().put(col, "foo"))[0]); assertNull(transformer.transform(new MockInputRow().put(col, "foo"))[1]); assertNull(transformer.transform(new MockInputRow().put(col, "foo"))[2]); transformer.close(); } @Test public void testTransformWithEveryToken() throws Exception { final MockInputColumn<String> col = new MockInputColumn<>("my col", String.class); // with retain original value SynonymLookupTransformer transformer = new SynonymLookupTransformer(col, sc, true, configuration); transformer.replaceInlinedSynonyms = true; transformer.replacedSynonymsType = SynonymLookupTransformer.ReplacedSynonymsType.LIST; transformer.init(); assertEquals(3, transformer.getOutputColumns().getColumnCount()); assertEquals("my col (synonyms replaced)", transformer.getOutputColumns().getColumnName(0)); assertEquals("my col (synonyms found)", transformer.getOutputColumns().getColumnName(1)); assertEquals("my col (master terms found)", transformer.getOutputColumns().getColumnName(2)); assertEquals("hello", transformer.transform(new MockInputRow().put(col, "hello"))[0]); assertEquals("ALB", transformer.transform(new MockInputRow().put(col, "Albania"))[0]); assertEquals("I come from ALB!", transformer.transform(new MockInputRow().put(col, "I come from ALB!"))[0]); assertEquals("I come from GBR!", transformer.transform(new MockInputRow().put(col, "I come from Britain!"))[0]); assertEquals("I come from GBR!", transformer.transform(new MockInputRow().put(col, "I come from Great Britain!"))[0]); final Object[] result = transformer.transform(new MockInputRow().put(col, "I come from Great Great Britain Albania!")); assertEquals("I come from Great GBR ALB!", result[0]); @SuppressWarnings("unchecked") final List<String> synonyms = (List<String>) result[1]; @SuppressWarnings("unchecked") final List<String> masterTerms = (List<String>) result[2]; assertEquals("Great Britain", synonyms.get(0)); assertEquals("Albania", synonyms.get(1)); assertEquals("GBR", masterTerms.get(0)); assertEquals("ALB", masterTerms.get(1)); transformer.close(); // without retain original value transformer = new SynonymLookupTransformer(col, sc, false, configuration); transformer.replaceInlinedSynonyms = false; transformer.init(); assertEquals(3, transformer.getOutputColumns().getColumnCount()); assertEquals("my col (synonyms replaced)", transformer.getOutputColumns().getColumnName(0)); assertNull(transformer.transform(new MockInputRow().put(col, "hello"))[0]); assertNull(transformer.transform(new MockInputRow().put(col, "hello"))[1]); assertNull(transformer.transform(new MockInputRow().put(col, "hello"))[2]); assertEquals("ALB", transformer.transform(new MockInputRow().put(col, "Albania"))[0]); assertEquals("Albania", transformer.transform(new MockInputRow().put(col, "Albania"))[1]); assertEquals("ALB", transformer.transform(new MockInputRow().put(col, "Albania"))[2]); transformer.close(); } }