// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.standardization.query; import static org.junit.Assert.assertEquals; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; /** * DOC sizhaoliu class global comment. Detailled comment */ public class FirstNameStandardizeTest { private final static String indexfolder = "src/test/resources/data/TalendGivenNames_index"; // $NON-NLS-1$ //$NON-NLS-1$ private static IndexSearcher searcher = null; private static Analyzer searchAnalyzer = null; private static FirstNameStandardize fnameStandardize = null; private static final String inputName = "Michel"; //$NON-NLS-1$ private static final String[][] expected = { { "Michel", "AUS", "MITCHEL", "MICHELE", "MICHEL" }, // { "Michel", "BEL", "MICHEL", "MICHEL", "MICHEL" }, // { "Michel", "DEU", "MICHEL", "MICHEL", "MICHEL" }, // { "Michel", "ESP", "MICHEL", "MICHEL", "MICHEL" }, // { "Michel", "FRA", "MICHEL", "MICHEL", "MICHEL" }, // { "Michel", "ITA", "MICHELA", "MICHELA", "MICHELE" }, // { "Michel", "RUS", "MICHEL", "MICHEL", "MICHEL" }, // { "Michel", "USA", "MICHEL", "MICHEL", "MICHEL" }, // { "Adrian", "AUS", "ADRIAN", "ADRIAN", "ADRIAN" }, // { "Adrian", "BEL", "ADRIAN", "ADRIAN", "ADRIAN" }, // { "Adrian", "DEU", "ADRIAN", "ADRIAN", "ADRIAN" }, // { "Adrian", "ESP", "ADRIAN", "ADRIAN", "ADRIAN" }, // { "Adrian", "FRA", "ADRIAN", "ADRIAN", "ADRIAN" }, // { "Adrian", "ITA", "ADRIANO", "ADRIANA", "ADRIANO" }, // { "Adrian", "RUS", "ADRIAN", "ADRIAN", "ADRIAN" }, // { "Adrian", "USA", "ADRIAN", "ADRIAN", "ADRIAN" }, // }; private static final String[][] expected_fuzzy = { { "Alessandra", "ALESSANDRA", "ALESSANDRA" }, //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ { "Antonino", "ANTONINO", "ANTONINO" }, { "amar", "AMAR", "AMAR" }, { "jan", "JAN", "JAN" }, //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ { "James", "JAMES", "JAMES" }, { "Keith", "KEITH", "KEITH" }, { "guy", "GUY", "GUY" }, //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ { "roland", "ROLAND", "ROLAND" }, { "Angela", "ANGELA", "ANGELA" }, { "Joe", "JOE", "JOE" }, //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ { "eric", "ERIC", "ERIC" }, { "francesco", "FRANCESCO", "FRANCESCO" }, { "Manfred", "MANFRED", "MANFRED" }, //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ { "malathi", "", "MALACHI" }, { "Aly", "ALY", "ALY" }, { "sreedhar", "", "" }, { "Louann", "LOUANN", "LOUANN" }, //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ //$NON-NLS-12$ { "Elif", "ELIF", "ELIF" }, { "Sreenivas", "", "" }, { "subhash", "SUBHASH", "SUBHASH" }, { "Dara", "DARA", "DARA" }, //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ //$NON-NLS-10$ //$NON-NLS-11$ //$NON-NLS-12$ { "Gabor", "GABOR", "GABOR" }, { "Jill", "JILL", "JILL" }, { "Michael", "MICHAEL", "MICHAEL" }, //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ //$NON-NLS-9$ { "bhargav", "", "BHARGAW" }, { "nonya", "", "NONNA" } }; //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ /** * DOC sizhaoliu Comment method "setUpBeforeClass". * * @throws java.lang.Exception */ @BeforeClass public static void setUpBeforeClass() throws Exception { Directory dir = FSDirectory.open(new File(indexfolder)); IndexReader reader = DirectoryReader.open(dir); searcher = new IndexSearcher(reader); searchAnalyzer = new SimpleAnalyzer(); fnameStandardize = new FirstNameStandardize(searcher, searchAnalyzer, 10); } @AfterClass public static void tearDown() throws Exception { if (searcher != null) { searcher.getIndexReader().close(); } } /** * Test method for * {@link org.talend.dataquality.standardization.query.FirstNameStandardize#replaceName(java.lang.String, boolean)}. */ @Test public void testReplaceName() { try { String res = fnameStandardize.replaceName(inputName, true); System.out.println("testReplaceName:\n" + res); //$NON-NLS-1$ assertEquals("MICHEL", res); //$NON-NLS-1$ fnameStandardize.replaceNameWithCountryGenderInfo(inputName, "ITA", "F", true); } catch (IOException e) { e.printStackTrace(); } } /** * Test method for * {@link org.talend.dataquality.standardization.query.FirstNameStandardize#replaceNameWithCountryGenderInfo(java.lang.String, java.lang.String, java.lang.String, boolean)} * . */ @Test public void testReplaceNameWithCountryGenderInfo() { try { System.out.println("\ntestReplaceNameWithCountryGenderInfo:"); //$NON-NLS-1$ System.out.println("Name\tCountry\tNon-gender\tFemale\tMale"); //$NON-NLS-1$ for (String[] testCase : expected) { String res, resF, resM = ""; //$NON-NLS-1$ System.out.print("{\"" + testCase[0] + "\", \"" + testCase[1] + "\", \""); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ // results for query without gender info res = fnameStandardize.replaceNameWithCountryInfo(testCase[0], testCase[1], true); System.out.print(res + "\", \""); //$NON-NLS-1$ assertEquals(testCase[2], res); // results for female first name query resF = fnameStandardize.replaceNameWithCountryGenderInfo(testCase[0], testCase[1], "F", true); //$NON-NLS-1$ System.out.print(resF + "\", \""); //$NON-NLS-1$ assertEquals(testCase[3], resF); // results for female first name query resM = fnameStandardize.replaceNameWithCountryGenderInfo(testCase[0], testCase[1], "M", true); //$NON-NLS-1$ System.out.println(resM + "\"}, //"); //$NON-NLS-1$ assertEquals(testCase[4], resM); } } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } @Test public void testReplaceNameWithFuzzyOption() { try { System.out.println("\ntestReplaceNameWithFuzzyOption:"); //$NON-NLS-1$ System.out.println("Name\tNon-fuzzy\tFuzzy"); //$NON-NLS-1$ for (String[] testCase : expected_fuzzy) { String res = ""; //$NON-NLS-1$ System.out.print("{\"" + testCase[0] + "\", \""); //$NON-NLS-1$ //$NON-NLS-2$ // results for non-country, non-fuzzy match res = fnameStandardize.replaceName(testCase[0], false); System.out.print(res + "\", \""); //$NON-NLS-1$ assertEquals(testCase[1], res); // results for non-country, fuzzy match res = fnameStandardize.replaceName(testCase[0], true); System.out.print(res + "\"},\n"); //$NON-NLS-1$ assertEquals(testCase[2], res); } } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } }