/** * AnalyzerBeans * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.eobjects.analyzer.beans.stringpattern; import junit.framework.TestCase; import org.eobjects.analyzer.data.MockInputColumn; import org.eobjects.analyzer.data.MockInputRow; import org.eobjects.analyzer.descriptors.AnalyzerBeanDescriptor; import org.eobjects.analyzer.descriptors.Descriptors; import org.eobjects.analyzer.storage.InMemoryRowAnnotationFactory; public class PatternFinderAnalyzerTest extends TestCase { public void testDescriptor() throws Exception { // simply test that the analyzer is valid AnalyzerBeanDescriptor<PatternFinderAnalyzer> descriptor = Descriptors.ofAnalyzer(PatternFinderAnalyzer.class); assertEquals("Pattern finder", descriptor.getDisplayName()); } public void testSingleToken() throws Exception { PatternFinderAnalyzer pf = new PatternFinderAnalyzer(); MockInputColumn<String> column = new MockInputColumn<String>("title", String.class); pf.setRowAnnotationFactory(new InMemoryRowAnnotationFactory()); pf.setColumn(column); pf.init(); pf.run(new MockInputRow().put(column, "blabla"), 1); assertEquals("Crosstab:\nMatch count,aaaaaa: 1\nSample,aaaaaa: blabla", pf.getResult().getSingleCrosstab() .toString()); } public void testEmployeeTitles() throws Exception { PatternFinderAnalyzer pf = new PatternFinderAnalyzer(); MockInputColumn<String> column = new MockInputColumn<String>("title", String.class); pf.setRowAnnotationFactory(new InMemoryRowAnnotationFactory()); pf.setColumn(column); pf.setDiscriminateTextCase(true); pf.init(); pf.run(new MockInputRow().put(column, "Sales director"), 1); String[] resultLines; resultLines = new PatternFinderResultTextRenderer().render(pf.getResult()).split("\n"); assertEquals(2, resultLines.length); assertEquals(" Match count Sample ", resultLines[0]); assertEquals("Aaaaa aaaaaaaa 1 Sales director ", resultLines[1]); pf.run(new MockInputRow().put(column, "Key account manager"), 1); pf.run(new MockInputRow().put(column, "Account manager"), 1); pf.run(new MockInputRow().put(column, "Sales manager (EMEA)"), 1); resultLines = new PatternFinderResultTextRenderer().render(pf.getResult()).split("\n"); assertEquals(4, resultLines.length); assertEquals(" Match count Sample ", resultLines[0]); assertEquals("Aaaaaaa aaaaaaaa 2 Sales director ", resultLines[1]); assertEquals("Aaa aaaaaaa aaaaaaa 1 Key account manager ", resultLines[2]); assertEquals("Aaaaa aaaaaaa (AAAA) 1 Sales manager (EMEA) ", resultLines[3]); pf.run(new MockInputRow().put(column, "Sales Manager, USA"), 1); pf.run(new MockInputRow().put(column, "Account Manager (USA)"), 1); pf.run(new MockInputRow().put(column, "1st on the phone"), 1); resultLines = new PatternFinderResultTextRenderer().render(pf.getResult()).split("\n"); assertEquals(7, resultLines.length); assertEquals(" Match count Sample ", resultLines[0]); assertEquals("Aaaaaaa aaaaaaaa 2 Sales director ", resultLines[1]); assertEquals("??? aa aaa aaaaa 1 1st on the phone ", resultLines[2]); assertEquals("Aaa aaaaaaa aaaaaaa 1 Key account manager ", resultLines[3]); assertEquals("Aaaaa Aaaaaaa, AAA 1 Sales Manager, USA ", resultLines[4]); assertEquals("Aaaaa aaaaaaa (AAAA) 1 Sales manager (EMEA) ", resultLines[5]); assertEquals("Aaaaaaa Aaaaaaa (AAA) 1 Account Manager (USA) ", resultLines[6]); } public void testEmailAddresses() throws Exception { PatternFinderAnalyzer pf = new PatternFinderAnalyzer(); MockInputColumn<String> column = new MockInputColumn<String>("title", String.class); pf.setRowAnnotationFactory(new InMemoryRowAnnotationFactory()); pf.setColumn(column); pf.setDiscriminateTextCase(true); pf.init(); pf.run(new MockInputRow().put(column, "kasper@eobjects.dk"), 1); pf.run(new MockInputRow().put(column, "kasper.sorensen@eobjects.dk"), 1); pf.run(new MockInputRow().put(column, "john@doe.com"), 1); pf.run(new MockInputRow().put(column, ""), 3); pf.run(new MockInputRow().put(column, "john.doe@company.com"), 1); pf.run(new MockInputRow().put(column, null), 1); pf.run(new MockInputRow().put(column, null), 1); String[] resultLines = new PatternFinderResultTextRenderer().render(pf.getResult()).split("\n"); assertEquals(5, resultLines.length); assertEquals(" Match count Sample ", resultLines[0]); assertEquals("<blank> 3 <blank> ", resultLines[1]); assertEquals("<null> 2 <null> ", resultLines[2]); assertEquals("aaaaaa.aaaaaaaa@aaaaaaaa.aaa 2 kasper.sorensen@eobjects.dk ", resultLines[3]); assertEquals("aaaaaa@aaaaaaaa.aaa 2 kasper@eobjects.dk ", resultLines[4]); } public void testGroupEmailByDomain() throws Exception { PatternFinderAnalyzer pf = new PatternFinderAnalyzer(); MockInputColumn<String> col1 = new MockInputColumn<String>("username", String.class); MockInputColumn<String> col2 = new MockInputColumn<String>("domain", String.class); pf.setRowAnnotationFactory(new InMemoryRowAnnotationFactory()); pf.setColumn(col1); pf.setGroupColumn(col2); pf.setDiscriminateTextCase(true); pf.init(); String[] resultLines; resultLines = new PatternFinderResultTextRenderer().render(pf.getResult()).split("\n"); assertEquals(1, resultLines.length); assertEquals("No patterns found", resultLines[0]); pf.run(new MockInputRow().put(col1, "kasper").put(col2, null), 1); resultLines = new PatternFinderResultTextRenderer().render(pf.getResult()).split("\n"); assertEquals("Patterns for group: null", resultLines[0]); assertEquals(" Match count Sample ", resultLines[1]); assertEquals("aaaaaa 1 kasper ", resultLines[2]); assertEquals(3, resultLines.length); pf.run(new MockInputRow().put(col1, "kasper").put(col2, "eobjects.dk"), 1); pf.run(new MockInputRow().put(col1, "kasper.sorensen").put(col2, "eobjects.dk"), 1); pf.run(new MockInputRow().put(col1, "kaspersorensen").put(col2, "eobjects.dk"), 1); pf.run(new MockInputRow().put(col1, "john").put(col2, "company.com"), 1); pf.run(new MockInputRow().put(col1, "doe").put(col2, "company.com"), 1); resultLines = new PatternFinderResultTextRenderer().render(pf.getResult()).split("\n"); assertEquals("Patterns for group: null", resultLines[0]); assertEquals(" Match count Sample ", resultLines[1]); assertEquals("aaaaaa 1 kasper ", resultLines[2]); assertEquals("", resultLines[3]); assertEquals("Patterns for group: company.com", resultLines[4]); assertEquals(" Match count Sample ", resultLines[5]); assertEquals("aaaa 2 john ", resultLines[6]); assertEquals("", resultLines[7]); assertEquals("Patterns for group: eobjects.dk", resultLines[8]); assertEquals(" Match count Sample ", resultLines[9]); assertEquals("aaaaaaaaaaaaaa 2 kasper ", resultLines[10]); assertEquals("aaaaaa.aaaaaaaa 1 kasper.sorensen ", resultLines[11]); assertEquals(12, resultLines.length); } }