package org.apache.solr.analysis.author; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import org.apache.solr.analysis.author.AuthorUtils; import junit.framework.TestCase; // TODO: fix the errors, these methods should not be part of public API // they are confusing when public public class TestAuthorUtils extends TestCase { public void testNormalizeAuthor() { assertEquals("Kurtz, Michael", AuthorUtils.normalizeAuthor("Kurtz, Michael")); assertEquals("Huchra, J", AuthorUtils.normalizeAuthor("Huchra, J.")); assertEquals("Gomez, Hector Q", AuthorUtils.normalizeAuthor(" Gomez, Hector Q ")); assertEquals("Gómez, Hector Q", AuthorUtils.normalizeAuthor("Gómez, Hector Q")); assertEquals("Foo Eye, Bar", AuthorUtils.normalizeAuthor("Foo'Eye, Bar")); assertEquals("Radio, F M", AuthorUtils.normalizeAuthor("Radio, F.M.")); assertEquals("NA49 COLLABORATION,", AuthorUtils.normalizeAuthor("NA49 COLLABORATION")); assertEquals("29, 000 STARDUST HOME DUSTERS", AuthorUtils.normalizeAuthor("29, 000 STARDUST HOME DUSTERS")); // unicode character vs unicode+accent // U+0061 (a) + U+0300 // U+00E0 (à) assertEquals("G\u0061\u0300mez, Hector Q", AuthorUtils.normalizeAuthor("G\u0061\u0300mez, Hector Q.")); assertEquals("G\u00E0mez, Hector Q", AuthorUtils.normalizeAuthor("G\u00E0mez, Hector Q.")); assertEquals("hey, joe", AuthorUtils.normalizeAuthor("hey,_joe")); assertEquals("hey, joe", AuthorUtils.normalizeAuthor("hey, joe$#@^&*!!!")); assertEquals("o sullivan, mike", AuthorUtils.normalizeAuthor("o'sullivan, mike")); assertEquals("o sullivan, mike", AuthorUtils.normalizeAuthor("o' sullivan, mike")); assertEquals("mc donald, co", AuthorUtils.normalizeAuthor("mc'donald, co(.)")); assertEquals("GómezFoo, He ctor 29Q", AuthorUtils.normalizeAuthor("%$Gómez_Foo, He-ctor; 29Q.")); assertEquals("Gómez, Hector Q", AuthorUtils.normalizeAuthor(" Gómez,\n Hector Q ")); assertEquals("Moon, D S", AuthorUtils.normalizeAuthor("Moon, D. -S.")); assertEquals("Moon, D S", AuthorUtils.normalizeAuthor("Moon, D.-S.")); assertEquals("Moon, D S", AuthorUtils.normalizeAuthor("Moon, D.--S.")); assertEquals("Moon, D S", AuthorUtils.normalizeAuthor("Moon, D. -S.-")); assertEquals("Moon, Dae Sik", AuthorUtils.normalizeAuthor("Moon, Dae-Sik")); assertEquals("Moon, Dae Sik", AuthorUtils.normalizeAuthor("Moon, Dae -Sik")); assertEquals("Moon, Dae Sik", AuthorUtils.normalizeAuthor("Moon, Dae - Sik ")); } public void testParseAuthor() throws Exception { HashMap<String,String> expected = new HashMap<String,String>(); expected.put("last", "Hoover"); expected.put("first", "Herbert"); expected.put("middle", "C"); assertEquals(expected, AuthorUtils.parseAuthor("Hoover, Herbert C.")); } public void testASCIIFolding() { HashSet<String> expected = new HashSet<String>(); expected.add("MULLER, BILL"); expected.add("MUELLER, BILL"); ArrayList<String> actual = AuthorUtils.getAsciiTransliteratedVariants("MÜLLER, BILL"); assertEquals(expected, new HashSet<String>(actual)); expected.clear(); expected.add("GOMEZ, HECTOR Q"); expected.add("GOEMEZ, HECTOR Q"); actual = AuthorUtils.getAsciiTransliteratedVariants("GÓMEZ, HECTOR Q"); assertEquals(expected, new HashSet<String>(actual)); } public void testTransliterate() { HashMap<String,String> testMap = new HashMap<String,String>(); testMap.put("Ü", "UE"); testMap.put("ä", "ae"); testMap.put("č", "ch"); for (String k : testMap.keySet()) { String expected = testMap.get(k); String actual = AuthorUtils.transliterateAccents(k); assertEquals(expected, actual); } } // for reference implementation // see: public void testTransRussianApostrophes() { HashSet<String> expected = new HashSet<String>(); expected.add("FOOIEYE, BAR"); expected.add("FOOYEYE, BAR"); expected.add("FOOEYE, BAR"); HashSet<String> input = new HashSet<String>(); input.add("FOO'EYE, BAR"); HashSet<String> actual = AuthorUtils.translitRussianApostrophes(input.iterator()); assertEquals(expected, actual); } public void testTransRussianLastNames1() { HashSet<String> expected = new HashSet<String>(); expected.add("FOOYEV, BAR"); expected.add("FOOJEV, BAR"); expected.add("FOOIEV, BAR"); HashSet<String> input = new HashSet<String>(); input.add("FOOEV, BAR"); HashSet<String> actual = AuthorUtils.translitRussianLastNames1(input.iterator()); assertEquals(expected, actual); } public void testTransRussianLastNames2() { HashSet<String> expected = new HashSet<String>(); expected.add("FOONIIA, BAR"); expected.add("FOONIYA, BAR"); HashSet<String> input = new HashSet<String>(); input.add("FOONIA, BAR"); HashSet<String> actual = AuthorUtils.translitRussianLastNames2(input.iterator()); assertEquals(expected, actual); } public void testTransRussianLastNames3() { HashSet<String> expected = new HashSet<String>(); expected.add("FOODYAN, BAR"); expected.add("FOODIAN, BAR"); expected.add("FOODJAN, BAR"); HashSet<String> input = new HashSet<String>(); input.add("FOODJAN, BAR"); HashSet<String> actual = AuthorUtils.translitRussianLastNames3(input.iterator()); assertEquals(expected, actual); } public void testTransRussianLastNames4() { HashSet<String> expected = new HashSet<String>(); expected.add("FOOKAYA, BAR"); expected.add("FOOKAJA, BAR"); expected.add("FOOKAIA, BAR"); HashSet<String> input = new HashSet<String>(); input.add("FOOKAYA, BAR"); HashSet<String> actual = AuthorUtils.translitRussianLastNames4(input.iterator()); assertEquals(expected, actual); } public void testTransRussianLastNames5() { HashSet<String> expected = new HashSet<String>(); expected.add("FOOKYI, BAR"); expected.add("FOOKII, BAR"); expected.add("FOOKY, BAR"); expected.add("FOOKI, BAR"); expected.add("FOOKIY, BAR"); expected.add("FOOKIJ, BAR"); expected.add("FOOVYI, BAR"); expected.add("FOOVII, BAR"); expected.add("FOOVY, BAR"); expected.add("FOOVI, BAR"); expected.add("FOOVIY, BAR"); expected.add("FOOVIJ, BAR"); HashSet<String> input = new HashSet<String>(); input.add("FOOKI, BAR"); input.add("FOOVI, BAR"); HashSet<String> actual = AuthorUtils.translitRussianLastNames5(input.iterator()); assertEquals(expected, actual); } public void testTransRussianFirstNames() { HashSet<String> expected = new HashSet<String>(); expected.add("FOOBAR, YURI"); expected.add("FOOBAR, IURI"); expected.add("FOOBAR, YAGNI"); expected.add("FOOBAR, IAGNI"); HashSet<String> input = new HashSet<String>(); input.add("FOOBAR, YURI"); input.add("FOOBAR, IAGNI"); HashSet<String> actual = AuthorUtils.translitRussianFirstNames(input.iterator()); assertEquals(expected, actual); } public void testTransRussianNames() { HashSet<String> expected = new HashSet<String>(); expected.add("FOOVIY, IURI"); expected.add("FOOVIY, YURI"); expected.add("FOOVI, YURI"); expected.add("FOOVYI, YURI"); expected.add("FOOVYI, IURI"); expected.add("FOOVIJ, IURI"); expected.add("FOOVY, IURI"); expected.add("FOOVIJ, YURI"); expected.add("FOOVY, YURI"); expected.add("FOOVI, IURI"); expected.add("FOOVII, IURI"); expected.add("FOOVII, YURI"); HashSet<String> actual = AuthorUtils.transliterateRussianNames(new String[] {"FOOVI, YURI"}); assertEquals(expected, actual); } public void testGenSynonyms() { HashSet<String> expected = new HashSet<String>(); expected.add("FOO'EYE, BAR"); expected.add("FOO'EYE, BAER"); expected.add("FOOIEYE, BÄR"); expected.add("FOOYEYE, BÄR"); expected.add("FOOEYE, BÄR"); expected.add("FOOIEYE, BAR"); expected.add("FOOYEYE, BAR"); expected.add("FOOEYE, BAR"); expected.add("FOOIEYE, BAER"); expected.add("FOOYEYE, BAER"); expected.add("FOOEYE, BAER"); HashSet<String> actual = new HashSet<String>(AuthorUtils.getAsciiTransliteratedVariants("FOO'EYE, BÄR")); assertEquals(expected, actual); } }