package org.wikibrain.phrases; import org.apache.commons.collections.IteratorUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.lucene.util.Version; import org.junit.Test; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.lang.IdentityStringNormalizer; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.StringNormalizer; import org.wikibrain.lucene.LuceneStringNormalizer; import org.wikibrain.lucene.TokenizerOptions; import java.io.File; import java.io.IOException; import java.util.*; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; /** * @author Shilad Sen */ public class TestPhraseAnalyzerDao { @Test public void testPrunedPageCounts() { Map<Integer, Integer> rr = new HashMap<Integer, Integer>(); rr.put(3424, 1); rr.put(31, 4); rr.put(999, 10); PrunedCounts<Integer> p1 = new SimplePruner<Integer>(0, 10000, 0.0).prune(rr); assertEquals(p1.size(), 3); assertEquals(p1.getTotal(), 15); assertEquals(new ArrayList<Integer>(p1.keySet()), Arrays.asList(999, 31, 3424)); assertEquals(new ArrayList<Integer>(p1.values()), Arrays.asList(10, 4, 1)); PrunedCounts<Integer> p2 = new SimplePruner<Integer>(0, 2, 0.0).prune(rr); assertEquals(p2.size(), 2); assertEquals(p2.getTotal(), 15); assertEquals(new ArrayList<Integer>(p2.keySet()), Arrays.asList(999, 31)); assertEquals(new ArrayList<Integer>(p2.values()), Arrays.asList(10, 4)); PrunedCounts<Integer> p3 = new SimplePruner<Integer>(3, 10000, 0.0).prune(rr); assertEquals(p3.size(), 2); assertEquals(p3.getTotal(), 15); assertEquals(new ArrayList<Integer>(p3.keySet()), Arrays.asList(999, 31)); assertEquals(new ArrayList<Integer>(p3.values()), Arrays.asList(10, 4)); PrunedCounts<Integer> p4 = new SimplePruner<Integer>(0, 10000, 0.25).prune(rr); assertEquals(p4.size(), 2); assertEquals(p4.getTotal(), 15); assertEquals(new ArrayList<Integer>(p4.keySet()), Arrays.asList(999, 31)); assertEquals(new ArrayList<Integer>(p4.values()), Arrays.asList(10, 4)); } @Test public void testPrunedPhraseCounts() { Map<String, Integer> counts = new HashMap<String, Integer>(); counts.put("z", 3); counts.put("wAz", 4); counts.put("Z.", 8); counts.put("Y", 9); PrunedCounts<String> p1 = new NormalizedStringPruner(0, 10000, 0.0).prune(counts); assertEquals(p1.size(), 3); assertEquals(p1.getTotal(), 24); assertEquals(new ArrayList<String>(p1.keySet()), Arrays.asList("Z.", "Y", "wAz")); assertEquals(new ArrayList<Integer>(p1.values()), Arrays.asList(11, 9, 4)); } @Test public void testDao() throws IOException, DaoException { File tmp = File.createTempFile("testdb", ".db", null); tmp.delete(); StringNormalizer normalizer = new LuceneStringNormalizer(new TokenizerOptions(true, false, false), Version.LUCENE_43); PhraseAnalyzerDao dao = new PhraseAnalyzerObjectDbDao(normalizer, tmp, true); FileUtils.forceDeleteOnExit(tmp); Language en = Language.getByLangCode("en"); PrunedCounts<Integer> c1 = new PrunedCounts<Integer>(12); c1.put(349, 7); c1.put(3121, 3); dao.savePhraseCounts(en, "FOo!", c1); PrunedCounts<String> c2 = new PrunedCounts<String>(13); c2.put("Bar", 9); c2.put("baz", 3); c2.put("boof", 1); dao.savePageCounts(en, 3214, c2); assertNull(dao.getPageCounts(en, 34321, 19)); assertNull(dao.getPhraseCounts(en, "sadfas", 19)); PrunedCounts<Integer> c3 = dao.getPhraseCounts(en, "fOO-", 5); assertNotNull(c3); assertEquals(c3.size(), 2); assertEquals(c3.getTotal(), 12); assertEquals(new ArrayList<Integer>(c3.keySet()), Arrays.asList(349, 3121)); assertEquals(new ArrayList<Integer>(c3.values()), Arrays.asList(7, 3)); PrunedCounts<Integer> c4 = dao.getPhraseCounts(en, "fOO-", 1); assertNotNull(c4); assertEquals(c4.size(), 1); assertEquals(c4.getTotal(), 12); assertEquals(new ArrayList<Integer>(c4.keySet()), Arrays.asList(349)); assertEquals(new ArrayList<Integer>(c4.values()), Arrays.asList(7)); PrunedCounts<String> c5 = dao.getPageCounts(en, 3214, 5); assertNotNull(c5); assertEquals(c5.size(), 3); assertEquals(c5.getTotal(), 13); assertEquals(new ArrayList<String>(c5.keySet()), Arrays.asList("Bar", "baz", "boof")); assertEquals(new ArrayList<Integer>(c5.values()), Arrays.asList(9, 3, 1)); PrunedCounts<String> c6 = dao.getPageCounts(en, 3214, 2); assertNotNull(c6); assertEquals(c6.size(), 2); assertEquals(c6.getTotal(), 13); assertEquals(new ArrayList<String>(c6.keySet()), Arrays.asList("Bar", "baz")); assertEquals(new ArrayList<Integer>(c6.values()), Arrays.asList(9, 3)); List<String> phrases = IteratorUtils.toList(dao.getAllPhrases(en)); System.out.println("phrases are " + phrases); assertEquals(phrases, Arrays.asList("foo")); List<Pair<String, PrunedCounts<Integer>>> phraseCounts = IteratorUtils.toList(dao.getAllPhraseCounts(en)); assertEquals(1, phraseCounts.size()); assertEquals("foo", phraseCounts.get(0).getKey()); assertEquals(2, phraseCounts.get(0).getValue().size()); assertEquals((Integer)7, (Integer)phraseCounts.get(0).getValue().get(349)); dao.close(); dao = new PhraseAnalyzerObjectDbDao(normalizer, tmp, false); phrases = IteratorUtils.toList(dao.getAllPhrases(en)); System.out.println("phrases are " + phrases); assertEquals(phrases, Arrays.asList("foo")); phraseCounts = IteratorUtils.toList(dao.getAllPhraseCounts(en)); assertEquals(1, phraseCounts.size()); assertEquals("foo", phraseCounts.get(0).getKey()); assertEquals(2, phraseCounts.get(0).getValue().size()); assertEquals((Integer)7, (Integer)phraseCounts.get(0).getValue().get(349)); dao.close(); } }