package org.wikibrain.core.nlp; import org.junit.Test; import org.wikibrain.core.dao.DaoException; import org.wikibrain.core.dao.LocalPageDao; import org.wikibrain.core.lang.Language; import org.wikibrain.core.model.LocalPage; import org.wikibrain.core.model.Title; import org.wikibrain.core.nlp.Dictionary; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.List; import static org.junit.Assert.*; import static org.mockito.Mockito.*; /** * @author Shilad Sen */ public class DictionaryTest { @Test public void testSimple() throws DaoException { Dictionary dict = new Dictionary(Language.EN, Dictionary.WordStorage.IN_MEMORY); dict.setContainsMentions(true); dict.setCountBigrams(true); dict.countRawText(TEST_CORPUS); assertEquals(429, dict.getTotalCount()); assertEquals(23, dict.getUnigramCount("the")); assertEquals(6, dict.getUnigramCount("I")); assertEquals(1, dict.getUnigramCount("veil")); assertEquals(3, dict.getBigramCount("in a")); assertEquals(3, dict.getUnigramCount("but")); assertEquals(252, dict.getNumUnigrams()); // Test mentions assertEquals(2, dict.getNumMentionedArticles()); assertEquals(1, dict.getMentionCount(3)); assertEquals(2, dict.getMentionCount(4)); // Test top unigrams List<String> top = Arrays.asList("the", "of", "and", "his", "a", "to", "in", "I", "was", "for", "own", "which"); assertEquals(top, dict.getFrequentUnigrams(12)); // Test top unigrams and mentions together // Mock a local page dao LocalPageDao lpd = mock(LocalPageDao.class); LocalPage page3 = new LocalPage(Language.EN, 3, "This_is_page_3"); LocalPage page4 = new LocalPage(Language.EN, 4, "This_is_page_4"); when(lpd.getById(Language.EN, 3)).thenReturn(page3); when(lpd.getById(Language.EN, 4)).thenReturn(page4); top = Arrays.asList("the", "of", "and", "his", "a", "/w/en/4/This_is_page_4", "/w/en/3/This_is_page_3"); assertEquals(top, dict.getFrequentUnigramsAndMentions(lpd, 5, 3, 1)); } @Test public void testReadWrite() throws IOException { Dictionary dict = new Dictionary(Language.EN, Dictionary.WordStorage.IN_MEMORY); dict.setContainsMentions(true); dict.setCountBigrams(true); dict.countRawText(TEST_CORPUS); File tmp = File.createTempFile("dict", "txt"); tmp.deleteOnExit(); tmp.delete(); dict.write(tmp); // Try reading it from disk Dictionary dict2 = new Dictionary(Language.EN, Dictionary.WordStorage.NONE); dict2.read(tmp); assertEquals(dict.getTotalCount(), dict2.getTotalCount()); assertEquals(dict.getNumUnigrams(), dict2.getNumUnigrams()); for (String word : dict.getFrequentUnigrams(Integer.MAX_VALUE)) { assertEquals(dict.getUnigramCount(word), dict2.getUnigramCount(word)); } // Try streaming the words to disk Dictionary dict3 = new Dictionary(Language.EN, Dictionary.WordStorage.ON_DISK); dict3.setContainsMentions(true); dict3.setCountBigrams(true); dict3.countRawText(TEST_CORPUS); dict.write(tmp); assertEquals(dict.getTotalCount(), dict3.getTotalCount()); assertEquals(dict.getNumUnigrams(), dict3.getNumUnigrams()); for (String word : dict.getFrequentUnigrams(Integer.MAX_VALUE)) { assertEquals(dict.getUnigramCount(word), dict3.getUnigramCount(word)); } // Try pruning the read-in result Dictionary dict4 = new Dictionary(Language.EN); dict4.read(tmp, Integer.MAX_VALUE, 3); assertEquals(dict4.getNumUnigrams(), 26); dict4 = new Dictionary(Language.EN); dict4.read(tmp, 5, 2); assertEquals(dict4.getNumUnigrams(), 5); assertEquals(dict.getTotalCount(), dict4.getTotalCount()); for (String word : dict.getFrequentUnigrams(5)) { assertEquals(dict.getUnigramCount(word), dict4.getUnigramCount(word)); } } @Test public void testPrune() throws IOException { Dictionary dict = new Dictionary(Language.EN, Dictionary.WordStorage.IN_MEMORY); dict.setMaxDictionarySize(10); dict.setContainsMentions(true); dict.setCountBigrams(true); dict.countRawText(TEST_CORPUS); assertEquals(429, dict.getTotalCount()); assertEquals(252, dict.getNumUnigrams()); assertEquals(23, dict.getUnigramCount("the")); assertEquals(6, dict.getUnigramCount("I")); assertEquals(1, dict.getUnigramCount("veil")); assertEquals(3, dict.getBigramCount("in a")); assertEquals(3, dict.getUnigramCount("but")); dict.pruneIfNecessary(); assertEquals(429, dict.getTotalCount()); assertEquals(7, dict.getNumUnigrams()); assertEquals(23, dict.getUnigramCount("the")); assertEquals(8, dict.getUnigramCount("in")); assertEquals(0, dict.getUnigramCount("I")); assertEquals(0, dict.getUnigramCount("veil")); assertEquals(0, dict.getBigramCount("in a")); assertEquals(0, dict.getUnigramCount("but")); } /** * From http://www.gutenberg.org/cache/epub/1661/pg1661.txt */ static String TEST_CORPUS = "To Sherlock Holmes she is always THE woman. I have seldom heard\n" + "him mention her under any other name. In his eyes she eclipses\n" + "and predominates the whole of her sex. It was not that he felt\n" + "any emotion akin to love for Irene Adler. All emotions, and that\n" + "one particularly, were abhorrent to his cold, precise but\n" + "admirably balanced mind. He was, I take it, the most perfect\n" + "reasoning and observing machine that the world has seen, but as a\n" + "lover he would have placed himself in a false position. He never\n" + "spoke of the softer passions, save with:/w/en/4/foo a gibe and a sneer. They\n" + "were admirable things for the observer--excellent for drawing the\n" + "veil from men's motives and actions. But for the trained reasoner\n" + "to admit such intrusions into his own delicate and finely\n" + "adjusted temperament was to introduce a distracting factor which\n" + "might throw a doubt upon all his mental results. Grit in a\n" + "sensitive instrument, or a crack in one of his own high-power\n" + "lenses, would not be more disturbing than a strong emotion in a\n" + "nature such as his. And yet there was but:/w/en/3/foo one woman to him, and\n" + "that woman was the late Irene Adler, of dubious and questionable\n" + "memory.\n" + "\n" + "I had seen little of Holmes lately. My marriage had drifted us\n" + "away from each other. My own complete happiness, and the\n" + "home-centred interests which rise up around the man who first\n" + "finds himself master of his own establishment, were sufficient to\n" + "absorb all my attention, while Holmes, who loathed every form of\n" + "society with his whole Bohemian soul, remained:/w/en/4/foo in our lodgings in\n" + "Baker Street, buried among his old books, and alternating from\n" + "week to week between cocaine and ambition, the drowsiness of the\n" + "drug, and the fierce energy of his own keen nature. He was still,\n" + "as ever, deeply attracted by the study of crime, and occupied his\n" + "immense faculties and extraordinary powers of observation in\n" + "following out those clues, and clearing up those mysteries which\n" + "had been abandoned as hopeless by the official police. From time\n" + "to time I heard some vague account of his doings: of his summons\n" + "to Odessa in the case of the Trepoff murder, of his clearing up\n" + "of the singular tragedy of the Atkinson brothers at Trincomalee,\n" + "and finally of the mission which he had accomplished so\n" + "delicately and successfully for the reigning family of Holland.\n" + "Beyond these signs of his activity, however, which I merely\n" + "shared with all the readers of the daily press, I knew little of\n" + "my former friend and companion."; }