/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.preprocessing; import java.util.Arrays; import java.util.List; import org.carrot2.util.attribute.AttributeUtils; import org.carrot2.util.tests.CarrotTestCase; import org.fest.assertions.Assertions; import org.junit.Before; import org.junit.Test; /** * Test cases for {@link PhraseExtractor}. */ public class PhraseExtractorTest extends CarrotTestCase { PreprocessingContextBuilder contextBuilder; @Before public void prepareContextBuilder() { contextBuilder = new PreprocessingContextBuilder(); contextBuilder.withStemmerFactory(new TestStemmerFactory()); } // @formatter:off @Test public void testEmpty() { assertThat(contextBuilder.buildContextAssert().phraseImages()).isEmpty(); } @Test public void testNullTitleSnippet() { PreprocessingContextAssert a = contextBuilder .newDoc(null, null) .buildContextAssert(); assertThat(a.phraseImages()).isEmpty(); } @Test public void testSinglePhrase() { PreprocessingContextAssert a = contextBuilder .newDoc("a a", "a a") .buildContextAssert(); a.containsPhrase("a", "a").withTf(2).withDocumentTf(0, 2); assertThat(a.wordImages()).containsOnly("a"); } @Test public void testTwoPhrasesOneDocument() { PreprocessingContextAssert a = contextBuilder .newDoc("a b", "a b") .buildContextAssert(); a.containsPhrase("a", "b").withTf(2).withDocumentTf(0, 2); } @Test public void testSubphrasesAcrossFields() { PreprocessingContextAssert a = contextBuilder .newDoc("a b . a b", "a b c d . a b c d") .buildContextAssert(); a.containsPhrase("a", "b").withTf(4); a.containsPhrase("b", "c").withTf(2); a.containsPhrase("c", "d").withTf(2); a.containsPhrase("a", "b", "c").withTf(2); a.containsPhrase("b", "c", "d").withTf(2); a.containsPhrase("a", "b", "c", "d").withTf(2); assertThat(a.phraseImages().size()).isEqualTo(6); } @Test public void testSubphrasesOneField() { PreprocessingContextAssert a = contextBuilder .newDoc("a b c d . a b c d") .buildContextAssert(); a.containsPhrase("a", "b").withTf(2); a.containsPhrase("b", "c").withTf(2); a.containsPhrase("c", "d").withTf(2); a.containsPhrase("a", "b", "c").withTf(2); a.containsPhrase("b", "c", "d").withTf(2); a.containsPhrase("a", "b", "c", "d").withTf(2); assertThat(a.phraseImages().size()).isEqualTo(6); } @Test public void testNestedPhrases() { PreprocessingContextAssert a = contextBuilder .newDoc("a b c d . a b c d", "a b d . a b d") .buildContextAssert(); a.containsPhrase("a", "b").withTf(4); a.containsPhrase("b", "c").withTf(2); a.containsPhrase("c", "d").withTf(2); a.containsPhrase("b", "d").withTf(2); a.containsPhrase("a", "b", "c").withTf(2); a.containsPhrase("a", "b", "d").withTf(2); a.containsPhrase("b", "c", "d").withTf(2); a.containsPhrase("a", "b", "c", "d").withTf(2); assertThat(a.phraseImages().size()).isEqualTo(8); } @Test public void testMaxPhraseLength() { PreprocessingContextAssert a = contextBuilder .newDoc("a b c d e f g h i", "a b c d e f g h i") .buildContextAssert(); // All subsequences sized 2..MAX_PHRASE_LENGTH. List<String> sequence = Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"); int all = 0; for (int len = 2; len <= PhraseExtractor.MAX_PHRASE_LENGTH; len++) { for (int pos = 0; pos + len <= sequence.size(); pos++, all++) { a.containsPhrase(sequence.subList(pos, pos + len)).withTf(2); } } assertThat(a.phraseImages().size()).isEqualTo(all); } @Test public void testTwoExtendedPhrases() { PreprocessingContextAssert a = contextBuilder .newDoc("a b c", "a b d") .buildContextAssert(); a.containsPhrase("a", "b").withTf(2); assertThat(a.phraseImages().size()).isEqualTo(1); } @Test public void testNoFrequentPhrases() { PreprocessingContextAssert a = contextBuilder .newDoc("a b c", "d e f") .buildContextAssert(); assertThat(a.phraseImages()).isEmpty(); } /** * For efficiency reasons we don't care about phrases that ARE frequent in general, * but do not have at least two occurrences of one specific variant. */ @Test public void testGeneralizedPhraseWithSingleOriginals() { PreprocessingContextAssert a = contextBuilder .newDoc("abc bcd", "abd bce") .newDoc("abe bcf", "abf bcg") .buildContextAssert(); assertThat(a.phraseImages()).isEmpty(); } /** * Same as {@link #testGeneralizedPhraseWithSingleOriginals()}? */ @Test public void testGeneralizedPhrasesWithSingleOriginals() { PreprocessingContextAssert a = contextBuilder .newDoc("abc bcd", "abd bce") .newDoc("abe bcf", "abf bcg") .newDoc("efg fgh", "efh fgi") .newDoc("efi fgj", "efj fgk") .buildContextAssert(); assertThat(a.phraseImages()).isEmpty(); } @Test public void testComposition() { PreprocessingContextAssert a = contextBuilder .newDoc("abc bcd cde", "abc bcd cdf") .newDoc("abc bcd cdg", "abc bcd cdh") .buildContextAssert(); a.containsPhraseStemmedAs("a__", "b__") .withTf(4) .withExactDocumentTfs(new int [][] {{0, 2}, {1, 2}}); assertThat(a.phraseImages().size()).isEqualTo(1); } @Test public void testGeneralizedPhraseWithMultipleOriginals() { PreprocessingContextAssert a = contextBuilder .newDoc("abd bce", "abe bcf") .newDoc("abd bce", "abe bcf . abe bcf") .newDoc("abc bcd . abc bcd . abc bcd . abc bcd") .buildContextAssert(); a.containsPhraseStemmedAs("a__", "b__") .withTf(9) .withExactDocumentTfs(new int [][] {{0, 2}, {1, 3}, {2, 4}}); assertThat(a.phraseImages().size()).isEqualTo(1); } @Test public void testGeneralizedPhraseFrequencyAggregation() { PreprocessingContextAssert a = contextBuilder .newDoc("abc bcd", "abc bcd") .newDoc("abd cde", "abd cde . abe bcd . abe bcd . abe bcd") .buildContextAssert(); a.containsPhraseStemmedAs("a__", "b__") .withTf(5) .withExactDocumentTfs(new int [][] {{0, 2}, {1, 3}}); a.containsPhraseStemmedAs("a__", "c__") .withTf(2) .withExactDocumentTfs(new int [][] {{1, 2}}); assertThat(a.phraseImages().size()).isEqualTo(2); } @Test public void testTermFrequencyAcrossDocuments() { PreprocessingContextAssert a = contextBuilder .newDoc("abc bcd") .newDoc("abc bcd cde") .newDoc("abc bcd cde") .newDoc("abc bcd cde") .buildContextAssert(); a.containsPhraseStemmedAs("a__", "b__") .withTf(4) .withExactDocumentTfs(new int [][] {{0, 1}, {1, 1}, {2, 1}, {3, 1}}); a.containsPhraseStemmedAs("b__", "c__") .withTf(3) .withExactDocumentTfs(new int [][] { {1, 1}, {2, 1}, {3, 1}}); a.containsPhraseStemmedAs("a__", "b__", "c__") .withTf(3) .withExactDocumentTfs(new int [][] { {1, 1}, {2, 1}, {3, 1}}); } @Test public void testOverlappingGeneralizedPhrase() { PreprocessingContextAssert a = contextBuilder .newDoc("abc bcd cde def", "abd bce") .newDoc("abd bce cde deg", "cdf deg efg . abc fgh cde def") .buildContextAssert(); a.containsPhraseStemmedAs("a__", "b__") .withTf(2) .withExactDocumentTfs(new int [][] {{0, 1}, {1, 1}}); a.containsPhraseStemmedAs("c__", "d__") .withTf(2) .withExactDocumentTfs(new int [][] {{0, 1}, {1, 1}}); } @Test public void testDfThreshold() { PreprocessingContextAssert a = contextBuilder .setAttribute(AttributeUtils.getKey(PhraseExtractor.class, "dfThreshold"), 2) .newDoc("a a", "a a") .newDoc("a a . b b . c c", "a a . b b") .newDoc("a a", "a a . c c") .buildContextAssert(); // a a // b b -> removed due to dfThreshold // c c a.containsPhrase("a", "a") .withTf(6) .withExactDocumentTfs(new int [][] {{0, 2}, {1, 2}, {2, 2}}); a.containsPhrase("c", "c") .withTf(2) .withExactDocumentTfs(new int [][] {{1, 1}, {2, 1}}); assertThat(a.phraseImages().size()).isEqualTo(2); } @Test public void minMaxPhraseLength() { checkPhrase("a b c d e f g h i"); } @Test public void minMaxPhraseLengthReverseOrder() { checkPhrase("i h g f e d c b a"); } private void checkPhrase(String phrase) { PreprocessingContextAssert a = contextBuilder .setAttribute(AttributeUtils.getKey(PhraseExtractor.class, "dfThreshold"), 2) .newDoc(phrase, phrase) .newDoc(phrase, phrase) .buildContextAssert(); // All subsequences sized 2..MAX_PHRASE_LENGTH. List<String> sequence = Arrays.asList(phrase.split("\\s")); int all = 0; for (int len = 2; len <= PhraseExtractor.MAX_PHRASE_LENGTH; len++) { for (int pos = 0; pos + len <= sequence.size(); pos++, all++) { a.containsPhrase(sequence.subList(pos, pos + len)).withTf(4) .withDocumentTf(0, 2).withDocumentTf(1, 2); } } Assertions.assertThat(a.phraseImages().size()).isEqualTo(all); } @Test public void tfByDocumentAndTfSanity() { String symbols = "abcd"; for (int reps = 0; reps < 100; reps++) { final PreprocessingContextBuilder builder = contextBuilder .setAttribute(AttributeUtils.getKey(PhraseExtractor.class, "dfThreshold"), 1); for (int docs = 1 + iterations(1, 10); docs >= 0; docs--) { int phraseSize = randomIntBetween(1, PhraseExtractor.MAX_PHRASE_LENGTH + 2); StringBuilder sb = new StringBuilder(); for (int i = 0; i < phraseSize; i++) sb.append(symbols.charAt(randomInt(symbols.length() - 1))).append(" "); builder.newDoc(sb.toString(), null); } PreprocessingContextAssert a = builder.buildContextAssert(); a.phraseTfsCorrect(); } } // @formatter:on }