/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import static org.carrot2.text.analysis.ITokenizer.*;
import org.carrot2.util.tests.CarrotTestCase;
import org.junit.Before;
import org.junit.Test;
/**
* Language-independent test cases for {@link LanguageModelStemmer}.
*/
public class StemmerSyntheticTest extends CarrotTestCase
{
PreprocessingContextBuilder contextBuilder;
// @formatter:off
@Before
public void prepareContextBuilder()
{
contextBuilder = new PreprocessingContextBuilder()
.withStemmerFactory(new TestStemmerFactory())
.withLexicalDataFactory(new TestLexicalDataFactory());
}
@Test
public void testEmpty()
{
assertThat(contextBuilder.buildContext().allStems.image.length).isEqualTo(0);
}
@Test
public void testSingleStems()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("abc", "bcd")
.buildContextAssert();
a.constainsStem("a").withTf(1).withDocumentTf(0, 1).withFieldIndices(0);
a.constainsStem("b").withTf(1).withDocumentTf(0, 1).withFieldIndices(1);
assertThat(a.context.allStems.image.length).isEqualTo(2);
// Account for field-separator markers (nulls) below.
assertThat(a.tokens()).onProperty("stemImage")
.containsExactly("a", null, "b", null);
}
@Test
public void testFrequentSingleStems()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("abc abc", "bcd bcd bcd")
.buildContextAssert();
a.constainsStem("a").withTf(2).withDocumentTf(0, 2).withFieldIndices(0);
a.constainsStem("b").withTf(3).withDocumentTf(0, 3).withFieldIndices(1);
assertThat(a.context.allStems.image.length).isEqualTo(2);
// Account for field-separator markers (nulls) below.
assertThat(a.tokens()).onProperty("stemImage")
.containsExactly("a", "a", null, "b", "b", "b", null);
}
@Test
public void testOriginalFrequencyAggregation()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("abc acd bcd", "ade bof")
.buildContextAssert();
a.constainsStem("a").withTf(3).withDocumentTf(0, 3).withFieldIndices(0, 1);
a.constainsStem("b").withTf(2).withDocumentTf(0, 2).withFieldIndices(0, 1);
assertThat(a.context.allStems.image.length).isEqualTo(2);
// Account for field-separator markers (nulls) below.
assertThat(a.tokens()).onProperty("stemImage")
.containsExactly("a", "a", "b", null, "a", "b", null);
}
@Test
public void testNullStems()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("aa ab", "aa bc")
.buildContextAssert();
a.constainsStem("aa").withTf(2).withDocumentTf(0, 2).withFieldIndices(0, 1);
a.constainsStem("ab").withTf(1).withDocumentTf(0, 1).withFieldIndices(0);
a.constainsStem("bc").withTf(1).withDocumentTf(0, 1).withFieldIndices(1);
assertThat(a.context.allStems.image.length).isEqualTo(3);
// Account for field-separator markers (nulls) below.
assertThat(a.tokens()).onProperty("stemImage")
.containsExactly("aa", "ab", null, "aa", "bc", null);
}
@Test
public void testWordTfByDocumentAggregation()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("abc acd ade")
.newDoc("ade", "bcd bof")
.newDoc(null, "bcd")
.newDoc("ade", "bof")
.buildContextAssert();
a.constainsStem("a").withTf(5)
.withExactDocumentTfs(new int [][] {{0, 3}, {1, 1}, {3, 1}})
.withFieldIndices(0);
a.constainsStem("b").withTf(4)
.withExactDocumentTfs(new int [][] {{1, 2}, {2, 1}, {3, 1}})
.withFieldIndices(1);
assertThat(a.context.allStems.image.length).isEqualTo(2);
// Account for field-separator markers (nulls) below.
assertThat(a.tokens()).onProperty("stemImage")
.containsExactly("a", "a", "a", null,
"a", null, "b", "b", null,
"b", null,
"a", null, "b", null);
}
@Test
public void testAllQueryWords()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("q1 q2", "q3")
.withQuery("q1 q2 q3")
.buildContextAssert();
assertThat(a.tokens()).onProperty("wordType")
.containsExactly(TF_QUERY_WORD | TT_TERM, TF_QUERY_WORD | TT_TERM, null,
TF_QUERY_WORD | TT_TERM, null);
}
@Test
public void testSomeQueryWords()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("test q2", "aa q1")
.withQuery("q1 q2 q3")
.buildContextAssert();
assertThat(a.tokens()).onProperty("wordType")
.containsExactly(TT_TERM, TF_QUERY_WORD | TT_TERM, null,
TT_TERM, TF_QUERY_WORD | TT_TERM, null);
}
@Test
public void testNoQueryWords()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("q2", "aa q1")
.withQuery("q3")
.buildContextAssert();
assertThat(a.tokens()).onProperty("wordType")
.containsExactly(TT_TERM, null,
TT_TERM, TT_TERM, null);
}
@Test
public void testBlankQuery()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("q2", "aa q1")
.withQuery("")
.buildContextAssert();
assertThat(a.tokens()).onProperty("wordType")
.containsExactly(TT_TERM, null,
TT_TERM, TT_TERM, null);
}
@Test
public void testNullQuery()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("q2", "aa q1")
.withQuery(null)
.buildContextAssert();
assertThat(a.tokens()).onProperty("wordType")
.containsExactly(TT_TERM, null,
TT_TERM, TT_TERM, null);
}
@Test
public void testDifferentStemsInQuery()
{
PreprocessingContextAssert a = contextBuilder
.newDoc("que01 que02", "test word")
.withQuery("que04")
.buildContextAssert();
assertThat(a.tokens()).onProperty("wordType")
.containsExactly(TT_TERM | TF_QUERY_WORD, TT_TERM | TF_QUERY_WORD, null,
TT_TERM, TT_TERM, null);
}
// @formatter:on
}