/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.junit.Before;
import org.junit.Test;
/**
* Test cases for {@link StopListMarker}.
*/
public class WordMarkerTest extends PreprocessingContextTestBase
{
PreprocessingContextBuilder contextBuilder;
@Before
public void prepareContextBuilder()
{
contextBuilder = new PreprocessingContextBuilder()
.withPreprocessingPipeline(new BasicPreprocessingPipeline());
}
// @formatter:off
@Test
public void testNonStopWords()
{
PreprocessingContext ctx = contextBuilder
.newDoc("data mining", "data mining")
.buildContext();
assertThat(ctx).containsWord("data")
.withExactTokenType(ITokenizer.TT_TERM);
assertThat(ctx).containsWord("mining")
.withExactTokenType(ITokenizer.TT_TERM);
}
@Test
public void testStopWords()
{
PreprocessingContext ctx = contextBuilder
.newDoc("this you", "have are")
.buildContext();
assertThat(ctx).containsWord("this")
.withExactTokenType(ITokenizer.TT_TERM | ITokenizer.TF_COMMON_WORD);
assertThat(ctx).containsWord("you")
.withExactTokenType(ITokenizer.TT_TERM | ITokenizer.TF_COMMON_WORD);
assertThat(ctx).containsWord("have")
.withExactTokenType(ITokenizer.TT_TERM | ITokenizer.TF_COMMON_WORD);
assertThat(ctx).containsWord("are")
.withExactTokenType(ITokenizer.TT_TERM | ITokenizer.TF_COMMON_WORD);
}
// @formatter:on
}