/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.preprocessing; import static org.carrot2.text.preprocessing.PreprocessingContextAssert.tokens; import static org.carrot2.text.preprocessing.PreprocessingContextAssert.MW; import static org.carrot2.text.preprocessing.PreprocessingContextAssert.DS; import static org.carrot2.text.preprocessing.PreprocessingContextAssert.FS; import static org.carrot2.text.preprocessing.PreprocessingContextAssert.EOS; import org.carrot2.text.analysis.ITokenizer; import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline; import org.carrot2.util.attribute.AttributeUtils; import org.junit.Before; import org.junit.Test; /** * Test cases for {@link CaseNormalizer}. */ public class CaseNormalizerTest extends PreprocessingContextTestBase { PreprocessingContextBuilder contextBuilder; @Before public void prepareContextBuilder() { contextBuilder = new PreprocessingContextBuilder() .withPreprocessingPipeline(new BasicPreprocessingPipeline()); } // @formatter:off @Test public void testEmpty() { PreprocessingContext ctx = contextBuilder.buildContext(); assertThat(ctx).tokens().isEmpty(); assertThat(ctx.allTokens.wordIndex).isEqualTo(new int [] {-1}); } @Test public void testOneToken() { PreprocessingContext ctx = contextBuilder .newDoc("test") .buildContext(); assertThat(ctx).containsWord("test").withTf(1).withDocumentTf(0, 1).withFieldIndices(0); assertThat(tokens(ctx)).onProperty("wordImage") .containsExactly("test", EOS); } @Test public void testMoreSingleDifferentTokens() { PreprocessingContext ctx = contextBuilder .newDoc("a simple testsymbol") .buildContext(); assertThat(ctx).containsWord("a").withTf(1).withDocumentTf(0, 1).withFieldIndices(0); assertThat(ctx).containsWord("simple").withTf(1).withDocumentTf(0, 1).withFieldIndices(0); assertThat(ctx).containsWord("testsymbol").withTf(1).withDocumentTf(0, 1).withFieldIndices(0); assertThat(tokens(ctx)).onProperty("wordImage") .containsExactly("a", "simple", "testsymbol", EOS); } @Test public void testTokenTypes() { String input = "12.2 email@email.com IEEE www.test.com file_name"; PreprocessingContext ctx = contextBuilder .newDoc(input) .buildContext(); for (String term : input.split("\\s")) assertThat(ctx).containsWord(term).withTf(1).withDocumentTf(0, 1).withFieldIndices(0); assertThat(ctx).containsWord("12.2").withTokenType(ITokenizer.TT_NUMERIC); assertThat(ctx).containsWord("email@email.com").withTokenType(ITokenizer.TT_EMAIL); assertThat(ctx).containsWord("IEEE").withTokenType(ITokenizer.TT_TERM); assertThat(ctx).containsWord("www.test.com").withTokenType(ITokenizer.TT_BARE_URL); assertThat(ctx).containsWord("file_name").withTokenType(ITokenizer.TT_FILE); } @Test public void testMoreRepeatedDifferentTokens() { PreprocessingContext ctx = contextBuilder .newDoc("a simple test", "a test a") .buildContext(); assertThat(ctx).containsWord("a").withTf(3).withFieldIndices(0, 1).withDocumentTf(0, 3); assertThat(ctx).containsWord("simple").withTf(1).withFieldIndices(0).withDocumentTf(0, 1); assertThat(ctx).containsWord("test").withTf(2).withFieldIndices(0, 1).withDocumentTf(0, 2); assertThat(ctx.allWords.image.length).isEqualTo(3); } @Test public void testOneTokenVariantEqualFrequencies() { PreprocessingContext ctx = contextBuilder .newDoc("abc abc ABC aBc") .buildContext(); assertThat(ctx).containsWord("abc").withTf(4).withFieldIndices(0).withDocumentTf(0, 4); assertThat(ctx.allWords.image.length).isEqualTo(1); assertThat(tokens(ctx)).onProperty("wordImage").containsExactly( "abc", "abc", "abc", "abc", EOS); } @Test public void testDemos() { PreprocessingContext ctx = contextBuilder .newDoc("demo demo demos demos DEMO DEMOs Demo Demos") .buildContext(); assertThat(ctx).containsWord("demo") .withTf(4).withFieldIndices(0).withDocumentTf(0, 4); assertThat(ctx).containsWord("demos") .withTf(4).withFieldIndices(0).withDocumentTf(0, 4); assertThat(ctx.allWords.image.length).isEqualTo(2); assertThat(tokens(ctx)).onProperty("wordImage").containsExactly( "demo", "demo", "demos", "demos", "demo", "demos", "demo", "demos", EOS); } @Test public void testOneTokenVariantNonequalFrequencies() { PreprocessingContext ctx = contextBuilder .newDoc("abc ABC ABC aBc aBc ABC") .buildContext(); assertThat(ctx).containsWord("ABC") .withTf(6).withFieldIndices(0).withDocumentTf(0, 6); } @Test public void testMoreTokenVariants() { PreprocessingContext ctx = contextBuilder .newDoc("abc bcd ABC bcD ABC efg", "aBc aBc ABC BCD bcd bcd") .buildContext(); assertThat(ctx).containsWord("ABC") .withTf(6).withFieldIndices(0, 1).withDocumentTf(0, 6); assertThat(ctx).containsWord("bcd") .withTf(5).withFieldIndices(0, 1).withDocumentTf(0, 5); assertThat(ctx).containsWord("efg") .withTf(1).withFieldIndices(0).withDocumentTf(0, 1); assertThat(tokens(ctx)).onProperty("wordImage").containsExactly( "ABC", "bcd", "ABC", "bcd", "ABC", "efg", FS, "ABC", "ABC", "ABC", "bcd", "bcd", "bcd", EOS); } @Test public void testDfThresholding() { PreprocessingContext ctx = contextBuilder .setAttribute(AttributeUtils.getKey(CaseNormalizer.class, "dfThreshold"), 2) .newDoc("a b c", "d e f") .newDoc("a c", "a") .buildContext(); assertThat(ctx).containsWord("a") .withTf(3).withFieldIndices(0, 1) .withExactDocumentTfs(new int [][] {{0, 1}, {1, 2}}); assertThat(ctx).containsWord("c") .withTf(2).withFieldIndices(0) .withExactDocumentTfs(new int [][] {{0, 1}, {1, 1}}); assertThat(ctx.allWords.image.length).isEqualTo(2); assertThat(tokens(ctx)).onProperty("wordImage").containsExactly( "a", MW, "c", FS, MW, MW, MW, DS, "a", "c", FS, "a", EOS); } @Test public void testTokenFiltering() { PreprocessingContext ctx = contextBuilder .newDoc("a . b ,", "a . b ,") .buildContext(); assertThat(ctx).containsWord("a") .withTf(2).withFieldIndices(0, 1).withDocumentTf(0, 2); assertThat(ctx).containsWord("b") .withTf(2).withFieldIndices(0, 1).withDocumentTf(0, 2); assertThat(ctx.allWords.image.length).isEqualTo(2); assertThat(tokens(ctx)).onProperty("wordImage").containsExactly( "a", MW, "b", MW, FS, "a", MW, "b", MW, EOS); } @Test public void testPunctuation() { PreprocessingContext ctx = contextBuilder .newDoc("aba . , aba", ", .") .buildContext(); assertThat(ctx).containsWord("aba") .withTf(2).withFieldIndices(0).withDocumentTf(0, 2); assertThat(ctx.allWords.image.length).isEqualTo(1); assertThat(tokens(ctx)).onProperty("wordImage").containsExactly( "aba", MW, MW, "aba", FS, MW, MW, EOS); } @Test public void testMoreDocuments() { PreprocessingContext ctx = contextBuilder .newDoc(null, "ABC abc") .newDoc("bcd", "BCD") .newDoc("ABC", "BCD") .newDoc("def DEF DEF", "DEF") .buildContext(); assertThat(ctx).containsWord("ABC") .withTf(3).withFieldIndices(0, 1) .withExactDocumentTfs(new int [][] {{0, 2}, {2, 1}}); assertThat(ctx).containsWord("BCD") .withTf(3).withFieldIndices(0, 1) .withExactDocumentTfs(new int [][] {{1, 2}, {2, 1}}); assertThat(ctx).containsWord("DEF") .withTf(4).withFieldIndices(0, 1) .withDocumentTf(3, 4); assertThat(ctx.allWords.image.length).isEqualTo(3); assertThat(tokens(ctx)).onProperty("wordImage").containsExactly( "ABC", "ABC", DS, "BCD", FS, "BCD", DS, "ABC", FS, "BCD", DS, "DEF", "DEF", "DEF", FS, "DEF", EOS); } @Test public void testPunctuationTokenFirst() { PreprocessingContext ctx = contextBuilder .newDoc("aa", "bb") .newDoc("", "bb . cc") .newDoc("", "aa . cc . cc") .buildContext(); assertThat(ctx).containsWord("aa") .withTf(2).withFieldIndices(0, 1) .withExactDocumentTfs(new int [][] {{0, 1}, {2, 1}}); assertThat(ctx).containsWord("bb") .withTf(2).withFieldIndices(1) .withExactDocumentTfs(new int [][] {{0, 1}, {1, 1}}); assertThat(ctx).containsWord("cc") .withTf(3).withFieldIndices(1) .withExactDocumentTfs(new int [][] {{1, 1}, {2, 2}}); assertThat(ctx.allWords.image.length).isEqualTo(3); } // @formatter:on }