/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.preprocessing; import static org.carrot2.text.preprocessing.PreprocessingContextAssert.tokens; import static org.carrot2.text.preprocessing.PreprocessingContextBuilder.FieldValue.*; import java.util.Arrays; import org.carrot2.text.analysis.ITokenizer; import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline; import org.carrot2.util.attribute.AttributeUtils; import org.junit.Before; import org.junit.Test; /** * Test cases for {@link Tokenizer}. */ public class TokenizerTest extends PreprocessingContextTestBase { PreprocessingContextBuilder contextBuilder; @Before public void prepareContextBuilder() { contextBuilder = new PreprocessingContextBuilder() .withPreprocessingPipeline(new BasicPreprocessingPipeline()); } // @formatter:off @Test public void testNoDocuments() { PreprocessingContext ctx = contextBuilder .buildContext(); assertThat(ctx).tokenAt(0) .hasImage(null).hasDocIndex(-1).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_TERMINATOR); } @Test public void testEmptyDocuments() { PreprocessingContext ctx = contextBuilder .newDoc(null, null) .newDoc("", "") .buildContext(); assertThat(ctx).tokenAt(0) .hasImage(null).hasDocIndex(-1).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_SEPARATOR_DOCUMENT); assertThat(ctx).tokenAt(1) .hasImage(null).hasDocIndex(-1).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_TERMINATOR); } @Test public void testEmptyFirstField() { PreprocessingContext ctx = contextBuilder .newDoc(null, "a") .buildContext(); assertThat(ctx).tokenAt(0) .hasImage("a").hasDocIndex(0).hasFieldIndex(1) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(1) .hasImage(null).hasDocIndex(-1).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_TERMINATOR); } @Test public void testEmptyField() { PreprocessingContext ctx = contextBuilder .setAttribute(AttributeUtils.getKey(Tokenizer.class, "documentFields"), Arrays.asList("field1", "field2", "field3")) .newDoc( fv("field1", "data mining"), fv("field2", ""), fv("field3", "web site")) .buildContext(); assertThat(ctx.allFields.name).isEqualTo(new String [] {"field1", "field2", "field3"}); assertThat(ctx).tokenAt(0) .hasImage("data").hasDocIndex(0).hasFieldIndex(0) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(1) .hasImage("mining").hasDocIndex(0).hasFieldIndex(0) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(2) .hasImage(null).hasDocIndex(0).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_SEPARATOR_FIELD); assertThat(ctx).tokenAt(3) .hasImage("web").hasDocIndex(0).hasFieldIndex(2) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(4) .hasImage("site").hasDocIndex(0).hasFieldIndex(2) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(5) .hasImage(null).hasDocIndex(-1).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_TERMINATOR); } @Test public void testOneDocument() { PreprocessingContext ctx = contextBuilder .newDoc("data mining", "web site") .buildContext(); assertThat(ctx).tokenAt(0) .hasImage("data").hasDocIndex(0).hasFieldIndex(0) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(1) .hasImage("mining").hasDocIndex(0).hasFieldIndex(0) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(2) .hasImage(null).hasDocIndex(0).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_SEPARATOR_FIELD); assertThat(ctx).tokenAt(3) .hasImage("web").hasDocIndex(0).hasFieldIndex(1) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(4) .hasImage("site").hasDocIndex(0).hasFieldIndex(1) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(5) .hasImage(null).hasDocIndex(-1).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_TERMINATOR); } @Test public void testSentenceSeparator() { PreprocessingContext ctx = contextBuilder .newDoc("data . mining", "") .buildContext(); assertThat(ctx).tokenAt(0) .hasImage("data").hasDocIndex(0).hasFieldIndex(0) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(1) .hasImage(".").hasDocIndex(0).hasFieldIndex(0) .hasExactTokenType(ITokenizer.TF_SEPARATOR_SENTENCE | ITokenizer.TT_PUNCTUATION); assertThat(ctx).tokenAt(2) .hasImage("mining").hasDocIndex(0).hasFieldIndex(0) .hasExactTokenType(ITokenizer.TT_TERM); assertThat(ctx).tokenAt(3) .hasImage(null).hasDocIndex(-1).hasFieldIndex(-1) .hasExactTokenType(ITokenizer.TF_TERMINATOR); } @Test public void testMoreDocuments() { PreprocessingContext ctx = contextBuilder .newDoc("data mining", "web site") .newDoc("artificial intelligence", "ai") .newDoc("test", "test") .buildContext(); assertThat(tokens(ctx)).onProperty("tokenImage").isEqualTo(Arrays.asList( "data", "mining", null, "web", "site", null, "artificial", "intelligence", null, "ai", null, "test", null, "test", null)); assertThat(ctx.allTokens.documentIndex).isEqualTo(new int [] { 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, -1, 2, 2, 2, -1 }); assertThat(ctx.allTokens.type).isEqualTo(new short [] { ITokenizer.TT_TERM, ITokenizer.TT_TERM, ITokenizer.TF_SEPARATOR_FIELD, ITokenizer.TT_TERM, ITokenizer.TT_TERM, ITokenizer.TF_SEPARATOR_DOCUMENT, ITokenizer.TT_TERM, ITokenizer.TT_TERM, ITokenizer.TF_SEPARATOR_FIELD, ITokenizer.TT_TERM, ITokenizer.TF_SEPARATOR_DOCUMENT, ITokenizer.TT_TERM, ITokenizer.TF_SEPARATOR_FIELD, ITokenizer.TT_TERM, ITokenizer.TF_TERMINATOR }); assertThat(ctx.allTokens.fieldIndex).isEqualTo(new byte [] { 0, 0, -1, 1, 1, -1, 0, 0, -1, 1, -1, 0, -1, 1, -1 }); } @Test public void testUnicodeNextLine() { PreprocessingContext ctx = contextBuilder .newDoc("Foo\u0085 Bar") .buildContext(); assertThat(tokens(ctx)).onProperty("tokenImage").isEqualTo(Arrays.asList( "Foo", "Bar", null)); assertThat(ctx.allTokens.type).isEqualTo(new short [] { ITokenizer.TT_TERM, ITokenizer.TT_TERM, ITokenizer.TF_TERMINATOR }); } // @formatter:on }