/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import java.util.List;
import org.junit.Test;
import org.carrot2.shaded.guava.common.collect.Lists;
/**
* Test cases for {@link PreprocessedDocumentScannerTest}.
*/
public class PreprocessedDocumentScannerTest extends PreprocessingComponentTestBase
{
@Test
public void testEmpty()
{
createDocuments();
final List<List<Integer>> expectedDocumentRanges = Lists.newArrayList();
final List<List<Integer>> expectedFieldRanges = Lists.newArrayList();
final List<List<Integer>> expectedSentenceRanges = Lists.newArrayList();
check(expectedDocumentRanges, expectedFieldRanges, expectedSentenceRanges);
}
@Test
public void testOneDocumentOneFieldOneSentence()
{
createDocuments("test");
final List<List<Integer>> expectedDocumentRanges = ranges(0, 1);
final List<List<Integer>> expectedFieldRanges = ranges(0, 1);
final List<List<Integer>> expectedSentenceRanges = ranges(0, 1);
check(expectedDocumentRanges, expectedFieldRanges, expectedSentenceRanges);
}
@Test
public void testOneDocumentOneFieldMoreSentences()
{
createDocuments("test1 . test2 . test3");
final List<List<Integer>> expectedDocumentRanges = ranges(0, 5);
final List<List<Integer>> expectedFieldRanges = ranges(0, 5);
final List<List<Integer>> expectedSentenceRanges = ranges(0, 1, 2, 1, 4, 1);
check(expectedDocumentRanges, expectedFieldRanges, expectedSentenceRanges);
}
@Test
public void testOneDocumentMoreFieldsMoreSentences()
{
createDocuments("test1 . test2 . ", "test3 . test4");
final List<List<Integer>> expectedDocumentRanges = ranges(0, 8);
final List<List<Integer>> expectedFieldRanges = ranges(0, 4, 5, 3);
final List<List<Integer>> expectedSentenceRanges = ranges(0, 1, 2, 1, 4, 0, 5, 1,
7, 1);
check(expectedDocumentRanges, expectedFieldRanges, expectedSentenceRanges);
}
@Test
public void testMoreDocumentsMoreFieldsMoreSentences()
{
createDocuments("test1", "test2 . test3", "test4", "test5 . test6");
final List<List<Integer>> expectedDocumentRanges = ranges(0, 5, 6, 5);
final List<List<Integer>> expectedFieldRanges = ranges(0, 1, 2, 3, 6, 1, 8, 3);
final List<List<Integer>> expectedSentenceRanges = ranges(0, 1, 2, 1, 4, 1, 6, 1,
8, 1, 10, 1);
check(expectedDocumentRanges, expectedFieldRanges, expectedSentenceRanges);
}
private List<List<Integer>> ranges(int... ranges)
{
final List<List<Integer>> result = Lists.newArrayList();
for (int i = 0; i < ranges.length / 2; i++)
{
result.add(Lists.newArrayList(ranges[i * 2], ranges[i * 2 + 1]));
}
return result;
}
private void check(List<List<Integer>> expectedDocumentRanges,
List<List<Integer>> expectedFieldRanges,
List<List<Integer>> expectedSentenceRanges)
{
final Tokenizer tokenizer = new Tokenizer();
final CaseNormalizer caseNormalizer = new CaseNormalizer();
final LanguageModelStemmer languageModelStemmer = new LanguageModelStemmer();
tokenizer.tokenize(context);
caseNormalizer.normalize(context);
languageModelStemmer.stem(context);
final List<List<Integer>> actualDocumentRanges = Lists.newArrayList();
final List<List<Integer>> actualFieldRanges = Lists.newArrayList();
final List<List<Integer>> actualSentenceRanges = Lists.newArrayList();
final PreprocessedDocumentScanner scanner = new PreprocessedDocumentScanner()
{
@Override
protected void document(PreprocessingContext context, int start, int length)
{
super.document(context, start, length);
actualDocumentRanges.add(Lists.newArrayList(start, length));
}
@Override
protected void field(PreprocessingContext context, int start, int length)
{
super.field(context, start, length);
actualFieldRanges.add(Lists.newArrayList(start, length));
}
@Override
protected void sentence(PreprocessingContext context, int start, int length)
{
super.sentence(context, start, length);
actualSentenceRanges.add(Lists.newArrayList(start, length));
}
};
scanner.iterate(context);
assertThat(actualDocumentRanges).as("documentRanges.size()").hasSize(
context.documents.size());
assertThat(actualDocumentRanges).as("documentRanges").isEqualTo(
expectedDocumentRanges);
assertThat(actualFieldRanges).as("fieldRanges").isEqualTo(expectedFieldRanges);
assertThat(actualSentenceRanges).as("sentenceRanges").isEqualTo(
expectedSentenceRanges);
}
}