/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.linguistic.*;
import org.junit.Before;
import org.junit.Test;
/**
* Test cases for {@link LabelFormatter}.
*/
public class LabelFormatterTest extends PreprocessingComponentTestBase
{
/** Label formatter under tests */
private LabelFormatter labelFormatter;
/** Other preprocessing components required for the test */
private Tokenizer tokenizer;
private CaseNormalizer caseNormalizer;
private LanguageModelStemmer languageModelStemmer;
private PhraseExtractor phraseExtractor;
private StopListMarker stopListMarker;
private LabelFilterProcessor labelFilterProcessor;
@Before
public void setUpPreprocessingComponents()
{
tokenizer = new Tokenizer();
caseNormalizer = new CaseNormalizer();
languageModelStemmer = new LanguageModelStemmer();
phraseExtractor = new PhraseExtractor();
stopListMarker = new StopListMarker();
labelFilterProcessor = new LabelFilterProcessor();
labelFormatter = new LabelFormatter();
}
@Override
protected ILexicalDataFactory createLexicalDataFactory()
{
final ILexicalDataFactory factory = super.createLexicalDataFactory();
((DefaultLexicalDataFactory) factory).mergeResources = false;
return factory;
}
@Test
public void testSingleWordNotCapitalized()
{
createDocuments("test", "test");
final String expectedLabel = "Test";
checkFullPreprocessing(LanguageCode.ENGLISH, expectedLabel);
checkWithoutPreprocessing(new char [] []
{
"test".toCharArray()
}, new boolean []
{
false
}, expectedLabel, true);
}
@Test
public void testSingleWordCapitalized()
{
createDocuments("kMN", "kMN");
final String expectedLabel = "kMN";
checkFullPreprocessing(LanguageCode.ENGLISH, expectedLabel);
checkWithoutPreprocessing(new char [] []
{
"kMN".toCharArray()
}, new boolean []
{
false
}, expectedLabel, true);
}
@Test
public void testSingleStopWord()
{
createDocuments("for", "for");
labelFilterProcessor.stopWordLabelFilter.enabled = false;
final String expectedLabel = "For";
checkFullPreprocessing(LanguageCode.ENGLISH, expectedLabel);
checkWithoutPreprocessing(new char [] []
{
"for".toCharArray()
}, new boolean []
{
true
}, expectedLabel, true);
}
@Test
public void testPhraseWithLowerCaseWords()
{
createDocuments("test phrase", "test phrase");
final String expectedLabel = "Test Phrase";
checkFullPreprocessing(LanguageCode.ENGLISH, expectedLabel);
checkWithoutPreprocessing(new char [] []
{
"test".toCharArray(), "phrase".toCharArray()
}, new boolean []
{
false, false
}, expectedLabel, true);
}
@Test
public void testPhraseWithStopWords()
{
createDocuments("food for fish", "food for fish");
final String expectedLabel = "Food for Fish";
checkFullPreprocessing(LanguageCode.ENGLISH, expectedLabel);
checkWithoutPreprocessing(new char [] []
{
"food".toCharArray(), "for".toCharArray(), "fish".toCharArray()
}, new boolean []
{
false, true, false
}, expectedLabel, true);
}
@Test
public void testPhraseWithoutStopWords()
{
createDocuments("Jaguar car", "Jaguar car");
final String expectedLabel = "Jaguar Car";
checkFullPreprocessing(LanguageCode.ENGLISH, expectedLabel);
checkWithoutPreprocessing(new char [] []
{
"Jaguar".toCharArray(), "Car".toCharArray()
}, new boolean []
{
false, true, false
}, expectedLabel, true);
}
@Test
public void testPhraseWithCapitalizedWords()
{
createDocuments("iMac stuff", "iMac stuff");
final String expectedLabel = "iMac Stuff";
checkFullPreprocessing(LanguageCode.ENGLISH, expectedLabel);
checkWithoutPreprocessing(new char [] []
{
"iMac".toCharArray(), "stuff".toCharArray()
}, new boolean []
{
false, false
}, expectedLabel, true);
}
@Test
public void testChinesePhrases()
{
createDocuments("东亚货币贬值", "东亚货币贬值");
final String expectedLabel = "东亚货币贬值";
checkFullPreprocessing(LanguageCode.ENGLISH, expectedLabel);
checkWithoutPreprocessing(new char [] []
{
"东亚货币贬值".toCharArray()
}, new boolean []
{
false, false
}, expectedLabel, false);
}
private void checkWithoutPreprocessing(char [][] words, boolean [] stopWords,
String expectedFormattedLabel, boolean joinWithSpace)
{
assertThat(LabelFormatter.format(words, stopWords, joinWithSpace)).isEqualTo(
expectedFormattedLabel);
}
private void checkFullPreprocessing(LanguageCode language,
String... expectedFormattedLabels)
{
tokenizer.tokenize(context);
caseNormalizer.normalize(context);
languageModelStemmer.stem(context);
phraseExtractor.extractPhrases(context);
stopListMarker.mark(context);
labelFilterProcessor.process(context);
final int [] labelsFeatureIndex = context.allLabels.featureIndex;
assertThat(labelsFeatureIndex.length).as("featureIndex.length").isEqualTo(
expectedFormattedLabels.length);
for (int i = 0; i < labelsFeatureIndex.length; i++)
{
assertThat(labelFormatter.format(context, labelsFeatureIndex[i])).as(
"featureIndex[" + i + "]").isEqualTo(expectedFormattedLabels[i]);
}
}
}