/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.carrot2.text.analysis.TokenTypeUtils;
import org.carrot2.text.preprocessing.PreprocessingContext.AllPhrases;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.text.util.CharArrayComparators;
import org.carrot2.util.IntMapUtils;
import org.fest.assertions.Assertions;
import org.fest.util.Strings;
import com.carrotsearch.hppc.IntIntHashMap;
import com.carrotsearch.hppc.procedures.IntIntProcedure;
import org.carrot2.shaded.guava.common.base.MoreObjects;
import org.carrot2.shaded.guava.common.collect.Lists;
/**
* Fest-style assertions on the content of {@link PreprocessingContext}.
*/
class PreprocessingContextAssert
{
/** missing word constant. */
public final static String MW = "<MW>";
/** document separator constant. */
public final static String DS = "<DS>";
/** field separator constant. */
public final static String FS = "<FS>";
/** end of stream constant. */
public final static String EOS = "<EOS>";
final PreprocessingContext context;
final class PreprocessingContextPhraseAssert
{
private int phraseIndex;
PreprocessingContextPhraseAssert(int index)
{
assert index >= 0;
this.phraseIndex = index;
}
public PreprocessingContextPhraseAssert withDocumentTf(int documentIndex, int expectedTf)
{
int [] byDocTf = context.allPhrases.tfByDocument[phraseIndex];
for (int i = 0; i < byDocTf.length; i += 2)
{
if (byDocTf[i] == documentIndex) {
Assertions.assertThat(expectedTf).isEqualTo(byDocTf[i + 1]);
return this;
}
}
org.junit.Assert.fail("No document " + documentIndex + " for this phrase: "
+ context.allPhrases.getPhrase(phraseIndex) + "\n" + context.allPhrases);
return this;
}
/**
* Asserts exact mapping of document-tf (the number of mappings and their value, regardless
* of their order).
*/
public PreprocessingContextPhraseAssert withExactDocumentTfs(int [][] docTfPairs)
{
for (int [] docTf : docTfPairs)
{
Assertions.assertThat(docTf.length).isEqualTo(2);
withDocumentTf(docTf[0], docTf[1]);
}
Assertions.assertThat(context.allPhrases.tfByDocument[phraseIndex].length / 2)
.describedAs("tfByDocument array size for phrase: '" + context.allPhrases.getPhrase(phraseIndex) + "'")
.isEqualTo(docTfPairs.length);
return this;
}
public PreprocessingContextPhraseAssert withTf(int expectedTf)
{
Assertions.assertThat(context.allPhrases.tf[phraseIndex])
.describedAs("tf different for phrase '" + context.allPhrases.getPhrase(phraseIndex) + "'")
.isEqualTo(expectedTf);
return this;
}
}
PreprocessingContextAssert(PreprocessingContext context)
{
this.context = context;
}
public List<String> wordImages()
{
Assertions.assertThat(context.allWords.image)
.describedAs("the context's allWords is not properly initialized.").isNotNull();
List<String> result = Lists.newArrayList();
for (int i = context.allWords.image.length; --i >= 0;)
{
result.add(new String(context.allWords.image[i]));
}
Collections.shuffle(result);
return result;
}
/**
* Return a list of random-ordered, space-separated phrase images.
*/
public List<String> phraseImages()
{
Assertions.assertThat(context.allPhrases.wordIndices)
.describedAs("the context's allPhrases is not properly initialized.").isNotNull();
List<String> result = Lists.newArrayList();
for (int i = context.allPhrases.wordIndices.length; --i >= 0;)
{
result.add(context.allPhrases.getPhrase(i).toString());
}
Collections.shuffle(result);
return result;
}
/** Assert the context contains a phrase consisting of these exact images. */
public PreprocessingContextPhraseAssert containsPhrase(List<String> processedTermImages)
{
return containsPhrase(processedTermImages.toArray(
new String [processedTermImages.size()]));
}
/** Assert the context contains a phrase consisting of these exact images. */
public PreprocessingContextPhraseAssert containsPhrase(String... processedTermImages)
{
Assertions.assertThat(processedTermImages).isNotEmpty();
Assertions.assertThat(context.allPhrases.wordIndices)
.describedAs("the context's allPhrases is not properly initialized.").isNotNull();
// Naive scan over the set of extracted phrases.
final String phraseImage = Strings.join(processedTermImages).with(" ");
int foundAt = -1;
for (int i = context.allPhrases.wordIndices.length; --i >= 0;)
{
if (phraseImage.equals(context.allPhrases.getPhrase(i).toString()))
{
if (foundAt >= 0) org.junit.Assert.fail("More than one phrase with an identical image '"
+ phraseImage + "'?\n\n" + context.allPhrases);
foundAt = i;
}
}
if (foundAt < 0)
org.junit.Assert.fail("No phrase '" + phraseImage + "' in allPhrases:\n" + context.allPhrases);
return new PreprocessingContextPhraseAssert(foundAt);
}
/**
* Looks up a phrase that matches the list of stemmed images. Stem images
* are preprocessed in this method and underscore "_"
* character is removed (clearer test input in conjunction with {@link TestStemmerFactory}).
*/
public PreprocessingContextPhraseAssert containsPhraseStemmedAs(String... stemImages)
{
Assertions.assertThat(stemImages).isNotEmpty();
Assertions.assertThat(context.allPhrases.wordIndices)
.describedAs("the context's allPhrases is not properly initialized.").isNotNull();
for (int i = 0; i < stemImages.length; i++)
stemImages[i] = stemImages[i].replaceAll("_", "");
// Naive scan over the set of extracted phrases.
Comparator<char[]> comp = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR;
int foundAt = -1;
nextPhrase:
for (int i = context.allPhrases.wordIndices.length; --i >= 0;)
{
int [] wordIdxs = context.allPhrases.wordIndices[i];
if (wordIdxs.length == stemImages.length)
{
for (int j = 0; j < wordIdxs.length; j++)
{
if (comp.compare(
context.allStems.image[context.allWords.stemIndex[wordIdxs[j]]],
stemImages[j].toCharArray()) != 0)
{
continue nextPhrase;
}
}
if (foundAt >= 0)
{
org.junit.Assert.fail("More than one phrase corresponds to stem sequence '" +
Arrays.toString(stemImages) + "':\n" + context.allPhrases);
}
foundAt = i;
}
}
if (foundAt < 0)
org.junit.Assert.fail("No phrase corresponding to stem sequence '" +
Arrays.toString(stemImages) + "' in allPhrases:\n" + context.allPhrases);
return new PreprocessingContextPhraseAssert(foundAt);
}
public static PreprocessingContextAssert assertThat(PreprocessingContext context)
{
return new PreprocessingContextAssert(context);
}
public static List<TokenEntry> tokens(PreprocessingContext context)
{
return new PreprocessingContextAssert(context).tokens();
}
final class StemAssert
{
private final int stemIndex;
private final String stemImage;
public StemAssert(int stemIndex)
{
this.stemIndex = stemIndex;
this.stemImage = new String(context.allStems.image[stemIndex]);
}
public StemAssert withTf(int expectedTf)
{
Assertions.assertThat(context.allStems.tf[stemIndex])
.describedAs("tf different for stem " + stemImage)
.isEqualTo(expectedTf);
return this;
}
public StemAssert withDocumentTf(int documentIndex, int expectedTf)
{
int [] byDocTf = context.allStems.tfByDocument[stemIndex];
for (int i = 0; i < byDocTf.length; i += 2)
{
if (byDocTf[i] == documentIndex) {
Assertions.assertThat(expectedTf).isEqualTo(byDocTf[i + 1]);
return this;
}
}
org.junit.Assert.fail("No document " + documentIndex + " for this stem: "
+ stemImage + "\n" + context.allPhrases);
return this;
}
public StemAssert withExactDocumentTfs(int [][] docTfPairs)
{
for (int [] docTf : docTfPairs)
{
Assertions.assertThat(docTf.length).isEqualTo(2);
withDocumentTf(docTf[0], docTf[1]);
}
Assertions.assertThat(context.allStems.tfByDocument[stemIndex].length / 2)
.describedAs("tfByDocument array size for stem: '" + stemImage + "'")
.isEqualTo(docTfPairs.length);
return this;
}
public StemAssert withFieldIndices(int... expectedIndices)
{
int [] indices = PreprocessingContext.toFieldIndexes(context.allStems.fieldIndices[stemIndex]);
Assertions.assertThat(expectedIndices).as("field indices of stem '" + stemImage + "'")
.isEqualTo(indices);
return this;
}
}
StemAssert constainsStem(String stemImage)
{
Assertions.assertThat(stemImage).isNotEmpty();
Assertions.assertThat(context.allStems.image)
.describedAs("the context's allStems is not properly initialized.").isNotNull();
Comparator<char[]> comp = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR;
int found = -1;
for (int i = 0; i < context.allStems.image.length; i++)
{
if (comp.compare(context.allStems.image[i], stemImage.toCharArray()) == 0)
{
if (found >= 0)
org.junit.Assert.fail("Duplicate stem with image '" + stemImage + "' in stems:\n"
+ context.allStems);
found = i;
}
}
if (found == -1)
org.junit.Assert.fail("No stem with image '" + stemImage + "' in stems:\n"
+ context.allStems);
return new StemAssert(found);
}
final class WordAssert
{
private final int wordIndex;
private final String wordImage;
public WordAssert(int wordIndex)
{
this.wordIndex = wordIndex;
this.wordImage = new String(context.allWords.image[wordIndex]);
}
public WordAssert withTf(int expectedTf)
{
Assertions.assertThat(context.allWords.tf[wordIndex])
.describedAs("tf different for word " + wordImage)
.isEqualTo(expectedTf);
return this;
}
public WordAssert withDocumentTf(int documentIndex, int expectedTf)
{
int [] byDocTf = context.allWords.tfByDocument[wordIndex];
for (int i = 0; i < byDocTf.length; i += 2)
{
if (byDocTf[i] == documentIndex) {
Assertions.assertThat(expectedTf).isEqualTo(byDocTf[i + 1]);
return this;
}
}
org.junit.Assert.fail("No document " + documentIndex + " for this word: "
+ wordImage + "\n" + context.allPhrases);
return this;
}
public WordAssert withExactDocumentTfs(int [][] docTfPairs)
{
for (int [] docTf : docTfPairs)
{
Assertions.assertThat(docTf.length).isEqualTo(2);
withDocumentTf(docTf[0], docTf[1]);
}
Assertions.assertThat(context.allWords.tfByDocument[wordIndex].length / 2)
.describedAs("tfByDocument array size for word: '" + wordImage + "'")
.isEqualTo(docTfPairs.length);
return this;
}
public WordAssert withFieldIndices(int... expectedIndices)
{
int [] indices = PreprocessingContext.toFieldIndexes(context.allWords.fieldIndices[wordIndex]);
Assertions.assertThat(expectedIndices).as("field indices of word '" + wordImage + "'")
.isEqualTo(indices);
return this;
}
/** type masked to token type only. */
public void withTokenType(int tokenType)
{
Assertions.assertThat(TokenTypeUtils.maskType(context.allWords.type[wordIndex]))
.as("token type (masked) of word '" + wordImage + "'")
.isEqualTo(tokenType);
}
/** raw value of token type field. */
public void withExactTokenType(int tokenType)
{
Assertions.assertThat(tokenType)
.as("token type of word '" + wordImage + "'")
.isEqualTo(context.allWords.type[wordIndex]);
}
}
public WordAssert containsWord(String wordImage)
{
Assertions.assertThat(wordImage).isNotEmpty();
Assertions.assertThat(context.allWords.image)
.describedAs("the context's allWords is not properly initialized.").isNotNull();
Comparator<char[]> comp = CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR;
int found = -1;
for (int i = 0; i < context.allWords.image.length; i++)
{
if (comp.compare(context.allWords.image[i], wordImage.toCharArray()) == 0)
{
if (found >= 0)
org.junit.Assert.fail("Duplicate word with image '" + wordImage + "' in words:\n"
+ context.allWords);
found = i;
}
}
if (found == -1)
org.junit.Assert.fail("No word with image '" + wordImage + "' in words:\n"
+ context.allStems);
return new WordAssert(found);
}
public final class TokenEntry
{
final int tokenIndex;
TokenEntry(int tokenIndex)
{
this.tokenIndex = tokenIndex;
}
public String getTokenImage()
{
if (context.allTokens.image[tokenIndex] == null)
return null;
return new String(context.allTokens.image[tokenIndex]);
}
public String getWordImage()
{
if (context.allTokens.image[tokenIndex] == null)
{
if (TokenTypeUtils.isDocumentSeparator(context.allTokens.type[tokenIndex]))
return DS;
if (TokenTypeUtils.isFieldSeparator(context.allTokens.type[tokenIndex]))
return FS;
if (TokenTypeUtils.isTerminator(context.allTokens.type[tokenIndex]))
return EOS;
throw new RuntimeException();
}
int wordIndex = context.allTokens.wordIndex[tokenIndex];
if (wordIndex < 0)
return MW;
return new String(context.allWords.image[wordIndex]);
}
public String getStemImage()
{
if (getTokenImage() == null)
return null;
int wordIndex = context.allTokens.wordIndex[tokenIndex];
int stemIndex = context.allWords.stemIndex[wordIndex];
return new String(context.allStems.image[stemIndex]);
}
public Integer getWordType()
{
if (getTokenImage() == null)
return null;
return (int) context.allWords.type[context.allTokens.wordIndex[tokenIndex]];
}
}
public List<TokenEntry> tokens()
{
List<TokenEntry> result = Lists.newArrayList();
for (int i = 0; i < context.allTokens.image.length; i++)
result.add(new TokenEntry(i));
return result;
}
final class TokenAssert
{
private final int tokenIndex;
private final String tokenImage;
public TokenAssert(int tokenIndex)
{
this.tokenIndex = tokenIndex;
this.tokenImage = tokenIndex + ":"
+ (context.allTokens.image[tokenIndex] != null ? new String(context.allTokens.image[tokenIndex]) : "<null>");
}
/** type masked to token type only. */
public TokenAssert hasTokenType(int tokenType)
{
Assertions.assertThat(tokenType)
.as("token type (masked) of token '" + tokenImage + "'")
.isEqualTo(TokenTypeUtils.maskType(context.allTokens.type[tokenIndex]));
return this;
}
/** raw value of token type field. */
public TokenAssert hasExactTokenType(int tokenType)
{
Assertions.assertThat(tokenType)
.as("token type of token '" + tokenImage + "'")
.isEqualTo(context.allTokens.type[tokenIndex]);
return this;
}
public TokenAssert hasImage(String image)
{
Assertions.assertThat(
CharArrayComparators.FAST_CHAR_ARRAY_COMPARATOR.compare(
image != null ? image.toCharArray() : null,
context.allTokens.image[tokenIndex]) == 0)
.as("token image equality: " + image + " vs. " +
new String(
MoreObjects.firstNonNull(context.allTokens.image[tokenIndex], "<null>".toCharArray())))
.isTrue();
return this;
}
public TokenAssert hasDocIndex(int expectedDocIndex)
{
Assertions.assertThat(context.allTokens.documentIndex[tokenIndex])
.as("documentIndex")
.isEqualTo(expectedDocIndex);
return this;
}
public TokenAssert hasFieldIndex(int expectedFieldIndex)
{
Assertions.assertThat(context.allTokens.fieldIndex[tokenIndex])
.as("fieldIndex")
.isEqualTo((byte) expectedFieldIndex);
return this;
}
}
public TokenAssert tokenAt(int tokenIndex)
{
return new TokenAssert(tokenIndex);
}
/**
* Make sure term frequencies and
*/
public void phraseTfsCorrect()
{
// for each discovered phrase, do manual count and verify if tf and tfByDocument are correct.
AllPhrases allPhrases = context.allPhrases;
for (int index = 0; index < allPhrases.size(); index++)
{
IntIntHashMap realTfByDocuments = countManually(context, allPhrases.wordIndices[index]);
final int realTf = realTfByDocuments.forEach(new IntIntProcedure()
{
int tf;
public void apply(int key, int value)
{
tf += value;
}
}).tf;
Assertions.assertThat(allPhrases.tf[index]).as("Phrase: " + allPhrases.getPhrase(index))
.isEqualTo(realTf);
// Phrase extractor does not sort the byDocumentTf, so we need to addAllFromFlattened
// to a map and then flatten with sorting.
Assertions
.assertThat(
IntMapUtils.flattenSortedByKey(IntMapUtils.addAllFromFlattened(
new IntIntHashMap(), allPhrases.tfByDocument[index])))
.as("Phrase: " + allPhrases.getPhrase(index))
.isEqualTo(IntMapUtils.flattenSortedByKey(realTfByDocuments));
}
}
/**
* Manually and naively count doc->tf for the given word sequence.
*/
private IntIntHashMap countManually(PreprocessingContext context, int [] phraseWordIndices)
{
IntIntHashMap tfByDoc = new IntIntHashMap();
AllTokens allTokens = context.allTokens;
outer:
for (int i = allTokens.wordIndex.length - phraseWordIndices.length; --i >=0 ;)
{
for (int j = 0; j < phraseWordIndices.length; j++)
{
int wordInPhrase = phraseWordIndices[j];
int wordInTokens = allTokens.wordIndex[i + j];
if (wordInPhrase != wordInTokens)
continue outer;
}
tfByDoc.putOrAdd(allTokens.documentIndex[i], 1, 1);
}
return tfByDoc;
}
}