package org.xbib.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.junit.Assert;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
*
*/
public abstract class BaseTokenStreamTest extends Assert {
/**
* Attribute that records if it was cleared or not. This is used
* for testing that clearAttributes() was called correctly.
*/
public interface CheckClearAttributesAttribute extends Attribute {
boolean getAndResetClearCalled();
}
/**
* Attribute that records if it was cleared or not. This is used
* for testing that clearAttributes() was called correctly.
*/
public static final class CheckClearAttributesAttributeImpl extends AttributeImpl implements CheckClearAttributesAttribute {
private boolean clearCalled = false;
@Override
public boolean getAndResetClearCalled() {
try {
return clearCalled;
} finally {
clearCalled = false;
}
}
@Override
public void clear() {
clearCalled = true;
}
@Override
public void reflectWith(AttributeReflector reflector) {
}
@Override
public boolean equals(Object other) {
return other instanceof CheckClearAttributesAttributeImpl &&
((CheckClearAttributesAttributeImpl) other).clearCalled == this.clearCalled;
}
@Override
public int hashCode() {
return 76137213 ^ Boolean.valueOf(clearCalled).hashCode();
}
@Override
public void copyTo(AttributeImpl target) {
target.clear();
}
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, Integer finalPosInc, boolean[] keywordAtts,
boolean offsetsAreCorrect) throws IOException {
assertNotNull(output);
CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
CharTermAttribute termAtt = null;
if (output.length > 0) {
assertTrue(ts.hasAttribute(CharTermAttribute.class));
termAtt = ts.getAttribute(CharTermAttribute.class);
}
OffsetAttribute offsetAtt = null;
if (startOffsets != null || endOffsets != null || finalOffset != null) {
assertTrue(ts.hasAttribute(OffsetAttribute.class));
offsetAtt = ts.getAttribute(OffsetAttribute.class);
}
TypeAttribute typeAtt = null;
if (types != null) {
assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class));
typeAtt = ts.getAttribute(TypeAttribute.class);
}
PositionIncrementAttribute posIncrAtt = null;
if (posIncrements != null || finalPosInc != null) {
assertTrue(ts.hasAttribute(PositionIncrementAttribute.class));
posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class);
}
PositionLengthAttribute posLengthAtt = null;
if (posLengths != null) {
assertTrue(ts.hasAttribute(PositionLengthAttribute.class));
posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
}
KeywordAttribute keywordAtt = null;
if (keywordAtts != null) {
assertTrue(ts.hasAttribute(KeywordAttribute.class));
keywordAtt = ts.getAttribute(KeywordAttribute.class);
}
// Maps position to the start/end offset:
final Map<Integer, Integer> posToStartOffset = new HashMap<>();
final Map<Integer, Integer> posToEndOffset = new HashMap<>();
ts.reset();
int pos = -1;
int lastStartOffset = 0;
for (int i = 0; i < output.length; i++) {
// extra safety to enforce, that the state is not preserved and also assign bogus values
ts.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
if (offsetAtt != null) {
offsetAtt.setOffset(14584724, 24683243);
}
if (typeAtt != null) {
typeAtt.setType("bogusType");
}
if (posIncrAtt != null) {
posIncrAtt.setPositionIncrement(45987657);
}
if (posLengthAtt != null) {
posLengthAtt.setPositionLength(45987653);
}
if (keywordAtt != null) {
keywordAtt.setKeyword((i & 1) == 0);
}
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
assertTrue(ts.incrementToken());
assertTrue(checkClearAtt.getAndResetClearCalled());
//assertEquals(output[i], termAtt.toString());
System.err.println(termAtt.toString());
if (startOffsets != null) {
assertEquals(startOffsets[i], offsetAtt.startOffset());
}
if (endOffsets != null) {
assertEquals(endOffsets[i], offsetAtt.endOffset());
}
if (types != null) {
assertEquals(types[i], typeAtt.type());
}
if (posIncrements != null) {
assertEquals(posIncrements[i], posIncrAtt.getPositionIncrement());
}
if (posLengths != null) {
assertEquals(posLengths[i], posLengthAtt.getPositionLength());
}
if (keywordAtts != null) {
assertEquals(keywordAtts[i], keywordAtt.isKeyword());
}
// we can enforce some basic things about a few attributes even if the caller doesn't check:
if (offsetAtt != null) {
final int startOffset = offsetAtt.startOffset();
final int endOffset = offsetAtt.endOffset();
if (finalOffset != null) {
assertTrue(startOffset <= finalOffset);
assertTrue(endOffset <= finalOffset);
}
if (offsetsAreCorrect) {
assertTrue(offsetAtt.startOffset() >= lastStartOffset);
lastStartOffset = offsetAtt.startOffset();
}
if (offsetsAreCorrect && posLengthAtt != null && posIncrAtt != null) {
// Validate offset consistency in the graph, ie
// all tokens leaving from a certain pos have the
// same startOffset, and all tokens arriving to a
// certain pos have the same endOffset:
final int posInc = posIncrAtt.getPositionIncrement();
pos += posInc;
final int posLength = posLengthAtt.getPositionLength();
if (!posToStartOffset.containsKey(pos)) {
// First time we've seen a token leaving from this position:
posToStartOffset.put(pos, startOffset);
//System.out.println(" + s " + pos + " -> " + startOffset);
} else {
// We've seen a token leaving from this position
// before; verify the startOffset is the same:
//System.out.println(" + vs " + pos + " -> " + startOffset);
assertEquals(posToStartOffset.get(pos).intValue(), startOffset);
}
final int endPos = pos + posLength;
if (!posToEndOffset.containsKey(endPos)) {
// First time we've seen a token arriving to this position:
posToEndOffset.put(endPos, endOffset);
//System.out.println(" + e " + endPos + " -> " + endOffset);
} else {
// We've seen a token arriving to this position
// before; verify the endOffset is the same:
//System.out.println(" + ve " + endPos + " -> " + endOffset);
assertEquals(posToEndOffset.get(endPos).intValue(), endOffset);
}
}
}
if (posIncrAtt != null) {
if (i == 0) {
assertTrue(posIncrAtt.getPositionIncrement() >= 1);
} else {
assertTrue(posIncrAtt.getPositionIncrement() >= 0);
}
}
if (posLengthAtt != null) {
assertTrue(posLengthAtt.getPositionLength() >= 1);
}
}
if (ts.incrementToken()) {
fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString());
}
// repeat our extra safety checks for end()
ts.clearAttributes();
if (termAtt != null) {
termAtt.setEmpty().append("bogusTerm");
}
if (offsetAtt != null) {
offsetAtt.setOffset(14584724, 24683243);
}
if (typeAtt != null) {
typeAtt.setType("bogusType");
}
if (posIncrAtt != null) {
posIncrAtt.setPositionIncrement(45987657);
}
if (posLengthAtt != null) {
posLengthAtt.setPositionLength(45987653);
}
checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
ts.end();
assertTrue(checkClearAtt.getAndResetClearCalled());
if (finalOffset != null) {
assertEquals(finalOffset.intValue(), offsetAtt.endOffset());
}
if (offsetAtt != null) {
assertTrue(offsetAtt.endOffset() >= 0);
}
if (finalPosInc != null) {
assertEquals(finalPosInc.intValue(), posIncrAtt.getPositionIncrement());
}
ts.close();
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
int posLengths[], Integer finalOffset, boolean[] keywordAtts,
boolean offsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, null, offsetsAreCorrect);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, finalOffset);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, String[] types) throws IOException {
assertTokenStreamContents(ts, output, null, null, types, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int[] posIncrements) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null, null, finalOffset);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, null);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, null, finalOffset);
}
public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements, int[] posLengths, Integer finalOffset) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, posIncrements, posLengths, finalOffset);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[]) throws IOException {
checkResetException(a, input);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, null, input.length());
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[]) throws IOException {
checkResetException(a, input);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length());
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], boolean offsetsAreCorrect) throws IOException {
checkResetException(a, input);
assertTokenStreamContents(a.tokenStream("dummy", input), output, startOffsets, endOffsets, types, posIncrements, posLengths, input.length(), offsetsAreCorrect);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, String[] types) throws IOException {
assertAnalyzesTo(a, input, output, null, null, types, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int[] posIncrements) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, posIncrements, null);
}
public static void assertAnalyzesToPositions(Analyzer a, String input, String[] output, int[] posIncrements, int[] posLengths) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, posIncrements, posLengths);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[]) throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], int[] posIncrements) throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, posIncrements, null);
}
private static void checkResetException(Analyzer a, String input) throws IOException {
if (a == null) {
fail("no analyzer");
return;
}
if (input == null) {
fail("no input");
return;
}
TokenStream ts = a.tokenStream("bogus", input);
try {
if (ts.incrementToken()) {
//System.out.println(ts.reflectAsString(false));
fail("didn't get expected exception when reset() not called");
}
} catch (IllegalStateException expected) {
// ok
} catch (AssertionError expected) {
// ok: MockTokenizer
assertTrue(expected.getMessage(), expected.getMessage() != null && expected.getMessage().contains("wrong state"));
} catch (Exception unexpected) {
unexpected.printStackTrace(System.err);
fail("got wrong exception when reset() not called: " + unexpected);
} finally {
// consume correctly
ts.reset();
while (ts.incrementToken()) {
}
ts.end();
ts.close();
}
// check for a missing close()
ts = a.tokenStream("bogus", input);
ts.reset();
while (ts.incrementToken()) {
}
ts.end();
try {
ts = a.tokenStream("bogus", input);
fail("didn't get expected exception when close() not called");
} catch (IllegalStateException expected) {
// ok
} finally {
ts.close();
}
}
public static void checkOneTerm(Analyzer a, final String input, final String expected) throws IOException {
assertAnalyzesTo(a, input, new String[]{expected});
}
}