package org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
public class TestGraphTokenizers extends BaseTokenStreamTestCase {
// Makes a graph TokenStream from the string; separate
// positions with single space, multiple tokens at the same
// position with /, and add optional position length with
// :. EG "a b c" is a simple chain, "a/x b c" adds 'x'
// over 'a' at position 0 with posLen=1, "a/x:3 b c" adds
// 'x' over a with posLen=3. Tokens are in normal-form!
// So, offsets are computed based on the first token at a
// given position. NOTE: each token must be a single
// character! We assume this when computing offsets...
// NOTE: all input tokens must be length 1!!! This means
// you cannot turn on MockCharFilter when random
// testing...
private static class GraphTokenizer extends Tokenizer {
private List<Token> tokens;
private int upto;
private int inputLength;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
public GraphTokenizer(Reader input) {
super(input);
}
@Override
public void reset() {
tokens = null;
upto = 0;
}
@Override
public boolean incrementToken() throws IOException {
if (tokens == null) {
fillTokens();
}
//System.out.println("graphTokenizer: incr upto=" + upto + " vs " + tokens.size());
if (upto == tokens.size()) {
//System.out.println(" END @ " + tokens.size());
return false;
}
final Token t = tokens.get(upto++);
//System.out.println(" return token=" + t);
clearAttributes();
termAtt.append(t.toString());
offsetAtt.setOffset(t.startOffset(), t.endOffset());
posIncrAtt.setPositionIncrement(t.getPositionIncrement());
posLengthAtt.setPositionLength(t.getPositionLength());
return true;
}
@Override
public void end() throws IOException {
super.end();
// NOTE: somewhat... hackish, but we need this to
// satisfy BTSTC:
final int lastOffset;
if (tokens != null && !tokens.isEmpty()) {
lastOffset = tokens.get(tokens.size()-1).endOffset();
} else {
lastOffset = 0;
}
offsetAtt.setOffset(correctOffset(lastOffset),
correctOffset(inputLength));
}
private void fillTokens() throws IOException {
final StringBuilder sb = new StringBuilder();
final char[] buffer = new char[256];
while (true) {
final int count = input.read(buffer);
if (count == -1) {
break;
}
sb.append(buffer, 0, count);
//System.out.println("got count=" + count);
}
//System.out.println("fillTokens: " + sb);
inputLength = sb.length();
final String[] parts = sb.toString().split(" ");
tokens = new ArrayList<Token>();
int pos = 0;
int maxPos = -1;
int offset = 0;
//System.out.println("again");
for(String part : parts) {
final String[] overlapped = part.split("/");
boolean firstAtPos = true;
int minPosLength = Integer.MAX_VALUE;
for(String part2 : overlapped) {
final int colonIndex = part2.indexOf(':');
final String token;
final int posLength;
if (colonIndex != -1) {
token = part2.substring(0, colonIndex);
posLength = Integer.parseInt(part2.substring(1+colonIndex));
} else {
token = part2;
posLength = 1;
}
maxPos = Math.max(maxPos, pos + posLength);
minPosLength = Math.min(minPosLength, posLength);
final Token t = new Token(token, offset, offset + 2*posLength - 1);
t.setPositionLength(posLength);
t.setPositionIncrement(firstAtPos ? 1:0);
firstAtPos = false;
//System.out.println(" add token=" + t + " startOff=" + t.startOffset() + " endOff=" + t.endOffset());
tokens.add(t);
}
pos += minPosLength;
offset = 2 * pos;
}
assert maxPos <= pos: "input string mal-formed: posLength>1 tokens hang over the end";
}
}
public void testMockGraphTokenFilterBasic() throws Exception {
for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
final TokenStream t2 = new MockGraphTokenFilter(random(), t);
return new TokenStreamComponents(t, t2);
}
};
checkAnalysisConsistency(random(), a, false, "a b c d e f g h i j k");
}
}
public void testMockGraphTokenFilterOnGraphInput() throws Exception {
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer t = new GraphTokenizer(reader);
final TokenStream t2 = new MockGraphTokenFilter(random(), t);
return new TokenStreamComponents(t, t2);
}
};
checkAnalysisConsistency(random(), a, false, "a/x:3 c/y:2 d e f/z:4 g h i j k");
}
}
// Just deletes (leaving hole) token 'a':
private final static class RemoveATokens extends TokenFilter {
private int pendingPosInc;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public RemoveATokens(TokenStream in) {
super(in);
}
@Override
public void reset() throws IOException {
super.reset();
pendingPosInc = 0;
}
@Override
public void end() throws IOException {
super.end();
posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
}
@Override
public boolean incrementToken() throws IOException {
while (true) {
final boolean gotOne = input.incrementToken();
if (!gotOne) {
return false;
} else if (termAtt.toString().equals("a")) {
pendingPosInc += posIncAtt.getPositionIncrement();
} else {
posIncAtt.setPositionIncrement(pendingPosInc + posIncAtt.getPositionIncrement());
pendingPosInc = 0;
return true;
}
}
}
}
public void testMockGraphTokenFilterBeforeHoles() throws Exception {
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
final TokenStream t2 = new MockGraphTokenFilter(random(), t);
final TokenStream t3 = new RemoveATokens(t2);
return new TokenStreamComponents(t, t3);
}
};
Random random = random();
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
}
}
public void testMockGraphTokenFilterAfterHoles() throws Exception {
for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
final TokenStream t2 = new RemoveATokens(t);
final TokenStream t3 = new MockGraphTokenFilter(random(), t2);
return new TokenStreamComponents(t, t3);
}
};
Random random = random();
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k");
checkAnalysisConsistency(random, a, false, "x y a b c d e f g h i j k");
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a");
checkAnalysisConsistency(random, a, false, "a b c d e f g h i j k a x y");
}
}
public void testMockGraphTokenFilterRandom() throws Exception {
for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
final TokenStream t2 = new MockGraphTokenFilter(random(), t);
return new TokenStreamComponents(t, t2);
}
};
Random random = random();
checkRandomData(random, a, 5, atLeast(1000));
}
}
// Two MockGraphTokenFilters
public void testDoubleMockGraphTokenFilterRandom() throws Exception {
for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
final TokenStream t1 = new MockGraphTokenFilter(random(), t);
final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
return new TokenStreamComponents(t, t2);
}
};
Random random = random();
checkRandomData(random, a, 5, atLeast(1000));
}
}
public void testMockGraphTokenFilterBeforeHolesRandom() throws Exception {
for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
final TokenStream t1 = new MockGraphTokenFilter(random(), t);
final TokenStream t2 = new MockHoleInjectingTokenFilter(random(), t1);
return new TokenStreamComponents(t, t2);
}
};
Random random = random();
checkRandomData(random, a, 5, atLeast(1000));
}
}
public void testMockGraphTokenFilterAfterHolesRandom() throws Exception {
for(int iter=0;iter<10*RANDOM_MULTIPLIER;iter++) {
if (VERBOSE) {
System.out.println("\nTEST: iter=" + iter);
}
// Make new analyzer each time, because MGTF has fixed
// seed:
final Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
final TokenStream t1 = new MockHoleInjectingTokenFilter(random(), t);
final TokenStream t2 = new MockGraphTokenFilter(random(), t1);
return new TokenStreamComponents(t, t2);
}
};
Random random = random();
checkRandomData(random, a, 5, atLeast(1000));
}
}
}