package folioxml.lucene.analysis.folio; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import java.io.IOException; public abstract class LookAroundCharTokenizer extends Tokenizer { public LookAroundCharTokenizer() { super(); } /** * Returns true iff a codepoint should be included in a token. This tokenizer * generates as tokens adjacent sequences of codepoints which satisfy this * predicate. Codepoints for which this is false are used to define token * boundaries and are not included in tokens. * <p> * Values previous and next may be -1 at the beginning or end of a stream, respectively. */ protected abstract boolean isTokenChar(int previous, int c, int next); /** * Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this to, * e.g., lowercase tokens. */ protected abstract int normalize(int c); private static final int MAX_WORD_LEN = 1024; private static final int IO_BUFFER_SIZE = 4096; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private int offset = 0, finalOffset = 0; private int cPrev = -1; private int c = -1; private int cNext = -1; @Override public final boolean incrementToken() throws IOException { clearAttributes(); int length = 0; int start = -1; // We will set this to the index of the first valid token char we find. char[] buffer = termAtt.buffer(); //Destination buffer while (true) { cPrev = c; c = cNext; cNext = input.read(); //No support for surrogates here! offset++; if (length > 1 && c == -1) break; //We hit the end of the input for this token if (offset > 1 && c == -1) return false; //We don't have a token, nothing to do. if (offset > 1 && isTokenChar(cPrev, c, cNext)) { // if it's a token char if (length == 0) { // start of token assert start == -1; start = offset - 2; } else if (length >= buffer.length - 1) { // check if a supplementary could run out of bounds buffer = termAtt.resizeBuffer(2 + length); // make sure a supplementary fits in the buffer } length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized if (length >= MAX_WORD_LEN) break; // buffer overflow! make sure to check for >= surrogate pair could break == test } else if (length > 0) { // at non-Letter w/ chars break; // return 'em } } termAtt.setLength(length); assert start != -1; offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start + length)); return true; } @Override public void end() throws IOException { // set final offset offsetAtt.setOffset(finalOffset, finalOffset); super.end(); } @Override public void reset() throws IOException { super.reset(); offset = 0; finalOffset = 0; cPrev = -1; c = -1; cNext = -1; } }