/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.RollingBuffer; // TODO: cut SynFilter over to this // TODO: somehow add "nuke this input token" capability... /** An abstract TokenFilter to make it easier to build graph * token filters requiring some lookahead. This class handles * the details of buffering up tokens, recording them by * position, restoring them, providing access to them, etc. */ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Position> extends TokenFilter { private final static boolean DEBUG = false; protected final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); protected final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); // Position of last read input token: protected int inputPos; // Position of next possible output token to return: protected int outputPos; // True if we hit end from our input: protected boolean end; private boolean tokenPending; private boolean insertPending; /** Holds all state for a single position; subclass this * to record other state at each position. */ protected static class Position implements RollingBuffer.Resettable { // Buffered input tokens at this position: public final List<AttributeSource.State> inputTokens = new ArrayList<>(); // Next buffered token to be returned to consumer: public int nextRead; // Any token leaving from this position should have this startOffset: public int startOffset = -1; // Any token arriving to this position should have this endOffset: public int endOffset = -1; @Override public void reset() { inputTokens.clear(); nextRead = 0; startOffset = -1; endOffset = -1; } public void add(AttributeSource.State state) { inputTokens.add(state); } public AttributeSource.State nextState() { assert nextRead < inputTokens.size(); return inputTokens.get(nextRead++); } } protected LookaheadTokenFilter(TokenStream input) { super(input); } /** Call this only from within afterPosition, to insert a new * token. After calling this you should set any * necessary token you need. */ protected void insertToken() throws IOException { if (tokenPending) { positions.get(inputPos).add(captureState()); tokenPending = false; } assert !insertPending; insertPending = true; } /** This is called when all input tokens leaving a given * position have been returned. Override this and * call insertToken and then set whichever token's * attributes you want, if you want to inject * a token starting from this position. */ protected void afterPosition() throws IOException { } protected abstract T newPosition(); protected final RollingBuffer<T> positions = new RollingBuffer<T>() { @Override protected T newInstance() { return newPosition(); } }; /** Returns true if there is a new token. */ protected boolean peekToken() throws IOException { if (DEBUG) { System.out.println("LTF.peekToken inputPos=" + inputPos + " outputPos=" + outputPos + " tokenPending=" + tokenPending); } assert !end; assert inputPos == -1 || outputPos <= inputPos; if (tokenPending) { positions.get(inputPos).add(captureState()); tokenPending = false; } final boolean gotToken = input.incrementToken(); if (DEBUG) { System.out.println(" input.incrToken() returned " + gotToken); } if (gotToken) { inputPos += posIncAtt.getPositionIncrement(); assert inputPos >= 0; if (DEBUG) { System.out.println(" now inputPos=" + inputPos); } final Position startPosData = positions.get(inputPos); final Position endPosData = positions.get(inputPos + posLenAtt.getPositionLength()); final int startOffset = offsetAtt.startOffset(); if (startPosData.startOffset == -1) { startPosData.startOffset = startOffset; } else { // Make sure our input isn't messing up offsets: assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos; } final int endOffset = offsetAtt.endOffset(); if (endPosData.endOffset == -1) { endPosData.endOffset = endOffset; } else { // Make sure our input isn't messing up offsets: assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos; } tokenPending = true; } else { end = true; } return gotToken; } /** Call this when you are done looking ahead; it will set * the next token to return. Return the boolean back to * the caller. */ protected boolean nextToken() throws IOException { //System.out.println(" nextToken: tokenPending=" + tokenPending); if (DEBUG) { System.out.println("LTF.nextToken inputPos=" + inputPos + " outputPos=" + outputPos + " tokenPending=" + tokenPending); } Position posData = positions.get(outputPos); // While loop here in case we have to // skip over a hole from the input: while (true) { //System.out.println(" check buffer @ outputPos=" + //outputPos + " inputPos=" + inputPos + " nextRead=" + //posData.nextRead + " vs size=" + //posData.inputTokens.size()); // See if we have a previously buffered token to // return at the current position: if (posData.nextRead < posData.inputTokens.size()) { if (DEBUG) { System.out.println(" return previously buffered token"); } // This position has buffered tokens to serve up: if (tokenPending) { positions.get(inputPos).add(captureState()); tokenPending = false; } restoreState(positions.get(outputPos).nextState()); //System.out.println(" return!"); return true; } if (inputPos == -1 || outputPos == inputPos) { // No more buffered tokens: // We may still get input tokens at this position //System.out.println(" break buffer"); if (tokenPending) { // Fast path: just return token we had just incr'd, // without having captured/restored its state: if (DEBUG) { System.out.println(" pass-through: return pending token"); } tokenPending = false; return true; } else if (end || !peekToken()) { if (DEBUG) { System.out.println(" END"); } afterPosition(); if (insertPending) { // Subclass inserted a token at this same // position: if (DEBUG) { System.out.println(" return inserted token"); } assert insertedTokenConsistent(); insertPending = false; return true; } return false; } } else { if (posData.startOffset != -1) { // This position had at least one token leaving if (DEBUG) { System.out.println(" call afterPosition"); } afterPosition(); if (insertPending) { // Subclass inserted a token at this same // position: if (DEBUG) { System.out.println(" return inserted token"); } assert insertedTokenConsistent(); insertPending = false; return true; } } // Done with this position; move on: outputPos++; if (DEBUG) { System.out.println(" next position: outputPos=" + outputPos); } positions.freeBefore(outputPos); posData = positions.get(outputPos); } } } // If subclass inserted a token, make sure it had in fact // looked ahead enough: private boolean insertedTokenConsistent() { final int posLen = posLenAtt.getPositionLength(); final Position endPosData = positions.get(outputPos + posLen); assert endPosData.endOffset != -1; assert offsetAtt.endOffset() == endPosData.endOffset: "offsetAtt.endOffset=" + offsetAtt.endOffset() + " vs expected=" + endPosData.endOffset; return true; } // TODO: end()? // TODO: close()? @Override public void reset() throws IOException { super.reset(); positions.reset(); inputPos = -1; outputPos = 0; tokenPending = false; end = false; } }