package org.xbib.elasticsearch.index.analysis.combo; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.Attribute; import java.io.IOException; import java.util.AbstractQueue; import java.util.Iterator; import java.util.PriorityQueue; /** * A TokenStream combining the output of multiple sub-TokenStreams. * <p/> * This class copies the attributes from the last sub-TokenStream that * was read from. If attributes are not uniform between sub-TokenStreams, * extraneous attributes will stay untouched. */ public class ComboTokenStream extends TokenStream { /** * Whether or not to continue with the current TokenStream * if it has multiple terms at same position, minimizing * queue moves, or to enforce strict order (position, offsets) */ static final boolean KEEP_STREAM_IF_SAME_POSITION = false; private int lastPosition; // Position tracked sub-TokenStreams private final PositionedTokenStream[] positionedTokenStreams; // Reading queue, using the reading order from PositionedTokenStream private final AbstractQueue<PositionedTokenStream> readQueue; // Flag for lazy initialization and reset private boolean readQueueResetted; public ComboTokenStream(TokenStream... tokenStreams) { // Load the TokenStreams, track their position, and register their attributes this.positionedTokenStreams = new PositionedTokenStream[tokenStreams.length]; for (int i = tokenStreams.length - 1; i >= 0; --i) { if (tokenStreams[i] == null) { continue; } this.positionedTokenStreams[i] = new PositionedTokenStream(tokenStreams[i]); // Add each and every token seen in the current sub AttributeSource Iterator<Class<? extends Attribute>> iterator = this.positionedTokenStreams[i].getAttributeClassesIterator(); while (iterator.hasNext()) { addAttribute(iterator.next()); } } this.lastPosition = 0; // Create an initially empty queue. // It will be filled at first incrementToken() call, because // it needs to call the same function on each sub-TokenStreams. this.readQueue = new PriorityQueue<PositionedTokenStream>(tokenStreams.length); readQueueResetted = false; } /* * TokenStream multiplexed methods */ @Override public final boolean incrementToken() throws IOException { clearAttributes(); // Fill the queue on first call if (!readQueueResetted) { readQueueResetted = true; readQueue.clear(); for (PositionedTokenStream pts : positionedTokenStreams) { if (pts == null) { continue; } // Read first token pts.clearAttributes(); if (pts.incrementToken()) { // PositionedTokenStream.incrementToken() initialized internal // variables to perform proper ordering. // Therefore we can only add it to the queue now! readQueue.add(pts); } // no token left (no token at all) } } // Read from the first token PositionedTokenStream toRead = readQueue.peek(); if (toRead == null) { return false; // end of streams } // Look position to see if it will be increased, see usage a bit below int pos = toRead.getPosition(); // Copy the current token attributes from the sub-TokenStream to our AttributeSource restoreState(toRead.captureState()); // Override the PositionIncrementAttribute this.getAttribute(PositionIncrementAttribute.class).setPositionIncrement(Math.max(0, pos - lastPosition)); // Prepare next read // We did not remove the TokenStream from the queue yet, // because if we have another token available at the same position, // we can save a queue movement. toRead.clearAttributes(); if (!toRead.incrementToken()) { // No more token to read, remove from the queue readQueue.poll(); } else { // Check if token position changed if (readQueue.size() > 1) { // If yes, re-enter in the priority queue readQueue.add(readQueue.poll()); } // Otherwise, next call will continue with the same TokenStream (less queue movements) } lastPosition = pos; return true; } @Override public void end() throws IOException { super.end(); lastPosition = 0; // Apply on each sub-TokenStream for (PositionedTokenStream pts : positionedTokenStreams) { if (pts == null) { continue; } pts.end(); } readQueueResetted = false; readQueue.clear(); } @Override public void reset() throws IOException { super.reset(); clearAttributes(); lastPosition = 0; // Apply on each sub-TokenStream for (PositionedTokenStream pts : positionedTokenStreams) { if (pts == null) { continue; } pts.reset(); } readQueueResetted = false; readQueue.clear(); } @Override public void close() throws IOException { super.close(); lastPosition = 0; // Apply on each sub-TokenStream for (PositionedTokenStream pts : positionedTokenStreams) { if (pts == null) { continue; } pts.close(); } readQueueResetted = false; readQueue.clear(); } }