/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.core; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.RollingBuffer; /** * Converts an incoming graph token stream, such as one from * {@link SynonymGraphFilter}, into a flat form so that * all nodes form a single linear chain with no side paths. Every * path through the graph touches every node. This is necessary * when indexing a graph token stream, because the index does not * save {@link PositionLengthAttribute} and so it cannot * preserve the graph structure. However, at search time, * query parsers can correctly handle the graph and this token * filter should <b>not</b> be used. * * <p>If the graph was not already flat to start, this * is likely a lossy process, i.e. it will often cause the * graph to accept token sequences it should not, and to * reject token sequences it should not. * * <p>However, when applying synonyms during indexing, this * is necessary because Lucene already does not index a graph * and so the indexing process is already lossy * (it ignores the {@link PositionLengthAttribute}). * * @lucene.experimental */ public final class FlattenGraphFilter extends TokenFilter { /** Holds all tokens leaving a given input position. */ private final static class InputNode implements RollingBuffer.Resettable { private final List<AttributeSource.State> tokens = new ArrayList<>(); /** Our input node, or -1 if we haven't been assigned yet */ int node = -1; /** Maximum to input node for all tokens leaving here; we use this * to know when we can freeze. */ int maxToNode = -1; /** Where we currently map to; this changes (can only * increase as we see more input tokens), until we are finished * with this position. */ int outputNode = -1; /** Which token (index into {@link #tokens}) we will next output. */ int nextOut; @Override public void reset() { tokens.clear(); node = -1; outputNode = -1; maxToNode = -1; nextOut = 0; } } /** Gathers up merged input positions into a single output position, * only for the current "frontier" of nodes we've seen but can't yet * output because they are not frozen. */ private final static class OutputNode implements RollingBuffer.Resettable { private final List<Integer> inputNodes = new ArrayList<>(); /** Node ID for this output, or -1 if we haven't been assigned yet. */ int node = -1; /** Which input node (index into {@link #inputNodes}) we will next output. */ int nextOut; /** Start offset of tokens leaving this node. */ int startOffset = -1; /** End offset of tokens arriving to this node. */ int endOffset = -1; @Override public void reset() { inputNodes.clear(); node = -1; nextOut = 0; startOffset = -1; endOffset = -1; } } private final RollingBuffer<InputNode> inputNodes = new RollingBuffer<InputNode>() { @Override protected InputNode newInstance() { return new InputNode(); } }; private final RollingBuffer<OutputNode> outputNodes = new RollingBuffer<OutputNode>() { @Override protected OutputNode newInstance() { return new OutputNode(); } }; private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); /** Which input node the last seen token leaves from */ private int inputFrom; /** We are currently releasing tokens leaving from this output node */ private int outputFrom; // for debugging: //private int retOutputFrom; private boolean done; private int lastOutputFrom; private int finalOffset; private int finalPosInc; private int maxLookaheadUsed; private int lastStartOffset; public FlattenGraphFilter(TokenStream in) { super(in); } private boolean releaseBufferedToken() { // We only need the while loop (retry) if we have a hole (an output node that has no tokens leaving): while (outputFrom < outputNodes.getMaxPos()) { OutputNode output = outputNodes.get(outputFrom); if (output.inputNodes.isEmpty()) { // No tokens arrived to this node, which happens for the first node // after a hole: //System.out.println(" skip empty outputFrom=" + outputFrom); outputFrom++; continue; } int maxToNode = -1; for(int inputNodeID : output.inputNodes) { InputNode inputNode = inputNodes.get(inputNodeID); assert inputNode.outputNode == outputFrom; maxToNode = Math.max(maxToNode, inputNode.maxToNode); } //System.out.println(" release maxToNode=" + maxToNode + " vs inputFrom=" + inputFrom); // TODO: we could shrink the frontier here somewhat if we // always output posLen=1 as part of our "sausagizing": if (maxToNode <= inputFrom || done) { //System.out.println(" output node merged these inputs: " + output.inputNodes); // These tokens are now frozen assert output.nextOut < output.inputNodes.size(): "output.nextOut=" + output.nextOut + " vs output.inputNodes.size()=" + output.inputNodes.size(); InputNode inputNode = inputNodes.get(output.inputNodes.get(output.nextOut)); if (done && inputNode.tokens.size() == 0 && outputFrom >= outputNodes.getMaxPos()) { return false; } if (inputNode.tokens.size() == 0) { assert inputNode.nextOut == 0; assert output.nextOut == 0; // Hole dest nodes should never be merged since 1) we always // assign them to a new output position, and 2) since they never // have arriving tokens they cannot be pushed: assert output.inputNodes.size() == 1: output.inputNodes.size(); outputFrom++; inputNodes.freeBefore(output.inputNodes.get(0)); outputNodes.freeBefore(outputFrom); continue; } assert inputNode.nextOut < inputNode.tokens.size(); restoreState(inputNode.tokens.get(inputNode.nextOut)); // Correct posInc assert outputFrom >= lastOutputFrom; posIncAtt.setPositionIncrement(outputFrom - lastOutputFrom); int toInputNodeID = inputNode.node + posLenAtt.getPositionLength(); InputNode toInputNode = inputNodes.get(toInputNodeID); // Correct posLen assert toInputNode.outputNode > outputFrom; posLenAtt.setPositionLength(toInputNode.outputNode - outputFrom); lastOutputFrom = outputFrom; inputNode.nextOut++; //System.out.println(" ret " + this); OutputNode outputEndNode = outputNodes.get(toInputNode.outputNode); // Correct offsets // This is a bit messy; we must do this so offset don't go backwards, // which would otherwise happen if the replacement has more tokens // than the input: int startOffset = Math.max(lastStartOffset, output.startOffset); // We must do this in case the incoming tokens have broken offsets: int endOffset = Math.max(startOffset, outputEndNode.endOffset); offsetAtt.setOffset(startOffset, endOffset); lastStartOffset = startOffset; if (inputNode.nextOut == inputNode.tokens.size()) { output.nextOut++; if (output.nextOut == output.inputNodes.size()) { outputFrom++; inputNodes.freeBefore(output.inputNodes.get(0)); outputNodes.freeBefore(outputFrom); } } return true; } else { return false; } } //System.out.println(" break false"); return false; } @Override public boolean incrementToken() throws IOException { //System.out.println("\nF.increment inputFrom=" + inputFrom + " outputFrom=" + outputFrom); while (true) { if (releaseBufferedToken()) { //retOutputFrom += posIncAtt.getPositionIncrement(); //System.out.println(" return buffered: " + termAtt + " " + retOutputFrom + "-" + (retOutputFrom + posLenAtt.getPositionLength())); //printStates(); return true; } else if (done) { //System.out.println(" done, return false"); return false; } if (input.incrementToken()) { // Input node this token leaves from: inputFrom += posIncAtt.getPositionIncrement(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); // Input node this token goes to: int inputTo = inputFrom + posLenAtt.getPositionLength(); //System.out.println(" input.inc " + termAtt + ": " + inputFrom + "-" + inputTo); InputNode src = inputNodes.get(inputFrom); if (src.node == -1) { // This means the "from" node of this token was never seen as a "to" node, // which should only happen if we just crossed a hole. This is a challenging // case for us because we normally rely on the full dependencies expressed // by the arcs to assign outgoing node IDs. It would be better if tokens // were never dropped but instead just marked deleted with a new // TermDeletedAttribute (boolean valued) ... but until that future, we have // a hack here to forcefully jump the output node ID: assert src.outputNode == -1; src.node = inputFrom; src.outputNode = outputNodes.getMaxPos() + 1; //System.out.println(" hole: force to outputNode=" + src.outputNode); OutputNode outSrc = outputNodes.get(src.outputNode); // Not assigned yet: assert outSrc.node == -1; outSrc.node = src.outputNode; outSrc.inputNodes.add(inputFrom); outSrc.startOffset = startOffset; } else { OutputNode outSrc = outputNodes.get(src.outputNode); if (outSrc.startOffset == -1 || startOffset > outSrc.startOffset) { // "shrink wrap" the offsets so the original tokens (with most // restrictive offsets) win: outSrc.startOffset = Math.max(startOffset, outSrc.startOffset); } } // Buffer this token: src.tokens.add(captureState()); src.maxToNode = Math.max(src.maxToNode, inputTo); maxLookaheadUsed = Math.max(maxLookaheadUsed, inputNodes.getBufferSize()); InputNode dest = inputNodes.get(inputTo); if (dest.node == -1) { // Common case: first time a token is arriving to this input position: dest.node = inputTo; } // Always number output nodes sequentially: int outputEndNode = src.outputNode + 1; if (outputEndNode > dest.outputNode) { if (dest.outputNode != -1) { boolean removed = outputNodes.get(dest.outputNode).inputNodes.remove(Integer.valueOf(inputTo)); assert removed; } //System.out.println(" increase output node: " + dest.outputNode + " vs " + outputEndNode); outputNodes.get(outputEndNode).inputNodes.add(inputTo); dest.outputNode = outputEndNode; // Since all we ever do is merge incoming nodes together, and then renumber // the merged nodes sequentially, we should only ever assign smaller node // numbers: assert outputEndNode <= inputTo: "outputEndNode=" + outputEndNode + " vs inputTo=" + inputTo; } OutputNode outDest = outputNodes.get(dest.outputNode); // "shrink wrap" the offsets so the original tokens (with most // restrictive offsets) win: if (outDest.endOffset == -1 || endOffset < outDest.endOffset) { outDest.endOffset = endOffset; } } else { //System.out.println(" got false from input"); input.end(); finalPosInc = posIncAtt.getPositionIncrement(); finalOffset = offsetAtt.endOffset(); done = true; // Don't return false here: we need to force release any buffered tokens now } } } // Only for debugging: /* private void printStates() { System.out.println("states:"); for(int i=outputFrom;i<outputNodes.getMaxPos();i++) { OutputNode outputNode = outputNodes.get(i); System.out.println(" output " + i + ": inputs " + outputNode.inputNodes); for(int inputNodeID : outputNode.inputNodes) { InputNode inputNode = inputNodes.get(inputNodeID); assert inputNode.outputNode == i; } } } */ @Override public void end() throws IOException { if (done == false) { super.end(); } else { // NOTE, shady: don't call super.end, because we did already from incrementToken } clearAttributes(); if (done) { // On exc, done is false, and we will not have set these: posIncAtt.setPositionIncrement(finalPosInc); offsetAtt.setOffset(finalOffset, finalOffset); } else { super.end(); } } @Override public void reset() throws IOException { //System.out.println("F: reset"); super.reset(); inputFrom = -1; inputNodes.reset(); InputNode in = inputNodes.get(0); in.node = 0; in.outputNode = 0; outputNodes.reset(); OutputNode out = outputNodes.get(0); out.node = 0; out.inputNodes.add(0); out.startOffset = 0; outputFrom = 0; //retOutputFrom = -1; lastOutputFrom = -1; done = false; finalPosInc = -1; finalOffset = -1; lastStartOffset = 0; maxLookaheadUsed = 0; } /** For testing */ public int getMaxLookaheadUsed() { return maxLookaheadUsed; } }