SynonymGraphFilter.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.synonym;

import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.FlattenGraphFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util.fst.FST;

// TODO: maybe we should resolve token -> wordID then run
// FST on wordIDs, for better perf?
 
// TODO: a more efficient approach would be Aho/Corasick's
// algorithm
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
// It improves over the current approach here
// because it does not fully re-start matching at every
// token.  For example if one pattern is "a b c x"
// and another is "b c d" and the input is "a b c d", on
// trying to parse "a b c x" but failing when you got to x,
// rather than starting over again your really should
// immediately recognize that "b c d" matches at the next
// input.  I suspect this won't matter that much in
// practice, but it's possible on some set of synonyms it
// will.  We'd have to modify Aho/Corasick to enforce our
// conflict resolving (eg greedy matching) because that algo
// finds all matches.  This really amounts to adding a .*
// closure to the FST and then determinizing it.
//
// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps

/** Applies single- or multi-token synonyms from a {@link SynonymMap}
 *  to an incoming {@link TokenStream}, producing a fully correct graph
 *  output.  This is a replacement for {@link SynonymFilter}, which produces
 *  incorrect graphs for multi-token synonyms.
 *
 *  <p>However, if you use this during indexing, you must follow it with
 *  {@link FlattenGraphFilter} to squash tokens on top of one another
 *  like {@link SynonymFilter}, because the indexer can't directly
 *  consume a graph.  To get fully correct positional queries when your
 *  synonym replacements are multiple tokens, you should instead apply
 *  synonyms using this {@code TokenFilter} at query time and translate
 *  the resulting graph to a {@code TermAutomatonQuery} e.g. using
 *  {@code TokenStreamToTermAutomatonQuery}.
 *
 *  <p><b>NOTE</b>: this cannot consume an incoming graph; results will
 *  be undefined.
 *
 *  @lucene.experimental */

public final class SynonymGraphFilter extends TokenFilter {

  public static final String TYPE_SYNONYM = "SYNONYM";

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  private final SynonymMap synonyms;
  private final boolean ignoreCase;

  private final FST<BytesRef> fst;

  private final FST.BytesReader fstReader;
  private final FST.Arc<BytesRef> scratchArc;
  private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
  private final BytesRef scratchBytes = new BytesRef();
  private final CharsRefBuilder scratchChars = new CharsRefBuilder();
  private final LinkedList<BufferedOutputToken> outputBuffer = new LinkedList<>();

  private int nextNodeOut;
  private int lastNodeOut;
  private int maxLookaheadUsed;

  // For testing:
  private int captureCount;

  private boolean liveToken;

  // Start/end offset of the current match:
  private int matchStartOffset;
  private int matchEndOffset;

  // True once the input TokenStream is exhausted:
  private boolean finished;

  private int lookaheadNextRead;
  private int lookaheadNextWrite;

  private RollingBuffer<BufferedInputToken> lookahead = new RollingBuffer<BufferedInputToken>() {
    @Override
    protected BufferedInputToken newInstance() {
      return new BufferedInputToken();
    }
  };

  static class BufferedInputToken implements RollingBuffer.Resettable {
    final CharsRefBuilder term = new CharsRefBuilder();
    AttributeSource.State state;
    int startOffset = -1;
    int endOffset = -1;

    @Override
    public void reset() {
      state = null;
      term.clear();

      // Intentionally invalid to ferret out bugs:
      startOffset = -1;
      endOffset = -1;
    }
  }

  static class BufferedOutputToken {
    final String term;

    // Non-null if this was an incoming token:
    final State state;

    final int startNode;
    final int endNode;

    public BufferedOutputToken(State state, String term, int startNode, int endNode) {
      this.state = state;
      this.term = term;
      this.startNode = startNode;
      this.endNode = endNode;
    }
  }

  /**
   * Apply previously built synonyms to incoming tokens.
   * @param input input tokenstream
   * @param synonyms synonym map
   * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
   *                   Note, if you set this to true, it's your responsibility to lowercase
   *                   the input entries when you create the {@link SynonymMap}
   */
  public SynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
    super(input);
    this.synonyms = synonyms;
    this.fst = synonyms.fst;
    if (fst == null) {
      throw new IllegalArgumentException("fst must be non-null");
    }
    this.fstReader = fst.getBytesReader();
    scratchArc = new FST.Arc<>();
    this.ignoreCase = ignoreCase;
  }

  @Override
  public boolean incrementToken() throws IOException {
    //System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut);

    assert lastNodeOut <= nextNodeOut;
      
    if (outputBuffer.isEmpty() == false) {
      // We still have pending outputs from a prior synonym match:
      releaseBufferedToken();
      //System.out.println("  syn: ret buffered=" + this);
      assert liveToken == false;
      return true;
    }

    // Try to parse a new synonym match at the current token:

    if (parse()) {
      // A new match was found:
      releaseBufferedToken();
      //System.out.println("  syn: after parse, ret buffered=" + this);
      assert liveToken == false;
      return true;
    }

    if (lookaheadNextRead == lookaheadNextWrite) {

      // Fast path: parse pulled one token, but it didn't match
      // the start for any synonym, so we now return it "live" w/o having
      // cloned all of its atts:
      if (finished) {
        //System.out.println("  syn: ret END");
        return false;
      }

      assert liveToken;
      liveToken = false;

      // NOTE: no need to change posInc since it's relative, i.e. whatever
      // node our output is upto will just increase by the incoming posInc.
      // We also don't need to change posLen, but only because we cannot
      // consume a graph, so the incoming token can never span a future
      // synonym match.

    } else {
      // We still have buffered lookahead tokens from a previous
      // parse attempt that required lookahead; just replay them now:
      //System.out.println("  restore buffer");
      assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite;
      BufferedInputToken token = lookahead.get(lookaheadNextRead);
      lookaheadNextRead++;

      restoreState(token.state);

      lookahead.freeBefore(lookaheadNextRead);

      //System.out.println("  after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset());
      assert liveToken == false;
    }

    lastNodeOut += posIncrAtt.getPositionIncrement();
    nextNodeOut = lastNodeOut + posLenAtt.getPositionLength();

    //System.out.println("  syn: ret lookahead=" + this);

    return true;
  }

  private void releaseBufferedToken() throws IOException {
    //System.out.println("  releaseBufferedToken");

    BufferedOutputToken token = outputBuffer.pollFirst();

    if (token.state != null) {
      // This is an original input token (keepOrig=true case):
      //System.out.println("    hasState");
      restoreState(token.state);
      //System.out.println("    startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset());
    } else {
      clearAttributes();
      //System.out.println("    no state");
      termAtt.append(token.term);

      // We better have a match already:
      assert matchStartOffset != -1;

      offsetAtt.setOffset(matchStartOffset, matchEndOffset);
      //System.out.println("    startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset);
      typeAtt.setType(TYPE_SYNONYM);
    }

    //System.out.println("    lastNodeOut=" + lastNodeOut);
    //System.out.println("    term=" + termAtt);

    posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut);
    lastNodeOut = token.startNode;
    posLenAtt.setPositionLength(token.endNode - token.startNode);
  }

  /** Scans the next input token(s) to see if a synonym matches.  Returns true
   *  if a match was found. */
  private boolean parse() throws IOException {
    // System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this));

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;
    int matchInputLength = 0;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    // How many tokens in the current match
    int matchLength = 0;
    boolean doFinalCapture = false;

    int lookaheadUpto = lookaheadNextRead;
    matchStartOffset = -1;

    byToken:
    while (true) {
      //System.out.println("  cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos());
      
      // Pull next token's chars:
      final char[] buffer;
      final int bufferLen;
      final int inputEndOffset;

      if (lookaheadUpto <= lookahead.getMaxPos()) {
        // Still in our lookahead buffer
        BufferedInputToken token = lookahead.get(lookaheadUpto);
        lookaheadUpto++;
        buffer = token.term.chars();
        bufferLen = token.term.length();
        inputEndOffset = token.endOffset;
        //System.out.println("    use buffer now max=" + lookahead.getMaxPos());
        if (matchStartOffset == -1) {
          matchStartOffset = token.startOffset;
        }
      } else {

        // We used up our lookahead buffer of input tokens
        // -- pull next real input token:

        assert finished || liveToken == false;

        if (finished) {
          //System.out.println("    break: finished");
          break;
        } else if (input.incrementToken()) {
          //System.out.println("    input.incrToken");
          liveToken = true;
          buffer = termAtt.buffer();
          bufferLen = termAtt.length();
          if (matchStartOffset == -1) {
            matchStartOffset = offsetAtt.startOffset();
          }
          inputEndOffset = offsetAtt.endOffset();

          lookaheadUpto++;
        } else {
          // No more input tokens
          finished = true;
          //System.out.println("    break: now set finished");
          break;
        }
      }

      matchLength++;
      //System.out.println("    cycle term=" + new String(buffer, 0, bufferLen));

      // Run each char in this token through the FST:
      int bufUpto = 0;
      while (bufUpto < bufferLen) {
        final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
        if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
          break byToken;
        }

        // Accum the output
        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
        bufUpto += Character.charCount(codePoint);
      }

      assert bufUpto == bufferLen;

      // OK, entire token matched; now see if this is a final
      // state in the FST (a match):
      if (scratchArc.isFinal()) {
        matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
        matchInputLength = matchLength;
        matchEndOffset = inputEndOffset;
        //System.out.println("    ** match");
      }

      // See if the FST can continue matching (ie, needs to
      // see the next input token):
      if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
        // No further rules can match here; we're done
        // searching for matching rules starting at the
        // current input position.
        break;
      } else {
        // More matching is possible -- accum the output (if
        // any) of the WORD_SEP arc:
        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
        doFinalCapture = true;
        if (liveToken) {
          capture();
        }
      }
    }

    if (doFinalCapture && liveToken && finished == false) {
      // Must capture the final token if we captured any prior tokens:
      capture();
    }

    if (matchOutput != null) {

      if (liveToken) {
        // Single input token synonym; we must buffer it now:
        capture();
      }

      // There is a match!
      bufferOutputTokens(matchOutput, matchInputLength);
      lookaheadNextRead += matchInputLength;
      //System.out.println("  precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
      lookahead.freeBefore(lookaheadNextRead);
      //System.out.println("  match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
      return true;
    } else {
      //System.out.println("  no match; lookaheadNextRead=" + lookaheadNextRead);
      return false;
    }

    //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
  }

  /** Expands the output graph into the necessary tokens, adding
   *  synonyms as side paths parallel to the input tokens, and
   *  buffers them in the output token buffer. */
  private void bufferOutputTokens(BytesRef bytes, int matchInputLength) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    //System.out.println("  buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength);

    // How many nodes along all paths; we need this to assign the
    // node ID for the final end node where all paths merge back:
    int totalPathNodes;
    if (keepOrig) {
      assert matchInputLength > 0;
      totalPathNodes = matchInputLength - 1;
    } else {
      totalPathNodes = 0;
    }

    // How many synonyms we will insert over this match:
    final int count = code >>> 1;

    // TODO: we could encode this instead into the FST:

    // 1st pass: count how many new nodes we need
    List<List<String>> paths = new ArrayList<>();
    for(int outputIDX=0;outputIDX<count;outputIDX++) {
      int wordID = bytesReader.readVInt();
      synonyms.words.get(wordID, scratchBytes);
      scratchChars.copyUTF8Bytes(scratchBytes);
      int lastStart = 0;

      List<String> path = new ArrayList<>();
      paths.add(path);
      int chEnd = scratchChars.length();
      for(int chUpto=0; chUpto<=chEnd; chUpto++) {
        if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) {
          path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart));
          lastStart = 1 + chUpto;
        }
      }

      assert path.size() > 0;
      totalPathNodes += path.size() - 1;
    }
    //System.out.println("  totalPathNodes=" + totalPathNodes);

    // 2nd pass: buffer tokens for the graph fragment

    // NOTE: totalPathNodes will be 0 in the case where the matched
    // input is a single token and all outputs are also a single token

    // We "spawn" a side-path for each of the outputs for this matched
    // synonym, all ending back at this end node:

    int startNode = nextNodeOut;

    int endNode = startNode + totalPathNodes + 1;
    //System.out.println("  " + paths.size() + " new side-paths");

    // First, fanout all tokens departing start node for these new side paths:
    int newNodeCount = 0;
    for(List<String> path : paths) {
      int pathEndNode;
      //System.out.println("    path size=" + path.size());
      if (path.size() == 1) {
        // Single token output, so there are no intermediate nodes:
        pathEndNode = endNode;
      } else {
        pathEndNode = nextNodeOut + newNodeCount + 1;
        newNodeCount += path.size() - 1;
      }
      outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode));
    }

    // We must do the original tokens last, else the offsets "go backwards":
    if (keepOrig) {
      BufferedInputToken token = lookahead.get(lookaheadNextRead);
      int inputEndNode;
      if (matchInputLength == 1) {
        // Single token matched input, so there are no intermediate nodes:
        inputEndNode = endNode;
      } else {
        inputEndNode = nextNodeOut + newNodeCount + 1;
      }

      //System.out.println("    keepOrig first token: " + token.term);

      outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode));
    }

    nextNodeOut = endNode;

    // Do full side-path for each syn output:
    for(int pathID=0;pathID<paths.size();pathID++) {
      List<String> path = paths.get(pathID);
      if (path.size() > 1) {
        int lastNode = outputBuffer.get(pathID).endNode;
        for(int i=1;i<path.size()-1;i++) {
          outputBuffer.add(new BufferedOutputToken(null, path.get(i), lastNode, lastNode+1));
          lastNode++;
        }
        outputBuffer.add(new BufferedOutputToken(null, path.get(path.size()-1), lastNode, endNode));
      }
    }

    if (keepOrig && matchInputLength > 1) {
      // Do full "side path" with the original tokens:
      int lastNode = outputBuffer.get(paths.size()).endNode;
      for(int i=1;i<matchInputLength-1;i++) {
        BufferedInputToken token = lookahead.get(lookaheadNextRead + i);
        outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), lastNode, lastNode+1));
        lastNode++;
      }
      BufferedInputToken token = lookahead.get(lookaheadNextRead + matchInputLength - 1);
      outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), lastNode, endNode));
    }

    /*
    System.out.println("  after buffer: " + outputBuffer.size() + " tokens:");
    for(BufferedOutputToken token : outputBuffer) {
      System.out.println("    tok: " + token.term + " startNode=" + token.startNode + " endNode=" + token.endNode);
    }
    */
  }

  /** Buffers the current input token into lookahead buffer. */
  private void capture() {
    assert liveToken;
    liveToken = false;
    BufferedInputToken token = lookahead.get(lookaheadNextWrite);
    lookaheadNextWrite++;

    token.state = captureState();
    token.startOffset = offsetAtt.startOffset();
    token.endOffset = offsetAtt.endOffset();
    assert token.term.length() == 0;
    token.term.append(termAtt);

    captureCount++;
    maxLookaheadUsed = Math.max(maxLookaheadUsed, lookahead.getBufferSize());
    //System.out.println("  maxLookaheadUsed=" + maxLookaheadUsed);
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    lookahead.reset();
    lookaheadNextWrite = 0;
    lookaheadNextRead = 0;
    captureCount = 0;
    lastNodeOut = -1;
    nextNodeOut = 0;
    matchStartOffset = -1;
    matchEndOffset = -1;
    finished = false;
    liveToken = false;
    outputBuffer.clear();
    maxLookaheadUsed = 0;
    //System.out.println("S: reset");
  }

  // for testing
  int getCaptureCount() {
    return captureCount;
  }

  // for testing
  int getMaxLookaheadUsed() {
    return maxLookaheadUsed;
  }
}