SynonymFilter.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.synonym;

import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.FlattenGraphFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;

/**
 * Matches single or multi word synonyms in a token stream.
 * This token stream cannot properly handle position
 * increments != 1, ie, you should place this filter before
 * filtering out stop words.
 * 
 * <p>Note that with the current implementation, parsing is
 * greedy, so whenever multiple parses would apply, the rule
 * starting the earliest and parsing the most tokens wins.
 * For example if you have these rules:
 *      
 * <pre>
 *   a -> x
 *   a b -> y
 *   b c d -> z
 * </pre>
 *
 * Then input <code>a b c d e</code> parses to <code>y b c
 * d</code>, ie the 2nd rule "wins" because it started
 * earliest and matched the most input tokens of other rules
 * starting at that point.
 *
 * <p>A future improvement to this filter could allow
 * non-greedy parsing, such that the 3rd rule would win, and
 * also separately allow multiple parses, such that all 3
 * rules would match, perhaps even on a rule by rule
 * basis.</p>
 *
 * <p><b>NOTE</b>: when a match occurs, the output tokens
 * associated with the matching rule are "stacked" on top of
 * the input stream (if the rule had
 * <code>keepOrig=true</code>) and also on top of another
 * matched rule's output tokens.  This is not a correct
 * solution, as really the output should be an arbitrary
 * graph/lattice.  For example, with the above match, you
 * would expect an exact <code>PhraseQuery</code> <code>"y b
 * c"</code> to match the parsed tokens, but it will fail to
 * do so.  This limitation is necessary because Lucene's
 * TokenStream (and index) cannot yet represent an arbitrary
 * graph.</p>
 *
 * <p><b>NOTE</b>: If multiple incoming tokens arrive on the
 * same position, only the first token at that position is
 * used for parsing.  Subsequent tokens simply pass through
 * and are not parsed.  A future improvement would be to
 * allow these tokens to also be matched.</p>
 *
 * @deprecated Use {@link SynonymGraphFilter} instead, but be sure to also
 * use {@link FlattenGraphFilter} at index time (not at search time) as well.
 */ 

// TODO: maybe we should resolve token -> wordID then run
// FST on wordIDs, for better perf?

// TODO: a more efficient approach would be Aho/Corasick's
// algorithm
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
// It improves over the current approach here
// because it does not fully re-start matching at every
// token.  For example if one pattern is "a b c x"
// and another is "b c d" and the input is "a b c d", on
// trying to parse "a b c x" but failing when you got to x,
// rather than starting over again your really should
// immediately recognize that "b c d" matches at the next
// input.  I suspect this won't matter that much in
// practice, but it's possible on some set of synonyms it
// will.  We'd have to modify Aho/Corasick to enforce our
// conflict resolving (eg greedy matching) because that algo
// finds all matches.  This really amounts to adding a .*
// closure to the FST and then determinizing it.
//
// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps

@Deprecated
public final class SynonymFilter extends TokenFilter {

  public static final String TYPE_SYNONYM = "SYNONYM";

  private final SynonymMap synonyms;

  private final boolean ignoreCase;
  private final int rollBufferSize;

  private int captureCount;

  // TODO: we should set PositionLengthAttr too...

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  // How many future input tokens have already been matched
  // to a synonym; because the matching is "greedy" we don't
  // try to do any more matching for such tokens:
  private int inputSkipCount;

  // Hold all buffered (read ahead) stacked input tokens for
  // a future position.  When multiple tokens are at the
  // same position, we only store (and match against) the
  // term for the first token at the position, but capture
  // state for (and enumerate) all other tokens at this
  // position:
  private static class PendingInput {
    final CharsRefBuilder term = new CharsRefBuilder();
    AttributeSource.State state;
    boolean keepOrig;
    boolean matched;
    boolean consumed = true;
    int startOffset;
    int endOffset;
    
    public void reset() {
      state = null;
      consumed = true;
      keepOrig = false;
      matched = false;
    }
  };

  // Rolling buffer, holding pending input tokens we had to
  // clone because we needed to look ahead, indexed by
  // position:
  private final PendingInput[] futureInputs;

  // Holds pending output synonyms for one future position:
  private static class PendingOutputs {
    CharsRefBuilder[] outputs;
    int[] endOffsets;
    int[] posLengths;
    int upto;
    int count;
    int posIncr = 1;
    int lastEndOffset;
    int lastPosLength;

    public PendingOutputs() {
      outputs = new CharsRefBuilder[1];
      endOffsets = new int[1];
      posLengths = new int[1];
    }

    public void reset() {
      upto = count = 0;
      posIncr = 1;
    }

    public CharsRef pullNext() {
      assert upto < count;
      lastEndOffset = endOffsets[upto];
      lastPosLength = posLengths[upto];
      final CharsRefBuilder result = outputs[upto++];
      posIncr = 0;
      if (upto == count) {
        reset();
      }
      return result.get();
    }

    public int getLastEndOffset() {
      return lastEndOffset;
    }

    public int getLastPosLength() {
      return lastPosLength;
    }

    public void add(char[] output, int offset, int len, int endOffset, int posLength) {
      if (count == outputs.length) {
        outputs = Arrays.copyOf(outputs, ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF));
      }
      if (count == endOffsets.length) {
        final int[] next = new int[ArrayUtil.oversize(1+count, Integer.BYTES)];
        System.arraycopy(endOffsets, 0, next, 0, count);
        endOffsets = next;
      }
      if (count == posLengths.length) {
        final int[] next = new int[ArrayUtil.oversize(1+count, Integer.BYTES)];
        System.arraycopy(posLengths, 0, next, 0, count);
        posLengths = next;
      }
      if (outputs[count] == null) {
        outputs[count] = new CharsRefBuilder();
      }
      outputs[count].copyChars(output, offset, len);
      // endOffset can be -1, in which case we should simply
      // use the endOffset of the input token, or X >= 0, in
      // which case we use X as the endOffset for this output
      endOffsets[count] = endOffset;
      posLengths[count] = posLength;
      count++;
    }
  };

  private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();

  // Rolling buffer, holding stack of pending synonym
  // outputs, indexed by position:
  private final PendingOutputs[] futureOutputs;

  // Where (in rolling buffers) to write next input saved state:
  private int nextWrite;

  // Where (in rolling buffers) to read next input saved state:
  private int nextRead;

  // True once we've read last token
  private boolean finished;

  private final FST.Arc<BytesRef> scratchArc;

  private final FST<BytesRef> fst;

  private final FST.BytesReader fstReader;


  private final BytesRef scratchBytes = new BytesRef();
  private final CharsRefBuilder scratchChars = new CharsRefBuilder();

  /**
   * @param input input tokenstream
   * @param synonyms synonym map
   * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
   *                   Note, if you set this to true, it's your responsibility to lowercase
   *                   the input entries when you create the {@link SynonymMap}
   */
  public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
    super(input);
    this.synonyms = synonyms;
    this.ignoreCase = ignoreCase;
    this.fst = synonyms.fst;
    if (fst == null) {
      throw new IllegalArgumentException("fst must be non-null");
    }
    this.fstReader = fst.getBytesReader();

    // Must be 1+ so that when roll buffer is at full
    // lookahead we can distinguish this full buffer from
    // the empty buffer:
    rollBufferSize = 1+synonyms.maxHorizontalContext;

    futureInputs = new PendingInput[rollBufferSize];
    futureOutputs = new PendingOutputs[rollBufferSize];
    for(int pos=0;pos<rollBufferSize;pos++) {
      futureInputs[pos] = new PendingInput();
      futureOutputs[pos] = new PendingOutputs();
    }

    //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

    scratchArc = new FST.Arc<>();
  }

  private void capture() {
    captureCount++;
    //System.out.println("  capture slot=" + nextWrite);
    final PendingInput input = futureInputs[nextWrite];

    input.state = captureState();
    input.consumed = false;
    input.term.copyChars(termAtt.buffer(), 0, termAtt.length());

    nextWrite = rollIncr(nextWrite);

    // Buffer head should never catch up to tail:
    assert nextWrite != nextRead;
  }

  /*
   This is the core of this TokenFilter: it locates the
   synonym matches and buffers up the results into
   futureInputs/Outputs.

   NOTE: this calls input.incrementToken and does not
   capture the state if no further tokens were checked.  So
   caller must then forward state to our caller, or capture:
  */
  private int lastStartOffset;
  private int lastEndOffset;

  private void parse() throws IOException {
    //System.out.println("\nS: parse");

    assert inputSkipCount == 0;

    int curNextRead = nextRead;

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;
    int matchInputLength = 0;
    int matchEndOffset = -1;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    int tokenCount = 0;

    byToken:
    while(true) {
      
      // Pull next token's chars:
      final char[] buffer;
      final int bufferLen;
      //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

      int inputEndOffset = 0;

      if (curNextRead == nextWrite) {

        // We used up our lookahead buffer of input tokens
        // -- pull next real input token:

        if (finished) {
          break;
        } else  {
          //System.out.println("  input.incrToken");
          assert futureInputs[nextWrite].consumed;
          // Not correct: a syn match whose output is longer
          // than its input can set future inputs keepOrig
          // to true:
          //assert !futureInputs[nextWrite].keepOrig;
          if (input.incrementToken()) {
            buffer = termAtt.buffer();
            bufferLen = termAtt.length();
            final PendingInput input = futureInputs[nextWrite];
            lastStartOffset = input.startOffset = offsetAtt.startOffset();
            lastEndOffset = input.endOffset = offsetAtt.endOffset();
            inputEndOffset = input.endOffset;
            //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
            if (nextRead != nextWrite) {
              capture();
            } else {
              input.consumed = false;
            }

          } else {
            // No more input tokens
            //System.out.println("      set end");
            finished = true;
            break;
          }
        }
      } else {
        // Still in our lookahead
        buffer = futureInputs[curNextRead].term.chars();
        bufferLen = futureInputs[curNextRead].term.length();
        inputEndOffset = futureInputs[curNextRead].endOffset;
        //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
      }

      tokenCount++;

      // Run each char in this token through the FST:
      int bufUpto = 0;
      while(bufUpto < bufferLen) {
        final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
        if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
          //System.out.println("    stop");
          break byToken;
        }

        // Accum the output
        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
        //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
        bufUpto += Character.charCount(codePoint);
      }

      // OK, entire token matched; now see if this is a final
      // state:
      if (scratchArc.isFinal()) {
        matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
        matchInputLength = tokenCount;
        matchEndOffset = inputEndOffset;
        //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
      }

      // See if the FST wants to continue matching (ie, needs to
      // see the next input token):
      if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
        // No further rules can match here; we're done
        // searching for matching rules starting at the
        // current input position.
        break;
      } else {
        // More matching is possible -- accum the output (if
        // any) of the WORD_SEP arc:
        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
        if (nextRead == nextWrite) {
          capture();
        }
      }

      curNextRead = rollIncr(curNextRead);
    }

    if (nextRead == nextWrite && !finished) {
      //System.out.println("  skip write slot=" + nextWrite);
      nextWrite = rollIncr(nextWrite);
    }

    if (matchOutput != null) {
      //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
      inputSkipCount = matchInputLength;
      addOutput(matchOutput, matchInputLength, matchEndOffset);
    } else if (nextRead != nextWrite) {
      // Even though we had no match here, we set to 1
      // because we need to skip current input token before
      // trying to match again:
      inputSkipCount = 1;
    } else {
      assert finished;
    }

    //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
  }

  // Interleaves all output tokens onto the futureOutputs:
  private void addOutput(BytesRef bytes, int matchInputLength, int matchEndOffset) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    final int count = code >>> 1;
    //System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
    for(int outputIDX=0;outputIDX<count;outputIDX++) {
      synonyms.words.get(bytesReader.readVInt(),
                         scratchBytes);
      //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
      scratchChars.copyUTF8Bytes(scratchBytes);
      int lastStart = 0;
      final int chEnd = lastStart + scratchChars.length();
      int outputUpto = nextRead;
      for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
        if (chIDX == chEnd || scratchChars.charAt(chIDX) == SynonymMap.WORD_SEPARATOR) {
          final int outputLen = chIDX - lastStart;
          // Caller is not allowed to have empty string in
          // the output:
          assert outputLen > 0: "output contains empty string: " + scratchChars;
          final int endOffset;
          final int posLen;
          if (chIDX == chEnd && lastStart == 0) {
            // This rule had a single output token, so, we set
            // this output's endOffset to the current
            // endOffset (ie, endOffset of the last input
            // token it matched):
            endOffset = matchEndOffset;
            posLen = keepOrig ? matchInputLength : 1;
          } else {
            // This rule has more than one output token; we
            // can't pick any particular endOffset for this
            // case, so, we inherit the endOffset for the
            // input token which this output overlaps:
            endOffset = -1;
            posLen = 1;
          }
          futureOutputs[outputUpto].add(scratchChars.chars(), lastStart, outputLen, endOffset, posLen);
          //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
          lastStart = 1+chIDX;
          //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
          outputUpto = rollIncr(outputUpto);
          assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
        }
      }
    }

    int upto = nextRead;
    for(int idx=0;idx<matchInputLength;idx++) {
      futureInputs[upto].keepOrig |= keepOrig;
      futureInputs[upto].matched = true;
      upto = rollIncr(upto);
    }
  }

  // ++ mod rollBufferSize
  private int rollIncr(int count) {
    count++;
    if (count == rollBufferSize) {
      return 0;
    } else {
      return count;
    }
  }

  // for testing
  int getCaptureCount() {
    return captureCount;
  }

  @Override
  public boolean incrementToken() throws IOException {

    //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);

    while(true) {

      // First play back any buffered future inputs/outputs
      // w/o running parsing again:
      while (inputSkipCount != 0) {
        
        // At each position, we first output the original
        // token

        // TODO: maybe just a PendingState class, holding
        // both input & outputs?
        final PendingInput input = futureInputs[nextRead];
        final PendingOutputs outputs = futureOutputs[nextRead];
        
        //System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);

        if (!input.consumed && (input.keepOrig || !input.matched)) {
          if (input.state != null) {
            // Return a previously saved token (because we
            // had to lookahead):
            restoreState(input.state);
          } else {
            // Pass-through case: return token we just pulled
            // but didn't capture:
            assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
          }
          input.reset();
          if (outputs.count > 0) {
            outputs.posIncr = 0;
          } else {
            nextRead = rollIncr(nextRead);
            inputSkipCount--;
          }
          //System.out.println("  return token=" + termAtt.toString());
          return true;
        } else if (outputs.upto < outputs.count) {
          // Still have pending outputs to replay at this
          // position
          input.reset();
          final int posIncr = outputs.posIncr;
          final CharsRef output = outputs.pullNext();
          clearAttributes();
          termAtt.copyBuffer(output.chars, output.offset, output.length);
          typeAtt.setType(TYPE_SYNONYM);
          int endOffset = outputs.getLastEndOffset();
          if (endOffset == -1) {
            endOffset = input.endOffset;
          }
          offsetAtt.setOffset(input.startOffset, endOffset);
          posIncrAtt.setPositionIncrement(posIncr);
          posLenAtt.setPositionLength(outputs.getLastPosLength());
          if (outputs.count == 0) {
            // Done with the buffered input and all outputs at
            // this position
            nextRead = rollIncr(nextRead);
            inputSkipCount--;
          }
          //System.out.println("  return token=" + termAtt.toString());
          return true;
        } else {
          // Done with the buffered input and all outputs at
          // this position
          input.reset();
          nextRead = rollIncr(nextRead);
          inputSkipCount--;
        }
      }

      if (finished && nextRead == nextWrite) {
        // End case: if any output syns went beyond end of
        // input stream, enumerate them now:
        final PendingOutputs outputs = futureOutputs[nextRead];
        if (outputs.upto < outputs.count) {
          final int posIncr = outputs.posIncr;
          final CharsRef output = outputs.pullNext();
          futureInputs[nextRead].reset();
          if (outputs.count == 0) {
            nextWrite = nextRead = rollIncr(nextRead);
          }
          clearAttributes();
          // Keep offset from last input token:
          offsetAtt.setOffset(lastStartOffset, lastEndOffset);
          termAtt.copyBuffer(output.chars, output.offset, output.length);
          typeAtt.setType(TYPE_SYNONYM);
          //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
          posIncrAtt.setPositionIncrement(posIncr);
          //System.out.println("  return token=" + termAtt.toString());
          return true;
        } else {
          return false;
        }
      }

      // Find new synonym matches:
      parse();
    }
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    captureCount = 0;
    finished = false;
    inputSkipCount = 0;
    nextRead = nextWrite = 0;

    // In normal usage these resets would not be needed,
    // since they reset-as-they-are-consumed, but the app
    // may not consume all input tokens (or we might hit an
    // exception), in which case we have leftover state
    // here:
    for (PendingInput input : futureInputs) {
      input.reset();
    }
    for (PendingOutputs output : futureOutputs) {
      output.reset();
    }
  }
}