XAnalyzingSuggester.java example

Explorer
elassandra-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import com.carrotsearch.hppc.ObjectIntHashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;
import org.elasticsearch.common.collect.HppcMaps;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

/**
 * Suggester that first analyzes the surface form, adds the
 * analyzed form to a weighted FST, and then does the same
 * thing at lookup time.  This means lookup is based on the
 * analyzed form while suggestions are still the surface
 * form(s).
 *
 * <p>
 * This can result in powerful suggester functionality.  For
 * example, if you use an analyzer removing stop words,
 * then the partial text "ghost chr..." could see the
 * suggestion "The Ghost of Christmas Past".  Note that
 * position increments MUST NOT be preserved for this example
 * to work, so you should call the constructor with
 * <code>preservePositionIncrements</code> parameter set to
 * false
 *
 * <p>
 * If SynonymFilter is used to map wifi and wireless network to
 * hotspot then the partial text "wirele..." could suggest
 * "wifi router".  Token normalization like stemmers, accent
 * removal, etc., would allow suggestions to ignore such
 * variations.
 *
 * <p>
 * When two matching suggestions have the same weight, they
 * are tie-broken by the analyzed form.  If their analyzed
 * form is the same then the order is undefined.
 *
 * <p>
 * There are some limitations:
 * <ul>
 *
 *   <li> A lookup from a query like "net" in English won't
 *        be any different than "net " (ie, user added a
 *        trailing space) because analyzers don't reflect
 *        when they've seen a token separator and when they
 *        haven't.
 *
 *   <li> If you're using {@code StopFilter}, and the user will
 *        type "fast apple", but so far all they've typed is
 *        "fast a", again because the analyzer doesn't convey whether
 *        it's seen a token separator after the "a",
 *        {@code StopFilter} will remove that "a" causing
 *        far more matches than you'd expect.
 *
 *   <li> Lookups with the empty string return no results
 *        instead of all results.
 * </ul>
 */
public class XAnalyzingSuggester extends Lookup {

  /**
   * FST<Weight,Surface>:
   *  input is the analyzed form, with a null byte between terms
   *  weights are encoded as costs: (Integer.MAX_VALUE-weight)
   *  surface is the original, unanalyzed form.
   */
  private FST<Pair<Long,BytesRef>> fst = null;

  /**
   * Analyzer that will be used for analyzing suggestions at
   * index time.
   */
  private final Analyzer indexAnalyzer;

  /**
   * Analyzer that will be used for analyzing suggestions at
   * query time.
   */
  private final Analyzer queryAnalyzer;

  /**
   * True if exact match suggestions should always be returned first.
   */
  private final boolean exactFirst;

  /**
   * True if separator between tokens should be preserved.
   */
  private final boolean preserveSep;

  /** Include this flag in the options parameter to {@code
   *  #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to always
   *  return the exact match first, regardless of score.  This
   *  has no performance impact but could result in
   *  low-quality suggestions. */
  public static final int EXACT_FIRST = 1;

  /** Include this flag in the options parameter to {@code
   *  #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to preserve
   *  token separators when matching. */
  public static final int PRESERVE_SEP = 2;

  /** Represents the separation between tokens, if
   *  PRESERVE_SEP was specified */
  public static final int SEP_LABEL = '\u001F';

  /** Marks end of the analyzed input and start of dedup
   *  byte. */
  public static final int END_BYTE = 0x0;

  /** Maximum number of dup surface forms (different surface
   *  forms for the same analyzed form). */
  private final int maxSurfaceFormsPerAnalyzedForm;

  /** Maximum graph paths to index for a single analyzed
   *  surface form.  This only matters if your analyzer
   *  makes lots of alternate paths (e.g. contains
   *  SynonymFilter). */
  private final int maxGraphExpansions;

  /** Highest number of analyzed paths we saw for any single
   *  input surface form.  For analyzers that never create
   *  graphs this will always be 1. */
  private int maxAnalyzedPathsForOneInput;

  private boolean hasPayloads;

  private final int sepLabel;
  private final int payloadSep;
  private final int endByte;
  private final int holeCharacter;

  public static final int PAYLOAD_SEP = '\u001F';
  public static final int HOLE_CHARACTER = '\u001E';

  private final Automaton queryPrefix;

  /** Whether position holes should appear in the automaton. */
  private boolean preservePositionIncrements;

  /** Number of entries the lookup was built with */
  private long count = 0;

  /**
   * Calls {@code #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
   * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
   * PRESERVE_SEP, 256, -1)}
   *
   * @param analyzer Analyzer that will be used for analyzing suggestions while building the index.
   */
  public XAnalyzingSuggester(Analyzer analyzer) {
    this(analyzer, null, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
  }

  /**
   * Calls {@code #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
   * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
   * PRESERVE_SEP, 256, -1)}
   *
   * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the index.
   * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup
   */
  public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
    this(indexAnalyzer, null, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
  }

  /**
   * Creates a new suggester.
   *
   * @param indexAnalyzer Analyzer that will be used for
   *   analyzing suggestions while building the index.
   * @param queryAnalyzer Analyzer that will be used for
   *   analyzing query text during lookup
   * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
   * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
   *   surface forms to keep for a single analyzed form.
   *   When there are too many surface forms we discard the
   *   lowest weighted ones.
   * @param maxGraphExpansions Maximum number of graph paths
   *   to expand from the analyzed form.  Set this to -1 for
   *   no limit.
   */
  public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                             boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
                             int sepLabel, int payloadSep, int endByte, int holeCharacter) {
      // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput
    this.indexAnalyzer = indexAnalyzer;
    this.queryAnalyzer = queryAnalyzer;
    this.fst = fst;
    this.hasPayloads = hasPayloads;
    if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
      throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
    }
    this.exactFirst = (options & EXACT_FIRST) != 0;
    this.preserveSep = (options & PRESERVE_SEP) != 0;

    // FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions
    this.queryPrefix = queryPrefix;

    // NOTE: this is just an implementation limitation; if
    // somehow this is a problem we could fix it by using
    // more than one byte to disambiguate ... but 256 seems
    // like it should be way more then enough.
    if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
      throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
    }
    this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;

    if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
      throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
    }
    this.maxGraphExpansions = maxGraphExpansions;
    this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
    this.preservePositionIncrements = preservePositionIncrements;
    this.sepLabel = sepLabel;
    this.payloadSep = payloadSep;
    this.endByte = endByte;
    this.holeCharacter = holeCharacter;
  }

  /** Returns byte size of the underlying FST. */
  @Override
public long ramBytesUsed() {
    return fst == null ? 0 : fst.ramBytesUsed();
  }

  public int getMaxAnalyzedPathsForOneInput() {
      return maxAnalyzedPathsForOneInput;
  }

  // Replaces SEP with epsilon or remaps them if
  // we were asked to preserve them:
  private Automaton replaceSep(Automaton a) {

      Automaton result = new Automaton();

      // Copy all states over
      int numStates = a.getNumStates();
      for(int s=0;s<numStates;s++) {
        result.createState();
        result.setAccept(s, a.isAccept(s));
      }

      // Go in reverse topo sort so we know we only have to
      // make one pass:
      Transition t = new Transition();
      int[] topoSortStates = topoSortStates(a);
      for(int i=0;i<topoSortStates.length;i++) {
        int state = topoSortStates[topoSortStates.length-1-i];
        int count = a.initTransition(state, t);
        for(int j=0;j<count;j++) {
          a.getNextTransition(t);
          if (t.min == TokenStreamToAutomaton.POS_SEP) {
            assert t.max == TokenStreamToAutomaton.POS_SEP;
            if (preserveSep) {
              // Remap to SEP_LABEL:
              result.addTransition(state, t.dest, SEP_LABEL);
            } else {
              result.addEpsilon(state, t.dest);
            }
          } else if (t.min == TokenStreamToAutomaton.HOLE) {
            assert t.max == TokenStreamToAutomaton.HOLE;

            // Just remove the hole: there will then be two
            // SEP tokens next to each other, which will only
            // match another hole at search time.  Note that
            // it will also match an empty-string token ... if
            // that's somehow a problem we can always map HOLE
            // to a dedicated byte (and escape it in the
            // input).
            result.addEpsilon(state, t.dest);
          } else {
            result.addTransition(state, t.dest, t.min, t.max);
          }
        }
      }

      result.finishState();

      return result;
  }

  protected Automaton convertAutomaton(Automaton a) {
    if (queryPrefix != null) {
      a = Operations.concatenate(Arrays.asList(queryPrefix, a));
      // This automaton should not blow up during determinize:
      a = Operations.determinize(a, Integer.MAX_VALUE);
    }
    return a;
  }

  private int[] topoSortStates(Automaton a) {
      int[] states = new int[a.getNumStates()];
      final Set<Integer> visited = new HashSet<>();
      final LinkedList<Integer> worklist = new LinkedList<>();
      worklist.add(0);
      visited.add(0);
      int upto = 0;
      states[upto] = 0;
      upto++;
      Transition t = new Transition();
      while (worklist.size() > 0) {
        int s = worklist.removeFirst();
        int count = a.initTransition(s, t);
        for (int i=0;i<count;i++) {
          a.getNextTransition(t);
          if (!visited.contains(t.dest)) {
            visited.add(t.dest);
            worklist.add(t.dest);
            states[upto++] = t.dest;
          }
        }
      }
      return states;
    }

  /** Just escapes the 0xff byte (which we still for SEP). */
  private static final class  EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {

    final BytesRefBuilder spare = new BytesRefBuilder();
    private char sepLabel;

    public EscapingTokenStreamToAutomaton(char sepLabel) {
      this.sepLabel = sepLabel;
    }

    @Override
    protected BytesRef changeToken(BytesRef in) {
      int upto = 0;
      for(int i=0;i<in.length;i++) {
        byte b = in.bytes[in.offset+i];
        if (b == (byte) sepLabel) {
          spare.grow(upto+2);
          spare.setByteAt(upto++, (byte) sepLabel);
          spare.setByteAt(upto++, b);
        } else {
          spare.grow(upto+1);
          spare.setByteAt(upto++, b);
        }
      }
      spare.setLength(upto);
      return spare.get();
    }
  }

  public TokenStreamToAutomaton getTokenStreamToAutomaton() {
    final TokenStreamToAutomaton tsta;
    if (preserveSep) {
      tsta = new EscapingTokenStreamToAutomaton((char) sepLabel);
    } else {
      // When we're not preserving sep, we don't steal 0xff
      // byte, so we don't need to do any escaping:
      tsta = new TokenStreamToAutomaton();
    }
    tsta.setPreservePositionIncrements(preservePositionIncrements);
    return tsta;
  }

  private static class AnalyzingComparator implements Comparator<BytesRef> {

    private final boolean hasPayloads;

    public AnalyzingComparator(boolean hasPayloads) {
      this.hasPayloads = hasPayloads;
    }

    private final ByteArrayDataInput readerA = new ByteArrayDataInput();
    private final ByteArrayDataInput readerB = new ByteArrayDataInput();
    private final BytesRef scratchA = new BytesRef();
    private final BytesRef scratchB = new BytesRef();

    @Override
    public int compare(BytesRef a, BytesRef b) {

      // First by analyzed form:
      readerA.reset(a.bytes, a.offset, a.length);
      scratchA.length = readerA.readShort();
      scratchA.bytes = a.bytes;
      scratchA.offset = readerA.getPosition();

      readerB.reset(b.bytes, b.offset, b.length);
      scratchB.bytes = b.bytes;
      scratchB.length = readerB.readShort();
      scratchB.offset = readerB.getPosition();

      int cmp = scratchA.compareTo(scratchB);
      if (cmp != 0) {
        return cmp;
      }
      readerA.skipBytes(scratchA.length);
      readerB.skipBytes(scratchB.length);
      // Next by cost:
      long aCost = readerA.readInt();
      long bCost = readerB.readInt();
      if (aCost < bCost) {
        return -1;
      } else if (aCost > bCost) {
        return 1;
      }

      // Finally by surface form:
      if (hasPayloads) {
        scratchA.length = readerA.readShort();
        scratchA.offset = readerA.getPosition();
        scratchB.length = readerB.readShort();
        scratchB.offset = readerB.getPosition();
      } else {
        scratchA.offset = readerA.getPosition();
        scratchA.length = a.length - scratchA.offset;
        scratchB.offset = readerB.getPosition();
        scratchB.length = b.length - scratchB.offset;
      }
      return scratchA.compareTo(scratchB);
    }
  }

  @Override
  public void build(InputIterator iterator) throws IOException {
    String prefix = getClass().getSimpleName();
    Path directory = OfflineSorter.getDefaultTempDir();
    Path tempInput = Files.createTempFile(directory, prefix, ".input");
    Path tempSorted = Files.createTempFile(directory, prefix, ".sorted");

    hasPayloads = iterator.hasPayloads();

    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();

    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

    boolean success = false;
    count = 0;
    byte buffer[] = new byte[8];
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);

      for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null;) {
        LimitedFiniteStringsIterator finiteStrings =
                new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
        for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
          Util.toBytesRef(string, scratch);

          // length of the analyzed text (FST input)
          if (scratch.length() > Short.MAX_VALUE-2) {
            throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length() + ")");
          }
          short analyzedLength = (short) scratch.length();

          // compute the required length:
          // analyzed sequence + weight (4) + surface + analyzedLength (short)
          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;

          BytesRef payload;

          if (hasPayloads) {
            if (surfaceForm.length > (Short.MAX_VALUE-2)) {
              throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
            }
            payload = iterator.payload();
            // payload + surfaceLength (short)
            requiredLength += payload.length + 2;
          } else {
            payload = null;
          }

          buffer = ArrayUtil.grow(buffer, requiredLength);

          output.reset(buffer);

          output.writeShort(analyzedLength);

          output.writeBytes(scratch.bytes(), 0, scratch.length());

          output.writeInt(encodeWeight(iterator.weight()));

          if (hasPayloads) {
            for(int i=0;i<surfaceForm.length;i++) {
              if (surfaceForm.bytes[i] == payloadSep) {
                throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
              }
            }
            output.writeShort((short) surfaceForm.length);
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
            output.writeBytes(payload.bytes, payload.offset, payload.length);
          } else {
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
          }

          assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength;

          writer.write(buffer, 0, output.getPosition());
        }
        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
      }
      writer.close();

      // Sort all input/output pairs (required by FST.Builder):
      new OfflineSorter(new AnalyzingComparator(hasPayloads)).sort(tempInput, tempSorted);

      // Free disk space:
      Files.delete(tempInput);

      reader = new OfflineSorter.ByteSequencesReader(tempSorted);

      PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
      Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);

      // Build FST:
      BytesRefBuilder previousAnalyzed = null;
      BytesRefBuilder analyzed = new BytesRefBuilder();
      BytesRef surface = new BytesRef();
      IntsRefBuilder scratchInts = new IntsRefBuilder();
      ByteArrayDataInput input = new ByteArrayDataInput();

      // Used to remove duplicate surface forms (but we
      // still index the hightest-weight one).  We clear
      // this when we see a new analyzed form, so it cannot
      // grow unbounded (at most 256 entries):
      Set<BytesRef> seenSurfaceForms = new HashSet<>();

      int dedup = 0;
      while (reader.read(scratch)) {
        input.reset(scratch.bytes(), 0, scratch.length());
        short analyzedLength = input.readShort();
        analyzed.grow(analyzedLength+2);
        input.readBytes(analyzed.bytes(), 0, analyzedLength);
        analyzed.setLength(analyzedLength);

        long cost = input.readInt();

        surface.bytes = scratch.bytes();
        if (hasPayloads) {
          surface.length = input.readShort();
          surface.offset = input.getPosition();
        } else {
          surface.offset = input.getPosition();
          surface.length = scratch.length() - surface.offset;
        }

        if (previousAnalyzed == null) {
          previousAnalyzed = new BytesRefBuilder();
          previousAnalyzed.copyBytes(analyzed);
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else if (analyzed.get().equals(previousAnalyzed.get())) {
          dedup++;
          if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
            // More than maxSurfaceFormsPerAnalyzedForm
            // dups: skip the rest:
            continue;
          }
          if (seenSurfaceForms.contains(surface)) {
            continue;
          }
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else {
          dedup = 0;
          previousAnalyzed.copyBytes(analyzed);
          seenSurfaceForms.clear();
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        }

        // TODO: I think we can avoid the extra 2 bytes when
        // there is no dup (dedup==0), but we'd have to fix
        // the exactFirst logic ... which would be sort of
        // hairy because we'd need to special case the two
        // (dup/not dup)...

        // NOTE: must be byte 0 so we sort before whatever
        // is next
        analyzed.append((byte) 0);
        analyzed.append((byte) dedup);

        Util.toIntsRef(analyzed.get(), scratchInts);
        //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
        if (!hasPayloads) {
          builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
        } else {
          int payloadOffset = input.getPosition() + surface.length;
          int payloadLength = scratch.length() - payloadOffset;
          BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
          System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
          br.bytes[surface.length] = (byte) payloadSep;
          System.arraycopy(scratch.bytes(), payloadOffset, br.bytes, surface.length+1, payloadLength);
          br.length = br.bytes.length;
          builder.add(scratchInts.get(), outputs.newPair(cost, br));
        }
      }
      fst = builder.finish();

      //PrintWriter pw = new PrintWriter("/tmp/out.dot");
      //Util.toDot(fst, pw, true, true);
      //pw.close();

      success = true;
    } finally {
      IOUtils.closeWhileHandlingException(reader, writer);

      if (success) {
        IOUtils.deleteFilesIfExist(tempInput, tempSorted);
      } else {
        IOUtils.deleteFilesIgnoringExceptions(tempInput, tempSorted);
      }
    }
  }

  @Override
  public boolean store(OutputStream output) throws IOException {
    DataOutput dataOut = new OutputStreamDataOutput(output);
    try {
      if (fst == null) {
        return false;
      }

      fst.save(dataOut);
      dataOut.writeVInt(maxAnalyzedPathsForOneInput);
      dataOut.writeByte((byte) (hasPayloads ? 1 : 0));
    } finally {
      IOUtils.close(output);
    }
    return true;
  }

    @Override
    public long getCount() {
        return count;
    }

    @Override
  public boolean load(InputStream input) throws IOException {
    DataInput dataIn = new InputStreamDataInput(input);
    try {
      this.fst = new FST<>(dataIn, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
      maxAnalyzedPathsForOneInput = dataIn.readVInt();
      hasPayloads = dataIn.readByte() == 1;
    } finally {
      IOUtils.close(input);
    }
    return true;
  }

  private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
    LookupResult result;
    if (hasPayloads) {
      int sepIndex = -1;
      for(int i=0;i<output2.length;i++) {
        if (output2.bytes[output2.offset+i] == payloadSep) {
          sepIndex = i;
          break;
        }
      }
      assert sepIndex != -1;
      final int payloadLen = output2.length - sepIndex - 1;
      spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
      BytesRef payload = new BytesRef(payloadLen);
      System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
      payload.length = payloadLen;
      result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
    } else {
      spare.copyUTF8Bytes(output2);
      result = new LookupResult(spare.toString(), decodeWeight(output1));
    }

    return result;
  }

  private boolean sameSurfaceForm(BytesRef key, BytesRef output2) {
    if (hasPayloads) {
      // output2 has at least PAYLOAD_SEP byte:
      if (key.length >= output2.length) {
        return false;
      }
      for(int i=0;i<key.length;i++) {
        if (key.bytes[key.offset+i] != output2.bytes[output2.offset+i]) {
          return false;
        }
      }
      return output2.bytes[output2.offset + key.length] == payloadSep;
    } else {
      return key.bytesEquals(output2);
    }
  }

  @Override
  public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;

    if (onlyMorePopular) {
      throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
      return Collections.emptyList();
    }

    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
      if (key.charAt(i) == holeCharacter) {
        throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
      }
      if (key.charAt(i) == sepLabel) {
        throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
      }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRefBuilder spare = new CharsRefBuilder();

      //System.out.println("  now intersect exactFirst=" + exactFirst);

      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
      //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

      //System.out.println("  prefixPaths: " + prefixPaths.size());

      BytesReader bytesReader = fst.getBytesReader();

      FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<>();

      final List<LookupResult> results = new ArrayList<>();

      List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);

      if (exactFirst) {

        int count = 0;
        for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
          if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            count++;
          }
        }

        // Searcher just to find the single exact only
        // match, if present:
        Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
        searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);

        // NOTE: we could almost get away with only using
        // the first start node.  The only catch is if
        // maxSurfaceFormsPerAnalyzedForm had kicked in and
        // pruned our exact match from one of these nodes
        // ...:
        for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
          if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
          }
        }

        Util.TopResults<Pair<Long,BytesRef>> completions = searcher.search();

        // NOTE: this is rather inefficient: we enumerate
        // every matching "exactly the same analyzed form"
        // path, and then do linear scan to see if one of
        // these exactly matches the input.  It should be
        // possible (though hairy) to do something similar
        // to getByOutput, since the surface form is encoded
        // into the FST output, so we more efficiently hone
        // in on the exact surface-form match.  Still, I
        // suspect very little time is spent in this linear
        // seach: it's bounded by how many prefix start
        // nodes we have and the
        // maxSurfaceFormsPerAnalyzedForm:
        for(Result<Pair<Long,BytesRef>> completion : completions) {
          BytesRef output2 = completion.output.output2;
          if (sameSurfaceForm(utf8Key, output2)) {
            results.add(getLookupResult(completion.output.output1, output2, spare));
            break;
          }
        }

        if (results.size() == num) {
          // That was quick:
          return results;
        }
      }

      Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
      searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
                                                            num - results.size(),
                                                            num * maxAnalyzedPathsForOneInput,
                                                            weightComparator) {
        private final Set<BytesRef> seen = new HashSet<>();

        @Override
        protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {

          // Dedup: when the input analyzes to a graph we
          // can get duplicate surface forms:
          if (seen.contains(output.output2)) {
            return false;
          }
          seen.add(output.output2);

          if (!exactFirst) {
            return true;
          } else {
            // In exactFirst mode, don't accept any paths
            // matching the surface form since that will
            // create duplicate results:
            if (sameSurfaceForm(utf8Key, output.output2)) {
              // We found exact match, which means we should
              // have already found it in the first search:
              assert results.size() == 1;
              return false;
            } else {
              return true;
            }
          }
        }
      };

      prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);

      for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
        searcher.addStartPaths(path.fstNode, path.output, true, path.input);
      }

      TopResults<Pair<Long,BytesRef>> completions = searcher.search();

      for(Result<Pair<Long,BytesRef>> completion : completions) {

        LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);

        // TODO: for fuzzy case would be nice to return
        // how many edits were required

        //System.out.println("    result=" + result);
        results.add(result);

        if (results.size() == num) {
          // In the exactFirst=true case the search may
          // produce one extra path
          break;
        }
      }

      return results;
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
  }

  @Override
  public boolean store(DataOutput output) throws IOException {
    output.writeVLong(count);
    if (fst == null) {
      return false;
    }

    fst.save(output);
    output.writeVInt(maxAnalyzedPathsForOneInput);
    output.writeByte((byte) (hasPayloads ? 1 : 0));
    return true;
  }

  @Override
  public boolean load(DataInput input) throws IOException {
    count = input.readVLong();
    this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
    maxAnalyzedPathsForOneInput = input.readVInt();
    hasPayloads = input.readByte() == 1;
    return true;
  }

    /** Returns all completion paths to initialize the search. */
  protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                       Automaton lookupAutomaton,
                                                                       FST<Pair<Long,BytesRef>> fst)
    throws IOException {
    return prefixPaths;
  }

  final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
      try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
          return toAutomaton(ts, ts2a);
      }
  }

  final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException {
      // Create corresponding automaton: labels are bytes
      // from each analyzed token, with byte 0 used as
      // separator between tokens:
      Automaton automaton = ts2a.toAutomaton(ts);

      automaton = replaceSep(automaton);
      automaton = convertAutomaton(automaton);

      // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
      // assert SpecialOperations.isFinite(automaton);

      // Get all paths from the automaton (there can be
      // more than one path, eg if the analyzer created a
      // graph using SynFilter or WDF):

      return automaton;
  }

  // EDIT: Adrien, needed by lookup providers
  // NOTE: these XForks are unmaintainable, we need to get rid of them...
  public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
      final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
      Automaton automaton;
      try (TokenStream ts = stream) {
          automaton = toAutomaton(ts, ts2a);
      }
      LimitedFiniteStringsIterator finiteStrings =
              new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
      Set<IntsRef> set = new HashSet<>();
      for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
          set.add(IntsRef.deepCopyOf(string));
      }
      return Collections.unmodifiableSet(set);
  }

  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
      // TODO: is there a Reader from a CharSequence?
      // Turn tokenstream into automaton:
      Automaton automaton = null;

      try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
          automaton = getTokenStreamToAutomaton().toAutomaton(ts);
      }

      automaton = replaceSep(automaton);

      // TODO: we can optimize this somewhat by determinizing
      // while we convert

      // This automaton should not blow up during determinize:
      automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
      return automaton;
  }



  /**
   * Returns the weight associated with an input string, or null if it does not exist.
   *
   * Unsupported in this implementation (and will throw an {@link UnsupportedOperationException}).
   *
   * @param key input string
   * @return the weight associated with the input string, or {@code null} if it does not exist.
   */
  public Object get(CharSequence key) {
    throw new UnsupportedOperationException();
  }

  /**
   * cost -> weight
   *
   * @param encoded Cost
   * @return Weight
   */
  public static int decodeWeight(long encoded) {
    return (int)(Integer.MAX_VALUE - encoded);
  }

  /**
   * weight -> cost
   *
   * @param value Weight
   * @return Cost
   */
  public static int encodeWeight(long value) {
    if (value < 0 || value > Integer.MAX_VALUE) {
      throw new UnsupportedOperationException("cannot encode value: " + value);
    }
    return Integer.MAX_VALUE - (int)value;
  }

  static final Comparator<Pair<Long,BytesRef>> weightComparator = new Comparator<Pair<Long,BytesRef>> () {
    @Override
    public int compare(Pair<Long,BytesRef> left, Pair<Long,BytesRef> right) {
      return left.output1.compareTo(right.output1);
    }
  };


    public static class XBuilder {
        private Builder<Pair<Long, BytesRef>> builder;
        private int maxSurfaceFormsPerAnalyzedForm;
        private IntsRefBuilder scratchInts = new IntsRefBuilder();
        private final PairOutputs<Long, BytesRef> outputs;
        private boolean hasPayloads;
        private BytesRefBuilder analyzed = new BytesRefBuilder();
        private final SurfaceFormAndPayload[] surfaceFormsAndPayload;
        private int count;
        private ObjectIntHashMap<BytesRef> seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f);
        private int payloadSep;

        public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
            this.payloadSep = payloadSep;
            this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
            this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
            this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
            this.hasPayloads = hasPayloads;
            surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm];

        }
        public void startTerm(BytesRef analyzed) {
            this.analyzed.grow(analyzed.length+2);
            this.analyzed.copyBytes(analyzed);
        }

        private final static class SurfaceFormAndPayload implements Comparable<SurfaceFormAndPayload> {
            BytesRef payload;
            long weight;

            public SurfaceFormAndPayload(BytesRef payload, long cost) {
                super();
                this.payload = payload;
                this.weight = cost;
            }

            @Override
            public int compareTo(SurfaceFormAndPayload o) {
                int res = compare(weight, o.weight);
                if (res == 0 ){
                    return payload.compareTo(o.payload);
                }
                return res;
            }
            public static int compare(long x, long y) {
                return (x < y) ? -1 : ((x == y) ? 0 : 1);
            }
        }

        public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException {
            int surfaceIndex = -1;
            long encodedWeight = cost == -1 ? cost : encodeWeight(cost);
            /*
             * we need to check if we have seen this surface form, if so only use the
             * the surface form with the highest weight and drop the rest no matter if
             * the payload differs.
             */
            if (count >= maxSurfaceFormsPerAnalyzedForm) {
                // More than maxSurfaceFormsPerAnalyzedForm
                // dups: skip the rest:
                return;
            }

            BytesRef surfaceCopy;
            final int keySlot;
            if (count > 0 && (keySlot = seenSurfaceForms.indexOf(surface)) >= 0) {
                surfaceIndex = seenSurfaceForms.indexGet(keySlot);
                SurfaceFormAndPayload surfaceFormAndPayload = surfaceFormsAndPayload[surfaceIndex];
                if (encodedWeight >= surfaceFormAndPayload.weight) {
                    return;
                }
                surfaceCopy = BytesRef.deepCopyOf(surface);
            } else {
                surfaceIndex = count++;
                surfaceCopy = BytesRef.deepCopyOf(surface);
                seenSurfaceForms.put(surfaceCopy, surfaceIndex);
            }

            BytesRef payloadRef;
            if (!hasPayloads) {
                payloadRef = surfaceCopy;
            } else {
                int len = surface.length + 1 + payload.length;
                final BytesRef br = new BytesRef(len);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = (byte) payloadSep;
                System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length);
                br.length = len;
                payloadRef = br;
            }
            if (surfaceFormsAndPayload[surfaceIndex] == null) {
                surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight);
            } else {
                surfaceFormsAndPayload[surfaceIndex].payload = payloadRef;
                surfaceFormsAndPayload[surfaceIndex].weight = encodedWeight;
            }
        }

        public void finishTerm(long defaultWeight) throws IOException {
            ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
            int deduplicator = 0;
            analyzed.append((byte) 0);
            analyzed.setLength(analyzed.length() + 1);
            analyzed.grow(analyzed.length());
            for (int i = 0; i < count; i++) {
                analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
                Util.toIntsRef(analyzed.get(), scratchInts);
                SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
                long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
                builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
            }
            seenSurfaceForms.clear();
            count = 0;
        }

        public FST<Pair<Long, BytesRef>> build() throws IOException {
            return builder.finish();
        }

        public boolean hasPayloads() {
            return hasPayloads;
        }

        public int maxSurfaceFormsPerAnalyzedForm() {
            return maxSurfaceFormsPerAnalyzedForm;
        }

    }
}