CompiledAutomaton.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.util.automaton;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
  
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.PrefixTermsEnum;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.util.BytesRef;

/**
 * Immutable class holding compiled details for a given
 * Automaton.  The Automaton is deterministic, must not have
 * dead states but is not necessarily minimal.
 *
 * @lucene.experimental
 */
public class CompiledAutomaton {
  /**
   * Automata are compiled into different internal forms for the
   * most efficient execution depending upon the language they accept.
   */
  public enum AUTOMATON_TYPE {
    /** Automaton that accepts no strings. */
    NONE, 
    /** Automaton that accepts all possible strings. */
    ALL, 
    /** Automaton that accepts only a single fixed string. */
    SINGLE, 
    /** Automaton that matches all Strings with a constant prefix. */
    PREFIX, 
    /** Catch-all for any other automata. */
    NORMAL
  };
  public final AUTOMATON_TYPE type;

  /** 
   * For {@link AUTOMATON_TYPE#PREFIX}, this is the prefix term; 
   * for {@link AUTOMATON_TYPE#SINGLE} this is the singleton term.
   */
  public final BytesRef term;

  /** 
   * Matcher for quickly determining if a byte[] is accepted.
   * only valid for {@link AUTOMATON_TYPE#NORMAL}.
   */
  public final ByteRunAutomaton runAutomaton;
  // TODO: would be nice if these sortedTransitions had "int
  // to;" instead of "State to;" somehow:
  /**
   * Two dimensional array of transitions, indexed by state
   * number for traversal. The state numbering is consistent with
   * {@link #runAutomaton}. 
   * Only valid for {@link AUTOMATON_TYPE#NORMAL}.
   */
  public final Transition[][] sortedTransitions;
  /**
   * Shared common suffix accepted by the automaton. Only valid
   * for {@link AUTOMATON_TYPE#NORMAL}, and only when the
   * automaton accepts an infinite language.
   */
  public final BytesRef commonSuffixRef;
  /**
   * Indicates if the automaton accepts a finite set of strings.
   * Null if this was not computed.
   * Only valid for {@link AUTOMATON_TYPE#NORMAL}.
   */
  public final Boolean finite;

  public CompiledAutomaton(Automaton automaton) {
    this(automaton, null, true);
  }

  public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) {

    if (simplify) {
      // Test whether the automaton is a "simple" form and
      // if so, don't create a runAutomaton.  Note that on a
      // large automaton these tests could be costly:
      if (BasicOperations.isEmpty(automaton)) {
        // matches nothing
        type = AUTOMATON_TYPE.NONE;
        term = null;
        commonSuffixRef = null;
        runAutomaton = null;
        sortedTransitions = null;
        this.finite = null;
        return;
      } else if (BasicOperations.isTotal(automaton)) {
        // matches all possible strings
        type = AUTOMATON_TYPE.ALL;
        term = null;
        commonSuffixRef = null;
        runAutomaton = null;
        sortedTransitions = null;
        this.finite = null;
        return;
      } else {
        final String commonPrefix;
        final String singleton;
        if (automaton.getSingleton() == null) {
          commonPrefix = SpecialOperations.getCommonPrefix(automaton);
          if (commonPrefix.length() > 0 && BasicOperations.sameLanguage(automaton, BasicAutomata.makeString(commonPrefix))) {
            singleton = commonPrefix;
          } else {
            singleton = null;
          }
        } else {
          commonPrefix = null;
          singleton = automaton.getSingleton();
        }
      
        if (singleton != null) {
          // matches a fixed string in singleton or expanded
          // representation
          type = AUTOMATON_TYPE.SINGLE;
          term = new BytesRef(singleton);
          commonSuffixRef = null;
          runAutomaton = null;
          sortedTransitions = null;
          this.finite = null;
          return;
        } else if (BasicOperations.sameLanguage(automaton, BasicOperations.concatenate(
                                                                                       BasicAutomata.makeString(commonPrefix), BasicAutomata.makeAnyString()))) {
          // matches a constant prefix
          type = AUTOMATON_TYPE.PREFIX;
          term = new BytesRef(commonPrefix);
          commonSuffixRef = null;
          runAutomaton = null;
          sortedTransitions = null;
          this.finite = null;
          return;
        }
      }
    }

    type = AUTOMATON_TYPE.NORMAL;
    term = null;
    if (finite == null) {
      this.finite = SpecialOperations.isFinite(automaton);
    } else {
      this.finite = finite;
    }
    Automaton utf8 = new UTF32ToUTF8().convert(automaton);
    if (this.finite) {
      commonSuffixRef = null;
    } else {
      commonSuffixRef = SpecialOperations.getCommonSuffixBytesRef(utf8);
    }
    runAutomaton = new ByteRunAutomaton(utf8, true);
    sortedTransitions = utf8.getSortedTransitions();
  }
  
  //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;

  private BytesRef addTail(int state, BytesRef term, int idx, int leadLabel) {

    // Find biggest transition that's < label
    // TODO: use binary search here
    Transition maxTransition = null;
    for (Transition transition : sortedTransitions[state]) {
      if (transition.min < leadLabel) {
        maxTransition = transition;
      }
    }

    assert maxTransition != null;

    // Append floorLabel
    final int floorLabel;
    if (maxTransition.max > leadLabel-1) {
      floorLabel = leadLabel-1;
    } else {
      floorLabel = maxTransition.max;
    }
    if (idx >= term.bytes.length) {
      term.grow(1+idx);
    }
    //if (DEBUG) System.out.println("  add floorLabel=" + (char) floorLabel + " idx=" + idx);
    term.bytes[idx] = (byte) floorLabel;

    state = maxTransition.to.getNumber();
    idx++;

    // Push down to last accept state
    while (true) {
      Transition[] transitions = sortedTransitions[state];
      if (transitions.length == 0) {
        assert runAutomaton.isAccept(state);
        term.length = idx;
        //if (DEBUG) System.out.println("  return " + term.utf8ToString());
        return term;
      } else {
        // We are pushing "top" -- so get last label of
        // last transition:
        assert transitions.length != 0;
        Transition lastTransition = transitions[transitions.length-1];
        if (idx >= term.bytes.length) {
          term.grow(1+idx);
        }
        //if (DEBUG) System.out.println("  push maxLabel=" + (char) lastTransition.max + " idx=" + idx);
        term.bytes[idx] = (byte) lastTransition.max;
        state = lastTransition.to.getNumber();
        idx++;
      }
    }
  }

  // TODO: should this take startTerm too?  This way
  // Terms.intersect could forward to this method if type !=
  // NORMAL:
  public TermsEnum getTermsEnum(Terms terms) throws IOException {
    switch(type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return terms.iterator(null);
    case SINGLE:
      return new SingleTermsEnum(terms.iterator(null), term);
    case PREFIX:
      // TODO: this is very likely faster than .intersect,
      // but we should test and maybe cutover
      return new PrefixTermsEnum(terms.iterator(null), term);
    case NORMAL:
      return terms.intersect(this, null);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
    }
  }

  /** Finds largest term accepted by this Automaton, that's
   *  <= the provided input term.  The result is placed in
   *  output; it's fine for output and input to point to
   *  the same BytesRef.  The returned result is either the
   *  provided output, or null if there is no floor term
   *  (ie, the provided input term is before the first term
   *  accepted by this Automaton). */
  public BytesRef floor(BytesRef input, BytesRef output) {

    output.offset = 0;
    //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString());

    int state = runAutomaton.getInitialState();

    // Special case empty string:
    if (input.length == 0) {
      if (runAutomaton.isAccept(state)) {
        output.length = 0;
        return output;
      } else {
        return null;
      }
    }

    final List<Integer> stack = new ArrayList<>();

    int idx = 0;
    while (true) {
      int label = input.bytes[input.offset + idx] & 0xff;
      int nextState = runAutomaton.step(state, label);
      //if (DEBUG) System.out.println("  cycle label=" + (char) label + " nextState=" + nextState);

      if (idx == input.length-1) {
        if (nextState != -1 && runAutomaton.isAccept(nextState)) {
          // Input string is accepted
          if (idx >= output.bytes.length) {
            output.grow(1+idx);
          }
          output.bytes[idx] = (byte) label;
          output.length = input.length;
          //if (DEBUG) System.out.println("  input is accepted; return term=" + output.utf8ToString());
          return output;
        } else {
          nextState = -1;
        }
      }

      if (nextState == -1) {

        // Pop back to a state that has a transition
        // <= our label:
        while (true) {
          Transition[] transitions = sortedTransitions[state];
          if (transitions.length == 0) {
            assert runAutomaton.isAccept(state);
            output.length = idx;
            //if (DEBUG) System.out.println("  return " + output.utf8ToString());
            return output;
          } else if (label-1 < transitions[0].min) {

            if (runAutomaton.isAccept(state)) {
              output.length = idx;
              //if (DEBUG) System.out.println("  return " + output.utf8ToString());
              return output;
            }
            // pop
            if (stack.size() == 0) {
              //if (DEBUG) System.out.println("  pop ord=" + idx + " return null");
              return null;
            } else {
              state = stack.remove(stack.size()-1);
              idx--;
              //if (DEBUG) System.out.println("  pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min);
              label = input.bytes[input.offset + idx] & 0xff;
            }

          } else {
            //if (DEBUG) System.out.println("  stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min);
            break;
          }
        }

        //if (DEBUG) System.out.println("  label=" + (char) label + " idx=" + idx);

        return addTail(state, output, idx, label);
        
      } else {
        if (idx >= output.bytes.length) {
          output.grow(1+idx);
        }
        output.bytes[idx] = (byte) label;
        stack.add(state);
        state = nextState;
        idx++;
      }
    }
  }
  
  public String toDot() {
    StringBuilder b = new StringBuilder("digraph CompiledAutomaton {\n");
    b.append("  rankdir = LR;\n");
    int initial = runAutomaton.getInitialState();
    for (int i = 0; i < sortedTransitions.length; i++) {
      b.append("  ").append(i);
      if (runAutomaton.isAccept(i)) b.append(" [shape=doublecircle,label=\"\"];\n");
      else b.append(" [shape=circle,label=\"\"];\n");
      if (i == initial) {
        b.append("  initial [shape=plaintext,label=\"\"];\n");
        b.append("  initial -> ").append(i).append("\n");
      }
      for (int j = 0; j < sortedTransitions[i].length; j++) {
        b.append("  ").append(i);
        sortedTransitions[i][j].appendDot(b);
      }
    }
    return b.append("}\n").toString();
  }
}