SpecialOperations.java example

Explorer
heliosearch-master
- lucene
- solr
/*
 * dk.brics.automaton
 * 
 * Copyright (c) 2001-2009 Anders Moeller
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.apache.lucene.util.automaton;

import java.util.BitSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Util;

/**
 * Special automata operations.
 * 
 * @lucene.experimental
 */
final public class SpecialOperations {
  
  private SpecialOperations() {}
  
  /**
   * Finds the largest entry whose value is less than or equal to c, or 0 if
   * there is no such entry.
   */
  static int findIndex(int c, int[] points) {
    int a = 0;
    int b = points.length;
    while (b - a > 1) {
      int d = (a + b) >>> 1;
      if (points[d] > c) b = d;
      else if (points[d] < c) a = d;
      else return d;
    }
    return a;
  }
  
  /**
   * Returns true if the language of this automaton is finite.
   */
  public static boolean isFinite(Automaton a) {
    if (a.isSingleton()) return true;
    return isFinite(a.initial, new BitSet(a.getNumberOfStates()), new BitSet(a.getNumberOfStates()));
  }
  
  /**
   * Checks whether there is a loop containing s. (This is sufficient since
   * there are never transitions to dead states.)
   */
  // TODO: not great that this is recursive... in theory a
  // large automata could exceed java's stack
  private static boolean isFinite(State s, BitSet path, BitSet visited) {
    path.set(s.number);
    for (Transition t : s.getTransitions())
      if (path.get(t.to.number) || (!visited.get(t.to.number) && !isFinite(t.to, path, visited))) return false;
    path.clear(s.number);
    visited.set(s.number);
    return true;
  }
  
  /**
   * Returns the longest string that is a prefix of all accepted strings and
   * visits each state at most once.
   * 
   * @return common prefix
   */
  public static String getCommonPrefix(Automaton a) {
    if (a.isSingleton()) return a.singleton;
    StringBuilder b = new StringBuilder();
    HashSet<State> visited = new HashSet<>();
    State s = a.initial;
    boolean done;
    do {
      done = true;
      visited.add(s);
      if (!s.accept && s.numTransitions() == 1) {
        Transition t = s.getTransitions().iterator().next();
        if (t.min == t.max && !visited.contains(t.to)) {
          b.appendCodePoint(t.min);
          s = t.to;
          done = false;
        }
      }
    } while (!done);
    return b.toString();
  }
  
  // TODO: this currently requites a determinized machine,
  // but it need not -- we can speed it up by walking the
  // NFA instead.  it'd still be fail fast.
  public static BytesRef getCommonPrefixBytesRef(Automaton a) {
    if (a.isSingleton()) return new BytesRef(a.singleton);
    BytesRef ref = new BytesRef(10);
    HashSet<State> visited = new HashSet<>();
    State s = a.initial;
    boolean done;
    do {
      done = true;
      visited.add(s);
      if (!s.accept && s.numTransitions() == 1) {
        Transition t = s.getTransitions().iterator().next();
        if (t.min == t.max && !visited.contains(t.to)) {
          ref.grow(++ref.length);
          ref.bytes[ref.length - 1] = (byte)t.min;
          s = t.to;
          done = false;
        }
      }
    } while (!done);
    return ref;
  }
  
  /**
   * Returns the longest string that is a suffix of all accepted strings and
   * visits each state at most once.
   * 
   * @return common suffix
   */
  public static String getCommonSuffix(Automaton a) {
    if (a.isSingleton()) // if singleton, the suffix is the string itself.
      return a.singleton;
    
    // reverse the language of the automaton, then reverse its common prefix.
    Automaton r = a.clone();
    reverse(r);
    r.determinize();
    return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString();
  }
  
  public static BytesRef getCommonSuffixBytesRef(Automaton a) {
    if (a.isSingleton()) // if singleton, the suffix is the string itself.
      return new BytesRef(a.singleton);
    
    // reverse the language of the automaton, then reverse its common prefix.
    Automaton r = a.clone();
    reverse(r);
    r.determinize();
    BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r);
    reverseBytes(ref);
    return ref;
  }
  
  private static void reverseBytes(BytesRef ref) {
    if (ref.length <= 1) return;
    int num = ref.length >> 1;
    for (int i = ref.offset; i < ( ref.offset + num ); i++) {
      byte b = ref.bytes[i];
      ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1];
      ref.bytes[ref.offset * 2 + ref.length - i - 1] = b;
    }
  }
  
  /**
   * Reverses the language of the given (non-singleton) automaton while returning
   * the set of new initial states.
   */
  public static Set<State> reverse(Automaton a) {
    a.expandSingleton();
    // reverse all edges
    HashMap<State, HashSet<Transition>> m = new HashMap<>();
    State[] states = a.getNumberedStates();
    Set<State> accept = new HashSet<>();
    for (State s : states)
      if (s.isAccept())
        accept.add(s);
    for (State r : states) {
      m.put(r, new HashSet<Transition>());
      r.accept = false;
    }
    for (State r : states)
      for (Transition t : r.getTransitions())
        m.get(t.to).add(new Transition(t.min, t.max, r));
    for (State r : states) {
      Set<Transition> tr = m.get(r);
      r.setTransitions(tr.toArray(new Transition[tr.size()]));
    }
    // make new initial+final states
    a.initial.accept = true;
    a.initial = new State();
    for (State r : accept)
      a.initial.addEpsilon(r); // ensures that all initial states are reachable
    a.deterministic = false;
    a.clearNumberedStates();
    return accept;
  }

  // TODO: this is a dangerous method ... Automaton could be
  // huge ... and it's better in general for caller to
  // enumerate & process in a single walk:

  /**
   * Returns the set of accepted strings, assuming that at most
   * <code>limit</code> strings are accepted. If more than <code>limit</code> 
   * strings are accepted, the first limit strings found are returned. If <code>limit</code><0, then 
   * the limit is infinite.
   */
  public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
    HashSet<IntsRef> strings = new HashSet<>();
    if (a.isSingleton()) {
      if (limit > 0) {
        strings.add(Util.toUTF32(a.singleton, new IntsRef()));
      }
    } else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) {
      return strings;
    }
    return strings;
  }
  
  /**
   * Returns the strings that can be produced from the given state, or
   * false if more than <code>limit</code> strings are found. 
   * <code>limit</code><0 means "infinite".
   */
  private static boolean getFiniteStrings(State s, HashSet<State> pathstates, 
      HashSet<IntsRef> strings, IntsRef path, int limit) {
    pathstates.add(s);
    for (Transition t : s.getTransitions()) {
      if (pathstates.contains(t.to)) {
        return false;
      }
      for (int n = t.min; n <= t.max; n++) {
        path.grow(path.length+1);
        path.ints[path.length] = n;
        path.length++;
        if (t.to.accept) {
          strings.add(IntsRef.deepCopyOf(path));
          if (limit >= 0 && strings.size() > limit) {
            return false;
          }
        }
        if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) {
          return false;
        }
        path.length--;
      }
    }
    pathstates.remove(s);
    return true;
  }
}