/* * dk.brics.automaton * * Copyright (c) 2001-2009 Anders Moeller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.apache.lucene.util.automaton; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.RamUsageEstimator; /** * Automata operations. * * @lucene.experimental */ final public class Operations { /** * Default maximum number of states that {@link Operations#determinize} should create. */ public static final int DEFAULT_MAX_DETERMINIZED_STATES = 10000; private Operations() {} /** * Returns an automaton that accepts the concatenation of the languages of the * given automata. * <p> * Complexity: linear in total number of states. */ static public Automaton concatenate(Automaton a1, Automaton a2) { return concatenate(Arrays.asList(a1, a2)); } /** * Returns an automaton that accepts the concatenation of the languages of the * given automata. * <p> * Complexity: linear in total number of states. */ static public Automaton concatenate(List<Automaton> l) { Automaton result = new Automaton(); // First pass: create all states for(Automaton a : l) { if (a.getNumStates() == 0) { result.finishState(); return result; } int numStates = a.getNumStates(); for(int s=0;s<numStates;s++) { result.createState(); } } // Second pass: add transitions, carefully linking accept // states of A to init state of next A: int stateOffset = 0; Transition t = new Transition(); for(int i=0;i<l.size();i++) { Automaton a = l.get(i); int numStates = a.getNumStates(); Automaton nextA = (i == l.size()-1) ? null : l.get(i+1); for(int s=0;s<numStates;s++) { int numTransitions = a.initTransition(s, t); for(int j=0;j<numTransitions;j++) { a.getNextTransition(t); result.addTransition(stateOffset + s, stateOffset + t.dest, t.min, t.max); } if (a.isAccept(s)) { Automaton followA = nextA; int followOffset = stateOffset; int upto = i+1; while (true) { if (followA != null) { // Adds a "virtual" epsilon transition: numTransitions = followA.initTransition(0, t); for(int j=0;j<numTransitions;j++) { followA.getNextTransition(t); result.addTransition(stateOffset + s, followOffset + numStates + t.dest, t.min, t.max); } if (followA.isAccept(0)) { // Keep chaining if followA accepts empty string followOffset += followA.getNumStates(); followA = (upto == l.size()-1) ? null : l.get(upto+1); upto++; } else { break; } } else { result.setAccept(stateOffset + s, true); break; } } } } stateOffset += numStates; } if (result.getNumStates() == 0) { result.createState(); } result.finishState(); return result; } /** * Returns an automaton that accepts the union of the empty string and the * language of the given automaton. This may create a dead state. * <p> * Complexity: linear in number of states. */ static public Automaton optional(Automaton a) { Automaton result = new Automaton(); result.createState(); result.setAccept(0, true); if (a.getNumStates() > 0) { result.copy(a); result.addEpsilon(0, 1); } result.finishState(); return result; } /** * Returns an automaton that accepts the Kleene star (zero or more * concatenated repetitions) of the language of the given automaton. Never * modifies the input automaton language. * <p> * Complexity: linear in number of states. */ static public Automaton repeat(Automaton a) { if (a.getNumStates() == 0) { // Repeating the empty automata will still only accept the empty automata. return a; } Automaton.Builder builder = new Automaton.Builder(); builder.createState(); builder.setAccept(0, true); builder.copy(a); Transition t = new Transition(); int count = a.initTransition(0, t); for(int i=0;i<count;i++) { a.getNextTransition(t); builder.addTransition(0, t.dest+1, t.min, t.max); } int numStates = a.getNumStates(); for(int s=0;s<numStates;s++) { if (a.isAccept(s)) { count = a.initTransition(0, t); for(int i=0;i<count;i++) { a.getNextTransition(t); builder.addTransition(s+1, t.dest+1, t.min, t.max); } } } return builder.finish(); } /** * Returns an automaton that accepts <code>min</code> or more concatenated * repetitions of the language of the given automaton. * <p> * Complexity: linear in number of states and in <code>min</code>. */ static public Automaton repeat(Automaton a, int count) { if (count == 0) { return repeat(a); } List<Automaton> as = new ArrayList<>(); while (count-- > 0) { as.add(a); } as.add(repeat(a)); return concatenate(as); } /** * Returns an automaton that accepts between <code>min</code> and * <code>max</code> (including both) concatenated repetitions of the language * of the given automaton. * <p> * Complexity: linear in number of states and in <code>min</code> and * <code>max</code>. */ static public Automaton repeat(Automaton a, int min, int max) { if (min > max) { return Automata.makeEmpty(); } Automaton b; if (min == 0) { b = Automata.makeEmptyString(); } else if (min == 1) { b = new Automaton(); b.copy(a); } else { List<Automaton> as = new ArrayList<>(); for(int i=0;i<min;i++) { as.add(a); } b = concatenate(as); } Set<Integer> prevAcceptStates = toSet(b, 0); Automaton.Builder builder = new Automaton.Builder(); builder.copy(b); for(int i=min;i<max;i++) { int numStates = builder.getNumStates(); builder.copy(a); for(int s : prevAcceptStates) { builder.addEpsilon(s, numStates); } prevAcceptStates = toSet(a, numStates); } return builder.finish(); } private static Set<Integer> toSet(Automaton a, int offset) { int numStates = a.getNumStates(); BitSet isAccept = a.getAcceptStates(); Set<Integer> result = new HashSet<Integer>(); int upto = 0; while (upto < numStates && (upto = isAccept.nextSetBit(upto)) != -1) { result.add(offset+upto); upto++; } return result; } /** * Returns a (deterministic) automaton that accepts the complement of the * language of the given automaton. * <p> * Complexity: linear in number of states if already deterministic and * exponential otherwise. * @param maxDeterminizedStates maximum number of states determinizing the * automaton can result in. Set higher to allow more complex queries and * lower to prevent memory exhaustion. */ static public Automaton complement(Automaton a, int maxDeterminizedStates) { a = totalize(determinize(a, maxDeterminizedStates)); int numStates = a.getNumStates(); for (int p=0;p<numStates;p++) { a.setAccept(p, !a.isAccept(p)); } return removeDeadStates(a); } /** * Returns a (deterministic) automaton that accepts the intersection of the * language of <code>a1</code> and the complement of the language of * <code>a2</code>. As a side-effect, the automata may be determinized, if not * already deterministic. * <p> * Complexity: quadratic in number of states if a2 already deterministic and * exponential in number of a2's states otherwise. */ static public Automaton minus(Automaton a1, Automaton a2, int maxDeterminizedStates) { if (Operations.isEmpty(a1) || a1 == a2) { return Automata.makeEmpty(); } if (Operations.isEmpty(a2)) { return a1; } return intersection(a1, complement(a2, maxDeterminizedStates)); } /** * Returns an automaton that accepts the intersection of the languages of the * given automata. Never modifies the input automata languages. * <p> * Complexity: quadratic in number of states. */ static public Automaton intersection(Automaton a1, Automaton a2) { if (a1 == a2) { return a1; } if (a1.getNumStates() == 0) { return a1; } if (a2.getNumStates() == 0) { return a2; } Transition[][] transitions1 = a1.getSortedTransitions(); Transition[][] transitions2 = a2.getSortedTransitions(); Automaton c = new Automaton(); c.createState(); ArrayDeque<StatePair> worklist = new ArrayDeque<>(); HashMap<StatePair,StatePair> newstates = new HashMap<>(); StatePair p = new StatePair(0, 0, 0); worklist.add(p); newstates.put(p, p); while (worklist.size() > 0) { p = worklist.removeFirst(); c.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2)); Transition[] t1 = transitions1[p.s1]; Transition[] t2 = transitions2[p.s2]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) if (t2[n2].max >= t1[n1].min) { StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); StatePair r = newstates.get(q); if (r == null) { q.s = c.createState(); worklist.add(q); newstates.put(q, q); r = q; } int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; c.addTransition(p.s, r.s, min, max); } } } c.finishState(); return removeDeadStates(c); } /** Returns true if these two automata accept exactly the * same language. This is a costly computation! Both automata * must be determinized and have no dead states! */ public static boolean sameLanguage(Automaton a1, Automaton a2) { if (a1 == a2) { return true; } return subsetOf(a2, a1) && subsetOf(a1, a2); } // TODO: move to test-framework? /** Returns true if this automaton has any states that cannot * be reached from the initial state or cannot reach an accept state. * Cost is O(numTransitions+numStates). */ public static boolean hasDeadStates(Automaton a) { BitSet liveStates = getLiveStates(a); int numLive = liveStates.cardinality(); int numStates = a.getNumStates(); assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates; return numLive < numStates; } // TODO: move to test-framework? /** Returns true if there are dead states reachable from an initial state. */ public static boolean hasDeadStatesFromInitial(Automaton a) { BitSet reachableFromInitial = getLiveStatesFromInitial(a); BitSet reachableFromAccept = getLiveStatesToAccept(a); reachableFromInitial.andNot(reachableFromAccept); return reachableFromInitial.isEmpty() == false; } // TODO: move to test-framework? /** Returns true if there are dead states that reach an accept state. */ public static boolean hasDeadStatesToAccept(Automaton a) { BitSet reachableFromInitial = getLiveStatesFromInitial(a); BitSet reachableFromAccept = getLiveStatesToAccept(a); reachableFromAccept.andNot(reachableFromInitial); return reachableFromAccept.isEmpty() == false; } /** * Returns true if the language of <code>a1</code> is a subset of the language * of <code>a2</code>. Both automata must be determinized and must have no dead * states. * <p> * Complexity: quadratic in number of states. */ public static boolean subsetOf(Automaton a1, Automaton a2) { if (a1.isDeterministic() == false) { throw new IllegalArgumentException("a1 must be deterministic"); } if (a2.isDeterministic() == false) { throw new IllegalArgumentException("a2 must be deterministic"); } assert hasDeadStatesFromInitial(a1) == false; assert hasDeadStatesFromInitial(a2) == false; if (a1.getNumStates() == 0) { // Empty language is alwyas a subset of any other language return true; } else if (a2.getNumStates() == 0) { return isEmpty(a1); } // TODO: cutover to iterators instead Transition[][] transitions1 = a1.getSortedTransitions(); Transition[][] transitions2 = a2.getSortedTransitions(); ArrayDeque<StatePair> worklist = new ArrayDeque<>(); HashSet<StatePair> visited = new HashSet<>(); StatePair p = new StatePair(0, 0); worklist.add(p); visited.add(p); while (worklist.size() > 0) { p = worklist.removeFirst(); if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { return false; } Transition[] t1 = transitions1[p.s1]; Transition[] t2 = transitions2[p.s2]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) { b2++; } int min1 = t1[n1].min, max1 = t1[n1].max; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { if (t2[n2].min > min1) { return false; } if (t2[n2].max < Character.MAX_CODE_POINT) { min1 = t2[n2].max + 1; } else { min1 = Character.MAX_CODE_POINT; max1 = Character.MIN_CODE_POINT; } StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); if (!visited.contains(q)) { worklist.add(q); visited.add(q); } } if (min1 <= max1) { return false; } } } return true; } /** * Returns an automaton that accepts the union of the languages of the given * automata. * <p> * Complexity: linear in number of states. */ public static Automaton union(Automaton a1, Automaton a2) { return union(Arrays.asList(a1, a2)); } /** * Returns an automaton that accepts the union of the languages of the given * automata. * <p> * Complexity: linear in number of states. */ public static Automaton union(Collection<Automaton> l) { Automaton result = new Automaton(); // Create initial state: result.createState(); // Copy over all automata for(Automaton a : l) { result.copy(a); } // Add epsilon transition from new initial state int stateOffset = 1; for(Automaton a : l) { if (a.getNumStates() == 0) { continue; } result.addEpsilon(0, stateOffset); stateOffset += a.getNumStates(); } result.finishState(); return removeDeadStates(result); } // Simple custom ArrayList<Transition> private final static class TransitionList { // dest, min, max int[] transitions = new int[3]; int next; public void add(Transition t) { if (transitions.length < next+3) { transitions = ArrayUtil.grow(transitions, next+3); } transitions[next] = t.dest; transitions[next+1] = t.min; transitions[next+2] = t.max; next += 3; } } // Holds all transitions that start on this int point, or // end at this point-1 private final static class PointTransitions implements Comparable<PointTransitions> { int point; final TransitionList ends = new TransitionList(); final TransitionList starts = new TransitionList(); @Override public int compareTo(PointTransitions other) { return point - other.point; } public void reset(int point) { this.point = point; ends.next = 0; starts.next = 0; } @Override public boolean equals(Object other) { return ((PointTransitions) other).point == point; } @Override public int hashCode() { return point; } } private final static class PointTransitionSet { int count; PointTransitions[] points = new PointTransitions[5]; private final static int HASHMAP_CUTOVER = 30; private final HashMap<Integer,PointTransitions> map = new HashMap<>(); private boolean useHash = false; private PointTransitions next(int point) { // 1st time we are seeing this point if (count == points.length) { final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(points, 0, newArray, 0, count); points = newArray; } PointTransitions points0 = points[count]; if (points0 == null) { points0 = points[count] = new PointTransitions(); } points0.reset(point); count++; return points0; } private PointTransitions find(int point) { if (useHash) { final Integer pi = point; PointTransitions p = map.get(pi); if (p == null) { p = next(point); map.put(pi, p); } return p; } else { for(int i=0;i<count;i++) { if (points[i].point == point) { return points[i]; } } final PointTransitions p = next(point); if (count == HASHMAP_CUTOVER) { // switch to HashMap on the fly assert map.size() == 0; for(int i=0;i<count;i++) { map.put(points[i].point, points[i]); } useHash = true; } return p; } } public void reset() { if (useHash) { map.clear(); useHash = false; } count = 0; } public void sort() { // Tim sort performs well on already sorted arrays: if (count > 1) ArrayUtil.timSort(points, 0, count); } public void add(Transition t) { find(t.min).starts.add(t); find(1+t.max).ends.add(t); } @Override public String toString() { StringBuilder s = new StringBuilder(); for(int i=0;i<count;i++) { if (i > 0) { s.append(' '); } s.append(points[i].point).append(':').append(points[i].starts.next/3).append(',').append(points[i].ends.next/3); } return s.toString(); } } /** * Determinizes the given automaton. * <p> * Worst case complexity: exponential in number of states. * @param maxDeterminizedStates Maximum number of states created when * determinizing. Higher numbers allow this operation to consume more * memory but allow more complex automatons. Use * DEFAULT_MAX_DETERMINIZED_STATES as a decent default if you don't know * how many to allow. * @throws TooComplexToDeterminizeException if determinizing a creates an * automaton with more than maxDeterminizedStates */ public static Automaton determinize(Automaton a, int maxDeterminizedStates) { if (a.isDeterministic()) { // Already determinized return a; } if (a.getNumStates() <= 1) { // Already determinized return a; } // subset construction Automaton.Builder b = new Automaton.Builder(); //System.out.println("DET:"); //a.writeDot("/l/la/lucene/core/detin.dot"); SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(0, 0); // Create state 0: b.createState(); ArrayDeque<SortedIntSet.FrozenIntSet> worklist = new ArrayDeque<>(); Map<SortedIntSet.FrozenIntSet,Integer> newstate = new HashMap<>(); worklist.add(initialset); b.setAccept(0, a.isAccept(0)); newstate.put(initialset, 0); // like Set<Integer,PointTransitions> final PointTransitionSet points = new PointTransitionSet(); // like SortedMap<Integer,Integer> final SortedIntSet statesSet = new SortedIntSet(5); Transition t = new Transition(); while (worklist.size() > 0) { SortedIntSet.FrozenIntSet s = worklist.removeFirst(); //System.out.println("det: pop set=" + s); // Collate all outgoing transitions by min/1+max: for(int i=0;i<s.values.length;i++) { final int s0 = s.values[i]; int numTransitions = a.getNumTransitions(s0); a.initTransition(s0, t); for(int j=0;j<numTransitions;j++) { a.getNextTransition(t); points.add(t); } } if (points.count == 0) { // No outgoing transitions -- skip it continue; } points.sort(); int lastPoint = -1; int accCount = 0; final int r = s.state; for(int i=0;i<points.count;i++) { final int point = points.points[i].point; if (statesSet.upto > 0) { assert lastPoint != -1; statesSet.computeHash(); Integer q = newstate.get(statesSet); if (q == null) { q = b.createState(); if (q >= maxDeterminizedStates) { throw new TooComplexToDeterminizeException(a, maxDeterminizedStates); } final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); //System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount); worklist.add(p); b.setAccept(q, accCount > 0); newstate.put(p, q); } else { assert (accCount > 0 ? true:false) == b.isAccept(q): "accCount=" + accCount + " vs existing accept=" + b.isAccept(q) + " states=" + statesSet; } // System.out.println(" add trans src=" + r + " dest=" + q + " min=" + lastPoint + " max=" + (point-1)); b.addTransition(r, q, lastPoint, point-1); } // process transitions that end on this point // (closes an overlapping interval) int[] transitions = points.points[i].ends.transitions; int limit = points.points[i].ends.next; for(int j=0;j<limit;j+=3) { int dest = transitions[j]; statesSet.decr(dest); accCount -= a.isAccept(dest) ? 1:0; } points.points[i].ends.next = 0; // process transitions that start on this point // (opens a new interval) transitions = points.points[i].starts.transitions; limit = points.points[i].starts.next; for(int j=0;j<limit;j+=3) { int dest = transitions[j]; statesSet.incr(dest); accCount += a.isAccept(dest) ? 1:0; } lastPoint = point; points.points[i].starts.next = 0; } points.reset(); assert statesSet.upto == 0: "upto=" + statesSet.upto; } Automaton result = b.finish(); assert result.isDeterministic(); return result; } /** * Returns true if the given automaton accepts no strings. */ public static boolean isEmpty(Automaton a) { if (a.getNumStates() == 0) { // Common case: no states return true; } if (a.isAccept(0) == false && a.getNumTransitions(0) == 0) { // Common case: just one initial state return true; } if (a.isAccept(0) == true) { // Apparently common case: it accepts the damned empty string return false; } ArrayDeque<Integer> workList = new ArrayDeque<>(); BitSet seen = new BitSet(a.getNumStates()); workList.add(0); seen.set(0); Transition t = new Transition(); while (workList.isEmpty() == false) { int state = workList.removeFirst(); if (a.isAccept(state)) { return false; } int count = a.initTransition(state, t); for(int i=0;i<count;i++) { a.getNextTransition(t); if (seen.get(t.dest) == false) { workList.add(t.dest); seen.set(t.dest); } } } return true; } /** * Returns true if the given automaton accepts all strings. The automaton must be minimized. */ public static boolean isTotal(Automaton a) { return isTotal(a, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); } /** * Returns true if the given automaton accepts all strings for the specified min/max * range of the alphabet. The automaton must be minimized. */ public static boolean isTotal(Automaton a, int minAlphabet, int maxAlphabet) { if (a.isAccept(0) && a.getNumTransitions(0) == 1) { Transition t = new Transition(); a.getTransition(0, 0, t); return t.dest == 0 && t.min == minAlphabet && t.max == maxAlphabet; } return false; } /** * Returns true if the given string is accepted by the automaton. The input must be deterministic. * <p> * Complexity: linear in the length of the string. * <p> * <b>Note:</b> for full performance, use the {@link RunAutomaton} class. */ public static boolean run(Automaton a, String s) { assert a.isDeterministic(); int state = 0; for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) { int nextState = a.step(state, cp = s.codePointAt(i)); if (nextState == -1) { return false; } state = nextState; } return a.isAccept(state); } /** * Returns true if the given string (expressed as unicode codepoints) is accepted by the automaton. The input must be deterministic. * <p> * Complexity: linear in the length of the string. * <p> * <b>Note:</b> for full performance, use the {@link RunAutomaton} class. */ public static boolean run(Automaton a, IntsRef s) { assert a.isDeterministic(); int state = 0; for (int i=0;i<s.length;i++) { int nextState = a.step(state, s.ints[s.offset+i]); if (nextState == -1) { return false; } state = nextState; } return a.isAccept(state); } /** * Returns the set of live states. A state is "live" if an accept state is * reachable from it and if it is reachable from the initial state. */ private static BitSet getLiveStates(Automaton a) { BitSet live = getLiveStatesFromInitial(a); live.and(getLiveStatesToAccept(a)); return live; } /** Returns bitset marking states reachable from the initial state. */ private static BitSet getLiveStatesFromInitial(Automaton a) { int numStates = a.getNumStates(); BitSet live = new BitSet(numStates); if (numStates == 0) { return live; } ArrayDeque<Integer> workList = new ArrayDeque<>(); live.set(0); workList.add(0); Transition t = new Transition(); while (workList.isEmpty() == false) { int s = workList.removeFirst(); int count = a.initTransition(s, t); for(int i=0;i<count;i++) { a.getNextTransition(t); if (live.get(t.dest) == false) { live.set(t.dest); workList.add(t.dest); } } } return live; } /** Returns bitset marking states that can reach an accept state. */ private static BitSet getLiveStatesToAccept(Automaton a) { Automaton.Builder builder = new Automaton.Builder(); // NOTE: not quite the same thing as what SpecialOperations.reverse does: Transition t = new Transition(); int numStates = a.getNumStates(); for(int s=0;s<numStates;s++) { builder.createState(); } for(int s=0;s<numStates;s++) { int count = a.initTransition(s, t); for(int i=0;i<count;i++) { a.getNextTransition(t); builder.addTransition(t.dest, s, t.min, t.max); } } Automaton a2 = builder.finish(); ArrayDeque<Integer> workList = new ArrayDeque<>(); BitSet live = new BitSet(numStates); BitSet acceptBits = a.getAcceptStates(); int s = 0; while (s < numStates && (s = acceptBits.nextSetBit(s)) != -1) { live.set(s); workList.add(s); s++; } while (workList.isEmpty() == false) { s = workList.removeFirst(); int count = a2.initTransition(s, t); for(int i=0;i<count;i++) { a2.getNextTransition(t); if (live.get(t.dest) == false) { live.set(t.dest); workList.add(t.dest); } } } return live; } /** * Removes transitions to dead states (a state is "dead" if it is not * reachable from the initial state or no accept state is reachable from it.) */ public static Automaton removeDeadStates(Automaton a) { int numStates = a.getNumStates(); BitSet liveSet = getLiveStates(a); int[] map = new int[numStates]; Automaton result = new Automaton(); //System.out.println("liveSet: " + liveSet + " numStates=" + numStates); for(int i=0;i<numStates;i++) { if (liveSet.get(i)) { map[i] = result.createState(); result.setAccept(map[i], a.isAccept(i)); } } Transition t = new Transition(); for (int i=0;i<numStates;i++) { if (liveSet.get(i)) { int numTransitions = a.initTransition(i, t); // filter out transitions to dead states: for(int j=0;j<numTransitions;j++) { a.getNextTransition(t); if (liveSet.get(t.dest)) { result.addTransition(map[i], map[t.dest], t.min, t.max); } } } } result.finishState(); assert hasDeadStates(result) == false; return result; } /** * Returns true if the language of this automaton is finite. The * automaton must not have any dead states. */ public static boolean isFinite(Automaton a) { if (a.getNumStates() == 0) { return true; } return isFinite(new Transition(), a, 0, new BitSet(a.getNumStates()), new BitSet(a.getNumStates())); } /** * Checks whether there is a loop containing state. (This is sufficient since * there are never transitions to dead states.) */ // TODO: not great that this is recursive... in theory a // large automata could exceed java's stack private static boolean isFinite(Transition scratch, Automaton a, int state, BitSet path, BitSet visited) { path.set(state); int numTransitions = a.initTransition(state, scratch); for(int t=0;t<numTransitions;t++) { a.getTransition(state, t, scratch); if (path.get(scratch.dest) || (!visited.get(scratch.dest) && !isFinite(scratch, a, scratch.dest, path, visited))) { return false; } } path.clear(state); visited.set(state); return true; } /** * Returns the longest string that is a prefix of all accepted strings and * visits each state at most once. The automaton must be deterministic. * * @return common prefix, which can be an empty (length 0) String (never null) */ public static String getCommonPrefix(Automaton a) { if (a.isDeterministic() == false) { throw new IllegalArgumentException("input automaton must be deterministic"); } StringBuilder b = new StringBuilder(); HashSet<Integer> visited = new HashSet<>(); int s = 0; boolean done; Transition t = new Transition(); do { done = true; visited.add(s); if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { b.appendCodePoint(t.min); s = t.dest; done = false; } } } while (!done); return b.toString(); } // TODO: this currently requites a determinized machine, // but it need not -- we can speed it up by walking the // NFA instead. it'd still be fail fast. /** * Returns the longest BytesRef that is a prefix of all accepted strings and * visits each state at most once. The automaton must be deterministic. * * @return common prefix, which can be an empty (length 0) BytesRef (never null) */ public static BytesRef getCommonPrefixBytesRef(Automaton a) { BytesRefBuilder builder = new BytesRefBuilder(); HashSet<Integer> visited = new HashSet<>(); int s = 0; boolean done; Transition t = new Transition(); do { done = true; visited.add(s); if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { builder.append((byte) t.min); s = t.dest; done = false; } } } while (!done); return builder.get(); } /** If this automaton accepts a single input, return it. Else, return null. * The automaton must be deterministic. */ public static IntsRef getSingleton(Automaton a) { if (a.isDeterministic() == false) { throw new IllegalArgumentException("input automaton must be deterministic"); } IntsRefBuilder builder = new IntsRefBuilder(); HashSet<Integer> visited = new HashSet<>(); int s = 0; Transition t = new Transition(); while (true) { visited.add(s); if (a.isAccept(s) == false) { if (a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { builder.append(t.min); s = t.dest; continue; } } } else if (a.getNumTransitions(s) == 0) { return builder.get(); } // Automaton accepts more than one string: return null; } } /** * Returns the longest BytesRef that is a suffix of all accepted strings. * Worst case complexity: exponential in number of states (this calls * determinize). * @param maxDeterminizedStates maximum number of states determinizing the * automaton can result in. Set higher to allow more complex queries and * lower to prevent memory exhaustion. * @return common suffix, which can be an empty (length 0) BytesRef (never null) */ public static BytesRef getCommonSuffixBytesRef(Automaton a, int maxDeterminizedStates) { // reverse the language of the automaton, then reverse its common prefix. Automaton r = Operations.determinize(reverse(a), maxDeterminizedStates); BytesRef ref = getCommonPrefixBytesRef(r); reverseBytes(ref); return ref; } private static void reverseBytes(BytesRef ref) { if (ref.length <= 1) return; int num = ref.length >> 1; for (int i = ref.offset; i < ( ref.offset + num ); i++) { byte b = ref.bytes[i]; ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1]; ref.bytes[ref.offset * 2 + ref.length - i - 1] = b; } } /** Returns an automaton accepting the reverse language. */ public static Automaton reverse(Automaton a) { return reverse(a, null); } /** Reverses the automaton, returning the new initial states. */ static Automaton reverse(Automaton a, Set<Integer> initialStates) { if (Operations.isEmpty(a)) { return new Automaton(); } int numStates = a.getNumStates(); // Build a new automaton with all edges reversed Automaton.Builder builder = new Automaton.Builder(); // Initial node; we'll add epsilon transitions in the end: builder.createState(); for(int s=0;s<numStates;s++) { builder.createState(); } // Old initial state becomes new accept state: builder.setAccept(1, true); Transition t = new Transition(); for (int s=0;s<numStates;s++) { int numTransitions = a.getNumTransitions(s); a.initTransition(s, t); for(int i=0;i<numTransitions;i++) { a.getNextTransition(t); builder.addTransition(t.dest+1, s+1, t.min, t.max); } } Automaton result = builder.finish(); int s = 0; BitSet acceptStates = a.getAcceptStates(); while (s < numStates && (s = acceptStates.nextSetBit(s)) != -1) { result.addEpsilon(0, s+1); if (initialStates != null) { initialStates.add(s+1); } s++; } result.finishState(); return result; } /** Returns a new automaton accepting the same language with added * transitions to a dead state so that from every state and every label * there is a transition. */ static Automaton totalize(Automaton a) { Automaton result = new Automaton(); int numStates = a.getNumStates(); for(int i=0;i<numStates;i++) { result.createState(); result.setAccept(i, a.isAccept(i)); } int deadState = result.createState(); result.addTransition(deadState, deadState, Character.MIN_CODE_POINT, Character.MAX_CODE_POINT); Transition t = new Transition(); for(int i=0;i<numStates;i++) { int maxi = Character.MIN_CODE_POINT; int count = a.initTransition(i, t); for(int j=0;j<count;j++) { a.getNextTransition(t); result.addTransition(i, t.dest, t.min, t.max); if (t.min > maxi) { result.addTransition(i, deadState, maxi, t.min-1); } if (t.max + 1 > maxi) { maxi = t.max + 1; } } if (maxi <= Character.MAX_CODE_POINT) { result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT); } } result.finishState(); return result; } /** Returns the topological sort of all states reachable from * the initial state. Behavior is undefined if this * automaton has cycles. CPU cost is O(numTransitions), * and the implementation is recursive so an automaton * matching long strings may exhaust the java stack. */ public static int[] topoSortStates(Automaton a) { if (a.getNumStates() == 0) { return new int[0]; } int numStates = a.getNumStates(); int[] states = new int[numStates]; final BitSet visited = new BitSet(numStates); int upto = topoSortStatesRecurse(a, visited, states, 0, 0); if (upto < states.length) { // There were dead states int[] newStates = new int[upto]; System.arraycopy(states, 0, newStates, 0, upto); states = newStates; } // Reverse the order: for(int i=0;i<states.length/2;i++) { int s = states[i]; states[i] = states[states.length-1-i]; states[states.length-1-i] = s; } return states; } private static int topoSortStatesRecurse(Automaton a, BitSet visited, int[] states, int upto, int state) { Transition t = new Transition(); int count = a.initTransition(state, t); for (int i=0;i<count;i++) { a.getNextTransition(t); if (!visited.get(t.dest)) { visited.set(t.dest); upto = topoSortStatesRecurse(a, visited, states, upto, t.dest); } } states[upto] = state; upto++; return upto; } }