/* * dk.brics.automaton * * Copyright (c) 2001-2009 Anders Moeller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.apache.lucene.util.automaton; import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; import java.util.Set; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Util; /** * Special automata operations. * * @lucene.experimental */ final public class SpecialOperations { private SpecialOperations() {} /** * Finds the largest entry whose value is less than or equal to c, or 0 if * there is no such entry. */ static int findIndex(int c, int[] points) { int a = 0; int b = points.length; while (b - a > 1) { int d = (a + b) >>> 1; if (points[d] > c) b = d; else if (points[d] < c) a = d; else return d; } return a; } /** * Returns true if the language of this automaton is finite. */ public static boolean isFinite(Automaton a) { if (a.isSingleton()) return true; return isFinite(a.initial, new BitSet(a.getNumberOfStates()), new BitSet(a.getNumberOfStates())); } /** * Checks whether there is a loop containing s. (This is sufficient since * there are never transitions to dead states.) */ // TODO: not great that this is recursive... in theory a // large automata could exceed java's stack private static boolean isFinite(State s, BitSet path, BitSet visited) { path.set(s.number); for (Transition t : s.getTransitions()) if (path.get(t.to.number) || (!visited.get(t.to.number) && !isFinite(t.to, path, visited))) return false; path.clear(s.number); visited.set(s.number); return true; } /** * Returns the longest string that is a prefix of all accepted strings and * visits each state at most once. * * @return common prefix */ public static String getCommonPrefix(Automaton a) { if (a.isSingleton()) return a.singleton; StringBuilder b = new StringBuilder(); HashSet<State> visited = new HashSet<>(); State s = a.initial; boolean done; do { done = true; visited.add(s); if (!s.accept && s.numTransitions() == 1) { Transition t = s.getTransitions().iterator().next(); if (t.min == t.max && !visited.contains(t.to)) { b.appendCodePoint(t.min); s = t.to; done = false; } } } while (!done); return b.toString(); } // TODO: this currently requites a determinized machine, // but it need not -- we can speed it up by walking the // NFA instead. it'd still be fail fast. public static BytesRef getCommonPrefixBytesRef(Automaton a) { if (a.isSingleton()) return new BytesRef(a.singleton); BytesRef ref = new BytesRef(10); HashSet<State> visited = new HashSet<>(); State s = a.initial; boolean done; do { done = true; visited.add(s); if (!s.accept && s.numTransitions() == 1) { Transition t = s.getTransitions().iterator().next(); if (t.min == t.max && !visited.contains(t.to)) { ref.grow(++ref.length); ref.bytes[ref.length - 1] = (byte)t.min; s = t.to; done = false; } } } while (!done); return ref; } /** * Returns the longest string that is a suffix of all accepted strings and * visits each state at most once. * * @return common suffix */ public static String getCommonSuffix(Automaton a) { if (a.isSingleton()) // if singleton, the suffix is the string itself. return a.singleton; // reverse the language of the automaton, then reverse its common prefix. Automaton r = a.clone(); reverse(r); r.determinize(); return new StringBuilder(SpecialOperations.getCommonPrefix(r)).reverse().toString(); } public static BytesRef getCommonSuffixBytesRef(Automaton a) { if (a.isSingleton()) // if singleton, the suffix is the string itself. return new BytesRef(a.singleton); // reverse the language of the automaton, then reverse its common prefix. Automaton r = a.clone(); reverse(r); r.determinize(); BytesRef ref = SpecialOperations.getCommonPrefixBytesRef(r); reverseBytes(ref); return ref; } private static void reverseBytes(BytesRef ref) { if (ref.length <= 1) return; int num = ref.length >> 1; for (int i = ref.offset; i < ( ref.offset + num ); i++) { byte b = ref.bytes[i]; ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1]; ref.bytes[ref.offset * 2 + ref.length - i - 1] = b; } } /** * Reverses the language of the given (non-singleton) automaton while returning * the set of new initial states. */ public static Set<State> reverse(Automaton a) { a.expandSingleton(); // reverse all edges HashMap<State, HashSet<Transition>> m = new HashMap<>(); State[] states = a.getNumberedStates(); Set<State> accept = new HashSet<>(); for (State s : states) if (s.isAccept()) accept.add(s); for (State r : states) { m.put(r, new HashSet<Transition>()); r.accept = false; } for (State r : states) for (Transition t : r.getTransitions()) m.get(t.to).add(new Transition(t.min, t.max, r)); for (State r : states) { Set<Transition> tr = m.get(r); r.setTransitions(tr.toArray(new Transition[tr.size()])); } // make new initial+final states a.initial.accept = true; a.initial = new State(); for (State r : accept) a.initial.addEpsilon(r); // ensures that all initial states are reachable a.deterministic = false; a.clearNumberedStates(); return accept; } // TODO: this is a dangerous method ... Automaton could be // huge ... and it's better in general for caller to // enumerate & process in a single walk: /** * Returns the set of accepted strings, assuming that at most * <code>limit</code> strings are accepted. If more than <code>limit</code> * strings are accepted, the first limit strings found are returned. If <code>limit</code><0, then * the limit is infinite. */ public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) { HashSet<IntsRef> strings = new HashSet<>(); if (a.isSingleton()) { if (limit > 0) { strings.add(Util.toUTF32(a.singleton, new IntsRef())); } } else if (!getFiniteStrings(a.initial, new HashSet<State>(), strings, new IntsRef(), limit)) { return strings; } return strings; } /** * Returns the strings that can be produced from the given state, or * false if more than <code>limit</code> strings are found. * <code>limit</code><0 means "infinite". */ private static boolean getFiniteStrings(State s, HashSet<State> pathstates, HashSet<IntsRef> strings, IntsRef path, int limit) { pathstates.add(s); for (Transition t : s.getTransitions()) { if (pathstates.contains(t.to)) { return false; } for (int n = t.min; n <= t.max; n++) { path.grow(path.length+1); path.ints[path.length] = n; path.length++; if (t.to.accept) { strings.add(IntsRef.deepCopyOf(path)); if (limit >= 0 && strings.size() > limit) { return false; } } if (!getFiniteStrings(t.to, pathstates, strings, path, limit)) { return false; } path.length--; } } pathstates.remove(s); return true; } }