package org.apache.lucene.util.automaton; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.UnicodeUtil; /** * Utilities for testing automata. * <p> * Capable of generating random regular expressions, * and automata, and also provides a number of very * basic unoptimized implementations (*slow) for testing. */ public class AutomatonTestUtil { /** Returns random string, including full unicode range. */ public static String randomRegexp(Random r) { while (true) { String regexp = randomRegexpString(r); // we will also generate some undefined unicode queries if (!UnicodeUtil.validUTF16String(regexp)) continue; try { new RegExp(regexp, RegExp.NONE); return regexp; } catch (Exception e) {} } } private static String randomRegexpString(Random r) { final int end = r.nextInt(20); if (end == 0) { // allow 0 length return ""; } final char[] buffer = new char[end]; for (int i = 0; i < end; i++) { int t = r.nextInt(15); if (0 == t && i < end - 1) { // Make a surrogate pair // High surrogate buffer[i++] = (char) TestUtil.nextInt(r, 0xd800, 0xdbff); // Low surrogate buffer[i] = (char) TestUtil.nextInt(r, 0xdc00, 0xdfff); } else if (t <= 1) buffer[i] = (char) r.nextInt(0x80); else if (2 == t) buffer[i] = (char) TestUtil.nextInt(r, 0x80, 0x800); else if (3 == t) buffer[i] = (char) TestUtil.nextInt(r, 0x800, 0xd7ff); else if (4 == t) buffer[i] = (char) TestUtil.nextInt(r, 0xe000, 0xffff); else if (5 == t) buffer[i] = '.'; else if (6 == t) buffer[i] = '?'; else if (7 == t) buffer[i] = '*'; else if (8 == t) buffer[i] = '+'; else if (9 == t) buffer[i] = '('; else if (10 == t) buffer[i] = ')'; else if (11 == t) buffer[i] = '-'; else if (12 == t) buffer[i] = '['; else if (13 == t) buffer[i] = ']'; else if (14 == t) buffer[i] = '|'; } return new String(buffer, 0, end); } /** picks a random int code point, avoiding surrogates; * throws IllegalArgumentException if this transition only * accepts surrogates */ private static int getRandomCodePoint(final Random r, final Transition t) { final int code; if (t.max < UnicodeUtil.UNI_SUR_HIGH_START || t.min > UnicodeUtil.UNI_SUR_HIGH_END) { // easy: entire range is before or after surrogates code = t.min+r.nextInt(t.max-t.min+1); } else if (t.min >= UnicodeUtil.UNI_SUR_HIGH_START) { if (t.max > UnicodeUtil.UNI_SUR_LOW_END) { // after surrogates code = 1+UnicodeUtil.UNI_SUR_LOW_END+r.nextInt(t.max-UnicodeUtil.UNI_SUR_LOW_END); } else { throw new IllegalArgumentException("transition accepts only surrogates: " + t); } } else if (t.max <= UnicodeUtil.UNI_SUR_LOW_END) { if (t.min < UnicodeUtil.UNI_SUR_HIGH_START) { // before surrogates code = t.min + r.nextInt(UnicodeUtil.UNI_SUR_HIGH_START - t.min); } else { throw new IllegalArgumentException("transition accepts only surrogates: " + t); } } else { // range includes all surrogates int gap1 = UnicodeUtil.UNI_SUR_HIGH_START - t.min; int gap2 = t.max - UnicodeUtil.UNI_SUR_LOW_END; int c = r.nextInt(gap1+gap2); if (c < gap1) { code = t.min + c; } else { code = UnicodeUtil.UNI_SUR_LOW_END + c - gap1 + 1; } } assert code >= t.min && code <= t.max && (code < UnicodeUtil.UNI_SUR_HIGH_START || code > UnicodeUtil.UNI_SUR_LOW_END): "code=" + code + " min=" + t.min + " max=" + t.max; return code; } /** * Lets you retrieve random strings accepted * by an Automaton. * <p> * Once created, call {@link #getRandomAcceptedString(Random)} * to get a new string (in UTF-32 codepoints). */ public static class RandomAcceptedStrings { private final Map<Transition,Boolean> leadsToAccept; private final Automaton a; private static class ArrivingTransition { final State from; final Transition t; public ArrivingTransition(State from, Transition t) { this.from = from; this.t = t; } } public RandomAcceptedStrings(Automaton a) { this.a = a; if (a.isSingleton()) { leadsToAccept = null; return; } // must use IdentityHashmap because two Transitions w/ // different start nodes can be considered the same leadsToAccept = new IdentityHashMap<>(); final Map<State,List<ArrivingTransition>> allArriving = new HashMap<>(); final LinkedList<State> q = new LinkedList<>(); final Set<State> seen = new HashSet<>(); // reverse map the transitions, so we can quickly look // up all arriving transitions to a given state for(State s: a.getNumberedStates()) { for(int i=0;i<s.numTransitions;i++) { final Transition t = s.transitionsArray[i]; List<ArrivingTransition> tl = allArriving.get(t.to); if (tl == null) { tl = new ArrayList<>(); allArriving.put(t.to, tl); } tl.add(new ArrivingTransition(s, t)); } if (s.accept) { q.add(s); seen.add(s); } } // Breadth-first search, from accept states, // backwards: while(!q.isEmpty()) { final State s = q.removeFirst(); List<ArrivingTransition> arriving = allArriving.get(s); if (arriving != null) { for(ArrivingTransition at : arriving) { final State from = at.from; if (!seen.contains(from)) { q.add(from); seen.add(from); leadsToAccept.put(at.t, Boolean.TRUE); } } } } } public int[] getRandomAcceptedString(Random r) { final List<Integer> soFar = new ArrayList<>(); if (a.isSingleton()) { // accepts only one final String s = a.singleton; int charUpto = 0; while(charUpto < s.length()) { final int cp = s.codePointAt(charUpto); charUpto += Character.charCount(cp); soFar.add(cp); } } else { State s = a.initial; while(true) { if (s.accept) { if (s.numTransitions == 0) { // stop now break; } else { if (r.nextBoolean()) { break; } } } if (s.numTransitions == 0) { throw new RuntimeException("this automaton has dead states"); } boolean cheat = r.nextBoolean(); final Transition t; if (cheat) { // pick a transition that we know is the fastest // path to an accept state List<Transition> toAccept = new ArrayList<>(); for(int i=0;i<s.numTransitions;i++) { final Transition t0 = s.transitionsArray[i]; if (leadsToAccept.containsKey(t0)) { toAccept.add(t0); } } if (toAccept.size() == 0) { // this is OK -- it means we jumped into a cycle t = s.transitionsArray[r.nextInt(s.numTransitions)]; } else { t = toAccept.get(r.nextInt(toAccept.size())); } } else { t = s.transitionsArray[r.nextInt(s.numTransitions)]; } soFar.add(getRandomCodePoint(r, t)); s = t.to; } } return ArrayUtil.toIntArray(soFar); } } /** return a random NFA/DFA for testing */ public static Automaton randomAutomaton(Random random) { // get two random Automata from regexps Automaton a1 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(); if (random.nextBoolean()) a1 = BasicOperations.complement(a1); Automaton a2 = new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(); if (random.nextBoolean()) a2 = BasicOperations.complement(a2); // combine them in random ways switch(random.nextInt(4)) { case 0: return BasicOperations.concatenate(a1, a2); case 1: return BasicOperations.union(a1, a2); case 2: return BasicOperations.intersection(a1, a2); default: return BasicOperations.minus(a1, a2); } } /** * below are original, unoptimized implementations of DFA operations for testing. * These are from brics automaton, full license (BSD) below: */ /* * dk.brics.automaton * * Copyright (c) 2001-2009 Anders Moeller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * Simple, original brics implementation of Brzozowski minimize() */ public static void minimizeSimple(Automaton a) { if (a.isSingleton()) return; determinizeSimple(a, SpecialOperations.reverse(a)); determinizeSimple(a, SpecialOperations.reverse(a)); } /** * Simple, original brics implementation of determinize() */ public static void determinizeSimple(Automaton a) { if (a.deterministic || a.isSingleton()) return; Set<State> initialset = new HashSet<>(); initialset.add(a.initial); determinizeSimple(a, initialset); } /** * Simple, original brics implementation of determinize() * Determinizes the given automaton using the given set of initial states. */ public static void determinizeSimple(Automaton a, Set<State> initialset) { int[] points = a.getStartPoints(); // subset construction Map<Set<State>, Set<State>> sets = new HashMap<>(); LinkedList<Set<State>> worklist = new LinkedList<>(); Map<Set<State>, State> newstate = new HashMap<>(); sets.put(initialset, initialset); worklist.add(initialset); a.initial = new State(); newstate.put(initialset, a.initial); while (worklist.size() > 0) { Set<State> s = worklist.removeFirst(); State r = newstate.get(s); for (State q : s) if (q.accept) { r.accept = true; break; } for (int n = 0; n < points.length; n++) { Set<State> p = new HashSet<>(); for (State q : s) for (Transition t : q.getTransitions()) if (t.min <= points[n] && points[n] <= t.max) p.add(t.to); if (!sets.containsKey(p)) { sets.put(p, p); worklist.add(p); newstate.put(p, new State()); } State q = newstate.get(p); int min = points[n]; int max; if (n + 1 < points.length) max = points[n + 1] - 1; else max = Character.MAX_CODE_POINT; r.addTransition(new Transition(min, max, q)); } } a.deterministic = true; a.clearNumberedStates(); a.removeDeadTransitions(); } /** * Returns true if the language of this automaton is finite. * <p> * WARNING: this method is slow, it will blow up if the automaton is large. * this is only used to test the correctness of our faster implementation. */ public static boolean isFiniteSlow(Automaton a) { if (a.isSingleton()) return true; return isFiniteSlow(a.initial, new HashSet<State>()); } /** * Checks whether there is a loop containing s. (This is sufficient since * there are never transitions to dead states.) */ // TODO: not great that this is recursive... in theory a // large automata could exceed java's stack private static boolean isFiniteSlow(State s, HashSet<State> path) { path.add(s); for (Transition t : s.getTransitions()) if (path.contains(t.to) || !isFiniteSlow(t.to, path)) return false; path.remove(s); return true; } /** * Checks that an automaton has no detached states that are unreachable * from the initial state. */ public static void assertNoDetachedStates(Automaton a) { int numStates = a.getNumberOfStates(); a.clearNumberedStates(); // force recomputation of cached numbered states assert numStates == a.getNumberOfStates() : "automaton has " + (numStates - a.getNumberOfStates()) + " detached states"; } }