package client.net.sf.saxon.ce.regex; import client.net.sf.saxon.ce.expr.z.*; import java.io.PrintStream; /** * A class that holds compiled regular expressions. * * @see REMatcher * @see RECompiler * * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a> * @version $Id: REProgram.java 518156 2007-03-14 14:31:26Z vgritsenko $ */ public class REProgram { static final int OPT_HASBACKREFS = 1; static final int OPT_HASBOL = 2; Operation[] instructions; REFlags flags; UnicodeString prefix; // Prefix string optimization int optimizationFlags; // Optimization flags (REProgram.OPT_*) int maxParens = -1; boolean nullable = false; /** * Constructs a program object from a character array * @param parens Count of parens in the program * @param instructions Array with RE opcode instructions in it. The "next" * pointers within the operations must already have been converted to absolute * offsets. */ public REProgram(Operation[] instructions, int parens, REFlags flags) { this.flags = flags; setInstructions(instructions); this.maxParens = parens; } /** * Sets a new regular expression program to run. It is this method which * performs any special compile-time search optimizations. Currently only * two optimizations are in place - one which checks for backreferences * (so that they can be lazily allocated) and another which attempts to * find an prefix anchor string so that substantial amounts of input can * potentially be skipped without running the actual program. * @param instructions Program instruction buffer */ private void setInstructions(Operation[] instructions) { // Save reference to instruction array this.instructions = instructions; // Initialize other program-related variables this.optimizationFlags = 0; this.prefix = null; // Try various compile-time optimizations if there's a program if (instructions != null && instructions.length != 0) { if (instructions[0] instanceof Operation.OpAtom) { prefix = ((Operation.OpAtom)instructions[0]).atom; } // If the first node is a branch if (instructions[0] instanceof Operation.OpBranch) { // to the end node int next = instructions[0].next; if (instructions[next] instanceof Operation.OpEndProgram) { final Operation nextOp = instructions[1]; // the branch starts with an atom if (nextOp instanceof Operation.OpAtom) { // then get that atom as an prefix because there's no other choice this.prefix = ((Operation.OpAtom)nextOp).atom; } // the branch starts with a BOL else if (nextOp instanceof Operation.OpBOL) { // then set the flag indicating that BOL is present this.optimizationFlags |= OPT_HASBOL; } } } // Check for backreferences for (Operation op : instructions) { if (op instanceof Operation.OpBackReference) { optimizationFlags |= OPT_HASBACKREFS; break; } } // Check for deterministic quantifiers; the optimization causes constructs such as A* or [0-9]+ to // be evaluated using iteration rather than recursion if there is no ambiguity about the ending condition, // which means there will never be any need to backtrack. boolean caseBlind = flags.isCaseIndependent(); for (int i=0; i<instructions.length; i++) { Operation op = instructions[i]; if (op instanceof Operation.OpStar && op.next == i+2 && (instructions[i+1] instanceof Operation.OpAtom || instructions[i+1] instanceof Operation.OpCharClass)) { if (noAmbiguity(instructions[i+1], instructions[op.next], caseBlind)) { //System.err.println("Optimizing *"); instructions[i] = new Operation.OpConfidentStar(); instructions[i].next = op.next; } } else if (op instanceof Operation.OpPlus && op.next == i-2 && (instructions[i-1] instanceof Operation.OpAtom || instructions[i-1] instanceof Operation.OpCharClass) && (instructions[i-2].next == i+1)) { if (noAmbiguity(instructions[i-1], instructions[i+1], caseBlind)) { //System.err.println("Optimizing +"); instructions[i] = new Operation.OpConfidentPlus(); instructions[i].next = i+1; } } } } } /** * Ask whether the regular expression matches a zero length string * @return true if the regex matches a zero length string */ public boolean isNullable() { return nullable; } /** * Say whether the regular expression matches a zero length string * @param nullable true if the regex matches a zero length string */ public void setNullable(boolean nullable) { this.nullable = nullable; } /** * Returns a copy of the prefix of current regular expression program * in a character array. If there is no prefix, or there is no program * compiled yet, <code>getPrefix</code> will return null. * @return A copy of the prefix of current compiled RE program */ public UnicodeString getPrefix() { return prefix; } /** * Output a human-readable printout of the program */ public void display(PrintStream out) { for (int i=0; i<instructions.length; i++) { int nextOffset = instructions[i].next; out.println(i + ". " + instructions[i].toString() + (nextOffset==-1 ? "" : ", next = " + (nextOffset))); } } /** * Determine that there is no ambiguity between two branches, that is, if one of them matches then the * other cannot possibly match. (This is for optimization, so it does not have to detect all cases; but * if it returns true, then the result must be dependable.) * @return true if it can be established that there is no input sequence that will match both instructions */ boolean noAmbiguity(Operation op0, Operation op1, boolean caseBlind) { // op0 will always be either an Atom or a CharClass. op1 may be anything. if (op1 instanceof Operation.OpClose || op1 instanceof Operation.OpCloseCluster) { op1 = instructions[op1.next]; } if (op1 instanceof Operation.OpEndProgram || op1 instanceof Operation.OpBOL || op1 instanceof Operation.OpEOL) { return true; } IntSet set0; if (op0 instanceof Operation.OpAtom) { set0 = getInitialChars((Operation.OpAtom) op0, caseBlind); } else { IntPredicate ip0 = ((Operation.OpCharClass)op0).predicate; if (ip0 instanceof IntSetPredicate) { set0 = ((IntSetPredicate)ip0).getIntSet(); } else if (ip0 instanceof IntValuePredicate) { set0 = new IntSingletonSet(((IntValuePredicate)ip0).getTarget()); } else { return false; } } IntSet set1; if (op1 instanceof Operation.OpAtom) { set1 = getInitialChars((Operation.OpAtom) op1, caseBlind); } else if (op1 instanceof Operation.OpCharClass) { IntPredicate ip1 = ((Operation.OpCharClass)op1).predicate; if (ip1 instanceof IntSetPredicate) { set1 = ((IntSetPredicate)ip1).getIntSet(); } else if (ip1 instanceof IntValuePredicate) { set1 = new IntSingletonSet(((IntValuePredicate)ip1).getTarget()); } else { return false; } } else { return false; } return isDisjoint(set0, set1); } private IntSet getInitialChars(Operation.OpAtom op, boolean caseBlind) { IntSet set; int ch = op.atom.charAt(0); set = new IntSingletonSet(ch); if (caseBlind) { set = new IntHashSet(10); set.add(ch); for (int v : CaseVariants.getCaseVariants(ch)) { set.add(v); } } return set; } boolean isDisjoint(IntSet set0, IntSet set1) { try { IntSet intersection = set0.intersect(set1); return intersection.isEmpty(); } catch (Throwable e) { return false; } } } // This class is derived from the Apache Jakarta project, with substantial // modifications by Saxonica to make the regular expression dialect conform // with XPath 2.0 specifications. /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */