/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * JFlex 1.4.1 * * Copyright (C) 1998-2004 Gerwin Klein <lsf@jflex.de> * * All rights reserved. * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License. See the file * * COPYRIGHT for more information. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License along * * with this program; if not, write to the Free Software Foundation, Inc., * * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ package weka.core.parser.JFlex; import java.util.*; import java.io.*; /** * NFA representation in JFlex. * * Contains algorithms RegExp -> NFA and NFA -> DFA. * * @author Gerwin Klein * @version JFlex 1.4.1, $Revision: 1.1 $, $Date: 2008/05/09 09:14:11 $ */ final public class NFA { // table[current_state][next_char] is the set of states that can be reached // from current_state with an input next_char StateSet [][] table; // epsilon[current_state] is the set of states that can be reached // from current_state via epsilon-edges StateSet [] epsilon; // isFinal[state] == true <=> state is a final state of the NFA boolean [] isFinal; // isPushback[state] == true <=> state is the final state of a regexp that // should only be matched when followed by a certain lookahead. boolean [] isPushback; // action[current_state]: the action associated with the state // current_state (null, if there is no action for the state) Action [] action; // the number of states in this NFA int numStates; // the current maximum number of input characters int numInput; // the number of lexical States. Lexical states have the indices // 0..numLexStates-1 in the transition table int numLexStates; // estimated size of the NFA (before actual construction) int estSize = 256; Macros macros; CharClasses classes; LexScan scanner; RegExps regExps; // will be reused by several methods (avoids excessive object creation) private static StateSetEnumerator states = new StateSetEnumerator(); private static StateSet tempStateSet = new StateSet(); public NFA(int numInput, int estSize) { this.numInput = numInput; this.estSize = estSize; numStates = 0; epsilon = new StateSet [estSize]; action = new Action [estSize]; isFinal = new boolean [estSize]; isPushback = new boolean [estSize]; table = new StateSet [estSize][numInput]; } public NFA(int numInput, LexScan scanner, RegExps regExps, Macros macros, CharClasses classes) { this(numInput, regExps.NFASize(macros)+2*scanner.states.number()); this.scanner = scanner; this.regExps = regExps; this.macros = macros; this.classes = classes; numLexStates = scanner.states.number(); ensureCapacity(2*numLexStates); numStates = 2*numLexStates; } public void addStandaloneRule() { // standalone rule has least priority, fires // transition on all characters and has "print it rule" int start = numStates; int end = numStates+1; for (int c = 0; c < classes.getNumClasses(); c++) addTransition(start, c, end); for (int i = 0; i < numLexStates*2; i++) addEpsilonTransition(i, start); action[end] = new Action("System.out.print(yytext());", Integer.MAX_VALUE); isFinal[end] = true; } public void addRegExp(int regExpNum) { if (Options.DEBUG) Out.debug("Adding nfa for regexp "+regExpNum+" :"+Out.NL+regExps.getRegExp(regExpNum)); IntPair nfa = insertNFA( regExps.getRegExp(regExpNum) ); Enumeration lexStates = regExps.getStates(regExpNum).elements(); if ( !lexStates.hasMoreElements() ) lexStates = scanner.states.getInclusiveStates(); while ( lexStates.hasMoreElements() ) { int stateNum = ((Integer)lexStates.nextElement()).intValue(); if ( !regExps.isBOL(regExpNum) ) addEpsilonTransition(2*stateNum, nfa.start); addEpsilonTransition(2*stateNum+1, nfa.start); } if ( regExps.getLookAhead(regExpNum) != null ) { IntPair look = insertNFA( regExps.getLookAhead(regExpNum) ); addEpsilonTransition(nfa.end, look.start); Action a = regExps.getAction(regExpNum); a.setLookAction(true); isPushback[nfa.end] = true; action[look.end] = a; isFinal[look.end] = true; } else { action[nfa.end] = regExps.getAction(regExpNum); isFinal[nfa.end] = true; } } private void ensureCapacity(int newNumStates) { int oldLength = epsilon.length; if ( newNumStates < oldLength ) return; int newStatesLength = Math.max(oldLength*2, newNumStates); boolean [] newFinal = new boolean [newStatesLength]; boolean [] newIsPush = new boolean [newStatesLength]; Action [] newAction = new Action [newStatesLength]; StateSet [] [] newTable = new StateSet [newStatesLength] [numInput]; StateSet [] newEpsilon = new StateSet [newStatesLength]; System.arraycopy(isFinal,0,newFinal,0,numStates); System.arraycopy(isPushback,0,newIsPush,0,numStates); System.arraycopy(action,0,newAction,0,numStates); System.arraycopy(epsilon,0,newEpsilon,0,numStates); System.arraycopy(table,0,newTable,0,numStates); isFinal = newFinal; isPushback = newIsPush; action = newAction; epsilon = newEpsilon; table = newTable; } public void addTransition(int start, int input, int dest) { Out.debug("Adding transition ("+start+", "+input+", "+dest+")"); int maxS = Math.max(start,dest)+1; ensureCapacity( maxS ); if (maxS > numStates) numStates = maxS; if ( table[start][input] != null ) table[start][input].addState(dest); else table[start][input] = new StateSet(estSize,dest); } public void addEpsilonTransition(int start, int dest) { int max = Math.max(start,dest)+1; ensureCapacity( max ); if (max > numStates) numStates = max; if (epsilon[start] != null) epsilon[start].addState(dest); else epsilon[start] = new StateSet(estSize,dest); } /** * Returns <code>true</code>, iff the specified set of states * contains a final state. * * @param set the set of states that is tested for final states. */ private boolean containsFinal(StateSet set) { states.reset(set); while ( states.hasMoreElements() ) if ( isFinal[states.nextElement()] ) return true; return false; } /** * Returns <code>true</code>, iff the specified set of states * contains a pushback-state. * * @param set the set of states that is tested for pushback-states. */ private boolean containsPushback(StateSet set) { states.reset(set); while ( states.hasMoreElements() ) if ( isPushback[states.nextElement()] ) return true; return false; } /** * Returns the action with highest priority in the specified * set of states. * * @param set the set of states for which to determine the action */ private Action getAction(StateSet set) { states.reset(set); Action maxAction = null; Out.debug("Determining action of : "+set); while ( states.hasMoreElements() ) { Action currentAction = action[ states.nextElement() ]; if ( currentAction != null ) { if (maxAction == null) maxAction = currentAction; else maxAction = maxAction.getHigherPriority(currentAction); } } return maxAction; } /** * Calculates the epsilon closure for a specified set of states. * * The epsilon closure for set a is the set of states that can be reached * by epsilon edges from a. * * @param set the set of states to calculate the epsilon closure for * * @return the epsilon closure of the specified set of states * in this NFA */ private StateSet closure(int startState) { // Out.debug("Calculating closure of "+set); StateSet notvisited = tempStateSet; StateSet closure = new StateSet(numStates,startState); notvisited.clear(); notvisited.addState(startState); while ( notvisited.containsElements() ) { // Out.debug("closure is now "+closure); // Out.debug("notvisited is "+notvisited); int state = notvisited.getAndRemoveElement(); // Out.debug("removed element "+state+" of "+notvisited); // Out.debug("epsilon[states] = "+epsilon[state]); notvisited.add(closure.complement(epsilon[state])); closure.add(epsilon[state]); } // Out.debug("Closure is : "+closure); return closure; } /** * Returns the epsilon closure of a set of states */ private StateSet closure(StateSet startStates) { StateSet result = new StateSet(numStates); if (startStates != null) { states.reset(startStates); while (states.hasMoreElements()) result.add( closure(states.nextElement()) ); } return result; } private void epsilonFill() { for (int i = 0; i < numStates; i++) { epsilon[i] = closure(i); } } /** * Calculates the set of states that can be reached from another * set of states <code>start</code> with an specified input * character <code>input</code> * * @param start the set of states to start from * @param input the input character for which to search the next states * * @return the set of states that are reached from <code>start</code> * via <code>input</code> */ private StateSet DFAEdge(StateSet start, char input) { // Out.debug("Calculating DFAEdge for state set "+start+" and input '"+input+"'"); tempStateSet.clear(); states.reset(start); while ( states.hasMoreElements() ) tempStateSet.add( table[states.nextElement()][input] ); StateSet result = new StateSet(tempStateSet); states.reset(tempStateSet); while ( states.hasMoreElements() ) result.add( epsilon[states.nextElement()] ); // Out.debug("DFAEdge is : "+result); return result; } /** * Returns an DFA that accepts the same language as this NFA. * This DFA is usualy not minimal. */ public DFA getDFA() { Hashtable dfaStates = new Hashtable(numStates); Vector dfaVector = new Vector(numStates); DFA dfa = new DFA(2*numLexStates, numInput); int numDFAStates = 0; int currentDFAState = 0; Out.println("Converting NFA to DFA : "); epsilonFill(); StateSet currentState, newState; for ( int i = 0; i < 2*numLexStates; i++ ) { newState = epsilon[i]; dfaStates.put(newState, new Integer(numDFAStates)); dfaVector.addElement(newState); dfa.setLexState( i, numDFAStates ); dfa.setFinal( numDFAStates, containsFinal(newState) ); dfa.setPushback( numDFAStates, containsPushback(newState) ); dfa.setAction( numDFAStates, getAction(newState) ); numDFAStates++; } numDFAStates--; if (Options.DEBUG) Out.debug("DFA start states are :"+Out.NL+dfaStates+Out.NL+Out.NL+"ordered :"+Out.NL+dfaVector); currentDFAState = 0; StateSet tempStateSet = NFA.tempStateSet; StateSetEnumerator states = NFA.states; // will be reused newState = new StateSet(numStates); while ( currentDFAState <= numDFAStates ) { currentState = (StateSet) dfaVector.elementAt(currentDFAState); for (char input = 0; input < numInput; input++) { // newState = DFAEdge(currentState, input); // inlining DFAEdge for performance: // Out.debug("Calculating DFAEdge for state set "+currentState+" and input '"+input+"'"); tempStateSet.clear(); states.reset(currentState); while ( states.hasMoreElements() ) tempStateSet.add( table[states.nextElement()][input] ); newState.copy(tempStateSet); states.reset(tempStateSet); while ( states.hasMoreElements() ) newState.add( epsilon[states.nextElement()] ); // Out.debug("DFAEdge is : "+newState); if ( newState.containsElements() ) { // Out.debug("DFAEdge for input "+(int)input+" and state set "+currentState+" is "+newState); // Out.debug("Looking for state set "+newState); Integer nextDFAState = (Integer) dfaStates.get(newState); if ( nextDFAState != null ) { // Out.debug("FOUND!"); dfa.addTransition(currentDFAState, input, nextDFAState.intValue()); } else { if (Options.progress) Out.print("."); // Out.debug("NOT FOUND!"); // Out.debug("Table was "+dfaStates); numDFAStates++; // make a new copy of newState to store in dfaStates StateSet storeState = new StateSet(newState); dfaStates.put(storeState, new Integer(numDFAStates)); dfaVector.addElement(storeState); dfa.addTransition(currentDFAState, input, numDFAStates); dfa.setFinal( numDFAStates, containsFinal(storeState) ); dfa.setPushback( numDFAStates, containsPushback(storeState) ); dfa.setAction( numDFAStates, getAction(storeState) ); } } } currentDFAState++; } if (Options.verbose) Out.println(""); return dfa; } public void dumpTable() { Out.dump(toString()); } public String toString() { StringBuffer result = new StringBuffer(); for (int i=0; i < numStates; i++) { result.append("State"); if ( isFinal[i] ) result.append("[FINAL]"); if ( isPushback[i] ) result.append(" [PUSHBACK]"); result.append(" "+i+Out.NL); for (char input = 0; input < numInput; input++) { if ( table[i][input] != null && table[i][input].containsElements() ) result.append(" with "+((int) input)+" in "+table[i][input]+Out.NL); } if ( epsilon[i] != null && epsilon[i].containsElements() ) result.append(" with epsilon in "+epsilon[i]+Out.NL); } return result.toString(); } public void writeDot(File file) { try { PrintWriter writer = new PrintWriter(new FileWriter(file)); writer.println(dotFormat()); writer.close(); } catch (IOException e) { Out.error(ErrorMessages.FILE_WRITE, file); throw new GeneratorException(); } } public String dotFormat() { StringBuffer result = new StringBuffer(); result.append("digraph NFA {"+Out.NL); result.append("rankdir = LR"+Out.NL); for (int i=0; i < numStates; i++) { if ( isFinal[i] || isPushback[i] ) result.append(i); if ( isFinal[i] ) result.append(" [shape = doublecircle]"); if ( isPushback[i] ) result.append(" [shape = box]"); if ( isFinal[i] || isPushback[i] ) result.append(Out.NL); } for (int i=0; i < numStates; i++) { for (int input = 0; input < numInput; input++) { if ( table[i][input] != null ) { StateSetEnumerator states = table[i][input].states(); while (states.hasMoreElements()) { int s = states.nextElement(); result.append(i+" -> "+s); result.append(" [label=\""+classes.toString(input)+"\"]"+Out.NL); } } } if ( epsilon[i] != null ) { StateSetEnumerator states = epsilon[i].states(); while (states.hasMoreElements()) { int s = states.nextElement(); result.append(i+" -> "+s+" [style=dotted]"+Out.NL); } } } result.append("}"+Out.NL); return result.toString(); } //----------------------------------------------------------------------- // Functions for constructing NFAs out of regular expressions. private void insertLetterNFA(boolean caseless, char letter, int start, int end) { if (caseless) { int lower = classes.getClassCode(Character.toLowerCase(letter)); int upper = classes.getClassCode(Character.toUpperCase(letter)); addTransition(start, lower, end); if (upper != lower) addTransition(start, upper, end); } else { addTransition(start, classes.getClassCode(letter), end); } } private IntPair insertStringNFA(boolean caseless, String letters) { int start = numStates; int i; for (i = 0; i < letters.length(); i++) { if (caseless) { char c = letters.charAt(i); int lower = classes.getClassCode(Character.toLowerCase(c)); int upper = classes.getClassCode(Character.toUpperCase(c)); addTransition(i+start, lower, i+start+1); if (upper != lower) addTransition(i+start, upper, i+start+1); } else { addTransition(i+start, classes.getClassCode(letters.charAt(i)), i+start+1); } } return new IntPair(start, i+start); } private void insertClassNFA(Vector intervalls, int start, int end) { // empty char class is ok: if (intervalls == null) return; int [] cl = classes.getClassCodes(intervalls); for (int i = 0; i < cl.length; i++) addTransition(start, cl[i], end); } private void insertNotClassNFA(Vector intervalls, int start, int end) { int [] cl = classes.getNotClassCodes(intervalls); for (int i = 0; i < cl.length; i++) addTransition(start, cl[i], end); } /** * Constructs an NFA accepting the complement of the language * of a given NFA. * * Converts the NFA into a DFA, then negates that DFA. * Exponential state blowup possible and common. * * @param the NFA to construct the complement for. * * @return a pair of integers denoting the index of start * and end state of the complement NFA. */ private IntPair complement(IntPair nfa) { if (Options.DEBUG) { Out.debug("complement for "+nfa); Out.debug("NFA is :"+Out.NL+this); } int dfaStart = nfa.end+1; // fixme: only need epsilon closure of states reachable from nfa.start epsilonFill(); Hashtable dfaStates = new Hashtable(numStates); Vector dfaVector = new Vector(numStates); int numDFAStates = 0; int currentDFAState = 0; StateSet currentState, newState; newState = epsilon[nfa.start]; dfaStates.put(newState, new Integer(numDFAStates)); dfaVector.addElement(newState); if (Options.DEBUG) Out.debug("pos DFA start state is :"+Out.NL+dfaStates+Out.NL+Out.NL+"ordered :"+Out.NL+dfaVector); currentDFAState = 0; while ( currentDFAState <= numDFAStates ) { currentState = (StateSet) dfaVector.elementAt(currentDFAState); for (char input = 0; input < numInput; input++) { newState = DFAEdge(currentState, input); if ( newState.containsElements() ) { // Out.debug("DFAEdge for input "+(int)input+" and state set "+currentState+" is "+newState); // Out.debug("Looking for state set "+newState); Integer nextDFAState = (Integer) dfaStates.get(newState); if ( nextDFAState != null ) { // Out.debug("FOUND!"); addTransition(dfaStart+currentDFAState, input, dfaStart+nextDFAState.intValue()); } else { if (Options.dump) Out.print("+"); // Out.debug("NOT FOUND!"); // Out.debug("Table was "+dfaStates); numDFAStates++; dfaStates.put(newState, new Integer(numDFAStates)); dfaVector.addElement(newState); addTransition(dfaStart+currentDFAState, input, dfaStart+numDFAStates); } } } currentDFAState++; } // We have a dfa accepting the positive regexp. // Now the complement: if (Options.DEBUG) Out.debug("dfa finished, nfa is now :"+Out.NL+this); int start = dfaStart+numDFAStates+1; int error = dfaStart+numDFAStates+2; int end = dfaStart+numDFAStates+3; addEpsilonTransition(start, dfaStart); for (int i = 0; i < numInput; i++) addTransition(error, i, error); addEpsilonTransition(error, end); for (int s = 0; s <= numDFAStates; s++) { currentState = (StateSet) dfaVector.elementAt(s); currentDFAState = dfaStart+s; // if it was not a final state, it is now in the complement if (!currentState.isElement(nfa.end)) addEpsilonTransition(currentDFAState, end); // all inputs not present (formerly leading to an implicit error) // now lead to an explicit (final) state accepting everything. for (int i = 0; i < numInput; i++) if (table[currentDFAState][i] == null) addTransition(currentDFAState, i, error); } // eliminate transitions leading to dead states if (live == null || live.length < numStates) { live = new boolean [2*numStates]; visited = new boolean [2*numStates]; } _end = end; _dfaStates = dfaVector; _dfaStart = dfaStart; removeDead(dfaStart); if (Options.DEBUG) Out.debug("complement finished, nfa ("+start+","+end+") is now :"+this); return new IntPair(start, end); } // "global" data for use in method removeDead only: // live[s] == false <=> no final state can be reached from s private boolean [] live; // = new boolean [estSize]; private boolean [] visited; // = new boolean [estSize]; private int _end; // final state of original nfa for dfa (nfa coordinates) private Vector _dfaStates; private int _dfaStart; // in nfa coordinates private void removeDead(int start) { // Out.debug("removeDead ("+start+")"); if ( visited[start] || live[start] ) return; visited[start] = true; // Out.debug("not yet visited"); if (closure(start).isElement(_end)) live[start] = true; // Out.debug("is final :"+live[start]); for (int i = 0; i < numInput; i++) { StateSet nextState = closure(table[start][i]); StateSetEnumerator states = nextState.states(); while (states.hasMoreElements()) { int next = states.nextElement(); if (next != start) { removeDead(next); if (live[next]) live[start] = true; else table[start][i] = null; } } } StateSet nextState = closure(epsilon[start]); StateSetEnumerator states = nextState.states(); while (states.hasMoreElements()) { int next = states.nextElement(); if (next != start) { removeDead(next); if (live[next]) live[start] = true; } } // Out.debug("state "+start+" is live :"+live[start]); } /** * Constructs a two state NFA for char class regexps, * such that the NFA has * * exactly one start state, * exactly one end state, * no transitions leading out of the end state * no transitions leading into the start state * * Assumes that regExp.isCharClass(macros) == true * * @param regExp the regular expression to construct the * NFA for * * @return a pair of integers denoting the index of start * and end state of the NFA. */ private void insertNFA(RegExp regExp, int start, int end) { switch (regExp.type) { case sym.BAR: RegExp2 r = (RegExp2) regExp; insertNFA(r.r1, start, end); insertNFA(r.r2, start, end); return; case sym.CCLASS: insertClassNFA( (Vector) ((RegExp1) regExp).content, start, end); return; case sym.CCLASSNOT: insertNotClassNFA( (Vector) ((RegExp1) regExp).content, start, end); return; case sym.CHAR: insertLetterNFA( false, ((Character) ((RegExp1) regExp).content).charValue(), start, end); return; case sym.CHAR_I: insertLetterNFA( true, ((Character) ((RegExp1) regExp).content).charValue(), start, end); return; case sym.MACROUSE: insertNFA(macros.getDefinition((String) ((RegExp1) regExp).content), start, end); return; } throw new Error("Unknown expression type "+regExp.type+" in NFA construction"); } /** * Constructs an NFA for regExp such that the NFA has * * exactly one start state, * exactly one end state, * no transitions leading out of the end state * no transitions leading into the start state * * @param regExp the regular expression to construct the * NFA for * * @return a pair of integers denoting the index of start * and end state of the NFA. */ public IntPair insertNFA(RegExp regExp) { IntPair nfa1, nfa2; int start, end; RegExp2 r; if (Options.DEBUG) Out.debug("Inserting RegExp : "+regExp); if (regExp.isCharClass(macros)) { start = numStates; end = numStates+1; ensureCapacity(end+1); if (end+1 > numStates) numStates = end+1; insertNFA(regExp, start, end); return new IntPair(start, end); } switch (regExp.type) { case sym.BAR: r = (RegExp2) regExp; nfa1 = insertNFA(r.r1); nfa2 = insertNFA(r.r2); start = nfa2.end+1; end = nfa2.end+2; addEpsilonTransition(start, nfa1.start); addEpsilonTransition(start, nfa2.start); addEpsilonTransition(nfa1.end, end); addEpsilonTransition(nfa2.end, end); return new IntPair(start, end); case sym.CONCAT: r = (RegExp2) regExp; nfa1 = insertNFA(r.r1); nfa2 = insertNFA(r.r2); addEpsilonTransition(nfa1.end, nfa2.start); return new IntPair(nfa1.start, nfa2.end); case sym.STAR: nfa1 = insertNFA( (RegExp) ((RegExp1) regExp).content ); start = nfa1.end+1; end = nfa1.end+2; addEpsilonTransition(nfa1.end, end); addEpsilonTransition(start, nfa1.start); addEpsilonTransition(start, end); addEpsilonTransition(nfa1.end, nfa1.start); return new IntPair(start, end); case sym.PLUS: nfa1 = insertNFA( (RegExp) ((RegExp1) regExp).content ); start = nfa1.end+1; end = nfa1.end+2; addEpsilonTransition(nfa1.end, end); addEpsilonTransition(start, nfa1.start); addEpsilonTransition(nfa1.end, nfa1.start); return new IntPair(start, end); case sym.QUESTION: nfa1 = insertNFA( (RegExp) ((RegExp1) regExp).content ); addEpsilonTransition(nfa1.start, nfa1.end); return new IntPair(nfa1.start, nfa1.end); case sym.BANG: return complement(insertNFA((RegExp) ((RegExp1) regExp).content)); case sym.TILDE: nfa1 = insertNFA((RegExp) ((RegExp1) regExp).content); start = nfa1.end+1; int s1 = start+1; int s2 = s1+1; end = s2+1; for (int i = 0; i < numInput; i++) { addTransition(s1,i,s1); addTransition(s2,i,s2); } addEpsilonTransition(start, s1); addEpsilonTransition(s1, nfa1.start); addEpsilonTransition(nfa1.end, s2); addEpsilonTransition(s2, end); nfa1 = complement(new IntPair(start,end)); nfa2 = insertNFA((RegExp) ((RegExp1) regExp).content); addEpsilonTransition(nfa1.end, nfa2.start); return new IntPair(nfa1.start, nfa2.end); case sym.STRING: return insertStringNFA(false, (String) ((RegExp1) regExp).content ); case sym.STRING_I: return insertStringNFA(true, (String) ((RegExp1) regExp).content ); case sym.MACROUSE: return insertNFA(macros.getDefinition((String) ((RegExp1) regExp).content)); } throw new Error("Unknown expression type "+regExp.type+" in NFA construction"); } }