package project.phase2.ll1parsergenerator.dfastuff; import java.util.Scanner; import java.util.LinkedList; /** * The Parser class takes in a Scanner of an input stream and generates a NFA for the tokens * within that stream. * The input file should contain a predefined classes, a empty or non token line, then * the tokens with which the nfa will be generated. * The parser class uses a recursive descent parser based on the grammar rules provided. * All methods are static. */ public class Parser { private static String line; private static int index; private static boolean DEBUG=false; private static final CharacterClass DOT = CharacterClass.fromRange((char)0x20+"-"+(char)0x7e); private static LinkedList<CharacterClass> predefined; //TODO test cases public static void main(String[] args){ predefined = new LinkedList<CharacterClass>(); CharacterClass c = CharacterClass.fromRange("0-9"); c.setName("$DIGIT"); predefined.add(c); //Testing repitition, union, and parenthesis line="(a|b)*"; index=0; System.out.println("Test Case 1:"); System.out.println("NFA for (a|b)* \n"); System.out.println(regEx() +"\n\n"); System.out.println("Test Case 2:"); line="(a|b)+"; index=0; System.out.println("NFA for (a|b)+ \n"); System.out.println(regEx()+"\n\n"); //testing for character classes line="[ a-d0-3]"; index=0; System.out.println("Test Case 3:"); System.out.println("NFA for [ a-d0-3] \n"); System.out.println(regEx()+"\n\n"); //testing escaped characters line="\\*[\\[-\\]]"; index=0; System.out.println("Test Case 4:"); System.out.println("NFA for \\*[\\[-\\]] \n"); System.out.println(regEx()+"\n\n"); //testing exclude sets line="[^1-9] IN [0-9]"; index=0; System.out.println("Test Case 5:"); System.out.println("NFA for [^1-9] IN [0-9] \n"); System.out.println(regEx()+"\n\n"); //testing predefined classes line="$DIGIT"; index=0; System.out.println("Test Case 6:"); System.out.println("NFA for $DIGIT, where $DIGIT is [0-9]\n"); System.out.println(regEx()+"\n\n"); } /** * If the RegEx is enclosed in '' * @param s * @return */ public static NFA fromString2(String s){ String s2=""; int i=0; while (s.charAt(i)!='\''){ i++; } i++; while (s.charAt(i)!='\''){ s2+=s.charAt(i); i++; } return parseNFA(new Scanner(s2)); } //Parse regex from string public static NFA fromString(String s){ line=s; index=0; predefined = new LinkedList<CharacterClass>(); return regEx(); } /** * Generates a NFA that detects tokens as defined in the input stream. * @param input A scanner containing the input stream * @return The NFA generated from the scanner */ public static NFA parseNFA(Scanner input){ NFA sum = null; String[] s=null; NFA identifier; CharacterClass def; predefined = new LinkedList<CharacterClass>(); //finding the first predefined class while(s == null) s = split(input.nextLine()); //generating the predefined character classes while(input.hasNext() && s!=null){ line=s[1]; index=0; def = charClass(); def.setName(s[0]); predefined.add(def); s=split(input.nextLine()); } //printing out predefined character classes if (DEBUG) for (CharacterClass c : predefined) System.out.println(c.getName()+" "+c.getClassDescriptor()); //Generating the NFA for each token and unioning them. while (input.hasNext()){ s=split(input.nextLine()); if (s!=null){ line=s[1]; index=0; identifier=regEx(); identifier.setGoalLabels(s[0]); if (sum==null) sum=identifier; else sum = (NFA)sum.union(identifier); } } return sum; } /** * <reg-ex> * Generates a NFA for a regular expression stored in 'line' starting at index 'index' * @return the generated NFA */ private static NFA regEx(){ return rExp(); } /** * <rexp> * Generates a NFA for a regular expression stored in 'line' starting at index 'index' * @return the generated nFA */ private static NFA rExp(){ if (DEBUG) System.out.println("rExp"); NFA exp = rExp1(); NFA exp2 = rExp_(); return (NFA)exp.union(exp2); } /** * <rexp'> * Detects and consumes union operators * @return the generated NFA from the unioned regex */ private static NFA rExp_(){ if (DEBUG) System.out.println("rExp'"); if (peek()=='|'){ match('|'); NFA exp = rExp1(); NFA exp2 = rExp_(); return (NFA)exp.union(exp2); } return null; } /** * <rexp1> * Detects and consumes everything with other than |. * @return the generated NFA */ private static NFA rExp1(){ if (DEBUG) System.out.println("rExp1"); NFA exp1 = rExp2(); if (exp1==null) exp1=NFA.nullNFA(); NFA exp2 = rExp1_(); return (NFA)exp1.concat(exp2); } /** * <rexp1'> * Detects concatenated expressions * @return the generated NFA */ private static NFA rExp1_(){ if (DEBUG) System.out.println("rExp1'"); NFA exp1 = rExp2(); if (exp1!=null){ NFA exp2 = rExp1_(); return (NFA)exp1.concat(exp2); } return null; } /** * <rexp2> * Detects parenthesis, lone characters, and character classes. * @return the generated NFA */ private static NFA rExp2(){ if (DEBUG) System.out.println("rExp2"); NFA exp; if (peek()=='('){ match('('); exp = rExp(); match(')'); exp = rExp2Tail(exp); } else if (peek()!='$' && isReChar()){ char c = peek(); match(c); exp = CharacterClass.fromSet(""+c).getNFA(); exp = rExp2Tail(exp); } else{ exp=rExp3(); } return exp; } /** * <rexp2-tail> * Detects a + or * operator following a nfa and returns the resulting nfa. * @param nfa The preceding nfa * @return nfa*, nfa+, or nfa depending on detected operators */ public static NFA rExp2Tail(NFA nfa){ if (DEBUG) System.out.println("rExp2Tail"); if (peek()=='+'){ match('+'); return (NFA)nfa.concat(nfa.star()); } if (peek()=='*'){ match('*'); return (NFA)nfa.star(); } return nfa; } /** * <rexp3> * Detects character classes and implied empty strings * @return the generated NFA */ public static NFA rExp3(){ if (DEBUG) System.out.println("rExp3"); CharacterClass chars = charClass(); if (chars==null) return null; return chars.getNFA(); } /** * <char-class> * Detects character classes * @return the detected character class */ private static CharacterClass charClass(){ if (DEBUG) System.out.println("charClass"); if (peek()=='.'){ match('.'); return DOT; } if (peek()=='['){ index++; return charClass1(); } if (peek()=='$'){ CharacterClass c = preClass(); if (c==null){ match('$'); c=CharacterClass.fromSet("$"); } return c; } return null; } /** * <char-class1> * Detects exclude sets and ranges * @return the detected character class */ private static CharacterClass charClass1(){ if (DEBUG) System.out.println("charClass1"); if (peek()=='^'){ return excludeSet(); } return charSetList(null); } /** * <char-set-list> * Detects characters within brackets * @param chars the characters found thus far * @return the character class created thus far within the brackets */ private static CharacterClass charSetList(CharacterClass chars){ if (DEBUG) System.out.println("charSetList"); if (peek()==']'){ match(']'); return chars; } if (chars==null) chars=charSet(); else chars = chars.merge(charSet()); return charSetList(chars); } /** * <char-set> * Detects a character and returns the appropriate character class. * Will also detect character ranges. * @return the character class */ private static CharacterClass charSet(){ if (DEBUG) System.out.println("charSet"); char c = peek(); if (isClassChar()){ c = peek(); matchClassChar(c); } else throw(new RuntimeException("" + c+ " is not a class character")); return charSetTail(c); } /** * <char-set-tail> * Detects ranges following a detected character- * i.e. if 'a' was found then it looks for the "-z". * @param c the detected char * @return a character class based on whether or not a range was detected */ private static CharacterClass charSetTail(char c){ if (DEBUG) System.out.println("charSet"); if (peek()=='-'){ matchClassChar('-'); char d=peek();; if (isClassChar()){ d=peek(); matchClassChar(d); } else throw(new RuntimeException("" + peek()+ " is not a class character")); if (d<c) throw(new RuntimeException(""+c+"-"+d+" is not a valid range.")); return CharacterClass.fromRange("" + c+'-'+d); } return CharacterClass.fromSet(""+c); } /** * <exclude-set> * Generates a character class from an exclude set. * @return the character class. */ private static CharacterClass excludeSet(){ if (DEBUG) System.out.println("excludeSet"); match('^'); CharacterClass excluded = charSet(); match(']'); match('I'); match('N'); return CharacterClass.fromExclude(excluded,excludeSetTail()); } /** * <exclude-set-tail> * @return the character class from which characters are excluded. */ private static CharacterClass excludeSetTail(){ if (DEBUG) System.out.println("excludeSetTail"); if (peek()=='$'){ return preClass(); } matchCharSetOpen(); CharacterClass chars = charSet(); match(']'); return chars; } /** * Checks the expression for predefined classes starting at the current index. * Consumes the characters if a class is matched. * @return the predefined class, or null if none matched. */ private static CharacterClass preClass(){ if (DEBUG) System.out.println("preClass"); String name; for (CharacterClass charClass: predefined){ name=charClass.getName(); if (line.length()>=index+name.length()){ if (name.compareTo(line.substring(index,index+name.length()))==0){ index+=name.length(); if(peek()==' ') match(' '); if (DEBUG) System.out.println("matched " + name); return charClass; } } } return null; } /** * Matches the char c with the next char. Will return false if they do not match. * Also increments the index of the string. Consumes any subsequent whitespace. * @param c * @return true if c is the next character, false otherwise */ private static boolean match(char c){ if (DEBUG) System.out.println("matched "+ c); if (line.charAt(index)==c){ index++; if (peek()==' ' && c!='\\') match(' '); return true; } System.out.println(""+ c + " expected, " + line.charAt(index) + " found"); index++; return false; } /** * A match which does not remove spaces within a newly opened character class. * @return true if c is the next character, false otherwise */ private static boolean matchCharSetOpen() { if (DEBUG) System.out.println("matched ["); if (line.charAt(index)=='['){ index++; return true; } System.out.println("[ expected, " + line.charAt(index) + " found"); index++; return false; } /** * Matches the char c with the next char. Will return false if they do not match. * Also increments the index of the string. Does NOT consume any subsequent whitespace. * @param c * @return true if c is the next character, false otherwise */ private static boolean matchClassChar(char c){ if (DEBUG) System.out.println("matched in class def "+ c); if (line.charAt(index)==c){ index++; return true; } System.out.println(""+ c + " expected in class def, " + line.charAt(index) + " found"); index++; return false; } /** * @return the next char */ private static char peek(){ if (index < line.length()) return line.charAt(index); return (char)-1; } /** * @return whether or not the next character is a reChar. */ private static boolean isReChar(){ char c = peek(); if (c == '\\'){ index++; if (DEBUG) System.out.println("Escape found"); if (!isEscape(peek())){ throw(new RuntimeException("Invalid escape. " + peek() + " was escaped but is not an escaped character.")); } return true; } if (c<0x20 || c>0x7e) return false; if (isEscape(c)) return false; return true; } /** * @return whether or not the next character is a class character. Consumes escapes. */ private static boolean isClassChar(){ char c=peek(); if (c=='\\'){ matchClassChar(c); if (!isClassEscape(peek())){ throw(new RuntimeException("Invalid escape. " +peek() +" was escaped in a class definition.")); } return true; } if (c<0x20 || c>0x7e) return false; if (isClassEscape(c)) return false; return true; } /** * Used for splitting a line into its identifier and regex * @return an string array. s[0] is the token and s[1] is the regex. */ private static String[] split(String line){ if(line==null) return null; if(line.length()==0 || line.charAt(0)!='$') return null; String out[] = new String[2]; out[0]=""; out[1]=""; int i=0; while (!Character.isWhitespace(line.charAt(i))){ out[0]+=line.charAt(i++); if (index==line.length()) throw(new RuntimeException("Improper line, no regular expression for token: " + out[0])); } while (Character.isWhitespace(line.charAt(i))){ i++; if (index==line.length()) throw(new RuntimeException("Improper line, no regular expression for token: " + out[0])); } while (i<line.length()){ out[1]+=line.charAt(i++); } return out; } /** * @param c * @return whether or not c must be escaped as a ReChar. */ private static boolean isEscape(char c){ for (char d : escape){ if (c==d) return true; } return false; } /** * @param c * @return whether or not c must be escaped as a class char. */ private static boolean isClassEscape(char c){ for (char d : classEscape){ if (c==d) return true; } return false; } /** * Class characters that must be escaped */ public final static char[] classEscape={'\\','^','[',']','-'}; /** * ReChar that must be escaped */ public final static char[] escape={'\\','*','+', '?', '|', '[', ']', '(', ')', '.','\'','\"',' '}; }