package edu.cmu.minorthird.text.mixup; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.log4j.Logger; public class Statement implements Serializable{ static private final long serialVersionUID=20080303L; private static Logger log=Logger.getLogger(Statement.class); public static int REGEX=1,MIXUP=2,FILTER=3,PROVIDE=4,REQUIRE=5,DECLARE=6, TRIE=7,ANNOTATE_WITH=8; //TODO: We should handle these properties better, possibly using a java properties object // encodes the statement properties private String keyword,property,type,startType,value; // set of words, for a dictionary private Set<String> wordSet=null; // file containing dictionary // private File dictFile=null; // Variable for whether to ignore case in dictionary private boolean ignoreCase; // split string for retokenizing textBase private String split,patt; // current tokenization level private String level; // Variables that define the level and type to be imported to the current textBase private String importLevel,importType,oldType; // encode generator private int statementType; // for statementType = MIXUP or FILTER private Mixup mixupExpr=null; // for statementType TRIE private List<String> phraseList; // for statementType = REGEX private String regex=null; private int regexGroup; // for statementType=PROVIDE,REQUIRE,ANNOTATEWITH,DICTIONARY private String annotationType,fileToLoad; private List<String> filesToLoad; // for parsing // private Matcher matcher; // for TRIE private int lastTokenStart; private String input; private static Set<String> generatorStart=new HashSet<String>(); private static Set<String> legalKeywords=new HashSet<String>(); private static Set<String> colonEqualsOrCase=new HashSet<String>(); private static Set<String> defLevelType=new HashSet<String>(); static{ legalKeywords.add("defTokenProp"); legalKeywords.add("defSpanProp"); legalKeywords.add("defSpanType"); legalKeywords.add("defDict"); legalKeywords.add("declareSpanType"); legalKeywords.add("provide"); legalKeywords.add("require"); legalKeywords.add("defLevel"); legalKeywords.add("onLevel"); legalKeywords.add("offLevel"); legalKeywords.add("importFromLevel"); } static{ colonEqualsOrCase.add(":"); colonEqualsOrCase.add("="); colonEqualsOrCase.add("case"); } static{ generatorStart.add(":"); generatorStart.add("~"); generatorStart.add("-"); } static{ defLevelType.add("re"); defLevelType.add("split"); defLevelType.add("filter"); defLevelType.add("pseudotoken"); } // // constructor and parser // Statement(Mixup.MixupTokenizer tok,String firstTok) throws Mixup.ParseException{ keyword=firstTok; if(keyword.equals("declareSpanType")){ statementType=DECLARE; type=tok.advance(null); tok.advance(null); // advance to end-of-statement marker return; } if(keyword.equals("provide")){ statementType=PROVIDE; annotationType=tok.advance(null); if(annotationType.charAt(0)=='\''){ annotationType=annotationType.substring(1,annotationType.length()-1); } // added to parse ";" -frank tok.advance(null); return; } if(keyword.equals("annotateWith")){ statementType=ANNOTATE_WITH; fileToLoad=tok.advance(null); if(fileToLoad.charAt(0)=='\''){ fileToLoad=fileToLoad.substring(1,fileToLoad.length()-1); } tok.advance(null); return; } if(keyword.equals("require")){ statementType=REQUIRE; annotationType=tok.advance(null); if(annotationType.charAt(0)=='\''){ annotationType=annotationType.substring(1,annotationType.length()-1); } String marker=tok.advance(null); //Collections.singleton(",")); log.debug("marker: "+marker); if(marker!=null){ fileToLoad=tok.advance(null); if(fileToLoad.charAt(0)=='\'') fileToLoad=fileToLoad.substring(1,fileToLoad.length()-1); tok.advance(null); } return; } if("onLevel".equals(keyword)||"offLevel".equals(keyword)){ level=tok.advance(null); tok.advance(null); return; } if("importFromLevel".equals(keyword)){ importLevel=tok.advance(null); // continue to parse NEWTYPE = OLDTYPE importType=tok.advance(null); // read property or type tok.advance(Collections.singleton("=")); oldType=tok.advance(null); tok.advance(null); // advance to end-of-statement marker return; } String propOrType=tok.advance(null); // read property or type // importType = propOrType; String token=tok.advance(colonEqualsOrCase); // read ':' or '=' if(":".equals(token)){ if(!"defSpanProp".equals(keyword)&&!"defTokenProp".equals(keyword)){ parseError("can't define properties here"); } property=propOrType; type=null; value=tok.advance(null); tok.advance(Collections.singleton("=")); }else if("case".equals(token)){ if(!"defDict".equals(keyword)) parseError("illegal keyword usage"); }else{ // token is '=' if(!"defSpanType".equals(keyword)&&!"defDict".equals(keyword)&& !"defLevel".equals(keyword)){ parseError("illegal keyword usage"); } if(!"=".equals(token)){ parseError("expected '='"); } type=propOrType; property=null; } if("defDict".equals(keyword)){ // syntax is "defDict [+case] dictName = ", so either // propOrType = dictName and token = '=', or else // propOrType = + and token = 'case', or else ignoreCase=true; if("case".equals(token)){ ignoreCase=false; if(!"+".equals(propOrType)) parseError("illegal defDict"); type=tok.advance(null); tok.advance(Collections.singleton("=")); }else{ type=propOrType; } wordSet=new HashSet<String>(); filesToLoad=new ArrayList<String>(); while(true){ String w=tok.advance(null); // read in each line of the file name embraced by double quotes if(w.equals("\"")){ StringBuffer defFile=new StringBuffer(""); while(!(w=tok.advance(null)).equals("\"")) defFile.append(w); fileToLoad=defFile.toString(); filesToLoad.add(fileToLoad); }else{ wordSet.add(ignoreCase?w.toLowerCase():w); } String sep=tok.advance(null); if(sep==null) break; else if(!",".equals(sep)) parseError("expected comma"); } }else if("defLevel".equals(keyword)){ split=tok.advance(defLevelType); patt=tok.advance(null); if(patt.charAt(0)=='\''&&patt.charAt(patt.length()-1)=='\'') patt=patt.substring(1,patt.length()-1); tok.advance(null); }else{ // GEN // should be at '=' sign or starttype token=tok.advance(null); if(generatorStart.contains(token)){ startType="top"; }else{ startType=token; token=tok.advance(generatorStart); } if(token.equals(":")){ statementType=MIXUP; //mixupExpr = new Mixup( tok.input.substring(tok.matcher.end(1),tok.input.length()) ); //if(tok.advance()) if(tok.advance()) mixupExpr=new Mixup(tok); }else if(token.equals("-")){ statementType=FILTER; //mixupExpr = new Mixup( tok.input.substring(tok.matcher.end(1),tok.input.length()) ); //if(tok.advance()) if(tok.advance()) mixupExpr=new Mixup(tok); }else if(token.equals("~")){ token=tok.advance(null); if("re".equals(token)){ statementType=REGEX; regex=tok.advance(null); if(regex.startsWith("'")){ regex=regex.substring(1,regex.length()-1); regex=regex.replaceAll("\\\\'","'"); } token=tok.advance(Collections.singleton(",")); token=tok.advance(null); try{ regexGroup=Integer.parseInt(token); token=tok.advance(null); }catch(NumberFormatException e){ parseError("expected a regex group number and saw "+token); } }else if("trie".equals(token)){ statementType=TRIE; phraseList=new ArrayList<String>(); String word=tok.advance(null); word.trim(); String fullWord=""; while(word!=null){ if(!word.equals(",")){ fullWord=fullWord+word+" "; }else{ fullWord.trim(); phraseList.add(fullWord); fullWord=""; } word=tok.advance(null); } phraseList.add(fullWord); //String[] phrases = (String[])phraseList.toArray(); }else{ parseError("expected 're' or 'trie'"); } }else{ throw new IllegalStateException("unexpected generatorStart '"+token+"'"); } } } /** convert a set to a string listing the elements */ // private String setContents(Set set){ // StringBuffer buf=new StringBuffer(""); // for(Iterator i=set.iterator();i.hasNext();){ // if(buf.length()>0) // buf.append(" "); // buf.append("'"+i.next().toString()+"'"); // } // return buf.toString(); // } // an error message private String parseError(String msg) throws Mixup.ParseException{ throw new Mixup.ParseException("statement error at char "+lastTokenStart+ ": "+msg+"\nin '"+input+"'"); } public String toString(){ if("defDict".equals(keyword)||"defLevel".equals(keyword)){ return keyword+" "+type+" = ... "; }else if("onLevel".equals(keyword)||"offLevel".equals(keyword)){ return keyword+" "+level; }else if("importFromLevel".equals(keyword)){ return keyword+" "+importLevel+" "+importType+" = "+oldType; }else if(statementType==DECLARE){ return keyword+" "+type; }else if(statementType==PROVIDE){ return keyword+" "+annotationType; }else if(statementType==REQUIRE){ return keyword+" "+annotationType+","+fileToLoad; }else if(statementType==ANNOTATE_WITH){ return keyword+" "+fileToLoad; }else{ String genString="???"; if(statementType==MIXUP){ genString=": "+mixupExpr.toString(); }else if(statementType==FILTER){ genString="- "+mixupExpr.toString(); }else if(statementType==REGEX){ genString="~ re '"+regex+"' ,"+regexGroup; }else if(statementType==TRIE){ genString="~ trie ..."; } if(type!=null){ return keyword+" "+type+" ="+startType+genString; }else{ return keyword+" "+property+":"+value+" ="+startType+genString; } } } // // From here down are public accessors to the properties of this Statement. In the future // this should be changed to use a better data store for less cumbersome access // /** * Returns an integer representing the type this Statement is. Valid types are: * DECLARE, PROVIDE, REQUIRE, ANNOTATE_WITH, MIXUP, FILTER, REGEX, and TRIE. */ public int getStatementType(){ return statementType; } /** * Returns the keyword that defines what this Statement does. */ public String getKeyword(){ return keyword; } /** * Returns a list of the files that need to be loaded if this Statement * defines a dictionary. */ public List<String> getFilesToLoad(){ return filesToLoad; } /** * Returns the file that needs to be loaded in this Statement is an * ANNOTATE_WITH or REQUIRE statement. */ public String getFileToLoad(){ return fileToLoad; } /** * Returns the type that this Statement matches. */ public String getType(){ return type; } /** * Returns the property that this statement matches */ public String getProperty(){ return property; } /** * Returns the value that this statement will match. */ public String getValue(){ return value; } /** * Returns whether or not this statement will ignore case when defining a dictionary. */ public boolean getIgnoreCase(){ return ignoreCase; } /** * Returns the set of words defining a dictionary in the case that this statement * defines a dictionary inline. */ public Set<String> getWordSet(){ return wordSet; } /** * Returns the type of level to create when this Statement is defining a level. */ public String getSplit(){ return split; } /** * Returns the pattern that is used to create a new level when this statement * is defining a new level. */ public String getPatt(){ return patt; } /** * Returns the level name to be used when this statement is performing a level * operation (onLevel, offLeve, defLevel, importFromLevel) */ public String getLevel(){ return level; } /** * Returns the type from the source level that should be imported when this statement * executes an importFromLevel call. */ public String getOldType(){ return oldType; } /** * Returns the type that imported spans should be called when this statement * executes an importFromLevel call. */ public String getImportType(){ return importType; } /** * Returns the level that this statement will import from in a call to importFromLevel. */ public String getImportLevel(){ return importLevel; } /** * Returns the type that this statement either provides or requires. */ public String getAnnotationType(){ return annotationType; } /** * Returns the starting type in the case that this statement is a generator statement. */ public String getStartType(){ return startType; } /** * Returns the mixup expression that this statement will execute. */ public Mixup getMixupExpr(){ return mixupExpr; } /** * Returns the phrase list for when this statement will define a trie. */ public List<String> getPhraseList(){ return phraseList; } /** * Returns the regex string that will be executed by this statement. */ public String getRegex(){ return regex; } /** * Returns the regex group that will be returned when this statement executes. */ public int getRegexGroup(){ return regexGroup; } }