Statement.java example

Explorer
MinorThird-master
package edu.cmu.minorthird.text.mixup;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;

public class Statement implements Serializable{

	static private final long serialVersionUID=20080303L;

	private static Logger log=Logger.getLogger(Statement.class);

	public static int REGEX=1,MIXUP=2,FILTER=3,PROVIDE=4,REQUIRE=5,DECLARE=6,
			TRIE=7,ANNOTATE_WITH=8;

	//TODO: We should handle these properties better, possibly using a java properties object
	// encodes the statement properties
	private String keyword,property,type,startType,value;

	// set of words, for a dictionary
	private Set<String> wordSet=null;

	// file containing dictionary
//	private File dictFile=null;

	// Variable for whether to ignore case in dictionary
	private boolean ignoreCase;

	// split string for retokenizing textBase
	private String split,patt;

	// current tokenization level 
	private String level;

	// Variables that define the level and type to be imported to the current textBase
	private String importLevel,importType,oldType;

	// encode generator
	private int statementType;

	// for statementType = MIXUP or FILTER
	private Mixup mixupExpr=null;

	// for statementType TRIE
	private List<String> phraseList;

	// for statementType = REGEX
	private String regex=null;

	private int regexGroup;

	// for statementType=PROVIDE,REQUIRE,ANNOTATEWITH,DICTIONARY
	private String annotationType,fileToLoad;

	private List<String> filesToLoad;

	// for parsing
//	private Matcher matcher;

	// for TRIE
	private int lastTokenStart;

	private String input;

	private static Set<String> generatorStart=new HashSet<String>();

	private static Set<String> legalKeywords=new HashSet<String>();

	private static Set<String> colonEqualsOrCase=new HashSet<String>();

	private static Set<String> defLevelType=new HashSet<String>();
	static{
		legalKeywords.add("defTokenProp");
		legalKeywords.add("defSpanProp");
		legalKeywords.add("defSpanType");
		legalKeywords.add("defDict");
		legalKeywords.add("declareSpanType");
		legalKeywords.add("provide");
		legalKeywords.add("require");
		legalKeywords.add("defLevel");
		legalKeywords.add("onLevel");
		legalKeywords.add("offLevel");
		legalKeywords.add("importFromLevel");
	}
	static{
		colonEqualsOrCase.add(":");
		colonEqualsOrCase.add("=");
		colonEqualsOrCase.add("case");
	}
	static{
		generatorStart.add(":");
		generatorStart.add("~");
		generatorStart.add("-");
	}
	static{
		defLevelType.add("re");
		defLevelType.add("split");
		defLevelType.add("filter");
		defLevelType.add("pseudotoken");
	}

	//
	// constructor and parser
	//
	Statement(Mixup.MixupTokenizer tok,String firstTok)
			throws Mixup.ParseException{
		keyword=firstTok;
		if(keyword.equals("declareSpanType")){
			statementType=DECLARE;
			type=tok.advance(null);
			tok.advance(null); // advance to end-of-statement marker
			return;
		}
		if(keyword.equals("provide")){
			statementType=PROVIDE;
			annotationType=tok.advance(null);
			if(annotationType.charAt(0)=='\''){
				annotationType=annotationType.substring(1,annotationType.length()-1);
			}
			// added to parse ";" -frank
			tok.advance(null);
			return;
		}
		if(keyword.equals("annotateWith")){
			statementType=ANNOTATE_WITH;
			fileToLoad=tok.advance(null);
			if(fileToLoad.charAt(0)=='\''){
				fileToLoad=fileToLoad.substring(1,fileToLoad.length()-1);
			}
			tok.advance(null);
			return;
		}
		if(keyword.equals("require")){
			statementType=REQUIRE;
			annotationType=tok.advance(null);
			if(annotationType.charAt(0)=='\''){
				annotationType=annotationType.substring(1,annotationType.length()-1);
			}
			String marker=tok.advance(null); //Collections.singleton(","));
			log.debug("marker: "+marker);
			if(marker!=null){
				fileToLoad=tok.advance(null);
				if(fileToLoad.charAt(0)=='\'')
					fileToLoad=fileToLoad.substring(1,fileToLoad.length()-1);
				tok.advance(null);
			}
			return;
		}
		if("onLevel".equals(keyword)||"offLevel".equals(keyword)){
			level=tok.advance(null);
			tok.advance(null);
			return;
		}
		if("importFromLevel".equals(keyword)){
			importLevel=tok.advance(null);
			// continue to parse NEWTYPE = OLDTYPE
			importType=tok.advance(null); // read property or type
			tok.advance(Collections.singleton("="));
			oldType=tok.advance(null);
			tok.advance(null); // advance to end-of-statement marker
			return;
		}
		String propOrType=tok.advance(null); // read property or type
		//        importType = propOrType;
		String token=tok.advance(colonEqualsOrCase); // read ':' or '='
		if(":".equals(token)){
			if(!"defSpanProp".equals(keyword)&&!"defTokenProp".equals(keyword)){
				parseError("can't define properties here");
			}
			property=propOrType;
			type=null;
			value=tok.advance(null);
			tok.advance(Collections.singleton("="));
		}else if("case".equals(token)){
			if(!"defDict".equals(keyword))
				parseError("illegal keyword usage");
		}else{
			// token is '='
			if(!"defSpanType".equals(keyword)&&!"defDict".equals(keyword)&&
					!"defLevel".equals(keyword)){
				parseError("illegal keyword usage");
			}
			if(!"=".equals(token)){

				parseError("expected '='");
			}
			type=propOrType;
			property=null;
		}

		if("defDict".equals(keyword)){
			// syntax is "defDict [+case] dictName = ", so either
			// propOrType = dictName and token = '=', or else 
			// propOrType = + and token = 'case', or else 
			ignoreCase=true;
			if("case".equals(token)){
				ignoreCase=false;
				if(!"+".equals(propOrType))
					parseError("illegal defDict");
				type=tok.advance(null);
				tok.advance(Collections.singleton("="));
			}else{
				type=propOrType;
			}
			wordSet=new HashSet<String>();
			filesToLoad=new ArrayList<String>();
			while(true){
				String w=tok.advance(null);
				// read in each line of the file name embraced by double quotes	
				if(w.equals("\"")){
					StringBuffer defFile=new StringBuffer("");
					while(!(w=tok.advance(null)).equals("\""))
						defFile.append(w);
					fileToLoad=defFile.toString();
					filesToLoad.add(fileToLoad);
				}else{
					wordSet.add(ignoreCase?w.toLowerCase():w);
				}
				String sep=tok.advance(null);
				if(sep==null)
					break;
				else if(!",".equals(sep))
					parseError("expected comma");
			}
		}else if("defLevel".equals(keyword)){
			split=tok.advance(defLevelType);
			patt=tok.advance(null);
			if(patt.charAt(0)=='\''&&patt.charAt(patt.length()-1)=='\'')
				patt=patt.substring(1,patt.length()-1);
			tok.advance(null);
		}else{
			// GEN
			// should be at '=' sign or starttype
			token=tok.advance(null);
			if(generatorStart.contains(token)){
				startType="top";
			}else{
				startType=token;
				token=tok.advance(generatorStart);
			}
			if(token.equals(":")){
				statementType=MIXUP;
				//mixupExpr = new Mixup( tok.input.substring(tok.matcher.end(1),tok.input.length()) );
				//if(tok.advance())
				if(tok.advance())
					mixupExpr=new Mixup(tok);
			}else if(token.equals("-")){
				statementType=FILTER;
				//mixupExpr = new Mixup( tok.input.substring(tok.matcher.end(1),tok.input.length()) );
				//if(tok.advance())		    
				if(tok.advance())
					mixupExpr=new Mixup(tok);
			}else if(token.equals("~")){
				token=tok.advance(null);
				if("re".equals(token)){
					statementType=REGEX;
					regex=tok.advance(null);
					if(regex.startsWith("'")){
						regex=regex.substring(1,regex.length()-1);
						regex=regex.replaceAll("\\\\'","'");
					}
					token=tok.advance(Collections.singleton(","));
					token=tok.advance(null);
					try{
						regexGroup=Integer.parseInt(token);
						token=tok.advance(null);
					}catch(NumberFormatException e){
						parseError("expected a regex group number and saw "+token);
					}
				}else if("trie".equals(token)){
					statementType=TRIE;
					phraseList=new ArrayList<String>();
					String word=tok.advance(null);
					word.trim();
					String fullWord="";
					while(word!=null){
						if(!word.equals(",")){
							fullWord=fullWord+word+" ";
						}else{
							fullWord.trim();
							phraseList.add(fullWord);
							fullWord="";
						}
						word=tok.advance(null);
					}
					phraseList.add(fullWord);
					//String[] phrases = (String[])phraseList.toArray();
				}else{
					parseError("expected 're' or 'trie'");
				}
			}else{
				throw new IllegalStateException("unexpected generatorStart '"+token+"'");
			}
		}
	}

	/** convert a set to a string listing the elements */
//	private String setContents(Set set){
//		StringBuffer buf=new StringBuffer("");
//		for(Iterator i=set.iterator();i.hasNext();){
//			if(buf.length()>0)
//				buf.append(" ");
//			buf.append("'"+i.next().toString()+"'");
//		}
//		return buf.toString();
//	}

	// an error message
	private String parseError(String msg) throws Mixup.ParseException{
		throw new Mixup.ParseException("statement error at char "+lastTokenStart+
				": "+msg+"\nin '"+input+"'");
	}

	public String toString(){
		if("defDict".equals(keyword)||"defLevel".equals(keyword)){
			return keyword+" "+type+" = ... ";
		}else if("onLevel".equals(keyword)||"offLevel".equals(keyword)){
			return keyword+" "+level;
		}else if("importFromLevel".equals(keyword)){
			return keyword+" "+importLevel+" "+importType+" = "+oldType;
		}else if(statementType==DECLARE){
			return keyword+" "+type;
		}else if(statementType==PROVIDE){
			return keyword+" "+annotationType;
		}else if(statementType==REQUIRE){
			return keyword+" "+annotationType+","+fileToLoad;
		}else if(statementType==ANNOTATE_WITH){
			return keyword+" "+fileToLoad;
		}else{
			String genString="???";
			if(statementType==MIXUP){
				genString=": "+mixupExpr.toString();
			}else if(statementType==FILTER){
				genString="- "+mixupExpr.toString();
			}else if(statementType==REGEX){
				genString="~ re '"+regex+"' ,"+regexGroup;
			}else if(statementType==TRIE){
				genString="~ trie ...";
			}
			if(type!=null){
				return keyword+" "+type+" ="+startType+genString;
			}else{
				return keyword+" "+property+":"+value+" ="+startType+genString;
			}
		}
	}

	//
	// From here down are public accessors to the properties of this Statement.  In the future
	// this should be changed to use a better data store for less cumbersome access
	//

	/**
	 * Returns an integer representing the type this Statement is.  Valid types are:
	 * DECLARE, PROVIDE, REQUIRE, ANNOTATE_WITH, MIXUP, FILTER, REGEX, and TRIE.
	 */
	public int getStatementType(){
		return statementType;
	}

	/**
	 * Returns the keyword that defines what this Statement does.
	 */
	public String getKeyword(){
		return keyword;
	}

	/**
	 * Returns a list of the files that need to be loaded if this Statement
	 * defines a dictionary.
	 */
	public List<String> getFilesToLoad(){
		return filesToLoad;
	}

	/**
	 * Returns the file that needs to be loaded in this Statement is an
	 * ANNOTATE_WITH or REQUIRE statement.
	 */
	public String getFileToLoad(){
		return fileToLoad;
	}

	/**
	 * Returns the type that this Statement matches.
	 */
	public String getType(){
		return type;
	}

	/**
	 * Returns the property that this statement matches
	 */
	public String getProperty(){
		return property;
	}

	/**
	 * Returns the value that this statement will match.
	 */
	public String getValue(){
		return value;
	}

	/**
	 * Returns whether or not this statement will ignore case when defining a dictionary.
	 */
	public boolean getIgnoreCase(){
		return ignoreCase;
	}

	/**
	 * Returns the set of words defining a dictionary in the case that this statement
	 * defines a dictionary inline.
	 */
	public Set<String> getWordSet(){
		return wordSet;
	}

	/**
	 * Returns the type of level to create when this Statement is defining a level.
	 */
	public String getSplit(){
		return split;
	}

	/**
	 * Returns the pattern that is used to create a new level when this statement
	 * is defining a new level.
	 */
	public String getPatt(){
		return patt;
	}

	/**
	 * Returns the level name to be used when this statement is performing a level
	 * operation (onLevel, offLeve, defLevel, importFromLevel)
	 */
	public String getLevel(){
		return level;
	}

	/**
	 * Returns the type from the source level that should be imported when this statement
	 * executes an importFromLevel call.
	 */
	public String getOldType(){
		return oldType;
	}

	/**
	 * Returns the type that imported spans should be called when this statement 
	 * executes an importFromLevel call.
	 */
	public String getImportType(){
		return importType;
	}

	/**
	 * Returns the level that this statement will import from in a call to importFromLevel.
	 */
	public String getImportLevel(){
		return importLevel;
	}

	/**
	 * Returns the type that this statement either provides or requires.
	 */
	public String getAnnotationType(){
		return annotationType;
	}

	/**
	 * Returns the starting type in the case that this statement is a generator statement.
	 */
	public String getStartType(){
		return startType;
	}

	/**
	 * Returns the mixup expression that this statement will execute.
	 */
	public Mixup getMixupExpr(){
		return mixupExpr;
	}

	/**
	 * Returns the phrase list for when this statement will define a trie.
	 */
	public List<String> getPhraseList(){
		return phraseList;
	}

	/**
	 * Returns the regex string that will be executed by this statement.
	 */
	public String getRegex(){
		return regex;
	}

	/**
	 * Returns the regex group that will be returned when this statement executes.
	 */
	public int getRegexGroup(){
		return regexGroup;
	}
}