MixupInterpreter.java example

Explorer
MinorThird-master
package edu.cmu.minorthird.text.mixup;

import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.Stack;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

import edu.cmu.minorthird.text.BasicTextLabels;
import edu.cmu.minorthird.text.MonotonicTextLabels;
import edu.cmu.minorthird.text.RegexTokenizer;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.SpanTypeTokenizer;
import edu.cmu.minorthird.text.SplitTokenizer;
import edu.cmu.minorthird.text.TextBase;
import edu.cmu.minorthird.text.TextBaseManager;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.TextToken;
import edu.cmu.minorthird.text.Tokenizer;

public class MixupInterpreter{

	private static Logger log=Logger.getLogger(MixupInterpreter.class);

	private MixupProgram program=null;

	private Map<String,MonotonicTextLabels> levelsToLabelsMap=
			new HashMap<String,MonotonicTextLabels>();

	private Stack<String> levelStack=new Stack<String>();

	private TextBaseManager tbManager;

	// Constructors
	public MixupInterpreter(){
		;
	}

	public MixupInterpreter(MixupProgram p){
		program=p;
	}

	public MixupInterpreter(MixupProgram p,MonotonicTextLabels rootLabels){
		program=p;
		tbManager=new TextBaseManager("root",rootLabels.getTextBase());
		levelsToLabelsMap.put("root",rootLabels);
		levelStack.push("root");
	}

	/** Effectively clears the current state and executes the current program on the
	 * specified TextLabels.  The specified labels will become the root level of the
	 * new labels hierarchy.
	 */
	public void eval(MonotonicTextLabels labels){
		// Check to make sure that there is an actual program to evaluate
		if(program==null)
			throw new IllegalStateException(
					"You must set the MixupProgram prior to calling eval.");

		// Clear out the state and start fresh with the specified labels
		tbManager=new TextBaseManager("root",labels.getTextBase());
		levelsToLabelsMap.clear();
		levelsToLabelsMap.put("root",labels);
		levelStack=new Stack<String>();
		levelStack.push("root");

		// Evaluate the program on this new state.
		this.eval();
	}

	/** Runs the current program on the current state of labels (current level).  This
	 * method is useful if you have already executed a program, called setProgram() and 
	 * want to run the new progam against the current state of the labels.
	 */
	public void eval(){
		// Check to make sure that there is an actual program to evaluate
		if(program==null)
			throw new IllegalStateException(
					"You must set the MixupProgram prior to calling eval.");

		// Make sure that at least a root level of labels has been specified
		if(this.getCurrentLevel()==null)
			throw new IllegalStateException(
					"There is no TextLabels heirarchy.  You must call eval(TextLabels) instead.");

		// If everything is in place, go ahead and evaluate the program statements on the current state.
		Statement[] statementList=program.getStatements();
		for(int i=0;i<statementList.length;i++){
			this.evaluate(statementList[i]);
		}
	}

	/** Sets the MixupProgram that this interpreter will execute when the eval method is called. */
	public void setProgram(MixupProgram p){
		program=p;
	}

	/** Returns the MixupProgram that this interpreter will execute if the eval method is called. */
	public MixupProgram getProgram(){
		return program;
	}

	/** Returns the TextLabels associated with the given level name or null if the level doesn't exist */
	public MonotonicTextLabels getLabelsForLevel(String level){
		return levelsToLabelsMap.get(level);
	}

	/** Returns the name of the current level or null if no TextLabels have been added or created because 
	 *  the program has not yet been evaluated. */
	public String getCurrentLevel(){
		if(levelStack.empty())
			return null;
		return levelStack.peek();
	}

	/** Returns the TextLabels associated with the current level */
	public MonotonicTextLabels getCurrentLabels(){
		return getLabelsForLevel(getCurrentLevel());
	}

	/** Makes the current level be the given level name.  Throws an exception if the given level name doesn't exist. */
	public void onLevel(String levelName){
		if(levelsToLabelsMap.get(levelName)==null)
			throw new IllegalArgumentException("There is no level named '"+levelName+
					"'");
		else
			levelStack.push(levelName);
	}

	/** Moves up one level in the stack of labels */
	public void offLevel(String levelName){
		if(levelStack.size()==1)
			throw new IllegalArgumentException("Already at the top level.");
		else if(!(levelStack.peek()).equals(levelName))
			throw new IllegalArgumentException("Not on level named '"+levelName+"'");
		else{
			levelStack.pop();
		}
	}

	/**
	 * Creates a new level.  This specified level type indicates what kind of level to be created.  Allowed types are: 
	 * pseudotoken, filter, re, and split. <br>
	 * <br>
	 * pseudotoken - This type of level creates a new text base with a different tokenization scheme.  The new scheme is
	 * the same as the original scheme except that tokens in the span type specified in pattern are all merged into a single
	 * token. <br>
	 * <br>
	 * filter - This type of level creates a new text base that only contains the text inside instances of the span type
	 * specified in pattern.  Each instance is placed in a separate document in the new text base. <br>
	 * <br>
	 * re - This type of level creates a new text base with a new tokenization scheme.  In this case the tokenization 
	 * scheme is defined by the regular expression specified in pattern.  Only matches to this regex are considered 
	 * tokens in the new text base. <br>
	 * <br>
	 * split - This type of level is similar to re except that matches to the regex are used to separate the tokens.  That 
	 * is everything in between matches is considered a single token.
	 */
	public void createLevel(String newLevelName,String levelType,String pattern){
		TextBase newTextBase=null;
		BasicTextLabels newLabels=null;
		String currentLevel=this.getCurrentLevel();
		MonotonicTextLabels parentLabels=
				levelsToLabelsMap.get(currentLevel);

		// Create a textBase where spans of a certain type are combined into a sigle token 
		if("pseudotoken".equals(levelType)){
			// First create the tokenizer
			SpanTypeTokenizer tokenizer=new SpanTypeTokenizer(pattern,parentLabels);
			// Next create the retokenized text base
			newTextBase=tbManager.retokenize(tokenizer,currentLevel,newLevelName);
			// Finally create the labels and add in the pseudotoken token properties.
			newLabels=new BasicTextLabels(newTextBase);
			Iterator<Span> typeInstances=parentLabels.instanceIterator(pattern);
			while(typeInstances.hasNext()){
				Span currInstance=typeInstances.next();
				Span matchingChildSpan=
						tbManager.getMatchingSpan(currInstance,currentLevel,newLevelName);
				for(int i=0;i<matchingChildSpan.size();i++){
					newLabels.setProperty(matchingChildSpan.getTextToken(i),
							"Pseudotoken","1");
				}
			}
		}
		// creates a textBase which filters out all spans not of a certain spanType
		else if("filter".equals(levelType)){
			newTextBase=
					tbManager.filter(currentLevel,parentLabels,newLevelName,pattern);
			newLabels=new BasicTextLabels(newTextBase);
		}
		// Creates a new text base retokenized by the given pattern
		else if("re".equals(levelType)||"split".equals(levelType)){
			Tokenizer tokenizer;
			if(levelType.equals("split")) // creates a tokenizer that splits the textBase at a certain token (e.g. split at ".")
				tokenizer=new SplitTokenizer(pattern);
			else
				tokenizer=new RegexTokenizer(pattern); //only things matching pattern will be tokens.
			newTextBase=tbManager.retokenize(tokenizer,currentLevel,newLevelName);
			newLabels=new BasicTextLabels(newTextBase);
		}else{
			throw new IllegalArgumentException("No level type: "+levelType+
					" new level created with old textBase and Labels");
		}

		// Add the new TextLabels to the list of labels for each level
		levelsToLabelsMap.put(newLevelName,newLabels);
	}

	/** imports labels from specified level to the current level */
	public void importLabelsFromLevel(String importLevel,String oldType,
			String newType){
		if(!tbManager.containsLevel(importLevel)){
			throw new IllegalArgumentException("Level: "+importLevel+
					" not defined for importFromLevel");
		}

		MonotonicTextLabels oldLabels=levelsToLabelsMap.get(importLevel);
		MonotonicTextLabels currLabels=this.getCurrentLabels();
		Iterator<Span> instances=oldLabels.instanceIterator(oldType);
		while(instances.hasNext()){
			Span currInstance=instances.next();
			Span newSpan=
					tbManager.getMatchingSpan(currInstance,importLevel,this
							.getCurrentLevel());
			currLabels.addToType(newSpan,newType);
		}
	}

	//
	// Evaluates a Statement instance against the current level's label set.
	//
	private void evaluate(Statement statement){
		log.info("Evaluating: "+statement);
		long start=System.currentTimeMillis();

		MonotonicTextLabels labels=this.getCurrentLabels();

		// The properties of this statement        
		int statementType=statement.getStatementType();
		String keyword=statement.getKeyword();
		List<String> filesToLoad=statement.getFilesToLoad();
		String fileToLoad=statement.getFileToLoad();
		String type=statement.getType();
		boolean ignoreCase=statement.getIgnoreCase();
		Set<String> wordSet=statement.getWordSet();
		String split=statement.getSplit();
		String patt=statement.getPatt();
		String level=statement.getLevel();
		String oldType=statement.getOldType();
		String importType=statement.getImportType();
		String importLevel=statement.getImportLevel();
		String annotationType=statement.getAnnotationType();
		String startType=statement.getStartType();
		Mixup mixupExpr=statement.getMixupExpr();
		List<String> phraseList=statement.getPhraseList();
		String regex=statement.getRegex();
		int regexGroup=statement.getRegexGroup();

		if("defDict".equals(keyword)){
			if(filesToLoad.size()>0){
				labels.defineDictionary(type,filesToLoad,ignoreCase);
				filesToLoad.clear();
			}else{
				log.debug("defining dictionary of: "+wordSet);
				labels.defineDictionary(type,wordSet);
			}
		}else if("defLevel".equals(keyword)){
			this.createLevel(type,split,patt);
		}else if("onLevel".equals(keyword)){
			this.onLevel(level);
		}else if("offLevel".equals(keyword)){
			this.offLevel(level);
		}else if("importFromLevel".equals(keyword)){
			this.importLabelsFromLevel(importLevel,oldType,importType);
		}else if("declareSpanType".equals(keyword)){
			labels.declareType(type);
		}else if(statementType==Statement.PROVIDE){
			labels.setAnnotatedBy(annotationType);
		}else if(statementType==Statement.REQUIRE){
			labels.require(annotationType,fileToLoad);
		}else if(statementType==Statement.ANNOTATE_WITH){
			labels.annotateWith(fileToLoad.substring(0,fileToLoad.length()-4),
					fileToLoad);
		}else{
			Iterator<Span> input=null;
			if("top".equals(startType)){
				input=labels.getTextBase().documentSpanIterator();
			}else if(labels.isType(startType)){
				input=labels.instanceIterator(startType);
			}else{
				throw new IllegalStateException("no type '"+startType+"' defined");
			}
			if(statementType==Statement.MIXUP){
				for(Iterator<Span> i=mixupExpr.extract(labels,input);i.hasNext();){
					Span span=i.next();
					extendLabels(labels,span,statement);
				}
				// make sure type is declared, even if nothing happened to be defined here
				if("defSpanType".equals(keyword)){
					labels.declareType(type);
				}
			}else if(statementType==Statement.FILTER){
				SortedSet<Span> accum=new TreeSet<Span>();
				for(Iterator<Span> i=input;i.hasNext();){
					Span span=i.next();
					if(!hasExtraction(mixupExpr,labels,span)){
						accum.add(span);
					}
				}
				for(Iterator<Span> i=accum.iterator();i.hasNext();){
					extendLabels(labels,i.next(),statement);
				}
			}else if(statementType==Statement.TRIE){
				labels.defineTrie(phraseList);
				while(input.hasNext()){
					Span span=input.next();
					Iterator<Span> output=labels.getTrie().lookup(span);
					while(output.hasNext()){
						extendLabels(labels,output.next(),statement);
					}
				}
			}else if(statementType==Statement.REGEX){
				Pattern pattern=Pattern.compile(regex);
				while(input.hasNext()){
					Span span=input.next();
					// Don't use this method as it drops leading and trailing spaces from the document text.
					//Matcher matcher = pattern.matcher( span.asString() );
					Matcher matcher=pattern.matcher(span.getDocumentContents());
					while(matcher.find()){
						try{
							Span subspan=
									span.charIndexProperSubSpan(matcher.start(regexGroup),matcher
											.end(regexGroup));
							extendLabels(labels,subspan,statement);
						}catch(IllegalArgumentException ex){
							/* there is no subspan that is properly contained by the regex match,
							   so don't add anything */
						}
					}
				}
			}else{
				throw new IllegalStateException("illegal statement type "+statementType);
			}
		}
		long end=System.currentTimeMillis();
		log.info("time: "+((end-start)/1000.0)+" sec");
	}

	// subroutine of eval - check if a mixup expression matches
	private boolean hasExtraction(final Mixup mixupExpr,final TextLabels labels,
			final Span span){
		Iterator<Span> input=Collections.singleton(span).iterator();
		Iterator<Span> output=mixupExpr.extract(labels,input);
		return output.hasNext();
	}

	// subroutine of eval - label the span  
	private void extendLabels(MonotonicTextLabels labels,Span span,
			Statement statement){
		String keyword=statement.getKeyword();
		String type=statement.getType();
		String property=statement.getProperty();
		String value=statement.getValue();

		if("defSpanType".equals(keyword))
			labels.addToType(span,type);
		else if("defSpanProp".equals(keyword))
			labels.setProperty(span,property,value);
		else if("defTokenProp".equals(keyword)){
			for(int j=0;j<span.size();j++){
				TextToken token=span.getTextToken(j);
				if(property==null)
					throw new IllegalStateException("null property");
				labels.setProperty(token,property,value);
			}
		}
	}
}