package edu.cmu.minorthird.text.mixup; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.Stack; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import edu.cmu.minorthird.text.BasicTextLabels; import edu.cmu.minorthird.text.MonotonicTextLabels; import edu.cmu.minorthird.text.RegexTokenizer; import edu.cmu.minorthird.text.Span; import edu.cmu.minorthird.text.SpanTypeTokenizer; import edu.cmu.minorthird.text.SplitTokenizer; import edu.cmu.minorthird.text.TextBase; import edu.cmu.minorthird.text.TextBaseManager; import edu.cmu.minorthird.text.TextLabels; import edu.cmu.minorthird.text.TextToken; import edu.cmu.minorthird.text.Tokenizer; public class MixupInterpreter{ private static Logger log=Logger.getLogger(MixupInterpreter.class); private MixupProgram program=null; private Map<String,MonotonicTextLabels> levelsToLabelsMap= new HashMap<String,MonotonicTextLabels>(); private Stack<String> levelStack=new Stack<String>(); private TextBaseManager tbManager; // Constructors public MixupInterpreter(){ ; } public MixupInterpreter(MixupProgram p){ program=p; } public MixupInterpreter(MixupProgram p,MonotonicTextLabels rootLabels){ program=p; tbManager=new TextBaseManager("root",rootLabels.getTextBase()); levelsToLabelsMap.put("root",rootLabels); levelStack.push("root"); } /** Effectively clears the current state and executes the current program on the * specified TextLabels. The specified labels will become the root level of the * new labels hierarchy. */ public void eval(MonotonicTextLabels labels){ // Check to make sure that there is an actual program to evaluate if(program==null) throw new IllegalStateException( "You must set the MixupProgram prior to calling eval."); // Clear out the state and start fresh with the specified labels tbManager=new TextBaseManager("root",labels.getTextBase()); levelsToLabelsMap.clear(); levelsToLabelsMap.put("root",labels); levelStack=new Stack<String>(); levelStack.push("root"); // Evaluate the program on this new state. this.eval(); } /** Runs the current program on the current state of labels (current level). This * method is useful if you have already executed a program, called setProgram() and * want to run the new progam against the current state of the labels. */ public void eval(){ // Check to make sure that there is an actual program to evaluate if(program==null) throw new IllegalStateException( "You must set the MixupProgram prior to calling eval."); // Make sure that at least a root level of labels has been specified if(this.getCurrentLevel()==null) throw new IllegalStateException( "There is no TextLabels heirarchy. You must call eval(TextLabels) instead."); // If everything is in place, go ahead and evaluate the program statements on the current state. Statement[] statementList=program.getStatements(); for(int i=0;i<statementList.length;i++){ this.evaluate(statementList[i]); } } /** Sets the MixupProgram that this interpreter will execute when the eval method is called. */ public void setProgram(MixupProgram p){ program=p; } /** Returns the MixupProgram that this interpreter will execute if the eval method is called. */ public MixupProgram getProgram(){ return program; } /** Returns the TextLabels associated with the given level name or null if the level doesn't exist */ public MonotonicTextLabels getLabelsForLevel(String level){ return levelsToLabelsMap.get(level); } /** Returns the name of the current level or null if no TextLabels have been added or created because * the program has not yet been evaluated. */ public String getCurrentLevel(){ if(levelStack.empty()) return null; return levelStack.peek(); } /** Returns the TextLabels associated with the current level */ public MonotonicTextLabels getCurrentLabels(){ return getLabelsForLevel(getCurrentLevel()); } /** Makes the current level be the given level name. Throws an exception if the given level name doesn't exist. */ public void onLevel(String levelName){ if(levelsToLabelsMap.get(levelName)==null) throw new IllegalArgumentException("There is no level named '"+levelName+ "'"); else levelStack.push(levelName); } /** Moves up one level in the stack of labels */ public void offLevel(String levelName){ if(levelStack.size()==1) throw new IllegalArgumentException("Already at the top level."); else if(!(levelStack.peek()).equals(levelName)) throw new IllegalArgumentException("Not on level named '"+levelName+"'"); else{ levelStack.pop(); } } /** * Creates a new level. This specified level type indicates what kind of level to be created. Allowed types are: * pseudotoken, filter, re, and split. <br> * <br> * pseudotoken - This type of level creates a new text base with a different tokenization scheme. The new scheme is * the same as the original scheme except that tokens in the span type specified in pattern are all merged into a single * token. <br> * <br> * filter - This type of level creates a new text base that only contains the text inside instances of the span type * specified in pattern. Each instance is placed in a separate document in the new text base. <br> * <br> * re - This type of level creates a new text base with a new tokenization scheme. In this case the tokenization * scheme is defined by the regular expression specified in pattern. Only matches to this regex are considered * tokens in the new text base. <br> * <br> * split - This type of level is similar to re except that matches to the regex are used to separate the tokens. That * is everything in between matches is considered a single token. */ public void createLevel(String newLevelName,String levelType,String pattern){ TextBase newTextBase=null; BasicTextLabels newLabels=null; String currentLevel=this.getCurrentLevel(); MonotonicTextLabels parentLabels= levelsToLabelsMap.get(currentLevel); // Create a textBase where spans of a certain type are combined into a sigle token if("pseudotoken".equals(levelType)){ // First create the tokenizer SpanTypeTokenizer tokenizer=new SpanTypeTokenizer(pattern,parentLabels); // Next create the retokenized text base newTextBase=tbManager.retokenize(tokenizer,currentLevel,newLevelName); // Finally create the labels and add in the pseudotoken token properties. newLabels=new BasicTextLabels(newTextBase); Iterator<Span> typeInstances=parentLabels.instanceIterator(pattern); while(typeInstances.hasNext()){ Span currInstance=typeInstances.next(); Span matchingChildSpan= tbManager.getMatchingSpan(currInstance,currentLevel,newLevelName); for(int i=0;i<matchingChildSpan.size();i++){ newLabels.setProperty(matchingChildSpan.getTextToken(i), "Pseudotoken","1"); } } } // creates a textBase which filters out all spans not of a certain spanType else if("filter".equals(levelType)){ newTextBase= tbManager.filter(currentLevel,parentLabels,newLevelName,pattern); newLabels=new BasicTextLabels(newTextBase); } // Creates a new text base retokenized by the given pattern else if("re".equals(levelType)||"split".equals(levelType)){ Tokenizer tokenizer; if(levelType.equals("split")) // creates a tokenizer that splits the textBase at a certain token (e.g. split at ".") tokenizer=new SplitTokenizer(pattern); else tokenizer=new RegexTokenizer(pattern); //only things matching pattern will be tokens. newTextBase=tbManager.retokenize(tokenizer,currentLevel,newLevelName); newLabels=new BasicTextLabels(newTextBase); }else{ throw new IllegalArgumentException("No level type: "+levelType+ " new level created with old textBase and Labels"); } // Add the new TextLabels to the list of labels for each level levelsToLabelsMap.put(newLevelName,newLabels); } /** imports labels from specified level to the current level */ public void importLabelsFromLevel(String importLevel,String oldType, String newType){ if(!tbManager.containsLevel(importLevel)){ throw new IllegalArgumentException("Level: "+importLevel+ " not defined for importFromLevel"); } MonotonicTextLabels oldLabels=levelsToLabelsMap.get(importLevel); MonotonicTextLabels currLabels=this.getCurrentLabels(); Iterator<Span> instances=oldLabels.instanceIterator(oldType); while(instances.hasNext()){ Span currInstance=instances.next(); Span newSpan= tbManager.getMatchingSpan(currInstance,importLevel,this .getCurrentLevel()); currLabels.addToType(newSpan,newType); } } // // Evaluates a Statement instance against the current level's label set. // private void evaluate(Statement statement){ log.info("Evaluating: "+statement); long start=System.currentTimeMillis(); MonotonicTextLabels labels=this.getCurrentLabels(); // The properties of this statement int statementType=statement.getStatementType(); String keyword=statement.getKeyword(); List<String> filesToLoad=statement.getFilesToLoad(); String fileToLoad=statement.getFileToLoad(); String type=statement.getType(); boolean ignoreCase=statement.getIgnoreCase(); Set<String> wordSet=statement.getWordSet(); String split=statement.getSplit(); String patt=statement.getPatt(); String level=statement.getLevel(); String oldType=statement.getOldType(); String importType=statement.getImportType(); String importLevel=statement.getImportLevel(); String annotationType=statement.getAnnotationType(); String startType=statement.getStartType(); Mixup mixupExpr=statement.getMixupExpr(); List<String> phraseList=statement.getPhraseList(); String regex=statement.getRegex(); int regexGroup=statement.getRegexGroup(); if("defDict".equals(keyword)){ if(filesToLoad.size()>0){ labels.defineDictionary(type,filesToLoad,ignoreCase); filesToLoad.clear(); }else{ log.debug("defining dictionary of: "+wordSet); labels.defineDictionary(type,wordSet); } }else if("defLevel".equals(keyword)){ this.createLevel(type,split,patt); }else if("onLevel".equals(keyword)){ this.onLevel(level); }else if("offLevel".equals(keyword)){ this.offLevel(level); }else if("importFromLevel".equals(keyword)){ this.importLabelsFromLevel(importLevel,oldType,importType); }else if("declareSpanType".equals(keyword)){ labels.declareType(type); }else if(statementType==Statement.PROVIDE){ labels.setAnnotatedBy(annotationType); }else if(statementType==Statement.REQUIRE){ labels.require(annotationType,fileToLoad); }else if(statementType==Statement.ANNOTATE_WITH){ labels.annotateWith(fileToLoad.substring(0,fileToLoad.length()-4), fileToLoad); }else{ Iterator<Span> input=null; if("top".equals(startType)){ input=labels.getTextBase().documentSpanIterator(); }else if(labels.isType(startType)){ input=labels.instanceIterator(startType); }else{ throw new IllegalStateException("no type '"+startType+"' defined"); } if(statementType==Statement.MIXUP){ for(Iterator<Span> i=mixupExpr.extract(labels,input);i.hasNext();){ Span span=i.next(); extendLabels(labels,span,statement); } // make sure type is declared, even if nothing happened to be defined here if("defSpanType".equals(keyword)){ labels.declareType(type); } }else if(statementType==Statement.FILTER){ SortedSet<Span> accum=new TreeSet<Span>(); for(Iterator<Span> i=input;i.hasNext();){ Span span=i.next(); if(!hasExtraction(mixupExpr,labels,span)){ accum.add(span); } } for(Iterator<Span> i=accum.iterator();i.hasNext();){ extendLabels(labels,i.next(),statement); } }else if(statementType==Statement.TRIE){ labels.defineTrie(phraseList); while(input.hasNext()){ Span span=input.next(); Iterator<Span> output=labels.getTrie().lookup(span); while(output.hasNext()){ extendLabels(labels,output.next(),statement); } } }else if(statementType==Statement.REGEX){ Pattern pattern=Pattern.compile(regex); while(input.hasNext()){ Span span=input.next(); // Don't use this method as it drops leading and trailing spaces from the document text. //Matcher matcher = pattern.matcher( span.asString() ); Matcher matcher=pattern.matcher(span.getDocumentContents()); while(matcher.find()){ try{ Span subspan= span.charIndexProperSubSpan(matcher.start(regexGroup),matcher .end(regexGroup)); extendLabels(labels,subspan,statement); }catch(IllegalArgumentException ex){ /* there is no subspan that is properly contained by the regex match, so don't add anything */ } } } }else{ throw new IllegalStateException("illegal statement type "+statementType); } } long end=System.currentTimeMillis(); log.info("time: "+((end-start)/1000.0)+" sec"); } // subroutine of eval - check if a mixup expression matches private boolean hasExtraction(final Mixup mixupExpr,final TextLabels labels, final Span span){ Iterator<Span> input=Collections.singleton(span).iterator(); Iterator<Span> output=mixupExpr.extract(labels,input); return output.hasNext(); } // subroutine of eval - label the span private void extendLabels(MonotonicTextLabels labels,Span span, Statement statement){ String keyword=statement.getKeyword(); String type=statement.getType(); String property=statement.getProperty(); String value=statement.getValue(); if("defSpanType".equals(keyword)) labels.addToType(span,type); else if("defSpanProp".equals(keyword)) labels.setProperty(span,property,value); else if("defTokenProp".equals(keyword)){ for(int j=0;j<span.size();j++){ TextToken token=span.getTextToken(j); if(property==null) throw new IllegalStateException("null property"); labels.setProperty(token,property,value); } } } }