MixupProgram.java example

Explorer
MinorThird-master
/* Copyright 2003, Carnegie Mellon, All Rights Reserved */

package edu.cmu.minorthird.text.mixup;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import edu.cmu.minorthird.text.FancyLoader;
import edu.cmu.minorthird.text.MonotonicTextLabels;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextLabelsLoader;

/** Modify a textlabeling using a series of mixup expressions.

 <pre>
 BNF:
 STATEMENT -> declareSpanType TYPE
 STATEMENT -> provide ID
 STATEMENT -> require ID [,FILE]
 STATEMENT -> annotateWith FILE
 STATEMENT -> defDict [+case] NAME = ID, ... , ID
 STATEMENT -> defTokenProp PROP:VALUE = GEN
 STATEMENT -> defSpanProp PROP:VALUE = GEN
 STATEMENT -> defSpanType TYPE2 = GEN
 STATEMENT -> defLevel NAME = LEVELDEF
 STATEMENT -> onLevel NAME
 STATEMENT -> offLevel NAME
 STATEMENT -> importFromLevel NAME TYPE = TYPE

 LEVELDEF -> filter TYPE
 LEVELDEF -> pseudotoken TYPE
 LEVELDEF -> split TOKEN
 LEVELDEF -> re 'REGEX'

 GEN -> [TYPE]: MIXUP-EXPR
 GEN -> [TYPE]- MIXUP-EXPR
 GEN -> [TYPE]~ re 'REGEX',NUMBER
 GEN -> [TYPE]~ trie phrase1, phrase2, ... ;

 statements are semicolon-separated
 // and comments look like this (C++ style)

 SEMANTICS:
 execute each command in order, saving spans/tokens as types, and asserting properties
 '=:' can be replaced with '=TYPE:', in which case the expr will be applied to
 each span of the given type, rather than all top-level spans

 defDict FOO = bar,baz,bat stores a lowercase version of each word the dictionary
 defDict +case FOO = blah,Bar,baZ stores each word the dictionary, preserving case

 in dictionaries and tries, a double-quoted word "foo.txt" means to
 find foo.txt on the classpath and store all lines from the file as
 words (after trimming them).

 TYPE: MIXUP-EXPR finds all spans inside a span of type TYPE that match the expression
 TYPE- MIXUP-EXPR finds all spans inside a span of type TYPE that do not contain anything matching MIXUP-EXPR

 </pre> <p> Mixup is matching language for modifying TextLabels.  It
 can label spans with a given TYPE (the new label for that token span)
 and assign properties to spans (much like labels, but 'invisible').
 There is more documentation for Mixup programs in the <a
 href="package-summary.html">package-level documents for Mixup.</a>
 <p>
 Briefly, a Mixup program will look something like this:
 <pre>
 require "req1"; //requires that "abc" type spans have already been labeled.  If not, the default annoator
 //for "abc" will be used.
 require "req2", "req2.mixup"; 
 //file 'def.mixup' will be run to provide "def" labels if they are not already there
 //if  "def" labels were already generated by a different annotator, they will be used and
 //and 'def.mixup' won't be called.
 provide "xyz"; //this program will annotate the text with "xyz" labels
 defDict titleWord = mr, ms, mrs, dr; 
 //defines a dictionary (with scope of this program execution called 'titleWord'
 //containing the values "mr", "ms", "mrs", "dr" 
 defDict myDictionary = "dictionary.txt"; 
 //defines a dictionary called 'myDictionary' with values taken from the file "dictionary.txt"
 defTokenProp title:true =: ... [ai(titleWord)] ... ; //finds all spans matching a work in the dictionary titleWord
 //those spans are given the property "Name" with value "true" (a string, not boolean)
 //if the span previously had "Name" property with a different value, that is replaced
 // the "..." before and after indicate that it doesn't matter what comes before or after the token
 //to be labeled.  if I said "=: [ai(titleWord)];" the document would need to be JUST a titleword.
 defTokenProp titlePunc:1 =: ... title:true [','] ... || ... title:true ['.'] ... ;
 //spans "." or "," preceeded by a title are given the property titlePunc with value "1"
 //note that the entire '... title:true [','] ...' is an expression; or operators ("||") must be
 //<em> between</em> expressions, not within them
 defSpanType fullTitle =: ...[title:true titlePunc:1?R] ...;
 //label a span as "fullTitle" if there is a title span optionally followed b a titlePunc span
 //but not more than one (from the R)
 defSpanType the =: ... [eqi('the')] ...; 
 //labels occurances of "the" ignoring case (eq = equals, adding i ignores case)
 defTokenProp aProp:t =: ...[<title:true, req1>] ...; 
 /tokens which have the title=true property AND are labeled as req1
 //are given the property aProp=t
 defTokenProp address:x =: ... [@fullTitle any] !a(myDictionary) ...; 
 //label spans of one 'fullTitle' (the @ is needed
 //before types) and the following token, whatever it is, 
 // which are followed by something other than a myDictionary word
 defTokenProp capProp:on =req2: ... [re('^[A-Z]$')] ...; 
 //on spans of type req2, match tokens fitting the given regular expression
 defSpanType listSet =: ... [address+R] ...; 
 //label as header spans of 1 or more address tokens, going all the way to 
 //right most possible token - example: blah address1 address2 address3 blah 
 // - will return three spans: "address3", "address2 address3", and "address1 address2 address3"
 defSpanType adList =: ... [L address+ R] ...; //as above but only returns the longest span
 defSpanType header =: [L address* R] ...; 
 //label longest span of 0 or more address tokens at the beginning of the document
 defSpanType shortList =: ... [address{2,3}] ...; //label spans of 2 or 3 address tokens
 defSpanType xyz =header: ...[capProp] ...; //providing the promised xyz labeling
 //creates a new level where each document is a span with spanType
 defLevel newLevel = filter spanType;
 //creates a new level where tokens of spanType are combined into a single token
 defLevel newLevel = pseudotoken spanType;
 //creates a new level where the textBase is retokenized by splitting a a certain token
 defLevel newLevel = split '.';
 //create a new level where the textBase is retokenized using a regular expression
 defLevel newLevel = re '([^\n]+)';
 //switches current textBase and Labels to Level
 onLevel levelName;
 //returns to root (or original) level - levelName is the name of the child level which you are switching off
 offLevel childLevelName;
 //Imports spans of Type in the child level to spans of newType in the parent level
 importFromLevel childLevelName newType = type;
 </pre>
 *
 * @author William Cohen
 */

public class MixupProgram implements Serializable{

	static private final long serialVersionUID=20080303L;

//	private static Logger log=Logger.getLogger(MixupProgram.class);

	private List<Statement> statementList=new ArrayList<Statement>();

	// maps dictionary names to the sets they correspond to 
//	private Map<String,Set<String>> dictionaryMap=new HashMap<String,Set<String>>();

//	private static TextBase textBase=null;

//	private static MonotonicTextLabels labels=null;

//	private static HashMap textBases=new HashMap(); //List of TextBases with different tokenizations

//	private static HashMap textLabels=new HashMap(); //List of TextLabels with for textBases with different tokenizations

	public static Set<String> legalKeywords=new HashSet<String>();
	static{
		legalKeywords.add("defTokenProp");
		legalKeywords.add("defSpanProp");
		legalKeywords.add("defSpanType");
		legalKeywords.add("defDict");
		legalKeywords.add("declareSpanType");
		legalKeywords.add("provide");
		legalKeywords.add("require");
		legalKeywords.add("annotateWith");
		legalKeywords.add("defLevel");
		legalKeywords.add("onLevel");
		legalKeywords.add("offLevel");
		legalKeywords.add("importFromLevel");
		legalKeywords.add("//");
		legalKeywords.add("\n");
	}

	public MixupProgram(){
		;
	}

	/** Create a MixupProgram from an array of statements */
	public MixupProgram(String[] statements) throws Mixup.ParseException{
		String program="";
		for(int i=0;i<statements.length;i++){
			program=program+statements[i]+";\n";
		}
		startProgram(program);
	}

	/** Create a MixupProgram from single string with a bunch of semicolon-separated statements. */
	public MixupProgram(String program) throws Mixup.ParseException{
		String[] lines=program.split("\n");
		StringBuffer buf=new StringBuffer();
		String line;
		for(int i=0;i<lines.length;i++){
			int startComment=lines[i].indexOf("//");
			if(startComment>=0)
				line=lines[i].substring(0,startComment);
			else
				line=lines[i];
			buf.append(line);
			buf.append("\n");
		}
		program=buf.toString();
		startProgram(program);
	}

	/** Create a MixupProgram from the contents of a file. */
	public MixupProgram(File file) throws Mixup.ParseException,
			FileNotFoundException,IOException{
		//LineNumberReader in = new LineNumberReader(new FileReader(file));
		LineNumberReader in=
				file.exists()?mixupReader(file):mixupReader(file.getName());
		StringBuffer buf=new StringBuffer();
		String line;
		while((line=in.readLine())!=null){
			int startComment=line.indexOf("//");
			if(startComment>=0)
				line=line.substring(0,startComment);
			buf.append(line);
			buf.append("\n");
		}
		in.close();
		String program=buf.toString();
		startProgram(program);
	}

	private void startProgram(String program) throws Mixup.ParseException{
		program.trim();
		Mixup.MixupTokenizer tok=new Mixup.MixupTokenizer(program);
		String keyword=tok.advance(legalKeywords);
		while(keyword!=null){
			if(!keyword.startsWith("\n")){
				addStatement(tok,keyword);
			}
			keyword=tok.advance(legalKeywords);
		}
	}

//	/**
//	 * @deprecated  Use MixupInterpreter to evaluate mixup programs
//	 */
//	// Deprecated on 2/20/2007
//	public MonotonicTextLabels eval(MonotonicTextLabels labels,TextBase tb){
//		MixupInterpreter interpreter=new MixupInterpreter(this);
//		interpreter.eval(labels);
//		return interpreter.getCurrentLabels();
//	}
//
//	/**
//	 * @deprecated  Use MixupInterpreter to evaluate mixup programs
//	 */
//	// Deprecated on 2/20/2007
//	public void eval(MonotonicTextLabels labels){
//		MixupInterpreter interpreter=new MixupInterpreter(this);
//		ProgressCounter pc=
//				new ProgressCounter("mixup program","statement",statementList.size());
//		interpreter.eval(labels);
//		pc.finished();
//	}

	/** Add a single statement to the current mixup program. */
	public void addStatement(Mixup.MixupTokenizer tok,String keyword)
			throws Mixup.ParseException{
		statementList.add(new Statement(tok,keyword));
		//System.out.println("Added Statement: "+statementList.get(statementList.size()-1));
	}

	/** Add a single statement to the current mixup program. */
	public void addStatement(String statement) throws Mixup.ParseException{
		Mixup.MixupTokenizer tok=new Mixup.MixupTokenizer(statement);
		String keyword=tok.advance(legalKeywords);
		addStatement(tok,keyword);
	}

	public Statement[] getStatements(){
		return statementList.toArray(new Statement[0]);
	}

	/** List the program **/
	@Override
	public String toString(){
		StringBuffer buf=new StringBuffer("");
		for(int i=0;i<statementList.size();i++){
			buf.append(statementList.get(i).toString()+";\n");
		}
		return buf.toString();
	}

	/** Convert a string to an input stream, then a LineNumberReader. */
	static private LineNumberReader mixupReader(String fileName)
			throws IOException,FileNotFoundException{
		File file=new File(fileName);
		if(file.exists())
			return mixupReader(file);
		else{
			InputStream s;
			s=
					ClassLoader
							.getSystemResourceAsStream(fileName);
			if(s==null)
				s=ClassLoader.getSystemResourceAsStream(fileName);
			if(s==null)
				throw new IllegalArgumentException("No file named '"+fileName+
						"' found on classpath");
			return new LineNumberReader(new BufferedReader(new InputStreamReader(s)));
		}
	}

	static private LineNumberReader mixupReader(File file) throws IOException,
			FileNotFoundException{
		return new LineNumberReader(new BufferedReader(new FileReader(file)));
	}

	/**
	 * usage: programFile textFile/directory [outfile]
	 * evaluates the given program file against the specified data (either a file or directory of files)
	 * if an outfile is specified it outputs the types as operators to that file
	 */
	public static void main(String[] args){
		try{
			MixupProgram program=new MixupProgram(new File(args[0]));
			System.out.println("program:\n"+program.toString());
			if(args.length>1){
				MonotonicTextLabels labels=
						(MonotonicTextLabels)FancyLoader.loadTextLabels(args[1]);

				MixupInterpreter interpreter=new MixupInterpreter(program);
				interpreter.eval(labels);

				if(args.length>2){
					File outFile=new File(args[2]);
					new TextLabelsLoader().saveTypesAsOps(labels,outFile);
				}else
					for(Iterator<String> i=labels.getTypes().iterator();i.hasNext();){
						String type=i.next();
						System.out.println("Type "+type+":");
						for(Iterator<Span> j=labels.instanceIterator(type);j.hasNext();){
							Span span=j.next();
							System.out.println("\t'"+span.asString()+"'");
						}
					}
			}
		}catch(Exception e){
			System.out.println("usage: programFile textFile/directory [outfile]");
			e.printStackTrace();
		}
	}
}