/* Copyright 2003, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.text.mixup; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import edu.cmu.minorthird.text.FancyLoader; import edu.cmu.minorthird.text.MonotonicTextLabels; import edu.cmu.minorthird.text.Span; import edu.cmu.minorthird.text.TextLabelsLoader; /** Modify a textlabeling using a series of mixup expressions. <pre> BNF: STATEMENT -> declareSpanType TYPE STATEMENT -> provide ID STATEMENT -> require ID [,FILE] STATEMENT -> annotateWith FILE STATEMENT -> defDict [+case] NAME = ID, ... , ID STATEMENT -> defTokenProp PROP:VALUE = GEN STATEMENT -> defSpanProp PROP:VALUE = GEN STATEMENT -> defSpanType TYPE2 = GEN STATEMENT -> defLevel NAME = LEVELDEF STATEMENT -> onLevel NAME STATEMENT -> offLevel NAME STATEMENT -> importFromLevel NAME TYPE = TYPE LEVELDEF -> filter TYPE LEVELDEF -> pseudotoken TYPE LEVELDEF -> split TOKEN LEVELDEF -> re 'REGEX' GEN -> [TYPE]: MIXUP-EXPR GEN -> [TYPE]- MIXUP-EXPR GEN -> [TYPE]~ re 'REGEX',NUMBER GEN -> [TYPE]~ trie phrase1, phrase2, ... ; statements are semicolon-separated // and comments look like this (C++ style) SEMANTICS: execute each command in order, saving spans/tokens as types, and asserting properties '=:' can be replaced with '=TYPE:', in which case the expr will be applied to each span of the given type, rather than all top-level spans defDict FOO = bar,baz,bat stores a lowercase version of each word the dictionary defDict +case FOO = blah,Bar,baZ stores each word the dictionary, preserving case in dictionaries and tries, a double-quoted word "foo.txt" means to find foo.txt on the classpath and store all lines from the file as words (after trimming them). TYPE: MIXUP-EXPR finds all spans inside a span of type TYPE that match the expression TYPE- MIXUP-EXPR finds all spans inside a span of type TYPE that do not contain anything matching MIXUP-EXPR </pre> <p> Mixup is matching language for modifying TextLabels. It can label spans with a given TYPE (the new label for that token span) and assign properties to spans (much like labels, but 'invisible'). There is more documentation for Mixup programs in the <a href="package-summary.html">package-level documents for Mixup.</a> <p> Briefly, a Mixup program will look something like this: <pre> require "req1"; //requires that "abc" type spans have already been labeled. If not, the default annoator //for "abc" will be used. require "req2", "req2.mixup"; //file 'def.mixup' will be run to provide "def" labels if they are not already there //if "def" labels were already generated by a different annotator, they will be used and //and 'def.mixup' won't be called. provide "xyz"; //this program will annotate the text with "xyz" labels defDict titleWord = mr, ms, mrs, dr; //defines a dictionary (with scope of this program execution called 'titleWord' //containing the values "mr", "ms", "mrs", "dr" defDict myDictionary = "dictionary.txt"; //defines a dictionary called 'myDictionary' with values taken from the file "dictionary.txt" defTokenProp title:true =: ... [ai(titleWord)] ... ; //finds all spans matching a work in the dictionary titleWord //those spans are given the property "Name" with value "true" (a string, not boolean) //if the span previously had "Name" property with a different value, that is replaced // the "..." before and after indicate that it doesn't matter what comes before or after the token //to be labeled. if I said "=: [ai(titleWord)];" the document would need to be JUST a titleword. defTokenProp titlePunc:1 =: ... title:true [','] ... || ... title:true ['.'] ... ; //spans "." or "," preceeded by a title are given the property titlePunc with value "1" //note that the entire '... title:true [','] ...' is an expression; or operators ("||") must be //<em> between</em> expressions, not within them defSpanType fullTitle =: ...[title:true titlePunc:1?R] ...; //label a span as "fullTitle" if there is a title span optionally followed b a titlePunc span //but not more than one (from the R) defSpanType the =: ... [eqi('the')] ...; //labels occurances of "the" ignoring case (eq = equals, adding i ignores case) defTokenProp aProp:t =: ...[<title:true, req1>] ...; /tokens which have the title=true property AND are labeled as req1 //are given the property aProp=t defTokenProp address:x =: ... [@fullTitle any] !a(myDictionary) ...; //label spans of one 'fullTitle' (the @ is needed //before types) and the following token, whatever it is, // which are followed by something other than a myDictionary word defTokenProp capProp:on =req2: ... [re('^[A-Z]$')] ...; //on spans of type req2, match tokens fitting the given regular expression defSpanType listSet =: ... [address+R] ...; //label as header spans of 1 or more address tokens, going all the way to //right most possible token - example: blah address1 address2 address3 blah // - will return three spans: "address3", "address2 address3", and "address1 address2 address3" defSpanType adList =: ... [L address+ R] ...; //as above but only returns the longest span defSpanType header =: [L address* R] ...; //label longest span of 0 or more address tokens at the beginning of the document defSpanType shortList =: ... [address{2,3}] ...; //label spans of 2 or 3 address tokens defSpanType xyz =header: ...[capProp] ...; //providing the promised xyz labeling //creates a new level where each document is a span with spanType defLevel newLevel = filter spanType; //creates a new level where tokens of spanType are combined into a single token defLevel newLevel = pseudotoken spanType; //creates a new level where the textBase is retokenized by splitting a a certain token defLevel newLevel = split '.'; //create a new level where the textBase is retokenized using a regular expression defLevel newLevel = re '([^\n]+)'; //switches current textBase and Labels to Level onLevel levelName; //returns to root (or original) level - levelName is the name of the child level which you are switching off offLevel childLevelName; //Imports spans of Type in the child level to spans of newType in the parent level importFromLevel childLevelName newType = type; </pre> * * @author William Cohen */ public class MixupProgram implements Serializable{ static private final long serialVersionUID=20080303L; // private static Logger log=Logger.getLogger(MixupProgram.class); private List<Statement> statementList=new ArrayList<Statement>(); // maps dictionary names to the sets they correspond to // private Map<String,Set<String>> dictionaryMap=new HashMap<String,Set<String>>(); // private static TextBase textBase=null; // private static MonotonicTextLabels labels=null; // private static HashMap textBases=new HashMap(); //List of TextBases with different tokenizations // private static HashMap textLabels=new HashMap(); //List of TextLabels with for textBases with different tokenizations public static Set<String> legalKeywords=new HashSet<String>(); static{ legalKeywords.add("defTokenProp"); legalKeywords.add("defSpanProp"); legalKeywords.add("defSpanType"); legalKeywords.add("defDict"); legalKeywords.add("declareSpanType"); legalKeywords.add("provide"); legalKeywords.add("require"); legalKeywords.add("annotateWith"); legalKeywords.add("defLevel"); legalKeywords.add("onLevel"); legalKeywords.add("offLevel"); legalKeywords.add("importFromLevel"); legalKeywords.add("//"); legalKeywords.add("\n"); } public MixupProgram(){ ; } /** Create a MixupProgram from an array of statements */ public MixupProgram(String[] statements) throws Mixup.ParseException{ String program=""; for(int i=0;i<statements.length;i++){ program=program+statements[i]+";\n"; } startProgram(program); } /** Create a MixupProgram from single string with a bunch of semicolon-separated statements. */ public MixupProgram(String program) throws Mixup.ParseException{ String[] lines=program.split("\n"); StringBuffer buf=new StringBuffer(); String line; for(int i=0;i<lines.length;i++){ int startComment=lines[i].indexOf("//"); if(startComment>=0) line=lines[i].substring(0,startComment); else line=lines[i]; buf.append(line); buf.append("\n"); } program=buf.toString(); startProgram(program); } /** Create a MixupProgram from the contents of a file. */ public MixupProgram(File file) throws Mixup.ParseException, FileNotFoundException,IOException{ //LineNumberReader in = new LineNumberReader(new FileReader(file)); LineNumberReader in= file.exists()?mixupReader(file):mixupReader(file.getName()); StringBuffer buf=new StringBuffer(); String line; while((line=in.readLine())!=null){ int startComment=line.indexOf("//"); if(startComment>=0) line=line.substring(0,startComment); buf.append(line); buf.append("\n"); } in.close(); String program=buf.toString(); startProgram(program); } private void startProgram(String program) throws Mixup.ParseException{ program.trim(); Mixup.MixupTokenizer tok=new Mixup.MixupTokenizer(program); String keyword=tok.advance(legalKeywords); while(keyword!=null){ if(!keyword.startsWith("\n")){ addStatement(tok,keyword); } keyword=tok.advance(legalKeywords); } } // /** // * @deprecated Use MixupInterpreter to evaluate mixup programs // */ // // Deprecated on 2/20/2007 // public MonotonicTextLabels eval(MonotonicTextLabels labels,TextBase tb){ // MixupInterpreter interpreter=new MixupInterpreter(this); // interpreter.eval(labels); // return interpreter.getCurrentLabels(); // } // // /** // * @deprecated Use MixupInterpreter to evaluate mixup programs // */ // // Deprecated on 2/20/2007 // public void eval(MonotonicTextLabels labels){ // MixupInterpreter interpreter=new MixupInterpreter(this); // ProgressCounter pc= // new ProgressCounter("mixup program","statement",statementList.size()); // interpreter.eval(labels); // pc.finished(); // } /** Add a single statement to the current mixup program. */ public void addStatement(Mixup.MixupTokenizer tok,String keyword) throws Mixup.ParseException{ statementList.add(new Statement(tok,keyword)); //System.out.println("Added Statement: "+statementList.get(statementList.size()-1)); } /** Add a single statement to the current mixup program. */ public void addStatement(String statement) throws Mixup.ParseException{ Mixup.MixupTokenizer tok=new Mixup.MixupTokenizer(statement); String keyword=tok.advance(legalKeywords); addStatement(tok,keyword); } public Statement[] getStatements(){ return statementList.toArray(new Statement[0]); } /** List the program **/ @Override public String toString(){ StringBuffer buf=new StringBuffer(""); for(int i=0;i<statementList.size();i++){ buf.append(statementList.get(i).toString()+";\n"); } return buf.toString(); } /** Convert a string to an input stream, then a LineNumberReader. */ static private LineNumberReader mixupReader(String fileName) throws IOException,FileNotFoundException{ File file=new File(fileName); if(file.exists()) return mixupReader(file); else{ InputStream s; s= ClassLoader .getSystemResourceAsStream(fileName); if(s==null) s=ClassLoader.getSystemResourceAsStream(fileName); if(s==null) throw new IllegalArgumentException("No file named '"+fileName+ "' found on classpath"); return new LineNumberReader(new BufferedReader(new InputStreamReader(s))); } } static private LineNumberReader mixupReader(File file) throws IOException, FileNotFoundException{ return new LineNumberReader(new BufferedReader(new FileReader(file))); } /** * usage: programFile textFile/directory [outfile] * evaluates the given program file against the specified data (either a file or directory of files) * if an outfile is specified it outputs the types as operators to that file */ public static void main(String[] args){ try{ MixupProgram program=new MixupProgram(new File(args[0])); System.out.println("program:\n"+program.toString()); if(args.length>1){ MonotonicTextLabels labels= (MonotonicTextLabels)FancyLoader.loadTextLabels(args[1]); MixupInterpreter interpreter=new MixupInterpreter(program); interpreter.eval(labels); if(args.length>2){ File outFile=new File(args[2]); new TextLabelsLoader().saveTypesAsOps(labels,outFile); }else for(Iterator<String> i=labels.getTypes().iterator();i.hasNext();){ String type=i.next(); System.out.println("Type "+type+":"); for(Iterator<Span> j=labels.instanceIterator(type);j.hasNext();){ Span span=j.next(); System.out.println("\t'"+span.asString()+"'"); } } } }catch(Exception e){ System.out.println("usage: programFile textFile/directory [outfile]"); e.printStackTrace(); } } }