InsideOutsideReduction.java example

package edu.cmu.minorthird.text.learn;

import java.io.Serializable;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import edu.cmu.minorthird.classify.ExampleSchema;
import edu.cmu.minorthird.text.MonotonicTextLabels;
import edu.cmu.minorthird.text.NestedTextLabels;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.mixup.Mixup;
import edu.cmu.minorthird.text.mixup.MixupInterpreter;
import edu.cmu.minorthird.text.mixup.MixupProgram;

/**
 * Reducing an extraction task to tagging tokens as inside the type to extract,
 * or outside the type to extract.
 * 
 * @author William Cohen
 */

public class InsideOutsideReduction extends Extraction2TaggingReduction
		implements Serializable{

	private static final long serialVersionUID=1;

	// saves result of last reduction
	transient private NestedTextLabels taggedLabels;

	private String tokenProp="_inside";

	// all tag values that were used
	private Set<String> tagset=new HashSet<String>();

	@Override
	public void reduceExtraction2Tagging(AnnotationExample example){
		reduceDocument(example.getDocumentSpan(),example.getLabels(),example
				.getInputType(),example.getInputProp());
	}

	private void reduceDocument(Span doc,TextLabels labels,String spanType,String spanProp){
		taggedLabels=new NestedTextLabels(labels);
		assignDefaultLabels(doc,taggedLabels,spanType,spanProp);
		// label the tokens inside a span to be extracted as POS if there's just one
		// type to extract, or with the property value, otherwise.
		String id=doc.getDocumentId();
		Iterator<Span> i=
				spanType!=null?taggedLabels.instanceIterator(spanType,id):taggedLabels
						.getSpansWithProperty(spanProp,id);
		while(i.hasNext()){
			Span span=i.next();
			String tag=
					spanType!=null?ExampleSchema.POS_CLASS_NAME:taggedLabels.getProperty(
							span,spanProp);
			tagset.add(tag);
			for(int j=0;j<span.size();j++){
				taggedLabels.setProperty(span.getToken(j),tokenProp,tag);
			}
		}
	}

	@Override
	public String getTokenProp(){
		return tokenProp;
	}

	@Override
	public Set<String> getNonDefaultTagValues(){
		return tagset;
	}

	@Override
	public TextLabels getTaggedLabels(){
		return taggedLabels;
	}

	/**
	 * Return a TextLabels in which tagged tokens are used to solve the extraction
	 * problem.
	 */
	@Override
	public void extractFromTags(String output,MonotonicTextLabels taggedLabels){
		try{
			MixupProgram p=new MixupProgram();
			if(tagset.size()==1&&
					tagset.iterator().next().equals(ExampleSchema.POS_CLASS_NAME)){
				p.addStatement("defSpanType "+output+" =: "+
						makePattern(ExampleSchema.POS_CLASS_NAME));
			}else{
				for(Iterator<String> i=tagset.iterator();i.hasNext();){
					String tag=i.next();
					p.addStatement("defSpanProp "+output+":"+tag+" =: "+makePattern(tag));
				}
			}
			MixupInterpreter interp=new MixupInterpreter(p);
			interp.eval(taggedLabels);
		}catch(Mixup.ParseException ex){
			throw new IllegalStateException("mixup error: "+ex);
		}
	}

	private String makePattern(String val){
		return "... [L "+tokenProp+":"+val+"+ R] ...";
	}
}