package edu.cmu.minorthird.text.learn;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import edu.cmu.minorthird.classify.ExampleSchema;
import edu.cmu.minorthird.text.MonotonicTextLabels;
import edu.cmu.minorthird.text.NestedTextLabels;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.mixup.Mixup;
import edu.cmu.minorthird.text.mixup.MixupInterpreter;
import edu.cmu.minorthird.text.mixup.MixupProgram;
/**
* Reducing an extraction task to tagging tokens as inside the type to extract,
* or outside the type to extract.
*
* @author William Cohen
*/
public class InsideOutsideReduction extends Extraction2TaggingReduction
implements Serializable{
private static final long serialVersionUID=1;
// saves result of last reduction
transient private NestedTextLabels taggedLabels;
private String tokenProp="_inside";
// all tag values that were used
private Set<String> tagset=new HashSet<String>();
@Override
public void reduceExtraction2Tagging(AnnotationExample example){
reduceDocument(example.getDocumentSpan(),example.getLabels(),example
.getInputType(),example.getInputProp());
}
private void reduceDocument(Span doc,TextLabels labels,String spanType,String spanProp){
taggedLabels=new NestedTextLabels(labels);
assignDefaultLabels(doc,taggedLabels,spanType,spanProp);
// label the tokens inside a span to be extracted as POS if there's just one
// type to extract, or with the property value, otherwise.
String id=doc.getDocumentId();
Iterator<Span> i=
spanType!=null?taggedLabels.instanceIterator(spanType,id):taggedLabels
.getSpansWithProperty(spanProp,id);
while(i.hasNext()){
Span span=i.next();
String tag=
spanType!=null?ExampleSchema.POS_CLASS_NAME:taggedLabels.getProperty(
span,spanProp);
tagset.add(tag);
for(int j=0;j<span.size();j++){
taggedLabels.setProperty(span.getToken(j),tokenProp,tag);
}
}
}
@Override
public String getTokenProp(){
return tokenProp;
}
@Override
public Set<String> getNonDefaultTagValues(){
return tagset;
}
@Override
public TextLabels getTaggedLabels(){
return taggedLabels;
}
/**
* Return a TextLabels in which tagged tokens are used to solve the extraction
* problem.
*/
@Override
public void extractFromTags(String output,MonotonicTextLabels taggedLabels){
try{
MixupProgram p=new MixupProgram();
if(tagset.size()==1&&
tagset.iterator().next().equals(ExampleSchema.POS_CLASS_NAME)){
p.addStatement("defSpanType "+output+" =: "+
makePattern(ExampleSchema.POS_CLASS_NAME));
}else{
for(Iterator<String> i=tagset.iterator();i.hasNext();){
String tag=i.next();
p.addStatement("defSpanProp "+output+":"+tag+" =: "+makePattern(tag));
}
}
MixupInterpreter interp=new MixupInterpreter(p);
interp.eval(taggedLabels);
}catch(Mixup.ParseException ex){
throw new IllegalStateException("mixup error: "+ex);
}
}
private String makePattern(String val){
return "... [L "+tokenProp+":"+val+"+ R] ...";
}
}