package edu.cmu.minorthird.text.learn;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import edu.cmu.minorthird.text.MonotonicTextLabels;
import edu.cmu.minorthird.text.NestedTextLabels;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.mixup.Mixup;
import edu.cmu.minorthird.text.mixup.MixupInterpreter;
import edu.cmu.minorthird.text.mixup.MixupProgram;
/**
* Reduces an extraction task to tagging tokens as one of three
* categories. The categories are: a token beginning the type to
* extract, a non-initial token inside the type to extract, something
* or outside the type to extract.
*
* @author William Cohen
*/
public class BeginContinueOutsideReduction extends Extraction2TaggingReduction
implements Serializable{
private static final long serialVersionUID=1;
// saves result of last reduction
transient private NestedTextLabels taggedLabels;
private String tokenProp="_entityPart";
// all tag values that were used
private Set<String> tagset=new HashSet<String>();
private boolean useSpanType=true;
@Override
public void reduceExtraction2Tagging(AnnotationExample example){
reduceDocument(example.getDocumentSpan(),example.getLabels(),example
.getInputType(),example.getInputProp());
}
private void reduceDocument(Span doc,TextLabels labels,String spanType,
String spanProp){
useSpanType=spanType!=null;
taggedLabels=new NestedTextLabels(labels);
// label all tokens as NEG
assignDefaultLabels(doc,taggedLabels,spanType,spanProp);
String id=doc.getDocumentId();
Iterator<Span> i=
useSpanType?taggedLabels.instanceIterator(spanType,id):taggedLabels
.getSpansWithProperty(spanProp,id);
while(i.hasNext()){
Span span=i.next();
String baseTag=
useSpanType?spanType:taggedLabels.getProperty(span,spanProp);
tagset.add(baseTag);
String beginTag=baseTag+"Begin";
taggedLabels.setProperty(span.getToken(0),tokenProp,beginTag);
String contTag=baseTag+"Continue";
for(int j=1;j<span.size();j++){
taggedLabels.setProperty(span.getToken(j),tokenProp,contTag);
}
}
}
@Override
public String getTokenProp(){
return tokenProp;
}
@Override
public Set<String> getNonDefaultTagValues(){
Set<String> result=new HashSet<String>();
for(Iterator<String> i=tagset.iterator();i.hasNext();){
String baseTag=i.next();
result.add(baseTag+"Begin");
result.add(baseTag+"Continue");
}
return result;
}
@Override
public TextLabels getTaggedLabels(){
return taggedLabels;
}
/** Return a TextLabels in which tagged tokens are used
* to solve the extraction problem. */
@Override
public void extractFromTags(String output,MonotonicTextLabels taggedLabels){
try{
MixupProgram p=new MixupProgram();
if(useSpanType){
String baseTag=tagset.iterator().next();
p.addStatement("defSpanType "+output+" =: "+makePattern(baseTag));
}else{
for(Iterator<String> i=tagset.iterator();i.hasNext();){
String baseTag=i.next();
p.addStatement("defSpanProp "+output+":"+baseTag+" =: "+
makePattern(baseTag));
}
}
MixupInterpreter interp=new MixupInterpreter(p);
interp.eval(taggedLabels);
}catch(Mixup.ParseException ex){
throw new IllegalStateException("mixup error: "+ex);
}
}
private String makePattern(String baseTag){
String p=tokenProp+":"+baseTag;
return "... ["+p+"Begin L "+p+"Continue* R ] ... ";
}
}