package edu.cmu.minorthird.text.learn;
import java.util.Iterator;
import org.apache.log4j.Logger;
import edu.cmu.minorthird.classify.ClassLabel;
import edu.cmu.minorthird.classify.Example;
import edu.cmu.minorthird.classify.sequential.SequenceDataset;
import edu.cmu.minorthird.text.Annotator;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.Token;
import edu.cmu.minorthird.ui.Recommended;
/**
* Learn an annotation model using a sequence dataset and some sort of
* batch learner.
*
* @author William Cohen
*/
public abstract class AbstractBatchAnnotatorLearner extends AnnotatorLearner
{
private static Logger log = Logger.getLogger(AbstractBatchAnnotatorLearner.class);
protected SpanFeatureExtractor fe;
protected String annotationType = "_prediction";
protected SequenceDataset seqData;
protected Extraction2TaggingReduction reduction;
public AbstractBatchAnnotatorLearner() {
this(new Recommended.TokenFE(),new InsideOutsideReduction());
}
public AbstractBatchAnnotatorLearner(SpanFeatureExtractor fe,Extraction2TaggingReduction reduction) {
this.reduction = reduction;
this.fe = fe;
seqData = new SequenceDataset();
}
@Override
public void reset() {
seqData = new SequenceDataset();
}
/** Scheme for reducing extraction to a token-classification problem */
public Extraction2TaggingReduction getTaggingReduction() { return reduction; }
public void setTaggingReduction(Extraction2TaggingReduction reduction) { this.reduction = reduction; }
public String getTaggingReductionHelp() { return "Scheme for reducing extraction to a token-classification problem"; }
/** Feature extractor used for tokens */
@Override
public SpanFeatureExtractor getSpanFeatureExtractor() { return fe; }
@Override
public void setSpanFeatureExtractor(SpanFeatureExtractor fe) {this.fe = fe; }
/** The spanType of the annotation produced by the learned annotator. */
@Override
public void setAnnotationType(String s) { annotationType=s; }
@Override
public String getAnnotationType() { return annotationType; }
//
// buffer data
//
// temporary storage
private Iterator<Span> documentLooper;
/** Accept the pool of unlabeled documents. */
@Override
public void setDocumentPool(Iterator<Span> documentLooper) { this.documentLooper = documentLooper; }
/** Ask for labels on every document. */
@Override
public boolean hasNextQuery() { return documentLooper.hasNext();}
/** Return the next unlabeled document. */
@Override
public Span nextQuery() { return documentLooper.next(); }
/** Accept the answer to the last query. */
@Override
public void setAnswer(AnnotationExample answeredQuery){
reduction.reduceExtraction2Tagging(answeredQuery);
TextLabels answerLabels=reduction.getTaggedLabels();
Span document=answeredQuery.getDocumentSpan();
Example[] sequence=new Example[document.size()];
for (int i=0; i<document.size(); i++) {
Token tok=document.getToken(i);
String value=answerLabels.getProperty(tok,reduction.getTokenProp());
if (value!=null) {
ClassLabel classLabel = new ClassLabel(value);
Span tokenSpan = document.subSpan(i,1);
Example example = new Example(fe.extractInstance(answeredQuery.getLabels(),tokenSpan), classLabel);
sequence[i] = example;
} else {
log.warn("ignoring "+document.getDocumentId()+" because token "+i+" is not labeled");
return;
}
}
seqData.addSequence(sequence);
}
/** Return the learned annotator.
*/
@Override
abstract public Annotator getAnnotator();
/** Get the constructed sequence data.
*/
public SequenceDataset getSequenceDataset()
{
return seqData;
}
}