/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.grmm.learning.extract;
import java.util.Iterator;
import cc.mallet.extract.*;
import cc.mallet.grmm.learning.ACRF;
import cc.mallet.grmm.util.SliceLabelsSequence;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.iterator.PipeInputIterator;
import cc.mallet.types.*;
/**
* Created: Mar 1, 2005
*
* @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A>
* @version $Id: ACRFExtractor.java,v 1.1 2007/10/22 21:38:02 mccallum Exp $
*/
public class ACRFExtractor implements Extractor {
private ACRF acrf;
private Pipe tokPipe;
private Pipe featurePipe;
private int slice = 0;
private String backgroundTag = "O";
private TokenizationFilter filter;
public ACRFExtractor (ACRF acrf, Pipe tokPipe, Pipe featurePipe)
{
this.acrf = acrf;
this.tokPipe = tokPipe;
this.featurePipe = featurePipe;
this.filter = new BIOTokenizationFilter ();
}
public Extraction extract (Object o)
{
throw new UnsupportedOperationException ("Not yet implemented");
}
public Extraction extract (Tokenization toks)
{
throw new UnsupportedOperationException ("Not yet implemented");
}
public Extraction extract (Iterator<Instance> source)
{
Extraction extraction = new Extraction (this, getTargetAlphabet ());
// Put all the instances through both pipes, then get viterbi path
InstanceList tokedList = new InstanceList (tokPipe);
tokedList.addThruPipe (source);
InstanceList pipedList = new InstanceList (getFeaturePipe ());
pipedList.addThruPipe (tokedList.iterator());
Iterator<Instance> it1 = tokedList.iterator ();
Iterator<Instance> it2 = pipedList.iterator ();
while (it1.hasNext()) {
Instance toked = it1.next();
Instance piped = it2.next();
Tokenization tok = (Tokenization) toked.getData();
String name = piped.getName().toString();
Sequence target = (Sequence) piped.getTarget ();
LabelsSequence output = acrf.getBestLabels (piped);
LabelSequence ls = SliceLabelsSequence.sliceLabelsSequence (output, slice);
LabelSequence lsTarget = SliceLabelsSequence.sliceLabelsSequence
((LabelsSequence) target, slice);
DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,
ls, lsTarget, backgroundTag, filter);
extraction.addDocumentExtraction (docseq);
}
return extraction;
}
// Experimental: Extract from training lists
public Extraction extract (InstanceList testing)
{
Extraction extraction = new Extraction (this, getTargetAlphabet ());
for (int i = 0; i < testing.size(); i++) {
Instance instance = testing.get (i);
Tokenization tok = (Tokenization) instance.getProperty ("TOKENIZATION");
if (tok == null)
throw new IllegalArgumentException
("To use extract(InstanceList), must save the Tokenization!");
String name = instance.getName ().toString ();
Sequence target = (Sequence) instance.getTarget ();
Sequence output = acrf.getBestLabels (instance);
DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,
output, target, backgroundTag, filter);
extraction.addDocumentExtraction (docseq);
}
return extraction;
}
public Pipe getFeaturePipe ()
{
return featurePipe;
}
public Pipe getTokenizationPipe ()
{
return tokPipe;
}
public void setTokenizationPipe (Pipe pipe)
{
tokPipe = pipe;
}
public Alphabet getInputAlphabet ()
{
return acrf.getInputAlphabet ();
}
public LabelAlphabet getTargetAlphabet ()
{
return (LabelAlphabet) acrf.getInputPipe ().getTargetAlphabet ();
}
public ACRF getAcrf ()
{
return acrf;
}
public void setSlice (int sl) { slice = sl; }
public void setTokenizationFilter (TokenizationFilter filter)
{
this.filter = filter;
}
}