import java.util.Iterator;
import edu.cmu.minorthird.classify.Feature;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.learn.SpanFE;
/**
* A sample feature extractor, intended for identifying person names.
* To try this out, compile it, put it on your classpath, and use the
* command:
*
* <code>
* <pre>
* java edu.cmu.minorthird.ui.TrainTestClassifier -labels sample1.train -test sample1.test -fe "new MyFE()" -spanType trueName -candidateType bigram -showResult
* </pre>
* </code>
*/
public class MyFE extends SpanFE
{
static final long serialVersionUID=20080224L;
// below, 'instance' is a MutableInstance that
// can be extended.
public void extractFeatures(TextLabels labels,Span span)
{
// some examples of the instance-extraction sublanguage
// add features for lower-case versions of all tokens in the span
from(span).tokens().eq().lc().emit();
// add features for the tokens to the left and right
from(span).left().token(-1).eq().emit(); // -1 is last token
// -in the left()
// span
from(span).right().token(0).eq().emit(); // 0 is first token
// in the right()
// span
// the 'charTypePattern' for the span itself
from(span).eq().charTypePattern().emit();
// the first token in the span
from(span).token(0).eq().emit();
// the capitalization of the tokens in the span
from(span).tokens().prop("cap").emit();
// an example of a numeric feature
instance.addNumeric( new Feature("lengthInChars"), myFunction(span) );
// a complex feature based on POS tags
// ask for POS annotations
labels.require("pos",null); // null means use the default
// tagger
// POS annotations are stored as span types on one-token-long
// spans. This loop collects all the noun-like tags inside
// the span into a single feature
String interestingPosTags = "";
for (int i=0; i<span.size(); i++) {
Span tokenSpan = span.subSpan(i,1);
boolean interestingTagFound=false;
for (Iterator<String> j=labels.getTypes().iterator(); j.hasNext() && !interestingTagFound; ) {
String posType = j.next();
if (posType.startsWith("N") && labels.hasType(tokenSpan,posType)) {
interestingPosTags = interestingPosTags + posType;
interestingTagFound = true;
}
}
if (!interestingTagFound) interestingPosTags = interestingPosTags+"-";
}
// now add a new feature based on this
instance.addBinary( new Feature("nounTags."+interestingPosTags) );
// another complex feature
if (span.size()==2) {
String token1 = span.getToken(0).getValue();
String token2 = span.getToken(1).getValue();
if (Character.isUpperCase(token1.charAt(0)) && Character.isUpperCase(token2.charAt(0))) {
instance.addBinary( new Feature("looksLikeName") );
}
}
}
private double myFunction(Span span)
{
return span.asString().length();
}
}