/* Copyright 2003, Carnegie Mellon, All Rights Reserved */
package edu.cmu.minorthird.text.learn;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Set;
import edu.cmu.minorthird.classify.BasicDataset;
import edu.cmu.minorthird.classify.ClassLabel;
import edu.cmu.minorthird.classify.Dataset;
import edu.cmu.minorthird.classify.Example;
import edu.cmu.minorthird.classify.SampleDatasets;
import edu.cmu.minorthird.text.BasicTextBase;
import edu.cmu.minorthird.text.EmptyLabels;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.util.StringUtil;
import edu.cmu.minorthird.util.gui.ViewerFrame;
/**
* Some sample feature extractors.
*
* @author William Cohen
*/
public class SampleFE{
/**
* Simple bag of words feature extractor.
*/
public static final AnnotatedSpanFE BAG_OF_WORDS=new BagOfWordsFE();
public static class BagOfWordsFE extends AnnotatedSpanFE implements
Serializable{
static final long serialVersionUID=20080306L;
@Override
public void extractFeatures(TextLabels labels,Span s){
from(s).tokens().emit();
}
}
/**
* Simple bag of words feature extractor, with all tokens converted to lower
* case.
*/
public static final AnnotatedSpanFE BAG_OF_LC_WORDS=
new BagOfLowerCaseWordsFE();
public static class BagOfLowerCaseWordsFE extends AnnotatedSpanFE implements
Serializable{
static final long serialVersionUID=20080306L;
@Override
public void extractFeatures(TextLabels labels,Span s){
from(s).tokens().eq().lc().emit();
}
}
/**
* A simple extraction-oriented feature extractor to apply to one-token spans,
* for extraction tasks.
*/
public static final AnnotatedSpanFE makeExtractionFE(
final int featureWindowSize){
ExtractionFE fe=new ExtractionFE();
fe.setFeatureWindowSize(featureWindowSize);
return fe;
}
/**
* An extraction-oriented feature extractor to apply to one-token spans, for
* extraction tasks.
*/
public static class ExtractionFE extends AnnotatedSpanFE{
static final long serialVersionUID=20080306L;
protected int windowSize=5;
protected boolean useCharType=true;
protected boolean useCompressedCharType=true;
protected String[] tokenPropertyFeatures=new String[0];
public ExtractionFE(){
this(3);
}
public ExtractionFE(int windowSize){
this.windowSize=windowSize;
}
//
// getters and setters
//
/**
* Specify the number of tokens on before and after the span to emit
* features for.
*/
public void setFeatureWindowSize(int n){
windowSize=n;
}
public int getFeatureWindowSize(){
return windowSize;
}
/**
* If set to true, produce features like "token.charTypePattern.Aaaa" for
* the word "Bill"
*/
public void setUseCharType(boolean flag){
useCharType=flag;
}
public boolean getUseCharType(){
return useCharType;
}
/**
* If set to true, produce features like "token.charTypePattern.Aa+" for the
* word "Bill".
*/
public void setUseCompressedCharType(boolean flag){
useCompressedCharType=flag;
}
public boolean getUseCompressedCharType(){
return useCompressedCharType;
}
/**
* Specify the token properties from the TextLabels environment that will be
* used as features. A value of '*' means to use all defined token
* properties.
*/
public void setTokenPropertyFeatures(String commaSeparatedTokenPropertyList){
if("*".equals(commaSeparatedTokenPropertyList)){
// System.out.println("setting properties to null");
tokenPropertyFeatures=null;
}else{
tokenPropertyFeatures=commaSeparatedTokenPropertyList.split(",\\s*");
}
}
public String getTokenPropertyFeatures(){
return StringUtil.toString(tokenPropertyFeatures);
}
public void setTokenPropertyFeatures(Set<String> propertySet){
tokenPropertyFeatures=
propertySet.toArray(new String[propertySet.size()]);
}
@Override
public void extractFeatures(Span s){
extractFeatures(new EmptyLabels(),s);
}
@Override
public void extractFeatures(TextLabels labels,Span s){
requireMyAnnotation(labels);
if(tokenPropertyFeatures==null){
System.out.println("setTokenPropertyFeatures to the set "+
labels.getTokenProperties());
setTokenPropertyFeatures(labels.getTokenProperties());
}
// tokens in span
from(s).tokens().eq().lc().emit();
// simplified capitalization pattern
if(useCompressedCharType){
from(s).tokens().eq().charTypePattern().emit();
}
// exact capitalization pattern
if(useCharType){
from(s).tokens().eq().charTypes().emit();
}
// token properties
for(int j=0;j<tokenPropertyFeatures.length;j++){
from(s).tokens().prop(tokenPropertyFeatures[j]).emit();
}
// window
for(int i=0;i<windowSize;i++){
from(s).left().token(-i-1).eq().lc().emit();
from(s).right().token(i).eq().lc().emit();
for(int j=0;j<tokenPropertyFeatures.length;j++){
// System.out.println("Property: "+tokenPropertyFeatures[j]);
from(s).left().token(-i-1).prop(tokenPropertyFeatures[j]).emit();
from(s).right().token(i).prop(tokenPropertyFeatures[j]).emit();
}
if(useCompressedCharType){
from(s).left().token(-i-1).eq().charTypePattern().emit();
from(s).right().token(i).eq().charTypePattern().emit();
}
if(useCharType){
from(s).left().token(-i-1).eq().charTypes().emit();
from(s).right().token(i).eq().charTypes().emit();
}
}
}
}
/**
* A feature extractor that pre-loads a mixup file or some other type of
* annotation.
*/
public static abstract class AnnotatedSpanFE extends SpanFE{
static final long serialVersionUID=20081125L;
}
/**
* Test case to try out the feature extractors
*/
public static void main(String[] args){
try{
SpanFeatureExtractor fe=BAG_OF_LC_WORDS;
BasicTextBase base=new BasicTextBase();
for(int i=0;i<SampleDatasets.posTrain.length;i++){
base.loadDocument("pos"+i,SampleDatasets.posTrain[i]);
}
for(int i=0;i<SampleDatasets.negTrain.length;i++){
base.loadDocument("neg"+i,SampleDatasets.negTrain[i]);
}
Dataset dataset=new BasicDataset();
for(Iterator<Span> i=base.documentSpanIterator();i.hasNext();){
Span s=i.next();
String id=s.getDocumentId();
ClassLabel label=ClassLabel.binaryLabel(id.startsWith("pos")?+1:-1);
TextLabels textLabels=new EmptyLabels();
dataset.add(new Example(fe.extractInstance(textLabels,s),label));
}
new ViewerFrame("Toy data",dataset.toGUI());
}catch(Exception e){
e.printStackTrace();
}
}
}