/* Copyright 2003, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.text.learn; import java.io.Serializable; import java.util.Iterator; import java.util.Set; import edu.cmu.minorthird.classify.BasicDataset; import edu.cmu.minorthird.classify.ClassLabel; import edu.cmu.minorthird.classify.Dataset; import edu.cmu.minorthird.classify.Example; import edu.cmu.minorthird.classify.SampleDatasets; import edu.cmu.minorthird.text.BasicTextBase; import edu.cmu.minorthird.text.EmptyLabels; import edu.cmu.minorthird.text.Span; import edu.cmu.minorthird.text.TextLabels; import edu.cmu.minorthird.util.StringUtil; import edu.cmu.minorthird.util.gui.ViewerFrame; /** * Some sample feature extractors. * * @author William Cohen */ public class SampleFE{ /** * Simple bag of words feature extractor. */ public static final AnnotatedSpanFE BAG_OF_WORDS=new BagOfWordsFE(); public static class BagOfWordsFE extends AnnotatedSpanFE implements Serializable{ static final long serialVersionUID=20080306L; @Override public void extractFeatures(TextLabels labels,Span s){ from(s).tokens().emit(); } } /** * Simple bag of words feature extractor, with all tokens converted to lower * case. */ public static final AnnotatedSpanFE BAG_OF_LC_WORDS= new BagOfLowerCaseWordsFE(); public static class BagOfLowerCaseWordsFE extends AnnotatedSpanFE implements Serializable{ static final long serialVersionUID=20080306L; @Override public void extractFeatures(TextLabels labels,Span s){ from(s).tokens().eq().lc().emit(); } } /** * A simple extraction-oriented feature extractor to apply to one-token spans, * for extraction tasks. */ public static final AnnotatedSpanFE makeExtractionFE( final int featureWindowSize){ ExtractionFE fe=new ExtractionFE(); fe.setFeatureWindowSize(featureWindowSize); return fe; } /** * An extraction-oriented feature extractor to apply to one-token spans, for * extraction tasks. */ public static class ExtractionFE extends AnnotatedSpanFE{ static final long serialVersionUID=20080306L; protected int windowSize=5; protected boolean useCharType=true; protected boolean useCompressedCharType=true; protected String[] tokenPropertyFeatures=new String[0]; public ExtractionFE(){ this(3); } public ExtractionFE(int windowSize){ this.windowSize=windowSize; } // // getters and setters // /** * Specify the number of tokens on before and after the span to emit * features for. */ public void setFeatureWindowSize(int n){ windowSize=n; } public int getFeatureWindowSize(){ return windowSize; } /** * If set to true, produce features like "token.charTypePattern.Aaaa" for * the word "Bill" */ public void setUseCharType(boolean flag){ useCharType=flag; } public boolean getUseCharType(){ return useCharType; } /** * If set to true, produce features like "token.charTypePattern.Aa+" for the * word "Bill". */ public void setUseCompressedCharType(boolean flag){ useCompressedCharType=flag; } public boolean getUseCompressedCharType(){ return useCompressedCharType; } /** * Specify the token properties from the TextLabels environment that will be * used as features. A value of '*' means to use all defined token * properties. */ public void setTokenPropertyFeatures(String commaSeparatedTokenPropertyList){ if("*".equals(commaSeparatedTokenPropertyList)){ // System.out.println("setting properties to null"); tokenPropertyFeatures=null; }else{ tokenPropertyFeatures=commaSeparatedTokenPropertyList.split(",\\s*"); } } public String getTokenPropertyFeatures(){ return StringUtil.toString(tokenPropertyFeatures); } public void setTokenPropertyFeatures(Set<String> propertySet){ tokenPropertyFeatures= propertySet.toArray(new String[propertySet.size()]); } @Override public void extractFeatures(Span s){ extractFeatures(new EmptyLabels(),s); } @Override public void extractFeatures(TextLabels labels,Span s){ requireMyAnnotation(labels); if(tokenPropertyFeatures==null){ System.out.println("setTokenPropertyFeatures to the set "+ labels.getTokenProperties()); setTokenPropertyFeatures(labels.getTokenProperties()); } // tokens in span from(s).tokens().eq().lc().emit(); // simplified capitalization pattern if(useCompressedCharType){ from(s).tokens().eq().charTypePattern().emit(); } // exact capitalization pattern if(useCharType){ from(s).tokens().eq().charTypes().emit(); } // token properties for(int j=0;j<tokenPropertyFeatures.length;j++){ from(s).tokens().prop(tokenPropertyFeatures[j]).emit(); } // window for(int i=0;i<windowSize;i++){ from(s).left().token(-i-1).eq().lc().emit(); from(s).right().token(i).eq().lc().emit(); for(int j=0;j<tokenPropertyFeatures.length;j++){ // System.out.println("Property: "+tokenPropertyFeatures[j]); from(s).left().token(-i-1).prop(tokenPropertyFeatures[j]).emit(); from(s).right().token(i).prop(tokenPropertyFeatures[j]).emit(); } if(useCompressedCharType){ from(s).left().token(-i-1).eq().charTypePattern().emit(); from(s).right().token(i).eq().charTypePattern().emit(); } if(useCharType){ from(s).left().token(-i-1).eq().charTypes().emit(); from(s).right().token(i).eq().charTypes().emit(); } } } } /** * A feature extractor that pre-loads a mixup file or some other type of * annotation. */ public static abstract class AnnotatedSpanFE extends SpanFE{ static final long serialVersionUID=20081125L; } /** * Test case to try out the feature extractors */ public static void main(String[] args){ try{ SpanFeatureExtractor fe=BAG_OF_LC_WORDS; BasicTextBase base=new BasicTextBase(); for(int i=0;i<SampleDatasets.posTrain.length;i++){ base.loadDocument("pos"+i,SampleDatasets.posTrain[i]); } for(int i=0;i<SampleDatasets.negTrain.length;i++){ base.loadDocument("neg"+i,SampleDatasets.negTrain[i]); } Dataset dataset=new BasicDataset(); for(Iterator<Span> i=base.documentSpanIterator();i.hasNext();){ Span s=i.next(); String id=s.getDocumentId(); ClassLabel label=ClassLabel.binaryLabel(id.startsWith("pos")?+1:-1); TextLabels textLabels=new EmptyLabels(); dataset.add(new Example(fe.extractInstance(textLabels,s),label)); } new ViewerFrame("Toy data",dataset.toGUI()); }catch(Exception e){ e.printStackTrace(); } } }