package edu.cmu.minorthird.ui; import java.io.Serializable; import java.util.Set; import edu.cmu.minorthird.classify.BatchVersion; import edu.cmu.minorthird.classify.algorithms.trees.AdaBoost; import edu.cmu.minorthird.classify.sequential.CMMLearner; import edu.cmu.minorthird.classify.sequential.CRFLearner; import edu.cmu.minorthird.classify.sequential.CollinsPerceptronLearner; import edu.cmu.minorthird.classify.sequential.HMMLearner; import edu.cmu.minorthird.classify.sequential.SegmentCRFLearner; import edu.cmu.minorthird.classify.sequential.SegmentCollinsPerceptronLearner; import edu.cmu.minorthird.classify.transform.TFIDFTransformLearner; import edu.cmu.minorthird.classify.transform.TransformingBatchLearner; import edu.cmu.minorthird.text.Span; import edu.cmu.minorthird.text.TextLabels; import edu.cmu.minorthird.text.learn.BeginContinueEndUniqueReduction; import edu.cmu.minorthird.text.learn.ConditionalSemiMarkovModel; import edu.cmu.minorthird.text.learn.InsideOutsideReduction; import edu.cmu.minorthird.text.learn.SegmentAnnotatorLearner; import edu.cmu.minorthird.text.learn.SequenceAnnotatorLearner; import edu.cmu.minorthird.text.learn.SpanFE; import edu.cmu.minorthird.util.BasicCommandLineProcessor; import edu.cmu.minorthird.util.CommandLineProcessor; import edu.cmu.minorthird.util.JointCommandLineProcessor; import edu.cmu.minorthird.util.StringUtil; /** * In Minorthird it is possible to build up constructs like learners, * feature extractors, and so on compositionally, out of simpler * pieces. This class contains pre-configured "recommended" instances * of some common learners, feature extractors, etc. * * @author William Cohen */ public class Recommended{ // // classifier learners // /** K-NN learner following Yang and Chute. This uses a TFIDF * transformation of the features, and averages the scores 30 * nearest neighbors, with score weighted by TFIDF distance to the * instance being classifier. Unlike many of the learners in the * 'recommended' package, this is a non-binary (multi-class) * learner. * * <p>Training for this learner is very fast, but classification * time is rather slow. * * <p>Reference: Y. Yang and C.G. Chute, <i>An example-based mapping * method for text classification and retrieval</i>, * ACM Transactions on Information Systems, 3(12), 1994. */ static public class KnnLearner extends TransformingBatchLearner{ public KnnLearner(){ super(new TFIDFTransformLearner(),new BatchVersion( new edu.cmu.minorthird.classify.algorithms.knn.KnnLearner(30),1)); } } /** Multinomial Naive Bayes, as in McCallum's Rainbow package. * *<p>This is one of the fastest learners, but because of the strong * independence assumptions, it often has a higher-than-necessary * error rate. * * <p>References: Andrew McCallum and Kamal Nigam, <i>A comparison * of event models for naive bayes text classification</i>, AAAI-98 * Workshop on Learning for Text Categorization; Kamal Nigam et al, * <i>Text classification from labeled and unlabeled documents using * EM</i>, Machine Learning, 39(2/3), 2000. */ static public class NaiveBayes extends edu.cmu.minorthird.classify.algorithms.linear.NaiveBayes{ static final long serialVersionUID=20080517L; public NaiveBayes(){ super(); } } /** A Tweaked Learner, with an optimization of the precision vs. recall * * @author Giora Unger * * A learner whose score was optimized according to an F_beta() function, * for a given beta. This optimization is used to fine-tune the precision * vs. recall for the underlying classification algorithm. * Values of beta<1.0 favor precision over recall, while values of * beta>1.0 favor recall over precision. beta=1.0 grants equal weight * to both precision and recall. * * Note: * - Currently, in a hard-coded manner, the leaner takes a NaiveBayse class * as its inner learner and a value of beta=1.0 * * <p>Reference: Jason D. M. Rennie, * <i>Derivation of the F-Measure</i>, * http://people.csail.mit.edu/jrennie/writing/fmeasure.pdf */ static public class TweakedLearner extends edu.cmu.minorthird.classify.TweakedLearner{ public TweakedLearner(){ super(new NaiveBayes(),1.0); } } /** Voted perceptron learning following Freund & Schapire. This is * a simple learning method which, like SVMs, has a bias towards * large-margin linear classifiers. * * <p>Reference: Yoav Freund and Robert E. Schapire, * <i>Large Margin Classification Using the Perceptron Algorithm</i>, * Computational Learning Theory, 1998. */ static public class VotedPerceptronLearner extends BatchVersion{ public VotedPerceptronLearner(){ super( new edu.cmu.minorthird.classify.algorithms.linear.VotedPerceptron(),5); } } /** A simple SVM learner with a linear kernel. */ static public class SVMLearner extends edu.cmu.minorthird.classify.algorithms.svm.SVMLearner{ public SVMLearner(){ super(); } } /** A maximum entropy learner. * * <p>This is a wrapper around the CRF learner, which is built on * the IIT CRF implementation. Iterations of the optimization * method are limited to 50, by default. */ static public class MaxEntLearner extends edu.cmu.minorthird.classify.algorithms.linear.MaxEntLearner{ public MaxEntLearner(){ super("maxIters 50"); } } static public class OneVsAllLearner extends edu.cmu.minorthird.classify.OneVsAllLearner{ public OneVsAllLearner(){ super(new Recommended.MaxEntLearner()); } } static public class MostFrequentFirstLearner extends edu.cmu.minorthird.classify.MostFrequentFirstLearner{ public MostFrequentFirstLearner(){ super(new Recommended.MaxEntLearner()); } } static public class CascadingBinaryLearner extends edu.cmu.minorthird.classify.CascadingBinaryLearner{ public CascadingBinaryLearner(){ super(new Recommended.MaxEntLearner()); } } /** A simple decision tree learner. * * <p> * This has no pruning, and limits decision trees to a depth of 5. * The splitting criterion is modelled after the one used in * Cohen & Singer's SLIPPER system---it is designed to optimize * performance of the metric being optimized by AdaBoost. * * <p>Related references: William W. Cohen and Yoram Singer, <i>A * Simple, Fast, and Effective Rule Learner</i>, Proceedings of the * Sixteenth National Conference on Artificial Intelligence * (AAAI-99); J. Ross Quinlan, <i>C4.5: programs for machine * learning</i>, Morgan Kaufmann, 1994. */ static public class DecisionTreeLearner extends edu.cmu.minorthird.classify.algorithms.trees.DecisionTreeLearner{ public DecisionTreeLearner(){ super(); } } /** Uses AdaBoost to boosts the default decision tree learner 10 times. * * <p>Reference: Yoav Freund and Robert E. Schapire, * <i>Experiments with a New Boosting Algorithm</i>, * Proc. of International Conference on Machine Learning, * 1996. */ static public class BoostedDecisionTreeLearner extends AdaBoost{ public BoostedDecisionTreeLearner(){ super(new DecisionTreeLearner(),10); } } /** Uses AdaBoost to boosts a two-level decision tree learner 100 * times. * * <p>Reference: Yoav Freund and Robert E. Schapire, * <i>Experiments with a New Boosting Algorithm</i>, * Proc. of International Conference on Machine Learning, * 1996. */ static public class BoostedStumpLearner extends AdaBoost{ public BoostedStumpLearner(){ super( new edu.cmu.minorthird.classify.algorithms.trees.DecisionTreeLearner(), 100); } } // // annotator learners // /** * Uses the voted perceptron algorithm to learn the parameters for a * hidden semi-Markov model (SMM). * * <p>This is a somewhat more expensive version of the VPHMMLearner, * which allows features to describe properties of multi-token * spans, rather than only properties of single tokens. This * implements the training algorithm described in the <i>initial</i> * draft of Cohen & Saragi's KDD paper. This implementation is less * memory-intensive than the VPSMMLearner2 package below, but * slower, since the feature-extraction step is iterated many times. * * <p>Reference: William W. Cohen and Sunita Sarawagi, <i>Exploiting * Dictionaries in Named Entity Extraction: Combining Semi-Markov * Extraction Processes and Data Integration Methods</i>, * Proceedings of the Tenth ACM SIGKDD International Conference on * Knowledge Discovery and Data Mining (KDD-2004). */ static public class VPSMMLearner extends ConditionalSemiMarkovModel.CSMMLearner{ static private final long serialVersionUID=1; /** Extracted entities must be of length 4 or less. */ public VPSMMLearner(){ super(20,4); } public VPSMMLearner(int maxLength){ super(20,maxLength); } } /** * Uses the voted perceptron algorithm to learn the parameters for a * hidden semi-Markov model (SMM). * * <p>This is a somewhat more expensive version of the VPHMMLearner, * which allows features to describe properties of multi-token * spans, rather than only properties of single tokens. This * implements the training algorithm described in the final draft of * Cohen & Saragi's KDD paper. This implementation is more * memory-intensive than the VPSMMLearner2 package below, but * faster, since the feature-extraction step is only performed once. * * <p>I generally prefer thus method to the (older) VPHMMLearner. * * <p>Reference: William W. Cohen and Sunita Sarawagi, <i>Exploiting * Dictionaries in Named Entity Extraction: Combining Semi-Markov * Extraction Processes and Data Integration Methods</i>, * Proceedings of the Tenth ACM SIGKDD International Conference on * Knowledge Discovery and Data Mining (KDD-2004). */ static public class VPSMMLearner2 extends SegmentAnnotatorLearner{ static private final long serialVersionUID=1; /** Extracted entities must be of length 4 or less. */ public VPSMMLearner2(){ super(new SegmentCollinsPerceptronLearner(),new MultitokenSpanFE(),4); } public VPSMMLearner2(int epochs,int maxLen){ super(new SegmentCollinsPerceptronLearner(epochs),new MultitokenSpanFE(), maxLen); } } /** Uses the voted perceptron algorithm to learn a parameters of a * hidden Markov model (HMM). This method is similar to a CRF, but * often less expensive to train. * * <p>This iterates over the data only 5 times. Subsequent * experiments suggest that iterating 20, 50, or 100 times often * gives better performance. * * <p>Reference: Michael Collins, <i>Discriminative Training Methods * for Hidden Markov Models: Theory and Experiments with Perceptron * Algorithms</i>, Empirical Methods in Natural Language Processing (EMNLP), * 2002. * */ static public class VPHMMLearner extends SequenceAnnotatorLearner{ public VPHMMLearner(){ super(new CollinsPerceptronLearner(1,5),new Recommended.TokenFE()); } } /** Uses logistic regression/Maximum entropy to learn a condition * Markov model (CMM), aka "maxent Markov model" (MEMM). * * <p>Reference: Andrew McCallum and Dayne Freitag and Fernando Pereira, * <i>Maximum Entropy Markov Models for Information Extraction and Segmentation</i>, * Proceedings of the International Conference on Machine Learning (ICML-2000). */ static public class MEMMLearner extends SequenceAnnotatorLearner{ public MEMMLearner(){ super(new CMMLearner(new MaxEntLearner(),1),new Recommended.TokenFE()); } } /** Uses the voted perceptron algorithm to learn a "conditional * Markov model" (CMM). This is analogous to an MEMM learner, and * often (surprisingly!) competitive in terms of performance. */ static public class VPCMMLearner extends SequenceAnnotatorLearner{ public VPCMMLearner(){ super(new CMMLearner(new VotedPerceptronLearner(),1), new Recommended.TokenFE()); } } /** Uses probabilistic SVM to learn a condition Markov model (CMM). * This is analogous to an MEMM learner. */ static public class SVMCMMLearner extends SequenceAnnotatorLearner{ public SVMCMMLearner(){ super(new CMMLearner(new SVMLearner(),1),new Recommended.TokenFE()); } } /** Implements the CRF algorithm. Based on the IIT CRF * implementation, in which optimization is performed using the * limited-memory BFGS technique of Liu and Nocedal (following Sha & * Pereira's recommendation.) * *<p>References: John Lafferty and Andrew McCallum and Fernando Pereira, * <i>Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data</i> * Proc. 18th International Conf. on Machine Learning, 2001; F. Sha and F. Pereira, * <i>Shallow parsing with conditional random fields</i>, Proceedings of HLT-NAACL, * 2003. */ static public class CRFAnnotatorLearner extends SequenceAnnotatorLearner{ public CRFAnnotatorLearner(){ super(new CRFLearner(),new Recommended.TokenFE(), new BeginContinueEndUniqueReduction()); } } /** Learns a semi-Markovian extension of CRFs. Like the * voted-perceptron SMM classes, this allows features to describe * properties of multi-token spans, rather than only properties of * single tokens. * * <p>Reference: Sunita Sarawagi and William W. Cohen, * <i>Semi-Markov Conditional Random Fields for Information Extraction</i>, * in Neural Information Proceedings Systems (NIPS) 2004. */ static public class SemiCRFAnnotatorLearner extends SegmentAnnotatorLearner{ /** Extracted entities must be of length 4 or less. */ public SemiCRFAnnotatorLearner(){ super(new SegmentCRFLearner(""),new MultitokenSpanFE(),4); } public SemiCRFAnnotatorLearner(int maxIters,int maxLen){ super(new SegmentCRFLearner("maxIters "+maxIters),new MultitokenSpanFE(), maxLen); } } /** a hidden Markov model (HMM), by zkou */ static public class HMMAnnotatorLearner extends SequenceAnnotatorLearner{ public HMMAnnotatorLearner(){ super(new HMMLearner(),new Recommended.HMMTokenFE(), new InsideOutsideReduction()); } } public static class HMMTokenFE extends TokenPropUsingFE implements CommandLineProcessor.Configurable,Serializable{ static final long serialVersionUID=20080517L; protected boolean useCharType=false; protected boolean useCharTypePattern=true; // // getters and setters, for gui-based configuration // /** Window size for features. */ /** If produce features like "token.charTypePattern.Aaaa" for the word "Bill" */ public void setUseCharType(boolean flag){ useCharType=flag; } public boolean getUseCharType(){ return useCharType; } /** If true produce features like "token.charTypePattern.Aa+" for the word "Bill". */ public void setUseCharTypePattern(boolean flag){ useCharTypePattern=flag; } public boolean getUseCharTypePattern(){ return useCharTypePattern; } // // command-line based configuration // @Override public CommandLineProcessor getCLP(){ return new MyCLP(); } public class MyCLP extends BasicCommandLineProcessor{ public void charTypes(){ useCharType=true; } public void noCharTypes(){ useCharType=false; } public void charTypePattern(){ useCharTypePattern=true; } public void noCharTypePattern(){ useCharTypePattern=false; } public void tokenProps(String s){ setTokenPropertyFeatures(s); } } // // real code (i.e., not configuration code) // @Override public void extractFeatures(TextLabels labels,Span s){ requireMyAnnotation(labels); setMyTokenPropertyList(labels); // from(s).tokens().eq().lc().emit(); from(s).tokens().eq().emit(); // if (useCharTypePattern) from(s).tokens().eq().charTypePattern().emit(); // if (useCharType) from(s).tokens().eq().charTypes().emit(); // for (int j=0; j<tokenPropertyFeatures.length; j++) { // from(s).tokens().prop(tokenPropertyFeatures[j]).emit(); // } } } // // SequenceClassifierLearners // public static class VPTagLearner extends CollinsPerceptronLearner{ public VPTagLearner(){ super(1,5); } } // // feature extractors // /** A simple bag-of-words feature extractor, with words being put in * lower case. */ abstract public static class TokenPropUsingFE extends SpanFE implements Serializable{ static final long serialVersionUID=20081125L; protected String[] tokenPropertyFeatures=null; /** * tokenProperties depends on the requiredAnnotation, so override * default setRequiredAnnotation() method to reset the * tokenPropertyFeatures to null when this changes. */ @Override public void setRequiredAnnotation(String requiredAnnotation, String annotationProvider){ super.setRequiredAnnotation(requiredAnnotation,annotationProvider); tokenPropertyFeatures=null; } /** * Specify the token properties from the TextLabels environment * that will be used as features. A value of '*' or null means to * use all defined token properties. */ public void setTokenPropertyFeatures(String commaSeparatedTokenPropertyList){ if("*".equals(commaSeparatedTokenPropertyList)) tokenPropertyFeatures=null; else tokenPropertyFeatures=commaSeparatedTokenPropertyList.split(",\\s*"); } public String getTokenPropertyFeatures(){ return tokenPropertyFeatures==null?"*":StringUtil .toString(tokenPropertyFeatures); } /** Specify the token properties from the TextLabels environment * that will be used as features. */ public void setTokenPropertyFeatures(Set<String> propertySet){ tokenPropertyFeatures= propertySet.toArray(new String[propertySet.size()]); } protected void setMyTokenPropertyList(TextLabels labels){ if(tokenPropertyFeatures==null){ System.out.println("tokenPropertyFeatures: "+labels.getTokenProperties()); setTokenPropertyFeatures(labels.getTokenProperties()); } } } /** A simple bag-of-words feature extractor. */ public static class DocumentFE extends TokenPropUsingFE implements Serializable{ static final long serialVersionUID=20080517L; protected boolean foldCase=true; @Override public void extractFeatures(TextLabels labels,Span s){ requireMyAnnotation(labels); setMyTokenPropertyList(labels); if(foldCase) from(s).tokens().eq().lc().emit(); else from(s).tokens().eq().emit(); for(int j=0;j<tokenPropertyFeatures.length;j++){ from(s).tokens().prop(tokenPropertyFeatures[j]).emit(); } } public boolean getFoldCase(){ return foldCase; } /** If foldCase is true, then words will be converted to lower * case before being used as features. */ public void setFoldCase(boolean flag){ foldCase=flag; } } /** An extraction-oriented feature extractor, which should be * applied to one-token spans. By default this extracts features * for: the lower-case version of the single word inside that span; * lexical properties of the word; and analogous features for tokens * in a small window to either side of the word. */ public static class TokenFE extends TokenPropUsingFE implements CommandLineProcessor.Configurable,Serializable{ static final long serialVersionUID=20080517L; protected int windowSize=3; protected boolean useCharType=false; protected boolean useCharTypePattern=true; protected boolean useTokenValues=true; // // getters and setters, for gui-based configuration // /** Window size for features. */ public void setFeatureWindowSize(int n){ windowSize=n; } public int getFeatureWindowSize(){ return windowSize; } /** If true, produce features like "token.charTypePattern.Aaaa" for the word "Bill" */ public void setUseCharType(boolean flag){ useCharType=flag; } public boolean getUseCharType(){ return useCharType; } /** If true, produce features like "token.charTypePattern.Aa+" for the word "Bill". */ public void setUseCharTypePattern(boolean flag){ useCharTypePattern=flag; } public boolean getUseCharTypePattern(){ return useCharTypePattern; } /** If true, produce features like "token.lc.bill" for the word "Bill". */ public void setUseTokenValues(boolean flag){ useTokenValues=flag; } public boolean getUseTokenValues(){ return useTokenValues; } // // command-line based configuration // @Override public CommandLineProcessor getCLP(){ return new MyCLP(); } public class MyCLP extends BasicCommandLineProcessor{ public void window(String s){ windowSize=StringUtil.atoi(s); System.out.println("window=>"+s); } public void charTypes(){ useCharType=true; } public void noCharTypes(){ useCharType=false; } public void charTypePattern(){ useCharTypePattern=true; } public void noCharTypePattern(){ useCharTypePattern=false; } public void noTokenValues(){ useTokenValues=false; } public void tokenProps(String s){ setTokenPropertyFeatures(s); } } // // real code (i.e., not configuration code) // @Override public void extractFeatures(TextLabels labels,Span s){ requireMyAnnotation(labels); setMyTokenPropertyList(labels); if(useTokenValues) from(s).tokens().eq().lc().emit(); if(useCharTypePattern) from(s).tokens().eq().charTypePattern().emit(); if(useCharType) from(s).tokens().eq().charTypes().emit(); for(int j=0;j<tokenPropertyFeatures.length;j++){ from(s).tokens().prop(tokenPropertyFeatures[j]).emit(); } for(int i=0;i<windowSize;i++){ if(useTokenValues) from(s).left().token(-i-1).eq().lc().emit(); if(useTokenValues) from(s).right().token(i).eq().lc().emit(); for(int j=0;j<tokenPropertyFeatures.length;j++){ from(s).left().token(-i-1).prop(tokenPropertyFeatures[j]).emit(); from(s).right().token(i).prop(tokenPropertyFeatures[j]).emit(); } if(useCharTypePattern){ from(s).left().token(-i-1).eq().charTypePattern().emit(); from(s).right().token(i).eq().charTypePattern().emit(); } if(useCharType){ from(s).left().token(-i-1).eq().charTypes().emit(); from(s).right().token(i).eq().charTypes().emit(); } } } } /** An extraction-oriented feature extractor to apply to multi-token * spans. By default this extracts features for: the lower-case * version of the phrase inside that span; lexical properties of the * phrase; the length of the span; features for tokens in a small * window to either side of the phrase, analogous to those extracted * by TokenFE; features for the first and last tokens of the phrase, * analogous to those extracted by TokenFE. * */ public static class MultitokenSpanFE extends TokenFE implements CommandLineProcessor.Configurable,Serializable{ static final long serialVersionUID=20080517L; private boolean useFirst=true,useLast=true,useLength=true,useInternal=true; // // getters/setters for gui configuration // /** Generate features for the first token of the span. */ public void setUseFirst(boolean flag){ useFirst=flag; } public boolean getUseFirst(){ return useFirst; } /** Generate features for the last token of the span. */ public void setUseLast(boolean flag){ useLast=flag; } public boolean getUseLast(){ return useLast; } /** Generate features for the length of the span. */ public void setUseLength(boolean flag){ useLength=flag; } public boolean getUseLength(){ return useLength; } /** Generate features for the span itself */ public void setUseInternal(boolean flag){ useInternal=flag; } public boolean getUseInternal(){ return useInternal; } // // command-line configuration // @Override public CommandLineProcessor getCLP(){ return new JointCommandLineProcessor(new CommandLineProcessor[]{ super.getCLP(),new MyCLP()}); } public class MyCLP extends BasicCommandLineProcessor{ public void first(){ useFirst=true; } public void noFirst(){ useFirst=false; } public void last(){ useLast=true; } public void noLast(){ useLast=false; } public void length(){ useLength=true; } public void noLength(){ useLength=false; } public void internal(){ useInternal=true; } public void noInternal(){ useInternal=false; } } // // 'real' code // @Override public void extractFeatures(TextLabels labels,Span span){ super.extractFeatures(labels,span); // text of span & its charTypePattern if(useInternal){ from(span).eq().lc().emit(); if(useCharType) from(span).eq().charTypes().emit(); if(useCharTypePattern) from(span).eq().charTypePattern().emit(); } // length properties of span if(useLength){ from(span).size().emit(); from(span).exactSize().emit(); } // first and last tokens if(useFirst) from(span).token(0).eq().lc().emit(); if(useLast) from(span).token(-1).eq().lc().emit(); if(useCharType){ if(useFirst) from(span).token(0).eq().charTypes().lc().emit(); if(useLast) from(span).token(-1).eq().charTypes().lc().emit(); } if(useCharTypePattern){ if(useFirst) from(span).token(0).eq().charTypePattern().lc().emit(); if(useLast) from(span).token(-1).eq().charTypePattern().lc().emit(); } // use marked properties of tokens for first & last tokens in span for(int i=0;i<tokenPropertyFeatures.length;i++){ String p=tokenPropertyFeatures[i]; // first & last tokens if(useFirst) from(span).token(0).prop(p).emit(); if(useLast) from(span).token(-1).prop(p).emit(); } } } }