/* Copyright 2003, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.classify; import java.util.Iterator; import java.util.Random; import java.util.StringTokenizer; import org.apache.log4j.Logger; import edu.cmu.minorthird.classify.semisupervised.SemiSupervisedDataset; import edu.cmu.minorthird.classify.sequential.SequenceDataset; import edu.cmu.minorthird.util.MathUtil; import edu.cmu.minorthird.util.gui.ViewerFrame; import edu.cmu.minorthird.util.gui.Visible; /** Some sample inputs for learners. * * @author William Cohen */ public class SampleDatasets{ static private Logger log=Logger.getLogger(SampleDatasets.class); public static final String[] posTrain={ "a pricy doll house", "a little red fire truck","a red wagon", "a pricy red sports car", "punk queen barbie and ken", "a little red bike" }; public static final String[] negTrain={ "a a a a big 7-seater minivan with an automatic transmission", "a big house in the suburbs with a crushing mortgage", "a job for life at IBM", "a huge pile of tax forms, due yesterday", "huge pile of junk mail, bills, and catalogs" }; public static final String[] posTest={ "a pricy barbie doll", "a little yellow toy car", "a red 10 speed bike", "a red convertible porshe" }; public static final String[] negTest={ "a big pile of paperwork", "a huge backlog of email", "a life of woe and trouble" }; private static Dataset makeData(String[] pos,String[] neg){ Dataset result=new BasicDataset(); for(int i=0;i<pos.length;i++){ result.add(makeExample(+1,pos[i])); } for(int i=0;i<neg.length;i++){ result.add(makeExample(-1,neg[i])); } return result; } private static Example makeExample(double label,String text){ MutableInstance instance=new MutableInstance(text); StringTokenizer tok=new StringTokenizer(text); while(tok.hasMoreTokens()){ String word=tok.nextToken(); instance.addBinary(new Feature(word)); } return new Example(instance,ClassLabel.binaryLabel(label)); } /** Training data for a trivial classification problem. */ public static Dataset toyTrain(){ return makeData(posTrain,negTrain); } /** Test data for a trivial classification problem. */ public static Dataset toyTest(){ return makeData(posTest,negTest); } private static String[] posBayesTrain= {"a a pricy doll house","a a little red red fire truck","a red wagon", "a pricy red sports car","punk queen barbie and and ken", "a little red bike"}; private static String[] negBayesTrain= {"a big big 7-seater minivan with with an an automatic transmission", "a big house in the suburbs with a a crushing mortgage", "a job for for life at at IBM", "a huge pile of of tax forms, due yesterday", "huge pile of of junk mail, bills, and catalogs"}; private static String[] posBayesTest= {"a a pricy barbie doll","a little yellow toy car", "a a red 10 speed bike","a red convertible porshe"}; private static String[] negBayesTest= {"a big pile of of paperwork","a huge backlog of email", "a life of woe and and trouble"}; private static String[] posBayesExtremeTrain= {"p1 p1 p1 p2 p2 p3 p3 p4 p4 p5 p5 n1 n2 n3 n4 n5", "p1 p1 p2 p2 p2 p3 p3 p4 p4 p5 p5 n1 n2 n3 n4 n5", "p1 p1 p2 p2 p3 p3 p3 p4 p4 p5 p5 n1 n2 n3 n4 n5", "p1 p1 p2 p2 p3 p3 p4 p4 p4 p5 p5 n1 n2 n3 n4 n5", "p1 p1 p2 p2 p3 p3 p4 p4 p5 p5 p5 n1 n2 n3 n4 n5"}; private static String[] negBayesExtremeTrain= {"p1 p2 p3 p4 p5 n1 n1 n1 n2 n2 n3 n3 n4 n4 n5 n5", "p1 p2 p3 p4 p5 n1 n1 n2 n2 n2 n3 n3 n4 n4 n5 n5", "p1 p2 p3 p4 p5 n1 n1 n2 n2 n3 n3 n3 n4 n4 n5 n5", "p1 p2 p3 p4 p5 n1 n1 n2 n2 n3 n3 n4 n4 n4 n5 n5", "p1 p2 p3 p4 p5 n1 n1 n2 n2 n3 n3 n4 n4 n5 n5 n5"}; private static String[] posBayesExtremeTest= {"p1 p1 n1","p2 p2 n2","p3 p3 n3","p4 p4 n4","p5 p5 n5"}; private static String[] negBayesExtremeTest= {"p1 n1 n1","p2 n2 n2","p3 n3 n3","p4 n4 n4","p5 n5 n5"}; public static Dataset toyBayesExtremeTrain(){ return makeBayesData(posBayesExtremeTrain,negBayesExtremeTrain); } public static Dataset toyBayesExtremeTest(){ return makeBayesData(posBayesExtremeTest,negBayesExtremeTest); } private static String[] unlabeledBayesExtreme= {"p1 n1 n1","p2 n2 n2","p3 n3 n3","p1 p1 n1","p2 p2 n2","p3 p3 n3"}; public static Dataset toyBayesExtremeUnlabeledTrain(){ return makeUnlabeledBayesData(posBayesExtremeTrain,negBayesExtremeTrain, unlabeledBayesExtreme); } /** Makes test-data for generative Bayesian models */ private static Dataset makeUnlabeledBayesData(String[] pos,String[] neg, String[] unlabeled){ SemiSupervisedDataset result=new SemiSupervisedDataset(); for(int i=0;i<pos.length;i++){ result.add(makeLabeledBayesExample(new ClassLabel("POS"),pos[i])); } for(int i=0;i<neg.length;i++){ result.add(makeLabeledBayesExample(new ClassLabel("NEG"),neg[i])); } for(int i=0;i<unlabeled.length;i++){ result.addUnlabeled(makeUnlabeledBayesExample(unlabeled[i])); } return result; } /** Makes test-example for generative Bayesian models */ private static Example makeLabeledBayesExample(ClassLabel label,String text){ MutableInstance instance=new MutableInstance(); StringTokenizer tok=new StringTokenizer(text); while(tok.hasMoreTokens()){ String word=tok.nextToken(); Feature f=new Feature(word); double w=instance.getWeight(f); if(w==0) instance.addBinary(f); else instance.addNumeric(f,w+1); } return new Example(instance,label); } /** Makes test-example for generative Bayesian models */ private static Instance makeUnlabeledBayesExample(String text){ MutableInstance instance=new MutableInstance(); StringTokenizer tok=new StringTokenizer(text); while(tok.hasMoreTokens()){ String word=tok.nextToken(); Feature f=new Feature(word); double w=instance.getWeight(f); if(w==0) instance.addBinary(f); else instance.addNumeric(f,w+1); } return instance; } /** Makes test-data for generative Bayesian models */ private static Dataset makeBayesData(String[] pos,String[] neg){ Dataset result=new BasicDataset(); for(int i=0;i<pos.length;i++){ result.add(makeBayesExample(+1,pos[i])); } for(int i=0;i<neg.length;i++){ result.add(makeBayesExample(-1,neg[i])); } return result; } /** Makes test-example for generative Bayesian models */ private static Example makeBayesExample(double label,String text){ MutableInstance instance=new MutableInstance(); StringTokenizer tok=new StringTokenizer(text); while(tok.hasMoreTokens()){ String word=tok.nextToken(); Feature f=new Feature(word); double w=instance.getWeight(f); if(w==0) instance.addBinary(f); else instance.addNumeric(f,w+1); } return new Example(instance,ClassLabel.binaryLabel(label)); } /** Training data for a trivial classification problem. */ public static Dataset toyBayesTrain(){ return makeBayesData(posBayesTrain,negBayesTrain); } /** Test data for a trivial classification problem. */ public static Dataset toyBayesTest(){ return makeBayesData(posBayesTest,negBayesTest); } /** Sparse numeric data - some values are 1.0, and some are zero. */ public static Dataset makeSparseNumericData(Random r,int m){ Dataset result=new BasicDataset(); Feature fx=new Feature("x"); for(int i=0;i<m;i++){ MutableInstance instance=new MutableInstance(); double x=r.nextDouble(); if(x>0.7){ instance.addNumeric(fx,1.0); result.add(new Example(instance,ClassLabel.binaryLabel(+1))); }else{ result.add(new Example(instance,ClassLabel.binaryLabel(-1))); } } return result; } /** Random data, defined by a simple boolean combination of thresholds * over two dimensions, with up to 5 irrelevant dimensions, and m * examples. */ public static Dataset makeNumericData(Random r,int dim,int m){ Feature fx=new Feature("x"); Feature fy=new Feature("y"); Dataset result=new BasicDataset(); String[] vars={"x","y","z","t","u","v","w"}; if(dim>vars.length) throw new IllegalArgumentException("dim to big!"); for(int i=0;i<m;i++){ MutableInstance instance=new MutableInstance(); for(int j=0;j<dim;j++){ // for testing purposes, leave the 'x' feature out of the first // example, and the 'y' feature out of the second if(j!=i){ instance.addNumeric(new Feature(vars[j]),r.nextDouble()*10); } } double x=instance.getWeight(fx); double y=instance.getWeight(fy); //double label = x<3 ? +1 : -1; double label=(x<3&&y<3||x>7&&y>7)?+1:-1; //if (r.nextDouble() < 0.5) label *= -1; result.add(new Example(instance,ClassLabel.binaryLabel(label))); } return result; } /** Data useful for testing univariate logistic regression. The * dataset will contain m examples, each with a single * uniformly-distributed numeric feature x. The probability of the * positive class will be chosen according to logistic(a*x + b). */ public static Dataset makeLogisticRegressionData(Random rand,int m,double a, double b){ int numPos=0,numNeg=0; Dataset data=new BasicDataset(); for(int i=0;i<m;i++){ double x=rand.nextDouble(); double p=MathUtil.logistic(a*x+b); double r=rand.nextDouble(); ClassLabel y=p>r?ClassLabel.positiveLabel(1):ClassLabel.negativeLabel(-1); if(p>r) numPos++; else numNeg++; MutableInstance instance=new MutableInstance(); instance.addNumeric(new Feature("x"),x); instance.addBinary(new Feature("bias")); data.add(new Example(instance,y)); } System.out.println(m+" examples: "+numPos+" pos, "+numNeg+" neg"); return data; } public static SequenceDataset makeToySequenceData(){ return makeToySequenceData(new String[]{"you're a good man Charlie Brown", "where's Waldo?","alas dear Yorick, I knew him well"}); } public static SequenceDataset makeToySequenceTestData(){ return makeToySequenceData(new String[]{"hello, World War III", "to be or 2B, that is a question"}); } public static SequenceDataset makeToySequenceData(String[] lines){ SequenceDataset d=new SequenceDataset(); for(int i=0;i<lines.length;i++){ String[] w=lines[i].split(" "); Example[] seq=new Example[w.length]; for(int j=0;j<w.length;j++){ ClassLabel lab= Character.isUpperCase(w[j].charAt(0))?new ClassLabel("POS") :new ClassLabel("NEG"); MutableInstance inst=new MutableInstance(lines[i]+":"+j,"line"+i); inst.addBinary(new Feature("here "+w[j])); if(j>1) inst.addBinary(new Feature("prev "+w[j-1])); if(j<w.length-1) inst.addBinary(new Feature("next "+w[j+1])); inst.addBinary(new Feature("casePattern "+ w[j].replaceAll("[A-Z]+","A").replaceAll("[a-z]+","a"))); seq[j]=new Example(inst,lab); } d.addSequence(seq); } return d; } /** * Makes a sample 3 class dataset * * @param random A random number generator for building the dataset. * @param numInstances The number of instances to be created. * */ public static Dataset makeToy3ClassData(Random random,int numInstances){ String[][] features=new String[][]{ {"money","cash","sleep","booze","chocolate","fun","beer","pizza"}, {"stocks","bonds","money","cash","influence","power","fame"}, {"chocolate","beer","pizza","pringles","popcorn","spam","crisco"} }; String[] labels=new String[]{"homer","marge","bart"}; Dataset dataset=new BasicDataset(); for(int i=0;i<numInstances;i++){ int classLabel=random.nextInt(labels.length); int numFeatures=random.nextInt(3)+2; MutableInstance instance=new MutableInstance(); for(int j=0;j<numFeatures;j++){ int feature=random.nextInt(features[classLabel].length); instance.addBinary(new Feature(new String[]{"word",features[classLabel][feature]})); } dataset.add(new Example(instance,new ClassLabel(labels[classLabel]))); } return dataset; } public static Dataset sampleData(String name,boolean isTest){ if("toy".equals(name)){ if(isTest) return toyTest(); else return toyTrain(); }else if("bayes".equals(name)){ if(isTest) return toyBayesTest(); else return toyBayesTrain(); }else if("bayesExtreme".equals(name)){ if(isTest) return toyBayesExtremeTest(); else return toyBayesExtremeTrain(); }else if("bayesUnlabeled".equals(name)){ if(isTest) return toyBayesExtremeTest(); else return toyBayesExtremeUnlabeledTrain(); }else if("num".equals(name)){ if(isTest) return makeNumericData(new Random(666),2,20); else return makeNumericData(new Random(999),2,20); }else if("logistic".equals(name)){ if(isTest) return makeLogisticRegressionData(new Random(666),50,2,-2); else return makeLogisticRegressionData(new Random(999),50,2,-2); }else if("bigLogistic".equals(name)){ if(isTest) return makeLogisticRegressionData(new Random(666),1000,2,-2); else return makeLogisticRegressionData(new Random(999),1000,2,-2); }else if("sparseNum".equals(name)){ if(isTest) return makeSparseNumericData(new Random(666),20); else return makeSparseNumericData(new Random(999),20); }else if("toy3".equals(name)){ if(isTest) return makeToy3ClassData(new Random(666),50); else return makeToy3ClassData(new Random(999),50); }else if("toySeq".equals(name)){ if(isTest) return makeToySequenceTestData(); else return makeToySequenceData(); }else{ throw new IllegalArgumentException("illegal dataset name '"+name+"'"); } } public static void main(String[] args){ try{ Dataset train=sampleData(args[0],false); Dataset test=sampleData(args[0],true); log.debug("Train dataset is: "); log.debug(train.toString()); log.debug("Test dataset is:"); log.debug(test.toString()); if(args.length>0){ ClassifierLearner learner= (ClassifierLearner)Class.forName(args[1]).newInstance(); boolean active=args.length>=3&&"active".equals(args[2]); ClassifierTeacher teacher=new DatasetClassifierTeacher(train,active); Classifier c=teacher.train(learner); log.info("Classifier: "+c); traceClassifier("Train",c,train); traceClassifier("Test",c,test); if(c instanceof Visible){ new ViewerFrame(args[1]+" on "+args[0],((Visible)c).toGUI()); } } }catch(Exception e){ System.out .println("usage: [toy|num] edu.cmu.minorthird.classify.SomeLearner [active]"); e.printStackTrace(); } } static private void traceClassifier(String datasetName,Classifier c,Dataset d){ log.info(""); log.info("Performance on dataset "+datasetName+":"); for(Iterator<Example> i=d.iterator();i.hasNext();){ Example e=i.next(); if(c instanceof BinaryClassifier){ double actual=e.getLabel().numericLabel(); double predicted=c.classification(e).posWeight(); String ok=predicted*actual>=0?"Y":"N"; log.info(ok+"\tpred="+predicted+"\tactual="+actual+"\t"+e); }else{ ClassLabel actual=e.getLabel(); ClassLabel predicted=c.classification(e); String ok=predicted.isCorrect(actual)?"Y":"N"; log.info(ok+"\tpred="+predicted+"\tactual="+actual+"\t"+e); } } } }