package eairoldi.experiments; import edu.cmu.minorthird.classify.*; import edu.cmu.minorthird.classify.algorithms.random.Estimators; import edu.cmu.minorthird.classify.algorithms.random.Estimate; import edu.cmu.minorthird.classify.transform.*; import java.io.IOException; import java.io.File; import java.io.PrintStream; import java.io.FileOutputStream; import java.util.ArrayList; /** * User: Edoardo M. Airoldi (eairoldi@cs.cmu.edu) * Date: Feb 23, 2005 */ public class EDA { private Dataset data; private double SCALE = 10.0; private String MODEL; public EDA(Dataset data, String model) { this.data=data; this.MODEL = model; } public void AnalyzeFeatures(PrintStream out) { // initialize stuff ExampleSchema schema = data.getSchema(); //System.out.println(schema); // retrieve valid class names and their sizes BasicFeatureIndex index = new BasicFeatureIndex(data); int numberOfClasses = schema.getNumberOfClasses(); String[] classLabels = new String[numberOfClasses]; int[] classSizes = new int[numberOfClasses]; ArrayList featureMatrix = new ArrayList(); ArrayList exampleWeightMatrix = new ArrayList(); for (int i=0; i<numberOfClasses; i++) { classLabels[i]=schema.getClassName(i); classSizes[i] = index.size(classLabels[i]); //System.out.println(classSizes[i]); // DEBUG double[] featureCounts = new double[ classSizes[i] ]; double[] exampleWeights = new double[ classSizes[i] ]; featureMatrix.add(featureCounts); exampleWeightMatrix.add(exampleWeights); } // estimate parameters double numberOfExamples = ((double)data.size()); double numberOfFeatures = ((double)index.numberOfFeatures()); double[] countsGivenClass = new double[numberOfClasses]; double[] examplesGivenClass = new double[numberOfClasses]; int[] excounter = new int[numberOfClasses]; for(Example.Looper i=data.iterator(); i.hasNext(); ) { Example ex = i.nextExample(); //System.out.println("label="+ex.getLabel().bestClassName().toString()); int idx = schema.getClassIndex( ex.getLabel().bestClassName().toString() ); //System.out.println("classIndex="+classIndex+" idx"+idx); examplesGivenClass[ idx ] += 1.0; for (Feature.Looper j=index.featureIterator(); j.hasNext();) { Feature f = j.nextFeature(); countsGivenClass[ idx ] += ex.getWeight(f); ((double[])exampleWeightMatrix.get(idx))[ excounter[idx] ] += ex.getWeight(f); // SCALE is HERE !!! } excounter[idx] += 1; } double[][] del = new double[numberOfClasses][(int)numberOfFeatures]; int[][] results = new int[numberOfClasses][3]; // 0: v<m, 1: v=m, 2: v>m int ftCnt = 0; for( Feature.Looper floo=index.featureIterator(); floo.hasNext(); ) { int[] counter = new int[numberOfClasses]; // load vector of counts (by class) for feature f Feature ft = floo.nextFeature(); for( Example.Looper eloo=data.iterator(); eloo.hasNext(); ) { Example ex = eloo.nextExample(); int idx = schema.getClassIndex( ex.getLabel().bestClassName().toString() ); if (MODEL.equals("Naive-Bayes")) { ((double[])featureMatrix.get(idx))[ counter[idx]++ ] = Math.min(1.0,ex.getWeight(ft)); } else { ((double[])featureMatrix.get(idx))[ counter[idx]++ ] = ex.getWeight(ft); } } if (MODEL.equals("Negative-Binomial")) { for (int j=0; j<numberOfClasses; j++) { Estimate probabilityOfOccurrence = Estimators.estimateNaiveBayesMean( 1.0,(double)numberOfClasses,examplesGivenClass[j],numberOfExamples ); double classPrior = ((Double) probabilityOfOccurrence.getPms().get("mean") ).doubleValue(); double[] countsFeatureGivenClass = (double[])featureMatrix.get(j); double[] countsGivenExample = (double[])exampleWeightMatrix.get(j); Estimate mudelta = Estimators.estimateNegativeBinomialMuDelta( countsFeatureGivenClass,countsGivenExample,1.0/numberOfFeatures,SCALE ); double delta = ((Double) mudelta.getPms().get("delta") ).doubleValue(); // Estimate parameters double m = mean(countsFeatureGivenClass); double v = variance(countsFeatureGivenClass); //System.out.println(". mean="+m+" variance="+v); //System.out.println(". p="+(m-v)/m+" N="+Math.pow(m,2)/(m-v)); del[j][ftCnt] = delta; if (v<m) { results[j][0] +=1; } else if (v==m) { results[j][1] +=1; } // sparsity :: sum of counts = 1 or 0 else if (v>m) { results[j][2] +=1; } } } // next feature ftCnt += 1; } // report results try { for (int c=0; c<numberOfClasses; c++) { double isDeltaPos = 0.0; double docsInC = del[c].length; out.println("class :: "+schema.getClassName(c)+" has "+docsInC+" features"); //StringBuffer buf = new StringBuffer("c("); for (int i=0; i<(docsInC-1); i++) { //buf.append(del[c][i]+","); if (del[c][i]>1e-7) { isDeltaPos+=1.0; } } //buf.append(del[c][del[c].length-1]+")"); if (del[c][del[c].length-1]>1e-7) { isDeltaPos+=1.0; } //System.out.println(buf.toString()); out.println("v<m:"+(results[c][0]/docsInC)+" v=m:"+(results[c][1]/docsInC)+" v>m:"+(results[c][2]/docsInC)+" d>0:"+(isDeltaPos/docsInC)); } } catch (Exception x) {;} } private double mean(double[] vec) { double m = 0.0; double N = (double) vec.length; for (int i=0; i<vec.length; i++) { m += vec[i]; } m = m/N; return m; } private double variance(double [] vec) { double m = 0.0; double m2 = 0.0; double N = (double) vec.length; for (int i=0; i<vec.length; i++) { m += vec[i]; m2 += Math.pow(vec[i],2); } m = m/N; m2 = m2/N; double v = (m2 - Math.pow(m,2)) *N /(N-1); return v; } // // Performs EDA on selected dataset in m3rd format // static public void main(String[] argv) { // define file's locations here String path = "/usr0/edo/m3rd-data"; // roxie.cald //String path = "C:\\Archive-Projects\\Text-Models\\m3rd-data"; /// lab7.privacy String fout = path+"eda.txt"; try { File outFile = new File(path+fout); PrintStream out = new PrintStream(new FileOutputStream(outFile)); out.println("class :: docsInC :: v<m :: v=m :: v>m :: d>0"); // // webmaster & info-gain // try { File dataFile = new File(path+"webmaster.3rd"); int[] levels = new int[]{100,250,500,750,1000}; out.println("# webmaster & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("webmaster.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# webmaster & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } // // webmaster & delta^2 stat // try { File dataFile = new File(path+"webmaster.3rd"); int[] levels = new int[]{100,250,500,750,1000}; out.println("# webmaster & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("webmaster.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } // // pr & info-gain // try { File dataFile = new File(path+"pr_data.3rd"); int[] levels = new int[]{100,200,300,400,500,600,700}; out.println("# pr & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("pr_data.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# pr & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // pr & delta^2 stat // try { File dataFile = new File(path+"pr_data.3rd"); int[] levels = new int[]{100,200,300,400,500,600,700}; out.println("# pr & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("pr_data.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // online & info-gain // try { File dataFile = new File(path+"online_data.3rd"); int[] levels = new int[]{25,50,75,100,125,150,175}; out.println("# online & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("online_data.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# online & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // online & delta^2 stat // try { File dataFile = new File(path+"online_data.3rd"); int[] levels = new int[]{25,50,75,100,125,150,175}; out.println("# online & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("online_data.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // movies & info-gain // try{ File dataFile = new File(path+"movie-data-all.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,15000,20000,30000}; out.println("# movie & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("movie-data-all.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# movie & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // movies & delta^2 stat // try { File dataFile = new File(path+"movie-data-all.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,15000,20000,30000}; out.println("# movie & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("movie-data-all.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // roy-data-fin & info-gain // try { File dataFile = new File(path+"roy-data-fin.3rd"); int[] levels = new int[]{100,500,1000,2500,5000}; out.println("# roy-data-fin & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("roy-data-fin.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# roy-data-fin & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // roy-data-fin & delta^2 stat // try { File dataFile = new File(path+"roy-data-fin.3rd"); int[] levels = new int[]{100,500,1000,2500,5000}; out.println("# roy-data-fin & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("roy-data-fin.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // roy-data-ma & info-gain // try { File dataFile = new File(path+"roy-data-ma.3rd"); int[] levels = new int[]{100,500,1000,2500,5000}; out.println("# roy-data-ma & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("roy-data-ma.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# roy-data-ma & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // roy-data-ma & delta^2 stat // try { File dataFile = new File(path+"roy-data-ma.3rd"); int[] levels = new int[]{100,500,1000,2500,5000}; out.println("# roy-data-ma & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("roy-data-ma.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // roy-data-mix & info-gain // try { File dataFile = new File(path+"roy-data-mix.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000}; out.println("# roy-data-mix & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("roy-data-mix.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# roy-data-mix & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // roy-data-ma & delta^2 stat // try { File dataFile = new File(path+"roy-data-mix.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000}; out.println("# roy-data-mix & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("roy-data-mix.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // spam assassin & info-gain // try { File dataFile = new File(path+"spamAss2002-3cat.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,20000,30000}; out.println("# spamAss2002-3cat & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("spamAss2002-3cat.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# spamAss2002-3cat & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // spam assassin & delta^2 stat // try { File dataFile = new File(path+"spamAss2002-3cat.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,20000,30000}; out.println("# spamAss2002-3cat & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("spamAss2002-3cat.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // fraud detection & info-gain // try { File dataFile = new File(path+"fraud-3cat.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,20000,30000}; out.println("# fraud-3cat & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("fraud-3cat.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# fraud-3cat & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // fraud detection & delta^2 stat // try { File dataFile = new File(path+"fraud-3cat.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,20000,30000}; out.println("# fraud-3cat & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("fraud-3cat.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // 5 newsgroups & info-gain // try { File dataFile = new File(path+"5news-nohead.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,20000,30000}; out.println("# 5news-nohead & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("5news-nohead.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# 5news-nohead & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // 5 newsgroups & delta^2 stat // try { File dataFile = new File(path+"5news-nohead.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,20000,30000}; out.println("# 5news-nohead & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("5news-nohead.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // reuters & info-gain // try { File dataFile = new File(path+"reuters21578.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,20000,30000}; out.println("# reuters21578 & info-gain"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); InfoGainTransformLearner2 filter = new InfoGainTransformLearner2(levels[l]); InstanceTransform t = filter.batchTrain(data); data = t.transform(data); DatasetLoader.save(data,new File("reuters21578.ig-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } Dataset data = DatasetLoader.loadFile( dataFile ); BasicFeatureIndex fid = new BasicFeatureIndex(data); out.println("# reuters21578 & info-gain :: "+fid.numberOfFeatures()); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); // // reuters & delta^2 stat // try { File dataFile = new File(path+"reuters21578.3rd"); int[] levels = new int[]{100,500,1000,2500,5000,10000,20000,30000}; out.println("# reuters21578 & delta^2 stat"); for (int l=0; l<levels.length;l++) { Dataset data = DatasetLoader.loadFile( dataFile ); D2TransformLearner f = new D2TransformLearner(); f.setREF_LENGTH(500.0); f.setSAMPLE(10000); f.setMIN_WORDS(levels[l]-1); f.setMAX_WORDS(levels[l]-1); InstanceTransform d2 = f.batchTrain( data ); data = d2.transform( data ); DatasetLoader.save(data,new File("reuters21578.d2-"+levels[l]+".3rd")); EDA eda = new EDA(data,"Negative-Binomial"); eda.AnalyzeFeatures(out); } } catch (Exception x) { out.println("error: webmaster ig\n"+x.toString()); } out.println("\n"); } catch (IOException e) { e.printStackTrace(); } // it's all good =:-) System.exit(0); } }