/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * RaceSearch.java * Copyright (C) 2000 Mark Hall * */ package weka.attributeSelection; import java.io.*; import java.util.*; import weka.core.*; import weka.experiment.PairedStats; import weka.experiment.Stats; /** * Class for performing a racing search. <p> * * For more information see: <br> * Moore, A. W. and Lee, M. S. (1994). Efficient algorithms for minimising * cross validation error. Proceedings of the Eleventh International * Conference on Machine Learning. pp 190--198. <p> * * Valid options are:<p> * * -R <race type><br> * 0 = forward, 1 = backward, 2 = schemata, 3 = rank. <p> * * -L <significance level> <br> * significance level to use for t-tests. <p> * * -T <threshold> <br> * threshold for considering mean errors of two subsets the same <p> * * -F <xval type> <br> * 0 = 10 fold, 1 = leave-one-out (selected automatically for schemata race * <p> * * -A <attribute evaluator> <br> * the attribute evaluator to use when doing a rank search <p> * * -Q <br> * produce a ranked list of attributes. Selecting this option forces * the race type to be forward. Racing continues until *all* attributes * have been selected, thus producing a ranked list of attributes. <p> * * -N <number to retain> <br> * Specify the number of attributes to retain. Overides any threshold. * Use in conjunction with -Q. <p> * * -J <threshold> <br> * Specify a threshold by which the AttributeSelection module can discard * attributes. Use in conjunction with -Q. <p> * * -Z <br> * Turn on verbose output for monitoring the search <p> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */ public class RaceSearch extends ASSearch implements RankedOutputSearch, OptionHandler { /* the training instances */ private Instances m_Instances = null; /** search types */ private static final int FORWARD_RACE = 0; private static final int BACKWARD_RACE = 1; private static final int SCHEMATA_RACE = 2; private static final int RANK_RACE = 3; public static final Tag [] TAGS_SELECTION = { new Tag(FORWARD_RACE, "Forward selection race"), new Tag(BACKWARD_RACE, "Backward elimination race"), new Tag(SCHEMATA_RACE, "Schemata race"), new Tag(RANK_RACE, "Rank race") }; /** the selected search type */ private int m_raceType = FORWARD_RACE; /** xval types */ private static final int TEN_FOLD = 0; private static final int LEAVE_ONE_OUT = 1; public static final Tag [] XVALTAGS_SELECTION = { new Tag(TEN_FOLD, "10 Fold"), new Tag(LEAVE_ONE_OUT, "Leave-one-out"), }; /** the selected xval type */ private int m_xvalType = TEN_FOLD; /** the class index */ private int m_classIndex; /** the number of attributes in the data */ private int m_numAttribs; /** the total number of partially/fully evaluated subsets */ private int m_totalEvals; /** holds the merit of the best subset found */ private double m_bestMerit = -Double.MAX_VALUE; /** the subset evaluator to use */ private HoldOutSubsetEvaluator m_theEvaluator = null; /** the significance level for comparisons */ private double m_sigLevel = 0.001; /** threshold for comparisons */ private double m_delta = 0.001; /** the number of samples above which to begin testing for similarity between competing subsets */ private int m_samples = 20; /** number of cross validation folds---equal to the number of instances for leave-one-out cv */ private int m_numFolds = 10; /** the attribute evaluator to generate the initial ranking when doing a rank race */ private ASEvaluation m_ASEval = new GainRatioAttributeEval(); /** will hold the attribute ranking produced by the above attribute evaluator if doing a rank search */ private int [] m_Ranking; /** verbose output for monitoring the search and debugging */ private boolean m_debug = false; /** If true then produce a ranked list of attributes by fully traversing a forward hillclimb race */ private boolean m_rankingRequested = false; /** The ranked list of attributes produced if m_rankingRequested is true */ private double [][] m_rankedAtts; /** The number of attributes ranked so far (if ranking is requested) */ private int m_rankedSoFar; /** The number of attributes to retain if a ranking is requested. -1 indicates that all attributes are to be retained. Has precedence over m_threshold */ private int m_numToSelect = -1; private int m_calculatedNumToSelect = -1; /** the threshold for removing attributes if ranking is requested */ private double m_threshold = -Double.MAX_VALUE; /** * Returns a string describing this search method * @return a description of the search method suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Races the cross validation error of competing " +"attribute subsets. Use in conjuction with a ClassifierSubsetEval. " +"RaceSearch has four modes:\n\nforward selection " +"races all single attribute additions to a base set (initially " +" no attributes), selects the winner to become the new base set " +"and then iterates until there is no improvement over the base set. " +"\n\nBackward elimination is similar but the initial base set has all " +"attributes included and races all single attribute deletions. " +"\n\nSchemata search is a bit different. Each iteration a series of " +"races are run in parallel. Each race in a set determines whether " +"a particular attribute should be included or not---ie the race is " +"between the attribute being \"in\" or \"out\". The other attributes " +"for this race are included or excluded randomly at each point in the " +"evaluation. As soon as one race " +"has a clear winner (ie it has been decided whether a particular " +"attribute should be inor not) then the next set of races begins, " +"using the result of the winning race from the previous iteration as " +"new base set.\n\nRank race first ranks the attributes using an " +"attribute evaluator and then races the ranking. The race includes " +"no attributes, the top ranked attribute, the top two attributes, the " +"top three attributes, etc.\n\nIt is also possible to generate a " +"raked list of attributes through the forward racing process. " +"If generateRanking is set to true then a complete forward race will " +"be run---that is, racing continues until all attributes have been " +"selected. The order that they are added in determines a complete " +"ranking of all the attributes.\n\nRacing uses paired and unpaired " +"t-tests on cross-validation errors of competing subsets. When there " +"is a significant difference between the means of the errors of two " +"competing subsets then the poorer of the two can be eliminated from " +"the race. Similarly, if there is no significant difference between " +"the mean errors of two competing subsets and they are within some " +"threshold of each other, then one can be eliminated from the race. "; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String raceTypeTipText() { return "Set the type of search."; } /** * Set the race type * * @param d the type of race */ public void setRaceType (SelectedTag d) { if (d.getTags() == TAGS_SELECTION) { m_raceType = d.getSelectedTag().getID(); } if (m_raceType == SCHEMATA_RACE && !m_rankingRequested) { try { setFoldsType(new SelectedTag(LEAVE_ONE_OUT, XVALTAGS_SELECTION)); setSignificanceLevel(0.01); } catch (Exception ex) { } } else { try { setFoldsType(new SelectedTag(TEN_FOLD, XVALTAGS_SELECTION)); setSignificanceLevel(0.001); } catch (Exception ex) { } } } /** * Get the race type * * @return the type of race */ public SelectedTag getRaceType() { return new SelectedTag(m_raceType, TAGS_SELECTION); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String significanceLevelTipText() { return "Set the significance level to use for t-test comparisons."; } /** * Sets the significance level to use * @param sig the significance level */ public void setSignificanceLevel(double sig) { m_sigLevel = sig; } /** * Get the significance level * @return the current significance level */ public double getSignificanceLevel() { return m_sigLevel; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String thresholdTipText() { return "Set the error threshold by which to consider two subsets " +"equivalent."; } /** * Sets the threshold for comparisons * @param t the threshold to use */ public void setThreshold(double t) { m_delta = t; } /** * Get the threshold * @return the current threshold */ public double getThreshold() { return m_delta; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String foldsTipText() { return "Set the number of folds to use for x-val error estimation. " +"Leave-one-out is selected automatically for schemata search."; } /** * Set the xfold type * * @param d the type of xval */ public void setFoldsType (SelectedTag d) { if (d.getTags() == XVALTAGS_SELECTION) { m_xvalType = d.getSelectedTag().getID(); } } /** * Get the xfold type * * @return the type of xval */ public SelectedTag getFoldsType () { return new SelectedTag(m_xvalType, XVALTAGS_SELECTION); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String debugTipText() { return "Turn on verbose output for monitoring the search's progress."; } /** * Set whether verbose output should be generated. * @param d true if output is to be verbose. */ public void setDebug(boolean d) { m_debug = d; } /** * Get whether output is to be verbose * @return true if output will be verbose */ public boolean getDebug() { return m_debug; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String attributeEvaluatorTipText() { return "Attribute evaluator to use for generating an initial ranking. " +"Use in conjunction with a rank race"; } /** * Set the attribute evaluator to use for generating the ranking. * @param newEvaluator the attribute evaluator to use. */ public void setAttributeEvaluator(ASEvaluation newEvaluator) { m_ASEval = newEvaluator; } /** * Get the attribute evaluator used to generate the ranking. * @return the evaluator used to generate the ranking. */ public ASEvaluation getAttributeEvaluator() { return m_ASEval; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String generateRankingTipText() { return "Use the racing process to generate a ranked list of attributes. " +"Using this mode forces the race to be a forward type and then races " +"until all attributes have been added, thus giving a ranked list"; } /** * Records whether the user has requested a ranked list of attributes. * @param doRank true if ranking is requested */ public void setGenerateRanking(boolean doRank) { m_rankingRequested = doRank; if (m_rankingRequested) { try { setRaceType(new SelectedTag(FORWARD_RACE, TAGS_SELECTION)); } catch (Exception ex) { } } } /** * Gets whether ranking has been requested. This is used by the * AttributeSelection module to determine if rankedAttributes() * should be called. * @return true if ranking has been requested. */ public boolean getGenerateRanking() { return m_rankingRequested; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String numToSelectTipText() { return "Specify the number of attributes to retain. Use in conjunction " +"with generateRanking. The default value " +"(-1) indicates that all attributes are to be retained. Use either " +"this option or a threshold to reduce the attribute set."; } /** * Specify the number of attributes to select from the ranked list * (if generating a ranking). -1 * indicates that all attributes are to be retained. * @param n the number of attributes to retain */ public void setNumToSelect(int n) { m_numToSelect = n; } /** * Gets the number of attributes to be retained. * @return the number of attributes to retain */ public int getNumToSelect() { return m_numToSelect; } /** * Gets the calculated number of attributes to retain. This is the * actual number of attributes to retain. This is the same as * getNumToSelect if the user specifies a number which is not less * than zero. Otherwise it should be the number of attributes in the * (potentially transformed) data. */ public int getCalculatedNumToSelect() { if (m_numToSelect >= 0) { m_calculatedNumToSelect = m_numToSelect; } return m_calculatedNumToSelect; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String selectionThresholdTipText() { return "Set threshold by which attributes can be discarded. Default value " + "results in no attributes being discarded. Use in conjunction with " + "generateRanking"; } /** * Set the threshold by which the AttributeSelection module can discard * attributes. * @param threshold the threshold. */ public void setSelectionThreshold(double threshold) { m_threshold = threshold; } /** * Returns the threshold so that the AttributeSelection module can * discard attributes from the ranking. */ public double getSelectionThreshold() { return m_threshold; } /** * Returns an enumeration describing the available options. * @return an enumeration of all the available options. **/ public Enumeration listOptions () { Vector newVector = new Vector(8); newVector.addElement(new Option("\tType of race to perform.\n\t" +"(default = 0).", "R", 1 ,"-R <0 = forward | 1 = backward " +"race | 2 = schemata | 3 = rank>")); newVector.addElement(new Option("\tSignificance level for comaparisons" +"\n\t(default = 0.001(forward/backward/" +"rank)/0.01(schemata)).", "L",1,"-L <significance>")); newVector.addElement(new Option("\tThreshold for error comparison.\n\t" +"(default = 0.001).", "T",1,"-T <threshold>")); newVector.addElement(new Option("\tAttribute ranker to use if doing a " +"\n\trank search. Place any\n\t" +"evaluator options LAST on the" + "\n\tcommand line following a \"--\"." + "\n\teg. -A weka.attributeSelection." +"GainRatioAttributeEval ... " + "-- -M.\n\t(default = GainRatioAttributeEval)", "A", 1, "-A <attribute evaluator>")); newVector.addElement(new Option("\tFolds for cross validation\n\t" +"(default = 0 (1 if schemata race)", "F",1,"-F <0 = 10 fold | 1 = leave-one-out>")); newVector.addElement(new Option("\tGenerate a ranked list of attributes." +"\n\tForces the search to be forward\n." +"\tand races until all attributes have\n" +"\tselected, thus producing a ranking.", "Q",0,"-Q")); newVector .addElement(new Option("\tSpecify number of attributes to retain from " +"\n\tthe ranking. Overides -T. Use " +"in conjunction with -Q" ,"N",1 , "-N <num to select>")); newVector .addElement(new Option("\tSpecify a theshold by which attributes" + "\n\tmay be discarded from the ranking." +"\n\tUse in conjuction with -Q","T",1 , "-T <threshold>")); newVector.addElement(new Option("\tVerbose output for monitoring the " +"search.", "Z",0,"-Z")); if ((m_ASEval != null) && (m_ASEval instanceof OptionHandler)) { newVector.addElement(new Option("", "", 0, "\nOptions specific to " + "evaluator " + m_ASEval.getClass().getName() + ":")); Enumeration enum = ((OptionHandler)m_ASEval).listOptions(); while (enum.hasMoreElements()) { newVector.addElement(enum.nextElement()); } } return newVector.elements(); } /** * Parses a given list of options. * * Valid options are:<p> * * -R <race type><br> * 0 = forward, 1 = backward, 2 = schemata, 3 = rank. <p> * * -L <significance level> <br> * significance level to use for t-tests. <p> * * -T <threshold> <br> * threshold for considering mean errors of two subsets the same <p> * * -F <xval type> <br> * 0 = 10 fold, 1 = leave-one-out (selected automatically for schemata race * <p> * * -A <attribute evaluator> <br> * the attribute evaluator to use when doing a rank search <p> * * -Q <br> * produce a ranked list of attributes. Selecting this option forces * the race type to be forward. Racing continues until *all* attributes * have been selected, thus producing a ranked list of attributes. <p> * * -N <number to retain> <br> * Specify the number of attributes to retain. Overides any threshold. * Use in conjunction with -Q. <p> * * -J <threshold> <br> * Specify a threshold by which the AttributeSelection module can discard * attributes. Use in conjunction with -Q. <p> * * -Z <br> * Turn on verbose output for monitoring the search <p> * * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions (String[] options) throws Exception { String optionString; resetOptions(); optionString = Utils.getOption('R', options); if (optionString.length() != 0) { setRaceType(new SelectedTag(Integer.parseInt(optionString), TAGS_SELECTION)); } optionString = Utils.getOption('F', options); if (optionString.length() != 0) { setFoldsType(new SelectedTag(Integer.parseInt(optionString), XVALTAGS_SELECTION)); } optionString = Utils.getOption('L', options); if (optionString.length() !=0) { Double temp; temp = Double.valueOf(optionString); setSignificanceLevel(temp.doubleValue()); } optionString = Utils.getOption('T', options); if (optionString.length() !=0) { Double temp; temp = Double.valueOf(optionString); setThreshold(temp.doubleValue()); } optionString = Utils.getOption('A', options); if (optionString.length() != 0) { setAttributeEvaluator(ASEvaluation.forName(optionString, Utils.partitionOptions(options))); } setGenerateRanking(Utils.getFlag('Q', options)); optionString = Utils.getOption('T', options); if (optionString.length() != 0) { Double temp; temp = Double.valueOf(optionString); setThreshold(temp.doubleValue()); } optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setNumToSelect(Integer.parseInt(optionString)); } setDebug(Utils.getFlag('Z', options)); } /** * Gets the current settings of BestFirst. * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions () { int current = 0; String[] evaluatorOptions = new String[0]; if ((m_ASEval != null) && (m_ASEval instanceof OptionHandler)) { evaluatorOptions = ((OptionHandler)m_ASEval).getOptions(); } String[] options = new String[17+evaluatorOptions.length]; options[current++] = "-R"; options[current++] = ""+m_raceType; options[current++] = "-L"; options[current++] = ""+getSignificanceLevel(); options[current++] = "-T"; options[current++] = ""+getThreshold(); options[current++] = "-F"; options[current++] = ""+m_xvalType; if (getGenerateRanking()) { options[current++] = "-Q"; } options[current++] = "-N"; options[current++] = ""+getNumToSelect(); options[current++] = "-J"; options[current++] = ""+getSelectionThreshold(); if (getDebug()) { options[current++] = "-Z"; } if (getAttributeEvaluator() != null) { options[current++] = "-A"; options[current++] = getAttributeEvaluator().getClass().getName(); options[current++] = "--"; System.arraycopy(evaluatorOptions, 0, options, current, evaluatorOptions.length); current += evaluatorOptions.length; } while (current < options.length) { options[current++] = ""; } return options; } /** * Searches the attribute subset space by racing cross validation * errors of competing subsets * * @param ASEvaluator the attribute evaluator to guide the search * @param data the training instances. * @return an array (not necessarily ordered) of selected attribute indexes * @exception Exception if the search can't be completed */ public int[] search (ASEvaluation ASEval, Instances data) throws Exception { if (!(ASEval instanceof SubsetEvaluator)) { throw new Exception(ASEval.getClass().getName() + " is not a " + "Subset evaluator! (RaceSearch)"); } if (ASEval instanceof UnsupervisedSubsetEvaluator) { throw new Exception("Can't use an unsupervised subset evaluator " +"(RaceSearch)."); } if (!(ASEval instanceof HoldOutSubsetEvaluator)) { throw new Exception("Must use a HoldOutSubsetEvaluator, eg. " +"weka.attributeSelection.ClassifierSubsetEval " +"(RaceSearch)"); } if (!(ASEval instanceof ErrorBasedMeritEvaluator)) { throw new Exception("Only error based subset evaluators can be used, " +"eg. weka.attributeSelection.ClassifierSubsetEval " +"(RaceSearch)"); } m_Instances = data; m_Instances.deleteWithMissingClass(); if (m_Instances.numInstances() == 0) { throw new Exception("All instances have missing class! (RaceSearch)"); } if (m_rankingRequested && m_numToSelect > m_Instances.numAttributes()-1) { throw new Exception("More attributes requested than exist in the data " +"(RaceSearch)."); } m_theEvaluator = (HoldOutSubsetEvaluator)ASEval; m_numAttribs = m_Instances.numAttributes(); m_classIndex = m_Instances.classIndex(); if (m_rankingRequested) { m_rankedAtts = new double[m_numAttribs-1][2]; m_rankedSoFar = 0; } if (m_xvalType == LEAVE_ONE_OUT) { m_numFolds = data.numInstances(); } else { m_numFolds = 10; } data.randomize(new Random(1)); int [] bestSubset=null; switch (m_raceType) { case FORWARD_RACE: case BACKWARD_RACE: bestSubset = hillclimbRace(data); break; case SCHEMATA_RACE: bestSubset = schemataRace(data); break; case RANK_RACE: bestSubset = rankRace(data); break; } return bestSubset; } public double [][] rankedAttributes() throws Exception { if (!m_rankingRequested) { throw new Exception("Need to request a ranked list of attributes " +"before attributes can be ranked (RaceSearch)."); } if (m_rankedAtts == null) { throw new Exception("Search must be performed before attributes " +"can be ranked (RaceSearch)."); } double [][] final_rank = new double [m_rankedSoFar][2]; for (int i=0;i<m_rankedSoFar;i++) { final_rank[i][0] = m_rankedAtts[i][0]; final_rank[i][1] = m_rankedAtts[i][1]; } if (m_numToSelect <= 0) { if (m_threshold == -Double.MAX_VALUE) { m_calculatedNumToSelect = final_rank.length; } else { determineNumToSelectFromThreshold(final_rank); } } return final_rank; } private void determineNumToSelectFromThreshold(double [][] ranking) { int count = 0; for (int i = 0; i < ranking.length; i++) { if (ranking[i][1] > m_threshold) { count++; } } m_calculatedNumToSelect = count; } /** * Print an attribute set. */ private String printSets(char [][]raceSets) { StringBuffer temp = new StringBuffer(); for (int i=0;i<raceSets.length;i++) { for (int j=0;j<m_numAttribs;j++) { temp.append(raceSets[i][j]); } temp.append('\n'); } return temp.toString(); } /** * Performs a schemata race---a series of races in parallel. * @param data the instances to estimate accuracy over. * @return an array of selected attribute indices. */ private int [] schemataRace(Instances data) throws Exception { // # races, 2 (competitors in each race), # attributes char [][][] parallelRaces; int numRaces = m_numAttribs-1; Random r = new Random(42); int numInstances = data.numInstances(); Instances trainCV; Instances testCV; Instance testInstance; // statistics on the racers Stats [][] raceStats = new Stats[numRaces][2]; parallelRaces = new char [numRaces][2][m_numAttribs-1]; char [] base = new char [m_numAttribs]; for (int i=0;i<m_numAttribs;i++) { base[i] = '*'; } int count=0; // set up initial races for (int i=0;i<m_numAttribs;i++) { if (i != m_classIndex) { parallelRaces[count][0] = (char [])base.clone(); parallelRaces[count][1] = (char [])base.clone(); parallelRaces[count][0][i] = '1'; parallelRaces[count++][1][i] = '0'; } } if (m_debug) { System.err.println("Initial sets:\n"); for (int i=0;i<numRaces;i++) { System.err.print(printSets(parallelRaces[i])+"--------------\n"); } } BitSet randomB = new BitSet(m_numAttribs); char [] randomBC = new char [m_numAttribs]; // notes which bit positions have been decided boolean [] attributeConstraints = new boolean[m_numAttribs]; double error; int evaluationCount = 0; raceSet: while (numRaces > 0) { boolean won = false; for (int i=0;i<numRaces;i++) { raceStats[i][0] = new Stats(); raceStats[i][1] = new Stats(); } // keep an eye on how many test instances have been randomly sampled int sampleCount = 0; // run the current set of races while (!won) { // generate a random binary string for (int i=0;i<m_numAttribs;i++) { if (i != m_classIndex) { if (!attributeConstraints[i]) { if (r.nextDouble() < 0.5) { randomB.set(i); } else { randomB.clear(i); } } else { // this position has been decided from previous races if (base[i] == '1') { randomB.set(i); } else { randomB.clear(i); } } } } // randomly select an instance to test on int testIndex = Math.abs(r.nextInt() % numInstances); trainCV = data.trainCV(numInstances, testIndex); testCV = data.testCV(numInstances, testIndex); testInstance = testCV.instance(0); sampleCount++; /* if (sampleCount > numInstances) { throw new Exception("raceSchemata: No clear winner after sampling " +sampleCount+" instances."); } */ m_theEvaluator.buildEvaluator(trainCV); // the evaluator must retrain for every test point error = -((HoldOutSubsetEvaluator)m_theEvaluator). evaluateSubset(randomB, testInstance, true); evaluationCount++; // see which racers match this random subset for (int i=0;i<m_numAttribs;i++) { if (randomB.get(i)) { randomBC[i] = '1'; } else { randomBC[i] = '0'; } } // System.err.println("Random subset: "+(new String(randomBC))); checkRaces: for (int i=0;i<numRaces;i++) { // if a pair of racers has evaluated more than num instances // then bail out---unlikely that having any more atts is any // better than the current base set. if (((raceStats[i][0].count + raceStats[i][1].count) / 2) > (numInstances)) { break raceSet; } for (int j=0;j<2;j++) { boolean matched = true; for (int k =0;k<m_numAttribs;k++) { if (parallelRaces[i][j][k] != '*') { if (parallelRaces[i][j][k] != randomBC[k]) { matched = false; break; } } } if (matched) { // update the stats for this racer // System.err.println("Matched "+i+" "+j); raceStats[i][j].add(error); // does this race have a clear winner, meaning we can // terminate the whole set of parallel races? if (raceStats[i][0].count > m_samples && raceStats[i][1].count > m_samples) { raceStats[i][0].calculateDerived(); raceStats[i][1].calculateDerived(); // System.err.println(j+" : "+(new String(parallelRaces[i][j]))); // System.err.println(raceStats[i][0]); // System.err.println(raceStats[i][1]); // check the ttest double prob = ttest(raceStats[i][0], raceStats[i][1]); // System.err.println("Prob :"+prob); if (prob < m_sigLevel) { // stop the races we have a winner! if (raceStats[i][0].mean < raceStats[i][1].mean) { base = (char [])parallelRaces[i][0].clone(); m_bestMerit = raceStats[i][0].mean; if (m_debug) { System.err.println("contender 0 won "); } } else { base = (char [])parallelRaces[i][1].clone(); m_bestMerit = raceStats[i][1].mean; if (m_debug) { System.err.println("contender 1 won"); } } if (m_debug) { System.err.println((new String(parallelRaces[i][0])) +" "+(new String(parallelRaces[i][1]))); System.err.println("Means : "+raceStats[i][0].mean +" vs"+raceStats[i][1].mean); System.err.println("Evaluations so far : " +evaluationCount); } won = true; break checkRaces; } } } } } } numRaces--; // set up the next set of races if necessary if (numRaces > 0 && won) { parallelRaces = new char [numRaces][2][m_numAttribs-1]; raceStats = new Stats[numRaces][2]; // update the attribute constraints for (int i=0;i<m_numAttribs;i++) { if (i != m_classIndex && !attributeConstraints[i] && base[i] != '*') { attributeConstraints[i] = true; break; } } count=0; for (int i=0;i<numRaces;i++) { parallelRaces[i][0] = (char [])base.clone(); parallelRaces[i][1] = (char [])base.clone(); for (int j=count;j<m_numAttribs;j++) { if (j != m_classIndex && parallelRaces[i][0][j] == '*') { parallelRaces[i][0][j] = '1'; parallelRaces[i][1][j] = '0'; count = j+1; break; } } } if (m_debug) { System.err.println("Next sets:\n"); for (int i=0;i<numRaces;i++) { System.err.print(printSets(parallelRaces[i])+"--------------\n"); } } } } if (m_debug) { System.err.println("Total evaluations : " +evaluationCount); } return attributeList(base); } // t-test for unequal sample sizes and same variance. Returns probability // that observed difference in means is due to chance. private double ttest(Stats c1, Stats c2) throws Exception { double n1 = c1.count; double n2 = c2.count; double v1 = c1.stdDev * c1.stdDev; double v2 = c2.stdDev * c2.stdDev; double av1 = c1.mean; double av2 = c2.mean; double df = n1 + n2 - 2; double cv = (((n1 - 1) * v1) + ((n2 - 1) * v2)) /df; double t = (av1 - av2) / Math.sqrt(cv * ((1.0 / n1) + (1.0 / n2))); return Statistics.incompleteBeta(df / 2.0, 0.5, df / (df + (t * t))); } /** * Performs a rank race---race consisting of no attributes, the top * ranked attribute, the top two attributes etc. The initial ranking * is determined by an attribute evaluator. * @param data the instances to estimate accuracy over * @return an array of selected attribute indices. */ private int [] rankRace(Instances data) throws Exception { char [] baseSet = new char [m_numAttribs]; char [] bestSet; double bestSetError; for (int i=0;i<m_numAttribs;i++) { if (i == m_classIndex) { baseSet[i] = '-'; } else { baseSet[i] = '0'; } } int numCompetitors = m_numAttribs-1; char [][] raceSets = new char [numCompetitors+1][m_numAttribs]; int winner; if (m_ASEval instanceof AttributeEvaluator) { // generate the attribute ranking first Ranker ranker = new Ranker(); ((AttributeEvaluator)m_ASEval).buildEvaluator(data); m_Ranking = ranker.search((AttributeEvaluator)m_ASEval,data); } else { ForwardSelection fs = new ForwardSelection(); double [][]rankres; fs.setGenerateRanking(true); ((SubsetEvaluator)m_ASEval).buildEvaluator(data); fs.search(m_ASEval, data); rankres = fs.rankedAttributes(); m_Ranking = new int[rankres.length]; for (int i=0;i<rankres.length;i++) { m_Ranking[i] = (int)rankres[i][0]; } } // set up the race raceSets[0] = (char [])baseSet.clone(); for (int i=0;i<m_Ranking.length;i++) { raceSets[i+1] = (char [])raceSets[i].clone(); raceSets[i+1][m_Ranking[i]] = '1'; } if (m_debug) { System.err.println("Initial sets:\n"+printSets(raceSets)); } // run the race double [] winnerInfo = raceSubsets(raceSets, data, true); bestSetError = winnerInfo[1]; bestSet = (char [])raceSets[(int)winnerInfo[0]].clone(); m_bestMerit = bestSetError; return attributeList(bestSet); } /** * Performs a hill climbing race---all single attribute changes to a * base subset are raced in parallel. The winner is chosen and becomes * the new base subset and the process is repeated until there is no * improvement in error over the base subset. * @param data the instances to estimate accuracy over * @return an array of selected attribute indices. */ private int [] hillclimbRace(Instances data) throws Exception { double baseSetError; char [] baseSet = new char [m_numAttribs]; int rankCount = 0; for (int i=0;i<m_numAttribs;i++) { if (i != m_classIndex) { if (m_raceType == FORWARD_RACE) { baseSet[i] = '0'; } else { baseSet[i] = '1'; } } else { baseSet[i] = '-'; } } int numCompetitors = m_numAttribs-1; char [][] raceSets = new char [numCompetitors+1][m_numAttribs]; int winner; raceSets[0] = (char [])baseSet.clone(); int count = 1; // initialize each race set to 1 attribute for (int i=0;i<m_numAttribs;i++) { if (i != m_classIndex) { raceSets[count] = (char [])baseSet.clone(); if (m_raceType == BACKWARD_RACE) { raceSets[count++][i] = '0'; } else { raceSets[count++][i] = '1'; } } } if (m_debug) { System.err.println("Initial sets:\n"+printSets(raceSets)); } // race the initial sets (base set either no or all features) double [] winnerInfo = raceSubsets(raceSets, data, true); baseSetError = winnerInfo[1]; m_bestMerit = baseSetError; baseSet = (char [])raceSets[(int)winnerInfo[0]].clone(); if (m_rankingRequested) { m_rankedAtts[m_rankedSoFar][0] = (int)(winnerInfo[0]-1); m_rankedAtts[m_rankedSoFar][1] = winnerInfo[1]; m_rankedSoFar++; } boolean improved = true; int j; // now race until there is no improvement over the base set or only // one competitor remains while (improved) { // generate the next set of competitors numCompetitors--; if (numCompetitors == 0) { //race finished! break; } j=0; // +1. we'll race against the base set---might be able to bail out // of the race if none from the new set are statistically better // than the base set. Base set is stored in loc 0. raceSets = new char [numCompetitors+1][m_numAttribs]; for (int i=0;i<numCompetitors+1;i++) { raceSets[i] = (char [])baseSet.clone(); if (i > 0) { for (int k=j;k<m_numAttribs;k++) { if (m_raceType == 1) { if (k != m_classIndex && raceSets[i][k] != '0') { raceSets[i][k] = '0'; j = k+1; break; } } else { if (k != m_classIndex && raceSets[i][k] != '1') { raceSets[i][k] = '1'; j = k+1; break; } } } } } if (m_debug) { System.err.println("Next set : \n"+printSets(raceSets)); } improved = false; winnerInfo = raceSubsets(raceSets, data, true); String bs = new String(baseSet); String win = new String(raceSets[(int)winnerInfo[0]]); if (bs.compareTo(win) == 0) { // race finished } else { if (winnerInfo[1] < baseSetError || m_rankingRequested) { improved = true; baseSetError = winnerInfo[1]; m_bestMerit = baseSetError; // find which att is different if (m_rankingRequested) { for (int i = 0; i < baseSet.length; i++) { if (win.charAt(i) != bs.charAt(i)) { m_rankedAtts[m_rankedSoFar][0] = i; m_rankedAtts[m_rankedSoFar][1] = winnerInfo[1]; m_rankedSoFar++; } } } baseSet = (char [])raceSets[(int)winnerInfo[0]].clone(); } else { // Will get here for a subset whose error is outside the delta // threshold but is not *significantly* worse than the base // subset //throw new Exception("RaceSearch: problem in hillClimbRace"); } } } return attributeList(baseSet); } /** * Convert an attribute set to an array of indices */ private int [] attributeList(char [] list) { int count = 0; for (int i=0;i<m_numAttribs;i++) { if (list[i] == '1') { count++; } } int [] rlist = new int[count]; count = 0; for (int i=0;i<m_numAttribs;i++) { if (list[i] == '1') { rlist[count++] = i; } } return rlist; } /** * Races the leave-one-out cross validation errors of a set of * attribute subsets on a set of instances. * @param raceSets a set of attribute subset specifications * @param data the instances to use when cross validating * @param baseSetIncluded true if the first attribute set is a * base set generated from the previous race * @return the index of the winning subset * @exception Exception if an error occurs during cross validation */ private double [] raceSubsets(char [][]raceSets, Instances data, boolean baseSetIncluded) throws Exception { // the evaluators --- one for each subset ASEvaluation [] evaluators = m_theEvaluator.makeCopies(m_theEvaluator, raceSets.length); // array of subsets eliminated from the race boolean [] eliminated = new boolean [raceSets.length]; // individual statistics Stats [] individualStats = new Stats [raceSets.length]; // pairwise statistics PairedStats [][] testers = new PairedStats[raceSets.length][raceSets.length]; /** do we ignore the base set or not? */ int startPt = m_rankingRequested ? 1 : 0; for (int i=0;i<raceSets.length;i++) { individualStats[i] = new Stats(); for (int j=i+1;j<raceSets.length;j++) { testers[i][j] = new PairedStats(m_sigLevel); } } BitSet [] raceBitSets = new BitSet[raceSets.length]; for (int i=0;i<raceSets.length;i++) { raceBitSets[i] = new BitSet(m_numAttribs); for (int j=0;j<m_numAttribs;j++) { if (raceSets[i][j] == '1') { raceBitSets[i].set(j); } } } // now loop over the data points collecting leave-one-out errors for // each attribute set Instances trainCV; Instances testCV; Instance testInst; double [] errors = new double [raceSets.length]; int eliminatedCount = 0; int processedCount = 0; // if there is one set left in the race then we need to continue to // evaluate it for the remaining instances in order to get an // accurate error estimate Stats clearWinner = null; int foldSize=1; processedCount = 0; race: for (int i=0;i<m_numFolds;i++) { trainCV = data.trainCV(m_numFolds, i); testCV = data.testCV(m_numFolds, i); foldSize = testCV.numInstances(); // loop over the surviving attribute sets building classifiers for this // training set for (int j=startPt;j<raceSets.length;j++) { if (!eliminated[j]) { evaluators[j].buildEvaluator(trainCV); } } for (int z=0;z<testCV.numInstances();z++) { testInst = testCV.instance(z); processedCount++; // loop over surviving attribute sets computing errors for this // test point for (int zz=startPt;zz<raceSets.length;zz++) { if (!eliminated[zz]) { if (z == 0) {// first test instance---make sure classifier is built errors[zz] = -((HoldOutSubsetEvaluator)evaluators[zz]). evaluateSubset(raceBitSets[zz], testInst, true); } else { // must be k fold rather than leave one out errors[zz] = -((HoldOutSubsetEvaluator)evaluators[zz]). evaluateSubset(raceBitSets[zz], testInst, false); } } } // now update the stats for (int j=startPt;j<raceSets.length;j++) { if (!eliminated[j]) { individualStats[j].add(errors[j]); for (int k=j+1;k<raceSets.length;k++) { if (!eliminated[k]) { testers[j][k].add(errors[j], errors[k]); } } } } // test for near identical models and models that are significantly // worse than some other model if (processedCount > m_samples-1 && (eliminatedCount < raceSets.length-1)) { for (int j=0;j<raceSets.length;j++) { if (!eliminated[j]) { for (int k=j+1;k<raceSets.length;k++) { if (!eliminated[k]) { testers[j][k].calculateDerived(); // near identical ? if ((testers[j][k].differencesSignificance == 0) && (Utils.eq(testers[j][k].differencesStats.mean, 0.0) || (Utils.gr(m_delta, Math.abs(testers[j][k]. differencesStats.mean))))) { // if they're exactly the same and there is a base set // in this race, make sure that the base set is NOT the // one eliminated. if (Utils.eq(testers[j][k].differencesStats.mean, 0.0)) { if (baseSetIncluded) { if (j != 0) { eliminated[j] = true; } else { eliminated[k] = true; } eliminatedCount++; } else { eliminated[j] = true; } if (m_debug) { System.err.println("Eliminating (identical) " +j+" "+raceBitSets[j].toString() +" vs "+k+" " +raceBitSets[k].toString() +" after " +processedCount +" evaluations\n" +"\nerror "+j+" : " +testers[j][k].xStats.mean +" vs "+k+" : " +testers[j][k].yStats.mean +" diff : " +testers[j][k].differencesStats .mean); } } else { // eliminate the one with the higer error if (testers[j][k].xStats.mean > testers[j][k].yStats.mean) { eliminated[j] = true; eliminatedCount++; if (m_debug) { System.err.println("Eliminating (near identical) " +j+" "+raceBitSets[j].toString() +" vs "+k+" " +raceBitSets[k].toString() +" after " +processedCount +" evaluations\n" +"\nerror "+j+" : " +testers[j][k].xStats.mean +" vs "+k+" : " +testers[j][k].yStats.mean +" diff : " +testers[j][k].differencesStats .mean); } break; } else { eliminated[k] = true; eliminatedCount++; if (m_debug) { System.err.println("Eliminating (near identical) " +k+" "+raceBitSets[k].toString() +" vs "+j+" " +raceBitSets[j].toString() +" after " +processedCount +" evaluations\n" +"\nerror "+k+" : " +testers[j][k].yStats.mean +" vs "+j+" : " +testers[j][k].xStats.mean +" diff : " +testers[j][k].differencesStats .mean); } } } } else { // significantly worse ? if (testers[j][k].differencesSignificance != 0) { if (testers[j][k].differencesSignificance > 0) { eliminated[j] = true; eliminatedCount++; if (m_debug) { System.err.println("Eliminating (-worse) " +j+" "+raceBitSets[j].toString() +" vs "+k+" " +raceBitSets[k].toString() +" after " +processedCount +" evaluations" +"\nerror "+j+" : " +testers[j][k].xStats.mean +" vs "+k+" : " +testers[j][k].yStats.mean); } break; } else { eliminated[k] = true; eliminatedCount++; if (m_debug) { System.err.println("Eliminating (worse) " +k+" "+raceBitSets[k].toString() +" vs "+j+" " +raceBitSets[j].toString() +" after " +processedCount +" evaluations" +"\nerror "+k+" : " +testers[j][k].yStats.mean +" vs "+j+" : " +testers[j][k].xStats.mean); } } } } } } } } } // if there is a base set from the previous race and it's the // only remaining subset then terminate the race. if (eliminatedCount == raceSets.length-1 && baseSetIncluded && !eliminated[0] && !m_rankingRequested) { break race; } } } if (m_debug) { System.err.println("*****eliminated count: "+eliminatedCount); } double bestError = Double.MAX_VALUE; int bestIndex=0; // return the index of the winner for (int i=startPt;i<raceSets.length;i++) { if (!eliminated[i]) { individualStats[i].calculateDerived(); if (m_debug) { System.err.println("Remaining error: "+raceBitSets[i].toString() +" "+individualStats[i].mean); } if (individualStats[i].mean < bestError) { bestError = individualStats[i].mean; bestIndex = i; } } } double [] retInfo = new double[2]; retInfo[0] = bestIndex; retInfo[1] = bestError; if (m_debug) { System.err.print("Best set from race : "); for (int i=0;i<m_numAttribs;i++) { if (raceSets[bestIndex][i] == '1') { System.err.print('1'); } else { System.err.print('0'); } } System.err.println(" :"+bestError+" Processed : "+(processedCount) +"\n"+individualStats[bestIndex].toString()); } return retInfo; } public String toString() { StringBuffer text = new StringBuffer(); text.append("\tRaceSearch.\n\tRace type : "); switch (m_raceType) { case FORWARD_RACE: text.append("forward selection race\n\tBase set : no attributes"); break; case BACKWARD_RACE: text.append("backward elimination race\n\tBase set : all attributes"); break; case SCHEMATA_RACE: text.append("schemata race\n\tBase set : no attributes"); break; case RANK_RACE: text.append("rank race\n\tBase set : no attributes\n\t"); text.append("Attribute evaluator : " + getAttributeEvaluator().getClass().getName() +" "); if (m_ASEval instanceof OptionHandler) { String[] evaluatorOptions = new String[0]; evaluatorOptions = ((OptionHandler)m_ASEval).getOptions(); for (int i=0;i<evaluatorOptions.length;i++) { text.append(evaluatorOptions[i]+' '); } } text.append("\n"); text.append("\tAttribute ranking : \n"); int rlength = (int)(Math.log(m_Ranking.length) / Math.log(10) + 1); for (int i=0;i<m_Ranking.length;i++) { text.append("\t "+Utils.doubleToString((double)(m_Ranking[i]+1), rlength,0) +" "+m_Instances.attribute(m_Ranking[i]).name()+'\n'); } break; } text.append("\n\tCross validation mode : "); if (m_xvalType == TEN_FOLD) { text.append("10 fold"); } else { text.append("Leave-one-out"); } text.append("\n\tMerit of best subset found : "); int fieldwidth = 3; double precision = (m_bestMerit - (int)m_bestMerit); if (Math.abs(m_bestMerit) > 0) { fieldwidth = (int)Math.abs((Math.log(Math.abs(m_bestMerit)) / Math.log(10)))+2; } if (Math.abs(precision) > 0) { precision = Math.abs((Math.log(Math.abs(precision)) / Math.log(10)))+3; } else { precision = 2; } text.append(Utils.doubleToString(Math.abs(m_bestMerit), fieldwidth+(int)precision, (int)precision)+"\n"); return text.toString(); } /** * Reset the search method. */ protected void resetOptions () { m_sigLevel = 0.001; m_delta = 0.001; m_ASEval = new GainRatioAttributeEval(); m_Ranking = null; m_raceType = FORWARD_RACE; m_debug = false; m_theEvaluator = null; m_bestMerit = -Double.MAX_VALUE; m_numFolds = 10; } }