/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* RankSearch.java
* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
*
*/
package weka.attributeSelection;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import java.util.BitSet;
import java.util.Enumeration;
import java.util.Vector;
/**
<!-- globalinfo-start -->
* RankSearch : <br/>
* <br/>
* Uses an attribute/subset evaluator to rank all attributes. If a subset evaluator is specified,
* then a forward selection search is used to generate a ranked list. From the
* ranked list of attributes, subsets of increasing size are evaluated, ie.
* The best attribute, the best attribute plus the next best attribute, etc....
* The best attribute set is reported. RankSearch is linear in the number of
* attributes if a simple attribute evaluator is used such as GainRatioAttributeEval.
* For more information see:<br/>
* <br/>
* Mark Hall, Geoffrey Holmes (2003). Benchmarking attribute selection techniques
* for discrete class data mining. IEEE Transactions on Knowledge and Data Engineering. 15(6):1437-1447.
* <p/>
<!-- globalinfo-end -->
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -A <attribute evaluator>
* class name of attribute evaluator to use for ranking. Place any
* evaluator options LAST on the command line following a "--".
* eg.:
* -A weka.attributeSelection.GainRatioAttributeEval ... -- -M
* (default: weka.attributeSelection.GainRatioAttributeEval)</pre>
*
* <pre> -S <step size>
* number of attributes to be added from the
* ranking in each iteration (default = 1).</pre>
*
* <pre> -R <start point>
* point in the ranking to start evaluating from.
* (default = 0, ie. the head of the ranking).</pre>
*
* <pre>
* Options specific to evaluator weka.attributeSelection.GainRatioAttributeEval:
* </pre>
*
* <pre> -M
* treat missing values as a seperate value.</pre>
*
<!-- options-end -->
*
* @author Mark Hall (mhall@cs.waikato.ac.nz)
* @version $Revision: 6253 $
*/
public class RankSearch
extends ASSearch
implements OptionHandler, TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = -7992268736874353755L;
/** does the data have a class */
private boolean m_hasClass;
/** holds the class index */
private int m_classIndex;
/** number of attributes in the data */
private int m_numAttribs;
/** the best subset found */
private BitSet m_best_group;
/** the attribute evaluator to use for generating the ranking */
private ASEvaluation m_ASEval;
/** the subset evaluator with which to evaluate the ranking */
private ASEvaluation m_SubsetEval;
/** the training instances */
private Instances m_Instances;
/** the merit of the best subset found */
private double m_bestMerit;
/** will hold the attribute ranking */
private int [] m_Ranking;
/** add this many attributes in each iteration from the ranking */
protected int m_add = 1;
/** start from this point in the ranking */
protected int m_startPoint = 0;
/**
* Returns a string describing this search method
* @return a description of the search method suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "RankSearch : \n\n"
+"Uses an attribute/subset evaluator to rank all attributes. "
+"If a subset evaluator is specified, then a forward selection "
+"search is used to generate a ranked list. From the ranked "
+"list of attributes, subsets of increasing size are evaluated, ie. "
+"The best attribute, the best attribute plus the next best attribute, "
+"etc.... The best attribute set is reported. RankSearch is linear in "
+"the number of attributes if a simple attribute evaluator is used "
+"such as GainRatioAttributeEval. For more information see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing
* detailed information about the technical background of this class,
* e.g., paper reference or book this class is based on.
*
* @return the technical information about this class
*/
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.ARTICLE);
result.setValue(Field.AUTHOR, "Mark Hall and Geoffrey Holmes");
result.setValue(Field.YEAR, "2003");
result.setValue(Field.TITLE, "Benchmarking attribute selection techniques for " +
"discrete class data mining");
result.setValue(Field.JOURNAL, "IEEE Transactions on Knowledge and Data Engineering");
result.setValue(Field.VOLUME, "15");
result.setValue(Field.NUMBER, "6");
result.setValue(Field.PAGES, "1437-1447");
result.setValue(Field.PUBLISHER, "IEEE Computer Society");
return result;
}
/**
* Constructor
*/
public RankSearch () {
resetOptions();
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String attributeEvaluatorTipText() {
return "Attribute evaluator to use for generating a ranking.";
}
/**
* Set the attribute evaluator to use for generating the ranking.
* @param newEvaluator the attribute evaluator to use.
*/
public void setAttributeEvaluator(ASEvaluation newEvaluator) {
m_ASEval = newEvaluator;
}
/**
* Get the attribute evaluator used to generate the ranking.
* @return the evaluator used to generate the ranking.
*/
public ASEvaluation getAttributeEvaluator() {
return m_ASEval;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String stepSizeTipText() {
return "Add this many attributes from the ranking in each iteration.";
}
/**
* Set the number of attributes to add from the rankining
* in each iteration
* @param ss the number of attribes to add.
*/
public void setStepSize(int ss) {
if (ss > 0) {
m_add = ss;
}
}
/**
* Get the number of attributes to add from the rankining
* in each iteration
* @return the number of attributes to add.
*/
public int getStepSize() {
return m_add;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String startPointTipText() {
return "Start evaluating from this point in the ranking.";
}
/**
* Set the point at which to start evaluating the ranking
* @param sp the position in the ranking to start at
*/
public void setStartPoint(int sp) {
if (sp >= 0) {
m_startPoint = sp;
}
}
/**
* Get the point at which to start evaluating the ranking
* @return the position in the ranking to start at
*/
public int getStartPoint() {
return m_startPoint;
}
/**
* Returns an enumeration describing the available options.
* @return an enumeration of all the available options.
**/
public Enumeration listOptions () {
Vector newVector = new Vector(4);
newVector.addElement(new Option(
"\tclass name of attribute evaluator to use for ranking. Place any\n"
+ "\tevaluator options LAST on the command line following a \"--\".\n"
+ "\teg.:\n"
+ "\t\t-A weka.attributeSelection.GainRatioAttributeEval ... -- -M\n"
+ "\t(default: weka.attributeSelection.GainRatioAttributeEval)",
"A", 1, "-A <attribute evaluator>"));
newVector.addElement(new Option(
"\tnumber of attributes to be added from the"
+"\n\tranking in each iteration (default = 1).",
"S", 1,"-S <step size>"));
newVector.addElement(new Option(
"\tpoint in the ranking to start evaluating from. "
+"\n\t(default = 0, ie. the head of the ranking).",
"R", 1,"-R <start point>"));
if ((m_ASEval != null) &&
(m_ASEval instanceof OptionHandler)) {
newVector.addElement(new Option("", "", 0, "\nOptions specific to "
+ "evaluator "
+ m_ASEval.getClass().getName()
+ ":"));
Enumeration enu = ((OptionHandler)m_ASEval).listOptions();
while (enu.hasMoreElements()) {
newVector.addElement(enu.nextElement());
}
}
return newVector.elements();
}
/**
* Parses a given list of options. <p/>
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -A <attribute evaluator>
* class name of attribute evaluator to use for ranking. Place any
* evaluator options LAST on the command line following a "--".
* eg.:
* -A weka.attributeSelection.GainRatioAttributeEval ... -- -M
* (default: weka.attributeSelection.GainRatioAttributeEval)</pre>
*
* <pre> -S <step size>
* number of attributes to be added from the
* ranking in each iteration (default = 1).</pre>
*
* <pre> -R <start point>
* point in the ranking to start evaluating from.
* (default = 0, ie. the head of the ranking).</pre>
*
* <pre>
* Options specific to evaluator weka.attributeSelection.GainRatioAttributeEval:
* </pre>
*
* <pre> -M
* treat missing values as a seperate value.</pre>
*
<!-- options-end -->
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
public void setOptions (String[] options)
throws Exception {
String optionString;
resetOptions();
optionString = Utils.getOption('S', options);
if (optionString.length() != 0) {
setStepSize(Integer.parseInt(optionString));
}
optionString = Utils.getOption('R', options);
if (optionString.length() != 0) {
setStartPoint(Integer.parseInt(optionString));
}
optionString = Utils.getOption('A', options);
if (optionString.length() == 0)
optionString = GainRatioAttributeEval.class.getName();
setAttributeEvaluator(ASEvaluation.forName(optionString,
Utils.partitionOptions(options)));
}
/**
* Gets the current settings of WrapperSubsetEval.
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions () {
String[] evaluatorOptions = new String[0];
if ((m_ASEval != null) &&
(m_ASEval instanceof OptionHandler)) {
evaluatorOptions = ((OptionHandler)m_ASEval).getOptions();
}
String[] options = new String[8 + evaluatorOptions.length];
int current = 0;
options[current++] = "-S"; options[current++] = ""+getStepSize();
options[current++] = "-R"; options[current++] = ""+getStartPoint();
if (getAttributeEvaluator() != null) {
options[current++] = "-A";
options[current++] = getAttributeEvaluator().getClass().getName();
}
if (evaluatorOptions.length > 0) {
options[current++] = "--";
System.arraycopy(evaluatorOptions, 0, options, current,
evaluatorOptions.length);
current += evaluatorOptions.length;
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Reset the search method.
*/
protected void resetOptions () {
m_ASEval = new GainRatioAttributeEval();
m_Ranking = null;
}
/**
* Ranks attributes using the specified attribute evaluator and then
* searches the ranking using the supplied subset evaluator.
*
* @param ASEval the subset evaluator to guide the search
* @param data the training instances.
* @return an array (not necessarily ordered) of selected attribute indexes
* @throws Exception if the search can't be completed
*/
public int[] search (ASEvaluation ASEval, Instances data)
throws Exception {
double best_merit = -Double.MAX_VALUE;
double temp_merit;
BitSet temp_group, best_group=null;
if (!(ASEval instanceof SubsetEvaluator)) {
throw new Exception(ASEval.getClass().getName()
+ " is not a "
+ "Subset evaluator!");
}
m_SubsetEval = ASEval;
m_Instances = data;
m_numAttribs = m_Instances.numAttributes();
/* if (m_ASEval instanceof AttributeTransformer) {
throw new Exception("Can't use an attribute transformer "
+"with RankSearch");
} */
if (m_ASEval instanceof UnsupervisedAttributeEvaluator ||
m_ASEval instanceof UnsupervisedSubsetEvaluator) {
m_hasClass = false;
/* if (!(m_SubsetEval instanceof UnsupervisedSubsetEvaluator)) {
throw new Exception("Must use an unsupervised subset evaluator.");
} */
}
else {
m_hasClass = true;
m_classIndex = m_Instances.classIndex();
}
if (m_ASEval instanceof AttributeEvaluator) {
// generate the attribute ranking first
Ranker ranker = new Ranker();
m_ASEval.buildEvaluator(m_Instances);
if (m_ASEval instanceof AttributeTransformer) {
// get the transformed data a rebuild the subset evaluator
m_Instances = ((AttributeTransformer)m_ASEval).
transformedData(m_Instances);
((ASEvaluation)m_SubsetEval).buildEvaluator(m_Instances);
}
m_Ranking = ranker.search(m_ASEval, m_Instances);
} else {
GreedyStepwise fs = new GreedyStepwise();
double [][]rankres;
fs.setGenerateRanking(true);
((ASEvaluation)m_ASEval).buildEvaluator(m_Instances);
fs.search(m_ASEval, m_Instances);
rankres = fs.rankedAttributes();
m_Ranking = new int[rankres.length];
for (int i=0;i<rankres.length;i++) {
m_Ranking[i] = (int)rankres[i][0];
}
}
// now evaluate the attribute ranking
for (int i=m_startPoint;i<m_Ranking.length;i+=m_add) {
temp_group = new BitSet(m_numAttribs);
for (int j=0;j<=i;j++) {
temp_group.set(m_Ranking[j]);
}
temp_merit = ((SubsetEvaluator)m_SubsetEval).evaluateSubset(temp_group);
if (temp_merit > best_merit) {
best_merit = temp_merit;;
best_group = temp_group;
}
}
m_bestMerit = best_merit;
return attributeList(best_group);
}
/**
* converts a BitSet into a list of attribute indexes
* @param group the BitSet to convert
* @return an array of attribute indexes
**/
private int[] attributeList (BitSet group) {
int count = 0;
// count how many were selected
for (int i = 0; i < m_numAttribs; i++) {
if (group.get(i)) {
count++;
}
}
int[] list = new int[count];
count = 0;
for (int i = 0; i < m_numAttribs; i++) {
if (group.get(i)) {
list[count++] = i;
}
}
return list;
}
/**
* returns a description of the search as a String
* @return a description of the search
*/
public String toString () {
StringBuffer text = new StringBuffer();
text.append("\tRankSearch :\n");
text.append("\tAttribute evaluator : "
+ getAttributeEvaluator().getClass().getName() +" ");
if (m_ASEval instanceof OptionHandler) {
String[] evaluatorOptions = new String[0];
evaluatorOptions = ((OptionHandler)m_ASEval).getOptions();
for (int i=0;i<evaluatorOptions.length;i++) {
text.append(evaluatorOptions[i]+' ');
}
}
text.append("\n");
text.append("\tAttribute ranking : \n");
int rlength = (int)(Math.log(m_Ranking.length) / Math.log(10) + 1);
for (int i=0;i<m_Ranking.length;i++) {
text.append("\t "+Utils.doubleToString((double)(m_Ranking[i]+1),
rlength,0)
+" "+m_Instances.attribute(m_Ranking[i]).name()+'\n');
}
text.append("\tMerit of best subset found : ");
int fieldwidth = 3;
double precision = (m_bestMerit - (int)m_bestMerit);
if (Math.abs(m_bestMerit) > 0) {
fieldwidth = (int)Math.abs((Math.log(Math.abs(m_bestMerit)) / Math.log(10)))+2;
}
if (Math.abs(precision) > 0) {
precision = Math.abs((Math.log(Math.abs(precision)) / Math.log(10)))+3;
} else {
precision = 2;
}
text.append(Utils.doubleToString(Math.abs(m_bestMerit),
fieldwidth+(int)precision,
(int)precision)+"\n");
return text.toString();
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 6253 $");
}
}