/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * RELEASE INFORMATION (December 27, 2004) * * FCBF algorithm: * Template obtained from Weka * Developed for Weka by Zheng Alan Zhao * December 27, 2004 * * FCBF algorithm is a feature selection method based on Symmetrical Uncertainty Measurement for * relevance redundancy analysis. The details of FCBF algorithm are in: * <!-- technical-plaintext-start --> * Lei Yu, Huan Liu: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution. In: Proceedings of the Twentieth International Conference on Machine Learning, 856-863, 2003. <!-- technical-plaintext-end --> * * * * CONTACT INFORMATION * * For algorithm implementation: * Zheng Zhao: zhaozheng at asu.edu * * For the algorithm: * Lei Yu: leiyu at asu.edu * Huan Liu: hliu at asu.edu * * Data Mining and Machine Learning Lab * Computer Science and Engineering Department * Fulton School of Engineering * Arizona State University * Tempe, AZ 85287 * * SymmetricalUncertAttributeSetEval.java * * Copyright (C) 2004 Data Mining and Machine Learning Lab, * Computer Science and Engineering Department, * Fulton School of Engineering, * Arizona State University * */ package weka.attributeSelection; import weka.core.*; import weka.core.Capabilities.Capability; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.filters.Filter; import weka.filters.supervised.attribute.Discretize; import java.util.Enumeration; import java.util.Vector; /** <!-- globalinfo-start --> * SymmetricalUncertAttributeSetEval :<br/> * <br/> * Evaluates the worth of a set attributes by measuring the symmetrical uncertainty with respect to another set of attributes. <br/> * <br/> * SymmU(AttributeSet2, AttributeSet1) = 2 * (H(AttributeSet2) - H(AttributeSet1 | AttributeSet2)) / H(AttributeSet2) + H(AttributeSet1).<br/> * <br/> * For more information see:<br/> * <br/> * Lei Yu, Huan Liu: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution. In: Proceedings of the Twentieth International Conference on Machine Learning, 856-863, 2003. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @inproceedings{Yu2003, * author = {Lei Yu and Huan Liu}, * booktitle = {Proceedings of the Twentieth International Conference on Machine Learning}, * pages = {856-863}, * publisher = {AAAI Press}, * title = {Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution}, * year = {2003} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -M * treat missing values as a seperate value.</pre> * <!-- options-end --> * * @author Zheng Zhao: zhaozheng at asu.edu * @version $Revision: 5447 $ * @see Discretize */ public class SymmetricalUncertAttributeSetEval extends AttributeSetEvaluator implements OptionHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 8351377335495873202L; /** The training instances */ private Instances m_trainInstances; /** The class index */ private int m_classIndex; /** The number of attributes */ private int m_numAttribs; /** The number of instances */ private int m_numInstances; /** The number of classes */ private int m_numClasses; /** Treat missing values as a seperate value */ private boolean m_missing_merge; /** * Returns a string describing this attribute evaluator * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "SymmetricalUncertAttributeSetEval :\n\nEvaluates the worth of a set attributes " +"by measuring the symmetrical uncertainty with respect to another set of attributes. " +"\n\n SymmU(AttributeSet2, AttributeSet1) = 2 * (H(AttributeSet2) - H(AttributeSet1 | AttributeSet2)) " +"/ H(AttributeSet2) + H(AttributeSet1).\n\n" + "For more information see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Lei Yu and Huan Liu"); result.setValue(Field.TITLE, "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution"); result.setValue(Field.BOOKTITLE, "Proceedings of the Twentieth International Conference on Machine Learning"); result.setValue(Field.YEAR, "2003"); result.setValue(Field.PAGES, "856-863"); result.setValue(Field.PUBLISHER, "AAAI Press"); return result; } /** * Constructor */ public SymmetricalUncertAttributeSetEval () { resetOptions(); } /** * Returns an enumeration describing the available options. * @return an enumeration of all the available options. **/ public Enumeration listOptions () { Vector newVector = new Vector(1); newVector.addElement(new Option("\ttreat missing values as a seperate " + "value.", "M", 0, "-M")); return newVector.elements(); } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -M * treat missing values as a seperate value.</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions (String[] options) throws Exception { resetOptions(); setMissingMerge(!(Utils.getFlag('M', options))); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String missingMergeTipText() { return "Distribute counts for missing values. Counts are distributed " +"across other values in proportion to their frequency. Otherwise, " +"missing is treated as a separate value."; } /** * distribute the counts for missing values across observed values * * @param b true=distribute missing values. */ public void setMissingMerge (boolean b) { m_missing_merge = b; } /** * get whether missing values are being distributed or not * * @return true if missing values are being distributed. */ public boolean getMissingMerge () { return m_missing_merge; } /** * Gets the current settings of WrapperSubsetEval. * @return an array of strings suitable for passing to setOptions() */ public String[] getOptions () { String[] options = new String[1]; int current = 0; if (!getMissingMerge()) { options[current++] = "-M"; } while (current < options.length) { options[current++] = ""; } return options; } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); return result; } /** * Initializes a symmetrical uncertainty attribute evaluator. * Discretizes all attributes that are numeric. * * @param data set of instances serving as training data * @throws Exception if the evaluator has not been * generated successfully */ public void buildEvaluator (Instances data) throws Exception { // can evaluator handle data? getCapabilities().testWithFail(data); m_trainInstances = data; m_classIndex = m_trainInstances.classIndex(); m_numAttribs = m_trainInstances.numAttributes(); m_numInstances = m_trainInstances.numInstances(); Discretize disTransform = new Discretize(); disTransform.setUseBetterEncoding(true); disTransform.setInputFormat(m_trainInstances); m_trainInstances = Filter.useFilter(m_trainInstances, disTransform); m_numClasses = m_trainInstances.attribute(m_classIndex).numValues(); } /** * set options to default values */ protected void resetOptions () { m_trainInstances = null; m_missing_merge = true; } /** * evaluates an individual attribute by measuring the symmetrical * uncertainty between it and the class. * * @param attribute the index of the attribute to be evaluated * @return the uncertainty * @throws Exception if the attribute could not be evaluated */ public double evaluateAttribute (int attribute) throws Exception { int i, j, ii, jj; int ni, nj; double sum = 0.0; ni = m_trainInstances.attribute(attribute).numValues() + 1; nj = m_numClasses + 1; double[] sumi, sumj; Instance inst; double temp = 0.0; sumi = new double[ni]; sumj = new double[nj]; double[][] counts = new double[ni][nj]; sumi = new double[ni]; sumj = new double[nj]; for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumj[j] = 0.0; counts[i][j] = 0.0; } } // Fill the contingency table for (i = 0; i < m_numInstances; i++) { inst = m_trainInstances.instance(i); if (inst.isMissing(attribute)) { ii = ni - 1; } else { ii = (int)inst.value(attribute); } if (inst.isMissing(m_classIndex)) { jj = nj - 1; } else { jj = (int)inst.value(m_classIndex); } counts[ii][jj]++; } // get the row totals for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { //there are how many happen of a special feature value sumi[i] += counts[i][j]; sum += counts[i][j]; } } // get the column totals for (j = 0; j < nj; j++) { sumj[j] = 0.0; for (i = 0; i < ni; i++) { //a class value include how many instance. sumj[j] += counts[i][j]; } } // distribute missing counts if (m_missing_merge && (sumi[ni-1] < m_numInstances) && (sumj[nj-1] < m_numInstances)) { double[] i_copy = new double[sumi.length]; double[] j_copy = new double[sumj.length]; double[][] counts_copy = new double[sumi.length][sumj.length]; for (i = 0; i < ni; i++) { System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length); } System.arraycopy(sumi, 0, i_copy, 0, sumi.length); System.arraycopy(sumj, 0, j_copy, 0, sumj.length); double total_missing = (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]); // do the missing i's if (sumi[ni - 1] > 0.0) { //sumi[ni - 1]: missing value contains how many values. for (j = 0; j < nj - 1; j++) { if (counts[ni - 1][j] > 0.0) { for (i = 0; i < ni - 1; i++) { temp = ((i_copy[i]/(sum - i_copy[ni - 1])) * counts[ni - 1][j]); counts[i][j] += temp; //according to the probability of value i we distribute account of the missing degree of a class lable to it sumi[i] += temp; } counts[ni - 1][j] = 0.0; } } } sumi[ni - 1] = 0.0; // do the missing j's if (sumj[nj - 1] > 0.0) { for (i = 0; i < ni - 1; i++) { if (counts[i][nj - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]); counts[i][j] += temp; sumj[j] += temp; } counts[i][nj - 1] = 0.0; } } } sumj[nj - 1] = 0.0; // do the both missing if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) { for (i = 0; i < ni - 1; i++) { for (j = 0; j < nj - 1; j++) { temp = (counts_copy[i][j]/(sum - total_missing)) * counts_copy[ni - 1][nj - 1]; counts[i][j] += temp; sumi[i] += temp; sumj[j] += temp; } } counts[ni - 1][nj - 1] = 0.0; } } return ContingencyTables.symmetricalUncertainty(counts); } /** * calculate symmetrical uncertainty between sets of attributes * * @param attributes the indexes of the attributes * @param classAttributes the indexes of the attributes whose combination will * be used as class label * @return the uncertainty * @throws Exception if the attribute could not be evaluated */ public double evaluateAttribute (int[] attributes, int[] classAttributes) throws Exception { int i, j; //variable for looping. int p; //variable for looping. int ii, jj; //specifying the position in the contingency table. int nnj, nni; //counting base for attributes[]. int ni, nj; //the nubmer of rows and columns in the ContingencyTables. double sum = 0.0; boolean b_missing_attribute = false; boolean b_missing_classAtrribute = false; if(attributes.length==0) { throw new Exception("the parameter attributes[] is empty;SEQ:W-FS-Eval-SUAS-001"); } if(classAttributes.length==0) { throw new Exception("the parameter classAttributes[] is empty;SEQ:W-FS-Eval-SUAS-002"); } /*calculate the number of the rows in ContingencyTable*/ ni = m_trainInstances.attribute(attributes[0]).numValues(); if (ni == 0) { throw new Exception("an attribute is empty;SEQ:W-FS-Eval-SUAS-003;"+1); } for (i = 1;i<attributes.length;i++) { if (m_trainInstances.attribute(attributes[i]).numValues() == 0) { throw new Exception("an attribute is empty;SEQ:W-FS-Eval-SUAS-003;" + (i+1)); } ni = ni*m_trainInstances.attribute(attributes[i]).numValues(); } ni = ni+1; /*calculate the number of the colums in the ContingencyTable*/ nj = m_trainInstances.attribute(classAttributes[0]).numValues(); if (nj == 0) { throw new Exception("the a classAttribute is empty;SEQ:W-FS-Eval-SUAS-004;"+1); } for (i = 1;i<classAttributes.length;i++) { if (m_trainInstances.attribute(classAttributes[i]).numValues() == 0) { throw new Exception("the a classAttribute is empty;SEQ:W-FS-Eval-SUAS-004;" + (i+1)); } nj = nj*m_trainInstances.attribute(classAttributes[i]).numValues(); } nj = nj+1; double[] sumi, sumj; Instance inst; double temp = 0.0; sumi = new double[ni]; sumj = new double[nj]; double[][] counts = new double[ni][nj]; sumi = new double[ni]; sumj = new double[nj]; for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { sumj[j] = 0.0; counts[i][j] = 0.0; } } // Fill the contingency table for (i = 0; i < m_numInstances; i++) { inst = m_trainInstances.instance(i); b_missing_attribute = false; b_missing_classAtrribute = false; /*get row position in contingency table*/ nni = 1; ii = 0; for (p=attributes.length-1; p>=0; p--) { if (inst.isMissing(attributes[p])) { b_missing_attribute = true; } ii = ((int)inst.value(attributes[p])*nni)+ii; if (p<attributes.length-1){ nni = nni * (m_trainInstances.attribute(attributes[p]).numValues()); } else { nni = m_trainInstances.attribute(attributes[p]).numValues(); } } if (b_missing_attribute) { ii = ni-1; } /*get colum position in contingency table*/ nnj = 1; jj = 0; for (p=classAttributes.length-1; p>=0; p--) { if (inst.isMissing(classAttributes[p])) { b_missing_classAtrribute = true; } jj = ((int)inst.value(classAttributes[p])*nnj)+jj; if (p<attributes.length-1){ nnj = nnj * (m_trainInstances.attribute(classAttributes[p]).numValues()); } else { nnj = m_trainInstances.attribute(classAttributes[p]).numValues(); } } if (b_missing_classAtrribute) { jj = nj-1; } counts[ii][jj]++; } // get the row totals for (i = 0; i < ni; i++) { sumi[i] = 0.0; for (j = 0; j < nj; j++) { //there are how many happen of a special feature value sumi[i] += counts[i][j]; sum += counts[i][j]; } } // get the column totals for (j = 0; j < nj; j++) { sumj[j] = 0.0; for (i = 0; i < ni; i++) { //a class value include how many instance. sumj[j] += counts[i][j]; } } // distribute missing counts if (m_missing_merge && (sumi[ni-1] < m_numInstances) && (sumj[nj-1] < m_numInstances)) { double[] i_copy = new double[sumi.length]; double[] j_copy = new double[sumj.length]; double[][] counts_copy = new double[sumi.length][sumj.length]; for (i = 0; i < ni; i++) { System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length); } System.arraycopy(sumi, 0, i_copy, 0, sumi.length); System.arraycopy(sumj, 0, j_copy, 0, sumj.length); double total_missing = (sumi[ni - 1] + sumj[nj - 1] - counts[ni - 1][nj - 1]); // do the missing i's if (sumi[ni - 1] > 0.0) { //sumi[ni - 1]: missing value contains how many values. for (j = 0; j < nj - 1; j++) { if (counts[ni - 1][j] > 0.0) { for (i = 0; i < ni - 1; i++) { temp = ((i_copy[i]/(sum - i_copy[ni - 1])) * counts[ni - 1][j]); counts[i][j] += temp; //according to the probability of value i we distribute account of the missing degree of a class lable to it sumi[i] += temp; } counts[ni - 1][j] = 0.0; } } } sumi[ni - 1] = 0.0; // do the missing j's if (sumj[nj - 1] > 0.0) { for (i = 0; i < ni - 1; i++) { if (counts[i][nj - 1] > 0.0) { for (j = 0; j < nj - 1; j++) { temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]); counts[i][j] += temp; sumj[j] += temp; } counts[i][nj - 1] = 0.0; } } } sumj[nj - 1] = 0.0; // do the both missing if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) { for (i = 0; i < ni - 1; i++) { for (j = 0; j < nj - 1; j++) { temp = (counts_copy[i][j]/(sum - total_missing)) * counts_copy[ni - 1][nj - 1]; counts[i][j] += temp; sumi[i] += temp; sumj[j] += temp; } } counts[ni - 1][nj - 1] = 0.0; } } return ContingencyTables.symmetricalUncertainty(counts); } /** * Return a description of the evaluator * @return description as a string */ public String toString () { StringBuffer text = new StringBuffer(); if (m_trainInstances == null) { text.append("\tSymmetrical Uncertainty evaluator has not been built"); } else { text.append("\tSymmetrical Uncertainty Ranking Filter"); if (!m_missing_merge) { text.append("\n\tMissing values treated as seperate"); } } text.append("\n"); return text.toString(); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5447 $"); } // ============ // Test method. // ============ /** * Main method for testing this class. * * @param argv should contain the following arguments: * -t training file */ public static void main (String[] argv) { runEvaluator(new SymmetricalUncertAttributeSetEval(), argv); } }