/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* RELEASE INFORMATION (December 27, 2004)
*
* FCBF algorithm:
* Template obtained from Weka
* Developed for Weka by Zheng Alan Zhao
* December 27, 2004
*
* FCBF algorithm is a feature selection method based on Symmetrical Uncertainty Measurement for
* relevance redundancy analysis. The details of FCBF algorithm are in:
*
<!-- technical-plaintext-start -->
* Lei Yu, Huan Liu: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution. In: Proceedings of the Twentieth International Conference on Machine Learning, 856-863, 2003.
<!-- technical-plaintext-end -->
*
*
*
* CONTACT INFORMATION
*
* For algorithm implementation:
* Zheng Zhao: zhaozheng at asu.edu
*
* For the algorithm:
* Lei Yu: leiyu at asu.edu
* Huan Liu: hliu at asu.edu
*
* Data Mining and Machine Learning Lab
* Computer Science and Engineering Department
* Fulton School of Engineering
* Arizona State University
* Tempe, AZ 85287
*
* SymmetricalUncertAttributeSetEval.java
*
* Copyright (C) 2004 Data Mining and Machine Learning Lab,
* Computer Science and Engineering Department,
* Fulton School of Engineering,
* Arizona State University
*
*/
package weka.attributeSelection;
import weka.core.Capabilities;
import weka.core.ContingencyTables;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;
import java.util.Enumeration;
import java.util.Vector;
/**
<!-- globalinfo-start -->
* SymmetricalUncertAttributeSetEval :<br/>
* <br/>
* Evaluates the worth of a set attributes by measuring the symmetrical uncertainty with respect to another set of attributes. <br/>
* <br/>
* SymmU(AttributeSet2, AttributeSet1) = 2 * (H(AttributeSet2) - H(AttributeSet1 | AttributeSet2)) / H(AttributeSet2) + H(AttributeSet1).<br/>
* <br/>
* For more information see:<br/>
* <br/>
* Lei Yu, Huan Liu: Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution. In: Proceedings of the Twentieth International Conference on Machine Learning, 856-863, 2003.
* <p/>
<!-- globalinfo-end -->
*
<!-- technical-bibtex-start -->
* BibTeX:
* <pre>
* @inproceedings{Yu2003,
* author = {Lei Yu and Huan Liu},
* booktitle = {Proceedings of the Twentieth International Conference on Machine Learning},
* pages = {856-863},
* publisher = {AAAI Press},
* title = {Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution},
* year = {2003}
* }
* </pre>
* <p/>
<!-- technical-bibtex-end -->
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -M
* treat missing values as a seperate value.</pre>
*
<!-- options-end -->
*
* @author Zheng Zhao: zhaozheng at asu.edu
* @version $Revision: 5511 $
* @see Discretize
*/
public class SymmetricalUncertAttributeSetEval
extends AttributeSetEvaluator
implements OptionHandler, TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = 8351377335495873202L;
/** The training instances */
private Instances m_trainInstances;
/** The class index */
private int m_classIndex;
/** The number of attributes */
private int m_numAttribs;
/** The number of instances */
private int m_numInstances;
/** The number of classes */
private int m_numClasses;
/** Treat missing values as a seperate value */
private boolean m_missing_merge;
/**
* Returns a string describing this attribute evaluator
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "SymmetricalUncertAttributeSetEval :\n\nEvaluates the worth of a set attributes "
+"by measuring the symmetrical uncertainty with respect to another set of attributes. "
+"\n\n SymmU(AttributeSet2, AttributeSet1) = 2 * (H(AttributeSet2) - H(AttributeSet1 | AttributeSet2)) "
+"/ H(AttributeSet2) + H(AttributeSet1).\n\n"
+ "For more information see:\n\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing
* detailed information about the technical background of this class,
* e.g., paper reference or book this class is based on.
*
* @return the technical information about this class
*/
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.INPROCEEDINGS);
result.setValue(Field.AUTHOR, "Lei Yu and Huan Liu");
result.setValue(Field.TITLE, "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution");
result.setValue(Field.BOOKTITLE, "Proceedings of the Twentieth International Conference on Machine Learning");
result.setValue(Field.YEAR, "2003");
result.setValue(Field.PAGES, "856-863");
result.setValue(Field.PUBLISHER, "AAAI Press");
return result;
}
/**
* Constructor
*/
public SymmetricalUncertAttributeSetEval () {
resetOptions();
}
/**
* Returns an enumeration describing the available options.
* @return an enumeration of all the available options.
**/
public Enumeration listOptions () {
Vector newVector = new Vector(1);
newVector.addElement(new Option("\ttreat missing values as a seperate "
+ "value.", "M", 0, "-M"));
return newVector.elements();
}
/**
* Parses a given list of options. <p/>
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -M
* treat missing values as a seperate value.</pre>
*
<!-- options-end -->
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
public void setOptions (String[] options)
throws Exception {
resetOptions();
setMissingMerge(!(Utils.getFlag('M', options)));
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String missingMergeTipText() {
return "Distribute counts for missing values. Counts are distributed "
+"across other values in proportion to their frequency. Otherwise, "
+"missing is treated as a separate value.";
}
/**
* distribute the counts for missing values across observed values
*
* @param b true=distribute missing values.
*/
public void setMissingMerge (boolean b) {
m_missing_merge = b;
}
/**
* get whether missing values are being distributed or not
*
* @return true if missing values are being distributed.
*/
public boolean getMissingMerge () {
return m_missing_merge;
}
/**
* Gets the current settings of WrapperSubsetEval.
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions () {
String[] options = new String[1];
int current = 0;
if (!getMissingMerge()) {
options[current++] = "-M";
}
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Returns the capabilities of this evaluator.
*
* @return the capabilities of this evaluator
* @see Capabilities
*/
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
result.disableAll();
// attributes
result.enable(Capability.NOMINAL_ATTRIBUTES);
result.enable(Capability.NUMERIC_ATTRIBUTES);
result.enable(Capability.DATE_ATTRIBUTES);
result.enable(Capability.MISSING_VALUES);
// class
result.enable(Capability.NOMINAL_CLASS);
result.enable(Capability.MISSING_CLASS_VALUES);
return result;
}
/**
* Initializes a symmetrical uncertainty attribute evaluator.
* Discretizes all attributes that are numeric.
*
* @param data set of instances serving as training data
* @throws Exception if the evaluator has not been
* generated successfully
*/
public void buildEvaluator (Instances data)
throws Exception {
// can evaluator handle data?
getCapabilities().testWithFail(data);
m_trainInstances = data;
m_classIndex = m_trainInstances.classIndex();
m_numAttribs = m_trainInstances.numAttributes();
m_numInstances = m_trainInstances.numInstances();
Discretize disTransform = new Discretize();
disTransform.setUseBetterEncoding(true);
disTransform.setInputFormat(m_trainInstances);
m_trainInstances = Filter.useFilter(m_trainInstances, disTransform);
m_numClasses = m_trainInstances.attribute(m_classIndex).numValues();
}
/**
* set options to default values
*/
protected void resetOptions () {
m_trainInstances = null;
m_missing_merge = true;
}
/**
* evaluates an individual attribute by measuring the symmetrical
* uncertainty between it and the class.
*
* @param attribute the index of the attribute to be evaluated
* @return the uncertainty
* @throws Exception if the attribute could not be evaluated
*/
public double evaluateAttribute (int attribute)
throws Exception {
int i, j, ii, jj;
int ni, nj;
double sum = 0.0;
ni = m_trainInstances.attribute(attribute).numValues() + 1;
nj = m_numClasses + 1;
double[] sumi, sumj;
Instance inst;
double temp = 0.0;
sumi = new double[ni];
sumj = new double[nj];
double[][] counts = new double[ni][nj];
sumi = new double[ni];
sumj = new double[nj];
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
counts[i][j] = 0.0;
}
}
// Fill the contingency table
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
if (inst.isMissing(attribute)) {
ii = ni - 1;
}
else {
ii = (int)inst.value(attribute);
}
if (inst.isMissing(m_classIndex)) {
jj = nj - 1;
}
else {
jj = (int)inst.value(m_classIndex);
}
counts[ii][jj]++;
}
// get the row totals
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
//there are how many happen of a special feature value
sumi[i] += counts[i][j];
sum += counts[i][j];
}
}
// get the column totals
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
for (i = 0; i < ni; i++) {
//a class value include how many instance.
sumj[j] += counts[i][j];
}
}
// distribute missing counts
if (m_missing_merge &&
(sumi[ni-1] < m_numInstances) &&
(sumj[nj-1] < m_numInstances)) {
double[] i_copy = new double[sumi.length];
double[] j_copy = new double[sumj.length];
double[][] counts_copy = new double[sumi.length][sumj.length];
for (i = 0; i < ni; i++) {
System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
}
System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
double total_missing = (sumi[ni - 1] + sumj[nj - 1]
- counts[ni - 1][nj - 1]);
// do the missing i's
if (sumi[ni - 1] > 0.0) { //sumi[ni - 1]: missing value contains how many values.
for (j = 0; j < nj - 1; j++) {
if (counts[ni - 1][j] > 0.0) {
for (i = 0; i < ni - 1; i++) {
temp = ((i_copy[i]/(sum - i_copy[ni - 1])) *
counts[ni - 1][j]);
counts[i][j] += temp; //according to the probability of value i we distribute account of the missing degree of a class lable to it
sumi[i] += temp;
}
counts[ni - 1][j] = 0.0;
}
}
}
sumi[ni - 1] = 0.0;
// do the missing j's
if (sumj[nj - 1] > 0.0) {
for (i = 0; i < ni - 1; i++) {
if (counts[i][nj - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
counts[i][j] += temp;
sumj[j] += temp;
}
counts[i][nj - 1] = 0.0;
}
}
}
sumj[nj - 1] = 0.0;
// do the both missing
if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
for (i = 0; i < ni - 1; i++) {
for (j = 0; j < nj - 1; j++) {
temp = (counts_copy[i][j]/(sum - total_missing)) *
counts_copy[ni - 1][nj - 1];
counts[i][j] += temp;
sumi[i] += temp;
sumj[j] += temp;
}
}
counts[ni - 1][nj - 1] = 0.0;
}
}
return ContingencyTables.symmetricalUncertainty(counts);
}
/**
* calculate symmetrical uncertainty between sets of attributes
*
* @param attributes the indexes of the attributes
* @param classAttributes the indexes of the attributes whose combination will
* be used as class label
* @return the uncertainty
* @throws Exception if the attribute could not be evaluated
*/
public double evaluateAttribute (int[] attributes, int[] classAttributes)
throws Exception {
int i, j; //variable for looping.
int p; //variable for looping.
int ii, jj; //specifying the position in the contingency table.
int nnj, nni; //counting base for attributes[].
int ni, nj; //the nubmer of rows and columns in the ContingencyTables.
double sum = 0.0;
boolean b_missing_attribute = false;
boolean b_missing_classAtrribute = false;
if(attributes.length==0)
{
throw new Exception("the parameter attributes[] is empty;SEQ:W-FS-Eval-SUAS-001");
}
if(classAttributes.length==0)
{
throw new Exception("the parameter classAttributes[] is empty;SEQ:W-FS-Eval-SUAS-002");
}
/*calculate the number of the rows in ContingencyTable*/
ni = m_trainInstances.attribute(attributes[0]).numValues();
if (ni == 0)
{
throw new Exception("an attribute is empty;SEQ:W-FS-Eval-SUAS-003;"+1);
}
for (i = 1;i<attributes.length;i++)
{
if (m_trainInstances.attribute(attributes[i]).numValues() == 0)
{
throw new Exception("an attribute is empty;SEQ:W-FS-Eval-SUAS-003;" +
(i+1));
}
ni = ni*m_trainInstances.attribute(attributes[i]).numValues();
}
ni = ni+1;
/*calculate the number of the colums in the ContingencyTable*/
nj = m_trainInstances.attribute(classAttributes[0]).numValues();
if (nj == 0)
{
throw new Exception("the a classAttribute is empty;SEQ:W-FS-Eval-SUAS-004;"+1);
}
for (i = 1;i<classAttributes.length;i++)
{
if (m_trainInstances.attribute(classAttributes[i]).numValues() == 0)
{
throw new Exception("the a classAttribute is empty;SEQ:W-FS-Eval-SUAS-004;" +
(i+1));
}
nj = nj*m_trainInstances.attribute(classAttributes[i]).numValues();
}
nj = nj+1;
double[] sumi, sumj;
Instance inst;
double temp = 0.0;
sumi = new double[ni];
sumj = new double[nj];
double[][] counts = new double[ni][nj];
sumi = new double[ni];
sumj = new double[nj];
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
counts[i][j] = 0.0;
}
}
// Fill the contingency table
for (i = 0; i < m_numInstances; i++) {
inst = m_trainInstances.instance(i);
b_missing_attribute = false;
b_missing_classAtrribute = false;
/*get row position in contingency table*/
nni = 1;
ii = 0;
for (p=attributes.length-1; p>=0; p--)
{
if (inst.isMissing(attributes[p])) {
b_missing_attribute = true;
}
ii = ((int)inst.value(attributes[p])*nni)+ii;
if (p<attributes.length-1){
nni = nni * (m_trainInstances.attribute(attributes[p]).numValues());
}
else {
nni = m_trainInstances.attribute(attributes[p]).numValues();
}
}
if (b_missing_attribute) {
ii = ni-1;
}
/*get colum position in contingency table*/
nnj = 1;
jj = 0;
for (p=classAttributes.length-1; p>=0; p--)
{
if (inst.isMissing(classAttributes[p])) {
b_missing_classAtrribute = true;
}
jj = ((int)inst.value(classAttributes[p])*nnj)+jj;
if (p<attributes.length-1){
nnj = nnj * (m_trainInstances.attribute(classAttributes[p]).numValues());
}
else {
nnj = m_trainInstances.attribute(classAttributes[p]).numValues();
}
}
if (b_missing_classAtrribute) {
jj = nj-1;
}
counts[ii][jj]++;
}
// get the row totals
for (i = 0; i < ni; i++) {
sumi[i] = 0.0;
for (j = 0; j < nj; j++) {
//there are how many happen of a special feature value
sumi[i] += counts[i][j];
sum += counts[i][j];
}
}
// get the column totals
for (j = 0; j < nj; j++) {
sumj[j] = 0.0;
for (i = 0; i < ni; i++) {
//a class value include how many instance.
sumj[j] += counts[i][j];
}
}
// distribute missing counts
if (m_missing_merge &&
(sumi[ni-1] < m_numInstances) &&
(sumj[nj-1] < m_numInstances)) {
double[] i_copy = new double[sumi.length];
double[] j_copy = new double[sumj.length];
double[][] counts_copy = new double[sumi.length][sumj.length];
for (i = 0; i < ni; i++) {
System.arraycopy(counts[i], 0, counts_copy[i], 0, sumj.length);
}
System.arraycopy(sumi, 0, i_copy, 0, sumi.length);
System.arraycopy(sumj, 0, j_copy, 0, sumj.length);
double total_missing = (sumi[ni - 1] + sumj[nj - 1]
- counts[ni - 1][nj - 1]);
// do the missing i's
if (sumi[ni - 1] > 0.0) { //sumi[ni - 1]: missing value contains how many values.
for (j = 0; j < nj - 1; j++) {
if (counts[ni - 1][j] > 0.0) {
for (i = 0; i < ni - 1; i++) {
temp = ((i_copy[i]/(sum - i_copy[ni - 1])) *
counts[ni - 1][j]);
counts[i][j] += temp; //according to the probability of value i we distribute account of the missing degree of a class lable to it
sumi[i] += temp;
}
counts[ni - 1][j] = 0.0;
}
}
}
sumi[ni - 1] = 0.0;
// do the missing j's
if (sumj[nj - 1] > 0.0) {
for (i = 0; i < ni - 1; i++) {
if (counts[i][nj - 1] > 0.0) {
for (j = 0; j < nj - 1; j++) {
temp = ((j_copy[j]/(sum - j_copy[nj - 1]))*counts[i][nj - 1]);
counts[i][j] += temp;
sumj[j] += temp;
}
counts[i][nj - 1] = 0.0;
}
}
}
sumj[nj - 1] = 0.0;
// do the both missing
if (counts[ni - 1][nj - 1] > 0.0 && total_missing != sum) {
for (i = 0; i < ni - 1; i++) {
for (j = 0; j < nj - 1; j++) {
temp = (counts_copy[i][j]/(sum - total_missing)) *
counts_copy[ni - 1][nj - 1];
counts[i][j] += temp;
sumi[i] += temp;
sumj[j] += temp;
}
}
counts[ni - 1][nj - 1] = 0.0;
}
}
return ContingencyTables.symmetricalUncertainty(counts);
}
/**
* Return a description of the evaluator
* @return description as a string
*/
public String toString () {
StringBuffer text = new StringBuffer();
if (m_trainInstances == null) {
text.append("\tSymmetrical Uncertainty evaluator has not been built");
}
else {
text.append("\tSymmetrical Uncertainty Ranking Filter");
if (!m_missing_merge) {
text.append("\n\tMissing values treated as seperate");
}
}
text.append("\n");
return text.toString();
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 5511 $");
}
// ============
// Test method.
// ============
/**
* Main method for testing this class.
*
* @param argv should contain the following arguments:
* -t training file
*/
public static void main (String[] argv) {
runEvaluator(new SymmetricalUncertAttributeSetEval(), argv);
}
}