/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* MergeNominalValues.java
* Copyright (C) 2013 University of Waikato, Hamilton, New Zealand
*
*/
package weka.filters.supervised.attribute;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.ContingencyTables;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.SpecialFunctions;
import weka.core.Statistics;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.SimpleBatchFilter;
import weka.filters.SupervisedFilter;
/**
<!-- globalinfo-start -->
* Merges values of all nominal attributes among the specified attributes, excluding the class attribute, using the CHAID method, but without considering to re-split merged subsets. It implements Steps 1 and 2 described by Kass (1980), see<br/>
* <br/>
* Gordon V. Kass (1980). An Exploratory Technique for Investigating Large Quantities of Categorical Data. Applied Statistics. 29(2):119-127.<br/>
* <br/>
* Once attribute values have been merged, a chi-squared test using the Bonferroni correction is applied to check if the resulting attribute is a valid predictor, based on the Bonferroni multiplier in Equation 3.2 in Kass (1980). If an attribute does not pass this test, all remaining values (if any) are merged. Nevertheless, useless predictors can slip through without being fully merged, e.g. identifier attributes.<br/>
* <br/>
* The code applies the Yates correction when the chi-squared statistic is computed.<br/>
* <br/>
* Note that the algorithm is quadratic in the number of attribute values for an attribute.
* <p/>
<!-- globalinfo-end -->
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -D
* Turns on output of debugging information.</pre>
*
* <pre> -L <double>
* The significance level (default: 0.05).
* </pre>
*
* <pre> -R <range>
* Sets list of attributes to act on (or its inverse). 'first and 'last' are accepted as well.'
* E.g.: first-5,7,9,20-last
* (default: first-last)</pre>
*
* <pre> -V
* Invert matching sense (i.e. act on all attributes not specified in list)</pre>
*
* <pre> -O
* Use short identifiers for merged subsets.</pre>
*
<!-- options-end -->
*
* @author Eibe Frank
* @version $Revision: 9853 $
*/
public class MergeNominalValues extends SimpleBatchFilter implements
SupervisedFilter, WeightedInstancesHandler, TechnicalInformationHandler {
/** for serialization */
static final long serialVersionUID = 7447337831221353842L;
/** Set the significance level */
protected double m_SigLevel = 0.05;
/** Stores which atributes to operate on (or nto) */
protected Range m_SelectCols = new Range("first-last");
/** Stores the indexes of the selected attributes in order. */
protected int[] m_SelectedAttributes;
/** Indicators for which attributes need to be changed. */
protected boolean[] m_AttToBeModified;
/** The indicators used to map the old values. */
protected int[][] m_Indicators;
/** Use short values */
protected boolean m_UseShortIdentifiers = false;
/**
* Returns a string describing this filter.
*
* @return a description of the filter suitable for displaying in the
* explorer/experimenter gui
*/
@Override
public String globalInfo() {
return "Merges values of all nominal attributes among the specified attributes, excluding "
+ "the class attribute, using the CHAID method, but without considering to re-split "
+ "merged subsets. It implements Steps 1 and 2 described by Kass (1980), see\n\n"
+ getTechnicalInformation().toString()
+ "\n\n"
+ "Once attribute values have been merged, a chi-squared test using the Bonferroni "
+ "correction is applied to check if the resulting attribute is a valid predictor, "
+ "based on the Bonferroni multiplier in Equation 3.2 in Kass (1980). If an attribute does "
+ "not pass this test, all remaining values (if any) are merged. Nevertheless, useless "
+ "predictors can slip through without being fully merged, e.g. identifier attributes.\n\n"
+ "The code applies the Yates correction when the chi-squared statistic is computed.\n\n"
+ "Note that the algorithm is quadratic in the number of attribute values for an attribute.";
}
/**
* Returns an instance of a TechnicalInformation object, containing detailed
* information about the technical background of this class, e.g., paper
* reference or book this class is based on.
*
* @return the technical information about this class
*/
@Override
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.ARTICLE);
result.setValue(Field.AUTHOR, "Gordon V. Kass");
result
.setValue(
Field.TITLE,
"An Exploratory Technique for Investigating Large Quantities of Categorical Data");
result.setValue(Field.JOURNAL, "Applied Statistics");
result.setValue(Field.YEAR, "1980");
result.setValue(Field.VOLUME, "29");
result.setValue(Field.NUMBER, "2");
result.setValue(Field.PAGES, "119-127");
return result;
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options.
*/
@Override
public Enumeration listOptions() {
Vector<Option> result;
Enumeration enm;
result = new Vector<Option>();
enm = super.listOptions();
while (enm.hasMoreElements())
result.addElement((Option) enm.nextElement());
result.addElement(new Option("\tThe significance level (default: 0.05).\n",
"-L", 1, "-L <double>"));
result
.addElement(new Option(
"\tSets list of attributes to act on (or its inverse). 'first and 'last' are accepted as well.'\n"
+ "\tE.g.: first-5,7,9,20-last\n" + "\t(default: first-last)",
"R", 1, "-R <range>"));
result
.addElement(new Option(
"\tInvert matching sense (i.e. act on all attributes not specified in list)",
"V", 0, "-V"));
result.addElement(new Option("\tUse short identifiers for merged subsets.",
"O", 0, "-O"));
return result.elements();
}
/**
* Gets the current settings of the filter.
*
* @return an array of strings suitable for passing to setOptions
*/
@Override
public String[] getOptions() {
Vector<String> result = new Vector<String>();
String[] options = super.getOptions();
for (int i = 0; i < options.length; i++) {
result.add(options[i]);
}
result.add("-L");
result.add("" + getSignificanceLevel());
if (!getAttributeIndices().equals(""))
;
{
result.add("-R");
result.add(getAttributeIndices());
}
if (getInvertSelection()) {
result.add("-V");
}
if (getUseShortIdentifiers()) {
result.add("-O");
}
return result.toArray(new String[result.size()]);
}
/**
* Parses a given list of options.
* <p/>
*
<!-- options-start -->
* Valid options are: <p/>
*
* <pre> -D
* Turns on output of debugging information.</pre>
*
* <pre> -L <double>
* The significance level (default: 0.05).
* </pre>
*
* <pre> -R <range>
* Sets list of attributes to act on (or its inverse). 'first and 'last' are accepted as well.'
* E.g.: first-5,7,9,20-last
* (default: first-last)</pre>
*
* <pre> -V
* Invert matching sense (i.e. act on all attributes not specified in list)</pre>
*
* <pre> -O
* Use short identifiers for merged subsets.</pre>
*
<!-- options-end -->
*
* @param options the list of options as an array of strings
* @throws Exception if an option is not supported
*/
@Override
public void setOptions(String[] options) throws Exception {
String significanceLevelString = Utils.getOption('L', options);
if (significanceLevelString.length() != 0) {
setSignificanceLevel(Double.parseDouble(significanceLevelString));
} else {
setSignificanceLevel(0.05);
}
String tmpStr = Utils.getOption('R', options);
if (tmpStr.length() != 0) {
setAttributeIndices(tmpStr);
} else {
setAttributeIndices("first-last");
}
setInvertSelection(Utils.getFlag('V', options));
setUseShortIdentifiers(Utils.getFlag('O', options));
super.setOptions(options);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String significanceLevelTipText() {
return "The significance level for the chi-squared test used to decide when to stop merging.";
}
/**
* Gets the significance level.
*
* @return int the significance level.
*/
public double getSignificanceLevel() {
return m_SigLevel;
}
/**
* Sets the significance level.
*
* @param the significance level as an integer.
*/
public void setSignificanceLevel(double sF) {
m_SigLevel = sF;
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String attributeIndicesTipText() {
return "Specify range of attributes to act on (or its inverse)."
+ " This is a comma separated list of attribute indices, with"
+ " \"first\" and \"last\" valid values. Specify an inclusive"
+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
}
/**
* Get the current range selection.
*
* @return a string containing a comma separated list of ranges
*/
public String getAttributeIndices() {
return m_SelectCols.getRanges();
}
/**
* Set which attributes are to be acted on (or not, if invert is true)
*
* @param rangeList a string representing the list of attributes. Since the
* string will typically come from a user, attributes are indexed
* from 1. <br>
* eg: first-3,5,6-last
*/
public void setAttributeIndices(String rangeList) {
m_SelectCols.setRanges(rangeList);
}
/**
* Set which attributes are to be acted on (or not, if invert is true)
*
* @param attributes an array containing indexes of attributes to select.
* Since the array will typically come from a program, attributes are
* indexed from 0.
*/
public void setAttributeIndicesArray(int[] attributes) {
setAttributeIndices(Range.indicesToRangeList(attributes));
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String invertSelectionTipText() {
return "Determines whether selected attributes are to be acted "
+ "on or all other attributes are used instead.";
}
/**
* Get whether the supplied attributes are to be acted on or all other
* attributes.
*
* @return true if the supplied attributes will be kept
*/
public boolean getInvertSelection() {
return m_SelectCols.getInvert();
}
/**
* Set whether selected attributes should be acted on or all other attributes.
*
* @param invert the new invert setting
*/
public void setInvertSelection(boolean invert) {
m_SelectCols.setInvert(invert);
}
/**
* Returns the tip text for this property
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String useShortIdentifiersTipText() {
return "Whether to use short identifiers for the merged values.";
}
/**
* Get whether short identifiers are to be output.
*
* @return true if short IDs are output
*/
public boolean getUseShortIdentifiers() {
return m_UseShortIdentifiers;
}
/**
* Set whether to output short identifiers for merged values.
*
* @param b if true, short IDs are output
*/
public void setUseShortIdentifiers(boolean b) {
m_UseShortIdentifiers = b;
}
/**
* We need access to the full input data in determineOutputFormat.
*/
@Override
public boolean allowAccessToFullInputFormat() {
return true;
}
/**
* Determines the output format based on the input format and returns this.
*
* @param inputFormat the input format to base the output format on
* @return the output format
*/
@Override
protected Instances determineOutputFormat(Instances inputFormat) {
// Set the upper limit of the range
m_SelectCols.setUpper(inputFormat.numAttributes() - 1);
// Get the selected attributes
m_SelectedAttributes = m_SelectCols.getSelection();
// Allocate arrays to store frequencies
double[][][] freqs = new double[inputFormat.numAttributes()][][];
for (int i = 0; i < m_SelectedAttributes.length; i++) {
int current = m_SelectedAttributes[i];
Attribute att = inputFormat.attribute(current);
if ((current != inputFormat.classIndex()) && (att.isNominal())) {
freqs[current] = new double[att.numValues()][inputFormat.numClasses()];
}
}
// Go through all the instances and compute frequencies
for (Instance inst : inputFormat) {
for (int i = 0; i < m_SelectedAttributes.length; i++) {
int current = m_SelectedAttributes[i];
if ((current != inputFormat.classIndex())
&& (inputFormat.attribute(current).isNominal())) {
if (!inst.isMissing(current) && !inst.classIsMissing()) {
freqs[current][(int) inst.value(current)][(int) inst.classValue()] += inst
.weight();
}
}
}
}
// For each attribute in turn merge values
m_AttToBeModified = new boolean[inputFormat.numAttributes()];
m_Indicators = new int[inputFormat.numAttributes()][];
for (int i = 0; i < m_SelectedAttributes.length; i++) {
int current = m_SelectedAttributes[i];
if ((current != inputFormat.classIndex())
&& (inputFormat.attribute(current).isNominal())) {
if (m_Debug) {
System.err.println(inputFormat.attribute(current));
}
// Compute subset indicators
m_Indicators[current] = mergeValues(freqs[current]);
if (m_Debug) {
for (int j = 0; j < m_Indicators[current].length; j++) {
System.err.print(" - " + m_Indicators[current][j] + " - ");
}
System.err.println();
}
// Does attribute need to modified?
for (int k = 0; k < m_Indicators[current].length; k++) {
if (m_Indicators[current][k] != k) {
m_AttToBeModified[current] = true;
}
}
}
}
// Create new header
ArrayList<Attribute> atts = new ArrayList<Attribute>();
for (int i = 0; i < inputFormat.numAttributes(); i++) {
int current = i;
Attribute att = inputFormat.attribute(current);
if (m_AttToBeModified[i]) {
// Compute number of new values
int numValues = 0;
for (int j = 0; j < m_Indicators[current].length; j++) {
if (m_Indicators[current][j] + 1 > numValues) {
numValues = m_Indicators[current][j] + 1;
}
}
// Establish new values
ArrayList<StringBuilder> vals = new ArrayList<StringBuilder>(numValues);
for (int j = 0; j < numValues; j++) {
vals.add(null);
}
for (int j = 0; j < m_Indicators[current].length; j++) {
int index = m_Indicators[current][j];
// Do we already have a value at the given index?
StringBuilder val = vals.get(index);
if (val == null) {
if (m_UseShortIdentifiers) {
vals.set(index, new StringBuilder("" + (index + 1)));
} else {
vals.set(index, new StringBuilder(att.value(j)));
}
} else {
if (!m_UseShortIdentifiers) {
vals.get(index).append("_or_").append(att.value(j));
}
}
}
ArrayList<String> valsAsStrings = new ArrayList<String>(vals.size());
for (StringBuilder val : vals) {
valsAsStrings.add(val.toString());
}
atts.add(new Attribute(att.name() + "_merged_values", valsAsStrings));
} else {
atts.add((Attribute) att.copy());
}
}
// Return modified header
Instances data = new Instances(inputFormat.relationName(), atts, 0);
data.setClassIndex(inputFormat.classIndex());
return data;
}
/**
* Compute factor for Bonferroni correction. This is based on Equation 3.2 in
* Kass (1980).
*/
protected double BFfactor(int c, int r) {
double sum = 0;
double multiplier = 1.0;
for (int i = 0; i < r; i++) {
sum += multiplier
* Math
.exp((c * Math.log(r - i) - (SpecialFunctions.lnFactorial(i) + SpecialFunctions
.lnFactorial(r - i))));
multiplier *= -1.0;
}
return sum;
}
/**
* Merges values and returns list of subset indicators for the values.
*/
protected int[] mergeValues(double[][] counts) {
int[] indicators = new int[counts.length];
// Initially, each value is in its own subset
for (int i = 0; i < indicators.length; i++) {
indicators[i] = i;
}
// Can't merge further if only one subset remains
while (counts.length > 1) {
// Find two rows that differ the least according to chi-squared statistic
double[][] reducedCounts = new double[2][];
double minVal = Double.MAX_VALUE;
int toMergeOne = -1;
int toMergeTwo = -1;
for (int i = 0; i < counts.length; i++) {
reducedCounts[0] = counts[i];
for (int j = i + 1; j < counts.length; j++) {
reducedCounts[1] = counts[j];
double val = ContingencyTables.chiVal(reducedCounts, true);
if (val < minVal) {
minVal = val;
toMergeOne = i;
toMergeTwo = j;
}
}
}
// Is least significant difference still significant?
if (Statistics.chiSquaredProbability(minVal, reducedCounts[0].length - 1) <= m_SigLevel) {
// Check whether overall split is insignificant using Bonferroni
// correction
double val = ContingencyTables.chiVal(counts, true);
int df = (counts[0].length - 1) * (counts.length - 1);
double originalSig = Statistics.chiSquaredProbability(val, df);
double adjustedSig = originalSig
* BFfactor(indicators.length, counts.length);
if (m_Debug) {
System.err.println("Original p-value: " + originalSig
+ "\tAdjusted p-value: " + adjustedSig);
}
if (!(adjustedSig <= m_SigLevel)) {
// Not significant: merge all values
for (int i = 0; i < indicators.length; i++) {
indicators[i] = 0;
}
}
break;
}
// Reduce table by merging
double[][] newCounts = new double[counts.length - 1][];
for (int i = 0; i < counts.length; i++) {
if (i < toMergeTwo) {
// Can simply copy reference
newCounts[i] = counts[i];
} else if (i == toMergeTwo) {
// Need to add counts
for (int k = 0; k < counts[i].length; k++) {
newCounts[toMergeOne][k] += counts[i][k];
}
} else {
// Need to shift row
newCounts[i - 1] = counts[i];
}
}
// Update membership indicators
for (int i = 0; i < indicators.length; i++) {
// All row indices < toMergeTwo remain unmodified
if (indicators[i] >= toMergeTwo) {
if (indicators[i] == toMergeTwo) {
// Need to change index for *all* indicator fields corresponding to
// merged row
indicators[i] = toMergeOne;
} else {
// We have one row less because toMergeTwo is gone
indicators[i]--;
}
}
}
// Replace matrix
counts = newCounts;
}
return indicators;
}
/**
* Returns the Capabilities of this filter.
*
* @return the capabilities of this object
* @see Capabilities
*/
@Override
public Capabilities getCapabilities() {
Capabilities result;
result = super.getCapabilities();
result.disableAll();
// attributes
result.enableAllAttributes();
result.enable(Capability.MISSING_VALUES);
// class
result.enableAllClasses();
result.enable(Capability.MISSING_CLASS_VALUES);
return result;
}
/**
* Processes the given data.
*
* @param instances the data to process
* @return the modified data
* @throws Exception in case the processing goes wrong
*/
@Override
public Instances process(Instances instances) throws Exception {
// Generate the output and return it
Instances result = new Instances(getOutputFormat(),
instances.numInstances());
for (int i = 0; i < instances.numInstances(); i++) {
Instance inst = instances.instance(i);
double[] newData = new double[instances.numAttributes()];
for (int j = 0; j < instances.numAttributes(); j++) {
if (m_AttToBeModified[j] && !inst.isMissing(j)) {
newData[j] = m_Indicators[j][(int) inst.value(j)];
} else {
newData[j] = inst.value(j);
}
}
DenseInstance instNew = new DenseInstance(1.0, newData);
instNew.setDataset(result);
// copy possible strings, relational values...
copyValues(instNew, false, inst.dataset(), getOutputFormat());
// Add instance to output
result.add(instNew);
}
return result;
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 9853 $");
}
/**
* runs the filter with the given arguments
*
* @param args the commandline arguments
*/
public static void main(String[] args) {
runFilter(new MergeNominalValues(), args);
}
}