/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * MergeNominalValues.java * Copyright (C) 2013 University of Waikato, Hamilton, New Zealand * */ package weka.filters.supervised.attribute; import java.util.ArrayList; import java.util.Enumeration; import java.util.Vector; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.ContingencyTables; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.Range; import weka.core.RevisionUtils; import weka.core.SpecialFunctions; import weka.core.Statistics; import weka.core.TechnicalInformation; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.core.WeightedInstancesHandler; import weka.filters.SimpleBatchFilter; import weka.filters.SupervisedFilter; /** <!-- globalinfo-start --> * Merges values of all nominal attributes among the specified attributes, excluding the class attribute, using the CHAID method, but without considering to re-split merged subsets. It implements Steps 1 and 2 described by Kass (1980), see<br/> * <br/> * Gordon V. Kass (1980). An Exploratory Technique for Investigating Large Quantities of Categorical Data. Applied Statistics. 29(2):119-127.<br/> * <br/> * Once attribute values have been merged, a chi-squared test using the Bonferroni correction is applied to check if the resulting attribute is a valid predictor, based on the Bonferroni multiplier in Equation 3.2 in Kass (1980). If an attribute does not pass this test, all remaining values (if any) are merged. Nevertheless, useless predictors can slip through without being fully merged, e.g. identifier attributes.<br/> * <br/> * The code applies the Yates correction when the chi-squared statistic is computed.<br/> * <br/> * Note that the algorithm is quadratic in the number of attribute values for an attribute. * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Turns on output of debugging information.</pre> * * <pre> -L <double> * The significance level (default: 0.05). * </pre> * * <pre> -R <range> * Sets list of attributes to act on (or its inverse). 'first and 'last' are accepted as well.' * E.g.: first-5,7,9,20-last * (default: first-last)</pre> * * <pre> -V * Invert matching sense (i.e. act on all attributes not specified in list)</pre> * * <pre> -O * Use short identifiers for merged subsets.</pre> * <!-- options-end --> * * @author Eibe Frank * @version $Revision: 9853 $ */ public class MergeNominalValues extends SimpleBatchFilter implements SupervisedFilter, WeightedInstancesHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 7447337831221353842L; /** Set the significance level */ protected double m_SigLevel = 0.05; /** Stores which atributes to operate on (or nto) */ protected Range m_SelectCols = new Range("first-last"); /** Stores the indexes of the selected attributes in order. */ protected int[] m_SelectedAttributes; /** Indicators for which attributes need to be changed. */ protected boolean[] m_AttToBeModified; /** The indicators used to map the old values. */ protected int[][] m_Indicators; /** Use short values */ protected boolean m_UseShortIdentifiers = false; /** * Returns a string describing this filter. * * @return a description of the filter suitable for displaying in the * explorer/experimenter gui */ @Override public String globalInfo() { return "Merges values of all nominal attributes among the specified attributes, excluding " + "the class attribute, using the CHAID method, but without considering to re-split " + "merged subsets. It implements Steps 1 and 2 described by Kass (1980), see\n\n" + getTechnicalInformation().toString() + "\n\n" + "Once attribute values have been merged, a chi-squared test using the Bonferroni " + "correction is applied to check if the resulting attribute is a valid predictor, " + "based on the Bonferroni multiplier in Equation 3.2 in Kass (1980). If an attribute does " + "not pass this test, all remaining values (if any) are merged. Nevertheless, useless " + "predictors can slip through without being fully merged, e.g. identifier attributes.\n\n" + "The code applies the Yates correction when the chi-squared statistic is computed.\n\n" + "Note that the algorithm is quadratic in the number of attribute values for an attribute."; } /** * Returns an instance of a TechnicalInformation object, containing detailed * information about the technical background of this class, e.g., paper * reference or book this class is based on. * * @return the technical information about this class */ @Override public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.ARTICLE); result.setValue(Field.AUTHOR, "Gordon V. Kass"); result .setValue( Field.TITLE, "An Exploratory Technique for Investigating Large Quantities of Categorical Data"); result.setValue(Field.JOURNAL, "Applied Statistics"); result.setValue(Field.YEAR, "1980"); result.setValue(Field.VOLUME, "29"); result.setValue(Field.NUMBER, "2"); result.setValue(Field.PAGES, "119-127"); return result; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration listOptions() { Vector<Option> result; Enumeration enm; result = new Vector<Option>(); enm = super.listOptions(); while (enm.hasMoreElements()) result.addElement((Option) enm.nextElement()); result.addElement(new Option("\tThe significance level (default: 0.05).\n", "-L", 1, "-L <double>")); result .addElement(new Option( "\tSets list of attributes to act on (or its inverse). 'first and 'last' are accepted as well.'\n" + "\tE.g.: first-5,7,9,20-last\n" + "\t(default: first-last)", "R", 1, "-R <range>")); result .addElement(new Option( "\tInvert matching sense (i.e. act on all attributes not specified in list)", "V", 0, "-V")); result.addElement(new Option("\tUse short identifiers for merged subsets.", "O", 0, "-O")); return result.elements(); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ @Override public String[] getOptions() { Vector<String> result = new Vector<String>(); String[] options = super.getOptions(); for (int i = 0; i < options.length; i++) { result.add(options[i]); } result.add("-L"); result.add("" + getSignificanceLevel()); if (!getAttributeIndices().equals("")) ; { result.add("-R"); result.add(getAttributeIndices()); } if (getInvertSelection()) { result.add("-V"); } if (getUseShortIdentifiers()) { result.add("-O"); } return result.toArray(new String[result.size()]); } /** * Parses a given list of options. * <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Turns on output of debugging information.</pre> * * <pre> -L <double> * The significance level (default: 0.05). * </pre> * * <pre> -R <range> * Sets list of attributes to act on (or its inverse). 'first and 'last' are accepted as well.' * E.g.: first-5,7,9,20-last * (default: first-last)</pre> * * <pre> -V * Invert matching sense (i.e. act on all attributes not specified in list)</pre> * * <pre> -O * Use short identifiers for merged subsets.</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String significanceLevelString = Utils.getOption('L', options); if (significanceLevelString.length() != 0) { setSignificanceLevel(Double.parseDouble(significanceLevelString)); } else { setSignificanceLevel(0.05); } String tmpStr = Utils.getOption('R', options); if (tmpStr.length() != 0) { setAttributeIndices(tmpStr); } else { setAttributeIndices("first-last"); } setInvertSelection(Utils.getFlag('V', options)); setUseShortIdentifiers(Utils.getFlag('O', options)); super.setOptions(options); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String significanceLevelTipText() { return "The significance level for the chi-squared test used to decide when to stop merging."; } /** * Gets the significance level. * * @return int the significance level. */ public double getSignificanceLevel() { return m_SigLevel; } /** * Sets the significance level. * * @param the significance level as an integer. */ public void setSignificanceLevel(double sF) { m_SigLevel = sF; } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String attributeIndicesTipText() { return "Specify range of attributes to act on (or its inverse)." + " This is a comma separated list of attribute indices, with" + " \"first\" and \"last\" valid values. Specify an inclusive" + " range with \"-\". E.g: \"first-3,5,6-10,last\"."; } /** * Get the current range selection. * * @return a string containing a comma separated list of ranges */ public String getAttributeIndices() { return m_SelectCols.getRanges(); } /** * Set which attributes are to be acted on (or not, if invert is true) * * @param rangeList a string representing the list of attributes. Since the * string will typically come from a user, attributes are indexed * from 1. <br> * eg: first-3,5,6-last */ public void setAttributeIndices(String rangeList) { m_SelectCols.setRanges(rangeList); } /** * Set which attributes are to be acted on (or not, if invert is true) * * @param attributes an array containing indexes of attributes to select. * Since the array will typically come from a program, attributes are * indexed from 0. */ public void setAttributeIndicesArray(int[] attributes) { setAttributeIndices(Range.indicesToRangeList(attributes)); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String invertSelectionTipText() { return "Determines whether selected attributes are to be acted " + "on or all other attributes are used instead."; } /** * Get whether the supplied attributes are to be acted on or all other * attributes. * * @return true if the supplied attributes will be kept */ public boolean getInvertSelection() { return m_SelectCols.getInvert(); } /** * Set whether selected attributes should be acted on or all other attributes. * * @param invert the new invert setting */ public void setInvertSelection(boolean invert) { m_SelectCols.setInvert(invert); } /** * Returns the tip text for this property * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String useShortIdentifiersTipText() { return "Whether to use short identifiers for the merged values."; } /** * Get whether short identifiers are to be output. * * @return true if short IDs are output */ public boolean getUseShortIdentifiers() { return m_UseShortIdentifiers; } /** * Set whether to output short identifiers for merged values. * * @param b if true, short IDs are output */ public void setUseShortIdentifiers(boolean b) { m_UseShortIdentifiers = b; } /** * We need access to the full input data in determineOutputFormat. */ @Override public boolean allowAccessToFullInputFormat() { return true; } /** * Determines the output format based on the input format and returns this. * * @param inputFormat the input format to base the output format on * @return the output format */ @Override protected Instances determineOutputFormat(Instances inputFormat) { // Set the upper limit of the range m_SelectCols.setUpper(inputFormat.numAttributes() - 1); // Get the selected attributes m_SelectedAttributes = m_SelectCols.getSelection(); // Allocate arrays to store frequencies double[][][] freqs = new double[inputFormat.numAttributes()][][]; for (int i = 0; i < m_SelectedAttributes.length; i++) { int current = m_SelectedAttributes[i]; Attribute att = inputFormat.attribute(current); if ((current != inputFormat.classIndex()) && (att.isNominal())) { freqs[current] = new double[att.numValues()][inputFormat.numClasses()]; } } // Go through all the instances and compute frequencies for (Instance inst : inputFormat) { for (int i = 0; i < m_SelectedAttributes.length; i++) { int current = m_SelectedAttributes[i]; if ((current != inputFormat.classIndex()) && (inputFormat.attribute(current).isNominal())) { if (!inst.isMissing(current) && !inst.classIsMissing()) { freqs[current][(int) inst.value(current)][(int) inst.classValue()] += inst .weight(); } } } } // For each attribute in turn merge values m_AttToBeModified = new boolean[inputFormat.numAttributes()]; m_Indicators = new int[inputFormat.numAttributes()][]; for (int i = 0; i < m_SelectedAttributes.length; i++) { int current = m_SelectedAttributes[i]; if ((current != inputFormat.classIndex()) && (inputFormat.attribute(current).isNominal())) { if (m_Debug) { System.err.println(inputFormat.attribute(current)); } // Compute subset indicators m_Indicators[current] = mergeValues(freqs[current]); if (m_Debug) { for (int j = 0; j < m_Indicators[current].length; j++) { System.err.print(" - " + m_Indicators[current][j] + " - "); } System.err.println(); } // Does attribute need to modified? for (int k = 0; k < m_Indicators[current].length; k++) { if (m_Indicators[current][k] != k) { m_AttToBeModified[current] = true; } } } } // Create new header ArrayList<Attribute> atts = new ArrayList<Attribute>(); for (int i = 0; i < inputFormat.numAttributes(); i++) { int current = i; Attribute att = inputFormat.attribute(current); if (m_AttToBeModified[i]) { // Compute number of new values int numValues = 0; for (int j = 0; j < m_Indicators[current].length; j++) { if (m_Indicators[current][j] + 1 > numValues) { numValues = m_Indicators[current][j] + 1; } } // Establish new values ArrayList<StringBuilder> vals = new ArrayList<StringBuilder>(numValues); for (int j = 0; j < numValues; j++) { vals.add(null); } for (int j = 0; j < m_Indicators[current].length; j++) { int index = m_Indicators[current][j]; // Do we already have a value at the given index? StringBuilder val = vals.get(index); if (val == null) { if (m_UseShortIdentifiers) { vals.set(index, new StringBuilder("" + (index + 1))); } else { vals.set(index, new StringBuilder(att.value(j))); } } else { if (!m_UseShortIdentifiers) { vals.get(index).append("_or_").append(att.value(j)); } } } ArrayList<String> valsAsStrings = new ArrayList<String>(vals.size()); for (StringBuilder val : vals) { valsAsStrings.add(val.toString()); } atts.add(new Attribute(att.name() + "_merged_values", valsAsStrings)); } else { atts.add((Attribute) att.copy()); } } // Return modified header Instances data = new Instances(inputFormat.relationName(), atts, 0); data.setClassIndex(inputFormat.classIndex()); return data; } /** * Compute factor for Bonferroni correction. This is based on Equation 3.2 in * Kass (1980). */ protected double BFfactor(int c, int r) { double sum = 0; double multiplier = 1.0; for (int i = 0; i < r; i++) { sum += multiplier * Math .exp((c * Math.log(r - i) - (SpecialFunctions.lnFactorial(i) + SpecialFunctions .lnFactorial(r - i)))); multiplier *= -1.0; } return sum; } /** * Merges values and returns list of subset indicators for the values. */ protected int[] mergeValues(double[][] counts) { int[] indicators = new int[counts.length]; // Initially, each value is in its own subset for (int i = 0; i < indicators.length; i++) { indicators[i] = i; } // Can't merge further if only one subset remains while (counts.length > 1) { // Find two rows that differ the least according to chi-squared statistic double[][] reducedCounts = new double[2][]; double minVal = Double.MAX_VALUE; int toMergeOne = -1; int toMergeTwo = -1; for (int i = 0; i < counts.length; i++) { reducedCounts[0] = counts[i]; for (int j = i + 1; j < counts.length; j++) { reducedCounts[1] = counts[j]; double val = ContingencyTables.chiVal(reducedCounts, true); if (val < minVal) { minVal = val; toMergeOne = i; toMergeTwo = j; } } } // Is least significant difference still significant? if (Statistics.chiSquaredProbability(minVal, reducedCounts[0].length - 1) <= m_SigLevel) { // Check whether overall split is insignificant using Bonferroni // correction double val = ContingencyTables.chiVal(counts, true); int df = (counts[0].length - 1) * (counts.length - 1); double originalSig = Statistics.chiSquaredProbability(val, df); double adjustedSig = originalSig * BFfactor(indicators.length, counts.length); if (m_Debug) { System.err.println("Original p-value: " + originalSig + "\tAdjusted p-value: " + adjustedSig); } if (!(adjustedSig <= m_SigLevel)) { // Not significant: merge all values for (int i = 0; i < indicators.length; i++) { indicators[i] = 0; } } break; } // Reduce table by merging double[][] newCounts = new double[counts.length - 1][]; for (int i = 0; i < counts.length; i++) { if (i < toMergeTwo) { // Can simply copy reference newCounts[i] = counts[i]; } else if (i == toMergeTwo) { // Need to add counts for (int k = 0; k < counts[i].length; k++) { newCounts[toMergeOne][k] += counts[i][k]; } } else { // Need to shift row newCounts[i - 1] = counts[i]; } } // Update membership indicators for (int i = 0; i < indicators.length; i++) { // All row indices < toMergeTwo remain unmodified if (indicators[i] >= toMergeTwo) { if (indicators[i] == toMergeTwo) { // Need to change index for *all* indicator fields corresponding to // merged row indicators[i] = toMergeOne; } else { // We have one row less because toMergeTwo is gone indicators[i]--; } } } // Replace matrix counts = newCounts; } return indicators; } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ @Override public Capabilities getCapabilities() { Capabilities result; result = super.getCapabilities(); result.disableAll(); // attributes result.enableAllAttributes(); result.enable(Capability.MISSING_VALUES); // class result.enableAllClasses(); result.enable(Capability.MISSING_CLASS_VALUES); return result; } /** * Processes the given data. * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong */ @Override public Instances process(Instances instances) throws Exception { // Generate the output and return it Instances result = new Instances(getOutputFormat(), instances.numInstances()); for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); double[] newData = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (m_AttToBeModified[j] && !inst.isMissing(j)) { newData[j] = m_Indicators[j][(int) inst.value(j)]; } else { newData[j] = inst.value(j); } } DenseInstance instNew = new DenseInstance(1.0, newData); instNew.setDataset(result); // copy possible strings, relational values... copyValues(instNew, false, inst.dataset(), getOutputFormat()); // Add instance to output result.add(instNew); } return result; } /** * Returns the revision string. * * @return the revision */ @Override public String getRevision() { return RevisionUtils.extract("$Revision: 9853 $"); } /** * runs the filter with the given arguments * * @param args the commandline arguments */ public static void main(String[] args) { runFilter(new MergeNominalValues(), args); } }