/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * PartitionedMultiFilter.java * Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand * */ package weka.filters.unsupervised.attribute; import java.util.ArrayList; import java.util.Enumeration; import java.util.Vector; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.Range; import weka.core.RevisionUtils; import weka.core.SparseInstance; import weka.core.Utils; import weka.filters.AllFilter; import weka.filters.Filter; import weka.filters.SimpleBatchFilter; /** <!-- globalinfo-start --> * A filter that applies filters on subsets of attributes and assembles the output into a new dataset. Attributes that are not covered by any of the ranges can be either retained or removed from the output. * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Turns on output of debugging information.</pre> * * <pre> -F <classname [options]> * A filter to apply (can be specified multiple times).</pre> * * <pre> -R <range> * An attribute range (can be specified multiple times). * For each filter a range must be supplied. 'first' and 'last' * are valid indices. 'inv(...)' around the range denotes an * inverted range.</pre> * * <pre> -U * Flag for leaving unused attributes out of the output, by default * these are included in the filter output.</pre> * <!-- options-end --> * * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 8034 $ * @see weka.filters.StreamableFilter */ public class PartitionedMultiFilter extends SimpleBatchFilter { /** for serialization. */ private static final long serialVersionUID = -6293720886005713120L; /** The filters. */ protected Filter m_Filters[] = {new AllFilter()}; /** The attribute ranges. */ protected Range m_Ranges[] = {new Range("first-last")}; /** Whether unused attributes are left out of the output. */ protected boolean m_RemoveUnused = false; /** the indices of the unused attributes. */ protected int[] m_IndicesUnused = new int[0]; /** * Returns a string describing this filter. * @return a description of the filter suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "A filter that applies filters on subsets of attributes and " + "assembles the output into a new dataset. Attributes that are " + "not covered by any of the ranges can be either retained or removed " + "from the output."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector result = new Vector(); Enumeration enm = super.listOptions(); while (enm.hasMoreElements()) result.add(enm.nextElement()); result.addElement(new Option( "\tA filter to apply (can be specified multiple times).", "F", 1, "-F <classname [options]>")); result.addElement(new Option( "\tAn attribute range (can be specified multiple times).\n" + "\tFor each filter a range must be supplied. 'first' and 'last'\n" + "\tare valid indices. 'inv(...)' around the range denotes an\n" + "\tinverted range.", "R", 1, "-R <range>")); result.addElement(new Option( "\tFlag for leaving unused attributes out of the output, by default\n" + "\tthese are included in the filter output.", "U", 0, "-U")); return result.elements(); } /** * Parses a list of options for this object. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Turns on output of debugging information.</pre> * * <pre> -F <classname [options]> * A filter to apply (can be specified multiple times).</pre> * * <pre> -R <range> * An attribute range (can be specified multiple times). * For each filter a range must be supplied. 'first' and 'last' * are valid indices. 'inv(...)' around the range denotes an * inverted range.</pre> * * <pre> -U * Flag for leaving unused attributes out of the output, by default * these are included in the filter output.</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; String classname; String[] options2; Vector objects; Range range; super.setOptions(options); setRemoveUnused(Utils.getFlag("U", options)); objects = new Vector(); while ((tmpStr = Utils.getOption("F", options)).length() != 0) { options2 = Utils.splitOptions(tmpStr); classname = options2[0]; options2[0] = ""; objects.add(Utils.forName(Filter.class, classname, options2)); } // at least one filter if (objects.size() == 0) objects.add(new AllFilter()); setFilters((Filter[]) objects.toArray(new Filter[objects.size()])); objects = new Vector(); while ((tmpStr = Utils.getOption("R", options)).length() != 0) { if (tmpStr.startsWith("inv(") && tmpStr.endsWith(")")) { range = new Range(tmpStr.substring(4, tmpStr.length() - 1)); range.setInvert(true); } else { range = new Range(tmpStr); } objects.add(range); } // at least one Range if (objects.size() == 0) objects.add(new Range("first-last")); setRanges((Range[]) objects.toArray(new Range[objects.size()])); // is number of filters the same as ranges? checkDimensions(); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector result; String[] options; int i; String tmpStr; result = new Vector(); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); if (getRemoveUnused()) result.add("-U"); for (i = 0; i < getFilters().length; i++) { result.add("-F"); result.add(getFilterSpec(getFilter(i))); } for (i = 0; i < getRanges().length; i++) { tmpStr = getRange(i).getRanges(); if (getRange(i).getInvert()) tmpStr = "inv(" + tmpStr + ")"; result.add("-R"); result.add(tmpStr); } return (String[]) result.toArray(new String[result.size()]); } /** * checks whether the dimensions of filters and ranges fit together. * * @throws Exception if dimensions differ */ protected void checkDimensions() throws Exception { if (getFilters().length != getRanges().length) throw new IllegalArgumentException( "Number of filters (= " + getFilters().length + ") " + "and ranges (= " + getRanges().length + ") don't match!"); } /** * tests the data whether the filter can actually handle it. * * @param instanceInfo the data to test * @throws Exception if the test fails */ protected void testInputFormat(Instances instanceInfo) throws Exception { for (int i = 0; i < getRanges().length; i++) { Instances newi = new Instances(instanceInfo, 0); if (instanceInfo.size() > 0){ newi.add((Instance)instanceInfo.get(0).copy()); } Range range = getRanges()[i]; range.setUpper(instanceInfo.numAttributes() - 1); Instances subset = generateSubset(newi, range); getFilters()[i].setInputFormat(subset); } } /** * Sets whether unused attributes (ones that are not covered by any of the * ranges) are removed from the output. * * @param value if true then the unused attributes get removed */ public void setRemoveUnused(boolean value) { m_RemoveUnused = value; } /** * Gets whether unused attributes (ones that are not covered by any of the * ranges) are removed from the output. * * @return true if unused attributes are removed */ public boolean getRemoveUnused() { return m_RemoveUnused; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String removeUnusedTipText() { return "If true then unused attributes (ones that are not covered by any " + "of the ranges) will be removed from the output."; } /** * Sets the list of possible filters to choose from. * Also resets the state of the filter (this reset doesn't affect the * options). * * @param filters an array of filters with all options set. * @see #reset() */ public void setFilters(Filter[] filters) { m_Filters = filters; reset(); } /** * Gets the list of possible filters to choose from. * * @return the array of Filters */ public Filter[] getFilters() { return m_Filters; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String filtersTipText() { return "The base filters to be used."; } /** * Gets a single filter from the set of available filters. * * @param index the index of the filter wanted * @return the Filter */ public Filter getFilter(int index) { return m_Filters[index]; } /** * returns the filter classname and the options as one string. * * @param filter the filter to get the specs for * @return the classname plus options */ protected String getFilterSpec(Filter filter) { String result; if (filter == null) { result = ""; } else { result = filter.getClass().getName(); if (filter instanceof OptionHandler) result += " " + Utils.joinOptions(((OptionHandler) filter).getOptions()); } return result; } /** * Sets the list of possible Ranges to choose from. * Also resets the state of the Range (this reset doesn't affect the * options). * * @param Ranges an array of Ranges with all options set. * @see #reset() */ public void setRanges(Range[] Ranges) { m_Ranges = Ranges; reset(); } /** * Gets the list of possible Ranges to choose from. * * @return the array of Ranges */ public Range[] getRanges() { return m_Ranges; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String rangesTipText() { return "The attribute ranges to be used; 'inv(...)' denotes an inverted range."; } /** * Gets a single Range from the set of available Ranges. * * @param index the index of the Range wanted * @return the Range */ public Range getRange(int index) { return m_Ranges[index]; } /** * determines the indices of unused attributes (ones that are not covered * by any of the range). * * @param data the data to base the determination on * @see #m_IndicesUnused */ protected void determineUnusedIndices(Instances data) { Vector<Integer> indices; int i; int n; boolean covered; // traverse all ranges indices = new Vector<Integer>(); for (i = 0; i < data.numAttributes(); i++) { if (i == data.classIndex()) continue; covered = false; for (n = 0; n < getRanges().length; n++) { if (getRanges()[n].isInRange(i)) { covered = true; break; } } if (!covered) indices.add(new Integer(i)); } // create array m_IndicesUnused = new int[indices.size()]; for (i = 0; i < indices.size(); i++) m_IndicesUnused[i] = indices.get(i).intValue(); if (getDebug()) System.out.println( "Unused indices: " + Utils.arrayToString(m_IndicesUnused)); } /** * generates a subset of the dataset with only the attributes from the range * (class is always added if present). * * @param data the data to work on * @param range the range of attribute to use * @return the generated subset * @throws Exception if creation fails */ protected Instances generateSubset(Instances data, Range range) throws Exception { Remove filter; StringBuilder atts; Instances result; int[] indices; int i; // determine attributes indices = range.getSelection(); atts = new StringBuilder(); for (i = 0; i < indices.length; i++) { if (i > 0) atts.append(","); atts.append("" + (indices[i] + 1)); } if ((data.classIndex() > -1) && (!range.isInRange(data.classIndex()))) atts.append("," + (data.classIndex() + 1)); // setup filter filter = new Remove(); filter.setAttributeIndices(atts.toString()); filter.setInvertSelection(true); filter.setInputFormat(data); // generate output result = Filter.useFilter(data, filter); return result; } /** * renames all the attributes in the dataset (excluding the class if present) * by adding the prefix to the name. * * @param data the data to work on * @param prefix the prefix for the attributes * @return a copy of the data with the attributes renamed * @throws Exception if renaming fails */ protected Instances renameAttributes(Instances data, String prefix) throws Exception { Instances result; int i; ArrayList<Attribute> atts; // rename attributes atts = new ArrayList<Attribute>(); for (i = 0; i < data.numAttributes(); i++) { if (i == data.classIndex()) atts.add((Attribute) data.attribute(i).copy()); else atts.add(data.attribute(i).copy(prefix + data.attribute(i).name())); } // create new dataset result = new Instances(data.relationName(), atts, data.numInstances()); for (i = 0; i < data.numInstances(); i++) { result.add((Instance) data.instance(i).copy()); } // set class if present if (data.classIndex() > -1) result.setClassIndex(data.classIndex()); return result; } /** * Determines the output format based only on the full input dataset and * returns this otherwise null is returned. In case the output format cannot * be returned immediately, i.e., immediateOutputFormat() returns false, * then this method will be called from batchFinished(). * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #hasImmediateOutputFormat() * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { Instances result; Instances processed; int i; int n; ArrayList<Attribute> atts; Attribute att; if (!isFirstBatchDone()) { // we need the full dataset here, see process(Instances) if (inputFormat.numInstances() == 0) return null; checkDimensions(); // determine unused indices determineUnusedIndices(inputFormat); atts = new ArrayList<Attribute>(); for (i = 0; i < getFilters().length; i++) { if (!isFirstBatchDone()) { // generate subset processed = generateSubset(inputFormat, getRange(i)); // set input format if (!getFilter(i).setInputFormat(processed)) Filter.useFilter(processed, getFilter(i)); } // get output format processed = getFilter(i).getOutputFormat(); // rename attributes processed = renameAttributes(processed, "filtered-" + i + "-"); // add attributes for (n = 0; n < processed.numAttributes(); n++) { if (n == processed.classIndex()) continue; atts.add((Attribute) processed.attribute(n).copy()); } } // add unused attributes if (!getRemoveUnused()) { for (i = 0; i < m_IndicesUnused.length; i++) { att = inputFormat.attribute(m_IndicesUnused[i]); atts.add(att.copy("unfiltered-" + att.name())); } } // add class if present if (inputFormat.classIndex() > -1) atts.add((Attribute) inputFormat.classAttribute().copy()); // generate new dataset result = new Instances(inputFormat.relationName(), atts, 0); if (inputFormat.classIndex() > -1) result.setClassIndex(result.numAttributes() - 1); } else { result = getOutputFormat(); } return result; } /** * Processes the given data (may change the provided dataset) and returns * the modified version. This method is called in batchFinished(). * * @param instances the data to process * @return the modified data * @throws Exception in case the processing goes wrong * @see #batchFinished() */ public Instances process(Instances instances) throws Exception { Instances result; int i; int n; int m; int index; Instances[] processed; Instance inst; Instance newInst; double[] values; Vector errors; if (!isFirstBatchDone()) { checkDimensions(); // set upper limits for (i = 0; i < m_Ranges.length; i++) m_Ranges[i].setUpper(instances.numAttributes() - 1); // determine unused indices determineUnusedIndices(instances); } // pass data through all filters processed = new Instances[getFilters().length]; for (i = 0; i < getFilters().length; i++) { processed[i] = generateSubset(instances, getRange(i)); if (!isFirstBatchDone()) getFilter(i).setInputFormat(processed[i]); processed[i] = Filter.useFilter(processed[i], getFilter(i)); } // set output format (can only be determined with full dataset, hence here) if (!isFirstBatchDone()) { result = determineOutputFormat(instances); setOutputFormat(result); } else { result = getOutputFormat(); } // check whether all filters didn't change the number of instances errors = new Vector(); for (i = 0; i < processed.length; i++) { if (processed[i].numInstances() != instances.numInstances()) errors.add(new Integer(i)); } if (errors.size() > 0) throw new IllegalStateException( "The following filter(s) changed the number of instances: " + errors); // assemble data for (i = 0; i < instances.numInstances(); i++) { inst = instances.instance(i); values = new double[result.numAttributes()]; // filtered data index = 0; for (n = 0; n < processed.length; n++) { for (m = 0; m < processed[n].numAttributes(); m++) { if (m == processed[n].classIndex()) continue; if (result.attribute(index).isString()) values[index] = result.attribute(index).addStringValue(processed[n].instance(i).stringValue(m)); else if (result.attribute(index).isRelationValued()) values[index] = result.attribute(index).addRelation(processed[n].instance(i).relationalValue(m)); else values[index] = processed[n].instance(i).value(m); index++; } } // unused attributes if (!getRemoveUnused()) { for (n = 0; n < m_IndicesUnused.length; n++) { if (result.attribute(index).isString()) values[index] = result.attribute(index).addStringValue(inst.stringValue(m_IndicesUnused[n])); else if (result.attribute(index).isRelationValued()) values[index] = result.attribute(index).addRelation(inst.relationalValue(m_IndicesUnused[n])); else values[index] = inst.value(m_IndicesUnused[n]); index++; } } // class if (instances.classIndex() > -1) values[values.length - 1] = inst.value(instances.classIndex()); // generate and add instance if (inst instanceof SparseInstance) newInst = new SparseInstance(instances.instance(i).weight(), values); else newInst = new DenseInstance(instances.instance(i).weight(), values); result.add(newInst); } return result; } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8034 $"); } /** * Main method for executing this class. * * @param args should contain arguments for the filter: use -h for help */ public static void main(String[] args) { runFilter(new PartitionedMultiFilter(), args); } }