NumericCleaner.java example

Explorer
TimeSeriesClassification-master
- TimeSeriesClassification
  - src
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * NumericCleaner.java
 * Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand
 */

package weka.filters.unsupervised.attribute;

import java.util.Enumeration;
import java.util.Vector;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.filters.SimpleStreamFilter;


/**
 <!-- globalinfo-start -->
 * A filter that 'cleanses' the numeric data from values that are too small, too big or very close to a certain value (e.g., 0) and sets these values to a pre-defined default.
 * <p/>
 <!-- globalinfo-end -->
 * 
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -D
 *  Turns on output of debugging information.</pre>
 * 
 * <pre> -min <double>
 *  The minimum threshold. (default -Double.MAX_VALUE)</pre>
 * 
 * <pre> -min-default <double>
 *  The replacement for values smaller than the minimum threshold.
 *  (default -Double.MAX_VALUE)</pre>
 * 
 * <pre> -max <double>
 *  The maximum threshold. (default Double.MAX_VALUE)</pre>
 * 
 * <pre> -max-default <double>
 *  The replacement for values larger than the maximum threshold.
 *  (default Double.MAX_VALUE)</pre>
 * 
 * <pre> -closeto <double>
 *  The number values are checked for closeness. (default 0)</pre>
 * 
 * <pre> -closeto-default <double>
 *  The replacement for values that are close to '-closeto'.
 *  (default 0)</pre>
 * 
 * <pre> -closeto-tolerance <double>
 *  The tolerance below which numbers are considered being close to 
 *  to each other. (default 1E-6)</pre>
 * 
 * <pre> -decimals <int>
 *  The number of decimals to round to, -1 means no rounding at all.
 *  (default -1)</pre>
 * 
 * <pre> -R <col1,col2,...>
 *  The list of columns to cleanse, e.g., first-last or first-3,5-last.
 *  (default first-last)</pre>
 * 
 * <pre> -V
 *  Inverts the matching sense.</pre>
 * 
 * <pre> -include-class
 *  Whether to include the class in the cleansing.
 *  The class column will always be skipped, if this flag is not
 *  present. (default no)</pre>
 * 
 <!-- options-end -->
 *
 * @author  fracpete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 8280 $
 */
public class NumericCleaner
  extends SimpleStreamFilter {

  /** for serialization */
  private static final long serialVersionUID = -352890679895066592L;

  /** the minimum threshold */
  protected double m_MinThreshold = -Double.MAX_VALUE;

  /** the minimum default replacement value */
  protected double m_MinDefault = -Double.MAX_VALUE;

  /** the maximum threshold */
  protected double m_MaxThreshold = Double.MAX_VALUE;

  /** the maximum default replacement value */
  protected double m_MaxDefault = Double.MAX_VALUE;

  /** the number the values are checked for closeness to */
  protected double m_CloseTo = 0;

  /** the default replacement value for numbers "close-to" */
  protected double m_CloseToDefault = 0;

  /** the tolerance distance, below which numbers are considered being "close-to" */
  protected double m_CloseToTolerance = 1E-6;

  /** Stores which columns to cleanse */
  protected Range m_Cols = new Range("first-last");

  /** whether to include the class attribute */
  protected boolean m_IncludeClass = false;
  
  /** the number of decimals to round to (-1 means no rounding) */
  protected int m_Decimals = -1;
  
  /**
   * Returns a string describing this filter.
   *
   * @return      a description of the filter suitable for
   *              displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return 
        "A filter that 'cleanses' the numeric data from values that are too "
      + "small, too big or very close to a certain value (e.g., 0) and sets "
      + "these values to a pre-defined default.";
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector        result;
    Enumeration   enm;

    result = new Vector();

    enm = super.listOptions();
    while (enm.hasMoreElements())
      result.addElement(enm.nextElement());

    result.addElement(new Option(
	"\tThe minimum threshold. (default -Double.MAX_VALUE)",
	"min", 1, "-min <double>"));
    
    result.addElement(new Option(
	"\tThe replacement for values smaller than the minimum threshold.\n"
	+ "\t(default -Double.MAX_VALUE)",
	"min-default", 1, "-min-default <double>"));

    result.addElement(new Option(
	"\tThe maximum threshold. (default Double.MAX_VALUE)",
	"max", 1, "-max <double>"));
    
    result.addElement(new Option(
	"\tThe replacement for values larger than the maximum threshold.\n"
	+ "\t(default Double.MAX_VALUE)",
	"max-default", 1, "-max-default <double>"));

    result.addElement(new Option(
	"\tThe number values are checked for closeness. (default 0)",
	"closeto", 1, "-closeto <double>"));
    
    result.addElement(new Option(
	"\tThe replacement for values that are close to '-closeto'.\n"
	+ "\t(default 0)",
	"closeto-default", 1, "-closeto-default <double>"));
    
    result.addElement(new Option(
	"\tThe tolerance below which numbers are considered being close to \n"
	+ "\tto each other. (default 1E-6)",
	"closeto-tolerance", 1, "-closeto-tolerance <double>"));

    result.addElement(new Option(
	"\tThe number of decimals to round to, -1 means no rounding at all.\n"
	+ "\t(default -1)",
	"decimals", 1, "-decimals <int>"));
    
    result.addElement(new Option(
	"\tThe list of columns to cleanse, e.g., first-last or first-3,5-last.\n"
	+ "\t(default first-last)",
	"R", 1, "-R <col1,col2,...>"));

    result.addElement(new Option(
	"\tInverts the matching sense.",
	"V", 0, "-V"));

    result.addElement(new Option(
	"\tWhether to include the class in the cleansing.\n"
	+ "\tThe class column will always be skipped, if this flag is not\n"
	+ "\tpresent. (default no)",
	"include-class", 0, "-include-class"));

    return result.elements();
  }	  

  /**
   * Gets the current settings of the filter.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String[] getOptions() {
    int       i;
    Vector    result;
    String[]  options;

    result = new Vector();
    options = super.getOptions();
    for (i = 0; i < options.length; i++)
      result.add(options[i]);

    result.add("-min"); 
    result.add("" + m_MinThreshold);

    result.add("-min-default"); 
    result.add("" + m_MinDefault);

    result.add("-max"); 
    result.add("" + m_MaxThreshold);

    result.add("-max-default"); 
    result.add("" + m_MaxDefault);

    result.add("-closeto"); 
    result.add("" + m_CloseTo);

    result.add("-closeto-default"); 
    result.add("" + m_CloseToDefault);
    
    result.add("-closeto-tolerance"); 
    result.add("" + m_CloseToTolerance);

    result.add("-R"); 
    result.add("" + m_Cols.getRanges());

    if (m_Cols.getInvert())
      result.add("-V");
    
    if (m_IncludeClass)
      result.add("-include-class"); 

    result.add("-decimals"); 
    result.add("" + getDecimals());

    return (String[]) result.toArray(new String[result.size()]);	  
  }	  

  /**
   * Parses a given list of options. <p/>
   *
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -D
   *  Turns on output of debugging information.</pre>
   * 
   * <pre> -min <double>
   *  The minimum threshold. (default -Double.MAX_VALUE)</pre>
   * 
   * <pre> -min-default <double>
   *  The replacement for values smaller than the minimum threshold.
   *  (default -Double.MAX_VALUE)</pre>
   * 
   * <pre> -max <double>
   *  The maximum threshold. (default Double.MAX_VALUE)</pre>
   * 
   * <pre> -max-default <double>
   *  The replacement for values larger than the maximum threshold.
   *  (default Double.MAX_VALUE)</pre>
   * 
   * <pre> -closeto <double>
   *  The number values are checked for closeness. (default 0)</pre>
   * 
   * <pre> -closeto-default <double>
   *  The replacement for values that are close to '-closeto'.
   *  (default 0)</pre>
   * 
   * <pre> -closeto-tolerance <double>
   *  The tolerance below which numbers are considered being close to 
   *  to each other. (default 1E-6)</pre>
   * 
   * <pre> -decimals <int>
   *  The number of decimals to round to, -1 means no rounding at all.
   *  (default -1)</pre>
   * 
   * <pre> -R <col1,col2,...>
   *  The list of columns to cleanse, e.g., first-last or first-3,5-last.
   *  (default first-last)</pre>
   * 
   * <pre> -V
   *  Inverts the matching sense.</pre>
   * 
   * <pre> -include-class
   *  Whether to include the class in the cleansing.
   *  The class column will always be skipped, if this flag is not
   *  present. (default no)</pre>
   * 
   <!-- options-end -->
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported 
   */
  public void setOptions(String[] options) throws Exception {
    String	tmpStr;

    tmpStr = Utils.getOption("min", options);
    if (tmpStr.length() != 0)
      setMinThreshold(Double.parseDouble(tmpStr));
    else
      setMinThreshold(-Double.MAX_VALUE);
    
    tmpStr = Utils.getOption("min-default", options);
    if (tmpStr.length() != 0)
      setMinDefault(Double.parseDouble(tmpStr));
    else
      setMinDefault(-Double.MAX_VALUE);
    
    tmpStr = Utils.getOption("max", options);
    if (tmpStr.length() != 0)
      setMaxThreshold(Double.parseDouble(tmpStr));
    else
      setMaxThreshold(Double.MAX_VALUE);
    
    tmpStr = Utils.getOption("max-default", options);
    if (tmpStr.length() != 0)
      setMaxDefault(Double.parseDouble(tmpStr));
    else
      setMaxDefault(Double.MAX_VALUE);
    
    tmpStr = Utils.getOption("closeto", options);
    if (tmpStr.length() != 0)
      setCloseTo(Double.parseDouble(tmpStr));
    else
      setCloseTo(0);
    
    tmpStr = Utils.getOption("closeto-default", options);
    if (tmpStr.length() != 0)
      setCloseToDefault(Double.parseDouble(tmpStr));
    else
      setCloseToDefault(0);
    
    tmpStr = Utils.getOption("closeto-tolerance", options);
    if (tmpStr.length() != 0)
      setCloseToTolerance(Double.parseDouble(tmpStr));
    else
      setCloseToTolerance(1E-6);
    
    tmpStr = Utils.getOption("R", options);
    if (tmpStr.length() != 0)
      setAttributeIndices(tmpStr);
    else
      setAttributeIndices("first-last");
    
    setInvertSelection(Utils.getFlag("V", options));
    
    setIncludeClass(Utils.getFlag("include-class", options));

    tmpStr = Utils.getOption("decimals", options);
    if (tmpStr.length() != 0)
      setDecimals(Integer.parseInt(tmpStr));
    else
      setDecimals(-1);
    
    super.setOptions(options);
  }	  

  /** 
   * Returns the Capabilities of this filter.
   *
   * @return            the capabilities of this object
   * @see               Capabilities
   */
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();

    // attributes
    result.enableAllAttributes();
    result.enable(Capability.MISSING_VALUES);
    
    // class
    result.enableAllClasses();
    result.enable(Capability.MISSING_CLASS_VALUES);
    result.enable(Capability.NO_CLASS);
    
    return result;
  }
  
  /**
   * Determines the output format based on the input format and returns 
   * this. In case the output format cannot be returned immediately, i.e.,
   * immediateOutputFormat() returns false, then this method will be called
   * from batchFinished().
   *
   * @param inputFormat     the input format to base the output format on
   * @return                the output format
   * @throws Exception      in case the determination goes wrong
   * @see   #hasImmediateOutputFormat()
   * @see   #batchFinished()
   */
  protected Instances determineOutputFormat(Instances inputFormat)
      throws Exception {

    m_Cols.setUpper(inputFormat.numAttributes() - 1);
    
    return new Instances(inputFormat);
  }

  /**
   * processes the given instance (may change the provided instance) and
   * returns the modified version.
   *
   * @param instance    the instance to process
   * @return            the modified data
   * @throws Exception  in case the processing goes wrong
   */
  protected Instance process(Instance instance) throws Exception {
    Instance		result;
    int			i;
    double		val;
    double		factor;
    
    result = (Instance) instance.copy();
    
    if (m_Decimals > -1)
      factor = StrictMath.pow(10, m_Decimals);
    else
      factor = 1;
    
    for (i = 0; i < result.numAttributes(); i++) {
      // only numeric attributes
      if (!result.attribute(i).isNumeric())
	continue;

      // out of range?
      if (!m_Cols.isInRange(i))
	continue;
      
      // skip class?
      if ( (result.classIndex() == i) && (!m_IncludeClass) )
	continue;
      
      // too small?
      if (result.value(i) < m_MinThreshold) {
	if (getDebug())
	  System.out.println("Too small: " + result.value(i) + " -> " + m_MinDefault);
	result.setValue(i, m_MinDefault);
      }
      // too big?
      else if (result.value(i) > m_MaxThreshold) {
	if (getDebug())
	  System.out.println("Too big: " + result.value(i) + " -> " + m_MaxDefault);
	result.setValue(i, m_MaxDefault);
      }
      // too close?
      else if (    (result.value(i) - m_CloseTo < m_CloseToTolerance) 
	        && (m_CloseTo - result.value(i) < m_CloseToTolerance) 
	        && (result.value(i) != m_CloseTo) ) {
	if (getDebug())
	  System.out.println("Too close: " + result.value(i) + " -> " + m_CloseToDefault);
	result.setValue(i, m_CloseToDefault);
      }
      
      // decimals?
      if (m_Decimals > -1 && !result.isMissing(i)) {
	val = result.value(i);
	val = StrictMath.round(val * factor) / factor;
	result.setValue(i, val);
      }
    }

    return result;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String minThresholdTipText() {
    return "The minimum threshold below values are replaced by a default.";
  }

  /**
   * Get the minimum threshold. 
   *
   * @return 		the minimum threshold.
   */
  public double getMinThreshold() {
    return m_MinThreshold;
  }

  /**
   * Set the minimum threshold. 
   *
   * @param value	the minimum threshold to use.
   */
  public void setMinThreshold(double value) {
    m_MinThreshold = value;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String minDefaultTipText() {
    return "The default value to replace values that are below the minimum threshold.";
  }

  /**
   * Get the minimum default. 
   *
   * @return 		the minimum default.
   */
  public double getMinDefault() {
    return m_MinDefault;
  }

  /**
   * Set the minimum default. 
   *
   * @param value	the minimum default to use.
   */
  public void setMinDefault(double value) {
    m_MinDefault = value;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String maxThresholdTipText() {
    return "The maximum threshold above values are replaced by a default.";
  }

  /**
   * Get the maximum threshold. 
   *
   * @return 		the maximum threshold.
   */
  public double getMaxThreshold() {
    return m_MaxThreshold;
  }

  /**
   * Set the maximum threshold. 
   *
   * @param value	the maximum threshold to use.
   */
  public void setMaxThreshold(double value) {
    m_MaxThreshold = value;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String maxDefaultTipText() {
    return "The default value to replace values that are above the maximum threshold.";
  }

  /**
   * Get the maximum default. 
   *
   * @return 		the maximum default.
   */
  public double getMaxDefault() {
    return m_MaxDefault;
  }

  /**
   * Set the naximum default. 
   *
   * @param value	the maximum default to use.
   */
  public void setMaxDefault(double value) {
    m_MaxDefault = value;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String closeToTipText() {
    return 
        "The number values are checked for whether they are too close to "
      + "and get replaced by a default.";
  }

  /**
   * Get the "close to" number.
   *
   * @return 		the "close to" number.
   */
  public double getCloseTo() {
    return m_CloseTo;
  }

  /**
   * Set the "close to" number.
   *
   * @param value	the number to use for checking closeness.
   */
  public void setCloseTo(double value) {
    m_CloseTo = value;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String closeToDefaultTipText() {
    return "The default value to replace values with that are too close.";
  }

  /**
   * Get the "close to" default.
   *
   * @return 		the "close to" default.
   */
  public double getCloseToDefault() {
    return m_CloseToDefault;
  }

  /**
   * Set the "close to" default. 
   *
   * @param value	the "close to" default to use.
   */
  public void setCloseToDefault(double value) {
    m_CloseToDefault = value;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String closeToToleranceTipText() {
    return "The value below which values are considered close to.";
  }

  /**
   * Get the "close to" Tolerance.
   *
   * @return 		the "close to" Tolerance.
   */
  public double getCloseToTolerance() {
    return m_CloseToTolerance;
  }

  /**
   * Set the "close to" Tolerance. 
   *
   * @param value	the "close to" Tolerance to use.
   */
  public void setCloseToTolerance(double value) {
    m_CloseToTolerance = value;
  }

  /**
   * Returns the tip text for this property
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String attributeIndicesTipText() {
    return "The selection of columns to use in the cleansing processs, first and last are valid indices.";
  }

  /**
   * Gets the selection of the columns, e.g., first-last or first-3,5-last
   *
   * @return 		the selected indices
   */
  public String getAttributeIndices() {
    return m_Cols.getRanges();
  }

  /**
   * Sets the columns to use, e.g., first-last or first-3,5-last
   *
   * @param value 	the columns to use
   */
  public void setAttributeIndices(String value) {
    m_Cols.setRanges(value);
  }

  /**
   * Returns the tip text for this property
   *
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String invertSelectionTipText() {
    return "If enabled the selection of the columns is inverted.";
  }

  /**
   * Gets whether the selection of the columns is inverted
   *
   * @return 		true if the selection is inverted
   */
  public boolean getInvertSelection() {
    return m_Cols.getInvert();
  }

  /**
   * Sets whether the selection of the indices is inverted or not
   *
   * @param value 	the new invert setting
   */
  public void setInvertSelection(boolean value) {
    m_Cols.setInvert(value);
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String includeClassTipText() {
    return "If disabled, the class attribute will be always left out of the cleaning process.";
  }

  /**
   * Gets whether the class is included in the cleaning process or always 
   * skipped.
   *
   * @return 		true if the class can be considered for cleaning.
   */
  public boolean getIncludeClass() {
    return m_IncludeClass;
  }

  /**
   * Sets whether the class can be cleaned, too.
   *
   * @param value	true if the class can be cleansed, too
   */
  public void setIncludeClass(boolean value) {
    m_IncludeClass = value;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String decimalsTipText() {
    return "The number of decimals to round to, -1 means no rounding at all.";
  }

  /**
   * Get the number of decimals to round to. 
   *
   * @return 		the number of decimals.
   */
  public int getDecimals() {
    return m_Decimals;
  }

  /**
   * Set the number of decimals to round to.
   *
   * @param value	the number of decimals.
   */
  public void setDecimals(int value) {
    m_Decimals = value;
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 8280 $");
  }

  /**
   * Runs the filter from commandline, use "-h" to see all options.
   * 
   * @param args the commandline options for the filter
   */
  public static void main(String[] args) {
    runFilter(new NumericCleaner(), args);
  }
}