PrincipalComponents.java example

Explorer
jDenetX-master
- src
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * PrincipalComponents.java
 * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand
 */

package weka.filters.unsupervised.attribute;

import weka.core.*;
import weka.core.Capabilities.Capability;
import weka.core.matrix.EigenvalueDecomposition;
import weka.core.matrix.Matrix;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

import java.util.Enumeration;
import java.util.Vector;

/**
 <!-- globalinfo-start -->
 * Performs a principal components analysis and transformation of the data.<br/>
 * Dimensionality reduction is accomplished by choosing enough eigenvectors to account for some percentage of the variance in the original data -- default 0.95 (95%).<br/>
 * Based on code of the attribute selection scheme 'PrincipalComponents' by Mark Hall and Gabi Schmidberger.
 * <p/>
 <!-- globalinfo-end -->
 * 
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -D
 *  Don't normalize input data.</pre>
 * 
 * <pre> -R <num>
 *  Retain enough PC attributes to account
 *  for this proportion of variance in the original data.
 *  (default: 0.95)</pre>
 * 
 * <pre> -A <num>
 *  Maximum number of attributes to include in 
 *  transformed attribute names.
 *  (-1 = include all, default: 5)</pre>
 * 
 * <pre> -M <num>
 *  Maximum number of PC attributes to retain.
 *  (-1 = include all, default: -1)</pre>
 * 
 <!-- options-end -->
 *
 * @author Mark Hall (mhall@cs.waikato.ac.nz) -- attribute selection code
 * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) -- attribute selection code
 * @author fracpete (fracpete at waikato dot ac dot nz) -- filter code
 * @version $Revision: 5987 $
 */
public class PrincipalComponents
  extends Filter
  implements OptionHandler, UnsupervisedFilter {

  /** for serialization. */
  private static final long serialVersionUID = 4626939780964387784L;

  /** The data to transform analyse/transform. */
  protected Instances m_TrainInstances;

  /** Keep a copy for the class attribute (if set). */
  protected Instances m_TrainCopy;

  /** The header for the transformed data format. */
  protected Instances m_TransformedFormat;

  /** Data has a class set. */
  protected boolean m_HasClass;

  /** Class index. */
  protected int m_ClassIndex;

  /** Number of attributes. */
  protected int m_NumAttribs;

  /** Number of instances. */
  protected int m_NumInstances;

  /** Correlation matrix for the original data. */
  protected double[][] m_Correlation;

  /** Will hold the unordered linear transformations of the (normalized)
      original data. */
  protected double[][] m_Eigenvectors;

  /** Eigenvalues for the corresponding eigenvectors. */
  protected double[] m_Eigenvalues = null;

  /** Sorted eigenvalues. */
  protected int[] m_SortedEigens;

  /** sum of the eigenvalues. */
  protected double m_SumOfEigenValues = 0.0;

  /** Filters for replacing missing values. */
  protected ReplaceMissingValues m_ReplaceMissingFilter;
  
  /** Filter for normalizing the data. */
  protected Normalize m_NormalizeFilter;
  
  /** Filter for turning nominal values into numeric ones. */
  protected NominalToBinary m_NominalToBinaryFilter;
  
  /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */
  protected Remove m_AttributeFilter;

  /** The number of attributes in the pc transformed data. */
  protected int m_OutputNumAtts = -1;

  /** normalize the input data? */
  protected boolean m_Normalize = true;

  /** the amount of varaince to cover in the original data when
      retaining the best n PC's. */
  protected double m_CoverVariance = 0.95;

  /** maximum number of attributes in the transformed attribute name. */
  protected int m_MaxAttrsInName = 5;

  /** maximum number of attributes in the transformed data (-1 for all). */
  protected int m_MaxAttributes = -1;

  /**
   * Returns a string describing this filter.
   *
   * @return 		a description of the filter suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return 
        "Performs a principal components analysis and transformation of "
      + "the data.\n"
      + "Dimensionality reduction is accomplished by choosing enough eigenvectors "
      + "to account for some percentage of the variance in the original data -- "
      + "default 0.95 (95%).\n"
      + "Based on code of the attribute selection scheme 'PrincipalComponents' "
      + "by Mark Hall and Gabi Schmidberger.";
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * @return 		an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector result = new Vector();

    result.addElement(new Option(
	"\tDon't normalize input data.", 
	"D", 0, "-D"));

    result.addElement(new Option(
	"\tRetain enough PC attributes to account\n"
	+"\tfor this proportion of variance in the original data.\n"
	+ "\t(default: 0.95)",
	"R", 1, "-R <num>"));

    result.addElement(new Option(
	"\tMaximum number of attributes to include in \n"
	+ "\ttransformed attribute names.\n"
	+ "\t(-1 = include all, default: 5)", 
	"A", 1, "-A <num>"));

    result.addElement(new Option(
	"\tMaximum number of PC attributes to retain.\n"
	+ "\t(-1 = include all, default: -1)", 
	"M", 1, "-M <num>"));

    return result.elements();
  }

  /**
   * Parses a list of options for this object. <p/>
   *
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -D
   *  Don't normalize input data.</pre>
   * 
   * <pre> -R <num>
   *  Retain enough PC attributes to account
   *  for this proportion of variance in the original data.
   *  (default: 0.95)</pre>
   * 
   * <pre> -A <num>
   *  Maximum number of attributes to include in 
   *  transformed attribute names.
   *  (-1 = include all, default: 5)</pre>
   * 
   * <pre> -M <num>
   *  Maximum number of PC attributes to retain.
   *  (-1 = include all, default: -1)</pre>
   * 
   <!-- options-end -->
   *
   * @param options 	the list of options as an array of strings
   * @throws Exception 	if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {
    String        tmpStr;

    tmpStr = Utils.getOption('R', options);
    if (tmpStr.length() != 0)
      setVarianceCovered(Double.parseDouble(tmpStr));
    else
      setVarianceCovered(0.95);

    tmpStr = Utils.getOption('A', options);
    if (tmpStr.length() != 0)
      setMaximumAttributeNames(Integer.parseInt(tmpStr));
    else
      setMaximumAttributeNames(5);

    tmpStr = Utils.getOption('M', options);
    if (tmpStr.length() != 0)
      setMaximumAttributes(Integer.parseInt(tmpStr));
    else
      setMaximumAttributes(-1);

    setNormalize(!Utils.getFlag('D', options));
  }

  /**
   * Gets the current settings of the filter.
   *
   * @return 		an array of strings suitable for passing to setOptions
   */
  public String[] getOptions() {
    Vector<String>	result;

    result = new Vector<String>();

    result.add("-R");
    result.add("" + getVarianceCovered());

    result.add("-A");
    result.add("" + getMaximumAttributeNames());

    result.add("-M");
    result.add("" + getMaximumAttributes());

    if (!getNormalize())
      result.add("-D");

    return result.toArray(new String[result.size()]);
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String normalizeTipText() {
    return "Normalize input data.";
  }

  /**
   * Set whether input data will be normalized.
   * 
   * @param value 	true if input data is to be normalized
   */
  public void setNormalize(boolean value) {
    m_Normalize = value;
  }

  /**
   * Gets whether or not input data is to be normalized.
   * 
   * @return 		true if input data is to be normalized
   */
  public boolean getNormalize() {
    return m_Normalize;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String varianceCoveredTipText() {
    return "Retain enough PC attributes to account for this proportion of variance.";
  }

  /**
   * Sets the amount of variance to account for when retaining
   * principal components.
   * 
   * @param value 	the proportion of total variance to account for
   */
  public void setVarianceCovered(double value) {
    m_CoverVariance = value;
  }

  /**
   * Gets the proportion of total variance to account for when
   * retaining principal components.
   * 
   * @return 		the proportion of variance to account for
   */
  public double getVarianceCovered() {
    return m_CoverVariance;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String maximumAttributeNamesTipText() {
    return "The maximum number of attributes to include in transformed attribute names.";
  }

  /**
   * Sets maximum number of attributes to include in
   * transformed attribute names.
   * 
   * @param value 	the maximum number of attributes
   */
  public void setMaximumAttributeNames(int value) {
    m_MaxAttrsInName = value;
  }

  /**
   * Gets maximum number of attributes to include in
   * transformed attribute names.
   * 
   * @return 		the maximum number of attributes
   */
  public int getMaximumAttributeNames() {
    return m_MaxAttrsInName;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return 		tip text for this property suitable for
   * 			displaying in the explorer/experimenter gui
   */
  public String maximumAttributesTipText() {
    return "The maximum number of PC attributes to retain.";
  }

  /**
   * Sets maximum number of PC attributes to retain.
   * 
   * @param value 	the maximum number of attributes
   */
  public void setMaximumAttributes(int value) {
    m_MaxAttributes = value;
  }

  /**
   * Gets maximum number of PC attributes to retain.
   * 
   * @return 		the maximum number of attributes
   */
  public int getMaximumAttributes() {
    return m_MaxAttributes;
  }

  /**
   * Returns the capabilities of this evaluator.
   *
   * @return            the capabilities of this evaluator
   * @see               Capabilities
   */
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.DATE_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    // class
    result.enable(Capability.NOMINAL_CLASS);
    result.enable(Capability.NUMERIC_CLASS);
    result.enable(Capability.DATE_CLASS);
    result.enable(Capability.MISSING_CLASS_VALUES);
    result.enable(Capability.NO_CLASS);

    return result;
  }

  /**
   * Determines the output format based on the input format and returns 
   * this. In case the output format cannot be returned immediately, i.e.,
   * immediateOutputFormat() returns false, then this method will be called
   * from batchFinished().
   *
   * @param inputFormat     the input format to base the output format on
   * @return                the output format
   * @throws Exception      in case the determination goes wrong
   * @see   #hasImmediateOutputFormat()
   * @see   #batchFinished()
   */
  protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    double 		cumulative;
    FastVector 		attributes;
    int 		i;
    int 		j;
    StringBuffer 	attName;
    double[] 		coeff_mags;
    int 		num_attrs;
    int[] 		coeff_inds;
    double 		coeff_value;
    int			numAttsLowerBound;
    
    if (m_Eigenvalues == null)
      return inputFormat;

    if (m_MaxAttributes > 0)
      numAttsLowerBound = m_NumAttribs - m_MaxAttributes;
    else
      numAttsLowerBound = 0;
    if (numAttsLowerBound < 0)
      numAttsLowerBound = 0;
    
    cumulative = 0.0;
    attributes = new FastVector();
    for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) {
      attName = new StringBuffer();
      // build array of coefficients
      coeff_mags = new double[m_NumAttribs];
      for (j = 0; j < m_NumAttribs; j++)
	coeff_mags[j] = -Math.abs(m_Eigenvectors[j][m_SortedEigens[i]]);
      num_attrs = (m_MaxAttrsInName > 0) ? Math.min(m_NumAttribs, m_MaxAttrsInName) : m_NumAttribs;

      // this array contains the sorted indices of the coefficients
      if (m_NumAttribs > 0) {
	// if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude
	coeff_inds = Utils.sort(coeff_mags);
      }
      else {
	// if  m_maxAttrsInName <= 0, use all coeffs in original order
	coeff_inds = new int[m_NumAttribs];
	for (j = 0; j < m_NumAttribs; j++)
	  coeff_inds[j] = j;
      }
      // build final attName string
      for (j = 0; j < num_attrs; j++) {
	coeff_value = m_Eigenvectors[coeff_inds[j]][m_SortedEigens[i]];
	if (j > 0 && coeff_value >= 0)
	  attName.append("+");
	attName.append(
	    Utils.doubleToString(coeff_value,5,3) 
	    + inputFormat.attribute(coeff_inds[j]).name());
      }
      if (num_attrs < m_NumAttribs)
	attName.append("...");

      attributes.addElement(new Attribute(attName.toString()));
      cumulative += m_Eigenvalues[m_SortedEigens[i]];

      if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance)
	break;
    }

    if (m_HasClass)
      attributes.addElement(m_TrainCopy.classAttribute().copy());

    Instances outputFormat = 
      new Instances(
	  m_TrainCopy.relationName() + "_principal components", attributes, 0);

    // set the class to be the last attribute if necessary
    if (m_HasClass)
      outputFormat.setClassIndex(outputFormat.numAttributes() - 1);

    m_OutputNumAtts = outputFormat.numAttributes();
    
    return outputFormat;
  }

  /**
   * Fill the correlation matrix.
   */
  protected void fillCorrelation() {
    int		i;
    int		j;
    int		k;
    double[] 	att1;
    double[] 	att2;
    double 	corr;
    
    m_Correlation = new double[m_NumAttribs][m_NumAttribs];
    att1          = new double [m_NumInstances];
    att2          = new double [m_NumInstances];

    for (i = 0; i < m_NumAttribs; i++) {
      for (j = 0; j < m_NumAttribs; j++) {
	if (i == j) {
	  m_Correlation[i][j] = 1.0;
	}
	else {
	  for (k = 0; k < m_NumInstances; k++) {
	    att1[k] = m_TrainInstances.instance(k).value(i);
	    att2[k] = m_TrainInstances.instance(k).value(j);
	  }
	  corr = Utils.correlation(att1,att2,m_NumInstances);
	  m_Correlation[i][j] = corr;
	  m_Correlation[j][i] = corr;
	}
      }
    }
  }

  /**
   * Transform an instance in original (unormalized) format.
   * 
   * @param instance 	an instance in the original (unormalized) format
   * @return 		a transformed instance
   * @throws Exception 	if instance can't be transformed
   */
  protected Instance convertInstance(Instance instance) throws Exception {
    Instance	result;
    double[] 	newVals;
    Instance 	tempInst;
    double 	cumulative;
    int		i;
    int		j;
    double 	tempval;
    int		numAttsLowerBound;
    
    newVals  = new double[m_OutputNumAtts];
    tempInst = (Instance) instance.copy();

    m_ReplaceMissingFilter.input(tempInst);
    m_ReplaceMissingFilter.batchFinished();
    tempInst = m_ReplaceMissingFilter.output();

    if (m_Normalize) {
      m_NormalizeFilter.input(tempInst);
      m_NormalizeFilter.batchFinished();
      tempInst = m_NormalizeFilter.output();
    }

    m_NominalToBinaryFilter.input(tempInst);
    m_NominalToBinaryFilter.batchFinished();
    tempInst = m_NominalToBinaryFilter.output();

    if (m_AttributeFilter != null) {
      m_AttributeFilter.input(tempInst);
      m_AttributeFilter.batchFinished();
      tempInst = m_AttributeFilter.output();
    }

    if (m_HasClass)
      newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex());

    if (m_MaxAttributes > 0)
      numAttsLowerBound = m_NumAttribs - m_MaxAttributes;
    else
      numAttsLowerBound = 0;
    if (numAttsLowerBound < 0)
      numAttsLowerBound = 0;
    
    cumulative = 0;
    for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) {
      tempval = 0.0;
      for (j = 0; j < m_NumAttribs; j++)
	tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j);

      newVals[m_NumAttribs - i - 1] = tempval;
      cumulative += m_Eigenvalues[m_SortedEigens[i]];
      if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance)
	break;
    }

    // create instance
    if (instance instanceof SparseInstance)
      result = new SparseInstance(instance.weight(), newVals);
    else
      result = new DenseInstance(instance.weight(), newVals);
    
    return result;
  }

  /**
   * Initializes the filter with the given input data.
   *
   * @param instances   the data to process
   * @throws Exception  in case the processing goes wrong
   * @see               #batchFinished()
   */
  protected void setup(Instances instances) throws Exception {
    int				i;
    int				j;
    Vector<Integer> 		deleteCols;
    int[] 			todelete;
    double[][] 			v;
    Matrix 			corr;
    EigenvalueDecomposition 	eig;
    Matrix 			V;
    
    m_TrainInstances = new Instances(instances);

    // make a copy of the training data so that we can get the class
    // column to append to the transformed data (if necessary)
    m_TrainCopy = new Instances(m_TrainInstances, 0);

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    m_ReplaceMissingFilter.setInputFormat(m_TrainInstances);
    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter);

    if (m_Normalize) {
      m_NormalizeFilter = new Normalize();
      m_NormalizeFilter.setInputFormat(m_TrainInstances);
      m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NormalizeFilter);
    }

    m_NominalToBinaryFilter = new NominalToBinary();
    m_NominalToBinaryFilter.setInputFormat(m_TrainInstances);
    m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter);

    // delete any attributes with only one distinct value or are all missing
    deleteCols = new Vector<Integer>();
    for (i = 0; i < m_TrainInstances.numAttributes(); i++) {
      if (m_TrainInstances.numDistinctValues(i) <= 1)
	deleteCols.addElement(i);
    }

    if (m_TrainInstances.classIndex() >=0) {
      // get rid of the class column
      m_HasClass = true;
      m_ClassIndex = m_TrainInstances.classIndex();
      deleteCols.addElement(new Integer(m_ClassIndex));
    }

    // remove columns from the data if necessary
    if (deleteCols.size() > 0) {
      m_AttributeFilter = new Remove();
      todelete = new int [deleteCols.size()];
      for (i = 0; i < deleteCols.size(); i++)
	todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue();
      m_AttributeFilter.setAttributeIndicesArray(todelete);
      m_AttributeFilter.setInvertSelection(false);
      m_AttributeFilter.setInputFormat(m_TrainInstances);
      m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter);
    }

    // can evaluator handle the processed data ? e.g., enough attributes?
    getCapabilities().testWithFail(m_TrainInstances);

    m_NumInstances = m_TrainInstances.numInstances();
    m_NumAttribs   = m_TrainInstances.numAttributes();

    fillCorrelation();

    // get eigen vectors/values
    corr = new Matrix(m_Correlation);
    eig  = corr.eig();
    V    = eig.getV();
    v    = new double[m_NumAttribs][m_NumAttribs];
    for (i = 0; i < v.length; i++) {
      for (j = 0; j < v[0].length; j++)
        v[i][j] = V.get(i, j);
    }
    m_Eigenvectors = (double[][]) v.clone();
    m_Eigenvalues  = (double[]) eig.getRealEigenvalues().clone();

    // any eigenvalues less than 0 are not worth anything --- change to 0
    for (i = 0; i < m_Eigenvalues.length; i++) {
      if (m_Eigenvalues[i] < 0)
	m_Eigenvalues[i] = 0.0;
    }
    m_SortedEigens     = Utils.sort(m_Eigenvalues);
    m_SumOfEigenValues = Utils.sum(m_Eigenvalues);

    m_TransformedFormat = determineOutputFormat(m_TrainInstances);
    setOutputFormat(m_TransformedFormat);
    
    m_TrainInstances = null;
  }

  /**
   * Sets the format of the input instances.
   *
   * @param instanceInfo 	an Instances object containing the input 
   * 				instance structure (any instances contained 
   * 				in the object are ignored - only the structure 
   * 				is required).
   * @return 			true if the outputFormat may be collected 
   * 				immediately
   * @throws Exception 		if the input format can't be set successfully
   */
  public boolean setInputFormat(Instances instanceInfo) throws Exception {
    super.setInputFormat(instanceInfo);

    m_Eigenvalues           = null;
    m_OutputNumAtts         = -1;
    m_AttributeFilter       = null;
    m_NominalToBinaryFilter = null;
    m_SumOfEigenValues      = 0.0;
    
    return false;
  }

  /**
   * Input an instance for filtering. Filter requires all
   * training instances be read before producing output.
   *
   * @param instance 			the input instance
   * @return 				true if the filtered instance may now be
   * 					collected with output().
   * @throws IllegalStateException 	if no input format has been set
   * @throws Exception 			if conversion fails
   */
  public boolean input(Instance instance) throws Exception {
    Instance 	inst;
    
    if (getInputFormat() == null)
      throw new IllegalStateException("No input instance format defined");

    if (isNewBatch()) {
      resetQueue();
      m_NewBatch = false;
    }
    
    if (isFirstBatchDone()) {
      inst = convertInstance(instance);
      inst.setDataset(getOutputFormat());
      push(inst);
      return true;
    }
    else {
      bufferInput(instance);
      return false;
    }
  }

  /**
   * Signify that this batch of input to the filter is finished.
   *
   * @return true 			if there are instances pending output
   * @throws NullPointerException 	if no input structure has been defined,
   * @throws Exception 			if there was a problem finishing the batch.
   */
  public boolean batchFinished() throws Exception {
    int		i;
    Instances	insts;
    Instance	inst;
    
    if (getInputFormat() == null)
      throw new NullPointerException("No input instance format defined");

    insts = getInputFormat();

    if (!isFirstBatchDone())
      setup(insts);
    
    for (i = 0; i < insts.numInstances(); i++) {
      inst = convertInstance(insts.instance(i));
      inst.setDataset(getOutputFormat());
      push(inst);
    }
    
    flushInput();
    m_NewBatch       = true;
    m_FirstBatchDone = true;
    
    return (numPendingOutput() != 0);
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 5987 $");
  }

  /**
   * Main method for running this filter.
   *
   * @param args 	should contain arguments to the filter: use -h for help
   */
  public static void main(String[] args) {
    runFilter(new PrincipalComponents(), args);
  }
}