/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * PrincipalComponents.java * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand */ package weka.filters.unsupervised.attribute; import weka.core.*; import weka.core.Capabilities.Capability; import weka.core.matrix.EigenvalueDecomposition; import weka.core.matrix.Matrix; import weka.filters.Filter; import weka.filters.UnsupervisedFilter; import java.util.Enumeration; import java.util.Vector; /** <!-- globalinfo-start --> * Performs a principal components analysis and transformation of the data.<br/> * Dimensionality reduction is accomplished by choosing enough eigenvectors to account for some percentage of the variance in the original data -- default 0.95 (95%).<br/> * Based on code of the attribute selection scheme 'PrincipalComponents' by Mark Hall and Gabi Schmidberger. * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Don't normalize input data.</pre> * * <pre> -R <num> * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default: 0.95)</pre> * * <pre> -A <num> * Maximum number of attributes to include in * transformed attribute names. * (-1 = include all, default: 5)</pre> * * <pre> -M <num> * Maximum number of PC attributes to retain. * (-1 = include all, default: -1)</pre> * <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) -- attribute selection code * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) -- attribute selection code * @author fracpete (fracpete at waikato dot ac dot nz) -- filter code * @version $Revision: 5987 $ */ public class PrincipalComponents extends Filter implements OptionHandler, UnsupervisedFilter { /** for serialization. */ private static final long serialVersionUID = 4626939780964387784L; /** The data to transform analyse/transform. */ protected Instances m_TrainInstances; /** Keep a copy for the class attribute (if set). */ protected Instances m_TrainCopy; /** The header for the transformed data format. */ protected Instances m_TransformedFormat; /** Data has a class set. */ protected boolean m_HasClass; /** Class index. */ protected int m_ClassIndex; /** Number of attributes. */ protected int m_NumAttribs; /** Number of instances. */ protected int m_NumInstances; /** Correlation matrix for the original data. */ protected double[][] m_Correlation; /** Will hold the unordered linear transformations of the (normalized) original data. */ protected double[][] m_Eigenvectors; /** Eigenvalues for the corresponding eigenvectors. */ protected double[] m_Eigenvalues = null; /** Sorted eigenvalues. */ protected int[] m_SortedEigens; /** sum of the eigenvalues. */ protected double m_SumOfEigenValues = 0.0; /** Filters for replacing missing values. */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** Filter for normalizing the data. */ protected Normalize m_NormalizeFilter; /** Filter for turning nominal values into numeric ones. */ protected NominalToBinary m_NominalToBinaryFilter; /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */ protected Remove m_AttributeFilter; /** The number of attributes in the pc transformed data. */ protected int m_OutputNumAtts = -1; /** normalize the input data? */ protected boolean m_Normalize = true; /** the amount of varaince to cover in the original data when retaining the best n PC's. */ protected double m_CoverVariance = 0.95; /** maximum number of attributes in the transformed attribute name. */ protected int m_MaxAttrsInName = 5; /** maximum number of attributes in the transformed data (-1 for all). */ protected int m_MaxAttributes = -1; /** * Returns a string describing this filter. * * @return a description of the filter suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Performs a principal components analysis and transformation of " + "the data.\n" + "Dimensionality reduction is accomplished by choosing enough eigenvectors " + "to account for some percentage of the variance in the original data -- " + "default 0.95 (95%).\n" + "Based on code of the attribute selection scheme 'PrincipalComponents' " + "by Mark Hall and Gabi Schmidberger."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option( "\tDon't normalize input data.", "D", 0, "-D")); result.addElement(new Option( "\tRetain enough PC attributes to account\n" +"\tfor this proportion of variance in the original data.\n" + "\t(default: 0.95)", "R", 1, "-R <num>")); result.addElement(new Option( "\tMaximum number of attributes to include in \n" + "\ttransformed attribute names.\n" + "\t(-1 = include all, default: 5)", "A", 1, "-A <num>")); result.addElement(new Option( "\tMaximum number of PC attributes to retain.\n" + "\t(-1 = include all, default: -1)", "M", 1, "-M <num>")); return result.elements(); } /** * Parses a list of options for this object. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -D * Don't normalize input data.</pre> * * <pre> -R <num> * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default: 0.95)</pre> * * <pre> -A <num> * Maximum number of attributes to include in * transformed attribute names. * (-1 = include all, default: 5)</pre> * * <pre> -M <num> * Maximum number of PC attributes to retain. * (-1 = include all, default: -1)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; tmpStr = Utils.getOption('R', options); if (tmpStr.length() != 0) setVarianceCovered(Double.parseDouble(tmpStr)); else setVarianceCovered(0.95); tmpStr = Utils.getOption('A', options); if (tmpStr.length() != 0) setMaximumAttributeNames(Integer.parseInt(tmpStr)); else setMaximumAttributeNames(5); tmpStr = Utils.getOption('M', options); if (tmpStr.length() != 0) setMaximumAttributes(Integer.parseInt(tmpStr)); else setMaximumAttributes(-1); setNormalize(!Utils.getFlag('D', options)); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector<String> result; result = new Vector<String>(); result.add("-R"); result.add("" + getVarianceCovered()); result.add("-A"); result.add("" + getMaximumAttributeNames()); result.add("-M"); result.add("" + getMaximumAttributes()); if (!getNormalize()) result.add("-D"); return result.toArray(new String[result.size()]); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String normalizeTipText() { return "Normalize input data."; } /** * Set whether input data will be normalized. * * @param value true if input data is to be normalized */ public void setNormalize(boolean value) { m_Normalize = value; } /** * Gets whether or not input data is to be normalized. * * @return true if input data is to be normalized */ public boolean getNormalize() { return m_Normalize; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String varianceCoveredTipText() { return "Retain enough PC attributes to account for this proportion of variance."; } /** * Sets the amount of variance to account for when retaining * principal components. * * @param value the proportion of total variance to account for */ public void setVarianceCovered(double value) { m_CoverVariance = value; } /** * Gets the proportion of total variance to account for when * retaining principal components. * * @return the proportion of variance to account for */ public double getVarianceCovered() { return m_CoverVariance; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String maximumAttributeNamesTipText() { return "The maximum number of attributes to include in transformed attribute names."; } /** * Sets maximum number of attributes to include in * transformed attribute names. * * @param value the maximum number of attributes */ public void setMaximumAttributeNames(int value) { m_MaxAttrsInName = value; } /** * Gets maximum number of attributes to include in * transformed attribute names. * * @return the maximum number of attributes */ public int getMaximumAttributeNames() { return m_MaxAttrsInName; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String maximumAttributesTipText() { return "The maximum number of PC attributes to retain."; } /** * Sets maximum number of PC attributes to retain. * * @param value the maximum number of attributes */ public void setMaximumAttributes(int value) { m_MaxAttributes = value; } /** * Gets maximum number of PC attributes to retain. * * @return the maximum number of attributes */ public int getMaximumAttributes() { return m_MaxAttributes; } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.NUMERIC_CLASS); result.enable(Capability.DATE_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Determines the output format based on the input format and returns * this. In case the output format cannot be returned immediately, i.e., * immediateOutputFormat() returns false, then this method will be called * from batchFinished(). * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #hasImmediateOutputFormat() * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { double cumulative; FastVector attributes; int i; int j; StringBuffer attName; double[] coeff_mags; int num_attrs; int[] coeff_inds; double coeff_value; int numAttsLowerBound; if (m_Eigenvalues == null) return inputFormat; if (m_MaxAttributes > 0) numAttsLowerBound = m_NumAttribs - m_MaxAttributes; else numAttsLowerBound = 0; if (numAttsLowerBound < 0) numAttsLowerBound = 0; cumulative = 0.0; attributes = new FastVector(); for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { attName = new StringBuffer(); // build array of coefficients coeff_mags = new double[m_NumAttribs]; for (j = 0; j < m_NumAttribs; j++) coeff_mags[j] = -Math.abs(m_Eigenvectors[j][m_SortedEigens[i]]); num_attrs = (m_MaxAttrsInName > 0) ? Math.min(m_NumAttribs, m_MaxAttrsInName) : m_NumAttribs; // this array contains the sorted indices of the coefficients if (m_NumAttribs > 0) { // if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude coeff_inds = Utils.sort(coeff_mags); } else { // if m_maxAttrsInName <= 0, use all coeffs in original order coeff_inds = new int[m_NumAttribs]; for (j = 0; j < m_NumAttribs; j++) coeff_inds[j] = j; } // build final attName string for (j = 0; j < num_attrs; j++) { coeff_value = m_Eigenvectors[coeff_inds[j]][m_SortedEigens[i]]; if (j > 0 && coeff_value >= 0) attName.append("+"); attName.append( Utils.doubleToString(coeff_value,5,3) + inputFormat.attribute(coeff_inds[j]).name()); } if (num_attrs < m_NumAttribs) attName.append("..."); attributes.addElement(new Attribute(attName.toString())); cumulative += m_Eigenvalues[m_SortedEigens[i]]; if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) break; } if (m_HasClass) attributes.addElement(m_TrainCopy.classAttribute().copy()); Instances outputFormat = new Instances( m_TrainCopy.relationName() + "_principal components", attributes, 0); // set the class to be the last attribute if necessary if (m_HasClass) outputFormat.setClassIndex(outputFormat.numAttributes() - 1); m_OutputNumAtts = outputFormat.numAttributes(); return outputFormat; } /** * Fill the correlation matrix. */ protected void fillCorrelation() { int i; int j; int k; double[] att1; double[] att2; double corr; m_Correlation = new double[m_NumAttribs][m_NumAttribs]; att1 = new double [m_NumInstances]; att2 = new double [m_NumInstances]; for (i = 0; i < m_NumAttribs; i++) { for (j = 0; j < m_NumAttribs; j++) { if (i == j) { m_Correlation[i][j] = 1.0; } else { for (k = 0; k < m_NumInstances; k++) { att1[k] = m_TrainInstances.instance(k).value(i); att2[k] = m_TrainInstances.instance(k).value(j); } corr = Utils.correlation(att1,att2,m_NumInstances); m_Correlation[i][j] = corr; m_Correlation[j][i] = corr; } } } } /** * Transform an instance in original (unormalized) format. * * @param instance an instance in the original (unormalized) format * @return a transformed instance * @throws Exception if instance can't be transformed */ protected Instance convertInstance(Instance instance) throws Exception { Instance result; double[] newVals; Instance tempInst; double cumulative; int i; int j; double tempval; int numAttsLowerBound; newVals = new double[m_OutputNumAtts]; tempInst = (Instance) instance.copy(); m_ReplaceMissingFilter.input(tempInst); m_ReplaceMissingFilter.batchFinished(); tempInst = m_ReplaceMissingFilter.output(); if (m_Normalize) { m_NormalizeFilter.input(tempInst); m_NormalizeFilter.batchFinished(); tempInst = m_NormalizeFilter.output(); } m_NominalToBinaryFilter.input(tempInst); m_NominalToBinaryFilter.batchFinished(); tempInst = m_NominalToBinaryFilter.output(); if (m_AttributeFilter != null) { m_AttributeFilter.input(tempInst); m_AttributeFilter.batchFinished(); tempInst = m_AttributeFilter.output(); } if (m_HasClass) newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex()); if (m_MaxAttributes > 0) numAttsLowerBound = m_NumAttribs - m_MaxAttributes; else numAttsLowerBound = 0; if (numAttsLowerBound < 0) numAttsLowerBound = 0; cumulative = 0; for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { tempval = 0.0; for (j = 0; j < m_NumAttribs; j++) tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j); newVals[m_NumAttribs - i - 1] = tempval; cumulative += m_Eigenvalues[m_SortedEigens[i]]; if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) break; } // create instance if (instance instanceof SparseInstance) result = new SparseInstance(instance.weight(), newVals); else result = new DenseInstance(instance.weight(), newVals); return result; } /** * Initializes the filter with the given input data. * * @param instances the data to process * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected void setup(Instances instances) throws Exception { int i; int j; Vector<Integer> deleteCols; int[] todelete; double[][] v; Matrix corr; EigenvalueDecomposition eig; Matrix V; m_TrainInstances = new Instances(instances); // make a copy of the training data so that we can get the class // column to append to the transformed data (if necessary) m_TrainCopy = new Instances(m_TrainInstances, 0); m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter); if (m_Normalize) { m_NormalizeFilter = new Normalize(); m_NormalizeFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NormalizeFilter); } m_NominalToBinaryFilter = new NominalToBinary(); m_NominalToBinaryFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter); // delete any attributes with only one distinct value or are all missing deleteCols = new Vector<Integer>(); for (i = 0; i < m_TrainInstances.numAttributes(); i++) { if (m_TrainInstances.numDistinctValues(i) <= 1) deleteCols.addElement(i); } if (m_TrainInstances.classIndex() >=0) { // get rid of the class column m_HasClass = true; m_ClassIndex = m_TrainInstances.classIndex(); deleteCols.addElement(new Integer(m_ClassIndex)); } // remove columns from the data if necessary if (deleteCols.size() > 0) { m_AttributeFilter = new Remove(); todelete = new int [deleteCols.size()]; for (i = 0; i < deleteCols.size(); i++) todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue(); m_AttributeFilter.setAttributeIndicesArray(todelete); m_AttributeFilter.setInvertSelection(false); m_AttributeFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter); } // can evaluator handle the processed data ? e.g., enough attributes? getCapabilities().testWithFail(m_TrainInstances); m_NumInstances = m_TrainInstances.numInstances(); m_NumAttribs = m_TrainInstances.numAttributes(); fillCorrelation(); // get eigen vectors/values corr = new Matrix(m_Correlation); eig = corr.eig(); V = eig.getV(); v = new double[m_NumAttribs][m_NumAttribs]; for (i = 0; i < v.length; i++) { for (j = 0; j < v[0].length; j++) v[i][j] = V.get(i, j); } m_Eigenvectors = (double[][]) v.clone(); m_Eigenvalues = (double[]) eig.getRealEigenvalues().clone(); // any eigenvalues less than 0 are not worth anything --- change to 0 for (i = 0; i < m_Eigenvalues.length; i++) { if (m_Eigenvalues[i] < 0) m_Eigenvalues[i] = 0.0; } m_SortedEigens = Utils.sort(m_Eigenvalues); m_SumOfEigenValues = Utils.sum(m_Eigenvalues); m_TransformedFormat = determineOutputFormat(m_TrainInstances); setOutputFormat(m_TransformedFormat); m_TrainInstances = null; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input * instance structure (any instances contained * in the object are ignored - only the structure * is required). * @return true if the outputFormat may be collected * immediately * @throws Exception if the input format can't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_Eigenvalues = null; m_OutputNumAtts = -1; m_AttributeFilter = null; m_NominalToBinaryFilter = null; m_SumOfEigenValues = 0.0; return false; } /** * Input an instance for filtering. Filter requires all * training instances be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be * collected with output(). * @throws IllegalStateException if no input format has been set * @throws Exception if conversion fails */ public boolean input(Instance instance) throws Exception { Instance inst; if (getInputFormat() == null) throw new IllegalStateException("No input instance format defined"); if (isNewBatch()) { resetQueue(); m_NewBatch = false; } if (isFirstBatchDone()) { inst = convertInstance(instance); inst.setDataset(getOutputFormat()); push(inst); return true; } else { bufferInput(instance); return false; } } /** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws NullPointerException if no input structure has been defined, * @throws Exception if there was a problem finishing the batch. */ public boolean batchFinished() throws Exception { int i; Instances insts; Instance inst; if (getInputFormat() == null) throw new NullPointerException("No input instance format defined"); insts = getInputFormat(); if (!isFirstBatchDone()) setup(insts); for (i = 0; i < insts.numInstances(); i++) { inst = convertInstance(insts.instance(i)); inst.setDataset(getOutputFormat()); push(inst); } flushInput(); m_NewBatch = true; m_FirstBatchDone = true; return (numPendingOutput() != 0); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 5987 $"); } /** * Main method for running this filter. * * @param args should contain arguments to the filter: use -h for help */ public static void main(String[] args) { runFilter(new PrincipalComponents(), args); } }