/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * PrincipalComponents.java * Copyright (C) 2007-2012 University of Waikato, Hamilton, New Zealand */ package weka.filters.unsupervised.attribute; import java.util.Enumeration; import java.util.Vector; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.DenseInstance; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SparseInstance; import weka.core.Utils; import weka.core.matrix.EigenvalueDecomposition; import weka.core.matrix.Matrix; import weka.filters.Filter; import weka.filters.UnsupervisedFilter; /** <!-- globalinfo-start --> * Performs a principal components analysis and transformation of the data.<br/> * Dimensionality reduction is accomplished by choosing enough eigenvectors to account for some percentage of the variance in the original data -- default 0.95 (95%).<br/> * Based on code of the attribute selection scheme 'PrincipalComponents' by Mark Hall and Gabi Schmidberger. * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -C * Center (rather than standardize) the * data and compute PCA using the covariance (rather * than the correlation) matrix.</pre> * * <pre> -R <num> * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default: 0.95)</pre> * * <pre> -A <num> * Maximum number of attributes to include in * transformed attribute names. * (-1 = include all, default: 5)</pre> * * <pre> -M <num> * Maximum number of PC attributes to retain. * (-1 = include all, default: -1)</pre> * <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) -- attribute selection code * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) -- attribute selection code * @author fracpete (fracpete at waikato dot ac dot nz) -- filter code * @version $Revision: 8034 $ */ public class PrincipalComponents extends Filter implements OptionHandler, UnsupervisedFilter { /** for serialization. */ private static final long serialVersionUID = -5649876869480249303L; /** The data to transform analyse/transform. */ protected Instances m_TrainInstances; /** Keep a copy for the class attribute (if set). */ protected Instances m_TrainCopy; /** The header for the transformed data format. */ protected Instances m_TransformedFormat; /** Data has a class set. */ protected boolean m_HasClass; /** Class index. */ protected int m_ClassIndex; /** Number of attributes. */ protected int m_NumAttribs; /** Number of instances. */ protected int m_NumInstances; /** Correlation matrix for the original data. */ protected double[][] m_Correlation; /** * If true, center (rather than standardize) the data and * compute PCA from covariance (rather than correlation) * matrix. */ private boolean m_center = false; /** Will hold the unordered linear transformations of the (normalized) original data. */ protected double[][] m_Eigenvectors; /** Eigenvalues for the corresponding eigenvectors. */ protected double[] m_Eigenvalues = null; /** Sorted eigenvalues. */ protected int[] m_SortedEigens; /** sum of the eigenvalues. */ protected double m_SumOfEigenValues = 0.0; /** Filters for replacing missing values. */ protected ReplaceMissingValues m_ReplaceMissingFilter; /** Filter for turning nominal values into numeric ones. */ protected NominalToBinary m_NominalToBinaryFilter; /** Filter for removing class attribute, nominal attributes with 0 or 1 value. */ protected Remove m_AttributeFilter; /** Filter for standardizing the data */ protected Standardize m_standardizeFilter; /** Filter for centering the data */ protected Center m_centerFilter; /** The number of attributes in the pc transformed data. */ protected int m_OutputNumAtts = -1; /** the amount of varaince to cover in the original data when retaining the best n PC's. */ protected double m_CoverVariance = 0.95; /** maximum number of attributes in the transformed attribute name. */ protected int m_MaxAttrsInName = 5; /** maximum number of attributes in the transformed data (-1 for all). */ protected int m_MaxAttributes = -1; /** * Returns a string describing this filter. * * @return a description of the filter suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Performs a principal components analysis and transformation of " + "the data.\n" + "Dimensionality reduction is accomplished by choosing enough eigenvectors " + "to account for some percentage of the variance in the original data -- " + "default 0.95 (95%).\n" + "Based on code of the attribute selection scheme 'PrincipalComponents' " + "by Mark Hall and Gabi Schmidberger."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option("\tCenter (rather than standardize) the" + "\n\tdata and compute PCA using the covariance (rather" + "\n\t than the correlation) matrix.", "C", 0, "-C")); result.addElement(new Option( "\tRetain enough PC attributes to account\n" +"\tfor this proportion of variance in the original data.\n" + "\t(default: 0.95)", "R", 1, "-R <num>")); result.addElement(new Option( "\tMaximum number of attributes to include in \n" + "\ttransformed attribute names.\n" + "\t(-1 = include all, default: 5)", "A", 1, "-A <num>")); result.addElement(new Option( "\tMaximum number of PC attributes to retain.\n" + "\t(-1 = include all, default: -1)", "M", 1, "-M <num>")); return result.elements(); } /** * Parses a list of options for this object. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -C * Center (rather than standardize) the * data and compute PCA using the covariance (rather * than the correlation) matrix.</pre> * * <pre> -R <num> * Retain enough PC attributes to account * for this proportion of variance in the original data. * (default: 0.95)</pre> * * <pre> -A <num> * Maximum number of attributes to include in * transformed attribute names. * (-1 = include all, default: 5)</pre> * * <pre> -M <num> * Maximum number of PC attributes to retain. * (-1 = include all, default: -1)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; tmpStr = Utils.getOption('R', options); if (tmpStr.length() != 0) setVarianceCovered(Double.parseDouble(tmpStr)); else setVarianceCovered(0.95); tmpStr = Utils.getOption('A', options); if (tmpStr.length() != 0) setMaximumAttributeNames(Integer.parseInt(tmpStr)); else setMaximumAttributeNames(5); tmpStr = Utils.getOption('M', options); if (tmpStr.length() != 0) setMaximumAttributes(Integer.parseInt(tmpStr)); else setMaximumAttributes(-1); setCenterData(Utils.getFlag('C', options)); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector<String> result; result = new Vector<String>(); result.add("-R"); result.add("" + getVarianceCovered()); result.add("-A"); result.add("" + getMaximumAttributeNames()); result.add("-M"); result.add("" + getMaximumAttributes()); if (getCenterData()) result.add("-C"); return result.toArray(new String[result.size()]); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String centerDataTipText() { return "Center (rather than standardize) the data. PCA will " + "be computed from the covariance (rather than correlation) " + "matrix"; } /** * Set whether to center (rather than standardize) * the data. If set to true then PCA is computed * from the covariance rather than correlation matrix. * * @param center true if the data is to be * centered rather than standardized */ public void setCenterData(boolean center) { m_center = center; } /** * Get whether to center (rather than standardize) * the data. If true then PCA is computed * from the covariance rather than correlation matrix. * * @return true if the data is to be centered rather * than standardized. */ public boolean getCenterData() { return m_center; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String varianceCoveredTipText() { return "Retain enough PC attributes to account for this proportion of variance."; } /** * Sets the amount of variance to account for when retaining * principal components. * * @param value the proportion of total variance to account for */ public void setVarianceCovered(double value) { m_CoverVariance = value; } /** * Gets the proportion of total variance to account for when * retaining principal components. * * @return the proportion of variance to account for */ public double getVarianceCovered() { return m_CoverVariance; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String maximumAttributeNamesTipText() { return "The maximum number of attributes to include in transformed attribute names."; } /** * Sets maximum number of attributes to include in * transformed attribute names. * * @param value the maximum number of attributes */ public void setMaximumAttributeNames(int value) { m_MaxAttrsInName = value; } /** * Gets maximum number of attributes to include in * transformed attribute names. * * @return the maximum number of attributes */ public int getMaximumAttributeNames() { return m_MaxAttrsInName; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String maximumAttributesTipText() { return "The maximum number of PC attributes to retain."; } /** * Sets maximum number of PC attributes to retain. * * @param value the maximum number of attributes */ public void setMaximumAttributes(int value) { m_MaxAttributes = value; } /** * Gets maximum number of PC attributes to retain. * * @return the maximum number of attributes */ public int getMaximumAttributes() { return m_MaxAttributes; } /** * Returns the capabilities of this evaluator. * * @return the capabilities of this evaluator * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.DATE_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); // class result.enable(Capability.NOMINAL_CLASS); result.enable(Capability.NUMERIC_CLASS); result.enable(Capability.DATE_CLASS); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Determines the output format based on the input format and returns * this. In case the output format cannot be returned immediately, i.e., * immediateOutputFormat() returns false, then this method will be called * from batchFinished(). * * @param inputFormat the input format to base the output format on * @return the output format * @throws Exception in case the determination goes wrong * @see #hasImmediateOutputFormat() * @see #batchFinished() */ protected Instances determineOutputFormat(Instances inputFormat) throws Exception { double cumulative; FastVector attributes; int i; int j; StringBuffer attName; double[] coeff_mags; int num_attrs; int[] coeff_inds; double coeff_value; int numAttsLowerBound; if (m_Eigenvalues == null) return inputFormat; if (m_MaxAttributes > 0) numAttsLowerBound = m_NumAttribs - m_MaxAttributes; else numAttsLowerBound = 0; if (numAttsLowerBound < 0) numAttsLowerBound = 0; cumulative = 0.0; attributes = new FastVector(); for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { attName = new StringBuffer(); // build array of coefficients coeff_mags = new double[m_NumAttribs]; for (j = 0; j < m_NumAttribs; j++) coeff_mags[j] = -Math.abs(m_Eigenvectors[j][m_SortedEigens[i]]); num_attrs = (m_MaxAttrsInName > 0) ? Math.min(m_NumAttribs, m_MaxAttrsInName) : m_NumAttribs; // this array contains the sorted indices of the coefficients if (m_NumAttribs > 0) { // if m_maxAttrsInName > 0, sort coefficients by decreasing magnitude coeff_inds = Utils.sort(coeff_mags); } else { // if m_maxAttrsInName <= 0, use all coeffs in original order coeff_inds = new int[m_NumAttribs]; for (j = 0; j < m_NumAttribs; j++) coeff_inds[j] = j; } // build final attName string for (j = 0; j < num_attrs; j++) { coeff_value = m_Eigenvectors[coeff_inds[j]][m_SortedEigens[i]]; if (j > 0 && coeff_value >= 0) attName.append("+"); attName.append( Utils.doubleToString(coeff_value,5,3) + inputFormat.attribute(coeff_inds[j]).name()); } if (num_attrs < m_NumAttribs) attName.append("..."); attributes.addElement(new Attribute(attName.toString())); cumulative += m_Eigenvalues[m_SortedEigens[i]]; if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) break; } if (m_HasClass) attributes.addElement(m_TrainCopy.classAttribute().copy()); Instances outputFormat = new Instances( m_TrainCopy.relationName() + "_principal components", attributes, 0); // set the class to be the last attribute if necessary if (m_HasClass) outputFormat.setClassIndex(outputFormat.numAttributes() - 1); m_OutputNumAtts = outputFormat.numAttributes(); return outputFormat; } protected void fillCovariance() throws Exception { if (!m_center) { fillCorrelation(); return; } double[] att = new double[m_TrainInstances.numInstances()]; // now center the data by subtracting the mean m_centerFilter = new Center(); m_centerFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_centerFilter); // now compute the covariance matrix m_Correlation = new double[m_NumAttribs][m_NumAttribs]; for (int i = 0; i < m_NumAttribs; i++) { for (int j = 0; j < m_NumAttribs; j++) { double cov = 0; for (int k = 0; k < m_NumInstances; k++) { if (i == j) { cov += (m_TrainInstances.instance(k).value(i) * m_TrainInstances.instance(k).value(i)); } else { cov += (m_TrainInstances.instance(k).value(i) * m_TrainInstances.instance(k).value(j)); } } cov /= (double)(m_TrainInstances.numInstances() - 1); m_Correlation[i][j] = cov; m_Correlation[j][i] = cov; } } } /** * Fill the correlation matrix. */ protected void fillCorrelation() throws Exception { int i; int j; int k; double[] att1; double[] att2; double corr; m_Correlation = new double[m_NumAttribs][m_NumAttribs]; att1 = new double [m_NumInstances]; att2 = new double [m_NumInstances]; for (i = 0; i < m_NumAttribs; i++) { for (j = 0; j < m_NumAttribs; j++) { for (k = 0; k < m_NumInstances; k++) { att1[k] = m_TrainInstances.instance(k).value(i); att2[k] = m_TrainInstances.instance(k).value(j); } if (i == j) { m_Correlation[i][j] = 1.0; } else { corr = Utils.correlation(att1,att2,m_NumInstances); m_Correlation[i][j] = corr; m_Correlation[j][i] = corr; } } } // now standardize the input data m_standardizeFilter = new Standardize(); m_standardizeFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_standardizeFilter); } /** * Transform an instance in original (unormalized) format. * * @param instance an instance in the original (unormalized) format * @return a transformed instance * @throws Exception if instance can't be transformed */ protected Instance convertInstance(Instance instance) throws Exception { Instance result; double[] newVals; Instance tempInst; double cumulative; int i; int j; double tempval; int numAttsLowerBound; newVals = new double[m_OutputNumAtts]; tempInst = (Instance) instance.copy(); m_ReplaceMissingFilter.input(tempInst); m_ReplaceMissingFilter.batchFinished(); tempInst = m_ReplaceMissingFilter.output(); m_NominalToBinaryFilter.input(tempInst); m_NominalToBinaryFilter.batchFinished(); tempInst = m_NominalToBinaryFilter.output(); if (m_AttributeFilter != null) { m_AttributeFilter.input(tempInst); m_AttributeFilter.batchFinished(); tempInst = m_AttributeFilter.output(); } if (!m_center) { m_standardizeFilter.input(tempInst); m_standardizeFilter.batchFinished(); tempInst = m_standardizeFilter.output(); } else { m_centerFilter.input(tempInst); m_centerFilter.batchFinished(); tempInst = m_centerFilter.output(); } if (m_HasClass) newVals[m_OutputNumAtts - 1] = instance.value(instance.classIndex()); if (m_MaxAttributes > 0) numAttsLowerBound = m_NumAttribs - m_MaxAttributes; else numAttsLowerBound = 0; if (numAttsLowerBound < 0) numAttsLowerBound = 0; cumulative = 0; for (i = m_NumAttribs - 1; i >= numAttsLowerBound; i--) { tempval = 0.0; for (j = 0; j < m_NumAttribs; j++) tempval += m_Eigenvectors[j][m_SortedEigens[i]] * tempInst.value(j); newVals[m_NumAttribs - i - 1] = tempval; cumulative += m_Eigenvalues[m_SortedEigens[i]]; if ((cumulative / m_SumOfEigenValues) >= m_CoverVariance) break; } // create instance if (instance instanceof SparseInstance) result = new SparseInstance(instance.weight(), newVals); else result = new DenseInstance(instance.weight(), newVals); return result; } /** * Initializes the filter with the given input data. * * @param instances the data to process * @throws Exception in case the processing goes wrong * @see #batchFinished() */ protected void setup(Instances instances) throws Exception { int i; int j; Vector<Integer> deleteCols; int[] todelete; double[][] v; Matrix corr; EigenvalueDecomposition eig; Matrix V; m_TrainInstances = new Instances(instances); // make a copy of the training data so that we can get the class // column to append to the transformed data (if necessary) m_TrainCopy = new Instances(m_TrainInstances, 0); m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_ReplaceMissingFilter); m_NominalToBinaryFilter = new NominalToBinary(); m_NominalToBinaryFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_NominalToBinaryFilter); // delete any attributes with only one distinct value or are all missing deleteCols = new Vector<Integer>(); for (i = 0; i < m_TrainInstances.numAttributes(); i++) { if (m_TrainInstances.numDistinctValues(i) <= 1) deleteCols.addElement(i); } if (m_TrainInstances.classIndex() >=0) { // get rid of the class column m_HasClass = true; m_ClassIndex = m_TrainInstances.classIndex(); deleteCols.addElement(new Integer(m_ClassIndex)); } // remove columns from the data if necessary if (deleteCols.size() > 0) { m_AttributeFilter = new Remove(); todelete = new int [deleteCols.size()]; for (i = 0; i < deleteCols.size(); i++) todelete[i] = ((Integer)(deleteCols.elementAt(i))).intValue(); m_AttributeFilter.setAttributeIndicesArray(todelete); m_AttributeFilter.setInvertSelection(false); m_AttributeFilter.setInputFormat(m_TrainInstances); m_TrainInstances = Filter.useFilter(m_TrainInstances, m_AttributeFilter); } // can evaluator handle the processed data ? e.g., enough attributes? getCapabilities().testWithFail(m_TrainInstances); m_NumInstances = m_TrainInstances.numInstances(); m_NumAttribs = m_TrainInstances.numAttributes(); //fillCorrelation(); fillCovariance(); // get eigen vectors/values corr = new Matrix(m_Correlation); eig = corr.eig(); V = eig.getV(); v = new double[m_NumAttribs][m_NumAttribs]; for (i = 0; i < v.length; i++) { for (j = 0; j < v[0].length; j++) v[i][j] = V.get(i, j); } m_Eigenvectors = (double[][]) v.clone(); m_Eigenvalues = (double[]) eig.getRealEigenvalues().clone(); // any eigenvalues less than 0 are not worth anything --- change to 0 for (i = 0; i < m_Eigenvalues.length; i++) { if (m_Eigenvalues[i] < 0) m_Eigenvalues[i] = 0.0; } m_SortedEigens = Utils.sort(m_Eigenvalues); m_SumOfEigenValues = Utils.sum(m_Eigenvalues); m_TransformedFormat = determineOutputFormat(m_TrainInstances); setOutputFormat(m_TransformedFormat); m_TrainInstances = null; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input * instance structure (any instances contained * in the object are ignored - only the structure * is required). * @return true if the outputFormat may be collected * immediately * @throws Exception if the input format can't be set successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); m_Eigenvalues = null; m_OutputNumAtts = -1; m_AttributeFilter = null; m_NominalToBinaryFilter = null; m_SumOfEigenValues = 0.0; return false; } /** * Input an instance for filtering. Filter requires all * training instances be read before producing output. * * @param instance the input instance * @return true if the filtered instance may now be * collected with output(). * @throws IllegalStateException if no input format has been set * @throws Exception if conversion fails */ public boolean input(Instance instance) throws Exception { Instance inst; if (getInputFormat() == null) throw new IllegalStateException("No input instance format defined"); if (isNewBatch()) { resetQueue(); m_NewBatch = false; } if (isFirstBatchDone()) { inst = convertInstance(instance); inst.setDataset(getOutputFormat()); push(inst); return true; } else { bufferInput(instance); return false; } } /** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws NullPointerException if no input structure has been defined, * @throws Exception if there was a problem finishing the batch. */ public boolean batchFinished() throws Exception { int i; Instances insts; Instance inst; if (getInputFormat() == null) throw new NullPointerException("No input instance format defined"); insts = getInputFormat(); if (!isFirstBatchDone()) setup(insts); for (i = 0; i < insts.numInstances(); i++) { inst = convertInstance(insts.instance(i)); inst.setDataset(getOutputFormat()); push(inst); } flushInput(); m_NewBatch = true; m_FirstBatchDone = true; return (numPendingOutput() != 0); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8034 $"); } /** * Main method for running this filter. * * @param args should contain arguments to the filter: use -h for help */ public static void main(String[] args) { runFilter(new PrincipalComponents(), args); } }