/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * RandomProjection.java * Copyright (C) 2003-2012 University of Waikato, Hamilton, New Zealand * */ package weka.filters.unsupervised.attribute; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.Capabilities.Capability; import weka.core.DenseInstance; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.OptionHandler; import weka.core.RevisionUtils; import weka.core.SelectedTag; import weka.core.Tag; import weka.core.TechnicalInformation; import weka.core.TechnicalInformation.Field; import weka.core.TechnicalInformation.Type; import weka.core.TechnicalInformationHandler; import weka.core.Utils; import weka.filters.Filter; import weka.filters.UnsupervisedFilter; /** <!-- globalinfo-start --> * Reduces the dimensionality of the data by projecting it onto a lower dimensional subspace using a random matrix with columns of unit length (i.e. It will reduce the number of attributes in the data while preserving much of its variation like PCA, but at a much less computational cost).<br/> * It first applies the NominalToBinary filter to convert all attributes to numeric before reducing the dimension. It preserves the class attribute.<br/> * <br/> * For more information, see:<br/> * <br/> * Dmitriy Fradkin, David Madigan: Experiments with random projections for machine learning. In: KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining, New York, NY, USA, 517-522, 003. * <p/> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @inproceedings{Fradkin003, * address = {New York, NY, USA}, * author = {Dmitriy Fradkin and David Madigan}, * booktitle = {KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining}, * pages = {517-522}, * publisher = {ACM Press}, * title = {Experiments with random projections for machine learning}, * year = {003} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <number> * The number of dimensions (attributes) the data should be reduced to * (default 10; exclusive of the class attribute, if it is set).</pre> * * <pre> -D [SPARSE1|SPARSE2|GAUSSIAN] * The distribution to use for calculating the random matrix. * Sparse1 is: * sqrt(3)*{-1 with prob(1/6), 0 with prob(2/3), +1 with prob(1/6)} * Sparse2 is: * {-1 with prob(1/2), +1 with prob(1/2)} * </pre> * * <pre> -P <percent> * The percentage of dimensions (attributes) the data should * be reduced to (exclusive of the class attribute, if it is set). This -N * option is ignored if this option is present or is greater * than zero.</pre> * * <pre> -M * Replace missing values using the ReplaceMissingValues filter</pre> * * <pre> -R <num> * The random seed for the random number generator used for * calculating the random matrix (default 42).</pre> * <!-- options-end --> * * @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz) * @version $Revision: 8034 $ [1.0 - 22 July 2003 - Initial version (Ashraf M. Kibriya)] */ public class RandomProjection extends Filter implements UnsupervisedFilter, OptionHandler, TechnicalInformationHandler { /** for serialization */ static final long serialVersionUID = 4428905532728645880L; /** Stores the number of dimensions to reduce the data to */ protected int m_k = 10; /** Stores the dimensionality the data should be reduced to as percentage of the original dimension */ protected double m_percent = 0.0; /** Is the random matrix will be computed using Gaussian distribution or not */ protected boolean m_useGaussian = false; /** distribution type: sparse 1 */ public static final int SPARSE1 = 1; /** distribution type: sparse 2 */ public static final int SPARSE2 = 2; /** distribution type: gaussian */ public static final int GAUSSIAN = 3; /** The types of distributions that can be used for calculating the random matrix */ public static final Tag [] TAGS_DSTRS_TYPE = { new Tag(SPARSE1, "Sparse1"), new Tag(SPARSE2, "Sparse2"), new Tag(GAUSSIAN, "Gaussian"), }; /** Stores the distribution to use for calculating the random matrix */ protected int m_distribution = SPARSE1; /** Should the missing values be replaced using unsupervised.ReplaceMissingValues filter */ protected boolean m_useReplaceMissing = false; /** Keeps track of output format if it is defined or not */ protected boolean m_OutputFormatDefined = false; /** The NominalToBinary filter applied to the data before this filter */ protected Filter m_ntob; // = new weka.filters.unsupervised.attribute.NominalToBinary(); /** The ReplaceMissingValues filter */ protected Filter m_replaceMissing; /** Stores the random seed used to generate the random matrix */ protected long m_rndmSeed = 42; /** The random matrix */ protected double m_rmatrix[][]; /** The random number generator used for generating the random matrix */ protected Random m_random; /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(2); newVector.addElement(new Option( "\tThe number of dimensions (attributes) the data should be reduced to\n" +"\t(default 10; exclusive of the class attribute, if it is set).", "N", 1, "-N <number>")); newVector.addElement(new Option( "\tThe distribution to use for calculating the random matrix.\n" +"\tSparse1 is:\n" +"\t sqrt(3)*{-1 with prob(1/6), 0 with prob(2/3), +1 with prob(1/6)}\n" +"\tSparse2 is:\n" +"\t {-1 with prob(1/2), +1 with prob(1/2)}\n", "D", 1, "-D [SPARSE1|SPARSE2|GAUSSIAN]")); //newVector.addElement(new Option( // "\tUse Gaussian distribution for calculating the random matrix.", // "G", 0, "-G")); newVector.addElement(new Option( "\tThe percentage of dimensions (attributes) the data should\n" +"\tbe reduced to (exclusive of the class attribute, if it is set). This -N\n" +"\toption is ignored if this option is present or is greater\n" +"\tthan zero.", "P", 1, "-P <percent>")); newVector.addElement(new Option( "\tReplace missing values using the ReplaceMissingValues filter", "M", 0, "-M")); newVector.addElement(new Option( "\tThe random seed for the random number generator used for\n" +"\tcalculating the random matrix (default 42).", "R", 0, "-R <num>")); return newVector.elements(); } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <number> * The number of dimensions (attributes) the data should be reduced to * (default 10; exclusive of the class attribute, if it is set).</pre> * * <pre> -D [SPARSE1|SPARSE2|GAUSSIAN] * The distribution to use for calculating the random matrix. * Sparse1 is: * sqrt(3)*{-1 with prob(1/6), 0 with prob(2/3), +1 with prob(1/6)} * Sparse2 is: * {-1 with prob(1/2), +1 with prob(1/2)} * </pre> * * <pre> -P <percent> * The percentage of dimensions (attributes) the data should * be reduced to (exclusive of the class attribute, if it is set). This -N * option is ignored if this option is present or is greater * than zero.</pre> * * <pre> -M * Replace missing values using the ReplaceMissingValues filter</pre> * * <pre> -R <num> * The random seed for the random number generator used for * calculating the random matrix (default 42).</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String mString = Utils.getOption('P', options); if (mString.length() != 0) { setPercent((double) Double.parseDouble(mString)); //setNumberOfAttributes((int) Integer.parseInt(mString)); } else { setPercent(0); mString = Utils.getOption('N', options); if (mString.length() != 0) setNumberOfAttributes(Integer.parseInt(mString)); else setNumberOfAttributes(10); } mString = Utils.getOption('R', options); if(mString.length()!=0) { setRandomSeed( Long.parseLong(mString) ); } mString = Utils.getOption('D', options); if(mString.length()!=0) { if(mString.equalsIgnoreCase("sparse1")) setDistribution( new SelectedTag(SPARSE1, TAGS_DSTRS_TYPE) ); else if(mString.equalsIgnoreCase("sparse2")) setDistribution( new SelectedTag(SPARSE2, TAGS_DSTRS_TYPE) ); else if(mString.equalsIgnoreCase("gaussian")) setDistribution( new SelectedTag(GAUSSIAN, TAGS_DSTRS_TYPE) ); } if(Utils.getFlag('M', options)) setReplaceMissingValues(true); else setReplaceMissingValues(false); //if(Utils.getFlag('G', options)) // setUseGaussian(true); //else // setUseGaussian(false); } /** * Gets the current settings of the filter. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [10]; int current = 0; //if (getUseGaussian()) { // options[current++] = "-G"; //} if (getReplaceMissingValues()) { options[current++] = "-M"; } if (getPercent() == 0) { options[current++] = "-N"; options[current++] = "" + getNumberOfAttributes(); } else { options[current++] = "-P"; options[current++] = "" + getPercent(); } options[current++] = "-R"; options[current++] = "" + getRandomSeed(); SelectedTag t = getDistribution(); options[current++] = "-D"; options[current++] = ""+t.getSelectedTag().getReadable(); while (current < options.length) { options[current++] = ""; } return options; } /** * Returns a string describing this filter * * @return a description of the filter suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Reduces the dimensionality of the data by projecting" + " it onto a lower dimensional subspace using a random" + " matrix with columns of unit length (i.e. It will reduce" + " the number of attributes in the data while preserving" + " much of its variation like PCA, but at a much less" + " computational cost).\n" + "It first applies the NominalToBinary filter to" + " convert all attributes to numeric before reducing the" + " dimension. It preserves the class attribute.\n\n" + "For more information, see:\n\n" + getTechnicalInformation().toString(); } /** * Returns an instance of a TechnicalInformation object, containing * detailed information about the technical background of this class, * e.g., paper reference or book this class is based on. * * @return the technical information about this class */ public TechnicalInformation getTechnicalInformation() { TechnicalInformation result; result = new TechnicalInformation(Type.INPROCEEDINGS); result.setValue(Field.AUTHOR, "Dmitriy Fradkin and David Madigan"); result.setValue(Field.TITLE, "Experiments with random projections for machine learning"); result.setValue(Field.BOOKTITLE, "KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining"); result.setValue(Field.YEAR, "003"); result.setValue(Field.PAGES, "517-522"); result.setValue(Field.PUBLISHER, "ACM Press"); result.setValue(Field.ADDRESS, "New York, NY, USA"); return result; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String numberOfAttributesTipText() { return "The number of dimensions (attributes) the data should" + " be reduced to."; } /** * Sets the number of attributes (dimensions) the data should be reduced to * * @param newAttNum the goal for the dimensions */ public void setNumberOfAttributes(int newAttNum) { m_k = newAttNum; } /** * Gets the current number of attributes (dimensionality) to which the data * will be reduced to. * * @return the number of dimensions */ public int getNumberOfAttributes() { return m_k; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String percentTipText() { return " The percentage of dimensions (attributes) the data should" + " be reduced to (inclusive of the class attribute). This " + " NumberOfAttributes option is ignored if this option is" + " present or is greater than zero."; } /** * Sets the percent the attributes (dimensions) of the data should be reduced to * * @param newPercent the percentage of attributes */ public void setPercent(double newPercent) { if(newPercent > 0) newPercent /= 100; m_percent = newPercent; } /** * Gets the percent the attributes (dimensions) of the data will be reduced to * * @return the percentage of attributes */ public double getPercent() { return m_percent * 100; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String randomSeedTipText() { return "The random seed used by the random" +" number generator used for generating" +" the random matrix "; } /** * Sets the random seed of the random number generator * * @param seed the random seed value */ public void setRandomSeed(long seed) { m_rndmSeed = seed; } /** * Gets the random seed of the random number generator * * @return the random seed value */ public long getRandomSeed() { return m_rndmSeed; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String distributionTipText() { return "The distribution to use for calculating the random matrix.\n" +"Sparse1 is:\n" +" sqrt(3) * { -1 with prob(1/6), \n" +" 0 with prob(2/3), \n" +" +1 with prob(1/6) } \n" +"Sparse2 is:\n" +" { -1 with prob(1/2), \n" +" +1 with prob(1/2) } "; } /** * Sets the distribution to use for calculating the random matrix * * @param newDstr the distribution to use */ public void setDistribution(SelectedTag newDstr) { if (newDstr.getTags() == TAGS_DSTRS_TYPE) { m_distribution = newDstr.getSelectedTag().getID(); } } /** * Returns the current distribution that'll be used for calculating the * random matrix * * @return the current distribution */ public SelectedTag getDistribution() { return new SelectedTag(m_distribution, TAGS_DSTRS_TYPE); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String replaceMissingValuesTipText() { return "If set the filter uses weka.filters.unsupervised.attribute.ReplaceMissingValues" + " to replace the missing values"; } /** * Sets either to use replace missing values filter or not * * @param t if true then the replace missing values is used */ public void setReplaceMissingValues(boolean t) { m_useReplaceMissing = t; } /** * Gets the current setting for using ReplaceMissingValues filter * * @return true if the replace missing values filter is used */ public boolean getReplaceMissingValues() { return m_useReplaceMissing; } /** * Returns the Capabilities of this filter. * * @return the capabilities of this object * @see Capabilities */ public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); // attributes result.enableAllAttributes(); result.enable(Capability.MISSING_VALUES); // class result.enableAllClasses(); result.enable(Capability.MISSING_CLASS_VALUES); result.enable(Capability.NO_CLASS); return result; } /** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input * instance structure (any instances contained in the object are * ignored - only the structure is required). * @return true if the outputFormat may be collected immediately * @throws Exception if the input format can't be set * successfully */ public boolean setInputFormat(Instances instanceInfo) throws Exception { super.setInputFormat(instanceInfo); /* if (instanceInfo.classIndex() < 0) { throw new UnassignedClassException("No class has been assigned to the instances"); } */ for(int i=0; i<instanceInfo.numAttributes(); i++) { if( i!=instanceInfo.classIndex() && instanceInfo.attribute(i).isNominal() ) { if(instanceInfo.classIndex()>=0) m_ntob = new weka.filters.supervised.attribute.NominalToBinary(); else m_ntob = new weka.filters.unsupervised.attribute.NominalToBinary(); break; } } //r.setSeed(m_rndmSeed); //in case the setRandomSeed() is not //called we better set the seed to its //default value of 42. boolean temp=true; if(m_replaceMissing!=null) { m_replaceMissing = new weka.filters.unsupervised.attribute.ReplaceMissingValues(); if(m_replaceMissing.setInputFormat(instanceInfo)) temp=true; else temp=false; } if(m_ntob!=null) { if(m_ntob.setInputFormat(instanceInfo)) { setOutputFormat(); return temp && true; } else { return false; } } else { setOutputFormat(); return temp && true; } } /** * Input an instance for filtering. * * @param instance the input instance * @return true if the filtered instance may now be * collected with output(). * @throws IllegalStateException if no input format has been set */ public boolean input(Instance instance) throws Exception { Instance newInstance=null; if (getInputFormat()==null) { throw new IllegalStateException("No input instance format defined"); } if(m_NewBatch) { resetQueue(); //if(ntob!=null) // ntob.m_NewBatch=true; m_NewBatch = false; } boolean replaceDone=false; if(m_replaceMissing!=null) { if(m_replaceMissing.input(instance)) { if(m_OutputFormatDefined == false) setOutputFormat(); newInstance = m_replaceMissing.output(); replaceDone = true; } else return false;; } if(m_ntob!=null) { if(replaceDone==false) newInstance = instance; if(m_ntob.input(newInstance)) { if(m_OutputFormatDefined == false) setOutputFormat(); newInstance = m_ntob.output(); newInstance = convertInstance(newInstance); push(newInstance); return true; } else { return false; } } else { if(replaceDone==false) newInstance = instance; newInstance = convertInstance(newInstance); push(newInstance); return true; } } /** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws NullPointerException if no input structure has been defined, * @throws Exception if there was a problem finishing the batch. */ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new NullPointerException("No input instance format defined"); } boolean conversionDone=false; if(m_replaceMissing!=null) { if(m_replaceMissing.batchFinished()) { Instance newInstance, instance; while((instance=m_replaceMissing.output())!=null) { if(!m_OutputFormatDefined) setOutputFormat(); if(m_ntob!=null) { m_ntob.input(instance); } else { newInstance = convertInstance(instance); push(newInstance); } } if(m_ntob!=null) { if(m_ntob.batchFinished()) { //Instance newInstance, instance; while((instance=m_ntob.output())!=null) { if(!m_OutputFormatDefined) setOutputFormat(); newInstance = convertInstance(instance); push(newInstance); } m_ntob = null; } } m_replaceMissing = null; conversionDone=true; } } if(conversionDone==false && m_ntob!=null) { if(m_ntob.batchFinished()) { Instance newInstance, instance; while((instance=m_ntob.output())!=null) { if(!m_OutputFormatDefined) setOutputFormat(); newInstance = convertInstance(instance); push(newInstance); } m_ntob = null; } } m_OutputFormatDefined=false; return super.batchFinished(); } /** Sets the output format */ protected void setOutputFormat() { Instances currentFormat; if(m_ntob!=null) { currentFormat = m_ntob.getOutputFormat(); } else currentFormat = getInputFormat(); if(m_percent>0) { m_k = (int) ((getInputFormat().numAttributes()-1)*m_percent); // System.out.print("numAtts: "+currentFormat.numAttributes()); // System.out.print("percent: "+m_percent); // System.out.print("percent*numAtts: "+(currentFormat.numAttributes()*m_percent)); // System.out.println("m_k: "+m_k); } Instances newFormat; int newClassIndex=-1; FastVector attributes = new FastVector(); for(int i=0; i<m_k; i++) { attributes.addElement( new Attribute("K"+(i+1)) ); } if(currentFormat.classIndex()!=-1) { //if classindex is set //attributes.removeElementAt(attributes.size()-1); attributes.addElement(currentFormat.attribute(currentFormat.classIndex())); newClassIndex = attributes.size()-1; } newFormat = new Instances(currentFormat.relationName(), attributes, 0); if(newClassIndex!=-1) newFormat.setClassIndex(newClassIndex); m_OutputFormatDefined=true; m_random = new Random(); m_random.setSeed(m_rndmSeed); m_rmatrix = new double[m_k][currentFormat.numAttributes()]; if(m_distribution==GAUSSIAN) { for(int i=0; i<m_rmatrix.length; i++) for(int j=0; j<m_rmatrix[i].length; j++) m_rmatrix[i][j] = m_random.nextGaussian(); } else { boolean useDstrWithZero = (m_distribution==SPARSE1); for(int i=0; i<m_rmatrix.length; i++) for(int j=0; j<m_rmatrix[i].length; j++) m_rmatrix[i][j] = rndmNum(useDstrWithZero); } setOutputFormat(newFormat); } /** * converts a single instance to the required format * * @param currentInstance the instance to convert * @return the converted instance */ protected Instance convertInstance(Instance currentInstance) { Instance newInstance; double vals[] = new double[getOutputFormat().numAttributes()]; int classIndex = (m_ntob==null) ? getInputFormat().classIndex():m_ntob.getOutputFormat().classIndex(); for(int i = 0; i < m_k; i++) { vals[i] = computeRandomProjection(i,classIndex,currentInstance); } if (classIndex != -1) { vals[m_k] = currentInstance.value(classIndex); } newInstance = new DenseInstance(currentInstance.weight(), vals); newInstance.setDataset(getOutputFormat()); return newInstance; } /** * computes one random projection for a given instance (skip missing values) * * @param rpIndex offset the new random projection attribute * @param classIndex classIndex of the input instance * @param instance the instance to convert * @return the random sum */ protected double computeRandomProjection(int rpIndex, int classIndex, Instance instance) { double sum = 0.0; for(int i = 0; i < instance.numValues(); i++) { int index = instance.index(i); if (index != classIndex) { double value = instance.valueSparse(i); if (!Utils.isMissingValue(value)) { sum += m_rmatrix[rpIndex][index] * value; } } } return sum; } private static final int weights[] = {1, 1, 4}; private static final int vals[] = {-1, 1, 0}; private static final int weights2[] = {1, 1}; private static final int vals2[] = {-1, 1}; private static final double sqrt3 = Math.sqrt(3); /** * returns a double x such that <br/> * x = sqrt(3) * { -1 with prob. 1/6, 0 with prob. 2/3, 1 with prob. 1/6 } * * @param useDstrWithZero * @return the generated number */ protected double rndmNum(boolean useDstrWithZero) { if(useDstrWithZero) return sqrt3 * vals[weightedDistribution(weights)]; else return vals2[weightedDistribution(weights2)]; } /** * Calculates a weighted distribution * * @param weights the weights to use * @return */ protected int weightedDistribution(int [] weights) { int sum=0; for(int i=0; i<weights.length; i++) sum += weights[i]; int val = (int)Math.floor(m_random.nextDouble()*sum); for(int i=0; i<weights.length; i++) { val -= weights[i]; if(val<0) return i; } return -1; } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8034 $"); } /** * Main method for testing this class. * * @param argv should contain arguments to the filter: * use -h for help */ public static void main(String [] argv) { runFilter(new RandomProjection(), argv); } }