/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * SubspaceCluster.java * Copyright (C) 2001-2012 University of Waikato, Hamilton, New Zealand * */ package weka.datagenerators.clusterers; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.Range; import weka.core.RevisionUtils; import weka.core.Tag; import weka.core.Utils; import weka.datagenerators.ClusterDefinition; import weka.datagenerators.ClusterGenerator; /** <!-- globalinfo-start --> * A data generator that produces data points in hyperrectangular subspace clusters. * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -h * Prints this help.</pre> * * <pre> -o <file> * The name of the output file, otherwise the generated data is * printed to stdout.</pre> * * <pre> -r <name> * The name of the relation.</pre> * * <pre> -d * Whether to print debug informations.</pre> * * <pre> -S * The seed for random function (default 1)</pre> * * <pre> -a <num> * The number of attributes (default 1).</pre> * * <pre> -c * Class Flag, if set, the cluster is listed in extra attribute.</pre> * * <pre> -b <range> * The indices for boolean attributes.</pre> * * <pre> -m <range> * The indices for nominal attributes.</pre> * * <pre> -P <num> * The noise rate in percent (default 0.0). * Can be between 0% and 30%. (Remark: The original * algorithm only allows noise up to 10%.)</pre> * * <pre> -C <cluster-definition> * A cluster definition of class 'SubspaceClusterDefinition' * (definition needs to be quoted to be recognized as * a single argument).</pre> * * <pre> * Options specific to weka.datagenerators.clusterers.SubspaceClusterDefinition: * </pre> * * <pre> -A <range> * Generates randomly distributed instances in the cluster.</pre> * * <pre> -U <range> * Generates uniformly distributed instances in the cluster.</pre> * * <pre> -G <range> * Generates gaussian distributed instances in the cluster.</pre> * * <pre> -D <num>,<num> * The attribute min/max (-A and -U) or mean/stddev (-G) for * the cluster.</pre> * * <pre> -N <num>..<num> * The range of number of instances per cluster (default 1..50).</pre> * * <pre> -I * Uses integer instead of continuous values (default continuous).</pre> * <!-- options-end --> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 8034 $ */ public class SubspaceCluster extends ClusterGenerator { /** for serialization */ static final long serialVersionUID = -3454999858505621128L; /** noise rate in percent (option P, between 0 and 30)*/ protected double m_NoiseRate; /** cluster list */ protected ClusterDefinition[] m_Clusters; /** if nominal, store number of values */ protected int[] m_numValues; /** store global min values */ protected double[] m_globalMinValue; /** store global max values */ protected double[] m_globalMaxValue; /** cluster type: uniform/random */ public static final int UNIFORM_RANDOM = 0; /** cluster type: total uniform */ public static final int TOTAL_UNIFORM = 1; /** cluster type: gaussian */ public static final int GAUSSIAN = 2; /** the tags for the cluster types */ public static final Tag[] TAGS_CLUSTERTYPE = { new Tag(UNIFORM_RANDOM, "uniform/random"), new Tag(TOTAL_UNIFORM, "total uniform"), new Tag(GAUSSIAN, "gaussian") }; /** cluster subtype: continuous */ public static final int CONTINUOUS = 0; /** cluster subtype: integer */ public static final int INTEGER = 1; /** the tags for the cluster types */ public static final Tag[] TAGS_CLUSTERSUBTYPE = { new Tag(CONTINUOUS, "continuous"), new Tag(INTEGER, "integer") }; /** * initializes the generator, sets the number of clusters to 0, since user * has to specify them explicitly */ public SubspaceCluster() { super(); setNoiseRate(defaultNoiseRate()); } /** * Returns a string describing this data generator. * * @return a description of the data generator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "A data generator that produces data points in " + "hyperrectangular subspace clusters."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector result = enumToVector(super.listOptions()); result.addElement(new Option( "\tThe noise rate in percent (default " + defaultNoiseRate() + ").\n" + "\tCan be between 0% and 30%. (Remark: The original \n" + "\talgorithm only allows noise up to 10%.)", "P", 1, "-P <num>")); result.addElement(new Option( "\tA cluster definition of class '" + SubspaceClusterDefinition.class.getName().replaceAll(".*\\.", "") + "'\n" + "\t(definition needs to be quoted to be recognized as \n" + "\ta single argument).", "C", 1, "-C <cluster-definition>")); result.addElement(new Option( "", "", 0, "\nOptions specific to " + SubspaceClusterDefinition.class.getName() + ":")); result.addAll( enumToVector(new SubspaceClusterDefinition(this).listOptions())); return result.elements(); } /** * Parses a list of options for this object. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -h * Prints this help.</pre> * * <pre> -o <file> * The name of the output file, otherwise the generated data is * printed to stdout.</pre> * * <pre> -r <name> * The name of the relation.</pre> * * <pre> -d * Whether to print debug informations.</pre> * * <pre> -S * The seed for random function (default 1)</pre> * * <pre> -a <num> * The number of attributes (default 1).</pre> * * <pre> -c * Class Flag, if set, the cluster is listed in extra attribute.</pre> * * <pre> -b <range> * The indices for boolean attributes.</pre> * * <pre> -m <range> * The indices for nominal attributes.</pre> * * <pre> -P <num> * The noise rate in percent (default 0.0). * Can be between 0% and 30%. (Remark: The original * algorithm only allows noise up to 10%.)</pre> * * <pre> -C <cluster-definition> * A cluster definition of class 'SubspaceClusterDefinition' * (definition needs to be quoted to be recognized as * a single argument).</pre> * * <pre> * Options specific to weka.datagenerators.clusterers.SubspaceClusterDefinition: * </pre> * * <pre> -A <range> * Generates randomly distributed instances in the cluster.</pre> * * <pre> -U <range> * Generates uniformly distributed instances in the cluster.</pre> * * <pre> -G <range> * Generates gaussian distributed instances in the cluster.</pre> * * <pre> -D <num>,<num> * The attribute min/max (-A and -U) or mean/stddev (-G) for * the cluster.</pre> * * <pre> -N <num>..<num> * The range of number of instances per cluster (default 1..50).</pre> * * <pre> -I * Uses integer instead of continuous values (default continuous).</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; SubspaceClusterDefinition cl; Vector list; int clCount; super.setOptions(options); m_numValues = new int[getNumAttributes()]; // numValues might be changed by a cluster definition // (only relevant for nominal data) for (int i = 0; i < getNumAttributes(); i++) m_numValues[i] = 1; tmpStr = Utils.getOption('P', options); if (tmpStr.length() != 0) setNoiseRate(Double.parseDouble(tmpStr)); else setNoiseRate(defaultNoiseRate()); // cluster definitions list = new Vector(); clCount = 0; do { tmpStr = Utils.getOption('C', options); if (tmpStr.length() != 0) { clCount++; cl = new SubspaceClusterDefinition(this); cl.setOptions(Utils.splitOptions(tmpStr)); list.add(cl); } } while (tmpStr.length() != 0); m_Clusters = (ClusterDefinition[]) list.toArray(new ClusterDefinition[list.size()]); // in case no cluster definition was provided, make sure that there's at // least one definition present -> see getClusters() getClusters(); } /** * Gets the current settings of the datagenerator. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector result; String[] options; int i; result = new Vector(); options = super.getOptions(); for (i = 0; i < options.length; i++) result.add(options[i]); result.add("-P"); result.add("" + getNoiseRate()); for (i = 0; i < getClusters().length; i++) { result.add("-C"); result.add(Utils.joinOptions(getClusters()[i].getOptions())); } return (String[]) result.toArray(new String[result.size()]); } /** * returns the current cluster definitions, if necessary initializes them * * @return the current cluster definitions */ protected ClusterDefinition[] getClusters() { if ( (m_Clusters == null) || (m_Clusters.length == 0) ) { if (m_Clusters != null) System.out.println("NOTE: at least 1 cluster definition is necessary, " + "created default one."); m_Clusters = new ClusterDefinition[]{new SubspaceClusterDefinition(this)}; } return m_Clusters; } /** * returns the default number of attributes * * @return the default number of attributes */ protected int defaultNumAttributes() { return 1; } /** * Sets the number of attributes the dataset should have. * @param numAttributes the new number of attributes */ public void setNumAttributes(int numAttributes) { super.setNumAttributes(numAttributes); m_numValues = new int[getNumAttributes()]; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String numAttributesTipText() { return "The number of attributes the generated data will contain (Note: they must be covered by the cluster definitions!)"; } /** * returns the default noise rate * * @return the default noise rate */ protected double defaultNoiseRate() { return 0.0; } /** * Gets the percentage of noise set. * * @return the percentage of noise set */ public double getNoiseRate() { return m_NoiseRate; } /** * Sets the percentage of noise set. * * @param newNoiseRate new percentage of noise */ public void setNoiseRate(double newNoiseRate) { m_NoiseRate = newNoiseRate; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String noiseRateTipText() { return "The noise rate to use."; } /** * returns the currently set clusters * * @return the currently set clusters */ public ClusterDefinition[] getClusterDefinitions() { return getClusters(); } /** * sets the clusters to use * * @param value the clusters do use * @throws Exception if clusters are not the correct class */ public void setClusterDefinitions(ClusterDefinition[] value) throws Exception { String indexStr; indexStr = ""; m_Clusters = value; for (int i = 0; i < getClusters().length; i++) { if (!(getClusters()[i] instanceof SubspaceClusterDefinition)) { if (indexStr.length() != 0) indexStr += ","; indexStr += "" + (i+1); } getClusters()[i].setParent(this); getClusters()[i].setOptions(getClusters()[i].getOptions()); // for initializing! } // any wrong classes encountered? if (indexStr.length() != 0) throw new Exception("These cluster definitions are not '" + SubspaceClusterDefinition.class.getName() + "': " + indexStr); } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String clusterDefinitionsTipText() { return "The clusters to use."; } /** * Checks, whether all attributes are covered by cluster definitions and * returns TRUE in that case. * * @return whether all attributes are covered */ protected boolean checkCoverage() { int i; int n; int[] count; Range r; String attrIndex; SubspaceClusterDefinition cl; // check whether all the attributes are covered count = new int[getNumAttributes()]; for (i = 0; i < getNumAttributes(); i++) { for (n = 0; n < getClusters().length; n++) { cl = (SubspaceClusterDefinition) getClusters()[n]; r = new Range(cl.getAttrIndexRange()); r.setUpper(getNumAttributes()); if (r.isInRange(i)) count[i]++; } } // list all indices that are not covered attrIndex = ""; for (i = 0; i < count.length; i++) { if (count[i] == 0) { if (attrIndex.length() != 0) attrIndex += ","; attrIndex += (i+1); } } if (attrIndex.length() != 0) throw new IllegalArgumentException( "The following attributes are not covered by a cluster " + "definition: " + attrIndex + "\n"); return true; } /** * Gets the single mode flag. * * @return true if methode generateExample can be used. */ public boolean getSingleModeFlag() { return false; } /** * Initializes the format for the dataset produced. * * @return the output data format * @throws Exception data format could not be defined */ public Instances defineDataFormat() throws Exception { // initialize setOptions(getOptions()); checkCoverage(); Random random = new Random (getSeed()); setRandom(random); Instances dataset; FastVector attributes = new FastVector(3); Attribute attribute; boolean classFlag = getClassFlag(); FastVector classValues = null; if (classFlag) classValues = new FastVector(getClusters().length); FastVector boolValues = new FastVector(2); boolValues.addElement("false"); boolValues.addElement("true"); FastVector nomValues = null; // define dataset for (int i = 0; i < getNumAttributes(); i++) { // define boolean attribute if (m_booleanCols.isInRange(i)) { attribute = new Attribute("B" + i, boolValues); } else if (m_nominalCols.isInRange(i)) { // define nominal attribute nomValues = new FastVector(m_numValues[i]); for (int j = 0; j < m_numValues[i]; j++) nomValues.addElement("value-" + j); attribute = new Attribute("N" + i, nomValues); } else { // numerical attribute attribute = new Attribute("X" + i); } attributes.addElement(attribute); } if (classFlag) { for (int i = 0; i < getClusters().length; i++) classValues.addElement("c" + i); attribute = new Attribute ("class", classValues); attributes.addElement(attribute); } dataset = new Instances(getRelationNameToUse(), attributes, 0); if (classFlag) dataset.setClassIndex(m_NumAttributes); // set dataset format of this class Instances format = new Instances(dataset, 0); setDatasetFormat(format); for (int i = 0; i < getClusters().length; i++) { SubspaceClusterDefinition cl = (SubspaceClusterDefinition) getClusters()[i]; cl.setNumInstances(random); cl.setParent(this); } return dataset; } /** * Returns true if attribute is boolean *@param index of the attribute *@return true if the attribute is boolean */ public boolean isBoolean(int index) { return m_booleanCols.isInRange(index); } /** * Returns true if attribute is nominal *@param index of the attribute *@return true if the attribute is nominal */ public boolean isNominal(int index) { return m_nominalCols.isInRange(index); } /** * returns array that stores the number of values for a nominal attribute. * * @return the array that stores the number of values for a nominal attribute */ public int[] getNumValues() { return m_numValues; } /** * Generate an example of the dataset. * @return the instance generated * @throws Exception if format not defined or generating <br/> * examples one by one is not possible, because voting is chosen */ public Instance generateExample() throws Exception { throw new Exception("Examples cannot be generated one by one."); } /** * Generate all examples of the dataset. * @return the instance generated * @throws Exception if format not defined */ public Instances generateExamples() throws Exception { Instances format = getDatasetFormat(); Instance example = null; if (format == null) throw new Exception("Dataset format not defined."); // generate examples for one cluster after another for (int cNum = 0; cNum < getClusters().length; cNum++) { SubspaceClusterDefinition cl = (SubspaceClusterDefinition) getClusters()[cNum]; //get the number of instances to create int instNum = cl.getNumInstances(); //class value is c + cluster number String cName = "c" + cNum; switch (cl.getClusterType().getSelectedTag().getID()) { case (UNIFORM_RANDOM): for (int i = 0; i < instNum; i++) { // generate example example = generateExample(format, getRandom(), cl, cName); if (example != null) format.add(example); } break; case (TOTAL_UNIFORM): // generate examples if (!cl.isInteger()) generateUniformExamples(format, instNum, cl, cName); else generateUniformIntegerExamples(format, instNum, cl, cName); break; case (GAUSSIAN): // generate examples generateGaussianExamples(format, instNum, getRandom(), cl, cName); break; } } return format; } /** * Generate an example of the dataset. * * @param format the dataset format * @param randomG the random number generator to use * @param cl the cluster definition * @param cName the class value * @return the generated instance */ private Instance generateExample( Instances format, Random randomG, SubspaceClusterDefinition cl, String cName) { boolean makeInteger = cl.isInteger(); int num = -1; Instance example = null; int numAtts = m_NumAttributes; if (getClassFlag()) numAtts++; example = new DenseInstance(numAtts); example.setDataset(format); boolean[] attributes = cl.getAttributes(); double[] minValue = cl.getMinValue(); double[] maxValue = cl.getMaxValue(); double value; int clusterI = -1; for (int i = 0; i < m_NumAttributes; i++) { if (attributes[i]) { clusterI++; num++; // boolean or nominal attribute if (isBoolean(i) || isNominal(i)) { if (minValue[clusterI] == maxValue[clusterI]) { value = minValue[clusterI]; } else { int numValues = (int)(maxValue[clusterI] - minValue[clusterI] + 1.0); value = randomG.nextInt(numValues); value += minValue[clusterI]; } } else { // numeric attribute value = randomG.nextDouble() * (maxValue[num] - minValue[num]) + minValue[num]; if (makeInteger) value = Math.round(value); } example.setValue(i, value); } else { example.setMissing(i); } } if (getClassFlag()) example.setClassValue(cName); return example; } /** * Generate examples for a uniform cluster dataset. * * @param format the dataset format * @param numInstances the number of instances to generator * @param cl the cluster definition * @param cName the class value */ private void generateUniformExamples( Instances format, int numInstances, SubspaceClusterDefinition cl, String cName) { Instance example = null; int numAtts = m_NumAttributes; if (getClassFlag()) numAtts++; example = new DenseInstance(numAtts); example.setDataset(format); boolean[] attributes = cl.getAttributes(); double[] minValue = cl.getMinValue(); double[] maxValue = cl.getMaxValue(); double[] diff = new double[minValue.length]; for (int i = 0; i < minValue.length; i++) diff[i] = (maxValue[i] - minValue[i]); for (int j = 0; j < numInstances; j++) { int num = -1; for (int i = 0; i < m_NumAttributes; i++) { if (attributes[i]) { num++; double value = minValue[num] + (diff[num] * (double)((double)j / (double)(numInstances - 1))); example.setValue(i, value); } else { example.setMissing(i); } } if (getClassFlag()) example.setClassValue(cName); format.add(example); } } /** * Generate examples for a uniform cluster dataset. * * @param format the dataset format * @param numInstances the number of instances to generator * @param cl the cluster definition * @param cName the class value */ private void generateUniformIntegerExamples( Instances format, int numInstances, SubspaceClusterDefinition cl, String cName) { Instance example = null; int numAtts = m_NumAttributes; if (getClassFlag()) numAtts++; example = new DenseInstance(numAtts); example.setDataset(format); boolean[] attributes = cl.getAttributes(); double[] minValue = cl.getMinValue(); double[] maxValue = cl.getMaxValue(); int[] minInt = new int[minValue.length]; int[] maxInt = new int[maxValue.length]; int[] intValue = new int[maxValue.length]; int[] numInt = new int[minValue.length]; int num = 1; for (int i = 0; i < minValue.length; i++) { minInt[i] = (int)Math.ceil(minValue[i]); maxInt[i] = (int)Math.floor(maxValue[i]); numInt[i] = (maxInt[i] - minInt[i] + 1); num = num * numInt[i]; } int numEach = numInstances / num; int rest = numInstances - numEach * num; // initialize with smallest values combination for (int i = 0; i < m_NumAttributes; i++) { if (attributes[i]) { example.setValue(i, (double)minInt[i]); intValue[i] = minInt[i]; } else { example.setMissing(i); } } if (getClassFlag()) example.setClassValue(cName); int added = 0; int attr = 0; // do while not added all do { // add all for one value combination for (int k = 0; k < numEach; k++) { format.add(example); example = (Instance) example.copy(); added++; } if (rest > 0) { format.add(example); example = (Instance) example.copy(); added++; rest--; } if (added >= numInstances) break; // switch to the next value combination boolean done = false; do { if (attributes[attr] && (intValue[attr] + 1 <= maxInt[attr])) { intValue[attr]++; done = true; } else { attr++; } } while (!done); example.setValue(attr, (double)intValue[attr]); } while (added < numInstances); } /** * Generate examples for a uniform cluster dataset. * * @param format the dataset format * @param numInstances the number of instances to generate * @param random the random number generator * @param cl the cluster definition * @param cName the class value */ private void generateGaussianExamples( Instances format, int numInstances, Random random, SubspaceClusterDefinition cl, String cName) { boolean makeInteger = cl.isInteger(); Instance example = null; int numAtts = m_NumAttributes; if (getClassFlag()) numAtts++; example = new DenseInstance(numAtts); example.setDataset(format); boolean[] attributes = cl.getAttributes(); double[] meanValue = cl.getMeanValue(); double[] stddevValue = cl.getStddevValue(); for (int j = 0; j < numInstances; j++) { int num = -1; for (int i = 0; i < m_NumAttributes; i++) { if (attributes[i]) { num++; double value = meanValue[num] + (random.nextGaussian() * stddevValue[num]); if (makeInteger) value = Math.round(value); example.setValue(i, value); } else { example.setMissing(i); } } if (getClassFlag()) example.setClassValue(cName); format.add(example); } } /** * Compiles documentation about the data generation after * the generation process * * @return string with additional information about generated dataset * @throws Exception no input structure has been defined */ public String generateFinished() throws Exception { return ""; } /** * Compiles documentation about the data generation before * the generation process * * @return string with additional information */ public String generateStart() { StringBuffer docu = new StringBuffer(); int sumInst = 0; for (int cNum = 0; cNum < getClusters().length; cNum++) { SubspaceClusterDefinition cl = (SubspaceClusterDefinition) getClusters()[cNum]; docu.append("%\n"); docu.append("% Cluster: c"+ cNum + " "); switch (cl.getClusterType().getSelectedTag().getID()) { case UNIFORM_RANDOM: docu.append("Uniform Random"); break; case TOTAL_UNIFORM: docu.append("Total Random"); break; case GAUSSIAN: docu.append("Gaussian"); break; } if (cl.isInteger()) { docu.append(" / INTEGER"); } docu.append("\n% ----------------------------------------------\n"); docu.append("%"+cl.attributesToString()); docu.append("\n% Number of Instances: " + cl.getInstNums() + "\n"); docu.append( "% Generated Number of Instances: " + cl.getNumInstances() + "\n"); sumInst += cl.getNumInstances(); } docu.append("%\n% ----------------------------------------------\n"); docu.append("% Total Number of Instances: " + sumInst + "\n"); docu.append("% in " + getClusters().length + " Cluster(s)\n%"); return docu.toString(); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8034 $"); } /** * Main method for testing this class. * * @param args should contain arguments for the data producer: */ public static void main(String[] args) { runDataGenerator(new SubspaceCluster(), args); } }