/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * SubspaceClusterDefinition.java * Copyright (C) 2005-2012 University of Waikato, Hamilton, New Zealand * */ package weka.datagenerators.clusterers; import java.util.Enumeration; import java.util.Random; import java.util.StringTokenizer; import java.util.Vector; import weka.core.Option; import weka.core.Range; import weka.core.RevisionUtils; import weka.core.SelectedTag; import weka.core.Utils; import weka.datagenerators.ClusterDefinition; import weka.datagenerators.ClusterGenerator; /** <!-- globalinfo-start --> * A single cluster for the SubspaceCluster datagenerator * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -A <range> * Generates randomly distributed instances in the cluster.</pre> * * <pre> -U <range> * Generates uniformly distributed instances in the cluster.</pre> * * <pre> -G <range> * Generates gaussian distributed instances in the cluster.</pre> * * <pre> -D <num>,<num> * The attribute min/max (-A and -U) or mean/stddev (-G) for * the cluster.</pre> * * <pre> -N <num>..<num> * The range of number of instances per cluster (default 1..50).</pre> * * <pre> -I * Uses integer instead of continuous values (default continuous).</pre> * <!-- options-end --> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @author FracPete (fracpete at waikato dot ac dot nz) * @version $Revision: 8034 $ * @see SubspaceCluster */ public class SubspaceClusterDefinition extends ClusterDefinition { /** for serialization */ static final long serialVersionUID = 3135678125044007231L; /** cluster type */ protected int m_clustertype; /** cluster subtypes */ protected int m_clustersubtype; /** number of attributes the cluster is defined for */ protected int m_numClusterAttributes; /** number of instances for this cluster */ protected int m_numInstances; /** minimal number of instances for this cluster */ protected int m_MinInstNum; /** maximal number of instances for this cluster */ protected int m_MaxInstNum; /** range of atttributes */ protected Range m_AttrIndexRange; /** attributes of this cluster */ protected boolean[] m_attributes; /** global indices of the attributes of the cluster */ protected int[] m_attrIndices; /** ranges of each attribute (min); not used if gaussian */ protected double[] m_minValue; /** ranges of each attribute (max); not used if gaussian */ protected double[] m_maxValue; /** mean ; only used if gaussian */ protected double[] m_meanValue; /** standarddev; only used if gaussian */ protected double[] m_stddevValue; /** * initializes the cluster, without a parent cluster (necessary for GOE) */ public SubspaceClusterDefinition() { super(); } /** * initializes the cluster with default values * * @param parent the datagenerator this cluster belongs to */ public SubspaceClusterDefinition(ClusterGenerator parent) { super(parent); } /** * sets the default values * * @throws Exception if setting of defaults fails */ protected void setDefaults() throws Exception { setClusterType(defaultClusterType()); setClusterSubType(defaultClusterSubType()); setMinInstNum(defaultMinInstNum()); setMaxInstNum(defaultMaxInstNum()); setAttrIndexRange(defaultAttrIndexRange()); m_numClusterAttributes = 1; setValuesList(defaultValuesList()); } /** * Returns a string describing this data generator. * * @return a description of the data generator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "A single cluster for the SubspaceCluster datagenerator"; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option( "\tGenerates randomly distributed instances in the cluster.", "A", 1, "-A <range>")); result.addElement(new Option( "\tGenerates uniformly distributed instances in the cluster.", "U", 1, "-U <range>")); result.addElement(new Option( "\tGenerates gaussian distributed instances in the cluster.", "G", 1, "-G <range>")); result.addElement(new Option( "\tThe attribute min/max (-A and -U) or mean/stddev (-G) for\n" + "\tthe cluster.", "D", 1, "-D <num>,<num>")); result.addElement(new Option( "\tThe range of number of instances per cluster (default " + defaultMinInstNum() + ".." + defaultMaxInstNum() + ").", "N", 1, "-N <num>..<num>")); result.addElement(new Option( "\tUses integer instead of continuous values (default continuous).", "I", 0, "-I")); return result.elements(); } /** * Parses a list of options for this object. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -A <range> * Generates randomly distributed instances in the cluster.</pre> * * <pre> -U <range> * Generates uniformly distributed instances in the cluster.</pre> * * <pre> -G <range> * Generates gaussian distributed instances in the cluster.</pre> * * <pre> -D <num>,<num> * The attribute min/max (-A and -U) or mean/stddev (-G) for * the cluster.</pre> * * <pre> -N <num>..<num> * The range of number of instances per cluster (default 1..50).</pre> * * <pre> -I * Uses integer instead of continuous values (default continuous).</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String tmpStr; String fromToStr; int typeCount; typeCount = 0; fromToStr = ""; tmpStr = Utils.getOption('A', options); if (tmpStr.length() != 0) { fromToStr = tmpStr; setClusterType( new SelectedTag( SubspaceCluster.UNIFORM_RANDOM, SubspaceCluster.TAGS_CLUSTERTYPE)); typeCount++; } tmpStr = Utils.getOption('U', options); if (tmpStr.length() != 0) { fromToStr = tmpStr; setClusterType( new SelectedTag( SubspaceCluster.TOTAL_UNIFORM, SubspaceCluster.TAGS_CLUSTERTYPE)); typeCount++; } tmpStr = Utils.getOption('G', options); if (tmpStr.length() != 0) { fromToStr = tmpStr; setClusterType( new SelectedTag( SubspaceCluster.GAUSSIAN, SubspaceCluster.TAGS_CLUSTERTYPE)); typeCount++; } // default is uniform/random if (typeCount == 0) setClusterType( new SelectedTag( SubspaceCluster.UNIFORM_RANDOM, SubspaceCluster.TAGS_CLUSTERTYPE)); else if (typeCount > 1) throw new Exception("Only one cluster type can be specified!"); setAttrIndexRange(fromToStr); tmpStr = Utils.getOption('D', options); if (isGaussian()) { if (tmpStr.length() != 0) setMeanStddev(tmpStr); else setMeanStddev(defaultMeanStddev()); } else { if (tmpStr.length() != 0) setValuesList(tmpStr); else setValuesList(defaultValuesList()); } tmpStr = Utils.getOption('N', options); if (tmpStr.length() != 0) setInstNums(tmpStr); else setInstNums(defaultMinInstNum() + ".." + defaultMaxInstNum()); if (Utils.getFlag('I', options)) setClusterSubType( new SelectedTag( SubspaceCluster.INTEGER, SubspaceCluster.TAGS_CLUSTERSUBTYPE)); else setClusterSubType( new SelectedTag( SubspaceCluster.CONTINUOUS, SubspaceCluster.TAGS_CLUSTERSUBTYPE)); } /** * Gets the current settings of the datagenerator BIRCHCluster. * * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { Vector result; result = new Vector(); if (isRandom()) { result.add("-A"); result.add("" + getAttrIndexRange()); result.add("-D"); result.add("" + getValuesList()); } else if (isUniform()) { result.add("-U"); result.add("" + getAttrIndexRange()); result.add("-D"); result.add("" + getValuesList()); } else if (isGaussian()) { result.add("-G"); result.add("" + getAttrIndexRange()); result.add("-D"); result.add("" + getMeanStddev()); } result.add("-N"); result.add("" + getInstNums()); if (m_clustersubtype == SubspaceCluster.INTEGER) result.add("-I"); return (String[]) result.toArray(new String[result.size()]); } /** * Make a string from the attribues list. * * @return the attributes as string */ public String attributesToString() { StringBuffer text = new StringBuffer(); int j = 0; for (int i = 0; i < m_attributes.length; i++) { if (m_attributes[i]) { if (isGaussian()) { text.append(" Attribute: " + i); text.append(" Mean: "+ m_meanValue[j]); text.append(" StdDev: "+m_stddevValue[j]+"\n%"); } else { text.append(" Attribute: " + i); text.append(" Range: "+ m_minValue[j]); text.append(" - "+m_maxValue[j]+"\n%"); } j++; } } return text.toString(); } /** * Make a string from the cluster features. * * @return the cluster features as string */ public String toString() { StringBuffer text = new StringBuffer(); text.append("attributes " + attributesToString() + "\n"); text.append("number of instances " + getInstNums()); return text.toString(); } /** * sets the parent datagenerator this cluster belongs to * @param parent the parent datagenerator */ public void setParent(SubspaceCluster parent) { super.setParent(parent); m_AttrIndexRange.setUpper(getParent().getNumAttributes()); } /** * returns the default attribute index range * * @return the default attribute index range */ protected String defaultAttrIndexRange() { return "1"; } /** * Sets which attributes are used in the cluster * attributes among the selection will be discretized. * * @param rangeList a string representing the list of attributes. Since * the string will typically come from a user, attributes are indexed from * 1. <br/> * eg: first-3,5,6-last */ public void setAttrIndexRange(String rangeList) { m_numClusterAttributes = 0; if (m_AttrIndexRange == null) m_AttrIndexRange = new Range(); m_AttrIndexRange.setRanges(rangeList); if (getParent() != null) { m_AttrIndexRange.setUpper(getParent().getNumAttributes()); m_attributes = new boolean [getParent().getNumAttributes()]; for (int i = 0; i < m_attributes.length; i++) { if (m_AttrIndexRange.isInRange(i)) { m_numClusterAttributes++; m_attributes[i] = true; } else { m_attributes[i] = false; } } //store translation from attr in cluster to attr in whole dataset m_attrIndices = new int[m_numClusterAttributes]; int clusterI = -1; for (int i = 0; i < m_attributes.length; i++) { if (m_AttrIndexRange.isInRange(i)) { clusterI++; m_attrIndices[clusterI] = i; } } } } /** * returns the attribute range(s). * * @return the attribute range(s). */ public String getAttrIndexRange() { return m_AttrIndexRange.getRanges(); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String attrIndexRangeTipText() { return "The attribute range(s)."; } public boolean[] getAttributes() { return m_attributes; } public double[] getMinValue() { return m_minValue; } public double[] getMaxValue() { return m_maxValue; } public double[] getMeanValue() { return m_meanValue; } public double[] getStddevValue() { return m_stddevValue; } public int getNumInstances () { return m_numInstances; } /** * returns the default cluster type * * @return the default cluster type */ protected SelectedTag defaultClusterType() { return new SelectedTag( SubspaceCluster.UNIFORM_RANDOM, SubspaceCluster.TAGS_CLUSTERTYPE); } /** * Gets the cluster type. * * @return the cluster type * @see SubspaceCluster#TAGS_CLUSTERTYPE */ public SelectedTag getClusterType() { return new SelectedTag(m_clustertype, SubspaceCluster.TAGS_CLUSTERTYPE); } /** * Sets the cluster type. * * @param value the new cluster type. * @see SubspaceCluster#TAGS_CLUSTERTYPE */ public void setClusterType(SelectedTag value) { if (value.getTags() == SubspaceCluster.TAGS_CLUSTERTYPE) m_clustertype = value.getSelectedTag().getID(); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String clusterTypeTipText() { return "The type of cluster to use."; } /** * returns the default cluster sub type * * @return the default cluster sub type */ protected SelectedTag defaultClusterSubType() { return new SelectedTag( SubspaceCluster.CONTINUOUS, SubspaceCluster.TAGS_CLUSTERSUBTYPE); } /** * Gets the cluster sub type. * * @return the cluster sub type * @see SubspaceCluster#TAGS_CLUSTERSUBTYPE */ public SelectedTag getClusterSubType() { return new SelectedTag( m_clustersubtype, SubspaceCluster.TAGS_CLUSTERSUBTYPE); } /** * Sets the cluster sub type. * * @param value the new cluster sub type. * @see SubspaceCluster#TAGS_CLUSTERSUBTYPE */ public void setClusterSubType(SelectedTag value) { if (value.getTags() == SubspaceCluster.TAGS_CLUSTERSUBTYPE) m_clustersubtype = value.getSelectedTag().getID(); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String clusterSubTypeTipText() { return "The sub-type of cluster to use."; } /** * checks, whether cluster type is random * * @return true if cluster type is random */ public boolean isRandom() { return (m_clustertype == SubspaceCluster.UNIFORM_RANDOM); } /** * checks, whether cluster type is uniform * * @return true if cluster type is uniform */ public boolean isUniform() { return (m_clustertype == SubspaceCluster.TOTAL_UNIFORM); } /** * checks, whether cluster type is gaussian * * @return true if cluster type is gaussian */ public boolean isGaussian() { return (m_clustertype == SubspaceCluster.GAUSSIAN); } /** * checks, whether cluster sub type is continuous * * @return true if cluster sub type is continuous */ public boolean isContinuous() { return (m_clustertype == SubspaceCluster.CONTINUOUS); } /** * checks, whether cluster sub type is integer * * @return true if cluster sub type is integer */ public boolean isInteger() { return (m_clustertype == SubspaceCluster.INTEGER); } /** * Sets the upper and lower boundary for instances for this cluster. * * @param fromTo the string containing the upper and lower boundary for * instances per cluster separated by .. */ protected void setInstNums(String fromTo) { int i = fromTo.indexOf(".."); if (i == -1) i = fromTo.length(); String from = fromTo.substring(0, i); m_MinInstNum = Integer.parseInt(from); if (i < fromTo.length()) { String to = fromTo.substring(i + 2, fromTo.length()); m_MaxInstNum = Integer.parseInt(to); } else { m_MaxInstNum = m_MinInstNum; } } /** * Get a string with the upper and lower boundary for the * number of instances for this cluster. * * @return the string containing the upper and lower boundary for * instances per cluster separated by .. */ protected String getInstNums() { String text = new String(""+m_MinInstNum+".."+m_MaxInstNum); return text; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ protected String instNumsTipText() { return "The lower and upper boundary for the number of instances in this cluster."; } /** * returns the default min number of instances * * @return the default min number of instances */ protected int defaultMinInstNum() { return 1; } /** * Gets the lower boundary for instances per cluster. * * @return the the lower boundary for instances per cluster */ public int getMinInstNum() { return m_MinInstNum; } /** * Sets the lower boundary for instances per cluster. * * @param newMinInstNum new lower boundary for instances per cluster */ public void setMinInstNum(int newMinInstNum) { m_MinInstNum = newMinInstNum; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String minInstNumTipText() { return "The lower boundary for instances per cluster."; } /** * returns the default max number of instances * * @return the default max number of instances */ protected int defaultMaxInstNum() { return 50; } /** * Gets the upper boundary for instances per cluster. * * @return the upper boundary for instances per cluster */ public int getMaxInstNum() { return m_MaxInstNum; } /** * Sets the upper boundary for instances per cluster. * * @param newMaxInstNum new upper boundary for instances per cluster */ public void setMaxInstNum(int newMaxInstNum) { m_MaxInstNum = newMaxInstNum; } /** * Returns the tip text for this property * * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String maxInstNumTipText() { return "The upper boundary for instances per cluster."; } /** * Sets the real number of instances for this cluster. * * @param r random number generator */ public void setNumInstances(Random r) { if (m_MaxInstNum > m_MinInstNum) m_numInstances = (int)(r.nextDouble() * (m_MaxInstNum - m_MinInstNum) + m_MinInstNum); else m_numInstances = m_MinInstNum; } /** * returns the default values list * * @return the default values list */ protected String defaultValuesList() { return "1,10"; } /** * Sets the ranges for each attribute. * * @param fromToList the string containing the upper and lower boundary for * instances per cluster separated by .. * @throws Exception if values are not correct in number or value */ public void setValuesList(String fromToList) throws Exception { m_minValue = new double [m_numClusterAttributes]; m_maxValue = new double [m_numClusterAttributes]; setValuesList(fromToList, m_minValue, m_maxValue, "D"); SubspaceCluster parent = (SubspaceCluster) getParent(); for (int i = 0; i < m_numClusterAttributes; i++) { if (m_minValue[i] > m_maxValue[i]) throw new Exception("Min must be smaller than max."); if (getParent() != null) { // boolean values are only 0.0 and 1.0 if (parent.isBoolean(m_attrIndices[i])) { parent.getNumValues()[m_attrIndices[i]] = 2; if (((m_minValue[i] != 0.0) && (m_minValue[i] != 1.0)) || ((m_maxValue[i] != 0.0) && (m_maxValue[i] != 1.0))) throw new Exception("Ranges for boolean must be 0 or 1 only."); } if (parent.isNominal(m_attrIndices[i])) { // nominal values: attributes range might have to be enlarged double rest = m_minValue[i] - Math.rint(m_minValue[i]); if (rest != 0.0) throw new Exception(" Ranges for nominal must be integer"); rest = m_maxValue[i] - Math.rint(m_maxValue[i]); if (rest != 0.0) throw new Exception("Ranges for nominal must be integer"); if (m_minValue[i] < 0.0) throw new Exception("Range for nominal must start with number 0.0 or higher"); if (m_maxValue[i] + 1 > parent.getNumValues()[m_attrIndices[i]]) { // add new values to attribute // (actual format is not yet defined) parent.getNumValues()[m_attrIndices[i]] = (int)m_maxValue[i] + 1; } } } } } /** * returns the range for each attribute as string */ public String getValuesList() { String result; int i; result = ""; if (m_minValue != null) { for (i = 0; i < m_minValue.length; i++) { if (i > 0) result += ","; result += "" + m_minValue[i] + "," + m_maxValue[i]; } } return result; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String valuesListTipText() { return "The range for each each attribute as string."; } /** * returns the default mean/stddev list */ protected String defaultMeanStddev() { return "0,1.0"; } /** * Sets mean and standarddeviation. * * @param meanstddev the string containing the upper and lower boundary for * instances per cluster separated by .. * @throws Exception if values are not correct in number or value */ public void setMeanStddev(String meanstddev) throws Exception { m_meanValue = new double [m_numClusterAttributes]; m_stddevValue = new double [m_numClusterAttributes]; setValuesList(meanstddev, m_meanValue, m_stddevValue, "D"); } /** * returns the current mean/stddev setup */ public String getMeanStddev() { String result; int i; result = ""; if (m_meanValue != null) { for (i = 0; i < m_meanValue.length; i++) { if (i > 0) result += ","; result += "" + m_meanValue[i] + "," + m_stddevValue[i]; } } return result; } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String meanStddevTipText() { return "The mean and stddev, in case of gaussian."; } /** * Sets the ranges for each attribute. * * @param fromToList the string containing the upper and lower boundary for * instances per cluster separated by .. * @param first the "from's" * @param second the "to's" * @param optionLetter the option, from which the list came * @throws Exception if values are not correct in number or value */ public void setValuesList(String fromToList, double[] first, double[] second, String optionLetter) throws Exception { StringTokenizer tok; int index; tok = new StringTokenizer(fromToList, ","); if (tok.countTokens() != first.length + second.length) throw new Exception( "Wrong number of values for option '-" + optionLetter + "'."); index = 0; while (tok.hasMoreTokens()) { first[index] = Double.parseDouble(tok.nextToken()); second[index] = Double.parseDouble(tok.nextToken()); index++; } } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8034 $"); } }