/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * XMeans.java * Copyright (C) 2000 Mark Hall, Malcolm Ware, Gabi Schmidberger * */ package weka.clusterers; import java.io.*; import java.util.*; import weka.core.AlgVector; import weka.core.AttributeStats; import weka.core.KDTree; import weka.core.DistanceFunction; import weka.core.EuclideanDistance; import weka.core.Instances; import weka.core.Instance; import weka.core.Attribute; import weka.core.Utils; import weka.core.Option; import weka.core.OptionHandler; import weka.filters.Filter; import weka.filters.unsupervised.attribute.ReplaceMissingValues; /** * XMeans clustering class. * * X-Means is K-Means extended by an Improve-Structure part In this part of * the algorithm the centers are attempted to be split in its region. * The decision between the children of * each center and itself is done comparing the BIC-values of * the two structures. * See also D. Pelleg and A. Moore's paper 'X-means: Extending * K-means with Efficient Estimation of the Number of Clusters'. <p> * * Valid options are:<p> * * -I <max iterations> <br> * Maximum number of iterations in the overall loop (default = 1). <p> * * -M <max iterations> <br> * Maximum number of iterations in the kMeans loop in <br> * the Improve-Parameter part (default = 1000).<p> * * -J <max iterations> <br> * Maximum number of iterations in the kMeans loop for the splitted <br> * centroids in the Improve-Structure part (default = 1000).<p> * * -L <minimal number of clusters> <br> * Specify the number of clusters to start with.<p> * * -H <maximal number of clusters> <br> * Specify the maximal number of clusters.<p> * * -B <value> <br> * Distance value between true and false of binary attributes and <br> * "same" and "different" of nominal attributes (default = 1.0).<p> * * -K <kdtree class><br> * KDTrees class and its options (can only use the same distance function * as XMeans).<p> * * -C <cutoff factor> <br> * If none of the children are better, percentage of the best splits<br> * to be taken.<p> * * -D <distance function class> * Distance function class to be used (default = Euclidean distance) * * -N <file name> <br> * Input starting cluster centers from file (ARFF-format). <p> * * -O <file name> <br> * Output cluster centers to file (ARFF-format). <p> * * -S <seed> <br> * Specify random number seed. <p> * * -U <debuglevel> <br> * Set debuglevel. <p> * * -Y <file name> <br> * Used for debugging: Input random vektors from file. <p> * * major TODOS: * * make BIC-Score replaceable by other scores * * @author Gabi Schmidberger <gabi@cs.waikato.ac.nz) * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Malcolm Ware <mfw4@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ * @see Clusterer * @see OptionHandler */ public class XMeans extends Clusterer implements OptionHandler { private AlgVector algv; // TODO just a trick /* training instances */ private Instances m_Instances = null; /* model information, should increase readability */ private Instances m_Model = null; /* replace missing values in training instances */ private ReplaceMissingValues m_ReplaceMissingFilter; /** * Distance value between true and false of binary attributes and * "same" and "different" of nominal attributes (default = 1.0). */ private double m_BinValue = 1.0; /* BIC-Score of the current model */ double m_Bic = Double.MIN_VALUE; /* Distortion */ double [] m_Mle = null; /* maximum overall iterations */ private int m_MaxIterations = 1; /** * maximum iterations to perform Kmeans part * if negative, iterations are not checked */ private int m_MaxKMeans = 1000; /* see above, but for kMeans of splitted clusters */ private int m_MaxKMeansForChildren = 1000; /* The actual number of clusters */ private int m_NumClusters = 2; /* min number of clusters to generate */ private int m_MinNumClusters = 2; /* max number of clusters to generate */ private int m_MaxNumClusters = 4; /** the distance function used */ private DistanceFunction m_DistanceF = null; /* cluster centers */ private Instances m_ClusterCenters; /* file name of the output file for the cluster centers */ String m_InputCenterFile = null; /*--> DebugVektors - USED FOR DEBUGGING */ /* input file for the random vektors --> USED FOR DEBUGGING */ Reader m_DebugVektorsInput = null; int m_DebugVektorsIndex = 0; Instances m_DebugVektors = null; /* file name of the input file for the random vektors */ String m_DebugVektorsFile = null; /* input file for the cluster centers */ Reader m_CenterInput = null; /* file name of the output file for the cluster centers */ String m_OutputCenterFile = null; /* output file for the cluster centers */ PrintWriter m_CenterOutput = null; /** * temporary variable holding cluster assignments while iterating */ private int [] m_ClusterAssignments; /* cutoff factor - percentage of splits done in Improve-Structure part only relevant, if all children lost */ double m_CutOffFactor = 0.5; /** * random seed */ private int m_Seed = 10; /** * Ranges of the universe of data, lowest value, highest value and width */ double [][] m_Ranges; /** * Index in ranges for LOW and HIGH and WIDTH */ public static int R_LOW = 0; public static int R_HIGH = 1; public static int R_WIDTH = 2; /** * KDTrees class if KDTrees are used */ private KDTree m_KDTree = null; /* counts iterations done in main loop */ private int m_IterationCount = 0; /* counter to say how often kMeans was stopped by loop counter */ private int m_KMeansStopped = 0; /* Number of splits prepared */ private int m_NumSplits = 0; /* Number of splits accepted (including cutoff factor decisions) */ private int m_NumSplitsDone = 0; /* Number of splits accepted just because of cutoff factor */ private int m_NumSplitsStillDone = 0; /** * level of debug output, 0 is no output. */ private int m_DebugLevel = 0; public static int D_PRINTCENTERS = 1; // follows the splitting of the centers public static int D_FOLLOWSPLIT = 2; // have a closer look at converge children public static int D_CONVCHCLOSER = 3; // check on random vektors public static int D_RANDOMVEKTOR = 4; // check on kdtree public static int D_KDTREE = 5; // follow iterations public static int D_ITERCOUNT = 6; // functions were maybe misused public static int D_METH_MISUSE = 80; // for current debug public static int D_CURR = 88; public static int D_GENERAL = 99; // Flag: I'm debugging public boolean m_CurrDebugFlag = true; /** * Returns a string describing this clusterer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the X-means algorithm" + ", as described in D. Pelleg and A. Moore's paper 'X-means: Extending" + " K-means with Efficient Estimation of the Number of Clusters'."; } /** * Function should be in the Instances class!! * * Initializes the minimum and maximum values * based on all instances. * * @param instList list of indexes */ public static double [][] initializeRanges(Instances instances, int[] instList) { int numAtt = instances.numAttributes(); double [][] ranges = new double [numAtt][3]; // initialize ranges using the first instance updateRangesFirst(instances.instance(instList[0]), numAtt, ranges); // update ranges, starting from the second for (int i = 1; i < instList.length; i++) { updateRanges(instances.instance(instList[i]), numAtt, ranges); } return ranges; } /** * Function should be in the Instances class!! * * Prints a range. * * @param ranges the ranges to print */ public static void printRanges(Instances model, double[][] ranges) { System.out.println("printRanges"); for (int j = 0; j < model.numAttributes(); j++) { System.out.print("Attribute "+ j +" LOW: " + ranges[j][R_LOW]); System.out.print(" HIGH: " + ranges[j][R_HIGH]); System.out.print(" WIDTH: " + ranges[j][R_WIDTH]); System.out.println(" "); } } /** * Function should be in the Instances class!! * * Used to initialize the ranges. For this the values * of the first instance is used to save time. * Sets low and high to the values of the first instance and * width to zero. * * @param instance the new instance * @param numAtt number of attributes in the model */ public static void updateRangesFirst(Instance instance, int numAtt, double[][] ranges) { for (int j = 0; j < numAtt; j++) { if (!instance.isMissing(j)) { ranges[j][R_LOW] = instance.value(j); ranges[j][R_HIGH] = instance.value(j); ranges[j][R_WIDTH] = 0.0; } else { // if value was missing ranges[j][R_LOW] = Double.MIN_VALUE; ranges[j][R_HIGH] = Double.MAX_VALUE; ranges[j][R_WIDTH] = 0.0; //todo?? } } } /** * Function should be in the Instances class!! * * Updates the minimum and maximum and width values for all the attributes * based on a new instance. * * @param instance the new instance * @param numAtt number of attributes in the model * @param ranges low, high and width values for all attributes */ public static void updateRanges(Instance instance, int numAtt, double [][] ranges) { // updateRangesFirst must have been called on ranges for (int j = 0; j < numAtt; j++) { double value = instance.value(j); if (!instance.isMissing(j)) { if (value < ranges[j][R_LOW]) { ranges[j][R_LOW] = value; ranges[j][R_WIDTH] = ranges[j][R_HIGH] - ranges[j][R_LOW]; } else { if (instance.value(j) > ranges[j][R_HIGH]) { ranges[j][R_HIGH] = value; ranges[j][R_WIDTH] = ranges[j][R_HIGH] - ranges[j][R_LOW]; } } } } } /** * Generates the X-Means clusterer. * * @param data set of instances serving as training data * @exception Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { if (data.checkForStringAttributes()) { throw new Exception("Can't handle string attributes!"); } // replace missing values m_ReplaceMissingFilter = new ReplaceMissingValues(); m_ReplaceMissingFilter.setInputFormat(data); data = Filter.useFilter(data, m_ReplaceMissingFilter); m_Instances = data; // initialize random function Random random0 = new Random(m_Seed); // num of clusters to start with m_NumClusters = m_MinNumClusters; // set distance function to default if (m_DistanceF == null) { m_DistanceF = new EuclideanDistance(data); checkInstances(); } if (m_DistanceF != null) { checkInstances(); } // if (m_DebugVektorsFile != null) initDebugVektorsInput(); // make list of indexes for m_Instances int [] allInstList = new int[m_Instances.numInstances()]; for (int i = 0; i < m_Instances.numInstances(); i++) { allInstList[i] = i; } // prepare the min and max value m_Ranges = m_Instances.initializeRanges(allInstList); // set model used (just for convenience) m_Model = new Instances(m_Instances, 0); // produce the starting centers if (m_CenterInput != null) { // read centers from file m_ClusterCenters = new Instances(m_CenterInput); m_NumClusters = m_ClusterCenters.numInstances(); } else // makes the first centers randomly m_ClusterCenters = makeCentersRandomly(random0, m_Instances, m_NumClusters); PFD(D_FOLLOWSPLIT, "\n*** Starting centers "); for (int k = 0; k < m_ClusterCenters.numInstances(); k++) { PFD(D_FOLLOWSPLIT, "Center " + k + ": " + m_ClusterCenters.instance(k)); } PrCentersFD(D_PRINTCENTERS); boolean finished = false; Instances children; // builds up a KDTree if (m_KDTree != null) { m_KDTree.buildKDTree(m_Instances, m_Ranges); //PFD(D_KDTREE, tree.toString()); } // loop counter of main loop m_IterationCount = 0; // is true for the beginning of first loop, // might false because ass. is already done in the following loops boolean firstAssignment = true; /** * "finished" does get true as soon as: * 1. number of clusters gets >= m_MaxClusters, * 2. in the last round, none of the centers have been split * * if number of clusters is already >= m_MaxClusters * part 1 (= Improve-Params) is done at least once. */ while (!finished && !stopIteration(m_IterationCount, m_MaxIterations)) { /* ==================================================================== * 1. Improve-Params * conventional K-means */ PFD(D_FOLLOWSPLIT, "\nBeginning of main loop - centers:"); PrCentersFD(D_FOLLOWSPLIT); PFD(D_ITERCOUNT, "\n*** 1. Improve-Params " + m_IterationCount + ". time"); m_IterationCount++; // prepare to converge boolean converged = false; // initialize assignments to -1 m_ClusterAssignments = initAssignments(m_Instances.numInstances()); // stores a list of indexes of instances belonging to each center int [][] instOfCent = new int[m_ClusterCenters.numInstances()][]; // KMeans loop counter int kMeansIteration = 0; // converge in conventional K-means ---------------------------------- PFD(D_FOLLOWSPLIT, "\nConverge in K-Means:"); while (!converged && !stopKMeansIteration(kMeansIteration, m_MaxKMeans)) { kMeansIteration++; converged = true; // assign instances to centers ------------------------------------- converged = assignToCenters(m_KDTree, m_ClusterCenters, instOfCent, allInstList, m_ClusterAssignments, kMeansIteration); /*gabifor (int l = 0; l < instOfCent.length; l++) { System.out.println(" " + instOfCent[l].length + "+++"); for (int m = 0; m < instOfCent[l].length; m++) { OOPS(" " + instOfCent[l][m]); } }*/ PFD(D_FOLLOWSPLIT, "\nMain loop - Assign - centers:"); PrCentersFD(D_FOLLOWSPLIT); //System.out.println(" "); // compute new centers = centers of mass of points converged = recomputeCenters(m_ClusterCenters, // clusters instOfCent, // their instances m_Model); // model information PFD(D_FOLLOWSPLIT, "\nMain loop - Recompute - centers:"); PrCentersFD(D_FOLLOWSPLIT); } PFD(D_FOLLOWSPLIT, ""); PFD(D_FOLLOWSPLIT, "End of Part: 1. Improve-Params - conventional K-means"); /* for (int m = 0; m < instOfCent.length; m++) { System.out.println("Center "+m+" : "+ m_ClusterCenters.instance(m)); for (int n = 0; n < instOfCent[m].length; n++) { System.out.print(instOfCent[m][n]+", "); } PFD(D_FOLLOWSPLIT, ""); }*/ /** ===================================================================== * 2. Improve-Structur */ // BIC before split distortioning the centres m_Mle = distortion(instOfCent, m_ClusterCenters); m_Bic = calculateBIC(instOfCent, m_ClusterCenters, m_Mle); PFD(D_FOLLOWSPLIT, "m_Bic " + m_Bic); int currNumCent = m_ClusterCenters.numInstances(); Instances splitCenters = new Instances(m_ClusterCenters, currNumCent * 2); // store BIC values of parent and children double [] pbic = new double [currNumCent]; double [] cbic = new double [currNumCent]; // split each center for (int i = 0; i < currNumCent // this could help to optimize the algorithm // && currNumCent + numSplits <= m_MaxNumClusters ; i++) { PFD(D_FOLLOWSPLIT, "\nsplit center " + i + " " + m_ClusterCenters.instance(i)); Instance currCenter = m_ClusterCenters.instance(i); int [] currInstList = instOfCent[i]; int currNumInst = instOfCent[i].length; double currMLE = m_Mle[i]; // not enough instances; than continue with next if (currNumInst <= 2) { pbic[i] = Double.MAX_VALUE; cbic[i] = 0.0; // add center itself as dummy splitCenters.add(currCenter); splitCenters.add(currCenter); continue; } // split centers ---------------------------------------------- double variance = m_Mle[i] / (double)currNumInst; children = splitCenter(random0, currCenter, variance, m_Model); // initialize assignments to -1 int[] oneCentAssignments = initAssignments(currNumInst); int[][] instOfChCent = new int [2][]; // todo maybe split didn't work // converge the children -------------------------------------- converged = false; int kMeansForChildrenIteration = 0; PFD(D_FOLLOWSPLIT, "\nConverge, K-Means for children: " + i); while (!converged && !stopKMeansIteration(kMeansForChildrenIteration, m_MaxKMeansForChildren)) { kMeansForChildrenIteration++; converged = assignToCenters(children, instOfChCent, currInstList, oneCentAssignments); if (!converged) { recomputeCentersFast(children, instOfChCent, m_Model); } } // store new centers for later decision if they are taken splitCenters.add(children.instance(0)); splitCenters.add(children.instance(1)); PFD(D_FOLLOWSPLIT, "\nconverged cildren "); PFD(D_FOLLOWSPLIT, " " + children.instance(0)); PFD(D_FOLLOWSPLIT, " " + children.instance(1)); // compare parent and children model by their BIC-value pbic[i] = calculateBIC(currInstList, currCenter, m_Mle[i], m_Model); double [] chMLE = distortion(instOfChCent, children); cbic[i] = calculateBIC(instOfChCent, children, chMLE); } // end of loop over clusters // decide which one to split and make new list of cluster centers Instances newClusterCenters = null; newClusterCenters = newCentersAfterSplit(pbic, cbic, m_CutOffFactor, splitCenters); /** * Compare with before Improve-Structure */ int newNumClusters = newClusterCenters.numInstances(); if (newNumClusters != m_NumClusters) { PFD(D_FOLLOWSPLIT, "Compare with non-split"); // initialize assignments to -1 int [] newClusterAssignments = initAssignments(m_Instances.numInstances()); // stores a list of indexes of instances belonging to each center int [][] newInstOfCent = new int[newClusterCenters.numInstances()][]; // assign instances to centers ------------------------------------- converged = assignToCenters(m_KDTree, newClusterCenters, newInstOfCent, allInstList, newClusterAssignments, m_IterationCount); double [] newMle = distortion(newInstOfCent, newClusterCenters); double newBic = calculateBIC(newInstOfCent, newClusterCenters, newMle); PFD(D_FOLLOWSPLIT, "newBic " + newBic); if (newBic > m_Bic) { PFD(D_FOLLOWSPLIT, "*** decide for new clusters"); m_Bic = newBic; m_ClusterCenters = newClusterCenters; m_ClusterAssignments = newClusterAssignments; } else { PFD(D_FOLLOWSPLIT, "*** keep old clusters"); } } newNumClusters = m_ClusterCenters.numInstances(); // decide if finished: max num cluster reached // or last centers where not split at all if ((newNumClusters >= m_MaxNumClusters) || (newNumClusters == m_NumClusters)) { finished = true; } m_NumClusters = newNumClusters; } } /** * Checks for nominal attributes in the dataset. * Class attribute is ignored. * @param data * @return false if no nominal attributes are present */ public boolean checkForNominalAttributes(Instances data) { int i = 0; while (i < data.numAttributes()) { if ((i != data.classIndex()) && data.attribute(i++).isNominal()) { return true; } } return false; } /** * Set array of int, used to store assignments, to -1. * @param ass integer array used for storing assignments * @return integer array used for storing assignments */ private int [] initAssignments(int [] ass) { for (int i = 0; i < ass.length; i++) ass[i] = -1; return ass; } /** * Creates and initializes integer array, used to store assignments. * @param numInstances length of array used for assignments * @return integer array used for storing assignments */ private int [] initAssignments(int numInstances) { int [] ass = new int[numInstances]; for (int i = 0; i < numInstances; i++) ass[i] = -1; return ass; } /** * Creates and initializes boolean array. * @param len length of new array * @return the new array */ boolean [] initBoolArray(int len) { boolean[] boolArray = new boolean [len]; for (int i = 0; i < len; i++) { boolArray[i] = false; } return boolArray; } /** * Returns new center list. * * The following steps 1. and 2. both take care that the number of centers * does not exceed maxCenters. * * 1. Compare BIC values of parent and children and takes the one as * new centers which do win (= BIC-value is smaller). * * 2. If in 1. none of the children are chosen * && and cutoff factor is > 0 * cutoff factor is taken as the percentage of "best" centers that are * still taken. * @param pbic array of parents BIC-values * @param cbic array of childrens BIC-values * @param cutoffFactor cutoff factor * @param splitCenters all children * @return the new centers */ private Instances newCentersAfterSplit(double [] pbic, double [] cbic, double cutoffFactor, Instances splitCenters) { // store if split won boolean splitPerCutoff = false; boolean takeSomeAway = false; boolean [] splitWon = initBoolArray(m_ClusterCenters.numInstances()); int numToSplit = 0; Instances newCenters = null; // how many would be split, because the children have a better bic value for (int i = 0; i < cbic.length; i++) { if (cbic[i] > pbic[i]) { // decide for splitting ---------------------------------------- splitWon[i] = true; numToSplit++; PFD(D_FOLLOWSPLIT, "Center " + i + " decide for children"); } else { // decide for parents and finished stays true ----------------- PFD(D_FOLLOWSPLIT, "Center " + i + " decide for parent"); } } // no splits yet so split per cutoff factor if ((numToSplit == 0) && (cutoffFactor > 0)) { splitPerCutoff = true; // how many to split per cutoff factor numToSplit = (int) ((double) m_ClusterCenters.numInstances() * m_CutOffFactor); } // prepare indexes of values in ascending order double [] diff = new double [m_NumClusters]; for (int j = 0; j < diff.length; j++) { diff[j] = pbic[j] - cbic[j]; } int [] sortOrder = Utils.sort(diff); // check if maxNumClusters would be exceeded int possibleToSplit = m_MaxNumClusters - m_NumClusters; if (possibleToSplit > numToSplit) { // still enough possible, do the whole amount possibleToSplit = numToSplit; } else takeSomeAway = true; // prepare for splitting the one that are supposed to be split if (splitPerCutoff) { for (int j = 0; (j < possibleToSplit) && (cbic[sortOrder[j]] > 0.0); j++) { splitWon[sortOrder[j]] = true; } m_NumSplitsStillDone += possibleToSplit; } else { // take some splits away if max number of clusters would be exceeded if (takeSomeAway) { int count = 0; int j = 0; for (;j < splitWon.length && count < possibleToSplit; j++){ if (splitWon[sortOrder[j]] == true) count++; } while (j < splitWon.length) { splitWon[sortOrder[j]] = false; j++; } } } // finally split if (possibleToSplit > 0) newCenters = newCentersAfterSplit(splitWon, splitCenters); else newCenters = m_ClusterCenters; return newCenters; } /** * Returns new centers. * Depending on splitWon: if true takes children, if false * takes parent = current center. * @param splitWon array of boolean to indicate to take split or not * @param splitCenters list of splitted centers * @return the new centers */ private Instances newCentersAfterSplit(boolean [] splitWon, Instances splitCenters) { Instances newCenters = new Instances(splitCenters, 0); int sIndex = 0; for (int i = 0; i < splitWon.length; i++) { if (splitWon[i]) { m_NumSplitsDone++; newCenters.add(splitCenters.instance(sIndex++)); newCenters.add(splitCenters.instance(sIndex++)); } else { sIndex++; sIndex++; newCenters.add(m_ClusterCenters.instance(i)); } } return newCenters; } /** * Controls that counter does not exceed max iteration value. * Special function for kmeans iterations. * @param iterationCount current value of counter * @param max maximum value for counter * @return true if iteration should be stopped */ private boolean stopKMeansIteration(int iterationCount, int max) { boolean stopIterate = false; if (max >= 0) stopIterate = (iterationCount >= max); if (stopIterate) m_KMeansStopped++; return stopIterate; } /** * Checks if iterationCount has to be checked and if yes * (this means m_MaxIterations is > 0) compares it with * m_MaxIteration */ private boolean stopIteration(int iterationCount, int max) { boolean stopIterate = false; if (max >= 0) stopIterate = (iterationCount >= max); return stopIterate; } /** * Recompute the new centers. New cluster center is center of mass of its * instances. Returns true if cluster stays the same. * @param centers the input and output centers * @param instancesOfCent the instances to the centers * @param model data model information * @return true if converged. */ private boolean recomputeCenters(Instances centers, int [][] instOfCent, Instances model) { boolean converged = true; for (int i = 0; i < centers.numInstances(); i++) { double val; for (int j = 0; j < model.numAttributes(); j++) { val = meanOrMode(m_Instances, instOfCent[i], j); for (int k = 0; k < instOfCent[i].length; k++) if (converged && m_ClusterCenters.instance(i).value(j) != val) converged = false; if (!converged) m_ClusterCenters.instance(i).setValue(j, val); } } return converged; } /** * Recompute the new centers - 2nd version * Same as recomputeCenters, but does not check if center stays the same. * * @param centers the input center and output centers * @param instOfCentIndexes the indexes of the instances to the centers * @param model data model information */ private void recomputeCentersFast(Instances centers, int [][] instOfCentIndexes, Instances model ) { for (int i = 0; i < centers.numInstances(); i++) { double val; for (int j = 0; j < model.numAttributes(); j++) { val = meanOrMode(m_Instances, instOfCentIndexes[i], j); centers.instance(i).setValue(j, val); } } } /** * Computes Mean Or Mode of one attribute on a subset of m_Instances. * The subset is defined by an index list. * @param instances all instances * @param instList the indexes of the instances the mean is computed from * @param attIndex the index of the attribute * @return mean value */ private double meanOrMode(Instances instances, int [] instList, int attIndex) { double result, found; int [] counts; int numInst = instList.length; if (instances.attribute(attIndex).isNumeric()) { result = found = 0; for (int j = 0; j < numInst; j++) { Instance currInst = instances.instance(instList[j]); if (!currInst.isMissing(attIndex)) { found += currInst.weight(); result += currInst.weight() * currInst.value(attIndex); } } if (Utils.eq(found, 0)) { return 0; } else { return result / found; } } else if (instances.attribute(attIndex).isNominal()) { counts = new int[instances.attribute(attIndex).numValues()]; for (int j = 0; j < numInst; j++) { Instance currInst = instances.instance(instList[j]); if (!currInst.isMissing(attIndex)) { counts[(int) currInst.value(attIndex)] += currInst.weight(); } } return (double)Utils.maxIndex(counts); } else { return 0; } } /** * Assigns instances to centers. * * @param tree KDTree on all instances * @param centers all the input centers * @param instOfCent the instances to each center * @param allInstList list of all instances * @param assignments assignments of instances to centers * @param iterationCount the number of iteration * @return true if converged */ private boolean assignToCenters(KDTree tree, Instances centers, int [][] instOfCent, int [] allInstList, int [] assignments, int iterationCount) throws Exception { boolean converged = true; if (tree != null) { // using KDTree structure for assigning converged = assignToCenters(tree, centers, instOfCent, assignments, iterationCount); } else { converged = assignToCenters(centers, instOfCent, allInstList, assignments); } return converged; } /** * Assign instances to centers using KDtree. * First part of conventionell K-Means, returns true if new assignment * is the same as the last one. * * @param tree KDTree on all instances * @param centers all the input centers * @param instOfCent the instances to each center * @param assignments assignments of instances to centers * @param iterationCount the number of iteration * @return true if converged */ private boolean assignToCenters(KDTree kdtree, Instances centers, int [][] instOfCent, int [] assignments, int iterationCount) throws Exception { int numCent = centers.numInstances(); int numInst = m_Instances.numInstances(); int [] oldAssignments = new int[numInst]; // WARNING: assignments is "input/output-parameter" // should not be null if (assignments == null) { OOPS(D_METH_MISUSE, "assignment was null"); assignments = new int[numInst]; for (int i = 0; i < numInst; i++) { assignments[0] = -1; } } // WARNING: instOfCent is "input/output-parameter" // should not be null if (instOfCent == null) { OOPS(D_METH_MISUSE, "inst of cent was null"); instOfCent = new int [numCent][]; } // save old assignments for (int i = 0; i < assignments.length; i++) { oldAssignments[i] = assignments[i]; } // use tree to get new assignments kdtree.centerInstances(centers, assignments, Math.pow(.8, iterationCount)); boolean converged = true; //PFD_CURR("assignments"); //for (int d = 0; d < assignments.length; d++) { // System.out.print(" "+assignments[d]+", "); //} //PFD(D_CURR, " "); // compare with previous assignment for (int i = 0; converged && (i < assignments.length); i++) { converged = (oldAssignments[i] == assignments[i]); if (assignments[i] == -1) throw new Exception("Instance " + i + " has not been assigned to cluster."); } if (!converged) { int [] numInstOfCent = new int[numCent]; for (int i = 0; i < numCent; i++) numInstOfCent[i] = 0; // count num of assignments per center for (int i = 0; i < numInst; i++) numInstOfCent[assignments[i]]++; // prepare instancelists per center for (int i = 0; i < numCent; i++){ instOfCent[i] = new int[numInstOfCent[i]]; } // write instance lists per center for (int i = 0; i < numCent; i++) { int index = -1; for (int j = 0; j < numInstOfCent[i]; j++) { index = nextAssignedOne(i, index, assignments); instOfCent[i][j] = index; } } /* for (int i = 0; i < numInst; i++) { int center = assignments[i]; instOfCent[center][numInstOfCent[center]++] = i; }*/ } return converged; } /** * Assign instances to centers. * Part of conventionell K-Means, returns true if new assignment * is the same as the last one. * * @param centers all the input centers * @param instOfCent the instances to each center * @param allInstList list of all indexes * @param assignments assignments of instances to centers * @return true if converged */ private boolean assignToCenters(Instances centers, int [][] instOfCent, int [] allInstList, int [] assignments) throws Exception { // todo: undecided situations boolean converged = true; // true if new assignment is the same // as the old one int numInst = allInstList.length; int numCent = centers.numInstances(); int [] numInstOfCent = new int [numCent]; for (int i = 0; i < numCent; i++) numInstOfCent[i] = 0; // WARNING: assignments is "input/output-parameter" // should not be null if (assignments == null) { OOPS(D_METH_MISUSE, "assignment was null"); assignments = new int[numInst]; for (int i = 0; i < numInst; i++) { assignments[i] = -1; } } // WARNING: instOfCent is "input/output-parameter" // should not be null if (instOfCent == null) { OOPS(D_METH_MISUSE, "inst of cent was null"); instOfCent = new int [numCent][]; } // set assignments for (int i = 0; i < numInst; i++) { Instance inst = m_Instances.instance(allInstList[i]); int newC = clusterProcessedInstance(inst, centers); if (converged && newC != assignments[i]) { converged = false; } numInstOfCent[newC]++; if (!converged) assignments[i] = newC; } // the following is only done // if assignments are not the same, because too much effort if (!converged) { PFD(D_FOLLOWSPLIT, "assignToCenters -> it has NOT converged"); for (int i = 0; i < numCent; i++) { instOfCent[i] = new int [numInstOfCent[i]]; } for (int i = 0; i < numCent; i++) { int index = -1; for (int j = 0; j < numInstOfCent[i]; j++) { index = nextAssignedOne(i, index, assignments); instOfCent[i][j] = allInstList[index]; } } } else PFD(D_FOLLOWSPLIT, "assignToCenters -> it has converged"); return converged; } /** * Searches along the assignment array for the next entry of the center * in question. * @param cent index of the center * @param lasIndex index to start searching * @param assignments assignments * @return index of the instance the center cent is assigned to */ private int nextAssignedOne(int cent, int lastIndex, int [] assignments) { int len = assignments.length; int index = lastIndex + 1; while (index < len) { if (assignments[index] == cent) { return (index); } index++; } return (-1); } /** * Split centers in their region. Generates random vektor of * length = variance and * adds and substractsx to cluster vektor to get two new clusters. * * @param random random function * @param center the center that is split here * @param variance variance of the cluster * @param model data model valid * @return a pair of new centers * @exception something in AlgVector goes wrong */ private Instances splitCenter(Random random, Instance center, double variance, Instances model) throws Exception { m_NumSplits++; AlgVector r = null; Instances children = new Instances(model, 2); if (m_DebugVektorsFile != null) { Instance nextVektor = getNextDebugVektorsInstance(model); PFD(D_RANDOMVEKTOR, "Random Vector from File " + nextVektor); r = new AlgVector(nextVektor); } else { //OOPS("before split variance "+ variance); //OOPS("center to be split "+ center); // random vector of length = variance r = new AlgVector(model, random); } r.changeLength(Math.pow(variance, 0.5)); //OOPS(D_FOLLOWSPLIT, "variance " + variance + // " sqrt-variance " + Math.pow(variance, 0.5)); PFD(D_RANDOMVEKTOR, "random vector *variance "+ r); // add random vector to center AlgVector c = new AlgVector(center); AlgVector c2 = (AlgVector) c.clone(); c.add(r); Instance newCenter = c.getAsInstance(model, random); children.add(newCenter); PFD(D_FOLLOWSPLIT, "first child "+ newCenter); // substract random vector to center c2.substract(r); newCenter = c.getAsInstance(model, random); children.add(newCenter); PFD(D_FOLLOWSPLIT, "second child "+ newCenter); return children; } /** * Split centers in their region. * (*Alternative version of splitCenter()*) * @param instances of the region * @return a pair of new centers */ private Instances splitCenters(Random random, Instances instances, Instances model) { Instances children = new Instances(model, 2); int instIndex = Math.abs(random.nextInt()) % instances.numInstances(); children.add(instances.instance(instIndex)); int instIndex2 = instIndex; int count = 0; while ((instIndex2 == instIndex) && count < 10) { count++; instIndex2 = Math.abs(random.nextInt()) % instances.numInstances(); } children.add(instances.instance(instIndex2)); return children; } /** * Generates new centers randomly. Used for starting centers. * * @param random0 random number generator * @param model data model of the instances * @param numClusters number of clusters * @return new centers */ private Instances makeCentersRandomly(Random random0, Instances model, int numClusters) { Instances clusterCenters = new Instances(model, numClusters); m_NumClusters = numClusters; // makes the new centers randomly for (int i = 0; i < numClusters; i++) { int instIndex = Math.abs(random0.nextInt()) % m_Instances.numInstances(); clusterCenters.add(m_Instances.instance(instIndex)); } return clusterCenters; } /** * Returns the BIC-value for the given center and instances. * @param instList The indices of the instances that belong to the center * @param center the center. * @param mle maximum likelihood * @param model the data model * @return the BIC value */ private double calculateBIC(int [] instList, Instance center, double mle, Instances model) { int [][] w1 = new int[1][instList.length]; for (int i = 0; i < instList.length; i++) { w1[0][i] = instList[i]; } double [] m = {mle}; Instances w2 = new Instances(model, 1); w2.add(center); return calculateBIC(w1, w2, m); } /** * Calculates the BIC for the given set of centers and instances. * @param instOfCent The instances that belong to their respective centers * @param centers the centers * @param mle maximum likelihood * @return The BIC for the input. */ private double calculateBIC(int [][] instOfCent, Instances centers, double [] mle) { double loglike = 0.0; int numInstTotal = 0; int numCenters = centers.numInstances(); int numDimensions = centers.numAttributes(); int numParameters = (numCenters - 1) + //probabilities numCenters * numDimensions + //means numCenters; // variance params for (int i = 0; i < centers.numInstances(); i++) { loglike += logLikelihoodEstimate(instOfCent[i].length, centers.instance(i), mle[i], centers.numInstances() * 2); numInstTotal += instOfCent[i].length; } /* diff thats how we did it loglike -= ((centers.numAttributes() + 1.0) * centers.numInstances() * 1) * Math.log(count); */ loglike -= numInstTotal * Math.log(numInstTotal); //System.out.println ("numInstTotal " + numInstTotal + // "calculateBIC res " + loglike); loglike -= (numParameters / 2.0) * Math.log(numInstTotal); //System.out.println ("numParam " + // + numParameters + // " calculateBIC res " + loglike); return loglike; } /** * Calculates the log-likelihood of the data for the given model, taken * at the maximum likelihood point. * * @param numInst number of instances that belong to the center * @param center the center * @param distortion distortion * @param numCent number of centers * @return the likelihood estimate */ private double logLikelihoodEstimate(int numInst, Instance center, double distortion, int numCent) { // R(n) num of instances of the center -> numInst // K num of centers -> not used // //todo take the diff comments away double loglike = 0; /* if is new */ if (numInst > 1) { /* diff variance is new */ // // distortion = Sum over instances x of the center(x-center) // different to paper; sum should be squared // // (Sum of distances to center) / R(n) - 1.0 // different to paper; should be R(n)-K double variance = distortion / (numInst - 1.0); // // -R(n)/2 * log(pi*2) // double p1 = - (numInst / 2.0) * Math.log(Math.PI * 2.0); /* diff thats how we had it double p2 = -((ni * center.numAttributes()) / 2) * distortion; */ // // -(R(n)*M)/2 * log(variance) // double p2 = - (numInst * center.numAttributes()) / 2 * Math.log(variance); /* diff thats how we had it, the difference is a bug in x-means double p3 = - (numInst - numCent) / 2; */ // // -(R(n)-1)/2 // double p3 = - (numInst - 1.0) / 2.0; // // R(n)*log(R(n)) // double p4 = numInst * Math.log(numInst); /* diff x-means doesn't have this part double p5 = - numInst * Math.log(numInstTotal); */ /* loglike = -(ni / 2) * Math.log(Math.PI * 2) - (ni * center.numAttributes()) / 2.0) * logdistortion - (ni - k) / 2.0 + ni * Math.log(ni) - ni * Math.log(r); */ //OOPS("distortion " + distortion); //OOPS("variance " + variance); //OOPS("p1 " + p1); //OOPS("p2 " + p2); //OOPS("p3 " + p3); //OOPS("p4 " + p4); //OOPS(p1 + " " + p2 + " " + p3 + " " + p4 + " " + p5 + " " + // distortion); loglike = p1 + p2 + p3 + p4; // diff + p5; //OOPS("loglike " + loglike); //the log(r) is something that can be reused. //as is the log(2 PI), these could provide extra speed up later on. //since distortion is so expensive to compute, I only do that once. } return loglike; } /** * Calculates the maximum likelihood estimate for the variance. * @param instOfCent indices of instances to each center * @param centers the centers * @return the list of distortions distortion. */ private double [] distortion(int[][] instOfCent, Instances centers) throws Exception { double [] distortion = new double [centers.numInstances()]; for (int i = 0; i < centers.numInstances(); i++) { distortion[i] = 0.0; for (int j = 0; j < instOfCent[i].length; j++) { distortion[i] += m_DistanceF.distance( m_Instances.instance(instOfCent[i][j]), centers.instance(i)); } } /* diff not done in x-means res *= 1.0 / (count - centers.numInstances()); */ return distortion; } /** * Clusters an instance. * @param instance the instance to assign a cluster to. * @param centers the centers to cluster the instance to. * @return a cluster index. */ private int clusterProcessedInstance(Instance instance, Instances centers) throws Exception{ double minDist = Integer.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < centers.numInstances(); i++) { double dist = m_DistanceF.distance(instance, centers.instance(i)); if (dist < minDist) { minDist = dist; bestCluster = i; } } ; return bestCluster; } /** * Clusters an instance that has been through the filters. * * @param instance the instance to assign a cluster to * @return a cluster number */ private int clusterProcessedInstance(Instance instance) throws Exception { double minDist = Integer.MAX_VALUE; int bestCluster = 0; for (int i = 0; i < m_NumClusters; i++) { double dist = m_DistanceF.distance(instance, m_ClusterCenters.instance(i)); if (dist < minDist) { minDist = dist; bestCluster = i; } } return bestCluster; } /** * Classifies a given instance. * * @param instance the instance to be assigned to a cluster * @return the number of the assigned cluster as an integer * if the class is enumerated, otherwise the predicted value * @exception if instance could not be classified * successfully */ public int clusterInstance(Instance instance) throws Exception { m_ReplaceMissingFilter.input(instance); Instance inst = m_ReplaceMissingFilter.output(); return clusterProcessedInstance(inst); } /** * Returns the number of clusters. * * @return the number of clusters generated for a training dataset. */ public int numberOfClusters() { return m_NumClusters; } /** * Returns an enumeration describing the available options. * @return an enumeration of all the available options **/ public Enumeration listOptions() { Vector newVector = new Vector(4); newVector.addElement(new Option( "\tmaximum number of overall iterations" + " (default = 1).", "I", 1, "-I <num>")); newVector.addElement(new Option( "\tmaximum number of iterations in the kMeans loop in" + " the Improve-Parameter part "+ " (default = 1000).", "M", 1, "-M <num>")); newVector.addElement(new Option( "\tmaximum number of iterations in the kMeans loop" + " for the splitted centroids in the Improve-Structure part "+ " (default = 1000).", "J", 1, "-J <num>")); newVector.addElement(new Option( "\tminimum number of clusters" + " (default = 2).", "L", 1, "-L <num>")); newVector.addElement(new Option( "\tmaximum number of clusters" + " (default = 4).", "H", 1, "-H <num>")); newVector.addElement(new Option( "\tdistance value for binary attributes" + " (default = 1.0).", "V", 1, "-V <value>")); newVector.addElement(new Option( "\tFull class name of KDTree class to use, followed\n" + "\tby scheme options.\n" + "\teg: \"weka.core.KDTree -P\"\n" + "(default = no KDTree class used).", "K", 1, "-K <KDTree class specification>")); newVector.addElement(new Option( "\tcutoff factor, takes the given percentage of the splitted \n" + "\tcentroids if none of the children win\n" + "\t(default = 0.0).", "C", 1, "-C <value>")); newVector.addElement(new Option( "\tFull class name of Distance function class to use, followed\n" + "\tby scheme options.\n" + "\teg: \"weka.core.MahalanobisDistance\"\n" + "\t(default = weka.core.EuclideanDistance).", "K", 1, "-K <distance function class specification>")); newVector.addElement(new Option( "\tfile to read starting centers from (ARFF format).", "N", 1, "-N <file name>")); newVector.addElement(new Option( "\tfile to write centers to (ARFF format).", "O", 1, "-O <file name>")); newVector.addElement(new Option( "\trandom number seed (default 10).", "S", 1, "-S <num>")); return newVector.elements(); } /** * Returns the tip text for this property * @return tip text for this property */ public String minNumClustersTipText() { return "set minimum number of clusters"; } /** * Returns the tip text for this property * @return tip text for this property */ public String maxNumClustersTipText() { return "set maximum number of clusters"; } /** * Sets the maximum number of iterations to perform. * @param i the number of iterations * @exception Exception if i is less than 1 */ public void setMaxIterations(int i) throws Exception { if (i < 0) throw new Exception("Only positive values for iteration number" + " allowed (Option I)."); m_MaxIterations = i; } /** * Gets the maximum number of iterations. * @return the number of iterations */ public int getMaxIterations() { return m_MaxIterations; } /** * Set the maximum number of iterations to perform in KMeans * @param i the number of iterations */ public void setMaxKMeans(int i) { m_MaxKMeans = i; m_MaxKMeansForChildren = i; } /** * Gets the maximum number of iterations in KMeans. * @return the number of iterations */ public int getMaxKMeans() { return m_MaxKMeans; } /** * Sets the maximum number of iterations KMeans that is performed * on the child centers. * @param i the number of iterations */ public void setMaxKMeansForChildren(int i) throws Exception { m_MaxKMeansForChildren = i; } /** * Gets the maximum number of iterations in KMeans. * @return the number of iterations */ public int getMaxKMeansForChildren() { return m_MaxKMeansForChildren; } /** * Sets a new cutoff factor. * @param i the new cutoff factor */ public void setCutOffFactor(double i) throws Exception { m_CutOffFactor = i; } /** * Gets the cutoff factor. * @return the cutoff factor */ public double getCutOffFactor() { return m_CutOffFactor; } /** * Sets the minimum number of clusters to generate. * * @param n the minimum number of clusters to generate */ public void setMinNumClusters(int n) { if (n <= m_MaxNumClusters) { m_MinNumClusters = n; } } /** * Sets the maximum number of clusters to generate. * @param n the maximum number of clusters to generate */ public void setMaxNumClusters(int n) { if (n >= m_MinNumClusters) { m_MaxNumClusters = n; } } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String binValueTipText() { return "Set the value that represents true in the new attributes."; } /** * Gets value that represents true in a new numeric attribute. * (False is always represented by 0.0.) * @return the value that represents true in a new numeric attribute */ public double getBinValue() { return m_BinValue; } /** * Sets the distance e value between true and false of binary attributes * and "same" and "different" of nominal attributes * @param double value */ public void setBinValue(double value) { m_BinValue = value; } /** * gets the "binary" distance value * @param distanceF the distance function with all options set */ public void setDistanceF(DistanceFunction distanceF) { m_DistanceF = distanceF; } /** * Gets the distance function. * @return the distance function */ public DistanceFunction getDistanceF() { return m_DistanceF; } /** * Gets the distance function specification string, which contains the * class name of the distance function class and any options to it * * @return the distance function specification string */ protected String getDistanceFSpec() { DistanceFunction d = getDistanceF(); if (d instanceof OptionHandler) { return d.getClass().getName() + " " + Utils.joinOptions(((OptionHandler) d).getOptions()); } return d.getClass().getName(); } /** * Sets a file name for a file that has the random vektors stored. * Just used for debugging reasons. * @param fileName file name for the file to read the random vektors from */ public void setDebugVektorsFile(String fileName) { m_DebugVektorsFile = fileName; } /** * Initialises the debug vektor input. */ public void initDebugVektorsInput() throws Exception { m_DebugVektorsInput = new BufferedReader(new FileReader(m_DebugVektorsFile)); m_DebugVektors = new Instances(m_DebugVektorsInput); m_DebugVektorsIndex = 0; } /** * Read an instance from debug vektors file. * @param model the data model for the instance */ public Instance getNextDebugVektorsInstance(Instances model) throws Exception { if (m_DebugVektorsIndex >= m_DebugVektors.numInstances()) throw new Exception("no more prefabricated Vektors"); Instance nex = m_DebugVektors.instance(m_DebugVektorsIndex); nex.setDataset(model); m_DebugVektorsIndex++; return nex; } /** * Sets the name of the file to read the list of centers from. * * @param fileName file name of file to read centers from */ public void setInputCenterFile(String fileName) { m_InputCenterFile = fileName; } /** * Sets the name of the file to write the list of centers to. * * @param fileName file to write centers to */ public void setOutputCenterFile(String fileName) { m_OutputCenterFile = fileName; } /** * Gets the name of the file to read the list of centers from. * * @return filename of the file to read the centers from */ public String getInputCenterFile() { return m_InputCenterFile; } /** * Gets the name of the file to write the list of centers to. * @return filename of the file to write centers to */ public String getOutputCenterFile() { return m_OutputCenterFile; } /** * Sets the KDTree class. * @param k a KDTree object with all options set */ public void setKDTree(KDTree k) { m_KDTree = k; } /** * Gets the KDTree class. * @return flag if KDTrees are used */ public KDTree getKDTree() { return m_KDTree; } /** * Gets the KDTree specification string, which contains the class name of * the KDTree class and any options to the KDTree * * @return the KDTree string. */ protected String getKDTreeSpec() { KDTree c = getKDTree(); if (c instanceof OptionHandler) { return c.getClass().getName() + " " + Utils.joinOptions(((OptionHandler)c).getOptions()); } return c.getClass().getName(); } /** * Sets the debug level. * debug level = 0, means no output * @param d debuglevel */ public void setDebugLevel(int d) { m_DebugLevel = d; } /** * Gets the debug level. * @return debug level */ public int getDebugLevel() { return m_DebugLevel; } /** * Gets the minimum number of clusters to generate. * @return the minimum number of clusters to generate */ public int getMinNumClusters() { return m_MinNumClusters; } /** * Gets the maximum number of clusters to generate. * @return the maximum number of clusters to generate */ public int getMaxNumClusters() { return m_MaxNumClusters; } /** * Returns the tip text for this property. * @return tip text for this property */ public String seedTipText() { return "random number seed"; } /** * Sets the random number seed. * @param s the seed */ public void setSeed(int s) { m_Seed = s; } /** * Gets the random number seed. * @return the seed */ public int getSeed() { return m_Seed; } /** * Checks the instances. * No checks in this KDTree but it calls the check of the distance function. */ private void checkInstances () throws Exception { m_DistanceF.checkInstances(); } /** * Parses a given list of options. * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions(String[] options) throws Exception { String optionString = Utils.getOption('I', options); if (optionString.length() != 0) { setMaxIterations(Integer.parseInt(optionString)); } optionString = Utils.getOption('M', options); if (optionString.length() != 0) { setMaxKMeans(Integer.parseInt(optionString)); } optionString = Utils.getOption('J', options); if (optionString.length() != 0) { setMaxKMeansForChildren(Integer.parseInt(optionString)); } optionString = Utils.getOption('L', options); if (optionString.length() != 0) { setMinNumClusters(Integer.parseInt(optionString)); } optionString = Utils.getOption('H', options); if (optionString.length() != 0) { setMaxNumClusters(Integer.parseInt(optionString)); } optionString = Utils.getOption('B', options); if (optionString.length() != 0) { setBinValue(Double.parseDouble(optionString)); } String funcString = Utils.getOption('K', options); if (funcString.length() != 0) { String [] funcSpec = Utils.splitOptions(funcString); if (funcSpec.length == 0) { throw new Exception("Invalid function specification string"); } String funcName = funcSpec[0]; funcSpec[0] = ""; setKDTree((KDTree) Utils.forName(KDTree.class, funcName, funcSpec)); } optionString = Utils.getOption('C', options); if (optionString.length() != 0) { setCutOffFactor(Double.parseDouble(optionString)); } funcString = Utils.getOption('D', options); if (funcString.length() != 0) { String [] funcSpec = Utils.splitOptions(funcString); if (funcSpec.length == 0) { throw new Exception("Invalid function specification string"); } String funcName = funcSpec[0]; funcSpec[0] = ""; setDistanceF((DistanceFunction) Utils.forName(DistanceFunction.class, funcName, funcSpec)); } optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setInputCenterFile(optionString); m_CenterInput = new BufferedReader(new FileReader(optionString)); } optionString = Utils.getOption('O', options); if (optionString.length() != 0) { setOutputCenterFile(optionString); m_CenterOutput = new PrintWriter(new FileOutputStream(optionString)); } optionString = Utils.getOption('S', options); if (optionString.length() != 0) { setSeed(Integer.parseInt(optionString)); } optionString = Utils.getOption('U', options); int debugLevel = 0; if (optionString.length() != 0) { try { debugLevel = Integer.parseInt(optionString); } catch (NumberFormatException e) { throw new Exception(optionString + "is an illegal value for option D"); } } setDebugLevel(debugLevel); optionString = Utils.getOption('Y', options); if (optionString.length() != 0) { setDebugVektorsFile(optionString); } } /** * Gets the current settings of SimpleKMeans. * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { String[] options = new String[27]; int current = 0; options[current++] = "-I"; options[current++] = "" + getMaxIterations(); options[current++] = "-M"; options[current++] = "" + getMaxKMeans(); options[current++] = "-J"; options[current++] = "" + getMaxKMeansForChildren(); options[current++] = "-L"; options[current++] = "" + getMinNumClusters(); options[current++] = "-H"; options[current++] = "" + getMaxNumClusters(); options[current++] = "-B"; options[current++] = "" + getBinValue(); if (getKDTree() != null) { options[current++] = "-K"; options[current++] = "" + getKDTreeSpec(); } options[current++] = "-C"; options[current++] = "" + getCutOffFactor(); if (getDistanceF() != null) { options[current++] = "-D"; options[current++] = "" + getDistanceFSpec(); } options[current++] = "-N"; options[current++] = "" + getInputCenterFile(); options[current++] = "-O"; options[current++] = "" + getOutputCenterFile(); options[current++] = "-S"; options[current++] = "" + getSeed(); int dL = getDebugLevel(); if (dL > 0) { options[current++] = "-U"; options[current++] = "" + getDebugLevel(); } while (current < options.length) { options[current++] = ""; } return options; } /** * Return a string describing this clusterer. * @return a description of the clusterer as a string */ public String toString() { StringBuffer temp = new StringBuffer(); temp.append("\nkMeans\n======\n"); temp.append("Requested iterations : " + m_MaxIterations + "\n"); temp.append("Iterations performed : " + m_IterationCount+ "\n"); temp.append("kMeans did not converge\n"); temp.append(" but was stopped by max-loops " + m_KMeansStopped + " times (max kMeans-iter) = \n\n"); temp.append("Splits prepared : " + m_NumSplits + "\n"); temp.append("Splits performed : " + m_NumSplitsDone + "\n"); temp.append("Cutoff factor : " + m_CutOffFactor + "\n"); double perc; if (m_NumSplitsDone > 0) perc = (((double)m_NumSplitsStillDone)/((double) m_NumSplitsDone)) * 100.0; else perc = 0.0; temp.append("Percentage of splits accepted \n" + "by cutoff factor : " + Utils.doubleToString(perc,2) + " %\n"); temp.append("------\n"); temp.append("Cutoff factor : " + m_CutOffFactor + "\n"); temp.append("------\n"); temp.append("\nCluster centers : " + m_NumClusters + " centers\n"); for (int i = 0; i < m_NumClusters; i++) { temp.append("\nCluster "+i+"\n "); for (int j = 0; j < m_ClusterCenters.numAttributes(); j++) { if (m_ClusterCenters.attribute(j).isNominal()) { temp.append(" "+m_ClusterCenters.attribute(j). value((int)m_ClusterCenters.instance(i).value(j))); } else { temp.append(" "+m_ClusterCenters.instance(i).value(j)); } } } if (m_Mle != null) temp.append("\n\nDistortion: " + Utils.doubleToString(Utils.sum(m_Mle),6) + "\n"); temp.append("BIC-Value : " + Utils.doubleToString(m_Bic,6) + "\n"); return temp.toString(); } /** * Used for debug println's. * @param output string that is printed */ private void OOPS(int debugLevel, String output) { if (debugLevel == m_DebugLevel) System.out.println(output); } private void OOPS(String output) { System.out.println(output); } /** * Print centers for debug. * @param debugLevel level that gives according messages * @return true if debug level is set */ private void PrCentersFD(int debugLevel) { if (debugLevel == m_DebugLevel) { for (int i = 0; i < m_ClusterCenters.numInstances(); i++) { System.out.println(m_ClusterCenters.instance(i)); } } } /** * Tests on debug status. * @param debugLevel level that gives according messages * @return true if debug level is set */ private boolean TFD(int debugLevel) { return (debugLevel == m_DebugLevel); } /** * Does debug printouts. * @param debugLevel level that gives according messages * @param output string that is printed */ private void PFD(int debugLevel, String output) { if (debugLevel == m_DebugLevel) System.out.println(output); } /** * Does debug printouts. * @param debugLevel level that gives according messages * @param output string that is printed */ private void PFD_CURR(String output) { if (m_CurrDebugFlag) System.out.println(output); } /** * Main method for testing this class. * @param argv should contain options */ public static void main(String[] argv) { try { System.out.println(ClusterEvaluation. evaluateClusterer(new XMeans(), argv)); } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); } } }