/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * ConstrainedKMeans.java * Copyright (C) 2009-2010 Aristotle University of Thessaloniki, Thessaloniki, Greece */ package mulan.classifier.meta; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.Random; import java.util.Vector; import weka.classifiers.rules.DecisionTableHashKey; import weka.clusterers.NumberOfClustersRequestable; import weka.clusterers.RandomizableClusterer; import weka.clusterers.SimpleKMeans; import weka.core.Attribute; import weka.core.Capabilities; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.core.Option; import weka.core.Utils; import weka.core.WeightedInstancesHandler; import weka.core.Capabilities.Capability; import weka.filters.Filter; import weka.filters.unsupervised.attribute.ReplaceMissingValues; /** <!-- globalinfo-start --> * Cluster data using the k means algorithm * <p/> <!-- globalinfo-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <num> * number of clusters. * (default 2).</pre> * * <pre> -S <num> * Random number seed. * (default 10)</pre> * <!-- options-end --> * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 1.29 $ * @see RandomizableClusterer */ public class ConstrainedKMeans extends RandomizableClusterer implements NumberOfClustersRequestable, WeightedInstancesHandler { /** for serialization **/ static final long serialVersionUID = -3235809600124455376L; private ArrayList[] bucket; private int bucketSize; private int maxIterations; @Override public String getRevision() { throw new UnsupportedOperationException("Not supported yet."); } static public class bucketInstance implements Comparable { double[] distances; double distance; public bucketInstance() { } public void setDistances(double[] x) { distances = new double[x.length]; System.arraycopy(x, 0, distances, 0, x.length); } public void setDistance(double x) { distance = x; } public double[] getDistances() { return distances; } public double getDistance() { return distance; } public int compareTo(Object ci) { double d = ((bucketInstance) ci).getDistance(); if ((this.distance - d) < 0) { return -1; } else if (this.distance == d) { return 0; } else { return 1; } } } /** * replace missing values in training instances */ private ReplaceMissingValues m_ReplaceMissingFilter; /** * number of clusters to generate */ private int m_NumClusters = 2; /** * holds the cluster centroids */ private Instances m_ClusterCentroids; /** * Holds the standard deviations of the numeric attributes in each cluster */ private Instances m_ClusterStdDevs; /** * For each cluster, holds the frequency counts for the values of each * nominal attribute */ private int[][][] m_ClusterNominalCounts; /** * The number of instances in each cluster */ private int[] m_ClusterSizes; /** * attribute min values */ private double[] m_Min; /** * attribute max values */ private double[] m_Max; /** * Keep track of the number of iterations completed before convergence */ private int m_Iterations = 0; /** * Holds the squared errors for all clusters */ private double[] m_squaredErrors; /** * the default constructor */ public ConstrainedKMeans() { super(); m_SeedDefault = 10; setSeed(m_SeedDefault); } /** * Returns a string describing this clusterer * @return a description of the evaluator suitable for * displaying in the explorer/experimenter gui */ public String globalInfo() { return "Cluster data using the k means algorithm"; } /** * Returns default capabilities of the clusterer. * * @return the capabilities of this clusterer */ @Override public Capabilities getCapabilities() { Capabilities result = super.getCapabilities(); result.disableAll(); result.enable(Capability.NO_CLASS); // attributes result.enable(Capability.NOMINAL_ATTRIBUTES); result.enable(Capability.NUMERIC_ATTRIBUTES); result.enable(Capability.MISSING_VALUES); return result; } public void setMaxIterations(int x) { maxIterations = x; } /** * Generates a clusterer. Has to initialize all fields of the clusterer * that are not being set via options. * * @param data set of instances serving as training data * @throws Exception if the clusterer has not been * generated successfully */ public void buildClusterer(Instances data) throws Exception { for (int i = 0; i < m_NumClusters; i++) { bucket[i] = new ArrayList<bucketInstance>(); } // calculate bucket size bucketSize = (int) Math.ceil(data.numInstances() / (double) m_NumClusters); //System.out.print("bucketSize = " + bucketSize + "\n"); // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); m_Min = new double[instances.numAttributes()]; m_Max = new double[instances.numAttributes()]; for (int i = 0; i < instances.numAttributes(); i++) { m_Min[i] = m_Max[i] = Double.NaN; } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; for (int i = 0; i < instances.numInstances(); i++) { updateMinMax(instances.instance(i)); } Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; for (int j = instances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(instances.instance(instIndex), instances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(instances.instance(instIndex)); initC.put(hk, null); } instances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } m_NumClusters = m_ClusterCentroids.numInstances(); int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; while (!converged) { // reset buckets for (int j = 0; j < m_NumClusters; j++) { bucket[j] = new ArrayList<bucketInstance>(); } emptyClusterCount = 0; m_Iterations++; //System.out.println(">>Iterations: "+m_Iterations); converged = true; for (i = 0; i < instances.numInstances(); i++) { //System.out.println("processing instance: " + i); Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } if (m_Iterations > maxIterations) { converged = true; } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { double[] vals = new double[instances.numAttributes()]; if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { for (int j = 0; j < instances.numAttributes(); j++) { vals[j] = tempI[i].meanOrMode(j); m_ClusterNominalCounts[i][j] = tempI[i].attributeStats(j).nominalCounts; } m_ClusterCentroids.add(new DenseInstance(1.0, vals)); } //System.out.println("centroid: " + i + " " + m_ClusterCentroids.instance(i).toString()); } if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; tempI = new Instances[m_NumClusters]; } if (!converged) { m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } } // reset buckets for (int j = 0; j < m_NumClusters; j++) { bucket[j] = new ArrayList<bucketInstance>(); } m_ClusterStdDevs = new Instances(instances, m_NumClusters); m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); m_ClusterSizes[i] = tempI[i].numInstances(); } } /** * clusters an instance that has been through the filters * * @param instance the instance to assign a cluster to * @param updateErrors if true, update the within clusters sum of errors * @return a cluster number */ private int clusterProcessedInstance(Instance instance, boolean updateErrors) { // calculate distance from bucket centers double[] distance = new double[m_NumClusters]; for (int i = 0; i < m_NumClusters; i++) { distance[i] = distance(instance, m_ClusterCentroids.instance(i)); // create a bucket item from the instance } bucketInstance ci = new bucketInstance(); ci.setDistances(distance); // assing item to closest bucket int bestCluster; boolean finished; do { finished = true; // add to closestBucket bestCluster = Utils.minIndex(distance); //System.out.print("closest bucket: " + closestBucket + "\n"); ci.setDistance(distance[bestCluster]); //* insert sort int j; for (j = 0; j < bucket[bestCluster].size() && ((bucketInstance) bucket[bestCluster].get(j)).compareTo(ci) < 0; j++) { } bucket[bestCluster].add(j, ci); //*/ /* simple insert bucket[closestBucket].add(ci); //*/ if (bucket[bestCluster].size() > bucketSize) { //System.out.println("removing an instance"); ci = (bucketInstance) bucket[bestCluster].remove(bucket[bestCluster].size() - 1); distance = ci.getDistances(); //System.out.print("distances: " + Arrays.toString(distance) + "\n"); distance[bestCluster] = Double.MAX_VALUE; ci.setDistances(distance); finished = false; } } while (!finished); if (updateErrors) { m_squaredErrors[bestCluster] += distance[bestCluster]; } return bestCluster; } /** * Classifies a given instance. * * @param instance the instance to be assigned to a cluster * @return the number of the assigned cluster as an interger * if the class is enumerated, otherwise the predicted value * @throws Exception if instance could not be classified * successfully */ @Override public int clusterInstance(Instance instance) throws Exception { m_ReplaceMissingFilter.input(instance); m_ReplaceMissingFilter.batchFinished(); Instance inst = m_ReplaceMissingFilter.output(); return clusterProcessedInstance(inst, false); } /** * Calculates the distance between two instances * * @param first the first instance * @param second the second instance * @return the distance between the two given instances, between 0 and 1 */ private double distance(Instance first, Instance second) { double distance = 0; int firstI, secondI; for (int p1 = 0, p2 = 0; p1 < first.numValues() || p2 < second.numValues();) { if (p1 >= first.numValues()) { firstI = m_ClusterCentroids.numAttributes(); } else { firstI = first.index(p1); } if (p2 >= second.numValues()) { secondI = m_ClusterCentroids.numAttributes(); } else { secondI = second.index(p2); } /* if (firstI == m_ClusterCentroids.classIndex()) { p1++; continue; } if (secondI == m_ClusterCentroids.classIndex()) { p2++; continue; } */ double diff; if (firstI == secondI) { diff = difference(firstI, first.valueSparse(p1), second.valueSparse(p2)); p1++; p2++; } else if (firstI > secondI) { diff = difference(secondI, 0, second.valueSparse(p2)); p2++; } else { diff = difference(firstI, first.valueSparse(p1), 0); p1++; } distance += diff * diff; } //return Math.sqrt(distance / m_ClusterCentroids.numAttributes()); return distance; } /** * Computes the difference between two given attribute * values. * * @param index the attribute index * @param val1 the first value * @param val2 the second value * @return the difference */ private double difference(int index, double val1, double val2) { switch (m_ClusterCentroids.attribute(index).type()) { case Attribute.NOMINAL: // If attribute is nominal if (Utils.isMissingValue(val1) || Utils.isMissingValue(val2) || ((int) val1 != (int) val2)) { return 1; } else { return 0; } case Attribute.NUMERIC: // If attribute is numeric if (Utils.isMissingValue(val1) || Utils.isMissingValue(val2)) { if (Utils.isMissingValue(val1) && Utils.isMissingValue(val2)) { return 1; } else { double diff; if (Utils.isMissingValue(val2)) { diff = norm(val1, index); } else { diff = norm(val2, index); } if (diff < 0.5) { diff = 1.0 - diff; } return diff; } } else { return norm(val1, index) - norm(val2, index); } default: return 0; } } /** * Normalizes a given value of a numeric attribute. * * @param x the value to be normalized * @param i the attribute's index * @return the normalized value */ private double norm(double x, int i) { if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i], m_Min[i])) { return 0; } else { return (x - m_Min[i]) / (m_Max[i] - m_Min[i]); } } /** * Updates the minimum and maximum values for all the attributes * based on a new instance. * * @param instance the new instance */ private void updateMinMax(Instance instance) { for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) { if (!instance.isMissing(j)) { if (Double.isNaN(m_Min[j])) { m_Min[j] = instance.value(j); m_Max[j] = instance.value(j); } else { if (instance.value(j) < m_Min[j]) { m_Min[j] = instance.value(j); } else { if (instance.value(j) > m_Max[j]) { m_Max[j] = instance.value(j); } } } } } } /** * Returns the number of clusters. * * @return the number of clusters generated for a training dataset. * @throws Exception if number of clusters could not be returned * successfully */ public int numberOfClusters() throws Exception { return m_NumClusters; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options. */ @Override public Enumeration listOptions() { Vector result = new Vector(); result.addElement(new Option( "\tnumber of clusters.\n" + "\t(default 2).", "N", 1, "-N <num>")); Enumeration en = super.listOptions(); while (en.hasMoreElements()) { result.addElement(en.nextElement()); } return result.elements(); } /** * Returns the tip text for this property * @return tip text for this property suitable for * displaying in the explorer/experimenter gui */ public String numClustersTipText() { return "set number of clusters"; } /** * set the number of clusters to generate * * @param n the number of clusters to generate * @throws Exception if number of clusters is negative */ public void setNumClusters(int n) throws Exception { if (n <= 0) { throw new Exception("Number of clusters must be > 0"); } m_NumClusters = n; bucket = new ArrayList[n]; } /** * gets the number of clusters to generate * * @return the number of clusters to generate */ public int getNumClusters() { return m_NumClusters; } /** * Parses a given list of options. <p/> * <!-- options-start --> * Valid options are: <p/> * * <pre> -N <num> * number of clusters. * (default 2).</pre> * * <pre> -S <num> * Random number seed. * (default 10)</pre> * <!-- options-end --> * * @param options the list of options as an array of strings * @throws Exception if an option is not supported */ @Override public void setOptions(String[] options) throws Exception { String optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setNumClusters(Integer.parseInt(optionString)); } super.setOptions(options); } /** * Gets the current settings of SimpleKMeans * * @return an array of strings suitable for passing to setOptions() */ @Override public String[] getOptions() { int i; Vector result; String[] options; result = new Vector(); result.add("-N"); result.add("" + getNumClusters()); options = super.getOptions(); for (i = 0; i < options.length; i++) { result.add(options[i]); } return (String[]) result.toArray(new String[result.size()]); } /** * return a string describing this clusterer * * @return a description of the clusterer as a string */ @Override public String toString() { int maxWidth = 0; for (int i = 0; i < m_NumClusters; i++) { for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) { if (m_ClusterCentroids.attribute(j).isNumeric()) { double width = Math.log(Math.abs(m_ClusterCentroids.instance(i).value(j))) / Math.log(10.0); width += 1.0; if ((int) width > maxWidth) { maxWidth = (int) width; } } } } StringBuffer temp = new StringBuffer(); String naString = "N/A"; for (int i = 0; i < maxWidth + 2; i++) { naString += " "; } temp.append("\nkMeans\n======\n"); temp.append("\nNumber of iterations: " + m_Iterations + "\n"); temp.append("Within cluster sum of squared errors: " + Utils.sum(m_squaredErrors)); temp.append("\n\nCluster centroids:\n"); for (int i = 0; i < m_NumClusters; i++) { temp.append("\nCluster " + i + "\n\t"); temp.append("Mean/Mode: "); for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) { if (m_ClusterCentroids.attribute(j).isNominal()) { temp.append(" " + m_ClusterCentroids.attribute(j). value((int) m_ClusterCentroids.instance(i).value(j))); } else { temp.append(" " + Utils.doubleToString(m_ClusterCentroids.instance(i).value(j), maxWidth + 5, 4)); } } temp.append("\n\tStd Devs: "); for (int j = 0; j < m_ClusterStdDevs.numAttributes(); j++) { if (m_ClusterStdDevs.attribute(j).isNumeric()) { temp.append(" " + Utils.doubleToString(m_ClusterStdDevs.instance(i).value(j), maxWidth + 5, 4)); } else { temp.append(" " + naString); } } } temp.append("\n\n"); return temp.toString(); } /** * Gets the the cluster centroids * * @return the cluster centroids */ public Instances getClusterCentroids() { return m_ClusterCentroids; } /** * Gets the standard deviations of the numeric attributes in each cluster * * @return the standard deviations of the numeric attributes * in each cluster */ public Instances getClusterStandardDevs() { return m_ClusterStdDevs; } /** * Returns for each cluster the frequency counts for the values of each * nominal attribute * * @return the counts */ public int[][][] getClusterNominalCounts() { return m_ClusterNominalCounts; } /** * Gets the squared error for all clusters * * @return the squared error */ public double getSquaredError() { return Utils.sum(m_squaredErrors); } /** * Gets the number of instances in each cluster * * @return The number of instances in each cluster */ public int[] getClusterSizes() { return m_ClusterSizes; } /** * Main method for testing this class. * * @param argv should contain the following arguments: <p> * -t training file [-N number of clusters] */ public static void main(String[] argv) { runClusterer(new SimpleKMeans(), argv); } }