SimpleKMeans.java example

Explorer
TimeSeriesClassification-master
- TimeSeriesClassification
  - src
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    SimpleKMeans.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */
package weka.clusterers;

import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Random;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import weka.classifiers.rules.DecisionTableHashKey;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.DenseInstance;
import weka.core.DistanceFunction;
import weka.core.EuclideanDistance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.ManhattanDistance;
import weka.core.Option;
import weka.core.RevisionUtils;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.WeightedInstancesHandler;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 <!-- globalinfo-start -->
 * Cluster data using the k means algorithm. Can use either the Euclidean distance (default) or the Manhattan distance. If the Manhattan distance is used, then centroids are computed as the component-wise median rather than mean. For more information see:<br/>
 * <br/>
 * D. Arthur, S. Vassilvitskii: k-means++: the advantages of carefull seeding. In: Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms, 1027-1035, 2007.
 * <p/>
 <!-- globalinfo-end -->
 * 
 <!-- technical-bibtex-start -->
 * BibTeX:
 * <pre>
 * @inproceedings{Arthur2007,
 *    author = {D. Arthur and S. Vassilvitskii},
 *    booktitle = {Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms},
 *    pages = {1027-1035},
 *    title = {k-means++: the advantages of carefull seeding},
 *    year = {2007}
 * }
 * </pre>
 * <p/>
 <!-- technical-bibtex-end -->
 * 
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -N <num>
 *  number of clusters.
 *  (default 2).</pre>
 * 
 * <pre> -P
 *  Initialize using the k-means++ method.
 * </pre>
 * 
 * <pre> -V
 *  Display std. deviations for centroids.
 * </pre>
 * 
 * <pre> -M
 *  Replace missing values with mean/mode.
 * </pre>
 * 
 * <pre> -A <classname and options>
 *  Distance function to use.
 *  (default: weka.core.EuclideanDistance)</pre>
 * 
 * <pre> -I <num>
 *  Maximum number of iterations.
 * </pre>
 * 
 * <pre> -O
 *  Preserve order of instances.
 * </pre>
 * 
 * <pre> -fast
 *  Enables faster distance calculations, using cut-off values.
 *  Disables the calculation/output of squared errors/distances.
 * </pre>
 * 
 * <pre> -num-slots <num>
 *  Number of execution slots.
 *  (default 1 - i.e. no parallelism)</pre>
 * 
 * <pre> -S <num>
 *  Random number seed.
 *  (default 10)</pre>
 * 
 <!-- options-end -->
 * 
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @author Eibe Frank (eibe@cs.waikato.ac.nz)
 * @version $Revision: 9756 $
 * @see RandomizableClusterer
 */
public class SimpleKMeans extends RandomizableClusterer implements
    NumberOfClustersRequestable, WeightedInstancesHandler,
    TechnicalInformationHandler {

  /** for serialization. */
  static final long serialVersionUID = -3235809600124455376L;

  /**
   * replace missing values in training instances.
   */
  private ReplaceMissingValues m_ReplaceMissingFilter;

  /**
   * number of clusters to generate.
   */
  private int m_NumClusters = 2;

  /**
   * holds the cluster centroids.
   */
  private Instances m_ClusterCentroids;

  /**
   * Holds the standard deviations of the numeric attributes in each cluster.
   */
  private Instances m_ClusterStdDevs;

  /**
   * For each cluster, holds the frequency counts for the values of each nominal
   * attribute.
   */
  private int[][][] m_ClusterNominalCounts;
  private int[][] m_ClusterMissingCounts;

  /**
   * Stats on the full data set for comparison purposes. In case the attribute
   * is numeric the value is the mean if is being used the Euclidian distance or
   * the median if Manhattan distance and if the attribute is nominal then it's
   * mode is saved.
   */
  private double[] m_FullMeansOrMediansOrModes;
  private double[] m_FullStdDevs;
  private int[][] m_FullNominalCounts;
  private int[] m_FullMissingCounts;

  /**
   * Display standard deviations for numeric atts.
   */
  private boolean m_displayStdDevs;

  /**
   * Replace missing values globally?
   */
  private boolean m_dontReplaceMissing = false;

  /**
   * The number of instances in each cluster.
   */
  private int[] m_ClusterSizes;

  /**
   * Maximum number of iterations to be executed.
   */
  private int m_MaxIterations = 500;

  /**
   * Keep track of the number of iterations completed before convergence.
   */
  private int m_Iterations = 0;

  /**
   * Holds the squared errors for all clusters.
   */
  private double[] m_squaredErrors;

  /** the distance function used. */
  protected DistanceFunction m_DistanceFunction = new EuclideanDistance();

  /**
   * Preserve order of instances.
   */
  private boolean m_PreserveOrder = false;

  /**
   * Assignments obtained.
   */
  protected int[] m_Assignments = null;

  /** whether to use fast calculation of distances (using a cut-off). */
  protected boolean m_FastDistanceCalc = false;

  /** Whether to initialize cluster centers using the k-means++ method */
  protected boolean m_initializeWithKMeansPlusPlus = false;

  protected int m_executionSlots = 1;

  /** For parallel execution mode */
  protected transient ExecutorService m_executorPool;

  /**
   * the default constructor.
   */
  public SimpleKMeans() {
    super();

    m_SeedDefault = 10;
    setSeed(m_SeedDefault);
  }

  /**
   * Start the pool of execution threads
   */
  protected void startExecutorPool() {
    if (m_executorPool != null) {
      m_executorPool.shutdownNow();
    }

    m_executorPool = Executors.newFixedThreadPool(m_executionSlots);
  }

  protected int m_completed;
  protected int m_failed;

  @Override
  public TechnicalInformation getTechnicalInformation() {
    TechnicalInformation result;

    result = new TechnicalInformation(Type.INPROCEEDINGS);
    result.setValue(Field.AUTHOR, "D. Arthur and S. Vassilvitskii");
    result.setValue(Field.TITLE,
        "k-means++: the advantages of carefull seeding");
    result.setValue(Field.BOOKTITLE, "Proceedings of the eighteenth annual "
        + "ACM-SIAM symposium on Discrete algorithms");
    result.setValue(Field.YEAR, "2007");
    result.setValue(Field.PAGES, "1027-1035");

    return result;
  }

  /**
   * Returns a string describing this clusterer.
   * 
   * @return a description of the evaluator suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String globalInfo() {
    return "Cluster data using the k means algorithm. Can use either "
        + "the Euclidean distance (default) or the Manhattan distance."
        + " If the Manhattan distance is used, then centroids are computed "
        + "as the component-wise median rather than mean."
        + " For more information see:\n\n"
        + getTechnicalInformation().toString();
  }

  /**
   * Returns default capabilities of the clusterer.
   * 
   * @return the capabilities of this clusterer
   */
  @Override
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();
    result.disableAll();
    result.enable(Capability.NO_CLASS);

    // attributes
    result.enable(Capability.NOMINAL_ATTRIBUTES);
    result.enable(Capability.NUMERIC_ATTRIBUTES);
    result.enable(Capability.MISSING_VALUES);

    return result;
  }

  private class KMeansComputeCentroidTask implements Callable<double[]> {

    protected Instances m_cluster;
    protected int m_centroidIndex;

    public KMeansComputeCentroidTask(int centroidIndex, Instances cluster) {
      m_cluster = cluster;
      m_centroidIndex = centroidIndex;
    }

    @Override
    public double[] call() {
      return moveCentroid(m_centroidIndex, m_cluster, true, false);
    }
  }

  /**
   * Launch the move centroids tasks
   * 
   * @param clusters the cluster centroids
   * @return the number of empty clusters
   */
  protected int launchMoveCentroids(Instances[] clusters) {
    int emptyClusterCount = 0;
    List<Future<double[]>> results = new ArrayList<Future<double[]>>();

    for (int i = 0; i < m_NumClusters; i++) {
      if (clusters[i].numInstances() == 0) {
        emptyClusterCount++;
      } else {
        Future<double[]> futureCentroid = m_executorPool
            .submit(new KMeansComputeCentroidTask(i, clusters[i]));
        results.add(futureCentroid);
      }
    }

    try {
      for (Future<double[]> d : results) {
        m_ClusterCentroids.add(new DenseInstance(1.0, d.get()));
      }
    } catch (Exception ex) {
      ex.printStackTrace();
    }

    return emptyClusterCount;
  }

  private class KMeansClusterTask implements Callable<Boolean> {

    protected int m_start;
    protected int m_end;
    protected Instances m_inst;
    protected int[] m_clusterAssignments;

    public KMeansClusterTask(Instances inst, int start, int end,
        int[] clusterAssignments) {
      m_start = start;
      m_end = end;
      m_inst = inst;
      m_clusterAssignments = clusterAssignments;
    }

    @Override
    public Boolean call() {
      boolean converged = true;
      for (int i = m_start; i < m_end; i++) {
        Instance toCluster = m_inst.instance(i);
        int newC = clusterInstance(toCluster);
        if (newC != m_clusterAssignments[i]) {
          converged = false;
        }
        m_clusterAssignments[i] = newC;
      }

      return converged;
    }

    protected int clusterInstance(Instance inst) {
      double minDist = Integer.MAX_VALUE;
      int bestCluster = 0;
      for (int i = 0; i < m_NumClusters; i++) {
        double dist;
        dist = m_DistanceFunction.distance(inst,
            m_ClusterCentroids.instance(i), minDist);
        if (dist < minDist) {
          minDist = dist;
          bestCluster = i;
        }
      }

      return bestCluster;
    }
  }

  /**
   * Launch the tasks that assign instances to clusters
   * 
   * @param insts the instances to be clustered
   * @param clusterAssignments the array of cluster assignments
   * @return true if k means has converged
   * @throws Exception if a problem occurs
   */
  protected boolean launchAssignToClusters(Instances insts,
      int[] clusterAssignments) throws Exception {
    int numPerTask = insts.numInstances() / m_executionSlots;

    List<Future<Boolean>> results = new ArrayList<Future<Boolean>>();
    for (int i = 0; i < m_executionSlots; i++) {
      int start = i * numPerTask;
      int end = start + numPerTask;
      if (i == m_executionSlots - 1) {
        end = insts.numInstances();
      }

      Future<Boolean> futureKM = m_executorPool.submit(new KMeansClusterTask(
          insts, start, end, clusterAssignments));
      results.add(futureKM);
    }

    boolean converged = true;
    for (Future<Boolean> f : results) {
      if (!f.get()) {
        converged = false;
      }
    }

    return converged;
  }

  /**
   * Generates a clusterer. Has to initialize all fields of the clusterer that
   * are not being set via options.
   * 
   * @param data set of instances serving as training data
   * @throws Exception if the clusterer has not been generated successfully
   */
  @Override
  public void buildClusterer(Instances data) throws Exception {

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
      m_ReplaceMissingFilter.setInputFormat(instances);
      instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_FullMissingCounts = new int[instances.numAttributes()];
    if (m_displayStdDevs) {
      m_FullStdDevs = new double[instances.numAttributes()];
    }
    m_FullNominalCounts = new int[instances.numAttributes()][0];

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false, false);
    for (int i = 0; i < instances.numAttributes(); i++) {
      m_FullMissingCounts[i] = instances.attributeStats(i).missingCount;
      if (instances.attribute(i).isNumeric()) {
        if (m_displayStdDevs) {
          m_FullStdDevs[i] = Math.sqrt(instances.variance(i));
        }
        if (m_FullMissingCounts[i] == instances.numInstances()) {
          m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
        }
      } else {
        m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts;
        if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils
            .maxIndex(m_FullNominalCounts[i])]) {
          m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common
                                               // value
        }
      }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder)
      m_Assignments = clusterAssignments;

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap initC = new HashMap();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder)
      initInstances = new Instances(instances);
    else
      initInstances = instances;

    if (m_initializeWithKMeansPlusPlus) {
      kMeansPlusPlusInit(initInstances);
    } else {
      for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
        instIndex = RandomO.nextInt(j + 1);
        hk = new DecisionTableHashKey(initInstances.instance(instIndex),
            initInstances.numAttributes(), true);
        if (!initC.containsKey(hk)) {
          m_ClusterCentroids.add(initInstances.instance(instIndex));
          initC.put(hk, null);
        }
        initInstances.swap(j, instIndex);

        if (m_ClusterCentroids.numInstances() == m_NumClusters) {
          break;
        }
      }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    // removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()];
    startExecutorPool();

    while (!converged) {
      emptyClusterCount = 0;
      m_Iterations++;
      converged = true;

      if (m_executionSlots <= 1
          || instances.numInstances() < 2 * m_executionSlots) {
        for (i = 0; i < instances.numInstances(); i++) {
          Instance toCluster = instances.instance(i);
          int newC = clusterProcessedInstance(toCluster, false, true);
          if (newC != clusterAssignments[i]) {
            converged = false;
          }
          clusterAssignments[i] = newC;
        }
      } else {
        converged = launchAssignToClusters(instances, clusterAssignments);
      }

      // update centroids
      m_ClusterCentroids = new Instances(instances, m_NumClusters);
      for (i = 0; i < m_NumClusters; i++) {
        tempI[i] = new Instances(instances, 0);
      }
      for (i = 0; i < instances.numInstances(); i++) {
        tempI[clusterAssignments[i]].add(instances.instance(i));
      }
      if (m_executionSlots <= 1
          || instances.numInstances() < 2 * m_executionSlots) {
        for (i = 0; i < m_NumClusters; i++) {
          if (tempI[i].numInstances() == 0) {
            // empty cluster
            emptyClusterCount++;
          } else {
            moveCentroid(i, tempI[i], true, true);
          }
        }
      } else {
        emptyClusterCount = launchMoveCentroids(tempI);
      }

      if (m_Iterations == m_MaxIterations)
        converged = true;

      if (emptyClusterCount > 0) {
        m_NumClusters -= emptyClusterCount;
        if (converged) {
          Instances[] t = new Instances[m_NumClusters];
          int index = 0;
          for (int k = 0; k < tempI.length; k++) {
            if (tempI[k].numInstances() > 0) {
              t[index++] = tempI[k];
            }
          }
          tempI = t;
        } else {
          tempI = new Instances[m_NumClusters];
        }
      }

      if (!converged) {
        m_ClusterNominalCounts = new int[m_NumClusters][instances
            .numAttributes()][0];
      }
    }

    // calculate errors
    if (!m_FastDistanceCalc) {
      for (i = 0; i < instances.numInstances(); i++) {
        clusterProcessedInstance(instances.instance(i), true, false);
      }
    }

    if (m_displayStdDevs) {
      m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new int[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
      if (m_displayStdDevs) {
        double[] vals2 = new double[instances.numAttributes()];
        for (int j = 0; j < instances.numAttributes(); j++) {
          if (instances.attribute(j).isNumeric()) {
            vals2[j] = Math.sqrt(tempI[i].variance(j));
          } else {
            vals2[j] = Utils.missingValue();
          }
        }
        m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
      }
      m_ClusterSizes[i] = tempI[i].numInstances();
    }

    m_executorPool.shutdown();
  }

  protected void kMeansPlusPlusInit(Instances data) throws Exception {
    Random randomO = new Random(getSeed());
    HashMap<DecisionTableHashKey, String> initC = new HashMap<DecisionTableHashKey, String>();

    // choose initial center uniformly at random
    int index = randomO.nextInt(data.numInstances());
    m_ClusterCentroids.add(data.instance(index));
    DecisionTableHashKey hk = new DecisionTableHashKey(data.instance(index),
        data.numAttributes(), true);
    initC.put(hk, null);

    int iteration = 0;
    int remainingInstances = data.numInstances() - 1;
    if (m_NumClusters > 1) {
      // proceed with selecting the rest

      // distances to the initial randomly chose center
      double[] distances = new double[data.numInstances()];
      double[] cumProbs = new double[data.numInstances()];
      for (int i = 0; i < data.numInstances(); i++) {
        distances[i] = m_DistanceFunction.distance(data.instance(i),
            m_ClusterCentroids.instance(iteration));
      }

      // now choose the remaining cluster centers
      for (int i = 1; i < m_NumClusters; i++) {

        // distances converted to probabilities
        double[] weights = new double[data.numInstances()];
        System.arraycopy(distances, 0, weights, 0, distances.length);
        Utils.normalize(weights);

        double sumOfProbs = 0;
        for (int k = 0; k < data.numInstances(); k++) {
          sumOfProbs += weights[k];
          cumProbs[k] = sumOfProbs;
        }

        cumProbs[data.numInstances() - 1] = 1.0; // make sure there are no
                                                 // rounding issues

        // choose a random instance
        double prob = randomO.nextDouble();
        for (int k = 0; k < cumProbs.length; k++) {
          if (prob < cumProbs[k]) {
            Instance candidateCenter = data.instance(k);
            hk = new DecisionTableHashKey(candidateCenter,
                data.numAttributes(), true);
            if (!initC.containsKey(hk)) {
              initC.put(hk, null);
              m_ClusterCentroids.add(candidateCenter);
            } else {
              // we shouldn't get here because any instance that is a duplicate
              // of
              // an already chosen cluster center should have zero distance (and
              // hence
              // zero probability of getting chosen) to that center.
              System.err.println("We shouldn't get here....");
            }
            remainingInstances--;
            break;
          }
        }
        iteration++;

        if (remainingInstances == 0) {
          break;
        }

        // prepare to choose the next cluster center.
        // check distances against the new cluster center to see if it is closer
        for (int k = 0; k < data.numInstances(); k++) {
          if (distances[k] > 0) {
            double newDist = m_DistanceFunction.distance(data.instance(k),
                m_ClusterCentroids.instance(iteration));
            if (newDist < distances[k]) {
              distances[k] = newDist;
            }
          }
        }
      }
    }
  }

  /**
   * Move the centroid to it's new coordinates. Generate the centroid
   * coordinates based on it's members (objects assigned to the cluster of the
   * centroid) and the distance function being used.
   * 
   * @param centroidIndex index of the centroid which the coordinates will be
   *          computed
   * @param members the objects that are assigned to the cluster of this
   *          centroid
   * @param updateClusterInfo if the method is supposed to update the m_Cluster
   *          arrays
   * @param addToCentroidInstances true if the method is to add the computed
   *          coordinates to the Instances holding the centroids
   * @return the centroid coordinates
   */
  protected double[] moveCentroid(int centroidIndex, Instances members,
      boolean updateClusterInfo, boolean addToCentroidInstances) {
    double[] vals = new double[members.numAttributes()];

    // used only for Manhattan Distance
    Instances sortedMembers = null;
    int middle = 0;
    boolean dataIsEven = false;

    if (m_DistanceFunction instanceof ManhattanDistance) {
      middle = (members.numInstances() - 1) / 2;
      dataIsEven = ((members.numInstances() % 2) == 0);
      if (m_PreserveOrder) {
        sortedMembers = members;
      } else {
        sortedMembers = new Instances(members);
      }
    }

    for (int j = 0; j < members.numAttributes(); j++) {

      // in case of Euclidian distance the centroid is the mean point
      // in case of Manhattan distance the centroid is the median point
      // in both cases, if the attribute is nominal, the centroid is the mode
      if (m_DistanceFunction instanceof EuclideanDistance
          || members.attribute(j).isNominal()) {
        vals[j] = members.meanOrMode(j);
      } else if (m_DistanceFunction instanceof ManhattanDistance) {
        // singleton special case
        if (members.numInstances() == 1) {
          vals[j] = members.instance(0).value(j);
        } else {
          vals[j] = sortedMembers.kthSmallestValue(j, middle + 1);
          if (dataIsEven) {
            vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2;
          }
        }
      }

      if (updateClusterInfo) {
        m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount;
        m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts;
        if (members.attribute(j).isNominal()) {
          if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils
              .maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) {
            vals[j] = Utils.missingValue(); // mark mode as missing
          }
        } else {
          if (m_ClusterMissingCounts[centroidIndex][j] == members
              .numInstances()) {
            vals[j] = Utils.missingValue(); // mark mean as missing
          }
        }
      }
    }
    if (addToCentroidInstances) {
      m_ClusterCentroids.add(new DenseInstance(1.0, vals));
    }
    return vals;
  }

  /**
   * clusters an instance that has been through the filters.
   * 
   * @param instance the instance to assign a cluster to
   * @param updateErrors if true, update the within clusters sum of errors
   * @param useFastDistCalc whether to use the fast distance calculation or not
   * @return a cluster number
   */
  private int clusterProcessedInstance(Instance instance, boolean updateErrors,
      boolean useFastDistCalc) {
    double minDist = Integer.MAX_VALUE;
    int bestCluster = 0;
    for (int i = 0; i < m_NumClusters; i++) {
      double dist;
      if (useFastDistCalc)
        dist = m_DistanceFunction.distance(instance,
            m_ClusterCentroids.instance(i), minDist);
      else
        dist = m_DistanceFunction.distance(instance,
            m_ClusterCentroids.instance(i));
      if (dist < minDist) {
        minDist = dist;
        bestCluster = i;
      }
    }
    if (updateErrors) {
      if (m_DistanceFunction instanceof EuclideanDistance) {
        // Euclidean distance to Squared Euclidean distance
        minDist *= minDist;
      }
      m_squaredErrors[bestCluster] += minDist;
    }
    return bestCluster;
  }

  /**
   * Classifies a given instance.
   * 
   * @param instance the instance to be assigned to a cluster
   * @return the number of the assigned cluster as an interger if the class is
   *         enumerated, otherwise the predicted value
   * @throws Exception if instance could not be classified successfully
   */
  @Override
  public int clusterInstance(Instance instance) throws Exception {
    Instance inst = null;
    if (!m_dontReplaceMissing) {
      m_ReplaceMissingFilter.input(instance);
      m_ReplaceMissingFilter.batchFinished();
      inst = m_ReplaceMissingFilter.output();
    } else {
      inst = instance;
    }

    return clusterProcessedInstance(inst, false, true);
  }

  /**
   * Returns the number of clusters.
   * 
   * @return the number of clusters generated for a training dataset.
   * @throws Exception if number of clusters could not be returned successfully
   */
  @Override
  public int numberOfClusters() throws Exception {
    return m_NumClusters;
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options.
   */
  @Override
  public Enumeration listOptions() {
    Vector result = new Vector();

    result.addElement(new Option("\tnumber of clusters.\n" + "\t(default 2).",
        "N", 1, "-N <num>"));

    result.addElement(new Option("\tInitialize using the k-means++ method.\n",
        "P", 0, "-P"));

    result.addElement(new Option("\tDisplay std. deviations for centroids.\n",
        "V", 0, "-V"));
    result.addElement(new Option("\tDon't replace missing values with mean/mode.\n",
        "M", 0, "-M"));

    result.add(new Option("\tDistance function to use.\n"
        + "\t(default: weka.core.EuclideanDistance)", "A", 1,
        "-A <classname and options>"));

    result.add(new Option("\tMaximum number of iterations.\n", "I", 1,
        "-I <num>"));

    result.addElement(new Option("\tPreserve order of instances.\n", "O", 0,
        "-O"));

    result
        .addElement(new Option(
            "\tEnables faster distance calculations, using cut-off values.\n"
                + "\tDisables the calculation/output of squared errors/distances.\n",
            "fast", 0, "-fast"));

    result.addElement(new Option("\tNumber of execution slots.\n"
        + "\t(default 1 - i.e. no parallelism)", "num-slots", 1,
        "-num-slots <num>"));

    Enumeration en = super.listOptions();
    while (en.hasMoreElements())
      result.addElement(en.nextElement());

    return result.elements();
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String numClustersTipText() {
    return "set number of clusters";
  }

  /**
   * set the number of clusters to generate.
   * 
   * @param n the number of clusters to generate
   * @throws Exception if number of clusters is negative
   */
  @Override
  public void setNumClusters(int n) throws Exception {
    if (n <= 0) {
      throw new Exception("Number of clusters must be > 0");
    }
    m_NumClusters = n;
  }

  /**
   * gets the number of clusters to generate.
   * 
   * @return the number of clusters to generate
   */
  public int getNumClusters() {
    return m_NumClusters;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String initializeUsingKMeansPlusPlusMethodTipText() {
    return "Initialize cluster centers using the probabilistic "
        + " farthest first method of the k-means++ algorithm";
  }

  /**
   * Set whether to initialize using the probabilistic farthest first like
   * method of the k-means++ algorithm (rather than the standard random
   * selection of initial cluster centers).
   * 
   * @param k true if the k-means++ method is to be used to select initial
   *          cluster centers.
   */
  public void setInitializeUsingKMeansPlusPlusMethod(boolean k) {
    m_initializeWithKMeansPlusPlus = k;
  }

  /**
   * Get whether to initialize using the probabilistic farthest first like
   * method of the k-means++ algorithm (rather than the standard random
   * selection of initial cluster centers).
   * 
   * @return true if the k-means++ method is to be used to select initial
   *         cluster centers.
   */
  public boolean getInitializeUsingKMeansPlusPlusMethod() {
    return m_initializeWithKMeansPlusPlus;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String maxIterationsTipText() {
    return "set maximum number of iterations";
  }

  /**
   * set the maximum number of iterations to be executed.
   * 
   * @param n the maximum number of iterations
   * @throws Exception if maximum number of iteration is smaller than 1
   */
  public void setMaxIterations(int n) throws Exception {
    if (n <= 0) {
      throw new Exception("Maximum number of iterations must be > 0");
    }
    m_MaxIterations = n;
  }

  /**
   * gets the number of maximum iterations to be executed.
   * 
   * @return the number of clusters to generate
   */
  public int getMaxIterations() {
    return m_MaxIterations;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String displayStdDevsTipText() {
    return "Display std deviations of numeric attributes "
        + "and counts of nominal attributes.";
  }

  /**
   * Sets whether standard deviations and nominal count. Should be displayed in
   * the clustering output.
   * 
   * @param stdD true if std. devs and counts should be displayed
   */
  public void setDisplayStdDevs(boolean stdD) {
    m_displayStdDevs = stdD;
  }

  /**
   * Gets whether standard deviations and nominal count. Should be displayed in
   * the clustering output.
   * 
   * @return true if std. devs and counts should be displayed
   */
  public boolean getDisplayStdDevs() {
    return m_displayStdDevs;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String dontReplaceMissingValuesTipText() {
    return "Replace missing values globally with mean/mode.";
  }

  /**
   * Sets whether missing values are to be replaced.
   * 
   * @param r true if missing values are to be replaced
   */
  public void setDontReplaceMissingValues(boolean r) {
    m_dontReplaceMissing = r;
  }

  /**
   * Gets whether missing values are to be replaced.
   * 
   * @return true if missing values are to be replaced
   */
  public boolean getDontReplaceMissingValues() {
    return m_dontReplaceMissing;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String distanceFunctionTipText() {
    return "The distance function to use for instances comparison "
        + "(default: weka.core.EuclideanDistance). ";
  }

  /**
   * returns the distance function currently in use.
   * 
   * @return the distance function
   */
  public DistanceFunction getDistanceFunction() {
    return m_DistanceFunction;
  }

  /**
   * sets the distance function to use for instance comparison.
   * 
   * @param df the new distance function to use
   * @throws Exception if instances cannot be processed
   */
  public void setDistanceFunction(DistanceFunction df) throws Exception {
    if (!(df instanceof EuclideanDistance)
        && !(df instanceof ManhattanDistance)) {
      throw new Exception(
          "SimpleKMeans currently only supports the Euclidean and Manhattan distances.");
    }
    m_DistanceFunction = df;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String preserveInstancesOrderTipText() {
    return "Preserve order of instances.";
  }

  /**
   * Sets whether order of instances must be preserved.
   * 
   * @param r true if missing values are to be replaced
   */
  public void setPreserveInstancesOrder(boolean r) {
    m_PreserveOrder = r;
  }

  /**
   * Gets whether order of instances must be preserved.
   * 
   * @return true if missing values are to be replaced
   */
  public boolean getPreserveInstancesOrder() {
    return m_PreserveOrder;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String fastDistanceCalcTipText() {
    return "Uses cut-off values for speeding up distance calculation, but "
        + "suppresses also the calculation and output of the within cluster sum "
        + "of squared errors/sum of distances.";
  }

  /**
   * Sets whether to use faster distance calculation.
   * 
   * @param value true if faster calculation to be used
   */
  public void setFastDistanceCalc(boolean value) {
    m_FastDistanceCalc = value;
  }

  /**
   * Gets whether to use faster distance calculation.
   * 
   * @return true if faster calculation is used
   */
  public boolean getFastDistanceCalc() {
    return m_FastDistanceCalc;
  }

  /**
   * Returns the tip text for this property
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String numExecutionSlotsTipText() {
    return "The number of execution slots (threads) to use. "
        + "Set equal to the number of available cpu/cores";
  }

  /**
   * Set the degree of parallelism to use.
   * 
   * @param slots the number of tasks to run in parallel when computing the
   *          nearest neighbors and evaluating different values of k between the
   *          lower and upper bounds
   */
  public void setNumExecutionSlots(int slots) {
    m_executionSlots = slots;
  }

  /**
   * Get the degree of parallelism to use.
   * 
   * @return the number of tasks to run in parallel when computing the nearest
   *         neighbors and evaluating different values of k between the lower
   *         and upper bounds
   */
  public int getNumExecutionSlots() {
    return m_executionSlots;
  }

  /**
   * Parses a given list of options.
   * <p/>
   * 
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -N <num>
   *  number of clusters.
   *  (default 2).</pre>
   * 
   * <pre> -P
   *  Initialize using the k-means++ method.
   * </pre>
   * 
   * <pre> -V
   *  Display std. deviations for centroids.
   * </pre>
   * 
   * <pre> -M
   *  Replace missing values with mean/mode.
   * </pre>
   * 
   * <pre> -A <classname and options>
   *  Distance function to use.
   *  (default: weka.core.EuclideanDistance)</pre>
   * 
   * <pre> -I <num>
   *  Maximum number of iterations.
   * </pre>
   * 
   * <pre> -O
   *  Preserve order of instances.
   * </pre>
   * 
   * <pre> -fast
   *  Enables faster distance calculations, using cut-off values.
   *  Disables the calculation/output of squared errors/distances.
   * </pre>
   * 
   * <pre> -num-slots <num>
   *  Number of execution slots.
   *  (default 1 - i.e. no parallelism)</pre>
   * 
   * <pre> -S <num>
   *  Random number seed.
   *  (default 10)</pre>
   * 
   <!-- options-end -->
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported
   */
  @Override
  public void setOptions(String[] options) throws Exception {

    m_displayStdDevs = Utils.getFlag("V", options);
    m_dontReplaceMissing = Utils.getFlag("M", options);
    m_initializeWithKMeansPlusPlus = Utils.getFlag('P', options);

    String optionString = Utils.getOption('N', options);

    if (optionString.length() != 0) {
      setNumClusters(Integer.parseInt(optionString));
    }

    optionString = Utils.getOption("I", options);
    if (optionString.length() != 0) {
      setMaxIterations(Integer.parseInt(optionString));
    }

    String distFunctionClass = Utils.getOption('A', options);
    if (distFunctionClass.length() != 0) {
      String distFunctionClassSpec[] = Utils.splitOptions(distFunctionClass);
      if (distFunctionClassSpec.length == 0) {
        throw new Exception("Invalid DistanceFunction specification string.");
      }
      String className = distFunctionClassSpec[0];
      distFunctionClassSpec[0] = "";

      setDistanceFunction((DistanceFunction) Utils.forName(
          DistanceFunction.class, className, distFunctionClassSpec));
    } else {
      setDistanceFunction(new EuclideanDistance());
    }

    m_PreserveOrder = Utils.getFlag("O", options);

    m_FastDistanceCalc = Utils.getFlag("fast", options);

    String slotsS = Utils.getOption("num-slots", options);
    if (slotsS.length() > 0) {
      setNumExecutionSlots(Integer.parseInt(slotsS));
    }

    super.setOptions(options);
  }

  /**
   * Gets the current settings of SimpleKMeans.
   * 
   * @return an array of strings suitable for passing to setOptions()
   */
  @Override
  public String[] getOptions() {
    int i;
    Vector result;
    String[] options;

    result = new Vector();

    if (m_initializeWithKMeansPlusPlus) {
      result.add("-P");
    }

    if (m_displayStdDevs) {
      result.add("-V");
    }

    if (m_dontReplaceMissing) {
      result.add("-M");
    }

    result.add("-N");
    result.add("" + getNumClusters());

    result.add("-A");
    result.add((m_DistanceFunction.getClass().getName() + " " + Utils
        .joinOptions(m_DistanceFunction.getOptions())).trim());

    result.add("-I");
    result.add("" + getMaxIterations());

    if (m_PreserveOrder) {
      result.add("-O");
    }

    if (m_FastDistanceCalc) {
      result.add("-fast");
    }

    result.add("-num-slots");
    result.add("" + getNumExecutionSlots());

    options = super.getOptions();
    for (i = 0; i < options.length; i++)
      result.add(options[i]);

    return (String[]) result.toArray(new String[result.size()]);
  }

  /**
   * return a string describing this clusterer.
   * 
   * @return a description of the clusterer as a string
   */
  @Override
  public String toString() {
    if (m_ClusterCentroids == null) {
      return "No clusterer built yet!";
    }

    int maxWidth = 0;
    int maxAttWidth = 0;
    boolean containsNumeric = false;
    for (int i = 0; i < m_NumClusters; i++) {
      for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
        if (m_ClusterCentroids.attribute(j).name().length() > maxAttWidth) {
          maxAttWidth = m_ClusterCentroids.attribute(j).name().length();
        }
        if (m_ClusterCentroids.attribute(j).isNumeric()) {
          containsNumeric = true;
          double width = Math.log(Math.abs(m_ClusterCentroids.instance(i)
              .value(j))) / Math.log(10.0);
          // System.err.println(m_ClusterCentroids.instance(i).value(j)+" "+width);
          if (width < 0) {
            width = 1;
          }
          // decimal + # decimal places + 1
          width += 6.0;
          if ((int) width > maxWidth) {
            maxWidth = (int) width;
          }
        }
      }
    }

    for (int i = 0; i < m_ClusterCentroids.numAttributes(); i++) {
      if (m_ClusterCentroids.attribute(i).isNominal()) {
        Attribute a = m_ClusterCentroids.attribute(i);
        for (int j = 0; j < m_ClusterCentroids.numInstances(); j++) {
          String val = a.value((int) m_ClusterCentroids.instance(j).value(i));
          if (val.length() > maxWidth) {
            maxWidth = val.length();
          }
        }
        for (int j = 0; j < a.numValues(); j++) {
          String val = a.value(j) + " ";
          if (val.length() > maxAttWidth) {
            maxAttWidth = val.length();
          }
        }
      }
    }

    if (m_displayStdDevs) {
      // check for maximum width of maximum frequency count
      for (int i = 0; i < m_ClusterCentroids.numAttributes(); i++) {
        if (m_ClusterCentroids.attribute(i).isNominal()) {
          int maxV = Utils.maxIndex(m_FullNominalCounts[i]);
          /*
           * int percent = (int)((double)m_FullNominalCounts[i][maxV] /
           * Utils.sum(m_ClusterSizes) * 100.0);
           */
          int percent = 6; // max percent width (100%)
          String nomV = "" + m_FullNominalCounts[i][maxV];
          // + " (" + percent + "%)";
          if (nomV.length() + percent > maxWidth) {
            maxWidth = nomV.length() + 1;
          }
        }
      }
    }

    // check for size of cluster sizes
    for (int i = 0; i < m_ClusterSizes.length; i++) {
      String size = "(" + m_ClusterSizes[i] + ")";
      if (size.length() > maxWidth) {
        maxWidth = size.length();
      }
    }

    if (m_displayStdDevs && maxAttWidth < "missing".length()) {
      maxAttWidth = "missing".length();
    }

    String plusMinus = "+/-";
    maxAttWidth += 2;
    if (m_displayStdDevs && containsNumeric) {
      maxWidth += plusMinus.length();
    }
    if (maxAttWidth < "Attribute".length() + 2) {
      maxAttWidth = "Attribute".length() + 2;
    }

    if (maxWidth < "Full Data".length()) {
      maxWidth = "Full Data".length() + 1;
    }

    if (maxWidth < "missing".length()) {
      maxWidth = "missing".length() + 1;
    }

    StringBuffer temp = new StringBuffer();
    temp.append("\nkMeans\n======\n");
    temp.append("\nNumber of iterations: " + m_Iterations);

    if (!m_FastDistanceCalc) {
      temp.append("\n");
      if (m_DistanceFunction instanceof EuclideanDistance) {
        temp.append("Within cluster sum of squared errors: "
            + Utils.sum(m_squaredErrors));
      } else {
        temp.append("Sum of within cluster distances: "
            + Utils.sum(m_squaredErrors));
      }
    }

    if (!m_dontReplaceMissing) {
      temp.append("\nMissing values globally replaced with mean/mode");
    }

    temp.append("\n\nCluster centroids:\n");
    temp.append(pad("Cluster#", " ", (maxAttWidth + (maxWidth * 2 + 2))
        - "Cluster#".length(), true));

    temp.append("\n");
    temp.append(pad("Attribute", " ", maxAttWidth - "Attribute".length(), false));

    temp.append(pad("Full Data", " ", maxWidth + 1 - "Full Data".length(), true));

    // cluster numbers
    for (int i = 0; i < m_NumClusters; i++) {
      String clustNum = "" + i;
      temp.append(pad(clustNum, " ", maxWidth + 1 - clustNum.length(), true));
    }
    temp.append("\n");

    // cluster sizes
    String cSize = "(" + Utils.sum(m_ClusterSizes) + ")";
    temp.append(pad(cSize, " ", maxAttWidth + maxWidth + 1 - cSize.length(),
        true));
    for (int i = 0; i < m_NumClusters; i++) {
      cSize = "(" + m_ClusterSizes[i] + ")";
      temp.append(pad(cSize, " ", maxWidth + 1 - cSize.length(), true));
    }
    temp.append("\n");

    temp.append(pad("", "=",
        maxAttWidth
            + (maxWidth * (m_ClusterCentroids.numInstances() + 1)
                + m_ClusterCentroids.numInstances() + 1), true));
    temp.append("\n");

    for (int i = 0; i < m_ClusterCentroids.numAttributes(); i++) {
      String attName = m_ClusterCentroids.attribute(i).name();
      temp.append(attName);
      for (int j = 0; j < maxAttWidth - attName.length(); j++) {
        temp.append(" ");
      }

      String strVal;
      String valMeanMode;
      // full data
      if (m_ClusterCentroids.attribute(i).isNominal()) {
        if (m_FullMeansOrMediansOrModes[i] == -1) { // missing
          valMeanMode = pad("missing", " ", maxWidth + 1 - "missing".length(),
              true);
        } else {
          valMeanMode = pad(
              (strVal = m_ClusterCentroids.attribute(i).value(
                  (int) m_FullMeansOrMediansOrModes[i])), " ", maxWidth + 1
                  - strVal.length(), true);
        }
      } else {
        if (Double.isNaN(m_FullMeansOrMediansOrModes[i])) {
          valMeanMode = pad("missing", " ", maxWidth + 1 - "missing".length(),
              true);
        } else {
          valMeanMode = pad(
              (strVal = Utils.doubleToString(m_FullMeansOrMediansOrModes[i],
                  maxWidth, 4).trim()), " ", maxWidth + 1 - strVal.length(),
              true);
        }
      }
      temp.append(valMeanMode);

      for (int j = 0; j < m_NumClusters; j++) {
        if (m_ClusterCentroids.attribute(i).isNominal()) {
          if (m_ClusterCentroids.instance(j).isMissing(i)) {
            valMeanMode = pad("missing", " ",
                maxWidth + 1 - "missing".length(), true);
          } else {
            valMeanMode = pad(
                (strVal = m_ClusterCentroids.attribute(i).value(
                    (int) m_ClusterCentroids.instance(j).value(i))), " ",
                maxWidth + 1 - strVal.length(), true);
          }
        } else {
          if (m_ClusterCentroids.instance(j).isMissing(i)) {
            valMeanMode = pad("missing", " ",
                maxWidth + 1 - "missing".length(), true);
          } else {
            valMeanMode = pad(
                (strVal = Utils.doubleToString(
                    m_ClusterCentroids.instance(j).value(i), maxWidth, 4)
                    .trim()), " ", maxWidth + 1 - strVal.length(), true);
          }
        }
        temp.append(valMeanMode);
      }
      temp.append("\n");

      if (m_displayStdDevs) {
        // Std devs/max nominal
        String stdDevVal = "";

        if (m_ClusterCentroids.attribute(i).isNominal()) {
          // Do the values of the nominal attribute
          Attribute a = m_ClusterCentroids.attribute(i);
          for (int j = 0; j < a.numValues(); j++) {
            // full data
            String val = "  " + a.value(j);
            temp.append(pad(val, " ", maxAttWidth + 1 - val.length(), false));
            int count = m_FullNominalCounts[i][j];
            int percent = (int) ((double) m_FullNominalCounts[i][j]
                / Utils.sum(m_ClusterSizes) * 100.0);
            String percentS = "" + percent + "%)";
            percentS = pad(percentS, " ", 5 - percentS.length(), true);
            stdDevVal = "" + count + " (" + percentS;
            stdDevVal = pad(stdDevVal, " ", maxWidth + 1 - stdDevVal.length(),
                true);
            temp.append(stdDevVal);

            // Clusters
            for (int k = 0; k < m_NumClusters; k++) {
              count = m_ClusterNominalCounts[k][i][j];
              percent = (int) ((double) m_ClusterNominalCounts[k][i][j]
                  / m_ClusterSizes[k] * 100.0);
              percentS = "" + percent + "%)";
              percentS = pad(percentS, " ", 5 - percentS.length(), true);
              stdDevVal = "" + count + " (" + percentS;
              stdDevVal = pad(stdDevVal, " ",
                  maxWidth + 1 - stdDevVal.length(), true);
              temp.append(stdDevVal);
            }
            temp.append("\n");
          }
          // missing (if any)
          if (m_FullMissingCounts[i] > 0) {
            // Full data
            temp.append(pad("  missing", " ",
                maxAttWidth + 1 - "  missing".length(), false));
            int count = m_FullMissingCounts[i];
            int percent = (int) ((double) m_FullMissingCounts[i]
                / Utils.sum(m_ClusterSizes) * 100.0);
            String percentS = "" + percent + "%)";
            percentS = pad(percentS, " ", 5 - percentS.length(), true);
            stdDevVal = "" + count + " (" + percentS;
            stdDevVal = pad(stdDevVal, " ", maxWidth + 1 - stdDevVal.length(),
                true);
            temp.append(stdDevVal);

            // Clusters
            for (int k = 0; k < m_NumClusters; k++) {
              count = m_ClusterMissingCounts[k][i];
              percent = (int) ((double) m_ClusterMissingCounts[k][i]
                  / m_ClusterSizes[k] * 100.0);
              percentS = "" + percent + "%)";
              percentS = pad(percentS, " ", 5 - percentS.length(), true);
              stdDevVal = "" + count + " (" + percentS;
              stdDevVal = pad(stdDevVal, " ",
                  maxWidth + 1 - stdDevVal.length(), true);
              temp.append(stdDevVal);
            }

            temp.append("\n");
          }

          temp.append("\n");
        } else {
          // Full data
          if (Double.isNaN(m_FullMeansOrMediansOrModes[i])) {
            stdDevVal = pad("--", " ", maxAttWidth + maxWidth + 1 - 2, true);
          } else {
            stdDevVal = pad(
                (strVal = plusMinus
                    + Utils.doubleToString(m_FullStdDevs[i], maxWidth, 4)
                        .trim()), " ",
                maxWidth + maxAttWidth + 1 - strVal.length(), true);
          }
          temp.append(stdDevVal);

          // Clusters
          for (int j = 0; j < m_NumClusters; j++) {
            if (m_ClusterCentroids.instance(j).isMissing(i)) {
              stdDevVal = pad("--", " ", maxWidth + 1 - 2, true);
            } else {
              stdDevVal = pad(
                  (strVal = plusMinus
                      + Utils.doubleToString(
                          m_ClusterStdDevs.instance(j).value(i), maxWidth, 4)
                          .trim()), " ", maxWidth + 1 - strVal.length(), true);
            }
            temp.append(stdDevVal);
          }
          temp.append("\n\n");
        }
      }
    }

    temp.append("\n\n");
    return temp.toString();
  }

  private String pad(String source, String padChar, int length, boolean leftPad) {
    StringBuffer temp = new StringBuffer();

    if (leftPad) {
      for (int i = 0; i < length; i++) {
        temp.append(padChar);
      }
      temp.append(source);
    } else {
      temp.append(source);
      for (int i = 0; i < length; i++) {
        temp.append(padChar);
      }
    }
    return temp.toString();
  }

  /**
   * Gets the the cluster centroids.
   * 
   * @return the cluster centroids
   */
  public Instances getClusterCentroids() {
    return m_ClusterCentroids;
  }

  /**
   * Gets the standard deviations of the numeric attributes in each cluster.
   * 
   * @return the standard deviations of the numeric attributes in each cluster
   */
  public Instances getClusterStandardDevs() {
    return m_ClusterStdDevs;
  }

  /**
   * Returns for each cluster the frequency counts for the values of each
   * nominal attribute.
   * 
   * @return the counts
   */
  public int[][][] getClusterNominalCounts() {
    return m_ClusterNominalCounts;
  }

  /**
   * Gets the squared error for all clusters.
   * 
   * @return the squared error, NaN if fast distance calculation is used
   * @see #m_FastDistanceCalc
   */
  public double getSquaredError() {
    if (m_FastDistanceCalc)
      return Double.NaN;
    else
      return Utils.sum(m_squaredErrors);
  }

  /**
   * Gets the number of instances in each cluster.
   * 
   * @return The number of instances in each cluster
   */
  public int[] getClusterSizes() {
    return m_ClusterSizes;
  }

  /**
   * Gets the assignments for each instance.
   * 
   * @return Array of indexes of the centroid assigned to each instance
   * @throws Exception if order of instances wasn't preserved or no assignments
   *           were made
   */
  public int[] getAssignments() throws Exception {
    if (!m_PreserveOrder) {
      throw new Exception(
          "The assignments are only available when order of instances is preserved (-O)");
    }
    if (m_Assignments == null) {
      throw new Exception("No assignments made.");
    }
    return m_Assignments;
  }

  /**
   * Returns the revision string.
   * 
   * @return the revision
   */
  @Override
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 9756 $");
  }

  /**
   * Main method for executing this class.
   * 
   * @param args use -h to list all parameters
   */
  public static void main(String[] args) {
    runClusterer(new SimpleKMeans(), args);
  }
}