PCKMeans.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    PCKMeans.java
 *    Copyright (C) 2003 Sugato Basu 
 *
 */
package weka.clusterers;

import  java.io.*;
import  java.util.*;
import  weka.core.*;
import  weka.core.metrics.*;
import  weka.filters.Filter;
import  weka.filters.unsupervised.attribute.Remove;

/**
 * Pairwise constrained k means clustering class.
 *
 * Valid options are:<p>
 *
 * -N <number of clusters> <br>
 * Specify the number of clusters to generate. <p>
 *
 * -R <random seed> <br>
 * Specify random number seed <p>
 *
 * -A <algorithm> <br>
 * The algorithm can be "Simple" (simple KMeans) or "Spherical" (spherical KMeans)
 *
 * -M <metric-class> <br>
 * Specifies the name of the distance metric class that should be used
 * 
 * .... etc. 
 *
 * @author Sugato Basu(sugato@cs.utexas.edu)
 * @see Clusterer
 * @see OptionHandler
 */
public class PCKMeans extends Clusterer implements OptionHandler,SemiSupClusterer,ActiveLearningClusterer {

  /** Name of clusterer */
  String m_name = "PCKMeans";

  /** holds the instances in the clusters */
  protected ArrayList m_Clusters = null;

  /** holds the instance indices in the clusters */
  protected HashSet[] m_IndexClusters = null;
  
  /** holds the ([instance pair] -> [type of constraint])
      mapping. Note that the instance pairs stored in the hash always
      have constraint type InstancePair.DONT_CARE_LINK, the actual
      link type is stored in the hashed value 
  */
  
  protected HashMap m_ConstraintsHash = null;
  
  /** holds the ([instance i] -> [Arraylist of constraints involving i])
      mapping. Note that the instance pairs stored in the Arraylist 
      have the actual link type
  */
  protected HashMap m_instanceConstraintHash = null; 

  /** adjacency list for neighborhoods */
  protected HashSet[] m_AdjacencyList;

  /** colors required for keeping track of DFS visit */
  final int WHITE = 300;
  final int GRAY = 301;
  final int BLACK = 302;

  /** holds the points involved in the constraints */
  protected HashSet m_SeedHash = null;

  /** weight to be given to each constraint */
  protected double m_CannotLinkWeight = 1;

  /** weight to be given to each constraint */
  protected double m_MustLinkWeight = 1;

  /** the maximum number of cannot-link constraints allowed */
  protected final static int m_MaxConstraintsAllowed = 10000;

  /** verbose? */
  protected boolean m_verbose = false;

  /** distance Metric */
  protected Metric m_metric = new WeightedEuclidean();

  /** has the metric has been constructed?  a fix for multiple buildClusterer's */
  protected boolean m_metricBuilt = false;

  /** indicates whether instances are sparse */
  protected  boolean m_isSparseInstance = false;

  /** Is the objective function increasing or decreasing?  Depends on type
      of metric used:  for similarity-based metric - increasing, 
      for distance-based - decreasing 
  */
  protected boolean m_objFunDecreasing = true;

  /** Seedable or not (true by default) */
  protected boolean m_Seedable = true;

  /** Round robin or Random in active Phase Two */
  protected boolean m_PhaseTwoRandom = false;

  /** Two-phase active learning or All Explore */
  protected boolean m_AllExplore = false;

  /** keep track of the number of iterations completed before convergence
   */
  protected int m_Iterations = 0;
  
  /** Define possible algorithms */
  public static final int ALGORITHM_SIMPLE = 1;
  public static final int ALGORITHM_SPHERICAL = 2;
  public static final Tag[] TAGS_ALGORITHM = {
    new Tag(ALGORITHM_SIMPLE, "Simple"),
    new Tag(ALGORITHM_SPHERICAL, "Spherical")
      };
  
  /** algorithm, by default spherical */
  protected int m_Algorithm = ALGORITHM_SIMPLE;
  
  /** min difference of objective function values for convergence*/
  protected double m_ObjFunConvergenceDifference = 1e-5;

  /** value of objective function */
  protected double m_Objective;

  /** returns objective function */
  public double objectiveFunction() {
    return m_Objective;
  }

  /**
   * training instances with labels
   */
  protected Instances m_TotalTrainWithLabels;

  /**
   * training instances
   */
  protected Instances m_Instances;


  /** A hash where the instance checksums are hashed */
  protected HashMap m_checksumHash = null; 
  protected double []m_checksumCoeffs = null; 


  /** test data -- required to make sure that test points are not
      selected during active learning */
  protected int m_StartingIndexOfTest = -1;

  /**
   * number of pairs to seed with
   */
  protected int m_NumActive;

  /**
   * active mode?
   */
  protected boolean m_Active = false;

  /**
   * number of clusters to generate, default is -1 to get it from labeled data
   */
  protected int m_NumClusters = 3;


  /** Number of clusters in the process*/
  protected int m_NumCurrentClusters = 0; 


  /**
   * holds the cluster centroids
   */
  protected Instances m_ClusterCentroids;


  /**
   * holds the global centroids
   */
  protected Instance m_GlobalCentroid;

  /**
   * holds the default perturbation value for randomPerturbInit
   */
  protected double m_DefaultPerturb = 0.7;

  /**
   * holds the default merge threshold for matchMergeStep
   */
  protected double m_MergeThreshold = 0.15;

  /**
   * temporary variable holding cluster assignments while iterating
   */
  protected int [] m_ClusterAssignments;

  /**
   * temporary variable holding cluster sums while iterating
   */
  protected Instance [] m_SumOfClusterInstances;

  /**
   * holds the random Seed used to seed the random number generator
   */
  protected int m_RandomSeed = 42;

  /**
   * holds the random number generator used in various parts of the code
   */
  protected Random m_RandomNumberGenerator = null;

  /** Define possible orderings */
  public static final int ORDERING_DEFAULT = 1;
  public static final int ORDERING_RANDOM = 2;
  public static final int ORDERING_SORTED = 3;
  public static final Tag[] TAGS_ORDERING = {
    new Tag(ORDERING_DEFAULT, "Default-Ordering"),
    new Tag(ORDERING_RANDOM, "Random-Ordering"),
    new Tag(ORDERING_SORTED, "Sorted-Ordering")
      };

  protected int m_InstanceOrdering = ORDERING_DEFAULT;

  /** Move points in assignment step till stabilization? */
  protected boolean m_MovePointsTillAssignmentStabilizes = false;

  /** neighbor list for active learning: points in each cluster neighborhood */
  protected HashSet[] m_NeighborSets;

  /** assigned set for active learning: whether a point has been assigned or not */
  HashSet m_AssignedSet;

  /* Constructor */
  public PCKMeans() {
  }

  /* Constructor */
  public PCKMeans(Metric metric) {
    m_metric = metric;
    m_objFunDecreasing = metric.isDistanceBased();
  }

  /**
   * We always want to implement SemiSupClusterer from a class extending Clusterer.  
   * We want to be able to return the underlying parent class.
   * @return parent Clusterer class
   */
  public Clusterer getThisClusterer() {
    return this;
  }
 

  /**
   * Cluster given instances to form the specified number of clusters.
   *
   * @param data instances to be clustered
   * @param num_clusters number of clusters to create
   * @exception Exception if something goes wrong.
   */
  public void buildClusterer(Instances data, int num_clusters) throws Exception {
    m_NumClusters = num_clusters;
    if (m_Algorithm == ALGORITHM_SPHERICAL && m_metric instanceof WeightedDotP) {
      ((WeightedDotP)m_metric).setLengthNormalized(false); // since instances and clusters are already normalized, we don't need to normalize again while computing similarity - saves time
    }
    if (data.instance(0) instanceof SparseInstance) {
      m_isSparseInstance = true;
    }    
    buildClusterer(data);
  }

  /**
   * Generates the clustering using labeled seeds
   *
   * @param labeledData set of labeled instances to use as seeds
   * @param unlabeledData set of unlabeled instances
   * @param classIndex attribute index in labeledData which holds class info
   * @param numClusters number of clusters to create
   * @param startingIndexOfTest from where test data starts in unlabeledData, useful if clustering is transductive, set to -1 if not relevant
   * @exception Exception if something is wrong
   */
  public void buildClusterer (Instances labeledData, Instances unlabeledData, int classIndex, int numClusters, int startingIndexOfTest) throws Exception {
    // Dummy function
    throw new Exception ("Not implemented for MPCKMeans, only here for compatibility to SemiSupClusterer interface");
  }


  /**
   * Clusters unlabeledData and labeledData (with labels removed),
   * using labeledData as seeds
   *
   * @param labeledPairs labeled instances to be used as seeds
   * @param unlabeledData unlabeled training (+ test for transductive) instances
   * @param labeledTrain labeled training instances
   * @param startingIndexOfTest starting index of test set in unlabeled data
   * @exception Exception if something goes wrong.  */
  public void buildClusterer(ArrayList labeledPairs, Instances unlabeledData, Instances labeledTrain,
			     int startingIndexOfTest) throws Exception {
    int classIndex = labeledTrain.numAttributes(); // assuming that the last attribute is always the class
    m_TotalTrainWithLabels = labeledTrain;

    m_SeedHash = new HashSet((int) (unlabeledData.numInstances()/0.75 + 10)) ;
    m_ConstraintsHash = new HashMap(m_MaxConstraintsAllowed); 
    m_instanceConstraintHash = new HashMap(m_MaxConstraintsAllowed);

    if (!m_Active && labeledPairs != null) {
      for (int i=0; i<labeledPairs.size(); i++) {
	InstancePair pair = (InstancePair) labeledPairs.get(i);
	Integer firstInt = new Integer(pair.first);
	Integer secondInt = new Integer(pair.second);

	// for first point 
	if(!m_SeedHash.contains(firstInt)) { // add instances with constraints to seedHash
	  m_SeedHash.add(firstInt);
	}

	// for second point 
	if(!m_SeedHash.contains(secondInt)) {
	  m_SeedHash.add(secondInt);
	}
	if (pair.first >= pair.second) {
	  throw new Exception("Ordering reversed - something wrong!!");
	} 
	else {
	  InstancePair newPair = new InstancePair(pair.first, pair.second, InstancePair.DONT_CARE_LINK);
	  m_ConstraintsHash.put(newPair, new Integer(pair.linkType)); // WLOG first < second
	  // hash the constraints for the instances involved
	  Object constraintList1 = m_instanceConstraintHash.get(firstInt);
	  if (constraintList1 == null) {
	    ArrayList constraintList = new ArrayList();
	    constraintList.add(pair);
	    m_instanceConstraintHash.put(firstInt, constraintList);
	  } else {
	    ((ArrayList)constraintList1).add(pair);
	  }
	  Object constraintList2 = m_instanceConstraintHash.get(secondInt);
	  if (constraintList2 == null) {
	    ArrayList constraintList = new ArrayList();
	    constraintList.add(pair);
	    m_instanceConstraintHash.put(secondInt, constraintList);
	  } else {
	    ((ArrayList)constraintList2).add(pair);
	  }
	}
      }
    } else {
      m_NumActive = labeledPairs.size();
    }

    // normalize all data for SPKMeans
    if (m_Algorithm == ALGORITHM_SPHERICAL) {
      for (int i=0; i<unlabeledData.numInstances(); i++) {
	normalize(unlabeledData.instance(i));
      }
    }
    m_StartingIndexOfTest = startingIndexOfTest;
    if (m_verbose) {
      System.out.println("Starting index of test: " + m_StartingIndexOfTest);
    }

    // learn metric using labeled data, then cluster both the labeled and unlabeled data
    m_metric.buildMetric(unlabeledData.numAttributes());
    m_metricBuilt = true;
    buildClusterer(unlabeledData, labeledTrain.numClasses());
  }

  /**
   * Generates a clusterer. Instances in data have to be
   * either all sparse or all non-sparse
   *
   * @param data set of instances serving as training data 
   * @exception Exception if the clusterer has not been 
   * generated successfully
   */
  public void buildClusterer(Instances data) throws Exception {
    System.out.println("Must link weight: " + m_MustLinkWeight);
    System.out.println("Cannot link weight: " + m_CannotLinkWeight);

    m_RandomNumberGenerator = new Random(m_RandomSeed);
    setInstances(data);

    // Don't rebuild the metric if it was already trained
    if (!m_metricBuilt) {
      m_metric.buildMetric(data.numAttributes());
    }
    m_ClusterCentroids = new Instances(m_Instances, m_NumClusters);
    m_ClusterAssignments = new int [m_Instances.numInstances()];

    if (m_Instances.checkForNominalAttributes() && m_Instances.checkForStringAttributes()) {
      throw new UnsupportedAttributeTypeException("Cannot handle nominal attributes\n");
    }

    System.out.println("Initializing clustering ... ");
    
    if (m_Active) {
      bestPairsForActiveLearning(m_NumActive);
    }
    else {
      nonActivePairwiseInit();
    }
    System.out.println("Done initializing clustering ...");
    
    if (m_verbose) {
      if (m_Seedable) {
	System.out.println("Initial assignments of seed points:");
	getIndexClusters();
	printIndexClusters();
      }
      for (int i=0; i<m_NumClusters; i++) {
	System.out.println("Centroid " + i + ": " + m_ClusterCentroids.instance(i));
      }
    }
    runKMeans();
  }


  /**
   * Reset all values that have been learned
   */
  public void resetClusterer()  throws Exception{
    if (m_metric instanceof LearnableMetric) {
      ((LearnableMetric)m_metric).resetMetric();
    }
    m_SeedHash = null;
    m_ConstraintsHash = null;
    m_instanceConstraintHash = null;
  }


  /** Set default perturbation value
   * @param p perturbation fraction
   */
  public void setDefaultPerturb(double p) {
    m_DefaultPerturb = p;
  }


  /** Get default perturbation value
   * @return perturbation fraction
   */
  public double getDefaultPerturb(){
    return m_DefaultPerturb;
  }


  /** Turn seeding on and off
   * @param seedable should seeding be done?
   */
  public void setSeedable(boolean seedable) {
    m_Seedable = seedable;
  }

  /** Is seeding performed?
   * @return is seeding being done?
   */
  public boolean getSeedable() {
      return m_Seedable;
  }
  
  /**
   * We can have clusterers that don't utilize seeding
   */
  public boolean seedable() {
    return m_Seedable;
  }

  /** Phase 1 code for active learning 
   */
  protected int activePhaseOne(int numQueries) throws Exception {
    int numInstances = m_Instances.numInstances();
    int X, Y, Z;
    int query, lambda, Label, CLcount;
    boolean MLmode = true;
    
    System.out.println("In Explore phase, with numqueries: " + numQueries);

    // these are the main data-structures to be updated here
    m_NeighborSets = new HashSet[m_NumClusters]; // set of points in each cluster neighborhood
    m_SumOfClusterInstances = new Instance[m_NumClusters];
    m_ClusterAssignments = new int[numInstances];
    m_AssignedSet = new HashSet((int) (numQueries/0.75+10)); // whether a point has been assigned or not

    for (int i=0; i<m_Instances.numInstances(); i++) {
      m_ClusterAssignments[i] = -1;
    }
    query = 0; // current num queries
    lambda = -1; // curent number of disjoint neighborhoods
    X = 0; // current point under investigation
    
    while( query < numQueries ){
      if( m_NeighborSets[0] == null ){
	// start the first neighborhood from the first point
	lambda++;
	if (m_verbose) 
	  System.out.println("Setting cluster of " + X + " to " + lambda);
	// update data structures
	m_NeighborSets[lambda] = new HashSet();
	m_NeighborSets[lambda].add(new Integer(X));
	m_SumOfClusterInstances[lambda] = sumWithInstance(m_SumOfClusterInstances[lambda],m_Instances.instance(X));
	m_ClusterAssignments[X] = lambda;
	m_AssignedSet.add(new Integer(X));
      }
      else if( lambda == m_NumClusters-1 && !m_AllExplore) {
	// NOTE: this condition is fired only if we are doing 2 phase (Explore + Consolidate)
	System.out.println("Explore phase over after " + query + " queries");
	m_NumCurrentClusters = lambda+1;
	return query;
      }
      else {
	Z = (int) farthestFromSet(m_AssignedSet, null);
	CLcount = -1;
	for( int h = 0; h <= lambda; h++ ){ 
	  if (m_verbose) 
	    System.out.println("Starting for loop CLcount: " + CLcount);
	  Iterator NbrIt = null;
	  if (m_NeighborSets[h] != null) {
	    NbrIt = m_NeighborSets[h].iterator();
	  }
	  if( NbrIt != null && NbrIt.hasNext() ){
	    X = ((Integer) NbrIt.next()).intValue();
	    if (m_verbose) 
	      System.out.println("Inside iterator next ... X: " + X);
	    
	    Label = askOracle(X,Z);
	    query++;
	    System.out.println("Making query: " + query);
	    if( Label == InstancePair.CANNOT_LINK ){ // Cannot-link, update CLcount
	      CLcount++;
	    }
	    else{  // Must-link, add to neighborset
	      // update data structures	      
	      m_NeighborSets[h].add(new Integer(Z));
	      m_SumOfClusterInstances[h] = sumWithInstance(m_SumOfClusterInstances[h],m_Instances.instance(Z));
	      m_ClusterAssignments[Z] = h;
	      m_AssignedSet.add(new Integer(Z));
	      break; // get out of for loop
	    }
	    if(query >= numQueries){
	      if (m_verbose) 
		System.out.println("Run out of queries");
	      m_NumCurrentClusters = lambda+1;
	      return query;
	    }
	  }
	}
	if (m_verbose) {
	  System.out.println("Out of for loop");
	}
	if( CLcount == lambda ){ // found a point cannot-linked to all current clusters
	  lambda++;
	  // update data structures
	  m_NeighborSets[lambda] = new HashSet();
	  m_NeighborSets[lambda].add(new Integer(Z));
	  m_SumOfClusterInstances[lambda] = sumWithInstance(m_SumOfClusterInstances[lambda],m_Instances.instance(Z));
	  m_ClusterAssignments[Z] = lambda;
	  m_AssignedSet.add(new Integer(Z));
	}
      } // close else
    } // close while

    if (m_verbose) 
      System.out.println("Number of queries: " + query);
    m_NumCurrentClusters = lambda+1;
    return query;
  }

  /** Phase 2 code for active learning, with round robin 
   */
  protected void activePhaseTwoRoundRobin(int numQueries) throws Exception {
    int numInstances = m_Instances.numInstances();
    int X,Y;
    int query = 0, Label;

    System.out.println("In Consolidate phase, with numqueries: " + numQueries);
    while( query < numQueries ){
      if (m_verbose)
	System.out.println("Starting round robin");
      
      // starting round robin
      Instance[] clusterCentroids = new Instance[m_NumClusters];
      
      // find cluster with smallest size
      int smallestSize = Integer.MAX_VALUE, smallestCluster = -1;
      for (int i=0; i<m_NumClusters; i++) {
	if (m_NeighborSets[i].size() < smallestSize) {
	  smallestSize = m_NeighborSets[i].size();
	  smallestCluster = i;
	}
      }
      
      // compute centroid for smallest cluster
      if (m_isSparseInstance) {
	clusterCentroids[smallestCluster] = new SparseInstance(m_SumOfClusterInstances[smallestCluster]);
      }
      else {
	clusterCentroids[smallestCluster] = new Instance(m_SumOfClusterInstances[smallestCluster]);
      }
      clusterCentroids[smallestCluster].setDataset(m_Instances);
      if (!m_objFunDecreasing) {
	normalize(clusterCentroids[smallestCluster]);
      }
      else {
	normalizeByWeight(clusterCentroids[smallestCluster]);
      }
      
      // find next point, closest to centroid of smallest cluster
      X = nearestFromPoint(clusterCentroids[smallestCluster], m_AssignedSet);
      if (X == -1) {
	if (m_verbose)
	  System.out.println("No more points left unassigned, we are DONE!!");
	createGlobalCentroids();
	addMLAndCLTransitiveClosure(null);
	return;
      }
      
      if (m_verbose)
	System.out.println("Nearest point is " + X);
      if (X >= m_StartingIndexOfTest) { // Sanity Check
	throw new Exception ("Test point selected, something went wrong!");
      }      
      
      Iterator NbrIt = m_NeighborSets[smallestCluster].iterator();
      Y = ((Integer) NbrIt.next()).intValue(); // get any point from the smallest neighborhood
      Label = askOracle(X,Y);
      query++;
      System.out.println("Making query:" + query);
      if (m_verbose) 
	System.out.println("Number of queries: " + query);
      if( Label == InstancePair.MUST_LINK ){
	// update data structures
	m_NeighborSets[smallestCluster].add(new Integer(X));
	m_SumOfClusterInstances[smallestCluster] = sumWithInstance(m_SumOfClusterInstances[smallestCluster], m_Instances.instance(X));
	m_ClusterAssignments[X] = smallestCluster;
	if (m_verbose)
	  System.out.println("Adding " + X + " to cluster: " + smallestCluster);
	m_AssignedSet.add(new Integer(X));

	if( query >= numQueries ){
	  if (m_verbose)
	    System.out.println("Ran out of queries");
	  System.out.println("Consolidate phase over after " + query + " queries");	  
	  createGlobalCentroids();
	  addMLAndCLTransitiveClosure(null);
	  return;
	}
      }
      else { // must-link not found with smallest neighborhood, process other neighborhoods now
	if (m_verbose) 
	  System.out.println("Processing other centroids now");
	// compute centroids of other clusters
	for (int i=0; i<m_NumClusters; i++) { 
	  if (i != smallestCluster) { // already made query for smallest cluster
	    if (m_isSparseInstance) {
	      clusterCentroids[i] = new SparseInstance(m_SumOfClusterInstances[i]);
	    }
	    else {
	      clusterCentroids[i] = new Instance(m_SumOfClusterInstances[i]);
	    }
	    clusterCentroids[i].setDataset(m_Instances);
	    if (!m_objFunDecreasing) {
	      normalize(clusterCentroids[i]);
	    }
	    else {
	      normalizeByWeight(clusterCentroids[i]);
	    }
	  }
	}
	
	double[] similaritiesToCentroids = new double[m_NumClusters];
	for (int i=0; i<m_NumClusters; i++) {
	  if (i != smallestCluster) { // already made query for smallestCluster
	    similaritiesToCentroids[i] = m_metric.similarity(clusterCentroids[i], m_Instances.instance(X));
	  }
	} // handles both Euclidean and WeightedDotP
	
	if (m_verbose) {
	  System.out.println("Before sort");
	  for (int i=0; i<m_NumClusters; i++) {
	    System.out.println(similaritiesToCentroids[i]);
	  }
	}
	
	int[] indices = Utils.sort(similaritiesToCentroids); // sorts in ascending order of similarity
	
	if (m_verbose) {
	  System.out.println("After sort");
	  for (int i=0; i<m_NumClusters; i++) {
	    System.out.println(indices[i]);
	  }
	}
	
	for(int h = m_NumClusters-1; h >=0; h-- ){ 
	  // since sort is ascending, and we want descending sort of similarity values
	  int index = indices[h];
	  if (index != smallestCluster) { // already made query for smallest cluster
	    NbrIt = m_NeighborSets[index].iterator();
	    Y = ((Integer) NbrIt.next()).intValue(); // get any point from the neighborhood
	    Label = askOracle(X,Y);
	    query++;
	    System.out.println("Making query:" + query);
	    if (m_verbose) 
	      System.out.println("Number of queries: " + query);
	    if( Label == InstancePair.MUST_LINK ){
	      // update data structures
	      m_NeighborSets[index].add(new Integer(X));
	      m_SumOfClusterInstances[index] = sumWithInstance(m_SumOfClusterInstances[index], m_Instances.instance(X));
	      m_ClusterAssignments[X] = index;
	      if (m_verbose)
		System.out.println("Adding " + X + " to cluster: " + index);
	      m_AssignedSet.add(new Integer(X));
	      if (m_verbose)
		System.out.println("Exiting phase 2 for loop");
	      break; // exit from for
	    }
	    if( query >= numQueries ){
	      if (m_verbose)
		System.out.println("Ran out of queries");
	      createGlobalCentroids();
	      addMLAndCLTransitiveClosure(null);
	      return;
	    }
	  }
	} // end reverse for
      } // end else
    } // end while
    createGlobalCentroids();
    addMLAndCLTransitiveClosure(null);
    return;
  }

  /** Phase 2 code for active learning, random */

  protected void activePhaseTwoRandom(int numQueries) throws Exception {
    int numInstances = m_Instances.numInstances();
    int X,Y;
    int query = 0, Label;

    System.out.println("In Phase 2 with random, with numqueries: " + numQueries);
    while( query < numQueries ){
      if (m_verbose)
	System.out.println("Starting phase 2");
      
      Instance[] clusterCentroids = new Instance[m_NumClusters];
      if (m_AssignedSet.size() == m_StartingIndexOfTest) {
	if (m_verbose)
	  System.out.println("No more points left unassigned, we are DONE!!");
	createGlobalCentroids();
	addMLAndCLTransitiveClosure(null);
	return;
      }
      // find next point at random
      X = m_RandomNumberGenerator.nextInt(m_StartingIndexOfTest);
      while (m_AssignedSet != null && m_AssignedSet.contains(new Integer(X))) {
	X = m_RandomNumberGenerator.nextInt(m_StartingIndexOfTest);
      }
      if (m_verbose) 
	System.out.println("X = " + X + ", finding distances to centroids now");
      // compute centroids of other clusters
      for (int i=0; i<m_NumClusters; i++) { 
	if (m_isSparseInstance) {
	  clusterCentroids[i] = new SparseInstance(m_SumOfClusterInstances[i]);
	}
	else {
	  clusterCentroids[i] = new Instance(m_SumOfClusterInstances[i]);
	}
	clusterCentroids[i].setDataset(m_Instances);
	if (!m_objFunDecreasing) {
	  normalize(clusterCentroids[i]);
	}
	else {
	  normalizeByWeight(clusterCentroids[i]);
	}
      }
    
      double[] similaritiesToCentroids = new double[m_NumClusters];
      for (int i=0; i<m_NumClusters; i++) {
	similaritiesToCentroids[i] = m_metric.similarity(clusterCentroids[i], m_Instances.instance(X));
      } // handles both Euclidean and WeightedDotP
	
      if (m_verbose) {
	System.out.println("Before sort");
	for (int i=0; i<m_NumClusters; i++) {
	  System.out.println(similaritiesToCentroids[i]);
	}
      }
      
      int[] indices = Utils.sort(similaritiesToCentroids);
      
      if (m_verbose) {
	System.out.println("After sort");
	for (int i=0; i<m_NumClusters; i++) {
	  System.out.println(indices[i]);
	}
      }
      
      for(int h = m_NumClusters-1; h >=0; h-- ){ 
	// since sort is ascending, and we want descending sort of similarity values
	int index = indices[h];
	Iterator NbrIt = m_NeighborSets[index].iterator();
	Y = ((Integer) NbrIt.next()).intValue(); // get any point from neighborhood
	Label = askOracle(X,Y);
	query++;
	System.out.println("Making query:" + query);
	if (m_verbose) 
	  System.out.println("Number of queries: " + query);
	if( Label == InstancePair.MUST_LINK ){
	  // update data structures
	  m_NeighborSets[index].add(new Integer(X));
	  m_SumOfClusterInstances[index] = sumWithInstance(m_SumOfClusterInstances[index], m_Instances.instance(X));
	  m_ClusterAssignments[X] = index;
	  if (m_verbose)
	    System.out.println("Adding " + X + " to cluster: " + index);
	  m_AssignedSet.add(new Integer(X));
	  if (m_verbose)
	    System.out.println("Exiting phase 2 for loop");
	  break; // exit from for
	}
	else {
	  if (m_verbose)
	    System.out.println(X + " is CANNOT-LINKed to cluster " + index);
	}

	if( query >= numQueries ){
	  if (m_verbose)
	    System.out.println("Ran out of queries");
	  createGlobalCentroids();
	  addMLAndCLTransitiveClosure(null);
	  return;
	}
      }
    } // end reverse for
    createGlobalCentroids();
    addMLAndCLTransitiveClosure(null);
    return;
  }

  /** Creates the global cluster centroid */
  protected void createGlobalCentroids() throws Exception {
    // initialize using m_NumCurrentClusters neighborhoods (< m_NumClusters), make random for rest

    System.out.println("Creating centroids");
    if (m_verbose)
      System.out.println("Current number of clusters: " + m_NumCurrentClusters);

    // compute centroids of all clusters
    m_ClusterCentroids = new Instances(m_Instances, m_NumClusters);
    for (int i=0; i<m_NumCurrentClusters; i++) {
      if (m_SumOfClusterInstances[i] != null) {
	if (m_verbose) {
	  System.out.println("Normalizing cluster center " + i);
	}
	if (!m_objFunDecreasing) {
	  normalize(m_SumOfClusterInstances[i]);
	}
	else {
	  normalizeByWeight(m_SumOfClusterInstances[i]);
	}
      }
      m_SumOfClusterInstances[i].setDataset(m_Instances);
      m_ClusterCentroids.add(m_SumOfClusterInstances[i]);
    }

    // fill up remaining by randomPerturbInit
    if (m_NumCurrentClusters < m_NumClusters) {
      
      // find global centroid
      System.out.println("Creating global centroid");
      double [] globalValues = new double[m_Instances.numAttributes()];
      if (m_isSparseInstance) {
	globalValues = meanOrMode(m_Instances); // uses fast meanOrMode
      }
      else {
	for (int j = 0; j < m_Instances.numAttributes(); j++) {
	  globalValues[j] = m_Instances.meanOrMode(j); // uses usual meanOrMode
	}
      }
      
      // global centroid is dense in SPKMeans
      m_GlobalCentroid = new Instance(1.0, globalValues);
      m_GlobalCentroid.setDataset(m_Instances);

      // normalize before random perturbation
      if (!m_objFunDecreasing) {
	normalizeInstance(m_GlobalCentroid);
      }
      
      System.out.println("Creating " + (m_NumClusters - m_NumCurrentClusters) + " random centroids");
      for (int i=m_NumCurrentClusters; i<m_NumClusters; i++) {
	double [] values = new double[m_Instances.numAttributes()];
	double normalizer = 0;
	for (int j = 0; j < m_Instances.numAttributes(); j++) {
	  values[j] = m_GlobalCentroid.value(j) * (1 + m_DefaultPerturb * (m_RandomNumberGenerator.nextFloat() - 0.5));
	  normalizer += values[j] * values[j];
	}
	if (!m_objFunDecreasing) {
	  normalizer = Math.sqrt(normalizer);
	  for (int j = 0; j < m_Instances.numAttributes(); j++) {
	    values[j] /= normalizer;
	  }
	}

	// values suitably normalized at this point if required
	if (m_isSparseInstance) {
	  m_ClusterCentroids.add(new SparseInstance(1.0, values)); // sparse for consistency with other cluster centroids
	}
	else {
	  m_ClusterCentroids.add(new Instance(1.0, values));
	}
      }
    }
    System.out.println("Finished creating centroids");
    m_NumCurrentClusters = m_NumClusters;
  }

  /** adding other inferred ML and CL links to m_ConstraintsHash, from
   *   m_NeighborSets 
   */
  protected void addMLAndCLTransitiveClosure(int[] indices) throws Exception {
    // add all ML links within clusters
    if (m_verbose) {
      for (int j=0; j<m_NumCurrentClusters; j++) {
	int i = j;
	if (indices != null) {
	  i = indices[j];
	}
	System.out.println("Neighborhood list " + j + " is:");
	System.out.println(m_NeighborSets[i]);
      }
    }

    for (int j=0; j<m_NumCurrentClusters; j++) {
      int i = j;
      if (indices != null) {
	i = indices[j];
      }
      if (m_NeighborSets[i] != null) {
	Iterator iter1 = m_NeighborSets[i].iterator();
	while (iter1.hasNext()) {
	  int first = ((Integer) iter1.next()).intValue();
	  Iterator iter2 = m_NeighborSets[i].iterator();
	  while (iter2.hasNext()) {
	    int second = ((Integer) iter2.next()).intValue();
	    if (first < second) {
	      InstancePair pair = null;
	      pair = new InstancePair(first, second, InstancePair.DONT_CARE_LINK);
	      if (!m_ConstraintsHash.containsKey(pair)) {
		m_ConstraintsHash.put(pair, new Integer(InstancePair.MUST_LINK));
		if (m_verbose) {
		  System.out.println("Adding inferred ML (" + pair.first +","+pair.second+")");
		}
		
		// hash the constraints for the instances involved
		Integer firstInt = new Integer(first);
		Integer secondInt = new Integer(second);
		InstancePair pairML = null;
		pairML = new InstancePair(first, second, InstancePair.MUST_LINK);
		Object constraintList1 = m_instanceConstraintHash.get(firstInt);
		if (constraintList1 == null) {
		  ArrayList constraintList = new ArrayList();
		  constraintList.add(pairML);
		  m_instanceConstraintHash.put(firstInt, constraintList);
		} else {
		  ((ArrayList)constraintList1).add(pairML);
		}
		Object constraintList2 = m_instanceConstraintHash.get(secondInt);
		if (constraintList2 == null) {
		  ArrayList constraintList = new ArrayList();
		  constraintList.add(pairML);
		  m_instanceConstraintHash.put(secondInt, constraintList);
		} else {
		  ((ArrayList)constraintList2).add(pairML);
		}
		
		if (m_verbose) {
		  System.out.println("Adding inferred ML link: " + pair);
		}
		if (!m_SeedHash.contains(new Integer(first))) {
		  m_SeedHash.add(new Integer(first));
		}
		if (!m_SeedHash.contains(new Integer(second))) {
		  m_SeedHash.add(new Integer(second));
		}
	      }
	    }
	  }
	}
      }
    }

    // add all CL links between clusters
    for (int ii=0; ii<m_NumCurrentClusters; ii++) {
      int i = ii;
      if (indices != null) {
	i = indices[ii];
      }
      if (m_NeighborSets[i] != null) {
	for (int jj=ii+1; jj<m_NumCurrentClusters; jj++) {
	  int j = jj;
	  if (indices != null) {
	    j = indices[jj];
	  }
	  // check if there is at least one CL between neighborhoods ii & jj
	  boolean existsCL = false;

	  Iterator iter1 = m_NeighborSets[i].iterator();	
	  while (iter1.hasNext()) {
	    int index1 = ((Integer) iter1.next()).intValue();
	    if (m_NeighborSets[j] != null) {
	      Iterator iter2 = m_NeighborSets[j].iterator();
	      while (iter2.hasNext()) {
		int index2 = ((Integer) iter2.next()).intValue();
		int first = (index1 < index2)? index1:index2;
		int second = (index1 >= index2)? index1:index2;
		if (first == second) {
		  throw new Exception(" Same instance " + first + " cannot be in cluster: " + i + " and cluster " + j);
		}
		else if (first < second) {
		  InstancePair pair;
		  pair = new InstancePair(first, second, InstancePair.DONT_CARE_LINK);
		  if (m_ConstraintsHash.containsKey(pair)) {
		    // found one CL between the neighborhoods		    
		    existsCL = true;
		    break; // out of inner while
		  }
		}
	      }
	    }
	    if (existsCL) {
	      break; // out of outer while
	    }
	  }

	  // now add the inferred CLs
	  if (existsCL) {
	    iter1 = m_NeighborSets[i].iterator();	
	    while (iter1.hasNext()) {
	      int index1 = ((Integer) iter1.next()).intValue();
	      if (m_NeighborSets[j] != null) {
		Iterator iter2 = m_NeighborSets[j].iterator();
		while (iter2.hasNext()) {
		  int index2 = ((Integer) iter2.next()).intValue();
		  int first = (index1 < index2)? index1:index2;
		  int second = (index1 >= index2)? index1:index2;
		  if (first == second) {
		    throw new Exception(" Same instance " + first + " cannot be in cluster: " + i + " and cluster " + j);
		  }
		  else if (first < second) {
		    InstancePair pair;
		    pair = new InstancePair(first, second, InstancePair.DONT_CARE_LINK);

		    // add new constraint
		    if (!m_ConstraintsHash.containsKey(pair)) {
		      m_ConstraintsHash.put(pair, new Integer(InstancePair.CANNOT_LINK));
		      if (m_verbose) {
			System.out.println("Adding inferred CL (" + pair.first +","+pair.second+")");
		      }

		      // hash the constraints for the instances involved
		      Integer firstInt = new Integer(first);
		      Integer secondInt = new Integer(second);
		      InstancePair pairCL;
		      pairCL = new InstancePair(first, second, InstancePair.CANNOT_LINK);
		      Object constraintList1 = m_instanceConstraintHash.get(firstInt);
		      if (constraintList1 == null) {
			ArrayList constraintList = new ArrayList();
			constraintList.add(pairCL);
			m_instanceConstraintHash.put(firstInt, constraintList);
		      } else {
			((ArrayList)constraintList1).add(pairCL);
		      }
		      Object constraintList2 = m_instanceConstraintHash.get(secondInt);
		      if (constraintList2 == null) {
			ArrayList constraintList = new ArrayList();
			constraintList.add(pairCL);
			m_instanceConstraintHash.put(secondInt, constraintList);
		      } else {
			((ArrayList)constraintList2).add(pairCL);
		      }
		    
		      if (m_verbose) {
			System.out.println("Adding inferred CL link: " + pair);
		      }
		      if (!m_SeedHash.contains(new Integer(first))) {
			m_SeedHash.add(new Integer(first));
		      }
		      if (!m_SeedHash.contains(new Integer(second))) {
			m_SeedHash.add(new Integer(second));
		      }
		    }
		  }
		}
	      }
	    }
	  }
	}
      }
    }
  }

  /** Main Depth First Search routine */
  protected void DFS() throws Exception {
    int [] vertexColor = new int[m_Instances.numInstances()];
    m_NumCurrentClusters = 0;

    for(int u=0; u<m_Instances.numInstances(); u++){
      vertexColor[u] = WHITE;
    }
    
    for(int u=0; u<m_Instances.numInstances(); u++){
      if (m_AdjacencyList[u] != null && vertexColor[u] == WHITE) {
  	m_NeighborSets[m_NumCurrentClusters] = new HashSet();
	DFS_VISIT(u, vertexColor); 	// finds whole neighbourhood of u
	m_NumCurrentClusters++;
      }
    }
  }

  /** Recursive subroutine for DFS */
  protected void DFS_VISIT(int u, int[] vertexColor) throws Exception {
    vertexColor[u] = GRAY;
    
    Iterator iter = null;
    if (m_AdjacencyList[u] != null) {
      iter = m_AdjacencyList[u].iterator();
      while (iter.hasNext()) {
	int j = ((Integer) iter.next()).intValue();
	if(vertexColor[j] == WHITE){        // if the vertex is still undiscovered
	  DFS_VISIT(j, vertexColor);
	}
      }
    }
    // update stats for u
    m_ClusterAssignments[u] = m_NumCurrentClusters;
    m_NeighborSets[m_NumCurrentClusters].add(new Integer(u));
    m_SumOfClusterInstances[m_NumCurrentClusters] = sumWithInstance(m_SumOfClusterInstances[m_NumCurrentClusters], m_Instances.instance(u));
    vertexColor[u] = BLACK;
  }


  /** Initialization routine for non-active algorithm */
  protected void nonActivePairwiseInit() throws Exception  {
    m_NeighborSets = new HashSet[m_Instances.numInstances()];
    m_SumOfClusterInstances = new Instance[m_Instances.numInstances()];
    m_AdjacencyList = new HashSet[m_Instances.numInstances()];
    
    for (int i=0; i<m_Instances.numInstances(); i++) {
      m_ClusterAssignments[i] = -1;
    }

    if (m_ConstraintsHash != null) {
      Set pointPairs = (Set) m_ConstraintsHash.keySet(); 
      Iterator pairItr = pointPairs.iterator();
      System.out.println("In non-active init");
      
      // iterate over the pairs in ConstraintHash to create Adjacency List
      while( pairItr.hasNext() ){
	InstancePair pair = (InstancePair) pairItr.next();
	int linkType = ((Integer) m_ConstraintsHash.get(pair)).intValue();
	if (m_verbose)
	  System.out.println(pair + ": type = " + linkType);
	if( linkType == InstancePair.MUST_LINK ){ // concerned with MUST-LINK in Adjacency List
	  if (m_AdjacencyList[pair.first] == null) {
	    m_AdjacencyList[pair.first] = new HashSet();
	  }
	  if (!m_AdjacencyList[pair.first].contains(new Integer(pair.second))) {
	    m_AdjacencyList[pair.first].add(new Integer(pair.second));
	  }
	  
	  if (m_AdjacencyList[pair.second] == null) {
	    m_AdjacencyList[pair.second] = new HashSet();
	  }
	  if (!m_AdjacencyList[pair.second].contains(new Integer(pair.first))) {
	    m_AdjacencyList[pair.second].add(new Integer(pair.first));
	  }
	}
      }
      
      // DFS for finding connected components in Adjacency List, updates required stats
      DFS();
    }
    
    if (!m_Seedable) { // don't perform any seeding, initialize from random
      m_NumCurrentClusters = 0;
    }

    //    System.out.println("Need to make " + m_NumClusters + " clusters, already made " + m_NumCurrentClusters);

    // if the required number of clusters has been obtained, wrap-up
    if( m_NumCurrentClusters >= m_NumClusters ){
      if (m_verbose) {
	System.out.println("Got the required number of clusters ...");
	System.out.println("num clusters: " + m_NumClusters + ", num current clusters: " + m_NumCurrentClusters);
      }
      int clusterSizes[] = new int[m_NumCurrentClusters];
      for (int i=0; i<m_NumCurrentClusters; i++) {
	if (m_verbose) {
	  System.out.println("Neighbor set: " + i + " has size: " + m_NeighborSets[i].size());
	}
	clusterSizes[i] = -m_NeighborSets[i].size(); // for reverse sort (decreasing order)
      }	
      int[] indices = Utils.sort(clusterSizes);
      
      Instance[] clusterCentroids = new Instance[m_NumClusters];

      // compute centroids of m_NumClusters clusters
      m_ClusterCentroids = new Instances(m_Instances, m_NumClusters);
      for (int j=0; j < m_NumClusters; j++) {	
	int i = indices[j];
	System.out.println("Neighborhood selected:  " + m_NeighborSets[i].size() + "(" + m_TotalTrainWithLabels.instance(((Integer)(m_NeighborSets[i].iterator().next())).intValue()).classValue()+ ")\t");
	
	if (m_SumOfClusterInstances[i] != null) {
	  if (m_verbose) {
	    System.out.println("Normalizing instance " + i);
	  }
	  if (!m_objFunDecreasing) {
	    normalize(m_SumOfClusterInstances[i]);
	  }
	  else {
	    normalizeByWeight(m_SumOfClusterInstances[i]);
	  }
	}
	Iterator iter = m_NeighborSets[i].iterator();
	while (iter.hasNext()) { // assign points of new cluster
	  int instNumber = ((Integer) iter.next()).intValue();
	  if (m_verbose) {
	    System.out.println("Assigning " + instNumber + " to cluster: " + j);
	  }
	  // have to re-assign after sorting
	  m_ClusterAssignments[instNumber] = j;
	}

	m_SumOfClusterInstances[j].setDataset(m_Instances);
	// m_SumOfClusterInstances suitably normalized now
	m_ClusterCentroids.add(m_SumOfClusterInstances[i]);
      }
      for (int j=m_NumClusters; j < m_NumCurrentClusters; j++) {
	int i = indices[j];
	Iterator iter = m_NeighborSets[i].iterator();
	while (iter.hasNext()) {
	  int instNumber = ((Integer) iter.next()).intValue();
	  if (m_verbose) {
	    System.out.println("Assigning " + instNumber + " to cluster -1");
	  }
	  m_ClusterAssignments[instNumber] = -1;
	}
      }
      m_NumCurrentClusters = m_NumClusters;
      // adding other inferred ML and CL links
      addMLAndCLTransitiveClosure(indices);
      return;
    }
    else if( m_NumCurrentClusters < m_NumClusters ){
      createGlobalCentroids();
      addMLAndCLTransitiveClosure(null);
    }
  }

  // Query: oracle replies on link, added to m_ConstraintsHash
  protected int askOracle(int X, int Y) {
    Instance first = m_TotalTrainWithLabels.instance(X);
    Instance second = m_TotalTrainWithLabels.instance(Y);
    int linkType;

    if (m_verbose) {
      System.out.print("["+X+","+Y);
    }
    if (first.classValue() == second.classValue()) {
      if (m_verbose) {
	System.out.println(",MUST]");
      }
      linkType = InstancePair.MUST_LINK;
    }
    else if (first.classValue() != second.classValue()) {
      if (m_verbose) {
	System.out.println(",CANNOT]");
      }
      linkType = InstancePair.CANNOT_LINK;
    } else {
      if (m_verbose) {
	System.out.println(",DONT_CARE]");
      }
      linkType = InstancePair.DONT_CARE_LINK;
    }
    
    // add to constraintHash and seedHash

    int firstIndex = (X<Y)? X:Y;
    int secondIndex = (X>=Y)? X:Y;
    InstancePair newPair = new InstancePair(firstIndex, secondIndex, InstancePair.DONT_CARE_LINK);
    if (!m_ConstraintsHash.containsKey(newPair)) {
      m_ConstraintsHash.put(newPair, new Integer(linkType));
    }

    Integer firstInt = new Integer(firstIndex);
    Integer secondInt = new Integer(secondIndex);
    // for first point 
    if(!m_SeedHash.contains(firstInt)) { // add instances with constraints to seedHash
      m_SeedHash.add(firstInt);
    }

    // for second point 
    if(!m_SeedHash.contains(secondInt)) {
      m_SeedHash.add(secondInt);
    }
    
    return linkType;
  }

  /** Finds point which has max min-distance from set visitedPoints, does not consider
      points from eliminationSet
   */
  int farthestFromSet(HashSet visitedPoints, HashSet eliminationSet) throws Exception {
    
    // implements farthest-first search algorithm:
    /*
      for (each datapoint x not in visitedPoints) {
        distance of x to visitedPoints = min{d(x,f):f \in visitedPoints}
      }
      select the point x with maximum distance as new center;
    */

    if (visitedPoints.size() == 0) {
      int point;
      if (m_StartingIndexOfTest < m_Instances.numInstances()) {	
	point = m_RandomNumberGenerator.nextInt(m_StartingIndexOfTest); // takes care not to select test example
      }
      else {
	point = m_RandomNumberGenerator.nextInt(m_Instances.numInstances());
      }
      // Note: no need to check for labeled data now, since we have no visitedPoints
      // => no labeled data
      if (m_verbose)
	System.out.println("First point selected: " + point);
      return point;
    }
    else {
      if (m_verbose) {
	Iterator iter = visitedPoints.iterator();
	if (eliminationSet != null) { 
	  iter = eliminationSet.iterator();
	  while(iter.hasNext()) {
	    System.out.println("In elimination set: " + ((Integer) iter.next()).intValue());
	  }
	}
      }
    }
    
    double minSimilaritySoFar = Double.POSITIVE_INFINITY;
    double maxDistanceSoFar = Double.NEGATIVE_INFINITY;
    ArrayList bestPointArray = null;
    int bestPoint = -1;

    for (int i=0; i<m_Instances.numInstances() && i<m_StartingIndexOfTest; i++) { // point should not belong to test set
      if (visitedPoints == null || !visitedPoints.contains(new Integer(i))) { // point should not belong to visitedPoints
	if (eliminationSet == null || !eliminationSet.contains(new Integer(i))) { // point should not belong to eliminationSet
	  Instance inst = m_Instances.instance(i);
	  Iterator iter = visitedPoints.iterator();
	  double minDistanceFromSet = Double.POSITIVE_INFINITY;
	  double maxSimilarityFromSet = Double.NEGATIVE_INFINITY;
	  while (iter.hasNext()) {
	    Instance pointInSet = m_Instances.instance(((Integer) iter.next()).intValue());
	    if (!m_objFunDecreasing) {
	      double sim = m_metric.similarity(inst, pointInSet);
	      if (sim > maxSimilarityFromSet) {
		maxSimilarityFromSet = sim;
	      }
	    }
	    else {
	      double dist = m_metric.distance(inst, pointInSet);
	      if (dist < minDistanceFromSet) {
		minDistanceFromSet = dist;
	      }
	    }
	  }
	  if (!m_objFunDecreasing) {
	    if (maxSimilarityFromSet == minSimilaritySoFar) {
	      minSimilaritySoFar = maxSimilarityFromSet;
	      bestPointArray.add(new Integer(i));
	    }
	    else if (maxSimilarityFromSet < minSimilaritySoFar) {
	      minSimilaritySoFar = maxSimilarityFromSet;
	      bestPointArray = new ArrayList();
	      bestPointArray.add(new Integer(i));
	    }
	  }
	  else {
	    if (minDistanceFromSet == maxDistanceSoFar) {
	      minDistanceFromSet = maxDistanceSoFar;
	      bestPointArray.add(new Integer(i));
	      if (m_verbose) {
		System.out.println("Additional point added: " + i + " with similarity: " + minSimilaritySoFar);
	      }
	    }
	    else if (minDistanceFromSet > maxDistanceSoFar) {
	      maxDistanceSoFar = minDistanceFromSet;
	      bestPointArray = new ArrayList();
	      bestPointArray.add(new Integer(i));
	      if (m_verbose) {
		System.out.println("Farthest point from set is: " + i + " with distance: " + maxDistanceSoFar);
	      }
	    }
	  }
	}
      }
    }

    if (bestPointArray == null) {
      System.out.println("\n\nAttention!! No more points left, all assigned\n\n");
    } else {
      if (m_verbose)
	System.out.println("Have " + bestPointArray.size() + " points in bestPointArray");
      int index = m_RandomNumberGenerator.nextInt(bestPointArray.size()); // select one of the bestPoints at random
      bestPoint = ((Integer) bestPointArray.get(index)).intValue();
    }

    if (m_verbose) {
      if (!m_objFunDecreasing) {
	System.out.println("Selected " + bestPoint + " with similarity: " + minSimilaritySoFar);
      }
      else {
	System.out.println("Selected " + bestPoint + " with distance: " + maxDistanceSoFar);
      }
    }
    return bestPoint;
  }


  /** Finds point which is nearest to center. This point should not be
   *  a test point and should not belong to visitedPoints 
   */
  int nearestFromPoint(Instance center, HashSet visitedPoints) throws Exception {
    double maxSimilarity = Double.NEGATIVE_INFINITY;
    double minDistance = Double.POSITIVE_INFINITY;
    int bestPoint = -1;

    for (int i=0; i<m_Instances.numInstances() && i<m_StartingIndexOfTest; i++) { // bestPoint should not be a test point
      if (!visitedPoints.contains(new Integer(i))) { // bestPoint should not belong to visitedPoints
	Instance inst = m_Instances.instance(i);
	if (!m_objFunDecreasing) {
	  double sim = m_metric.similarity(inst, center);
	  if (sim > maxSimilarity) {
	    bestPoint = i;
	    maxSimilarity = sim;
	  }
	}
	else {
	  double dist = m_metric.distance(inst, center);
	  if (dist < minDistance) {
	    bestPoint = i;
	    minDistance = dist;
	    if (m_verbose) {
	      System.out.println("Nearest point is: " + bestPoint + " with dist: " + minDistance);
	    }
	  }
	}
      }
    }
    return bestPoint;
  }

  /** This function divides every attribute value in an instance by
   *  the instance weight -- useful to find the mean of a cluster in
   *  Euclidean space 
   *  @param inst Instance passed in for normalization (destructive update)
   */
  protected void normalizeByWeight(Instance inst) {
    double weight = inst.weight();
    if (m_verbose) {
      System.out.println("Before weight normalization: " + inst);
    }
    if (inst instanceof SparseInstance) { 
      for (int i=0; i<inst.numValues(); i++) {
	inst.setValueSparse(i, inst.valueSparse(i)/weight);
      }
    }
    else if (!(inst instanceof SparseInstance)) {
      for (int i=0; i<inst.numAttributes(); i++) {
	inst.setValue(i, inst.value(i)/weight);
      }
    }
    if (m_verbose) {
      System.out.println("After weight normalization: " + inst);
    }
  }


  /** Finds the sum of instance sum with instance inst 
   */
  Instance sumWithInstance(Instance sum, Instance inst) throws Exception {
    Instance newSum;
    if (sum == null) {
      if (m_isSparseInstance) {
	newSum = new SparseInstance(inst);
	newSum.setDataset(m_Instances);
      }
      else {
	newSum = new Instance(inst);
	newSum.setDataset(m_Instances);
      }
    }
    else {
      newSum = sumInstances(sum, inst);
    }
    return newSum;
  }


  /** Finds sum of 2 instances (handles sparse and non-sparse)
   */

  protected Instance sumInstances(Instance inst1, Instance inst2) throws Exception {
    int numAttributes = inst1.numAttributes();
    if (inst2.numAttributes() != numAttributes) {
      throw new Exception ("Error!! inst1 and inst2 should have same number of attributes.");
    }
    //      if (m_verbose) {
    //        System.out.println("Instance 1 is: " + inst1 + ", instance 2 is: " + inst2);
    //      }
    double weight1 = inst1.weight(), weight2 = inst2.weight();
    double [] values = new double[numAttributes];
    Instance newInst;
    
    for (int i=0; i<numAttributes; i++) {
      values[i] = 0;
    }
    
    if (inst1 instanceof SparseInstance && inst2 instanceof SparseInstance) {
      for (int i=0; i<inst1.numValues(); i++) {
	int indexOfIndex = inst1.index(i);
	values[indexOfIndex] = inst1.valueSparse(i);
      }
      for (int i=0; i<inst2.numValues(); i++) {
	int indexOfIndex = inst2.index(i);
	values[indexOfIndex] += inst2.valueSparse(i);
      }
      newInst = new SparseInstance(weight1+weight2, values);
      newInst.setDataset(m_Instances);
    }
    else if (!(inst1 instanceof SparseInstance) && !(inst2 instanceof SparseInstance)){
      for (int i=0; i<numAttributes; i++) {
	values[i] = inst1.value(i) + inst2.value(i);
      }
      newInst = new Instance(weight1+weight2, values);
      newInst.setDataset(m_Instances);
    }
    else {
      throw new Exception ("Error!! inst1 and inst2 should be both of same type -- sparse or non-sparse");
    }
    //      if (m_verbose) {
    //        System.out.println("Sum instance is: " + newInst);
    //      }
    return newInst;
  }


  /** Updates the clusterAssignments for all points after clustering.
   *  Map assignments from [0,numInstances-1] to [0,numClusters-1]
   *  i.e. from [0 2 2 0 6 6 2] -> [0 1 1 0 2 2 0] 
   *  **** NOTE: THIS FUNCTION IS NO LONGER USED!!! ****
   */
  protected void updateClusterAssignments() throws Exception {

    //  **** DEPRECATED: THIS FUNCTION IS NO LONGER USED!!! ****

    int numInstances = m_Instances.numInstances();
    HashMap clusterNumberHash = new HashMap((int) (m_NumClusters/0.75+10));
    int clusterNumber = 0;

    if (m_verbose) {
      System.out.println("Mapping cluster assignments. Initial cluster assignments:");
      for (int i=0; i<numInstances; i++) {
	System.out.print(m_ClusterAssignments[i] + " ");
      }
      System.out.println();
    }
    
    for (int i=0; i<numInstances; i++) {
      if (m_ClusterAssignments[i]!=-1) {
	Integer clusterNum = new Integer(m_ClusterAssignments[i]);
	if (!clusterNumberHash.containsKey(clusterNum)) {
	  clusterNumberHash.put(clusterNum, new Integer(clusterNumber));
	  clusterNumber++;
	}
      }
    }
    if (clusterNumber != m_NumClusters) {
      throw new Exception("Number of clusters do not match");
    }
    for (int i=0; i<numInstances; i++) {
      if (m_ClusterAssignments[i]!=-1) {
	int newCluster = ((Integer) clusterNumberHash.get(new Integer(m_ClusterAssignments[i]))).intValue();
	m_ClusterAssignments[i] = newCluster;
      }
    }
    if (m_verbose) {
      System.out.println("Done updating cluster assignments. New cluster assignments:");
      for (int i=0; i<numInstances; i++) {
	System.out.print(m_ClusterAssignments[i] + " ");
      }
      System.out.println();
    }
    clusterNumberHash.clear(); clusterNumberHash = null; //free memory
  }

  /** Outputs the current clustering
   *
   * @exception Exception if something goes wrong
   */
  public void printIndexClusters() throws Exception {
    if (m_IndexClusters == null)
      throw new Exception ("Clusters were not created");

    for (int i = 0; i < m_NumClusters; i++) {
      HashSet cluster = m_IndexClusters[i];
      if (cluster == null) {
	System.out.println("Cluster " + i + " is null");
      }
      else {
	System.out.println ("Cluster " + i + " consists of " + cluster.size() + " elements");
	Iterator iter = cluster.iterator();
	while(iter.hasNext()) {
	  int idx = ((Integer) iter.next()).intValue();
	  System.out.println("\t\t" + idx);
	}
      }
    }
  }


  /** E-step of the KMeans clustering algorithm -- find best cluster assignments
   */
  protected void findBestAssignments() throws Exception{
    int moved = 0;

    int numInstances = m_Instances.numInstances();
    int [] indices = new int[numInstances];
    for (int i=0; i<numInstances; i++) {
      indices[i] = i; // initialize
    }
    
    if (m_InstanceOrdering == ORDERING_DEFAULT) {
      for (int i=0; i<numInstances; i++) {
	try {
	  // Update number of points moved
	  moved += assignInstanceToClusterWithConstraints(i);
	}
	catch (Exception e) {
	  System.out.println("Could not find distance. Exception: " + e);
	  e.printStackTrace();
	}
      }

      if (m_MovePointsTillAssignmentStabilizes) {
	int newMoved = -1;
	for (int t=0; t<100 && newMoved != 0; t++) { // move points till assignment stabilizes
	  newMoved = 0;
	  for (int i=0; i<numInstances; i++) {
	    newMoved += assignInstanceToClusterWithConstraints(i);
	  }
	  if (newMoved > 0) {
	    System.out.println(newMoved + " points moved on changing order in t=" + t);
	  } 
	}
      }
    } else if (m_InstanceOrdering == ORDERING_RANDOM) { // randomize instance ordering

      m_RandomNumberGenerator = new Random(m_RandomSeed); // initialize random number generator again
      for (int i = numInstances - 1; i > 0; i--) { 
	int indexToSwap = m_RandomNumberGenerator.nextInt(i+1);
	int temp = indices[i]; // swap
	indices[i] = indices[indexToSwap];
	indices[indexToSwap] = temp;
      }

      for (int i=0; i<numInstances; i++) {
	try {
	  // Update number of points moved
	  moved += assignInstanceToClusterWithConstraints(indices[i]);
	}
	catch (Exception e) {
	  System.out.println("Could not find distance. Exception: " + e);
	  e.printStackTrace();
	}
      }

      if (m_MovePointsTillAssignmentStabilizes) {
	int newMoved = -1;
	for (int t=0; t<100 && newMoved != 0; t++) { // move points till assignment stabilizes
	  newMoved = 0;
	  for (int i=0; i<numInstances; i++) {
	    newMoved += assignInstanceToClusterWithConstraints(indices[i]);
	  }
	  if (newMoved > 0) {
	    System.out.println(newMoved + " points moved on changing order in t=" + t);
	  } 
	}
      }
    } else if (m_InstanceOrdering == ORDERING_SORTED) {
      
      int [] sortedIndices = null;
      double bestSquareDistance = Integer.MAX_VALUE;
      double bestSimilarity = Integer.MIN_VALUE;

      double [] distances = new double[numInstances];

      // find closest cluster centroid for each instance
      for (int i = 0; i < numInstances; i++) {
	for (int j = 0; j < m_NumClusters; j++) {
	  double squareDistance = 0, similarity = 0;
	  if (!m_objFunDecreasing) {
	    similarity = similarityInPottsModel(i,j);
	    if (similarity > bestSimilarity) {
	      bestSimilarity = similarity;
	      distances[i] = -similarity; // hacked distance conversion for sorting
	    }
	  } else {
	    squareDistance = squareDistanceInPottsModel(i,j);
	    if (squareDistance < bestSquareDistance) {
	      bestSquareDistance = squareDistance;
	      distances[i] = squareDistance;
	    }
	  }
	}
      }

      sortedIndices = Utils.sort(distances); // sort in ascending order
      
      for (int i=0; i<numInstances; i++) {
	try {
	  // Update number of points moved
	  moved += assignInstanceToClusterWithConstraints(sortedIndices[i]);
	}
	catch (Exception e) {
	  System.out.println("Could not find distance. Exception: " + e);
	  e.printStackTrace();
	}
      }

      if (m_MovePointsTillAssignmentStabilizes) {
	int newMoved = -1;
	for (int t=0; t<100 && newMoved != 0; t++) { // move points till assignment stabilizes
	  newMoved = 0;
	  for (int i=0; i<numInstances; i++) {
	    newMoved += assignInstanceToClusterWithConstraints(sortedIndices[i]);
	  }
	  if (newMoved > 0) {
	    System.out.println(newMoved + " points moved on changing order in t=" + t);
	  } 
	}
      }
    } else {
      throw new Exception ("Unknown instance ordering!!");
    }
    
    System.out.println("\t" + moved + " points moved in this E-step");
  }

  /**
   * Classifies the instance using the current clustering considering
   * constraints, updates cluster assignments
   *
   * @param instance the instance to be assigned to a cluster
   * @return 1 if the point is moved, 0 otherwise
   * @exception Exception if instance could not be classified
   * successfully */

  public int assignInstanceToClusterWithConstraints(int instIdx) throws Exception {
    int bestCluster = 0;
    double bestSquareDistance = Integer.MAX_VALUE;
    double bestSimilarity = Integer.MIN_VALUE;
    int moved = 0;
    
    for (int i = 0; i < m_NumClusters; i++) {
      double squareDistance = 0, similarity = 0;
      if (!m_objFunDecreasing) {
	similarity = similarityInPottsModel(instIdx, i);
	//	System.out.println("Sim between instance " + instIdx + " and cluster " + i + " = " + similarity);
	if (similarity > bestSimilarity) {
	  bestSimilarity = similarity;
	  bestCluster = i;
	}
      } else {
	squareDistance = squareDistanceInPottsModel(instIdx, i);
	if (squareDistance < bestSquareDistance) {
	  bestSquareDistance = squareDistance;
	  bestCluster = i;
	}
      }
    }
    if (m_ClusterAssignments[instIdx] != bestCluster) {
      if (m_verbose) {
	System.out.println("Moving instance " + instIdx + " from cluster " + m_ClusterAssignments[instIdx] + " to cluster " + bestCluster);
      }
      moved = 1;

//        // remove instIdx from old cluster
//        if (m_ClusterAssignments[instIdx] < m_NumClusters && m_ClusterAssignments[instIdx] != -1 && m_IndexClusters[m_ClusterAssignments[instIdx]] != null ) {
//  	m_IndexClusters[m_ClusterAssignments[instIdx]].remove(new Integer(instIdx));
//        }
//        // add instIdx to new cluster
//        if (m_IndexClusters[bestCluster] == null) {
//  	m_IndexClusters[bestCluster] = new HashSet();
//        }
//        m_IndexClusters[bestCluster].add(new Integer (instIdx));

      // updates cluster Assignments
      m_ClusterAssignments[instIdx] = bestCluster; 
    }
    if (m_verbose) {
      System.out.println("Assigning instance " + instIdx + " to cluster " + bestCluster);
    }

    return moved;
  }

  /** finds similarity between instance and centroid in Potts Model
   */
  double similarityInPottsModel(int instIdx, int centroidIdx) throws Exception{
    double sim = m_metric.similarity(m_Instances.instance(instIdx), m_ClusterCentroids.instance(centroidIdx));

    Object list =  m_instanceConstraintHash.get(new Integer(instIdx));
    if (list != null) {   // there are constraints associated with this instance
      ArrayList constraintList = (ArrayList) list;
      for (int i = 0; i < constraintList.size(); i++) {
	InstancePair pair = (InstancePair) constraintList.get(i);
	int firstIdx = pair.first;
	int secondIdx = pair.second;
	Instance instance1 = m_Instances.instance(firstIdx);
	Instance instance2 = m_Instances.instance(secondIdx);
	int otherIdx = (firstIdx == instIdx) ? m_ClusterAssignments[secondIdx] : m_ClusterAssignments[firstIdx];
	
	// check whether the constraint is violated
	if (otherIdx != -1) { 
	  if (otherIdx != centroidIdx && pair.linkType == InstancePair.MUST_LINK) { 
	    sim -= m_MustLinkWeight;
	  } else if (otherIdx == centroidIdx && pair.linkType == InstancePair.CANNOT_LINK) { 
	    sim -= m_CannotLinkWeight;
	  }
	}
      }
    }

    if(m_verbose) {
      System.out.println("Final similarity between instance " + instIdx + " and centroid " + centroidIdx + " is: " + sim);
    }
    return sim;
  }

  /** finds squaredistance between instance and centroid in Potts Model
   */
  double squareDistanceInPottsModel(int instIdx, int centroidIdx) throws Exception{
    double dist = m_metric.distance(m_Instances.instance(instIdx), m_ClusterCentroids.instance(centroidIdx));
    dist *= dist; // doing the squaring here itself
    if(m_verbose) {
      System.out.println("Unconstrained distance between instance " + instIdx + " and centroid " + centroidIdx + " is: " + dist);
    }

    Object list =  m_instanceConstraintHash.get(new Integer(instIdx));
    if (list != null) {   // there are constraints associated with this instance
      ArrayList constraintList = (ArrayList) list;
      for (int i = 0; i < constraintList.size(); i++) {
	InstancePair pair = (InstancePair) constraintList.get(i);
	int firstIdx = pair.first;
	int secondIdx = pair.second;
	Instance instance1 = m_Instances.instance(firstIdx);
	Instance instance2 = m_Instances.instance(secondIdx);
	int otherIdx = (firstIdx == instIdx) ? m_ClusterAssignments[secondIdx] : m_ClusterAssignments[firstIdx];
	
	// check whether the constraint is violated
	if (otherIdx != -1) { 
	  if (otherIdx != centroidIdx && pair.linkType == InstancePair.MUST_LINK) { 
	    dist += m_MustLinkWeight;
	  } else if (otherIdx == centroidIdx && pair.linkType == InstancePair.CANNOT_LINK) { 
	    dist += m_CannotLinkWeight;
	  }
	}
      }
    }

    if(m_verbose) {
      System.out.println("Final distance between instance " + instIdx + " and centroid " + centroidIdx + " is: " + dist);
    }
    return dist;
  }

  /** M-step of the KMeans clustering algorithm -- updates cluster centroids
   */
  protected void updateClusterCentroids() throws Exception {
    // M-step: update cluster centroids
    Instances [] tempI = new Instances[m_NumClusters];
    m_ClusterCentroids = new Instances(m_Instances, m_NumClusters);
    
    for (int i = 0; i < m_NumClusters; i++) {
      tempI[i] = new Instances(m_Instances, 0); // tempI[i] stores the cluster instances for cluster i
    }
    for (int i = 0; i < m_Instances.numInstances(); i++) {
      tempI[m_ClusterAssignments[i]].add(m_Instances.instance(i));
    }
    
    // Calculates cluster centroids
    for (int i = 0; i < m_NumClusters; i++) {
      double [] values = new double[m_Instances.numAttributes()];
      if (m_isSparseInstance) {
	values = meanOrMode(tempI[i]); // uses fast meanOrMode
      }
      else {
	for (int j = 0; j < m_Instances.numAttributes(); j++) {
	  values[j] = tempI[i].meanOrMode(j); // uses usual meanOrMode
	}
      }
      
      // cluster centroids are dense in SPKMeans
      m_ClusterCentroids.add(new Instance(1.0, values));
      if (m_Algorithm == ALGORITHM_SPHERICAL) {
	try {
	  normalize(m_ClusterCentroids.instance(i));
	}
	catch (Exception e) {
	  e.printStackTrace();
	}
      }
    }
    for (int i = 0; i < m_NumClusters; i++) {
      tempI[i] = null; // free memory for garbage collector to pick up
    }
  }
  
  /** calculates objective function */
  protected void calculateObjectiveFunction() throws Exception {
    if (m_verbose) {
      System.out.println("Calculating objective function ...");
    }
    m_Objective = 0;

    double tempML = m_MustLinkWeight;
    double tempCL = m_CannotLinkWeight;
    m_MustLinkWeight = tempML/2;
    m_CannotLinkWeight = tempCL/2; // adjust weights to take care of double counting of constraints 
    if (m_verbose) {
      System.out.println("Must link weight: " + m_MustLinkWeight);
      System.out.println("Cannot link weight: " + m_CannotLinkWeight);    
    }

    for (int i=0; i<m_Instances.numInstances(); i++) {
      if (m_objFunDecreasing) {
	m_Objective += squareDistanceInPottsModel(i, m_ClusterAssignments[i]);
      }
      else {
	m_Objective += similarityInPottsModel(i, m_ClusterAssignments[i]);
      }
    }
    double o = m_Objective;
    
    m_MustLinkWeight = tempML;
    m_CannotLinkWeight = tempCL; // reset the values of the constraint weights
    if (m_verbose) {
      System.out.println("Must link weight: " + m_MustLinkWeight);
      System.out.println("Cannot link weight: " + m_CannotLinkWeight);    
    }
  }

  
  /** Actual KMeans function */
  protected void runKMeans() throws Exception {
    boolean converged = false;
    m_Iterations = 0;

    double oldObjective = m_objFunDecreasing ? Double.POSITIVE_INFINITY : Double.NEGATIVE_INFINITY;

    while (!converged) {
      // E-step: updates m_Objective
      if (m_verbose) {
	System.out.println("Doing E-step ...");
      }
      // to find the instance indices in the clusters, for constraint calculation in E-step

      findBestAssignments();

      // Find objective function
      if (m_Iterations > 0) {
	calculateObjectiveFunction();
	System.out.println("Objective function after point point assignment: " + m_Objective);
      }

      // M-step
      if (m_verbose) {
	System.out.println("Doing M-step ...");
      }
      updateClusterCentroids(); 
      calculateObjectiveFunction();
      System.out.println("Objective function after centroid estimation: " + m_Objective);

      m_Iterations++;
      
      // Convergence check
      if(Math.abs(oldObjective - m_Objective) > m_ObjFunConvergenceDifference) {
	converged = false;
      }
      else {
	converged = true;
	System.out.println("Final Objective function is: " + m_Objective);
      }
      if ((!m_objFunDecreasing && oldObjective > m_Objective) ||
	  (m_objFunDecreasing && oldObjective < m_Objective)) {
	throw new Exception("Oscillations => bug in objective function/EM step!!");
      }
      oldObjective = m_Objective;
    }
  }

  /** Dummy: not implemented for PCKMeans */
  public int[] bestInstancesForActiveLearning(int numActive) throws Exception{
    throw new Exception("Not implemented for PCKMeans");
  }

  /** Returns the indices of the best numActive instances for active learning */
  public InstancePair[] bestPairsForActiveLearning(int numActive) throws Exception{
    int usedQueries = activePhaseOne(numActive);
    if (m_PhaseTwoRandom) {
      activePhaseTwoRandom(numActive-usedQueries);
    }
    else {
      activePhaseTwoRoundRobin(numActive-usedQueries);
    }
    return null;
  }


  /**
   * Checks if instance has to be normalized and classifies the
   * instance using the current clustering
   *
   * @param instance the instance to be assigned to a cluster
   * @return the number of the assigned cluster as an integer
   * if the class is enumerated, otherwise the predicted value
   * @exception Exception if instance could not be classified
   * successfully */

  public int clusterInstance(Instance instance) throws Exception {
    if (m_Algorithm == ALGORITHM_SPHERICAL) { // check here, since evaluateModel calls this function on test data
      normalize(instance);
    }
    return assignInstanceToCluster(instance);
  }

  /** lookup the instance in the checksum hash
   * @param instance instance to be looked up
   * @return the index of the cluster to which the instance was assigned, -1 if the instance has not bee clustered
   */
  protected int lookupInstanceCluster(Instance instance) {
    int classIdx = instance.classIndex();
    double[] values1 = instance.toDoubleArray();
    double checksum = 0; 
    for (int i = 0; i < values1.length; i++) {
      if (i != classIdx) {
	checksum += m_checksumCoeffs[i] * values1[i]; 
      } 
    }

    Object list = m_checksumHash.get(new Double(checksum));
    if (list != null) {
      // go through the list of instances with the same checksum and find the one that is equivalent
      ArrayList checksumList = (ArrayList) list;
      for (int i = 0; i < checksumList.size(); i++) {
	int instanceIdx = ((Integer) checksumList.get(i)).intValue();
	Instance listInstance = m_Instances.instance(instanceIdx);
	double[] values2 = listInstance.toDoubleArray();
	boolean equal = true; 
	for (int j = 0; j < values1.length && equal == true; j++) {
	  if (j != classIdx) {
	    if (values1[j] != values2[j]) {
	      equal = false;
	    }
	  } 
	}
	if (equal == true) {
	  return m_ClusterAssignments[instanceIdx]; 
	}
      } 
    } 
    return -1; 
  }

  /**
   * Classifies the instance using the current clustering, without considering constraints
   *
   * @param instance the instance to be assigned to a cluster
   * @return the number of the assigned cluster as an integer
   * if the class is enumerated, otherwise the predicted value
   * @exception Exception if instance could not be classified
   * successfully */

  public int assignInstanceToCluster(Instance instance) throws Exception {
    int bestCluster = 0;
    double bestDistance = Double.POSITIVE_INFINITY;
    double bestSimilarity = Double.NEGATIVE_INFINITY;

    // lookup the cluster assignment of the instance
    int lookupCluster = lookupInstanceCluster(instance);
    if (lookupCluster >= 0) {
      return lookupCluster;
    }
    System.out.println("Something's wrong, were supposed to look up but couldn't find it; size=" +  m_checksumHash.size());
    throw new Exception("WARNING!!!\n\nCouldn't lookup the instance!!!!\n\n");
  }
  
  /** Set the cannot link constraint weight */
  public void setCannotLinkWeight(double w) {
    m_CannotLinkWeight = w;
  }

  /** Return the cannot link constraint weight */
  public double getCannotLinkWeight() {
    return m_CannotLinkWeight;
  }

  /** Set the must link constraint weight */
  public void setMustLinkWeight(double w) {
    m_MustLinkWeight = w;
  }

  /** Return the must link constraint weight */
  public double getMustLinkWeight() {
    return m_MustLinkWeight;
  }


  /** Return m_PhaseTwoRandom */
  public  boolean getPhaseTwoRandom() {
    return m_PhaseTwoRandom;
  }

  /** Set m_PhaseTwoRandom */
  public void setPhaseTwoRandom(boolean w) {
    m_PhaseTwoRandom = w;
  }

  /** Return m_AllExplore */
  public  boolean getAllExplore() {
    return m_AllExplore;
  }

  /** Set m_AllExplore */
  public void setAllExplore(boolean b) {
    m_AllExplore = b;
  }

  /** Return the number of clusters */
  public int getNumClusters() {
    return m_NumClusters;
  }

  /** A duplicate function to conform to Clusterer abstract class.
   * @returns the number of clusters
   */
  public int numberOfClusters() {
    return getNumClusters();
  } 
  
  /** Set the m_SeedHash */
  public void setSeedHash(HashMap seedhash) {
    System.err.println("Not implemented here");
  }    

 /**
   * Set the random number seed
   * @param s the seed
   */
  public void setRandomSeed (int s) {
    m_RandomSeed = s;
  }

    
  /** Return the random number seed */
  public int getRandomSeed () {
    return  m_RandomSeed;
  }

 /**
   * Set m_MovePointsTillAssignmentStabilizes
   * @param b truth value
   */
  public void setMovePointsTillAssignmentStabilizes (boolean b) {
    m_MovePointsTillAssignmentStabilizes = b;
  }
    
  /** Return m_MovePointsTillAssignmentStabilizes */
  public boolean getMovePointsTillAssignmentStabilizes () {
    return  m_MovePointsTillAssignmentStabilizes;
  }


  /**
   * Set the minimum value of the objective function difference required for convergence
   * @param objFunConvergenceDifference the minimum value of the objective function difference required for convergence
   */
  public void setObjFunConvergenceDifference(double objFunConvergenceDifference) {
    m_ObjFunConvergenceDifference = objFunConvergenceDifference;
  }

    /**
   * Get the minimum value of the objective function difference required for convergence
   * @returns the minimum value of the objective function difference required for convergence
   */
  public double getObjFunConvergenceDifference() {
    return m_ObjFunConvergenceDifference;
  }
    
 /** Sets training instances */
  public void setInstances(Instances instances) {
    m_Instances = instances;

    // create the checksum coefficients
    m_checksumCoeffs = new double[instances.numAttributes()];
    for (int i = 0; i < m_checksumCoeffs.length; i++) {
      m_checksumCoeffs[i] = m_RandomNumberGenerator.nextDouble();
    }

    // hash the instance checksums
    m_checksumHash = new HashMap(instances.numInstances());
    int classIdx = instances.classIndex();
    for (int i = 0; i < instances.numInstances(); i++) {
      Instance instance = instances.instance(i);
      double[] values = instance.toDoubleArray();
      double checksum = 0;

      for (int j = 0; j < values.length; j++) {
	if (j != classIdx) {
	  checksum += m_checksumCoeffs[j] * values[j]; 
	} 
      }

      // take care of chaining
      Object list = m_checksumHash.get(new Double(checksum));
      ArrayList idxList = null; 
      if (list == null) {
	idxList = new ArrayList();
	m_checksumHash.put(new Double(checksum), idxList);
      } else { // chaining
	idxList = (ArrayList) list;
      }
      idxList.add(new Integer(i));
    } 
  }


  /** Return training instances */
  public Instances getInstances() {
    return m_Instances;
  }

  /**
   * Set the number of clusters to generate
   *
   * @param n the number of clusters to generate
   */
  public void setNumClusters(int n) {
    m_NumClusters = n;
    if (m_verbose) {
      System.out.println("Number of clusters: " + n);
    }
  }


  /**
   * Set the distance metric
   *
   * @param s the metric
   */
  public void setMetric (LearnableMetric m) {
    m_metric = m;
    String metricName = m_metric.getClass().getName();
    System.out.println("Setting m_metric to " + metricName);
    m_objFunDecreasing = m.isDistanceBased();
  }

  /**
   * Get the distance metric
   *
   * @returns the distance metric used
   */
  public Metric getMetric () {
    return m_metric;
  }

  /**
   * Set the KMeans algorithm.  Values other than
   * ALGORITHM_SIMPLE or ALGORITHM_SPHERICAL will be ignored
   *
   * @param algo algorithm type
   */
  public void setAlgorithm (SelectedTag algo)
  {
    if (algo.getTags() == TAGS_ALGORITHM) {
      if (m_verbose) {
	System.out.println("Algorithm: " + algo.getSelectedTag().getReadable());
      }
      m_Algorithm = algo.getSelectedTag().getID();
    }
  }

  /**
   * Get the KMeans algorithm type. Will be one of
   * ALGORITHM_SIMPLE or ALGORITHM_SPHERICAL
   *
   * @returns algorithm type
   */
  public SelectedTag getAlgorithm ()
  {
    return new SelectedTag(m_Algorithm, TAGS_ALGORITHM);
  }


  /**
   * Set the instance ordering
   *
   * @param order instance ordering
   */
  public void setInstanceOrdering (SelectedTag order)
  {
    if (order.getTags() == TAGS_ORDERING) {
      if (m_verbose) {
	System.out.println("Ordering: " + order.getSelectedTag().getReadable());
      }
      m_InstanceOrdering = order.getSelectedTag().getID();
    }
  }

  /**
   * Get the instance ordering
   *
   * @returns ordering type
   */
  public SelectedTag getInstanceOrdering ()
  {
    return new SelectedTag(m_InstanceOrdering, TAGS_ORDERING);
  }

    
  /** Read the seeds from a hastable, where every key is an instance and every value is:
   * the cluster assignment of that instance 
   * seedVector vector containing seeds
   */
  
  public void seedClusterer(HashMap seedHash) {
    System.err.println("Not implemented here");
  }
 

  /** Prints clusters */
  public void printClusters () throws Exception{
    ArrayList clusters = getClusters();
    for (int i=0; i<clusters.size(); i++) {
      Cluster currentCluster = (Cluster) clusters.get(i);
      System.out.println("\nCluster " + i + ": " + currentCluster.size() + " instances");
      if (currentCluster == null) {
	System.out.println("(empty)");
      }
      else {
	for (int j=0; j<currentCluster.size(); j++) {
	  Instance instance = (Instance) currentCluster.get(j);	
	  System.out.println("Instance: " + instance);
	}
      }
    }
  }

  /**
   * Computes the clusters from the cluster assignments, for external access
   * 
   * @exception Exception if clusters could not be computed successfully
   */    

  public ArrayList getClusters() throws Exception {
    m_Clusters = new ArrayList();
    Cluster [] clusterArray = new Cluster[m_NumClusters];

    for (int i=0; i < m_Instances.numInstances(); i++) {
	Instance inst = m_Instances.instance(i);
	if(clusterArray[m_ClusterAssignments[i]] == null)
	   clusterArray[m_ClusterAssignments[i]] = new Cluster();
	clusterArray[m_ClusterAssignments[i]].add(inst, 1);
    }

    for (int j =0; j< m_NumClusters; j++) 
      m_Clusters.add(clusterArray[j]);

    return m_Clusters;
  }

  /**
   * Computes the clusters from the cluster assignments, for external access
   * 
   * @exception Exception if clusters could not be computed successfully
   */    

  public HashSet[] getIndexClusters() throws Exception {
    m_IndexClusters = new HashSet[m_NumClusters];
    for (int i=0; i < m_Instances.numInstances(); i++) {
      //        if (m_verbose) {
      //  	System.out.println("In getIndexClusters, " + i + " assigned to cluster " + m_ClusterAssignments[i]);
      //        }
      if (m_ClusterAssignments[i]!=-1 && m_ClusterAssignments[i] < m_NumCurrentClusters) {
	if (m_IndexClusters[m_ClusterAssignments[i]] == null) {
	  m_IndexClusters[m_ClusterAssignments[i]] = new HashSet();
	}
	m_IndexClusters[m_ClusterAssignments[i]].add(new Integer(i));
      }
    }
    return m_IndexClusters;
  }


  public Enumeration listOptions () {
    Vector newVector = new Vector(10);

     newVector.addElement(new Option("\tnumber of clusters (default = 3)." 
				    , "N", 1, "-N <num>"));
     newVector.addElement(new Option("\trandom number seed (default 1)"
				     , "R", 1, "-R <num>"));
     newVector.addElement(new Option("\tperform no seeding (default false)"
				     , "NS", 1, "-NS"));
     newVector.addElement(new Option("\tperform active learning (default false)"
				     , "A", 1, "-A"));
     newVector.addElement(new Option("\tphase two of active learning is random (default false)"
				     , "P2", 1, "-P2"));
     newVector.addElement(new Option("\tdo only Explore phase in active learning (default false)"
				     , "E", 1, "-E"));
     newVector.addElement(new Option("\tmetric type (default WeightedEuclidean)"
				     , "M", 1, "-M <string> (WeightedEuclidean or WeightedDotP)"));     
     newVector.addElement(new Option("\tconstraints file"
				     , "C", 1, "-C <string> (each line is of the form \"firstID\\tsecondID\\t<+1/-1>\", where +1=>must-link, -1=>cannot-link)"));
     newVector.addElement(new Option("\tmust link weight (default 1)"
				     , "ML", 1, "-ML <double>"));
     newVector.addElement(new Option("\tcannot link weight (default 1)"
				     , "CL", 1, "-CL <double>"));
     newVector.addElement(new Option("\talgorithm type (default Simple)"
				     , "A", 1, "-A <string> (Simple => Simple-KMeans, Spherical => Spherical-KMeans)"));

     return  newVector.elements();

  }

  public String [] getOptions ()  {
    String[] options = new String[80];
    int current = 0;
    
    if (!m_Seedable) {
      options[current++] = "-NS";
    }
    
    if (m_MovePointsTillAssignmentStabilizes) {
      options[current++] = "-Stable";
    }
    
    options[current++] = "-IO";
    options[current++] = "" + getInstanceOrdering().getSelectedTag().getID();
      
    options[current++] = "-A";
    options[current++] = "" + getAlgorithm().getSelectedTag().getID();
    if (getActive()) {
      options[current++] = "-active";
    }
    options[current++] = "-N";
    options[current++] = "" + getNumClusters();
    options[current++] = "-E";
    options[current++] = "" + getAllExplore();
    options[current++] = "-P2";
    options[current++] = "" + getPhaseTwoRandom();
    options[current++] = "-R";
    options[current++] = "" + getRandomSeed();
    options[current++] = "-ML";
    options[current++] = "" + m_MustLinkWeight;
    options[current++] = "-CL";
    options[current++] = "" + m_CannotLinkWeight;
            
    options[current++] = "-M";
    options[current++] = Utils.removeSubstring(m_metric.getClass().getName(), "weka.core.metrics.");
    if (m_metric instanceof OptionHandler) {
      String[] metricOptions = ((OptionHandler)m_metric).getOptions();
      for (int i = 0; i < metricOptions.length; i++) {
	options[current++] = metricOptions[i];
      }
    } 
    
    while (current < options.length) {
      options[current++] = "";
    }
    
    return  options;
  }
  
  /**
   * Parses a given list of options.
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   *
   **/
  public void setOptions (String[] options)
    throws Exception {

    String optionString = Utils.getOption('N', options);
    if (optionString.length() != 0) {
      setNumClusters(Integer.parseInt(optionString));
    }

    optionString = Utils.getOption('R', options);
    if (optionString.length() != 0) {
      setRandomSeed(Integer.parseInt(optionString));
    }

    optionString = Utils.getOption('A', options);
    if (optionString.length() != 0) {
      setAlgorithm(new SelectedTag(Integer.parseInt(optionString), TAGS_ALGORITHM));
    }

    optionString = Utils.getOption('M', options);
    if (optionString.length() != 0) {
      String[] metricSpec = Utils.splitOptions(optionString);
      String metricName = metricSpec[0]; 
      metricSpec[0] = "";
      setMetric((LearnableMetric)LearnableMetric.forName(metricName, metricSpec));
    }
  }

  /**   
   * return a string describing this clusterer
   *
   * @return a description of the clusterer as a string
   */
  public String toString() {
    StringBuffer temp = new StringBuffer();

    temp.append("\nkMeans\n======\n");
    temp.append("\nNumber of iterations: " + m_Iterations+"\n");

//      temp.append("\nCluster centroids:\n");
//      for (int i = 0; i < m_NumClusters; i++) {
//        temp.append("\nCluster "+i+"\n\t");
//      }
//      temp.append("\n");
    return temp.toString();
  }


  /**
   * set the active level of the clusterer
   * @param active
   */
  public void setActive (boolean active) {
    m_Active = active;
  }

  /**
   * get the active level of clusterer
   * @return active
   */
  public boolean getActive () {
    return m_Active;
  }


  /**
   * set the verbosity level of the clusterer
   * @param verbose messages on(true) or off (false)
   */
  public void setVerbose (boolean verbose) {
    m_verbose = verbose;
  }

  /**
   * get the verbosity level of the clusterer
   * @return messages on(true) or off (false)
   */
  public boolean getVerbose () {
    return m_verbose;
  }

     
  /**
   * Train the clusterer using specified parameters
   *
   * @param instances Instances to be used for training
   */
  public void trainClusterer (Instances instances) throws Exception {
    if (m_metric instanceof LearnableMetric) {
      if (((LearnableMetric)m_metric).getTrainable()) {
	((LearnableMetric)m_metric).learnMetric(instances);
      }
      else {
	throw new Exception ("Metric is not trainable");
      }
    }
    else {
      throw new Exception ("Metric is not trainable");
    }
  }

  /** Normalizes Instance or SparseInstance
   *
   * @author Sugato Basu
   * @param inst Instance to be normalized
   */

  public void normalize(Instance inst) throws Exception {
    if (inst instanceof SparseInstance) {
      normalizeSparseInstance(inst);
    }
    else {
      normalizeInstance(inst);
    }
  }

  /** Normalizes the values of a normal Instance in L2 norm
   *
   * @author Sugato Basu
   * @param inst Instance to be normalized
   */

  public void normalizeInstance(Instance inst) throws Exception{
    double norm = 0;
    double values [] = inst.toDoubleArray();

    if (inst instanceof SparseInstance) {
      System.err.println("Is SparseInstance, using normalizeSparseInstance function instead");
      normalizeSparseInstance(inst);
    }
    
    for (int i=0; i<values.length; i++) {
      if (i != inst.classIndex()) { // don't normalize the class index 
	norm += values[i] * values[i];
      }
    }
    norm = Math.sqrt(norm);
    for (int i=0; i<values.length; i++) {
      if (i != inst.classIndex()) { // don't normalize the class index 
	values[i] /= norm;
      }
    }
    inst.setValueArray(values);
  }

  /** Normalizes the values of a SparseInstance in L2 norm
   *
   * @author Sugato Basu
   * @param inst SparseInstance to be normalized
   */

  public void normalizeSparseInstance(Instance inst) throws Exception{
    double norm=0;
    int length = inst.numValues();

    if (!(inst instanceof SparseInstance)) {
      System.err.println("Not SparseInstance, using normalizeInstance function instead");
      normalizeInstance(inst);
    }

    for (int i=0; i<length; i++) {
      if (inst.index(i) != inst.classIndex()) { // don't normalize the class index
	norm += inst.valueSparse(i) * inst.valueSparse(i);
      }
    }
    norm = Math.sqrt(norm);
    for (int i=0; i<length; i++) { // don't normalize the class index
      if (inst.index(i) != inst.classIndex()) {
	inst.setValueSparse(i, inst.valueSparse(i)/norm);
      }
    }
  }
  
  /** Fast version of meanOrMode - streamlined from Instances.meanOrMode for efficiency 
   *  Does not check for missing attributes, assumes numeric attributes, assumes Sparse instances
   */

  protected double[] meanOrMode(Instances insts) {

    int numAttributes = insts.numAttributes();
    double [] value = new double[numAttributes];
    double weight = 0;
    
    for (int i=0; i<numAttributes; i++) {
      value[i] = 0;
    }

    for (int j=0; j<insts.numInstances(); j++) {
      SparseInstance inst = (SparseInstance) (insts.instance(j));
      weight += inst.weight();
      for (int i=0; i<inst.numValues(); i++) {
	int indexOfIndex = inst.index(i);
	value[indexOfIndex]  += inst.weight() * inst.valueSparse(i);
      }
    }
    
    if (Utils.eq(weight, 0)) {
      for (int k=0; k<numAttributes; k++) {
	value[k] = 0;
      }
    }
    else {
      for (int k=0; k<numAttributes; k++) {
	value[k] = value[k] / weight;
      }
    }

    return value;
  }

  /**
   * Gets a Double representing the current date and time.
   * eg: 1:46pm on 20/5/1999 -> 19990520.1346
   *
   * @return a value of type Double
   */
  public static Double getTimeStamp() {

    Calendar now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
    double timestamp = now.getTimeInMillis();
    return new Double(timestamp);
  }


  /**
   * Main method for testing this class.
   *
   */

  public static void main (String[] args) {
    try {    
      testCase();
      //System.out.println(ClusterEvaluation.evaluateClusterer(new PCKMeans(), args));
    }
    catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
    }
  }

  protected static void testCase() {
    try {
      //String dataset = new String("lowd");
      String dataset = new String("highd");
      if (dataset.equals("lowd")) {
	//////// Low-D data
	String datafile = "/u/ml/software/weka-latest/data/iris.arff";
	
	// set up the data
	FileReader reader = new FileReader (datafile);
	Instances data = new Instances (reader);
	
	// Make the last attribute be the class 
	int classIndex = data.numAttributes()-1;
	data.setClassIndex(classIndex); // starts with 0
	System.out.println("ClassIndex is: " + classIndex);
	
	// Remove the class labels before clustering
	Instances clusterData = new Instances(data);
	clusterData.deleteClassAttribute();
		
	// create random constraints from the labeled training data
	int numPairs = 100, num=0;
	ArrayList labeledPairs = new ArrayList(numPairs);
	Random rand = new Random(42);
	System.out.println("Initializing constraint matrix:");
	while (num < numPairs) {
	  int i = (int) (data.numInstances()*rand.nextFloat());	
	  int j = (int) (data.numInstances()*rand.nextFloat());
	  int first = (i<j)? i:j;
	  int second = (i>=j)? i:j;
	  int linkType = (data.instance(first).classValue() == 
			  data.instance(second).classValue())? 
	    InstancePair.MUST_LINK:InstancePair.CANNOT_LINK;
	  InstancePair pair = new InstancePair(first, second, linkType);
	  if (first!=second && !labeledPairs.contains(pair)) {
	    labeledPairs.add(pair);
	    num++;
	  }
	}
	System.out.println("Finished initializing constraints");
	
	// create clusterer
	PCKMeans pckmeans = new PCKMeans();
	System.out.println("\nClustering the iris data using PCKmeans...\n");
	pckmeans.setAlgorithm(new SelectedTag(ALGORITHM_SIMPLE, TAGS_ALGORITHM));
	WeightedEuclidean euclidean = new WeightedEuclidean();
	euclidean.setExternal(false);
	pckmeans.setMetric(euclidean);
	pckmeans.setVerbose(false);
	pckmeans.setActive(false);
	pckmeans.setSeedable(true);
	pckmeans.setNumClusters(data.numClasses());

	// do clustering
	pckmeans.buildClusterer(labeledPairs, clusterData, data, data.numInstances());
	pckmeans.getIndexClusters();
	//	pckmeans.printIndexClusters();

	SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(pckmeans.m_TotalTrainWithLabels,
									 pckmeans.m_TotalTrainWithLabels.numClasses(),
									 pckmeans.m_TotalTrainWithLabels.numClasses());
	eval.evaluateModel(pckmeans, pckmeans.m_TotalTrainWithLabels, pckmeans.m_Instances);
	System.out.println("MI=" + eval.mutualInformation());
	System.out.print("FM=" + eval.pairwiseFMeasure());
	System.out.print("\tP=" + eval.pairwisePrecision());
	System.out.print("\tR=" + eval.pairwiseRecall());
      }
      else if (dataset.equals("highd")) {
	//////// Newsgroup data
	//	String datafile = "/u/ml/data/CCSfiles/arffFromCCS/cmu-newsgroup-clean-1000_fromCCS.arff";
	String datafile = "/u/ml/data/CCSfiles/arffFromCCS/different-100_fromCCS.arff";
	
	// set up the data
	FileReader reader = new FileReader (datafile);
	Instances data = new Instances (reader);
	
	// Make the last attribute be the class 
	int classIndex = data.numAttributes()-1;
	data.setClassIndex(classIndex); // starts with 0
	System.out.println("ClassIndex is: " + classIndex);
	
	// Remove the class labels before clustering
	Instances clusterData = new Instances(data);
	clusterData.deleteClassAttribute();
	
	// create random constraints from the labeled training data
	int numPairs = 100, num=0;
	ArrayList labeledPairs = new ArrayList(numPairs);
	Random rand = new Random(42); 
	System.out.println("Initializing constraint matrix:");
	while (num < numPairs) {
	  int i = (int) (data.numInstances()*rand.nextFloat());	
	  int j = (int) (data.numInstances()*rand.nextFloat());
	  int first = (i<j)? i:j;
	  int second = (i>=j)? i:j;
	  int linkType = (data.instance(first).classValue() == 
			  data.instance(second).classValue())? 
	    InstancePair.MUST_LINK:InstancePair.CANNOT_LINK;
	  InstancePair pair = new InstancePair(first, second, linkType);
	  if (first!=second && !labeledPairs.contains(pair)) {
	    labeledPairs.add(pair);
	    num++;
	  }
	}
	System.out.println("Finished initializing constraints");
	
	// create clusterer
	PCKMeans pckmeans = new PCKMeans();
	System.out.println("\nClustering the news data using PCKmeans...\n");
	pckmeans.resetClusterer();
	pckmeans.setAlgorithm(new SelectedTag(ALGORITHM_SPHERICAL, TAGS_ALGORITHM));
	pckmeans.setInstanceOrdering(new SelectedTag(ORDERING_SORTED, TAGS_ORDERING));
	pckmeans.setMovePointsTillAssignmentStabilizes(false);
	WeightedDotP dotp = new WeightedDotP();
	dotp.setExternal(false);
	dotp.setLengthNormalized(true);
	pckmeans.setMetric(dotp);
	pckmeans.setVerbose(false);
	pckmeans.setActive(false);
	//pckmeans.setActive(true); // uncomment to run Active Learning
	pckmeans.setSeedable(true);
	pckmeans.setNumClusters(data.numClasses());

	// do clustering
	pckmeans.buildClusterer(labeledPairs, clusterData, data, clusterData.numInstances());
	pckmeans.getIndexClusters();
	//	pckmeans.printIndexClusters();

	SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(pckmeans.m_TotalTrainWithLabels,
									 pckmeans.m_TotalTrainWithLabels.numClasses(),
									 pckmeans.m_TotalTrainWithLabels.numClasses());
	eval.evaluateModel(pckmeans, pckmeans.m_TotalTrainWithLabels, pckmeans.m_Instances);
	System.out.println("MI=" + eval.mutualInformation());
	System.out.print("FM=" + eval.pairwiseFMeasure());
	System.out.print("\tP=" + eval.pairwisePrecision());
	System.out.print("\tR=" + eval.pairwiseRecall());
      }
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }
}

// TODO: Add reading constraints from file
// TODO: Add all the options to setOptions
// TODO: Add all the options in comment on top of class