FarthestFirst.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    FarthestFirst.java
 *    Copyright (C) 2002 Bernhard Pfahringer
 *    based on SimpleKMeans which is 
 *     Copyright (C) 2000 Mark Hall (mhall@cs.waikato.ac.nz)
 *
 */
package weka.clusterers;

import  java.io.*;
import  java.util.*;
import  weka.core.*;
import  weka.filters.Filter;
import  weka.filters.unsupervised.attribute.ReplaceMissingValues;

/**
 * Implements the "Farthest First Traversal Algorithm" by 
 * Hochbaum and Shmoys 1985: A best possible heuristic for the
 * k-center problem, Mathematics of Operations Research, 10(2):180-184,
 * as cited by Sanjoy Dasgupta "performance guarantees for hierarchical
 * clustering", colt 2002, sydney
 *
 * works as a fast simple approximate clusterer
 *
 * modelled after SimpleKMeans, might be a useful initializer for it
 *
 * Valid options are:<p>
 *
 * -N <number of clusters> <br>
 * Specify the number of clusters to generate. <p>
 *
 * -S <seed> <br>
 * Specify random number seed. <p>
 *
 * @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
 * @version $Revision: 1.1.1.1 $
 * @see Clusterer
 * @see OptionHandler
 */

// Todo: rewrite to be fully incremental
//       cleanup, like deleting m_instances 
public class FarthestFirst extends Clusterer implements OptionHandler {

  /**
   * training instances, not necessary to keep, 
   * could be replaced by m_ClusterCentroids where needed for header info
   */
  protected Instances m_instances;

  /**
   * replace missing values in training instances
   */
  protected ReplaceMissingValues m_ReplaceMissingFilter;

  /**
   * number of clusters to generate
   */
  protected int m_NumClusters = 2;

  /**
   * holds the cluster centroids
   */
  protected Instances m_ClusterCentroids;

  /**
   * attribute min values
   */
  private double [] m_Min;
  
  /**
   * attribute max values
   */
  private double [] m_Max;

  /**
   * random seed
   */
  protected int m_Seed = 1;

  /**
   * Returns a string describing this clusterer
   * @return a description of the evaluator suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return "Cluster data using the FarthestFirst algorithm";
  }

  /**
   * Generates a clusterer. Has to initialize all fields of the clusterer
   * that are not being set via options.
   *
   * @param data set of instances serving as training data 
   * @exception Exception if the clusterer has not been 
   * generated successfully
   */
  public void buildClusterer(Instances data) throws Exception {

    //long start = System.currentTimeMillis();
    if (data.checkForStringAttributes()) {
      throw  new Exception("Can't handle string attributes!");
    }

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    m_ReplaceMissingFilter.setInputFormat(data);
    m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);

    initMinMax(m_instances);

    m_ClusterCentroids = new Instances(m_instances, m_NumClusters);

    int n = m_instances.numInstances();
    Random r = new Random(m_Seed);
    boolean[] selected = new boolean[n];
    double[] minDistance = new double[n];

    for(int i = 0; i<n; i++) minDistance[i] = Double.MAX_VALUE;

    int firstI = r.nextInt(n);
    m_ClusterCentroids.add(m_instances.instance(firstI));
    selected[firstI] = true;

    updateMinDistance(minDistance,selected,m_instances,m_instances.instance(firstI));

    if (m_NumClusters > n) m_NumClusters = n;

    for(int i = 1; i < m_NumClusters; i++) {
      int nextI =  farthestAway(minDistance, selected);
      m_ClusterCentroids.add(m_instances.instance(nextI));
      selected[nextI] = true;
      updateMinDistance(minDistance,selected,m_instances,m_instances.instance(nextI));
    }

    m_instances = new Instances(m_instances,0);
    //long end = System.currentTimeMillis();
    //System.out.println("Clustering Time = " + (end-start));
  }


  protected void updateMinDistance(double[] minDistance, boolean[] selected, 
				   Instances data, Instance center) {
    for(int i = 0; i<selected.length; i++) 
      if (!selected[i]) {
	double d = distance(center,data.instance(i));
	if (d<minDistance[i]) 
	  minDistance[i] = d;
      }
  }

  protected int farthestAway(double[] minDistance, boolean[] selected) {
    double maxDistance = -1.0;
    int maxI = -1;
    for(int i = 0; i<selected.length; i++) 
      if (!selected[i]) 
	if (maxDistance < minDistance[i]) {
	  maxDistance = minDistance[i];
	  maxI = i;
	}
    return maxI;
  }

  protected void initMinMax(Instances data) {
    m_Min = new double [data.numAttributes()];
    m_Max = new double [data.numAttributes()];
    for (int i = 0; i < data.numAttributes(); i++) {
      m_Min[i] = m_Max[i] = Double.NaN;
    }

    for (int i = 0; i < data.numInstances(); i++) {
      updateMinMax(data.instance(i));
    }
  }


  /**
   * Updates the minimum and maximum values for all the attributes
   * based on a new instance.
   *
   * @param instance the new instance
   */
  private void updateMinMax(Instance instance) {  

    for (int j = 0;j < instance.numAttributes(); j++) {
      if (Double.isNaN(m_Min[j])) {
	m_Min[j] = instance.value(j);
	m_Max[j] = instance.value(j);
      } else {
	if (instance.value(j) < m_Min[j]) {
	  m_Min[j] = instance.value(j);
	} else {
	  if (instance.value(j) > m_Max[j]) {
	    m_Max[j] = instance.value(j);
	  }
	}
      }
    }
  }


  /**
   * clusters an instance that has been through the filters
   *
   * @param instance the instance to assign a cluster to
   * @return a cluster number
   */
  protected int clusterProcessedInstance(Instance instance) {
    double minDist = Double.MAX_VALUE;
    int bestCluster = 0;
    for (int i = 0; i < m_NumClusters; i++) {
      double dist = distance(instance, m_ClusterCentroids.instance(i));
      if (dist < minDist) {
	minDist = dist;
	bestCluster = i;
      }
    }
    return bestCluster;
  }

  /**
   * Classifies a given instance.
   *
   * @param instance the instance to be assigned to a cluster
   * @return the number of the assigned cluster as an integer
   * if the class is enumerated, otherwise the predicted value
   * @exception Exception if instance could not be classified
   * successfully
   */
  public int clusterInstance(Instance instance) throws Exception {
    m_ReplaceMissingFilter.input(instance);
    m_ReplaceMissingFilter.batchFinished();
    Instance inst = m_ReplaceMissingFilter.output();

    return clusterProcessedInstance(inst);
  }

  /**
   * Calculates the distance between two instances
   *
   * @param test the first instance
   * @param train the second instance
   * @return the distance between the two given instances, between 0 and 1
   */          
  protected double distance(Instance first, Instance second) {  

    double distance = 0;
    int firstI, secondI;

    for (int p1 = 0, p2 = 0; 
	 p1 < first.numValues() || p2 < second.numValues();) {
      if (p1 >= first.numValues()) {
	firstI = m_instances.numAttributes();
      } else {
	firstI = first.index(p1); 
      }
      if (p2 >= second.numValues()) {
	secondI = m_instances.numAttributes();
      } else {
	secondI = second.index(p2);
      }
      if (firstI == m_instances.classIndex()) {
	p1++; continue;
      } 
      if (secondI == m_instances.classIndex()) {
	p2++; continue;
      } 
      double diff;
      if (firstI == secondI) {
	diff = difference(firstI, 
			  first.valueSparse(p1),
			  second.valueSparse(p2));
	p1++; p2++;
      } else if (firstI > secondI) {
	diff = difference(secondI, 
			  0, second.valueSparse(p2));
	p2++;
      } else {
	diff = difference(firstI, 
			  first.valueSparse(p1), 0);
	p1++;
      }
      distance += diff * diff;
    }
    
    return Math.sqrt(distance / m_instances.numAttributes());
  }

  /**
   * Computes the difference between two given attribute
   * values.
   */
  protected double difference(int index, double val1, double val2) {

    switch (m_instances.attribute(index).type()) {
    case Attribute.NOMINAL:
      
      // If attribute is nominal
      if (Instance.isMissingValue(val1) || 
	  Instance.isMissingValue(val2) ||
	  ((int)val1 != (int)val2)) {
	return 1;
      } else {
	return 0;
      }
    case Attribute.NUMERIC:

      // If attribute is numeric
      if (Instance.isMissingValue(val1) || 
	  Instance.isMissingValue(val2)) {
	if (Instance.isMissingValue(val1) && 
	    Instance.isMissingValue(val2)) {
	  return 1;
	} else {
	  double diff;
	  if (Instance.isMissingValue(val2)) {
	    diff = norm(val1, index);
	  } else {
	    diff = norm(val2, index);
	  }
	  if (diff < 0.5) {
	    diff = 1.0 - diff;
	  }
	  return diff;
	}
      } else {
	return norm(val1, index) - norm(val2, index);
      }
    default:
      return 0;
    }
  }

  /**
   * Normalizes a given value of a numeric attribute.
   *
   * @param x the value to be normalized
   * @param i the attribute's index
   */
  protected double norm(double x, int i) {

    if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i],m_Min[i])) {
      return 0;
    } else {
      return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
    }
  }

  /**
   * Returns the number of clusters.
   *
   * @return the number of clusters generated for a training dataset.
   * @exception Exception if number of clusters could not be returned
   * successfully
   */
  public int numberOfClusters() throws Exception {
    return m_NumClusters;
  }

  /**
   * Returns an enumeration describing the available options.. <p>
   *
   * Valid options are:<p>
   *
   * -N <number of clusters> <br>
   * Specify the number of clusters to generate. If omitted,
   * FarthestFirst will use cross validation to select the number of clusters
   * automatically. <p>
   *
   * -S <seed> <br>
   * Specify random number seed. <p>
   *
   * @return an enumeration of all the available options.
   *
   **/
  public Enumeration listOptions () {
    Vector newVector = new Vector(2);

     newVector.addElement(new Option("\tnumber of clusters. (default = 2)." 
				    , "N", 1, "-N <num>"));
     newVector.addElement(new Option("\trandom number seed.\n (default 10)"
				     , "S", 1, "-S <num>"));

     return  newVector.elements();
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String numClustersTipText() {
    return "set number of clusters";
  }

  /**
   * set the number of clusters to generate
   *
   * @param n the number of clusters to generate
   */
  public void setNumClusters(int n) {
    m_NumClusters = n;
  }

  /**
   * gets the number of clusters to generate
   *
   * @return the number of clusters to generate
   */
  public int getNumClusters() {
    return m_NumClusters;
  }
    
  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String seedTipText() {
    return "random number seed";
  }


  /**
   * Set the random number seed
   *
   * @param s the seed
   */
  public void setSeed (int s) {
    m_Seed = s;
  }


  /**
   * Get the random number seed
   *
   * @return the seed
   */
  public int getSeed () {
    return  m_Seed;
  }

  /**
   * Parses a given list of options.
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   *
   **/
  public void setOptions (String[] options)
    throws Exception {

    String optionString = Utils.getOption('N', options);

    if (optionString.length() != 0) {
      setNumClusters(Integer.parseInt(optionString));
    }

    optionString = Utils.getOption('S', options);
    
    if (optionString.length() != 0) {
      setSeed(Integer.parseInt(optionString));
    }
  }

  /**
   * Gets the current settings of FarthestFirst
   *
   * @return an array of strings suitable for passing to setOptions()
   */
  public String[] getOptions () {
    String[] options = new String[4];
    int current = 0;
    
    options[current++] = "-N";
    options[current++] = "" + getNumClusters();
    options[current++] = "-S";
    options[current++] = "" + getSeed();
    
    while (current < options.length) {
      options[current++] = "";
    }

    return  options;
  }

  /**
   * return a string describing this clusterer
   *
   * @return a description of the clusterer as a string
   */
  public String toString() {
    StringBuffer temp = new StringBuffer();

    temp.append("\n FarthestFirst\n==============\n");

    temp.append("\nCluster centroids:\n");
    for (int i = 0; i < m_NumClusters; i++) {
      temp.append("\nCluster "+i+"\n\t");
      for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
	if (m_ClusterCentroids.attribute(j).isNominal()) {
	  temp.append(" "+m_ClusterCentroids.attribute(j).
		      value((int)m_ClusterCentroids.instance(i).value(j)));
	} else {
	  temp.append(" "+m_ClusterCentroids.instance(i).value(j));
	}
      }
    }
    temp.append("\n\n");
    return temp.toString();
  }

  /**
   * Main method for testing this class.
   *
   * @param argv should contain the following arguments: <p>
   * -t training file [-N number of clusters]
   */
  public static void main (String[] argv) {
    try {
      System.out.println(ClusterEvaluation.
			 evaluateClusterer(new FarthestFirst(), argv));
    }
    catch (Exception e) {
      System.out.println(e.getMessage());
      e.printStackTrace();
    }
  }
}