/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* FarthestFirst.java
* Copyright (C) 2002 Bernhard Pfahringer
* based on SimpleKMeans which is
* Copyright (C) 2000 Mark Hall (mhall@cs.waikato.ac.nz)
*
*/
package weka.clusterers;
import java.io.*;
import java.util.*;
import weka.core.*;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.ReplaceMissingValues;
/**
* Implements the "Farthest First Traversal Algorithm" by
* Hochbaum and Shmoys 1985: A best possible heuristic for the
* k-center problem, Mathematics of Operations Research, 10(2):180-184,
* as cited by Sanjoy Dasgupta "performance guarantees for hierarchical
* clustering", colt 2002, sydney
*
* works as a fast simple approximate clusterer
*
* modelled after SimpleKMeans, might be a useful initializer for it
*
* Valid options are:<p>
*
* -N <number of clusters> <br>
* Specify the number of clusters to generate. <p>
*
* -S <seed> <br>
* Specify random number seed. <p>
*
* @author Bernhard Pfahringer (bernhard@cs.waikato.ac.nz)
* @version $Revision: 1.1.1.1 $
* @see Clusterer
* @see OptionHandler
*/
// Todo: rewrite to be fully incremental
// cleanup, like deleting m_instances
public class FarthestFirst extends Clusterer implements OptionHandler {
/**
* training instances, not necessary to keep,
* could be replaced by m_ClusterCentroids where needed for header info
*/
protected Instances m_instances;
/**
* replace missing values in training instances
*/
protected ReplaceMissingValues m_ReplaceMissingFilter;
/**
* number of clusters to generate
*/
protected int m_NumClusters = 2;
/**
* holds the cluster centroids
*/
protected Instances m_ClusterCentroids;
/**
* attribute min values
*/
private double [] m_Min;
/**
* attribute max values
*/
private double [] m_Max;
/**
* random seed
*/
protected int m_Seed = 1;
/**
* Returns a string describing this clusterer
* @return a description of the evaluator suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "Cluster data using the FarthestFirst algorithm";
}
/**
* Generates a clusterer. Has to initialize all fields of the clusterer
* that are not being set via options.
*
* @param data set of instances serving as training data
* @exception Exception if the clusterer has not been
* generated successfully
*/
public void buildClusterer(Instances data) throws Exception {
//long start = System.currentTimeMillis();
if (data.checkForStringAttributes()) {
throw new Exception("Can't handle string attributes!");
}
m_ReplaceMissingFilter = new ReplaceMissingValues();
m_ReplaceMissingFilter.setInputFormat(data);
m_instances = Filter.useFilter(data, m_ReplaceMissingFilter);
initMinMax(m_instances);
m_ClusterCentroids = new Instances(m_instances, m_NumClusters);
int n = m_instances.numInstances();
Random r = new Random(m_Seed);
boolean[] selected = new boolean[n];
double[] minDistance = new double[n];
for(int i = 0; i<n; i++) minDistance[i] = Double.MAX_VALUE;
int firstI = r.nextInt(n);
m_ClusterCentroids.add(m_instances.instance(firstI));
selected[firstI] = true;
updateMinDistance(minDistance,selected,m_instances,m_instances.instance(firstI));
if (m_NumClusters > n) m_NumClusters = n;
for(int i = 1; i < m_NumClusters; i++) {
int nextI = farthestAway(minDistance, selected);
m_ClusterCentroids.add(m_instances.instance(nextI));
selected[nextI] = true;
updateMinDistance(minDistance,selected,m_instances,m_instances.instance(nextI));
}
m_instances = new Instances(m_instances,0);
//long end = System.currentTimeMillis();
//System.out.println("Clustering Time = " + (end-start));
}
protected void updateMinDistance(double[] minDistance, boolean[] selected,
Instances data, Instance center) {
for(int i = 0; i<selected.length; i++)
if (!selected[i]) {
double d = distance(center,data.instance(i));
if (d<minDistance[i])
minDistance[i] = d;
}
}
protected int farthestAway(double[] minDistance, boolean[] selected) {
double maxDistance = -1.0;
int maxI = -1;
for(int i = 0; i<selected.length; i++)
if (!selected[i])
if (maxDistance < minDistance[i]) {
maxDistance = minDistance[i];
maxI = i;
}
return maxI;
}
protected void initMinMax(Instances data) {
m_Min = new double [data.numAttributes()];
m_Max = new double [data.numAttributes()];
for (int i = 0; i < data.numAttributes(); i++) {
m_Min[i] = m_Max[i] = Double.NaN;
}
for (int i = 0; i < data.numInstances(); i++) {
updateMinMax(data.instance(i));
}
}
/**
* Updates the minimum and maximum values for all the attributes
* based on a new instance.
*
* @param instance the new instance
*/
private void updateMinMax(Instance instance) {
for (int j = 0;j < instance.numAttributes(); j++) {
if (Double.isNaN(m_Min[j])) {
m_Min[j] = instance.value(j);
m_Max[j] = instance.value(j);
} else {
if (instance.value(j) < m_Min[j]) {
m_Min[j] = instance.value(j);
} else {
if (instance.value(j) > m_Max[j]) {
m_Max[j] = instance.value(j);
}
}
}
}
}
/**
* clusters an instance that has been through the filters
*
* @param instance the instance to assign a cluster to
* @return a cluster number
*/
protected int clusterProcessedInstance(Instance instance) {
double minDist = Double.MAX_VALUE;
int bestCluster = 0;
for (int i = 0; i < m_NumClusters; i++) {
double dist = distance(instance, m_ClusterCentroids.instance(i));
if (dist < minDist) {
minDist = dist;
bestCluster = i;
}
}
return bestCluster;
}
/**
* Classifies a given instance.
*
* @param instance the instance to be assigned to a cluster
* @return the number of the assigned cluster as an integer
* if the class is enumerated, otherwise the predicted value
* @exception Exception if instance could not be classified
* successfully
*/
public int clusterInstance(Instance instance) throws Exception {
m_ReplaceMissingFilter.input(instance);
m_ReplaceMissingFilter.batchFinished();
Instance inst = m_ReplaceMissingFilter.output();
return clusterProcessedInstance(inst);
}
/**
* Calculates the distance between two instances
*
* @param test the first instance
* @param train the second instance
* @return the distance between the two given instances, between 0 and 1
*/
protected double distance(Instance first, Instance second) {
double distance = 0;
int firstI, secondI;
for (int p1 = 0, p2 = 0;
p1 < first.numValues() || p2 < second.numValues();) {
if (p1 >= first.numValues()) {
firstI = m_instances.numAttributes();
} else {
firstI = first.index(p1);
}
if (p2 >= second.numValues()) {
secondI = m_instances.numAttributes();
} else {
secondI = second.index(p2);
}
if (firstI == m_instances.classIndex()) {
p1++; continue;
}
if (secondI == m_instances.classIndex()) {
p2++; continue;
}
double diff;
if (firstI == secondI) {
diff = difference(firstI,
first.valueSparse(p1),
second.valueSparse(p2));
p1++; p2++;
} else if (firstI > secondI) {
diff = difference(secondI,
0, second.valueSparse(p2));
p2++;
} else {
diff = difference(firstI,
first.valueSparse(p1), 0);
p1++;
}
distance += diff * diff;
}
return Math.sqrt(distance / m_instances.numAttributes());
}
/**
* Computes the difference between two given attribute
* values.
*/
protected double difference(int index, double val1, double val2) {
switch (m_instances.attribute(index).type()) {
case Attribute.NOMINAL:
// If attribute is nominal
if (Instance.isMissingValue(val1) ||
Instance.isMissingValue(val2) ||
((int)val1 != (int)val2)) {
return 1;
} else {
return 0;
}
case Attribute.NUMERIC:
// If attribute is numeric
if (Instance.isMissingValue(val1) ||
Instance.isMissingValue(val2)) {
if (Instance.isMissingValue(val1) &&
Instance.isMissingValue(val2)) {
return 1;
} else {
double diff;
if (Instance.isMissingValue(val2)) {
diff = norm(val1, index);
} else {
diff = norm(val2, index);
}
if (diff < 0.5) {
diff = 1.0 - diff;
}
return diff;
}
} else {
return norm(val1, index) - norm(val2, index);
}
default:
return 0;
}
}
/**
* Normalizes a given value of a numeric attribute.
*
* @param x the value to be normalized
* @param i the attribute's index
*/
protected double norm(double x, int i) {
if (Double.isNaN(m_Min[i]) || Utils.eq(m_Max[i],m_Min[i])) {
return 0;
} else {
return (x - m_Min[i]) / (m_Max[i] - m_Min[i]);
}
}
/**
* Returns the number of clusters.
*
* @return the number of clusters generated for a training dataset.
* @exception Exception if number of clusters could not be returned
* successfully
*/
public int numberOfClusters() throws Exception {
return m_NumClusters;
}
/**
* Returns an enumeration describing the available options.. <p>
*
* Valid options are:<p>
*
* -N <number of clusters> <br>
* Specify the number of clusters to generate. If omitted,
* FarthestFirst will use cross validation to select the number of clusters
* automatically. <p>
*
* -S <seed> <br>
* Specify random number seed. <p>
*
* @return an enumeration of all the available options.
*
**/
public Enumeration listOptions () {
Vector newVector = new Vector(2);
newVector.addElement(new Option("\tnumber of clusters. (default = 2)."
, "N", 1, "-N <num>"));
newVector.addElement(new Option("\trandom number seed.\n (default 10)"
, "S", 1, "-S <num>"));
return newVector.elements();
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String numClustersTipText() {
return "set number of clusters";
}
/**
* set the number of clusters to generate
*
* @param n the number of clusters to generate
*/
public void setNumClusters(int n) {
m_NumClusters = n;
}
/**
* gets the number of clusters to generate
*
* @return the number of clusters to generate
*/
public int getNumClusters() {
return m_NumClusters;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String seedTipText() {
return "random number seed";
}
/**
* Set the random number seed
*
* @param s the seed
*/
public void setSeed (int s) {
m_Seed = s;
}
/**
* Get the random number seed
*
* @return the seed
*/
public int getSeed () {
return m_Seed;
}
/**
* Parses a given list of options.
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*
**/
public void setOptions (String[] options)
throws Exception {
String optionString = Utils.getOption('N', options);
if (optionString.length() != 0) {
setNumClusters(Integer.parseInt(optionString));
}
optionString = Utils.getOption('S', options);
if (optionString.length() != 0) {
setSeed(Integer.parseInt(optionString));
}
}
/**
* Gets the current settings of FarthestFirst
*
* @return an array of strings suitable for passing to setOptions()
*/
public String[] getOptions () {
String[] options = new String[4];
int current = 0;
options[current++] = "-N";
options[current++] = "" + getNumClusters();
options[current++] = "-S";
options[current++] = "" + getSeed();
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* return a string describing this clusterer
*
* @return a description of the clusterer as a string
*/
public String toString() {
StringBuffer temp = new StringBuffer();
temp.append("\n FarthestFirst\n==============\n");
temp.append("\nCluster centroids:\n");
for (int i = 0; i < m_NumClusters; i++) {
temp.append("\nCluster "+i+"\n\t");
for (int j = 0; j < m_ClusterCentroids.numAttributes(); j++) {
if (m_ClusterCentroids.attribute(j).isNominal()) {
temp.append(" "+m_ClusterCentroids.attribute(j).
value((int)m_ClusterCentroids.instance(i).value(j)));
} else {
temp.append(" "+m_ClusterCentroids.instance(i).value(j));
}
}
}
temp.append("\n\n");
return temp.toString();
}
/**
* Main method for testing this class.
*
* @param argv should contain the following arguments: <p>
* -t training file [-N number of clusters]
*/
public static void main (String[] argv) {
try {
System.out.println(ClusterEvaluation.
evaluateClusterer(new FarthestFirst(), argv));
}
catch (Exception e) {
System.out.println(e.getMessage());
e.printStackTrace();
}
}
}