package ca.pfv.spmf.algorithms.clustering.kmeans;
/* This file is copyright (c) 2008-2015 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction;
import ca.pfv.spmf.patterns.cluster.ClusterWithMean;
import ca.pfv.spmf.patterns.cluster.DoubleArray;
import ca.pfv.spmf.tools.MemoryLogger;
/**
* An implementation of the Bisecting K-means algorithm (Steinbach et al, 2000).
* <br/><br/>
*
* "A comparison of document clustering techniques", M. Steinbach, G. Karypis
* and V. Kumar. Workshop on Text Mining, KDD, 2000.<br/><br/>
*
* The Bisecting K-Means algorithm is a variation of the regular K-Means algorithms.
* It consists of the following steps:
* (1) pick a cluster, (2) find 2-subclusters using the basic K-Means algorithm
* (bisecting step), (3) repeat step 2, the bisecting step, for ITER times and take
* the split that produces the clustering, (4) repeat steps 1,2,3 until the desired
* number of clusters is reached.
* <br/><br/>
*
* In this implementation, we use the Squared Sum of Errors (SSE) to determine if
* a split is good. Moreover, we always choose to split the largest cluster as
* suggested by Steinbach et al. However, note that an alternative way
* would be to always choose the cluster with the highest SSE. But we have not done that.
* <br/><br/>
*
* @author Philippe Fournier-Viger
* @see AlgoKMeans
*/
public class AlgoBisectingKMeans extends AlgoKMeans{
/** the number times a split should be repeated to choose the best one */
int iter = -1;
/**
* Default constructor
*/
public AlgoBisectingKMeans() {
}
/**
* Run the K-Means algorithm
* @param inputFile an input file path containing a list of vectors of double values
* @param k the parameter k
* @param distanceFunction a distance function
* @param iter the number times a split should be repeated to choose the best one
* @return a list of clusters (some of them may be empty)
* @throws IOException exception if an error while writing the file occurs
*/
public List<ClusterWithMean> runAlgorithm(String inputFile, int k,
DistanceFunction distanceFunction, int iter) throws NumberFormatException, IOException {
this.iter = iter;
return runAlgorithm(inputFile, k, distanceFunction);
}
/**
* Apply the K-means algorithm
* @param k the parameter k
* @param distanceFunction a distance function
* @param vectors the list of initial vectors
* @param minValue the min value
* @param maxValue the max value
* @param vectorsSize the vector size
*/
void applyAlgorithm(int k, DistanceFunction distanceFunction,
List<DoubleArray> vectors, double minValue, double maxValue,
int vectorsSize) {
clusters = new ArrayList<ClusterWithMean>();
List<DoubleArray> currentVectors = vectors;
while(true) {
// apply kmeans iter times and keep the best clusters
List<ClusterWithMean> bestClustersUntilNow = null;
double smallestSSE = Double.MAX_VALUE;
// Apply KMEANS with K = 2 "iter" times
// and select the partition with the best SSE (Sum of Squared errors)
for(int i = 0; i < iter; i++) {
List<ClusterWithMean> newClusters = applyKMeans(2, distanceFunction, currentVectors, minValue, maxValue, vectorsSize);
double sse = getSSE(newClusters);
if(sse < smallestSSE) {
bestClustersUntilNow = newClusters;
smallestSSE = sse;
}
}
// add the best 2 clusters to the list of all clusters until now
clusters.addAll(bestClustersUntilNow);
// if we have enough clusters, we stop
if(clusters.size() == k){
break;
}
// otherwise, we choose the next cluster to be bisected.
int biggestClusterSize = -1;
int biggestClusterIndex = -1;
for(int i =0; i < clusters.size(); i++) {
ClusterWithMean cluster = clusters.get(i);
// if the biggest cluster until now, we remember it
if(cluster.getVectors().size() > biggestClusterSize) {
biggestClusterIndex = i;
biggestClusterSize = cluster.getVectors().size();
currentVectors = cluster.getVectors();
}
}
// remove the cluster from the list of clusters because we will split it
clusters.remove(biggestClusterIndex);
}
}
/**
* Print statistics of the latest execution to System.out.
*/
public void printStatistics() {
System.out.println("========== BISECTING KMEANS - STATS ============");
System.out.println(" Distance function: " + distanceFunction.getName());
System.out.println(" Total time ~: " + (endTimestamp - startTimestamp)
+ " ms");
System.out.println(" SSE (Sum of Squared Errors) (lower is better) : " + getSSE(clusters));
System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory() + " mb ");
System.out.println("=====================================");
}
}