package ca.pfv.spmf.algorithms.clustering.hierarchical_clustering; /* This file is copyright (c) 2008-2012 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; import ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction; import ca.pfv.spmf.patterns.cluster.ClusterWithMean; import ca.pfv.spmf.patterns.cluster.DoubleArray; import ca.pfv.spmf.tools.MemoryLogger; /** * This is an implementation of generic Hierarchical Clustering Algorithm as described * in this webpage: * http://home.dei.polimi.it/matteucc/Clustering/tutorial_html/hierarchical.html * <br/><br/> * * This is a Hierarchical Clustering with a constant "threshold" that indicate * the maximal distance between two clusters to group them. The algorithm stops * when no cluster can be merged. * <br/><br/> * * The distance between two clusters is calculated as the distance between the * medians of the two clusters. * * @author Philippe Fournier-Viger */ public class AlgoHierarchicalClustering { // parameter private double maxDistance =0; // maximum distance allowed for merging two clusters // list of clusters List<ClusterWithMean> clusters = null; // for statistics private long startTimestamp; // start time of latest execution private long endTimestamp; // end time of latest execution private long iterationCount; // number of iterations performed /* The distance function to be used for clustering */ private DistanceFunction distanceFunction = null; /** * Default constructor */ public AlgoHierarchicalClustering() { } /** * Run the algorithm. * @param inputFile an input file containing vectors of doubles * @param maxDistance the maximum distance allowed for merging two clusters * @param distanceFunction * @return a list of Clusters * @throws IOException exception if error while reading the file */ public List<ClusterWithMean> runAlgorithm(String inputFile, double maxDistance, DistanceFunction distanceFunction) throws NumberFormatException, IOException { // record start time startTimestamp = System.currentTimeMillis(); // save the parameter this.maxDistance = maxDistance; // save the distance function this.distanceFunction = distanceFunction; // create an empty list of clusters clusters = new ArrayList<ClusterWithMean>(); // Read the vectors from the input file // and add each vector to an individual cluster. BufferedReader reader = new BufferedReader(new FileReader(inputFile)); String line; // for each line until the end of file while (((line = reader.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the line by spaces String[] lineSplited = line.split(" "); // convert the values to double values and put them in // a vector of doubles double [] vector = new double[lineSplited.length]; for (int i=0; i< lineSplited.length; i++) { double value = Double.parseDouble(lineSplited[i]); vector[i] = value; // System.out.println("val"); } // create a DoubleArray object with the vector DoubleArray theVector = new DoubleArray(vector); // Initiallly we create a cluster for each vector ClusterWithMean cluster = new ClusterWithMean(vector.length); cluster.addVector(theVector); cluster.setMean(theVector.clone()); clusters.add(cluster); } reader.close(); // close the input file // (2) Loop to combine the two closest clusters into a bigger cluster // until no clusters can be combined. boolean changed = false; do { // merge the two closest clusters changed = mergeTheClosestCluster(); // record memory usage MemoryLogger.getInstance().checkMemory(); } while (changed); // record end time endTimestamp = System.currentTimeMillis(); // return the clusters return clusters; } /** * Merge the two closest clusters in terms of distance. * @return true if a merge was done, otherwise false. */ private boolean mergeTheClosestCluster() { // These variables will contain the two closest clusters that // can be merged ClusterWithMean clusterToMerge1 = null; ClusterWithMean clusterToMerge2 = null; double minClusterDistance = Integer.MAX_VALUE; // find the two closest clusters with distance > threshold // by comparing all pairs of clusters i and j for (int i = 0; i < clusters.size(); i++) { for (int j = i + 1; j < clusters.size(); j++) { // calculate the distance between i and j double distance = distanceFunction.calculateDistance(clusters.get(i).getmean(), clusters.get(j).getmean()); // if the distance is less than the max distance allowed // and if it is the smallest distance until now if (distance < minClusterDistance && distance <= maxDistance) { // record this pair of clusters minClusterDistance = distance; clusterToMerge1 = clusters.get(i); clusterToMerge2 = clusters.get(j); } } } // if no close clusters were found, return false if (clusterToMerge1 == null) { return false; } // else, merge the two closest clusters for(DoubleArray vector : clusterToMerge2.getVectors()){ clusterToMerge1.addVector(vector); } // after mergint, we need to recompute the mean of the resulting cluster clusterToMerge1.recomputeClusterMean(); // we delete the cluster that was merged clusters.remove(clusterToMerge2); // increase iteration count for statistics iterationCount++; return true; } /** * Save the clusters to an output file * @param output the output file path * @throws IOException exception if there is some writing error. */ public void saveToFile(String output) throws IOException { BufferedWriter writer = new BufferedWriter(new FileWriter(output)); // for each cluster for(int i=0; i< clusters.size(); i++){ // if the cluster is not empty if(clusters.get(i).getVectors().size() >= 1){ // write the cluster writer.write(clusters.get(i).toString()); // if not the last cluster, add a line return if(i < clusters.size()-1){ writer.newLine(); } } } // close the file writer.close(); } private double getSSE() { double sse = 0; for(ClusterWithMean cluster : clusters) { for(DoubleArray vector : cluster.getVectors()) { sse += Math.pow(distanceFunction.calculateDistance(vector, cluster.getmean()), 2); } } return sse; } /** * Print statistics about the latest execution to System.out. */ public void printStatistics() { System.out.println("========== HIERARCHICAL CLUSTERING - STATS ============"); System.out.println(" Distance function: " + distanceFunction.getName()); System.out.println(" Total time ~: " + (endTimestamp - startTimestamp) + " ms"); System.out.println(" SSE (Sum of Squared Errors) (lower is better) : " + getSSE()); System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory() + " mb "); System.out.println(" Iteration count: " + iterationCount); System.out.println("====================================="); } }