package ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.kmeans_for_fournier08; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.util.ArrayList; import java.util.List; import java.util.Random; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.AlgoFournierViger08; import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.ItemValued; /** * This is the implementation of the K-means algorithm used by the Fournier-Viger-2008 algorithm * for sequential pattern mining. * <br/><br/> * This implementation should only be used for the Fournier-Viger 2008 as it is specially designed * for it (e.g. it clusters valued items and it keep the * min max and avg values of each clusters). For other purposes, one should use the general KMeans implementation * in the package "clustering". This latter implementation is more general (uses vector of doubles instead * of items) and is more optimized. * <br/><br/> * * @see AlgoFournierViger08 * @see ItemValued * @author Philippe Fournier-Viger */ public class AlgoKMeans_forFournier08 { // the parameter k indicates the number of clusters to be found private int k; // a random number generator because k-means is a randomized algorithm private final static Random random = new Random(System.currentTimeMillis()); /** * Contructor * @param k the parameter k of K-Means, which represents the desired * number of clusters. */ public AlgoKMeans_forFournier08(int k) { this.k = k; } /** * Run the algorithm * @param input a list of items to be clustered * number of clusters. * @return a list of clusters */ public List<Cluster> runAlgorithm(List<ItemValued> input) { // Create a list of clusters List<Cluster> clusters = new ArrayList<Cluster>(k); // If onely 1 item if (input.size() == 1) { // create a cluster with that item ItemValued item = input.get(0); Cluster cluster = new Cluster(item); cluster.addItem(item); clusters.add(cluster); // return that cluster return clusters; } // (1) Randomly generate k empty clusters with a random average (cluster // center) // (1.1) Find the smallest value and largest value double higher = input.get(0).getId(); double lower = input.get(0).getId(); // for each item for (ItemValued item : input) { // if the largest item until now, remember it if (item.getValue() > higher) { higher = item.getValue(); } // if the smallest item until now, remember it if (item.getValue() < lower) { lower = item.getValue(); } } // If all items have the same values, we return only one // cluster. if (higher == lower) { // Create a cluster with all items and return it Cluster cluster = new Cluster(input); clusters.add(cluster); return clusters; } // (1.2) Generate the k empty clusters with a random average // between the smallest and largest values. for (int i = 0; i < k; i++) { // generate random average double average = random.nextInt((int) (higher - lower)) + lower; // create the cluster Cluster cluster = new Cluster(average); clusters.add(cluster); } // (2) Repeat the two next steps until the assignment hasn't changed boolean changed; do { changed = false; // (2.1) Assign each point to the nearest cluster center. // / for each item for (ItemValued item : input) { // find the nearest cluster and the cluster containing the item Cluster nearestCluster = null; Cluster containingCluster = null; double distanceToNearestCluster = Double.MAX_VALUE; // for each cluster for (Cluster cluster : clusters) { // calculate the distance to the current item double distance = averageDistance(cluster, item); // if the smallest distance until now, remember // that cluster if (distance < distanceToNearestCluster) { nearestCluster = cluster; distanceToNearestCluster = distance; } // if the cluster contains that item, // then note that this is the cluster // containing the item. if (cluster.containsItem(item)) { containingCluster = cluster; } } // if the closest cluster to the current item // is not the cluster containing the item if (containingCluster != nearestCluster) { // if the item is in a cluster if (containingCluster != null) { // remove item from the cluster removeItem(containingCluster.getItems(), item); } // add the item to the nearest cluster nearestCluster.addItem(item); changed = true; } } // (2.2) For each cluster, recompute the new cluster average for (Cluster cluster : clusters) { cluster.recomputeClusterAverage(); } } while (changed); // Computer min and max for all clusters for (Cluster cluster : clusters) { cluster.computeHigherAndLower(); } // return the set of clusters return clusters; } /** * Remove an item from a list of items * @param items a list of items * @param item the item to be removed */ private void removeItem(List<ItemValued> items, ItemValued item) { // for each item in the list for (int i = 0; i < items.size(); i++) { // if the item to be removed is found if (items.get(i) == item) { // then, remove it items.remove(i); } } } /** * Calculate the distance between the average of a cluster and * a given item * @param cluster1 the cluster * @param item the item * @return the distance as a double */ private double averageDistance(Cluster cluster1, ItemValued item) { return Math.abs(cluster1.getaverage() - item.getValue()); } /** * Set the parameter k for the k-means algorithm. * @param k an integer. */ public void setK(int k) { this.k = k; } }