package ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.kmeans_for_fournier08;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.AlgoFournierViger08;
import ca.pfv.spmf.algorithms.sequentialpatterns.fournier2008_seqdim.ItemValued;
/**
* This algorithm is a modified K-Means algorithm to be used with the Fournier-Viger-2008 algorithm, which.
* adds the constraint of a minimum number of items per clusters (minimum support).
* <br/><br/>
* This implementation should only be used for the Fournier-Viger 2008 as it is specially designed
* for it (e.g. it clusters valued items and it keep the
* min max and avg values of each clusters). For other purposes, one should use the general KMeans implementation
* in the package "clustering". This latter implementation is more general (uses vector of doubles instead
* of items) and is more optimized.
* <br/><br/>
* This algorithm works as follows <br/>
* We specify a maximum K.<br/>
* The algorithmm executes K-Means from K=1 to K=Kmax and try to find
* the largest number of clusters such that each cluster has a size that is larger
* than the minimum support (as an integer).<br/>
* The algorithm returns this set of clusters.<br/>
* The algorithm stops at k=k+1 or when the number of clusters does not increase for
* two successives K.
*
* @see AlgoFournierViger08
* @see ItemValued
* @author Philippe Fournier-Viger
*/
public class AlgoKMeansWithSupport{
// the maximum number of clusters to be found
//
private int maxK;
// the minimum support threshold as an integer value. It indicates
// the minimum size that a cluster should have.
private int minsuppRelative;
// the number of times that K-Means should be executed for each value
// of k
private final int numberOfTriesForEachK;
// an implementation of the regular k-means.
private final AlgoKMeans_forFournier08 algoKMeans;
/**
* Constructor
* @param maxK the maximum number of cluster to be found
* @param relativeMinsup a relative minimum support threshold
* @param algoKMeans an implementation of the regular K-Means
* @param numberOfTriesForEachK the number of times that K-Means should
* be executed for each value of k.
*/
public AlgoKMeansWithSupport(int maxK, int relativeMinsup, AlgoKMeans_forFournier08 algoKMeans, int numberOfTriesForEachK){
// save the parameters
this.maxK = maxK;
this.minsuppRelative = relativeMinsup;
this.algoKMeans = algoKMeans;
this.numberOfTriesForEachK = numberOfTriesForEachK;
// if the minimum support is 0, we set it to 1
// so that no empty cluster is found.
if(minsuppRelative <= 0){
minsuppRelative = 1;
}
}
/**
* Constructor
* @param maxK the maximum number of cluster to be found
* @param minsup minimum support threshold as a percentage (double)
* @param algoKMeans an implementation of the regular K-Means
* @param numberOfTriesForEachK the number of times that K-Means should
* be executed for each value of k.
*/
public AlgoKMeansWithSupport(int maxK, double minsup, int transactioncount, AlgoKMeans_forFournier08 algoKMeans, int numberOfTriesForEachK){
this.maxK = maxK;
// convert to a relative minimum support by multiplying
// by the database size.
this.minsuppRelative = (int) Math.ceil(minsup * transactioncount);
this.algoKMeans = algoKMeans;
this.numberOfTriesForEachK = numberOfTriesForEachK;
// if the minimum support is 0, we set it to 1
// so that no empty cluster is found.
if(minsuppRelative <= 0){
minsuppRelative = 1;
}
}
/**
* Run the algorithm
* @param items the values to be clustered
* @return a list of clusters found.
*/
public List<Cluster> runAlgorithm(List<ItemValued> items){
// if the maximum number of clusters is larger than
// the number of items, then set it to the number of items.
if(maxK > items.size()){
maxK = items.size();
}
// The number of clusters that will be found
int nbClustersFound = -1;
// The list of clusters that will be found
List<Cluster> clustersFound = null;
// For each K.
for(int k=1; k <= maxK; k++){
// we try numberOfTriesForEachK times.
for(int j=0; j<numberOfTriesForEachK; j++){
// We execute K-Means with k
algoKMeans.setK(k);
// K-means return a set of clusters
List<Cluster> clusters = algoKMeans.runAlgorithm(items);
// We count the numbers of clusters with size >= minsupp
// and we remove clusters with size < minsupp
int frequentClustersCount = 0;
// for each cluster
for(int i=0; i< clusters.size();){
// if the cluster has a size >= minsup
if(isAFrequentCluster(clusters.get(i))){
// increase the count of frequent clusters
frequentClustersCount++;
i++; // go to next cluster
}else{
// if size < minsup, we delete the cluster
clusters.remove(i);
}
}
// If the number of clusters found is higher than
// the number of clusters found by other execution
// of k-means, we keep the clusters from this execution.
if(frequentClustersCount > nbClustersFound){
nbClustersFound = frequentClustersCount;
clustersFound = clusters;
}
}
}
// We associate the items to their respective clusters because we called
// K-Means many times with different K and it is possible
// that items are not associated to the last set of clusters that was found.
for(ItemValued item : items){
// for each cluster
for(Cluster cluster : clustersFound){
// if the current item is contained in this
// cluster
for(ItemValued item2 : cluster.getItems()){
if(item == item2){
// we re-associate the item to the cluster.
item.setCluster(cluster);
}
}
}
}
// We return the list of clusters found.
return clustersFound;
}
/**
* Check if the support of a cluster is higher than minsupp.
* To do this, we should not count two times the items that have
* the same SequenceID.
* @param cluster
* @return
*/
private boolean isAFrequentCluster(Cluster cluster) {
// Create a set to store the sequence IDs where
// each item appears
Set<Integer> sequenceIds = new HashSet<Integer>();
// for ea item
for(ItemValued item : cluster.getItems()){
// store the sequence IDs in the set
sequenceIds.add(item.getSequenceID());
}
// if the set of sequence IDs is >= minsup
return sequenceIds.size() >= minsuppRelative;
}
}