/*
* Copyright (c) 2004, the JUNG Project and the Regents of the University
* of California
* All rights reserved.
*
* This software is open-source under the BSD license; see either
* "license.txt" or
* http://jung.sourceforge.net/license.txt for a description.
*
* Created on Aug 12, 2004
*/
package edu.uci.ics.jung.algorithms.cluster;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import edu.uci.ics.jung.algorithms.scoring.VoltageScorer;
import edu.uci.ics.jung.algorithms.util.DiscreteDistribution;
import edu.uci.ics.jung.algorithms.util.KMeansClusterer;
import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException;
import edu.uci.ics.jung.graph.Graph;
/**
* <p>
* Clusters vertices of a <code>Graph</code> based on their ranks as calculated
* by <code>VoltageScorer</code>. This algorithm is based on, but not identical
* with, the method described in the paper below. The primary difference is that
* Wu and Huberman assume a priori that the clusters are of approximately the
* same size, and therefore use a more complex method than k-means (which is
* used here) for determining cluster membership based on co-occurrence data.
* </p>
*
* <p>
* The algorithm proceeds as follows:
* <ul>
* <li/>first, generate a set of candidate clusters as follows:
* <ul>
* <li/>pick (widely separated) vertex pair, run VoltageScorer
* <li/>group the vertices in two clusters according to their voltages
* <li/>store resulting candidate clusters
* </ul>
* <li/>second, generate k-1 clusters as follows:
* <ul>
* <li/>pick a vertex v as a cluster 'seed' <br>
* (Wu/Huberman: most frequent vertex in candidate clusters)
* <li/>calculate co-occurrence over all candidate clusters of v with each other
* vertex
* <li/>separate co-occurrence counts into high/low; high vertices constitute a
* cluster
* <li/>remove v's vertices from candidate clusters; continue
* </ul>
* <li/>finally, remaining unassigned vertices are assigned to the kth
* ("garbage") cluster.
* </ul>
* </p>
*
* <p>
* <b>NOTE</b>: Depending on how the co-occurrence data splits the data into
* clusters, the number of clusters returned by this algorithm may be less than
* the number of clusters requested. The number of clusters will never be more
* than the number requested, however.
* </p>
*
* @author Joshua O'Madadhain
* @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/"
* @see VoltageScorer
* @see KMeansClusterer
*/
public class VoltageClusterer<V, E> {
protected int num_candidates;
protected KMeansClusterer<V> kmc;
protected Random rand;
protected Graph<V, E> g;
/**
* Creates an instance of a VoltageCluster with the specified parameters.
* These are mostly parameters that are passed directly to VoltageScorer and
* KMeansClusterer.
*
* @param num_candidates
* the number of candidate clusters to create
*/
public VoltageClusterer(Graph<V, E> g, int num_candidates) {
if (num_candidates < 1) {
throw new IllegalArgumentException("must generate >=1 candidates");
}
this.num_candidates = num_candidates;
this.kmc = new KMeansClusterer<V>();
rand = new Random();
this.g = g;
}
protected void setRandomSeed(int random_seed) {
rand = new Random(random_seed);
}
/**
* Returns a community (cluster) centered around <code>v</code>.
*
* @param v
* the vertex whose community we wish to discover
*/
public Collection<Set<V>> getCommunity(V v) {
return cluster_internal(v, 2);
}
/**
* Clusters the vertices of <code>g</code> into <code>num_clusters</code>
* clusters, based on their connectivity.
*
* @param num_clusters
* the number of clusters to identify
*/
public Collection<Set<V>> cluster(int num_clusters) {
return cluster_internal(null, num_clusters);
}
/**
* Does the work of <code>getCommunity</code> and <code>cluster</code>.
*
* @param origin
* the vertex around which clustering is to be done
* @param num_clusters
* the (maximum) number of clusters to find
*/
protected Collection<Set<V>> cluster_internal(V origin, int num_clusters) {
// generate candidate clusters
// repeat the following 'samples' times:
// * pick (widely separated) vertex pair, run VoltageScorer
// * use k-means to identify 2 communities in ranked graph
// * store resulting candidate communities
ArrayList<V> v_array = new ArrayList<V>(g.getVertices());
LinkedList<Set<V>> candidates = new LinkedList<Set<V>>();
for (int j = 0; j < num_candidates; j++) {
V source;
if (origin == null) {
source = v_array
.get((int) (rand.nextDouble() * v_array.size()));
} else {
source = origin;
}
V target = null;
do {
target = v_array
.get((int) (rand.nextDouble() * v_array.size()));
} while (source == target);
VoltageScorer<V, E> vs = new VoltageScorer<V, E>(g, source, target);
vs.evaluate();
Map<V, double[]> voltage_ranks = new HashMap<V, double[]>();
for (V v : g.getVertices()) {
voltage_ranks.put(v, new double[] { vs.getVertexScore(v) });
}
// addOneCandidateCluster(candidates, voltage_ranks);
addTwoCandidateClusters(candidates, voltage_ranks);
}
// repeat the following k-1 times:
// * pick a vertex v as a cluster seed
// (Wu/Huberman: most frequent vertex in candidates)
// * calculate co-occurrence (in candidate clusters)
// of this vertex with all others
// * use k-means to separate co-occurrence counts into high/low;
// high vertices are a cluster
// * remove v's vertices from candidate clusters
Collection<Set<V>> clusters = new LinkedList<Set<V>>();
Set<V> remaining = new HashSet<V>(g.getVertices());
List<V> seed_candidates = getSeedCandidates(candidates);
int seed_index = 0;
for (int j = 0; j < (num_clusters - 1); j++) {
if (remaining.isEmpty()) {
break;
}
V seed;
if (seed_index == 0 && origin != null) {
seed = origin;
} else {
do {
seed = seed_candidates.get(seed_index++);
} while (!remaining.contains(seed));
}
Map<V, double[]> occur_counts = getObjectCounts(candidates, seed);
if (occur_counts.size() < 2) {
break;
}
// now that we have the counts, cluster them...
try {
Collection<Map<V, double[]>> high_low = kmc
.cluster(occur_counts, 2);
// ...get the cluster with the highest-valued centroid...
Iterator<Map<V, double[]>> h_iter = high_low.iterator();
Map<V, double[]> cluster1 = h_iter.next();
Map<V, double[]> cluster2 = h_iter.next();
double[] centroid1 = DiscreteDistribution
.mean(cluster1.values());
double[] centroid2 = DiscreteDistribution
.mean(cluster2.values());
Set<V> new_cluster;
if (centroid1[0] >= centroid2[0]) {
new_cluster = cluster1.keySet();
} else {
new_cluster = cluster2.keySet();
}
// ...remove the elements of new_cluster from each candidate...
for (Set<V> cluster : candidates) {
cluster.removeAll(new_cluster);
}
clusters.add(new_cluster);
remaining.removeAll(new_cluster);
} catch (NotEnoughClustersException nece) {
// all remaining vertices are in the same cluster
break;
}
}
// identify remaining vertices (if any) as a 'garbage' cluster
if (!remaining.isEmpty()) {
clusters.add(remaining);
}
return clusters;
}
/**
* Do k-means with three intervals and pick the smaller two clusters
* (presumed to be on the ends); this is closer to the Wu-Huberman method.
*
* @param candidates
* @param voltage_ranks
*/
protected void addTwoCandidateClusters(LinkedList<Set<V>> candidates,
Map<V, double[]> voltage_ranks) {
try {
List<Map<V, double[]>> clusters = new ArrayList<Map<V, double[]>>(
kmc.cluster(voltage_ranks, 3));
boolean b01 = clusters.get(0).size() > clusters.get(1).size();
boolean b02 = clusters.get(0).size() > clusters.get(2).size();
boolean b12 = clusters.get(1).size() > clusters.get(2).size();
if (b01 && b02) {
candidates.add(clusters.get(1).keySet());
candidates.add(clusters.get(2).keySet());
} else if (!b01 && b12) {
candidates.add(clusters.get(0).keySet());
candidates.add(clusters.get(2).keySet());
} else if (!b02 && !b12) {
candidates.add(clusters.get(0).keySet());
candidates.add(clusters.get(1).keySet());
}
} catch (NotEnoughClustersException e) {
// no valid candidates, continue
}
}
/**
* alternative to addTwoCandidateClusters(): cluster vertices by voltages
* into 2 clusters. We only consider the smaller of the two clusters
* returned by k-means to be a 'true' cluster candidate; the other is a
* garbage cluster.
*
* @param candidates
* @param voltage_ranks
*/
protected void addOneCandidateCluster(LinkedList<Set<V>> candidates,
Map<V, double[]> voltage_ranks) {
try {
List<Map<V, double[]>> clusters;
clusters = new ArrayList<Map<V, double[]>>(
kmc.cluster(voltage_ranks, 2));
if (clusters.get(0).size() < clusters.get(1).size()) {
candidates.add(clusters.get(0).keySet());
} else {
candidates.add(clusters.get(1).keySet());
}
} catch (NotEnoughClustersException e) {
// no valid candidates, continue
}
}
/**
* Returns an array of cluster seeds, ranked in decreasing order of number
* of appearances in the specified collection of candidate clusters.
*
* @param candidates
*/
protected List<V> getSeedCandidates(Collection<Set<V>> candidates) {
final Map<V, double[]> occur_counts = getObjectCounts(candidates, null);
ArrayList<V> occurrences = new ArrayList<V>(occur_counts.keySet());
Collections.sort(occurrences,
new MapValueArrayComparator(occur_counts));
System.out.println("occurrences: ");
for (int i = 0; i < occurrences.size(); i++) {
System.out.println(occur_counts.get(occurrences.get(i))[0]);
}
return occurrences;
}
protected Map<V, double[]> getObjectCounts(Collection<Set<V>> candidates,
V seed) {
Map<V, double[]> occur_counts = new HashMap<V, double[]>();
for (V v : g.getVertices()) {
occur_counts.put(v, new double[] { 0 });
}
for (Set<V> candidate : candidates) {
if (seed == null) {
System.out.println(candidate.size());
}
if (seed == null || candidate.contains(seed)) {
for (V element : candidate) {
double[] count = occur_counts.get(element);
count[0]++;
}
}
}
if (seed == null) {
System.out.println("occur_counts size: " + occur_counts.size());
for (V v : occur_counts.keySet()) {
System.out.println(occur_counts.get(v)[0]);
}
}
return occur_counts;
}
protected class MapValueArrayComparator implements Comparator<V> {
private Map<V, double[]> map;
protected MapValueArrayComparator(Map<V, double[]> map) {
this.map = map;
}
@Override
public int compare(V o1, V o2) {
double[] count0 = map.get(o1);
double[] count1 = map.get(o2);
if (count0[0] < count1[0]) {
return 1;
} else if (count0[0] > count1[0]) {
return -1;
}
return 0;
}
}
}