/* * Copyright (c) 2004, the JUNG Project and the Regents of the University * of California * All rights reserved. * * This software is open-source under the BSD license; see either * "license.txt" or * http://jung.sourceforge.net/license.txt for a description. * * Created on Aug 12, 2004 */ package edu.uci.ics.jung.algorithms.cluster; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import edu.uci.ics.jung.algorithms.scoring.VoltageScorer; import edu.uci.ics.jung.algorithms.util.DiscreteDistribution; import edu.uci.ics.jung.algorithms.util.KMeansClusterer; import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException; import edu.uci.ics.jung.graph.Graph; /** * <p> * Clusters vertices of a <code>Graph</code> based on their ranks as calculated * by <code>VoltageScorer</code>. This algorithm is based on, but not identical * with, the method described in the paper below. The primary difference is that * Wu and Huberman assume a priori that the clusters are of approximately the * same size, and therefore use a more complex method than k-means (which is * used here) for determining cluster membership based on co-occurrence data. * </p> * * <p> * The algorithm proceeds as follows: * <ul> * <li/>first, generate a set of candidate clusters as follows: * <ul> * <li/>pick (widely separated) vertex pair, run VoltageScorer * <li/>group the vertices in two clusters according to their voltages * <li/>store resulting candidate clusters * </ul> * <li/>second, generate k-1 clusters as follows: * <ul> * <li/>pick a vertex v as a cluster 'seed' <br> * (Wu/Huberman: most frequent vertex in candidate clusters) * <li/>calculate co-occurrence over all candidate clusters of v with each other * vertex * <li/>separate co-occurrence counts into high/low; high vertices constitute a * cluster * <li/>remove v's vertices from candidate clusters; continue * </ul> * <li/>finally, remaining unassigned vertices are assigned to the kth * ("garbage") cluster. * </ul> * </p> * * <p> * <b>NOTE</b>: Depending on how the co-occurrence data splits the data into * clusters, the number of clusters returned by this algorithm may be less than * the number of clusters requested. The number of clusters will never be more * than the number requested, however. * </p> * * @author Joshua O'Madadhain * @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/" * @see VoltageScorer * @see KMeansClusterer */ public class VoltageClusterer<V, E> { protected int num_candidates; protected KMeansClusterer<V> kmc; protected Random rand; protected Graph<V, E> g; /** * Creates an instance of a VoltageCluster with the specified parameters. * These are mostly parameters that are passed directly to VoltageScorer and * KMeansClusterer. * * @param num_candidates * the number of candidate clusters to create */ public VoltageClusterer(Graph<V, E> g, int num_candidates) { if (num_candidates < 1) { throw new IllegalArgumentException("must generate >=1 candidates"); } this.num_candidates = num_candidates; this.kmc = new KMeansClusterer<V>(); rand = new Random(); this.g = g; } protected void setRandomSeed(int random_seed) { rand = new Random(random_seed); } /** * Returns a community (cluster) centered around <code>v</code>. * * @param v * the vertex whose community we wish to discover */ public Collection<Set<V>> getCommunity(V v) { return cluster_internal(v, 2); } /** * Clusters the vertices of <code>g</code> into <code>num_clusters</code> * clusters, based on their connectivity. * * @param num_clusters * the number of clusters to identify */ public Collection<Set<V>> cluster(int num_clusters) { return cluster_internal(null, num_clusters); } /** * Does the work of <code>getCommunity</code> and <code>cluster</code>. * * @param origin * the vertex around which clustering is to be done * @param num_clusters * the (maximum) number of clusters to find */ protected Collection<Set<V>> cluster_internal(V origin, int num_clusters) { // generate candidate clusters // repeat the following 'samples' times: // * pick (widely separated) vertex pair, run VoltageScorer // * use k-means to identify 2 communities in ranked graph // * store resulting candidate communities ArrayList<V> v_array = new ArrayList<V>(g.getVertices()); LinkedList<Set<V>> candidates = new LinkedList<Set<V>>(); for (int j = 0; j < num_candidates; j++) { V source; if (origin == null) { source = v_array .get((int) (rand.nextDouble() * v_array.size())); } else { source = origin; } V target = null; do { target = v_array .get((int) (rand.nextDouble() * v_array.size())); } while (source == target); VoltageScorer<V, E> vs = new VoltageScorer<V, E>(g, source, target); vs.evaluate(); Map<V, double[]> voltage_ranks = new HashMap<V, double[]>(); for (V v : g.getVertices()) { voltage_ranks.put(v, new double[] { vs.getVertexScore(v) }); } // addOneCandidateCluster(candidates, voltage_ranks); addTwoCandidateClusters(candidates, voltage_ranks); } // repeat the following k-1 times: // * pick a vertex v as a cluster seed // (Wu/Huberman: most frequent vertex in candidates) // * calculate co-occurrence (in candidate clusters) // of this vertex with all others // * use k-means to separate co-occurrence counts into high/low; // high vertices are a cluster // * remove v's vertices from candidate clusters Collection<Set<V>> clusters = new LinkedList<Set<V>>(); Set<V> remaining = new HashSet<V>(g.getVertices()); List<V> seed_candidates = getSeedCandidates(candidates); int seed_index = 0; for (int j = 0; j < (num_clusters - 1); j++) { if (remaining.isEmpty()) { break; } V seed; if (seed_index == 0 && origin != null) { seed = origin; } else { do { seed = seed_candidates.get(seed_index++); } while (!remaining.contains(seed)); } Map<V, double[]> occur_counts = getObjectCounts(candidates, seed); if (occur_counts.size() < 2) { break; } // now that we have the counts, cluster them... try { Collection<Map<V, double[]>> high_low = kmc .cluster(occur_counts, 2); // ...get the cluster with the highest-valued centroid... Iterator<Map<V, double[]>> h_iter = high_low.iterator(); Map<V, double[]> cluster1 = h_iter.next(); Map<V, double[]> cluster2 = h_iter.next(); double[] centroid1 = DiscreteDistribution .mean(cluster1.values()); double[] centroid2 = DiscreteDistribution .mean(cluster2.values()); Set<V> new_cluster; if (centroid1[0] >= centroid2[0]) { new_cluster = cluster1.keySet(); } else { new_cluster = cluster2.keySet(); } // ...remove the elements of new_cluster from each candidate... for (Set<V> cluster : candidates) { cluster.removeAll(new_cluster); } clusters.add(new_cluster); remaining.removeAll(new_cluster); } catch (NotEnoughClustersException nece) { // all remaining vertices are in the same cluster break; } } // identify remaining vertices (if any) as a 'garbage' cluster if (!remaining.isEmpty()) { clusters.add(remaining); } return clusters; } /** * Do k-means with three intervals and pick the smaller two clusters * (presumed to be on the ends); this is closer to the Wu-Huberman method. * * @param candidates * @param voltage_ranks */ protected void addTwoCandidateClusters(LinkedList<Set<V>> candidates, Map<V, double[]> voltage_ranks) { try { List<Map<V, double[]>> clusters = new ArrayList<Map<V, double[]>>( kmc.cluster(voltage_ranks, 3)); boolean b01 = clusters.get(0).size() > clusters.get(1).size(); boolean b02 = clusters.get(0).size() > clusters.get(2).size(); boolean b12 = clusters.get(1).size() > clusters.get(2).size(); if (b01 && b02) { candidates.add(clusters.get(1).keySet()); candidates.add(clusters.get(2).keySet()); } else if (!b01 && b12) { candidates.add(clusters.get(0).keySet()); candidates.add(clusters.get(2).keySet()); } else if (!b02 && !b12) { candidates.add(clusters.get(0).keySet()); candidates.add(clusters.get(1).keySet()); } } catch (NotEnoughClustersException e) { // no valid candidates, continue } } /** * alternative to addTwoCandidateClusters(): cluster vertices by voltages * into 2 clusters. We only consider the smaller of the two clusters * returned by k-means to be a 'true' cluster candidate; the other is a * garbage cluster. * * @param candidates * @param voltage_ranks */ protected void addOneCandidateCluster(LinkedList<Set<V>> candidates, Map<V, double[]> voltage_ranks) { try { List<Map<V, double[]>> clusters; clusters = new ArrayList<Map<V, double[]>>( kmc.cluster(voltage_ranks, 2)); if (clusters.get(0).size() < clusters.get(1).size()) { candidates.add(clusters.get(0).keySet()); } else { candidates.add(clusters.get(1).keySet()); } } catch (NotEnoughClustersException e) { // no valid candidates, continue } } /** * Returns an array of cluster seeds, ranked in decreasing order of number * of appearances in the specified collection of candidate clusters. * * @param candidates */ protected List<V> getSeedCandidates(Collection<Set<V>> candidates) { final Map<V, double[]> occur_counts = getObjectCounts(candidates, null); ArrayList<V> occurrences = new ArrayList<V>(occur_counts.keySet()); Collections.sort(occurrences, new MapValueArrayComparator(occur_counts)); System.out.println("occurrences: "); for (int i = 0; i < occurrences.size(); i++) { System.out.println(occur_counts.get(occurrences.get(i))[0]); } return occurrences; } protected Map<V, double[]> getObjectCounts(Collection<Set<V>> candidates, V seed) { Map<V, double[]> occur_counts = new HashMap<V, double[]>(); for (V v : g.getVertices()) { occur_counts.put(v, new double[] { 0 }); } for (Set<V> candidate : candidates) { if (seed == null) { System.out.println(candidate.size()); } if (seed == null || candidate.contains(seed)) { for (V element : candidate) { double[] count = occur_counts.get(element); count[0]++; } } } if (seed == null) { System.out.println("occur_counts size: " + occur_counts.size()); for (V v : occur_counts.keySet()) { System.out.println(occur_counts.get(v)[0]); } } return occur_counts; } protected class MapValueArrayComparator implements Comparator<V> { private Map<V, double[]> map; protected MapValueArrayComparator(Map<V, double[]> map) { this.map = map; } @Override public int compare(V o1, V o2) { double[] count0 = map.get(o1); double[] count1 = map.get(o2); if (count0[0] < count1[0]) { return 1; } else if (count0[0] > count1[0]) { return -1; } return 0; } } }