VoltageClusterer.java example

Explorer
geogebra-master
/*
 * Copyright (c) 2004, the JUNG Project and the Regents of the University
 * of California
 * All rights reserved.
 *
 * This software is open-source under the BSD license; see either
 * "license.txt" or
 * http://jung.sourceforge.net/license.txt for a description.
 *
 * Created on Aug 12, 2004
 */
package edu.uci.ics.jung.algorithms.cluster;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import edu.uci.ics.jung.algorithms.scoring.VoltageScorer;
import edu.uci.ics.jung.algorithms.util.DiscreteDistribution;
import edu.uci.ics.jung.algorithms.util.KMeansClusterer;
import edu.uci.ics.jung.algorithms.util.KMeansClusterer.NotEnoughClustersException;
import edu.uci.ics.jung.graph.Graph;

/**
 * <p>
 * Clusters vertices of a <code>Graph</code> based on their ranks as calculated
 * by <code>VoltageScorer</code>. This algorithm is based on, but not identical
 * with, the method described in the paper below. The primary difference is that
 * Wu and Huberman assume a priori that the clusters are of approximately the
 * same size, and therefore use a more complex method than k-means (which is
 * used here) for determining cluster membership based on co-occurrence data.
 * </p>
 *
 * <p>
 * The algorithm proceeds as follows:
 * <ul>
 * <li/>first, generate a set of candidate clusters as follows:
 * <ul>
 * <li/>pick (widely separated) vertex pair, run VoltageScorer
 * <li/>group the vertices in two clusters according to their voltages
 * <li/>store resulting candidate clusters
 * </ul>
 * <li/>second, generate k-1 clusters as follows:
 * <ul>
 * <li/>pick a vertex v as a cluster 'seed' <br>
 * (Wu/Huberman: most frequent vertex in candidate clusters)
 * <li/>calculate co-occurrence over all candidate clusters of v with each other
 * vertex
 * <li/>separate co-occurrence counts into high/low; high vertices constitute a
 * cluster
 * <li/>remove v's vertices from candidate clusters; continue
 * </ul>
 * <li/>finally, remaining unassigned vertices are assigned to the kth
 * ("garbage") cluster.
 * </ul>
 * </p>
 *
 * <p>
 * <b>NOTE</b>: Depending on how the co-occurrence data splits the data into
 * clusters, the number of clusters returned by this algorithm may be less than
 * the number of clusters requested. The number of clusters will never be more
 * than the number requested, however.
 * </p>
 *
 * @author Joshua O'Madadhain
 * @see "'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman, http://www.hpl.hp.com/research/idl/papers/linear/"
 * @see VoltageScorer
 * @see KMeansClusterer
 */
public class VoltageClusterer<V, E> {
	protected int num_candidates;
	protected KMeansClusterer<V> kmc;
	protected Random rand;
	protected Graph<V, E> g;

	/**
	 * Creates an instance of a VoltageCluster with the specified parameters.
	 * These are mostly parameters that are passed directly to VoltageScorer and
	 * KMeansClusterer.
	 *
	 * @param num_candidates
	 *            the number of candidate clusters to create
	 */
	public VoltageClusterer(Graph<V, E> g, int num_candidates) {
		if (num_candidates < 1) {
			throw new IllegalArgumentException("must generate >=1 candidates");
		}

		this.num_candidates = num_candidates;
		this.kmc = new KMeansClusterer<V>();
		rand = new Random();
		this.g = g;
	}

	protected void setRandomSeed(int random_seed) {
		rand = new Random(random_seed);
	}

	/**
	 * Returns a community (cluster) centered around <code>v</code>.
	 * 
	 * @param v
	 *            the vertex whose community we wish to discover
	 */
	public Collection<Set<V>> getCommunity(V v) {
		return cluster_internal(v, 2);
	}

	/**
	 * Clusters the vertices of <code>g</code> into <code>num_clusters</code>
	 * clusters, based on their connectivity.
	 * 
	 * @param num_clusters
	 *            the number of clusters to identify
	 */
	public Collection<Set<V>> cluster(int num_clusters) {
		return cluster_internal(null, num_clusters);
	}

	/**
	 * Does the work of <code>getCommunity</code> and <code>cluster</code>.
	 * 
	 * @param origin
	 *            the vertex around which clustering is to be done
	 * @param num_clusters
	 *            the (maximum) number of clusters to find
	 */
	protected Collection<Set<V>> cluster_internal(V origin, int num_clusters) {
		// generate candidate clusters
		// repeat the following 'samples' times:
		// * pick (widely separated) vertex pair, run VoltageScorer
		// * use k-means to identify 2 communities in ranked graph
		// * store resulting candidate communities
		ArrayList<V> v_array = new ArrayList<V>(g.getVertices());

		LinkedList<Set<V>> candidates = new LinkedList<Set<V>>();

		for (int j = 0; j < num_candidates; j++) {
			V source;
			if (origin == null) {
				source = v_array
						.get((int) (rand.nextDouble() * v_array.size()));
			} else {
				source = origin;
			}
			V target = null;
			do {
				target = v_array
						.get((int) (rand.nextDouble() * v_array.size()));
			} while (source == target);
			VoltageScorer<V, E> vs = new VoltageScorer<V, E>(g, source, target);
			vs.evaluate();

			Map<V, double[]> voltage_ranks = new HashMap<V, double[]>();
			for (V v : g.getVertices()) {
				voltage_ranks.put(v, new double[] { vs.getVertexScore(v) });
			}

			// addOneCandidateCluster(candidates, voltage_ranks);
			addTwoCandidateClusters(candidates, voltage_ranks);
		}

		// repeat the following k-1 times:
		// * pick a vertex v as a cluster seed
		// (Wu/Huberman: most frequent vertex in candidates)
		// * calculate co-occurrence (in candidate clusters)
		// of this vertex with all others
		// * use k-means to separate co-occurrence counts into high/low;
		// high vertices are a cluster
		// * remove v's vertices from candidate clusters

		Collection<Set<V>> clusters = new LinkedList<Set<V>>();
		Set<V> remaining = new HashSet<V>(g.getVertices());

		List<V> seed_candidates = getSeedCandidates(candidates);
		int seed_index = 0;

		for (int j = 0; j < (num_clusters - 1); j++) {
			if (remaining.isEmpty()) {
				break;
			}

			V seed;
			if (seed_index == 0 && origin != null) {
				seed = origin;
			} else {
				do {
					seed = seed_candidates.get(seed_index++);
				} while (!remaining.contains(seed));
			}

			Map<V, double[]> occur_counts = getObjectCounts(candidates, seed);
			if (occur_counts.size() < 2) {
				break;
			}

			// now that we have the counts, cluster them...
			try {
				Collection<Map<V, double[]>> high_low = kmc
						.cluster(occur_counts, 2);
				// ...get the cluster with the highest-valued centroid...
				Iterator<Map<V, double[]>> h_iter = high_low.iterator();
				Map<V, double[]> cluster1 = h_iter.next();
				Map<V, double[]> cluster2 = h_iter.next();
				double[] centroid1 = DiscreteDistribution
						.mean(cluster1.values());
				double[] centroid2 = DiscreteDistribution
						.mean(cluster2.values());
				Set<V> new_cluster;
				if (centroid1[0] >= centroid2[0]) {
					new_cluster = cluster1.keySet();
				} else {
					new_cluster = cluster2.keySet();
				}

				// ...remove the elements of new_cluster from each candidate...
				for (Set<V> cluster : candidates) {
					cluster.removeAll(new_cluster);
				}
				clusters.add(new_cluster);
				remaining.removeAll(new_cluster);
			} catch (NotEnoughClustersException nece) {
				// all remaining vertices are in the same cluster
				break;
			}
		}

		// identify remaining vertices (if any) as a 'garbage' cluster
		if (!remaining.isEmpty()) {
			clusters.add(remaining);
		}

		return clusters;
	}

	/**
	 * Do k-means with three intervals and pick the smaller two clusters
	 * (presumed to be on the ends); this is closer to the Wu-Huberman method.
	 * 
	 * @param candidates
	 * @param voltage_ranks
	 */
	protected void addTwoCandidateClusters(LinkedList<Set<V>> candidates,
			Map<V, double[]> voltage_ranks) {
		try {
			List<Map<V, double[]>> clusters = new ArrayList<Map<V, double[]>>(
					kmc.cluster(voltage_ranks, 3));
			boolean b01 = clusters.get(0).size() > clusters.get(1).size();
			boolean b02 = clusters.get(0).size() > clusters.get(2).size();
			boolean b12 = clusters.get(1).size() > clusters.get(2).size();
			if (b01 && b02) {
				candidates.add(clusters.get(1).keySet());
				candidates.add(clusters.get(2).keySet());
			} else if (!b01 && b12) {
				candidates.add(clusters.get(0).keySet());
				candidates.add(clusters.get(2).keySet());
			} else if (!b02 && !b12) {
				candidates.add(clusters.get(0).keySet());
				candidates.add(clusters.get(1).keySet());
			}
		} catch (NotEnoughClustersException e) {
			// no valid candidates, continue
		}
	}

	/**
	 * alternative to addTwoCandidateClusters(): cluster vertices by voltages
	 * into 2 clusters. We only consider the smaller of the two clusters
	 * returned by k-means to be a 'true' cluster candidate; the other is a
	 * garbage cluster.
	 * 
	 * @param candidates
	 * @param voltage_ranks
	 */
	protected void addOneCandidateCluster(LinkedList<Set<V>> candidates,
			Map<V, double[]> voltage_ranks) {
		try {
			List<Map<V, double[]>> clusters;
			clusters = new ArrayList<Map<V, double[]>>(
					kmc.cluster(voltage_ranks, 2));
			if (clusters.get(0).size() < clusters.get(1).size()) {
				candidates.add(clusters.get(0).keySet());
			} else {
				candidates.add(clusters.get(1).keySet());
			}
		} catch (NotEnoughClustersException e) {
			// no valid candidates, continue
		}
	}

	/**
	 * Returns an array of cluster seeds, ranked in decreasing order of number
	 * of appearances in the specified collection of candidate clusters.
	 * 
	 * @param candidates
	 */
	protected List<V> getSeedCandidates(Collection<Set<V>> candidates) {
		final Map<V, double[]> occur_counts = getObjectCounts(candidates, null);

		ArrayList<V> occurrences = new ArrayList<V>(occur_counts.keySet());
		Collections.sort(occurrences,
				new MapValueArrayComparator(occur_counts));

		System.out.println("occurrences: ");
		for (int i = 0; i < occurrences.size(); i++) {
			System.out.println(occur_counts.get(occurrences.get(i))[0]);
		}

		return occurrences;
	}

	protected Map<V, double[]> getObjectCounts(Collection<Set<V>> candidates,
			V seed) {
		Map<V, double[]> occur_counts = new HashMap<V, double[]>();
		for (V v : g.getVertices()) {
			occur_counts.put(v, new double[] { 0 });
		}

		for (Set<V> candidate : candidates) {
			if (seed == null) {
				System.out.println(candidate.size());
			}
			if (seed == null || candidate.contains(seed)) {
				for (V element : candidate) {
					double[] count = occur_counts.get(element);
					count[0]++;
				}
			}
		}

		if (seed == null) {
			System.out.println("occur_counts size: " + occur_counts.size());
			for (V v : occur_counts.keySet()) {
				System.out.println(occur_counts.get(v)[0]);
			}
		}

		return occur_counts;
	}

	protected class MapValueArrayComparator implements Comparator<V> {
		private Map<V, double[]> map;

		protected MapValueArrayComparator(Map<V, double[]> map) {
			this.map = map;
		}

		@Override
		public int compare(V o1, V o2) {
			double[] count0 = map.get(o1);
			double[] count1 = map.get(o2);
			if (count0[0] < count1[0]) {
				return 1;
			} else if (count0[0] > count1[0]) {
				return -1;
			}
			return 0;
		}

	}

}