package uk.ac.rhul.cs.cl1;
import java.util.*;
import java.util.concurrent.*;
import uk.ac.rhul.cs.cl1.growth.ClusterGrowthWorker;
import uk.ac.rhul.cs.cl1.merging.AbstractNodeSetMerger;
import uk.ac.rhul.cs.cl1.seeding.Seed;
import uk.ac.rhul.cs.cl1.seeding.SeedGenerator;
import uk.ac.rhul.cs.cl1.seeding.SeedIterator;
import uk.ac.rhul.cs.cl1.support.OrderMaintainingQueue;
import uk.ac.rhul.cs.cl1.support.UsedNodeSet;
import uk.ac.rhul.cs.collections.IntObjectHashMap;
import uk.ac.rhul.cs.graph.Graph;
import uk.ac.rhul.cs.graph.GraphAlgorithm;
import uk.ac.rhul.cs.graph.TransitivityCalculator;
import uk.ac.rhul.cs.utils.ArrayUtils;
import uk.ac.rhul.cs.utils.Ordered;
/**
* Main class for the ClusterONE algorithm.
*
* This class represents an instance of the algorithm along with all its
* necessary parameters. The main entry point of the algorithm is the
* run() method which executes the clustering algorithm on the graph
* set earlier using the setGraph() method. The algorithm can also be
* run in a separate thread as it implements the Callable interface.
*
* @author Tamas Nepusz <tamas@cs.rhul.ac.uk>
*/
public class ClusterONE extends GraphAlgorithm implements Callable<Void>, TaskMonitorSupport {
/** The name of the application that will appear on the user interface */
public static final String applicationName = "ClusterONE";
/** The version number of the application */
public static final String version = "1.1";
/** A thread pool used for asynchronous operations within ClusterONE */
private static Executor threadPool = null;
/** The clustering result as a list of {@link ValuedNodeSet} objects */
protected ValuedNodeSetList result = null;
/** Algorithm settings for this instance */
protected ClusterONEAlgorithmParameters parameters = null;
/** A task monitor where the algorithm will report its progress */
protected TaskMonitor monitor = new NullTaskMonitor();
/** Whether we are running on a Mac or not */
protected static boolean runningOnMac = false;
static {
runningOnMac = System.getProperty("os.name").toLowerCase().startsWith("mac os x");
}
/**
* Internal enum that stores the state of the main loop of the algorithm.
*/
enum State {
START,
GENERATING_SEEDS,
NOTIFYING_WORKERS_NO_MORE_SEEDS,
WAITING_FOR_CLUSTERS,
FINISHED(true),
CANCELLED(true);
boolean isTerminal;
State() {
this(false);
}
State(boolean isTerminal) {
this.isTerminal = isTerminal;
}
}
/**
* Constructs an instance of the algorithm using the default algorithm parameters.
*/
public ClusterONE() {
this(null);
}
/**
* Constructs an instance of the algorithm using the given algorithm parameters.
*
* @param algorithmParameters a {@link ClusterONEAlgorithmParameters} instance that
* controls the algorithms. If null, the defaults
* will be used.
*/
public ClusterONE(ClusterONEAlgorithmParameters algorithmParameters) {
if (algorithmParameters == null)
this.setParameters(new ClusterONEAlgorithmParameters());
else
this.setParameters(algorithmParameters);
}
/**
* Executes the algorithm in a separate thread and returns a future
*/
public Void call() throws ClusterONEException {
run();
return null;
}
/**
* Returns the current parameter setting of the algorithm
*
* @return the parameters
*/
public ClusterONEAlgorithmParameters getParameters() {
return parameters;
}
/**
* Returns the clustering results or null if there was no clustering executed so far
*/
public List<ValuedNodeSet> getResults() {
return result;
}
/**
* Returns a thread pool used by ClusterONE for asynchronous operations
*/
public static Executor getThreadPool() {
if (threadPool == null)
threadPool = Executors.newSingleThreadExecutor();
return threadPool;
}
/**
* Checks whether we are running on a Mac
*/
public static boolean isRunningOnMac() {
return runningOnMac;
}
/**
* Executes the algorithm on the graph set earlier by setGraph()
*/
public void run() throws ClusterONEException {
Double minDensity = parameters.getMinDensity();
AbstractNodeSetMerger merger;
Seed seed;
Seed pendingSeed = null;
ValuedNodeSet cluster;
Ordered<ValuedNodeSet> orderedCluster;
State state;
UsedNodeSet usedNodes;
int numGeneratedSeeds;
int numPostedSeeds;
int numProcessedClusters;
ValuedNodeSetList result = new ValuedNodeSetList();
IntObjectHashMap submittedSeeds = new IntObjectHashMap();
OrderMaintainingQueue<ValuedNodeSet> receivedClusters = new OrderMaintainingQueue<ValuedNodeSet>();
/* Simple sanity checks */
if (ArrayUtils.min(graph.getEdgeWeights()) < 0.0)
throw new ClusterONEException("Edge weights must all be non-negative");
try {
merger = AbstractNodeSetMerger.fromString(
parameters.getMergingMethodName());
} catch (InstantiationException ex) {
throw new ClusterONEException(ex.getMessage());
}
/* Set the minimum density automatically if needed */
if (minDensity == null) {
monitor.setStatus("Choosing density thresold...");
monitor.setPercentCompleted(0);
if (graph.isWeighted())
minDensity = 0.3;
else {
TransitivityCalculator calc = new TransitivityCalculator(graph);
calc.setTaskMonitor(monitor);
if (calc.getGlobalTransitivity() < 0.1)
minDensity = 0.6;
else
minDensity = 0.5;
}
monitor.setPercentCompleted(100);
}
/* Create an executor service that will run the workers */
int numThreads = parameters.getNumThreads();
if (numThreads <= 0) {
numThreads = Math.max(1, Runtime.getRuntime().availableProcessors());
}
ExecutorService executor = Executors.newFixedThreadPool(numThreads);
// Create the input and output queue for the workers.
// Limit the size of the seed queue so it does not run too much "ahead" the worker
// threads. This is useful for seed generators that depend on the clusters produced
// by the workers.
LinkedBlockingQueue<Ordered<Seed>> seedQueue = new LinkedBlockingQueue<Ordered<Seed>>(numThreads);
LinkedBlockingQueue<Ordered<ValuedNodeSet>> clusterQueue = new LinkedBlockingQueue<Ordered<ValuedNodeSet>>();
/* Create the workers and post them to the executor */
for (int i = 0; i < numThreads; i++) {
ClusterGrowthWorker worker = new ClusterGrowthWorker(graph, parameters, minDensity,
seedQueue, clusterQueue);
worker.setDebugMode(debugMode);
executor.execute(worker);
}
// Get the seed generator from the parameters
SeedGenerator seedGenerator = parameters.getSeedGenerator();
seedGenerator.setGraph(graph);
// Create a used node set where we will mark nodes that have been used in clusters
usedNodes = new UsedNodeSet(graph);
// Set up the task monitor
if (numThreads > 1) {
monitor.setStatus("Growing clusters from seeds using " + numThreads + " threads...");
} else {
monitor.setStatus("Growing clusters from seeds...");
}
monitor.setPercentCompleted(0);
// Set up the seed iterator
SeedIterator it = seedGenerator.iterator();
numGeneratedSeeds = 0;
numPostedSeeds = 0;
numProcessedClusters = 0;
state = State.START;
// Start iterating over the seeds and collecting the clusters
while (!state.isTerminal) {
switch (state) {
case START:
state = State.GENERATING_SEEDS;
break;
case GENERATING_SEEDS:
// Try to fill the seed queue with seeds
boolean shouldEnqueue = true;
while (shouldEnqueue) {
// Get the next seed that is acceptable
boolean seedAccepted = false;
seed = null;
while (!seedAccepted) {
if (pendingSeed != null) {
seed = pendingSeed;
numGeneratedSeeds++;
pendingSeed = null;
} else if (it.hasNext()) {
seed = it.next();
numGeneratedSeeds++;
} else {
seed = null;
}
seedAccepted = (seed == null || !parameters.shouldRejectSeedsWithOnlyUsedNodes() ||
!usedNodes.areAllNodesUsedFromSeed(seed));
}
if (seed == null) {
state = State.NOTIFYING_WORKERS_NO_MORE_SEEDS;
shouldEnqueue = false;
} else {
// Offer the seed to the workers; if the queue is full, do nothing
if (seedQueue.offer(new Ordered<Seed>(numPostedSeeds, seed))) {
// Store the seed and increase the number of posted seeds
submittedSeeds.add(numPostedSeeds, seed);
numPostedSeeds++;
} else {
// Queue is full now. Store the seed so we can try it again in the next iteration.
pendingSeed = seed;
numGeneratedSeeds--;
shouldEnqueue = false;
}
}
}
break;
case NOTIFYING_WORKERS_NO_MORE_SEEDS:
// Iterator has just became null, so inform workers that there will
// be no more seeds.
if (seedQueue.offer(new Ordered<Seed>(numPostedSeeds, ClusterGrowthWorker.NO_MORE_SEEDS))) {
state = State.WAITING_FOR_CLUSTERS;
}
break;
case WAITING_FOR_CLUSTERS:
// If we have processed all the seeds, switch to the FINISHED state
if (numPostedSeeds == numProcessedClusters) {
state = State.FINISHED;
}
break;
case FINISHED:
case CANCELLED:
// Nothing to do here; we should not get here anyway.
}
// Check for termination
if (shouldStop) {
state = State.CANCELLED;
}
// In GENERATING_SEEDS, NOTIFYING_WORKERS_NO_MORE_SEEDS and WAITING_FOR_CLUSTERS states,
// try to read a cluster from the cluster queue if we still expect one.
if (state == State.GENERATING_SEEDS || state == State.NOTIFYING_WORKERS_NO_MORE_SEEDS ||
state == State.WAITING_FOR_CLUSTERS) {
// Try to get clusters from the incoming queue if we expect them
while (numProcessedClusters < numPostedSeeds) {
orderedCluster = null;
try {
orderedCluster = clusterQueue.take();
} catch (InterruptedException ignored) {
}
if (orderedCluster == null)
break;
// Add the cluster to the queue that will restore the ordering
// according to the sequence numbers
receivedClusters.add(orderedCluster);
numProcessedClusters++;
// Try to retrieve a few clusters from receivedClusters; note that
// even though we have added a cluster above, the queue might still
// appear empty if the cluster with the _next_ sequence number we
// are waiting for has not arrived yet
while (!receivedClusters.isEmpty()) {
orderedCluster = receivedClusters.remove();
cluster = orderedCluster.object;
if (cluster != ClusterGrowthWorker.EMPTY_CLUSTER) {
Seed originalSeed = (Seed) submittedSeeds.remove(orderedCluster.sequenceNumber);
// Check whether the cluster would have been generated at all if we were working
// sequentially.
if (!usedNodes.areAllNodesUsedFromSeed(originalSeed)) {
// Yes, so mark the nodes in the seed and the cluster as used and store the
// cluster.
result.add(cluster);
usedNodes.markSeedAsUsed(originalSeed);
usedNodes.markNodeSetAsUsed(cluster);
}
}
}
// We try to keep all our workers busy so we break out of the while
// loop here if the queue through which we feed the workers has some
// empty slots and we are still generating seeds.
if (state == State.GENERATING_SEEDS && seedQueue.remainingCapacity() > 0)
break;
}
}
// Report progress.
// Progress has to be calculated from numGeneratedSeeds and not numPostedSeeds
// because some seeds may be skipped before posting them to workers
monitor.setPercentCompleted((int) (numGeneratedSeeds * 100.0 / it.getEstimatedLength()));
// Check for termination
if (shouldStop) {
state = State.CANCELLED;
}
}
if (state == State.FINISHED) {
// Merge highly overlapping clusters
merger.setTaskMonitor(monitor);
this.result = merger.mergeOverlapping(result, parameters.getSimilarityFunction(),
parameters.getOverlapThreshold());
}
// Wait for the workers to terminate. 1 day is a reasonable upper bound on the timeout ;)
executor.shutdown();
while (true) {
try {
executor.awaitTermination(1, TimeUnit.DAYS);
break;
} catch (InterruptedException ignored) {
}
}
}
/**
* Executes the algorithm on the given graph.
*
* @param graph the graph being clustered
*/
public void runOnGraph(Graph graph) throws ClusterONEException {
setGraph(graph);
run();
}
/**
* Sets the current parameter settings of the algorithm
* @param parameters the new parameter settings
*/
public void setParameters(ClusterONEAlgorithmParameters parameters) {
this.parameters = parameters;
}
/**
* Sets the task monitor where the algorithm will report its progress
*
* @param monitor the task monitor to use
*/
public void setTaskMonitor(TaskMonitor monitor) {
this.monitor = monitor;
}
}