/**
* Copyright (c) 2012 Cloudsmith Inc. and other contributors, as listed below.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Cloudsmith
*
*/
package org.cloudsmith.geppetto.common.stats;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import com.google.common.collect.Ranges;
/**
* <p>
* IntegerCluster utility that implements a simplistic hierarchical clustering algorithm.
* </p>
* <p>
* This class solves the problem; given a series of integers find the smallest number of clusters (of a given min-max inclusive range of values) such
* that no range is wider than a given max.
* </p>
* <p>
* If all values are within the given max, one cluster is produces, and if no two values are closer than max, then there will be as many clusters as
* there are unique values in the observed set of values.
* </p>
* <p>
* The algorithm orders the set of observed values into an set of clusters of 0 size (min = max = value), and searches for the two adjacent clusters
* that produce the smallest resulting cluster if merged. If the smallest available merge is bigger than the max, the work is done. If the range is
* smaller than the max, the merged cluster replaces the two inputs. The algorithm now loops back to check for the next two adjacent clusters with the
* smallest distance.
* </p>
* <p>
* The implementation is primarily intended for a fairly small number of observations/clusters as the final step of mapping observations to clusters
* search (binary search) for a cluster per value. To use this class with larger data sets, it would be better to keep a map from observations to
* clusters.
* </p>
*/
public class IntegerCluster {
private static class ClusterNode {
int min;
int max;
private ClusterNode(ClusterNode a, ClusterNode b) {
this.min = Math.min(a.min(), b.min());
this.max = Math.max(a.max(), b.max());
}
private ClusterNode(int value) {
this.min = this.max = value;
}
private ClusterNode(int min, int max) {
this.min = Math.min(min, max);
this.max = Math.max(min, max);
}
public int max() {
return this.max;
}
public int min() {
return this.min;
}
}
/**
* Compares nodes; the node that starts first is smaller. If starting on the same value, the node that ends first is smaller.
*/
private static class ClusterNodeComparator implements Comparator<ClusterNode> {
@Override
public int compare(ClusterNode o1, ClusterNode o2) {
if(o1.min() == o2.min() && o1.max() == o2.max())
return 0;
if(o1.min() < o2.min())
return -1;
if(o1.max() < o2.max())
return -1;
return 1;
}
}
private final static ClusterNodeComparator comparator = new ClusterNodeComparator();
private List<ClusterNode> clusterList = Lists.newArrayList();
private boolean dirty;
private final int maxDistance;
public IntegerCluster(int maxDistance) {
this.maxDistance = maxDistance;
}
public void add(int observation) {
clusterList.add(new ClusterNode(observation));
dirty = true;
}
public void addAll(Iterable<Integer> iterable) {
for(Integer i : iterable)
add(i);
}
private void cluster() {
Collections.sort(clusterList, comparator);
while(clusterList.size() > 1) {
int limit = clusterList.size() - 1;
int mind = Integer.MAX_VALUE;
int minix = -1;
for(int i = 0; i < limit; i++) {
int d = distance(clusterList.get(i), clusterList.get(i + 1));
if(d < mind) {
mind = d;
minix = i;
}
}
// the two smallest causes a range that is bigger than max allowed
if(mind > maxDistance)
return;
// join the two clusters closest to each other.
clusterList.set(minix, new ClusterNode(clusterList.get(minix), clusterList.get(minix + 1)));
clusterList.remove(minix + 1);
}
dirty = false;
}
private ClusterNode clusterForValue(int x) {
lazyCluster();
int pos = Collections.binarySearch(clusterList, new ClusterNode(x), comparator);
if(pos >= 0) {
return clusterList.get(pos);
}
// abs(pos) is the index of the first element > x (or the size if last).
pos = -pos - 1;
if(pos == clusterList.size()) {
ClusterNode result = clusterList.get(pos - 1);
// outch, the value is > the largest cluster - should not happen if used correctly
if(result.max() < x)
throw new IllegalStateException("The given value was not included in the set of observed values: " + x);
return result;
}
// pos is the insertion point, but needs adjustment if x > min of its cluster since clusters are pimarily ordered on min value
ClusterNode result = clusterList.get(pos);
if(result.min() > x)
return clusterList.get(pos - 1);
return result;
}
/**
* Returns the max value for the cluster x is a member of.
*
* @param x
* @return
*/
public int clusterMax(int x) {
return clusterForValue(x).max();
}
/**
* Returns the min value for the cluster x is a member of.
*
* @param x
* @return
*/
public int clusterMin(int x) {
return clusterForValue(x).min();
}
private int distance(ClusterNode a, ClusterNode b) {
int low = Math.min(a.min(), b.min());
int high = Math.max(a.max(), b.max());
return high - low;
}
/**
* Return the number of clusters.
*
* @return
*/
public int getClusterCount() {
lazyCluster();
return clusterList.size();
}
private void lazyCluster() {
if(dirty)
cluster();
}
public List<Range<Integer>> toListOfRanges() {
lazyCluster();
List<Range<Integer>> result = Lists.newArrayListWithExpectedSize(clusterList.size());
for(ClusterNode n : clusterList) {
result.add(Ranges.closed(n.min(), n.max()));
}
return result;
}
}