/**
* GeDBIT.app.algorithms.PivotWisePartition.java 2006.06.28
*
* Copyright Information:
*
* Change Log:
* 2006.06.28: Created, by Rui Mao
*/
package GeDBIT.index.algorithms;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;
import java.util.Collections;
import GeDBIT.dist.Metric;
import GeDBIT.index.VPInternalNode;
import GeDBIT.type.IndexObject;
import GeDBIT.util.Debug;
import GeDBIT.util.Histogram;
/**
* This is a utility class of data partition algorithm. It partitions data pivot
* by pivot. Which pivot to partition with depends on the result of partition
* based on that pivot
*
* @author Rui Mao
* @version 2006.06.28
*/
class PivotWisePartition implements PartitionMethod {
int MaxLS = 0;
Logger logger = null;
int SVF = 0;
double[][] distance = null;
double MaxRadius = 0;
double HistogramScale = 10;
public void setMaxRadius(double R) {
this.MaxRadius = R;
}
public PartitionResults partition(Metric metric, IndexObject[] pivots,
List<? extends IndexObject> data, int numPartitions, int maxLS) {
return partition(metric, pivots, data, 0, data.size(), numPartitions,
maxLS);
}
public PartitionResults partition(Metric metric, IndexObject[] pivots,
List<? extends IndexObject> data, int first, int size,
int numPartitions, int maxLS) {
double[][] distance = new double[pivots.length][size];
for (int i = first; i < first + size; i++)
for (int j = 0; j < pivots.length; j++)
distance[j][i - first] = metric.getDistance(data.get(i),
pivots[j]);
return partition(distance, pivots, data.subList(first, first + size),
numPartitions, maxLS, Logger.getLogger("GeDBIT.index"),
this.MaxRadius);
}
/**
* given pivots, this method partition the dataset pivot by pivot.
*
* @param distance
* distances from each data point (column) to each piovt(row)
* @param pivot
* the pivots array, each element can be computed distance on
* @param data
* the source data list to split, each element is a
* {@link RecordObject}
* @param SVF
* partition number induced by each vantage point
* @param maxLS
* max leaf size, if a cluster has less size, don't partition
* further
* @return a list, the first element is a List [], which contains lists of
* data of each child, the second element is of type double [][],
* which is the lowerRange, the min distance from each child to each
* vantage point, child*VP, the third element is of type double
* [][], which is the upperRange, the max distance from each child
* to each vantage point, child*vp
*/
public PartitionResults partition(double[][] distance, IndexObject[] pivot,
List<? extends IndexObject> data, final int SVF, final int maxLS,
Logger logger, double R) {
if (Debug.debug)
logger.finer("Pivot-wise Partition");
this.logger = logger;
this.MaxLS = maxLS;
this.SVF = SVF;
this.distance = distance;
this.MaxRadius = R;
// compute all the distance
final int numP = pivot.length;
// maintain a list of partition task, each task contains:
// 1. the first last offset of the cluster data in the data array, two
// Integers
// 2. the distance ranges to all vps, two 1-d double array the first is
// the lower bound, then the upper bound
// if the upper bound to a pivot is -1, then the pivot is not used
LinkedList<PartitionTask> taskList = new LinkedList<PartitionTask>();
taskList.addFirst(new PartitionTask(data, pivot));
// maintain a list of partitions that are done. finally, use these
// completed partitions to create an index node.
LinkedList<PartitionTask> completedTask = new LinkedList<PartitionTask>();
// the loop to partition each cluster
while (!taskList.isEmpty()) {
PartitionTask task = taskList.removeFirst();
// if task is finished or is a leaf, move it to completed task list.
if (task.isDone() || task.isLeaf(distance, maxLS)) {
completedTask.add(task);
continue;
}
// otherwise, process a partition task
// 1. select a best pivot
// 2. partition based on this pivot
// 3. put new tasks into task list
// 4. sort the data list and distance array.
taskList.addAll(0, processTask(task));
}
// now partition is done, return result's in required format.
final int childrenNumber = completedTask.size(); // may need to check
// whether cluster
// number ==1
// if (childrenNumber ==1)
// System.out.println("cluster can not be partitioned!");
List<List<? extends IndexObject>> subDataList = new ArrayList<List<? extends IndexObject>>(
childrenNumber);
double[][] allLower = new double[childrenNumber][numP];
double[][] allUpper = new double[childrenNumber][numP];
for (int i = 0; i < childrenNumber; i++) {
PartitionTask task = completedTask.get(i);
subDataList.add(data.subList(task.first, task.last));
for (int j = 0; j < numP; j++) {
allLower[i][j] = task.lower[j];
allUpper[i][j] = task.upper[j];
}
}
VPInternalNode predicate = new VPInternalNode(pivot, allLower,
allUpper, data.size(), new long[childrenNumber]);
PartitionResults partitionResult = new PartitionResults(subDataList,
predicate);
return partitionResult;
}
/**
* process a partition task. Note 1: the task should be checked whether can
* be a leaf node before calling this method. Note 2: the task should also
* be checked wheter all the points are identical 1. select a best pivot 2.
* partition based on this pivot 3. create new tasks and return 4. sort the
* data list and distance array, put data and distances belongs to the same
* sub-clusters together.
*/
private List<PartitionTask> processTask(PartitionTask task) {
final int pivotNum = task.pivot.length;
double obj = Double.NEGATIVE_INFINITY; // object function value of
// partition, the larger the
// better.
int pivot = 0;
double largestRange = 0;
double[] clusterLeftBound = null; // inclusive
double[] clusterRightBound = null; // inclusive
double[] clusterFirstOffsetDouble = null; // inclusive, will cast to
// integer
// select pivot to partition with
double tempR = this.MaxRadius;
while ((obj == Double.NEGATIVE_INFINITY) && !task.isDone()) {
for (int i = 0; i < pivotNum; i++) {
// skip used pivot
if (task.upper[i] != -1)
continue;
// partition by one pivot, return a 2-d double array. no empty
// cluster allowed
// 0th row: consists of only one element, the objective function
// value, the larger the better. can be the pruning rate etc.
// 1st row: cluster left bound, left inclusive
// 2nd row: range, left and right
// 3rd row: cluster first offset
// 4th row: cluster right bound, right inclusive
double[][] result = partitionByOnePivot(i, tempR, task);
if (obj < result[0][0]) {
obj = result[0][0];
clusterLeftBound = result[1];
clusterRightBound = result[4];
clusterFirstOffsetDouble = result[3];
pivot = i;
}
if (result[2][0] == result[2][1]) {
task.lower[i] = result[2][0];
task.upper[i] = result[2][1];
} else {
largestRange = (largestRange > result[2][1] - result[2][0]) ? largestRange
: result[2][1] - result[2][0];
}
}
if (obj != Double.NEGATIVE_INFINITY)
break;
tempR = largestRange / 4;
}
// partition by the pivot selected
int[] clusterFirstOffset = new int[clusterFirstOffsetDouble.length];
for (int i = 0; i < clusterFirstOffset.length; i++)
clusterFirstOffset[i] = (int) clusterFirstOffsetDouble[i];
sort(clusterLeftBound, clusterFirstOffset, task, pivot);
// create partition tasks and then return
ArrayList<PartitionTask> children = new ArrayList<PartitionTask>(
clusterFirstOffset.length);
for (int i = 0; i < clusterFirstOffset.length; i++) {
// skip empty cluster
if (((i == clusterFirstOffset.length - 1) && (clusterFirstOffset[i] == task.last))
|| ((i < clusterFirstOffset.length - 1) && (clusterFirstOffset[i] == clusterFirstOffset[i + 1])))
continue;
double[] l = (double[]) task.lower.clone();
double[] u = (double[]) task.upper.clone();
l[pivot] = clusterLeftBound[i];
u[pivot] = clusterRightBound[i];
if (i == clusterFirstOffset.length - 1)
children.add(new PartitionTask(task.data,
clusterFirstOffset[i], task.last, task.pivot, l, u));
else
children.add(new PartitionTask(task.data,
clusterFirstOffset[i], clusterFirstOffset[i + 1],
task.pivot, l, u));
}
return children;
}
/**
* partition by the distances to one pivot, return a double array. no empty
* cluster allowed 0th row consists of only one element, the objective
* function value, the larger the better. can be the pruning rate etc. 1st
* row: cluster left bound,left inclusive, right exclusive 2nd row: range,
* left and right 3rd row: cluster first offset 4th row: cluster right
* bound, left exclusive, right inclusive
*
*/
double[][] partitionByOnePivot(int pivot, double R, PartitionTask task) {
ArrayList<Histogram.BinInfo> bin = Histogram.completeOneDHistogram(-R
/ this.HistogramScale / 2, R / this.HistogramScale,
this.distance[pivot], task.first, task.last);
double[][] result = new double[5][];
result[2] = new double[] { bin.get(0).lower(),
bin.get(bin.size() - 1).upper() };
// return if range is not large enough
if ((bin.size() < 3)
|| (bin.get(bin.size() - 2).upper() - bin.get(1).lower()) <= 2 * R) {
partitionSmallRange(result, bin, task);
return result;
}
// range is large enough, find the pruning rate for all the possible
// 3-partitions.
// if the size of the 3 clusters are a, b, c, where b has widht 2R, then
// the pruning rate is:
// r = 2ac/(a+b+c)^2, since a+b+c is constant for all partitions, we can
// just use r=ac for comparison
int bestLeftBoundary = 0; // the offset of the first bin in the middle
// part
int bestRightBoundary = 0; // the offset of the last bin in the middle
// part
int bestA = 0, bestB = 0; // cluster size of the best partition.
double maxR = -1; // the max value of r=ac, for comparison. the larger
// the better.
double a = 0, b = 0;
int rightBoundary = 0;
for (int leftBoundary = 1; leftBoundary < bin.size() - 1; leftBoundary++) {
if (bin.get(bin.size() - 2).upper() - bin.get(leftBoundary).lower() < 2 * R)// already
// reach
// the
// right
// ends
break;
// compute a
a = 0;
for (int i = 0; i < leftBoundary; i++)
a += bin.get(i).size();
// find right boundary
rightBoundary = leftBoundary;
b = bin.get(rightBoundary).size();
while ((rightBoundary < bin.size() - 2)
&& ((bin.get(rightBoundary).upper() - bin.get(leftBoundary)
.lower()) < 2 * R)) {
rightBoundary++;
b += bin.get(rightBoundary).size();
}
// already reach the right ends. already check at the beginning of
// the loop, just for safety
if (rightBoundary == bin.size() - 1)
break;
// comparison with the best-so-far
if (maxR < a * (task.last - task.first - b - a)) {
maxR = a * (task.last - task.first - b - a);
bestLeftBoundary = leftBoundary;
bestRightBoundary = rightBoundary;
bestA = (int) a;
bestB = (int) b;
}
}
// set the cluster information and return
result[0] = new double[] { maxR };
result[1] = new double[3];
result[3] = new double[3];
result[4] = new double[3];
result[1][0] = bin.get(0).lower();
result[3][0] = task.first;
result[4][0] = bin.get(bestLeftBoundary - 1).upper();
result[1][1] = bin.get(bestLeftBoundary).lower();
result[3][1] = task.first + bestA;
result[4][1] = bin.get(bestRightBoundary).upper();
result[1][2] = bin.get(bestRightBoundary + 1).lower();
result[3][2] = task.first + bestA + bestB;
result[4][2] = bin.get(bin.size() - 1).upper();
return result;
}
void partitionSmallRange(double[][] result,
ArrayList<Histogram.BinInfo> bin, PartitionTask task) {
boolean isDiscrete = true;
for (Histogram.BinInfo b : bin)
if (b.upper() != b.lower()) {
isDiscrete = false;
break;
}
if (isDiscrete) // if discrete, return each discrete value as a cluster
{
result[0] = new double[] { 0 };
result[1] = new double[bin.size()];
for (int i = 0; i < bin.size(); i++)
result[1][i] = bin.get(i).lower();
result[4] = (double[]) result[1].clone();
result[3] = new double[bin.size()];
result[3][0] = task.first;
for (int i = 1; i < bin.size(); i++)
result[3][i] = result[3][i - 1] + bin.get(i - 1).size();
} else {
result[0] = new double[] { Double.NEGATIVE_INFINITY };
result[1] = new double[] { result[2][0] };
result[4] = new double[] { result[2][1] };
result[3] = new double[] { task.first };
}
}
/**
* sort the array and list into groups, based on given split values and
* group sizes
*
* @param split
* @param count
* @param distance
* @param data
*/
void sort(double[] clusterLeftBound, int[] clusterFirstOffset,
PartitionTask task, int pivot) {
double temp = 0;
int toCluster = 0;
final int clusterNum = clusterFirstOffset.length;
int[] currentOffset = (int[]) clusterFirstOffset.clone();
for (int cluster = 0; cluster < clusterNum; cluster++) {
for (; currentOffset[cluster] < ((cluster == clusterNum - 1) ? task.last
: clusterFirstOffset[cluster + 1]); currentOffset[cluster]++) {
toCluster = cluster + 1;
while (toCluster != cluster) {
// compute tocluster
for (toCluster = 0; toCluster < clusterNum - 1; toCluster++) {
if (this.distance[pivot][currentOffset[cluster]] < clusterLeftBound[toCluster + 1])
break;
}
if (toCluster != cluster) // exchange
{
Collections.swap(task.data, currentOffset[cluster],
currentOffset[toCluster]);
for (int i = 0; i < task.pivot.length; i++) {
temp = distance[i][currentOffset[cluster]];
distance[i][currentOffset[cluster]] = distance[i][currentOffset[toCluster]];
distance[i][currentOffset[toCluster]] = temp;
}
currentOffset[toCluster]++;
}// end of exchange
}// end of while
}// end of one cluster
}
}
class PartitionTask {
// Metric metric;
List<? extends IndexObject> data; // data to partition
final int first; // offset of the first point in the data list,
// inclusive
final int last; // offset of the last point in the data list, exclusive.
IndexObject[] pivot; // pivots based on distance to which to partition
// the data
double[] upper; // upper.length = lower.length = pivot.length.
double[] lower; // upper and lower bounds to used pivots. computed by
// previous partition steps.
// if upper[i] == -1, then pivot[i] is not used yet.
/**
* Constructor of PartitionTask. Assume no pivots were used
*
* @param data
* data to partition, copy by reference
* @param pivot
* pivots to use, copy by reference
*/
public PartitionTask(List<? extends IndexObject> data,
IndexObject[] pivot) {
this(data, 0, data.size(), pivot, new double[pivot.length],
new double[pivot.length]);
for (int i = 0; i < pivot.length; i++)
upper[i] = -1;
}
/**
* Constructor of PartitionTask
*
* @param data
* data to partition, copy by reference
* @param first
* offset of the first point in the data list, inclusive
* @param last
* offset of the last point in the data list, exclusive.
* @param pivot
* pivots to use, copy by reference
* @param upper
* upper bounds to used pivots, copy by value
* @param lower
* lower bounds to used pivots, copy by value
*/
public PartitionTask(List<? extends IndexObject> data, int first,
int last, IndexObject[] pivot, double[] lower, double[] upper) {
if ((data == null) || (pivot == null) || (upper == null)
|| (lower == null))
throw new IllegalArgumentException("Null argument!");
if (first >= last)
throw new IllegalArgumentException(
"Empty data list to partition!");
if ((pivot.length != upper.length)
|| (upper.length != lower.length))
throw new IllegalArgumentException(
"Arrays of inconsistent size!");
// this.metric = metric;
this.data = data;
this.first = first;
this.last = last;
this.pivot = pivot;
this.upper = (double[]) upper.clone();
this.lower = (double[]) lower.clone();
}
/**
* check whether there are still pivots to use
*
* @return true if no pivots to use
*/
boolean isDone() {
for (int i = 0; i < upper.length; i++)
if (upper[i] == -1)
return false;
return true;
}
/**
* Check whether this partition task is small enough to form a leaf
* node. If yes, compute the range to all unused pivots.
*
* @param mls
* maximum leaf node size
* @param distance
* distances from each data point (column) to each piovt(row)
* @return
*/
boolean isLeaf(double[][] distance, int mls) {
if (last - first > mls)
return false;
for (int i = 0; i < pivot.length; i++) {
if (upper[i] != -1) // pivot i is already used
continue;
upper[i] = Double.NEGATIVE_INFINITY;
lower[i] = Double.POSITIVE_INFINITY;
for (int j = first; j < last; j++) {
upper[i] = (upper[i] > distance[i][j]) ? upper[i]
: distance[i][j];
lower[i] = (lower[i] < distance[i][j]) ? lower[i]
: distance[i][j];
}
}
return true;
}
}
}