PivotWisePartition.java example

Explorer
GeDBIT-master
- src
  - GeDBIT
/**
 * GeDBIT.app.algorithms.PivotWisePartition.java 2006.06.28
 *
 * Copyright Information:
 *
 * Change Log:
 * 2006.06.28: Created, by Rui Mao
 */
package GeDBIT.index.algorithms;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;
import java.util.Collections;

import GeDBIT.dist.Metric;
import GeDBIT.index.VPInternalNode;
import GeDBIT.type.IndexObject;
import GeDBIT.util.Debug;
import GeDBIT.util.Histogram;

/**
 * This is a utility class of data partition algorithm. It partitions data pivot
 * by pivot. Which pivot to partition with depends on the result of partition
 * based on that pivot
 * 
 * @author Rui Mao
 * @version 2006.06.28
 */
class PivotWisePartition implements PartitionMethod {
    int MaxLS = 0;
    Logger logger = null;
    int SVF = 0;
    double[][] distance = null;
    double MaxRadius = 0;
    double HistogramScale = 10;

    public void setMaxRadius(double R) {
	this.MaxRadius = R;
    }

    public PartitionResults partition(Metric metric, IndexObject[] pivots,
	    List<? extends IndexObject> data, int numPartitions, int maxLS) {
	return partition(metric, pivots, data, 0, data.size(), numPartitions,
		maxLS);
    }

    public PartitionResults partition(Metric metric, IndexObject[] pivots,
	    List<? extends IndexObject> data, int first, int size,
	    int numPartitions, int maxLS) {
	double[][] distance = new double[pivots.length][size];

	for (int i = first; i < first + size; i++)
	    for (int j = 0; j < pivots.length; j++)
		distance[j][i - first] = metric.getDistance(data.get(i),
			pivots[j]);

	return partition(distance, pivots, data.subList(first, first + size),
		numPartitions, maxLS, Logger.getLogger("GeDBIT.index"),
		this.MaxRadius);
    }

    /**
     * given pivots, this method partition the dataset pivot by pivot.
     * 
     * @param distance
     *            distances from each data point (column) to each piovt(row)
     * @param pivot
     *            the pivots array, each element can be computed distance on
     * @param data
     *            the source data list to split, each element is a
     *            {@link RecordObject}
     * @param SVF
     *            partition number induced by each vantage point
     * @param maxLS
     *            max leaf size, if a cluster has less size, don't partition
     *            further
     * @return a list, the first element is a List [], which contains lists of
     *         data of each child, the second element is of type double [][],
     *         which is the lowerRange, the min distance from each child to each
     *         vantage point, child*VP, the third element is of type double
     *         [][], which is the upperRange, the max distance from each child
     *         to each vantage point, child*vp
     */
    public PartitionResults partition(double[][] distance, IndexObject[] pivot,
	    List<? extends IndexObject> data, final int SVF, final int maxLS,
	    Logger logger, double R) {
	if (Debug.debug)
	    logger.finer("Pivot-wise Partition");

	this.logger = logger;
	this.MaxLS = maxLS;
	this.SVF = SVF;
	this.distance = distance;
	this.MaxRadius = R;
	// compute all the distance
	final int numP = pivot.length;

	// maintain a list of partition task, each task contains:
	// 1. the first last offset of the cluster data in the data array, two
	// Integers
	// 2. the distance ranges to all vps, two 1-d double array the first is
	// the lower bound, then the upper bound
	// if the upper bound to a pivot is -1, then the pivot is not used

	LinkedList<PartitionTask> taskList = new LinkedList<PartitionTask>();
	taskList.addFirst(new PartitionTask(data, pivot));

	// maintain a list of partitions that are done. finally, use these
	// completed partitions to create an index node.
	LinkedList<PartitionTask> completedTask = new LinkedList<PartitionTask>();

	// the loop to partition each cluster
	while (!taskList.isEmpty()) {
	    PartitionTask task = taskList.removeFirst();

	    // if task is finished or is a leaf, move it to completed task list.
	    if (task.isDone() || task.isLeaf(distance, maxLS)) {
		completedTask.add(task);
		continue;
	    }

	    // otherwise, process a partition task
	    // 1. select a best pivot
	    // 2. partition based on this pivot
	    // 3. put new tasks into task list
	    // 4. sort the data list and distance array.
	    taskList.addAll(0, processTask(task));
	}

	// now partition is done, return result's in required format.
	final int childrenNumber = completedTask.size(); // may need to check
							 // whether cluster
							 // number ==1
	// if (childrenNumber ==1)
	// System.out.println("cluster can not be partitioned!");

	List<List<? extends IndexObject>> subDataList = new ArrayList<List<? extends IndexObject>>(
		childrenNumber);
	double[][] allLower = new double[childrenNumber][numP];
	double[][] allUpper = new double[childrenNumber][numP];

	for (int i = 0; i < childrenNumber; i++) {
	    PartitionTask task = completedTask.get(i);
	    subDataList.add(data.subList(task.first, task.last));
	    for (int j = 0; j < numP; j++) {
		allLower[i][j] = task.lower[j];
		allUpper[i][j] = task.upper[j];
	    }
	}

	VPInternalNode predicate = new VPInternalNode(pivot, allLower,
		allUpper, data.size(), new long[childrenNumber]);
	PartitionResults partitionResult = new PartitionResults(subDataList,
		predicate);

	return partitionResult;
    }

    /**
     * process a partition task. Note 1: the task should be checked whether can
     * be a leaf node before calling this method. Note 2: the task should also
     * be checked wheter all the points are identical 1. select a best pivot 2.
     * partition based on this pivot 3. create new tasks and return 4. sort the
     * data list and distance array, put data and distances belongs to the same
     * sub-clusters together.
     */
    private List<PartitionTask> processTask(PartitionTask task) {
	final int pivotNum = task.pivot.length;

	double obj = Double.NEGATIVE_INFINITY; // object function value of
					       // partition, the larger the
					       // better.
	int pivot = 0;
	double largestRange = 0;
	double[] clusterLeftBound = null; // inclusive
	double[] clusterRightBound = null; // inclusive
	double[] clusterFirstOffsetDouble = null; // inclusive, will cast to
						  // integer

	// select pivot to partition with
	double tempR = this.MaxRadius;
	while ((obj == Double.NEGATIVE_INFINITY) && !task.isDone()) {
	    for (int i = 0; i < pivotNum; i++) {
		// skip used pivot
		if (task.upper[i] != -1)
		    continue;

		// partition by one pivot, return a 2-d double array. no empty
		// cluster allowed
		// 0th row: consists of only one element, the objective function
		// value, the larger the better. can be the pruning rate etc.
		// 1st row: cluster left bound, left inclusive
		// 2nd row: range, left and right
		// 3rd row: cluster first offset
		// 4th row: cluster right bound, right inclusive
		double[][] result = partitionByOnePivot(i, tempR, task);

		if (obj < result[0][0]) {
		    obj = result[0][0];
		    clusterLeftBound = result[1];
		    clusterRightBound = result[4];
		    clusterFirstOffsetDouble = result[3];
		    pivot = i;
		}

		if (result[2][0] == result[2][1]) {
		    task.lower[i] = result[2][0];
		    task.upper[i] = result[2][1];
		} else {
		    largestRange = (largestRange > result[2][1] - result[2][0]) ? largestRange
			    : result[2][1] - result[2][0];
		}
	    }

	    if (obj != Double.NEGATIVE_INFINITY)
		break;

	    tempR = largestRange / 4;

	}

	// partition by the pivot selected
	int[] clusterFirstOffset = new int[clusterFirstOffsetDouble.length];
	for (int i = 0; i < clusterFirstOffset.length; i++)
	    clusterFirstOffset[i] = (int) clusterFirstOffsetDouble[i];

	sort(clusterLeftBound, clusterFirstOffset, task, pivot);

	// create partition tasks and then return
	ArrayList<PartitionTask> children = new ArrayList<PartitionTask>(
		clusterFirstOffset.length);
	for (int i = 0; i < clusterFirstOffset.length; i++) {
	    // skip empty cluster
	    if (((i == clusterFirstOffset.length - 1) && (clusterFirstOffset[i] == task.last))
		    || ((i < clusterFirstOffset.length - 1) && (clusterFirstOffset[i] == clusterFirstOffset[i + 1])))
		continue;

	    double[] l = (double[]) task.lower.clone();
	    double[] u = (double[]) task.upper.clone();
	    l[pivot] = clusterLeftBound[i];
	    u[pivot] = clusterRightBound[i];
	    if (i == clusterFirstOffset.length - 1)
		children.add(new PartitionTask(task.data,
			clusterFirstOffset[i], task.last, task.pivot, l, u));
	    else
		children.add(new PartitionTask(task.data,
			clusterFirstOffset[i], clusterFirstOffset[i + 1],
			task.pivot, l, u));
	}

	return children;

    }

    /**
     * partition by the distances to one pivot, return a double array. no empty
     * cluster allowed 0th row consists of only one element, the objective
     * function value, the larger the better. can be the pruning rate etc. 1st
     * row: cluster left bound,left inclusive, right exclusive 2nd row: range,
     * left and right 3rd row: cluster first offset 4th row: cluster right
     * bound, left exclusive, right inclusive
     * 
     */
    double[][] partitionByOnePivot(int pivot, double R, PartitionTask task) {
	ArrayList<Histogram.BinInfo> bin = Histogram.completeOneDHistogram(-R
		/ this.HistogramScale / 2, R / this.HistogramScale,
		this.distance[pivot], task.first, task.last);

	double[][] result = new double[5][];
	result[2] = new double[] { bin.get(0).lower(),
		bin.get(bin.size() - 1).upper() };

	// return if range is not large enough
	if ((bin.size() < 3)
		|| (bin.get(bin.size() - 2).upper() - bin.get(1).lower()) <= 2 * R) {
	    partitionSmallRange(result, bin, task);
	    return result;
	}

	// range is large enough, find the pruning rate for all the possible
	// 3-partitions.
	// if the size of the 3 clusters are a, b, c, where b has widht 2R, then
	// the pruning rate is:
	// r = 2ac/(a+b+c)^2, since a+b+c is constant for all partitions, we can
	// just use r=ac for comparison
	int bestLeftBoundary = 0; // the offset of the first bin in the middle
				  // part
	int bestRightBoundary = 0; // the offset of the last bin in the middle
				   // part
	int bestA = 0, bestB = 0; // cluster size of the best partition.
	double maxR = -1; // the max value of r=ac, for comparison. the larger
			  // the better.

	double a = 0, b = 0;
	int rightBoundary = 0;
	for (int leftBoundary = 1; leftBoundary < bin.size() - 1; leftBoundary++) {
	    if (bin.get(bin.size() - 2).upper() - bin.get(leftBoundary).lower() < 2 * R)// already
											// reach
											// the
											// right
											// ends
		break;

	    // compute a
	    a = 0;
	    for (int i = 0; i < leftBoundary; i++)
		a += bin.get(i).size();

	    // find right boundary
	    rightBoundary = leftBoundary;
	    b = bin.get(rightBoundary).size();
	    while ((rightBoundary < bin.size() - 2)
		    && ((bin.get(rightBoundary).upper() - bin.get(leftBoundary)
			    .lower()) < 2 * R)) {
		rightBoundary++;
		b += bin.get(rightBoundary).size();
	    }

	    // already reach the right ends. already check at the beginning of
	    // the loop, just for safety
	    if (rightBoundary == bin.size() - 1)
		break;

	    // comparison with the best-so-far
	    if (maxR < a * (task.last - task.first - b - a)) {
		maxR = a * (task.last - task.first - b - a);
		bestLeftBoundary = leftBoundary;
		bestRightBoundary = rightBoundary;
		bestA = (int) a;
		bestB = (int) b;
	    }
	}

	// set the cluster information and return
	result[0] = new double[] { maxR };
	result[1] = new double[3];
	result[3] = new double[3];
	result[4] = new double[3];

	result[1][0] = bin.get(0).lower();
	result[3][0] = task.first;
	result[4][0] = bin.get(bestLeftBoundary - 1).upper();

	result[1][1] = bin.get(bestLeftBoundary).lower();
	result[3][1] = task.first + bestA;
	result[4][1] = bin.get(bestRightBoundary).upper();

	result[1][2] = bin.get(bestRightBoundary + 1).lower();
	result[3][2] = task.first + bestA + bestB;
	result[4][2] = bin.get(bin.size() - 1).upper();

	return result;

    }

    void partitionSmallRange(double[][] result,
	    ArrayList<Histogram.BinInfo> bin, PartitionTask task) {
	boolean isDiscrete = true;
	for (Histogram.BinInfo b : bin)
	    if (b.upper() != b.lower()) {
		isDiscrete = false;
		break;
	    }

	if (isDiscrete) // if discrete, return each discrete value as a cluster
	{
	    result[0] = new double[] { 0 };
	    result[1] = new double[bin.size()];
	    for (int i = 0; i < bin.size(); i++)
		result[1][i] = bin.get(i).lower();
	    result[4] = (double[]) result[1].clone();
	    result[3] = new double[bin.size()];
	    result[3][0] = task.first;
	    for (int i = 1; i < bin.size(); i++)
		result[3][i] = result[3][i - 1] + bin.get(i - 1).size();

	} else {
	    result[0] = new double[] { Double.NEGATIVE_INFINITY };
	    result[1] = new double[] { result[2][0] };
	    result[4] = new double[] { result[2][1] };
	    result[3] = new double[] { task.first };
	}

    }

    /**
     * sort the array and list into groups, based on given split values and
     * group sizes
     * 
     * @param split
     * @param count
     * @param distance
     * @param data
     */
    void sort(double[] clusterLeftBound, int[] clusterFirstOffset,
	    PartitionTask task, int pivot) {
	double temp = 0;
	int toCluster = 0;
	final int clusterNum = clusterFirstOffset.length;
	int[] currentOffset = (int[]) clusterFirstOffset.clone();
	for (int cluster = 0; cluster < clusterNum; cluster++) {
	    for (; currentOffset[cluster] < ((cluster == clusterNum - 1) ? task.last
		    : clusterFirstOffset[cluster + 1]); currentOffset[cluster]++) {
		toCluster = cluster + 1;
		while (toCluster != cluster) {
		    // compute tocluster
		    for (toCluster = 0; toCluster < clusterNum - 1; toCluster++) {
			if (this.distance[pivot][currentOffset[cluster]] < clusterLeftBound[toCluster + 1])
			    break;
		    }

		    if (toCluster != cluster) // exchange
		    {
			Collections.swap(task.data, currentOffset[cluster],
				currentOffset[toCluster]);

			for (int i = 0; i < task.pivot.length; i++) {
			    temp = distance[i][currentOffset[cluster]];
			    distance[i][currentOffset[cluster]] = distance[i][currentOffset[toCluster]];
			    distance[i][currentOffset[toCluster]] = temp;
			}
			currentOffset[toCluster]++;
		    }// end of exchange
		}// end of while
	    }// end of one cluster
	}

    }

    class PartitionTask {
	// Metric metric;
	List<? extends IndexObject> data; // data to partition
	final int first; // offset of the first point in the data list,
			 // inclusive
	final int last; // offset of the last point in the data list, exclusive.
	IndexObject[] pivot; // pivots based on distance to which to partition
			     // the data
	double[] upper; // upper.length = lower.length = pivot.length.
	double[] lower; // upper and lower bounds to used pivots. computed by
			// previous partition steps.
			// if upper[i] == -1, then pivot[i] is not used yet.

	/**
	 * Constructor of PartitionTask. Assume no pivots were used
	 * 
	 * @param data
	 *            data to partition, copy by reference
	 * @param pivot
	 *            pivots to use, copy by reference
	 */
	public PartitionTask(List<? extends IndexObject> data,
		IndexObject[] pivot) {
	    this(data, 0, data.size(), pivot, new double[pivot.length],
		    new double[pivot.length]);
	    for (int i = 0; i < pivot.length; i++)
		upper[i] = -1;
	}

	/**
	 * Constructor of PartitionTask
	 * 
	 * @param data
	 *            data to partition, copy by reference
	 * @param first
	 *            offset of the first point in the data list, inclusive
	 * @param last
	 *            offset of the last point in the data list, exclusive.
	 * @param pivot
	 *            pivots to use, copy by reference
	 * @param upper
	 *            upper bounds to used pivots, copy by value
	 * @param lower
	 *            lower bounds to used pivots, copy by value
	 */
	public PartitionTask(List<? extends IndexObject> data, int first,
		int last, IndexObject[] pivot, double[] lower, double[] upper) {
	    if ((data == null) || (pivot == null) || (upper == null)
		    || (lower == null))
		throw new IllegalArgumentException("Null argument!");

	    if (first >= last)
		throw new IllegalArgumentException(
			"Empty data list to partition!");

	    if ((pivot.length != upper.length)
		    || (upper.length != lower.length))
		throw new IllegalArgumentException(
			"Arrays of inconsistent size!");

	    // this.metric = metric;
	    this.data = data;
	    this.first = first;
	    this.last = last;
	    this.pivot = pivot;
	    this.upper = (double[]) upper.clone();
	    this.lower = (double[]) lower.clone();

	}

	/**
	 * check whether there are still pivots to use
	 * 
	 * @return true if no pivots to use
	 */
	boolean isDone() {
	    for (int i = 0; i < upper.length; i++)
		if (upper[i] == -1)
		    return false;
	    return true;
	}

	/**
	 * Check whether this partition task is small enough to form a leaf
	 * node. If yes, compute the range to all unused pivots.
	 * 
	 * @param mls
	 *            maximum leaf node size
	 * @param distance
	 *            distances from each data point (column) to each piovt(row)
	 * @return
	 */
	boolean isLeaf(double[][] distance, int mls) {
	    if (last - first > mls)
		return false;

	    for (int i = 0; i < pivot.length; i++) {
		if (upper[i] != -1) // pivot i is already used
		    continue;

		upper[i] = Double.NEGATIVE_INFINITY;
		lower[i] = Double.POSITIVE_INFINITY;
		for (int j = first; j < last; j++) {
		    upper[i] = (upper[i] > distance[i][j]) ? upper[i]
			    : distance[i][j];
		    lower[i] = (lower[i] < distance[i][j]) ? lower[i]
			    : distance[i][j];
		}
	    }

	    return true;
	}

    }
}