PartitionMethods.java example

Explorer
GeDBIT-master
- src
  - GeDBIT
package GeDBIT.index.algorithms;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;

import GeDBIT.dist.Metric;
import GeDBIT.index.VPInternalNode;
import GeDBIT.util.Debug;
import GeDBIT.type.IndexObject;
import GeDBIT.type.DoubleIndexObjectPair;

/**
 * All the built-in data partition methods. Balance: all the partitions have
 * similar sizes. Fast but performs worse CLUSTERINGKMEANS: partition the data
 * according to the intrinsic clustering, use k-means for each dimension.
 * CLUSTERINGBOUNDARY: partition according to the intrinsic clustering, use a
 * boundary-oriented algorithm for each dimension.
 */
public enum PartitionMethods implements PartitionMethod
{
    BALANCED
    {
        //by Honglong Xu
        private int GHTDegree = 0; // -1: ght, -2:cght
        /**
         * 
         * @param R
         */
        public void setMaxRadius(double R)
        {
            //by Honglong Xu
            this.GHTDegree = (int) R;
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(final Metric metric,
                final IndexObject[] pivots, List<? extends IndexObject> data,
                final int numPartitions, int maxLS)
        {
            return partition(metric, pivots, data, 0, data.size(),
                    numPartitions, maxLS);
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param first
         * @param size
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots,
                List<? extends IndexObject> data, int first, int size,
                int numPartitions, int maxLS)
        {
            return partition(metric, pivots, data, first, size, numPartitions,
                    maxLS, Logger.getLogger("GeDBIT.index"));
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param first
         * @param size
         * @param numPartitions
         * @param logger
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots,
                List<? extends IndexObject> data, int first, int size,
                int numPartitions, int maxLS, Logger logger)
        {

            final int numPivots = pivots.length;
            final int fanout = (int) Math.pow(numPartitions, numPivots);

            if (Debug.debug)
                logger.finer("Start of splitData(), data size= " + size
                        + ", VPNumber= " + numPivots + ", fanout= " + fanout);

            // the lower and upper bound of distances from each child to each
            // vantage point
            double[][] lower = new double[fanout][numPivots];
            double[][] upper = new double[fanout][numPivots];

            DoubleIndexObjectPair[] wrapper = new DoubleIndexObjectPair[size];
            for (int i = first; i < size; i++)
                wrapper[i] = new DoubleIndexObjectPair(0, data.get(i));

            // split data.
            int clusterNumber = 1; // total cluster number when partition based
                                   // on each vp, SVF ^ i
            int clusterCardinality = fanout; // number of final cluster in each
                                             // of current cluster

            // offset of the first point in current cluster in wrapper, the
            // whole data list
            // this array has an additional element at the end of value size,
            // serving as a loop guard
            // the first element of this array is always 0
            int[] clusterOffset = new int[2];
            clusterOffset[0] = 0;
            clusterOffset[1] = size;

            //longer
            if (this.GHTDegree < 0)
            {
                // for now, must be only two pivots
                if (numPivots != 2)
                {
                    throw new IllegalArgumentException(
                            "for CGHT partition, there should be only two pivots!");
                }
            }
            
            for (int i = 0; i < numPivots; i++)
            {
                if (Debug.debug)
                    logger.finer("\nStart spliting vp:" + i
                            + ", cluster number:" + clusterNumber
                            + ", clusterCardinality =" + clusterCardinality
                            + ", computing distances to the vp...");

                // compute distance to the current VP
                //for (int j = 0; j < size; j++)
                //    wrapper[j].setDouble(metric.getDistance(pivots[i],
                //            ((IndexObject) wrapper[j].getObject())));
                
                //by Honglong Xu.
                // hack: if this is ght,cght partition, transform distances to d1+d2
                // an d1-d2
                if(this.GHTDegree == -2)  //cght
                {
                    double distance1, distance2;
                    for (int j = 0; j < size; j++)
                    {
                        distance1 = metric.getDistance(pivots[0],
                                ((IndexObject) wrapper[j].getObject()));
                        distance2 = metric.getDistance(pivots[1],
                                ((IndexObject) wrapper[j].getObject()));
                        distance1 = distance1 + distance2;
                        distance2 = distance1 - distance2 * 2;
                        wrapper[j].setDouble(i==0 ? distance1 : distance2);
                        //wrapper[j].setDouble(i==0 ? distance1+distance2 : distance1-distance2);
                    }
                }
                else
                {
                    for (int j = 0; j < size; j++)
                        wrapper[j].setDouble(metric.getDistance(pivots[i],
                                ((IndexObject) wrapper[j].getObject())));
                }

                if (Debug.debug)
                    logger.finer("Sorting the new computed distances...:");

                // sort each part
                for (int j = 0; j < clusterNumber; j++)
                {
                    if (Debug.debug)
                        logger.finer("[" + j + ": " + clusterOffset[j] + ", "
                                + clusterOffset[j + 1] + "], ");

                    Arrays.sort(wrapper, clusterOffset[j],
                            clusterOffset[j + 1],
                            DoubleIndexObjectPair.DoubleComparator);
                }

                final int nextClusterNumber = clusterNumber * numPartitions;
                int[] nextClusterOffset = new int[nextClusterNumber + 1];
                nextClusterOffset[0] = 0;
                nextClusterOffset[nextClusterNumber] = size;

                int nextClusterCardinality = clusterCardinality / numPartitions;

                // split each current cluster into SVF sub-clusters based on the
                // distance to current VP
                for (int j = 0; j < clusterNumber; j++)
                {
                    // size of current cluster (number of points)
                    final int clusterSize = clusterOffset[j + 1]
                            - clusterOffset[j];

                    // if this cluster is empty, set all its sub-cluster to be
                    // empty
                    if (clusterSize == 0)
                    {
                        for (int k = 0; k < numPartitions; k++)
                            nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j + 1];

                        // jump to next cluster
                        continue;
                    }

                    if (Debug.debug)
                    {
                        logger.finer("Partitioning the " + j
                                + "th cluster, size=" + clusterSize
                                + ", Distances: ");
                        for (int temp = clusterOffset[j]; temp < clusterOffset[j + 1]; temp++)
                            logger.finer(wrapper[temp].getDouble() + ", ");
                        logger.finer("");
                    }

                    // find the last indices of each distinct distance value in
                    // wrapper, which is already sorted
                    ArrayList<Integer> tempIndex = new ArrayList<Integer>();
                    ArrayList<Double> tempValue = new ArrayList<Double>();

                    // the distinct distance value in check, and the number of
                    // points with this distance
                    double currentDistance = wrapper[clusterOffset[j]]
                            .getDouble();
                    int sum = 0;

                    for (int k = clusterOffset[j]; k < clusterOffset[j + 1]; k++)
                    {
                        final double nextDistance = wrapper[k].getDouble();

                        if (nextDistance != currentDistance) // find next
                                                             // distinct
                                                             // distance value
                        {
                            tempIndex.add(sum);
                            tempValue.add(currentDistance);
                            currentDistance = nextDistance;
                        }

                        sum++;
                    }
                    // put the last distinct value into the list
                    tempIndex.add(sum);
                    tempValue.add(currentDistance);

                    final int distinctSize = tempIndex.size();

                    // index of first point with current distinct distance
                    // value,
                    // this is the offset in current cluster, not the index in
                    // wrapper
                    // distinct distance values
                    int[] firstPointWithDistinctDistance = new int[distinctSize + 1];
                    double[] distinctDistance = new double[distinctSize];
                    firstPointWithDistinctDistance[0] = 0;
                    firstPointWithDistinctDistance[distinctSize] = clusterSize;
                    distinctDistance[0] = wrapper[clusterOffset[j]].getDouble();

                    for (int k = 1; k < distinctSize; k++)
                    {
                        firstPointWithDistinctDistance[k] = ((Integer) tempIndex
                                .get(k - 1)).intValue();
                        distinctDistance[k] = ((Double) tempValue.get(k))
                                .doubleValue();
                    }

                    if (Debug.debug)
                    {
                        logger.finer("distinct distances(" + distinctSize
                                + "): ");
                        for (int temp = 0; temp < distinctSize; temp++)
                            logger.finer("[" + temp + ": "
                                    + distinctDistance[temp] + ", "
                                    + firstPointWithDistinctDistance[temp]
                                    + "], ");
                        logger.finer("");
                    }

                    // assign the total distinctSize set of points with
                    // identical distance value
                    // to at most SVF sub-clusters, which is actually split
                    // current cluster

                    // number of distinct set that are already been assigned
                    int startingDistinctSet = 0;

                    // if distince set number is greater than SVF, assign them,
                    // otherwise,
                    // just assign one set to each sub-cluster, remain
                    // sub-clusters are all empty
                    int k = 0; // k is the current sub-cluster to assign
                               // distinct set to
                    while ((k < numPartitions - 1)
                            && (distinctSize - startingDistinctSet > numPartitions
                                    - k))
                    {
                        // assign sets based on their cardinality, prefer
                        // balance sub-cluster
                        final int median = (clusterSize - firstPointWithDistinctDistance[startingDistinctSet])
                                / (numPartitions - k);

                        // find the distince set that contains the median point
                        int t = startingDistinctSet;
                        while (firstPointWithDistinctDistance[t + 1] < median
                                + firstPointWithDistinctDistance[startingDistinctSet])
                            t++;

                        // if median falls in the first distinct set, assign
                        // this set to current cluster
                        if (t != startingDistinctSet)
                            t = (firstPointWithDistinctDistance[t + 1]
                                    - median
                                    - firstPointWithDistinctDistance[startingDistinctSet] >= median
                                    + firstPointWithDistinctDistance[startingDistinctSet]
                                    - firstPointWithDistinctDistance[t]) ? t - 1
                                    : t;

                        // now startingDistinctSet is the index of the first
                        // distinct set, and t is the index
                        // of the last distinct set, to be assinged to current
                        // sub-cluster
                        // set the sub-cluster offset, lower, upper bound
                        nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j]
                                + firstPointWithDistinctDistance[t + 1];

                        final int firstChild = j * clusterCardinality + k
                                * nextClusterCardinality;

                        for (int temp = firstChild; temp < firstChild
                                + nextClusterCardinality; temp++)
                        {
                            lower[temp][i] = distinctDistance[startingDistinctSet];
                            upper[temp][i] = distinctDistance[t];
                        }

                        if (Debug.debug)
                        {
                            logger.finer("computing " + k
                                    + "th sub-cluster, median=" + median
                                    + ", assigned distinct set:"
                                    + startingDistinctSet + ", last set:" + t
                                    + ", first child =" + firstChild + ", i="
                                    + i + ", j=" + j + ", k=" + k);
                            logger.finer("next cluster offset:");
                            for (int temp = 0; temp < nextClusterOffset.length; temp++)
                                logger.finer("[" + temp + ":"
                                        + nextClusterOffset[temp] + "],");

                            logger.finer("\nlower, upper:");
                            for (int temp = 0; temp < fanout; temp++)
                                logger.finer("[" + temp + ": " + lower[temp][i]
                                        + ", " + upper[temp][i] + "], ");
                            logger.finer("");
                        }

                        startingDistinctSet = t + 1;
                        k++;
                    }

                    // if reaches the last sub-cluster, assign all remain set to
                    // it
                    if (k == numPartitions - 1)
                    {
                        // set the sub-cluster offset, lower, upper bound
                        nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j + 1];

                        final int firstChild = j * clusterCardinality + k
                                * nextClusterCardinality;
                        for (int temp = firstChild; temp < firstChild
                                + nextClusterCardinality; temp++)
                        {
                            lower[temp][i] = distinctDistance[startingDistinctSet];
                            upper[temp][i] = distinctDistance[distinctSize - 1];
                        }
                    }

                    // remain set number is not greater than remain sub-cluster
                    // number,
                    // assign one set to each sub-cluster
                    else
                    {
                        if (Debug.debug)
                        {
                            logger.finer("less distinct set:"
                                    + (distinctSize - startingDistinctSet)
                                    + ", remain sub-cluster:"
                                    + (numPartitions - k));
                        }

                        for (int t = startingDistinctSet; t < distinctSize; t++)
                        {
                            nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j]
                                    + firstPointWithDistinctDistance[t + 1];

                            final int firstChild = j * clusterCardinality + k
                                    * nextClusterCardinality;
                            for (int temp = firstChild; temp < firstChild
                                    + nextClusterCardinality; temp++)
                            {
                                lower[temp][i] = distinctDistance[t];
                                upper[temp][i] = distinctDistance[t];
                            }

                            k++;
                        }

                        if (k < numPartitions) // if there are still
                                               // sub-cluster, set them to be
                                               // null
                        {
                            for (; k < numPartitions; k++)
                                nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j + 1];
                        }
                    }
                } // end of a loop for each cluster

                clusterOffset = nextClusterOffset;
                clusterCardinality = nextClusterCardinality;
                clusterNumber = nextClusterNumber;

            } // end of loop for each vantage point

            // compute non-empty cluster number
            int childrenNumber = 0;
            for (int i = 0; i < fanout; i++)
            {
                if (clusterOffset[i] < clusterOffset[i + 1])
                    childrenNumber++;
            }

            if (Debug.debug)
                logger.finer("final children number: " + childrenNumber
                        + ", fanout=" + fanout);

            if (childrenNumber < fanout) // if there are some empty clusters,
                                         // delete them)
            {
                double[][] newLower = new double[childrenNumber][];
                double[][] newUpper = new double[childrenNumber][];
                int[] newOffset = new int[childrenNumber + 1];
                newOffset[childrenNumber] = size;

                int j = 0;
                for (int i = 0; i < fanout; i++)
                {
                    if (clusterOffset[i] < clusterOffset[i + 1])
                    {
                        newLower[j] = lower[i];
                        newUpper[j] = upper[i];
                        newOffset[j] = clusterOffset[i];
                        j++;
                    }
                }

                lower = newLower;
                upper = newUpper;
                clusterOffset = newOffset;
            }

            // assign data to subDataList
            List<List<? extends IndexObject>> subDataList = new ArrayList<List<? extends IndexObject>>(
                    childrenNumber);
            for (int i = 0; i < childrenNumber; i++)
            {
                ArrayList<IndexObject> subList = new ArrayList<IndexObject>(
                        clusterOffset[i + 1] - clusterOffset[i]);

                for (int j = clusterOffset[i]; j < clusterOffset[i + 1]; j++)
                    subList.add((IndexObject) wrapper[j].getObject());

                if (subList.size() == 0)
                    System.out.println("sub list :" + i + " is empty!");

                subDataList.add(subList);
            }

            VPInternalNode predicate = new VPInternalNode(pivots, lower, upper,
                    data.size(), new long[childrenNumber], this.GHTDegree);
            PartitionResults partitionResult = new PartitionResults(
                    subDataList, predicate);

            return partitionResult;

        }
    },
    CLUSTERINGKMEANS
    {
        private int GHTDegree = 0; // -1: ght, -2:cght

        /**
         * @param R
         */
        public void setMaxRadius(double R)
        {
            this.GHTDegree = (int) R;
        }

        /**
         * @author Rui Mao
         */
        class ClusteringKMeansTask
        {

            private int       first;

            private int       last;

            private double[]  lower;

            private double[]  upper;

            private boolean[] toUse;

            public ClusteringKMeansTask(int first, int last, double[] lower,
                    double[] upper, boolean[] toUse)
            {
                this.first = first;
                this.last = last;
                this.lower = lower;
                this.upper = upper;
                this.toUse = toUse;
            }

        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots,
                List<? extends IndexObject> data, int numPartitions, int maxLS)
        {
            return partition(metric, pivots, data, 0, data.size(),
                    numPartitions, maxLS);
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param first
         * @param size
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots,
                List<? extends IndexObject> data, int first, int size,
                int numPartitions, int maxLS)
        {
            return partition(metric, pivots, data, first, size, numPartitions,
                    maxLS, Logger.getLogger("GeDBIT.index"));
        }

        /**
         * given vantage points, this method partition the dataset based on its
         * intrinsic clustering.
         * 
         * @param METRIC
         *            the {@link Metric} to compute distance with
         * @param data
         *            the source data list to split, each element is a
         *            {@link RecordObject}
         * @param VP
         *            the vantage points array, each element can be computed
         *            distance on
         * @param SVF
         *            partition number induced by each vantage point
         * @param maxLS
         *            max leaf size, if a cluster has less size, don't partition
         *            further
         * @return a list, the first element is a List [], which contains lists
         *         of data of each child, the second element is of type double
         *         [][], which is the lowerRange, the min distance from each
         *         child to each vantage point, child*VP, the third element is
         *         of type double [][], which is the upperRange, the max
         *         distance from each child to each vantage point, child*vp
         */
        PartitionResults partition(Metric metric, IndexObject[] pivots,
                List<? extends IndexObject> data, int first, int size,
                final int SVF, final int maxLS, Logger logger)
        {
            if (Debug.debug)
                logger.finer("clusteringPartition");

            // compute all the distance
            final int numPivots = pivots.length;

            double[][] distance = new double[numPivots][size];

            for (int i = 0; i < size; i++)
                for (int j = 0; j < numPivots; j++)
                    distance[j][i] = metric.getDistance(data.get(i), pivots[j]);

            // hack: if this is ght,cght partition, transform distances to d1+d2
            // an d1-d2
            if (this.GHTDegree < 0)
            {
                // for now, must be only two pivots
                if (distance.length != 2)
                    throw new IllegalArgumentException(
                            "for CGHT partition, there should be only two pivots!");

                // transformation to d1+d2, d1-d2
                for (int i = 0; i < size; i++)
                {
                    distance[0][i] = distance[0][i] + distance[1][i];
                    distance[1][i] = distance[0][i] - distance[1][i] * 2;
                }
            }

            // matain a list of clusters to be partitioned. each list item
            // contains:
            // 1. the first last offset of the cluster in the data array, two
            // Integers
            // 2. the distance ranges to all vps, two 1-d double array the first
            // is the lower bound,
            // then the upper bound
            // 3. a boolean array corresponding to all vps, true means this vp
            // is to partition on.
            // if all the list item's boolean array are all false, the partition
            // is done.
            // therefore, when add a list item to the list, if its boolean array
            // is all false, add
            // it to the end, otherwise to the begining
            // thus if the first list item's boolean array is all false, then
            // the partition is done.

            if (this.GHTDegree != -1)
            {
                LinkedList<ClusteringKMeansTask> taskList = new LinkedList<ClusteringKMeansTask>();
                boolean[] toUse = new boolean[numPivots];
                for (int i = 0; i < numPivots; i++)
                {
                    toUse[i] = true;
                }
                ClusteringKMeansTask ckmTask = new ClusteringKMeansTask(0,
                        size - 1, new double[numPivots], new double[numPivots],
                        toUse);
                taskList.addFirst(ckmTask);

                boolean done = false;
                // the loop to partition each cluster
                while (true)
                {
                    ckmTask = taskList.getFirst();
                    done = true;
                    for (int i = 0; i < numPivots; i++)
                        if (ckmTask.toUse[i])
                        {
                            done = false;
                            break;
                        }

                    // if done, the first list item's boolean is all false, the
                    // partition is done, ready
                    // to return
                    if (done)
                        break;

                    // otherwise, partition the current cluster, select a best
                    // vp,
                    // partition based on
                    // this vp, put new sub-clusters into task list, arrange
                    // data
                    // list, distance array.
                    partitionACluster(data, distance, taskList, SVF, maxLS,
                            logger);
                }

                // now partition is done, return result's in required format.
                final int childrenNumber = taskList.size(); // may need to check
                // whether cluster
                // number
                // ==1
                // if (childrenNumber ==1)
                // System.out.println("cluster can not be partitioned!");

                List<List<? extends IndexObject>> subDataList = new ArrayList<List<? extends IndexObject>>(
                        childrenNumber);
                double[][] allLower = new double[childrenNumber][numPivots];
                double[][] allUpper = new double[childrenNumber][numPivots];
                final int taskListSize = taskList.size();
                for (int i = 0; i < taskListSize; i++)
                {
                    ckmTask = taskList.get(i);
                    subDataList.add(data.subList(ckmTask.first,
                            ckmTask.last + 1));
                    for (int j = 0; j < numPivots; j++)
                    {
                        allLower[i][j] = ckmTask.lower[j];
                        allUpper[i][j] = ckmTask.upper[j];
                    }
                }

                VPInternalNode predicate = new VPInternalNode(pivots, allLower,
                        allUpper, size, new long[childrenNumber],
                        this.GHTDegree);
                PartitionResults partitionResult = new PartitionResults(
                        subDataList, predicate);

                return partitionResult;
            }
            //else ght partition
            {
                final int childrenNumber = 2;
                List<List<? extends IndexObject>> subDataList = new ArrayList<List<? extends IndexObject>>(
                        childrenNumber);
                double[][] allLower = new double[childrenNumber][numPivots];
                double[][] allUpper = new double[childrenNumber][numPivots];
                
                int head = 0, tail = size -1;
                double temp;
                while(head <= tail)
                {
                    while ((head < size) && (distance[1][head] <= 0)) head++;
                    while ((tail >= 0  ) && (distance[1][tail] >  0)) tail--;
                    if (head <= tail)
                    {
                        Collections.swap(data, head, tail);
                        temp = distance[1][head];
                        distance[1][head] = distance[1][tail];
                        distance[1][tail] = temp;
                    }
                }
                subDataList.add(data.subList(0, head));
                subDataList.add(data.subList(head, size));

                VPInternalNode predicate = new VPInternalNode(pivots, allLower, allUpper, size, new long[childrenNumber],
                        this.GHTDegree);
                PartitionResults partitionResult = new PartitionResults(subDataList, predicate);

                return partitionResult;
                
            }
        }

        /**
         * partition the first cluster in the task list, select a best vp,
         * partition based on this vp, put new sub-clusters back into task list,
         * if the sub-clusters don't have further partition, append them to the
         * end, otherwise insert to the head arrange data list, distance array,
         * put data and distances belongs to the same sub-clusters together.
         * 
         * @param data
         *            list of data set
         * @param distance
         *            distance values from each data point to each vantage point
         * @param taskList
         *            a {@link LinkedList} of all the clusters to be
         *            partitioned.
         * @param SVF
         *            single fanout
         * @param maxLS
         *            max leaf size, if a cluster has less size, don't partition
         *            further
         * @param logger
         */
        private void partitionACluster(List<? extends IndexObject> data,
                double[][] distance, LinkedList<ClusteringKMeansTask> taskList,
                final int SVF, final int maxLS, Logger logger)
        {
            ClusteringKMeansTask task = taskList.removeFirst();
            final int first = task.first;
            final int last = task.last;

            boolean[] toUse = task.toUse;
            // if current cluster can be fit in a leaf node, don't partition
            // further
            if (last - first + 1 <= maxLS)
            {
                double min = Double.POSITIVE_INFINITY;
                double max = Double.NEGATIVE_INFINITY;

                double[] lower = task.lower;
                double[] upper = task.upper;
                
                // set distance range for unused vps
                for (int i = 0; i < toUse.length; i++)
                    if (toUse[i])
                    {
                        for (int j = first; j <= last; j++)
                        {
                            if (min > distance[i][j])
                                min = distance[i][j];

                            if (max < distance[i][j])
                                max = distance[i][j];
                        }

                        lower[i] = min;
                        upper[i] = max;
                        toUse[i] = false;
                    }
                taskList.addLast(task);
                return;
            }

            // if the cluster can not be fit in a leaf node, go on to partition

            int childrenNumber = 1;
            int minVP = 0; // the index of the vp with the min variance
            double minVar = Double.POSITIVE_INFINITY; // min value of variance
            double var = 0; // temp variable for variance
            double[] means = null; // means of k-means
            double[] split = null; // split values
            double[] minSplit = null; // the set of split values with the min
            // variance
            int[] bucketSize = null;
            int[] minBucketSize = null; // the bucketsize array of the vp with
            // min variance
            double[] lower = null;
            double[] minLower = null;
            double[] upper = null;
            double[] minUpper = null; // lower, upper bound of distances of
            // each sub-cluster
            for (int i = 0; i < toUse.length; i++)
            {
                // if the vp is already used, go to the next one
                if (!toUse[i])
                    continue;

                // find the initial clustering to run k-means, each item in
                // means should be a
                // different real distance value that exists
                // thus, if the length of means is less than SVF, all the
                // distinct distance values
                // have been returned, no need to run k-means
                means = bucketInitialClustering(distance[i], first, last, SVF);

                if (means.length < SVF)
                {
                    if (childrenNumber > means.length) // some other vp can
                        // partition the cluster
                        // into more sub-clusters, ignore current vp
                        continue;
                } else
                // run the k-means with means as the initial clustering
                {
                    kMeans(distance[i], first, last, means, logger);
                }

                split = new double[means.length - 1];
                Arrays.sort(means);
                for (int j = 0; j < split.length; j++)
                    split[j] = (means[j] + means[j + 1]) / 2;

                // split values are available, compute the variance
                bucketSize = new int[split.length + 1];
                for (int j = 0; j < bucketSize.length; j++)
                    bucketSize[j] = 0;

                lower = new double[split.length + 1];
                upper = new double[split.length + 1];
                for (int j = 0; j < lower.length; j++)
                {
                    lower[j] = Double.POSITIVE_INFINITY;
                    upper[j] = Double.NEGATIVE_INFINITY;
                }
                for (int j = first; j <= last; j++) // for each point, find
                // which bucket it belongs
                // to. left bound inclusive, right bound
                // exclusive
                {
                    int k = 0;
                    while ((k < split.length) && (distance[i][j] >= split[k]))
                        k++;

                    bucketSize[k]++;
                    if (lower[k] > distance[i][j])
                        lower[k] = distance[i][j];

                    if (upper[k] < distance[i][j])
                        upper[k] = distance[i][j];
                }

                var = 0;
                // varx = E(x^2) - (Ex)^2, since Ex is fixed, we dont compute
                // it.
                for (int j = 0; j < bucketSize.length; j++)
                    var += bucketSize[j] * bucketSize[j];

                // compare with currnet min variance
                if ((bucketSize.length > childrenNumber)
                        || ((bucketSize.length == childrenNumber) && (minVar > var)))
                {
                    minVar = var;
                    minSplit = split;
                    minVP = i;
                    minBucketSize = bucketSize;
                    minLower = lower;
                    minUpper = upper;
                }
                if (bucketSize.length > childrenNumber)
                {
                    childrenNumber = bucketSize.length;
                }

            }// end of loop for each vp

            // the best vp is found , split and bucketsize array are ready,
            // partition the cluster
            // now
            // if childrenNumber ==1, the cluster can not be partitioned by any
            // vp, add a finished
            // task to task list
            if (childrenNumber == 1)
            {
                lower = task.lower;
                upper = task.upper;
                for (int i = 0; i < toUse.length; i++)
                    if (toUse[i])
                    {
                        lower[i] = distance[i][first];
                        upper[i] = distance[i][first];
                        toUse[i] = false;
                    }

                ClusteringKMeansTask newCKMTask = new ClusteringKMeansTask(
                        first, last, lower, upper, toUse);
                taskList.addLast(newCKMTask);
                return;
            }

            // if childrennumber != 1, continue further process
            // set bucketFirst array, bucketFirst[i] is the offset of the first
            // element of bucket i
            int[] bucketFirst = new int[childrenNumber + 1];
            System.arraycopy(minBucketSize, 0, bucketFirst, 0, childrenNumber);
            for (int i = 1; i < childrenNumber; i++)
                bucketFirst[i] += bucketFirst[i - 1];
            System.arraycopy(bucketFirst, 0, bucketFirst, 1, childrenNumber);
            bucketFirst[0] = 0;
            for (int i = 0; i <= childrenNumber; i++)
                bucketFirst[i] += first;

            // bucketPointer[i] points to the first place of bucket i to be
            // sorted
            int[] bucketPointer = new int[childrenNumber];
            System.arraycopy(bucketFirst, 0, bucketPointer, 0, childrenNumber);

            // ready to arrange, for each bucket, for each value, if it doesn't
            // belong to the
            // bucket,
            // exchange it with an element in the correct bucket
            double tempDouble;
            for (int i = 0; i < childrenNumber; i++)
            {
                for (int j = bucketPointer[i]; j < bucketFirst[i + 1]; j++)
                {
                    while (true)
                    {
                        // compute the bucket id of current point
                        int k = 0; // k is the correct id
                        while ((k < minSplit.length)
                                && (distance[minVP][j] >= minSplit[k]))
                            k++;

                        // if current point belongs to current bucket, go to
                        // next point
                        if (k == i)
                            break;

                        // exchange the point object
                        Collections.swap(data, j, bucketPointer[k]);

                        // exchange the distance values to all vps
                        for (int t = 0; t < distance.length; t++)
                        {
                            tempDouble = distance[t][j];
                            distance[t][j] = distance[t][bucketPointer[k]];
                            distance[t][bucketPointer[k]] = tempDouble;
                        }

                        bucketPointer[k]++;
                    } // end of while
                }
            }

            // the data list and distance array have been re-arranged, now
            // create sub-clusters, and
            // put them into task list
            toUse[minVP] = false;
            boolean done = true;
            // check whether the sub-clusters need further partition
            for (int i = 0; i < toUse.length; i++)
                if (toUse[i])
                {
                    done = false;
                    break;
                }

            if (done) // if no further partition (all vp are used), add to the
            // end of the task
            // list
            {
                for (int i = 0; i < childrenNumber; i++)
                {
                    lower = task.lower.clone();
                    lower[minVP] = minLower[i];
                    upper = task.upper.clone();
                    upper[minVP] = minUpper[i];

                    ClusteringKMeansTask newCKMTask = new ClusteringKMeansTask(
                            bucketFirst[i], bucketFirst[i + 1] - 1, lower,
                            upper, toUse.clone());
                    taskList.addLast(newCKMTask);
                }
            } else
            // further partition is needed, add to the head of the task list
            {
                for (int i = childrenNumber - 1; i >= 0; i--)
                {
                    lower = task.lower.clone();
                    lower[minVP] = minLower[i];
                    upper = task.upper.clone();
                    upper[minVP] = minUpper[i];

                    ClusteringKMeansTask newCKMTask = new ClusteringKMeansTask(
                            bucketFirst[i], bucketFirst[i + 1] - 1, lower,
                            upper, toUse.clone());
                    taskList.addFirst(newCKMTask);
                }
            }

        }

        /**
         * find the initial clustering to run k-means, each item in retuned
         * array should be a real distinct distance value that exists, thus, if
         * the length of returned array is less than SVF, all the distinct
         * distance values have been returned
         * 
         * @param distance
         *            the array containing all the double value to find means on
         * @param first
         *            offset of the first element
         * @param last
         *            offset of the last element
         * @param SVF
         *            number of means/distinct values to find
         * @return an array of means, each element is different from others, if
         *         its length is shorter than SVF, all distinct values are in it
         */
        private double[] bucketInitialClustering(double[] distance,
                final int first, final int last, final int SVF)
        {
            final int bucketNumber = Math.max(Math.min((last - first + 1) / 10,
                    50 * SVF), SVF);

            double min, max; // min , max distance to a vp
            min = Double.POSITIVE_INFINITY;
            max = Double.NEGATIVE_INFINITY;
            for (int j = first; j <= last; j++)
            {
                if (distance[j] > max)
                    max = distance[j];

                if (distance[j] < min)
                    min = distance[j];
            }

            // if min == max, can not partition by current vp.
            if (max == min)
            {
                double[] result = new double[1];
                result[0] = distance[first];
                return result;
            }

            // compute the bucket size
            int[] bucketSize = new int[bucketNumber];
            for (int i = 0; i < bucketNumber; i++)
                bucketSize[i] = 0;

            final double bucketWidth = (max - min) / bucketNumber;
            for (int i = first; i <= last; i++)
            {
                int temp = (int) ((distance[i] - min) / bucketWidth);
                if (temp >= bucketNumber)
                    temp = bucketNumber - 1;
                bucketSize[temp]++;
            }

            // find the buckets whose size is local max
            boolean[] isLocalMax = new boolean[bucketNumber];
            isLocalMax[0] = (bucketSize[0] >= bucketSize[1]) ? true : false;
            isLocalMax[bucketNumber - 1] = (bucketSize[bucketNumber - 1] >= bucketSize[bucketNumber - 2]) ? true
                    : false;
            for (int i = 1; i <= bucketNumber - 2; i++)
                isLocalMax[i] = ((bucketSize[i] >= bucketSize[i - 1]) && (bucketSize[i] >= bucketSize[i + 1])) ? true
                        : false;

            // remove consecutive local max bucket
            int loop = 0;
            while (loop < bucketNumber)
            {
                if (!isLocalMax[loop])
                    loop++;
                else
                {
                    int lastMax = loop + 1;
                    while ((lastMax < bucketNumber) && isLocalMax[lastMax])
                        lastMax++;
                    for (int i = loop; i < lastMax; i++)
                        isLocalMax[i] = false;
                    isLocalMax[(loop + lastMax - 1) / 2] = true;
                    loop = lastMax + 1;
                }
            }

            int localMaxBucketNumber = 0; // number of positive-size local max
            // bucket
            for (int i = 0; i < bucketNumber; i++)
                if (isLocalMax[i] && (bucketSize[i] > 0))
                    localMaxBucketNumber++;

            if (localMaxBucketNumber >= SVF) // there are enough bins, find
            // svf largest ones,
            // return the middle point of them
            {
                boolean[] isLargest = new boolean[bucketNumber];
                for (int i = 0; i < bucketNumber; i++)
                    isLargest[i] = false;

                for (int i = 0; i < SVF; i++)
                {
                    int maxSize = 0;
                    int maxId = 0;
                    for (int j = 0; j < bucketNumber; j++)
                    {
                        if (isLocalMax[j] && !isLargest[j]
                                && (bucketSize[j] > maxSize))
                        {
                            maxSize = bucketSize[j];
                            maxId = j;
                        }
                    }
                    isLargest[maxId] = true;
                }
                double[] result = new double[SVF];
                int counter = 0;
                for (int i = 0; i < bucketNumber; i++)
                {
                    if (isLargest[i])
                    {
                        result[counter] = min + (i + 0.5) * bucketWidth;
                        counter++;
                    }
                }

                return result;
            } else
            // no enough local max bucket, for each local max bin, find a value
            // in it, then
            // find dist
            {
                double[] result = new double[SVF];
                int counter = 0;
                // for each local max bucket, find a value in it.
                for (int i = first; i <= last; i++)
                {
                    int temp = (int) ((distance[i] - min) / bucketWidth);
                    if (temp >= bucketNumber)
                        temp = bucketNumber - 1;
                    if (isLocalMax[temp])
                    {
                        result[counter] = distance[i];
                        isLocalMax[temp] = false;
                        counter++;
                        if (counter >= localMaxBucketNumber)
                            break;
                    }
                }

                // find distinct values
                for (int i = first; i <= last; i++)
                {
                    boolean isDistinct = true;
                    for (int j = 0; j < counter; j++)
                        if (distance[i] == result[j])
                        {
                            isDistinct = false;
                            break;
                        }

                    if (isDistinct)
                    {
                        result[counter] = distance[i];
                        counter++;
                        if (counter >= SVF)
                            break;
                    }
                }

                if (counter < SVF) // no enough distinct values
                {
                    double[] finalResult = new double[counter];
                    System.arraycopy(result, 0, finalResult, 0, counter);
                    return finalResult;
                } else
                    return result;
            }

        }

        /**
         * run the k-means given the initial clustering. after running, the
         * results are stored in the argument, means
         * 
         * @param distance
         *            the array containing all the double value to run on
         * @param first
         *            offset of the first element
         * @param last
         *            offset of the last element
         * @param means
         *            double array of initial values of means, after the method
         *            runs, its values are the final means
         * @param logger
         */
        private void kMeans(double[] distance, final int first, final int last,
                double[] means, Logger logger)
        {
            final double stop = 0.1;
            final int size = last - first + 1;
            final int clusterNumber = means.length;
            short[] clusterId = new short[size];
            double[] split = new double[clusterNumber - 1];

            double sum = 0, newSum = 0;

            double[] clusterSum = new double[clusterNumber];
            int[] clusterSize = new int[clusterNumber];
            int counter = 0;
            while ((counter < 2) || Math.abs(newSum - sum) / sum > stop)
            {
                sum = newSum;

                // set the cluster id of each value
                Arrays.sort(means);
                for (int j = 0; j < split.length; j++)
                    split[j] = (means[j] + means[j + 1]) / 2;

                for (int i = 0; i < size; i++)
                {
                    clusterId[i] = 0; // k is the correct id
                    while ((clusterId[i] < split.length)
                            && (distance[first + i] >= split[clusterId[i]]))
                        clusterId[i]++;
                }

                // compute new mean and new sum
                for (int i = 0; i < clusterNumber; i++)
                {
                    clusterSum[i] = 0;
                    clusterSize[i] = 0;
                }
                for (int i = 0; i < size; i++)
                {
                    clusterSum[clusterId[i]] += distance[first + i];
                    clusterSize[clusterId[i]]++;
                }

                newSum = 0;
                for (int i = 0; i < clusterNumber; i++)
                {
                    means[i] = clusterSum[i] / clusterSize[i];
                    newSum += means[i];
                }

                counter++;
                if ((counter > 100) && (counter % 100 == 0))
                    System.out.println("counter= " + counter + ", too large!");

                if (Debug.debug)
                    logger.finer("counter= " + counter + ",  sum= " + sum
                            + ",  new sum= " + newSum);
            }

        }
    },
    EXCLUDEDMIDDLE
    {
        double maxR = 0;

        /**
         * @param R
         */
        public void setMaxRadius(double R)
        {
            this.maxR = R;
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots,
                List<? extends IndexObject> data, int numPartitions, int maxLS)
        {
            return partition(metric, pivots, data, 0, data.size(),
                    numPartitions, maxLS);
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param first
         * @param size
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots,
                List<? extends IndexObject> data, int first, int size,
                int numPartitions, int maxLS)
        {
            PivotWisePartition pm = new PivotWisePartition();
            pm.setMaxRadius(maxR);
            return pm.partition(metric, pivots, data, first, size,
                    numPartitions, maxLS);
        }
    },
    CGHT
    {
        /**
         * @param R
         */
        public void setMaxRadius(double R)
        {
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int numPartitions, int maxLS)
        {
            return null;
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param first
         * @param size
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size,
                int numPartitions, int maxLS)
        {
            return null;
        }
    },
    GHT
    {
        /**
         * @param R
         */
        public void setMaxRadius(double R)
        {
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int numPartitions, int maxLS)
        {
            return null;
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param first
         * @param size
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size,
                int numPartitions, int maxLS)
        {
            return null;
        }
    },
    CGHTBALANCED
    {
        /**
         * @param R
         */
        public void setMaxRadius(double R)
        {
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int numPartitions, int maxLS)
        {
            return null;
        }

        /**
         * @param metric
         * @param pivots
         * @param data
         * @param first
         * @param size
         * @param numPartitions
         * @return
         */
        public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size,
                int numPartitions, int maxLS)
        {
            return null;
        }
    };
    /*
     * constant used to hold input parameters
     * Added by Kewei Ma
     */
    public static String pm;
    public static double r;
    public static int countRN = 0;
}