package GeDBIT.index.algorithms; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.logging.Logger; import GeDBIT.dist.Metric; import GeDBIT.index.VPInternalNode; import GeDBIT.util.Debug; import GeDBIT.type.IndexObject; import GeDBIT.type.DoubleIndexObjectPair; /** * All the built-in data partition methods. Balance: all the partitions have * similar sizes. Fast but performs worse CLUSTERINGKMEANS: partition the data * according to the intrinsic clustering, use k-means for each dimension. * CLUSTERINGBOUNDARY: partition according to the intrinsic clustering, use a * boundary-oriented algorithm for each dimension. */ public enum PartitionMethods implements PartitionMethod { BALANCED { //by Honglong Xu private int GHTDegree = 0; // -1: ght, -2:cght /** * * @param R */ public void setMaxRadius(double R) { //by Honglong Xu this.GHTDegree = (int) R; } /** * @param metric * @param pivots * @param data * @param numPartitions * @return */ public PartitionResults partition(final Metric metric, final IndexObject[] pivots, List<? extends IndexObject> data, final int numPartitions, int maxLS) { return partition(metric, pivots, data, 0, data.size(), numPartitions, maxLS); } /** * @param metric * @param pivots * @param data * @param first * @param size * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size, int numPartitions, int maxLS) { return partition(metric, pivots, data, first, size, numPartitions, maxLS, Logger.getLogger("GeDBIT.index")); } /** * @param metric * @param pivots * @param data * @param first * @param size * @param numPartitions * @param logger * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size, int numPartitions, int maxLS, Logger logger) { final int numPivots = pivots.length; final int fanout = (int) Math.pow(numPartitions, numPivots); if (Debug.debug) logger.finer("Start of splitData(), data size= " + size + ", VPNumber= " + numPivots + ", fanout= " + fanout); // the lower and upper bound of distances from each child to each // vantage point double[][] lower = new double[fanout][numPivots]; double[][] upper = new double[fanout][numPivots]; DoubleIndexObjectPair[] wrapper = new DoubleIndexObjectPair[size]; for (int i = first; i < size; i++) wrapper[i] = new DoubleIndexObjectPair(0, data.get(i)); // split data. int clusterNumber = 1; // total cluster number when partition based // on each vp, SVF ^ i int clusterCardinality = fanout; // number of final cluster in each // of current cluster // offset of the first point in current cluster in wrapper, the // whole data list // this array has an additional element at the end of value size, // serving as a loop guard // the first element of this array is always 0 int[] clusterOffset = new int[2]; clusterOffset[0] = 0; clusterOffset[1] = size; //longer if (this.GHTDegree < 0) { // for now, must be only two pivots if (numPivots != 2) { throw new IllegalArgumentException( "for CGHT partition, there should be only two pivots!"); } } for (int i = 0; i < numPivots; i++) { if (Debug.debug) logger.finer("\nStart spliting vp:" + i + ", cluster number:" + clusterNumber + ", clusterCardinality =" + clusterCardinality + ", computing distances to the vp..."); // compute distance to the current VP //for (int j = 0; j < size; j++) // wrapper[j].setDouble(metric.getDistance(pivots[i], // ((IndexObject) wrapper[j].getObject()))); //by Honglong Xu. // hack: if this is ght,cght partition, transform distances to d1+d2 // an d1-d2 if(this.GHTDegree == -2) //cght { double distance1, distance2; for (int j = 0; j < size; j++) { distance1 = metric.getDistance(pivots[0], ((IndexObject) wrapper[j].getObject())); distance2 = metric.getDistance(pivots[1], ((IndexObject) wrapper[j].getObject())); distance1 = distance1 + distance2; distance2 = distance1 - distance2 * 2; wrapper[j].setDouble(i==0 ? distance1 : distance2); //wrapper[j].setDouble(i==0 ? distance1+distance2 : distance1-distance2); } } else { for (int j = 0; j < size; j++) wrapper[j].setDouble(metric.getDistance(pivots[i], ((IndexObject) wrapper[j].getObject()))); } if (Debug.debug) logger.finer("Sorting the new computed distances...:"); // sort each part for (int j = 0; j < clusterNumber; j++) { if (Debug.debug) logger.finer("[" + j + ": " + clusterOffset[j] + ", " + clusterOffset[j + 1] + "], "); Arrays.sort(wrapper, clusterOffset[j], clusterOffset[j + 1], DoubleIndexObjectPair.DoubleComparator); } final int nextClusterNumber = clusterNumber * numPartitions; int[] nextClusterOffset = new int[nextClusterNumber + 1]; nextClusterOffset[0] = 0; nextClusterOffset[nextClusterNumber] = size; int nextClusterCardinality = clusterCardinality / numPartitions; // split each current cluster into SVF sub-clusters based on the // distance to current VP for (int j = 0; j < clusterNumber; j++) { // size of current cluster (number of points) final int clusterSize = clusterOffset[j + 1] - clusterOffset[j]; // if this cluster is empty, set all its sub-cluster to be // empty if (clusterSize == 0) { for (int k = 0; k < numPartitions; k++) nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j + 1]; // jump to next cluster continue; } if (Debug.debug) { logger.finer("Partitioning the " + j + "th cluster, size=" + clusterSize + ", Distances: "); for (int temp = clusterOffset[j]; temp < clusterOffset[j + 1]; temp++) logger.finer(wrapper[temp].getDouble() + ", "); logger.finer(""); } // find the last indices of each distinct distance value in // wrapper, which is already sorted ArrayList<Integer> tempIndex = new ArrayList<Integer>(); ArrayList<Double> tempValue = new ArrayList<Double>(); // the distinct distance value in check, and the number of // points with this distance double currentDistance = wrapper[clusterOffset[j]] .getDouble(); int sum = 0; for (int k = clusterOffset[j]; k < clusterOffset[j + 1]; k++) { final double nextDistance = wrapper[k].getDouble(); if (nextDistance != currentDistance) // find next // distinct // distance value { tempIndex.add(sum); tempValue.add(currentDistance); currentDistance = nextDistance; } sum++; } // put the last distinct value into the list tempIndex.add(sum); tempValue.add(currentDistance); final int distinctSize = tempIndex.size(); // index of first point with current distinct distance // value, // this is the offset in current cluster, not the index in // wrapper // distinct distance values int[] firstPointWithDistinctDistance = new int[distinctSize + 1]; double[] distinctDistance = new double[distinctSize]; firstPointWithDistinctDistance[0] = 0; firstPointWithDistinctDistance[distinctSize] = clusterSize; distinctDistance[0] = wrapper[clusterOffset[j]].getDouble(); for (int k = 1; k < distinctSize; k++) { firstPointWithDistinctDistance[k] = ((Integer) tempIndex .get(k - 1)).intValue(); distinctDistance[k] = ((Double) tempValue.get(k)) .doubleValue(); } if (Debug.debug) { logger.finer("distinct distances(" + distinctSize + "): "); for (int temp = 0; temp < distinctSize; temp++) logger.finer("[" + temp + ": " + distinctDistance[temp] + ", " + firstPointWithDistinctDistance[temp] + "], "); logger.finer(""); } // assign the total distinctSize set of points with // identical distance value // to at most SVF sub-clusters, which is actually split // current cluster // number of distinct set that are already been assigned int startingDistinctSet = 0; // if distince set number is greater than SVF, assign them, // otherwise, // just assign one set to each sub-cluster, remain // sub-clusters are all empty int k = 0; // k is the current sub-cluster to assign // distinct set to while ((k < numPartitions - 1) && (distinctSize - startingDistinctSet > numPartitions - k)) { // assign sets based on their cardinality, prefer // balance sub-cluster final int median = (clusterSize - firstPointWithDistinctDistance[startingDistinctSet]) / (numPartitions - k); // find the distince set that contains the median point int t = startingDistinctSet; while (firstPointWithDistinctDistance[t + 1] < median + firstPointWithDistinctDistance[startingDistinctSet]) t++; // if median falls in the first distinct set, assign // this set to current cluster if (t != startingDistinctSet) t = (firstPointWithDistinctDistance[t + 1] - median - firstPointWithDistinctDistance[startingDistinctSet] >= median + firstPointWithDistinctDistance[startingDistinctSet] - firstPointWithDistinctDistance[t]) ? t - 1 : t; // now startingDistinctSet is the index of the first // distinct set, and t is the index // of the last distinct set, to be assinged to current // sub-cluster // set the sub-cluster offset, lower, upper bound nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j] + firstPointWithDistinctDistance[t + 1]; final int firstChild = j * clusterCardinality + k * nextClusterCardinality; for (int temp = firstChild; temp < firstChild + nextClusterCardinality; temp++) { lower[temp][i] = distinctDistance[startingDistinctSet]; upper[temp][i] = distinctDistance[t]; } if (Debug.debug) { logger.finer("computing " + k + "th sub-cluster, median=" + median + ", assigned distinct set:" + startingDistinctSet + ", last set:" + t + ", first child =" + firstChild + ", i=" + i + ", j=" + j + ", k=" + k); logger.finer("next cluster offset:"); for (int temp = 0; temp < nextClusterOffset.length; temp++) logger.finer("[" + temp + ":" + nextClusterOffset[temp] + "],"); logger.finer("\nlower, upper:"); for (int temp = 0; temp < fanout; temp++) logger.finer("[" + temp + ": " + lower[temp][i] + ", " + upper[temp][i] + "], "); logger.finer(""); } startingDistinctSet = t + 1; k++; } // if reaches the last sub-cluster, assign all remain set to // it if (k == numPartitions - 1) { // set the sub-cluster offset, lower, upper bound nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j + 1]; final int firstChild = j * clusterCardinality + k * nextClusterCardinality; for (int temp = firstChild; temp < firstChild + nextClusterCardinality; temp++) { lower[temp][i] = distinctDistance[startingDistinctSet]; upper[temp][i] = distinctDistance[distinctSize - 1]; } } // remain set number is not greater than remain sub-cluster // number, // assign one set to each sub-cluster else { if (Debug.debug) { logger.finer("less distinct set:" + (distinctSize - startingDistinctSet) + ", remain sub-cluster:" + (numPartitions - k)); } for (int t = startingDistinctSet; t < distinctSize; t++) { nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j] + firstPointWithDistinctDistance[t + 1]; final int firstChild = j * clusterCardinality + k * nextClusterCardinality; for (int temp = firstChild; temp < firstChild + nextClusterCardinality; temp++) { lower[temp][i] = distinctDistance[t]; upper[temp][i] = distinctDistance[t]; } k++; } if (k < numPartitions) // if there are still // sub-cluster, set them to be // null { for (; k < numPartitions; k++) nextClusterOffset[j * numPartitions + k + 1] = clusterOffset[j + 1]; } } } // end of a loop for each cluster clusterOffset = nextClusterOffset; clusterCardinality = nextClusterCardinality; clusterNumber = nextClusterNumber; } // end of loop for each vantage point // compute non-empty cluster number int childrenNumber = 0; for (int i = 0; i < fanout; i++) { if (clusterOffset[i] < clusterOffset[i + 1]) childrenNumber++; } if (Debug.debug) logger.finer("final children number: " + childrenNumber + ", fanout=" + fanout); if (childrenNumber < fanout) // if there are some empty clusters, // delete them) { double[][] newLower = new double[childrenNumber][]; double[][] newUpper = new double[childrenNumber][]; int[] newOffset = new int[childrenNumber + 1]; newOffset[childrenNumber] = size; int j = 0; for (int i = 0; i < fanout; i++) { if (clusterOffset[i] < clusterOffset[i + 1]) { newLower[j] = lower[i]; newUpper[j] = upper[i]; newOffset[j] = clusterOffset[i]; j++; } } lower = newLower; upper = newUpper; clusterOffset = newOffset; } // assign data to subDataList List<List<? extends IndexObject>> subDataList = new ArrayList<List<? extends IndexObject>>( childrenNumber); for (int i = 0; i < childrenNumber; i++) { ArrayList<IndexObject> subList = new ArrayList<IndexObject>( clusterOffset[i + 1] - clusterOffset[i]); for (int j = clusterOffset[i]; j < clusterOffset[i + 1]; j++) subList.add((IndexObject) wrapper[j].getObject()); if (subList.size() == 0) System.out.println("sub list :" + i + " is empty!"); subDataList.add(subList); } VPInternalNode predicate = new VPInternalNode(pivots, lower, upper, data.size(), new long[childrenNumber], this.GHTDegree); PartitionResults partitionResult = new PartitionResults( subDataList, predicate); return partitionResult; } }, CLUSTERINGKMEANS { private int GHTDegree = 0; // -1: ght, -2:cght /** * @param R */ public void setMaxRadius(double R) { this.GHTDegree = (int) R; } /** * @author Rui Mao */ class ClusteringKMeansTask { private int first; private int last; private double[] lower; private double[] upper; private boolean[] toUse; public ClusteringKMeansTask(int first, int last, double[] lower, double[] upper, boolean[] toUse) { this.first = first; this.last = last; this.lower = lower; this.upper = upper; this.toUse = toUse; } } /** * @param metric * @param pivots * @param data * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int numPartitions, int maxLS) { return partition(metric, pivots, data, 0, data.size(), numPartitions, maxLS); } /** * @param metric * @param pivots * @param data * @param first * @param size * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size, int numPartitions, int maxLS) { return partition(metric, pivots, data, first, size, numPartitions, maxLS, Logger.getLogger("GeDBIT.index")); } /** * given vantage points, this method partition the dataset based on its * intrinsic clustering. * * @param METRIC * the {@link Metric} to compute distance with * @param data * the source data list to split, each element is a * {@link RecordObject} * @param VP * the vantage points array, each element can be computed * distance on * @param SVF * partition number induced by each vantage point * @param maxLS * max leaf size, if a cluster has less size, don't partition * further * @return a list, the first element is a List [], which contains lists * of data of each child, the second element is of type double * [][], which is the lowerRange, the min distance from each * child to each vantage point, child*VP, the third element is * of type double [][], which is the upperRange, the max * distance from each child to each vantage point, child*vp */ PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size, final int SVF, final int maxLS, Logger logger) { if (Debug.debug) logger.finer("clusteringPartition"); // compute all the distance final int numPivots = pivots.length; double[][] distance = new double[numPivots][size]; for (int i = 0; i < size; i++) for (int j = 0; j < numPivots; j++) distance[j][i] = metric.getDistance(data.get(i), pivots[j]); // hack: if this is ght,cght partition, transform distances to d1+d2 // an d1-d2 if (this.GHTDegree < 0) { // for now, must be only two pivots if (distance.length != 2) throw new IllegalArgumentException( "for CGHT partition, there should be only two pivots!"); // transformation to d1+d2, d1-d2 for (int i = 0; i < size; i++) { distance[0][i] = distance[0][i] + distance[1][i]; distance[1][i] = distance[0][i] - distance[1][i] * 2; } } // matain a list of clusters to be partitioned. each list item // contains: // 1. the first last offset of the cluster in the data array, two // Integers // 2. the distance ranges to all vps, two 1-d double array the first // is the lower bound, // then the upper bound // 3. a boolean array corresponding to all vps, true means this vp // is to partition on. // if all the list item's boolean array are all false, the partition // is done. // therefore, when add a list item to the list, if its boolean array // is all false, add // it to the end, otherwise to the begining // thus if the first list item's boolean array is all false, then // the partition is done. if (this.GHTDegree != -1) { LinkedList<ClusteringKMeansTask> taskList = new LinkedList<ClusteringKMeansTask>(); boolean[] toUse = new boolean[numPivots]; for (int i = 0; i < numPivots; i++) { toUse[i] = true; } ClusteringKMeansTask ckmTask = new ClusteringKMeansTask(0, size - 1, new double[numPivots], new double[numPivots], toUse); taskList.addFirst(ckmTask); boolean done = false; // the loop to partition each cluster while (true) { ckmTask = taskList.getFirst(); done = true; for (int i = 0; i < numPivots; i++) if (ckmTask.toUse[i]) { done = false; break; } // if done, the first list item's boolean is all false, the // partition is done, ready // to return if (done) break; // otherwise, partition the current cluster, select a best // vp, // partition based on // this vp, put new sub-clusters into task list, arrange // data // list, distance array. partitionACluster(data, distance, taskList, SVF, maxLS, logger); } // now partition is done, return result's in required format. final int childrenNumber = taskList.size(); // may need to check // whether cluster // number // ==1 // if (childrenNumber ==1) // System.out.println("cluster can not be partitioned!"); List<List<? extends IndexObject>> subDataList = new ArrayList<List<? extends IndexObject>>( childrenNumber); double[][] allLower = new double[childrenNumber][numPivots]; double[][] allUpper = new double[childrenNumber][numPivots]; final int taskListSize = taskList.size(); for (int i = 0; i < taskListSize; i++) { ckmTask = taskList.get(i); subDataList.add(data.subList(ckmTask.first, ckmTask.last + 1)); for (int j = 0; j < numPivots; j++) { allLower[i][j] = ckmTask.lower[j]; allUpper[i][j] = ckmTask.upper[j]; } } VPInternalNode predicate = new VPInternalNode(pivots, allLower, allUpper, size, new long[childrenNumber], this.GHTDegree); PartitionResults partitionResult = new PartitionResults( subDataList, predicate); return partitionResult; } //else ght partition { final int childrenNumber = 2; List<List<? extends IndexObject>> subDataList = new ArrayList<List<? extends IndexObject>>( childrenNumber); double[][] allLower = new double[childrenNumber][numPivots]; double[][] allUpper = new double[childrenNumber][numPivots]; int head = 0, tail = size -1; double temp; while(head <= tail) { while ((head < size) && (distance[1][head] <= 0)) head++; while ((tail >= 0 ) && (distance[1][tail] > 0)) tail--; if (head <= tail) { Collections.swap(data, head, tail); temp = distance[1][head]; distance[1][head] = distance[1][tail]; distance[1][tail] = temp; } } subDataList.add(data.subList(0, head)); subDataList.add(data.subList(head, size)); VPInternalNode predicate = new VPInternalNode(pivots, allLower, allUpper, size, new long[childrenNumber], this.GHTDegree); PartitionResults partitionResult = new PartitionResults(subDataList, predicate); return partitionResult; } } /** * partition the first cluster in the task list, select a best vp, * partition based on this vp, put new sub-clusters back into task list, * if the sub-clusters don't have further partition, append them to the * end, otherwise insert to the head arrange data list, distance array, * put data and distances belongs to the same sub-clusters together. * * @param data * list of data set * @param distance * distance values from each data point to each vantage point * @param taskList * a {@link LinkedList} of all the clusters to be * partitioned. * @param SVF * single fanout * @param maxLS * max leaf size, if a cluster has less size, don't partition * further * @param logger */ private void partitionACluster(List<? extends IndexObject> data, double[][] distance, LinkedList<ClusteringKMeansTask> taskList, final int SVF, final int maxLS, Logger logger) { ClusteringKMeansTask task = taskList.removeFirst(); final int first = task.first; final int last = task.last; boolean[] toUse = task.toUse; // if current cluster can be fit in a leaf node, don't partition // further if (last - first + 1 <= maxLS) { double min = Double.POSITIVE_INFINITY; double max = Double.NEGATIVE_INFINITY; double[] lower = task.lower; double[] upper = task.upper; // set distance range for unused vps for (int i = 0; i < toUse.length; i++) if (toUse[i]) { for (int j = first; j <= last; j++) { if (min > distance[i][j]) min = distance[i][j]; if (max < distance[i][j]) max = distance[i][j]; } lower[i] = min; upper[i] = max; toUse[i] = false; } taskList.addLast(task); return; } // if the cluster can not be fit in a leaf node, go on to partition int childrenNumber = 1; int minVP = 0; // the index of the vp with the min variance double minVar = Double.POSITIVE_INFINITY; // min value of variance double var = 0; // temp variable for variance double[] means = null; // means of k-means double[] split = null; // split values double[] minSplit = null; // the set of split values with the min // variance int[] bucketSize = null; int[] minBucketSize = null; // the bucketsize array of the vp with // min variance double[] lower = null; double[] minLower = null; double[] upper = null; double[] minUpper = null; // lower, upper bound of distances of // each sub-cluster for (int i = 0; i < toUse.length; i++) { // if the vp is already used, go to the next one if (!toUse[i]) continue; // find the initial clustering to run k-means, each item in // means should be a // different real distance value that exists // thus, if the length of means is less than SVF, all the // distinct distance values // have been returned, no need to run k-means means = bucketInitialClustering(distance[i], first, last, SVF); if (means.length < SVF) { if (childrenNumber > means.length) // some other vp can // partition the cluster // into more sub-clusters, ignore current vp continue; } else // run the k-means with means as the initial clustering { kMeans(distance[i], first, last, means, logger); } split = new double[means.length - 1]; Arrays.sort(means); for (int j = 0; j < split.length; j++) split[j] = (means[j] + means[j + 1]) / 2; // split values are available, compute the variance bucketSize = new int[split.length + 1]; for (int j = 0; j < bucketSize.length; j++) bucketSize[j] = 0; lower = new double[split.length + 1]; upper = new double[split.length + 1]; for (int j = 0; j < lower.length; j++) { lower[j] = Double.POSITIVE_INFINITY; upper[j] = Double.NEGATIVE_INFINITY; } for (int j = first; j <= last; j++) // for each point, find // which bucket it belongs // to. left bound inclusive, right bound // exclusive { int k = 0; while ((k < split.length) && (distance[i][j] >= split[k])) k++; bucketSize[k]++; if (lower[k] > distance[i][j]) lower[k] = distance[i][j]; if (upper[k] < distance[i][j]) upper[k] = distance[i][j]; } var = 0; // varx = E(x^2) - (Ex)^2, since Ex is fixed, we dont compute // it. for (int j = 0; j < bucketSize.length; j++) var += bucketSize[j] * bucketSize[j]; // compare with currnet min variance if ((bucketSize.length > childrenNumber) || ((bucketSize.length == childrenNumber) && (minVar > var))) { minVar = var; minSplit = split; minVP = i; minBucketSize = bucketSize; minLower = lower; minUpper = upper; } if (bucketSize.length > childrenNumber) { childrenNumber = bucketSize.length; } }// end of loop for each vp // the best vp is found , split and bucketsize array are ready, // partition the cluster // now // if childrenNumber ==1, the cluster can not be partitioned by any // vp, add a finished // task to task list if (childrenNumber == 1) { lower = task.lower; upper = task.upper; for (int i = 0; i < toUse.length; i++) if (toUse[i]) { lower[i] = distance[i][first]; upper[i] = distance[i][first]; toUse[i] = false; } ClusteringKMeansTask newCKMTask = new ClusteringKMeansTask( first, last, lower, upper, toUse); taskList.addLast(newCKMTask); return; } // if childrennumber != 1, continue further process // set bucketFirst array, bucketFirst[i] is the offset of the first // element of bucket i int[] bucketFirst = new int[childrenNumber + 1]; System.arraycopy(minBucketSize, 0, bucketFirst, 0, childrenNumber); for (int i = 1; i < childrenNumber; i++) bucketFirst[i] += bucketFirst[i - 1]; System.arraycopy(bucketFirst, 0, bucketFirst, 1, childrenNumber); bucketFirst[0] = 0; for (int i = 0; i <= childrenNumber; i++) bucketFirst[i] += first; // bucketPointer[i] points to the first place of bucket i to be // sorted int[] bucketPointer = new int[childrenNumber]; System.arraycopy(bucketFirst, 0, bucketPointer, 0, childrenNumber); // ready to arrange, for each bucket, for each value, if it doesn't // belong to the // bucket, // exchange it with an element in the correct bucket double tempDouble; for (int i = 0; i < childrenNumber; i++) { for (int j = bucketPointer[i]; j < bucketFirst[i + 1]; j++) { while (true) { // compute the bucket id of current point int k = 0; // k is the correct id while ((k < minSplit.length) && (distance[minVP][j] >= minSplit[k])) k++; // if current point belongs to current bucket, go to // next point if (k == i) break; // exchange the point object Collections.swap(data, j, bucketPointer[k]); // exchange the distance values to all vps for (int t = 0; t < distance.length; t++) { tempDouble = distance[t][j]; distance[t][j] = distance[t][bucketPointer[k]]; distance[t][bucketPointer[k]] = tempDouble; } bucketPointer[k]++; } // end of while } } // the data list and distance array have been re-arranged, now // create sub-clusters, and // put them into task list toUse[minVP] = false; boolean done = true; // check whether the sub-clusters need further partition for (int i = 0; i < toUse.length; i++) if (toUse[i]) { done = false; break; } if (done) // if no further partition (all vp are used), add to the // end of the task // list { for (int i = 0; i < childrenNumber; i++) { lower = task.lower.clone(); lower[minVP] = minLower[i]; upper = task.upper.clone(); upper[minVP] = minUpper[i]; ClusteringKMeansTask newCKMTask = new ClusteringKMeansTask( bucketFirst[i], bucketFirst[i + 1] - 1, lower, upper, toUse.clone()); taskList.addLast(newCKMTask); } } else // further partition is needed, add to the head of the task list { for (int i = childrenNumber - 1; i >= 0; i--) { lower = task.lower.clone(); lower[minVP] = minLower[i]; upper = task.upper.clone(); upper[minVP] = minUpper[i]; ClusteringKMeansTask newCKMTask = new ClusteringKMeansTask( bucketFirst[i], bucketFirst[i + 1] - 1, lower, upper, toUse.clone()); taskList.addFirst(newCKMTask); } } } /** * find the initial clustering to run k-means, each item in retuned * array should be a real distinct distance value that exists, thus, if * the length of returned array is less than SVF, all the distinct * distance values have been returned * * @param distance * the array containing all the double value to find means on * @param first * offset of the first element * @param last * offset of the last element * @param SVF * number of means/distinct values to find * @return an array of means, each element is different from others, if * its length is shorter than SVF, all distinct values are in it */ private double[] bucketInitialClustering(double[] distance, final int first, final int last, final int SVF) { final int bucketNumber = Math.max(Math.min((last - first + 1) / 10, 50 * SVF), SVF); double min, max; // min , max distance to a vp min = Double.POSITIVE_INFINITY; max = Double.NEGATIVE_INFINITY; for (int j = first; j <= last; j++) { if (distance[j] > max) max = distance[j]; if (distance[j] < min) min = distance[j]; } // if min == max, can not partition by current vp. if (max == min) { double[] result = new double[1]; result[0] = distance[first]; return result; } // compute the bucket size int[] bucketSize = new int[bucketNumber]; for (int i = 0; i < bucketNumber; i++) bucketSize[i] = 0; final double bucketWidth = (max - min) / bucketNumber; for (int i = first; i <= last; i++) { int temp = (int) ((distance[i] - min) / bucketWidth); if (temp >= bucketNumber) temp = bucketNumber - 1; bucketSize[temp]++; } // find the buckets whose size is local max boolean[] isLocalMax = new boolean[bucketNumber]; isLocalMax[0] = (bucketSize[0] >= bucketSize[1]) ? true : false; isLocalMax[bucketNumber - 1] = (bucketSize[bucketNumber - 1] >= bucketSize[bucketNumber - 2]) ? true : false; for (int i = 1; i <= bucketNumber - 2; i++) isLocalMax[i] = ((bucketSize[i] >= bucketSize[i - 1]) && (bucketSize[i] >= bucketSize[i + 1])) ? true : false; // remove consecutive local max bucket int loop = 0; while (loop < bucketNumber) { if (!isLocalMax[loop]) loop++; else { int lastMax = loop + 1; while ((lastMax < bucketNumber) && isLocalMax[lastMax]) lastMax++; for (int i = loop; i < lastMax; i++) isLocalMax[i] = false; isLocalMax[(loop + lastMax - 1) / 2] = true; loop = lastMax + 1; } } int localMaxBucketNumber = 0; // number of positive-size local max // bucket for (int i = 0; i < bucketNumber; i++) if (isLocalMax[i] && (bucketSize[i] > 0)) localMaxBucketNumber++; if (localMaxBucketNumber >= SVF) // there are enough bins, find // svf largest ones, // return the middle point of them { boolean[] isLargest = new boolean[bucketNumber]; for (int i = 0; i < bucketNumber; i++) isLargest[i] = false; for (int i = 0; i < SVF; i++) { int maxSize = 0; int maxId = 0; for (int j = 0; j < bucketNumber; j++) { if (isLocalMax[j] && !isLargest[j] && (bucketSize[j] > maxSize)) { maxSize = bucketSize[j]; maxId = j; } } isLargest[maxId] = true; } double[] result = new double[SVF]; int counter = 0; for (int i = 0; i < bucketNumber; i++) { if (isLargest[i]) { result[counter] = min + (i + 0.5) * bucketWidth; counter++; } } return result; } else // no enough local max bucket, for each local max bin, find a value // in it, then // find dist { double[] result = new double[SVF]; int counter = 0; // for each local max bucket, find a value in it. for (int i = first; i <= last; i++) { int temp = (int) ((distance[i] - min) / bucketWidth); if (temp >= bucketNumber) temp = bucketNumber - 1; if (isLocalMax[temp]) { result[counter] = distance[i]; isLocalMax[temp] = false; counter++; if (counter >= localMaxBucketNumber) break; } } // find distinct values for (int i = first; i <= last; i++) { boolean isDistinct = true; for (int j = 0; j < counter; j++) if (distance[i] == result[j]) { isDistinct = false; break; } if (isDistinct) { result[counter] = distance[i]; counter++; if (counter >= SVF) break; } } if (counter < SVF) // no enough distinct values { double[] finalResult = new double[counter]; System.arraycopy(result, 0, finalResult, 0, counter); return finalResult; } else return result; } } /** * run the k-means given the initial clustering. after running, the * results are stored in the argument, means * * @param distance * the array containing all the double value to run on * @param first * offset of the first element * @param last * offset of the last element * @param means * double array of initial values of means, after the method * runs, its values are the final means * @param logger */ private void kMeans(double[] distance, final int first, final int last, double[] means, Logger logger) { final double stop = 0.1; final int size = last - first + 1; final int clusterNumber = means.length; short[] clusterId = new short[size]; double[] split = new double[clusterNumber - 1]; double sum = 0, newSum = 0; double[] clusterSum = new double[clusterNumber]; int[] clusterSize = new int[clusterNumber]; int counter = 0; while ((counter < 2) || Math.abs(newSum - sum) / sum > stop) { sum = newSum; // set the cluster id of each value Arrays.sort(means); for (int j = 0; j < split.length; j++) split[j] = (means[j] + means[j + 1]) / 2; for (int i = 0; i < size; i++) { clusterId[i] = 0; // k is the correct id while ((clusterId[i] < split.length) && (distance[first + i] >= split[clusterId[i]])) clusterId[i]++; } // compute new mean and new sum for (int i = 0; i < clusterNumber; i++) { clusterSum[i] = 0; clusterSize[i] = 0; } for (int i = 0; i < size; i++) { clusterSum[clusterId[i]] += distance[first + i]; clusterSize[clusterId[i]]++; } newSum = 0; for (int i = 0; i < clusterNumber; i++) { means[i] = clusterSum[i] / clusterSize[i]; newSum += means[i]; } counter++; if ((counter > 100) && (counter % 100 == 0)) System.out.println("counter= " + counter + ", too large!"); if (Debug.debug) logger.finer("counter= " + counter + ", sum= " + sum + ", new sum= " + newSum); } } }, EXCLUDEDMIDDLE { double maxR = 0; /** * @param R */ public void setMaxRadius(double R) { this.maxR = R; } /** * @param metric * @param pivots * @param data * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int numPartitions, int maxLS) { return partition(metric, pivots, data, 0, data.size(), numPartitions, maxLS); } /** * @param metric * @param pivots * @param data * @param first * @param size * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size, int numPartitions, int maxLS) { PivotWisePartition pm = new PivotWisePartition(); pm.setMaxRadius(maxR); return pm.partition(metric, pivots, data, first, size, numPartitions, maxLS); } }, CGHT { /** * @param R */ public void setMaxRadius(double R) { } /** * @param metric * @param pivots * @param data * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int numPartitions, int maxLS) { return null; } /** * @param metric * @param pivots * @param data * @param first * @param size * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size, int numPartitions, int maxLS) { return null; } }, GHT { /** * @param R */ public void setMaxRadius(double R) { } /** * @param metric * @param pivots * @param data * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int numPartitions, int maxLS) { return null; } /** * @param metric * @param pivots * @param data * @param first * @param size * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size, int numPartitions, int maxLS) { return null; } }, CGHTBALANCED { /** * @param R */ public void setMaxRadius(double R) { } /** * @param metric * @param pivots * @param data * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int numPartitions, int maxLS) { return null; } /** * @param metric * @param pivots * @param data * @param first * @param size * @param numPartitions * @return */ public PartitionResults partition(Metric metric, IndexObject[] pivots, List<? extends IndexObject> data, int first, int size, int numPartitions, int maxLS) { return null; } }; /* * constant used to hold input parameters * Added by Kewei Ma */ public static String pm; public static double r; public static int countRN = 0; }