/** * GeDBIT.index.VPIndex 2006.05.09 * * Copyright Information: * * Change Log: * 2006.05.09: Created, by Rui Mao, Willard * 2006.07.09: added KNN search, by Weijia Xu */ package GeDBIT.index; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.logging.Level; import GeDBIT.dist.Metric; import GeDBIT.index.algorithms.PartitionMethod; import GeDBIT.index.algorithms.PartitionMethods; import GeDBIT.index.algorithms.PartitionResults; import GeDBIT.index.algorithms.PivotSelectionMethod; import GeDBIT.index.Query; import GeDBIT.index.RangeQuery; import GeDBIT.type.IndexObject; import GeDBIT.util.Debug; /** * A Vantage Point Tree (VPT) index. * * @author Rui Mao, Willard, Weijia Xu * @version 2006.05.09 */ public class VPIndex extends AbstractIndex { private static final long serialVersionUID = 6928847050089693231L; private PivotSelectionMethod psm; private int numPivot; private PartitionMethod pm; private int singlePivotFanout; private int maxLeafSize; private int maxPathLength; private transient ArrayList<LoadTask> taskList; public transient int numLeaf; public transient int numInternal; private transient int[] partitionSizes; /** * Builds an idex over the specified Table. * * @param fileName * the fileName to give the index. * @param data * the {@link List} of {@link IndexObject}s to build the index * over * @param metric * the {@link Metric} to use when building the index. * @param psm * the {@link PivotSelectionMethod} to use when building the * index. * @param numPivot * the number of pivots to use. * @param pm * the {@link PartitionMethod} to use when building the index. * @param singlePivotFanout * the fanout for a single pivot. the total fanout is * singlePivotFanout^numPivot * @param maxLeafSize * the maximum number of data points in a leaf node. * @param maxPathLength * the number of previous distance calculations to store in a * given node. * @param debugLevel * the debug level. */ public VPIndex(String fileName, List<? extends IndexObject> data, Metric metric, PivotSelectionMethod psm, int numPivot, PartitionMethod pm, int singlePivotFanout, int maxLeafSize, int maxPathLength, Level debugLevel) { // 1.a. initialize the object io manager super(fileName, data, metric, debugLevel); // 1.b. initialize fields this.psm = psm; this.numPivot = numPivot; this.pm = pm; this.singlePivotFanout = singlePivotFanout; this.maxLeafSize = maxLeafSize; this.maxPathLength = maxPathLength; // 1.c. initialize two lists taskList = new ArrayList<LoadTask>(); // 2. do the actual loading, this.numInternal = 0; this.numLeaf = 0; this.root = load(data); // 4. set oiom to be read only try { close(); } catch (IOException e) { e.printStackTrace(); } initOIOM(true); openOIOM(); } public VPIndex(String fileName, List<? extends IndexObject> data, Metric metric, PivotSelectionMethod psm, int numPivot, PartitionMethod pm, int singlePivotFanout, int maxLeafSize, int maxPathLength, Level debugLevel, int subtree) { // 1.a. initialize the object io manager super(fileName, data, metric, debugLevel, subtree); // 1.b. initialize fields this.psm = psm; this.numPivot = numPivot; this.pm = pm; this.singlePivotFanout = singlePivotFanout; this.maxLeafSize = maxLeafSize; this.maxPathLength = maxPathLength; // 1.c. initialize two lists taskList = new ArrayList<LoadTask>(); // 2.a. do the actual loading this.numInternal = 0; this.numLeaf = 0; this.root = loadRoot(data); // 3.a. set oiom to be read only try { for (int i = 0; i < subtree + 1; i++) { close(i); } } catch (IOException e) { e.printStackTrace(); } for (int i = 0; i < subtree + 1; i++) { initOIOM(true, i); openOIOM(i); } } /** * * @param data * @return */ private long load(List<? extends IndexObject> data) { // step 1: initialize the stack, put the root task into it. LoadTask task = new LoadTask(data, "0", -1, 0, new ArrayList<IndexObject>(maxPathLength)); logger.info("building " + task.description + ", size: " + task.size + ", heap size: " + Runtime.getRuntime().totalMemory() / 1024 + "K, used size: " + (Runtime.getRuntime().totalMemory() - Runtime.getRuntime() .freeMemory()) / 1024 + "K"); // at the root, you always want to create an internal node // step 2.a: select pivots int numPivots = (numPivot >= task.size) ? task.size : numPivot; int[] pivots = psm.selectPivots(metric, task.compressedData, numPivots); // step 2.b: rearrange data list, put pivots at end: task.groupPivotsAtEnd(pivots); taskList.add(task); createAnInternalNode(task, pm, singlePivotFanout, maxLeafSize, 0); if (0 != PartitionMethods.countRN) { // CountRN added by Kewei Ma countRN(data, task, PartitionMethods.r, PartitionMethods.pm); System.out .println("Please ignore the Exceptions below. It's caused by manually return a wrong value 0 to end the rest of building process"); return 0; } else { // step 2: do the DFS bulkloading int taskToGet; while ((taskToGet = taskList.size() - 1) > 0) { task = taskList.get(taskToGet); if (task.node == null) { // step 2.a: select pivots numPivots = (numPivot >= task.size) ? task.size : numPivot; logger.finer("VP Selection Method:" + psm.toString()); logger.finer("selecting VP using " + psm.toString() + " selection.., data size= " + task.size + ", vp number= " + numPivots); pivots = psm.selectPivots(metric, task.compressedData, numPivots); // step 2.b: rearrange data list, put pivots at end: task.groupPivotsAtEnd(pivots); // step 2.c: create either an internal node or a leaf node // depending on the maximum // leaf size allowed if (task.size - task.numPivots > maxLeafSize) createAnInternalNode(task, pm, singlePivotFanout, maxLeafSize, taskToGet); else createAndWriteLeafNode(taskList.remove(taskToGet)); } else { writeInternalNode(taskList.remove(taskToGet)); } } // step 3: save the root node to the ObjectIOManager return writeRootNode(taskList.remove(0)); } } private long loadRoot(List<? extends IndexObject> data) { LoadTask task = new LoadTask(data, "0", -1, 0, new ArrayList<IndexObject>(maxPathLength)); int numPivots = (numPivot >= task.size) ? task.size : numPivot; int[] pivots = psm.selectPivots(metric, task.compressedData, numPivots); task.groupPivotsAtEnd(pivots); taskList.add(task); createAnInternalNode(task, pm, singlePivotFanout, maxLeafSize, 0); int myIndex; int taskToGet; HashMap<Integer, Long> subtrees = new HashMap<Integer, Long>(); while ((taskToGet = taskList.size() - 1) > 0) { task = taskList.remove(taskToGet); myIndex = task.getMyIndex() % super.getSubtree(); subtrees.put(task.getMyIndex(), loadSubtree(task, myIndex)); } return writeAllSubRoots(taskList.remove(0), subtrees); } private long writeAllSubRoots(LoadTask task, HashMap<Integer, Long> subtrees) { final InternalNode iNode = (InternalNode) task.node; final int childNumber = iNode.numChildren(); for (int j = 0; j < childNumber; j++) if (iNode.getChildAddress(j) == -1) iNode.setChildAddress(j, subtrees.get(j)); long pointer = -1; try { pointer = oioms[0].writeObject(iNode); } catch (Exception e) { System.out.println("Error while writing root address! "); e.printStackTrace(); } return pointer; } private long loadSubtree(LoadTask task, int myIndex) { int i; int[] pivots; int numPivots; ArrayList<LoadTask> lists = new ArrayList<LoadTask>(); lists.add(task); boolean first = true; while ((i = lists.size() - 1) > 0 || first) { first = false; task = lists.get(i); if (task.node == null) { numPivots = (numPivot >= task.size) ? task.size : numPivot; pivots = psm.selectPivots(metric, task.compressedData, numPivots); task.groupPivotsAtEnd(pivots); if (task.size - task.numPivots > maxLeafSize) { createAnSubtreeInternalNode(task, pm, singlePivotFanout, maxLeafSize, i, lists); } else { final long pointer = createAndWriteSubtreeLeafNode( lists.remove(i), myIndex); LoadTask parentNodeTask = lists.get(task.parentNodeIndex); if (parentNodeTask != null) { InternalNode parentNode = (InternalNode) parentNodeTask.node; parentNode.setChildAddress(task.getMyIndex(), pointer); } } } else { final long pointer = writeSubtreeInternalNode(lists.remove(i), myIndex); LoadTask parentNodeTask = lists.get(task.parentNodeIndex); if (parentNodeTask != null) { InternalNode parent = (InternalNode) parentNodeTask.node; parent.setChildAddress(task.getMyIndex(), pointer); } } } return writeSubtreeRootNode(lists.remove(0), myIndex); } private long writeSubtreeRootNode(LoadTask task, int subtree) { final InternalNode iNode = (InternalNode) task.node; final int childNumber = iNode.numChildren(); for (int j = 0; j < childNumber; j++) if (iNode.getChildAddress(j) == -1) System.out.println("current node: " + iNode.toString() + ", child " + j + " is null!"); long pointer = -1; try { pointer = oioms[subtree + 1].writeObject(iNode); } catch (Exception e) { System.out.println("error writing index node to ObjectIOManager!"); e.printStackTrace(); } return pointer; } private long createAndWriteSubtreeLeafNode(LoadTask task, int subtree) { IndexObject[] children = task.getDataPoints(); IndexObject[] pivots = task.getPivots(); double[][] distance = new double[children.length][task.numPivots]; for (int i = 0; i < children.length; i++) { final IndexObject currentChild = children[i]; for (int j = 0; j < task.numPivots; j++) distance[i][j] = metric.getDistance(currentChild, pivots[j]); } final int pivotHistoryListSize = task.pivotHistoryList.size(); double[][] pathDistance = new double[children.length][pivotHistoryListSize]; for (int i = 0; i < children.length; i++) for (int j = 0; j < pivotHistoryListSize; j++) pathDistance[i][j] = metric.getDistance(children[i], task.pivotHistoryList.get(j)); long pointer = -1; try { VPLeafNode node = new VPLeafNode(pivots, children, task.size, distance, pathDistance); pointer = oioms[subtree + 1].writeObject(node); } catch (Exception e) { System.out.print("Exception in creating HeavyLeafNode: " + e.toString()); e.printStackTrace(); } return pointer; } private long writeSubtreeInternalNode(LoadTask task, int subtree) { final InternalNode iNode = (InternalNode) task.node; final int childNumber = iNode.numChildren(); for (int j = 0; j < childNumber; j++) if (iNode.getChildAddress(j) == -1) System.out.println("current node: " + iNode.toString() + ", child " + j + " is null!"); long pointer = -1; try { pointer = oioms[subtree + 1].writeObject(iNode); } catch (Exception e) { System.out.println("error writing index node to ObjectIOManager!"); e.printStackTrace(); } return pointer; } private void createAnSubtreeInternalNode(LoadTask task, PartitionMethod pm, int singlePivotFanout, int maxLeafSize, int nodeIndex, ArrayList<LoadTask> lists) { IndexObject[] pivots = task.getPivots(); for (int i = 0; i < pivots.length; i++) { if (task.pivotHistoryList.size() < maxPathLength) task.pivotHistoryList.add(pivots[i]); else break; } PartitionResults partitionResults = pm.partition(metric, pivots, task.compressedData, 0, (task.size - task.numPivots), singlePivotFanout, maxLeafSize); int childrenNumber = partitionResults.size(); // step 3. create an internal node, and save it to nodeList task.node = partitionResults.getInternalNode(); // step 4: add tasks for each child into the stack for (int i = childrenNumber - 1; i >= 0; i--) { task.node.setChildAddress(i, -1); LoadTask newTask; if (pivots.length < maxPathLength) newTask = new LoadTask(partitionResults.getPartition(i), task.description + "." + i, nodeIndex, i, new ArrayList<IndexObject>(task.pivotHistoryList)); else newTask = new LoadTask(partitionResults.getPartition(i), task.description + "." + i, nodeIndex, i, task.pivotHistoryList); lists.add(newTask); } } private void createAnInternalNode(LoadTask task, PartitionMethod pm, int singlePivotFanout, int maxLeafSize, int nodeIndex) { logger.finest("Create an internal node..."); this.numInternal++; // counter for internal node // get pivots IndexObject[] pivots = task.getPivots(); for (int i = 0; i < pivots.length; i++) { if (task.pivotHistoryList.size() < maxPathLength) task.pivotHistoryList.add(pivots[i]); else break; } PartitionResults partitionResults = pm.partition(metric, pivots, task.compressedData, 0, (task.size - task.numPivots), singlePivotFanout, maxLeafSize); partitionSizes = new int[partitionResults.size()]; // added by Kewei Ma for (int i = 0; i < partitionResults.size(); i++) partitionSizes[i] = partitionResults.getPartition(i).size(); int childrenNumber = partitionResults.size(); // step 3. create an internal node, and save it to nodeList task.node = partitionResults.getInternalNode(); // step 4: add tasks for each child into the stack for (int i = childrenNumber - 1; i >= 0; i--) { task.node.setChildAddress(i, -1); LoadTask newTask; // TODO why this? // if (pivots.length < maxPathLength) if (task.pivotHistoryList.size() < maxPathLength) newTask = new LoadTask(partitionResults.getPartition(i), task.description + "." + i, nodeIndex, i, new ArrayList<IndexObject>(task.pivotHistoryList)); else newTask = new LoadTask(partitionResults.getPartition(i), task.description + "." + i, nodeIndex, i, task.pivotHistoryList); taskList.add(newTask); } } private void createAndWriteLeafNode(LoadTask task) { this.numLeaf++; // counter for leaf node IndexObject[] children = task.getDataPoints(); IndexObject[] pivots = task.getPivots(); double[][] distance = new double[children.length][task.numPivots]; for (int i = 0; i < children.length; i++) { final IndexObject currentChild = children[i]; for (int j = 0; j < task.numPivots; j++) distance[i][j] = metric.getDistance(currentChild, pivots[j]); } if (Debug.debug) { logger.finest("create a leaf node, distances(VP*children):"); for (int i = 0; i < task.numPivots; i++) { for (int j = 0; j < children.length; j++) logger.finest(distance[i][j] + ", "); logger.finest(""); } } final int pivotHistoryListSize = task.pivotHistoryList.size(); double[][] pathDistance = new double[children.length][pivotHistoryListSize]; for (int i = 0; i < children.length; i++) for (int j = 0; j < pivotHistoryListSize; j++) pathDistance[i][j] = metric.getDistance(children[i], task.pivotHistoryList.get(j)); try { VPLeafNode node = new VPLeafNode(pivots, children, task.size, distance, pathDistance); // write this node to the index file final long pointer = oiom.writeObject(node); InternalNode parentNode = taskList.get(task.parentNodeIndex).node; parentNode.setChildAddress(task.myIndex, pointer); } catch (Exception e) { System.out.print("Exception in creating HeavyLeafNode: " + e.toString()); e.printStackTrace(); } } private long writeInternalNode(LoadTask task) { final InternalNode iNode = (InternalNode) task.node; // check whether each child address has already been set. final int childNumber = iNode.numChildren(); for (int j = 0; j < childNumber; j++) if (iNode.getChildAddress(j) == -1) System.out.println("current node: " + iNode.toString() + ", child " + j + " is null!"); // write current node, and set its parent long pointer = -1; try { pointer = oiom.writeObject(iNode); } catch (Exception e) { System.out.println("error writing index node to ObjectIOManager!"); e.printStackTrace(); } ; LoadTask parentNodeTask = taskList.get(task.parentNodeIndex); InternalNode parent = (InternalNode) parentNodeTask.node; parent.setChildAddress(task.myIndex, pointer); return pointer; } /** * This method counts the R neighborhood of the tree's top layer, only * executes once in building time * * @param data * All data objects * @param task * LoadTask, contains building time information * @author Kewei Ma */ private void countRN(List<? extends IndexObject> data, LoadTask task, double r, String dpm) { final double[][] lows = ((VPInternalNode) task.node).getLowerRange(); final double[][] highs = ((VPInternalNode) task.node).getUpperRange(); double result = 0; // for all data except pivots for (int i = 0; i < task.size - numPivot; i++) { double avgPTraveled = 0; // for all partitions for (int j = 0; j < lows.length; j++) { boolean search = true; if ("CLUSTERINGKMEANS".equalsIgnoreCase(dpm)) // MVPT { // for all pivots for (int k = 0; k < numPivot; k++) { // distance of current data point and current pivot final double dist = metric.getDistance(data.get(i), task.getPivots()[k]); if (dist < lows[j][k] - r || dist > highs[j][k] + r) search = false; } if (true == search) avgPTraveled += partitionSizes[j] / (double) (task.size - numPivot); } else if ("CGHT".equalsIgnoreCase(dpm))// CGHT { // d1 + d2 final double dist1 = metric.getDistance(data.get(i), task.getPivots()[0]) + metric.getDistance(data.get(i), task.getPivots()[1]); // d1 - d2 final double dist2 = metric.getDistance(data.get(i), task.getPivots()[0]) - metric.getDistance(data.get(i), task.getPivots()[1]); if (dist1 < lows[j][0] - r * 2 || dist1 > highs[j][0] + r * 2 || dist2 < lows[j][1] - r * 2 || dist2 > highs[j][1] + r * 2) search = false; if (true == search) avgPTraveled += partitionSizes[j] / (double) (task.size - numPivot); } else // GHT { // d1 - d2 final double dist = metric.getDistance(data.get(i), task.getPivots()[0]) - metric.getDistance(data.get(i), task.getPivots()[1]); if (dist >= -2 * r && dist <= 2 * r) avgPTraveled += 1; else if (dist < -2 * r) avgPTraveled += partitionSizes[0] / (double) (task.size - numPivot); else avgPTraveled += partitionSizes[1] / (double) (task.size - numPivot); break; } } result += avgPTraveled; } result /= (task.size - numPivot); System.out.println("R neighborhood: " + result); } private long writeRootNode(LoadTask task) { final InternalNode iNode = (InternalNode) task.node; // check whether each child address has already been set. final int childNumber = iNode.numChildren(); for (int j = 0; j < childNumber; j++) if (iNode.getChildAddress(j) == -1) System.out.println("current node: " + iNode.toString() + ", child " + j + " is null!"); // write current node, and set its parent long pointer = -1; try { pointer = oiom.writeObject(iNode); } catch (Exception e) { System.out.println("error writing index node to ObjectIOManager!"); e.printStackTrace(); } ; return pointer; } private class LoadTask { private List<? extends IndexObject> compressedData; private String description; private int parentNodeIndex; private int myIndex; private int size; private int numPivots; public List<IndexObject> pivotHistoryList; private InternalNode node; LoadTask(List<? extends IndexObject> compressedData, String description, int parentIndex, int myIndex, List<IndexObject> pivotHistoryList) { this.compressedData = compressedData; this.description = description; this.parentNodeIndex = parentIndex; this.myIndex = myIndex; this.pivotHistoryList = pivotHistoryList; this.size = compressedData.size(); this.numPivots = -1; } private void groupPivotsAtEnd(int[] pivots) { numPivots = pivots.length; if (numPivots < size) { int futurePivotIndex = size - 1; // put all the pivots at the end for (int i = 0; i < numPivots; i++) { // System.out.println("numPivots = "+numPivots); // System.out.println("size = "+size); // System.out.println("future = "+futurePivotIndex); // System.out.println("i = "+i); // System.out.println("pivots[i] = "+pivots[i]); // System.out.println("compress size = "+compressedData.size()); Collections.swap(compressedData, futurePivotIndex, pivots[i]); // if we just swapped this pivot with another pivot, then we // need to change that // pivot's value for (int j = i + 1; j < numPivots; j++) { if (pivots[j] == futurePivotIndex) { pivots[j] = pivots[i]; } } // pivots[i] = futurePivotIndex; don't need this since // pivots[] is not used // after this. futurePivotIndex--; } } } // get the pivots (sans rowIDs); used as input for // PivotSelectMethod.selectPivots() public IndexObject[] getPivots() { // TODO replace with context specific error if (numPivots == -1) { throw new Error("pivots have not yet been chosen!!!"); } IndexObject[] pivots = new IndexObject[numPivots]; int start = size - numPivots; for (int i = 0; i < numPivots; i++) { pivots[i] = compressedData.get(start); start++; } return pivots; } public int getMyIndex() { return this.myIndex; } public IndexObject[] getDataPoints() { final int mySize; if (numPivots == -1) mySize = size; else mySize = size - numPivots; IndexObject[] dataPoints = new IndexObject[mySize]; for (int i = 0; i < mySize; i++) { dataPoints[i] = compressedData.get(i); } return dataPoints; } } public Cursor search(Query q) { if (q instanceof RangeQuery) { if (super.getFlag()) { return new VPRangeCursor((RangeQuery) q, oioms, metric, root); } else { return new VPRangeCursor((RangeQuery) q, oiom, metric, root); } } else if (q instanceof KNNQuery) return new VPKNNCursor((KNNQuery) q, oiom, metric, root); else throw new UnsupportedOperationException("Unsupported query type " + q.getClass()); } }