/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * KDTree.java * Copyright (C) 2000 University of Waikato * */ package weka.core; import weka.core.*; import java.io.Serializable; import java.util.*; import java.lang.Math; /** * This is a KD-Tree structure that stores instances using a divide and conquer * method. * The connection to dataset is only a reference. For the tree structure the * indexes are stored in a dynamic array * Building the tree: * If a node has <maxleaf> (option -L) instances no further splitting is done. * Also if the split would leave one side empty, the branch is not split any * further even if the instances in the resulting node are more than <maxleaf> * instances. * * -P <br> * Pruning flag. <p> * * -W <minimal-box-relative-width> <br> * minimal width of a box * * -L <maximal-inst-number> <br> * maximal instance number in a leaf * * -D <distance function> * Distance function to be used (default = Euclidean distance) * * -N <br> * Set Normalization. Used when building the tree and selecting the * widest dimension, each dimension is 'normalized' to the universe range. * * -U <debuglevel> <br> * Set debuglevel. <p> * * @author Gabi Schmidberger (gabi@cs.waikato.ac.nz) * @author Malcolm Ware (mfw4@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */ public class KDTree implements OptionHandler, Serializable{ /** flag for pruning */ boolean m_Prune = false; /** flag for normalizing */ boolean m_Normalize = false; /** for debugging: debug level */ private int m_DebugLevel = 0; /* * Ranges of the whole KDTree. * lowest and highest value and width (= high - low) for each * dimension */ private double[][] m_Universe; /** the distance function used */ private DistanceFunction m_DistanceFunction = null; /** value to split on. */ private double m_SplitValue; /** attribute to split on. */ private int m_SplitDim; /** root node */ private KDTreeNode m_Root = null; /** * Indexlist of the instances of this kdtree. * Instances get sorted according to the splits. * the nodes of the KDTree just hold their start and end indices */ private DynamicArrayOfPosInt m_InstList; /** Reference to the instances of the universe. */ private Instances m_Instances; /** * Flag that can be set to signalize that the KDTree is not valid anymore for the * dataset given, * the flag is false after the tree is newly * initialized and therefore empty * or if the dataset has changed in a way that couldn't be followed up for * the KDTree, for instance if an instance was deleted. **/ private boolean m_Valid = false; /* minimal relative width of a KDTree rectangle */ double m_MinBoxRelWidth = 1.0E-2; /** maximal number of leaves in a KDTree */ // int m_MaxLeafNumber = 100; not yet implemented /** maximal number of instances in a leaf */ int m_MaxInstInLeaf = 40; /** This will cache the pruning value for future use. */ private double m_pruneValue = Double.NaN; //todo not used /** * Index in ranges for LOW and HIGH and WIDTH */ public static int R_MIN = 0; public static int R_MAX = 1; public static int R_WIDTH = 2; /** * Default Constructor */ public KDTree() { } /** * Constructor, copies all options from an existing KDTree. * @param tree the KDTree to copy from */ public KDTree(KDTree tree) { m_Universe = tree.m_Universe; m_Instances = tree.m_Instances; m_MinBoxRelWidth = tree.m_MinBoxRelWidth; m_MaxInstInLeaf = tree.m_MaxInstInLeaf; m_Valid = tree.m_Valid; } /************************************************************************** * * A class for storing a KDTree node. * **************************************************************************/ private class KDTreeNode implements Serializable { /** node number (only for debug) */ private int m_NodeNumber; /** left subtree; contains instances with smaller or equal to split value. */ private KDTreeNode m_Left = null; /** right subtree; contains instances with larger than split value. */ private KDTreeNode m_Right = null; /** value to split on. */ private double m_SplitValue; /** attribute to split on. */ private int m_SplitDim; /** * Every subtree stores the beginning index and the end index of the range * in the main instancelist, that contains its own instances */ private int m_Start = 0; private int m_End = 0; /* * lowest and highest value and width (= high - low) for each * dimension */ private double[][] m_NodeRanges; /** * Gets the splitting dimension. * @return splitting dimension */ public int getSplitDim() { return m_SplitDim; } /** * Gets the splitting value. * @return splitting value */ public double getSplitValue() { return m_SplitValue; } /** * Checks if node is a leaf. * @return true if it is a leaf */ public boolean isALeaf () { return (m_Left == null); } /** * Tidies up after delete. This means it changes start and end ranges. * @param deleted index of the already deleted instance */ public void tidyUpAfterDelete(int deleted) { boolean changed = false; // test, if one of the childrens last instance gets deleted if (!isALeaf()) { boolean deleteLastOne = false; if ((m_Left.m_Start == deleted) && (m_Left.m_End == deleted)) { deleteLastOne = true; } if ((m_Right.m_Start == deleted) && (m_Right.m_End == deleted)) { deleteLastOne = true; } if (deleteLastOne) { // make a leaf m_Right = null; m_Left = null; } } // test if start or/and end needs to be changed if (deleted <= m_End) { m_End--; changed = true; if (deleted < m_Start) { m_Start--; } } if (changed) { // prepare local instance list to work with int numInst = m_End - m_Start + 1; int [] instList = new int[numInst]; int index = 0; for (int i = m_Start; i <= m_End; i++) { instList[index++] = m_InstList.get(i); } // set ranges and split parameter m_NodeRanges = m_Instances.initializeRanges(instList); } } /** * Makes a KDTreeNode. * Use this, if ranges are already defined. * @param num number of the current node * @param ranges the ranges * @param start start index of the instances * @param end index of the instances * @exception thrown if instance couldn't be retrieved */ private void makeKDTreeNode(int[] num, double[][] ranges, int start, int end) throws Exception { m_NodeRanges = ranges; makeKDTreeNode(num, start, end); } /** * Makes a KDTreeNode. * @param num the node number * @param start the start index of the instances in the index list * @param end the end index of the instances in the index list * @exception thrown if instance couldn't be retrieved */ private void makeKDTreeNode(int [] num, int start, int end) throws Exception { num[0]++; m_NodeNumber = num[0]; m_Start = start; m_End = end; m_Left = null; m_Right = null; m_SplitDim = -1; m_SplitValue = -1; double relWidth = 0.0; boolean makeALeaf = false; int numInst = end - start + 1; // if number of instances is under a maximum, then the node is a leaf if (numInst <= m_MaxInstInLeaf) { makeALeaf = true; } // prepare local instance list to work with int [] instList = new int[numInst]; int index = 0; for (int i = start; i <= end; i++) { instList[index++] = m_InstList.get(i); } // set ranges and split parameter if (m_NodeRanges == null) m_NodeRanges = m_Instances.initializeRanges(instList); // set outer ranges if (m_Universe == null) { m_Universe = m_NodeRanges; } m_SplitDim = widestDim(m_Normalize); if (m_SplitDim >= 0) { m_SplitValue = splitValue(m_SplitDim); // set relative width relWidth = m_NodeRanges[m_SplitDim][R_WIDTH] / m_Universe[m_SplitDim][R_WIDTH]; } /* calculate bias double bias = 0.0; for (int i = 0; i < m_Instances.numAttributes(); i++) { double tmp = m_NodeRanges[i][R_WIDTH] / m_Universe[i][R_WIDTH]; bias += tmp * tmp; } m_Bias = bias * numInst; */ // check if thin enough to make a leaf if (relWidth <= m_MinBoxRelWidth) { makeALeaf = true; } // split instance list into two // first define which one have to go left and right.. int numLeft = 0; boolean [] left = new boolean[numInst]; if (!makeALeaf) { numLeft = checkSplitInstances(left, instList, m_SplitDim, m_SplitValue); // if one of the sides would be empty, make a leaf // which means, do nothing if ((numLeft == 0) || (numLeft == numInst)) { makeALeaf = true; //OOPS("makeKDTreeNode: " + m_NodeNumber // + " one side was empty after split"); } } if (makeALeaf) { //TODO I think we don't need any of the following: // sum = // sum is a row vector that has added up all rows // summags = // is one double that contains the sum of the scalar product // of all row vectors with themselves } else { // and now really make two lists int [] leftInstList = new int[numLeft]; int [] rightInstList = new int[numInst - numLeft]; int startLeft = start; int startRight = start + numLeft; splitInstances(left, instList, startLeft, startRight); /** for (int i = 0; i < m_InstList.length(); i++) { if (i == startLeft) System.out.print(" /LE/ "); if (i == startRight) System.out.print(" /RI/ "); System.out.print(" / " + m_InstList.get(i)); } int[] debugInstList = new int[m_InstList.length()]; for (int i = 0; i < debugInstList.length; i++) { debugInstList[i] = m_InstList.get(i); } boolean [] debugleft = new boolean[m_InstList.length()]; int debugnumLeft = checkSplitInstances(debugleft, debugInstList, m_SplitDim, m_SplitValue); boolean first = true; for (int i = start; i < end; i++) { if (first && !debugleft[i]) { first = false; } System.out.print(" " + debugleft[i]); } OOPS(" "); end debug **/ // make left subKDTree int endLeft = startLeft + numLeft - 1; m_Left = new KDTreeNode(); m_Left.makeKDTreeNode(num, startLeft, endLeft); // m_Sum += m_Left.getSum(); // m_SumMags += m_Left.getSumMags(); // make right subKDTree int endRight = end; m_Right = new KDTreeNode(); m_Right.makeKDTreeNode(num, startRight, endRight); // m_Sum += m_Right.getSum(); // m_SumMags += m_Right.getSumMags(); } } /** * Returns the widest dimension. * @param normalize if true normalization is used * @return attribute index that has widest range */ private int widestDim(boolean normalize) { double widest = 0.0; int w = -1; if (normalize) { for (int i = 0; i < m_NodeRanges.length; i++) { double newWidest = m_NodeRanges[i][R_WIDTH] / m_Universe[i][R_WIDTH]; if (newWidest > widest) { widest = newWidest; w = i; } } } else { for (int i = 0; i < m_NodeRanges.length; i++) { if (m_NodeRanges[i][R_WIDTH] > widest) { widest = m_NodeRanges[i][R_WIDTH]; w = i; } } } return w; } /** * Returns the split value of a given dimension. * @param dim dimension where split happens * @return the split value */ private double splitValue(int dim) { double split = m_DistanceFunction.getMiddle(m_NodeRanges[dim]); // split = m_NodeRanges[dim][R_MIN] + m_NodeRanges[dim][R_WIDTH] * 0.5; return split; } /** * Add an instance to the node or subnode. Returns false if adding cannot * be done. * Looks for the subnode the instance actually belongs to. * Corrects the end boundary of the instance list by coming up * @param instance the instance to add * @return true if adding was done **/ public boolean addInstance(Instance instance) throws Exception { boolean success = false; if (!isALeaf()) { // go further down the tree to look for the leaf the instance should be in double instanceValue = instance.value(m_SplitDim); boolean instanceInLeft = instanceValue <= m_SplitValue; if (instanceInLeft) { success = m_Left.addInstance(instance); if (success) { // go into right branch to correct instance list boundaries m_Right.afterAddInstance(); } } else { success = m_Right.addInstance(instance); } // instance was included if (success) { // correct end index of instance list of this node m_End++; // correct ranges m_NodeRanges = Instances.updateRanges(instance, m_NodeRanges); } } else { // found the leaf to insert instance // ranges have been updated m_NodeRanges = Instances.updateRanges(instance, m_NodeRanges); int index = m_Instances.numInstances() - 1; m_InstList.squeezeIn(m_End, index); m_End++; int numInst = m_End - m_Start + 1; // leaf did get too big? if (numInst > m_MaxInstInLeaf) { //split leaf int [] num = new int[1]; num[0] = m_NodeNumber; this.makeKDTreeNode(num, m_NodeRanges, m_Start, m_End); } success = true; } return success; } /** * Corrects the boundaries of all nodes to the right of the leaf where * the instance was added to. **/ public void afterAddInstance() { m_Start++; m_End++; if (!isALeaf()) { m_Left.afterAddInstance(); m_Right.afterAddInstance(); } } /** * Deletes one instance in this KDTree node or its subsequent nodes. * @param index the index of the instance to be deleted * @return true if instance was deleted */ public boolean deleteInstance(int index) throws Exception { int pos = m_InstList.deleteOneIndex(index); if (pos >= 0) { m_Root.tidyUpAfterDelete(pos); return true; } else { return false; } } /** * Returns statistics about the KDTree. * @param num give number of nodes * @param leaves give number of leaves * @return a text string that contains the statistics to the KDTree */ public String statToString (boolean nodes, boolean leaves) { int count = 1; int stats[] = new int [2]; if (this.m_Left != null) count = this.m_Left.collectStats(count, stats); if (this.m_Right != null) count = this.m_Right.collectStats(count, stats); StringBuffer text = new StringBuffer(); if (nodes) text.append("\n Number of nodes in the tree " + count + " \n"); if (leaves) text.append(" Number of leaves in the tree " + stats[0] + " \n"); return text.toString(); } /** * Returns statistics about the KDTree. * @param count number of nodes so far * @param stats array with stats info * @return the number of nodes */ public int collectStats (int count, int[] stats) { count++; if (this.m_Left != null) count = this.m_Left.collectStats(count, stats); if (this.m_Right != null) count = this.m_Right.collectStats(count, stats); else // is a leaf stats[0]++; return count; } /** * Returns the KDTree node and its underlying branches as string. * @param leaves adds the instances of the leaves * @return a string representing the node */ public String nodeToString (boolean leaves) { StringBuffer text = new StringBuffer(); text.append("NODE-Nr: " + m_NodeNumber + "\n"); int len = m_End - m_Start + 1; text.append("Num of instances: " + len + "\n"); text.append("start " + m_Start + " == end " + m_End + "\n"); if (!isALeaf()) { text.append("attribute: " + this.m_SplitDim); text.append("split at: " + this.m_SplitValue + "\n"); } else { text.append("is a leaf\n"); if (leaves) { for (int i = m_Start; i <= m_End; i++) { int instIndex = m_InstList.get(i); text.append(instIndex + ": "); text.append(m_Instances.instance(instIndex).toString() + "\n"); } } } text.append("------------------\n"); if (this.m_Left != null) text.append(this.m_Left.nodeToString(leaves)); if (this.m_Right != null) text.append(this.m_Right.nodeToString(leaves)); return text.toString(); } /** * Assigns instances to the current centers called candidates. * * @param centers all the current centers * @param candidates the current centers the method works on * @param assignments the center index for each instance * @param pc the threshold value for pruning * @param p True if pruning should be used */ private void determineAssignments(Instances centers, int[] candidates, int[] assignments, double pc, boolean p) throws Exception { // reduce number of owners for current hyper rectangle int [] owners = refineOwners(centers, candidates); // only one owner if (owners.length == 1) { // all instances of this node are owned by one center for (int i = m_Start; i <= m_End; i++) { assignments[m_InstList.get(i)] // the assignment of this instance = owners[0]; // is the current owner } } else if (!this.isALeaf()) { // more than one owner and it is not a leaf m_Left.determineAssignments(centers, owners, assignments, pc, p); m_Right.determineAssignments(centers, owners, assignments, pc, p); } else { // this is a leaf and there are more than 1 owner //XMeans. assignSubToCenters(m_NodeRanges, centers, owners, assignments); } } /** * Refines the ownerlist. * @param centers all centers * @param candidates the indexes of those centers that are candidates * @return list of owners */ private int [] refineOwners(Instances centers, int [] candidates) throws Exception { int [] owners = new int [candidates.length]; double minDistance = Double.MAX_VALUE; int ownerIndex = -1; Instance owner; int numCand = candidates.length; double [] distance = new double[numCand]; boolean [] inside = new boolean[numCand]; for (int i = 0; i < numCand; i++) { distance[i] = distanceToHrect(centers.instance(candidates[i])); inside[i] = (distance[i] == 0.0); if (distance[i] < minDistance) { minDistance = distance[i]; ownerIndex = i; } } owner = new Instance(centers.instance(candidates[ownerIndex])); // are there other owners // loop also goes over already found owner, keeps order // in owner list int index = 0; for (int i = 0; i < numCand; i++) { // 1. all centers that are points within rectangle are owners if ((inside[i]) // 2. take all points with same distance to the rect. as the owner || (distance[i] == distance[ownerIndex])) { // add competitor to owners list owners[index++] = candidates[i]; } else { Instance competitor = new Instance(centers.instance(candidates[i])); if // 3. point has larger distance to rectangle but still can compete // with owner for some points in the rectangle (!candidateIsFullOwner(owner, competitor)) { // also add competitor to owners list owners[index++] = candidates[i]; } } } int [] result = new int [index]; for (int i = 0; i < index; i++) result[i] = owners[i]; return result; } /* * Returns true if candidate is a full owner in respect to a * competitor.<p> * * The candidate has been the closer point to the current rectangle * or even has been a point within the rectangle. * The competitor is competing with the candidate for a few points * out of the rectangle although it is a point further away * from the rectangle then the candidate. * The extrem point is the corner of the rectangle that is furthest * away from the candidate towards the direction of the competitor. * * If the distance candidate to this extreme point is smaller * then the distance competitor to this extreme point, then it is * proven that none of the points in the rectangle can be owned be * the competitor and the candidate is full owner of the rectangle * in respect to this competitor. * See also D. Pelleg and A. Moore's paper 'Accelerating exact k-means * Algorithms with Geometric Reasoning'. <p> * * @param candidate instance that is candidate to be owner * @param competitor instance that competes against the candidate * @return true if candidate is full owner */ private boolean candidateIsFullOwner(Instance candidate, Instance competitor) throws Exception { // get extreme point Instance extreme = new Instance(candidate); for (int i = 0; i < m_Instances.numAttributes(); i++) { if ((competitor.value(i) - candidate.value(i)) > 0) { extreme.setValue(i, m_NodeRanges[i][R_MAX]); } else { extreme.setValue(i, m_NodeRanges[i][R_MIN]); } } boolean isFullOwner = // m_DistanceFunction.distance(extreme, candidate) < m_DistanceFunction.distance(extreme, competitor); return isFullOwner; } /** * Returns the distance between a point and an hyperrectangle. * @param x the point * @return the distance */ private double distanceToHrect(Instance x) throws Exception { double distance = 0.0; Instance closestPoint = new Instance(x); boolean inside; inside = clipToInsideHrect(closestPoint); if (!inside) distance = m_DistanceFunction.distance(closestPoint, x); return distance; } /** * Finds the closest point in the hyper rectangle to a given point. * Change the given point to this closest point by clipping of * at all the dimensions to be clipped of. * If the point is inside the rectangle it stays unchanged. * The return value is true if the point was not changed, so the * the return value is true if the point was inside the rectangle. * * @param x a point * @return true if the input point stayed unchanged. */ private boolean clipToInsideHrect(Instance x) { boolean inside = true; for (int i = 0; i < m_Instances.numAttributes(); i++) { //TODO treat nominals differently!?? if (x.value(i) < m_NodeRanges[i][R_MIN]) { x.setValue(i, m_NodeRanges[i][R_MIN]); inside = false; } else if (x.value(i) > m_NodeRanges[i][R_MAX]) { x.setValue(i, m_NodeRanges[i][R_MAX]); inside = false; } } return inside; } /** * Assigns instances of this node to center. Center to be assign to * is decided by the distance function. * * @param ranges min's and max's of attributes * @param centers all the input centers * @param centList the list of centers to work with * @param assignments index list of last assignments */ public void assignSubToCenters(double [][] ranges, Instances centers, int [] centList, int [] assignments) throws Exception { //todo: undecided situations int numCent = centList.length; // WARNING: assignments is "input/output-parameter" // should not be null and the following should not happen if (assignments == null) { assignments = new int[m_Instances.numInstances()]; for (int i = 0; i < assignments.length; i++) { assignments[i] = -1; } } // set assignments for all instances of this node for (int i = m_Start; i <= m_End; i++) { int instIndex = m_InstList.get(i); Instance inst = m_Instances.instance(instIndex); //if (instList[i] == 664) System.out.println("664***"); int newC = m_DistanceFunction.closestPoint(inst, centers, centList); // int newC = clusterProcessedInstance(inst, centers); assignments[instIndex] = newC; } } /** * Find k nearest neighbours to target by simply searching through all instances * in the leaf. * No check on missing class. * @param target the instance to find nearest neighbour for * @param minDist the minimal distance found so far * @return the minimal distance found */ public double simpleKNearestNeighbour(Instance target, double minDist) throws Exception { double dist = 0; int currIndex; // sets and uses: // double m_MinDist // double m_MaxMinDist // int m_FurthestNear int i = 0; int index = m_Start; // if no instances, return max value as distance if (m_End < m_Start) return Double.MAX_VALUE; if (m_NearestListLength == 0) { for (;(index <= m_End) && (i < m_kNN);) { currIndex = m_InstList.get(index); Instance trainInstance = m_Instances.instance(m_InstList.get(index)); if (target != trainInstance) { // for hold-one-out cross-validation dist = m_DistanceFunction.distance(target, trainInstance); m_NearestList[i] = currIndex; m_DistanceList[i] = dist; if (dist < minDist) minDist = dist; i++; } index++; } m_NearestListLength = m_kNN; } /* System.out.print("dist: "); for (int j = 0; j < m_kNN; j++) { System.out.print(" " + m_DistanceList[j]); } System.out.println(); System.out.print("near: "); for (int j = 0; j < m_kNN; j++) { System.out.print(" " + m_NearestList[j]); } System.out.println(); */ // set the new furthest nearest m_FurthestNear = checkFurthestNear(); m_MaxMinDist = m_DistanceList[m_FurthestNear]; // check all or rest of instances if nearer for (; index <= m_End; index++) { currIndex = m_InstList.get(index); Instance trainInstance = m_Instances.instance(currIndex); if (target != trainInstance) { // for hold-one-out cross-validation dist = m_DistanceFunction.distance(target, trainInstance); // is instance one of the nearest? if (dist < m_MaxMinDist) { // set instance as one of the nearest, // replacing the last furthest nearest m_NearestList[m_FurthestNear] = currIndex; m_DistanceList[m_FurthestNear] = dist; if (m_MultipleFurthest) { // remove multiple entries of old furthest nearest m_NearestListLength = m_kNN; m_MultipleFurthest = false; } // set the new furthest nearest m_FurthestNear = checkFurthestNear(); m_MaxMinDist = m_DistanceList[m_FurthestNear]; // minimal value of distances did change too if (dist < minDist) minDist = dist; } else { if (dist == m_MaxMinDist) { // instance is at same distance as furthest nearest m_MultipleFurthest = true; m_NearestList[m_NearestListLength] = currIndex; m_DistanceList[m_NearestListLength] = dist; m_NearestListLength++; } } } } return minDist; } /** * Finds the nearest neighbour to target, this method is called recursively. * @param target the instance to find nearest neighbour for * @param maxDist the distance to the nearest neighbour so far * @return the minimal distance found */ private double kNearestNeighbour(Instance target, double maxDist) throws Exception { double newDist; KDTreeNode nearer, further; // if is a leaf then the instance is in this hyperrectangle if (this.isALeaf()) { // return distance to nearest (and index of all // all k nearest in m_NearestList) return this.simpleKNearestNeighbour(target, maxDist); } boolean targetInLeft = m_DistanceFunction.valueIsSmallerEqual( target, m_SplitDim, m_SplitValue); if (targetInLeft) { nearer = m_Left; further = m_Right; } else { nearer = m_Right; further = m_Left; } // look for nearer neighbours in nearer half maxDist = nearer.kNearestNeighbour(target, maxDist); // ... now look in further half if maxDist reaches into it Instance splitPoint = new Instance(target); splitPoint.setValue(m_SplitDim, m_SplitValue); double distanceToSplit = m_DistanceFunction.distance(target, splitPoint); boolean lookInSecondHalf = maxDist >= distanceToSplit; if (lookInSecondHalf) { // look for nearer neighbours in further half maxDist = further.kNearestNeighbour(target, maxDist); } return maxDist; } } // // End of class KDTreeNode /** * Adds one instance to KDTree loosly. It only changes the ranges. The ranges are * important for the distance function. * @param instance the new instance */ public void addLooslyInstance(Instance instance) { m_DistanceFunction.updateRanges(instance); } /** * Builds the KDTree. * @param instances instances to build the tree of */ public void buildKDTree(Instances instances) throws Exception { double [][] ranges = instances.initializeRanges(); buildKDTree(instances, ranges); } /** * Builds the KDTree. * It is adviseable to run the replace missing attributes filter on the * passed instances first. * @param instances instances to build the tree of * @ranges the ranges of this instances */ public void buildKDTree(Instances instances, double [][] ranges) throws Exception { m_Instances = instances; int numInst = m_Instances.numInstances(); // Make the global index list m_InstList = new DynamicArrayOfPosInt(numInst); for (int i = 0; i < numInst; i++) { m_InstList.set(i, i); } // make the tree starting with the roor node m_Root = new KDTreeNode(); if (ranges == null) { ranges = instances.initializeRanges(); } // set global ranges m_Universe = ranges; // set distance function m_DistanceFunction = new weka.core.EuclideanDistance(m_Instances, m_Universe); checkInstances(); // build the tree int [] num = new int[1]; num[0] = 0; m_Root.makeKDTreeNode(num, ranges, 0, // index of first instance index numInst - 1); // index of last instance index // tree is valid for instances m_Valid = true; } /** * Checks the instances. * No checks in this KDTree but it calls the check of the distance function. */ private void checkInstances () throws Exception { m_DistanceFunction.checkInstances(); } /** * Adds one instance to the KDTree. * @param instance the instance to be added */ public void updateKDTree(Instance instance) throws Exception { boolean success = m_Root.addInstance(instance); if (!success) { // make a new tree double [][] dummyRanges = null; buildKDTree(m_Instances, dummyRanges); } } /** * Deletes one instance in the KDTree. * @param instance the instance to be deleted */ public void deleteInstance(Instance instance) throws Exception { int index =0; // for (int deleteInstance(index); } /** * Deletes one instance in the KDTree. * @param index the index of the instance to be deleted * @return true if instance was deleted */ public boolean deleteInstance(int index) throws Exception { boolean success = false; int pos = m_InstList.deleteOneIndex(index); if (pos >= 0) { m_Root.tidyUpAfterDelete(pos); success = true; } if (!success) { // make a new tree double [][] dummyRanges = null; buildKDTree(m_Instances, dummyRanges); } return success; } /** * toString * @return string representing the tree */ public String toString() { StringBuffer text = new StringBuffer(); KDTreeNode tree = m_Root; int[] num = new int[1]; num[0] = 0; // index list in string format: //for (int i = 0; i <= m_InstList.length(); i++) { // int instIndex = m_InstList.get(i); // text.append(instIndex + "/ "); //} text.append("\nKDTree build:"); text.append(tree.statToString(true, true)); // tree in string format: text.append(tree.nodeToString(true)); return text.toString(); } /** * Assigns instances to centers using KDTree. * * @param centers the current centers * @param assignments the centerindex for each instance * @param pc the threshold value for pruning. * @param p True if pruning should be used. */ public void centerInstances(Instances centers, int [] assignments, double pc) throws Exception { int [] centList = new int[centers.numInstances()]; for (int i = 0; i < centers.numInstances(); i++) centList[i] = i; m_Root.determineAssignments(centers, centList, assignments, pc, m_Prune); } /** * Used for debug println's. * @param output string that is printed */ private void OOPS(String output) { System.out.println(output); } /** * Normalizes a given value of a numeric attribute. * * @param x the value to be normalized * @param i the attribute's index * @param r the ranges for each attribute */ private double norm(double x, int i, double[][] r) { if (Double.isNaN(r[i][0]) || (r[i][1]) == 0) { return 0; } else { return (x - r[i][0]) / (r[i][1]); } } /** * Returns array of boolean set true or false if instance is part * of next left kdtree. * @param left list of boolean values, true if instance belongs to left * @param instList list of indexes of instances of this node * @param splitDim index of splitting attribute * @param splitValue value at which the node is split * @return number of instances that belong to the left */ private int checkSplitInstances(boolean [] left, int [] instList, int splitDim, double splitValue) { // length of left should be same as length of instList int numLeft = 0; for (int i = 0; i < instList.length; i++) { // value <= splitValue if (m_DistanceFunction.valueIsSmallerEqual( m_Instances.instance(instList[i]), splitDim, splitValue)) { left[i] = true; numLeft++; } else { left[i] = false; } } return numLeft; } /** * Sorts instances newly into left and right part. * @param left list of flags, set true by this method if instance * should go to the left follow node * @param instList list of instances of this node * @param iLeft index to the left * @param iLeft index to the left */ private void splitInstances(boolean [] left, int [] instList, int iLeft, int iRight) { for (int i = 0; i < instList.length; i++) { if (left[i]) { m_InstList.set(iLeft++, instList[i]); } else { m_InstList.set(iRight++, instList[i]); } } } /** -------------------------------------------------------------------------------- ** variables for nearest neighbour search * --------------------------------------------------------------------------------*/ /** index/indices of current target */ private int [] m_NearestList; /** length of nearest list (can be larger than k) */ private int m_NearestListLength = 0; /** true if more than of k nearest neighbours */ private boolean m_MultipleFurthest = false; /** number of nearest neighbours k */ private int m_kNN = 0; /** distance to current nearest neighbour */ private double m_MinDist = Double.MAX_VALUE; /** distance to current furthest of the neighbours */ private double m_MaxMinDist = Double.MAX_VALUE; /** index of the furthest of the neighbours in m_NearestList */ private int m_FurthestNear = 0; /** distance to current nearest neighbour */ private double [] m_DistanceList; /** * Find k nearest neighbours to target. This is the main method. * * @param target the instance to find nearest neighbour for * @param maxDist the distance to the nearest neighbor so far * @param nearest the index of the nearest neighbor (second return value) */ public int findKNearestNeighbour(Instance target, int kNN, int [] nearestList, double [] distanceList) throws Exception { m_kNN = kNN; double maxDist = Double.MAX_VALUE; m_NearestList = nearestList; m_DistanceList = distanceList; m_NearestListLength = 0; int[] num = new int[1]; num[0] = 0; double minDist = m_Root.kNearestNeighbour(target, maxDist); return m_NearestListLength; } /** * Get the distance of the furthest of the nearest neighbour * the index of this instance * in the index list. * @param nearest index of k nearest instances */ private int checkFurthestNear() { double max = 0.0; int furthestNear = 0; for (int i = 0; i < m_kNN; i++) { if (m_DistanceList[i] > max) { max = m_DistanceList[i]; furthestNear = i; } } return furthestNear; } /** * the GET and SET - functions =============================================== **/ /** * Sets KDTree to be valid for dataset in m_Instances. * @param flag if KDtree is valid */ public void setValid(boolean valid) { m_Valid = valid; } /** * Returns true if valid flag is true. * @return flag if KDtree is valid */ public boolean isValid() { return m_Valid; } /** * Gets number of instances in KDTree. * @return number of instances */ public int numInstances() { return m_Instances.numInstances(); } /** * Gets instances used in the tree. * @return model information */ public Instances getInstances() { return m_Instances; } /** * Gets instance list used in the tree. * @return instance list */ public DynamicArrayOfPosInt getInstList() { return m_InstList; } /** * Gets the distance function specification string, * which contains the class name of distance function * the filter and any options to the filter * * @return the filter string. */ protected String getDistanceFunctionSpec() { DistanceFunction c = getDistanceFunction(); if (c instanceof OptionHandler) { return c.getClass().getName() + " " + Utils.joinOptions(((OptionHandler)c).getOptions()); } return c.getClass().getName(); } /** * Sets the distance function. * @param distanceF the distance function with all options set */ public void setDistanceFunction(DistanceFunction distanceF) { m_DistanceFunction = distanceF; } /** * Gets the distance function. * @return the distance function */ public DistanceFunction getDistanceFunction() { return m_DistanceFunction; } /** * Sets the minimum relative box width. * @param i the minimum relative box width */ public void setMinBoxRelWidth(double i) throws Exception { m_MinBoxRelWidth = i; } /** * Gets the minimum relative box width. * @return the minimum relative box width */ public double getMinBoxRelWidth() { return m_MinBoxRelWidth; } /** * Sets the maximum number of instances in a leaf. * @param i the maximum number of instances in a leaf */ public void setMaxInstInLeaf(int i) throws Exception { m_MaxInstInLeaf = i; } /** * Get the maximum number of instances in a leaf. * @return the maximum number of instances in a leaf */ public int getMaxInstInLeaf() { return m_MaxInstInLeaf; } /** * Sets the flag for pruning of the blacklisting algorithm. * @param p true to use pruning. */ public void setPrune(boolean p) { m_Prune = p; } /** * Gets the pruning flag. * @return True if pruning */ public boolean getPrune() { return m_Prune; } /** * Sets the flag for normalizing the widths of a KDTree Node by the width * of the dimension in the universe. * @param n true to use normalizing. */ public void setNormalize(boolean n) { m_Normalize = n; } /** * Gets the normalize flag. * @return True if normalizing */ public boolean getNormalize() { return m_Normalize; } /** * Sets the debug level. * debug level = 0, means no output * @param d debuglevel */ public void setDebugLevel(int d) { m_DebugLevel = d; } /** * Gets the debug level. * @return debug level */ public int getDebugLevel() { return m_DebugLevel; } /** * Returns an enumeration describing the available options. * @return an enumeration of all the available options. */ public Enumeration listOptions() { Vector newVector = new Vector(5); newVector.addElement(new Option( "\tPruning will be done\n" +"\t(Use this to prune).", "P", 0,"-P")); newVector.addElement(new Option( "\tSet minimal width of a box\n" +"\t(default = 1.0E-2).", "W", 0,"-W <value>")); newVector.addElement(new Option( "\tMaximal number of instances in a leaf\n" +"\t(default = 40).", "L", 0,"-L")); newVector.addElement(new Option( "\tDistance function\n" +"\t(default = Euclidean Distance).", "D", 0,"-D")); newVector.addElement(new Option( "\tNormalizing will be done\n" +"\t(Select dimension for split, with normalising to universe).", "N", 0,"-N")); return newVector.elements(); } /** * Parses a given list of options. * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions(String[] options) throws Exception { if (Utils.getFlag('P', options)) { setPrune(true); } else { setPrune(false); } String optionString = Utils.getOption('W', options); if (optionString.length() != 0) { setMinBoxRelWidth(Double.parseDouble(optionString)); } optionString = Utils.getOption('L', options); if (optionString.length() != 0) { setMaxInstInLeaf(Integer.parseInt(optionString)); } String funcString = Utils.getOption('D', options); if (funcString.length() != 0) { String [] funcSpec = Utils.splitOptions(funcString); String funcName = funcSpec[0]; funcSpec[0] = ""; setDistanceFunction((DistanceFunction) Utils.forName(DistanceFunction.class, funcName, funcSpec)); } if (Utils.getFlag('N', options)) { setNormalize(true); } else { setNormalize(false); } optionString = Utils.getOption('U', options); int debugLevel = 0; if (optionString.length() != 0) { try { debugLevel = Integer.parseInt(optionString); } catch (NumberFormatException e) { throw new Exception(optionString + "is an illegal value for option U"); } } setDebugLevel(debugLevel); } /** * Gets the current settings of KDtree. * @return an array of strings suitable for passing to setOptions */ public String[] getOptions() { String[] options = new String[10]; int current = 0; if (getPrune()) { options[current++] = "-P"; } options[current++] = "-W"; options[current++] = "" + getMinBoxRelWidth(); options[current++] = "-L"; options[current++] = "" + getMaxInstInLeaf(); options[current++] = "-D"; options[current++] = "" + getDistanceFunctionSpec(); if (getNormalize()) { options[current++] = "-N"; } int dL = getDebugLevel(); if (dL > 0) { options[current++] = "-U"; options[current++] = "" + dL; } while (current < options.length) { options[current++] = ""; } return options; } /** * Main method for testing this class */ public static void main(String [] args) { try { if (args.length < 1 ) { System.err.println("Usage : weka.gui.visualize.VisualizePanel " +"<dataset> [<dataset> <dataset>...]"); System.exit(1); } }catch (Exception ex) { ex.printStackTrace(); System.err.println(ex.getMessage()); } } }