package tr.gov.ulakbim.jDenetX.clusterers.streamkm; /** * @author Marcel R. Ackermann, Christiane Lammersen, Marcus Maertens, Christoph Raupach, * Christian Sohler, Kamil Swierkot */ public class TreeCoreset { /** * datastructure representing a node within a tree */ protected class treeNode { //number of points in this node int n; //array with pointers on points Point[] points; //pointer on the centre of the treenode Point centre; //pointer on the left childnode treeNode lc; //pointer on the right childnode treeNode rc; //pointer on the parent node treeNode parent; //cost of the treenode double cost; void free() { this.parent = null; this.lc = null; this.rc = null; this.points = null; this.centre = null; } public treeNode(int n, Point[] points, Point centre, treeNode parent) { this.n = n; this.points = points; this.centre = centre; this.lc = null; this.rc = null; this.parent = parent; this.cost = treeNodeTargetFunctionValue(); } /** * initalizes root as a treenode with the union of setA and setB as pointset and centre as centre */ public treeNode(Point[] setA, Point[] setB, int n_1, int n_2, Point centre, int centreIndex) { //loop counter variable int i; //the root has no parent and no child nodes in the beginning this.parent = null; this.lc = null; this.rc = null; //array with points to the points this.points = new Point[n_1 + n_2]; this.n = n_1 + n_2; for (i = 0; i < this.n; i++) { if (i < n_1) { this.points[i] = setA[i]; this.points[i].centreIndex = centreIndex; } else { this.points[i] = setB[i - n_1]; this.points[i].centreIndex = centreIndex; } } //set the centre this.centre = centre; //calculate costs this.cost = treeNodeTargetFunctionValue(); } /** * Computes the target function value of the n points of the treenode. Differs from the function "targetFunctionValue" in three things: * <p/> * 1. only the centre of the treenode is used as a centre * <p/> * 2. works on arrays of pointers instead on arrays of points * <p/> * 3. stores the cost in the treenode */ double treeNodeTargetFunctionValue() { //loop counter variable int i; //stores the cost double sum = 0.0; for (i = 0; i < this.n; i++) { //stores the distance double distance = 0.0; //loop counter variable int l; for (l = 0; l < this.points[i].dimension; l++) { //centroid coordinate of the point double centroidCoordinatePoint; if (this.points[i].weight != 0.0) { centroidCoordinatePoint = this.points[i].coordinates[l] / this.points[i].weight; } else { centroidCoordinatePoint = this.points[i].coordinates[l]; } //centroid coordinate of the centre double centroidCoordinateCentre; if (this.centre.weight != 0.0) { centroidCoordinateCentre = this.centre.coordinates[l] / this.centre.weight; } else { centroidCoordinateCentre = this.centre.coordinates[l]; } distance += (centroidCoordinatePoint - centroidCoordinateCentre) * (centroidCoordinatePoint - centroidCoordinateCentre); } sum += distance * this.points[i].weight; } return sum; } } /** * computes the hypothetical cost if the node would be split with new centers centreA, centreB */ double treeNodeSplitCost(treeNode node, Point centreA, Point centreB) { //loop counter variable int i; //stores the cost double sum = 0.0; for (i = 0; i < node.n; i++) { //loop counter variable int l; //stores the distance between p and centreA double distanceA = 0.0; for (l = 0; l < node.points[i].dimension; l++) { //centroid coordinate of the point double centroidCoordinatePoint; if (node.points[i].weight != 0.0) { centroidCoordinatePoint = node.points[i].coordinates[l] / node.points[i].weight; } else { centroidCoordinatePoint = node.points[i].coordinates[l]; } //centroid coordinate of the centre double centroidCoordinateCentre; if (centreA.weight != 0.0) { centroidCoordinateCentre = centreA.coordinates[l] / centreA.weight; } else { centroidCoordinateCentre = centreA.coordinates[l]; } distanceA += (centroidCoordinatePoint - centroidCoordinateCentre) * (centroidCoordinatePoint - centroidCoordinateCentre); } //stores the distance between p and centreB double distanceB = 0.0; for (l = 0; l < node.points[i].dimension; l++) { //centroid coordinate of the point double centroidCoordinatePoint; if (node.points[i].weight != 0.0) { centroidCoordinatePoint = node.points[i].coordinates[l] / node.points[i].weight; } else { centroidCoordinatePoint = node.points[i].coordinates[l]; } //centroid coordinate of the centre double centroidCoordinateCentre; if (centreB.weight != 0.0) { centroidCoordinateCentre = centreB.coordinates[l] / centreB.weight; } else { centroidCoordinateCentre = centreB.coordinates[l]; } distanceB += (centroidCoordinatePoint - centroidCoordinateCentre) * (centroidCoordinatePoint - centroidCoordinateCentre); } //add the cost of the closest centre to the sum if (distanceA < distanceB) { sum += distanceA * node.points[i].weight; } else { sum += distanceB * node.points[i].weight; } } //return the total cost return sum; } /** * computes the cost of point p with the centre of treenode node */ double treeNodeCostOfPoint(treeNode node, Point p) { if (p.weight == 0.0) { return 0.0; } //stores the distance between centre and p double distance = 0.0; //loop counter variable int l; for (l = 0; l < p.dimension; l++) { //centroid coordinate of the point double centroidCoordinatePoint; if (p.weight != 0.0) { centroidCoordinatePoint = p.coordinates[l] / p.weight; } else { centroidCoordinatePoint = p.coordinates[l]; } //centroid coordinate of the centre double centroidCoordinateCentre; if (node.centre.weight != 0.0) { centroidCoordinateCentre = node.centre.coordinates[l] / node.centre.weight; } else { centroidCoordinateCentre = node.centre.coordinates[l]; } distance += (centroidCoordinatePoint - centroidCoordinateCentre) * (centroidCoordinatePoint - centroidCoordinateCentre); } return distance * p.weight; } /** * tests if a node is a leaf */ boolean isLeaf(treeNode node) { if (node.lc == null && node.rc == null) { return true; } else { return false; } } /** * selects a leaf node (using the kMeans++ distribution) */ treeNode selectNode(treeNode root, MTRandom clustererRandom) { //random number between 0 and 1 double random = clustererRandom.nextDouble(); while (!isLeaf(root)) { if (root.lc.cost == 0 && root.rc.cost == 0) { if (root.lc.n == 0) { root = root.rc; } else if (root.rc.n == 0) { root = root.lc; } else if (random < 0.5) { random = clustererRandom.nextDouble(); root = root.lc; } else { random = clustererRandom.nextDouble(); root = root.rc; } } else { if (random < root.lc.cost / root.cost) { root = root.lc; } else { root = root.rc; } } } return root; } /** * selects a new centre from the treenode (using the kMeans++ distribution) */ Point chooseCentre(treeNode node, MTRandom clustererRandom) { //How many times should we try to choose a centre ?? int times = 3; //stores the nodecost if node is split with the best centre double minCost = node.cost; Point bestCentre = null; //loop counter variable int i; int j; for (j = 0; j < times; j++) { //sum of the relativ cost of the points double sum = 0.0; //random number between 0 and 1 double random = clustererRandom.nextDouble(); for (i = 0; i < node.n; i++) { sum += treeNodeCostOfPoint(node, node.points[i]) / node.cost; if (sum >= random) { if (node.points[i].weight == 0.0) { //printf("ERROR: CHOOSEN DUMMY NODE THOUGH OTHER AVAILABLE \n"); return null; } double curCost = treeNodeSplitCost(node, node.centre, node.points[i]); if (curCost < minCost) { bestCentre = node.points[i]; minCost = curCost; } break; } } } if (bestCentre == null) { return node.points[0]; } else { return bestCentre; } } /** * returns the next centre */ Point determineClosestCentre(Point p, Point centreA, Point centreB) { //loop counter variable int l; //stores the distance between p and centreA double distanceA = 0.0; for (l = 0; l < p.dimension; l++) { //centroid coordinate of the point double centroidCoordinatePoint; if (p.weight != 0.0) { centroidCoordinatePoint = p.coordinates[l] / p.weight; } else { centroidCoordinatePoint = p.coordinates[l]; } //centroid coordinate of the centre double centroidCoordinateCentre; if (centreA.weight != 0.0) { centroidCoordinateCentre = centreA.coordinates[l] / centreA.weight; } else { centroidCoordinateCentre = centreA.coordinates[l]; } distanceA += (centroidCoordinatePoint - centroidCoordinateCentre) * (centroidCoordinatePoint - centroidCoordinateCentre); } //stores the distance between p and centreB double distanceB = 0.0; for (l = 0; l < p.dimension; l++) { //centroid coordinate of the point double centroidCoordinatePoint; if (p.weight != 0.0) { centroidCoordinatePoint = p.coordinates[l] / p.weight; } else { centroidCoordinatePoint = p.coordinates[l]; } //centroid coordinate of the centre double centroidCoordinateCentre; if (centreB.weight != 0.0) { centroidCoordinateCentre = centreB.coordinates[l] / centreB.weight; } else { centroidCoordinateCentre = centreB.coordinates[l]; } distanceB += (centroidCoordinatePoint - centroidCoordinateCentre) * (centroidCoordinatePoint - centroidCoordinateCentre); } //return the nearest centre if (distanceA < distanceB) { return centreA; } else { return centreB; } } /** * splits the parent node and creates two child nodes (one with the old centre and one with the new one) */ void split(treeNode parent, Point newCentre, int newCentreIndex) { //loop counter variable int i; //1. Counts how many points belong to the new and how many points belong to the old centre int nOld = 0; int nNew = 0; for (i = 0; i < parent.n; i++) { Point centre = determineClosestCentre(parent.points[i], parent.centre, newCentre); if (centre == newCentre) { nNew++; } else { nOld++; } } //2. initalizes the arrays for the pointer //array for pointer on the points belonging to the old centre Point[] oldPoints = new Point[nOld]; //array for pointer on the points belonging to the new centre Point[] newPoints = new Point[nNew]; int indexOld = 0; int indexNew = 0; for (i = 0; i < parent.n; i++) { Point centre = determineClosestCentre(parent.points[i], parent.centre, newCentre); if (centre == newCentre) { newPoints[indexNew] = parent.points[i]; newPoints[indexNew].centreIndex = newCentreIndex; indexNew++; } else if (centre == parent.centre) { oldPoints[indexOld] = parent.points[i]; indexOld++; } else { //printf("ERROR !!! NO CENTER NEAREST !! \n"); } } //left child: old centre treeNode lc = new treeNode(nOld, oldPoints, parent.centre, parent); /*lc.centre = parent.centre; lc.points = oldPoints; lc.n = nOld; lc.lc = null; lc.rc = null; lc.parent = parent; treeNodeTargetFunctionValue(lc);*/ //right child: new centre treeNode rc = new treeNode(nNew, newPoints, newCentre, parent); /*rc.centre = newCentre; rc.points = newPoints; rc.n = nNew; rc.lc = null; rc.rc = null; rc.parent = parent; treeNodeTargetFunctionValue(rc);*/ //set childs of the parent node parent.lc = lc; parent.rc = rc; //propagate the cost changes to the parent nodes while (parent != null) { parent.cost = parent.lc.cost + parent.rc.cost; parent = parent.parent; } } /** * Checks if the storage is completly freed */ boolean treeFinished(treeNode root) { return (root.parent == null && root.lc == null && root.rc == null); } /** * frees a tree of its storage */ void freeTree(treeNode root) { while (!treeFinished(root)) { if (root.lc == null && root.rc == null) { root = root.parent; } else if (root.lc == null && root.rc != null) { //Schau ob rc ein Blatt ist if (isLeaf(root.rc)) { //Gebe rechtes Kind frei root.rc.free(); root.rc = null; } else { //Fahre mit rechtem Kind fort root = root.rc; } } else if (root.lc != null) { if (isLeaf(root.lc)) { root.lc.free(); root.lc = null; } else { root = root.lc; } } } root.free(); } /** * Constructs a coreset of size k from the union of setA and setB */ void unionTreeCoreset(int k, int n_1, int n_2, int d, Point[] setA, Point[] setB, Point[] centres, MTRandom clustererRandom) { //printf("Computing coreset...\n"); //total number of points int n = n_1 + n_2; //choose the first centre (each point has the same probability of being choosen) //stores, how many centres have been choosen yet int choosenPoints = 0; //only choose from the n-i points not already choosen int j = clustererRandom.nextInt(n - choosenPoints); //copy the choosen point if (j < n_1) { //copyPointWithoutInit(&setA[j],¢res[choosenPoints]); centres[choosenPoints] = setA[j].clone(); } else { j = j - n_1; //copyPointWithoutInit(&setB[j],¢res[choosenPoints]); centres[choosenPoints] = setB[j].clone(); } treeNode root = new treeNode(setA, setB, n_1, n_2, centres[choosenPoints], choosenPoints); //?? choosenPoints = 1; //choose the remaining points while (choosenPoints < k) { if (root.cost > 0.0) { treeNode leaf = selectNode(root, clustererRandom); Point centre = chooseCentre(leaf, clustererRandom); split(leaf, centre, choosenPoints); //copyPointWithoutInit(centre,¢res[choosenPoints]); centres[choosenPoints] = centre; } else { //create a dummy point //copyPointWithoutInit(root.centre,¢res[choosenPoints]); centres[choosenPoints] = root.centre; int l; for (l = 0; l < root.centre.dimension; l++) { centres[choosenPoints].coordinates[l] = -1 * 1000000; } centres[choosenPoints].id = -1; centres[choosenPoints].weight = 0.0; centres[choosenPoints].squareSum = 0.0; } choosenPoints++; } //free the tree freeTree(root); //recalculate clustering features int i; for (i = 0; i < n; i++) { if (i < n_1) { int index = setA[i].centreIndex; if (centres[index].id != setA[i].id) { centres[index].weight += setA[i].weight; centres[index].squareSum += setA[i].squareSum; int l; for (l = 0; l < centres[index].dimension; l++) { if (setA[i].weight != 0.0) { centres[index].coordinates[l] += setA[i].coordinates[l]; } } } } else { int index = setB[i - n_1].centreIndex; if (centres[index].id != setB[i - n_1].id) { centres[index].weight += setB[i - n_1].weight; centres[index].squareSum += setB[i - n_1].squareSum; int l; for (l = 0; l < centres[index].dimension; l++) { if (setB[i - n_1].weight != 0.0) { centres[index].coordinates[l] += setB[i - n_1].coordinates[l]; } } } } } } }