package tr.gov.ulakbim.jDenetX.clusterers.streamkm; import tr.gov.ulakbim.jDenetX.cluster.Cluster; import tr.gov.ulakbim.jDenetX.cluster.Clustering; import tr.gov.ulakbim.jDenetX.clusterers.AbstractClusterer; import tr.gov.ulakbim.jDenetX.core.Measurement; import tr.gov.ulakbim.jDenetX.options.IntOption; import weka.core.Instance; /** * @author Marcel R. Ackermann, Christiane Lammersen, Marcus Maertens, Christoph Raupach, * Christian Sohler, Kamil Swierkot */ public class StreamKM extends AbstractClusterer { public IntOption sizeCoresetOption = new IntOption("sizeCoreset", 's', "Size of the coreset.", 100); public IntOption numClustersOption = new IntOption( "numClusters", 'k', "Number of clusters to compute.", 5); public IntOption widthOption = new IntOption("width", 'w', "Size of Window for training learner.", 1000, 0, Integer.MAX_VALUE); public IntOption randomSeedOption = new IntOption("randomSeed", 'r', "Seed for random behaviour of the classifier.", 1); protected MTRandom clustererRandom; protected Point[] centresStreamingCoreset; protected int numberInstances; protected int dimension; protected int length; protected int numberOfCentres; protected int coresetsize; protected BucketManager manager; protected boolean initialized = false; protected boolean clustersComputed = false; //protected Point[] points; protected Point[] tmpCentresStreamingCoreset; private final static double THRESHOLD = 1.000; @Override public void resetLearningImpl() { this.initialized = false; this.coresetsize = sizeCoresetOption.getValue(); this.numberOfCentres = numClustersOption.getValue(); this.length = widthOption.getValue(); this.centresStreamingCoreset = new Point[this.numberOfCentres]; //initalize random generator with seed this.clustererRandom = new MTRandom(this.randomSeedOption.getValue()); //this.points = new Point[1000]; } @Override public void trainOnInstanceImpl(Instance inst) { if (this.initialized == false) { this.dimension = inst.numAttributes(); manager = new BucketManager(this.length, this.dimension, this.coresetsize, this.clustererRandom); this.initialized = true; } manager.insertPoint(new Point(inst, this.numberInstances)); //this.points[this.numberInstances % widthOption.getValue()] = new Point(inst, this.numberInstances); //if ((this.numberInstances < 15) ) // System.out.println(this.points[this.numberInstances].coordinates[0]+" "+this.points[this.numberInstances].coordinates[1]); this.numberInstances++; if ((this.numberInstances) % widthOption.getValue() == 0) { this.clustersComputed = true; Point[] streamingCoreset = manager.getCoresetFromManager(dimension); //compute 5 clusterings of the coreset with kMeans++ and take the best double minCost = 0.0; double curCost = 0.0; minCost = lloydPlusPlus(numberOfCentres, coresetsize, dimension, streamingCoreset); curCost = minCost; centresStreamingCoreset = this.tmpCentresStreamingCoreset.clone(); for (int i = 1; i < 5; i++) { curCost = lloydPlusPlus(numberOfCentres, coresetsize, dimension, streamingCoreset); //System.out.println(i+" "+curCost+" "+tmpCentresStreamingCoreset.length); if (curCost < minCost) { minCost = curCost; centresStreamingCoreset = this.tmpCentresStreamingCoreset.clone(); } } } } @Override protected Measurement[] getModelMeasurementsImpl() { throw new UnsupportedOperationException("Not supported yet."); } @Override public void getModelDescription(StringBuilder out, int indent) { throw new UnsupportedOperationException("Not supported yet."); } public boolean isRandomizable() { return true; } public double[] getVotesForInstance(Instance inst) { throw new UnsupportedOperationException("Not supported yet."); } @Override public Clustering getClusteringResult() { if (!this.clustersComputed) { return new Clustering(new Cluster[0]); } Cluster[] res = new Cluster[centresStreamingCoreset.length]; for (int i = 0; i < centresStreamingCoreset.length; i++) { res[i] = centresStreamingCoreset[i].toCluster(); //System.out.println(i+" "+res[i].getCenter()[0]+" "+res[i].getCenter()[1]+" "); } return new Clustering(res); } public double lloydPlusPlus(int k, int n, int d, Point points[]) { //System.out.println("starting kMeans++"); //choose random centres this.tmpCentresStreamingCoreset = chooseRandomCentres(k, n, d, points); double cost = targetFunctionValue(k, n, this.tmpCentresStreamingCoreset, points); double newCost = cost; Point[] massCentres = new Point[k]; double[] numberOfPoints = new double[k]; do { cost = newCost; //reset centres of mass int i = 0; for (i = 0; i < k; i++) { massCentres[i] = new Point(d); numberOfPoints[i] = 0.0; } //compute centres of mass for (i = 0; i < n; i++) { int centre = points[i].determineClusterCentreKMeans(k, this.tmpCentresStreamingCoreset); for (int l = 0; l < massCentres[centre].dimension; l++) { if (points[i].weight != 0.0) massCentres[centre].coordinates[l] += points[i].coordinates[l]; } numberOfPoints[centre] += points[i].weight; } //move centres for (i = 0; i < k; i++) { for (int l = 0; l < this.tmpCentresStreamingCoreset[i].dimension; l++) { this.tmpCentresStreamingCoreset[i].coordinates[l] = massCentres[i].coordinates[l]; this.tmpCentresStreamingCoreset[i].weight = numberOfPoints[i]; } } //calculate costs newCost = targetFunctionValue(k, n, this.tmpCentresStreamingCoreset, points); //System.out.println("old cost: "+cost+", new cost: "+newCost); } while (newCost < THRESHOLD * cost); //System.out.println("Centres: \n"); int i = 0; for (i = 0; i < k; i++) { //System.out.print("("); int l = 0; for (l = 0; l < this.tmpCentresStreamingCoreset[i].dimension; l++) { // System.out.print(this.tmpCentresStreamingCoreset[i].coordinates[l] / this.tmpCentresStreamingCoreset[i].weight); this.tmpCentresStreamingCoreset[i].coordinates[l] /= this.tmpCentresStreamingCoreset[i].weight; // System.out.print(","); } //System.out.println(")"); } //System.out.println("kMeans++ finished"); //System.out.println(i+" "+newCost+" "+this.tmpCentresStreamingCoreset.length);*/ return newCost; } private Point[] chooseRandomCentres(int k, int n, int d, Point points[]) { //array to store the choosen centres Point[] centres = new Point[k]; //choose the first centre (each point has the same probability of being choosen) int i = 0; int next = 0; int j = 0; do { //only choose from the n-i points not already choosen next = this.clustererRandom.nextInt(n - 1); //check if the choosen point is not a dummy } while (points[next].weight < 1); //set j to next unchoosen point j = next; //copy the choosen point to the array centres[i] = points[j].clone(); //set the current centre for all points to the choosen centre for (i = 0; i < n; i++) { points[i].centreIndex = 0; points[i].curCost = points[i].costOfPointToCenter(centres[0]); } //choose centre 1 to k-1 with the kMeans++ distribution for (i = 1; i < k; i++) { double cost = 0.0; for (j = 0; j < n; j++) { cost += points[j].curCost; } double random = 0; double sum = 0.0; int pos = -1; do { random = this.clustererRandom.nextDouble();//genrand_real3(); sum = 0.0; pos = -1; for (j = 0; j < n; j++) { sum = sum + points[j].curCost; if (random <= sum / cost) { pos = j; break; } } } while (points[pos].weight < 1); //copy the choosen centre centres[i] = points[pos].clone(); //check which points are closest to the new centre for (j = 0; j < n; j++) { double newCost = points[j].costOfPointToCenter(centres[i]); if (points[j].curCost > newCost) { points[j].curCost = newCost; points[j].centreIndex = i; } } } /*printf("random centres: \n"); for(i = 0; i < k; i++){ //printf("%d: (",i); int l = 0; for(l = 0; l < centres[i].dimension; l++){ printf("%f,",centres[i].coordinates[l] / centres[i].weight); } printf(")\n"); }*/ return centres; } /** * computes the target function for the given pointarray points[] (of size n) with the given array of * centres centres[] (of size k) */ public double targetFunctionValue(int k, int n, Point[] centres, Point[] points) { int i = 0; double sum = 0.0; for (i = 0; i < n; i++) { double nearestCost = -1.0; int j = 0; for (j = 0; j < k; j++) { double distance = 0.0; int l = 0; for (l = 0; l < points[i].dimension; l++) { //Centroid coordinate of the point double centroidCoordinatePoint; if (points[i].weight != 0.0) { centroidCoordinatePoint = points[i].coordinates[l] / points[i].weight; } else { centroidCoordinatePoint = points[i].coordinates[l]; } //Centroid coordinate of the centre double centroidCoordinateCentre; if (centres[j].weight != 0.0) { centroidCoordinateCentre = centres[j].coordinates[l] / centres[j].weight; } else { centroidCoordinateCentre = centres[j].coordinates[l]; } distance += (centroidCoordinatePoint - centroidCoordinateCentre) * (centroidCoordinatePoint - centroidCoordinateCentre); } if (nearestCost < 0 || distance < nearestCost) { nearestCost = distance; } } sum += nearestCost * points[i].weight; } return sum; } }