package kmeans;
import java.util.Arrays;
import com.ibm.apgas.Pool;
import com.ibm.apgas.Task;
/**
*
*/
public class KMeansAPGAS {
// Statics used to hold place-local data needed by asyncs
private static float[] points;
private static int[] clusterCounts;
private static int[] closestCluster;
private static float[] incomingClusterPoints;
private static int[] incomingClusterCounts;
private static int myK;
private static int numPoints;
private static int numDimensions;
// per iteration timing data
private static int iteration;
private static long[] kernelNanos;
private static long[] kernel2Nanos;
private static long[] allToAllNanos;
private static long[] localReduceNanos;
/**
* Initialize data structures needed for the kMeans computation
*/
public static void initialize(int _myK, int numIterations, int _numPoints, int _numDimensions, float[] _points) {
points = _points;
myK = _myK;
numPoints = _numPoints;
numDimensions = _numDimensions;
clusterCounts = new int[myK];
closestCluster = new int[numPoints];
kernelNanos = new long[numIterations];
kernel2Nanos = new long[numIterations];
allToAllNanos = new long[numIterations];
localReduceNanos = new long[numIterations];
}
private static native void nativeInit(int myK, int numPoints, int numDimensions, float[] points);
/**
* Given the current cluster, compute a new cluster
*/
public static void computeNewLocalClusters(float[] clusterPoints) {
kernelNanos[iteration] = - System.nanoTime();
for (int pointNumber = 0; pointNumber<numPoints; pointNumber++) {
int closest = -1;
float closestDist = Float.MAX_VALUE;
for (int k=0; k<myK; k++) {
float dist = 0;
for (int dim=0; dim<numDimensions; dim++) {
float tmp = clusterPoints[k*numDimensions + dim] - points[pointNumber*numDimensions + dim];
dist += tmp*tmp;
}
if (dist < closestDist) {
closestDist = dist;
closest = k;
}
}
closestCluster[pointNumber] = closest;
}
long now = System.nanoTime();
kernelNanos[iteration] += now;
kernel2Nanos[iteration] = -now;
// Now that we know the closest cluster for each point, compute the new cluster centers
Arrays.fill(clusterCounts, 0);
Arrays.fill(clusterPoints, 0.0f);
for (int pointNumber=0; pointNumber<numPoints; pointNumber++) {
int closest = closestCluster[pointNumber];
for (int dim=0; dim<numDimensions; dim++) {
clusterPoints[closest*numDimensions + dim] += points[pointNumber*numDimensions + dim];
}
clusterCounts[closest]++;
}
now = System.nanoTime();
kernel2Nanos[iteration] += now;
allToAllNanos[iteration] = -now;
// This really should be an all-to-all collective using Teams, but instead we
// will do a point-to-point send to place 0, which will receive the
// data, accumulate it, reduce it, then scatter it back as the argument
// to an async to start the next iteration.
final float[] outgoingClusterPoints = clusterPoints;
final int[] outgoingClusterCounts = clusterCounts;
Pool.runAsync(0, new Task(){
public void body() {
synchronized(incomingClusterPoints) {
for (int i=0; i<outgoingClusterPoints.length; i++) {
incomingClusterPoints[i] += outgoingClusterPoints[i];
}
}
synchronized(incomingClusterCounts) {
for (int i=0; i<outgoingClusterCounts.length; i++) {
incomingClusterCounts[i] += outgoingClusterCounts[i];
}
}
}});
now = System.nanoTime();
allToAllNanos[iteration] += now;
iteration += 1;
}
private static double toMillis(long nanoTime) {
return ((double)nanoTime)/1e6;
}
public static void main (final String[] args) {
Pool p = new Pool(new Task() {
public void body() {
String fileName = "points.dat";
int K = 4;
int iterations = 50;
int argIndex = 0;
while (argIndex < args.length) {
String arg = args[argIndex++];
if (arg.equals("-k")) {
K = Integer.parseInt(args[argIndex++]);
} else if (arg.equals("-i")) {
iterations = Integer.parseInt(args[argIndex++]);
} else {
fileName = arg;
}
}
final KMeansDataSet data = KMeansDataSet.readPointsFromFile(fileName);
final float[] currentCluster = new float[K*data.numDimensions];
System.arraycopy(data.points, 0, currentCluster, 0, currentCluster.length);
// Only place zero needs these arrays (incoming buffers for "collective")
incomingClusterCounts = new int[K];
incomingClusterPoints = new float[K*data.numDimensions];
// Initialize all places, evenly splitting data.points between them
final int pointsPerPlace = data.numPoints/Pool.numPlaces();
final int capturedK = K;
final int capturedIterations = iterations;
Pool.runFinish(new Task(){
public void body() {
for (int i = 0; i<Pool.numPlaces(); i++) {
final int start = i * pointsPerPlace;
final int stop = Math.min(start+pointsPerPlace-1, data.points.length);
final int numPoints = stop-start+1;
final float[] points = new float[numPoints*data.numDimensions];
System.arraycopy(data.points, start, points, 0, points.length);
System.out.println("Sending points "+start+"..."+stop+" to place "+i);
Pool.runAsync(i, new Task(){
public void body() {
initialize(capturedK, capturedIterations, numPoints, data.numDimensions, points);
}});
}
}});
// Do the requested number of iterations.
long start = System.nanoTime();
for (int iter = 0; iter < iterations; iter++) {
Pool.atEach(new Task(){
public void body() {
computeNewLocalClusters(currentCluster);
}});
// local reduction to get the new clusters
// Adjust cluster coordinates by dividing each point value
// by the number of points in the cluster
localReduceNanos[iter] = -System.nanoTime();
for (int k=0; k<K; k++) {
float tmp = (float)incomingClusterCounts[k];
for (int dim=0; dim<numDimensions; dim++) {
incomingClusterPoints[k*numDimensions+dim] /= tmp;
}
}
System.arraycopy(incomingClusterPoints, 0, currentCluster, 0, currentCluster.length);
Arrays.fill(incomingClusterCounts, 0);
Arrays.fill(incomingClusterPoints, 0.0f);
localReduceNanos[iter] += System.nanoTime();
}
long stop = System.nanoTime();
// All done. Print the results
for (int k=0; k<K; k++) {
for (int j=0; j<data.numDimensions; j++) {
if (j>0) System.out.print(" ");
System.out.print(currentCluster[k*data.numDimensions+j]);
}
System.out.println();
}
System.out.println();
long totalKernelNanos = 0;
long totalKernel2Nanos = 0;
long totalAllToAllNanos = 0;
long totalLocalReduceNanos = 0;
System.out.println("Per iteration phase timings (kernel1, kernel2, collective, localReduce)");
for (int i=0; i<iterations; i++) {
System.out.printf("%3.5f %3.5f %3.5f %3.5f\n",toMillis(kernelNanos[i]), toMillis(kernel2Nanos[i]), toMillis(allToAllNanos[i]), toMillis(localReduceNanos[i]));
totalKernelNanos += kernelNanos[i];
totalKernel2Nanos += kernel2Nanos[i];
totalAllToAllNanos += allToAllNanos[i];
totalLocalReduceNanos += localReduceNanos[i];
}
System.out.println("-------------------------------------------------------");
System.out.printf("%3.5f %3.5f %3.5f %3.5f\n", toMillis(totalKernelNanos), toMillis(totalKernel2Nanos), toMillis(totalAllToAllNanos), toMillis(totalLocalReduceNanos));
System.out.println("Total time (seconds)"+((double)(stop-start)/1e9));
}
});
p.start();
}
}