package samples.expert; import water.Job; import water.Key; import water.MRTask2; import water.fvec.*; import water.util.Utils; import java.io.File; import java.text.DecimalFormat; import java.util.Random; /** * Simplified version of H2O's k-means algorithm. Shows how to write your own algorithm. */ public class MapReduceKMeans extends Job { public static void main(String[] args) throws Exception { samples.launchers.CloudProcess.launch(MapReduceKMeans.class, 2); } @Override protected void execImpl() { // Load and parse a file. Data is distributed to other nodes in a round-robin way Key file = NFSFileVec.make(new File("../lib/resources/datasets/gaussian.csv")); Frame frame = ParseDataset2.parse(Key.make("test"), new Key[] { file }); // Optionally create a frame with less columns, e.g. skip first frame = new Frame(Utils.remove(frame._names, 0), Utils.remove(frame.vecs(), 0)); // Create k clusters as arrays of doubles int k = 7; double[][] clusters = new double[k][frame.vecs().length]; // Initialize first cluster to random row Random rand = new Random(); for( int cluster = 0; cluster < clusters.length; cluster++ ) { long row = Math.max(0, (long) (rand.nextDouble() * frame.vecs().length) - 1); for( int i = 0; i < frame.vecs().length; i++ ) { Vec v = frame.vecs()[i]; clusters[cluster][i] = v.at(row); } } // Iterate over the dataset and show error for each step for( int i = 0; i < 10; i++ ) { KMeans task = new KMeans(); task.clusters = clusters; task.doAll(frame); for( int c = 0; c < clusters.length; c++ ) { if( task.counts[c] > 0 ) { for( int v = 0; v < frame.vecs().length; v++ ) { double value = task.sums[c][v] / task.counts[c]; clusters[c][v] = value; } } } System.out.println("Error is " + task.error); } System.out.println("Clusters:"); DecimalFormat df = new DecimalFormat("#.00"); for( int c = 0; c < clusters.length; c++ ) { for( int v = 0; v < frame.vecs().length; v++ ) System.out.print(df.format(clusters[c][v]) + ", "); System.out.println(""); } } /** * For more complex tasks like this one, it is useful to marks fields that are provided by the * caller (IN), and fields generated by the task (OUT). IN fields can then be set to null when the * task is done using them, so that they do not get serialized back to the caller. */ public static class KMeans extends MRTask2<KMeans> { double[][] clusters; // IN: Centroids/clusters double[][] sums; // OUT: Sum of features in each cluster int[] counts; // OUT: Count of rows in cluster double error; // OUT: Total sqr distance @Override public void map(Chunk[] chunks) { sums = new double[clusters.length][chunks.length]; counts = new int[clusters.length]; // Find nearest cluster for each row for( int row = 0; row < chunks[0]._len; row++ ) { int nearest = -1; double minSqr = Double.MAX_VALUE; for( int cluster = 0; cluster < clusters.length; cluster++ ) { double sqr = 0; // Sum of dimensional distances for( int column = 0; column < chunks.length; column++ ) { double delta = chunks[column].at0(row) - clusters[cluster][column]; sqr += delta * delta; } if( sqr < minSqr ) { nearest = cluster; minSqr = sqr; } } error += minSqr; // Add values and increment counter for chosen cluster for( int column = 0; column < chunks.length; column++ ) sums[nearest][column] += chunks[column].at0(row); counts[nearest]++; } clusters = null; } @Override public void reduce(KMeans task) { for( int cluster = 0; cluster < counts.length; cluster++ ) { for( int column = 0; column < sums[0].length; column++ ) sums[cluster][column] += task.sums[cluster][column]; counts[cluster] += task.counts[cluster]; } error += task.error; } } }