KMeans2.java example

Explorer
h2o-2-master
package hex;

import water.*;
import water.Job.ColumnsJob;
import water.api.DocGen;
import water.api.Progress2;
import water.api.Request;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.util.Log;
import water.util.RString;
import water.util.Utils;

import java.util.ArrayList;
import java.util.Random;
import java.util.Arrays;


/**
 * Scalable K-Means++ (KMeans||)<br>
 * http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf<br>
 * http://www.youtube.com/watch?v=cigXAxV3XcY
 */
public class KMeans2 extends ColumnsJob {
  static final int API_WEAVER = 1;
  static public DocGen.FieldDoc[] DOC_FIELDS;
  static final String DOC_GET = "k-means";
  public enum Initialization {
    None, PlusPlus, Furthest
  };

  @API(help = "Cluster initialization: None - chooses initial centers at random; Plus Plus - choose first center at random, subsequent centers chosen from probability distribution weighted so that points further from first center are more likey to be selected; Furthest - chooses initial point at random, subsequent point taken as the point furthest from prior point.", filter = Default.class, json=true)
  // public Initialization initialization = Initialization.None;
  public Initialization initialization = Initialization.Furthest;
  // default Initialization is Furthest. Better results for hard cases, especially with just one trial
  // in the browser. PlusPlus can be biased, so Furthest can be best, again especially if just one trial
  // Random should never be better than Furthest.

  @API(help = "Number of clusters", required = true, filter = Default.class, lmin = 1, lmax = 100000, json=true)
  public int k = 2;

  @API(help = "Maximum number of iterations before stopping", required = true, filter = Default.class, lmin = 1, lmax = 100000, json=true)
  public int max_iter = 100;

  @API(help = "Whether data should be normalized", filter = Default.class, json=true)
  public boolean normalize;

  @API(help = "Seed for the random number generator", filter = Default.class, json=true)
  public long seed = new Random().nextLong();

  @API(help = "Drop columns with more than 20% missing values", filter = Default.class, json=true)
  public boolean drop_na_cols = true;

  // Number of categorical columns
  private int _ncats;

  // Number of reinitialization attempts for preventing empty clusters
  transient private int reinit_attempts;

  // Make a link that lands on this page
  public static String link(Key k, String content) {
    RString rs = new RString("<a href='KMeans2.query?source=%$key'>%content</a>");
    rs.replace("key", k.toString());
    rs.replace("content", content);
    return rs.toString();
  }

  public KMeans2() {
    description = "K-means";
  }

  // ----------------------
  @Override public void execImpl() {
    Frame fr;
    KMeans2Model model = null;
    try {

      logStart();
      source.read_lock(self());
      if ( source.numRows() < k) throw new IllegalArgumentException("Cannot make " + k + " clusters out of " + source.numRows() + " rows.");

      // Drop ignored cols and, if user asks for it, cols with too many NAs
      fr = FrameTask.DataInfo.prepareFrame(source, ignored_cols, false, drop_na_cols);
//      fr = source;
      if (fr.numCols() == 0) throw new IllegalArgumentException("No columns left to work with.");

      // Sort columns, so the categoricals are all up front.  They use a
      // different distance metric than numeric columns.
      Vec vecs[] = fr.vecs();
      final int N = vecs.length; // Feature count
      int ncats=0, len=N;
      while( ncats != len ) {
        while( ncats < len && vecs[ncats].isEnum() ) ncats++;
        while( len > 0 && !vecs[len-1].isEnum() ) len--;
        if( ncats < len-1 ) fr.swap(ncats,len-1);
      }
      _ncats = ncats;

      // The model to be built
      model = new KMeans2Model(this, dest(), fr._key, fr.names(), fr.domains());

      model.delete_and_lock(self());

      // means are used to impute NAs
      double[] means = new double[N];
      for( int i = 0; i < N; i++ )
        means[i] = vecs[i].mean();
      // mults & means for normalization
      double[] mults = null;
      if( normalize ) {
        mults = new double[N];
        for( int i = 0; i < N; i++ ) {
          double sigma = vecs[i].sigma();
          mults[i] = normalize(sigma) ? 1.0 / sigma : 1.0;
        }
      }

      // Initialize clusters
      Random rand = Utils.getRNG(seed - 1);
      double clusters[][];    // Normalized cluster centers
      if( initialization == Initialization.None ) {
        // Initialize all clusters to random rows. Get 3x the number needed
        clusters = model.centers = new double[k*3][fr.numCols()];
        for( double[] cluster : clusters )
          randomRow(vecs, rand, cluster, means, mults);
        // for( int i=0; i<model.centers.length; i++ ) {
        //   Log.info("random model.centers["+i+"]: "+Arrays.toString(model.centers[i]));
        // }
        // Recluster down to K normalized clusters. 
        clusters = recluster(clusters, rand);
      } else {
        clusters = new double[1][vecs.length];
        // Initialize first cluster to random row
        randomRow(vecs, rand, clusters[0], means, mults);

        while( model.iterations < 5 ) {
          // Sum squares distances to clusters
          SumSqr sqr = new SumSqr(clusters,means,mults,_ncats).doAll(vecs);
          // Log.info("iteration: "+model.iterations+" sqr: "+sqr._sqr);

          // Sample with probability inverse to square distance
          long randomSeed = (long) rand.nextDouble();
          Sampler sampler = new Sampler(clusters, means, mults, _ncats, sqr._sqr, k * 3, randomSeed).doAll(vecs);
          clusters = Utils.append(clusters,sampler._sampled);

          // Fill in sample clusters into the model
          if( !isRunning() ) return; // Stopped/cancelled
          model.centers = denormalize(clusters, ncats, means, mults);
          // see below. this is sum of squared error now
          model.total_within_SS = sqr._sqr;
          model.iterations++;     // One iteration done

          // Log.info("\nKMeans Centers during init models.iterations: "+model.iterations);
          // for( int i=0; i<model.centers.length; i++ ) {
          //   Log.info("model.centers["+i+"]: "+Arrays.toString(model.centers[i]));
          // }
          // Log.info("model.total_within_SS: "+model.total_within_SS);

          // Don't count these iterations as work for model building
          model.update(self()); // Early version of model is visible

          // Recluster down to K normalized clusters. 
          // makes more sense to recluster each iteration, since the weighted k*3 effect on sqr vs _sqr
          // reflects the k effect on _sqr? ..if there are too many "centers" (samples) then _sqr (sum of all) is too 
          // big relative to sqr (possible new point, and we don't gather any more samples? 
          // (so the centers won't change during the init)
          clusters = recluster(clusters, rand);
        }
      }
      model.iterations = 0;     // Reset iteration count

      // ---
      // Run the main KMeans Clustering loop
      // Stop after enough iterations
      boolean done;
      LOOP:
      for( ; model.iterations < max_iter; model.iterations++ ) {
        if( !isRunning() ) return; // Stopped/cancelled
        Lloyds task = new Lloyds(clusters,means,mults,_ncats, k).doAll(vecs);
        // Pick the max categorical level for clusters' center
        max_cats(task._cMeans,task._cats);

        // Handle the case where some clusters go dry.  Rescue only 1 cluster
        // per iteration ('cause we only tracked the 1 worst row)
        boolean badrow=false;
        for( int clu=0; clu<k; clu++ ) {
          if (task._rows[clu] == 0) {
            // If we see 2 or more bad rows, just re-run Lloyds to get the
            // next-worst row.  We don't count this as an iteration, because
            // we're not really adjusting the centers, we're trying to get
            // some centers *at-all*.
            if (badrow) {
              Log.warn("KMeans: Re-running Lloyds to re-init another cluster");
              model.iterations--; // Do not count against iterations
              if (reinit_attempts++ < k) {
                continue LOOP;  // Rerun Lloyds, and assign points to centroids
              } else {
                reinit_attempts = 0;
                break; //give up and accept empty cluster
              }
            }
            long row = task._worst_row;
            Log.warn("KMeans: Re-initializing cluster " + clu + " to row " + row);
            data(clusters[clu] = task._cMeans[clu], vecs, row, means, mults);
            task._rows[clu] = 1;
            badrow = true;
          }
        }

        // Fill in the model; denormalized centers
        model.centers = denormalize(task._cMeans, ncats, means, mults);
        model.size = task._rows;
        model.within_cluster_variances = task._cSqr;
        double ssq = 0;       // sum squared error
        for( int i=0; i<k; i++ ) {
          ssq += model.within_cluster_variances[i]; // sum squared error all clusters
//          model.within_cluster_variances[i] /= task._rows[i]; // mse per-cluster
        }
//        model.total_within_SS = ssq/fr.numRows(); // mse total
        model.total_within_SS = ssq; //total within sum of squares

        model.update(self()); // Update model in K/V store
        reinit_attempts = 0;

        // Compute change in clusters centers
        double sum=0;
        for( int clu=0; clu<k; clu++ )
          sum += distance(clusters[clu],task._cMeans[clu],ncats);
        sum /= N;             // Average change per feature
        Log.info("KMeans: Change in cluster centers="+sum);
        done = ( sum < 1e-6 || model.iterations == max_iter-1);

        if (done) {
          Log.info("Writing clusters to key " + model._clustersKey);
          Clusters cc = new Clusters();
          cc._clusters = clusters;
          cc._means = means;
          cc._mults = mults;
          cc.doAll(1, vecs);
          Frame fr2 = cc.outputFrame(model._clustersKey, new String[]{"Cluster ID"}, new String[][]{Utils.toStringMap(0, cc._clusters.length - 1)});
          fr2.delete_and_lock(self()).unlock(self());
          break;
        }

        clusters = task._cMeans; // Update cluster centers

        StringBuilder sb = new StringBuilder();
        sb.append("KMeans: iter: ").append(model.iterations).append(", MSE=").append(model.total_within_SS);
        for( int i=0; i<k; i++ )
          sb.append(", ").append(task._cSqr[i]).append("/").append(task._rows[i]);
        Log.info(sb);
      }

    } catch( Throwable t ) {
      t.printStackTrace();
      cancel(t);
    } finally {
      remove();                   // Remove Job
      if( model != null ) model.unlock(self());
      source.unlock(self());
      state = UKV.<Job>get(self()).state;
      new TAtomic<KMeans2Model>() {
        @Override
        public KMeans2Model atomic(KMeans2Model m) {
          if (m != null) m.get_params().state = state;
          return m;
        }
      }.invoke(dest());
    }
  }

  @Override protected Response redirect() {
    return KMeans2Progress.redirect(this, job_key, destination_key);
  }

  public static class KMeans2Progress extends Progress2 {
    static final int API_WEAVER = 1;
    static public DocGen.FieldDoc[] DOC_FIELDS;

    @Override protected Response jobDone(Key dst) {
      return KMeans2ModelView.redirect(this, destination_key);
    }

    public static Response redirect(Request req, Key job_key, Key destination_key) {
      return Response.redirect(req, new KMeans2Progress().href(), JOB_KEY, job_key, DEST_KEY, destination_key);
    }
  }

  public static class KMeans2ModelView extends Request2 {
    static final int API_WEAVER = 1;
    static public DocGen.FieldDoc[] DOC_FIELDS;

    @API(help = "KMeans2 Model", json = true, filter = Default.class)
    public KMeans2Model model;

    @API(help="KMeans2 Model Key", required = true, filter = KMeans2Filter.class)
    Key _modelKey;
    class KMeans2Filter extends H2OKey { public KMeans2Filter() { super("",true); } }

    public static String link(String txt, Key model) {
      return "<a href='" + new KMeans2ModelView().href() + ".html?_modelKey=" + model + "'>" + txt + "</a>";
    }

    public static Response redirect(Request req, Key model) {
      return Response.redirect(req, "/2/KMeans2ModelView", "_modelKey", model);
//      return Response.redirect(req, new KMeans2ModelView().href(), "_modelKey", model);
    }

    @Override protected Response serve() {
      model = DKV.get(_modelKey).get();
      return Response.done(this);
    }

    @Override public boolean toHTML(StringBuilder sb) {
      if( model != null && model.centers != null && model.within_cluster_variances != null) {
        model.parameters.makeJsonBox(sb);
        DocGen.HTML.section(sb, "Cluster Centers: "); //"Total Within Cluster Sum of Squares: " + model.total_within_SS);
        table(sb, "Clusters", model._names, model.centers);
        double[][] rows = new double[model.within_cluster_variances.length][1];
        for( int i = 0; i < rows.length; i++ )
          rows[i][0] = model.within_cluster_variances[i];
        columnHTMLlong(sb, "Cluster Size", model.size);
        DocGen.HTML.section(sb, "Cluster Variances: ");
        table(sb, "Clusters", new String[]{"Within Cluster Sum of Squares"}, rows);
//        columnHTML(sb, "Between Cluster Variances", model.between_cluster_variances);
        sb.append("<br />");
        DocGen.HTML.section(sb, "Overall Totals: ");
        double[] row = new double[]{model.total_within_SS};
        rowHTML(sb, new String[]{"Total Within Cluster Sum of Squares"}, row);
//        double[] row = new double[]{model.total_SS, model.total_within_SS, model.between_cluster_SS};
//        rowHTML(sb, new String[]{"Total Sum of Squares", "Total Within Cluster Sum of Squares", "Between Cluster Sum of Squares"}, row);
        DocGen.HTML.section(sb, "Cluster Assignments by Observation: ");
        RString rs = new RString("<a href='Inspect2.html?src_key=%$key'>%content</a>");
        rs.replace("key", model._key + "_clusters");
        rs.replace("content", "View the row-by-row cluster assignments");
        sb.append(rs.toString());
        //sb.append("<iframe src=\"" + "/Inspect.html?key=KMeansClusters\"" + "width = \"850\" height = \"550\" marginwidth=\"25\" marginheight=\"25\" scrolling=\"yes\"></iframe>" );
        return true;
      }
      return false;
    }

    private static void rowHTML(StringBuilder sb, String[] header, double[] ro) {
      sb.append("<span style='display: inline-block; '>");
      sb.append("<table class='table table-striped table-bordered'>");
      sb.append("<tr>");
      for (String aHeader : header) sb.append("<th>").append(aHeader).append("</th>");
      sb.append("</tr>");
      sb.append("<tr>");
      for (double row : ro) {
        sb.append("<td>").append(ElementBuilder.format(row)).append("</td>");
      }
      sb.append("</tr>");
      sb.append("</table></span>");
    }

    private static void columnHTML(StringBuilder sb, String name, double[] rows) {
      sb.append("<span style='display: inline-block; '>");
      sb.append("<table class='table table-striped table-bordered'>");
      sb.append("<tr>");
      sb.append("<th>").append(name).append("</th>");
      sb.append("</tr>");
      sb.append("<tr>");
      for (double row : rows) {
        sb.append("<tr>");
        sb.append("<td>").append(ElementBuilder.format(row)).append("</td>");
        sb.append("</tr>");
      }
      sb.append("</table></span>");
    }

    private static void columnHTMLlong(StringBuilder sb, String name, long[] rows) {
      sb.append("<span style='display: inline-block; '>");
      sb.append("<table class='table table-striped table-bordered'>");
      sb.append("<tr>");
      sb.append("<th>").append(name).append("</th>");
      sb.append("</tr>");
      sb.append("<tr>");
      for (double row : rows) {
        sb.append("<tr>");
        sb.append("<td>").append(ElementBuilder.format(row)).append("</td>");
        sb.append("</tr>");
      }
      sb.append("</table></span>");
    }

    private static void table(StringBuilder sb, String title, String[] names, double[][] rows) {
      sb.append("<span style='display: inline-block;'>");
      sb.append("<table class='table table-striped table-bordered'>");
      sb.append("<tr>");
      sb.append("<th>").append(title).append("</th>");
      for( int i = 0; names != null && i < rows[0].length; i++ )
        sb.append("<th>").append(names[i]).append("</th>");
      sb.append("</tr>");
      for( int r = 0; r < rows.length; r++ ) {
        sb.append("<tr>");
        sb.append("<td>").append(r).append("</td>");
        for( int c = 0; c < rows[r].length; c++ )
          sb.append("<td>").append(ElementBuilder.format(rows[r][c])).append("</td>");
        sb.append("</tr>");
      }
      sb.append("</table></span>");
    }
  }

  public static class KMeans2Model extends Model implements Progress {
    static final int API_WEAVER = 1;
    static public DocGen.FieldDoc[] DOC_FIELDS;

    @API(help = "Model parameters")
    private final KMeans2 parameters;    // This is used purely for printing values out.

    @API(help = "Cluster centers, always denormalized")
    public double[][] centers;

    @API(help = "Sum of within cluster sum of squares")
    public double total_within_SS;

//    @API(help = "Between cluster sum of square distances")
//    public double between_cluster_SS;

//    @API(help = "Total Sum of squares = total_within_SS + betwen_cluster_SS")
//    public double total_SS;

    @API(help = "Number of clusters")
    public int k;

    @API(help = "Numbers of observations in each cluster.")
    public long[] size;

    @API(help = "Whether data was normalized")
    public boolean normalized;

    @API(help = "Maximum number of iterations before stopping")
    public int max_iter = 100;

    @API(help = "Iterations the algorithm ran")
    public int iterations;

    @API(help = "Within cluster sum of squares per cluster")
    public double[] within_cluster_variances; //Warning: See note below
    //Note: The R wrapper interprets this as withinss (sum of squares), so that's what we compute here, and NOT the variances.
    //FIXME: => wrong name, should be within_cluster_sum_of_squares, but leaving to be backward-compatible with REST API


//    @API(help = "Between Cluster square distances per cluster")
//    public double[] between_cluster_variances;

    @API(help = "The row-by-row cluster assignments")
    public final Key _clustersKey;

    private transient int _ncats;

    public KMeans2Model(KMeans2 params, Key selfKey, Key dataKey, String names[], String domains[][]) {
      super(selfKey, dataKey, names, domains, /* priorClassDistribution */ null, /* modelClassDistribution */ null);
      _ncats = params._ncats;
      parameters = params;

      // only for backward-compatibility of JSON response
      k = params.k;
      normalized = params.normalize;
      max_iter = params.max_iter;

      _clustersKey = Key.make(selfKey.toString() + "_clusters");
    }

    @Override public final KMeans2 get_params() { return parameters; }
    @Override public final Request2 job() { return get_params(); }

    @Override public double mse() { return total_within_SS; }

    @Override public float progress() {
      return Math.min(1f, iterations / (float) parameters.max_iter);
    }

    @Override protected float[] score0(Chunk[] chunks, int rowInChunk, double[] tmp, float[] preds) {
      assert chunks.length>=_names.length;
      for( int i=0; i<_names.length; i++ )
        tmp[i] = chunks[i].at0(rowInChunk);
      return score0(tmp,preds);
    }

    @Override protected float[] score0(double[] data, float[] preds) {
      preds[0] = closest(centers,data,_ncats);
      return preds;
    }

    @Override public int nfeatures() { return _names.length; }
    @Override public boolean isSupervised() { return false; }
    @Override public String responseName() { throw new IllegalArgumentException("KMeans doesn't have a response."); }

    /** Remove any Model internal Keys */
    @Override public Futures delete_impl(Futures fs) {
      Lockable.delete(_clustersKey);
      return fs;
    }
  }

  public class Clusters extends MRTask2<Clusters> {
    // IN
    double[][] _clusters;         // Cluster centers
    double[] _means, _mults;      // Normalization
    int _ncats, _nnums;

    @Override public void map(Chunk[] cs, NewChunk ncs) {
      double[] values = new double[_clusters[0].length];
      ClusterDist cd = new ClusterDist();
      for (int row = 0; row < cs[0]._len; row++) {
        data(values, cs, row, _means, _mults);
        // closest(_clusters, values, cd);
        closest(_clusters, values, _ncats, cd);
        int clu = cd._cluster;
        // ncs[0].addNum(clu);
        ncs.addEnum(clu);
      }
    }
  }

  // -------------------------------------------------------------------------
  // Initial sum-of-square-distance to nearest cluster
  private static class SumSqr extends MRTask2<SumSqr> {
    // IN
    double[][] _clusters;
    double[] _means, _mults; // Normalization
    final int _ncats;

    // OUT
    double _sqr;

    SumSqr( double[][] clusters, double[] means, double[] mults, int ncats ) {
      _clusters = clusters;
      _means = means;
      _mults = mults;
      _ncats = ncats;
    }

    @Override public void map(Chunk[] cs) {
      double[] values = new double[cs.length];
      ClusterDist cd = new ClusterDist();
      for( int row = 0; row < cs[0].len(); row++ ) {
        data(values, cs, row, _means, _mults);
        _sqr += minSqr(_clusters, values, _ncats, cd);
      }
      _means = _mults = null;
      _clusters = null;
    }

    @Override public void reduce(SumSqr other) { _sqr += other._sqr; }
  }

  // -------------------------------------------------------------------------
  // Sample rows with increasing probability the farther they are from any
  // cluster.
  private static class Sampler extends MRTask2<Sampler> {
    // IN
    double[][] _clusters;
    double[] _means, _mults; // Normalization
    final int _ncats;
    final double _sqr;           // Min-square-error
    final double _probability;   // Odds to select this point
    final long _seed;

    // OUT
    double[][] _sampled;   // New clusters

    Sampler( double[][] clusters, double[] means, double[] mults, int ncats, double sqr, double prob, long seed ) {
      _clusters = clusters;
      _means = means;
      _mults = mults;
      _ncats = ncats;
      _sqr = sqr;
      _probability = prob;
      _seed = seed;
    }

    @Override public void map(Chunk[] cs) {
      double[] values = new double[cs.length];
      ArrayList<double[]> list = new ArrayList<double[]>();
      Random rand = Utils.getRNG(_seed + cs[0]._start);
      ClusterDist cd = new ClusterDist();

      for( int row = 0; row < cs[0].len(); row++ ) {
        data(values, cs, row, _means, _mults);
        double sqr = minSqr(_clusters, values, _ncats, cd);
        if( _probability * sqr > rand.nextDouble() * _sqr ) {
          list.add(values.clone());
          // Log.info("Sampler map adding to the list used for an init iteration values: "+
          //   Arrays.toString(values)+" _probability: "+_probability+" sqr: "+sqr+" _sqr: "+_sqr);
        }
      }
      // Log.info("Sampler map summary: that's another "+list.size()+" to the list used for an init iteration values");

      _sampled = new double[list.size()][];
      list.toArray(_sampled);
      _clusters = null;
      _means = _mults = null;
    }

    @Override public void reduce(Sampler other){
      _sampled = Utils.append(_sampled, other._sampled);
    }
  }

  public static class Lloyds extends MRTask2<Lloyds> {
    // IN
    double[][] _clusters;
    double[] _means, _mults;      // Normalization
    final int _ncats, _K;

    // OUT
    double[][] _cMeans;         // Means for each cluster
    long[/*K*/][/*ncats*/][] _cats; // Histogram of cat levels
    double[] _cSqr;             // Sum of squares for each cluster
    long[] _rows;               // Rows per cluster
    long _worst_row;            // Row with max err
    double _worst_err;          // Max-err-row's max-err

    Lloyds( double[][] clusters, double[] means, double[] mults, int ncats, int K ) {
      _clusters = clusters;
      _means = means;
      _mults = mults;
      _ncats = ncats;
      _K = K;
    }

    @Override public void map(Chunk[] cs) {
      int N = cs.length;
      assert _clusters[0].length==N;
      _cMeans = new double[_K][N];
      _cSqr = new double[_K];
      _rows = new long[_K];
      // Space for cat histograms
      _cats = new long[_K][_ncats][];
      for( int clu=0; clu<_K; clu++ )
        for( int col=0; col<_ncats; col++ )
          _cats[clu][col] = new long[cs[col]._vec.cardinality()];
      _worst_err = 0;

      // Find closest cluster for each row
      double[] values = new double[N];
      ClusterDist cd = new ClusterDist();
      for( int row = 0; row < cs[0].len(); row++ ) {
        data(values, cs, row, _means, _mults);
        closest(_clusters, values, _ncats, cd);
        int clu = cd._cluster;
        assert clu != -1; // No broken rows
        _cSqr[clu] += cd._dist;

        // Add values and increment counter for chosen cluster
        for( int col = 0; col < _ncats; col++ )
          _cats[clu][col][(int)values[col]]++; // Histogram the cats
        for( int col = _ncats; col < N; col++ )
          _cMeans[clu][col] += values[col];
        _rows[clu]++;
        // Track worst row
        if( cd._dist > _worst_err) { _worst_err = cd._dist; _worst_row = cs[0]._start+row; }
      }
      // Scale back down to local mean
      for( int clu = 0; clu < _K; clu++ )
        if( _rows[clu] != 0 ) Utils.div(_cMeans[clu],_rows[clu]);
      _clusters = null;
      _means = _mults = null;
    }

    @Override public void reduce(Lloyds mr) {
      for( int clu = 0; clu < _K; clu++ ) {
        long ra =    _rows[clu];
        long rb = mr._rows[clu];
        double[] ma =    _cMeans[clu];
        double[] mb = mr._cMeans[clu];
        for( int c = 0; c < ma.length; c++ ) // Recursive mean
          if( ra+rb > 0 ) ma[c] = (ma[c] * ra + mb[c] * rb) / (ra + rb);
      }
      Utils.add(_cats, mr._cats);
      Utils.add(_cSqr, mr._cSqr);
      Utils.add(_rows, mr._rows);
      // track global worst-row
      if( _worst_err < mr._worst_err) { _worst_err = mr._worst_err; _worst_row = mr._worst_row; }
    }
  }

  // A pair result: nearest cluster, and the square distance
  private static final class ClusterDist { int _cluster; double _dist;  }

  private static double minSqr(double[][] clusters, double[] point, int ncats, ClusterDist cd) {
    return closest(clusters, point, ncats, cd, clusters.length)._dist;
  }

  private static double minSqr(double[][] clusters, double[] point, int ncats, ClusterDist cd, int count) {
    return closest(clusters,point,ncats,cd,count)._dist;
  }

  private static ClusterDist closest(double[][] clusters, double[] point, int ncats, ClusterDist cd) {
    return closest(clusters, point, ncats, cd, clusters.length);
  }

  private static double distance(double[] cluster, double[] point, int ncats) {
    double sqr = 0;             // Sum of dimensional distances
    int pts = point.length;     // Count of valid points

    // Categorical columns first.  Only equals/unequals matters (i.e., distance is either 0 or 1).
    for(int column = 0; column < ncats; column++) {
      double d = point[column];
      if( Double.isNaN(d) ) pts--;
      else if( d != cluster[column] )
        sqr += 1.0;           // Manhattan distance
    }
    // Numeric column distance
    for( int column = ncats; column < cluster.length; column++ ) {
      double d = point[column];
      if( Double.isNaN(d) ) pts--; // Do not count
      else {
        double delta = d - cluster[column];
        sqr += delta * delta;
      }
    }
    // Scale distance by ratio of valid dimensions to all dimensions - since
    // we did not add any error term for the missing point, the sum of errors
    // is small - ratio up "as if" the missing error term is equal to the
    // average of other error terms.  Same math another way:
    //   double avg_dist = sqr / pts; // average distance per feature/column/dimension
    //   sqr = sqr * point.length;    // Total dist is average*#dimensions
    if( 0 < pts && pts < point.length )
      sqr *= point.length / pts;
    return sqr;
  }

  /** Return both nearest of N cluster/centroids, and the square-distance. */
  private static ClusterDist closest(double[][] clusters, double[] point, int ncats, ClusterDist cd, int count) {
    int min = -1;
    double minSqr = Double.MAX_VALUE;
    for( int cluster = 0; cluster < count; cluster++ ) {
      double sqr = distance(clusters[cluster],point,ncats);
      if( sqr < minSqr ) {      // Record nearest cluster
        min = cluster;
        minSqr = sqr;
      }
    }
    cd._cluster = min;          // Record nearest cluster
    cd._dist = minSqr;          // Record square-distance
    return cd;                  // Return for flow-coding
  }

  // For KMeansModel scoring; just the closest cluster
  static int closest(double[][] clusters, double[] point, int ncats) {
    int min = -1;
    double minSqr = Double.MAX_VALUE;
    for( int cluster = 0; cluster < clusters.length; cluster++ ) {
      double sqr = distance(clusters[cluster],point,ncats);
      if( sqr < minSqr ) {      // Record nearest cluster
        min = cluster;
        minSqr = sqr;
      }
    }
    return min;
  }

  // KMeans++ re-clustering
  private double[][] recluster(double[][] points, Random rand) {
    double[][] res = new double[k][];
    res[0] = points[0];
    int count = 1;
    ClusterDist cd = new ClusterDist();
    switch( initialization ) {
      case PlusPlus: { // k-means++
        while( count < res.length ) {
          double sum = 0;
          for (double[] point1 : points) sum += minSqr(res, point1, _ncats, cd, count);

          for (double[] point : points) {
            if (minSqr(res, point, _ncats, cd, count) >= rand.nextDouble() * sum) {
              res[count++] = point;
              break;
            }
          }
        }
        break;
      }
      // if we oversampled for initialization=None, recluster using the Furthest criteria down to k
      case None:
      case Furthest: { // Takes cluster further from any already chosen ones
        while( count < res.length ) {
          double max = 0;
          int index = 0;
          for( int i = 0; i < points.length; i++ ) {
            double sqr = minSqr(res, points[i], _ncats, cd, count);
            if( sqr > max ) {
              max = sqr;
              index = i;
            }
          }
          res[count++] = points[index];
        }
        break;
      }
      default:  throw H2O.fail();
    }
    return res;
  }

  private void randomRow(Vec[] vecs, Random rand, double[] cluster, double[] means, double[] mults) {
    long row = Math.max(0, (long) (rand.nextDouble() * vecs[0].length()) - 1);
    data(cluster, vecs, row, means, mults);
  }

  private static boolean normalize(double sigma) {
    // TODO unify handling of constant columns
    return sigma > 1e-6;
  }

  // Pick most common cat level for each cluster_centers' cat columns
  private static double[][] max_cats(double[][] clusters, long[][][] cats) {
    int K = cats.length;
    int ncats = cats[0].length;
    for( int clu = 0; clu < K; clu++ )
      for( int col = 0; col < ncats; col++ ) // Cats use max level for cluster center
        clusters[clu][col] = Utils.maxIndex(cats[clu][col]);
    return clusters;
  }

  private static double[][] denormalize(double[][] clusters, int ncats, double[] means, double[] mults) {
    int K = clusters.length;
    int N = clusters[0].length;
    double[][] value = new double[K][N];
    for( int clu = 0; clu < K; clu++ ) {
      System.arraycopy(clusters[clu],0,value[clu],0,N);
      if( mults!=null )         // Reverse normalization
        for( int col = ncats; col < N; col++ )
          value[clu][col] = value[clu][col] / mults[col] + means[col];
    }
    return value;
  }

  private static void data(double[] values, Vec[] vecs, long row, double[] means, double[] mults) {
    for( int i = 0; i < values.length; i++ ) {
      double d = vecs[i].at(row);
      values[i] = data(d, i, means, mults, vecs[i].cardinality());
    }
  }

  private static void data(double[] values, Chunk[] chks, int row, double[] means, double[] mults) {
    for( int i = 0; i < values.length; i++ ) {
      double d = chks[i].at0(row);
      values[i] = data(d, i, means, mults, chks[i]._vec.cardinality());
    }
  }

  /**
   * Takes mean if NaN, normalize if requested.
   */
  private static double data(double d, int i, double[] means, double[] mults, int cardinality) {
    if(cardinality == -1) {
      if( Double.isNaN(d) )
        d = means[i];
      if( mults != null ) {
        d -= means[i];
        d *= mults[i];
      }
    } else {
      // TODO: If NaN, then replace with majority class?
      if(Double.isNaN(d))
        d = Math.min(Math.round(means[i]), cardinality-1);
    }
    return d;
  }
}