PCA.java example

Explorer
h2o-2-master
package hex.pca;

import Jama.Matrix;
import Jama.SingularValueDecomposition;
import hex.FrameTask.DataInfo;
import hex.gram.Gram.GramTask;
import water.Job.ColumnsJob;
import water.*;
import water.api.DocGen;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.Log;
import water.util.RString;

import java.util.ArrayList;


/**
 * Principal Components Analysis
 * This is an algorithm for dimensionality reduction of numerical data.
 * <a href = "http://en.wikipedia.org/wiki/Principal_component_analysis">PCA on Wikipedia</a>
 * @author anqi_fu
 *
 */
public class PCA extends ColumnsJob {
  static final int API_WEAVER = 1;
  static public DocGen.FieldDoc[] DOC_FIELDS;
  static final String DOC_GET = "pca";

  static final int MAX_COL = 5000;

  @API(help = "The PCA Model")
  public PCAModel pca_model;

  @API(help = "Maximum number of principal components to return.", filter = Default.class, lmin = 1, lmax = 5000, json=true)
  int max_pc = 5000;

  @API(help = "Omit components with std dev <= tol times std dev of first component.", filter = Default.class, lmin = 0, lmax = 1, json=true)
  double tolerance = 0;

  @API(help = "If true, data will be standardized on the fly when computing the model.", filter = Default.class, json=true)
  boolean standardize = true;

  public PCA() {}

  public PCA(String desc, Key dest, Frame src, double tolerance, boolean standardize) {
    this(desc, dest, src, 5000, tolerance, standardize);
  }

  public PCA(String desc, Key dest, Frame src, int max_pc, double tolerance, boolean standardize) {
    description = desc;
    destination_key = dest;
    source = src;
    this.max_pc = max_pc;
    this.tolerance = tolerance;
    this.standardize = standardize;
  }

  @Override public boolean toHTML(StringBuilder sb) { return makeJsonBox(sb); }

  @Override protected void execImpl() {
    Frame fr = selectFrame(source);
    Vec[] vecs = fr.vecs();

    // Remove constant cols and cols with too many NAs
    ArrayList<Integer> removeCols = new ArrayList<Integer>();
    for(int i = 0; i < vecs.length; i++) {
      if(vecs[i].min() == vecs[i].max() || vecs[i].naCnt() > vecs[i].length()*0.2)
      // if(vecs[i].min() == vecs[i].max() || vecs[i].naCnt() > vecs[i].length()*0.2 || vecs[i].domain() != null)
        removeCols.add(i);
    }
    if(!removeCols.isEmpty()) {
      int[] cols = new int[removeCols.size()];
      for(int i = 0; i < cols.length; i++)
        cols[i] = removeCols.get(i);
      fr.remove(cols);
    }
    if( fr.numCols() < 2 )
      throw new IllegalArgumentException("Need more than one column to run PCA");

    DataInfo dinfo = new DataInfo(fr, 0, false, false, standardize ? DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType.NONE);
    GramTask tsk = new GramTask(self(), dinfo, false,false).doAll(dinfo._adaptedFrame);
    PCAModel myModel = buildModel(dinfo, tsk);
    myModel.delete_and_lock(self());
    myModel.unlock(self());
    remove();                   // Close/remove job
    final JobState state = UKV.<Job>get(self()).state;
    new TAtomic<PCAModel>() {
      @Override
      public PCAModel atomic(PCAModel m) {
        if (m != null) m.get_params().state = state;
        return m;
      }
    }.invoke(dest());
  }

  @Override protected void init() {
    super.init();
    int num_ecols = selectFrame(source).numExpCols();
    Log.info("Running PCA on dataset with " + num_ecols + " expanded columns in Gram matrix");
    if(num_ecols > MAX_COL)
      throw new IllegalArgumentException("Cannot process more than " + MAX_COL + " columns, taking into account expanded categoricals");
  }

  @Override protected Response redirect() {
    return PCAProgressPage.redirect(this, self(), dest());
  }

  public PCAModel buildModel(DataInfo dinfo, GramTask tsk) {
    logStart();
    Matrix myGram = new Matrix(tsk._gram.getXX());   // X'X/n where n = num rows
    SingularValueDecomposition mySVD = myGram.svd();

    // Extract eigenvalues and eigenvectors
    // Note: Singular values ordered in weakly descending order by algorithm
    double[] Sval = mySVD.getSingularValues();
    double[][] eigVec = mySVD.getV().getArray();  // rows = features, cols = principal components
    assert Sval.length == eigVec.length;
    // DKV.put(EigenvectorMatrix.makeKey(input("source"), destination_key), new EigenvectorMatrix(eigVec));

    // Compute standard deviation
    double[] sdev = new double[Sval.length];
    double totVar = 0;
    double dfcorr = dinfo._adaptedFrame.numRows()/(dinfo._adaptedFrame.numRows() - 1.0);
    for(int i = 0; i < Sval.length; i++) {
      // if(standardize)
        Sval[i] = dfcorr*Sval[i];   // Correct since degrees of freedom = n-1
      sdev[i] = Math.sqrt(Sval[i]);
      totVar += Sval[i];
    }

    double[] propVar = new double[Sval.length];    // Proportion of total variance
    double[] cumVar = new double[Sval.length];    // Cumulative proportion of total variance
    for(int i = 0; i < Sval.length; i++) {
      propVar[i] = Sval[i]/totVar;
      cumVar[i] = i == 0 ? propVar[0] : cumVar[i-1] + propVar[i];
    }

    Key dataKey = input("source") == null ? null : Key.make(input("source"));
    int ncomp = Math.min(getNumPC(sdev, tolerance), max_pc);
    return new PCAModel(this, destination_key, dataKey, dinfo, tsk, sdev, propVar, cumVar, eigVec, mySVD.rank(), ncomp);
  }

  public static int getNumPC(double[] sdev, double tol) {
    if(sdev == null) return 0;
    double cutoff = tol*sdev[0];
    for( int i=0; i<sdev.length; i++ )
      if( sdev[i] < cutoff )
        return i;
    return sdev.length;
  }

  public static String link(Key src_key, String content) {
      RString rs = new RString("<a href='/2/PCA.query?%key_param=%$key'>%content</a>");
      rs.replace("key_param", "source");
      rs.replace("key", src_key.toString());
      rs.replace("content", content);
      return rs.toString();
  }
}