package hex;
import water.*;
import water.H2O.FJWThr;
import water.H2O.H2OCallback;
import water.H2O.H2OCountedCompleter;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.NewChunk.Value;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.Log;
import water.util.StringUtils;
import java.util.Arrays;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Created by tomasnykodym on 11/13/14.
*
* Distributed matrix operations such as (sparse) multiplication and transpose.
*/
public class DMatrix {
/**
* Transpose the Frame as if it was a matrix (i.e. rows become coumns).
* Must be all numeric, currently will fail if there are too many rows ( >= ~.5M).
* Result will be put into a new Vectro Group and will be balanced so that each vec will have
* (4*num cpus in the cluster) chunks.
*
* @param src
* @return
*/
public static Frame transpose(Frame src){
if(src.numRows() != (int)src.numRows())
throw H2O.unimpl();
int nchunks = Math.max(1,src.numCols()/10000);
long [] espc = new long[nchunks+1];
int rpc = (src.numCols() / nchunks);
int rem = (src.numCols() % nchunks);
Arrays.fill(espc, rpc);
for (int i = 0; i < rem; ++i) ++espc[i];
long sum = 0;
for (int i = 0; i < espc.length; ++i) {
long s = espc[i];
espc[i] = sum;
sum += s;
}
Key key = Vec.newKey();
int rowLayout = Vec.ESPC.rowLayout(key,espc);
return transpose(src, new Frame(new Vec(key,rowLayout).makeZeros((int)src.numRows())));
}
/**
* Transpose the Frame as if it was a matrix (rows <-> columns).
* Must be all numeric, will fail if there are too many rows ( >= ~.5M).
*
* Result is made to be compatible (i.e. the same vector group and chunking) with the target frame.
*
* @param src
* @return
*/
public static Frame transpose(Frame src, Frame tgt){
if(src.numRows() != tgt.numCols() || src.numCols() != tgt.numRows())
throw new IllegalArgumentException("dimension do not match!");
for(Vec v:src.vecs()) {
if (v.isCategorical())
throw new IllegalArgumentException("transpose can only be applied to all-numeric frames (representing a matrix)");
if(v.length() > 1000000)
throw new IllegalArgumentException("too many rows, transpose only works for frames with < 1M rows.");
}
new TransposeTsk(tgt).doAll(src);
return tgt;
}
/**
* (MR)Task performing the matrix transpose.
* It is to be applied to the source frame.
* Target frame must be created up front (e.g. via Vec.makeZeros() call)
* and passed in as an argument.
*
* Task will utilize sparsity and will preserve compression if possible
* (compression may differ because of switching from column compressed to row-compressed form)
*/
public static class TransposeTsk extends MRTask<TransposeTsk> {
final Frame _tgt; // Target dataset, should be created up front, e.g. via Vec.makeZeros(n) call.
public TransposeTsk(Frame tgt){ _tgt = tgt;}
public void map(final Chunk[] chks) {
final Frame tgt = _tgt;
final long [] espc = tgt.anyVec().espc();
final int colStart = (int)chks[0].start();
for (int i = 0; i < espc.length - 1; ++i) {
final int fi = i;
final NewChunk[] tgtChunks = new NewChunk[chks[0]._len];
for (int j = 0; j < tgtChunks.length; ++j)
tgtChunks[j] = new NewChunk(tgt.vec(j + colStart), fi);
for (int c = ((int) espc[fi]); c < (int) espc[fi + 1]; ++c) {
Chunk nc = chks[c];
if(nc.isSparseZero()) {
for (int k = nc.nextNZ(-1); k < nc._len; k = nc.nextNZ(k)) {
tgtChunks[k].addZeros((int) (c - espc[fi]) - tgtChunks[k]._len);
nc.extractRows(tgtChunks[k], k);
}
} else
for(int k = 0; k < nc._len; k++) {
tgtChunks[k].addZeros((int) (c - espc[fi]) - tgtChunks[k]._len);
nc.extractRows(tgtChunks[k], k);
}
}
for (int j = 0; j < tgtChunks.length; ++j) { // finalize the target chunks and close them
final int fj = j;
tgtChunks[fj].addZeros((int) (espc[fi + 1] - espc[fi]) - tgtChunks[fj]._len);
tgtChunks[fj].close(_fs);
tgtChunks[fj] = null;
}
}
}
}
/**
* Info about matrix multiplication currently in progress.
*
* Contains runtime and (already computed)chunks stats
*
*/
public static class MatrixMulStats extends Iced {
public final Key jobKey;
public final long chunksTotal;
public final long _startTime;
public long lastUpdateAt;
public long chunksDone;
public long size;
public int [] chunkTypes = new int[0];
public long [] chunkCnts = new long[0];
public MatrixMulStats(long n, Key jobKey){chunksTotal = n; _startTime = System.currentTimeMillis(); this.jobKey = jobKey;}
public float progress(){ return (float)((double)chunksDone/chunksTotal);}
}
public static Frame mmul(Frame x, Frame y) {
MatrixMulTsk t = new MatrixMulTsk(null,null,x,y);
if(Thread.currentThread() instanceof FJWThr)
t.fork().join();
else
H2O.submitTask(t).join();
return t._z;
}
public static class MatrixMulTsk extends H2OCountedCompleter {
final transient Frame _x;
Frame _y;
Frame _z;
final Key _progressKey;
AtomicInteger _cntr;
public MatrixMulTsk(H2OCountedCompleter cmp, Key progressKey, Frame x, Frame y) {
super(cmp);
if(x.numCols() != y.numRows())
throw new IllegalArgumentException("dimensions do not match! x.numcols = " + x.numCols() + ", y.numRows = " + y.numRows());
_x = x;
_y = y;
_progressKey = progressKey;
}
@Override
public void compute2() {
_z = new Frame(_x.anyVec().makeZeros(_y.numCols()));
int total_cores = H2O.CLOUD.size()*H2O.NUMCPUS;
int chunksPerCol = _y.anyVec().nChunks();
int maxP = 256*total_cores/chunksPerCol;
Log.info("maxP = " + maxP);
_cntr = new AtomicInteger(maxP-1);
addToPendingCount(2*_y.numCols()-1);
for(int i = 0; i < Math.min(_y.numCols(),maxP); ++i)
forkVecTask(i);
}
private void forkVecTask(final int i) {
new GetNonZerosTsk(new H2OCallback<GetNonZerosTsk>(this) {
@Override
public void callback(GetNonZerosTsk gnz) {
new VecTsk(new Callback(), _progressKey, gnz._vals).dfork(ArrayUtils.append(_x.vecs(gnz._idxs), _z.vec(i)));
}
}).dfork(_y.vec(i));
}
private class Callback extends H2OCallback{
public Callback(){super(MatrixMulTsk.this);}
@Override
public void callback(H2OCountedCompleter h2OCountedCompleter) {
int i = _cntr.incrementAndGet();
if(i < _y.numCols())
forkVecTask(i);
}
}
}
static int cnt = 0;
// to be invoked from R expression
private static class GetNonZerosTsk extends MRTask<GetNonZerosTsk>{
final int _maxsz;
int [] _idxs;
double [] _vals;
public GetNonZerosTsk(H2OCountedCompleter cmp){super(cmp);_maxsz = 10000000;}
public GetNonZerosTsk(H2OCountedCompleter cmp, int maxsz){super(cmp); _maxsz = maxsz;}
@Override public void map(Chunk c){
int istart = (int)c.start();
assert (c.start() + c._len) == (istart + c._len);
final int n = c.sparseLenZero();
_idxs = MemoryManager.malloc4(n);
_vals = MemoryManager.malloc8d(n);
int j = 0;
for(int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i),++j) {
_idxs[j] = i + istart;
_vals[j] = c.atd(i);
}
assert j == n;
if(_idxs.length > _maxsz)
throw new RuntimeException("too many nonzeros! found at least " + _idxs.length + " nonzeros.");
}
@Override public void reduce(GetNonZerosTsk gnz){
if(_idxs.length + gnz._idxs.length > _maxsz)
throw new RuntimeException("too many nonzeros! found at least " + (_idxs.length + gnz._idxs.length) + " nonzeros.");
int [] idxs = MemoryManager.malloc4(_idxs.length + gnz._idxs.length);
double [] vals = MemoryManager.malloc8d(_vals.length + gnz._vals.length);
ArrayUtils.sortedMerge(_idxs,_vals,gnz._idxs,gnz._vals,idxs,vals);
_idxs = idxs;
_vals = vals;
}
}
// compute single vec of the output in matrix multiply
private static class VecTsk extends MRTask<VecTsk> {
double [] _y;
Key _progressKey;
public VecTsk(H2OCountedCompleter cmp, Key progressKey, double [] y){
super(cmp);
_progressKey = progressKey;
_y = y;
}
@Override public void setupLocal(){_fr.lastVec().preWriting();}
@Override public void map(Chunk [] chks) {
Chunk zChunk = chks[chks.length-1];
double [] res = MemoryManager.malloc8d(chks[0]._len);
for(int i = 0; i < _y.length; ++i) {
final double yVal = _y[i];
final Chunk xChunk = chks[i];
for (int k = xChunk.nextNZ(-1); k < res.length; k = xChunk.nextNZ(k))
try { res[k] += yVal * xChunk.atd(k);} catch(Throwable t) {
t.printStackTrace();
throw new RuntimeException(t);
}
}
Chunk modChunk = new NewChunk(res).setSparseRatio(2).compress();
if(_progressKey != null)
new UpdateProgress(modChunk.getBytes().length,modChunk.frozenType()).fork(_progressKey);
DKV.put(zChunk.vec().chunkKey(zChunk.cidx()),modChunk,_fs);
}
@Override public void closeLocal(){
_y = null; // drop inputs
_progressKey = null;
}
}
private static class UpdateProgress extends TAtomic<MatrixMulStats> {
final int _chunkSz;
final int _chunkType;
public UpdateProgress(int sz, int type) {
_chunkSz = sz;
_chunkType = type;
}
@Override
public MatrixMulStats atomic(MatrixMulStats old) {
old.chunkCnts = old.chunkCnts.clone();
int j = -1;
for(int i = 0; i < old.chunkTypes.length; ++i) {
if(_chunkType == old.chunkTypes[i]) {
j = i;
break;
}
}
if(j == -1) {
old.chunkTypes = Arrays.copyOf(old.chunkTypes,old.chunkTypes.length+1);
old.chunkCnts = Arrays.copyOf(old.chunkCnts,old.chunkCnts.length+1);
old.chunkTypes[old.chunkTypes.length-1] = _chunkType;
j = old.chunkTypes.length-1;
}
old.chunksDone++;
old.chunkCnts[j]++;
old.lastUpdateAt = System.currentTimeMillis();
old.size += _chunkSz;
return old;
}
}
}