/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
 */
package com.ibm.bi.dml.runtime.matrix.data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.commons.math3.util.FastMath;
import com.ibm.bi.dml.lops.MapMultChain.ChainType;
import com.ibm.bi.dml.lops.WeightedCrossEntropy.WCeMMType;
import com.ibm.bi.dml.lops.WeightedDivMM.WDivMMType;
import com.ibm.bi.dml.lops.WeightedSigmoid.WSigmoidType;
import com.ibm.bi.dml.lops.WeightedSquaredLoss.WeightsType;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.functionobjects.SwapIndex;
import com.ibm.bi.dml.runtime.matrix.operators.ReorgOperator;
import com.ibm.bi.dml.runtime.util.UtilFunctions;
/**
* MB:
* Library for matrix multiplications including MM, MV, VV for all
* combinations of dense, sparse, ultrasparse representations and special
* operations such as transpose-self matrix multiplication.
*
* In general all implementations use internally dense outputs
* for direct access, but change the final result to sparse if necessary.
* The only exceptions are ultra-sparse matrix mult, wsloss and wsigmoid.
*
* NOTES on BLAS:
* * Experiments in 04/2013 showed that even on dense-dense this implementation
* is 3x faster than f2j-BLAS-DGEMM, 2x faster than f2c-BLAS-DGEMM, and
* level (+10% after JIT) with a native C implementation.
* * Calling native BLAS would loose platform independence and would require
* JNI calls incl data transfer. Furthermore, BLAS does not support sparse
* matrices (except Sparse BLAS, with dedicated function calls and matrix formats)
* and would be an external dependency.
* * Experiments in 02/2014 showed that on dense-dense this implementation now achieves
* almost 30% peak FP performance. Compared to Intel MKL 11.1 (dgemm, N=1000) it is
* just 3.2x (sparsity=1.0) and 1.9x (sparsity=0.5) slower, respectively.
*
*/
public class LibMatrixMult
{
//internal configuration
public static final boolean LOW_LEVEL_OPTIMIZATION = true;
public static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB
private static final long PAR_MINFLOP_THRESHOLD = 2L*1024*1024; //MIN 2 MFLOP
private LibMatrixMult() {
//prevent instantiation via private constructor
}
////////////////////////////////
// public matrix mult interface
////////////////////////////////
/**
* Performs a matrix multiplication and stores the result in the output matrix.
*
* All variants use a IKJ access pattern, and internally use dense output. After the
* actual computation, we recompute nnz and check for sparse/dense representation.
*
*
* @param m1 first matrix
* @param m2 second matrix
* @param ret result matrix
* @throws DMLRuntimeException
*/
public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)
throws DMLRuntimeException
{
//check inputs / outputs
if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//pre-processing: output allocation
boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
m2 = prepMatrixMultRightInput(m1, m2);
ret.sparse = (m1.isUltraSparse() || m2.isUltraSparse());
if( !ret.sparse )
ret.allocateDenseBlock();
//prepare row-upper for special cases of vector-matrix
boolean pm2 = checkParMatrixMultRightInput(m1, m2, Integer.MAX_VALUE);
int ru = pm2 ? m2.rlen : m1.rlen;
//core matrix mult computation
if( m1.isUltraSparse() || m2.isUltraSparse() )
matrixMultUltraSparse(m1, m2, ret, 0, ru);
else if(!m1.sparse && !m2.sparse)
matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru);
else if(m1.sparse && m2.sparse)
matrixMultSparseSparse(m1, m2, ret, pm2, 0, ru);
else if(m1.sparse)
matrixMultSparseDense(m1, m2, ret, pm2, 0, ru);
else
matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru);
//post-processing: nnz/representation
if( !ret.sparse )
ret.recomputeNonZeros();
ret.examSparsity();
//System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
// "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
}
/**
* Performs a multi-threaded matrix multiplication and stores the result in the output matrix.
* The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
*
* @param m1
* @param m2
* @param ret
* @param k
* @throws DMLRuntimeException
*/
public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k)
throws DMLRuntimeException
{
//check inputs / outputs
if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//check too high additional vector-matrix memory requirements (fallback to sequential)
//check too small workload in terms of flops (fallback to sequential too)
if( m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD || !LOW_LEVEL_OPTIMIZATION || m2.clen==1 || m1.isUltraSparse() || m2.isUltraSparse())
|| 2L * m1.rlen * m1.clen * m2.clen < PAR_MINFLOP_THRESHOLD )
{
matrixMult(m1, m2, ret);
return;
}
//Timing time = new Timing(true);
//pre-processing: output allocation (in contrast to single-threaded,
//we need to allocate sparse as well in order to prevent synchronization)
boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
m2 = prepMatrixMultRightInput(m1, m2);
ret.sparse = (m1.isUltraSparse() || m2.isUltraSparse());
if( !ret.sparse )
ret.allocateDenseBlock();
else
ret.allocateSparseRowsBlock();
//prepare row-upper for special cases of vector-matrix / matrix-matrix
boolean pm2 = checkParMatrixMultRightInput(m1, m2, k);
int ru = pm2 ? m2.rlen : m1.rlen;
//core multi-threaded matrix mult computation
//(currently: always parallelization over number of rows)
try {
ExecutorService pool = Executors.newFixedThreadPool( k );
ArrayList<MatrixMultTask> tasks = new ArrayList<MatrixMultTask>();
int blklen = (int)(Math.ceil((double)ru/k));
for( int i=0; i<k & i*blklen<ru; i++ )
tasks.add(new MatrixMultTask(m1, m2, ret, tm2, pm2, i*blklen, Math.min((i+1)*blklen, ru)));
pool.invokeAll(tasks);
pool.shutdown();
//aggregate partial results (nnz, ret for vector/matrix)
ret.nonZeros = 0; //reset after execute
for( MatrixMultTask task : tasks ) {
if( pm2 )
vectAdd(task.getResult().denseBlock, ret.denseBlock, 0, 0, ret.rlen*ret.clen);
else
ret.nonZeros += task.getPartialNnz();
}
if( pm2 )
ret.recomputeNonZeros();
}
catch(Exception ex) {
throw new DMLRuntimeException(ex);
}
//post-processing (nnz maintained in parallel)
ret.examSparsity();
//System.out.println("MM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
// "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
}
/**
* Performs a matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)).
*
* All variants use a IKJ access pattern, and internally use dense output. After the
* actual computation, we recompute nnz and check for sparse/dense representation.
*
* @param m1
* @param m2
* @param w
* @param ret
* @param ct
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct)
throws DMLRuntimeException, DMLUnsupportedOperationException
{
//check inputs / outputs (after that mV and mW guaranteed to be dense)
if( mX.isEmptyBlock(false) || mV.isEmptyBlock(false) || (mW !=null && mW.isEmptyBlock(false)) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//pre-processing: output allocation
ret.sparse = false;
ret.allocateDenseBlock();
//core matrix mult chain computation
if( mX.sparse )
matrixMultChainSparse(mX, mV, mW, ret, ct, 0, mX.rlen);
else
matrixMultChainDense(mX, mV, mW, ret, ct, 0, mX.rlen);
//post-processing
ret.recomputeNonZeros();
ret.examSparsity();
//System.out.println("MMChain "+ct.toString()+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
/**
* Performs a parallel matrix multiplication chain operation of type t(X)%*%(X%*%v) or t(X)%*%(w*(X%*%v)).
* The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
*
* NOTE: This multi-threaded mmchain operation has additional memory requirements of k*ncol(X)*8bytes
* for partial aggregation. Current max memory: 256KB; otherwise redirectly to sequential execution.
*
* @param mX
* @param mV
* @param mW
* @param ret
* @param ct
* @param k
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
public static void matrixMultChain(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int k)
throws DMLRuntimeException, DMLUnsupportedOperationException
{
//check inputs / outputs (after that mV and mW guaranteed to be dense)
if( mX.isEmptyBlock(false) || mV.isEmptyBlock(false) || (mW !=null && mW.isEmptyBlock(false)) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//check too high additional memory requirements (fallback to sequential)
//check too small workload in terms of flops (fallback to sequential too)
if( 8L * mV.rlen * k > MEM_OVERHEAD_THRESHOLD
|| 4L * mX.rlen * mX.clen < PAR_MINFLOP_THRESHOLD )
{
matrixMultChain(mX, mV, mW, ret, ct);
return;
}
//Timing time = new Timing(true);
//pre-processing
ret.sparse = false;
ret.allocateDenseBlock();
//core matrix mult chain computation
//(currently: always parallelization over number of rows)
try {
ExecutorService pool = Executors.newFixedThreadPool( k );
ArrayList<MatrixMultChainTask> tasks = new ArrayList<MatrixMultChainTask>();
int blklen = (int)(Math.ceil((double)mX.rlen/k));
blklen += (blklen%24 != 0)?24-blklen%24:0;
for( int i=0; i<k & i*blklen<mX.rlen; i++ )
tasks.add(new MatrixMultChainTask(mX, mV, mW, ret, ct, i*blklen, Math.min((i+1)*blklen, mX.rlen)));
pool.invokeAll(tasks);
pool.shutdown();
//aggregate partial results
for( MatrixMultChainTask task : tasks )
vectAdd(task.getResult().denseBlock, ret.denseBlock, 0, 0, mX.clen);
}
catch(Exception ex) {
throw new DMLRuntimeException(ex);
}
//post-processing
ret.recomputeNonZeros();
ret.examSparsity();
//System.out.println("MMChain "+ct.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
/**
*
* @param m1
* @param ret
* @param leftTranspose
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
public static void matrixMultTransposeSelf( MatrixBlock m1, MatrixBlock ret, boolean leftTranspose )
throws DMLUnsupportedOperationException, DMLRuntimeException
{
//check inputs / outputs
if( m1.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//pre-processing
m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose);
ret.sparse = false;
ret.allocateDenseBlock();
if( m1.sparse )
matrixMultTransposeSelfSparse(m1, ret, leftTranspose, 0, ret.rlen);
else
matrixMultTransposeSelfDense(m1, ret, leftTranspose, 0, ret.rlen );
//post-processing
copyUpperToLowerTriangle( ret );
ret.recomputeNonZeros();
ret.examSparsity();
//System.out.println("TSMM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop());
}
/**
*
* @param m1
* @param ret
* @param leftTranspose
* @param k
* @throws DMLUnsupportedOperationException
* @throws DMLRuntimeException
*/
public static void matrixMultTransposeSelf( MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int k )
throws DMLUnsupportedOperationException, DMLRuntimeException
{
//check inputs / outputs
if( m1.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//check no parallelization benefit (fallback to sequential)
//check too small workload in terms of flops (fallback to sequential too)
if( ret.rlen == 1
|| leftTranspose && 1L * m1.rlen * m1.clen * m1.clen < PAR_MINFLOP_THRESHOLD
|| !leftTranspose && 1L * m1.clen * m1.rlen * m1.rlen < PAR_MINFLOP_THRESHOLD )
{
matrixMultTransposeSelf(m1, ret, leftTranspose);
return;
}
//Timing time = new Timing(true);
//pre-processing
m1 = prepMatrixMultTransposeSelfInput(m1, leftTranspose);
ret.sparse = false;
ret.allocateDenseBlock();
//core multi-threaded matrix mult computation
try {
ExecutorService pool = Executors.newFixedThreadPool( k );
ArrayList<MatrixMultTransposeTask> tasks = new ArrayList<MatrixMultTransposeTask>();
//load balance via #tasks=2k due to triangular shape
int blklen = (int)(Math.ceil((double)ret.rlen/(2*k)));
for( int i=0; i<2*k & i*blklen<ret.rlen; i++ )
tasks.add(new MatrixMultTransposeTask(m1, ret, leftTranspose, i*blklen, Math.min((i+1)*blklen, ret.rlen)));
pool.invokeAll(tasks);
pool.shutdown();
}
catch(Exception ex) {
throw new DMLRuntimeException(ex);
}
//post-processing
copyUpperToLowerTriangle( ret );
ret.recomputeNonZeros();
ret.examSparsity();
//System.out.println("TSMM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop());
}
/**
*
* @param m1
* @param m2
* @param ret1
* @param ret2
* @throws DMLUnsupportedOperationException
* @throws DMLRuntimeException
*/
public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2 )
throws DMLUnsupportedOperationException, DMLRuntimeException
{
//check inputs / outputs
if( pm1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
return;
//Timing time = new Timing(true);
//pre-processing
ret1.sparse = (m2.sparse || ret1.sparse);
if( ret1.sparse )
ret1.allocateSparseRowsBlock();
else
ret1.allocateDenseBlock();
//core permutation mm computation
if( m2.sparse )
matrixMultPermuteSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
else if( ret1.sparse )
matrixMultPermuteDenseSparse(pm1, m2, ret1, ret2, 0, pm1.rlen);
else
matrixMultPermuteDense(pm1, m2, ret1, ret2, 0, pm1.rlen);
//post-processing
ret1.recomputeNonZeros();
ret1.examSparsity();
if( ret2 != null ) { //optional second output
ret2.recomputeNonZeros();
ret2.examSparsity();
}
//System.out.println("PMM Seq ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" +
// "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
}
/**
*
* @param m1
* @param m2
* @param ret1
* @param ret2
* @throws DMLUnsupportedOperationException
* @throws DMLRuntimeException
* @throws DMLRuntimeException
*/
public static void matrixMultPermute( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int k)
throws DMLUnsupportedOperationException, DMLRuntimeException
{
//check inputs / outputs
if( pm1.isEmptyBlock(false) || m2.isEmptyBlock(false) )
return;
//check no parallelization benefit (fallback to sequential)
if (pm1.rlen == 1) {
matrixMultPermute(pm1, m2, ret1, ret2);
return;
}
//Timing time = new Timing(true);
//allocate first output block (second allocated if needed)
ret1.sparse = false;
ret1.allocateDenseBlock();
try
{
ExecutorService pool = Executors.newFixedThreadPool(k);
ArrayList<MatrixMultPermuteTask> tasks = new ArrayList<MatrixMultPermuteTask>();
int blklen = (int)(Math.ceil((double)pm1.rlen/k));
for( int i=0; i<k & i*blklen<pm1.rlen; i++ )
tasks.add(new MatrixMultPermuteTask(pm1, m2, ret1, ret2, i*blklen, Math.min((i+1)*blklen, pm1.rlen)));
pool.invokeAll(tasks);
pool.shutdown();
}
catch (InterruptedException e) {
throw new DMLRuntimeException(e);
}
//post-processing
ret1.recomputeNonZeros();
ret1.examSparsity();
if( ret2 != null ) { //optional second output
ret2.recomputeNonZeros();
ret2.examSparsity();
}
// System.out.println("PMM Par ("+pm1.isInSparseFormat()+","+pm1.getNumRows()+","+pm1.getNumColumns()+","+pm1.getNonZeros()+")x" +
// "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @throws DMLRuntimeException
*/
public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt)
throws DMLRuntimeException
{
//check for empty result
if( wt==WeightsType.POST && mW.isEmptyBlock(false)
|| wt==WeightsType.POST_NZ && mX.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//core weighted square sum mm computation
if( !mX.sparse && !mU.sparse && !mV.sparse && (mW==null || !mW.sparse)
&& !mX.isEmptyBlock() && !mU.isEmptyBlock() && !mV.isEmptyBlock() && (mW==null || !mW.isEmptyBlock()))
matrixMultWSLossDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
else if( mX.sparse && !mU.sparse && !mV.sparse && (mW==null || mW.sparse)
&& !mX.isEmptyBlock() && !mU.isEmptyBlock() && !mV.isEmptyBlock() && (mW==null || !mW.isEmptyBlock()))
matrixMultWSLossSparseDense(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
else
matrixMultWSLossGeneric(mX, mU, mV, mW, ret, wt, 0, mX.rlen);
//System.out.println("MMWSLoss " +wt.toString()+ " ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @throws DMLRuntimeException
*/
public static void matrixMultWSLoss(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int k)
throws DMLRuntimeException
{
//check for empty result
if( wt==WeightsType.POST && mW.isEmptyBlock(false)
|| wt==WeightsType.POST_NZ && mX.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//check no parallelization benefit (fallback to sequential)
if (mX.rlen == 1) {
matrixMultWSLoss(mX, mU, mV, mW, ret, wt);
return;
}
//Timing time = new Timing(true);
try
{
ExecutorService pool = Executors.newFixedThreadPool(k);
ArrayList<ScalarResultTask> tasks = new ArrayList<ScalarResultTask>();
int blklen = (int)(Math.ceil((double)mX.rlen/k));
for( int i=0; i<k & i*blklen<mX.rlen; i++ )
tasks.add(new MatrixMultWSLossTask(mX, mU, mV, mW, wt, i*blklen, Math.min((i+1)*blklen, mX.rlen)));
pool.invokeAll(tasks);
pool.shutdown();
//aggregate partial results
sumScalarResults(tasks, ret);
}
catch (InterruptedException e) {
throw new DMLRuntimeException(e);
}
//System.out.println("MMWSLoss "+wt.toString()+" k="+k+" ("+mX.isInSparseFormat()+","+mX.getNumRows()+","+mX.getNumColumns()+","+mX.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
/**
*
* @param mW
* @param mU
* @param mV
* @param ret
* @param wt
* @throws DMLRuntimeException
*/
public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt)
throws DMLRuntimeException
{
//check for empty result
if( mW.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//pre-processing
ret.sparse = mW.sparse;
ret.allocateDenseOrSparseBlock();
//core weighted square sum mm computation
if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
matrixMultWSigmoidDense(mW, mU, mV, ret, wt, 0, mW.rlen);
else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
matrixMultWSigmoidSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen);
else
matrixMultWSigmoidGeneric(mW, mU, mV, ret, wt, 0, mW.rlen);
//post-processing
ret.recomputeNonZeros();
ret.examSparsity();
//System.out.println("MMWSig "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
/**
*
* @param mW
* @param mU
* @param mV
* @param ret
* @param wt
* @param k
* @throws DMLRuntimeException
*/
public static void matrixMultWSigmoid(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int k)
throws DMLRuntimeException
{
//check for empty result
if( mW.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//check no parallelization benefit (fallback to sequential)
if (mW.rlen == 1) {
matrixMultWSigmoid(mW, mU, mV, ret, wt);
return;
}
//Timing time = new Timing(true);
//pre-processing
ret.sparse = mW.sparse;
ret.allocateDenseOrSparseBlock();
try
{
ExecutorService pool = Executors.newFixedThreadPool(k);
ArrayList<MatrixMultWSigmoidTask> tasks = new ArrayList<MatrixMultWSigmoidTask>();
int blklen = (int)(Math.ceil((double)mW.rlen/k));
for( int i=0; i<k & i*blklen<mW.rlen; i++ )
tasks.add(new MatrixMultWSigmoidTask(mW, mU, mV, ret, wt, i*blklen, Math.min((i+1)*blklen, mW.rlen)));
pool.invokeAll(tasks);
pool.shutdown();
ret.nonZeros = 0; //reset after execute
for( MatrixMultWSigmoidTask task : tasks )
ret.nonZeros += task.getPartialNnz();
}
catch (InterruptedException e) {
throw new DMLRuntimeException(e);
}
//post-processing (nnz maintained in parallel)
ret.examSparsity();
//System.out.println("MMWSig "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop() + ".");
}
/**
* NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations
* have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail,
* this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero
* which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X)
* is zero.
*
* @param mX
* @param mU
* @param mV
* @param ret
* @param wt
* @throws DMLRuntimeException
*/
public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt)
throws DMLRuntimeException
{
//check for empty result
if( mW.isEmptyBlock(false)
|| (wt.isLeft() && mU.isEmptyBlock(false))
|| (wt.isRight() && mV.isEmptyBlock(false))
|| (wt.isBasic() && mW.isEmptyBlock(false))) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//pre-processing
ret.sparse = wt.isBasic()?mW.sparse:false;
ret.allocateDenseOrSparseBlock();
//core weighted div mm computation
if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
matrixMultWDivMMDense(mW, mU, mV, ret, wt, 0, mW.rlen, 0, mW.clen);
else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
matrixMultWDivMMSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen, 0, mW.clen);
else
matrixMultWDivMMGeneric(mW, mU, mV, ret, wt, 0, mW.rlen, 0, mW.clen);
//post-processing
ret.recomputeNonZeros();
ret.examSparsity();
//System.out.println("MMWDiv "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
/**
* NOTE: This operation has limited NaN support, which is acceptable because all our sparse-safe operations
* have only limited NaN support. If this is not intended behavior, please disable the rewrite. In detail,
* this operator will produce for W/(U%*%t(V)) a zero intermediate for each zero in W (even if UVij is zero
* which would give 0/0=NaN) but INF/-INF for non-zero entries in V where the corresponding cell in (Y%*%X)
* is zero.
*
* @param mX
* @param mU
* @param mV
* @param ret
* @param wt
* @param k
* @throws DMLRuntimeException
*/
public static void matrixMultWDivMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int k)
throws DMLRuntimeException
{
//check for empty result
if( mW.isEmptyBlock(false)
|| (wt.isLeft() && mU.isEmptyBlock(false))
|| (wt.isRight() && mV.isEmptyBlock(false))
|| (wt.isBasic() && mW.isEmptyBlock(false))) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//pre-processing
ret.sparse = wt.isBasic()?mW.sparse:false;
ret.allocateDenseOrSparseBlock();
try
{
ExecutorService pool = Executors.newFixedThreadPool(k);
ArrayList<MatrixMultWDivTask> tasks = new ArrayList<MatrixMultWDivTask>();
//create tasks (for wdivmm-left, parallelization over columns;
//for wdivmm-right, parallelization over rows; both ensure disjoint results)
if( wt.isLeft() ) {
int blklen = (int)(Math.ceil((double)mW.clen/k));
for( int j=0; j<k & j*blklen<mW.clen; j++ )
tasks.add(new MatrixMultWDivTask(mW, mU, mV, ret, wt, 0, mW.rlen, j*blklen, Math.min((j+1)*blklen, mW.clen)));
}
else { //basic/right
int blklen = (int)(Math.ceil((double)mW.rlen/k));
for( int i=0; i<k & i*blklen<mW.rlen; i++ )
tasks.add(new MatrixMultWDivTask(mW, mU, mV, ret, wt, i*blklen, Math.min((i+1)*blklen, mW.rlen), 0, mW.clen));
}
//execute tasks
pool.invokeAll(tasks);
pool.shutdown();
//aggregate partial nnz
for( MatrixMultWDivTask task : tasks )
ret.nonZeros += task.getPartialNnz();
}
catch (InterruptedException e) {
throw new DMLRuntimeException(e);
}
//post-processing
ret.examSparsity();
//System.out.println("MMWDiv "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
/**
*
* @param mW
* @param mU
* @param mV
* @param ret
* @param wt
* @throws DMLRuntimeException
*/
public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WCeMMType wt)
throws DMLRuntimeException
{
//check for empty result
if( mW.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//pre-processing
ret.sparse = false;
ret.allocateDenseBlock();
//core weighted div mm computation
if( !mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock() )
matrixMultWCeMMDense(mW, mU, mV, ret, wt, 0, mW.rlen);
else if( mW.sparse && !mU.sparse && !mV.sparse && !mU.isEmptyBlock() && !mV.isEmptyBlock())
matrixMultWCeMMSparseDense(mW, mU, mV, ret, wt, 0, mW.rlen);
else
matrixMultWCeMMGeneric(mW, mU, mV, ret, wt, 0, mW.rlen);
//System.out.println("MMWCe "+wt.toString()+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
/**
*
* @param mX
* @param mU
* @param mV
* @param ret
* @param wt
* @param k
* @throws DMLRuntimeException
*/
public static void matrixMultWCeMM(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WCeMMType wt, int k)
throws DMLRuntimeException
{
//check for empty result
if( mW.isEmptyBlock(false) ) {
ret.examSparsity(); //turn empty dense into sparse
return;
}
//Timing time = new Timing(true);
//pre-processing
ret.sparse = false;
ret.allocateDenseBlock();
try
{
ExecutorService pool = Executors.newFixedThreadPool(k);
ArrayList<ScalarResultTask> tasks = new ArrayList<ScalarResultTask>();
int blklen = (int)(Math.ceil((double)mW.rlen/k));
for( int i=0; i<k & i*blklen<mW.rlen; i++ )
tasks.add(new MatrixMultWCeTask(mW, mU, mV, wt, i*blklen, Math.min((i+1)*blklen, mW.rlen)));
pool.invokeAll(tasks);
pool.shutdown();
//aggregate partial results
sumScalarResults(tasks, ret);
}
catch (InterruptedException e) {
throw new DMLRuntimeException(e);
}
//System.out.println("MMWCe "+wt.toString()+" k="+k+" ("+mW.isInSparseFormat()+","+mW.getNumRows()+","+mW.getNumColumns()+","+mW.getNonZeros()+")x" +
// "("+mV.isInSparseFormat()+","+mV.getNumRows()+","+mV.getNumColumns()+","+mV.getNonZeros()+") in "+time.stop());
}
//////////////////////////////////////////
// optimized matrix mult implementation //
//////////////////////////////////////////
/**
*
* @param m1
* @param m2
* @param ret
* @throws DMLRuntimeException
*/
private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru)
throws DMLRuntimeException
{
double[] a = m1.denseBlock;
double[] b = m2.denseBlock;
double[] c = ret.denseBlock;
final int m = m1.rlen;
final int n = m2.clen;
final int cd = m1.clen;
if( LOW_LEVEL_OPTIMIZATION )
{
if( m==1 && n==1 ) //DOT PRODUCT
{
c[0] = dotProduct(a, b, cd);
}
else if( n>1 && cd == 1 ) //OUTER PRODUCT
{
for( int i=rl, cix=rl*n; i < ru; i++, cix+=n) {
if( a[i] == 1 )
System.arraycopy(b, 0, c, cix, n);
else if( a[i] != 0 )
vectMultiplyWrite(a[i], b, c, 0, cix, n);
else
Arrays.fill(c, cix, cix+n, 0);
}
}
else if( n==1 && cd == 1 ) //VECTOR-SCALAR
{
vectMultiplyWrite(b[0], a, c, rl, rl, ru-rl);
}
else if( n==1 ) //MATRIX-VECTOR
{
for( int i=rl, aix=rl*cd; i < ru; i++, aix+=cd)
c[ i ] = dotProduct(a, b, aix, 0, cd);
}
else if( pm2 && m==1 ) //VECTOR-MATRIX
{
//parallelization over rows in rhs matrix
//rest not aligned to blocks of 2 rows
final int kn = (ru-rl)%2;
if( kn == 1 && a[rl] != 0 )
vectMultiplyAdd(a[rl], b, c, rl*n, 0, n);
//compute blocks of 2 rows (2 instead of 4 for small n<64)
for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n ){
if( a[k] != 0 && a[k+1] != 0 )
vectMultiplyAdd2(a[k], a[k+1], b, c, bix, bix+n, 0, n);
else if( a[k] != 0 )
vectMultiplyAdd(a[k], b, c, bix, 0, n);
else if( a[k+1] != 0 )
vectMultiplyAdd(a[k+1], b, c, bix+n, 0, n);
}
}
else if( pm2 && m<=16 ) //MATRIX-MATRIX (short lhs)
{
//parallelization over rows in rhs matrix
final int kn = (ru-rl)%2;
//rest not aligned to blocks of 2 rows
if( kn == 1 )
for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n )
if( a[aix+rl] != 0 )
vectMultiplyAdd(a[aix+rl], b, c, rl*n, cix, n);
//compute blocks of 2 rows (w/ repeated scan for each row in lhs)
for( int k=rl+kn, bix=(rl+kn)*n; k<ru; k+=2, bix+=2*n )
for( int i=0, aix=0, cix=0; i<m; i++, aix+=cd, cix+=n ){
if( a[aix+k] != 0 && a[aix+k+1] != 0 )
vectMultiplyAdd2(a[aix+k], a[aix+k+1], b, c, bix, bix+n, cix, n);
else if( a[aix+k] != 0 )
vectMultiplyAdd(a[aix+k], b, c, bix, cix, n);
else if( a[aix+k+1] != 0 )
vectMultiplyAdd(a[aix+k+1], b, c, bix+n, cix, n);
}
}
else if( tm2 ) //MATRIX-MATRIX (skinny rhs)
{
//note: prepared rhs input via transpose for: m > n && cd > 64 && n < 64
//however, explicit flag required since dimension change m2
final int n2 = m2.rlen;
for( int i=rl, aix=rl*cd, cix=rl*n2; i < ru; i++, aix+=cd, cix+=n2 )
for( int j=0, bix=0; j<n2; j++, bix+=cd )
c[cix+j] = dotProduct(a, b, aix, bix, cd);
}
else //MATRIX-MATRIX
{
//1) Unrolled inner loop (for better instruction-level parallelism)
//2) Blocked execution (for less cache trashing in parallel exec)
//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block
final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c
final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan
//temporary arrays (nnz a, b index)
double[] ta = new double[ blocksizeK ];
int[] tbi = new int[ blocksizeK ];
//blocked execution
for( int bi = rl; bi < ru; bi+=blocksizeI )
for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK )
for( int bj = 0, bkmin = Math.min(cd, bk+blocksizeK); bj < n; bj+=blocksizeJ )
{
int bklen = bkmin-bk;
int bjlen = Math.min(n, bj+blocksizeJ)-bj;
//core sub block matrix multiplication
for( int i = bi; i < bimin; i++)
{
int aixi = i * cd + bk; //start index on a
int cixj = i * n + bj; //scan index on c
//determine nnz of a (for sparsity-aware skipping of rows)
int knnz = copyNonZeroElements(a, aixi, bk, bj, n, ta, tbi, bklen);
//if( knnz > 0 ) //for skipping empty rows
//rest not aligned to blocks of 4 rows
final int bn = knnz % 4;
switch( bn ){
case 1: vectMultiplyAdd(ta[0], b, c, tbi[0], cixj, bjlen); break;
case 2: vectMultiplyAdd2(ta[0],ta[1], b, c, tbi[0], tbi[1], cixj, bjlen); break;
case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], b, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
}
//compute blocks of 4 rows (core inner loop)
for( int k = bn; k<knnz; k+=4 ){
vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], b, c,
tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
}
}
}
}
}
else
{
double val;
for( int i = rl, aix=rl*cd, cix=rl*n; i < ru; i++, cix+=n)
for( int k = 0, bix=0; k < cd; k++, aix++, bix+=n)
{
val = a[ aix ];
if( val != 0 )
for( int j = 0; j < n; j++)
c[ cix+j ] += val * b[ bix+j ];
}
}
}
/**
*
* @param m1
* @param m2
* @param ret
* @throws DMLRuntimeException
*/
private static void matrixMultDenseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru)
throws DMLRuntimeException
{
double[] a = m1.denseBlock;
double[] c = ret.denseBlock;
int m = m1.rlen;
int cd = m1.clen;
int n = m2.clen;
// MATRIX-MATRIX (VV, MV not applicable here because V always dense)
if( LOW_LEVEL_OPTIMIZATION )
{
final int blocksizeI = 32; //256KB c block (typical L2 size per core), 32KB a block
final int blocksizeK = 32;
//note: in contrast to dense-dense, no blocking over j (would require maintaining blocksizeK indexes, counter-productive on skew)
SparseRow[] b = m2.sparseRows;
if( pm2 && m==1 ) //VECTOR-MATRIX
{
//parallelization over rows in rhs matrix
for( int k=rl; k<ru; k++ )
if( a[k] != 0 && b[k] != null && !b[k].isEmpty() ) {
int[] bix = b[k].getIndexContainer();
double[] bvals = b[k].getValueContainer();
vectMultiplyAdd(a[k], bvals, c, bix, 0, b[k].size());
}
}
else //MATRIX-MATRIX
{
//blocked execution
for( int bi = rl; bi < ru; bi+=blocksizeI )
for( int bk = 0, bimin = Math.min(ru, bi+blocksizeI); bk < cd; bk+=blocksizeK )
{
int bklen = Math.min(cd, bk+blocksizeK)-bk;
//core sub block matrix multiplication
for( int i = bi; i < bimin; i++)
{
int aixi = i * cd + bk; //start index on a
int cixj = i * n + 0; //scan index on c
for( int k = 0; k < bklen; k++ )
{
double val = a[aixi+k];
SparseRow brow = b[ bk+k ];
if( val != 0 && brow != null && !brow.isEmpty() ) {
int blen = brow.size();
int[] bix = brow.getIndexContainer();
double[] bvals = brow.getValueContainer();
vectMultiplyAdd(val, bvals, c, bix, cixj, blen);
}
}
}
}
}
}
else
{
for( int i=rl, aix=rl*cd, cix=rl*n; i < ru; i++, cix+=n )
for(int k = 0; k < cd; k++, aix++ )
{
double val = a[aix];
if( val!=0 )
{
SparseRow brow = m2.sparseRows[ k ];
if( brow != null && !brow.isEmpty() )
{
int blen = brow.size();
int[] bix = brow.getIndexContainer();
double[] bvals = brow.getValueContainer();
for(int j = 0; j < blen; j++)
c[cix+bix[j]] += val * bvals[j];
}
}
}
}
}
/**
*
* @param m1
* @param m2
* @param ret
* @throws DMLRuntimeException
*/
private static void matrixMultSparseDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru)
throws DMLRuntimeException
{
double[] b = m2.denseBlock;
double[] c = ret.denseBlock;
final int m = m1.rlen;
final int n = m2.clen;
if( LOW_LEVEL_OPTIMIZATION )
{
if( m==1 && n==1 ) //DOT PRODUCT
{
SparseRow arow = m1.sparseRows[0];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
c[0] = dotProduct(avals, b, aix, 0, alen);
}
}
else if( n==1 ) //MATRIX-VECTOR
{
for( int i=rl; i<Math.min(ru, m1.sparseRows.length); i++ )
{
SparseRow arow = m1.sparseRows[i];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
c[i] = dotProduct(avals, b, aix, 0, alen);
}
}
}
else if( pm2 && m==1 ) //VECTOR-MATRIX
{
//parallelization over rows in rhs matrix
SparseRow arow = m1.sparseRows[0];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
int rlix = (rl==0) ? 0 : arow.searchIndexesFirstGTE(rl);
rlix = (rlix>=0) ? rlix : alen;
for( int k=rlix; k<alen && aix[k]<ru; k++ ) {
if( k+1<alen && aix[k+1]<ru )
vectMultiplyAdd2(avals[k], avals[k+1], b, c, aix[k]*n, aix[++k]*n, 0, n);
else
vectMultiplyAdd(avals[k], b, c, aix[k]*n, 0, n);
}
}
}
else //MATRIX-MATRIX
{
for( int i=rl, cix=rl*n; i<Math.min(ru, m1.sparseRows.length); i++, cix+=n )
{
SparseRow arow = m1.sparseRows[i];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
if( alen==1 && avals[0]==1 ) //ROW SELECTION
{
//plain memcopy for permutation matrices
System.arraycopy(b, aix[0]*n, c, cix, n);
}
else //GENERAL CASE
{
//rest not aligned to blocks of 4 rows
final int bn = alen % 4;
switch( bn ){
case 1: vectMultiplyAdd(avals[0], b, c, aix[0]*n, cix, n); break;
case 2: vectMultiplyAdd2(avals[0],avals[1], b, c, aix[0]*n, aix[1]*n, cix, n); break;
case 3: vectMultiplyAdd3(avals[0],avals[1],avals[2], b, c, aix[0]*n, aix[1]*n, aix[2]*n, cix, n); break;
}
//compute blocks of 4 rows (core inner loop)
for( int k = bn; k<alen; k+=4 ) {
vectMultiplyAdd4( avals[k], avals[k+1], avals[k+2], avals[k+3], b, c,
aix[k]*n, aix[k+1]*n, aix[k+2]*n, aix[k+3]*n, cix, n );
}
}
}
}
}
}
else
{
for( int i=rl, cix=rl*n; i<Math.min(ru, m1.sparseRows.length); i++, cix+=n )
{
SparseRow arow = m1.sparseRows[i];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
for(int k = 0; k < alen; k++)
{
double val = avals[k];
for(int j = 0, bix=aix[k]*n; j < n; j++)
c[cix+j] += val * b[bix+j];
}
}
}
}
}
/**
*
* @param m1
* @param m2
* @param ret
* @throws DMLRuntimeException
*/
private static void matrixMultSparseSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru)
throws DMLRuntimeException
{
SparseRow[] b = m2.sparseRows;
double[] c = ret.denseBlock;
int m = m1.rlen;
int n = m2.clen;
// MATRIX-MATRIX (VV, MV not applicable here because V always dense)
if(LOW_LEVEL_OPTIMIZATION)
{
if( pm2 && m==1 ) //VECTOR-MATRIX
{
//parallelization over rows in rhs matrix
SparseRow arow = m1.sparseRows[0];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
int rlix = (rl==0) ? 0 : arow.searchIndexesFirstGTE(rl);
rlix = (rlix>=0) ? rlix : alen;
for( int k=rlix; k<alen && aix[k]<ru; k++ )
if( b[aix[k]] != null && !b[aix[k]].isEmpty() ) {
SparseRow brow = b[aix[k]];
int blen = brow.size();
int[] bix = brow.getIndexContainer();
double[] bvals = brow.getValueContainer();
vectMultiplyAdd(avals[k], bvals, c, bix, 0, blen);
}
}
}
else //MATRIX-MATRIX
{
for( int i=rl, cix=rl*n; i<Math.min(ru, m1.sparseRows.length); i++, cix+=n )
{
SparseRow arow = m1.sparseRows[i];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
for(int k = 0; k < alen; k++)
{
double val = avals[k];
SparseRow brow = b[ aix[k] ];
if( brow != null && !brow.isEmpty() )
{
int blen = brow.size();
int[] bix = brow.getIndexContainer();
double[] bvals = brow.getValueContainer();
vectMultiplyAdd(val, bvals, c, bix, cix, blen);
}
}
}
}
}
}
else
{
for( int i=rl, cix=rl*n; i<Math.min(ru, m1.sparseRows.length); i++, cix+=n )
{
SparseRow arow = m1.sparseRows[i];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
for(int k = 0; k < alen; k++)
{
double val = avals[k];
SparseRow brow = m2.sparseRows[ aix[k] ];
if( brow != null && !brow.isEmpty() )
{
int blen = brow.size();
int[] bix = brow.getIndexContainer();
double[] bvals = brow.getValueContainer();
for(int j = 0; j < blen; j++)
c[cix+bix[j]] += val * bvals[j];
}
}
}
}
}
}
/**
* This implementation applies to any combination of dense/sparse if at least one
* input is ultrasparse (sparse and very few nnz). In that case, most importantly,
* we want to create a sparse output and only iterate over the few nnz as the major
* dimension. Low-level optimization have less importance in that case and having
* this generic implementation helps to reduce the implementations from (2+1)^2
* to 2^2+1.
*
* @param m1
* @param m2
* @param ret
* @throws DMLRuntimeException
*/
private static void matrixMultUltraSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru)
throws DMLRuntimeException
{
boolean leftUS = m1.isUltraSparse();
final int m = m1.rlen;
final int cd = m1.clen;
final int n = m2.clen;
if( leftUS ) //left is ultra-sparse (IKJ)
{
boolean rightSparse = m2.sparse;
for( int i=rl; i<ru; i++ )
{
SparseRow arow = m1.sparseRows[ i ];
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aixs = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
if( alen==1 && avals[0]==1 ) //ROW SELECTION (no aggregation)
{
int aix = aixs[0];
if( rightSparse ) { //sparse right matrix (full row copy)
if( m2.sparseRows!=null && m2.sparseRows[aix]!=null ) {
ret.rlen=m;
ret.allocateSparseRowsBlock(false); //allocation on demand
ret.sparseRows[i] = new SparseRow(m2.sparseRows[aix]);
ret.nonZeros += ret.sparseRows[i].size();
}
}
else { //dense right matrix (append all values)
for( int j=0; j<n; j++ )
ret.appendValue(i, j, m2.quickGetValue(aix, j));
}
}
else //GENERAL CASE
{
for( int k=0; k<alen; k++ )
{
double aval = avals[k];
int aix = aixs[k];
for( int j=0; j<n; j++ )
{
double cval = ret.quickGetValue(i, j);
double cvald = aval*m2.quickGetValue(aix, j);
if( cvald != 0 )
ret.quickSetValue(i, j, cval+cvald);
}
}
}
}
}
}
else //right is ultra-sparse (KJI)
{
for(int k = 0; k < cd; k++ )
{
SparseRow brow = m2.sparseRows[ k ];
if( brow != null && !brow.isEmpty() )
{
int blen = brow.size();
int[] bixs = brow.getIndexContainer();
double[] bvals = brow.getValueContainer();
for( int j=0; j<blen; j++ )
{
double bval = bvals[j];
int bix = bixs[j];
for( int i=rl; i<ru; i++ )
{
double cvald = bval*m1.quickGetValue(i, k);
if( cvald != 0 ){
double cval = ret.quickGetValue(i, bix);
ret.quickSetValue(i, bix, cval+cvald);
}
}
}
}
}
}
//no need to recompute nonzeros because maintained internally
}
/**
*
* @param mX
* @param mV
* @param mW
* @param ret
* @param ct
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
private static void matrixMultChainDense(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru)
{
double[] a = mX.denseBlock;
double[] b = mV.denseBlock;
double[] w = (mW!=null) ? mW.denseBlock : null;
double[] c = ret.denseBlock;
final int cd = mX.clen; //features in X
boolean weights = (ct == ChainType.XtwXv);
//temporary array for cache blocking
//(blocksize chosen to fit b+v in L2 (256KB) for default 1k blocks)
final int blocksize = 24; // constraint: factor of 4
double[] tmp = new double[blocksize];
//blockwise mmchain computation
final int bn = ru - ru % blocksize; //rl blocksize aligned
for( int bi=rl; bi < bn; bi+=blocksize )
{
//compute 1st matrix-vector for row block
for( int j=0, aix=bi*cd; j < blocksize; j++, aix+=cd)
tmp[j] = dotProduct(a, b, aix, 0, cd);
//multiply weights (in-place), if required
if( weights )
vectMultiply(w, tmp, bi, 0, blocksize);
//compute 2nd matrix vector for row block and aggregate
for (int j=0, aix=bi*cd; j < blocksize; j+=4, aix+=4*cd)
vectMultiplyAdd4(tmp[j], tmp[j+1], tmp[j+2], tmp[j+3], a, c, aix, aix+cd, aix+2*cd, aix+3*cd, 0, cd);
}
//compute rest (not aligned to blocksize)
for( int i=bn, aix=bn*cd; i < ru; i++, aix+=cd ) {
double val = dotProduct(a, b, aix, 0, cd);
val *= (weights) ? w[i] : 1;
vectMultiplyAdd(val, a, c, aix, 0, cd);
}
}
/**
*
* @param mX
* @param mV
* @param mW
* @param ret
* @param ct
* @param rl
* @param ru
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
private static void matrixMultChainSparse(MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru)
{
SparseRow[] a = mX.sparseRows;
double[] b = mV.denseBlock;
double[] w = (mW!=null) ? mW.denseBlock : null;
double[] c = ret.denseBlock;
boolean weights = (ct == ChainType.XtwXv);
//temporary array for cache blocking
//(blocksize chosen to fit b+v in L2 (256KB) for default 1k blocks)
final int blocksize = 24;
double[] tmp = new double[blocksize];
//blockwise mmchain computation
for( int bi=rl; bi < ru; bi+=blocksize )
{
//reset row block intermediate
int tmplen = Math.min(blocksize, ru-bi);
//compute 1st matrix-vector for row block
for( int j=0; j < tmplen; j++) {
SparseRow arow = a[bi+j];
if( arow != null && !arow.isEmpty() ) {
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
tmp[j] = dotProduct(avals, b, aix, 0, alen);
}
}
//multiply weights (in-place), if required
if( weights )
vectMultiply(w, tmp, bi, 0, tmplen);
//compute 2nd matrix vector for row block and aggregate
for( int j=0; j < tmplen; j++) {
SparseRow arow = a[bi+j];
if( arow != null && !arow.isEmpty() && tmp[j] != 0 ) {
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
vectMultiplyAdd(tmp[j], avals, c, aix, 0, alen);
}
}
}
}
/**
*
* @param m1
* @param ret
* @param leftTranspose
* @throws DMLRuntimeException
* @throws DMLUnsupportedOperationException
*/
private static void matrixMultTransposeSelfDense( MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru )
throws DMLRuntimeException
{
//2) transpose self matrix multiply dense
// (compute only upper-triangular matrix due to symmetry)
double[] a = m1.denseBlock;
double[] c = ret.denseBlock;
int m = m1.rlen;
int n = m1.clen;
if( leftTranspose ) // t(X)%*%X
{
if( LOW_LEVEL_OPTIMIZATION )
{
if( n==1 ) //VECTOR (col)
{
c[0] = dotProduct(a, a, m);
}
else //MATRIX
{
//1) Unrolled inner loop (for better instruction-level parallelism)
//2) Blocked execution (for less cache trashing in parallel exec)
//3) Asymmetric block sizes (for less misses in inner loop, yet blocks in L1/L2)
final int blocksizeI = 32; //64//256KB c block (typical L2 size per core), 32KB a block
final int blocksizeK = 24; //64//256KB b block (typical L2 size per core), used while read 512B of a / read/write 4KB of c
final int blocksizeJ = 1024; //512//4KB (typical main-memory page size), for scan
//temporary arrays (nnz a, b index)
double[] ta = new double[ blocksizeK ];
int[] tbi = new int[ blocksizeK ];
final int mx = ru;
final int cdx = m;
final int nx = n;
//blocked execution
for( int bi = rl; bi < mx; bi+=blocksizeI ) //from bi due to symmetry
for( int bk = 0, bimin = Math.min(mx, bi+blocksizeI); bk < cdx; bk+=blocksizeK )
for( int bj = bi, bkmin = Math.min(cdx, bk+blocksizeK); bj < nx; bj+=blocksizeJ )
{
int bklen = bkmin-bk;
int bjlen = Math.min(nx, bj+blocksizeJ)-bj;
//core sub block matrix multiplication
for( int i = bi; i < bimin; i++)
{
int aixi = bk*n +i; //start index on a (logical t(X))
int cixj = i * nx + bj; //scan index on c
//determine nnz of a (for sparsity-aware skipping of rows)
int knnz = copyNonZeroElements(a, aixi, bk, bj, n, nx, ta, tbi, bklen);
//rest not aligned to blocks of 4 rows
final int bn = knnz % 4;
switch( bn ){
case 1: vectMultiplyAdd(ta[0], a, c, tbi[0], cixj, bjlen); break;
case 2: vectMultiplyAdd2(ta[0],ta[1], a, c, tbi[0], tbi[1], cixj, bjlen); break;
case 3: vectMultiplyAdd3(ta[0],ta[1],ta[2], a, c, tbi[0], tbi[1],tbi[2], cixj, bjlen); break;
}
//compute blocks of 4 rows (core inner loop)
for( int k = bn; k<knnz; k+=4 ){
vectMultiplyAdd4( ta[k], ta[k+1], ta[k+2], ta[k+3], a, c,
tbi[k], tbi[k+1], tbi[k+2], tbi[k+3], cixj, bjlen );
}
}
}
}
}
else
{
for(int k = 0, ix1 = 0; k < m; k++, ix1+=n)
for(int i = rl, ix3 = 0; i < ru; i++, ix3+=n)
{
double val = a[ ix1+i ];
if( val != 0 )
{
for(int j = i; j < n; j++) //from i due to symmetry
c[ ix3+j ] += val * a[ ix1+j ];
}
}
}
}
else // X%*%t(X)
{
if(LOW_LEVEL_OPTIMIZATION)
{
if( m==1 ) //VECTOR
{
c[0] = dotProduct(a, a, n);
}
else //MATRIX
{
//algorithm: scan c, foreach ci,j: scan row of a and t(a) (IJK)
//1) Unrolled inner loop, for better ILP
//2) Blocked execution, for less cache trashing in parallel exec
// (smaller block sizes would be slightly better, but consistent as is)
//3) Single write in inner loop (transient intermediates)
int blocksize = 64;
for( int bi = rl; bi<ru; bi+=blocksize )
for( int bj = bi; bj<m; bj+=blocksize )
{
final int bimin = Math.min(ru, bi+blocksize);
final int bjmin = Math.min(m, bj+blocksize);
for(int i = bi, ix1 = bi*n, ix3 = bi*m; i < bimin; i++, ix1+=n, ix3+=m)
{
final int bjmax = Math.max(i,bj);
for(int j = bjmax, ix2 = bjmax*n; j <bjmin; j++, ix2+=n) //from i due to symmetry
{
c[ ix3+j ] = dotProduct(a, a, ix1, ix2, n);
}
}
}
}
}
else
{
for(int i = rl, ix1 = 0, ix3 = 0; i < ru; i++, ix1+=n, ix3+=m)
for(int j = i, ix2 = i*n; j < m; j++, ix2+=n) //from i due to symmetry
{
double val = 0;
for(int k = 0; k < n; k++)
val += a[ ix1+k ] * a[ix2+k];
c[ ix3+j ] = val;
}
}
}
}
/**
*
* @param out
* @param leftTranspose
* @throws DMLUnsupportedOperationException
* @throws DMLRuntimeException
*/
private static void matrixMultTransposeSelfSparse( MatrixBlock m1, MatrixBlock ret, boolean leftTranspose, int rl, int ru )
throws DMLRuntimeException
{
//2) transpose self matrix multiply sparse
// (compute only upper-triangular matrix due to symmetry)
double[] c = ret.denseBlock;
int m = m1.rlen;
int n = m1.clen;
if( leftTranspose ) // t(X)%*%X
{
//only general case (because vectors always dense)
//algorithm: scan rows, foreach row self join (KIJ)
if( LOW_LEVEL_OPTIMIZATION )
{
for( SparseRow arow : m1.sparseRows )
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
int rlix = (rl==0) ? 0 : arow.searchIndexesFirstGTE(rl);
rlix = (rlix>=0) ? rlix : alen;
for(int i = rlix; i < alen && aix[i]<ru; i++)
{
double val = avals[i];
if( val != 0 ) {
int ix2 = aix[i]*n;
vectMultiplyAdd(val, avals, c, aix, i, ix2, alen);
}
}
}
}
else
{
for( SparseRow arow : m1.sparseRows )
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
int rlix = (rl==0) ? 0 : arow.searchIndexesFirstGTE(rl);
rlix = (rlix>=0) ? rlix : alen;
for(int i = rlix; i < alen && aix[i]<ru; i++)
{
double val = avals[i];
if( val != 0 )
for(int j = i, ix2 = aix[i]*n; j < alen; j++)
c[ix2+aix[j]] += val * avals[j];
}
}
}
}
else // X%*%t(X)
{
if( m==1 ) //VECTOR
{
SparseRow arow = m1.sparseRows[0];
if( arow !=null && !arow.isEmpty() )
{
int alen = arow.size();
double[] avals = arow.getValueContainer();
c[0] = dotProduct(avals, avals, alen);
}
}
else //MATRIX
{
//note: reorg to similar layout as t(X)%*%X because faster than
//direct computation with IJK (no dependencies/branches in inner loop)
//see preprocessMatrixMultTransposeSelf m1<-tmpBlock
m = m1.clen;
n = m1.rlen;
//algorithm: scan rows, foreach row self join (KIJ)
if( LOW_LEVEL_OPTIMIZATION )
{
for( SparseRow arow : m1.sparseRows )
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
int rlix = (rl==0) ? 0 : arow.searchIndexesFirstGTE(rl);
rlix = (rlix>=0) ? rlix : alen;
for(int i = rlix; i < alen && aix[i]<ru; i++)
{
double val = avals[i];
if( val != 0 ) {
int ix2 = aix[i]*m;
vectMultiplyAdd(val, avals, c, aix, i, ix2, alen);
}
}
}
}
else
{
for( SparseRow arow : m1.sparseRows )
if( arow != null && !arow.isEmpty() )
{
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
int rlix = (rl==0) ? 0 : arow.searchIndexesFirstGTE(rl);
rlix = (rlix>=0) ? rlix : alen;
for(int i = rlix; i < alen && aix[i]<ru; i++)
{
double val = avals[i];
if( val != 0 )
for(int j = i, ix2 = aix[i]*m; j < alen; j++)
c[ix2+aix[j]] += val * avals[j];
}
}
}
}
}
}
/**
*
* @param pm1
* @param m2
* @param ret1
* @param ret2
* @param rl
* @param ru
* @throws DMLRuntimeException
*/
private static void matrixMultPermuteDense(MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
throws DMLRuntimeException
{
double[] a = pm1.denseBlock;
double[] b = m2.denseBlock;
double[] c = ret1.denseBlock;
final int n = m2.clen;
final int brlen = ret1.getNumRows();
int lastblk = -1;
for( int i=rl, bix=rl*n; i<ru; i++, bix+=n )
{
//compute block index and in-block indexes
int pos = UtilFunctions.toInt( a[ i ]); //safe cast
if( pos > 0 ) //selected row
{
int bpos = (pos-1) % brlen;
int blk = (pos-1) / brlen;
//allocate and switch to second output block
//(never happens in cp, correct for multi-threaded usage)
if( lastblk!=-1 && lastblk<blk ){
ret2.sparse = false;
ret2.allocateDenseBlock();
c = ret2.denseBlock;
}
//memcopy entire dense row into target position
System.arraycopy(b, bix, c, bpos*n, n);
lastblk = blk;
}
}
}
/**
*
* @param pm1
* @param m2
* @param ret1
* @param ret2
* @param rl
* @param ru
*/
private static void matrixMultPermuteDenseSparse( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
{
double[] a = pm1.denseBlock;
double[] b = m2.denseBlock;
SparseRow[] c = ret1.sparseRows;
final int n = m2.clen;
final int brlen = ret1.getNumRows();
int lastblk = -1;
for( int i=rl, bix=rl*n; i<ru; i++, bix+=n )
{
//compute block index and in-block indexes
int pos = UtilFunctions.toInt( a[ i ]); //safe cast
if( pos > 0 ) //selected row
{
int bpos = (pos-1) % brlen;
int blk = (pos-1) / brlen;
//allocate and switch to second output block
//(never happens in cp, correct for multi-threaded usage)
if( lastblk!=-1 && lastblk<blk ){
ret2.sparse = true;
ret2.rlen=ret1.rlen;
ret2.allocateSparseRowsBlock();
c = ret2.sparseRows;
}
//append entire dense row into sparse target position
c[bpos] = new SparseRow( n );
for( int j=0; j<n; j++ )
c[bpos].append(j, b[bix+j]);
lastblk = blk;
}
}
}
/**
*
* @param pm1
* @param m2
* @param ret1
* @param ret2
* @param rl
* @param ru
*/
private static void matrixMultPermuteSparse( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
{
double[] a = pm1.denseBlock;
SparseRow[] b = m2.sparseRows;
SparseRow[] c = ret1.sparseRows;
final int brlen = ret1.getNumRows();
int lastblk = -1;
for( int i=rl; i<ru; i++ )
{
//compute block index and in-block indexes
int pos = UtilFunctions.toInt( a[ i ]); //safe cast
if( pos > 0 ) //selected row
{
int bpos = (pos-1) % brlen;
int blk = (pos-1) / brlen;
//allocate and switch to second output block
//(never happens in cp, correct for multi-threaded usage)
if( lastblk!=-1 && lastblk<blk ){
ret2.sparse = true;
ret2.allocateSparseRowsBlock();
c = ret2.sparseRows;
}
//memcopy entire sparse row into target position
if( b[i] != null )
c[bpos] = new SparseRow( b[i] );
lastblk = blk;
}
}
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @param rl
* @param ru
*/
private static void matrixMultWSLossDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru)
{
double[] x = mX.denseBlock;
double[] u = mU.denseBlock;
double[] v = mV.denseBlock;
double[] w = (mW!=null)? mW.denseBlock : null;
final int n = mX.clen;
final int cd = mU.clen;
double wsloss = 0;
// approach: iterate over all cells of X
//cache-conscious blocking: due to blocksize constraint (default 1000),
//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB)
final int blocksizeIJ = 16; //u/v block (max at typical L2 size)
//blocked execution
for( int bi = rl; bi < ru; bi+=blocksizeIJ )
for( int bj = 0, bimin = Math.min(ru, bi+blocksizeIJ); bj < n; bj+=blocksizeIJ )
{
int bjmin = Math.min(n, bj+blocksizeIJ);
// Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting)
if( wt==WeightsType.POST )
{
for( int i=bi, ix=bi*n, uix=bi*cd; i<bimin; i++, ix+=n, uix+=cd )
for( int j=bj, vix=bj*cd; j<bjmin; j++, vix+=cd) {
double wij = w[ix+j];
if( wij != 0 ) {
double uvij = dotProduct(u, v, uix, vix, cd);
wsloss += wij*(x[ix+j]-uvij)*(x[ix+j]-uvij); //^2
}
}
}
// Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post_nz weighting)
else if( wt==WeightsType.POST_NZ )
{
for( int i=bi, ix=bi*n, uix=bi*cd; i<bimin; i++, ix+=n, uix+=cd )
for( int j=bj, vix=bj*cd; j<bjmin; j++, vix+=cd) {
double xij = x[ix+j];
if( xij != 0 ) {
double uvij = dotProduct(u, v, uix, vix, cd);
wsloss += (xij-uvij)*(xij-uvij); //^2
}
}
}
// Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting)
else if( wt==WeightsType.PRE )
{
for( int i=bi, ix=bi*n, uix=bi*cd; i<bimin; i++, ix+=n, uix+=cd )
for( int j=bj, vix=bj*cd; j<bjmin; j++, vix+=cd) {
double wij = w[ix+j];
double uvij = 0;
if( wij != 0 )
uvij = dotProduct(u, v, uix, vix, cd);
wsloss += (x[ix+j]-wij*uvij)*(x[ix+j]-wij*uvij); //^2
}
}
// Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting)
else if( wt==WeightsType.NONE )
{
for( int i=bi, ix=bi*n, uix=bi*cd; i<bimin; i++, ix+=n, uix+=cd )
for( int j=bj, vix=bj*cd; j<bjmin; j++, vix+=cd) {
double uvij = dotProduct(u, v, uix, vix, cd);
wsloss += (x[ix+j]-uvij)*(x[ix+j]-uvij); //^2
}
}
}
ret.quickSetValue(0, 0, wsloss);
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @param rl
* @param ru
*/
private static void matrixMultWSLossSparseDense(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru)
{
SparseRow[] x = mX.sparseRows;
SparseRow[] w = (mW!=null)? mW.sparseRows : null;
double[] u = mU.denseBlock;
double[] v = mV.denseBlock;
final int n = mX.clen;
final int cd = mU.clen;
double wsloss = 0;
// Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting)
if( wt==WeightsType.POST )
{
// approach: iterate over W, point-wise in order to exploit sparsity
for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd )
if( w[i] != null && !w[i].isEmpty() ) {
int wlen = w[i].size();
int[] wix = w[i].getIndexContainer();
double[] wval = w[i].getValueContainer();
for( int k=0; k<wlen; k++ ) {
double xi = mX.quickGetValue(i, wix[k]);
double uvij = dotProduct(u, v, uix, wix[k]*cd, cd);
wsloss += wval[k]*(xi-uvij)*(xi-uvij);
}
}
}
// Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post weighting)
else if( wt==WeightsType.POST_NZ )
{
// approach: iterate over W, point-wise in order to exploit sparsity
for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd )
if( x[i] != null && !x[i].isEmpty() ) {
int xlen = x[i].size();
int[] xix = x[i].getIndexContainer();
double[] xval = x[i].getValueContainer();
for( int k=0; k<xlen; k++ ) {
double uvij = dotProduct(u, v, uix, xix[k]*cd, cd);
wsloss += (xval[k]-uvij)*(xval[k]-uvij);
}
}
}
// Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting)
else if( wt==WeightsType.PRE )
{
// approach: iterate over all cells of X maybe sparse and dense
// (note: tuning similar to pattern 3 possible but more complex)
for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd )
for( int j=0, vix=0; j<n; j++, vix+=cd)
{
double xij = mX.quickGetValue(i, j);
double wij = mW.quickGetValue(i, j);
double uvij = 0;
if( wij != 0 )
uvij = dotProduct(u, v, uix, vix, cd);
wsloss += (xij-wij*uvij)*(xij-wij*uvij);
}
}
// Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting)
else if( wt==WeightsType.NONE )
{
// approach: iterate over all cells of X and
for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd )
{
if( x[i]==null || x[i].isEmpty() ) { //empty row
for( int j=0, vix=0; j<n; j++, vix+=cd) {
double uvij = dotProduct(u, v, uix, vix, cd);
wsloss += (-uvij)*(-uvij);
}
}
else { //non-empty row
int xlen = x[i].size();
int[] xix = x[i].getIndexContainer();
double[] xval = x[i].getValueContainer();
int last = -1;
for( int k=0; k<xlen; k++ ) {
//process last nnz til current nnz
for( int k2=last+1; k2<xix[k]; k2++ ){
double uvij = dotProduct(u, v, uix, k2*cd, cd);
wsloss += (-uvij)*(-uvij);
}
//process current nnz
double uvij = dotProduct(u, v, uix, xix[k]*cd, cd);
wsloss += (xval[k]-uvij)*(xval[k]-uvij);
last = xix[k];
}
//process last nnz til end of row
for( int k2=last+1; k2<n; k2++ ) {
double uvij = dotProduct(u, v, uix, k2*cd, cd);
wsloss += (-uvij)*(-uvij);
}
}
}
}
ret.quickSetValue(0, 0, wsloss);
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @param rl
* @param ru
*/
private static void matrixMultWSLossGeneric (MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, WeightsType wt, int rl, int ru)
{
final int n = mX.clen;
final int cd = mU.clen;
double wsloss = 0;
// Pattern 1) sum (W * (X - U %*% t(V)) ^ 2) (post weighting)
if( wt==WeightsType.POST )
{
// approach: iterate over W, point-wise in order to exploit sparsity
if( mW.sparse ) //SPARSE
{
SparseRow[] wrows = mW.sparseRows;
for( int i=rl; i<ru; i++ )
if( wrows[i] != null && !wrows[i].isEmpty() ){
int wlen = wrows[i].size();
int[] wix = wrows[i].getIndexContainer();
double[] wval = wrows[i].getValueContainer();
for( int k=0; k<wlen; k++ ) {
double uvij = dotProductGeneric(mU, mV, i, wix[k], cd);
double xi = mX.quickGetValue(i, wix[k]);
wsloss += wval[k]*(xi-uvij)*(xi-uvij);
}
}
}
else //DENSE
{
double[] w = mW.denseBlock;
for( int i=rl, wix=rl*n; i<ru; i++, wix+=n )
for( int j=0; j<n; j++)
if( w[wix+j] != 0 ) {
double uvij = dotProductGeneric(mU, mV, i, j, cd);
double xij = mX.quickGetValue(i, j);
wsloss += w[wix+j]*(xij-uvij)*(xij-uvij);
}
}
}
// Pattern 1b) sum ((X!=0) * (X - U %*% t(V)) ^ 2) (post weighting)
else if( wt==WeightsType.POST_NZ )
{
// approach: iterate over W, point-wise in order to exploit sparsity
if( mW.sparse ) //SPARSE
{
SparseRow[] xrows = mX.sparseRows;
for( int i=rl; i<ru; i++ )
if( xrows[i] != null && !xrows[i].isEmpty() ){
int xlen = xrows[i].size();
int[] xix = xrows[i].getIndexContainer();
double[] xval = xrows[i].getValueContainer();
for( int k=0; k<xlen; k++ ) {
double uvij = dotProductGeneric(mU, mV, i, xix[k], cd);
wsloss += (xval[k]-uvij)*(xval[k]-uvij);
}
}
}
else //DENSE
{
double[] x = mX.denseBlock;
for( int i=rl, xix=rl*n; i<ru; i++, xix+=n )
for( int j=0; j<n; j++)
if( x[xix+j] != 0 ) {
double uvij = dotProductGeneric(mU, mV, i, j, cd);
wsloss += (x[xix+j]-uvij)*(x[xix+j]-uvij);
}
}
}
// Pattern 2) sum ((X - W * (U %*% t(V))) ^ 2) (pre weighting)
else if( wt==WeightsType.PRE )
{
// approach: iterate over all cells of X maybe sparse and dense
for( int i=rl; i<ru; i++ )
for( int j=0; j<n; j++)
{
double xij = mX.quickGetValue(i, j);
double wij = mW.quickGetValue(i, j);
double uvij = 0;
if( wij != 0 )
uvij = dotProductGeneric(mU, mV, i, j, cd);
wsloss += (xij-wij*uvij)*(xij-wij*uvij);
}
}
// Pattern 3) sum ((X - (U %*% t(V))) ^ 2) (no weighting)
else if( wt==WeightsType.NONE )
{
// approach: iterate over all cells of X and
for( int i=rl; i<ru; i++ )
for( int j=0; j<n; j++)
{
double xij = mX.quickGetValue(i, j);
double uvij = dotProductGeneric(mU, mV, i, j, cd);
wsloss += (xij-uvij)*(xij-uvij);
}
}
ret.quickSetValue(0, 0, wsloss);
}
/**
*
* @param mW
* @param mU
* @param mV
* @param ret
* @param wt
* @param rl
* @param ru
* @throws DMLRuntimeException
*/
private static void matrixMultWSigmoidDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru)
throws DMLRuntimeException
{
double[] w = mW.denseBlock;
double[] c = ret.denseBlock;
double[] u = mU.denseBlock;
double[] v = mV.denseBlock;
final int n = mW.clen;
final int cd = mU.clen;
//note: cannot compute U %*% t(V) in-place of result w/ regular mm because
//t(V) comes in transformed form and hence would require additional memory
boolean flagminus = (wt==WSigmoidType.MINUS || wt==WSigmoidType.LOG_MINUS);
boolean flaglog = (wt==WSigmoidType.LOG || wt==WSigmoidType.LOG_MINUS);
//approach: iterate over non-zeros of w, selective mm computation
//cache-conscious blocking: due to blocksize constraint (default 1000),
//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB)
final int blocksizeIJ = 16; //u/v block (max at typical L2 size)
//blocked execution
for( int bi = rl; bi < ru; bi+=blocksizeIJ )
for( int bj = 0, bimin = Math.min(ru, bi+blocksizeIJ); bj < n; bj+=blocksizeIJ )
{
int bjmin = Math.min(n, bj+blocksizeIJ);
//core wsigmoid computation
for( int i=bi, ix=bi*n, uix=bi*cd; i<bimin; i++, ix+=n, uix+=cd )
for( int j=bj, vix=bj*cd; j<bjmin; j++, vix+=cd) {
double wij = w[ix+j];
if( wij != 0 )
c[ix+j] = wsigmoid(wij, u, v, uix, vix, flagminus, flaglog, cd);
}
}
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @param rl
* @param ru
* @throws DMLRuntimeException
*/
private static void matrixMultWSigmoidSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru)
throws DMLRuntimeException
{
SparseRow[] w = mW.sparseRows;
SparseRow[] c = ret.sparseRows;
double[] u = mU.denseBlock;
double[] v = mV.denseBlock;
final int n = mW.clen;
final int cd = mU.clen;
boolean flagminus = (wt==WSigmoidType.MINUS || wt==WSigmoidType.LOG_MINUS);
boolean flaglog = (wt==WSigmoidType.LOG || wt==WSigmoidType.LOG_MINUS);
//approach: iterate over non-zeros of w, selective mm computation
for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd )
if( w[i] != null && !w[i].isEmpty() ) {
int wlen = w[i].size();
int[] wix = w[i].getIndexContainer();
double[] wval = w[i].getValueContainer();
c[i] = new SparseRow(wlen, n);
for( int k=0; k<wlen; k++ ) {
double cval = wsigmoid(wval[k], u, v, uix, wix[k]*cd, flagminus, flaglog, cd);
c[i].append(wix[k], cval);
}
}
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @param rl
* @param ru
* @throws DMLRuntimeException
*/
private static void matrixMultWSigmoidGeneric (MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru)
throws DMLRuntimeException
{
final int n = mW.clen;
final int cd = mU.clen;
boolean flagminus = (wt==WSigmoidType.MINUS || wt==WSigmoidType.LOG_MINUS);
boolean flaglog = (wt==WSigmoidType.LOG || wt==WSigmoidType.LOG_MINUS);
//approach: iterate over non-zeros of w, selective mm computation
if( mW.sparse ) //SPARSE
{
//w and c always in same representation
SparseRow[] w = mW.sparseRows;
SparseRow[] c = ret.sparseRows;
for( int i=rl; i<ru; i++ )
if( w[i] != null && !w[i].isEmpty() ) {
int wlen = w[i].size();
int[] wix = w[i].getIndexContainer();
double[] wval = w[i].getValueContainer();
c[i] = new SparseRow(wlen, n);
for( int k=0; k<wlen; k++ ) {
double cval = wsigmoid(wval[k], mU, mV, i, wix[k], flagminus, flaglog, cd);
c[i].append(wix[k], cval);
}
}
}
else //DENSE
{
//w and c always in same representation
double[] w = mW.denseBlock;
double[] c = ret.denseBlock;
for( int i=rl, ix=rl*n; i<ru; i++ )
for( int j=0; j<n; j++, ix++) {
double wij = w[ix];
if( wij != 0 ) {
c[ix] = wsigmoid(wij, mU, mV, i, j, flagminus, flaglog, cd);
}
}
}
}
/**
*
* @param mW
* @param mU
* @param mV
* @param ret
* @param wt
* @param rl
* @param ru
* @throws DMLRuntimeException
*/
private static void matrixMultWDivMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu)
throws DMLRuntimeException
{
final boolean basic = wt.isBasic();
final boolean left = wt.isLeft();
final boolean mult = wt.isMult();
final boolean minus = wt.isMinus();
final int n = mW.clen;
final int cd = mU.clen;
double[] w = mW.denseBlock;
double[] u = mU.denseBlock;
double[] v = mV.denseBlock;
double[] c = ret.denseBlock;
//approach: iterate over non-zeros of w, selective mm computation
//cache-conscious blocking: due to blocksize constraint (default 1000),
//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB)
final int blocksizeIJ = 16; //u/v block (max at typical L2 size)
//blocked execution
for( int bi = rl; bi < ru; bi+=blocksizeIJ )
for( int bj = cl, bimin = Math.min(ru, bi+blocksizeIJ); bj < cu; bj+=blocksizeIJ )
{
int bjmin = Math.min(cu, bj+blocksizeIJ);
//core wsigmoid computation
for( int i=bi, ix=bi*n, uix=bi*cd; i<bimin; i++, ix+=n, uix+=cd )
for( int j=bj, vix=bj*cd; j<bjmin; j++, vix+=cd)
if( w[ix+j] != 0 ) {
if( basic )
c[ix+j] = w[ix+j] * dotProduct(u, v, uix, vix, cd);
else //left/right minus/default
wdivmm(w[ix+j], u, v, c, uix, vix, left, mult, minus, cd);
}
}
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @param rl
* @param ru
* @throws DMLRuntimeException
*/
private static void matrixMultWDivMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu)
throws DMLRuntimeException
{
final boolean basic = wt.isBasic();
final boolean left = wt.isLeft();
final boolean mult = wt.isMult();
final boolean minus = wt.isMinus();
final int cd = mU.clen;
SparseRow[] w = mW.sparseRows;
double[] u = mU.denseBlock;
double[] v = mV.denseBlock;
double[] c = ret.denseBlock;
//approach: iterate over non-zeros of w, selective mm computation
for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd ) {
if( w[i] != null && !w[i].isEmpty() ) {
int wlen = w[i].size();
int[] wix = w[i].getIndexContainer();
double[] wval = w[i].getValueContainer();
if( basic ) {
for( int k=0; k<wlen; k++ )
ret.appendValue( i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k]*cd, cd));
}
else { //left/right minus default
int k = (cl==0) ? 0 : w[i].searchIndexesFirstGTE(cl);
k = (k>=0) ? k : wlen;
for( ; k<wlen && wix[k]<cu; k++ )
wdivmm(wval[k], u, v, c, uix, wix[k]*cd, left, mult, minus, cd);
}
}
}
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @param rl
* @param ru
* @throws DMLRuntimeException
*/
private static void matrixMultWDivMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu)
throws DMLRuntimeException
{
final boolean basic = wt.isBasic();
final boolean left = wt.isLeft();
final boolean mult = wt.isMult();
final boolean minus = wt.isMinus();
final int n = mW.clen;
final int cd = mU.clen;
//output always in dense representation
double[] c = ret.denseBlock;
//approach: iterate over non-zeros of w, selective mm computation
if( mW.sparse ) //SPARSE
{
SparseRow[] w = mW.sparseRows;
for( int i=rl; i<ru; i++ ) {
if( w[i] != null && !w[i].isEmpty() ) {
int wlen = w[i].size();
int[] wix = w[i].getIndexContainer();
double[] wval = w[i].getValueContainer();
int k = (cl==0) ? 0 : w[i].searchIndexesFirstGTE(cl);
k = (k>=0) ? k : wlen;
for( ; k<wlen && wix[k]<cu; k++ ) {
if( basic ) {
double uvij = dotProductGeneric(mU,mV, i, wix[k], cd);
ret.appendValue(i, wix[k], uvij);
}
else { //left/right minus/default
wdivmm(wval[k], mU, mV, c, i, wix[k], left, mult, minus, cd);
}
}
}
}
}
else //DENSE
{
double[] w = mW.denseBlock;
for( int i=rl, ix=rl*n; i<ru; i++, ix+=n )
for( int j=cl; j<cu; j++)
if( w[ix+j] != 0 ) {
if( basic ) {
c[ix+j] = dotProductGeneric(mU,mV, i, j, cd);
}
else { //left/right minus/default
wdivmm(w[ix+j], mU, mV, c, i, j, left, mult, minus, cd);
}
}
}
}
/**
*
* @param mW
* @param mU
* @param mV
* @param ret
* @param wt
* @param rl
* @param ru
*/
private static void matrixMultWCeMMDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WCeMMType wt, int rl, int ru)
{
double[] w = mW.denseBlock;
double[] u = mU.denseBlock;
double[] v = mV.denseBlock;
final int n = mW.clen;
final int cd = mU.clen;
double wceval = 0;
// approach: iterate over all cells of X
//cache-conscious blocking: due to blocksize constraint (default 1000),
//a blocksize of 16 allows to fit blocks of UV into L2 cache (256KB)
final int blocksizeIJ = 16; //u/v block (max at typical L2 size)
//blocked execution
for( int bi = rl; bi < ru; bi+=blocksizeIJ )
for( int bj = 0, bimin = Math.min(ru, bi+blocksizeIJ); bj < n; bj+=blocksizeIJ )
{
int bjmin = Math.min(n, bj+blocksizeIJ);
for( int i=bi, ix=bi*n, uix=bi*cd; i<bimin; i++, ix+=n, uix+=cd )
for( int j=bj, vix=bj*cd; j<bjmin; j++, vix+=cd) {
double wij = w[ix+j];
if( wij != 0 ) {
double uvij = dotProduct(u, v, uix, vix, cd);
wceval += wij * FastMath.log(uvij);
}
}
}
ret.quickSetValue(0, 0, wceval);
}
/**
*
* @param mW
* @param mU
* @param mV
* @param ret
* @param wt
* @param rl
* @param ru
*/
private static void matrixMultWCeMMSparseDense(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WCeMMType wt, int rl, int ru)
{
SparseRow[] w = mW.sparseRows;
double[] u = mU.denseBlock;
double[] v = mV.denseBlock;
final int cd = mU.clen;
double wceval = 0;
// approach: iterate over all cells of X and
for( int i=rl, uix=rl*cd; i<ru; i++, uix+=cd ) {
if( w[i]!=null && !w[i].isEmpty() ) {
int wlen = w[i].size();
int[] wix = w[i].getIndexContainer();
double[] wval = w[i].getValueContainer();
for( int k=0; k<wlen; k++ ) {
double uvij = dotProduct(u, v, uix, wix[k]*cd, cd);
wceval += wval[k] * FastMath.log(uvij);
}
}
}
ret.quickSetValue(0, 0, wceval);
}
/**
*
* @param mX
* @param mU
* @param mV
* @param mW
* @param ret
* @param wt
* @param rl
* @param ru
*/
private static void matrixMultWCeMMGeneric(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WCeMMType wt, int rl, int ru)
{
final int n = mW.clen;
final int cd = mU.clen;
double wceval = 0;
//approach: iterate over non-zeros of w, selective mm computation
if( mW.sparse ) //SPARSE
{
SparseRow[] w = mW.sparseRows;
for( int i=rl; i<ru; i++ )
if( w[i] != null && !w[i].isEmpty() ) {
int wlen = w[i].size();
int[] wix = w[i].getIndexContainer();
double[] wval = w[i].getValueContainer();
for( int k=0; k<wlen; k++ ) {
double uvij = dotProductGeneric(mU, mV, i, wix[k], cd);
wceval += wval[k] * FastMath.log(uvij);
}
}
}
else //DENSE
{
double[] w = mW.denseBlock;
for( int i=rl, ix=rl*n; i<ru; i++ )
for( int j=0; j<n; j++, ix++) {
double wij = w[ix];
if( wij != 0 ) {
double uvij = dotProductGeneric(mU, mV, i, j, cd);
wceval += wij * FastMath.log(uvij);
}
}
}
ret.quickSetValue(0, 0, wceval);
}
////////////////////////////////////////////
// performance-relevant utility functions //
////////////////////////////////////////////
/**
* Computes the dot-product of two vectors. Experiments (on long vectors of
* 10^7 values) showed that this generic function provides equivalent performance
* even for the specific case of dotProduct(a,a,len) as used for TSMM.
*
* @param a
* @param b
* @param len
* @return
*/
private static double dotProduct( double[] a, double[] b, final int len )
{
double val = 0;
final int bn = len%8;
//compute rest
for( int i = 0; i < bn; i++ )
val += a[ i ] * b[ i ];
//unrolled 8-block (for better instruction-level parallelism)
for( int i = bn; i < len; i+=8 )
{
//read 64B cachelines of a and b
//compute cval' = sum(a * b) + cval
val += a[ i+0 ] * b[ i+0 ]
+ a[ i+1 ] * b[ i+1 ]
+ a[ i+2 ] * b[ i+2 ]
+ a[ i+3 ] * b[ i+3 ]
+ a[ i+4 ] * b[ i+4 ]
+ a[ i+5 ] * b[ i+5 ]
+ a[ i+6 ] * b[ i+6 ]
+ a[ i+7 ] * b[ i+7 ];
}
//scalar result
return val;
}
/**
*
* @param a
* @param b
* @param ai
* @param bi
* @param len
* @return
*/
private static double dotProduct( double[] a, double[] b, int ai, int bi, final int len )
{
double val = 0;
final int bn = len%8;
//compute rest
for( int i = 0; i < bn; i++, ai++, bi++ )
val += a[ ai ] * b[ bi ];
//unrolled 8-block (for better instruction-level parallelism)
for( int i = bn; i < len; i+=8, ai+=8, bi+=8 )
{
//read 64B cachelines of a and b
//compute cval' = sum(a * b) + cval
val += a[ ai+0 ] * b[ bi+0 ]
+ a[ ai+1 ] * b[ bi+1 ]
+ a[ ai+2 ] * b[ bi+2 ]
+ a[ ai+3 ] * b[ bi+3 ]
+ a[ ai+4 ] * b[ bi+4 ]
+ a[ ai+5 ] * b[ bi+5 ]
+ a[ ai+6 ] * b[ bi+6 ]
+ a[ ai+7 ] * b[ bi+7 ];
}
//scalar result
return val;
}
private static double dotProduct( double[] a, double[] b, int[] aix, final int bi, final int len )
{
double val = 0;
final int bn = len%8;
//compute rest
for( int i = 0; i < bn; i++ )
val += a[ i ] * b[ bi+aix[i] ];
//unrolled 8-block (for better instruction-level parallelism)
for( int i = bn; i < len; i+=8 )
{
//read 64B cacheline of a
//read 64B of b via 'gather'
//compute cval' = sum(a * b) + cval
val += a[ i+0 ] * b[ bi+aix[i+0] ]
+ a[ i+1 ] * b[ bi+aix[i+1] ]
+ a[ i+2 ] * b[ bi+aix[i+2] ]
+ a[ i+3 ] * b[ bi+aix[i+3] ]
+ a[ i+4 ] * b[ bi+aix[i+4] ]
+ a[ i+5 ] * b[ bi+aix[i+5] ]
+ a[ i+6 ] * b[ bi+aix[i+6] ]
+ a[ i+7 ] * b[ bi+aix[i+7] ];
}
//scalar result
return val;
}
/**
*
* @param aval
* @param b
* @param c
* @param bi
* @param ci
* @param len
*/
private static void vectMultiplyAdd( final double aval, double[] b, double[] c, int bi, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, bi++, ci++)
c[ ci ] += aval * b[ bi ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, bi+=8, ci+=8)
{
//read 64B cachelines of b and c
//compute c' = aval * b + c
//write back 64B cacheline of c = c'
c[ ci+0 ] += aval * b[ bi+0 ];
c[ ci+1 ] += aval * b[ bi+1 ];
c[ ci+2 ] += aval * b[ bi+2 ];
c[ ci+3 ] += aval * b[ bi+3 ];
c[ ci+4 ] += aval * b[ bi+4 ];
c[ ci+5 ] += aval * b[ bi+5 ];
c[ ci+6 ] += aval * b[ bi+6 ];
c[ ci+7 ] += aval * b[ bi+7 ];
}
}
/**
*
* @param aval1
* @param aval2
* @param b
* @param c
* @param bi
* @param bi2
* @param ci
* @param len
*/
private static void vectMultiplyAdd2( final double aval1, final double aval2, double[] b, double[] c, int bi1, int bi2, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, bi1++, bi2++, ci++ )
c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, ci+=8 )
{
//read 64B cachelines of b (2x) and c
//compute c' = aval_1 * b_1 + aval_2 * b_2 + c
//write back 64B cacheline of c = c'
c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ];
c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ];
c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ];
c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ];
c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ];
c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ];
c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ];
c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ];
}
}
/**
*
* @param aval1
* @param aval2
* @param aval3
* @param b
* @param c
* @param bi1
* @param bi2
* @param bi3
* @param ci
* @param len
*/
private static void vectMultiplyAdd3( final double aval1, final double aval2, final double aval3, double[] b, double[] c, int bi1, int bi2, int bi3, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, bi1++, bi2++, bi3++, ci++ )
c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ] + aval3 * b[ bi3 ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, bi3+=8, ci+=8 )
{
//read 64B cachelines of b (3x) and c
//compute c' = aval_1 * b_1 + aval_2 * b_2 + c
//write back 64B cacheline of c = c'
c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ] + aval3 * b[ bi3+0 ];
c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ] + aval3 * b[ bi3+1 ];
c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ] + aval3 * b[ bi3+2 ];
c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ] + aval3 * b[ bi3+3 ];
c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ] + aval3 * b[ bi3+4 ];
c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ] + aval3 * b[ bi3+5 ];
c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ] + aval3 * b[ bi3+6 ];
c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ];
}
}
/**
*
* @param aval1
* @param aval2
* @param aval3
* @param aval4
* @param b
* @param c
* @param bi1
* @param bi2
* @param bi3
* @param bi4
* @param ci
* @param len
*/
private static void vectMultiplyAdd4( final double aval1, final double aval2, final double aval3, final double aval4, double[] b, double[] c, int bi1, int bi2, int bi3, int bi4, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, bi1++, bi2++, bi3++, bi4++, ci++ )
c[ ci ] += aval1 * b[ bi1 ] + aval2 * b[ bi2 ] + aval3 * b[ bi3 ] + aval4 * b[ bi4 ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, bi1+=8, bi2+=8, bi3+=8, bi4+=8, ci+=8)
{
//read 64B cachelines of b (4x) and c
//compute c' = aval_1 * b_1 + aval_2 * b_2 + c
//write back 64B cacheline of c = c'
c[ ci+0 ] += aval1 * b[ bi1+0 ] + aval2 * b[ bi2+0 ] + aval3 * b[ bi3+0 ] + aval4 * b[ bi4+0 ];
c[ ci+1 ] += aval1 * b[ bi1+1 ] + aval2 * b[ bi2+1 ] + aval3 * b[ bi3+1 ] + aval4 * b[ bi4+1 ];
c[ ci+2 ] += aval1 * b[ bi1+2 ] + aval2 * b[ bi2+2 ] + aval3 * b[ bi3+2 ] + aval4 * b[ bi4+2 ];
c[ ci+3 ] += aval1 * b[ bi1+3 ] + aval2 * b[ bi2+3 ] + aval3 * b[ bi3+3 ] + aval4 * b[ bi4+3 ];
c[ ci+4 ] += aval1 * b[ bi1+4 ] + aval2 * b[ bi2+4 ] + aval3 * b[ bi3+4 ] + aval4 * b[ bi4+4 ];
c[ ci+5 ] += aval1 * b[ bi1+5 ] + aval2 * b[ bi2+5 ] + aval3 * b[ bi3+5 ] + aval4 * b[ bi4+5 ];
c[ ci+6 ] += aval1 * b[ bi1+6 ] + aval2 * b[ bi2+6 ] + aval3 * b[ bi3+6 ] + aval4 * b[ bi4+6 ];
c[ ci+7 ] += aval1 * b[ bi1+7 ] + aval2 * b[ bi2+7 ] + aval3 * b[ bi3+7 ] + aval4 * b[ bi4+7 ];
}
}
/**
*
* @param aval
* @param b
* @param c
* @param bix
* @param ci
* @param len
*/
private static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++ )
c[ ci + bix[j] ] += aval * b[ j ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8 )
{
//read 64B cacheline of b
//read 64B of c via 'gather'
//compute c' = aval * b + c
//write back 64B of c = c' via 'scatter'
c[ ci+bix[j+0] ] += aval * b[ j+0 ];
c[ ci+bix[j+1] ] += aval * b[ j+1 ];
c[ ci+bix[j+2] ] += aval * b[ j+2 ];
c[ ci+bix[j+3] ] += aval * b[ j+3 ];
c[ ci+bix[j+4] ] += aval * b[ j+4 ];
c[ ci+bix[j+5] ] += aval * b[ j+5 ];
c[ ci+bix[j+6] ] += aval * b[ j+6 ];
c[ ci+bix[j+7] ] += aval * b[ j+7 ];
}
}
private static void vectMultiplyAdd( final double aval, double[] b, double[] c, int[] bix, final int bi, final int ci, final int len )
{
final int bn = (len-bi)%8;
//rest, not aligned to 8-blocks
for( int j = bi; j < bi+bn; j++ )
c[ ci + bix[j] ] += aval * b[ j ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bi+bn; j < len; j+=8 )
{
//read 64B cacheline of b
//read 64B of c via 'gather'
//compute c' = aval * b + c
//write back 64B of c = c' via 'scatter'
c[ ci+bix[j+0] ] += aval * b[ j+0 ];
c[ ci+bix[j+1] ] += aval * b[ j+1 ];
c[ ci+bix[j+2] ] += aval * b[ j+2 ];
c[ ci+bix[j+3] ] += aval * b[ j+3 ];
c[ ci+bix[j+4] ] += aval * b[ j+4 ];
c[ ci+bix[j+5] ] += aval * b[ j+5 ];
c[ ci+bix[j+6] ] += aval * b[ j+6 ];
c[ ci+bix[j+7] ] += aval * b[ j+7 ];
}
}
/**
*
* @param aval
* @param b
* @param c
* @param bi
* @param ci
* @param len
*/
private static void vectMultiplyWrite( final double aval, double[] b, double[] c, int bi, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, bi++, ci++)
c[ ci ] = aval * b[ bi ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, bi+=8, ci+=8)
{
//read 64B cachelines of b and c
//compute c' = aval * b + c
//write back 64B cacheline of c = c'
c[ ci+0 ] = aval * b[ bi+0 ];
c[ ci+1 ] = aval * b[ bi+1 ];
c[ ci+2 ] = aval * b[ bi+2 ];
c[ ci+3 ] = aval * b[ bi+3 ];
c[ ci+4 ] = aval * b[ bi+4 ];
c[ ci+5 ] = aval * b[ bi+5 ];
c[ ci+6 ] = aval * b[ bi+6 ];
c[ ci+7 ] = aval * b[ bi+7 ];
}
}
/**
*
* @param a
* @param b
* @param c
* @param ai
* @param bi
* @param ci
* @param len
*/
@SuppressWarnings("unused")
private static void vectMultiplyWrite( double[] a, double[] b, double[] c, int ai, int bi, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, ai++, bi++, ci++)
c[ ci ] = a[ ai ] * b[ bi ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, ai+=8, bi+=8, ci+=8)
{
//read 64B cachelines of a and b
//compute c' = a * b
//write back 64B cacheline of c = c'
c[ ci+0 ] = a[ ai+0 ] * b[ bi+0 ];
c[ ci+1 ] = a[ ai+1 ] * b[ bi+1 ];
c[ ci+2 ] = a[ ai+2 ] * b[ bi+2 ];
c[ ci+3 ] = a[ ai+3 ] * b[ bi+3 ];
c[ ci+4 ] = a[ ai+4 ] * b[ bi+4 ];
c[ ci+5 ] = a[ ai+5 ] * b[ bi+5 ];
c[ ci+6 ] = a[ ai+6 ] * b[ bi+6 ];
c[ ci+7 ] = a[ ai+7 ] * b[ bi+7 ];
}
}
/**
*
* @param a
* @param c
* @param ai
* @param ci
* @param len
*/
private static void vectMultiply( double[] a, double[] c, int ai, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, ai++, ci++)
c[ ci ] *= a[ ai ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, ai+=8, ci+=8)
{
//read 64B cachelines of a and c
//compute c' = c * a
//write back 64B cacheline of c = c'
c[ ci+0 ] *= a[ ai+0 ];
c[ ci+1 ] *= a[ ai+1 ];
c[ ci+2 ] *= a[ ai+2 ];
c[ ci+3 ] *= a[ ai+3 ];
c[ ci+4 ] *= a[ ai+4 ];
c[ ci+5 ] *= a[ ai+5 ];
c[ ci+6 ] *= a[ ai+6 ];
c[ ci+7 ] *= a[ ai+7 ];
}
}
/**
*
* @param a
* @param c
* @param ai
* @param ci
* @param len
*/
private static void vectAdd( double[] a, double[] c, int ai, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, ai++, ci++)
c[ ci ] += a[ ai ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, ai+=8, ci+=8)
{
//read 64B cachelines of a and c
//compute c' = c * a
//write back 64B cacheline of c = c'
c[ ci+0 ] += a[ ai+0 ];
c[ ci+1 ] += a[ ai+1 ];
c[ ci+2 ] += a[ ai+2 ];
c[ ci+3 ] += a[ ai+3 ];
c[ ci+4 ] += a[ ai+4 ];
c[ ci+5 ] += a[ ai+5 ];
c[ ci+6 ] += a[ ai+6 ];
c[ ci+7 ] += a[ ai+7 ];
}
}
/**
*
* @param a1
* @param a2
* @param a3
* @param a4
* @param c
* @param ai
* @param ci
* @param len
*/
private static void vectAdd4( double[] a1, double[] a2, double[] a3, double[] a4, double[] c, int ai, int ci, final int len )
{
final int bn = len%8;
//rest, not aligned to 8-blocks
for( int j = 0; j < bn; j++, ai++, ci++)
c[ ci ] += a1[ ai ] + a2[ ai ] + a3[ ai ] + a4[ ai ];
//unrolled 8-block (for better instruction-level parallelism)
for( int j = bn; j < len; j+=8, ai+=8, ci+=8)
{
//read 64B cachelines of a (4x) and c
//compute c' = c + a1 + a2 + a3 + a4
//write back 64B cacheline of c = c'
c[ ci+0 ] += a1[ ai+0 ] + a2[ ai+0 ] + a3[ ai+0 ] + a4[ ai+0 ];
c[ ci+1 ] += a1[ ai+1 ] + a2[ ai+1 ] + a3[ ai+1 ] + a4[ ai+1 ];
c[ ci+2 ] += a1[ ai+2 ] + a2[ ai+2 ] + a3[ ai+2 ] + a4[ ai+2 ];
c[ ci+3 ] += a1[ ai+3 ] + a2[ ai+3 ] + a3[ ai+3 ] + a4[ ai+3 ];
c[ ci+4 ] += a1[ ai+4 ] + a2[ ai+4 ] + a3[ ai+4 ] + a4[ ai+4 ];
c[ ci+5 ] += a1[ ai+5 ] + a2[ ai+5 ] + a3[ ai+5 ] + a4[ ai+5 ];
c[ ci+6 ] += a1[ ai+6 ] + a2[ ai+6 ] + a3[ ai+6 ] + a4[ ai+6 ];
c[ ci+7 ] += a1[ ai+7 ] + a2[ ai+7 ] + a3[ ai+7 ] + a4[ ai+7 ];
}
}
/**
*
* @param wij
* @param u
* @param v
* @param uix
* @param vix
* @param flagminus
* @param flaglog
* @param len
* @return
*/
private static double wsigmoid( final double wij, double[] u, double[] v, final int uix, final int vix, final boolean flagminus, final boolean flaglog, final int len )
{
//compute dot product over ui vj
double uvij = dotProduct(u, v, uix, vix, len);
//compute core sigmoid function
double cval = flagminus ?
1 / (1 + FastMath.exp(uvij)) :
1 / (1 + FastMath.exp(-uvij));
//compute weighted output
return wij * ((flaglog) ? FastMath.log(cval) : cval);
}
/**
*
* @param wij
* @param u
* @param v
* @param uix
* @param vix
* @param flagminus
* @param flaglog
* @param len
* @return
*/
private static double wsigmoid( final double wij, MatrixBlock u, MatrixBlock v, final int uix, final int vix, final boolean flagminus, final boolean flaglog, final int len )
{
//compute dot product over ui vj
double uvij = dotProductGeneric(u, v, uix, vix, len);
//compute core sigmoid function
double cval = flagminus ?
1 / (1 + FastMath.exp(uvij)) :
1 / (1 + FastMath.exp(-uvij));
//compute weighted output
return wij * ((flaglog) ? FastMath.log(cval) : cval);
}
/**
*
* @param wij
* @param u
* @param v
* @param c
* @param uix
* @param vix
* @param flagleft
* @param len
*/
private static void wdivmm( final double wij, double[] u, double[] v, double[] c, final int uix, final int vix, final boolean left, final boolean mult, final boolean minus, final int len )
{
//compute dot product over ui vj
double uvij = dotProduct(u, v, uix, vix, len);
//compute core wdivmm
double tmpval = minus ? uvij - wij :
mult ? wij * uvij : wij / uvij;
//prepare inputs for final mm
int bix = left ? uix : vix;
int cix = left ? vix : uix;
double[] b = left ? u : v;
//compute final mm output
vectMultiplyAdd(tmpval, b, c, bix, cix, len);
}
/**
*
* @param wij
* @param u
* @param v
* @param c
* @param uix
* @param vix
* @param flagleft
* @param len
*/
private static void wdivmm( final double wij, MatrixBlock u, MatrixBlock v, double[] c, final int uix, final int vix, final boolean left, boolean mult, final boolean minus, final int len )
{
//compute dot product over ui vj
double uvij = dotProductGeneric(u, v, uix, vix, len);
//compute core wdivmm
double wtmp = minus ? uvij - wij :
mult ? wij * uvij : wij / uvij;
//prepare inputs for final mm
int bix = left ? uix : vix;
int cix = left ? vix*len : uix*len;
MatrixBlock b = left ? u : v;
//compute final mm
for( int k2=0; k2<len; k2++ )
c[cix+k2] += b.quickGetValue(bix, k2) * wtmp;
}
/**
*
* @param a
* @param b
* @param ai
* @param bi
* @param len
* @return
*/
private static double dotProductGeneric(MatrixBlock a, MatrixBlock b, final int ai, final int bi, int len)
{
double val = 0;
for( int k2=0; k2<len; k2++ )
val += a.quickGetValue(ai, k2) * b.quickGetValue(bi, k2);
return val;
}
/**
* Used for all version of TSMM where the result is known to be symmetric.
* Hence, we compute only the upper triangular matrix and copy this partial
* result down to lower triangular matrix once.
*
* @param ret
*/
private static void copyUpperToLowerTriangle( MatrixBlock ret )
{
double[] c = ret.denseBlock;
final int m = ret.rlen;
final int n = ret.clen;
//copy symmetric values
for( int i=0, uix=0; i<m; i++, uix+=n )
for( int j=i+1, lix=j*n+i; j<n; j++, lix+=n )
c[ lix ] = c[ uix+j ];
}
/**
*
* @param m1
* @param leftTranspose
* @return
* @throws DMLRuntimeException
*/
private static MatrixBlock prepMatrixMultTransposeSelfInput( MatrixBlock m1, boolean leftTranspose )
throws DMLRuntimeException
{
MatrixBlock ret = m1;
if( !leftTranspose && m1.sparse && m1.rlen > 1) //X%*%t(X) SPARSE MATRIX
{
//directly via LibMatrixReorg in order to prevent sparsity change
MatrixBlock tmpBlock = new MatrixBlock(m1.clen, m1.rlen, m1.sparse);
LibMatrixReorg.reorg(m1, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
ret = tmpBlock;
}
return ret;
}
/**
*
* @param m1
* @param m2
* @return
*/
private static boolean checkPrepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 )
{
//transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output
return (!m1.sparse && !m2.sparse && m1.rlen>m2.clen && m2.rlen > 64 && m2.clen > 1 && m2.clen < 64);
}
/**
*
* @param m1
* @param m2
* @return
*/
private static boolean checkParMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2, int k )
{
//parallelize over rows in rhs matrix if number of rows in lhs/output is very small
return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && !(m1.isUltraSparse()||m2.isUltraSparse()))
|| (m1.rlen<=16 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && m2.rlen > m1.rlen && !m1.sparse && !m2.sparse
&& (long)k * 8 * m1.rlen * m2.clen < MEM_OVERHEAD_THRESHOLD );
}
/**
*
* @param m1
* @param m2
* @return
* @throws DMLRuntimeException
*/
private static MatrixBlock prepMatrixMultRightInput( MatrixBlock m1, MatrixBlock m2 )
throws DMLRuntimeException
{
MatrixBlock ret = m2;
//transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output
if( checkPrepMatrixMultRightInput(m1, m2) ) {
MatrixBlock tmpBlock = new MatrixBlock(m2.clen, m2.rlen, m2.sparse);
LibMatrixReorg.reorg(m2, tmpBlock, new ReorgOperator(SwapIndex.getSwapIndexFnObject()));
ret = tmpBlock;
}
return ret;
}
/**
*
* @param a
* @param aixi
* @param bk
* @param bj
* @param n
* @param tmpa
* @param tmpbi
* @param bklen
* @return
*/
private static int copyNonZeroElements( double[] a, final int aixi, final int bk, final int bj, final int n, double[] tmpa, int[] tmpbi, final int bklen )
{
int knnz = 0;
for( int k = 0; k < bklen; k++ )
if( a[ aixi+k ] != 0 ) {
tmpa[ knnz ] = a[ aixi+k ];
tmpbi[ knnz ] = (bk+k) * n + bj; //scan index on b
knnz ++;
}
return knnz;
}
/**
*
* @param a
* @param aixi
* @param bk
* @param bj
* @param n
* @param nx
* @param tmpa
* @param tmpbi
* @param bklen
* @return
*/
private static int copyNonZeroElements( double[] a, int aixi, final int bk, final int bj, final int n, final int nx, double[] tmpa, int[] tmpbi, final int bklen )
{
int knnz = 0;
for( int k = 0; k < bklen; k++, aixi+=n )
if( a[ aixi ] != 0 ) {
tmpa[ knnz ] = a[ aixi ];
tmpbi[ knnz ] = (bk+k) * nx + bj; //scan index on b
knnz ++;
}
return knnz;
}
/**
*
* @param tasks
* @param ret
*/
private static void sumScalarResults(ArrayList<ScalarResultTask> tasks, MatrixBlock ret)
{
//aggregate partial results
double val = 0;
for(ScalarResultTask task : tasks)
val += task.getScalarResult();
ret.quickSetValue(0, 0, val);
}
/**
*
* @param partret
* @param ret
*/
@SuppressWarnings("unused")
private static void sumDenseResults( double[][] partret, double[] ret )
{
final int len = ret.length;
final int k = partret.length;
final int bk = k % 4;
final int blocksize = 2 * 1024; //16KB (half of common L1 data)
//cache-conscious aggregation to prevent repreated scans/writes of ret
for( int bi=0; bi<len; bi+=blocksize ) {
int llen = Math.min(len-bi, blocksize);
//aggregate next block from all partial results
for( int j=0; j<bk; j++ ) //rest (not aligned to 4)
vectAdd(partret[j], ret, bi, bi, llen);
for( int j=bk; j<k; j+=4 ) //4 partial results at a time
vectAdd4(partret[j], partret[j+1], partret[j+2], partret[j+3], ret, bi, bi, llen);
}
}
/////////////////////////////////////////////////////////
// Task Implementations for Multi-Threaded Operations //
/////////////////////////////////////////////////////////
/**
*
*
*/
private static class MatrixMultTask implements Callable<Object>
{
private MatrixBlock _m1 = null;
private MatrixBlock _m2 = null;
private MatrixBlock _ret = null;
private boolean _tm2 = false; //transposed m2
private boolean _pm2 = false; //par over m2
private int _rl = -1;
private int _ru = -1;
private long _nnz = -1;
protected MatrixMultTask( MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru )
{
_m1 = m1;
_m2 = m2;
_tm2 = tm2;
_pm2 = pm2;
_rl = rl;
_ru = ru;
if( pm2 ) { //vector-matrix / matrix-matrix
//allocate local result for partial aggregation
_ret = new MatrixBlock(ret.rlen, ret.clen, false);
}
else { //default case
_ret = ret;
}
}
@Override
public Object call() throws DMLRuntimeException
{
//thread-local allocation
if( _pm2 )
_ret.allocateDenseBlock();
//compute block matrix multiplication
if( _m1.isUltraSparse() || _m2.isUltraSparse() )
matrixMultUltraSparse(_m1, _m2, _ret, _rl, _ru);
else if(!_m1.sparse && !_m2.sparse)
matrixMultDenseDense(_m1, _m2, _ret, _tm2, _pm2, _rl, _ru);
else if(_m1.sparse && _m2.sparse)
matrixMultSparseSparse(_m1, _m2, _ret, _pm2, _rl, _ru);
else if(_m1.sparse)
matrixMultSparseDense(_m1, _m2, _ret, _pm2, _rl, _ru);
else
matrixMultDenseSparse(_m1, _m2, _ret, _pm2, _rl, _ru);
//maintain block nnz (upper bounds inclusive)
if( !_pm2 )
_nnz = _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
return null;
}
public long getPartialNnz(){
return _nnz;
}
public MatrixBlock getResult(){
return _ret;
}
}
/**
*
*
*/
private static class MatrixMultChainTask implements Callable<Object>
{
private MatrixBlock _m1 = null;
private MatrixBlock _m2 = null;
private MatrixBlock _m3 = null;
private MatrixBlock _ret = null;
private ChainType _ct = null;
private int _rl = -1;
private int _ru = -1;
protected MatrixMultChainTask( MatrixBlock mX, MatrixBlock mV, MatrixBlock mW, MatrixBlock ret, ChainType ct, int rl, int ru )
throws DMLRuntimeException
{
_m1 = mX;
_m2 = mV;
_m3 = mW;
_ct = ct;
_rl = rl;
_ru = ru;
//allocate local result for partial aggregation
_ret = new MatrixBlock(ret.rlen, ret.clen, false);
_ret.allocateDenseBlock();
}
@Override
public Object call() throws DMLRuntimeException
{
if( _m1.sparse )
matrixMultChainSparse(_m1, _m2, _m3, _ret, _ct, _rl, _ru);
else
matrixMultChainDense(_m1, _m2, _m3, _ret, _ct, _rl, _ru);
//NOTE: we dont do global aggregation from concurrent tasks in order
//to prevent synchronization (sequential aggregation led to better
//performance after JIT)
return null;
}
public MatrixBlock getResult() {
return _ret;
}
}
private static class MatrixMultTransposeTask implements Callable<Object>
{
private MatrixBlock _m1 = null;
private MatrixBlock _ret = null;
private boolean _left = true;
private int _rl = -1;
private int _ru = -1;
protected MatrixMultTransposeTask( MatrixBlock m1, MatrixBlock ret, boolean left, int rl, int ru )
{
_m1 = m1;
_ret = ret;
_left = left;
_rl = rl;
_ru = ru;
}
@Override
public Object call() throws DMLRuntimeException
{
if( _m1.sparse )
matrixMultTransposeSelfSparse(_m1, _ret, _left, _rl, _ru);
else
matrixMultTransposeSelfDense(_m1, _ret, _left, _rl, _ru);
return null;
}
}
/**
*
*
*/
private static class MatrixMultPermuteTask implements Callable<Object>
{
private MatrixBlock _pm1 = null;
private MatrixBlock _m2 = null;
private MatrixBlock _ret1 = null;
private MatrixBlock _ret2 = null;
private int _rl = -1;
private int _ru = -1;
protected MatrixMultPermuteTask( MatrixBlock pm1, MatrixBlock m2, MatrixBlock ret1, MatrixBlock ret2, int rl, int ru)
{
_pm1 = pm1;
_m2 = m2;
_ret1 = ret1;
_ret2 = ret2;
_rl = rl;
_ru = ru;
}
@Override
public Object call() throws DMLRuntimeException
{
if( _m2.sparse )
matrixMultPermuteSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru);
else if( _ret1.sparse )
matrixMultPermuteDenseSparse(_pm1, _m2, _ret1, _ret2, _rl, _ru);
else
matrixMultPermuteDense(_pm1, _m2, _ret1, _ret2, _rl, _ru);
return null;
}
}
/**
*
*/
private static interface ScalarResultTask extends Callable<Object>{
public double getScalarResult();
}
/**
*
*
*/
private static class MatrixMultWSLossTask implements ScalarResultTask
{
private MatrixBlock _mX = null;
private MatrixBlock _mU = null;
private MatrixBlock _mV = null;
private MatrixBlock _mW = null;
private MatrixBlock _ret = null;
private WeightsType _wt = null;
private int _rl = -1;
private int _ru = -1;
protected MatrixMultWSLossTask(MatrixBlock mX, MatrixBlock mU, MatrixBlock mV, MatrixBlock mW, WeightsType wt, int rl, int ru)
throws DMLRuntimeException
{
_mX = mX;
_mU = mU;
_mV = mV;
_mW = mW;
_wt = wt;
_rl = rl;
_ru = ru;
//allocate local result for partial aggregation
_ret = new MatrixBlock(1, 1, false);
_ret.allocateDenseBlock();
}
@Override
public Object call() throws DMLRuntimeException
{
if( !_mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || !_mW.sparse)
&& !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()
&& (_mW==null || !_mW.isEmptyBlock()))
matrixMultWSLossDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
else if( _mX.sparse && !_mU.sparse && !_mV.sparse && (_mW==null || _mW.sparse)
&& !_mX.isEmptyBlock() && !_mU.isEmptyBlock() && !_mV.isEmptyBlock()
&& (_mW==null || !_mW.isEmptyBlock()))
matrixMultWSLossSparseDense(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
else
matrixMultWSLossGeneric(_mX, _mU, _mV, _mW, _ret, _wt, _rl, _ru);
return null;
}
@Override
public double getScalarResult() {
return _ret.quickGetValue(0, 0);
}
}
/**
*
*
*/
private static class MatrixMultWSigmoidTask implements Callable<Object>
{
private MatrixBlock _mW = null;
private MatrixBlock _mU = null;
private MatrixBlock _mV = null;
private MatrixBlock _ret = null;
private WSigmoidType _wt = null;
private int _rl = -1;
private int _ru = -1;
private long _nnz = -1;
protected MatrixMultWSigmoidTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WSigmoidType wt, int rl, int ru)
throws DMLRuntimeException
{
_mW = mW;
_mU = mU;
_mV = mV;
_ret = ret;
_wt = wt;
_rl = rl;
_ru = ru;
}
@Override
public Object call() throws DMLRuntimeException
{
//core weighted square sum mm computation
if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
matrixMultWSigmoidDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
matrixMultWSigmoidSparseDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
else
matrixMultWSigmoidGeneric(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
//maintain block nnz (upper bounds inclusive)
_nnz = _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1);
return null;
}
public long getPartialNnz(){
return _nnz;
}
}
/**
*
*
*/
private static class MatrixMultWDivTask implements Callable<Object>
{
private MatrixBlock _mW = null;
private MatrixBlock _mU = null;
private MatrixBlock _mV = null;
private MatrixBlock _ret = null;
private WDivMMType _wt = null;
private int _rl = -1;
private int _ru = -1;
private int _cl = -1;
private int _cu = -1;
private long _nnz = -1;
protected MatrixMultWDivTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, MatrixBlock ret, WDivMMType wt, int rl, int ru, int cl, int cu)
throws DMLRuntimeException
{
_mW = mW;
_mU = mU;
_mV = mV;
_wt = wt;
_rl = rl;
_ru = ru;
_cl = cl;
_cu = cu;
_ret = ret;
}
@Override
public Object call() throws DMLRuntimeException
{
//core weighted div mm computation
if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
matrixMultWDivMMDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru, _cl, _cu);
else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
matrixMultWDivMMSparseDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru, _cl, _cu);
else
matrixMultWDivMMGeneric(_mW, _mU, _mV, _ret, _wt, _rl, _ru, _cl, _cu);
//maintain partial nnz for right (upper bounds inclusive)
int rl = _wt.isLeft() ? _cl : _rl;
int ru = _wt.isLeft() ? _cu : _ru;
_nnz = _ret.recomputeNonZeros(rl, ru-1, 0, _ret.getNumColumns()-1);
return null;
}
/**
* For wdivmm right.
* @return
*/
public long getPartialNnz(){
return _nnz;
}
}
private static class MatrixMultWCeTask implements ScalarResultTask
{
private MatrixBlock _mW = null;
private MatrixBlock _mU = null;
private MatrixBlock _mV = null;
private MatrixBlock _ret = null;
private WCeMMType _wt = null;
private int _rl = -1;
private int _ru = -1;
protected MatrixMultWCeTask(MatrixBlock mW, MatrixBlock mU, MatrixBlock mV, WCeMMType wt, int rl, int ru)
throws DMLRuntimeException
{
_mW = mW;
_mU = mU;
_mV = mV;
_wt = wt;
_rl = rl;
_ru = ru;
//allocate local result for partial aggregation
_ret = new MatrixBlock(1, 1, false);
_ret.allocateDenseBlock();
}
@Override
public Object call() throws DMLRuntimeException
{
//core weighted div mm computation
if( !_mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock() )
matrixMultWCeMMDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
else if( _mW.sparse && !_mU.sparse && !_mV.sparse && !_mU.isEmptyBlock() && !_mV.isEmptyBlock())
matrixMultWCeMMSparseDense(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
else
matrixMultWCeMMGeneric(_mW, _mU, _mV, _ret, _wt, _rl, _ru);
return null;
}
@Override
public double getScalarResult() {
return _ret.quickGetValue(0, 0);
}
}
}