/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.compress; import java.util.Arrays; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.compress.utils.ConverterUtils; import org.apache.sysml.runtime.compress.utils.LinearAlgebraUtils; import org.apache.sysml.runtime.functionobjects.Builtin; import org.apache.sysml.runtime.functionobjects.KahanFunction; import org.apache.sysml.runtime.functionobjects.KahanPlus; import org.apache.sysml.runtime.instructions.cp.KahanObject; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.operators.ScalarOperator; /** * Class to encapsulate information about a column group that is encoded with * simple lists of offsets for each set of distinct values. * */ public class ColGroupOLE extends ColGroupOffset { private static final long serialVersionUID = -9157676271360528008L; private static final Log LOG = LogFactory.getLog(ColGroupOLE.class.getName()); public ColGroupOLE() { super(); } /** * Main constructor. Constructs and stores the necessary bitmaps. * * @param colIndices * indices (within the block) of the columns included in this * column * @param numRows * total number of rows in the parent block * @param ubm * Uncompressed bitmap representation of the block */ public ColGroupOLE(int[] colIndices, int numRows, UncompressedBitmap ubm) { super(colIndices, numRows, ubm); // compress the bitmaps final int numVals = ubm.getNumValues(); char[][] lbitmaps = new char[numVals][]; int totalLen = 0; for( int i=0; i<numVals; i++ ) { lbitmaps[i] = BitmapEncoder.genOffsetBitmap( ubm.getOffsetsList(i).extractValues(), ubm.getNumOffsets(i)); totalLen += lbitmaps[i].length; } // compact bitmaps to linearized representation createCompressedBitmaps(numVals, totalLen, lbitmaps); if( LOW_LEVEL_OPT && CREATE_SKIPLIST && numRows > 2*BitmapEncoder.BITMAP_BLOCK_SZ ) { int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; _skiplist = new int[numVals]; int rl = (getNumRows()/2/blksz)*blksz; for (int k = 0; k < numVals; k++) { int boff = _ptr[k]; int blen = len(k); int bix = 0; for( int i=0; i<rl && bix<blen; i+=blksz ) { bix += _data[boff+bix] + 1; } _skiplist[k] = bix; } } //debug output double ucSize = MatrixBlock.estimateSizeDenseInMemory(numRows, colIndices.length); if( estimateInMemorySize() > ucSize ) LOG.warn("OLE group larger than UC dense: "+estimateInMemorySize()+" "+ucSize); } public ColGroupOLE(int[] colIndices, int numRows, boolean zeros, double[] values, char[] bitmaps, int[] bitmapOffs) { super(colIndices, numRows, zeros, values); _data = bitmaps; _ptr = bitmapOffs; } @Override public CompressionType getCompType() { return CompressionType.OLE_BITMAP; } @Override public Iterator<Integer> getDecodeIterator(int k) { return new BitmapDecoderOLE(_data, _ptr[k], len(k)); } @Override public void decompressToBlock(MatrixBlock target, int rl, int ru) { if( LOW_LEVEL_OPT && getNumValues() > 1 ) { final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int numCols = getNumCols(); final int numVals = getNumValues(); //cache blocking config and position array int[] apos = skipScan(numVals, rl); //cache conscious append via horizontal scans for( int bi=rl; bi<ru; bi+=blksz ) { for (int k = 0, off=0; k < numVals; k++, off+=numCols) { int boff = _ptr[k]; int blen = len(k); int bix = apos[k]; if( bix >= blen ) continue; int len = _data[boff+bix]; int pos = boff+bix+1; for( int i=pos; i<pos+len; i++ ) for( int j=0, rix = bi+_data[i]; j<numCols; j++ ) if( _values[off+j]!=0 ) target.appendValue(rix, _colIndexes[j], _values[off+j]); apos[k] += len + 1; } } } else { //call generic decompression with decoder super.decompressToBlock(target, rl, ru); } } @Override public void decompressToBlock(MatrixBlock target, int[] colixTargets) { if( LOW_LEVEL_OPT && getNumValues() > 1 ) { final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int numCols = getNumCols(); final int numVals = getNumValues(); final int n = getNumRows(); //cache blocking config and position array int[] apos = new int[numVals]; int[] cix = new int[numCols]; //prepare target col indexes for( int j=0; j<numCols; j++ ) cix[j] = colixTargets[_colIndexes[j]]; //cache conscious append via horizontal scans for( int bi=0; bi<n; bi+=blksz ) { for (int k = 0, off=0; k < numVals; k++, off+=numCols) { int boff = _ptr[k]; int blen = len(k); int bix = apos[k]; if( bix >= blen ) continue; int len = _data[boff+bix]; int pos = boff+bix+1; for( int i=pos; i<pos+len; i++ ) for( int j=0, rix = bi+_data[i]; j<numCols; j++ ) if( _values[off+j]!=0 ) target.appendValue(rix, cix[j], _values[off+j]); apos[k] += len + 1; } } } else { //call generic decompression with decoder super.decompressToBlock(target, colixTargets); } } @Override public void decompressToBlock(MatrixBlock target, int colpos) { final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int numCols = getNumCols(); final int numVals = getNumValues(); final int n = getNumRows(); double[] c = target.getDenseBlock(); //cache blocking config and position array int[] apos = allocIVector(numVals, true); //cache conscious append via horizontal scans int nnz = 0; for( int bi=0; bi<n; bi+=blksz ) { Arrays.fill(c, bi, Math.min(bi+blksz, n), 0); for (int k = 0, off=0; k < numVals; k++, off+=numCols) { int boff = _ptr[k]; int blen = len(k); int bix = apos[k]; if( bix >= blen ) continue; int len = _data[boff+bix]; int pos = boff+bix+1; for( int i=pos; i<pos+len; i++ ) { c[bi+_data[i]] = _values[off+colpos]; nnz++; } apos[k] += len + 1; } } target.setNonZeros(nnz); } @Override public int[] getCounts() { final int numVals = getNumValues(); int[] counts = new int[numVals]; for (int k = 0; k < numVals; k++) { int boff = _ptr[k]; int blen = len(k); //iterate over bitmap blocks and count partial lengths int count = 0; for (int bix=0; bix < blen; bix+=_data[boff+bix]+1) count += _data[boff+bix]; counts[k] = count; } return counts; } @Override public ColGroup scalarOperation(ScalarOperator op) throws DMLRuntimeException { double val0 = op.executeScalar(0); //fast path: sparse-safe operations // Note that bitmaps don't change and are shallow-copied if( op.sparseSafe || val0==0 ) { return new ColGroupOLE(_colIndexes, _numRows, _zeros, applyScalarOp(op), _data, _ptr); } //slow path: sparse-unsafe operations (potentially create new bitmap) //note: for efficiency, we currently don't drop values that become 0 boolean[] lind = computeZeroIndicatorVector(); int[] loff = computeOffsets(lind); if( loff.length==0 ) { //empty offset list: go back to fast path return new ColGroupOLE(_colIndexes, _numRows, true, applyScalarOp(op), _data, _ptr); } double[] rvalues = applyScalarOp(op, val0, getNumCols()); char[] lbitmap = BitmapEncoder.genOffsetBitmap(loff, loff.length); char[] rbitmaps = Arrays.copyOf(_data, _data.length+lbitmap.length); System.arraycopy(lbitmap, 0, rbitmaps, _data.length, lbitmap.length); int[] rbitmapOffs = Arrays.copyOf(_ptr, _ptr.length+1); rbitmapOffs[rbitmapOffs.length-1] = rbitmaps.length; return new ColGroupOLE(_colIndexes, _numRows, loff.length<_numRows, rvalues, rbitmaps, rbitmapOffs); } @Override public void rightMultByVector(MatrixBlock vector, MatrixBlock result, int rl, int ru) throws DMLRuntimeException { double[] b = ConverterUtils.getDenseVector(vector); double[] c = result.getDenseBlock(); final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int numCols = getNumCols(); final int numVals = getNumValues(); //prepare reduced rhs w/ relevant values double[] sb = new double[numCols]; for (int j = 0; j < numCols; j++) { sb[j] = b[_colIndexes[j]]; } if( LOW_LEVEL_OPT && numVals > 1 && _numRows > blksz ) { //since single segment scans already exceed typical L2 cache sizes //and because there is some overhead associated with blocking, the //best configuration aligns with L3 cache size (x*vcores*64K*8B < L3) //x=4 leads to a good yet slightly conservative compromise for single-/ //multi-threaded and typical number of cores and L3 cache sizes final int blksz2 = ColGroupOffset.WRITE_CACHE_BLKSZ; //step 1: prepare position and value arrays int[] apos = skipScan(numVals, rl); double[] aval = preaggValues(numVals, sb); //step 2: cache conscious matrix-vector via horizontal scans for( int bi=rl; bi<ru; bi+=blksz2 ) { int bimax = Math.min(bi+blksz2, ru); //horizontal segment scan, incl pos maintenance for (int k = 0; k < numVals; k++) { int boff = _ptr[k]; int blen = len(k); double val = aval[k]; int bix = apos[k]; for( int ii=bi; ii<bimax && bix<blen; ii+=blksz ) { //prepare length, start, and end pos int len = _data[boff+bix]; int pos = boff+bix+1; //compute partial results LinearAlgebraUtils.vectAdd(val, c, _data, pos, ii, len); bix += len + 1; } apos[k] = bix; } } } else { //iterate over all values and their bitmaps for (int k = 0; k < numVals; k++) { //prepare value-to-add for entire value bitmap int boff = _ptr[k]; int blen = len(k); double val = sumValues(k, sb); //iterate over bitmap blocks and add values if (val != 0) { int bix = 0; int off = 0; int slen = -1; //scan to beginning offset if necessary if( rl > 0 ){ for (; bix<blen & off<rl; bix += slen+1, off += blksz) { slen = _data[boff+bix]; } } //compute partial results for (; bix<blen & off<ru; bix += slen + 1, off += blksz) { slen = _data[boff+bix]; for (int blckIx = 1; blckIx <= slen; blckIx++) { c[off + _data[boff+bix + blckIx]] += val; } } } } } } @Override public void leftMultByRowVector(MatrixBlock vector, MatrixBlock result) throws DMLRuntimeException { double[] a = ConverterUtils.getDenseVector(vector); double[] c = result.getDenseBlock(); final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int numCols = getNumCols(); final int numVals = getNumValues(); final int n = getNumRows(); if( LOW_LEVEL_OPT && numVals > 1 && _numRows > blksz ) { //cache blocking config (see matrix-vector mult for explanation) final int blksz2 = ColGroupOffset.READ_CACHE_BLKSZ; //step 1: prepare position and value arrays //current pos per OLs / output values int[] apos = allocIVector(numVals, true); double[] cvals = allocDVector(numVals, true); //step 2: cache conscious matrix-vector via horizontal scans for( int ai=0; ai<n; ai+=blksz2 ) { int aimax = Math.min(ai+blksz2, n); //horizontal segment scan, incl pos maintenance for (int k = 0; k < numVals; k++) { int boff = _ptr[k]; int blen = len(k); int bix = apos[k]; double vsum = 0; for( int ii=ai; ii<aimax && bix<blen; ii+=blksz ) { //prepare length, start, and end pos int len = _data[boff+bix]; int pos = boff+bix+1; //iterate over bitmap blocks and compute partial results (a[i]*1) vsum += LinearAlgebraUtils.vectSum(a, _data, ii, pos, len); bix += len + 1; } apos[k] = bix; cvals[k] += vsum; } } //step 3: scale partial results by values and write to global output for (int k = 0, valOff=0; k < numVals; k++, valOff+=numCols) for( int j = 0; j < numCols; j++ ) c[ _colIndexes[j] ] += cvals[k] * _values[valOff+j]; } else { //iterate over all values and their bitmaps for (int k=0, valOff=0; k<numVals; k++, valOff+=numCols) { int boff = _ptr[k]; int blen = len(k); //iterate over bitmap blocks and add partial results double vsum = 0; for (int bix=0, off=0; bix < blen; bix+=_data[boff+bix]+1, off+=blksz) vsum += LinearAlgebraUtils.vectSum(a, _data, off, boff+bix+1, _data[boff+bix]); //scale partial results by values and write results for( int j = 0; j < numCols; j++ ) c[ _colIndexes[j] ] += vsum * _values[ valOff+j ]; } } } @Override public void leftMultByRowVector(ColGroupDDC a, MatrixBlock result) throws DMLRuntimeException { //note: this method is only applicable for numrows < blocksize double[] c = result.getDenseBlock(); final int numCols = getNumCols(); final int numVals = getNumValues(); //iterate over all values and their bitmaps for (int k=0, valOff=0; k<numVals; k++, valOff+=numCols) { int boff = _ptr[k]; //iterate over bitmap blocks and add partial results double vsum = 0; for( int j = boff+1; j < boff+1+_data[boff]; j++ ) vsum += a.getData(_data[j], 0); //scale partial results by values and write results for( int j = 0; j < numCols; j++ ) c[ _colIndexes[j] ] += vsum * _values[ valOff+j ]; } } @Override protected final void computeSum(MatrixBlock result, KahanFunction kplus) { KahanObject kbuff = new KahanObject(result.quickGetValue(0, 0), result.quickGetValue(0, 1)); //iterate over all values and their bitmaps final int numVals = getNumValues(); final int numCols = getNumCols(); for (int k = 0; k < numVals; k++) { int boff = _ptr[k]; int blen = len(k); int valOff = k * numCols; //iterate over bitmap blocks and count partial lengths int count = 0; for (int bix=0; bix < blen; bix+=_data[boff+bix]+1) count += _data[boff+bix]; //scale counts by all values for( int j = 0; j < numCols; j++ ) kplus.execute3(kbuff, _values[ valOff+j ], count); } result.quickSetValue(0, 0, kbuff._sum); result.quickSetValue(0, 1, kbuff._correction); } @Override protected final void computeRowSums(MatrixBlock result, KahanFunction kplus, int rl, int ru) { KahanObject kbuff = new KahanObject(0, 0); KahanPlus kplus2 = KahanPlus.getKahanPlusFnObject(); final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int numVals = getNumValues(); double[] c = result.getDenseBlock(); if( ALLOW_CACHE_CONSCIOUS_ROWSUMS && LOW_LEVEL_OPT && numVals > 1 && _numRows > blksz ) { final int blksz2 = ColGroupOffset.WRITE_CACHE_BLKSZ/2; //step 1: prepare position and value arrays int[] apos = skipScan(numVals, rl); double[] aval = sumAllValues(kplus, kbuff, false); //step 2: cache conscious row sums via horizontal scans for( int bi=rl; bi<ru; bi+=blksz2 ) { int bimax = Math.min(bi+blksz2, ru); //horizontal segment scan, incl pos maintenance for (int k = 0; k < numVals; k++) { int boff = _ptr[k]; int blen = len(k); double val = aval[k]; int bix = apos[k]; for( int ii=bi; ii<bimax && bix<blen; ii+=blksz ) { //prepare length, start, and end pos int len = _data[boff+bix]; int pos = boff+bix+1; //compute partial results for (int i = 0; i < len; i++) { int rix = ii + _data[pos + i]; kbuff.set(c[2*rix], c[2*rix+1]); kplus2.execute2(kbuff, val); c[2*rix] = kbuff._sum; c[2*rix+1] = kbuff._correction; } bix += len + 1; } apos[k] = bix; } } } else { //iterate over all values and their bitmaps for (int k = 0; k < numVals; k++) { //prepare value-to-add for entire value bitmap int boff = _ptr[k]; int blen = len(k); double val = sumValues(k, kplus, kbuff); //iterate over bitmap blocks and add values if (val != 0) { int slen; int bix = skipScanVal(k, rl); for( int off=((rl+1)/blksz)*blksz; bix<blen && off<ru; bix+=slen+1, off+=blksz ) { slen = _data[boff+bix]; for (int i = 1; i <= slen; i++) { int rix = off + _data[boff+bix + i]; kbuff.set(c[2*rix], c[2*rix+1]); kplus2.execute2(kbuff, val); c[2*rix] = kbuff._sum; c[2*rix+1] = kbuff._correction; } } } } } } @Override protected final void computeColSums(MatrixBlock result, KahanFunction kplus) { KahanObject kbuff = new KahanObject(0, 0); //iterate over all values and their bitmaps final int numVals = getNumValues(); final int numCols = getNumCols(); for (int k = 0; k < numVals; k++) { int boff = _ptr[k]; int blen = len(k); int valOff = k * numCols; //iterate over bitmap blocks and count partial lengths int count = 0; for (int bix=0; bix < blen; bix+=_data[boff+bix]+1) count += _data[boff+bix]; //scale counts by all values for( int j = 0; j < numCols; j++ ) { kbuff.set(result.quickGetValue(0, _colIndexes[j]),result.quickGetValue(1, _colIndexes[j])); kplus.execute3(kbuff, _values[ valOff+j ], count); result.quickSetValue(0, _colIndexes[j], kbuff._sum); result.quickSetValue(1, _colIndexes[j], kbuff._correction); } } } @Override protected final void computeRowMxx(MatrixBlock result, Builtin builtin, int rl, int ru) { //NOTE: zeros handled once for all column groups outside final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int numVals = getNumValues(); double[] c = result.getDenseBlock(); //iterate over all values and their bitmaps for (int k = 0; k < numVals; k++) { //prepare value-to-add for entire value bitmap int boff = _ptr[k]; int blen = len(k); double val = mxxValues(k, builtin); //iterate over bitmap blocks and add values int slen; int bix = skipScanVal(k, rl); for( int off=bix*blksz; bix<blen && off<ru; bix+=slen+1, off+=blksz ) { slen = _data[boff+bix]; for (int i = 1; i <= slen; i++) { int rix = off + _data[boff+bix + i]; c[rix] = builtin.execute2(c[rix], val); } } } } /** * Utility function of sparse-unsafe operations. * * @return zero indicator vector * @throws DMLRuntimeException if DMLRuntimeException occurs */ private boolean[] computeZeroIndicatorVector() throws DMLRuntimeException { boolean[] ret = new boolean[_numRows]; final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int numVals = getNumValues(); //initialize everything with zero Arrays.fill(ret, true); //iterate over all values and their bitmaps for (int k = 0; k < numVals; k++) { //prepare value-to-add for entire value bitmap int boff = _ptr[k]; int blen = len(k); //iterate over bitmap blocks and add values int off = 0; int slen; for( int bix=0; bix < blen; bix+=slen+1, off+=blksz) { slen = _data[boff+bix]; for (int i = 1; i <= slen; i++) { ret[off + _data[boff+bix + i]] &= false; } } } return ret; } @Override protected void countNonZerosPerRow(int[] rnnz, int rl, int ru) { final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; final int blksz2 = ColGroupOffset.WRITE_CACHE_BLKSZ; final int numVals = getNumValues(); final int numCols = getNumCols(); //current pos per OLs / output values int[] apos = skipScan(numVals, rl); //cache conscious count via horizontal scans for( int bi=rl; bi<ru; bi+=blksz2 ) { int bimax = Math.min(bi+blksz2, ru); //iterate over all values and their bitmaps for (int k = 0; k < numVals; k++) { //prepare value-to-add for entire value bitmap int boff = _ptr[k]; int blen = len(k); int bix = apos[k]; //iterate over bitmap blocks and add values for( int off=bi, slen=0; bix<blen && off<bimax; bix+=slen+1, off+=blksz ) { slen = _data[boff+bix]; for (int blckIx = 1; blckIx <= slen; blckIx++) { rnnz[off + _data[boff+bix + blckIx] - rl] += numCols; } } apos[k] = bix; } } } ///////////////////////////////// // internal helper functions /** * Scans to given row_lower position by exploiting any existing * skip list and scanning segment length fields. Returns array * of positions for all values. * * @param numVals number of values * @param rl row lower position * @return array of positions for all values */ private int[] skipScan(int numVals, int rl) { int[] ret = allocIVector(numVals, rl==0); final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; if( rl > 0 ) { //rl aligned with blksz int rskip = (getNumRows()/2/blksz)*blksz; for( int k = 0; k < numVals; k++ ) { int boff = _ptr[k]; int blen = len(k); int start = (rl>=rskip)?rskip:0; int bix = (rl>=rskip)?_skiplist[k]:0; for( int i=start; i<rl && bix<blen; i+=blksz ) { bix += _data[boff+bix] + 1; } ret[k] = bix; } } return ret; } private int skipScanVal(int k, int rl) { final int blksz = BitmapEncoder.BITMAP_BLOCK_SZ; if( rl > 0 ) { //rl aligned with blksz int rskip = (getNumRows()/2/blksz)*blksz; int boff = _ptr[k]; int blen = len(k); int start = (rl>=rskip)?rskip:0; int bix = (rl>=rskip)?_skiplist[k]:0; for( int i=start; i<rl && bix<blen; i+=blksz ) { bix += _data[boff+bix] + 1; } return bix; } return 0; } }