/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import com.ibm.bi.dml.parser.Expression.ValueType;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject;
import com.ibm.bi.dml.runtime.io.MatrixReader;
import com.ibm.bi.dml.runtime.io.MatrixReaderFactory;
import com.ibm.bi.dml.runtime.io.MatrixWriter;
import com.ibm.bi.dml.runtime.io.MatrixWriterFactory;
import com.ibm.bi.dml.runtime.io.ReadProperties;
import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics;
import com.ibm.bi.dml.runtime.matrix.data.CTableMap;
import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.IJV;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.matrix.data.SparseRowsIterator;
import com.ibm.bi.dml.udf.Matrix;
/**
* This class provides methods to read and write matrix blocks from to HDFS using different data formats.
* Those functionalities are used especially for CP read/write and exporting in-memory matrices to HDFS
* (before executing MR jobs).
*
*/
public class DataConverter
{
//////////////
// READING and WRITING of matrix blocks to/from HDFS
// (textcell, binarycell, binaryblock)
///////
/**
*
* @param mat
* @param dir
* @param outputinfo
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @throws IOException
*/
public static void writeMatrixToHDFS(MatrixBlock mat, String dir, OutputInfo outputinfo, MatrixCharacteristics mc )
throws IOException
{
writeMatrixToHDFS(mat, dir, outputinfo, mc, -1, null);
}
/**
*
* @param mat
* @param dir
* @param outputinfo
* @param mc
* @param replication
* @param formatProperties
* @throws IOException
*/
public static void writeMatrixToHDFS(MatrixBlock mat, String dir, OutputInfo outputinfo, MatrixCharacteristics mc, int replication, FileFormatProperties formatProperties)
throws IOException
{
try {
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter( outputinfo, replication, formatProperties );
writer.writeMatrixToHDFS(mat, dir, mc.getRows(), mc.getCols(), mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros());
}
catch(Exception e)
{
throw new IOException(e);
}
}
/**
*
* @param dir
* @param inputinfo
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @return
* @throws IOException
*/
public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen, boolean localFS)
throws IOException
{
ReadProperties prop = new ReadProperties();
prop.path = dir;
prop.inputInfo = inputinfo;
prop.rlen = rlen;
prop.clen = clen;
prop.brlen = brlen;
prop.bclen = bclen;
prop.localFS = localFS;
//expected matrix is sparse (default SystemML usecase)
return readMatrixFromHDFS(prop);
}
/**
*
* @param dir
* @param inputinfo
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @return
* @throws IOException
*/
public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen)
throws IOException
{
ReadProperties prop = new ReadProperties();
prop.path = dir;
prop.inputInfo = inputinfo;
prop.rlen = rlen;
prop.clen = clen;
prop.brlen = brlen;
prop.bclen = bclen;
//expected matrix is sparse (default SystemML usecase)
return readMatrixFromHDFS(prop);
}
/**
*
* @param dir
* @param inputinfo
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param expectedSparsity
* @return
* @throws IOException
*/
public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen, double expectedSparsity)
throws IOException
{
ReadProperties prop = new ReadProperties();
prop.path = dir;
prop.inputInfo = inputinfo;
prop.rlen = rlen;
prop.clen = clen;
prop.brlen = brlen;
prop.bclen = bclen;
prop.expectedSparsity = expectedSparsity;
return readMatrixFromHDFS(prop);
}
/**
*
* @param dir
* @param inputinfo
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param expectedSparsity
* @param localFS
* @return
* @throws IOException
*/
public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen,
int brlen, int bclen, double expectedSparsity, boolean localFS)
throws IOException
{
ReadProperties prop = new ReadProperties();
prop.path = dir;
prop.inputInfo = inputinfo;
prop.rlen = rlen;
prop.clen = clen;
prop.brlen = brlen;
prop.bclen = bclen;
prop.expectedSparsity = expectedSparsity;
prop.localFS = localFS;
return readMatrixFromHDFS(prop);
}
/**
*
* @param dir
* @param inputinfo
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param expectedSparsity
* @param localFS
* @return
* @throws IOException
*/
public static MatrixBlock readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen,
int brlen, int bclen, double expectedSparsity, FileFormatProperties formatProperties)
throws IOException
{
ReadProperties prop = new ReadProperties();
prop.path = dir;
prop.inputInfo = inputinfo;
prop.rlen = rlen;
prop.clen = clen;
prop.brlen = brlen;
prop.bclen = bclen;
prop.expectedSparsity = expectedSparsity;
prop.formatProperties = formatProperties;
//prop.printMe();
return readMatrixFromHDFS(prop);
}
/**
* Core method for reading matrices in format textcell, matrixmarket, binarycell, or binaryblock
* from HDFS into main memory. For expected dense matrices we directly copy value- or block-at-a-time
* into the target matrix. In contrast, for sparse matrices, we append (column-value)-pairs and do a
* final sort if required in order to prevent large reorg overheads and increased memory consumption
* in case of unordered inputs.
*
* DENSE MxN input:
* * best/average/worst: O(M*N)
* SPARSE MxN input
* * best (ordered, or binary block w/ clen<=bclen): O(M*N)
* * average (unordered): O(M*N*log(N))
* * worst (descending order per row): O(M * N^2)
*
* NOTE: providing an exact estimate of 'expected sparsity' can prevent a full copy of the result
* matrix block (required for changing sparse->dense, or vice versa)
*
* @param dir
* @param inputinfo
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param expectedSparsity
* @return
* @throws IOException
*/
public static MatrixBlock readMatrixFromHDFS(ReadProperties prop)
throws IOException
{
//Timing time = new Timing(true);
long estnnz = (long)(prop.expectedSparsity*prop.rlen*prop.clen);
//core matrix reading
MatrixBlock ret = null;
try {
MatrixReader reader = MatrixReaderFactory.createMatrixReader(prop);
ret = reader.readMatrixFromHDFS(prop.path, prop.rlen, prop.clen, prop.brlen, prop.bclen, estnnz);
}
catch(DMLRuntimeException rex)
{
throw new IOException(rex);
}
//System.out.println("read matrix ("+prop.rlen+","+prop.clen+","+ret.getNonZeros()+") in "+time.stop());
return ret;
}
//////////////
// Utils for CREATING and COPYING matrix blocks
///////
/**
* Creates a two-dimensional double matrix of the input matrix block.
*
* @param mb
* @return
*/
public static double[][] convertToDoubleMatrix( MatrixBlock mb )
{
int rows = mb.getNumRows();
int cols = mb.getNumColumns();
double[][] ret = new double[rows][cols]; //0-initialized
if( mb.getNonZeros() > 0 )
{
if( mb.isInSparseFormat() )
{
SparseRowsIterator iter = mb.getSparseRowsIterator();
while( iter.hasNext() )
{
IJV cell = iter.next();
ret[cell.i][cell.j] = cell.v;
}
}
else
{
for( int i=0; i<rows; i++ )
for( int j=0; j<cols; j++ )
ret[i][j] = mb.getValueDenseUnsafe(i, j);
}
}
return ret;
}
/**
*
* @param mb
* @return
*/
public static boolean [] convertToBooleanVector(MatrixBlock mb)
{
int rows = mb.getNumRows();
int cols = mb.getNumColumns();
boolean[] ret = new boolean[rows*cols]; //false-initialized
if( mb.getNonZeros() > 0 )
{
if( mb.isInSparseFormat() )
{
SparseRowsIterator iter = mb.getSparseRowsIterator();
while( iter.hasNext() )
{
IJV cell = iter.next();
ret[cell.i*rows+cell.j] = (cell.v != 0.0);
}
}
else
{
if( !mb.isEmptyBlock(false) )
{
for( int i=0; i<rows; i++ )
for( int j=0; j<cols; j++ )
ret[i*cols+j] = (mb.getValueDenseUnsafe(i, j) != 0.0);
}
}
}
return ret;
}
/**
*
* @param mb
* @return
*/
public static int[] convertToIntVector( MatrixBlock mb)
{
int rows = mb.getNumRows();
int cols = mb.getNumColumns();
int[] ret = new int[rows*cols]; //0-initialized
if( mb.getNonZeros() > 0 )
{
if( mb.isInSparseFormat() )
{
SparseRowsIterator iter = mb.getSparseRowsIterator();
while( iter.hasNext() )
{
IJV cell = iter.next();
ret[cell.i*rows+cell.j] = (int)cell.v;
}
}
else
{
//memcopy row major representation if at least 1 non-zero
if( !mb.isEmptyBlock(false) )
for( int i=0; i<rows; i++ )
for( int j=0; j<cols; j++ )
ret[i*cols+j] = (int)(mb.getValueDenseUnsafe(i, j));
}
}
return ret;
}
/**
*
* @param mb
* @return
*/
public static double[] convertToDoubleVector( MatrixBlock mb )
{
int rows = mb.getNumRows();
int cols = mb.getNumColumns();
double[] ret = new double[rows*cols]; //0-initialized
if( mb.getNonZeros() > 0 )
{
if( mb.isInSparseFormat() )
{
SparseRowsIterator iter = mb.getSparseRowsIterator();
while( iter.hasNext() )
{
IJV cell = iter.next();
ret[cell.i*rows+cell.j] = cell.v;
}
}
else
{
//memcopy row major representation if at least 1 non-zero
if( !mb.isEmptyBlock(false) )
System.arraycopy(mb.getDenseArray(), 0, ret, 0, rows*cols);
}
}
return ret;
}
/**
*
* @param mb
* @return
*/
public static List<Double> convertToDoubleList( MatrixBlock mb )
{
int rows = mb.getNumRows();
int cols = mb.getNumColumns();
long nnz = mb.getNonZeros();
ArrayList<Double> ret = new ArrayList<Double>();
if( mb.isInSparseFormat() )
{
SparseRowsIterator iter = mb.getSparseRowsIterator();
while( iter.hasNext() )
{
IJV cell = iter.next();
ret.add( cell.v );
}
for( long i=nnz; i<(long)rows*cols; i++ )
ret.add( 0d ); //add remaining values
}
else
{
for( int i=0; i<rows; i++ )
for( int j=0; j<cols; j++ )
ret.add( mb.getValueDenseUnsafe(i, j) );
}
return ret;
}
/**
* Creates a dense Matrix Block and copies the given double matrix into it.
*
* @param data
* @return
* @throws DMLRuntimeException
*/
public static MatrixBlock convertToMatrixBlock( double[][] data )
throws DMLRuntimeException
{
int rows = data.length;
int cols = (rows > 0)? data[0].length : 0;
MatrixBlock mb = new MatrixBlock(rows, cols, false);
try
{
//copy data to mb (can be used because we create a dense matrix)
mb.init( data, rows, cols );
}
catch (Exception e){} //can never happen
//check and convert internal representation
mb.examSparsity();
return mb;
}
/**
* Creates a dense Matrix Block and copies the given double vector into it.
*
* @param data
* @return
* @throws DMLRuntimeException
*/
public static MatrixBlock convertToMatrixBlock( double[] data, boolean columnVector )
throws DMLRuntimeException
{
int rows = columnVector ? data.length : 1;
int cols = columnVector ? 1 : data.length;
MatrixBlock mb = new MatrixBlock(rows, cols, false);
try
{
//copy data to mb (can be used because we create a dense matrix)
mb.init( data, rows, cols );
}
catch (Exception e){} //can never happen
//check and convert internal representation
mb.examSparsity();
return mb;
}
/**
*
* @param map
* @return
*/
public static MatrixBlock convertToMatrixBlock( HashMap<MatrixIndexes,Double> map )
{
// compute dimensions from the map
long nrows=0, ncols=0;
for (MatrixIndexes index : map.keySet()) {
nrows = Math.max( nrows, index.getRowIndex() );
ncols = Math.max( ncols, index.getColumnIndex() );
}
// convert to matrix block
return convertToMatrixBlock(map, (int)nrows, (int)ncols);
}
/**
* NOTE: this method also ensures the specified matrix dimensions
*
* @param map
* @return
*/
public static MatrixBlock convertToMatrixBlock( HashMap<MatrixIndexes,Double> map, int rlen, int clen )
{
int nnz = map.size();
boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, nnz);
MatrixBlock mb = new MatrixBlock(rlen, clen, sparse, nnz);
// copy map values into new block
if( sparse ) //SPARSE <- cells
{
//append cells to sparse target (prevent shifting)
for( Entry<MatrixIndexes,Double> e : map.entrySet() )
{
MatrixIndexes index = e.getKey();
double value = e.getValue();
int rix = (int)index.getRowIndex();
int cix = (int)index.getColumnIndex();
if( value != 0 && rix<=rlen && cix<=clen )
mb.appendValue( rix-1, cix-1, value );
}
//sort sparse target representation
mb.sortSparseRows();
}
else //DENSE <- cells
{
//directly insert cells into dense target
for( Entry<MatrixIndexes,Double> e : map.entrySet() )
{
MatrixIndexes index = e.getKey();
double value = e.getValue();
int rix = (int)index.getRowIndex();
int cix = (int)index.getColumnIndex();
if( value != 0 && rix<=rlen && cix<=clen )
mb.quickSetValue( rix-1, cix-1, value );
}
}
return mb;
}
/**
*
* @param map
* @return
*/
public static MatrixBlock convertToMatrixBlock( CTableMap map )
{
// compute dimensions from the map
int nrows = (int)map.getMaxRow();
int ncols = (int)map.getMaxColumn();
// convert to matrix block
return convertToMatrixBlock(map, nrows, ncols);
}
/**
* NOTE: this method also ensures the specified matrix dimensions
*
* @param map
* @return
*/
public static MatrixBlock convertToMatrixBlock( CTableMap map, int rlen, int clen )
{
return map.toMatrixBlock(rlen, clen);
}
/**
*
* @param mb
* @param colwise
* @return
* @throws DMLRuntimeException
*/
public static MatrixBlock[] convertToMatrixBlockPartitions( MatrixBlock mb, boolean colwise )
throws DMLRuntimeException
{
MatrixBlock[] ret = null;
int rows = mb.getNumRows();
int cols = mb.getNumColumns();
long nnz = mb.getNonZeros();
boolean sparse = mb.isInSparseFormat();
double sparsity = ((double)nnz)/(rows*cols);
if( colwise ) //COL PARTITIONS
{
//allocate output partitions
ret = new MatrixBlock[ cols ];
for( int j=0; j<cols; j++ )
ret[j] = new MatrixBlock(rows, 1, false);
//cache-friendly sequential read/append
if( !mb.isEmptyBlock(false) ) {
if( sparse ){ //SPARSE
SparseRowsIterator iter = mb.getSparseRowsIterator();
while( iter.hasNext() ) {
IJV cell = iter.next();
ret[cell.j].appendValue(cell.i, 0, cell.v);
}
}
else { //DENSE
for( int i=0; i<rows; i++ )
for( int j=0; j<cols; j++ )
ret[j].appendValue(i, 0, mb.getValueDenseUnsafe(i, j));
}
}
}
else //ROW PARTITIONS
{
//allocate output partitions
ret = new MatrixBlock[ rows ];
for( int i=0; i<rows; i++ )
ret[i] = new MatrixBlock(1, cols, sparse, (long)(cols*sparsity));
//cache-friendly sparse/dense row slicing
if( !mb.isEmptyBlock(false) ) {
for( int i=0; i<rows; i++ )
mb.sliceOperations(i, i, 0, cols-1, ret[i]);
}
}
return ret;
}
/**
* Helper method that converts SystemML matrix variable (<code>varname</code>) into a Array2DRowRealMatrix format,
* which is useful in invoking Apache CommonsMath.
*
* @param ec
* @param varname
* @return
* @throws DMLRuntimeException
*/
public static Array2DRowRealMatrix convertToArray2DRowRealMatrix(MatrixObject mo)
throws DMLRuntimeException
{
Matrix.ValueType vt = (mo.getValueType() == ValueType.DOUBLE ? Matrix.ValueType.Double : Matrix.ValueType.Integer);
Matrix mathInput = new Matrix(mo.getFileName(), mo.getNumRows(), mo.getNumColumns(), vt);
mathInput.setMatrixObject(mo);
double[][] data = mathInput.getMatrixAsDoubleArray();
Array2DRowRealMatrix matrixInput = new Array2DRowRealMatrix(data, false);
return matrixInput;
}
}