/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.io;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.JobConf;
import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.hops.OptimizerUtils;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration;
public class ReaderBinaryBlockParallel extends ReaderBinaryBlock
{
private static int _numThreads = 1;
public ReaderBinaryBlockParallel( boolean localFS )
{
super(localFS);
_numThreads = OptimizerUtils.getParallelBinaryReadParallelism();
}
@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
throws IOException, DMLRuntimeException
{
//allocate output matrix block (incl block allocation for parallel)
MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, true);
//prepare file access
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job);
Path path = new Path( (_localFS ? "file:///" : "") + fname);
//check existence and non-empty file
checkValidInputFile(fs, path);
//core read
readBinaryBlockMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen);
//finally check if change of sparse/dense block representation required
if( !AGGREGATE_BLOCK_NNZ )
ret.recomputeNonZeros();
ret.examSparsity();
return ret;
}
/**
*
* @param path
* @param job
* @param fs
* @param dest
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
* @throws DMLRuntimeException
*/
private static void readBinaryBlockMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen )
throws IOException, DMLRuntimeException
{
//set up preferred custom serialization framework for binary block format
if( MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION )
MRJobConfiguration.addBinaryBlockSerializationFramework( job );
try
{
//create read tasks for all files
ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
ArrayList<ReadFileTask> tasks = new ArrayList<ReadFileTask>();
for( Path lpath : getSequenceFilePaths(fs, path) ){
ReadFileTask t = new ReadFileTask(lpath, job, fs, dest, rlen, clen, brlen, bclen);
tasks.add(t);
}
//wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
//check for exceptions and aggregate nnz
long lnnz = 0;
for( Future<Object> task : rt )
lnnz += (Long)task.get();
//post-processing
dest.setNonZeros( lnnz );
if( dest.isInSparseFormat() && clen>bclen ){
//no need to sort if 1 column block since always sorted
dest.sortSparseRows();
}
}
catch (Exception e) {
throw new IOException("Failed parallel read of binary block input.", e);
}
}
/**
*
*/
private static class ReadFileTask implements Callable<Object>
{
private Path _path = null;
private JobConf _job = null;
private FileSystem _fs = null;
private MatrixBlock _dest = null;
private long _rlen = -1;
private long _clen = -1;
private int _brlen = -1;
private int _bclen = -1;
public ReadFileTask(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen)
{
_path = path;
_fs = fs;
_job = job;
_dest = dest;
_rlen = rlen;
_clen = clen;
_brlen = brlen;
_bclen = bclen;
}
@Override
@SuppressWarnings({ "deprecation", "resource" })
public Object call() throws Exception
{
boolean sparse = _dest.isInSparseFormat();
MatrixIndexes key = new MatrixIndexes();
MatrixBlock value = new MatrixBlock();
long lnnz = 0; //aggregate block nnz
//directly read from sequence files (individual partfiles)
SequenceFile.Reader reader = new SequenceFile.Reader(_fs,_path,_job);
try
{
//note: next(key, value) does not yet exploit the given serialization classes, record reader does but is generally slower.
while( reader.next(key, value) )
{
//empty block filter (skip entire block)
if( value.isEmptyBlock(false) )
continue;
int row_offset = (int)(key.getRowIndex()-1)*_brlen;
int col_offset = (int)(key.getColumnIndex()-1)*_bclen;
int rows = value.getNumRows();
int cols = value.getNumColumns();
//bound check per block
if( row_offset + rows < 0 || row_offset + rows > _rlen || col_offset + cols<0 || col_offset + cols > _clen )
{
throw new IOException("Matrix block ["+(row_offset+1)+":"+(row_offset+rows)+","+(col_offset+1)+":"+(col_offset+cols)+"] " +
"out of overall matrix range [1:"+_rlen+",1:"+_clen+"].");
}
//copy block to result
if( sparse )
{
//note: append requires final sort
if (cols < _clen ) {
synchronized( _dest ){ //sparse requires lock, when matrix is wider than one block
_dest.appendToSparse(value, row_offset, col_offset);
}
}
else
_dest.appendToSparse(value, row_offset, col_offset);
}
else
{
_dest.copy( row_offset, row_offset+rows-1,
col_offset, col_offset+cols-1,
value, false );
}
//aggregate nnz
lnnz += value.getNonZeros();
}
}
finally
{
IOUtilFunctions.closeSilently(reader);
}
return lnnz;
}
}
}