/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.runtime.io; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapred.JobConf; import com.ibm.bi.dml.conf.ConfigurationManager; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes; import com.ibm.bi.dml.runtime.matrix.mapred.IndexedMatrixValue; import com.ibm.bi.dml.runtime.matrix.mapred.MRJobConfiguration; public class ReaderBinaryBlock extends MatrixReader { protected boolean _localFS = false; public ReaderBinaryBlock( boolean localFS ) { _localFS = localFS; } public void setLocalFS(boolean flag) { _localFS = flag; } @Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { //allocate output matrix block MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, false, false); //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job); Path path = new Path( (_localFS ? "file:///" : "") + fname); //check existence and non-empty file checkValidInputFile(fs, path); //core read readBinaryBlockMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen); //finally check if change of sparse/dense block representation required if( !AGGREGATE_BLOCK_NNZ ) ret.recomputeNonZeros(); ret.examSparsity(); return ret; } /** * * @param fname * @param rlen * @param clen * @param brlen * @param bclen * @param estnnz * @return * @throws IOException * @throws DMLRuntimeException */ public ArrayList<IndexedMatrixValue> readIndexedMatrixBlocksFromHDFS(String fname, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { //allocate output matrix block collection ArrayList<IndexedMatrixValue> ret = new ArrayList<IndexedMatrixValue>(); //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = _localFS ? FileSystem.getLocal(job) : FileSystem.get(job); Path path = new Path( (_localFS ? "file:///" : "") + fname); //check existence and non-empty file checkValidInputFile(fs, path); //core read readBinaryBlockMatrixBlocksFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen); return ret; } /** * Note: For efficiency, we directly use SequenceFile.Reader instead of SequenceFileInputFormat- * InputSplits-RecordReader (SequenceFileRecordReader). First, this has no drawbacks since the * SequenceFileRecordReader internally uses SequenceFile.Reader as well. Second, it is * advantageous if the actual sequence files are larger than the file splits created by * informat.getSplits (which is usually aligned to the HDFS block size) because then there is * overhead for finding the actual split between our 1k-1k blocks. This case happens * if the read matrix was create by CP or when jobs directly write to large output files * (e.g., parfor matrix partitioning). * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private static void readBinaryBlockMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen ) throws IOException, DMLRuntimeException { boolean sparse = dest.isInSparseFormat(); MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); long lnnz = 0; //aggregate block nnz //set up preferred custom serialization framework for binary block format if( MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION ) MRJobConfiguration.addBinaryBlockSerializationFramework( job ); for( Path lpath : getSequenceFilePaths(fs, path) ) //1..N files { //directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs,lpath,job); try { //note: next(key, value) does not yet exploit the given serialization classes, record reader does but is generally slower. while( reader.next(key, value) ) { //empty block filter (skip entire block) if( value.isEmptyBlock(false) ) continue; int row_offset = (int)(key.getRowIndex()-1)*brlen; int col_offset = (int)(key.getColumnIndex()-1)*bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); //bound check per block if( row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols<0 || col_offset + cols > clen ) { throw new IOException("Matrix block ["+(row_offset+1)+":"+(row_offset+rows)+","+(col_offset+1)+":"+(col_offset+cols)+"] " + "out of overall matrix range [1:"+rlen+",1:"+clen+"]."); } //copy block to result if( sparse ) { //note: append requires final sort (but prevents repeated shifting) dest.appendToSparse(value, row_offset, col_offset); } else { dest.copy( row_offset, row_offset+rows-1, col_offset, col_offset+cols-1, value, false ); } //maintain nnz as aggregate of block nnz lnnz += value.getNonZeros(); } } finally { IOUtilFunctions.closeSilently(reader); } } //post-processing dest.setNonZeros( lnnz ); if( sparse && clen>bclen ){ //no need to sort if 1 column block since always sorted dest.sortSparseRows(); } } /** * * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings("deprecation") private void readBinaryBlockMatrixBlocksFromHDFS( Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen ) throws IOException { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); //set up preferred custom serialization framework for binary block format if( MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION ) MRJobConfiguration.addBinaryBlockSerializationFramework( job ); for( Path lpath : getSequenceFilePaths(fs, path) ) //1..N files { //directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs,lpath,job); try { while( reader.next(key, value) ) { int row_offset = (int)(key.getRowIndex()-1)*brlen; int col_offset = (int)(key.getColumnIndex()-1)*bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); //bound check per block if( row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols<0 || col_offset + cols > clen ) { throw new IOException("Matrix block ["+(row_offset+1)+":"+(row_offset+rows)+","+(col_offset+1)+":"+(col_offset+cols)+"] " + "out of overall matrix range [1:"+rlen+",1:"+clen+"]."); } //copy block to result dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value))); } } finally { IOUtilFunctions.closeSilently(reader); } } } }