/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.runtime.io; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import com.ibm.bi.dml.conf.ConfigurationManager; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.matrix.data.InputInfo; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.util.FastStringTokenizer; public class ReaderTextCell extends MatrixReader { private boolean _isMMFile = false; public ReaderTextCell(InputInfo info) { _isMMFile = (info == InputInfo.MatrixMarketInputInfo); } @Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { //allocate output matrix block MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false); //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path( fname ); //check existence and non-empty file checkValidInputFile(fs, path); //core read if( fs.isDirectory(path) ) readTextCellMatrixFromHDFS(path, job, ret, rlen, clen, brlen, bclen); else readRawTextCellMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen, _isMMFile); //finally check if change of sparse/dense block representation required if( !ret.isInSparseFormat() ) ret.recomputeNonZeros(); ret.examSparsity(); return ret; } public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { //allocate output matrix block MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false); //core read readRawTextCellMatrixFromInputStream(is, ret, rlen, clen, brlen, bclen, _isMMFile); //finally check if change of sparse/dense block representation required if( !ret.isInSparseFormat() ) ret.recomputeNonZeros(); ret.examSparsity(); return ret; } /** * * @param path * @param job * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ private void readTextCellMatrixFromHDFS( Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen ) throws IOException { boolean sparse = dest.isInSparseFormat(); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LongWritable key = new LongWritable(); Text value = new Text(); int row = -1; int col = -1; try { FastStringTokenizer st = new FastStringTokenizer(' '); for(InputSplit split: splits) { RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { if( sparse ) //SPARSE<-value { while( reader.next(key, value) ) { st.reset( value.toString() ); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.appendValue(row, col, lvalue); } dest.sortSparseRows(); } else //DENSE<-value { while( reader.next(key, value) ) { st.reset( value.toString() ); //reinit tokenizer row = st.nextInt()-1; col = st.nextInt()-1; double lvalue = st.nextDouble(); dest.setValueDenseUnsafe( row, col, lvalue ); } } } finally { if( reader != null ) reader.close(); } } } catch(Exception ex) { //post-mortem error handling and bounds checking if( row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen ) { throw new IOException("Matrix cell ["+(row+1)+","+(col+1)+"] " + "out of overall matrix range [1:"+rlen+",1:"+clen+"]."); } else { throw new IOException( "Unable to read matrix in text cell format.", ex ); } } } /** * * @param path * @param job * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ private void readRawTextCellMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket ) throws IOException { //create input stream for path InputStream inputStream = fs.open(path); //actual read readRawTextCellMatrixFromInputStream(inputStream, dest, rlen, clen, brlen, bclen, matrixMarket); } /** * * @param is * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @param matrixMarket * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ private void readRawTextCellMatrixFromInputStream( InputStream is, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket ) throws IOException { BufferedReader br = new BufferedReader(new InputStreamReader( is )); boolean sparse = dest.isInSparseFormat(); String value = null; int row = -1; int col = -1; // Read the header lines, if reading from a matrixMarket file if ( matrixMarket ) { value = br.readLine(); // header line if ( value==null || !value.startsWith("%%") ) { throw new IOException("Error while reading file in MatrixMarket format. Expecting a header line, but encountered, \"" + value +"\"."); } // skip until end-of-comments while( (value = br.readLine())!=null && value.charAt(0) == '%' ) { //do nothing just skip comments } // the first line after comments is the one w/ matrix dimensions // validate (rlen clen nnz) String[] fields = value.trim().split("\\s+"); long mm_rlen = Long.parseLong(fields[0]); long mm_clen = Long.parseLong(fields[1]); if ( rlen != mm_rlen || clen != mm_clen ) { throw new IOException("Unexpected matrix dimensions while reading file in MatrixMarket format. Expecting dimensions [" + rlen + " rows, " + clen + " cols] but encountered [" + mm_rlen + " rows, " + mm_clen + "cols]."); } } try { FastStringTokenizer st = new FastStringTokenizer(' '); if( sparse ) //SPARSE<-value { while( (value=br.readLine())!=null ) { st.reset( value ); //reinit tokenizer row = st.nextInt()-1; col = st.nextInt()-1; double lvalue = st.nextDouble(); dest.appendValue(row, col, lvalue); } dest.sortSparseRows(); } else //DENSE<-value { while( (value=br.readLine())!=null ) { st.reset( value ); //reinit tokenizer row = st.nextInt()-1; col = st.nextInt()-1; double lvalue = st.nextDouble(); dest.setValueDenseUnsafe( row, col, lvalue ); } } } catch(Exception ex) { //post-mortem error handling and bounds checking if( row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen ) { throw new IOException("Matrix cell ["+(row+1)+","+(col+1)+"] " + "out of overall matrix range [1:"+rlen+",1:"+clen+"].", ex); } else { throw new IOException( "Unable to read matrix in raw text cell format.", ex ); } } finally { IOUtilFunctions.closeSilently(br); } } }