/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.io;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.util.FastStringTokenizer;
public class ReaderTextCell extends MatrixReader
{
private boolean _isMMFile = false;
public ReaderTextCell(InputInfo info)
{
_isMMFile = (info == InputInfo.MatrixMarketInputInfo);
}
@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
throws IOException, DMLRuntimeException
{
//allocate output matrix block
MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false);
//prepare file access
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = FileSystem.get(job);
Path path = new Path( fname );
//check existence and non-empty file
checkValidInputFile(fs, path);
//core read
if( fs.isDirectory(path) )
readTextCellMatrixFromHDFS(path, job, ret, rlen, clen, brlen, bclen);
else
readRawTextCellMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen, _isMMFile);
//finally check if change of sparse/dense block representation required
if( !ret.isInSparseFormat() )
ret.recomputeNonZeros();
ret.examSparsity();
return ret;
}
public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int brlen, int bclen, long estnnz)
throws IOException, DMLRuntimeException
{
//allocate output matrix block
MatrixBlock ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false);
//core read
readRawTextCellMatrixFromInputStream(is, ret, rlen, clen, brlen, bclen, _isMMFile);
//finally check if change of sparse/dense block representation required
if( !ret.isInSparseFormat() )
ret.recomputeNonZeros();
ret.examSparsity();
return ret;
}
/**
*
* @param path
* @param job
* @param dest
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
*/
private void readTextCellMatrixFromHDFS( Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen )
throws IOException
{
boolean sparse = dest.isInSparseFormat();
FileInputFormat.addInputPath(job, path);
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
LongWritable key = new LongWritable();
Text value = new Text();
int row = -1;
int col = -1;
try
{
FastStringTokenizer st = new FastStringTokenizer(' ');
for(InputSplit split: splits)
{
RecordReader<LongWritable,Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
try
{
if( sparse ) //SPARSE<-value
{
while( reader.next(key, value) )
{
st.reset( value.toString() ); //reinit tokenizer
row = st.nextInt() - 1;
col = st.nextInt() - 1;
double lvalue = st.nextDouble();
dest.appendValue(row, col, lvalue);
}
dest.sortSparseRows();
}
else //DENSE<-value
{
while( reader.next(key, value) )
{
st.reset( value.toString() ); //reinit tokenizer
row = st.nextInt()-1;
col = st.nextInt()-1;
double lvalue = st.nextDouble();
dest.setValueDenseUnsafe( row, col, lvalue );
}
}
}
finally
{
if( reader != null )
reader.close();
}
}
}
catch(Exception ex)
{
//post-mortem error handling and bounds checking
if( row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen )
{
throw new IOException("Matrix cell ["+(row+1)+","+(col+1)+"] " +
"out of overall matrix range [1:"+rlen+",1:"+clen+"].");
}
else
{
throw new IOException( "Unable to read matrix in text cell format.", ex );
}
}
}
/**
*
* @param path
* @param job
* @param dest
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
*/
private void readRawTextCellMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket )
throws IOException
{
//create input stream for path
InputStream inputStream = fs.open(path);
//actual read
readRawTextCellMatrixFromInputStream(inputStream, dest, rlen, clen, brlen, bclen, matrixMarket);
}
/**
*
* @param is
* @param dest
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param matrixMarket
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
*/
private void readRawTextCellMatrixFromInputStream( InputStream is, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean matrixMarket )
throws IOException
{
BufferedReader br = new BufferedReader(new InputStreamReader( is ));
boolean sparse = dest.isInSparseFormat();
String value = null;
int row = -1;
int col = -1;
// Read the header lines, if reading from a matrixMarket file
if ( matrixMarket ) {
value = br.readLine(); // header line
if ( value==null || !value.startsWith("%%") ) {
throw new IOException("Error while reading file in MatrixMarket format. Expecting a header line, but encountered, \"" + value +"\".");
}
// skip until end-of-comments
while( (value = br.readLine())!=null && value.charAt(0) == '%' ) {
//do nothing just skip comments
}
// the first line after comments is the one w/ matrix dimensions
// validate (rlen clen nnz)
String[] fields = value.trim().split("\\s+");
long mm_rlen = Long.parseLong(fields[0]);
long mm_clen = Long.parseLong(fields[1]);
if ( rlen != mm_rlen || clen != mm_clen ) {
throw new IOException("Unexpected matrix dimensions while reading file in MatrixMarket format. Expecting dimensions [" + rlen + " rows, " + clen + " cols] but encountered [" + mm_rlen + " rows, " + mm_clen + "cols].");
}
}
try
{
FastStringTokenizer st = new FastStringTokenizer(' ');
if( sparse ) //SPARSE<-value
{
while( (value=br.readLine())!=null )
{
st.reset( value ); //reinit tokenizer
row = st.nextInt()-1;
col = st.nextInt()-1;
double lvalue = st.nextDouble();
dest.appendValue(row, col, lvalue);
}
dest.sortSparseRows();
}
else //DENSE<-value
{
while( (value=br.readLine())!=null )
{
st.reset( value ); //reinit tokenizer
row = st.nextInt()-1;
col = st.nextInt()-1;
double lvalue = st.nextDouble();
dest.setValueDenseUnsafe( row, col, lvalue );
}
}
}
catch(Exception ex)
{
//post-mortem error handling and bounds checking
if( row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen )
{
throw new IOException("Matrix cell ["+(row+1)+","+(col+1)+"] " +
"out of overall matrix range [1:"+rlen+",1:"+clen+"].", ex);
}
else
{
throw new IOException( "Unable to read matrix in raw text cell format.", ex );
}
}
finally
{
IOUtilFunctions.closeSilently(br);
}
}
}