/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.io;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.matrix.CSVReblockMR;
import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.util.UtilFunctions;
public class ReaderTextCSV extends MatrixReader
{
private CSVFileFormatProperties _props = null;
public ReaderTextCSV(CSVFileFormatProperties props)
{
_props = props;
}
@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
throws IOException, DMLRuntimeException
{
//allocate output matrix block
MatrixBlock ret = null;
if( rlen>0 && clen>0 ) //otherwise CSV reblock based on file size for matrix w/ unknown dimensions
ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false);
//prepare file access
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = FileSystem.get(job);
Path path = new Path( fname );
//check existence and non-empty file
checkValidInputFile(fs, path);
//core read
ret = readCSVMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen,
_props.hasHeader(), _props.getDelim(), _props.isFill(), _props.getFillValue() );
//finally check if change of sparse/dense block representation required
//(nnz explicitly maintained during read)
ret.examSparsity();
return ret;
}
/**
*
* @param path
* @param job
* @param fs
* @param dest
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param hasHeader
* @param delim
* @param fill
* @param fillValue
* @return
* @throws IOException
*/
@SuppressWarnings("unchecked")
private MatrixBlock readCSVMatrixFromHDFS( Path path, JobConf job, FileSystem fs, MatrixBlock dest,
long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue )
throws IOException
{
ArrayList<Path> files=new ArrayList<Path>();
if(fs.isDirectory(path)) {
for(FileStatus stat: fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
files.add(stat.getPath());
Collections.sort(files);
}
else
files.add(path);
if ( dest == null ) {
dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue);
clen = dest.getNumColumns();
}
boolean sparse = dest.isInSparseFormat();
/////////////////////////////////////////
String value = null;
int row = 0;
int col = -1;
double cellValue = 0;
long lnnz = 0;
for(int fileNo=0; fileNo<files.size(); fileNo++)
{
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
if(fileNo==0 && hasHeader )
br.readLine(); //ignore header
// Read the data
boolean emptyValuesFound = false;
try
{
if( sparse ) //SPARSE<-value
{
while( (value=br.readLine())!=null ) //foreach line
{
String cellStr = value.toString().trim();
emptyValuesFound = false;
String[] parts = IOUtilFunctions.split(cellStr, delim);
col = 0;
for(String part : parts) //foreach cell
{
part = part.trim();
if ( part.isEmpty() ) {
emptyValuesFound = true;
cellValue = fillValue;
}
else {
cellValue = UtilFunctions.parseToDouble(part);
}
if ( cellValue != 0 ) {
dest.appendValue(row, col, cellValue);
lnnz++;
}
col++;
}
//sanity checks for empty values and number of columns
IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
row++;
}
}
else //DENSE<-value
{
while( (value=br.readLine())!=null ) //foreach line
{
String cellStr = value.toString().trim();
emptyValuesFound = false;
String[] parts = IOUtilFunctions.split(cellStr, delim);
col = 0;
for( String part : parts ) //foreach cell
{
part = part.trim();
if ( part.isEmpty() ) {
emptyValuesFound = true;
cellValue = fillValue;
}
else {
cellValue = UtilFunctions.parseToDouble(part);
}
if ( cellValue != 0 ) {
dest.setValueDenseUnsafe(row, col, cellValue);
lnnz++;
}
col++;
}
//sanity checks for empty values and number of columns
IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
row++;
}
}
}
finally {
IOUtilFunctions.closeSilently(br);
}
}
//post processing
dest.setNonZeros( lnnz );
return dest;
}
/**
*
* @param files
* @param job
* @param fs
* @param hasHeader
* @param delim
* @param fill
* @param fillValue
* @return
* @throws IOException
*/
private MatrixBlock computeCSVSize ( List<Path> files, JobConf job, FileSystem fs, boolean hasHeader, String delim, boolean fill, double fillValue)
throws IOException
{
int nrow = -1;
int ncol = -1;
String value = null;
String cellStr = null;
for(int fileNo=0; fileNo<files.size(); fileNo++)
{
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
try
{
// Read the header line, if there is one.
if(fileNo==0)
{
if ( hasHeader )
br.readLine(); //ignore header
if( (value = br.readLine()) != null ) {
cellStr = value.toString().trim();
ncol = StringUtils.countMatches(cellStr, delim) + 1;
nrow = 1;
}
}
while ( (value = br.readLine()) != null ) {
nrow++;
}
}
finally {
IOUtilFunctions.closeSilently(br);
}
}
//create new matrix block (assume sparse for consistency w/ compiler)
return new MatrixBlock(nrow, ncol, true);
}
}