FrameReaderTextCSV.java example

Explorer
incubator-systemml-master
- dev
  - release
    - src
      - test
        java
        org
        apache
        sysml
        validation
        Constants.java
        Utility.java
        ValidateLicAndNotice.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.io;

import java.io.IOException;
import java.io.InputStream;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.transform.TfUtils;
import org.apache.sysml.runtime.util.InputStreamInputFormat;
import org.apache.sysml.runtime.util.UtilFunctions;

/**
 * Single-threaded frame text csv reader.
 * 
 */
public class FrameReaderTextCSV extends FrameReader
{
	protected CSVFileFormatProperties _props = null;
	
	public FrameReaderTextCSV(CSVFileFormatProperties props) {
		_props = props;
	}

	@Override
	public final FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names,
			long rlen, long clen)
		throws IOException, DMLRuntimeException 
	{
		//prepare file access
		JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());	
		FileSystem fs = FileSystem.get(job);
		Path path = new Path( fname );
		FileInputFormat.addInputPath(job, path);
		
		//check existence and non-empty file
		checkValidInputFile(fs, path); 
		
		//compute size if necessary
		if( rlen <= 0 || clen <= 0 ) {
			Pair<Integer,Integer> size = computeCSVSize(path, job, fs);
			rlen = size.getKey();
			clen = size.getValue();
		}
		
		//allocate output frame block
		ValueType[] lschema = createOutputSchema(schema, clen);
		String[] lnames = createOutputNames(names, clen);
		FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen);
	
		//core read (sequential/parallel) 
		readCSVFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen);
		
		return ret;
	}
	
	@Override
	public FrameBlock readFrameFromInputStream(InputStream is, ValueType[] schema, String[] names, 
			long rlen, long clen)
		throws IOException, DMLRuntimeException 
	{
		//allocate output frame block
		ValueType[] lschema = createOutputSchema(schema, clen);
		String[] lnames = createOutputNames(names, clen);
		FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen);
	
		//core read (sequential/parallel) 
		InputStreamInputFormat informat = new InputStreamInputFormat(is);
		InputSplit split = informat.getSplits(null, 1)[0];
		readCSVFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true);
		
		return ret;
	}

	protected void readCSVFrameFromHDFS( Path path, JobConf job, FileSystem fs, 
			FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) 
		throws IOException
	{
		TextInputFormat informat = new TextInputFormat();
		informat.configure(job);
		InputSplit[] splits = informat.getSplits(job, 1);
		splits = IOUtilFunctions.sortInputSplits(splits);
		for( int i=0; i<splits.length; i++ )
			readCSVFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, 0, i==0);
	}

	protected final void readCSVFrameFromInputSplit( InputSplit split, InputFormat<LongWritable,Text> informat, JobConf job, 
			FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen, int rl, boolean first)
		throws IOException
	{
		boolean hasHeader = _props.hasHeader();
		boolean isFill = _props.isFill();
		double dfillValue = _props.getFillValue();
		String sfillValue = String.valueOf(_props.getFillValue());
		String delim = _props.getDelim();
		
		//create record reader
		RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
		LongWritable key = new LongWritable();
		Text value = new Text();
		int row = rl;
		int col = -1;
		
		//handle header if existing
		if(first && hasHeader ) {
			reader.next(key, value); //read header
			dest.setColumnNames(value.toString().split(delim));
		}
			
		// Read the data
		boolean emptyValuesFound = false;
		try
		{
			while( reader.next(key, value) ) //foreach line
			{
				String cellStr = value.toString().trim();
				emptyValuesFound = false; col = 0;
				String[] parts = IOUtilFunctions.splitCSV(cellStr, delim);
				
				//parse frame meta data (missing values / num distinct)
				if( parts[0].equals(TfUtils.TXMTD_MVPREFIX) || parts[0].equals(TfUtils.TXMTD_NDPREFIX) ) {
					if( parts[0].equals(TfUtils.TXMTD_MVPREFIX) )
						for( int j=0; j<dest.getNumColumns(); j++ )
							dest.getColumnMetadata(j).setMvValue(parts[j+1]);
					else if( parts[0].equals(TfUtils.TXMTD_NDPREFIX) )
						for( int j=0; j<dest.getNumColumns(); j++ )
							dest.getColumnMetadata(j).setNumDistinct(Long.parseLong(parts[j+1]));
					continue;
				}
				
				for( String part : parts ) //foreach cell
				{
					part = part.trim();
					if ( part.isEmpty() ) {
						if( isFill && dfillValue!=0 )
							dest.set(row, col, UtilFunctions.stringToObject(schema[col], sfillValue));
						emptyValuesFound = true;
					}
					else {
						dest.set(row, col, UtilFunctions.stringToObject(schema[col], part));
					}
					col++;
				}
				
				//sanity checks for empty values and number of columns
				IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill, emptyValuesFound);
				IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen);
				row++;
			}
		}
		finally {
			IOUtilFunctions.closeSilently(reader);
		}
	}

	protected Pair<Integer,Integer> computeCSVSize( Path path, JobConf job, FileSystem fs) 
		throws IOException 
	{	
		TextInputFormat informat = new TextInputFormat();
		informat.configure(job);
		InputSplit[] splits = informat.getSplits(job, 1);
		splits = IOUtilFunctions.sortInputSplits(splits);
		
		//compute number of columns
		int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
		
		//compute number of rows
		int nrow = 0;
		for( int i=0; i<splits.length; i++ ) 
		{
			RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
			LongWritable key = new LongWritable();
			Text value = new Text();
			
			try
			{
				//ignore header of first split
				if( i==0 && _props.hasHeader() )
					reader.next(key, value);
				
				//count remaining number of rows, ignore meta data
				while ( reader.next(key, value) ) {
					String val = value.toString();
					nrow += ( val.startsWith(TfUtils.TXMTD_MVPREFIX)
						|| val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1; 
				}
			}
			finally {
				IOUtilFunctions.closeSilently(reader);
			}
		}
		
		return new Pair<Integer,Integer>(nrow, ncol);
	}
}