/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.io; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties; import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.Pair; import org.apache.sysml.runtime.transform.TfUtils; /** * Multi-threaded frame text csv reader. * */ public class FrameReaderTextCSVParallel extends FrameReaderTextCSV { public FrameReaderTextCSVParallel(CSVFileFormatProperties props) { super(props); } @Override protected void readCSVFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException { int numThreads = OptimizerUtils.getParallelTextReadParallelism(); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, numThreads); splits = IOUtilFunctions.sortInputSplits(splits); try { ExecutorService pool = Executors.newFixedThreadPool(numThreads); //compute num rows per split ArrayList<CountRowsTask> tasks = new ArrayList<CountRowsTask>(); for( int i=0; i<splits.length; i++ ) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i==0)); List<Future<Long>> cret = pool.invokeAll(tasks); //compute row offset per split via cumsum on row counts long offset = 0; List<Long> offsets = new ArrayList<Long>(); for( Future<Long> count : cret ) { offsets.add(offset); offset += count.get(); } //read individual splits ArrayList<ReadRowsTask> tasks2 = new ArrayList<ReadRowsTask>(); for( int i=0; i<splits.length; i++ ) tasks2.add( new ReadRowsTask(splits[i], informat, job, dest, offsets.get(i).intValue(), i==0)); List<Future<Object>> rret = pool.invokeAll(tasks2); pool.shutdown(); //error handling for( Future<Object> read : rret ) read.get(); } catch (Exception e) { throw new IOException("Failed parallel read of text csv input.", e); } } @Override protected Pair<Integer,Integer> computeCSVSize( Path path, JobConf job, FileSystem fs) throws IOException { int numThreads = OptimizerUtils.getParallelTextReadParallelism(); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, numThreads); //compute number of columns int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim()); //compute number of rows int nrow = 0; ExecutorService pool = Executors.newFixedThreadPool(numThreads); try { ArrayList<CountRowsTask> tasks = new ArrayList<CountRowsTask>(); for( int i=0; i<splits.length; i++ ) tasks.add(new CountRowsTask(splits[i], informat, job, _props.hasHeader(), i==0)); List<Future<Long>> cret = pool.invokeAll(tasks); for( Future<Long> count : cret ) nrow += count.get().intValue(); } catch (Exception e) { throw new IOException("Failed parallel read of text csv input.", e); } return new Pair<Integer,Integer>(nrow, ncol); } private static class CountRowsTask implements Callable<Long> { private InputSplit _split = null; private TextInputFormat _informat = null; private JobConf _job = null; private boolean _hasHeader = false; private boolean _firstSplit = false; public CountRowsTask(InputSplit split, TextInputFormat informat, JobConf job, boolean hasHeader, boolean first) { _split = split; _informat = informat; _job = job; _hasHeader = hasHeader; _firstSplit = first; } @Override public Long call() throws Exception { RecordReader<LongWritable, Text> reader = _informat.getRecordReader(_split, _job, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); long nrows = 0; // count rows from the first non-header row try { if ( _firstSplit && _hasHeader ) reader.next(key, value); while ( reader.next(key, value) ) { String val = value.toString(); nrows += ( val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1; } } finally { IOUtilFunctions.closeSilently(reader); } return nrows; } } private class ReadRowsTask implements Callable<Object> { private InputSplit _split = null; private TextInputFormat _informat = null; private JobConf _job = null; private FrameBlock _dest = null; private int _offset = -1; private boolean _isFirstSplit = false; public ReadRowsTask(InputSplit split, TextInputFormat informat, JobConf job, FrameBlock dest, int offset, boolean first) { _split = split; _informat = informat; _job = job; _dest = dest; _offset = offset; _isFirstSplit = first; } @Override public Object call() throws Exception { readCSVFrameFromInputSplit(_split, _informat, _job, _dest, _dest.getSchema(), _dest.getColumnNames(), _dest.getNumRows(), _dest.getNumColumns(), _offset, _isFirstSplit); return null; } } }