/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.io;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import com.ibm.bi.dml.conf.DMLConfig;
import com.ibm.bi.dml.hops.OptimizerUtils;
import com.ibm.bi.dml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.matrix.data.SparseRow;
import com.ibm.bi.dml.runtime.util.MapReduceTool;
/**
*
*/
public class WriterTextCSVParallel extends WriterTextCSV
{
public WriterTextCSVParallel( CSVFileFormatProperties props ) {
super( props );
}
/**
*
* @param fileName
* @param src
* @param rlen
* @param clen
* @param nnz
* @throws IOException
*/
@Override
protected void writeCSVMatrixToHDFS( Path path, JobConf job, MatrixBlock src, long rlen, long clen, long nnz, CSVFileFormatProperties props )
throws IOException
{
//estimate output size and number of output blocks (min 1)
int numPartFiles = (int)(OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(), src.getNonZeros(),
OutputInfo.CSVOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize());
numPartFiles = Math.max(numPartFiles, 1);
//determine degree of parallelism
int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
numThreads = Math.min(numThreads, numPartFiles);
//fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
if( numThreads <= 1 ) {
super.writeCSVMatrixToHDFS(path, job, src, rlen, clen, nnz, props);
return;
}
//create directory for concurrent tasks
MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
//create and execute tasks
try
{
ExecutorService pool = Executors.newFixedThreadPool(numThreads);
ArrayList<WriteCSVTask> tasks = new ArrayList<WriteCSVTask>();
int blklen = (int)Math.ceil((double)rlen / numThreads);
for(int i=0; i<numThreads & i*blklen<rlen; i++) {
Path newPath = new Path(path, String.format("0-m-%05d",i));
tasks.add(new WriteCSVTask(newPath, job, src, i*blklen, (int)Math.min((i+1)*blklen, rlen), props));
}
//wait until all tasks have been executed
List<Future<Object>> rt = pool.invokeAll(tasks);
pool.shutdown();
//check for exceptions
for( Future<Object> task : rt )
task.get();
}
catch (Exception e) {
throw new IOException("Failed parallel write of csv output.", e);
}
}
/**
*
*
*/
private static class WriteCSVTask implements Callable<Object>
{
private JobConf _job = null;
private MatrixBlock _src = null;
private Path _path =null;
private int _rl = -1;
private int _ru = -1;
private CSVFileFormatProperties _props = null;
public WriteCSVTask(Path path, JobConf job, MatrixBlock src, int rl, int ru, CSVFileFormatProperties props)
{
_path = path;
_job = job;
_src = src;
_rl = rl;
_ru = ru;
_props = props;
}
@Override
public Object call() throws Exception
{
FileSystem _fs = FileSystem.get(_job);
BufferedWriter bw = null;
boolean sparse = _src.isInSparseFormat();
int cols = _src.getNumColumns();
try
{
//for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
bw = new BufferedWriter(new OutputStreamWriter(_fs.create(_path,true)));
_props = (_props==null)? new CSVFileFormatProperties() : _props;
String delim = _props.getDelim(); //Pattern.quote(csvProperties.getDelim());
boolean csvsparse = _props.isSparse();
// Write header line, if needed
if( _props.hasHeader() && _rl == 0 )
{
//write row chunk-wise to prevent OOM on large number of columns
for( int bj=0; bj<cols; bj+=WriterTextCSV.BLOCKSIZE_J )
{
for( int j=bj; j < Math.min(cols,bj+WriterTextCSV.BLOCKSIZE_J); j++)
{
sb.append("C"+ (j+1));
if ( j < cols-1 )
sb.append(delim);
}
bw.write( sb.toString() );
sb.setLength(0);
}
sb.append('\n');
bw.write( sb.toString() );
sb.setLength(0);
}
// Write data lines
if( sparse ) //SPARSE
{
SparseRow[] sparseRows = _src.getSparseRows();
for( int i=_rl; i<_ru; i++ )
{
//write row chunk-wise to prevent OOM on large number of columns
int prev_jix = -1;
if( sparseRows!=null && i<sparseRows.length
&& sparseRows[i]!=null && !sparseRows[i].isEmpty() )
{
SparseRow arow = sparseRows[i];
int alen = arow.size();
int[] aix = arow.getIndexContainer();
double[] avals = arow.getValueContainer();
for(int j=0; j < alen; j++)
{
int jix = aix[j];
// output empty fields, if needed
for( int j2=prev_jix; j2<jix-1; j2++ ) {
if( !csvsparse )
sb.append('0');
sb.append(delim);
//flush buffered string
if( j2%WriterTextCSV.BLOCKSIZE_J==0 ){
bw.write( sb.toString() );
sb.setLength(0);
}
}
// output the value (non-zero)
sb.append( avals[j] );
if( jix < cols-1)
sb.append(delim);
bw.write( sb.toString() );
sb.setLength(0);
//flush buffered string
if( jix%WriterTextCSV.BLOCKSIZE_J==0 ){
bw.write( sb.toString() );
sb.setLength(0);
}
prev_jix = jix;
}
}
// Output empty fields at the end of the row.
// In case of an empty row, output (clen-1) empty fields
for( int bj=prev_jix+1; bj<cols; bj+=WriterTextCSV.BLOCKSIZE_J )
{
for( int j = bj; j < Math.min(cols,bj+WriterTextCSV.BLOCKSIZE_J); j++) {
if( !csvsparse )
sb.append('0');
if( j < cols-1 )
sb.append(delim);
}
bw.write( sb.toString() );
sb.setLength(0);
}
sb.append('\n');
bw.write( sb.toString() );
sb.setLength(0);
}
}
else //DENSE
{
for( int i=_rl; i<_ru; i++ )
{
//write row chunk-wise to prevent OOM on large number of columns
for( int bj=0; bj<cols; bj+=WriterTextCSV.BLOCKSIZE_J )
{
for( int j=bj; j<Math.min(cols,bj+WriterTextCSV.BLOCKSIZE_J); j++ )
{
double lvalue = _src.getValueDenseUnsafe(i, j);
if( lvalue != 0 ) //for nnz
sb.append(lvalue);
else if( !csvsparse )
sb.append('0');
if( j != cols-1 )
sb.append(delim);
}
bw.write( sb.toString() );
sb.setLength(0);
}
sb.append('\n');
bw.write( sb.toString() ); //same as append
sb.setLength(0);
}
}
}
finally
{
IOUtilFunctions.closeSilently(bw);
}
return null;
}
}
}