/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.matrix.mapred; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Iterator; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.IJV; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.TaggedFirstSecondIndexes; import org.apache.sysml.runtime.matrix.mapred.CSVWriteReducer.RowBlockForTextOutput; import org.apache.sysml.runtime.matrix.mapred.CSVWriteReducer.RowBlockForTextOutput.Situation; import org.apache.sysml.runtime.util.MapReduceTool; public class CSVWriteReducer extends ReduceBase implements Reducer<TaggedFirstSecondIndexes, MatrixBlock, NullWritable, RowBlockForTextOutput> { private NullWritable nullKey = NullWritable.get(); private RowBlockForTextOutput outValue = new RowBlockForTextOutput(); private RowBlockForTextOutput zeroBlock = new RowBlockForTextOutput(); private long[] rowIndexes=null; private long[] minRowIndexes=null; private long[] maxRowIndexes=null; private long[] colIndexes=null; private long[] numColBlocks=null; private int[] colsPerBlock=null; private int[] lastBlockNCols=null; private String[] delims=null; private boolean[] sparses=null; private int[] tagToResultIndex=null; private void addEndingMissingValues(byte tag, Reporter reporter) throws IOException { long col=colIndexes[tag]+1; for(;col<numColBlocks[tag]; col++) { zeroBlock.setNumColumns(colsPerBlock[tag]); zeroBlock.setSituation(Situation.MIDDLE); collectFinalMultipleOutputs.directOutput(nullKey, zeroBlock, tagToResultIndex[tag], reporter); } //the last block if(col<=numColBlocks[tag]) { zeroBlock.setNumColumns(lastBlockNCols[tag]); zeroBlock.setSituation(Situation.MIDDLE); collectFinalMultipleOutputs.directOutput(nullKey, zeroBlock, tagToResultIndex[tag], reporter); colIndexes[tag]=0; } } private Situation addMissingRows(byte tag, long stoppingRow, Situation sit, Reporter reporter) throws IOException { for(long row=rowIndexes[tag]+1; row<stoppingRow; row++) { for(long c=1; c<numColBlocks[tag]; c++) { zeroBlock.setNumColumns(colsPerBlock[tag]); zeroBlock.setSituation(sit); collectFinalMultipleOutputs.directOutput(nullKey, zeroBlock, tagToResultIndex[tag], reporter); sit=Situation.MIDDLE; } zeroBlock.setNumColumns(lastBlockNCols[tag]); zeroBlock.setSituation(sit); collectFinalMultipleOutputs.directOutput(nullKey, zeroBlock, tagToResultIndex[tag], reporter); colIndexes[tag]=0; sit=Situation.NEWLINE; } colIndexes[tag]=0; return sit; } private void addNewlineCharacter(byte tag, Reporter reporter) throws IOException { zeroBlock.setNumColumns(0); zeroBlock.setSituation(Situation.NEWLINE); collectFinalMultipleOutputs.directOutput(nullKey, zeroBlock, tagToResultIndex[tag], reporter); } @Override public void reduce(TaggedFirstSecondIndexes inkey, Iterator<MatrixBlock> inValue, OutputCollector<NullWritable, RowBlockForTextOutput> out, Reporter reporter) throws IOException { long begin = System.currentTimeMillis(); cachedReporter = reporter; byte tag = inkey.getTag(); zeroBlock.setFormatParameters(delims[tag], sparses[tag]); outValue.setFormatParameters(delims[tag], sparses[tag]); Situation sit = Situation.MIDDLE; if(rowIndexes[tag]==minRowIndexes[tag]) sit=Situation.START; else if(rowIndexes[tag]!=inkey.getFirstIndex()) sit=Situation.NEWLINE; //check whether need to fill in missing values in previous rows if(sit==Situation.NEWLINE) { //if the previous row has not finished addEndingMissingValues(tag, reporter); } if(sit==Situation.NEWLINE||sit==Situation.START) { //if a row is completely missing sit=addMissingRows(tag, inkey.getFirstIndex(), sit, reporter); } //add missing value at the beginning of this row for(long col=colIndexes[tag]+1; col<inkey.getSecondIndex(); col++) { zeroBlock.setNumColumns(colsPerBlock[tag]); zeroBlock.setSituation(sit); collectFinalMultipleOutputs.directOutput(nullKey, zeroBlock, tagToResultIndex[tag], reporter); sit=Situation.MIDDLE; } colIndexes[tag]=inkey.getSecondIndex(); while(inValue.hasNext()) { MatrixBlock block = inValue.next(); outValue.setData(block); outValue.setNumColumns(block.getNumColumns()); outValue.setSituation(sit); collectFinalMultipleOutputs.directOutput(nullKey, outValue, tagToResultIndex[tag], reporter); resultsNonZeros[tagToResultIndex[tag]] += block.getNonZeros(); sit = Situation.MIDDLE; } rowIndexes[tag]=inkey.getFirstIndex(); reporter.incrCounter(Counters.COMBINE_OR_REDUCE_TIME, (System.currentTimeMillis()-begin)); } @Override public void configure(JobConf job) { super.configure(job); byte maxIndex=0; HashMap<Byte, CSVWriteInstruction> out2Ins=new HashMap<Byte, CSVWriteInstruction>(); try { CSVWriteInstruction[] ins = MRJobConfiguration.getCSVWriteInstructions(job); for(CSVWriteInstruction in: ins) { out2Ins.put(in.output, in); if(in.output>maxIndex) maxIndex=in.output; } } catch (Exception e) { throw new RuntimeException(e); } int numParitions=job.getNumReduceTasks(); int taskID=MapReduceTool.getUniqueTaskId(job); //LOG.info("## taks id: "+taskID); //for efficiency only, the arrays may have missing values rowIndexes=new long[maxIndex+1]; colIndexes=new long[maxIndex+1]; maxRowIndexes=new long[maxIndex+1]; minRowIndexes=new long[maxIndex+1]; numColBlocks=new long[maxIndex+1]; lastBlockNCols=new int[maxIndex+1]; colsPerBlock=new int[maxIndex+1]; delims=new String[maxIndex+1]; sparses=new boolean[maxIndex+1]; tagToResultIndex=new int[maxIndex+1]; for(int i=0; i<resultIndexes.length; i++) { byte ri=resultIndexes[i]; tagToResultIndex[ri]=i; CSVWriteInstruction in=out2Ins.get(ri); MatrixCharacteristics dim=MRJobConfiguration.getMatrixCharacteristicsForInput(job, in.input); delims[ri]=in.delim; sparses[ri]=in.sparse; numColBlocks[ri]=(long)Math.ceil((double)dim.getCols()/(double) dim.getColsPerBlock()); lastBlockNCols[ri]=(int) (dim.getCols()%dim.getColsPerBlock()); colsPerBlock[ri]=dim.getColsPerBlock(); long rstep=(long)Math.ceil((double)dim.getRows()/(double)numParitions); minRowIndexes[ri]=rowIndexes[ri]=rstep*taskID; maxRowIndexes[ri]=Math.min(rstep*(taskID+1), dim.getRows()); colIndexes[ri]=0; } zeroBlock.setData(new MatrixBlock()); } @Override public void close() throws IOException { for( byte tag : resultIndexes ) { //if the previous row has not finished addEndingMissingValues(tag, cachedReporter); //if a row is completely missing addMissingRows(tag, maxRowIndexes[tag]+1, Situation.NEWLINE, cachedReporter); // add a newline character at the end of file addNewlineCharacter(tag, cachedReporter); } super.close(); } /** * Custom output writable to prevent automatic newline after each partial block. * Writing partial blocks is important for robustness in case of very large rows * (otherwise there would be potential to run OOM). * */ public static class RowBlockForTextOutput implements Writable { public static enum Situation{ START, NEWLINE, MIDDLE }; private MatrixBlock _data = null; private int _numCols = 0; private Situation _sit = Situation.START; private String delim=","; private boolean sparse=true; private StringBuilder _buffer = new StringBuilder(); public RowBlockForTextOutput() { } public void setData(MatrixBlock block) { _data = block; } public void setNumColumns(int cols) { _numCols = cols; } public void setSituation(Situation s) { _sit = s; } public void setFormatParameters(String del, boolean sps) { delim=del; sparse=sps; } @Override public void readFields(DataInput arg0) throws IOException { throw new IOException("this is not supposed to be called!"); } @Override public void write(DataOutput out) throws IOException { _buffer.setLength(0); switch( _sit ) { case START: break; case NEWLINE: _buffer.append('\n'); break; case MIDDLE: _buffer.append(delim); break; default: throw new RuntimeException("Unrecognized situation "+_sit); } //serialize data if required (not newline) if ( _numCols > 0 ) { if( _data.isEmptyBlock(false) ) //EMPTY BLOCK { appendZero(_buffer, sparse, delim, false, _numCols); } else if( _data.isInSparseFormat() ) //SPARSE BLOCK { Iterator<IJV> iter = _data.getSparseBlockIterator(); int j = -1; while( iter.hasNext() ) { IJV cell = iter.next(); appendZero(_buffer, sparse, delim, true, cell.getJ()-j-1); j = cell.getJ(); //current col if( cell.getV() != 0 ) //for nnz _buffer.append(cell.getV()); else if( !sparse ) _buffer.append('0'); if( j < _numCols-1 ) _buffer.append(delim); } appendZero(_buffer, sparse, delim, false, _numCols-j-1); } else //DENSE BLOCK { for(int j=0; j<_numCols; j++) { double val = _data.getValueDenseUnsafe(0, j); if( val!=0 ) //for nnz _buffer.append(val); else if( !sparse ) _buffer.append('0'); if( j < _numCols-1 ) _buffer.append(delim); } } } ByteBuffer bytes = Text.encode(_buffer.toString()); int length = bytes.limit(); out.write(bytes.array(), 0, length); } private static void appendZero( StringBuilder buffer, boolean sparse, String delim, boolean alwaysDelim, int len ) { if( len <= 0 ) return; for( int i=0; i<len; i++ ) { if( !sparse ) //single character buffer.append('0'); if( alwaysDelim || i < len-1 ) buffer.append(delim); } } } }