/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.runtime.instructions.spark; import java.io.IOException; import java.util.ArrayList; import java.util.Random; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.Accumulator; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import com.ibm.bi.dml.parser.Expression.ValueType; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException; import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject; import com.ibm.bi.dml.runtime.controlprogram.context.ExecutionContext; import com.ibm.bi.dml.runtime.controlprogram.context.SparkExecutionContext; import com.ibm.bi.dml.runtime.instructions.Instruction; import com.ibm.bi.dml.runtime.instructions.InstructionUtils; import com.ibm.bi.dml.runtime.instructions.cp.CPOperand; import com.ibm.bi.dml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction; import com.ibm.bi.dml.runtime.instructions.spark.functions.ConvertMatrixBlockToIJVLines; import com.ibm.bi.dml.runtime.instructions.spark.utils.RDDConverterUtils; import com.ibm.bi.dml.runtime.instructions.spark.utils.SparkUtils; import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics; import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes; import com.ibm.bi.dml.runtime.matrix.data.OutputInfo; import com.ibm.bi.dml.runtime.util.MapReduceTool; public class WriteSPInstruction extends SPInstruction { private CPOperand input1 = null; private CPOperand input2 = null; private CPOperand input3 = null; private FileFormatProperties formatProperties; //scalars might occur for transform private boolean isInputMatrixBlock = true; public WriteSPInstruction(String opcode, String istr) { super(opcode, istr); } public WriteSPInstruction(CPOperand in1, CPOperand in2, CPOperand in3, String opcode, String str) { super(opcode, str); input1 = in1; input2 = in2; input3 = in3; formatProperties = null; // set in case of csv } public static Instruction parseInstruction ( String str ) throws DMLRuntimeException { String[] parts = InstructionUtils.getInstructionPartsWithValueType ( str ); String opcode = parts[0]; if( !opcode.equals("write") ) { throw new DMLRuntimeException("Unsupported opcode"); } // All write instructions have 3 parameters, except in case of delimited/csv file. // Write instructions for csv files also include three additional parameters (hasHeader, delimiter, sparse) if ( parts.length != 4 && parts.length != 8 ) { throw new DMLRuntimeException("Invalid number of operands in write instruction: " + str); } //SPARK°write°_mVar2·MATRIX·DOUBLE°./src/test/scripts/functions/data/out/B·SCALAR·STRING·true°matrixmarket·SCALAR·STRING·true // _mVar2·MATRIX·DOUBLE CPOperand in1=null, in2=null, in3=null; in1 = new CPOperand(parts[1]); in2 = new CPOperand(parts[2]); in3 = new CPOperand(parts[3]); WriteSPInstruction inst = new WriteSPInstruction(in1, in2, in3, opcode, str); if ( in3.getName().equalsIgnoreCase("csv") ) { boolean hasHeader = Boolean.parseBoolean(parts[4]); String delim = parts[5]; boolean sparse = Boolean.parseBoolean(parts[6]); FileFormatProperties formatProperties = new CSVFileFormatProperties(hasHeader, delim, sparse); inst.setFormatProperties(formatProperties); boolean isInputMB = Boolean.parseBoolean(parts[7]); inst.setInputMatrixBlock(isInputMB); } return inst; } public FileFormatProperties getFormatProperties() { return formatProperties; } public void setFormatProperties(FileFormatProperties prop) { formatProperties = prop; } public void setInputMatrixBlock(boolean isMB) { isInputMatrixBlock = isMB; } public boolean isInputMatrixBlock() { return isInputMatrixBlock; } @Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException, DMLUnsupportedOperationException { SparkExecutionContext sec = (SparkExecutionContext) ec; //get filename (literal or variable expression) String fname = ec.getScalarInput(input2.getName(), ValueType.STRING, input2.isLiteral()).getStringValue(); try { //if the file already exists on HDFS, remove it. MapReduceTool.deleteFileIfExistOnHDFS( fname ); //prepare output info according to meta data String outFmt = input3.getName(); OutputInfo oi = OutputInfo.stringToOutputInfo(outFmt); //get input rdd JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable( input1.getName() ); MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName()); if( oi == OutputInfo.MatrixMarketOutputInfo || oi == OutputInfo.TextCellOutputInfo ) { //recompute nnz if necessary (required for header if matrix market) if ( isInputMatrixBlock && !mc.nnzKnown() ) mc.setNonZeros( SparkUtils.computeNNZFromBlocks(in1) ); JavaRDD<String> header = null; if(outFmt.equalsIgnoreCase("matrixmarket")) { ArrayList<String> headerContainer = new ArrayList<String>(1); // First output MM header String headerStr = "%%MatrixMarket matrix coordinate real general\n" + // output number of rows, number of columns and number of nnz mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros(); headerContainer.add(headerStr); header = sec.getSparkContext().parallelize(headerContainer); } JavaRDD<String> ijv = in1.flatMap(new ConvertMatrixBlockToIJVLines(mc.getRowsPerBlock(), mc.getColsPerBlock())); if(header != null) customSaveTextFile(header.union(ijv), fname, true); else customSaveTextFile(ijv, fname, false); } else if( oi == OutputInfo.CSVOutputInfo ) { JavaRDD<String> out = null; Accumulator<Double> aNnz = null; if ( isInputMatrixBlock ) { //piggyback nnz computation on actual write if( !mc.nnzKnown() ) { aNnz = sec.getSparkContext().accumulator(0L); in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz)); } out = RDDConverterUtils.binaryBlockToCsv(in1, mc, (CSVFileFormatProperties) formatProperties, true); } else { // This case is applicable when the CSV output from transform() is written out @SuppressWarnings("unchecked") JavaPairRDD<Long,String> rdd = (JavaPairRDD<Long, String>) ((MatrixObject) sec.getVariable(input1.getName())).getRDDHandle().getRDD(); out = rdd.values(); String sep = ","; boolean hasHeader = false; if(formatProperties != null) { sep = ((CSVFileFormatProperties) formatProperties).getDelim(); hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader(); } if(hasHeader) { StringBuffer buf = new StringBuffer(); for(int j = 1; j < mc.getCols(); j++) { if(j != 1) { buf.append(sep); } buf.append("C" + j); } ArrayList<String> headerContainer = new ArrayList<String>(1); headerContainer.add(0, buf.toString()); JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer); out = header.union(out); } } customSaveTextFile(out, fname, false); if( isInputMatrixBlock && !mc.nnzKnown() ) mc.setNonZeros((long)aNnz.value().longValue()); } else if( oi == OutputInfo.BinaryBlockOutputInfo ) { //piggyback nnz computation on actual write Accumulator<Double> aNnz = null; if( !mc.nnzKnown() ) { aNnz = sec.getSparkContext().accumulator(0L); in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz)); } //save binary block rdd on hdfs in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); if( !mc.nnzKnown() ) mc.setNonZeros((long)aNnz.value().longValue()); } else { //unsupported formats: binarycell (not externalized) throw new DMLRuntimeException("Unexpected data format: " + outFmt); } // write meta data file MapReduceTool.writeMetaDataFile (fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties); } catch(IOException ex) { throw new DMLRuntimeException("Failed to process write instruction", ex); } } /** * * @param rdd * @param fname * @param inSingleFile * @throws DMLRuntimeException */ private void customSaveTextFile(JavaRDD<String> rdd, String fname, boolean inSingleFile) throws DMLRuntimeException { if(inSingleFile) { Random rand = new Random(); String randFName = fname + "_" + rand.nextLong() + "_" + rand.nextLong(); try { while(MapReduceTool.existsFileOnHDFS(randFName)) { randFName = fname + "_" + rand.nextLong() + "_" + rand.nextLong(); } rdd.saveAsTextFile(randFName); MapReduceTool.mergeIntoSingleFile(randFName, fname); // Faster version :) // rdd.coalesce(1, true).saveAsTextFile(randFName); // MapReduceTool.copyFileOnHDFS(randFName + "/part-00000", fname); } catch (IOException e) { throw new DMLRuntimeException("Cannot merge the output into single file: " + e.getMessage()); } finally { try { // This is to make sure that we donot create random files on HDFS MapReduceTool.deleteFileIfExistOnHDFS( randFName ); } catch (IOException e) { throw new DMLRuntimeException("Cannot merge the output into single file: " + e.getMessage()); } } } else { rdd.saveAsTextFile(fname); } } }