WriteSPInstruction.java example

Explorer
systemml-master
- system-ml
  - src
/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package com.ibm.bi.dml.runtime.instructions.spark;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;

import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.Accumulator;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;

import com.ibm.bi.dml.parser.Expression.ValueType;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException;
import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject;
import com.ibm.bi.dml.runtime.controlprogram.context.ExecutionContext;
import com.ibm.bi.dml.runtime.controlprogram.context.SparkExecutionContext;
import com.ibm.bi.dml.runtime.instructions.Instruction;
import com.ibm.bi.dml.runtime.instructions.InstructionUtils;
import com.ibm.bi.dml.runtime.instructions.cp.CPOperand;
import com.ibm.bi.dml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction;
import com.ibm.bi.dml.runtime.instructions.spark.functions.ConvertMatrixBlockToIJVLines;
import com.ibm.bi.dml.runtime.instructions.spark.utils.RDDConverterUtils;
import com.ibm.bi.dml.runtime.instructions.spark.utils.SparkUtils;
import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics;
import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.util.MapReduceTool;

public class WriteSPInstruction extends SPInstruction 
{	
	private CPOperand input1 = null; 
	private CPOperand input2 = null;
	private CPOperand input3 = null;
	private FileFormatProperties formatProperties;
	
	//scalars might occur for transform
	private boolean isInputMatrixBlock = true; 
	
	public WriteSPInstruction(String opcode, String istr) {
		super(opcode, istr);
	}

	public WriteSPInstruction(CPOperand in1, CPOperand in2, CPOperand in3, String opcode, String str) {
		super(opcode, str);
		input1 = in1;
		input2 = in2;
		input3 = in3;
		
		formatProperties = null; // set in case of csv
	}

	public static Instruction parseInstruction ( String str ) 
		throws DMLRuntimeException 
	{
		String[] parts = InstructionUtils.getInstructionPartsWithValueType ( str );
		String opcode = parts[0];
		
		if( !opcode.equals("write") ) {
			throw new DMLRuntimeException("Unsupported opcode");
		}
		
		// All write instructions have 3 parameters, except in case of delimited/csv file.
		// Write instructions for csv files also include three additional parameters (hasHeader, delimiter, sparse)
		if ( parts.length != 4 && parts.length != 8 ) {
			throw new DMLRuntimeException("Invalid number of operands in write instruction: " + str);
		}
		
		//SPARK°write°_mVar2·MATRIX·DOUBLE°./src/test/scripts/functions/data/out/B·SCALAR·STRING·true°matrixmarket·SCALAR·STRING·true
		// _mVar2·MATRIX·DOUBLE
		CPOperand in1=null, in2=null, in3=null;
		in1 = new CPOperand(parts[1]);
		in2 = new CPOperand(parts[2]);
		in3 = new CPOperand(parts[3]);
		
		WriteSPInstruction inst = new WriteSPInstruction(in1, in2, in3, opcode, str); 
		
		if ( in3.getName().equalsIgnoreCase("csv") ) {
			boolean hasHeader = Boolean.parseBoolean(parts[4]);
			String delim = parts[5];
			boolean sparse = Boolean.parseBoolean(parts[6]);
			FileFormatProperties formatProperties = new CSVFileFormatProperties(hasHeader, delim, sparse);
			inst.setFormatProperties(formatProperties);
			
			boolean isInputMB = Boolean.parseBoolean(parts[7]);
			inst.setInputMatrixBlock(isInputMB);
		}
		return inst;		
	}
	
	
	public FileFormatProperties getFormatProperties() {
		return formatProperties;
	}
	
	public void setFormatProperties(FileFormatProperties prop) {
		formatProperties = prop;
	}
	
	public void setInputMatrixBlock(boolean isMB) {
		isInputMatrixBlock = isMB;
	}
	
	public boolean isInputMatrixBlock() {
		return isInputMatrixBlock;
	}
	
	@Override
	public void processInstruction(ExecutionContext ec)
			throws DMLRuntimeException, DMLUnsupportedOperationException 
	{			
		SparkExecutionContext sec = (SparkExecutionContext) ec;

		//get filename (literal or variable expression)
		String fname = ec.getScalarInput(input2.getName(), ValueType.STRING, input2.isLiteral()).getStringValue();
		
		try
		{
			//if the file already exists on HDFS, remove it.
			MapReduceTool.deleteFileIfExistOnHDFS( fname );

			//prepare output info according to meta data
			String outFmt = input3.getName();
			OutputInfo oi = OutputInfo.stringToOutputInfo(outFmt);
				
			//get input rdd
			JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable( input1.getName() );
			MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
			
			if(    oi == OutputInfo.MatrixMarketOutputInfo
				|| oi == OutputInfo.TextCellOutputInfo     ) 
			{
				//recompute nnz if necessary (required for header if matrix market)
				if ( isInputMatrixBlock && !mc.nnzKnown() )
					mc.setNonZeros( SparkUtils.computeNNZFromBlocks(in1) );
				
				JavaRDD<String> header = null;				
				if(outFmt.equalsIgnoreCase("matrixmarket")) {
					ArrayList<String> headerContainer = new ArrayList<String>(1);
					// First output MM header
					String headerStr = "%%MatrixMarket matrix coordinate real general\n" +
							// output number of rows, number of columns and number of nnz
							mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
					headerContainer.add(headerStr);
					header = sec.getSparkContext().parallelize(headerContainer);
				}
				
				JavaRDD<String> ijv = in1.flatMap(new ConvertMatrixBlockToIJVLines(mc.getRowsPerBlock(), mc.getColsPerBlock()));
				if(header != null)
					customSaveTextFile(header.union(ijv), fname, true);
				else
					customSaveTextFile(ijv, fname, false);
			}
			else if( oi == OutputInfo.CSVOutputInfo ) 
			{
				JavaRDD<String> out = null;
				Accumulator<Double> aNnz = null;
				
				if ( isInputMatrixBlock ) {
					//piggyback nnz computation on actual write
					if( !mc.nnzKnown() ) {
						aNnz = sec.getSparkContext().accumulator(0L);
						in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
					}	
					
					out = RDDConverterUtils.binaryBlockToCsv(in1, mc, 
							(CSVFileFormatProperties) formatProperties, true);
				}
				else 
				{
					// This case is applicable when the CSV output from transform() is written out
					@SuppressWarnings("unchecked")
					JavaPairRDD<Long,String> rdd = (JavaPairRDD<Long, String>) ((MatrixObject) sec.getVariable(input1.getName())).getRDDHandle().getRDD();
					out = rdd.values(); 

					String sep = ",";
					boolean hasHeader = false;
					if(formatProperties != null) {
						sep = ((CSVFileFormatProperties) formatProperties).getDelim();
						hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
					}
					
					if(hasHeader) {
						StringBuffer buf = new StringBuffer();
			    		for(int j = 1; j < mc.getCols(); j++) {
			    			if(j != 1) {
			    				buf.append(sep);
			    			}
			    			buf.append("C" + j);
			    		}
			    		ArrayList<String> headerContainer = new ArrayList<String>(1);
			    		headerContainer.add(0, buf.toString());
			    		JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
			    		out = header.union(out);
					}
				}
				
				customSaveTextFile(out, fname, false);
				
				if( isInputMatrixBlock && !mc.nnzKnown() )
					mc.setNonZeros((long)aNnz.value().longValue());
			}
			else if( oi == OutputInfo.BinaryBlockOutputInfo ) {
				//piggyback nnz computation on actual write
				Accumulator<Double> aNnz = null;
				if( !mc.nnzKnown() ) {
					aNnz = sec.getSparkContext().accumulator(0L);
					in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
				}
				
				//save binary block rdd on hdfs
				in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
				
				if( !mc.nnzKnown() )
					mc.setNonZeros((long)aNnz.value().longValue());
			}
			else {
				//unsupported formats: binarycell (not externalized)
				throw new DMLRuntimeException("Unexpected data format: " + outFmt);
			}
			
			// write meta data file
			MapReduceTool.writeMetaDataFile (fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);	
		}
		catch(IOException ex)
		{
			throw new DMLRuntimeException("Failed to process write instruction", ex);
		}
	}
	
	/**
	 * 
	 * @param rdd
	 * @param fname
	 * @param inSingleFile
	 * @throws DMLRuntimeException
	 */
	private void customSaveTextFile(JavaRDD<String> rdd, String fname, boolean inSingleFile) 
		throws DMLRuntimeException 
	{
		if(inSingleFile) {
			Random rand = new Random();
			String randFName = fname + "_" + rand.nextLong() + "_" + rand.nextLong();
			try {
				while(MapReduceTool.existsFileOnHDFS(randFName)) {
					randFName = fname + "_" + rand.nextLong() + "_" + rand.nextLong();
				}
				
				rdd.saveAsTextFile(randFName);
				MapReduceTool.mergeIntoSingleFile(randFName, fname); // Faster version :)
				
				// rdd.coalesce(1, true).saveAsTextFile(randFName);
				// MapReduceTool.copyFileOnHDFS(randFName + "/part-00000", fname);
			} catch (IOException e) {
				throw new DMLRuntimeException("Cannot merge the output into single file: " + e.getMessage());
			}
			finally {
				try {
					// This is to make sure that we donot create random files on HDFS
					MapReduceTool.deleteFileIfExistOnHDFS( randFName );
				} catch (IOException e) {
					throw new DMLRuntimeException("Cannot merge the output into single file: " + e.getMessage());
				}
			}
		}
		else {
			rdd.saveAsTextFile(fname);
		}
	}
}