/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.instructions.spark;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.util.LongAccumulator;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.instructions.cp.CPOperand;
import org.apache.sysml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction;
import org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils;
import org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction;
import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties;
import org.apache.sysml.runtime.matrix.data.FileFormatProperties;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.util.MapReduceTool;
public class WriteSPInstruction extends SPInstruction
{
private CPOperand input1 = null;
private CPOperand input2 = null;
private CPOperand input3 = null;
private CPOperand input4 = null;
private FileFormatProperties formatProperties;
//scalars might occur for transform
// TODO remove once transform over frames supported
private boolean isInputMatrixBlock = true;
public WriteSPInstruction(CPOperand in1, CPOperand in2, CPOperand in3, String opcode, String str) {
super(opcode, str);
input1 = in1;
input2 = in2;
input3 = in3;
formatProperties = null; // set in case of csv
}
public static WriteSPInstruction parseInstruction ( String str )
throws DMLRuntimeException
{
String[] parts = InstructionUtils.getInstructionPartsWithValueType ( str );
String opcode = parts[0];
if( !opcode.equals("write") ) {
throw new DMLRuntimeException("Unsupported opcode");
}
// All write instructions have 3 parameters, except in case of delimited/csv file.
// Write instructions for csv files also include three additional parameters (hasHeader, delimiter, sparse)
if ( parts.length != 5 && parts.length != 9 ) {
throw new DMLRuntimeException("Invalid number of operands in write instruction: " + str);
}
//SPARK°write°_mVar2·MATRIX·DOUBLE°./src/test/scripts/functions/data/out/B·SCALAR·STRING·true°matrixmarket·SCALAR·STRING·true
// _mVar2·MATRIX·DOUBLE
CPOperand in1 = new CPOperand(parts[1]);
CPOperand in2 = new CPOperand(parts[2]);
CPOperand in3 = new CPOperand(parts[3]);
WriteSPInstruction inst = new WriteSPInstruction(in1, in2, in3, opcode, str);
if ( in3.getName().equalsIgnoreCase("csv") ) {
boolean hasHeader = Boolean.parseBoolean(parts[4]);
String delim = parts[5];
boolean sparse = Boolean.parseBoolean(parts[6]);
FileFormatProperties formatProperties = new CSVFileFormatProperties(hasHeader, delim, sparse);
inst.setFormatProperties(formatProperties);
boolean isInputMB = Boolean.parseBoolean(parts[7]);
inst.setInputMatrixBlock(isInputMB);
CPOperand in4 = new CPOperand(parts[8]);
inst.input4 = in4;
} else {
FileFormatProperties ffp = new FileFormatProperties();
CPOperand in4 = new CPOperand(parts[4]);
inst.input4 = in4;
inst.setFormatProperties(ffp);
}
return inst;
}
public FileFormatProperties getFormatProperties() {
return formatProperties;
}
public void setFormatProperties(FileFormatProperties prop) {
formatProperties = prop;
}
public void setInputMatrixBlock(boolean isMB) {
isInputMatrixBlock = isMB;
}
public boolean isInputMatrixBlock() {
return isInputMatrixBlock;
}
@Override
public void processInstruction(ExecutionContext ec)
throws DMLRuntimeException
{
SparkExecutionContext sec = (SparkExecutionContext) ec;
//get filename (literal or variable expression)
String fname = ec.getScalarInput(input2.getName(), ValueType.STRING, input2.isLiteral()).getStringValue();
String desc = ec.getScalarInput(input4.getName(), ValueType.STRING, input4.isLiteral()).getStringValue();
formatProperties.setDescription(desc);
ValueType[] schema = (input1.getDataType()==DataType.FRAME) ?
sec.getFrameObject(input1.getName()).getSchema() : null;
try
{
//if the file already exists on HDFS, remove it.
MapReduceTool.deleteFileIfExistOnHDFS( fname );
//prepare output info according to meta data
String outFmt = input3.getName();
OutputInfo oi = OutputInfo.stringToOutputInfo(outFmt);
//core matrix/frame write
if( input1.getDataType()==DataType.MATRIX )
processMatrixWriteInstruction(sec, fname, oi);
else
processFrameWriteInstruction(sec, fname, oi, schema);
}
catch(IOException ex)
{
throw new DMLRuntimeException("Failed to process write instruction", ex);
}
}
protected void processMatrixWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi)
throws DMLRuntimeException, IOException
{
//get input rdd
JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable( input1.getName() );
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if( oi == OutputInfo.MatrixMarketOutputInfo
|| oi == OutputInfo.TextCellOutputInfo )
{
//piggyback nnz maintenance on write
LongAccumulator aNnz = null;
if ( isInputMatrixBlock && !mc.nnzKnown() ) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
JavaRDD<String> header = null;
if( oi == OutputInfo.MatrixMarketOutputInfo ) {
ArrayList<String> headerContainer = new ArrayList<String>(1);
// First output MM header
String headerStr = "%%MatrixMarket matrix coordinate real general\n" +
// output number of rows, number of columns and number of nnz
mc.getRows() + " " + mc.getCols() + " " + mc.getNonZeros();
headerContainer.add(headerStr);
header = sec.getSparkContext().parallelize(headerContainer);
}
JavaRDD<String> ijv = RDDConverterUtils.binaryBlockToTextCell(in1, mc);
if(header != null)
customSaveTextFile(header.union(ijv), fname, true);
else
customSaveTextFile(ijv, fname, false);
if ( isInputMatrixBlock && !mc.nnzKnown() )
mc.setNonZeros( aNnz.value() );
}
else if( oi == OutputInfo.CSVOutputInfo )
{
JavaRDD<String> out = null;
LongAccumulator aNnz = null;
if ( isInputMatrixBlock ) {
//piggyback nnz computation on actual write
if( !mc.nnzKnown() ) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
out = RDDConverterUtils.binaryBlockToCsv(in1, mc,
(CSVFileFormatProperties) formatProperties, true);
}
else
{
// This case is applicable when the CSV output from transform() is written out
// TODO remove once transform over frames supported
@SuppressWarnings("unchecked")
JavaPairRDD<Long,String> rdd = (JavaPairRDD<Long, String>) (sec.getMatrixObject(input1.getName())).getRDDHandle().getRDD();
out = rdd.values();
String sep = ",";
boolean hasHeader = false;
if(formatProperties != null) {
sep = ((CSVFileFormatProperties) formatProperties).getDelim();
hasHeader = ((CSVFileFormatProperties) formatProperties).hasHeader();
}
if(hasHeader) {
StringBuffer buf = new StringBuffer();
for(int j = 1; j < mc.getCols(); j++) {
if(j != 1) {
buf.append(sep);
}
buf.append("C" + j);
}
ArrayList<String> headerContainer = new ArrayList<String>(1);
headerContainer.add(0, buf.toString());
JavaRDD<String> header = sec.getSparkContext().parallelize(headerContainer);
out = header.union(out);
}
}
customSaveTextFile(out, fname, false);
if( isInputMatrixBlock && !mc.nnzKnown() )
mc.setNonZeros((long)aNnz.value().longValue());
}
else if( oi == OutputInfo.BinaryBlockOutputInfo ) {
//piggyback nnz computation on actual write
LongAccumulator aNnz = null;
if( !mc.nnzKnown() ) {
aNnz = sec.getSparkContext().sc().longAccumulator("nnz");
in1 = in1.mapValues(new ComputeBinaryBlockNnzFunction(aNnz));
}
//save binary block rdd on hdfs
in1.saveAsHadoopFile(fname, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
if( !mc.nnzKnown() )
mc.setNonZeros((long)aNnz.value().longValue());
}
else {
//unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
}
// write meta data file
MapReduceTool.writeMetaDataFile (fname + ".mtd", ValueType.DOUBLE, mc, oi, formatProperties);
}
@SuppressWarnings("unchecked")
protected void processFrameWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi, ValueType[] schema)
throws DMLRuntimeException, IOException
{
//get input rdd
JavaPairRDD<Long,FrameBlock> in1 = (JavaPairRDD<Long,FrameBlock>)sec
.getRDDHandleForVariable( input1.getName(), InputInfo.BinaryBlockInputInfo );
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if( oi == OutputInfo.TextCellOutputInfo )
{
JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToTextCell(in1, mc);
customSaveTextFile(out, fname, false);
}
else if( oi == OutputInfo.CSVOutputInfo )
{
CSVFileFormatProperties props = (formatProperties!=null) ?
(CSVFileFormatProperties) formatProperties : null;
JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToCsv(in1, mc, props, true);
customSaveTextFile(out, fname, false);
}
else if( oi == OutputInfo.BinaryBlockOutputInfo )
{
JavaPairRDD<LongWritable,FrameBlock> out = in1.mapToPair(new LongFrameToLongWritableFrameFunction());
out.saveAsHadoopFile(fname, LongWritable.class, FrameBlock.class, SequenceFileOutputFormat.class);
}
else {
//unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
}
// write meta data file
MapReduceTool.writeMetaDataFile(fname + ".mtd", input1.getValueType(), schema, DataType.FRAME, mc, oi, formatProperties);
}
private void customSaveTextFile(JavaRDD<String> rdd, String fname, boolean inSingleFile)
throws DMLRuntimeException
{
if(inSingleFile) {
Random rand = new Random();
String randFName = fname + "_" + rand.nextLong() + "_" + rand.nextLong();
try {
while(MapReduceTool.existsFileOnHDFS(randFName)) {
randFName = fname + "_" + rand.nextLong() + "_" + rand.nextLong();
}
rdd.saveAsTextFile(randFName);
MapReduceTool.mergeIntoSingleFile(randFName, fname); // Faster version :)
// rdd.coalesce(1, true).saveAsTextFile(randFName);
// MapReduceTool.copyFileOnHDFS(randFName + "/part-00000", fname);
} catch (IOException e) {
throw new DMLRuntimeException("Cannot merge the output into single file: " + e.getMessage());
}
finally {
try {
// This is to make sure that we donot create random files on HDFS
MapReduceTool.deleteFileIfExistOnHDFS( randFName );
} catch (IOException e) {
throw new DMLRuntimeException("Cannot merge the output into single file: " + e.getMessage());
}
}
}
else {
rdd.saveAsTextFile(fname);
}
}
}