/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.runtime.instructions.spark; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.spark.api.java.JavaPairRDD; import com.ibm.bi.dml.hops.recompile.Recompiler; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.DMLUnsupportedOperationException; import com.ibm.bi.dml.runtime.controlprogram.caching.MatrixObject; import com.ibm.bi.dml.runtime.controlprogram.context.ExecutionContext; import com.ibm.bi.dml.runtime.controlprogram.context.SparkExecutionContext; import com.ibm.bi.dml.runtime.instructions.Instruction; import com.ibm.bi.dml.runtime.instructions.InstructionUtils; import com.ibm.bi.dml.runtime.instructions.cp.CPOperand; import com.ibm.bi.dml.runtime.instructions.spark.utils.RDDConverterUtils; import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics; import com.ibm.bi.dml.runtime.matrix.MatrixFormatMetaData; import com.ibm.bi.dml.runtime.matrix.data.InputInfo; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.MatrixIndexes; import com.ibm.bi.dml.runtime.matrix.operators.Operator; public class CSVReblockSPInstruction extends UnarySPInstruction { private int _brlen; private int _bclen; private boolean _hasHeader; private String _delim; private boolean _fill; private double _missingValue; public CSVReblockSPInstruction(Operator op, CPOperand in, CPOperand out, int br, int bc, boolean hasHeader, String delim, boolean fill, double missingValue, String opcode, String instr) { super(op, in, out, opcode, instr); _brlen = br; _bclen = bc; _hasHeader = hasHeader; _delim = delim; _fill = fill; _missingValue = missingValue; } public static Instruction parseInstruction(String str) throws DMLRuntimeException { String opcode = InstructionUtils.getOpCode(str); if( !opcode.equals("csvrblk") ) { throw new DMLRuntimeException( "Incorrect opcode for CSVReblockSPInstruction:" + opcode); } // Example parts of CSVReblockSPInstruction: // [csvrblk, pREADmissing_val_maps·MATRIX·DOUBLE, _mVar37·MATRIX·DOUBLE, // 1000, 1000, false, ,, true, 0.0] String parts[] = InstructionUtils.getInstructionPartsWithValueType(str); CPOperand in = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[2]); int brlen = Integer.parseInt(parts[3]); int bclen = Integer.parseInt(parts[4]); boolean hasHeader = Boolean.parseBoolean(parts[5]); String delim = parts[6]; boolean fill = Boolean.parseBoolean(parts[7]); double missingValue = Double.parseDouble(parts[8]); return new CSVReblockSPInstruction(null, in, out, brlen, bclen, hasHeader, delim, fill, missingValue, opcode, str); } @Override @SuppressWarnings("unchecked") public void processInstruction(ExecutionContext ec) throws DMLRuntimeException, DMLUnsupportedOperationException { SparkExecutionContext sec = (SparkExecutionContext) ec; //sanity check input info MatrixObject mo = sec.getMatrixObject(input1.getName()); MatrixFormatMetaData iimd = (MatrixFormatMetaData) mo.getMetaData(); if (iimd.getInputInfo() != InputInfo.CSVInputInfo) { throw new DMLRuntimeException("The given InputInfo is not implemented for " + "CSVReblockSPInstruction:" + iimd.getInputInfo()); } //set output characteristics MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName()); MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName()); mcOut.set(mcIn.getRows(), mcIn.getCols(), _brlen, _bclen); //check for in-memory reblock (w/ lazy spark context, potential for latency reduction) if( Recompiler.checkCPReblock(sec, input1.getName()) ) { Recompiler.executeInMemoryReblock(sec, input1.getName(), output.getName()); return; } //check jdk version (prevent double.parseDouble contention on <jdk8) sec.checkAndRaiseValidationWarningJDKVersion(); //get input rdd (needs to be longwritable/text for consistency with meta data, in case of //serialization issues create longwritableser/textser as serializable wrappers JavaPairRDD<LongWritable, Text> in = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iimd.getInputInfo()); //reblock csv to binary block JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.csvToBinaryBlock( sec.getSparkContext(), in, mcOut, _hasHeader, _delim, _fill, _missingValue); // put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); } }