/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.transform; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.broadcast.Broadcast; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import scala.Tuple2; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties; public class ApplyTfCSVSPARK { /** * Apply transformation metadata and generate the result in CSV format, as a * JavaRDD of Strings. * * @param sec spark execution context * @param inputRDD input rdd * @param tfMtdPath transform metadata path * @param spec transform specification as json string * @param tmpPath temporary file path * @param prop csv file format properties * @param numCols number of columns * @param headerLine header line * @return JavaPairRDD of long-strings * @throws IOException if IOException occurs * @throws ClassNotFoundException if ClassNotFoundException occurs * @throws InterruptedException if InterruptedException occurs * @throws IllegalArgumentException if IllegalArgumentException occurs * @throws JSONException if JSONException occurs */ public static JavaPairRDD<Long, String> runSparkJob( SparkExecutionContext sec, JavaRDD<Tuple2<LongWritable, Text>> inputRDD, String tfMtdPath, String spec, String tmpPath, CSVFileFormatProperties prop, int numCols, String headerLine) throws IOException, ClassNotFoundException, InterruptedException, IllegalArgumentException, JSONException { // Load transformation metadata and broadcast it String[] naStrings = TfUtils.parseNAStrings(prop.getNAStrings()); JSONObject jspec = new JSONObject(spec); TfUtils _tfmapper = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), naStrings, jspec, numCols, tfMtdPath, null, tmpPath); _tfmapper.loadTfMetadata(); Broadcast<TfUtils> bcast_tf = sec.getSparkContext().broadcast(_tfmapper); /* * Construct transformation metadata (map-side) -- the logic is similar * to GTFMTDMapper * * Note: The result of mapPartitionsWithIndex is cached so that the * transformed data is not redundantly computed multiple times */ JavaPairRDD<Long, String> applyRDD = inputRDD .mapPartitionsWithIndex( new ApplyTfCSVMap(bcast_tf), true) .mapToPair( new PairFunction<String,Long,String>(){ private static final long serialVersionUID = 3868143093999082931L; @Override public Tuple2<Long, String> call(String t) throws Exception { return new Tuple2<Long, String>(new Long(1), t); } } ).cache(); /* * An action to force execution of apply() * * We need to trigger the execution of this RDD so as to ensure the * creation of a few metadata files (headers, dummycoded information, * etc.), which are referenced in the caller function. */ applyRDD.count(); return applyRDD; } public static class ApplyTfCSVMap implements Function2<Integer, Iterator<Tuple2<LongWritable, Text>>, Iterator<String>> { private static final long serialVersionUID = 1496686437276906911L; TfUtils _tfmapper = null; ApplyTfCSVMap(Broadcast<TfUtils> tf) throws IllegalArgumentException, IOException, JSONException { _tfmapper = tf.getValue(); } @Override public Iterator<String> call(Integer partitionID, Iterator<Tuple2<LongWritable, Text>> csvLines) throws Exception { boolean first = true; Tuple2<LongWritable, Text> rec = null; ArrayList<String> outLines = new ArrayList<String>(); while(csvLines.hasNext()) { rec = csvLines.next(); if (first && partitionID == 0) { first = false; _tfmapper.processHeaderLine(); if (_tfmapper.hasHeader() ) { continue; } } // parse the input line and apply transformation String[] words = _tfmapper.getWords(rec._2()); if(!_tfmapper.omit(words)) { try { words = _tfmapper.apply(words); String outStr = _tfmapper.checkAndPrepOutputString(words); outLines.add(outStr); } catch(DMLRuntimeException e) { throw new RuntimeException(e.getMessage() + ": " + rec._2().toString()); } } } return outLines.iterator(); } } }