/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.spark.app; import co.cask.cdap.api.TaskLocalizationContext; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.spark.JavaSparkExecutionContext; import co.cask.cdap.api.spark.JavaSparkMain; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; import java.io.File; import java.net.URI; import java.util.Iterator; import java.util.Map; import static co.cask.cdap.spark.app.SparkAppUsingLocalFiles.OUTPUT_DATASET_NAME; /** * Spark program that uses local files in Java. */ public class SparkUsingLocalFilesMain implements JavaSparkMain { @Override public void run(JavaSparkExecutionContext sec) throws Exception { JavaSparkContext jsc = new JavaSparkContext(); Map<String, String> args = sec.getRuntimeArguments(); Preconditions.checkArgument(args.containsKey(SparkAppUsingLocalFiles.LOCAL_FILE_RUNTIME_ARG), "Runtime argument %s must be set.", SparkAppUsingLocalFiles.LOCAL_FILE_RUNTIME_ARG); final String localFilePath = URI.create(args.get(SparkAppUsingLocalFiles.LOCAL_FILE_RUNTIME_ARG)).getPath(); JavaRDD<String> fileContents = jsc.textFile(localFilePath, 1); final TaskLocalizationContext taskLocalizationContext = sec.getLocalizationContext(); JavaPairRDD<byte[], byte[]> rows = fileContents.mapToPair(new PairFunction<String, byte[], byte[]>() { @Override public Tuple2<byte[], byte[]> call(String line) throws Exception { Map<String, File> localFiles = taskLocalizationContext.getAllLocalFiles(); Preconditions.checkState(localFiles.containsKey(SparkAppUsingLocalFiles.LOCAL_FILE_ALIAS), "File %s should have been localized with the name %s.", localFilePath, SparkAppUsingLocalFiles.LOCAL_FILE_ALIAS); Preconditions.checkState(localFiles.containsKey(SparkAppUsingLocalFiles.LOCAL_ARCHIVE_ALIAS), "A temporary archive should have been localized with the name %s.", SparkAppUsingLocalFiles.LOCAL_ARCHIVE_ALIAS); boolean localFileFound = false; for (File localFile : localFiles.values()) { if (SparkAppUsingLocalFiles.LOCAL_FILE_ALIAS.equals(localFile.getName())) { localFileFound = true; break; } } Preconditions.checkState(localFileFound, "Local file must be found."); File localFile = taskLocalizationContext.getLocalFile(SparkAppUsingLocalFiles.LOCAL_FILE_ALIAS); Preconditions.checkState(localFile.exists(), "Local file %s must exist.", localFile); File localArchive = taskLocalizationContext.getLocalFile(SparkAppUsingLocalFiles.LOCAL_ARCHIVE_ALIAS); Preconditions.checkState(localArchive.exists(), "Local archive %s must exist.", SparkAppUsingLocalFiles.LOCAL_ARCHIVE_ALIAS); Preconditions.checkState(localArchive.isDirectory(), "Local archive %s should have been extracted to a directory.", SparkAppUsingLocalFiles.LOCAL_ARCHIVE_ALIAS); Iterator<String> splitter = Splitter.on("=").omitEmptyStrings().trimResults().split(line).iterator(); Preconditions.checkArgument(splitter.hasNext()); String key = splitter.next(); Preconditions.checkArgument(splitter.hasNext()); String value = splitter.next(); return new Tuple2<>(Bytes.toBytes(key), Bytes.toBytes(value)); } }); sec.saveAsDataset(rows, OUTPUT_DATASET_NAME); } }