/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.spark.app; import co.cask.cdap.api.data.format.FormatSpecification; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.spark.AbstractSpark; import co.cask.cdap.api.spark.JavaSparkExecutionContext; import co.cask.cdap.api.spark.JavaSparkMain; import co.cask.cdap.api.stream.GenericStreamEventData; import com.google.common.collect.ImmutableList; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import scala.Tuple2; import java.io.Serializable; /** * Spark program for testing stream format specification usage. */ public class StreamFormatSpecSpark extends AbstractSpark implements JavaSparkMain { @Override protected void configure() { setMainClass(StreamFormatSpecSpark.class); } @Override public void run(JavaSparkExecutionContext sec) throws Exception { JavaSparkContext jsc = new JavaSparkContext(); SQLContext sqlContext = new SQLContext(jsc); // Read from CSV stream and turn it into a DataFrame String streamName = sec.getRuntimeArguments().get("stream.name"); Schema schema = Schema.recordOf("record", ImmutableList.of( Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.INT)) )); FormatSpecification formatSpec = new FormatSpecification("csv", schema); JavaPairRDD<Long, GenericStreamEventData<StructuredRecord>> rdd = sec.fromStream(streamName, formatSpec, StructuredRecord.class); JavaRDD<Person> personRDD = rdd.values().map(new Function<GenericStreamEventData<StructuredRecord>, Person>() { @Override public Person call(GenericStreamEventData<StructuredRecord> data) throws Exception { StructuredRecord record = data.getBody(); return new Person(record.<String>get("name"), record.<Integer>get("age")); } }); sqlContext.createDataFrame(personRDD, Person.class).registerTempTable("people"); // Execute a SQL on the table and save the result JavaPairRDD<String, Integer> resultRDD = sqlContext.sql(sec.getRuntimeArguments().get("sql.statement")) .toJavaRDD() .mapToPair(new PairFunction<Row, String, Integer>() { @Override public Tuple2<String, Integer> call(Row row) throws Exception { return new Tuple2<>(row.getString(0), row.getInt(1)); } }); sec.saveAsDataset(resultRDD, sec.getRuntimeArguments().get("output.dataset")); } public static class Person implements Serializable { private String name; private int age; public Person() { // For serialization } public Person(String name, int age) { this.name = name; this.age = age; } public String getName() { return name; } public int getAge() { return age; } } }