/*
* Copyright © 2014-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.sparkpagerank;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.annotation.UseDataSet;
import co.cask.cdap.api.app.AbstractApplication;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.schema.UnsupportedTypeException;
import co.cask.cdap.api.data.stream.Stream;
import co.cask.cdap.api.dataset.DatasetProperties;
import co.cask.cdap.api.dataset.lib.ObjectStore;
import co.cask.cdap.api.dataset.lib.ObjectStores;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.service.Service;
import co.cask.cdap.api.service.http.AbstractHttpServiceHandler;
import co.cask.cdap.api.service.http.HttpServiceRequest;
import co.cask.cdap.api.service.http.HttpServiceResponder;
import co.cask.cdap.api.spark.AbstractSpark;
import co.cask.cdap.api.spark.SparkClientContext;
import co.cask.cdap.api.workflow.AbstractWorkflow;
import com.google.common.base.Charsets;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.spark.SparkConf;
import java.io.IOException;
import java.net.HttpURLConnection;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.PathParam;
/**
* An Application that calculates page rank of URLs from an input stream.
*/
public class SparkPageRankApp extends AbstractApplication {
public static final String SERVICE_HANDLERS = "SparkPageRankService";
public static final String BACKLINK_URL_STREAM = "backlinkURLStream";
@Override
public void configure() {
setName("SparkPageRank");
setDescription("Spark page rank application.");
// Ingest data into the Application via a Stream
addStream(new Stream(BACKLINK_URL_STREAM));
// Run a Spark program on the acquired data
addSpark(new PageRankSpark());
// Runs MapReduce program on data emitted by Spark program
addMapReduce(new RanksCounter());
// Runs Spark followed by a MapReduce in a Workflow
addWorkflow(new PageRankWorkflow());
// Service to retrieve process data
addService(SERVICE_HANDLERS, new SparkPageRankServiceHandler());
// Store input and processed data in ObjectStore Datasets
try {
ObjectStores.createObjectStore(getConfigurer(), "ranks", Integer.class,
DatasetProperties.builder().setDescription("Ranks Dataset").build());
ObjectStores.createObjectStore(getConfigurer(), "rankscount", Integer.class,
DatasetProperties.builder().setDescription("Ranks Count Dataset").build());
} catch (UnsupportedTypeException e) {
// This exception is thrown by ObjectStore if its parameter type cannot be
// (de)serialized (for example, if it is an interface and not a class, then there is
// no auto-magic way deserialize an object.) In this case that will not happen
// because String and Double are actual classes.
throw new RuntimeException(e);
}
}
/**
* PageRankWorkflow which connect a Spark program followed by a MapReduce
*/
public static class PageRankWorkflow extends AbstractWorkflow {
@Override
public void configure() {
setDescription("Runs PageRankSpark program followed by RanksCounter MapReduce program");
addSpark(PageRankSpark.class.getSimpleName());
addMapReduce(RanksCounter.class.getSimpleName());
}
}
/**
* A Spark program that calculates page rank.
*/
public static final class PageRankSpark extends AbstractSpark {
@Override
public void configure() {
setDescription("Spark page rank program");
setMainClass(SparkPageRankProgram.class);
setDriverResources(new Resources(1024));
setExecutorResources(new Resources(1024));
}
@Override
public void beforeSubmit(SparkClientContext context) throws Exception {
context.setSparkConf(new SparkConf().set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m"));
}
}
/**
* A {@link Service} with handlers to get rank of a url, total number of pages for a given rank and transform a page
* rank on a scale of 1 to 10
*/
public static final class SparkPageRankServiceHandler extends AbstractHttpServiceHandler {
private static final Gson GSON = new Gson();
public static final String URL_KEY = "url";
public static final String RANKS_PATH = "rank";
public static final String TOTAL_PAGES_PATH = "total";
public static final String TRANSFORM_PATH = "transform";
@UseDataSet("rankscount")
private ObjectStore<Integer> store;
@UseDataSet("ranks")
private ObjectStore<Integer> ranks;
@Path(RANKS_PATH)
@POST
public void getRank(HttpServiceRequest request, HttpServiceResponder responder) {
String urlRequest = Charsets.UTF_8.decode(request.getContent()).toString();
String url = GSON.fromJson(urlRequest, JsonObject.class).get(URL_KEY).getAsString();
if (url == null) {
responder.sendError(HttpURLConnection.HTTP_BAD_REQUEST,
"The url must be specified with \"url\" as key in JSON.");
return;
}
// Get the rank from the ranks dataset
Integer rank = ranks.read(url.getBytes(Charsets.UTF_8));
if (rank == null) {
responder.sendError(HttpURLConnection.HTTP_NO_CONTENT, String.format("No rank found of %s", url));
} else {
responder.sendString(rank.toString());
}
}
@Path(TOTAL_PAGES_PATH + "/{pr}")
@GET
public void centers(HttpServiceRequest request, HttpServiceResponder responder,
@PathParam("pr") Integer pageRank) {
Integer totalPages = store.read(Bytes.toBytes(pageRank));
if (totalPages == null) {
responder.sendString(HttpURLConnection.HTTP_NO_CONTENT,
String.format("No pages found with pr: %s", pageRank), Charsets.UTF_8);
} else {
responder.sendString(HttpURLConnection.HTTP_OK, totalPages.toString(), Charsets.UTF_8);
}
}
@Path(TRANSFORM_PATH + "/{pr}")
@GET
public void transform(HttpServiceRequest request, HttpServiceResponder responder, @PathParam("pr") String pr) {
responder.sendString(String.valueOf((int) (Math.round(Double.parseDouble(pr) * 10))));
}
}
/**
* MapReduce job which counts the total number of pages for every unique page rank
*/
public static class RanksCounter extends AbstractMapReduce {
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Job job = context.getHadoopJob();
job.setMapperClass(Emitter.class);
job.setReducerClass(Counter.class);
job.setNumReduceTasks(1);
context.addInput(Input.ofDataset("ranks"));
context.addOutput(Output.ofDataset("rankscount"));
}
/**
* A mapper that emits each url's page rank with a value of 1.
*/
public static class Emitter extends Mapper<byte[], Integer, IntWritable, IntWritable> {
private static final IntWritable ONE = new IntWritable(1);
@Override
protected void map(byte[] key, Integer value, Context context)
throws IOException, InterruptedException {
context.write(new IntWritable(value), ONE);
}
}
/**
* A reducer that sums up the counts for each key.
*/
public static class Counter extends Reducer<IntWritable, IntWritable, byte[], Integer> {
@Override
public void reduce(IntWritable key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(Bytes.toBytes(key.get()), sum);
}
}
}
}