package com.mongodb.spark.examples.enron;
import com.mongodb.hadoop.MongoInputFormat;
import com.mongodb.hadoop.MongoOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.bson.BSONObject;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
public class Enron {
public void run() {
JavaSparkContext sc = new JavaSparkContext(new SparkConf());
// Set configuration options for the MongoDB Hadoop Connector.
Configuration mongodbConfig = new Configuration();
// MongoInputFormat allows us to read from a live MongoDB instance.
// We could also use BSONFileInputFormat to read BSON snapshots.
mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");
// MongoDB connection string naming a collection to use.
// If using BSON, use "mapred.input.dir" to configure the directory
// where BSON files are located instead.
mongodbConfig.set("mongo.input.uri",
"mongodb://localhost:27017/enron_mail.messages");
// Create an RDD backed by the MongoDB collection.
JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD(
mongodbConfig, // Configuration
MongoInputFormat.class, // InputFormat: read from a live cluster.
Object.class, // Key class
BSONObject.class // Value class
);
JavaRDD<String> edges = documents.flatMap(
new FlatMapFunction<Tuple2<Object, BSONObject>, String>() {
@Override
public Iterable<String> call(final Tuple2<Object, BSONObject> t) throws Exception {
BSONObject header = (BSONObject) t._2().get("headers");
String to = (String) header.get("To");
String from = (String) header.get("From");
// each tuple in the set is an individual from|to pair
//JavaPairRDD<String, Integer> tuples = new JavaPairRDD<String, Integer>();
List<String> tuples = new ArrayList<String>();
if (to != null && !to.isEmpty()) {
for (String recipient : to.split(",")) {
String s = recipient.trim();
if (s.length() > 0) {
tuples.add(from + "|" + s);
}
}
}
return tuples;
}
}
);
JavaPairRDD<String, Integer> pairs = edges.mapToPair(
new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(final String s) {
return new Tuple2<String, Integer>(s, 1);
}
}
);
JavaPairRDD<String, Integer> counts = pairs.reduceByKey(
new Function2<Integer, Integer, Integer>() {
public Integer call(final Integer a, final Integer b) {
return a + b;
}
}
);
// Create a separate Configuration for saving data back to MongoDB.
Configuration outputConfig = new Configuration();
outputConfig.set("mongo.output.uri",
"mongodb://localhost:27017/enron_mail.message_pairs");
// Save this RDD as a Hadoop "file".
// The path argument is unused; all documents will go to 'mongo.output.uri'.
counts.saveAsNewAPIHadoopFile(
"file:///this-is-completely-unused",
Object.class,
BSONObject.class,
MongoOutputFormat.class,
outputConfig
);
}
public static void main(final String[] args) {
new Enron().run();
}
}