package com.mongodb.spark.examples.enron; import com.mongodb.hadoop.MongoInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.bson.BSONObject; import scala.Tuple2; public class DataframeExample { public void run() { JavaSparkContext sc = new JavaSparkContext(new SparkConf()); // Set configuration options for the MongoDB Hadoop Connector. Configuration mongodbConfig = new Configuration(); // MongoInputFormat allows us to read from a live MongoDB instance. // We could also use BSONFileInputFormat to read BSON snapshots. mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat"); // MongoDB connection string naming a collection to use. // If using BSON, use "mapred.input.dir" to configure the directory // where BSON files are located instead. mongodbConfig.set("mongo.input.uri", "mongodb://localhost:27017/enron_mail.messages"); // Create an RDD backed by the MongoDB collection. JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD( mongodbConfig, // Configuration MongoInputFormat.class, // InputFormat: read from a live cluster. Object.class, // Key class BSONObject.class // Value class ); JavaRDD<Message> messages = documents.map( new Function<Tuple2<Object, BSONObject>, Message>() { public Message call(final Tuple2<Object, BSONObject> tuple) { Message m = new Message(); BSONObject header = (BSONObject) tuple._2().get("headers"); m.setTo((String) header.get("To")); m.setxFrom((String) header.get("From")); m.setMessageID((String) header.get("Message-ID")); m.setBody((String) tuple._2().get("body")); return m; } } ); SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); DataFrame messagesSchema = sqlContext.createDataFrame(messages, Message.class); messagesSchema.registerTempTable("messages"); DataFrame ericsMessages = sqlContext.sql( "SELECT to, body FROM messages WHERE to = \"eric.bass@enron.com\""); ericsMessages.show(); messagesSchema.printSchema(); } public static void main(final String[] args) { new DataframeExample().run(); } }