package com.github.projectflink.spark; import java.util.Arrays; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaDoubleRDD; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import scala.Tuple2; public class WordCountGrouping { public static void main(String[] args) { String master = args[0]; String inFile = args[1]; String outFile = args[2]; int numParts = Integer.valueOf(args[3]); System.out.println("Starting spark with master="+master+" in="+ inFile+" out="+outFile+" numParts="+numParts+" length="+args.length); SparkConf conf = new SparkConf().setAppName("WordCountGrouping").setMaster(master); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> file = sc.textFile(inFile); JavaRDD<String> words = file.flatMap(new FlatMapFunction<String, String>() { public Iterable<String> call(String s) { return Arrays.asList(s.toLowerCase().split("\\W+")); } }); JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).filter(new Function<Tuple2<String,Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> v1) throws Exception { return !v1._1.equals(""); } }); JavaPairRDD<String, Iterable<Integer>> countsGrouped = pairs.groupByKey(); JavaPairRDD<String, Integer> counts = countsGrouped.mapToPair(new PairFunction<Tuple2<String,Iterable<Integer>>, String, Integer>() { @Override public Tuple2<String, Integer> call(Tuple2<String, Iterable<Integer>> group) throws Exception { int count = 0; for(Integer a : group._2() ) { count += a; } return new Tuple2<String, Integer>(group._1(), count); } }); counts.saveAsTextFile(outFile); } }