package thinkbig;
import java.util.StringTokenizer;
import org.apache.hadoop.util.GenericOptionsParser;
import tap.CommandOptions;
import tap.Pipe;
import tap.Tap;
import tap.TapMapper;
import tap.TapReducer;
import thinkbig.examples.messages.*;
public class WordCount {
public static void main(String[] args) throws Exception {
CommandOptions o = new CommandOptions(args);
/* Set up a basic pipeline of map reduce */
Tap wordcount = new Tap(o).named("wordcount");
/* Parse options - just use the standard options - input and output location, time window, etc. */
if (o.input == null) {
System.err.println("Must specify input directory");
return;
}
if (o.output == null) {
System.err.println("Must specify output directory");
return;
}
wordcount.getConf().setJarByClass(WordCount.class);
wordcount.createPhase().reads(o.input).writes(o.output).map(Mapper.class).
groupBy("word").reduce(Reducer.class);
wordcount.make();
}
public static class Mapper extends TapMapper<String,Wordcountmsg.WordCountRecord> {
private Wordcountmsg.WordCountRecord outrec;
@Override
public void map(String line, Pipe<Wordcountmsg.WordCountRecord> out) {
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
outrec = Wordcountmsg.WordCountRecord.newBuilder().setWord(tokenizer.nextToken()).setCount(1).build();
out.put(outrec);
}
}
}
public static class Reducer extends TapReducer<Wordcountmsg.WordCountRecord,Wordcountmsg.WordCountRecord> {
private Wordcountmsg.WordCountRecord outrec;
@Override
public void reduce(Pipe<Wordcountmsg.WordCountRecord> in, Pipe<Wordcountmsg.WordCountRecord> out) {
int count = 0;
String word = null;
for (Wordcountmsg.WordCountRecord rec : in) {
word = rec.getWord();
count++;
}
outrec = Wordcountmsg.WordCountRecord.newBuilder().setWord(word).setCount(count).build();
out.put(outrec);
System.out.println(outrec.toString());
}
}
}