package com.mongodb.hadoop.examples.shakespeare; import com.mongodb.hadoop.GridFSInputFormat; import com.mongodb.hadoop.MongoOutputFormat; import com.mongodb.hadoop.input.GridFSSplit; import com.mongodb.hadoop.io.BSONWritable; import com.mongodb.hadoop.util.MongoConfigUtil; import com.mongodb.hadoop.util.MongoTool; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.ToolRunner; import org.bson.BSONObject; import org.bson.BasicBSONObject; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; /** * MapReduce job that counts the most common exclamations in his complete works. */ public class Shakespeare extends MongoTool { public static final int MAX_EXCLAMATION_WORDS = 3; public static final int MIN_OCCURRENCES = 5; public Shakespeare() { JobConf conf = new JobConf(new Configuration()); if (MongoTool.isMapRedV1()) { // TODO } else { MongoConfigUtil.setInputFormat(conf, GridFSInputFormat.class); MongoConfigUtil.setOutputFormat(conf, MongoOutputFormat.class); } MongoConfigUtil.setInputURI( conf, "mongodb://localhost:27017/mongo_hadoop.fs"); // End-of-sentence punctuation with lookbehind, to keep delimiter. MongoConfigUtil.setGridFSDelimiterPattern(conf, "(?<=[.?!])"); MongoConfigUtil.setMapper(conf, ShakespeareMapper.class); MongoConfigUtil.setMapperOutputKey(conf, Text.class); MongoConfigUtil.setMapperOutputValue(conf, Text.class); MongoConfigUtil.setReducer(conf, ShakespeareReducer.class); MongoConfigUtil.setOutputKey(conf, NullWritable.class); MongoConfigUtil.setOutputValue(conf, BSONWritable.class); MongoConfigUtil.setOutputURI( conf, "mongodb://localhost:27017/mongo_hadoop.shakespeare.out"); setConf(conf); } public static void main(final String[] args) throws Exception { System.exit(ToolRunner.run(new Shakespeare(), args)); } static class ShakespeareMapper extends Mapper<NullWritable, Text, Text, Text> { private HashSet<String> secondPersonPronouns; private Text exclamation; private Text foundIn; public ShakespeareMapper() { super(); secondPersonPronouns = new HashSet<String>() {{ add("you"); add("your"); add("yours"); add("ye"); add("thou"); add("thy"); add("thine"); add("thee"); }}; exclamation = new Text(); foundIn = new Text(); } private boolean isExclamation(final String test) { // Exclamations must end! if (!test.endsWith("!")) { return false; } String[] words = test.split("[\r\n\t ]+"); // We figure the most entertaining exclamations will be directed at // the listener. for (String word : words) { if (secondPersonPronouns.contains(word)) { return true; } } // Exclamations be brief! return words.length <= MAX_EXCLAMATION_WORDS; } @Override protected void map(final NullWritable key, final Text value, final Context context) throws IOException, InterruptedException { GridFSSplit gridSplit = (GridFSSplit) context.getInputSplit(); // Work title will become the output key. String workTitle = (String) gridSplit.get("filename"); // Extract exclamations. String sentence = Text.decode(value.getBytes()).trim().toLowerCase(); if (isExclamation(sentence)) { foundIn.set(workTitle); exclamation.set(sentence); context.write(exclamation, foundIn); } } } static class ShakespeareReducer extends Reducer<Text, Text, NullWritable, BSONWritable> { private final BSONWritable bsonWritable; public ShakespeareReducer() { super(); bsonWritable = new BSONWritable(); } @Override protected void reduce( final Text key, final Iterable<Text> values, final Context context) throws IOException, InterruptedException { Map<String, Integer> foundInMap = new HashMap<String, Integer>(); int totalCount = 0; for (Text foundIn : values) { String title = foundIn.toString(); if (foundInMap.containsKey(title)) { foundInMap.put(title, foundInMap.get(title) + 1); } else { foundInMap.put(title, 1); } ++totalCount; } if (totalCount >= MIN_OCCURRENCES) { BSONObject result = new BasicBSONObject("totalCount", totalCount); result.put("exclamation", key.toString()); result.put("counts", foundInMap); result.put( "wordCount", key.toString().split("[\r\n\t ]+").length); bsonWritable.setDoc(result); context.write(NullWritable.get(), bsonWritable); } } } }