package com.manning.hip.ch1;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
public final class InvertedIndexMapReduce {
public static void main(String... args) throws Exception {
runJob(
Arrays.copyOfRange(args, 0, args.length - 1),
args[args.length - 1]);
}
public static void runJob(String[] input, String output)
throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(InvertedIndexMapReduce.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
Path outputPath = new Path(output);
FileInputFormat.setInputPaths(job, StringUtils.join(input, ","));
FileOutputFormat.setOutputPath(job, outputPath);
outputPath.getFileSystem(conf).delete(outputPath, true);
job.waitForCompletion(true);
}
public static class Map
extends Mapper<LongWritable, Text, Text, Text> {
private Text documentId;
private Text word = new Text();
@Override
protected void setup(Context context) {
String filename =
((FileSplit) context.getInputSplit()).getPath().getName();
documentId = new Text(filename);
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
for (String token : StringUtils.split(value.toString())) {
word.set(token);
context.write(word, documentId);
}
}
}
public static class Reduce
extends Reducer<Text, Text, Text, Text> {
private Text docIds = new Text();
public void reduce(Text key, Iterable<Text> values,
Context context)
throws IOException, InterruptedException {
HashSet<Text> uniqueDocIds = new HashSet<Text>();
for (Text docId : values) {
uniqueDocIds.add(new Text(docId));
}
docIds.set(new Text(StringUtils.join(uniqueDocIds, ",")));
context.write(key, docIds);
}
}
}