package nlp.com.knowledgebooks.mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import nlp.com.knowledgebooks.nlp.util.ScoredList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import nlp.com.knowledgebooks.nlp.ExtractNames;
/**
* Created by IntelliJ IDEA.
* User: markw
* <p/>
* This is an example Hadoop Map/Reduce application derived from Apache Hadoop examples.
* It reads the input files, breaks each line into words, checks to see if words are part
* of proper (human) names) and counts them. The output is a sorted list of human names
* that occur some minumum number of times in the inpup files.
* <p/>
* To run:
* <p/>
* bin/hadoop jar namefinder.jar namefinder [-m <i>maps</i>] [-r <i>reduces</i>] <i>in-dir</i> <i>out-dir</i>
* <p/>
* Copyright 2002-2013 by Mark Watson. All rights reserved.
* <p/>
* This software is not public domain. It can be legally
* used under the following licenses: LGPL version 3 or Apache 2
* <p/>
*/
public class NameFinder extends Configured implements Tool {
private static ExtractNames extractNames = new ExtractNames();
/**
* Finds human names and emits them with the document name that they are in.
*/
public static class MapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
private Text human_name = new Text();
private Text doc = new Text();
public void map(LongWritable key, Text value,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String line = value.toString();
System.err.println("NameFInder: map: key="+key+" line="+line);
int index = line.indexOf(" ");
if (index > -1) {
String doc2 = line.substring(0, index);
System.err.println("NameFInder: map: doc2="+doc2);
doc.set(doc2);
ScoredList [] names_scored_list = extractNames.getProperNames(line.substring(index));
for (ScoredList name_sc : names_scored_list) {
for (String name : name_sc.strings) {
Text human_name = new Text();
human_name.set(name);
output.collect(human_name, doc);
}
}
}
}
}
/**
* A reducer class that just emits the sum of the input values.
*/
public static class Reduce extends MapReduceBase
implements Reducer<Text, Text, Text, Text> {
public void reduce(Text person_name, Iterator<Text> documents,
OutputCollector<Text, Text> output,
Reporter reporter) throws IOException {
String person = person_name.toString();
List<String> doc_list = new ArrayList<String>();
while (documents.hasNext()) {
Text document = documents.next();
String document_str = document.toString();
if (document_str.substring(0, 1).equals("["))
document_str = document_str.substring(1, document_str.length() - 1);
doc_list.add(document_str);
}
output.collect(new Text(person), new Text(doc_list.toString()));
}
}
/**
* The main driver for name finder map/reduce program.
* <p/>
* NOTE: copied with modifications from Hadoppjava example programs
* <p/>
* Invoke this method to submit the map/reduce job.
*
* @throws IOException When there is communication problems with the
* job tracker.
*/
public int run(String[] args) throws Exception {
JobConf conf = new JobConf(getConf(), NameFinder.class);
conf.setJobName("namefinder");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
conf.setMapperClass(MapClass.class);
//conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
List<String> other_args = new ArrayList<String>();
for (int i = 0; i < args.length; ++i) {
try {
if ("-m".equals(args[i])) {
conf.setNumMapTasks(Integer.parseInt(args[++i]));
} else if ("-r".equals(args[i])) {
conf.setNumReduceTasks(Integer.parseInt(args[++i]));
} else {
other_args.add(args[i]);
}
} catch (Exception ex) {
System.err.println("ERROR: " + ex);
}
}
FileInputFormat.setInputPaths(conf, other_args.get(0));
FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new NameFinder(), args);
System.exit(res);
}
}