package edu.isi.karma.mapreduce.driver;
import java.io.File;
import java.io.FileInputStream;
import java.security.InvalidParameterException;
import java.util.Properties;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import edu.isi.karma.rdf.CommandLineArgumentParser;
import edu.isi.karma.rdf.OfflineRdfGenerator;
public class JSONIdentityReducerProcessor extends Configured implements Tool {
public Job configure(Properties p ) throws Exception
{
Configuration conf = getConf();
Job job = Job.getInstance(conf);
if(p.getProperty("file.type").equalsIgnoreCase("JL"))
{
job.setInputFormatClass(TextInputFormat.class);
}
else
{
job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
}
job.setJarByClass(JSONIdentityReducerProcessor.class);
job.setMapperClass(IdentityJSONMapper.class);
job.setCombinerClass(JSONReducer.class);
if(p.getProperty("file.type").equalsIgnoreCase("JL"))
{
job.setReducerClass(ValueOnlyJSONReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputFormatClass(TextOutputFormat.class);
}
else
{
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setReducerClass(JSONReducer.class);
job.setOutputKeyClass(Text.class);
}
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputValueClass(Text.class);
String[] paths = p.getProperty("input.directory").split(",");
Path[] array = new Path[paths.length];
int i = 0;
for (String path : paths) {
array[i++] = new Path(path);
}
FileInputFormat.setInputPaths(job, array);
FileOutputFormat.setOutputPath(job, new Path(p.getProperty("output.directory")));
job.setNumReduceTasks(1);
return job;
}
public int run(String[] args) throws Exception {
Properties p = new Properties();
if(args.length > 1)
{
Options options = createCommandLineOptions();
CommandLine cl = CommandLineArgumentParser.parse(args, options, OfflineRdfGenerator.class.getSimpleName());
if(cl == null)
{
return -1;
}
try {
if(cl.getOptionValue("filetype") == null)
{
throw new InvalidParameterException("Missing argument --filetype");
}
if(cl.getOptionValue("inputdirectory") == null)
{
throw new InvalidParameterException("Missing argument --inputdirectory");
}
if(cl.getOptionValue("outputdirectory") == null)
{
throw new InvalidParameterException("Missing argument --outputdirectory");
}
p.setProperty("fs.default.name", cl.getOptionValue("fsdefaultname", "file:///"));
p.setProperty("mapred.job.tracker", cl.getOptionValue("mapredjobtracker", "local"));
p.setProperty("input.directory", cl.getOptionValue("inputdirectory"));
p.setProperty("output.directory", cl.getOptionValue("outputdirectory"));
p.setProperty("file.type", cl.getOptionValue("filetype", "SEQ"));
}
catch(Exception e )
{
System.err.println("Invalid arguments: " + e.getMessage());
return -1;
}
}
else
{
p.load(new FileInputStream(new File(args[0])));
}
Job job = configure(p);
if(!job.waitForCompletion(false))
{
System.err.println("Unable to finish job");
return -1;
}
return 0;
}
public static void main(String[] args) throws Exception {
System.exit(ToolRunner.run(new Configuration(), new JSONIdentityReducerProcessor(), args));
}
private static Options createCommandLineOptions() {
Options options = new Options();
options.addOption(new Option("inputdirectory", "inputdirectory", true, "input directory"));
options.addOption(new Option("outputdirectory","outputdirectory", true, "output directory"));
options.addOption(new Option("filetype","filetype",true, "file type. one of [SEQ, JL]"));
options.addOption(new Option("fsdefaultname","fsdefaultname",true, "file system name for hdfs"));
options.addOption(new Option("mapredjobtracker","mapredjobtracker",true, "location of mapreduce job tracker"));
options.addOption(new Option("help", "help", false, "print this message"));
return options;
}
}