package edu.isi.karma.mapreduce.tripleparser; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TripleProcessor { private final static Logger logger = LoggerFactory.getLogger(TripleProcessor.class); private TripleProcessor() { } public static void parseTriples(Configuration conf, Path input, Path output) throws IOException { FileSystem fs = FileSystem.get(conf); try { Job job = Job.getInstance(conf, "parse triples"); job.setJarByClass(Neo4jCSVGenerator.class); job.setMapperClass(TripleMapper.class); job.setReducerClass(TripleReducer.class); job.setNumReduceTasks(6); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); if (fs.exists(output)) { fs.delete(output, true); } job.setInputFormatClass(KeyValueTextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); } catch (IOException e) { logger.error("i/o exception loading data sources", e); } catch (InterruptedException e) { logger.debug("Hadoop job interrupted", e); } catch (ClassNotFoundException e) { logger.debug("Cannot find mapper/reducer class", e); } } }