package ldbc.snb.datagen.hadoop; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import java.io.IOException; /** * Created by aprat on 11/17/14. */ public class HadoopFileKeyChanger { public interface KeySetter<K> { public K getKey(Object object); } private String keySetterName; private Configuration conf; private Class<?> K; private Class<?> V; public HadoopFileKeyChanger( Configuration conf, Class<?> K, Class<?> V, String keySetterName ) { this.keySetterName = keySetterName; this.conf = conf; this.K = K; this.V = V; } public static class HadoopFileKeyChangerReducer<K, V> extends Reducer<K, V, TupleKey, V> { KeySetter<TupleKey> keySetter; @Override public void setup( Context context ) { try { String className = context.getConfiguration().get("keySetterClassName"); keySetter = (HadoopFileKeyChanger.KeySetter) Class.forName(className).newInstance(); } catch(ClassNotFoundException e) { System.out.print(e.getMessage()); e.printStackTrace(); } catch(IllegalAccessException e) { System.out.print(e.getMessage()); e.printStackTrace(); } catch(InstantiationException e) { System.out.print(e.getMessage()); e.printStackTrace(); } } @Override public void reduce(K key, Iterable<V> valueSet, Context context) throws IOException, InterruptedException { for( V v : valueSet ) { context.write(keySetter.getKey(v), v); } } } public void run( String inputFileName, String outputFileName ) throws Exception { int numThreads = conf.getInt("ldbc.snb.datagen.generator.numThreads",1); System.out.println("***************"+numThreads); conf.set("keySetterClassName", keySetterName); /** First Job to sort the key-value pairs and to count the number of elements processed by each reducer.**/ Job job = Job.getInstance(conf, "Sorting "+inputFileName); FileInputFormat.setInputPaths(job, new Path(inputFileName)); FileOutputFormat.setOutputPath(job, new Path(outputFileName)); job.setMapOutputKeyClass(K); job.setMapOutputValueClass(V); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(V); job.setNumReduceTasks(numThreads); job.setReducerClass(HadoopFileKeyChangerReducer.class); job.setJarByClass(V); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); if(!job.waitForCompletion(true)){ throw new Exception(); } } }