package ldbc.snb.datagen.hadoop; import ldbc.snb.datagen.dictionary.Dictionaries; import ldbc.snb.datagen.generator.DatagenParams; import ldbc.snb.datagen.generator.LDBCDatagen; import ldbc.snb.datagen.objects.Knows; import ldbc.snb.datagen.objects.Person; import ldbc.snb.datagen.serializer.PersonSerializer; import ldbc.snb.datagen.serializer.UpdateEventSerializer; import ldbc.snb.datagen.vocabulary.SN; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import java.io.IOException; /** * Created by aprat on 10/15/14. */ public class HadoopPersonSerializer { public static class HadoopPersonSerializerReducer extends Reducer<TupleKey, Person, LongWritable, Person> { private int reducerId; /** The id of the reducer.**/ private PersonSerializer personSerializer_; /** The person serializer **/ private UpdateEventSerializer updateSerializer_; @Override protected void setup(Context context) { Configuration conf = context.getConfiguration(); reducerId = context.getTaskAttemptID().getTaskID().getId(); LDBCDatagen.init(conf); try { personSerializer_ = (PersonSerializer) Class.forName(conf.get("ldbc.snb.datagen.serializer.personSerializer")).newInstance(); personSerializer_.initialize(conf,reducerId); if (DatagenParams.updateStreams) { updateSerializer_ = new UpdateEventSerializer(conf, DatagenParams.hadoopDir + "/temp_updateStream_person_" + reducerId, reducerId, DatagenParams.numUpdatePartitions); } } catch( Exception e ) { System.err.println(e.getMessage()); e.printStackTrace(); } } @Override public void reduce(TupleKey key, Iterable<Person> valueSet,Context context) throws IOException, InterruptedException { // SN.machineId = key.block; personSerializer_.reset(); for( Person p : valueSet ) { if(p.creationDate()< Dictionaries.dates.getUpdateThreshold() || !DatagenParams.updateStreams ) { personSerializer_.export(p); } else { updateSerializer_.export(p); updateSerializer_.changePartition(); } for( Knows k : p.knows() ) { if( k.creationDate() < Dictionaries.dates.getUpdateThreshold() || !DatagenParams.updateStreams ) { personSerializer_.export(p, k); } } } } @Override protected void cleanup(Context context){ personSerializer_.close(); if (DatagenParams.updateStreams) { updateSerializer_.close(); } } } private Configuration conf; public HadoopPersonSerializer( Configuration conf ) { this.conf = new Configuration(conf); } public void run( String inputFileName ) throws Exception { FileSystem fs = FileSystem.get(conf); /*String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked"; HadoopFileRanker hadoopFileRanker = new HadoopFileRanker( conf, TupleKey.class, Person.class, null ); hadoopFileRanker.run(inputFileName,rankedFileName);*/ int numThreads = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numThreads")); Job job = Job.getInstance(conf, "Person Serializer"); //job.setMapOutputKeyClass(BlockKey.class); job.setMapOutputKeyClass(TupleKey.class); job.setMapOutputValueClass(Person.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Person.class); job.setJarByClass(HadoopBlockMapper.class); //job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(HadoopPersonSerializerReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopTuplePartitioner.class); /*job.setSortComparatorClass(BlockKeyComparator.class); job.setGroupingComparatorClass(BlockKeyGroupComparator.class); job.setPartitionerClass(HadoopBlockPartitioner.class);*/ //FileInputFormat.setInputPaths(job, new Path(rankedFileName)); FileInputFormat.setInputPaths(job, new Path(inputFileName)); FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux")); if(!job.waitForCompletion(true)) { throw new Exception(); } try{ //fs.delete(new Path(rankedFileName), true); fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux"),true); } catch(IOException e) { System.err.println(e.getMessage()); } } }