package ldbc.snb.datagen.hadoop; import ldbc.snb.datagen.serializer.PersonSerializer; import ldbc.snb.datagen.serializer.UpdateEventSerializer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPOutputStream; /** * Created by aprat on 10/15/14. */ public class HadoopUpdateStreamSorterAndSerializer { public static class HadoopUpdateStreamSorterAndSerializerReducer extends Reducer<UpdateEventKey, Text, UpdateEventKey, Text> { private boolean compressed = false; private Configuration conf; private String streamType; protected void setup(Context context) { conf = context.getConfiguration(); streamType = conf.get("streamType"); try { compressed = Boolean.parseBoolean(conf.get("ldbc.snb.datagen.serializer.compressed")); } catch( Exception e) { System.err.println(e.getMessage()); } } @Override public void reduce(UpdateEventKey key, Iterable<Text> valueSet,Context context) throws IOException, InterruptedException { OutputStream out; try { FileSystem fs = FileSystem.get(conf); if( compressed ) { Path outFile = new Path(context.getConfiguration().get("ldbc.snb.datagen.serializer.socialNetworkDir")+"/updateStream_"+key.reducerId+"_"+key.partition+"_"+streamType+".csv.gz"); out = new GZIPOutputStream( fs.create(outFile)); } else { Path outFile = new Path(context.getConfiguration().get("ldbc.snb.datagen.serializer.socialNetworkDir")+"/updateStream_"+key.reducerId+"_"+key.partition+"_"+streamType+".csv"); out = fs.create(outFile); } int counter = 0; for( Text t : valueSet ) { counter++; out.write(t.toString().getBytes("UTF8")); } out.close(); } catch( Exception e ) { System.err.println(e.getMessage()); } } protected void cleanup(Context context){ try { } catch( Exception e ) { System.err.println(e.getMessage()); } } } private Configuration conf; public HadoopUpdateStreamSorterAndSerializer(Configuration conf ) { this.conf = new Configuration(conf); } public void run(List<String> inputFileNames, String type ) throws Exception { int numThreads = conf.getInt("ldbc.snb.datagen.generator.numThreads",1); conf.set("streamType", type); Job job = Job.getInstance(conf, "Update Stream Serializer"); job.setMapOutputKeyClass(UpdateEventKey.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(UpdateEventKey.class); job.setOutputValueClass(Text.class); job.setJarByClass(HadoopUpdateStreamSorterAndSerializerReducer.class); job.setReducerClass(HadoopUpdateStreamSorterAndSerializerReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopUpdateEventKeyPartitioner.class); job.setGroupingComparatorClass(UpdateEventKeyGroupComparator.class); //job.setSortComparatorClass(UpdateEventKeySortComparator.class); for(String s : inputFileNames) { FileInputFormat.addInputPath(job, new Path(s)); } FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux")); if(!job.waitForCompletion(true)) { throw new Exception(); } try{ FileSystem fs = FileSystem.get(conf); fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux"),true); } catch(IOException e) { System.err.println(e.getMessage()); } } }