package ldbc.snb.datagen.hadoop; import ldbc.snb.datagen.serializer.PersonSerializer; import ldbc.snb.datagen.serializer.UpdateEventSerializer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import java.io.IOException; import java.io.OutputStream; import java.util.zip.GZIPOutputStream; /** * Created by aprat on 10/15/14. */ public class HadoopUpdateStreamSerializer { public static class HadoopUpdateStreamSerializerReducer extends Reducer<LongWritable, Text, LongWritable, Text> { private int reducerId; /** The id of the reducer.**/ private PersonSerializer personSerializer_; /** The person serializer **/ private UpdateEventSerializer updateSerializer_; private OutputStream out; protected void setup(Context context) { Configuration conf = context.getConfiguration(); int reducerId = Integer.parseInt(conf.get("reducerId")); int partitionId = Integer.parseInt(conf.get("partitionId")); String streamType = conf.get("streamType"); try { FileSystem fs = FileSystem.get(conf); if( Boolean.parseBoolean(conf.get("ldbc.snb.datagen.serializer.compressed")) == true ) { Path outFile = new Path(context.getConfiguration().get("ldbc.snb.datagen.serializer.socialNetworkDir")+"/updateStream_"+reducerId+"_"+partitionId+"_"+streamType+".csv.gz"); out = new GZIPOutputStream( fs.create(outFile)); } else { Path outFile = new Path(context.getConfiguration().get("ldbc.snb.datagen.serializer.socialNetworkDir")+"/updateStream_"+reducerId+"_"+partitionId+"_"+streamType+".csv"); out = fs.create(outFile); } } catch( Exception e ) { System.err.println(e.getMessage()); e.printStackTrace(); } } @Override public void reduce(LongWritable key, Iterable<Text> valueSet,Context context) throws IOException, InterruptedException { for( Text t : valueSet ) { out.write(t.toString().getBytes("UTF8")); } } protected void cleanup(Context context){ try { out.close(); } catch( Exception e ) { System.err.println(e.getMessage()); } } } private Configuration conf; public HadoopUpdateStreamSerializer( Configuration conf ) { this.conf = new Configuration(conf); } public void run( String inputFileName, int reducer, int partition, String type ) throws Exception { conf.setInt("reducerId", reducer); conf.setInt("partitionId", partition); conf.set("streamType", type); Job job = Job.getInstance(conf, "Update Stream Serializer"); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setJarByClass(HadoopUpdateStreamSerializerReducer.class); job.setReducerClass(HadoopUpdateStreamSerializerReducer.class); job.setNumReduceTasks(1); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputFileName)); FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux")); if(!job.waitForCompletion(true)) { throw new Exception(); } try{ FileSystem fs = FileSystem.get(conf); fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir")+"/aux"),true); } catch(IOException e) { System.err.println(e.getMessage()); } } }