/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.hpg.bigdata.tools.alignment.stats; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapred.AvroValue; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyInputFormat; import org.apache.avro.mapreduce.AvroKeyOutputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.ga4gh.models.LinearAlignment; import org.ga4gh.models.ReadAlignment; import org.opencb.hpg.bigdata.tools.utils.ChunkKey; import java.io.IOException; import java.io.OutputStream; public class ReadAlignmentSortMR { public static class ReadAlignmentSortMapper extends Mapper<AvroKey<ReadAlignment>, NullWritable, ChunkKey, AvroValue<ReadAlignment>> { public void map(AvroKey<ReadAlignment> key, NullWritable value, Context context) throws IOException, InterruptedException { ChunkKey newKey; LinearAlignment la = (LinearAlignment) key.datum().getAlignment(); if (la == null) { newKey = new ChunkKey(new String("*"), 0); } else { newKey = new ChunkKey(la.getPosition().getReferenceName().toString(), la.getPosition().getPosition().intValue()); } context.write(newKey, new AvroValue<>(key.datum())); } } public static class ReadAlignmentSortReducer extends Reducer<ChunkKey, AvroValue<ReadAlignment>, AvroKey<ReadAlignment>, NullWritable> { public void reduce(ChunkKey key, Iterable<AvroValue<ReadAlignment>> values, Context context) throws IOException, InterruptedException { for (AvroValue<ReadAlignment> value : values) { context.write(new AvroKey<>(value.datum()), NullWritable.get()); } } } public static int run(String input, String output) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "ReadAlignmentSortMR"); job.setJarByClass(ReadAlignmentSortMR.class); // input AvroJob.setInputKeySchema(job, ReadAlignment.SCHEMA$); FileInputFormat.setInputPaths(job, new Path(input)); job.setInputFormatClass(AvroKeyInputFormat.class); // output FileOutputFormat.setOutputPath(job, new Path(output)); job.setOutputFormatClass(AvroKeyOutputFormat.class); //job.setOutputKeyClass(AvroKeyInputFormat.class); //job.setOutputValueClass(NullWritable.class); // mapper job.setMapperClass(ReadAlignmentSortMapper.class); //AvroJob.setMapOutputKeySchema(job, ReadAlignment.SCHEMA$); AvroJob.setMapOutputValueSchema(job, ReadAlignment.SCHEMA$); job.setMapOutputKeyClass(ChunkKey.class); //job.setMapOutputValueClass(AvroKeyInputFormat.class); // reducer job.setReducerClass(ReadAlignmentSortReducer.class); AvroJob.setOutputKeySchema(job, ReadAlignment.SCHEMA$); // Compress Map output conf.set("mapred.compress.map.output", "true"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); // Compress MapReduce output conf.set("mapred.output.compress", "true"); conf.set("mapred.output.compression", "org.apache.hadoop.io.compress.SnappyCodec"); //conf.setBoolean("mapreduce.map.output.compress", true); //conf.setBoolean("mapreduce.output.fileoutputformat.compress", true); //conf.setBoolean("mapred.compress.map.output", true); //conf.set("mapred.map.output.compression.codec","org.apache.hadoop.io.compress.SnappyCodec"); //mapred.map.output.compression.codec //conf.set(AvroJob.CONF_OUTPUT_CODEC, CodecFactory.snappyCodec().toString()); // conf.set("mapreduce.map.output.compress", true) // conf.set("mapreduce.output.fileoutputformat.compress", false) // conf.setBoolean("mapreduce.output.fileoutputformat.compres", true); //conf.setBoolean("mapreduce.output.fileoutputformat.compress", false); //System.exit(-1); //job.setNumReduceTasks(1); //job.waitForCompletion(true); //System.exit(-1); //return 0; job.waitForCompletion(true); // copy header to the output folder FileSystem fs = FileSystem.get(conf); // read header Path srcHeaderPath = new Path(input + ".header"); FSDataInputStream dis = fs.open(srcHeaderPath); FileStatus status = fs.getFileStatus(srcHeaderPath); byte[] data = new byte[(int) status.getLen()]; dis.read(data, 0, (int) status.getLen()); dis.close(); // copy header OutputStream os = fs.create(new Path(output + "/part-r-00000.avro.header")); os.write(data); os.close(); return 0; } }