/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.hpg.bigdata.tools.alignment; import htsjdk.samtools.*; import htsjdk.samtools.seekablestream.SeekableStream; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapred.AvroValue; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyOutputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.ga4gh.models.ReadAlignment; import org.opencb.hpg.bigdata.core.converters.SAMRecord2ReadAlignmentConverter; import org.opencb.hpg.bigdata.tools.utils.ChunkKey; import org.opencb.hpg.bigdata.tools.utils.CompressionUtils; import org.seqdoop.hadoop_bam.AnySAMInputFormat; import org.seqdoop.hadoop_bam.SAMRecordWritable; import org.seqdoop.hadoop_bam.util.WrapSeekable; import java.io.BufferedWriter; import java.io.IOException; import java.io.OutputStreamWriter; public class Bam2AvroMR { public static final String ADJUST_QUALITY = "adjustQuality"; public static class Bam2GaMapper extends Mapper<LongWritable, SAMRecordWritable, AvroKey<ReadAlignment>, NullWritable> { private SAMRecord2ReadAlignmentConverter samRecord2ReadAlignmentConverter; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); boolean adjustQuality = context.getConfiguration().getBoolean(ADJUST_QUALITY, false); samRecord2ReadAlignmentConverter = new SAMRecord2ReadAlignmentConverter(adjustQuality); } @Override public void map(LongWritable key, SAMRecordWritable value, Context context) throws IOException, InterruptedException { // ChunkKey newKey; SAMRecord sam = value.get(); if (sam.getReadUnmappedFlag()) { System.out.println("Empty block"); // do nothing // newKey = new ChunkKey(new String("*"), (long) 0); } else { // long start_chunk = sam.getAlignmentStart() / RegionDepth.CHUNK_SIZE; // long end_chunk = sam.getAlignmentEnd() / RegionDepth.CHUNK_SIZE; // newKey = new ChunkKey(sam.getReferenceName(), start_chunk); SAMRecord2ReadAlignmentConverter converter = samRecord2ReadAlignmentConverter; ReadAlignment readAlignment = converter.forward(sam); AvroKey<ReadAlignment> newKey = new AvroKey<>(readAlignment); //context.write(newKey, value); context.write(newKey, NullWritable.get()); } } } public static class Bam2GaReducer extends Reducer<ChunkKey, AvroValue<ReadAlignment>, AvroKey<ReadAlignment>, NullWritable> { public void reduce(ChunkKey key, Iterable<AvroValue<ReadAlignment>> values, Context context) throws IOException, InterruptedException { for (AvroValue<ReadAlignment> value : values) { context.write(new AvroKey<>(value.datum()), NullWritable.get()); } } } public static int run(String input, String output, String codecName, boolean adjustQuality) throws Exception { return run(input, output, codecName, adjustQuality, new Configuration()); } public static int run(String input, String output, String codecName, boolean adjQuality, Configuration conf) throws Exception { // read header, and save sequence index/name in conf final Path p = new Path(input); final SeekableStream seekableStream = WrapSeekable.openPath(conf, p); final SamReader reader = SamReaderFactory.make().open(SamInputResource.of(seekableStream)); final SAMFileHeader header = reader.getFileHeader(); int i = 0; SAMSequenceRecord sr; while ((sr = header.getSequence(i)) != null) { conf.set("" + i, sr.getSequenceName()); i++; } Job job = Job.getInstance(conf, "Bam2AvroMR"); job.setJarByClass(Bam2AvroMR.class); // Avro problem fix job.getConfiguration().set("mapreduce.job.user.classpath.first", "true"); job.getConfiguration().set(ADJUST_QUALITY, Boolean.toString(adjQuality)); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputKeySchema(job, ReadAlignment.getClassSchema()); job.setOutputValueClass(NullWritable.class); AvroJob.setMapOutputValueSchema(job, ReadAlignment.getClassSchema()); // point to input data FileInputFormat.setInputPaths(job, new Path(input)); job.setInputFormatClass(AnySAMInputFormat.class); // set the output format FileOutputFormat.setOutputPath(job, new Path(output)); if (codecName != null) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, CompressionUtils.getHadoopCodec(codecName)); } job.setOutputFormatClass(AvroKeyOutputFormat.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(Void.class); job.setMapperClass(Bam2GaMapper.class); job.setNumReduceTasks(0); job.waitForCompletion(true); // write header Path headerPath = new Path(output + "/part-m-00000.avro.header"); FileSystem fs = FileSystem.get(conf); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(headerPath, true))); br.write(header.getTextHeader()); br.close(); return 0; } }