// Copyright (C) 2011-2012 CRS4. // // This file is part of Seal. // // Seal is free software: you can redistribute it and/or modify it // under the terms of the GNU General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // Seal is distributed in the hope that it will be useful, but // WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY // or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with Seal. If not, see <http://www.gnu.org/licenses/>. /********************************************************************* * This class is incomplete. Don't use it. ********************************************************************/ package it.crs4.seal.common; import it.crs4.seal.common.AbstractTaggedMapping.TagDataType; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; import org.seqdoop.hadoop_bam.FileVirtualSplit; import org.seqdoop.hadoop_bam.SAMRecordWritable; //import net.sf.samtools.SAMRecord; public class BamInputFormat extends FileInputFormat<LongWritable, ReadPair> { private static final Log LOG = LogFactory.getLog(BamInputFormat.class); private org.seqdoop.hadoop_bam.BAMInputFormat bamImpl; public static class BamRecordReader extends RecordReader<LongWritable, ReadPair> { private ReadPair value; private WritableMapping mapping; private RecordReader<LongWritable, SAMRecordWritable> rrImpl; public BamRecordReader(org.seqdoop.hadoop_bam.BAMRecordReader finBamRR) { rrImpl = finBamRR; } @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { rrImpl.initialize(genericSplit, context); value = new ReadPair(); mapping = new WritableMapping(); } @Override public void close() throws IOException { rrImpl.close(); } @Override public float getProgress() throws IOException, InterruptedException { return rrImpl.getProgress(); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return rrImpl.getCurrentKey(); } @Override public ReadPair getCurrentValue() { return value; } @Override public boolean nextKeyValue() throws IOException, InterruptedException { // ***** not implemented **** //if (rrImpl.nextKeyValue()) //{ // value.clear(); // SAMRecord sam = rrImpl.getCurrentValue().get(); // // copy data from SAMRecord to our mapping // readSamRecord(sam, mapping); // if (mapping.isRead2()) // value.setRead2(mapping); // else // anything that's not explicitly labelled as "read 2" goes in as read 1. // value.setRead1(mapping); // return true; //} //else return false; } /******* not implemented protected void readSamRecord(SAMRecord sam, WritableMapping mapping) { mapping.clear(); mapping.setName(sam.getReadName()); mapping.setSequence(ByteBuffer.wrap(sam.getReadBases())); mapping.setFlag(sam.getFlags()); // we have to map base qualities from raw phred-scaled scores to // Phred+33, as per our convention byte[] originalQ = sam.getBaseQualities(); ByteBuffer q = ByteBuffer.allocate(originalQ.length); byte[] newQ = q.array(); int newQStart = q.position(); for (int i = 0; i < originalQ.length; ++i) { if (originalQ[i] < 0 || originalQ[i] > Utils.SANGER_MAX) { throw new FormatException( "base quality score out of range for BAM format (found " + originalQ[i] + " but acceptable range is [0," + Utils.SANGER_MAX + "])."); } newQ[ newQStart + i ] = (byte)(originalQ[i] + Utils.SANGER_OFFSET); } q.rewind().mark(); mapping.setBaseQualities(q); // now get the rest of the fields if (mapping.isMapped()) { mapping.setContig(sam.getReferenceName()); mapping.set5Position(sam.getAlignmentStart()); mapping.setMapQ(sam.getMappingQuality()); mapping.setAlignment(AlignOp.scanCigar(sam.getCigarString())); } if (mapping.isPaired() && mapping.isMateMapped()) mapping.setTemplateLength(Math.abs(sam.getInferredInsertSize())); for (SAMRecord.SAMTagAndValue tag: sam.getAttributes()) { Object value = tag.value; TagDataType type = null; if (value instanceof String) type = TagDataType.String; else if (value instanceof Character) type = TagDataType.Char; else if (value instanceof Float) type = TagDataType.Float; else if (value instanceof Integer || value instanceof Byte || value instanceof Short) type = TagDataType.Int; else { LOG.warn("dropping tag of unsupported type " + value.getClass().toString()); continue; } mapping.setTag(tag.tag, type, value.toString()); } } */ } public BamInputFormat() { // **** not implemented // bamImpl = new org.seqdoop.hadoop_bam.BAMInputFormat(); throw new UnsupportedOperationException("This class is not implemented"); } @Override public List<InputSplit> getSplits(JobContext job) throws IOException { return bamImpl.getSplits(job); } /** * Public method to create FileVirtualSplits. * * Useful for unit testing. */ public List<InputSplit> getVirtualSplits(List<InputSplit> fileSplits, Configuration conf) throws IOException { return bamImpl.getSplits(fileSplits, conf); } @Override public RecordReader<LongWritable,ReadPair> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new BamRecordReader((org.seqdoop.hadoop_bam.BAMRecordReader)bamImpl.createRecordReader(split, context)); } @Override protected boolean isSplitable(JobContext context, Path file) { return bamImpl.isSplitable(context, file); } }