package org.seqcode.deepseq.hitloaders; import htsjdk.samtools.AlignmentBlock; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.util.CloseableIterator; import java.io.File; import java.io.IOException; import java.util.List; import org.seqcode.deepseq.HitPair; import org.seqcode.deepseq.Read; import org.seqcode.deepseq.ReadHit; /** * TophatFileHitLoader: A FileHitLoader for Tophat SAM/BAM output. * Each alignment block is loaded as a separate hit. Ignores uniqueness. * @author mahony * */ public class TophatFileHitLoader extends FileHitLoader{ public TophatFileHitLoader(File f, boolean nonUnique, boolean loadT1Reads, boolean loadT2Reads, boolean loadRead2, boolean loadPairs) { super(f, nonUnique, true, false, loadRead2, loadPairs); if(!loadT1Reads || loadT2Reads) System.err.println("TophatFileHitLoader: You asked to load only Type1 or Type2 reads, we do not load this information from Tophat SAM format."); } /** * Get the reads from the appropriate source (implementation-specific). * Loads data to the fivePrimesList and hitsCountList * Loads pairs to hitPairsList */ public void sourceAllHits() { this.initialize(); SamReaderFactory factory = SamReaderFactory.makeDefault() .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) .validationStringency(ValidationStringency.SILENT); SamReader reader = factory.open(file); CloseableIterator<SAMRecord> iter = reader.iterator(); while (iter.hasNext()) { SAMRecord record = iter.next(); if (record.getReadUnmappedFlag()) {continue; } if(record.getReadPairedFlag() && record.getSecondOfPairFlag() && !loadRead2){continue;} float weight = 1/(float)record.getIntegerAttribute("NH"); Read currRead = new Read(); List<AlignmentBlock> blocks = record.getAlignmentBlocks(); for(int a=0; a<blocks.size(); a++){ //Iterate over alignment blocks AlignmentBlock currBlock = blocks.get(a); int aStart = currBlock.getReferenceStart(); int aEnd = aStart + currBlock.getLength()-1; int aLen = currBlock.getLength(); boolean nearbyBlocks=true; while(nearbyBlocks && a<blocks.size()-1){ if(blocks.get(a+1).getReferenceStart() - currBlock.getReferenceStart() < record.getReadLength()){ aEnd = blocks.get(a+1).getReferenceStart() + blocks.get(a+1).getLength()-1; aLen += blocks.get(a+1).getLength(); a++; }else{ nearbyBlocks=false; } } ReadHit currHit = new ReadHit( record.getReferenceName().replaceFirst("^chromosome", "").replaceFirst("^chrom", "").replaceFirst("^chr", ""), aStart, aEnd, record.getReadNegativeStrandFlag() ? '-' : '+', weight); currRead.addHit(currHit); } addHits(currRead); //load pair if this is a first mate, congruent, proper pair if(record.getFirstOfPairFlag() && record.getProperPairFlag()){ boolean neg = record.getReadNegativeStrandFlag(); boolean mateneg = record.getMateNegativeStrandFlag(); HitPair hp = new HitPair((neg ? record.getAlignmentEnd() : record.getAlignmentStart()), record.getMateReferenceName(), (mateneg ? record.getMateAlignmentStart()+record.getReadLength()-1 : record.getMateAlignmentStart()), mateneg ? 1 : 0, 1); addPair(record.getReferenceName(), neg ? '-':'+', hp); } } iter.close(); try { reader.close(); } catch (IOException e) { e.printStackTrace(); } }//end of sourceAllHits method }