package org.seqcode.data.readdb.tools; import java.io.*; import java.util.*; import org.apache.commons.cli.*; import htsjdk.samtools.AlignmentBlock; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.util.CloseableIterator; /** * Reads SAM or BAM data on stdin. * Produces a file on stdout in the format expected by ImportHits. * The weight for a hit is 1/(# of hits for that read) * * Options: --nosuboptimal (flag to only take the hits with the minimum number of mismatches) * --uniquehits (flag to only print 1:1 read to hit mappings) * --pairedend (flag to print pairs) * --junctions (flag to print junction mapping reads as pairs) * * nosuboptimal is applied before uniquehits */ public class TophatSAMToReadDB { public static boolean uniqueOnly; public static boolean filterSubOpt; public static boolean inclPairedEnd; public static boolean inclJunction; public static void main(String args[]) throws IOException, ParseException { Options options = new Options(); options.addOption("u","uniquehits",false,"only output hits with a single mapping"); options.addOption("s","nosuboptimal",false,"do not include hits whose score is not equal to the best score for the read"); options.addOption("p","pairedend",false,"output paired-end hits"); options.addOption("j","junctions",false,"output junction mapping reads (reads with gaps)"); CommandLineParser parser = new GnuParser(); CommandLine cl = parser.parse( options, args, false ); uniqueOnly = cl.hasOption("uniquehits"); filterSubOpt = cl.hasOption("nosuboptimal"); inclPairedEnd = cl.hasOption("pairedend"); inclJunction = cl.hasOption("junctions"); SamReaderFactory factory = SamReaderFactory.makeDefault() .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) .validationStringency(ValidationStringency.SILENT); SamReader reader = factory.open(SamInputResource.of(System.in)); CloseableIterator<SAMRecord> iter = reader.iterator(); while (iter.hasNext()) { SAMRecord record = iter.next(); if (record.getReadUnmappedFlag()) {continue; } processRecord(record); } iter.close(); reader.close(); } public static void processRecord(SAMRecord record) { //NH seems to be an accurate count of hits for Tophat. //Grouping hits will not work, as Tophat BAM output is sorted by genomic location if (uniqueOnly && record.getIntegerAttribute("NH") > 1) { return; } float weight = 1/(float)record.getIntegerAttribute("NH"); if(inclPairedEnd || inclJunction){ /* * Okay, so the paired-end part is just hacked together for now. * It only accepts true pairs. * It also assumes that the left and right mates have the same length, * and that there are no gaps in the second mate alignment (SAM doesn't store the paired read's end) * Note: if you change this, you may have to change the SAMStats output also */ if(inclPairedEnd){ if(record.getReadPairedFlag() && !record.getNotPrimaryAlignmentFlag() && record.getFirstOfPairFlag() && record.getProperPairFlag() && record.getReadPairedFlag()){ boolean neg = record.getReadNegativeStrandFlag(); boolean mateneg = record.getMateNegativeStrandFlag(); String len = record.getReadLength() + "\t"; System.out.println( record.getReferenceName() + "\t" + (neg ? record.getAlignmentEnd() : record.getAlignmentStart()) + "\t" + (neg ? "-\t" : "+\t") + len + record.getMateReferenceName() + "\t" + (mateneg ? record.getMateAlignmentStart()+record.getReadLength()-1 : record.getMateAlignmentStart()) + "\t" + (mateneg ? "-\t" : "+\t") + len + weight +"\t"+ 1); } } /* * Outputs as paired alignments those reads that are aligned in >2 blocks * Note: if you change this, you may have to change the SAMStats output also */ if(inclJunction){ List<AlignmentBlock> blocks = record.getAlignmentBlocks(); if(blocks.size()>=2){ for(int ab=0; ab<blocks.size()-1; ab++){ AlignmentBlock lBlock = blocks.get(ab); int lStart = lBlock.getReferenceStart(); int lEnd = lStart + lBlock.getLength()-1; int lLen = lBlock.getLength(); AlignmentBlock rBlock = blocks.get(ab+1); int rStart = rBlock.getReferenceStart(); int rEnd = rStart + rBlock.getLength()-1; int rLen = rBlock.getLength(); boolean neg = record.getReadNegativeStrandFlag(); String refname = record.getReferenceName() + "\t"; System.out.println( refname + (neg ? lEnd : lStart) + "\t" + (neg ? "-\t" : "+\t") + lLen + "\t" + refname + (neg ? rEnd : rStart) + "\t" + (neg ? "-\t" : "+\t") + rLen + "\t" + weight +"\t"+ 0); } } } }else{ //Just output reads (or read parts) List<AlignmentBlock> blocks = record.getAlignmentBlocks(); for(int a=0; a<blocks.size(); a++){ //Iterate over alignment blocks AlignmentBlock currBlock = blocks.get(a); int aStart = currBlock.getReferenceStart(); int aEnd = aStart + currBlock.getLength()-1; int aLen = currBlock.getLength(); boolean nearbyBlocks=true; while(nearbyBlocks && a<blocks.size()-1){ if(blocks.get(a+1).getReferenceStart() - currBlock.getReferenceStart() < record.getReadLength()){ aEnd = blocks.get(a+1).getReferenceStart() + blocks.get(a+1).getLength()-1; aLen += blocks.get(a+1).getLength(); a++; }else{ nearbyBlocks=false; } } boolean neg = record.getReadNegativeStrandFlag(); System.out.println( record.getReferenceName() + "\t" + (neg ? aEnd : aStart) + "\t" + (neg ? "-\t" : "+\t") + aLen + "\t" + weight); } } } }