package org.seqcode.data.readdb.tools; import htsjdk.samtools.AlignmentBlock; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.util.CloseableIterator; import java.io.*; import java.util.*; import org.apache.commons.cli.*; /** * Reads SAM or BAM data on stdin. * Produces a file on stdout in the format expected by ImportHits. * Ignores supplementary (i.e. chimeric) alignments. * * Options: --uniquehits (flag to only print 1:1 read to hit mappings) * --pairedend (flag to print pairs) * --junctions (flag to print junction mapping reads as pairs) * --concordantonly (flag to print only concordant pairs) * --read1 (flag to print only read 1 hits) * --read2 (flag to print only read 2 hits) */ public class Bowtie2SAMToReadDB { public static boolean uniqueOnly; public static boolean concordantOnly; public static boolean inclPairedEnd; public static boolean inclJunction; public static boolean read1, read2; public static void main(String args[]) throws IOException, ParseException { Options options = new Options(); options.addOption("u","uniquehits",false,"only output hits with a single mapping"); options.addOption("cc","concordantonly",false,"only output concordant pairs"); options.addOption("p","pairedend",false,"output paired-end hits"); options.addOption("j","junctions",false,"output junction mapping reads (reads with a single gap)"); options.addOption("1","read1",false,"output only read 1 hits"); options.addOption("2","read2",false,"output only read 2 hits"); CommandLineParser parser = new GnuParser(); CommandLine cl = parser.parse( options, args, false ); uniqueOnly = cl.hasOption("uniquehits"); concordantOnly = cl.hasOption("concordantonly"); inclPairedEnd = cl.hasOption("pairedend"); inclJunction = cl.hasOption("junctions"); read1 = cl.hasOption("read1"); read2 = cl.hasOption("read2"); SamReaderFactory factory = SamReaderFactory.makeDefault() .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) .validationStringency(ValidationStringency.SILENT); SamReader reader = factory.open(SamInputResource.of(System.in)); CloseableIterator<SAMRecord> iter = reader.iterator(); List<SAMRecord> records = new ArrayList<SAMRecord>(); String lastName = ""; while (iter.hasNext()) { //Group neighboring reads by name (note this will group read pair hits too) SAMRecord record = iter.next(); if (record.getReadUnmappedFlag()) {continue; } if(record.getSupplementaryAlignmentFlag()){continue;} if(!record.getReadName().equals(lastName)){ if(records.size()>0){ processRecord(records); records.clear(); } } records.add(record); lastName =record.getReadName(); } if(records.size()>0) processRecord(records); iter.close(); reader.close(); } public static void processRecord(List<SAMRecord> records) { int lcount = 0, rcount=0; for(SAMRecord record : records) //get weights for L & R reads separately if(!record.getReadPairedFlag() || record.getFirstOfPairFlag()) lcount++; else rcount++; for(SAMRecord record : records){ int count = lcount; if(record.getReadPairedFlag() && record.getSecondOfPairFlag()) count=rcount; float weight = 1/(float)count; int primAScore = record.getIntegerAttribute("AS"); int secAScore=-1000000; if(record.getIntegerAttribute("XS")!=null) secAScore = record.getIntegerAttribute("XS"); if(inclPairedEnd || inclJunction){ boolean currUnique = primAScore > secAScore ? true : false; /* * Only accept proper pairs, optionally only concordant. * It also assumes that the left and right mates have the same length, * and that there are no gaps in the second mate alignment (SAM doesn't store the paired read's end) * Note: if you change this, you may have to change the SAMStats output also */ if(inclPairedEnd){ //Check this if using bowtie2 to produce multiple mappings for each read pair. could be problematic if(record.getReadPairedFlag() && record.getFirstOfPairFlag() && record.getProperPairFlag() && (!concordantOnly || record.getStringAttribute("YT").equals("CP"))){ if(!uniqueOnly || currUnique){ //Print boolean neg = record.getReadNegativeStrandFlag(); boolean mateneg = record.getMateNegativeStrandFlag(); String len = record.getReadLength() + "\t"; System.out.println( record.getReferenceName() + "\t" + (neg ? record.getAlignmentEnd() : record.getAlignmentStart()) + "\t" + (neg ? "-\t" : "+\t") + len + record.getMateReferenceName() + "\t" + (mateneg ? record.getMateAlignmentStart()+record.getReadLength()-1 : record.getMateAlignmentStart()) + "\t" + (mateneg ? "-\t" : "+\t") + len + weight +"\t"+ 1); } } } /* * Outputs as paired alignments those reads that are aligned in >2 blocks * Note: if you change this, you may have to change the SAMStats output also */ if(inclJunction){ if(!uniqueOnly || currUnique){ List<AlignmentBlock> blocks = record.getAlignmentBlocks(); if(blocks.size()>=2){ for(int ab=0; ab<blocks.size()-1; ab++){ AlignmentBlock lBlock = blocks.get(ab); int lStart = lBlock.getReferenceStart(); int lEnd = lStart + lBlock.getLength()-1; int lLen = lBlock.getLength(); AlignmentBlock rBlock = blocks.get(ab+1); int rStart = rBlock.getReferenceStart(); int rEnd = rStart + rBlock.getLength()-1; int rLen = rBlock.getLength(); boolean neg = record.getReadNegativeStrandFlag(); String refname = record.getReferenceName() + "\t"; System.out.println( refname + (neg ? lEnd : lStart) + "\t" + (neg ? "-\t" : "+\t") + lLen + "\t" + refname + (neg ? rEnd : rStart) + "\t" + (neg ? "-\t" : "+\t") + rLen + "\t" + weight +"\t"+ 0); } } } } }else{ //Just output reads (ignore alignment blocks for now) if (uniqueOnly && primAScore == secAScore) { return; } if((!read1 && !read2) || (!record.getReadPairedFlag()) || (read1 && record.getFirstOfPairFlag()) || (read2 && record.getSecondOfPairFlag())){ System.out.println(String.format("%s\t%d\t%s\t%d\t%f", record.getReferenceName(), record.getReadNegativeStrandFlag() ? record.getAlignmentEnd() : record.getAlignmentStart(), record.getReadNegativeStrandFlag() ? "-" : "+", record.getReadLength(), weight)); } } } } }