package org.seqcode.data.readdb.tools; import java.io.IOException; import java.util.List; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.seqcode.gseutils.RealValuedHistogram; import htsjdk.samtools.AlignmentBlock; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamInputResource; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.util.CloseableIterator; public class SAMStats { private Double totalReads = 0.0, totalHits=0.0, LHits=0.0, RHits=0.0; private Double singleEnd=0.0, properPair=0.0, pairMapped=0.0, notPrimary=0.0; private Double junctions=0.0; private Double uniquelyMapped=0.0; private Double weight=0.0; private Integer readLen =-1; private Double pairedEndSameChr=0.0, pairedEndDiffChr=0.0; private Boolean bowtie1=false, bowtie2=false; private RealValuedHistogram histo; public static void main(String args[]) throws IOException, ParseException { Options options = new Options(); options.addOption("i","insdist",false,"print insert size distribution"); options.addOption("s","stats",false,"print mapping stats"); options.addOption("l","readlen",false,"read length"); options.addOption("c","readcount",false,"read count"); options.addOption("bt2",false,"input is from bowtie2"); options.addOption("bt1",false,"input is from bowtie1 (tested for --best --strata -m 1)"); CommandLineParser parser = new GnuParser(); CommandLine cl = parser.parse( options, args, false ); SAMStats s = new SAMStats(cl.hasOption("bt1"), cl.hasOption("bt2")); if(cl.hasOption("insdist")) s.printInsertDistrib(); if(cl.hasOption("stats")) s.printStats(); if(cl.hasOption("readlen")) s.printReadLength(); if(cl.hasOption("readcount")) s.printReadCount(); } public SAMStats(boolean bowtie1, boolean bowtie2){ this.bowtie1 = bowtie1; this.bowtie2 = bowtie2; histo = new RealValuedHistogram(0, 1000, 100); SamReaderFactory factory = SamReaderFactory.makeDefault() .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) .validationStringency(ValidationStringency.SILENT); SamReader reader = factory.open(SamInputResource.of(System.in)); CloseableIterator<SAMRecord> iter = reader.iterator(); while (iter.hasNext()) { SAMRecord record = iter.next(); if(readLen==-1) readLen = record.getReadLength(); if(bowtie1) processBT1SAMRecord(record); else if(bowtie2) processBT2SAMRecord(record); else processSAMRecord(record); } iter.close(); try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } public void processSAMRecord(SAMRecord r){ totalReads++; if(!r.getReadUnmappedFlag()){ totalHits++; int count = 1; //Have to figure out something for BWA when reporting multiple alignments if(r.getIntegerAttribute("NH")!=null) count = r.getIntegerAttribute("NH"); if(count==1 && r.getMappingQuality()!=0) //Second clause for BWA uniquelyMapped++; weight += 1/(float)count; if(r.getReadPairedFlag()){ if(r.getMateUnmappedFlag()){ singleEnd++; }else{ pairMapped++; if(r.getMateReferenceName().equals(r.getReferenceName())){ pairedEndSameChr++; }else{ pairedEndDiffChr++; } } }else{ singleEnd++; } List<AlignmentBlock> blocks = r.getAlignmentBlocks(); if(blocks.size()>=2){ junctions+=blocks.size()-1; } if(!r.getNotPrimaryAlignmentFlag()){ if(!r.getReadPairedFlag() || r.getFirstOfPairFlag()){ LHits++; if(r.getReadPairedFlag() && r.getProperPairFlag()){ properPair++; if(!r.getReadNegativeStrandFlag() && r.getMateNegativeStrandFlag()){ double dist = (r.getMateAlignmentStart()+r.getReadLength())-r.getAlignmentStart(); histo.addValue(dist); } } }else if(r.getSecondOfPairFlag()){ RHits++; } }else{ notPrimary++; } } } public void processBT1SAMRecord(SAMRecord r){ totalReads++; if(!r.getReadUnmappedFlag()) if(r.getIntegerAttribute("XM")!=null){ int xm = r.getIntegerAttribute("XM"); if(xm!=0) weight++; } else{ totalHits++; int count =1; //TODO: Fix this if using bowtie for multi-mapping reads boolean currUnique = true; if(count==1 && currUnique){ uniquelyMapped++; } weight += 1/(float)count; if(r.getReadPairedFlag()){ if(r.getMateUnmappedFlag()){ singleEnd++; }else{ pairMapped++; if(r.getMateReferenceName().equals(r.getReferenceName())){ pairedEndSameChr++; }else{ pairedEndDiffChr++; } } }else{ singleEnd++; } List<AlignmentBlock> blocks = r.getAlignmentBlocks(); if(blocks.size()>=2){ junctions+=blocks.size()-1; } if(!r.getNotPrimaryAlignmentFlag()){ if(!r.getReadPairedFlag() || r.getFirstOfPairFlag()){ LHits++; if(r.getReadPairedFlag() && r.getProperPairFlag()){ properPair++; if(!r.getReadNegativeStrandFlag() && r.getMateNegativeStrandFlag()){ double dist = (r.getMateAlignmentStart()+r.getReadLength())-r.getAlignmentStart(); histo.addValue(dist); } } }else if(r.getSecondOfPairFlag()){ RHits++; } }else{ notPrimary++; } } } public void processBT2SAMRecord(SAMRecord r){ totalReads++; if(!r.getReadUnmappedFlag()){ totalHits++; int count =1; //TODO: Fix this if using bowtie2 for multi-mapping reads int primAScore = r.getIntegerAttribute("AS"); int secAScore=-1000000; if(r.getIntegerAttribute("XS")!=null) secAScore = r.getIntegerAttribute("XS"); boolean currUnique = primAScore > secAScore ? true : false; if(count==1 && currUnique){ uniquelyMapped++; } weight += 1/(float)count; if(r.getReadPairedFlag()){ if(r.getMateUnmappedFlag()){ singleEnd++; }else{ pairMapped++; if(r.getMateReferenceName().equals(r.getReferenceName())){ pairedEndSameChr++; }else{ pairedEndDiffChr++; } } }else{ singleEnd++; } List<AlignmentBlock> blocks = r.getAlignmentBlocks(); if(blocks.size()>=2){ junctions+=blocks.size()-1; } if(!r.getNotPrimaryAlignmentFlag()){ if(!r.getReadPairedFlag() || r.getFirstOfPairFlag()){ LHits++; if(r.getReadPairedFlag() && r.getProperPairFlag()){ properPair++; if(!r.getReadNegativeStrandFlag() && r.getMateNegativeStrandFlag()){ double dist = (r.getMateAlignmentStart()+r.getReadLength())-r.getAlignmentStart(); histo.addValue(dist); } } }else if(r.getSecondOfPairFlag()){ RHits++; } }else{ notPrimary++; } } } public void printReadLength(){ System.out.println(readLen); } public void printReadCount(){ System.out.println(String.format("%.0f",+totalReads)); } public void printInsertDistrib(){ histo.printContents(); } public void printStats(){ System.out.println("\nTotalHits:\t"+String.format("%.0f",+totalHits)); System.out.println("MappedSeq:\t"+String.format("%.0f",weight)); System.out.println("UniquelyMapped:\t"+String.format("%.0f",uniquelyMapped)); System.out.println("SingleEndMapped:\t"+String.format("%.0f",singleEnd)); if(pairMapped>0){ System.out.println("LeftHits:\t"+String.format("%.0f",LHits)); System.out.println("RightHits:\t"+String.format("%.0f",RHits)); System.out.println("PairedEndMapped:\t"+String.format("%.0f",pairMapped)); System.out.println("ProperPairs:\t"+String.format("%.0f",properPair)); System.out.println("Junctions:\t"+String.format("%.0f",junctions)); System.out.println("PairedEndMapped_SameChr:\t"+String.format("%.0f",pairedEndSameChr)); System.out.println("PairedEndMapped_DiffChr:\t"+String.format("%.0f",pairedEndDiffChr)); System.out.println("NotPrimary:\t"+String.format("%.0f",notPrimary)); } } }