/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis; import java.io.File; import java.lang.IllegalStateException;import java.lang.Integer;import java.lang.Math;import java.lang.String;import java.lang.System;import java.util.ArrayList; import java.util.Iterator; import java.util.List; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.programgroups.Metrics; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.metrics.MetricsFile; import picard.sam.DuplicationMetrics; import htsjdk.samtools.SamPairUtil; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamPairUtil.PairOrientation; import htsjdk.samtools.SAMTag; /** * Command-line program to compute metrics about outward-facing pairs, inward-facing * pairs, and chimeras in a jumping library. * * @author ktibbett@broadinstitute.org */ @CommandLineProgramProperties( usage = "Computes jumping library metrics. Gets all data for computation from the first" + "read in each pair and assumes that the MQ tag is set with the mate's mapping quality. If the " + "MQ tag is not set, then the program assumes that the mate's mapping quality is >= MINIMUM_MAPPING_QUALITY", usageShort = "Produces jumping library metrics for the provided SAM/BAMs", programGroup = Metrics.class ) public class CollectJumpingLibraryMetrics extends CommandLineProgram { // Usage and parameters @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc="BAM file(s) of reads with duplicates marked") public List<File> INPUT = new ArrayList<File>(); @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="File to which metrics should be written") public File OUTPUT; @Option(shortName = StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc="Mapping quality minimum cutoff") public Integer MINIMUM_MAPPING_QUALITY = 0; @Option(shortName="T", doc="When calculating mean and stdev stop when the bins in the tail of the distribution " + "contain fewer than mode/TAIL_LIMIT items") public int TAIL_LIMIT = 10000; @Option(doc="Jumps greater than or equal to the greater of this value or 2 times the mode of the " + "outward-facing pairs are considered chimeras") public int CHIMERA_KB_MIN = 100000; private static final int SAMPLE_FOR_MODE = 50000; // How many outward-facing pairs to sample to determine the mode /** Stock main method. */ public static void main(String[] args) { System.exit(new CollectJumpingLibraryMetrics().instanceMain(args)); } /** * Calculates the detailed statistics about the jumping library and then generates the results. */ protected int doWork() { for (File f : INPUT) { IOUtil.assertFileIsReadable(f); } IOUtil.assertFileIsWritable(OUTPUT); Histogram<Integer> innieHistogram = new Histogram<Integer>(); Histogram<Integer> outieHistogram = new Histogram<Integer>(); int fragments = 0; int innies = 0; int outies = 0; int innieDupes = 0; int outieDupes = 0; int crossChromPairs = 0; int superSized = 0; int tandemPairs = 0; double chimeraSizeMinimum = Math.max(getOutieMode(), (double)CHIMERA_KB_MIN); for (File f : INPUT) { SAMFileReader reader = new SAMFileReader(f); if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { throw new PicardException("SAM file must " + f.getName() + " must be sorted in coordintate order"); } for (SAMRecord sam : reader) { // We're getting all our info from the first of each pair. if (!sam.getFirstOfPairFlag()) { continue; } // Ignore unmapped read pairs if (sam.getReadUnmappedFlag()) { if (!sam.getMateUnmappedFlag()) { fragments++; continue; } // If both ends are unmapped and we've hit unaligned reads we're done if (sam.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { break; } continue; } if (sam.getMateUnmappedFlag()) { fragments++; continue; } // Ignore low-quality reads. If we don't have the mate mapping quality, assume it's OK if ((sam.getAttribute(SAMTag.MQ.name()) != null && sam.getIntegerAttribute(SAMTag.MQ.name()) < MINIMUM_MAPPING_QUALITY) || sam.getMappingQuality() < MINIMUM_MAPPING_QUALITY) { continue; } final int absInsertSize = Math.abs(sam.getInferredInsertSize()); if (absInsertSize > chimeraSizeMinimum) { superSized++; } else if (sam.getMateNegativeStrandFlag() == sam.getReadNegativeStrandFlag()) { tandemPairs++; } else if (!sam.getMateReferenceIndex().equals(sam.getReferenceIndex())) { crossChromPairs++; } else { final PairOrientation pairOrientation = SamPairUtil.getPairOrientation(sam); if (pairOrientation == PairOrientation.RF) { outieHistogram.increment(absInsertSize); outies++; if (sam.getDuplicateReadFlag()) { outieDupes++; } } else if(pairOrientation == PairOrientation.FR) { innieHistogram.increment(absInsertSize); innies++; if (sam.getDuplicateReadFlag()) { innieDupes++; } } else { throw new IllegalStateException("This should never happen"); } } } reader.close(); } MetricsFile<JumpingLibraryMetrics,Integer> metricsFile = getMetricsFile(); JumpingLibraryMetrics metrics = new JumpingLibraryMetrics(); metrics.JUMP_PAIRS = outies; metrics.JUMP_DUPLICATE_PAIRS = outieDupes; metrics.JUMP_DUPLICATE_PCT = outies != 0 ? outieDupes/(double)outies : 0; metrics.JUMP_LIBRARY_SIZE = (outies > 0 && outieDupes > 0) ? DuplicationMetrics.estimateLibrarySize(outies, outies-outieDupes) : 0; outieHistogram.trimByTailLimit(TAIL_LIMIT); metrics.JUMP_MEAN_INSERT_SIZE = outieHistogram.getMean(); metrics.JUMP_STDEV_INSERT_SIZE = outieHistogram.getStandardDeviation(); metrics.NONJUMP_PAIRS = innies; metrics.NONJUMP_DUPLICATE_PAIRS = innieDupes; metrics.NONJUMP_DUPLICATE_PCT = innies != 0 ? innieDupes/(double)innies : 0; metrics.NONJUMP_LIBRARY_SIZE = (innies > 0 && innieDupes > 0) ? DuplicationMetrics.estimateLibrarySize(innies, innies-innieDupes) : 0; innieHistogram.trimByTailLimit(TAIL_LIMIT); metrics.NONJUMP_MEAN_INSERT_SIZE = innieHistogram.getMean(); metrics.NONJUMP_STDEV_INSERT_SIZE = innieHistogram.getStandardDeviation(); metrics.CHIMERIC_PAIRS = crossChromPairs + superSized + tandemPairs; metrics.FRAGMENTS = fragments; double totalPairs = outies + innies + metrics.CHIMERIC_PAIRS; metrics.PCT_JUMPS = totalPairs != 0 ? outies/totalPairs : 0; metrics.PCT_NONJUMPS = totalPairs != 0 ? innies/totalPairs : 0; metrics.PCT_CHIMERAS = totalPairs != 0 ? metrics.CHIMERIC_PAIRS/totalPairs : 0; metricsFile.addMetric(metrics); metricsFile.write(OUTPUT); return 0; } /** * Calculates the mode for outward-facing pairs, using the first SAMPLE_FOR_MODE * outward-facing pairs found in INPUT */ private double getOutieMode() { int samplePerFile = SAMPLE_FOR_MODE/INPUT.size(); Histogram<Integer> histo = new Histogram<Integer>(); for (File f : INPUT) { SAMFileReader reader = new SAMFileReader(f); int sampled = 0; for (Iterator<SAMRecord> it = reader.iterator(); it.hasNext() && sampled < samplePerFile;) { SAMRecord sam = it.next(); if (!sam.getFirstOfPairFlag()) { continue; } // If we get here we've hit the end of the aligned reads if (sam.getReadUnmappedFlag() && sam.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { break; } else if (sam.getReadUnmappedFlag() || sam.getMateUnmappedFlag()) { continue; } else { if ( (sam.getAttribute(SAMTag.MQ.name()) == null || sam.getIntegerAttribute(SAMTag.MQ.name()) >= MINIMUM_MAPPING_QUALITY) && sam.getMappingQuality() >= MINIMUM_MAPPING_QUALITY && sam.getMateNegativeStrandFlag() != sam.getReadNegativeStrandFlag() && sam.getMateReferenceIndex().equals(sam.getReferenceIndex())) { if (SamPairUtil.getPairOrientation(sam) == PairOrientation.RF) { histo.increment(Math.abs(sam.getInferredInsertSize())); sampled++; } } } } reader.close(); } return histo.size() > 0 ? histo.getMode() : 0; } }