/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam.markduplicates; import picard.PicardException; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.Log; import picard.cmdline.programgroups.Metrics; import picard.sam.DuplicationMetrics; import htsjdk.samtools.util.PeekableIterator; import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.SortingCollection; import htsjdk.samtools.util.StringUtil; import picard.sam.markduplicates.util.AbstractOpticalDuplicateFinderCommandLineProgram; import picard.sam.markduplicates.util.OpticalDuplicateFinder; import java.io.*; import java.util.*; import static java.lang.Math.pow; /** * <p>Attempts to estimate library complexity from sequence alone. Does so by sorting all reads * by the first N bases (5 by default) of each read and then comparing reads with the first * N bases identical to each other for duplicates. Reads are considered to be duplicates if * they match each other with no gaps and an overall mismatch rate less than or equal to * MAX_DIFF_RATE (0.03 by default).</p> * <p/> * <p>Reads of poor quality are filtered out so as to provide a more accurate estimate. The filtering * removes reads with any no-calls in the first N bases or with a mean base quality lower than * MIN_MEAN_QUALITY across either the first or second read.</p> * <p/> * <p>The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes * these in the calculation of library size. Also, since there is no alignment to screen out technical * reads one further filter is applied on the data. After examining all reads a Histogram is built of * [#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are * then removed from the Histogram as outliers before library size is estimated.</p> * * @author Tim Fennell */ @CommandLineProgramProperties( usage = "Attempts to estimate library complexity from sequence of read pairs alone. Does so by sorting all reads " + "by the first N bases (5 by default) of each read and then comparing reads with the first " + "N bases identical to each other for duplicates. Reads are considered to be duplicates if " + "they match each other with no gaps and an overall mismatch rate less than or equal to " + "MAX_DIFF_RATE (0.03 by default).\n\n" + "Reads of poor quality are filtered out so as to provide a more accurate estimate. The filtering " + "removes reads with any no-calls in the first N bases or with a mean base quality lower than " + "MIN_MEAN_QUALITY across either the first or second read.\n\n" + "Unpaired reads are ignored in this computation.\n\n" + "The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes " + "these in the calculation of library size. Also, since there is no alignment to screen out technical " + "reads one further filter is applied on the data. After examining all reads a Histogram is built of " + "[#reads in duplicate set -> #of duplicate sets] all bins that contain exactly one duplicate set are " + "then removed from the Histogram as outliers before library size is estimated.", usageShort = "Estimates library complexity from the sequence of read pairs", programGroup = Metrics.class ) public class EstimateLibraryComplexity extends AbstractOpticalDuplicateFinderCommandLineProgram { @Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="One or more files to combine and " + "estimate library complexity from. Reads can be mapped or unmapped.") public List<File> INPUT; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file to writes per-library metrics to.") public File OUTPUT; @Option(doc = "The minimum number of bases at the starts of reads that must be identical for reads to " + "be grouped together for duplicate detection. In effect total_reads / 4^max_id_bases reads will " + "be compared at a time, so lower numbers will produce more accurate results but consume " + "exponentially more memory and CPU.") public int MIN_IDENTICAL_BASES = 5; @Option(doc = "The maximum rate of differences between two reads to call them identical.") public double MAX_DIFF_RATE = 0.03; @Option(doc = "The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads with " + "lower average quality are filtered out and not considered in any calculations.") public int MIN_MEAN_QUALITY = 20; @Option(doc = "Do not process self-similar groups that are this many times over the mean expected group size. " + "I.e. if the input contains 10m read pairs and MIN_IDENTICAL_BASES is set to 5, then the mean expected " + "group size would be approximately 10 reads.") public int MAX_GROUP_RATIO = 500; private final Log log = Log.getInstance(EstimateLibraryComplexity.class); /** * Little class to hold the sequence of a pair of reads and tile location information. */ static class PairedReadSequence implements OpticalDuplicateFinder.PhysicalLocation { static int size_in_bytes = 2 + 1 + 4 + 1 + 300; // rough guess at memory footprint short readGroup = -1; short tile = -1; short x = -1, y = -1; boolean qualityOk = true; byte[] read1; byte[] read2; short libraryId; public short getReadGroup() { return this.readGroup; } public void setReadGroup(final short readGroup) { this.readGroup = readGroup; } public short getTile() { return this.tile; } public void setTile(final short tile) { this.tile = tile; } public short getX() { return this.x; } public void setX(final short x) { this.x = x; } public short getY() { return this.y; } public void setY(final short y) { this.y = y; } public short getLibraryId() { return this.libraryId; } public void setLibraryId(final short libraryId) { this.libraryId = libraryId; } } /** * Codec class for writing and read PairedReadSequence objects. */ static class PairedReadCodec implements SortingCollection.Codec<PairedReadSequence> { private DataOutputStream out; private DataInputStream in; public void setOutputStream(final OutputStream out) { this.out = new DataOutputStream(out); } public void setInputStream(final InputStream in) { this.in = new DataInputStream(in); } public void encode(final PairedReadSequence val) { try { this.out.writeShort(val.readGroup); this.out.writeShort(val.tile); this.out.writeShort(val.x); this.out.writeShort(val.y); this.out.writeInt(val.read1.length); this.out.write(val.read1); this.out.writeInt(val.read2.length); this.out.write(val.read2); } catch (IOException ioe) { throw new PicardException("Error write out read pair.", ioe); } } public PairedReadSequence decode() { try { final PairedReadSequence val = new PairedReadSequence(); try { val.readGroup = this.in.readShort(); } catch (EOFException eof) { return null; } val.tile = this.in.readShort(); val.x = this.in.readShort(); val.y = this.in.readShort(); int length = this.in.readInt(); val.read1 = new byte[length]; if (this.in.read(val.read1) != length) { throw new PicardException("Could not read " + length + " bytes from temporary file."); } length = this.in.readInt(); val.read2 = new byte[length]; if (this.in.read(val.read2) != length) { throw new PicardException("Could not read " + length + " bytes from temporary file."); } return val; } catch (IOException ioe) { throw new PicardException("Exception reading read pair.", ioe); } } @Override public SortingCollection.Codec<PairedReadSequence> clone() { return new PairedReadCodec(); } } /** * Comparator that orders read pairs on the first N bases of both reads. */ class PairedReadComparator implements Comparator<PairedReadSequence> { final int BASES = EstimateLibraryComplexity.this.MIN_IDENTICAL_BASES; public int compare(final PairedReadSequence lhs, final PairedReadSequence rhs) { // First compare the first N bases of the first read for (int i = 0; i < BASES; ++i) { final int retval = lhs.read1[i] - rhs.read1[i]; if (retval != 0) return retval; } // Then compare the first N bases of the second read for (int i = 0; i < BASES; ++i) { final int retval = lhs.read2[i] - rhs.read2[i]; if (retval != 0) return retval; } return System.identityHashCode(lhs) - System.identityHashCode(rhs); } } /** Stock main method. */ public static void main(final String[] args) { new EstimateLibraryComplexity().instanceMainWithExit(args); } public EstimateLibraryComplexity() { MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / PairedReadSequence.size_in_bytes) / 2; } /** * Method that does most of the work. Reads through the input BAM file and extracts the * read sequences of each read pair and sorts them via a SortingCollection. Then traverses * the sorted reads and looks at small groups at a time to find duplicates. */ @Override protected int doWork() { for (final File f : INPUT) IOUtil.assertFileIsReadable(f); log.info("Will store " + MAX_RECORDS_IN_RAM + " read pairs in memory before sorting."); final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>(); int recordsRead = 0; final SortingCollection<PairedReadSequence> sorter = SortingCollection.newInstance(PairedReadSequence.class, new PairedReadCodec(), new PairedReadComparator(), MAX_RECORDS_IN_RAM, TMP_DIR); // Loop through the input files and pick out the read sequences etc. final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read"); for (final File f : INPUT) { final Map<String, PairedReadSequence> pendingByName = new HashMap<String, PairedReadSequence>(); final SAMFileReader in = new SAMFileReader(f); readGroups.addAll(in.getFileHeader().getReadGroups()); for (final SAMRecord rec : in) { if (!rec.getReadPairedFlag()) continue; if (!rec.getFirstOfPairFlag() && !rec.getSecondOfPairFlag()) { continue; } PairedReadSequence prs = pendingByName.remove(rec.getReadName()); if (prs == null) { // Make a new paired read object and add RG and physical location information to it prs = new PairedReadSequence(); if (opticalDuplicateFinder.addLocationInformation(rec.getReadName(), prs)) { final SAMReadGroupRecord rg = rec.getReadGroup(); if (rg != null) prs.setReadGroup((short) readGroups.indexOf(rg)); } pendingByName.put(rec.getReadName(), prs); } // Read passes quality check if both ends meet the mean quality criteria final boolean passesQualityCheck = passesQualityCheck(rec.getReadBases(), rec.getBaseQualities(), MIN_IDENTICAL_BASES, MIN_MEAN_QUALITY); prs.qualityOk = prs.qualityOk && passesQualityCheck; // Get the bases and restore them to their original orientation if necessary final byte[] bases = rec.getReadBases(); if (rec.getReadNegativeStrandFlag()) SequenceUtil.reverseComplement(bases); if (rec.getFirstOfPairFlag()) { prs.read1 = bases; } else { prs.read2 = bases; } if (prs.read1 != null && prs.read2 != null && prs.qualityOk) { sorter.add(prs); } progress.record(rec); } } log.info("Finished reading - moving on to scanning for duplicates."); // Now go through the sorted reads and attempt to find duplicates final PeekableIterator<PairedReadSequence> iterator = new PeekableIterator<PairedReadSequence>(sorter.iterator()); final Map<String, Histogram<Integer>> duplicationHistosByLibrary = new HashMap<String, Histogram<Integer>>(); final Map<String, Histogram<Integer>> opticalHistosByLibrary = new HashMap<String, Histogram<Integer>>(); int groupsProcessed = 0; long lastLogTime = System.currentTimeMillis(); final int meanGroupSize = Math.max(1, (recordsRead / 2) / (int) pow(4, MIN_IDENTICAL_BASES * 2)); while (iterator.hasNext()) { // Get the next group and split it apart by library final List<PairedReadSequence> group = getNextGroup(iterator); if (group.size() > meanGroupSize * MAX_GROUP_RATIO) { final PairedReadSequence prs = group.get(0); log.warn("Omitting group with over " + MAX_GROUP_RATIO + " times the expected mean number of read pairs. " + "Mean=" + meanGroupSize + ", Actual=" + group.size() + ". Prefixes: " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES) + " / " + StringUtil.bytesToString(prs.read1, 0, MIN_IDENTICAL_BASES)); } else { final Map<String, List<PairedReadSequence>> sequencesByLibrary = splitByLibrary(group, readGroups); // Now process the reads by library for (final Map.Entry<String, List<PairedReadSequence>> entry : sequencesByLibrary.entrySet()) { final String library = entry.getKey(); final List<PairedReadSequence> seqs = entry.getValue(); Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library); Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library); if (duplicationHisto == null) { duplicationHisto = new Histogram<Integer>("duplication_group_count", library); opticalHisto = new Histogram<Integer>("duplication_group_count", "optical_duplicates"); duplicationHistosByLibrary.put(library, duplicationHisto); opticalHistosByLibrary.put(library, opticalHisto); } // Figure out if any reads within this group are duplicates of one another for (int i = 0; i < seqs.size(); ++i) { final PairedReadSequence lhs = seqs.get(i); if (lhs == null) continue; final List<PairedReadSequence> dupes = new ArrayList<PairedReadSequence>(); for (int j = i + 1; j < seqs.size(); ++j) { final PairedReadSequence rhs = seqs.get(j); if (rhs == null) continue; if (matches(lhs, rhs, MAX_DIFF_RATE)) { dupes.add(rhs); seqs.set(j, null); } } if (dupes.size() > 0) { dupes.add(lhs); final int duplicateCount = dupes.size(); duplicationHisto.increment(duplicateCount); final boolean[] flags = opticalDuplicateFinder.findOpticalDuplicates(dupes); for (final boolean b : flags) { if (b) opticalHisto.increment(duplicateCount); } } else { duplicationHisto.increment(1); } } } ++groupsProcessed; if (lastLogTime < System.currentTimeMillis() - 60000) { log.info("Processed " + groupsProcessed + " groups."); lastLogTime = System.currentTimeMillis(); } } } iterator.close(); sorter.cleanup(); final MetricsFile<DuplicationMetrics, Integer> file = getMetricsFile(); for (final String library : duplicationHistosByLibrary.keySet()) { final Histogram<Integer> duplicationHisto = duplicationHistosByLibrary.get(library); final Histogram<Integer> opticalHisto = opticalHistosByLibrary.get(library); final DuplicationMetrics metrics = new DuplicationMetrics(); metrics.LIBRARY = library; // Filter out any bins that have only a single entry in them and calcu for (final Integer bin : duplicationHisto.keySet()) { final double duplicateGroups = duplicationHisto.get(bin).getValue(); final double opticalDuplicates = opticalHisto.get(bin) == null ? 0 : opticalHisto.get(bin).getValue(); if (duplicateGroups > 1) { metrics.READ_PAIRS_EXAMINED += (bin * duplicateGroups); metrics.READ_PAIR_DUPLICATES += ((bin - 1) * duplicateGroups); metrics.READ_PAIR_OPTICAL_DUPLICATES += opticalDuplicates; } } metrics.calculateDerivedMetrics(); file.addMetric(metrics); file.addHistogram(duplicationHisto); } file.write(OUTPUT); return 0; } /** * Checks to see if two reads pairs have sequence that are the same, give or take a few * errors/diffs as dictated by the maxDiffRate. */ private boolean matches(final PairedReadSequence lhs, final PairedReadSequence rhs, final double maxDiffRate) { final int read1Length = Math.min(lhs.read1.length, rhs.read1.length); final int read2Length = Math.min(lhs.read2.length, rhs.read2.length); final int maxErrors = (int) Math.floor((read1Length + read2Length) * maxDiffRate); int errors = 0; // The loop can start from MIN_IDENTICAL_BASES because we've already confirmed that // at least those first few bases are identical when sorting. for (int i = MIN_IDENTICAL_BASES; i < read1Length; ++i) { if (lhs.read1[i] != rhs.read1[i]) { if (++errors > maxErrors) return false; } } for (int i = MIN_IDENTICAL_BASES; i < read2Length; ++i) { if (lhs.read2[i] != rhs.read2[i]) { if (++errors > maxErrors) return false; } } return true; } /** * Pulls out of the iterator the next group of reads that can be compared to each other to * identify duplicates. */ List<PairedReadSequence> getNextGroup(final PeekableIterator<PairedReadSequence> iterator) { final List<PairedReadSequence> group = new ArrayList<PairedReadSequence>(); final PairedReadSequence first = iterator.next(); group.add(first); outer: while (iterator.hasNext()) { final PairedReadSequence next = iterator.peek(); for (int i = 0; i < MIN_IDENTICAL_BASES; ++i) { if (first.read1[i] != next.read1[i] || first.read2[i] != next.read2[i]) break outer; } group.add(iterator.next()); } return group; } /** * Takes a list of PairedReadSequence objects and splits them into lists by library. */ Map<String, List<PairedReadSequence>> splitByLibrary(final List<PairedReadSequence> input, final List<SAMReadGroupRecord> rgs) { final Map<String, List<PairedReadSequence>> out = new HashMap<String, List<PairedReadSequence>>(); for (final PairedReadSequence seq : input) { String library = null; if (seq.getReadGroup() != -1) { library = rgs.get(seq.getReadGroup()).getLibrary(); if (library == null) library = "Unknown"; } else { library = "Unknown"; } List<PairedReadSequence> librarySeqs = out.get(library); if (librarySeqs == null) { librarySeqs = new ArrayList<PairedReadSequence>(); out.put(library, librarySeqs); } librarySeqs.add(seq); } return out; } /** * Checks that the average quality over the entire read is >= min, and that the first N bases do * not contain any no-calls. */ boolean passesQualityCheck(final byte[] bases, final byte[] quals, final int seedLength, final int minQuality) { if (bases.length < seedLength) return false; for (int i = 0; i < seedLength; ++i) { if (SequenceUtil.isNoCall(bases[i])) return false; } int total = 0; for (final byte b : quals) total += b; return total / quals.length >= minQuality; } }