/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam.markduplicates; import htsjdk.samtools.util.SamRecordWithOrdinal; import htsjdk.samtools.util.SamRecordTrackingBuffer; import picard.PicardException; import htsjdk.samtools.util.Histogram; import picard.sam.DuplicationMetrics; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.PeekableIterator; import htsjdk.samtools.*; import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy; import htsjdk.samtools.util.CloseableIterator; import picard.sam.markduplicates.util.*; import java.io.File; import java.util.*; /** * This will iterate through a coordinate sorted SAM file (iterator) and either mark or * remove duplicates as appropriate. This class relies on the coordinate sort order as * well as the mate cigar (MC) optional SAM tag. */ public class MarkDuplicatesWithMateCigarIterator implements SAMRecordIterator { private SAMFileHeader header = null; /** The iterator from which records are read. */ private PeekableIterator<SAMRecord> backingIterator = null; /** The ordinal of the next record to be read from the backing iterator */ private int backingIteratorRecordIndex = 0; private boolean removeDuplicates = false; /** Should we skip pairs with no mate cigars or should be throw an error? */ private boolean skipPairsWithNoMateCigar = true; private int numRecordsWithNoMateCigar = 0; /** When we hit unmapped reads that are just before the EOF, we can greedily process them as they will not have coordinates */ private boolean foundUnmappedEOFReads = false; /** We can flush our queues and buffers if we move to a different reference index */ private int referenceIndex = 0; /** * This buffer contains all the records read from input in the same order. Nonetheless, each record * must be examined for duplicate marking, and so we may need to wait for this process to occur. This * buffer stores the records in coordinate order, whether or not they can be emitted, and their associated * duplicate marking flag. By definition, any record in the toMarkQueue will also be in the outputBuffer, * so we can omit checking the size of the toMarkQueue in some cases. */ private SamRecordTrackingBuffer outputBuffer = null; /** * The queue that stores the records that currently are not marked as duplicates. These need to be kept until * they cannot proven not to be duplicates, with the latter records having greater coordinate. The queue is stored in 5' unclipped * ordering, along with keeping the record with the best score, defined by the scoring strategies. If any record * is added to this queue and can be identified as a duplicate, the outputBuffer is notified of its * status and it can be emitted. Therefore, we limit the amount of records in this queue to only those that will NOT * be duplicates. */ private final MarkQueue toMarkQueue; /** The next record to be returned by next * */ private SAMRecord nextRecord = null; /** This gets various information about the library id for a given record */ private final LibraryIdGenerator libraryIdGenerator; /** This is used to identify optical duplicates among sets of duplicates */ private OpticalDuplicateFinder opticalDuplicateFinder = null; /** We use this to check that the input data was in coordinate sort order */ private final SAMRecordCoordinateComparator sortComparator = new SAMRecordCoordinateComparator(); boolean isClosed = false; /** * Initializes the mark duplicates iterator. * * @param header the SAM header * @param iterator an iterator over the SAM records to consider * @param opticalDuplicateFinder the algorithm for optical duplicate detection * @param duplicateScoringStrategy the scoring strategy for choosing duplicates. This cannot be SUM_OF_BASE_QUALITIES. * @param toMarkQueueMinimumDistance minimum distance for which to buffer * @param removeDuplicates true to remove duplicates, false to mark duplicates * @param skipPairsWithNoMateCigar true to not return mapped pairs with no mate cigar, false otherwise * @param blockSize the size of the blocks in the underlying buffer/queue * @param tmpDirs the temporary directories to use if we spill records to disk * @throws PicardException if the inputs are not in coordinate sort order */ public MarkDuplicatesWithMateCigarIterator(final SAMFileHeader header, final CloseableIterator<SAMRecord> iterator, final OpticalDuplicateFinder opticalDuplicateFinder, final ScoringStrategy duplicateScoringStrategy, final int toMarkQueueMinimumDistance, final boolean removeDuplicates, final boolean skipPairsWithNoMateCigar, final int maxRecordsInRam, final int blockSize, final List<File> tmpDirs) throws PicardException { if (header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { throw new PicardException(getClass().getName() + " expects the input to be in coordinate sort order."); } this.header = header; backingIterator = new PeekableIterator<SAMRecord>(iterator); outputBuffer = new SamRecordTrackingBuffer<SamRecordWithOrdinalAndSetDuplicateReadFlag>(maxRecordsInRam, blockSize, tmpDirs, header, SamRecordWithOrdinalAndSetDuplicateReadFlag.class); this.removeDuplicates = removeDuplicates; this.skipPairsWithNoMateCigar = skipPairsWithNoMateCigar; this.opticalDuplicateFinder = opticalDuplicateFinder; toMarkQueue = new MarkQueue(duplicateScoringStrategy); libraryIdGenerator = new LibraryIdGenerator(header); // Check for supported scoring strategies if (duplicateScoringStrategy == ScoringStrategy.SUM_OF_BASE_QUALITIES) throw new PicardException("SUM_OF_BASE_QUALITIES not supported as this may cause inconsistencies across ends in a pair. Please use a different scoring strategy."); // set up metrics for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { final String library = readGroup.getLibrary(); DuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library); if (metrics == null) { metrics = new DuplicationMetrics(); metrics.LIBRARY = library; libraryIdGenerator.addMetricsByLibrary(library, metrics); } } // This sets the window size we need to keep to guarantee we can mark duplicates correctly toMarkQueue.setToMarkQueueMinimumDistance(toMarkQueueMinimumDistance); // get the first samRecordWithOrdinal nextRecord = markDuplicatesAndGetTheNextAvailable(); // get one directly, or null } public void logMemoryStats(final Log log) { System.gc(); final Runtime runtime = Runtime.getRuntime(); log.info("freeMemory: " + runtime.freeMemory() + "; totalMemory: " + runtime.totalMemory() + "; maxMemory: " + runtime.maxMemory() + "; output buffer size: " + outputBuffer.size() + "; duplicate queue size: " + toMarkQueue.size() ); } /** * Establishes that records returned by this iterator are expected to * be in the specified sort order. If this method has been called, * then implementers must throw an IllegalStateException from tmpReadEnds() * when a samRecordWithOrdinal is read that violates the sort order. This method * may be called multiple times over the course of an iteration, * changing the expected sort, if desired -- from the time it is called, * it validates whatever sort is set, or stops validating if it * is set to null or SAMFileHeader.SortOrder.unsorted. If this method * is not called, then no validation of the iterated records is done. * * @param sortOrder The order in which records are expected to be returned * @return This SAMRecordIterator */ @Override public SAMRecordIterator assertSorted(final SAMFileHeader.SortOrder sortOrder) { if (sortOrder != SAMFileHeader.SortOrder.coordinate) { throw new IllegalStateException("Cannot assort " + sortOrder + " when expecting coordinate sorted input"); } return this; } @Override public boolean hasNext() { // fast succeed if (null != nextRecord) return true; // We would need to get another record, so check if we can either a record read from the input to the mark queue, or we have more that we should return. // There should be at no time records in the mark queue that are not tracked in the output buffer. return (backingIterator.hasNext() || !outputBuffer.isEmpty()); } @Override public SAMRecord next() throws PicardException { final SAMRecord toReturn = nextRecord; // save for return // This should always return an element // NB: it should be the case that nextRecord != null if (null == toReturn) { throw new NoSuchElementException(); } // Get the next record, if possible // NB: it should be the case that (nextRecord != null), due to the (null == toReturn) above if (hasNext()) { nextRecord = markDuplicatesAndGetTheNextAvailable(); // get one more, if possible } else { nextRecord = null; } // Check for sorted order if (null != nextRecord && 0 < sortComparator.fileOrderCompare(toReturn, nextRecord)) { System.err.print("Previous record: " + toReturn.getSAMString()); System.err.print("Current record:" + nextRecord.getSAMString()); throw new PicardException("Records were not found coordinate sort order"); } return toReturn; } /** * Handles records that are paired with both ends mapped, but lacking a mate cigar. This returns true if we * can ignore this record after calling this method (when reading input), false otherwise. */ private boolean ignoreDueToMissingMateCigar(final SamRecordWithOrdinal samRecordWithOrdinal) { final SAMRecord record = samRecordWithOrdinal.getRecord(); // ignore/except-on paired records with mapped mate and no mate cigar if (record.getReadPairedFlag() && !record.getMateUnmappedFlag() && null == SAMUtils.getMateCigar(record)) { // paired with one end unmapped and no mate cigar // NB: we are not truly examining these records. Do we want to count them? if (!record.isSecondaryOrSupplementary()) { // update metrics final DuplicationMetrics metrics = getMetrics(record); if (record.getReadUnmappedFlag()) { ++metrics.UNMAPPED_READS; } else if (!record.getReadPairedFlag() || record.getMateUnmappedFlag()) { ++metrics.UNPAIRED_READS_EXAMINED; } else { ++metrics.READ_PAIRS_EXAMINED; } } if (skipPairsWithNoMateCigar) { // pseudo-silently ignores them // NB: need to addRecordToTheOutputBuffer/set-flag as chunking/flushing of the toMarkQueue may need to occur addRecordToTheOutputBuffer(samRecordWithOrdinal); // now samRecordWithOrdinal will be stored in outputBuffer for return backingIteratorRecordIndex++; outputBuffer.setResultState(samRecordWithOrdinal, false); // indicate the present wrapped samRecordWithOrdinal is available for return numRecordsWithNoMateCigar++; backingIterator.next(); // remove it, since we called backingIterator.peek() return true; } else { throw new PicardException("Read " + record.getReadName() + " was mapped and had a mapped mate, but no mate cigar (\"MC\") tag."); } } return false; } /** * This handles unmapped records at the end of the file. If this is the first time we have found them, then we * can empty the toMarkQueue and call markDuplicatesAndGetTheNextAvailable, otherwise we can just emit them. The * duplication metrics will be updated. */ private SAMRecord nextIfRecordIsUnmappedAtEOF(final SAMRecord record) { // when we find unmapped reads with -1 as their reference index, there should be no mapped reads later in the file. if (foundUnmappedEOFReads) { // previously found unmapped reads at the end of the file final SAMRecord unmappedRecord = backingIterator.next(); // since we called backingIterator.peek() if (!record.isSecondaryOrSupplementary()) { // update metrics final DuplicationMetrics metrics = getMetrics(record); ++metrics.UNMAPPED_READS; } // We should have no more in the queue if (!outputBuffer.isEmpty()) { throw new PicardException("Encountered unmapped reads at the end of the file, but the alignment start buffer was not empty."); } return unmappedRecord; // unmapped end of file records can simply be emitted - no need to duplicate mark them } else { foundUnmappedEOFReads = true; // move past all mapped reads referenceIndex = header.getSequenceDictionary().getSequences().size(); // do the final round of duplicate marking tryPollingTheToMarkQueue(true, null); // NB: we do not call next here since we will recurse and perhaps hit the flush, or re-enter the if with unmapped EOF reads return markDuplicatesAndGetTheNextAvailable(); // this should flush the buffer } } /** * Check that we are not incorrectly performing any duplicate marking, by having too few of the records. This * can happen if the alignment start is increasing but 5' soft-clipping is increasing such that we miss reads with * the same 5' unclipped alignment start. This is especially true for RNAseq. */ private void checkForMinimumDistanceFailure(final ReadEndsForMateCigar current) { if (!toMarkQueue.isEmpty()) { final ReadEndsForMateCigar other = toMarkQueue.peek(); if (other.read1ReferenceIndex == current.read1ReferenceIndex && toMarkQueue.getToMarkQueueMinimumDistance() <= other.read1Coordinate - current.read1Coordinate) { if (checkCigarForSkips(other.getRecord().getCigar())) { throw new PicardException("Found a samRecordWithOrdinal with sufficiently large code length that we may have\n" + " missed including it in an early duplicate marking iteration. Alignment contains skipped" + " reference bases (N's). If this is an\n RNAseq aligned bam, please use MarkDuplicates instead," + " as this tool does not work well with spliced reads.\n Minimum distance set to " + toMarkQueue.getToMarkQueueMinimumDistance() + " but " + (other.read1Coordinate - current.read1Coordinate - 1) + " would be required.\n" + "Record was: " + other.getRecord().getSAMString()); } else { System.err.print("record #1: " + other.getRecord().getSAMString()); System.err.print("record #2: " + current.getRecord().getSAMString()); throw new PicardException("Found a samRecordWithOrdinal with sufficiently large clipping that we may have\n" + " missed including it in an early duplicate marking iteration. Please increase the" + " minimum distance to at least " + (other.read1Coordinate - current.read1Coordinate - 1) + "bp\nto ensure it is considered (was " + toMarkQueue.getToMarkQueueMinimumDistance() + ").\n" + "Record was: " + other.getRecord().getSAMString()); } } } } /** * This tries to get a record that has been evaluated for duplicate marking. It does this by first seeing if there * are any records that have been through duplicate marking. If none are available, it will try to get more records * from the input iterator until there are reads available that have been duplicate marked. If there are no more * records available from the input iterator, it will duplicate mark the final chunk of records. Finally, if there * are no more records, it will return null; */ private SAMRecord markDuplicatesAndGetTheNextAvailable() { // Check if there are any we can flush output buffer { // NB: braces to limit the scope of 'record' final SAMRecord record = flush(); if (null != record) return record; } // Check if there are any more records to read in if (!backingIterator.hasNext()) { // no more records to read in // Check if there are any more to mark if (toMarkQueue.isEmpty()) { // check if there are any that can be outputted if (outputBuffer.isEmpty()) { return null; } // no need to flush; no records in the queue and buffer } else { // force marking duplicates on the remaining records tryPollingTheToMarkQueue(true, null); } /** Since we have no more records to read in, and no more records that need duplicate marking run, we * update our coordinate to past the end of the reference */ referenceIndex = header.getSequenceDictionary().getSequences().size(); /** Now we recurse, so that we can flush from the outputBuffer until it is empty, then return null when * all of the input, queue, and buffer are empty */ return markDuplicatesAndGetTheNextAvailable(); } /** We need to retrieve more records from the input iterator and duplicate mark, until we can return one that * has been through duplicate marking. */ while (backingIterator.hasNext()) { // NB: we could get rid of this if we made nextRecord into a list... // NB: we do not actually remove this record from the backing iterator until much later, to help with processing unmapped reads at the EOF SAMRecord record = backingIterator.peek(); // peek: used for unmapped reads final SamRecordWithOrdinal samRecordWithOrdinal = new SamRecordWithOrdinalAndSetDuplicateReadFlag(record, backingIteratorRecordIndex); ReadEndsForMateCigar readEnds = null; boolean performedChunkAndMarkTheDuplicates = false; // remove duplicate information record.setDuplicateReadFlag(false); /** Check for pairs that have both ends mapped and missing mate cigar. */ if (ignoreDueToMissingMateCigar(samRecordWithOrdinal)) { continue; } // check for an unmapped read if (record.getReadUnmappedFlag()) { // unmapped reads at the end of the file! if (-1 == record.getReferenceIndex()) { // NB: this may call markDuplicatesAndGetTheNextAvailable if this is the first time a EOF unmapped record has been seen return nextIfRecordIsUnmappedAtEOF(record); } else if (!record.isSecondaryOrSupplementary()) { // update metrics final DuplicationMetrics metrics = getMetrics(record); ++metrics.UNMAPPED_READS; } // we will check for unmapped reads later so as not to add them to mark queue } else { // If not already set, this sets the minimum distance to twice the read length, or 100, whichever is larger if (-1 == toMarkQueue.getToMarkQueueMinimumDistance()) { // use twice the first read's length toMarkQueue.setToMarkQueueMinimumDistance(Math.max(2 * record.getReadBases().length, 100)); } // build a read end for use in the toMarkQueue readEnds = new ReadEndsForMateCigar(header, samRecordWithOrdinal, opticalDuplicateFinder, libraryIdGenerator.getLibraryId(samRecordWithOrdinal.getRecord())); // check that the minimumDistance was not too small checkForMinimumDistanceFailure(readEnds); /** * If we can do some duplicate marking, lets do it! * IMPORTANT: this does not flush the to-mark-queue, so the minimum distance needs to be set for us to infer * which records will never be supplemented (i.e. are non-duplicate). */ performedChunkAndMarkTheDuplicates = tryPollingTheToMarkQueue(false, readEnds); } // We can now remove the record from the input backingIterator.next(); // Add this to the outputBuffer so it can be tracked. It will not be available to emit until it has been through duplicate marking. addRecordToTheOutputBuffer(samRecordWithOrdinal); backingIteratorRecordIndex++; // Each record is has an index and is emitted in the same order. This helps that. // We do not consider secondary, supplementary, or unmapped alignments for duplicate marking. We can thus mark that duplicate marking on them has been completed. if (record.isSecondaryOrSupplementary() || record.getReadUnmappedFlag()) { outputBuffer.setResultState(samRecordWithOrdinal, false); } else { // Bring the simple metrics up to date final DuplicationMetrics metrics = getMetrics(record); if (!record.getReadPairedFlag() || record.getMateUnmappedFlag()) { ++metrics.UNPAIRED_READS_EXAMINED; } else { ++metrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end } // Add the record for duplicate marking, which may in fact cause it to be duplicate marked or stored for later toMarkQueue.add(readEnds, outputBuffer, getMetrics(readEnds.getRecord())); } // Check if there are any we can flush, which happens if we just performed duplicate marking if (performedChunkAndMarkTheDuplicates) { record = flush(); if (null != record) return record; } } // try again, as we may have records we can flush, or we want to see if we are at the EOF return markDuplicatesAndGetTheNextAvailable(); } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public void close() { // close the input and output backingIterator.close(); outputBuffer.close(); isClosed = true; } /** * Checks a Cigar for the presence of N operators. Reads with skipped bases may be spliced RNAseq reads * * @param cigar */ private boolean checkCigarForSkips(final Cigar cigar) { final List<CigarElement> elements = cigar.getCigarElements(); for (final CigarElement el : elements) { if (el.getOperator() == CigarOperator.N) return true; } return false; } private void enforceClosed() { if (!isClosed) throw new PicardException("Calling a method that assumes the iterator is closed"); } /** Useful for statistics after the iterator has been exhausted and closed. */ public int getNumRecordsWithNoMateCigar() { enforceClosed(); return numRecordsWithNoMateCigar; } public int getNumDuplicates() { enforceClosed(); return toMarkQueue.getNumDuplicates(); } public LibraryIdGenerator getLibraryIdGenerator() { enforceClosed(); return libraryIdGenerator; } public Histogram<Short> getOpticalDupesByLibraryId() { enforceClosed(); return libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(); } /** * Gets a SAMRecord if one is available after marking. This enforces that we return records in the original * coordinate sort order in a stable fashion. * * @return record representing the head of the alignment-start sorted buffer, or null if the head record has not yet been duplicate marked */ private SAMRecord flush() { // Check that there is at least one record in the coordinate-sorted buffer, and that the head record has been through duplicate-marking while (!outputBuffer.isEmpty() && outputBuffer.canEmit()) { // the buffer contains wrapped SAMRecords, which we want to unwrap final SAMRecord record = outputBuffer.next().getRecord(); // If this read is a duplicate, do we want to remove it (continue the loop) or return it for emission? if (!removeDuplicates || !record.getDuplicateReadFlag()) { return record; } } return null; } /** * Adds a samRecordWithOrdinal to the output buffer. This does not mean that it is ready to be emitted, since it may need to be * duplicate marked. * * @param samRecordWithOrdinal the index of the record of which to track. * @throws PicardException if the records are added out of order */ private void addRecordToTheOutputBuffer(final SamRecordWithOrdinal samRecordWithOrdinal) throws PicardException { final int recordReferenceIndex = samRecordWithOrdinal.getRecord().getReferenceIndex(); if (recordReferenceIndex < referenceIndex) { throw new PicardException("Records out of order: " + recordReferenceIndex + " < " + referenceIndex); } else if (referenceIndex < recordReferenceIndex) { // new reference, so we need to mark duplicates on the current ones // NB: we will not miss inter-chromosomal alignments since presumably one end will have been mapped to this chromosome and processed, and we do not need the other end to do so. tryPollingTheToMarkQueue(true, null); // update genomic coordinate to the next reference index referenceIndex = recordReferenceIndex; } // add the samRecordWithOrdinal to the output buffer so that it can be tracked outputBuffer.add(samRecordWithOrdinal); } /** * Tries to get a record from the toMarkQueue that has been successfully through duplicate marking. Note, either flush is true or * current must be non-null. * * @param flush true if we should empty the toMarkQueue fully. * @param current the current end to ensure we consider all possible ends for a duplicate * @return true if we did get at least one record, false otherwise */ private boolean tryPollingTheToMarkQueue(final boolean flush, final ReadEndsForMateCigar current) { boolean performedChunkAndMarkTheDuplicates = false; if (!flush && null == current) throw new PicardException("Flush cannot be false and current be null"); if (toMarkQueue.isEmpty()) return false; if (!toMarkQueue.isEmpty() && outputBuffer.isEmpty()) { throw new PicardException("0 < toMarkQueue && outputBuffer.isEmpty()"); } /** * Try to poll the toMarkQueue. If we are flushing all the records from it, just do so until empty. Otherwise, we need to * make sure we only poll those a certain distance away from current. */ while (!toMarkQueue.isEmpty() && (flush || referenceIndex != current.read1ReferenceIndex || toMarkQueue.getToMarkQueueMinimumDistance() < current.read1Coordinate - toMarkQueue.peek().read1Coordinate)) { // Poll will track that this samRecordWithOrdinal has been through duplicate marking. It is not marked as a duplicate :) final ReadEndsForMateCigar next = toMarkQueue.poll(outputBuffer, header, opticalDuplicateFinder, libraryIdGenerator); // get the first one! performedChunkAndMarkTheDuplicates = true; // track optical duplicates using only those reads that are the first end... if (toMarkQueue.shouldBeInLocations(next) && next.getRecord().getFirstOfPairFlag()) { final Set<ReadEnds> locations = toMarkQueue.getLocations(next); if (!locations.isEmpty()) { AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(new ArrayList<ReadEnds>(locations), opticalDuplicateFinder, libraryIdGenerator); } } // NB: we could try to greedily return a record if one is available here. Instead we continue processing the mark queue */ } return performedChunkAndMarkTheDuplicates; } /** Get the duplication metrics for the library associated with end. */ private DuplicationMetrics getMetrics(final SAMRecord record) { final String library = LibraryIdGenerator.getLibraryName(header, record); DuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library); if (metrics == null) { metrics = new DuplicationMetrics(); metrics.LIBRARY = library; libraryIdGenerator.addMetricsByLibrary(library, metrics); } return metrics; } }