/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates;
import picard.PicardException;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.SamOrBam;
import picard.sam.DuplicationMetrics;
import htsjdk.samtools.ReservedTagConstants;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.*;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.SortingCollection;
import htsjdk.samtools.util.SortingLongCollection;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
import picard.sam.markduplicates.util.*;
import picard.sam.util.RepresentativeReadIndexer;
import java.io.*;
import java.util.*;
/**
* A better duplication marking algorithm that handles all cases including clipped
* and gapped alignments.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = MarkDuplicates.USAGE_SUMMARY + MarkDuplicates.USAGE_DETAILS,
usageShort = MarkDuplicates.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram {
static final String USAGE_SUMMARY = "Identifies duplicate reads. ";
static final String USAGE_DETAILS = "<p>This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are " +
"defined as originating from a single fragment of DNA. Duplicates can arise during sample preparation e.g. library " +
"construction using PCR. See also " +
"<a href='https://broadinstitute.github.io/picard/command-line-overview.html#EstimateLibraryComplexity'>EstimateLibraryComplexity</a>" +
" for additional notes on PCR duplication artifacts. Duplicate reads can also result from a single amplification cluster, " +
"incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument. These duplication artifacts are " +
"referred to as optical duplicates.</p>" +
"" +
"<p>The MarkDuplicates tool works by comparing sequences in the 5 prime positions of both reads and read-pairs in a SAM/BAM file. " +
"An BARCODE_TAG option is available to facilitate duplicate marking using molecular barcodes. After duplicate reads are" +
" collected, the tool differentiates the primary and duplicate reads using an algorithm that ranks reads by the sums " +
"of their base-quality scores (default method).</p> " +
"<p>The tool's main output is a new SAM or BAM file, in which duplicates have been identified in the SAM flags field for each" +
" read. Duplicates are marked with the hexadecimal value of 0x0400, which corresponds to a decimal value of 1024. " +
"If you are not familiar with this type of annotation, please see the following " +
"<a href='https://www.broadinstitute.org/gatk/blog?id=7019'>blog post</a> for additional information.</p>" +
"" +
"<p>Although the bitwise flag annotation indicates whether a read was marked as a duplicate, it does not identify the type of " +
"duplicate. To do this, a new tag called the duplicate type (DT) tag was recently added as an optional output in " +
"the 'optional field' section of a SAM/BAM file. Invoking the TAGGING_POLICY option," +
" you can instruct the program to mark all the duplicates (All), only the optical duplicates (OpticalOnly), or no " +
"duplicates (DontTag). The records within the output of a SAM/BAM file will have values for the 'DT' tag (depending on the invoked " +
"TAGGING_POLICY), as either library/PCR-generated duplicates (LB), or sequencing-platform artifact duplicates (SQ). " +
"This tool uses the READ_NAME_REGEX and the OPTICAL_DUPLICATE_PIXEL_DISTANCE options as the primary methods to identify " +
"and differentiate duplicate types. Set READ_NAME_REGEX to null to skip optical duplicate detection, e.g. for RNA-seq " +
"or other data where duplicate sets are extremely large and estimating library complexity is not an aim. " +
"Note that without optical duplicate counts, library size estimation will be inaccurate.</p> "+
"<p>MarkDuplicates also produces a metrics file indicating the numbers of duplicates for both single- and paired-end reads.</p> "+
"<p>The program can take either coordinate-sorted or query-sorted inputs, however the behavior is slightly different. " +
"When the input is coordinate-sorted, unmapped mates of mapped records and supplementary/secondary alignments are not " +
"marked as duplicates. However, when the input is query-sorted (actually query-grouped), " +
"then unmapped mates and secondary/supplementary reads are not excluded from the duplication test and can be" +
" marked as duplicate reads.</p> " +
"<p>If desired, duplicates can be removed using the REMOVE_DUPLICATE and REMOVE_SEQUENCING_DUPLICATES options.</p>" +
"" +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar MarkDuplicates \\<br />" +
" I=input.bam \\<br />" +
" O=marked_duplicates.bam \\<br />" +
" M=marked_dup_metrics.txt" +
"</pre>" +
"" +
"Please see " +
"<a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#DuplicationMetrics'>MarkDuplicates</a> " +
"for detailed explanations of the output metrics." +
"<hr />";
/** Enum used to control how duplicates are flagged in the DT optional tag on each read. */
public enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All }
/** The optional attribute in SAM/BAM files used to store the duplicate type. */
public static final String DUPLICATE_TYPE_TAG = "DT";
/** The duplicate type tag value for duplicate type: library. */
public static final String DUPLICATE_TYPE_LIBRARY = "LB";
/** The duplicate type tag value for duplicate type: sequencing (optical & pad-hopping, or "co-localized"). */
public static final String DUPLICATE_TYPE_SEQUENCING = "SQ";
/** The attribute in the SAM/BAM file used to store which read was selected as representative out of a duplicate set */
public static final String DUPLICATE_SET_INDEX_TAG = "DI";
/** The attribute in the SAM/BAM file used to store the size of a duplicate set */
public static final String DUPLICATE_SET_SIZE_TAG = "DS";
/** Enum for the possible values that a duplicate read can be tagged with in the DT attribute. */
public enum DuplicateType {
LIBRARY(DUPLICATE_TYPE_LIBRARY),
SEQUENCING(DUPLICATE_TYPE_SEQUENCING);
private final String code;
DuplicateType(final String code) { this.code = code; }
public String code() { return this.code; }
}
private final Log log = Log.getInstance(MarkDuplicates.class);
/**
* If more than this many sequences in SAM file, don't spill to disk because there will not
* be enough file handles.
*/
@Option(shortName = "MAX_SEQS",
doc = "This option is obsolete. ReadEnds will always be spilled to disk.")
public int MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP = 50000;
@Option(shortName = "MAX_FILE_HANDLES",
doc = "Maximum number of file handles to keep open when spilling read ends to disk. " +
"Set this number a little lower than the per-process maximum number of file that may be open. " +
"This number can be found by executing the 'ulimit -n' command on a Unix system.")
public int MAX_FILE_HANDLES_FOR_READ_ENDS_MAP = 8000;
@Option(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by " +
"some of the sorting collections. If you are running out of memory, try reducing this number.")
public double SORTING_COLLECTION_SIZE_RATIO = 0.25;
@Option(doc = "Barcode SAM tag (ex. BC for 10X Genomics)", optional = true)
public String BARCODE_TAG = null;
@Option(doc = "Read one barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
public String READ_ONE_BARCODE_TAG = null;
@Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true)
public String READ_TWO_BARCODE_TAG = null;
@Option(doc = "If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG (DS), " +
"indicates the size of the duplicate set. The smallest possible DS value is 2 which occurs when two " +
"reads map to the same portion of the reference only one of which is marked as duplicate. The second " +
"tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier for the duplicate set to which the " +
"record belongs. This identifier is the index-in-file of the representative read that was selected out " +
"of the duplicate set.", optional = true)
public boolean TAG_DUPLICATE_SET_MEMBERS = false;
@Option(doc = "If true remove 'optical' duplicates and other duplicates that appear to have arisen from the " +
"sequencing process instead of the library preparation process, even if REMOVE_DUPLICATES is false. " +
"If REMOVE_DUPLICATES is true, all duplicates are removed and this option is ignored.")
public boolean REMOVE_SEQUENCING_DUPLICATES = false;
@Option(doc= "Determines how duplicate types are recorded in the DT optional attribute.")
public DuplicateTaggingPolicy TAGGING_POLICY = DuplicateTaggingPolicy.DontTag;
private SortingCollection<ReadEndsForMarkDuplicates> pairSort;
private SortingCollection<ReadEndsForMarkDuplicates> fragSort;
private SortingLongCollection duplicateIndexes;
private SortingLongCollection opticalDuplicateIndexes;
private SortingCollection<RepresentativeReadIndexer> representativeReadIndicesForDuplicates;
private int numDuplicateIndices = 0;
static private final long NO_SUCH_INDEX = Long.MAX_VALUE; // needs to be large so that that >= test fails for query-sorted traversal
protected LibraryIdGenerator libraryIdGenerator = null; // this is initialized in buildSortedReadEndLists
private int getBarcodeValue(final SAMRecord record) {
return EstimateLibraryComplexity.getReadBarcodeValue(record, BARCODE_TAG);
}
private int getReadOneBarcodeValue(final SAMRecord record) {
return EstimateLibraryComplexity.getReadBarcodeValue(record, READ_ONE_BARCODE_TAG);
}
private int getReadTwoBarcodeValue(final SAMRecord record) {
return EstimateLibraryComplexity.getReadBarcodeValue(record, READ_TWO_BARCODE_TAG);
}
public MarkDuplicates() {
DUPLICATE_SCORING_STRATEGY = ScoringStrategy.SUM_OF_BASE_QUALITIES;
}
/** Stock main method. */
public static void main(final String[] args) {
new MarkDuplicates().instanceMainWithExit(args);
}
/**
* Main work method. Reads the BAM file once and collects sorted information about
* the 5' ends of both ends of each read (or just one end in the case of pairs).
* Then makes a pass through those determining duplicates before re-reading the
* input file and writing it out with duplication flags set correctly.
*/
protected int doWork() {
IOUtil.assertInputsAreValid(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsWritable(METRICS_FILE);
final boolean useBarcodes = (null != BARCODE_TAG || null != READ_ONE_BARCODE_TAG || null != READ_TWO_BARCODE_TAG);
reportMemoryStats("Start of doWork");
log.info("Reading input file and constructing read end information.");
buildSortedReadEndLists(useBarcodes);
reportMemoryStats("After buildSortedReadEndLists");
generateDuplicateIndexes(useBarcodes, this.REMOVE_SEQUENCING_DUPLICATES || this.TAGGING_POLICY != DuplicateTaggingPolicy.DontTag);
reportMemoryStats("After generateDuplicateIndexes");
log.info("Marking " + this.numDuplicateIndices + " records as duplicates.");
if (this.READ_NAME_REGEX == null) {
log.warn("Skipped optical duplicate cluster discovery; library size estimation may be inaccurate!");
} else {
log.info("Found " + (this.libraryIdGenerator.getNumberOfOpticalDuplicateClusters()) + " optical duplicate clusters.");
}
final SamHeaderAndIterator headerAndIterator = openInputs();
final SAMFileHeader header = headerAndIterator.header;
final SAMFileHeader.SortOrder sortOrder = header.getSortOrder();
final SAMFileHeader outputHeader = header.clone();
log.info("Reads are assumed to be ordered by: " + sortOrder);
if (sortOrder != SAMFileHeader.SortOrder.coordinate && sortOrder != SAMFileHeader.SortOrder.queryname) {
throw new PicardException("This program requires input that are either coordinate or query sorted. " +
"Found "+ sortOrder);
}
COMMENT.forEach(outputHeader::addComment);
// Key: previous PG ID on a SAM Record (or null). Value: New PG ID to replace it.
final Map<String, String> chainedPgIds = getChainedPgIds(outputHeader);
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outputHeader,
true,
OUTPUT);
// Now copy over the file while marking all the necessary indexes as duplicates
long recordInFileIndex = 0;
long nextOpticalDuplicateIndex = this.opticalDuplicateIndexes != null && this.opticalDuplicateIndexes.hasNext() ? this.opticalDuplicateIndexes.next() : NO_SUCH_INDEX;
long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : NO_SUCH_INDEX);
// initialize variables for optional representative read tagging
CloseableIterator<RepresentativeReadIndexer> representativeReadIterator = null;
RepresentativeReadIndexer rri = null;
int representativeReadIndexInFile = -1;
int duplicateSetSize = -1;
int nextRepresentativeIndex = -1;
if (TAG_DUPLICATE_SET_MEMBERS) {
representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator();
if (representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
}
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
String duplicateQueryName = null;
String opticalDuplicateQueryName = null;
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
final String library = LibraryIdGenerator.getLibraryName(header, rec);
DuplicationMetrics metrics = libraryIdGenerator.getMetricsByLibrary(library);
if (metrics == null) {
metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
libraryIdGenerator.addMetricsByLibrary(library, metrics);
}
// First bring the simple metrics up to date
if (rec.getReadUnmappedFlag()) {
++metrics.UNMAPPED_READS;
} else if(rec.isSecondaryOrSupplementary()) {
++metrics.SECONDARY_OR_SUPPLEMENTARY_RDS;
} else if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
++metrics.UNPAIRED_READS_EXAMINED;
} else {
++metrics.READ_PAIRS_EXAMINED; // will need to be divided by 2 at the end
}
// Now try and figure out the next duplicate index (if going by coordinate. if going by query name, only do this
// if the query name has changed.
final boolean needNextDuplicateIndex = recordInFileIndex > nextDuplicateIndex &&
(sortOrder == SAMFileHeader.SortOrder.coordinate || !rec.getReadName().equals(duplicateQueryName));
if (needNextDuplicateIndex) {
nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : NO_SUCH_INDEX);
}
final boolean isDuplicate = recordInFileIndex == nextDuplicateIndex ||
(sortOrder == SAMFileHeader.SortOrder.queryname &&
recordInFileIndex > nextDuplicateIndex && rec.getReadName().equals(duplicateQueryName));
if (isDuplicate) {
duplicateQueryName = rec.getReadName();
rec.setDuplicateReadFlag(true);
// only update duplicate counts for "decider" reads, not tag-a-long reads
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
// Update the duplication metrics
if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) {
++metrics.UNPAIRED_READ_DUPLICATES;
} else {
++metrics.READ_PAIR_DUPLICATES;// will need to be divided by 2 at the end
}
}
} else {
rec.setDuplicateReadFlag(false);
}
// Manage the flagging of optical/sequencing duplicates
final boolean needNextOpticalDuplicateIndex = recordInFileIndex > nextOpticalDuplicateIndex &&
(sortOrder == SAMFileHeader.SortOrder.coordinate || !rec.getReadName().equals(opticalDuplicateQueryName));
// Possibly figure out the next opticalDuplicate index (if going by coordinate, if going by query name, only do this
// if the query name has changed)
if (needNextOpticalDuplicateIndex) {
nextOpticalDuplicateIndex = (this.opticalDuplicateIndexes.hasNext() ? this.opticalDuplicateIndexes.next() : NO_SUCH_INDEX);
}
final boolean isOpticalDuplicate = sortOrder == SAMFileHeader.SortOrder.queryname &&
recordInFileIndex > nextOpticalDuplicateIndex &&
rec.getReadName().equals(opticalDuplicateQueryName) ||
recordInFileIndex == nextOpticalDuplicateIndex;
rec.setAttribute(DUPLICATE_TYPE_TAG, null);
if (this.TAGGING_POLICY != DuplicateTaggingPolicy.DontTag && rec.getDuplicateReadFlag()) {
if (isOpticalDuplicate) {
opticalDuplicateQueryName = rec.getReadName();
rec.setAttribute(DUPLICATE_TYPE_TAG, DuplicateType.SEQUENCING.code());
} else if (this.TAGGING_POLICY == DuplicateTaggingPolicy.All) {
rec.setAttribute(DUPLICATE_TYPE_TAG, DuplicateType.LIBRARY.code());
}
}
// Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name
if (TAG_DUPLICATE_SET_MEMBERS) {
final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex;
if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) {
rri = representativeReadIterator.next();
nextRepresentativeIndex = rri.readIndexInFile;
representativeReadIndexInFile = rri.representativeReadIndexInFile;
duplicateSetSize = rri.setSize;
}
final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex ||
(sortOrder == SAMFileHeader.SortOrder.queryname &&
recordInFileIndex > nextDuplicateIndex);
if (isInDuplicateSet) {
if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) {
if (TAG_DUPLICATE_SET_MEMBERS) {
rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile);
rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize);
}
}
}
}
// Output the record if desired and bump the record index
recordInFileIndex++;
if (this.REMOVE_DUPLICATES && rec.getDuplicateReadFlag()) continue;
if (this.REMOVE_SEQUENCING_DUPLICATES && isOpticalDuplicate) continue;
if (PROGRAM_RECORD_ID != null) rec.setAttribute(SAMTag.PG.name(), chainedPgIds.get(rec.getStringAttribute(SAMTag.PG.name())));
out.addAlignment(rec);
progress.record(rec);
}
// remember to close the inputs
iterator.close();
this.duplicateIndexes.cleanup();
if (TAG_DUPLICATE_SET_MEMBERS) {
this.representativeReadIndicesForDuplicates.cleanup();
}
reportMemoryStats("Before output close");
out.close();
reportMemoryStats("After output close");
// Write out the metrics
finalizeAndWriteMetrics(libraryIdGenerator);
return 0;
}
/**
* package-visible for testing
*/
long numOpticalDuplicates() { return ((long) this.libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap().getSumOfValues()); } // cast as long due to returning a double
/** Print out some quick JVM memory stats. */
private void reportMemoryStats(final String stage) {
System.gc();
final Runtime runtime = Runtime.getRuntime();
log.info(stage + " freeMemory: " + runtime.freeMemory() + "; totalMemory: " + runtime.totalMemory() +
"; maxMemory: " + runtime.maxMemory());
}
/**
* Goes through all the records in a file and generates a set of ReadEndsForMarkDuplicates objects that
* hold the necessary information (reference sequence, 5' read coordinate) to do
* duplication, caching to disk as necessary to sort them.
*/
private void buildSortedReadEndLists(final boolean useBarcodes) {
final int sizeInBytes;
if (useBarcodes) {
sizeInBytes = ReadEndsForMarkDuplicatesWithBarcodes.getSizeOf();
} else {
sizeInBytes = ReadEndsForMarkDuplicates.getSizeOf();
}
MAX_RECORDS_IN_RAM = (int) (Runtime.getRuntime().maxMemory() / sizeInBytes) / 2;
final int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * SORTING_COLLECTION_SIZE_RATIO) / sizeInBytes);
log.info("Will retain up to " + maxInMemory + " data points before spilling to disk.");
final ReadEndsForMarkDuplicatesCodec fragCodec, pairCodec, diskCodec;
if (useBarcodes) {
fragCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
pairCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
diskCodec = new ReadEndsForMarkDuplicatesWithBarcodesCodec();
} else {
fragCodec = new ReadEndsForMarkDuplicatesCodec();
pairCodec = new ReadEndsForMarkDuplicatesCodec();
diskCodec = new ReadEndsForMarkDuplicatesCodec();
}
this.pairSort = SortingCollection.newInstance(ReadEndsForMarkDuplicates.class,
pairCodec,
new ReadEndsMDComparator(useBarcodes),
maxInMemory,
TMP_DIR);
this.fragSort = SortingCollection.newInstance(ReadEndsForMarkDuplicates.class,
fragCodec,
new ReadEndsMDComparator(useBarcodes),
maxInMemory,
TMP_DIR);
final SamHeaderAndIterator headerAndIterator = openInputs();
final SAMFileHeader.SortOrder assumedSortOrder = headerAndIterator.header.getSortOrder();
final SAMFileHeader header = headerAndIterator.header;
final ReadEndsForMarkDuplicatesMap tmp = new DiskBasedReadEndsForMarkDuplicatesMap(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP, diskCodec);
long index = 0;
final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Read");
final CloseableIterator<SAMRecord> iterator = headerAndIterator.iterator;
if (null == this.libraryIdGenerator) {
this.libraryIdGenerator = new LibraryIdGenerator(header);
}
String duplicateQueryName = null;
long duplicateIndex = NO_SUCH_INDEX;
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
// This doesn't have anything to do with building sorted ReadEnd lists, but it can be done in the same pass
// over the input
if (PROGRAM_RECORD_ID != null) {
// Gather all PG IDs seen in merged input files in first pass. These are gathered for two reasons:
// - to know how many different PG records to create to represent this program invocation.
// - to know what PG IDs are already used to avoid collisions when creating new ones.
// Note that if there are one or more records that do not have a PG tag, then a null value
// will be stored in this set.
pgIdsSeen.add(rec.getStringAttribute(SAMTag.PG.name()));
}
// Of working in query-sorted, need to keep index of first record with any given query-name.
if(assumedSortOrder == SAMFileHeader.SortOrder.queryname && !rec.getReadName().equals(duplicateQueryName)) {
duplicateQueryName = rec.getReadName();
duplicateIndex = index;
}
if (rec.getReadUnmappedFlag()) {
if (rec.getReferenceIndex() == -1 && assumedSortOrder == SAMFileHeader.SortOrder.coordinate) {
// When we hit the unmapped reads with no coordinate, no reason to continue (only in coordinate sort).
break;
}
// If this read is unmapped but sorted with the mapped reads, just skip it.
} else if (!rec.isSecondaryOrSupplementary()) {
final long indexForRead = assumedSortOrder == SAMFileHeader.SortOrder.queryname ? duplicateIndex : index;
final ReadEndsForMarkDuplicates fragmentEnd = buildReadEnds(header, indexForRead, rec, useBarcodes);
this.fragSort.add(fragmentEnd);
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
final String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName();
ReadEndsForMarkDuplicates pairedEnds = tmp.remove(rec.getReferenceIndex(), key);
// See if we've already seen the first end or not
if (pairedEnds == null) {
// at this point pairedEnds and fragmentEnd are the same, but we need to make
// a copy since pairedEnds will be modified when the mate comes along.
pairedEnds = fragmentEnd.clone();
tmp.put(pairedEnds.read2ReferenceIndex, key, pairedEnds);
} else {
final int matesRefIndex = fragmentEnd.read1ReferenceIndex;
final int matesCoordinate = fragmentEnd.read1Coordinate;
// Set orientationForOpticalDuplicates, which always goes by the first then the second end for the strands. NB: must do this
// before updating the orientation later.
if (rec.getFirstOfPairFlag()) {
pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R);
if (useBarcodes)
((ReadEndsForMarkDuplicatesWithBarcodes) pairedEnds).readOneBarcode = getReadOneBarcodeValue(rec);
} else {
pairedEnds.orientationForOpticalDuplicates = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag());
if (useBarcodes)
((ReadEndsForMarkDuplicatesWithBarcodes) pairedEnds).readTwoBarcode = getReadTwoBarcodeValue(rec);
}
// If the other read is actually later, simply add the other read's data as read2, else flip the reads
if (matesRefIndex > pairedEnds.read1ReferenceIndex ||
(matesRefIndex == pairedEnds.read1ReferenceIndex && matesCoordinate >= pairedEnds.read1Coordinate)) {
pairedEnds.read2ReferenceIndex = matesRefIndex;
pairedEnds.read2Coordinate = matesCoordinate;
pairedEnds.read2IndexInFile = indexForRead;
pairedEnds.orientation = ReadEnds.getOrientationByte(pairedEnds.orientation == ReadEnds.R,
rec.getReadNegativeStrandFlag());
// if the two read ends are in the same position, pointing in opposite directions,
// the orientation is undefined and the procedure above
// will depend on the order of the reads in the file.
// To avoid this, we set it explicitly (to FR):
if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex &&
pairedEnds.read2Coordinate == pairedEnds.read1Coordinate &&
pairedEnds.orientation == ReadEnds.RF) {
pairedEnds.orientation = ReadEnds.FR;
}
} else {
pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex;
pairedEnds.read2Coordinate = pairedEnds.read1Coordinate;
pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile;
pairedEnds.read1ReferenceIndex = matesRefIndex;
pairedEnds.read1Coordinate = matesCoordinate;
pairedEnds.read1IndexInFile = indexForRead;
pairedEnds.orientation = ReadEnds.getOrientationByte(rec.getReadNegativeStrandFlag(),
pairedEnds.orientation == ReadEnds.R);
}
pairedEnds.score += DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY);
this.pairSort.add(pairedEnds);
}
}
}
// Print out some stats every 1m reads
++index;
if (progress.record(rec)) {
log.info("Tracking " + tmp.size() + " as yet unmatched pairs. " + tmp.sizeInRam() + " records in RAM.");
}
}
log.info("Read " + index + " records. " + tmp.size() + " pairs never matched.");
iterator.close();
// Tell these collections to free up memory if possible.
this.pairSort.doneAdding();
this.fragSort.doneAdding();
}
/** Builds a read ends object that represents a single read. */
private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, final long index, final SAMRecord rec, final boolean useBarcodes) {
final ReadEndsForMarkDuplicates ends;
if (useBarcodes) {
ends = new ReadEndsForMarkDuplicatesWithBarcodes();
} else {
ends = new ReadEndsForMarkDuplicates();
}
ends.read1ReferenceIndex = rec.getReferenceIndex();
ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart();
ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F;
ends.read1IndexInFile = index;
ends.score = DuplicateScoringStrategy.computeDuplicateScore(rec, this.DUPLICATE_SCORING_STRATEGY);
// Doing this lets the ends object know that it's part of a pair
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
ends.read2ReferenceIndex = rec.getMateReferenceIndex();
}
// Fill in the library ID
ends.libraryId = libraryIdGenerator.getLibraryId(rec);
// Fill in the location information for optical duplicates
if (this.opticalDuplicateFinder.addLocationInformation(rec.getReadName(), ends)) {
// calculate the RG number (nth in list)
ends.readGroup = 0;
final String rg = (String) rec.getAttribute("RG");
final List<SAMReadGroupRecord> readGroups = header.getReadGroups();
if (rg != null && readGroups != null) {
for (final SAMReadGroupRecord readGroup : readGroups) {
if (readGroup.getReadGroupId().equals(rg)) break;
else ends.readGroup++;
}
}
}
if (useBarcodes) {
final ReadEndsForMarkDuplicatesWithBarcodes endsWithBarcode = (ReadEndsForMarkDuplicatesWithBarcodes) ends;
endsWithBarcode.barcode = getBarcodeValue(rec);
if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) {
endsWithBarcode.readOneBarcode = getReadOneBarcodeValue(rec);
} else {
endsWithBarcode.readTwoBarcode = getReadTwoBarcodeValue(rec);
}
}
return ends;
}
/**
* Goes through the accumulated ReadEndsForMarkDuplicates objects and determines which of them are
* to be marked as duplicates.
*
* @return an array with an ordered list of indexes into the source file
*/
private void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) {
int entryOverhead;
if (TAG_DUPLICATE_SET_MEMBERS) {
// Memory requirements for RepresentativeReadIndexer:
// three int entries + overhead: (3 * 4) + 4 = 16 bytes
entryOverhead = 16;
}
else {
entryOverhead = SortingLongCollection.SIZEOF;
}
// Keep this number from getting too large even if there is a huge heap.
int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5));
// If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections
if (indexOpticalDuplicates) {
maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead);
this.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
}
log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk.");
this.duplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()]));
if (TAG_DUPLICATE_SET_MEMBERS) {
final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec();
this.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class,
representativeIndexCodec,
new RepresentativeReadComparator(),
maxInMemory,
TMP_DIR);
}
ReadEndsForMarkDuplicates firstOfNextChunk = null;
final List nextChunk = new ArrayList<ReadEndsForMarkDuplicates>(200);
// First just do the pairs
log.info("Traversing read pair information and detecting duplicates.");
for (final ReadEndsForMarkDuplicates next : this.pairSort) {
if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, true, useBarcodes)) {
nextChunk.add(next);
} else {
if (nextChunk.size() > 1) {
markDuplicatePairs(nextChunk);
if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
}
nextChunk.clear();
nextChunk.add(next);
firstOfNextChunk = next;
}
}
if (nextChunk.size() > 1) {
markDuplicatePairs(nextChunk);
if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk);
}
this.pairSort.cleanup();
this.pairSort = null;
// Now deal with the fragments
log.info("Traversing fragment information and detecting duplicates.");
boolean containsPairs = false;
boolean containsFrags = false;
firstOfNextChunk = null;
for (final ReadEndsForMarkDuplicates next : this.fragSort) {
if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false, useBarcodes)) {
nextChunk.add(next);
containsPairs = containsPairs || next.isPaired();
containsFrags = containsFrags || !next.isPaired();
} else {
if (nextChunk.size() > 1 && containsFrags) {
markDuplicateFragments(nextChunk, containsPairs);
}
nextChunk.clear();
nextChunk.add(next);
firstOfNextChunk = next;
containsPairs = next.isPaired();
containsFrags = !next.isPaired();
}
}
markDuplicateFragments(nextChunk, containsPairs);
this.fragSort.cleanup();
this.fragSort = null;
log.info("Sorting list of duplicate records.");
this.duplicateIndexes.doneAddingStartIteration();
if (this.opticalDuplicateIndexes != null) this.opticalDuplicateIndexes.doneAddingStartIteration();
if (TAG_DUPLICATE_SET_MEMBERS) this.representativeReadIndicesForDuplicates.doneAdding();
}
private boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2, final boolean useBarcodes) {
boolean areComparable = lhs.libraryId == rhs.libraryId;
if (useBarcodes && areComparable) { // areComparable is useful here to avoid the casts below
final ReadEndsForMarkDuplicatesWithBarcodes lhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) lhs;
final ReadEndsForMarkDuplicatesWithBarcodes rhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) rhs;
areComparable = lhsWithBarcodes.barcode == rhsWithBarcodes.barcode &&
lhsWithBarcodes.readOneBarcode == rhsWithBarcodes.readOneBarcode &&
lhsWithBarcodes.readTwoBarcode == rhsWithBarcodes.readTwoBarcode;
}
if (areComparable) {
areComparable = lhs.read1ReferenceIndex == rhs.read1ReferenceIndex &&
lhs.read1Coordinate == rhs.read1Coordinate &&
lhs.orientation == rhs.orientation;
}
if (areComparable && compareRead2) {
areComparable = lhs.read2ReferenceIndex == rhs.read2ReferenceIndex &&
lhs.read2Coordinate == rhs.read2Coordinate;
}
return areComparable;
}
private void addIndexAsDuplicate(final long bamIndex) {
this.duplicateIndexes.add(bamIndex);
++this.numDuplicateIndices;
}
private void addRepresentativeReadOfDuplicateSet(final long representativeReadIndexInFile, final int setSize, final long read1IndexInFile) {
final RepresentativeReadIndexer rri = new RepresentativeReadIndexer();
rri.representativeReadIndexInFile = (int) representativeReadIndexInFile;
rri.setSize = setSize;
rri.readIndexInFile = (int) read1IndexInFile;
this.representativeReadIndicesForDuplicates.add(rri);
}
/**
* Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on
* quality score. For all members of the duplicate set, add the read1 index-in-file of the representative
* read to the records of the first and second in a pair. This value becomes is used for
* the 'DI' tag.
*
* @param list
*/
private void addRepresentativeReadIndex(final List<ReadEndsForMarkDuplicates> list) {
short maxScore = 0;
ReadEndsForMarkDuplicates best = null;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (final ReadEndsForMarkDuplicates end : list) {
if (end.score > maxScore || best == null) {
maxScore = end.score;
best = end;
}
}
// for read name (for representative read name), add the last of the pair that was examined
for (final ReadEndsForMarkDuplicates end : list) {
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile);
addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile);
}
}
/**
* Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should
* not be marked as duplicates. This assumes that the list contains objects representing pairs.
*
* @param list
*/
private void markDuplicatePairs(final List<ReadEndsForMarkDuplicates> list) {
short maxScore = 0;
ReadEndsForMarkDuplicates best = null;
/** All read ends should have orientation FF, FR, RF, or RR **/
for (final ReadEndsForMarkDuplicates end : list) {
if (end.score > maxScore || best == null) {
maxScore = end.score;
best = end;
}
}
if (this.READ_NAME_REGEX != null) {
AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(list, best, opticalDuplicateFinder, libraryIdGenerator);
}
for (final ReadEndsForMarkDuplicates end : list) {
if (end != best) {
addIndexAsDuplicate(end.read1IndexInFile);
// in query-sorted case, these will be the same.
// TODO: also in coordinate sorted, when one read is unmapped
if(end.read2IndexInFile != end.read1IndexInFile) addIndexAsDuplicate(end.read2IndexInFile);
if (end.isOpticalDuplicate && this.opticalDuplicateIndexes != null) {
this.opticalDuplicateIndexes.add(end.read1IndexInFile);
this.opticalDuplicateIndexes.add(end.read2IndexInFile);
}
}
}
}
/**
* Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should
* not be marked as duplicates. This will set the duplicate index for only list items are fragments.
*
* @param list
* @param containsPairs true if the list also contains objects containing pairs, false otherwise.
*/
private void markDuplicateFragments(final List<ReadEndsForMarkDuplicates> list, final boolean containsPairs) {
if (containsPairs) {
for (final ReadEndsForMarkDuplicates end : list) {
if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile);
}
} else {
short maxScore = 0;
ReadEndsForMarkDuplicates best = null;
for (final ReadEndsForMarkDuplicates end : list) {
if (end.score > maxScore || best == null) {
maxScore = end.score;
best = end;
}
}
for (final ReadEndsForMarkDuplicates end : list) {
if (end != best) {
addIndexAsDuplicate(end.read1IndexInFile);
}
}
}
}
// To avoid overflows or underflows when subtracting two large (positive and negative) numbers
static int compareInteger(final int x, final int y) {
return (x < y) ? -1 : ((x == y) ? 0 : 1);
}
/** Comparator for ReadEndsForMarkDuplicates that orders by read1 position then pair orientation then read2 position. */
static class ReadEndsMDComparator implements Comparator<ReadEndsForMarkDuplicates> {
final boolean useBarcodes;
public ReadEndsMDComparator(final boolean useBarcodes) {
this.useBarcodes = useBarcodes;
}
public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs) {
int compareDifference = lhs.libraryId - rhs.libraryId;
if (useBarcodes) {
final ReadEndsForMarkDuplicatesWithBarcodes lhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) lhs;
final ReadEndsForMarkDuplicatesWithBarcodes rhsWithBarcodes = (ReadEndsForMarkDuplicatesWithBarcodes) rhs;
if (compareDifference == 0) compareDifference = compareInteger(lhsWithBarcodes.barcode, rhsWithBarcodes.barcode);
if (compareDifference == 0) compareDifference = compareInteger(lhsWithBarcodes.readOneBarcode, rhsWithBarcodes.readOneBarcode);
if (compareDifference == 0) compareDifference = compareInteger(lhsWithBarcodes.readTwoBarcode, rhsWithBarcodes.readTwoBarcode);
}
if (compareDifference == 0) compareDifference = lhs.read1ReferenceIndex - rhs.read1ReferenceIndex;
if (compareDifference == 0) compareDifference = lhs.read1Coordinate - rhs.read1Coordinate;
if (compareDifference == 0) compareDifference = lhs.orientation - rhs.orientation;
if (compareDifference == 0) compareDifference = lhs.read2ReferenceIndex - rhs.read2ReferenceIndex;
if (compareDifference == 0) compareDifference = lhs.read2Coordinate - rhs.read2Coordinate;
if (compareDifference == 0) compareDifference = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile);
if (compareDifference == 0) compareDifference = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile);
return compareDifference;
}
}
// order representative read entries based on the record index
static class RepresentativeReadComparator implements Comparator<RepresentativeReadIndexer> {
public RepresentativeReadComparator() {}
public int compare(final RepresentativeReadIndexer lhs, final RepresentativeReadIndexer rhs) {
int compareDifference = lhs.readIndexInFile - rhs.readIndexInFile;
return compareDifference;
}
}
}