/*
* The MIT License
*
* Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates.util;
import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy;
import htsjdk.samtools.MergingSamRecordIterator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMProgramRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamFileHeaderMerger;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.Histogram;
import picard.PicardException;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.sam.DuplicationMetrics;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Abstract class that holds parameters and methods common to classes that perform duplicate
* detection and/or marking within SAM/BAM files.
*
* @author Nils Homer
*/
public abstract class AbstractMarkDuplicatesCommandLineProgram extends AbstractOpticalDuplicateFinderCommandLineProgram {
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
doc = "One or more input SAM or BAM files to analyze. Must be coordinate sorted.")
public List<String> INPUT;
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME,
doc = "The output file to write marked records to")
public File OUTPUT;
@Option(shortName = "M",
doc = "File to write duplication metrics to")
public File METRICS_FILE;
@Option(doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.")
public boolean REMOVE_DUPLICATES = false;
@Deprecated
@Option(shortName = StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME,
doc = "If true, assume that the input file is coordinate sorted even if the header says otherwise. " +
"Deprecated, used ASSUME_SORT_ORDER=coordinate instead.", mutex = {"ASSUME_SORT_ORDER"})
public boolean ASSUME_SORTED = false;
@Option(shortName = StandardOptionDefinitions.ASSUME_SORT_ORDER_SHORT_NAME,
doc = "If not null, assume that the input file has this order even if the header says otherwise.",
optional = true, mutex = {"ASSUME_SORTED"})
public SAMFileHeader.SortOrder ASSUME_SORT_ORDER = null;
@Option(shortName = "DS", doc = "The scoring strategy for choosing the non-duplicate among candidates.")
public ScoringStrategy DUPLICATE_SCORING_STRATEGY = ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH;
@Option(shortName = StandardOptionDefinitions.PROGRAM_RECORD_ID_SHORT_NAME,
doc = "The program record ID for the @PG record(s) created by this program. Set to null to disable " +
"PG record creation. This string may have a suffix appended to avoid collision with other " +
"program record IDs.",
optional = true)
public String PROGRAM_RECORD_ID = "MarkDuplicates";
@Option(shortName = "PG_VERSION",
doc = "Value of VN tag of PG record to be created. If not specified, the version will be detected automatically.",
optional = true)
public String PROGRAM_GROUP_VERSION;
@Option(shortName = "PG_COMMAND",
doc = "Value of CL tag of PG record to be created. If not supplied the command line will be detected automatically.",
optional = true)
public String PROGRAM_GROUP_COMMAND_LINE;
@Option(shortName = "PG_NAME",
doc = "Value of PN tag of PG record to be created.")
public String PROGRAM_GROUP_NAME = getClass().getSimpleName();
@Option(shortName = "CO",
doc = "Comment(s) to include in the output file's header.",
optional = true)
public List<String> COMMENT = new ArrayList<>();
/** The program groups that have been seen during the course of examining the input records. */
protected final Set<String> pgIdsSeen = new HashSet<>();
/**
* We have to re-chain the program groups based on this algorithm. This returns the map from existing program group ID
* to new program group ID.
*/
protected Map<String, String> getChainedPgIds(final SAMFileHeader outputHeader) {
final Map<String, String> chainedPgIds;
// Generate new PG record(s)
if (PROGRAM_RECORD_ID != null) {
final SAMFileHeader.PgIdGenerator pgIdGenerator = new SAMFileHeader.PgIdGenerator(outputHeader);
if (PROGRAM_GROUP_VERSION == null) {
PROGRAM_GROUP_VERSION = this.getVersion();
}
if (PROGRAM_GROUP_COMMAND_LINE == null) {
PROGRAM_GROUP_COMMAND_LINE = this.getCommandLine();
}
chainedPgIds = new HashMap<>();
for (final String existingId : this.pgIdsSeen) {
final String newPgId = pgIdGenerator.getNonCollidingId(PROGRAM_RECORD_ID);
chainedPgIds.put(existingId, newPgId);
final SAMProgramRecord programRecord = new SAMProgramRecord(newPgId);
programRecord.setProgramVersion(PROGRAM_GROUP_VERSION);
programRecord.setCommandLine(PROGRAM_GROUP_COMMAND_LINE);
programRecord.setProgramName(PROGRAM_GROUP_NAME);
programRecord.setPreviousProgramGroupId(existingId);
outputHeader.addProgramRecord(programRecord);
}
} else {
chainedPgIds = null;
}
return chainedPgIds;
}
/**
* Writes the metrics given by the libraryIdGenerator to the METRICS_FILE.
*
* @param libraryIdGenerator
*/
protected void finalizeAndWriteMetrics(final LibraryIdGenerator libraryIdGenerator) {
final Map<String, DuplicationMetrics> metricsByLibrary = libraryIdGenerator.getMetricsByLibraryMap();
final Histogram<Short> opticalDuplicatesByLibraryId = libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap();
final Map<String, Short> libraryIds = libraryIdGenerator.getLibraryIdsMap();
// Write out the metrics
final MetricsFile<DuplicationMetrics, Double> file = getMetricsFile();
for (final Map.Entry<String, DuplicationMetrics> entry : metricsByLibrary.entrySet()) {
final String libraryName = entry.getKey();
final DuplicationMetrics metrics = entry.getValue();
metrics.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2;
metrics.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2;
// Add the optical dupes to the metrics
final Short libraryId = libraryIds.get(libraryName);
if (libraryId != null) {
final Histogram.Bin<Short> bin = opticalDuplicatesByLibraryId.get(libraryId);
if (bin != null) {
metrics.READ_PAIR_OPTICAL_DUPLICATES = (long) bin.getValue();
}
}
metrics.calculateDerivedFields();
file.addMetric(metrics);
}
if (metricsByLibrary.size() == 1) {
file.setHistogram(metricsByLibrary.values().iterator().next().calculateRoiHistogram());
}
file.write(METRICS_FILE);
}
/** Little class used to package up a header and an iterable/iterator. */
public static final class SamHeaderAndIterator {
public final SAMFileHeader header;
public final CloseableIterator<SAMRecord> iterator;
public SamHeaderAndIterator(final SAMFileHeader header, final CloseableIterator<SAMRecord> iterator) {
this.header = header;
this.iterator = iterator;
}
}
/**
* Since this may read its inputs more than once this method does all the opening
* and checking of the inputs.
*/
protected SamHeaderAndIterator openInputs() {
final List<SAMFileHeader> headers = new ArrayList<>(INPUT.size());
final List<SamReader> readers = new ArrayList<>(INPUT.size());
for (final String input : INPUT) {
SamReader reader = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.EAGERLY_DECODE)
.open(SamInputResource.of(input));
final SAMFileHeader header = reader.getFileHeader();
headers.add(header);
readers.add(reader);
}
if (ASSUME_SORT_ORDER != null || ASSUME_SORTED) {
if (ASSUME_SORT_ORDER == null) {
ASSUME_SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
ASSUME_SORTED = false; // to maintain the "mutex" regarding these two arguments.
}
//if we assume a particular order, then the output will have that order in the header
headers.get(0).setSortOrder(ASSUME_SORT_ORDER);
}
if (headers.size() == 1) {
return new SamHeaderAndIterator(headers.get(0), readers.get(0).iterator());
} else {
final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(headers.get(0).getSortOrder(), headers, false);
final MergingSamRecordIterator iterator = new MergingSamRecordIterator(headerMerger, readers, ASSUME_SORT_ORDER != null);
return new SamHeaderAndIterator(headerMerger.getMergedHeader(), iterator);
}
}
/**
* Looks through the set of reads and identifies how many of the duplicates are
* in fact optical duplicates, and stores the data in the instance level histogram.
* Additionally sets the transient isOpticalDuplicate flag on each read end that is
* identified as an optical duplicate.
*/
public static void trackOpticalDuplicates(List<? extends ReadEnds> ends,
final ReadEnds keeper,
final OpticalDuplicateFinder opticalDuplicateFinder,
final LibraryIdGenerator libraryIdGenerator) {
boolean hasFR = false, hasRF = false;
// Check to see if we have a mixture of FR/RF
for (final ReadEnds end : ends) {
if (ReadEnds.FR == end.orientationForOpticalDuplicates) {
hasFR = true;
} else if (ReadEnds.RF == end.orientationForOpticalDuplicates) {
hasRF = true;
}
}
// Check if we need to partition since the orientations could have changed
if (hasFR && hasRF) { // need to track them independently
// Variables used for optical duplicate detection and tracking
final List<ReadEnds> trackOpticalDuplicatesF = new ArrayList<>();
final List<ReadEnds> trackOpticalDuplicatesR = new ArrayList<>();
// Split into two lists: first of pairs and second of pairs, since they must have orientation and same starting end
for (final ReadEnds end : ends) {
if (ReadEnds.FR == end.orientationForOpticalDuplicates) {
trackOpticalDuplicatesF.add(end);
} else if (ReadEnds.RF == end.orientationForOpticalDuplicates) {
trackOpticalDuplicatesR.add(end);
} else {
throw new PicardException("Found an unexpected orientation: " + end.orientation);
}
}
// track the duplicates
trackOpticalDuplicates(trackOpticalDuplicatesF, keeper, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap());
trackOpticalDuplicates(trackOpticalDuplicatesR, keeper, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap());
} else { // No need to partition
AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(ends, keeper, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap());
}
}
/**
* Looks through the set of reads and identifies how many of the duplicates are
* in fact optical duplicates, and stores the data in the instance level histogram.
*
* We expect only reads with FR or RF orientations, not a mixture of both.
*
* In PCR duplicate detection, a duplicates can be a have FR and RF when fixing the orientation order to the first end of the mate. In
* optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF when we order orientation by the
* first mate sequenced (read #1 of the pair).
*/
private static void trackOpticalDuplicates(final List<? extends ReadEnds> list,
final ReadEnds keeper,
final OpticalDuplicateFinder opticalDuplicateFinder,
final Histogram<Short> opticalDuplicatesByLibraryId) {
final boolean[] opticalDuplicateFlags = opticalDuplicateFinder.findOpticalDuplicates(list, keeper);
int opticalDuplicates = 0;
for (int i=0; i<opticalDuplicateFlags.length; ++i) {
if (opticalDuplicateFlags[i]) {
++opticalDuplicates;
list.get(i).isOpticalDuplicate = true;
}
}
if (opticalDuplicates > 0) {
opticalDuplicatesByLibraryId.increment(list.get(0).getLibraryId(), opticalDuplicates);
}
}
}