/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam.markduplicates.util; import picard.PicardException; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.MergingSamRecordIterator; import htsjdk.samtools.SamFileHeaderMerger; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.*; import htsjdk.samtools.util.CloseableIterator; import picard.cmdline.programgroups.SamOrBam; import picard.sam.DuplicationMetrics; import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy; import java.io.File; import java.util.*; /** * Abstract class that holds parameters and methods common to classes that perform duplicate * detection and/or marking within SAM/BAM files. * * @author Nils Homer */ public abstract class AbstractMarkDuplicatesCommandLineProgram extends AbstractOpticalDuplicateFinderCommandLineProgram { @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "One or more input SAM or BAM files to analyze. Must be coordinate sorted.") public List<File> INPUT; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output file to write marked records to") public File OUTPUT; @Option(shortName = "M", doc = "File to write duplication metrics to") public File METRICS_FILE; @Option(shortName = StandardOptionDefinitions.PROGRAM_RECORD_ID_SHORT_NAME, doc = "The program record ID for the @PG record(s) created by this program. Set to null to disable " + "PG record creation. This string may have a suffix appended to avoid collision with other " + "program record IDs.", optional = true) public String PROGRAM_RECORD_ID = "MarkDuplicates"; @Option(shortName = "PG_VERSION", doc = "Value of VN tag of PG record to be created. If not specified, the version will be detected automatically.", optional = true) public String PROGRAM_GROUP_VERSION; @Option(shortName = "PG_COMMAND", doc = "Value of CL tag of PG record to be created. If not supplied the command line will be detected automatically.", optional = true) public String PROGRAM_GROUP_COMMAND_LINE; @Option(shortName = "PG_NAME", doc = "Value of PN tag of PG record to be created.") public String PROGRAM_GROUP_NAME = getClass().getSimpleName(); @Option(shortName = "CO", doc = "Comment(s) to include in the output file's header.", optional = true) public List<String> COMMENT = new ArrayList<String>(); @Option(doc = "If true do not write duplicates to the output file instead of writing them with appropriate flags set.") public boolean REMOVE_DUPLICATES = false; @Option(shortName = StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME, doc = "If true, assume that the input file is coordinate sorted even if the header says otherwise.") public boolean ASSUME_SORTED = false; @Option(shortName = "DS", doc = "The scoring strategy for choosing the non-duplicate among candidates.") public ScoringStrategy DUPLICATE_SCORING_STRATEGY = ScoringStrategy.TOTAL_MAPPED_REFERENCE_LENGTH; /** The program groups that have been seen during the course of examining the input records. */ protected final Set<String> pgIdsSeen = new HashSet<String>(); /** * We have to re-chain the program groups based on this algorithm. This returns the map from existing program group ID * to new program group ID. */ protected Map<String, String> getChainedPgIds(final SAMFileHeader outputHeader) { final Map<String, String> chainedPgIds; // Generate new PG record(s) if (PROGRAM_RECORD_ID != null) { final PgIdGenerator pgIdGenerator = new PgIdGenerator(outputHeader); if (PROGRAM_GROUP_VERSION == null) { PROGRAM_GROUP_VERSION = this.getVersion(); } if (PROGRAM_GROUP_COMMAND_LINE == null) { PROGRAM_GROUP_COMMAND_LINE = this.getCommandLine(); } chainedPgIds = new HashMap<String, String>(); for (final String existingId : this.pgIdsSeen) { final String newPgId = pgIdGenerator.getNonCollidingId(PROGRAM_RECORD_ID); chainedPgIds.put(existingId, newPgId); final SAMProgramRecord programRecord = new SAMProgramRecord(newPgId); programRecord.setProgramVersion(PROGRAM_GROUP_VERSION); programRecord.setCommandLine(PROGRAM_GROUP_COMMAND_LINE); programRecord.setProgramName(PROGRAM_GROUP_NAME); programRecord.setPreviousProgramGroupId(existingId); outputHeader.addProgramRecord(programRecord); } } else { chainedPgIds = null; } return chainedPgIds; } /** * Writes the metrics given by the libraryIdGenerator to the METRICS_FILE. * * @param libraryIdGenerator */ protected void finalizeAndWriteMetrics(final LibraryIdGenerator libraryIdGenerator) { final Map<String, DuplicationMetrics> metricsByLibrary = libraryIdGenerator.getMetricsByLibraryMap(); final Histogram<Short> opticalDuplicatesByLibraryId = libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(); final Map<String, Short> libraryIds = libraryIdGenerator.getLibraryIdsMap(); // Write out the metrics final MetricsFile<DuplicationMetrics, Double> file = getMetricsFile(); for (final Map.Entry<String, DuplicationMetrics> entry : metricsByLibrary.entrySet()) { final String libraryName = entry.getKey(); final DuplicationMetrics metrics = entry.getValue(); metrics.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2; metrics.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2; // Add the optical dupes to the metrics final Short libraryId = libraryIds.get(libraryName); if (libraryId != null) { final Histogram<Short>.Bin bin = opticalDuplicatesByLibraryId.get(libraryId); if (bin != null) { metrics.READ_PAIR_OPTICAL_DUPLICATES = (long) bin.getValue(); } } metrics.calculateDerivedMetrics(); file.addMetric(metrics); } if (metricsByLibrary.size() == 1) { file.setHistogram(metricsByLibrary.values().iterator().next().calculateRoiHistogram()); } file.write(METRICS_FILE); } /** Little class to generate program group IDs */ static class PgIdGenerator { private int recordCounter; private final Set<String> idsThatAreAlreadyTaken = new HashSet<String>(); PgIdGenerator(final SAMFileHeader header) { for (final SAMProgramRecord pgRecord : header.getProgramRecords()) { idsThatAreAlreadyTaken.add(pgRecord.getProgramGroupId()); } recordCounter = idsThatAreAlreadyTaken.size(); } String getNonCollidingId(final String recordId) { if (!idsThatAreAlreadyTaken.contains(recordId)) { // don't remap 1st record. If there are more records // with this id, they will be remapped in the 'else'. idsThatAreAlreadyTaken.add(recordId); ++recordCounter; return recordId; } else { String newId; // Below we tack on one of roughly 1.7 million possible 4 digit base36 at random. We do this because // our old process of just counting from 0 upward and adding that to the previous id led to 1000s of // calls idsThatAreAlreadyTaken.contains() just to resolve 1 collision when merging 1000s of similarly // processed bams. while (idsThatAreAlreadyTaken.contains(newId = recordId + "." + SamFileHeaderMerger.positiveFourDigitBase36Str(recordCounter++))) ; idsThatAreAlreadyTaken.add(newId); return newId; } } } /** Little class used to package up a header and an iterable/iterator. */ public static final class SamHeaderAndIterator { public final SAMFileHeader header; public final CloseableIterator<SAMRecord> iterator; public SamHeaderAndIterator(final SAMFileHeader header, final CloseableIterator<SAMRecord> iterator) { this.header = header; this.iterator = iterator; } } /** * Since this may read it's inputs more than once this method does all the opening * and checking of the inputs. */ protected SamHeaderAndIterator openInputs() { final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(INPUT.size()); final List<SAMFileReader> readers = new ArrayList<SAMFileReader>(INPUT.size()); for (final File f : INPUT) { final SAMFileReader reader = new SAMFileReader(f, true); // eager decode final SAMFileHeader header = reader.getFileHeader(); if (!ASSUME_SORTED && header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { throw new PicardException("Input file " + f.getAbsolutePath() + " is not coordinate sorted."); } headers.add(header); readers.add(reader); } if (headers.size() == 1) { return new SamHeaderAndIterator(headers.get(0), readers.get(0).iterator()); } else { final SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, false); final MergingSamRecordIterator iterator = new MergingSamRecordIterator(headerMerger, readers, ASSUME_SORTED); return new SamHeaderAndIterator(headerMerger.getMergedHeader(), iterator); } } /** * Looks through the set of reads and identifies how many of the duplicates are * in fact optical duplicates, and stores the data in the instance level histogram. */ public static void trackOpticalDuplicates(List<? extends ReadEnds> ends, final OpticalDuplicateFinder opticalDuplicateFinder, final LibraryIdGenerator libraryIdGenerator) { boolean hasFR = false, hasRF = false; // Check to see if we have a mixture of FR/RF for (final ReadEnds end : ends) { if (ReadEnds.FR == end.orientationForOpticalDuplicates) { hasFR = true; } else if (ReadEnds.RF == end.orientationForOpticalDuplicates) { hasRF = true; } } // Check if we need to partition since the orientations could have changed if (hasFR && hasRF) { // need to track them independently // Variables used for optical duplicate detection and tracking final List<ReadEnds> trackOpticalDuplicatesF = new ArrayList<ReadEnds>(); final List<ReadEnds> trackOpticalDuplicatesR = new ArrayList<ReadEnds>(); // Split into two lists: first of pairs and second of pairs, since they must have orientation and same starting end for (final ReadEnds end : ends) { if (ReadEnds.FR == end.orientationForOpticalDuplicates) { trackOpticalDuplicatesF.add(end); } else if (ReadEnds.RF == end.orientationForOpticalDuplicates) { trackOpticalDuplicatesR.add(end); } else { throw new PicardException("Found an unexpected orientation: " + end.orientation); } } // track the duplicates trackOpticalDuplicates(trackOpticalDuplicatesF, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); trackOpticalDuplicates(trackOpticalDuplicatesR, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); } else { // No need to partition AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(ends, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); } } /** * Looks through the set of reads and identifies how many of the duplicates are * in fact optical duplicates, and stores the data in the instance level histogram. */ private static void trackOpticalDuplicates(final List<? extends OpticalDuplicateFinder.PhysicalLocation> list, final OpticalDuplicateFinder opticalDuplicateFinder, final Histogram<Short> opticalDuplicatesByLibraryId) { final boolean[] opticalDuplicateFlags = opticalDuplicateFinder.findOpticalDuplicates(list); int opticalDuplicates = 0; for (final boolean b : opticalDuplicateFlags) if (b) ++opticalDuplicates; if (opticalDuplicates > 0) { opticalDuplicatesByLibraryId.increment(list.get(0).getLibraryId(), opticalDuplicates); } } }