/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam; import htsjdk.samtools.BAMRecordCodec; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordQueryNameComparator; import htsjdk.samtools.SAMTag; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.filter.FilteringSamIterator; import htsjdk.samtools.filter.SamRecordFilter; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.FastqQualityFormat; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.PeekableIterator; import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.util.QualityEncodingDetector; import htsjdk.samtools.util.SolexaQualityConverter; import htsjdk.samtools.util.SortingCollection; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.SamOrBam; import picard.util.TabbedTextFileWithHeaderParser; import java.io.File; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Reverts a SAM file by optionally restoring original quality scores and by removing * all alignment information. */ @CommandLineProgramProperties( usage = RevertSam.USAGE_SUMMARY + RevertSam.USAGE_DETAILS, usageShort = RevertSam.USAGE_SUMMARY, programGroup = SamOrBam.class ) public class RevertSam extends CommandLineProgram { static final String USAGE_SUMMARY ="Reverts SAM or BAM files to a previous state. "; static final String USAGE_DETAILS ="This tool removes or restores certain properties of the SAM records, including alignment " + "information, which can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of " + "restoring the original quality scores of a BAM file that has already undergone base quality score recalibration (BQSR) if the" + "original qualities were retained." + "<h4>Example with single output:</h4>" + "<pre>" + "java -jar picard.jar RevertSam \\<br />" + " I=input.bam \\<br />" + " O=reverted.bam" + "</pre>" + "Output format is BAM by default, or SAM or CRAM if the input path ends with '.sam' or '.cram', respectively." + "<h4>Example outputting by read group with output map:</h4>" + "<pre>" + "java -jar picard.jar RevertSam \\<br />" + " I=input.bam \\<br />" + " OUTPUT_BY_READGROUP=true \\<br />" + " OUTPUT_MAP=reverted_bam_paths.tsv" + "</pre>" + "Will output a BAM/SAM file per read group. By default, all outputs will be in BAM format. " + "However, a SAM file will be produced instead for any read group mapped in OUTPUT_MAP to a path ending with '.sam'. " + "A CRAM file will be produced for any read group mapped to a path ending with '.cram'. " + "<h4>Example outputting by read group without output map:</h4>" + "<pre>" + "java -jar picard.jar RevertSam \\<br />" + " I=input.bam \\<br />" + " OUTPUT_BY_READGROUP=true \\<br />" + " O=/write/reverted/read/group/bams/in/this/dir" + "</pre>" + "Will output a BAM/SAM file per read group. By default, all outputs will be in BAM format. " + "However, outputs will be in SAM format if the input path ends with '.sam', or CRAM format if it ends with '.cram'." + " This behaviour can be overriden with OUTPUT_BY_READGROUP_FILE_FORMAT option."+ "<hr />"; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM/BAM file to revert the state of.") public File INPUT; @Option(mutex = {"OUTPUT_MAP"}, shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output SAM/BAM file to create, or an output directory if OUTPUT_BY_READGROUP is true.") public File OUTPUT; @Option(mutex = {"OUTPUT"}, shortName = "OM", doc = "Tab separated file with two columns, READ_GROUP_ID and OUTPUT, providing file mapping only used if OUTPUT_BY_READGROUP is true.") public File OUTPUT_MAP; @Option(shortName = "OBR", doc = "When true, outputs each read group in a separate file.") public boolean OUTPUT_BY_READGROUP = false; public static enum FileType {sam, bam, cram,dynamic} @Option(shortName = "OBRFF", doc = "When using OUTPUT_BY_READGROUP, the output file format can be set to a certain format." ) public FileType OUTPUT_BY_READGROUP_FILE_FORMAT=FileType.dynamic; @Option(shortName = "SO", doc = "The sort order to create the reverted output file with.") public SortOrder SORT_ORDER = SortOrder.queryname; @Option(shortName = StandardOptionDefinitions.USE_ORIGINAL_QUALITIES_SHORT_NAME, doc = "True to restore original qualities from the OQ field to the QUAL field if available.") public boolean RESTORE_ORIGINAL_QUALITIES = true; @Option(doc = "Remove duplicate read flags from all reads. Note that if this is true and REMOVE_ALIGNMENT_INFORMATION==false, " + " the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.") public boolean REMOVE_DUPLICATE_INFORMATION = true; @Option(doc = "Remove all alignment information from the file.") public boolean REMOVE_ALIGNMENT_INFORMATION = true; @Option(doc = "When removing alignment information, the set of optional tags to remove.") public List<String> ATTRIBUTE_TO_CLEAR = new ArrayList<String>() {{ add(SAMTag.NM.name()); add(SAMTag.UQ.name()); add(SAMTag.PG.name()); add(SAMTag.MD.name()); add(SAMTag.MQ.name()); add(SAMTag.SA.name()); // Supplementary alignment metadata add(SAMTag.MC.name()); // Mate Cigar add(SAMTag.AS.name()); }}; @Option(doc = "WARNING: This option is potentially destructive. If enabled will discard reads in order to produce " + "a consistent output BAM. Reads discarded include (but are not limited to) paired reads with missing " + "mates, duplicated records, records with mismatches in length of bases and qualities. This option can " + "only be enabled if the output sort order is queryname and will always cause sorting to occur.") public boolean SANITIZE = false; @Option(doc = "If SANITIZE=true and higher than MAX_DISCARD_FRACTION reads are discarded due to sanitization then" + "the program will exit with an Exception instead of exiting cleanly. Output BAM will still be valid.") public double MAX_DISCARD_FRACTION = 0.01; @Option(doc = "The sample alias to use in the reverted output file. This will override the existing " + "sample alias in the file and is used only if all the read groups in the input file have the " + "same sample alias ", shortName = StandardOptionDefinitions.SAMPLE_ALIAS_SHORT_NAME, optional = true) public String SAMPLE_ALIAS; @Option(doc = "The library name to use in the reverted output file. This will override the existing " + "sample alias in the file and is used only if all the read groups in the input file have the " + "same library name ", shortName = StandardOptionDefinitions.LIBRARY_NAME_SHORT_NAME, optional = true) public String LIBRARY_NAME; private final static Log log = Log.getInstance(RevertSam.class); /** Default main method impl. */ public static void main(final String[] args) { new RevertSam().instanceMainWithExit(args); } /** * Enforce that output ordering is queryname when sanitization is turned on since it requires a queryname sort. */ @Override protected String[] customCommandLineValidation() { final List<String> errors = new ArrayList<>(); ValidationUtil.validateSanitizeSortOrder(SANITIZE, SORT_ORDER, errors); ValidationUtil.validateOutputParams(OUTPUT_BY_READGROUP, OUTPUT, OUTPUT_MAP, errors); if (!errors.isEmpty()) { return errors.toArray(new String[errors.size()]); } return null; } protected int doWork() { IOUtil.assertFileIsReadable(INPUT); ValidationUtil.assertWritable(OUTPUT, OUTPUT_BY_READGROUP); final boolean sanitizing = SANITIZE; final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).validationStringency(VALIDATION_STRINGENCY).open(INPUT); final SAMFileHeader inHeader = in.getFileHeader(); ValidationUtil.validateHeaderOverrides(inHeader, SAMPLE_ALIAS, LIBRARY_NAME); //////////////////////////////////////////////////////////////////////////// // Build the output writer with an appropriate header based on the options //////////////////////////////////////////////////////////////////////////// final boolean presorted = isPresorted(inHeader, SORT_ORDER, sanitizing); if (SAMPLE_ALIAS != null) overwriteSample(inHeader.getReadGroups(), SAMPLE_ALIAS); if (LIBRARY_NAME != null) overwriteLibrary(inHeader.getReadGroups(), LIBRARY_NAME); final SAMFileHeader singleOutHeader = createOutHeader(inHeader, SORT_ORDER, REMOVE_ALIGNMENT_INFORMATION); inHeader.getReadGroups().forEach(readGroup -> singleOutHeader.addReadGroup(readGroup)); final Map<String, File> outputMap; final Map<String, SAMFileHeader> headerMap; if (OUTPUT_BY_READGROUP) { if (inHeader.getReadGroups().isEmpty()) { throw new PicardException(INPUT + " does not contain Read Groups"); } final String defaultExtension; if (OUTPUT_BY_READGROUP_FILE_FORMAT==FileType.dynamic) { defaultExtension = getDefaultExtension(INPUT.toString()); } else { defaultExtension = "." + OUTPUT_BY_READGROUP_FILE_FORMAT.toString(); } outputMap = createOutputMap(OUTPUT_MAP, OUTPUT, defaultExtension, inHeader.getReadGroups()); ValidationUtil.assertAllReadGroupsMapped(outputMap, inHeader.getReadGroups()); headerMap = createHeaderMap(inHeader, SORT_ORDER, REMOVE_ALIGNMENT_INFORMATION); } else { outputMap = null; headerMap = null; } final SAMFileWriterFactory factory = new SAMFileWriterFactory(); final RevertSamWriter out = new RevertSamWriter(OUTPUT_BY_READGROUP, headerMap, outputMap, singleOutHeader, OUTPUT, presorted, factory, REFERENCE_SEQUENCE); //////////////////////////////////////////////////////////////////////////// // Build a sorting collection to use if we are sanitizing //////////////////////////////////////////////////////////////////////////// final RevertSamSorter sorter; if (sanitizing) sorter = new RevertSamSorter(OUTPUT_BY_READGROUP, headerMap, singleOutHeader, MAX_RECORDS_IN_RAM); else sorter = null; final ProgressLogger progress = new ProgressLogger(log, 1000000, "Reverted"); for (final SAMRecord rec : in) { // Weed out non-primary and supplemental read as we don't want duplicates in the reverted file! if (rec.isSecondaryOrSupplementary()) continue; // log the progress before you revert because otherwise the "last read position" might not be accurate progress.record(rec); // Actually do the reverting of the remaining records revertSamRecord(rec); if (sanitizing) sorter.add(rec); else out.addAlignment(rec); } //////////////////////////////////////////////////////////////////////////// // Now if we're sanitizing, clean up the records and write them to the output //////////////////////////////////////////////////////////////////////////// if (!sanitizing) { out.close(); } else { final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat; try { readGroupToFormat = createReadGroupFormatMap(inHeader, REFERENCE_SEQUENCE, VALIDATION_STRINGENCY, INPUT, RESTORE_ORIGINAL_QUALITIES); } catch (final PicardException e) { log.error(e.getMessage()); return -1; } final long[] sanitizeResults = sanitize(readGroupToFormat, sorter, out); final long discarded = sanitizeResults[0]; final long total = sanitizeResults[1]; out.close(); final double discardRate = discarded / (double) total; final NumberFormat fmt = new DecimalFormat("0.000%"); log.info("Discarded " + discarded + " out of " + total + " (" + fmt.format(discardRate) + ") reads in order to sanitize output."); if (discardRate > MAX_DISCARD_FRACTION) { throw new PicardException("Discarded " + fmt.format(discardRate) + " which is above MAX_DISCARD_FRACTION of " + fmt.format(MAX_DISCARD_FRACTION)); } } CloserUtil.close(in); return 0; } static String getDefaultExtension(final String input) { if (input.endsWith(".sam")) { return ".sam"; } if (input.endsWith(".cram")) { return ".cram"; } return ".bam"; } private boolean isPresorted(final SAMFileHeader inHeader, final SortOrder sortOrder, final boolean sanitizing) { return (inHeader.getSortOrder() == sortOrder) || (sortOrder == SortOrder.queryname && sanitizing); } /** * Takes an individual SAMRecord and applies the set of changes/reversions to it that * have been requested by program level options. */ public void revertSamRecord(final SAMRecord rec) { if (RESTORE_ORIGINAL_QUALITIES) { final byte[] oq = rec.getOriginalBaseQualities(); if (oq != null) { rec.setBaseQualities(oq); rec.setOriginalBaseQualities(null); } } if (REMOVE_DUPLICATE_INFORMATION) { rec.setDuplicateReadFlag(false); } if (REMOVE_ALIGNMENT_INFORMATION) { if (rec.getReadNegativeStrandFlag()) { rec.reverseComplement(true); rec.setReadNegativeStrandFlag(false); } // Remove all alignment based information about the read itself rec.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); rec.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); rec.setInferredInsertSize(0); rec.setNotPrimaryAlignmentFlag(false); rec.setProperPairFlag(false); rec.setReadUnmappedFlag(true); // Then remove any mate flags and info related to alignment rec.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); rec.setMateNegativeStrandFlag(false); rec.setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); rec.setMateUnmappedFlag(rec.getReadPairedFlag()); // And then remove any tags that are calculated from the alignment ATTRIBUTE_TO_CLEAR.forEach(tag -> rec.setAttribute(tag, null)); } } private long[] sanitize(final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat, final RevertSamSorter sorter, final RevertSamWriter out) { long total = 0, discarded = 0; final ProgressLogger sanitizerProgress = new ProgressLogger(log, 1000000, "Sanitized"); final List<PeekableIterator<SAMRecord>> iterators = sorter.iterators(); for (final PeekableIterator<SAMRecord> iterator : iterators) { readNameLoop: while (iterator.hasNext()) { final List<SAMRecord> recs = fetchByReadName(iterator); total += recs.size(); // Check that all the reads have bases and qualities of the same length for (final SAMRecord rec : recs) { if (rec.getReadBases().length != rec.getBaseQualities().length) { log.debug("Discarding " + recs.size() + " reads with name " + rec.getReadName() + " for mismatching bases and quals length."); discarded += recs.size(); continue readNameLoop; } } // Check that if the first read is marked as unpaired that there is in fact only one read if (!recs.get(0).getReadPairedFlag() && recs.size() > 1) { log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because they claim to be unpaired."); discarded += recs.size(); continue readNameLoop; } // Check that if we have paired reads there is exactly one first of pair and one second of pair if (recs.get(0).getReadPairedFlag()) { int firsts = 0, seconds = 0, unpaired = 0; for (final SAMRecord rec : recs) { if (!rec.getReadPairedFlag()) ++unpaired; if (rec.getFirstOfPairFlag()) ++firsts; if (rec.getSecondOfPairFlag()) ++seconds; } if (unpaired > 0 || firsts != 1 || seconds != 1) { log.debug("Discarding " + recs.size() + " reads with name " + recs.get(0).getReadName() + " because pairing information in corrupt."); discarded += recs.size(); continue readNameLoop; } } // If we've made it this far spit the records into the output! for (final SAMRecord rec : recs) { // The only valid quality score encoding scheme is standard; if it's not standard, change it. final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup()); if (!recordFormat.equals(FastqQualityFormat.Standard)) { final byte[] quals = rec.getBaseQualities(); for (int i = 0; i < quals.length; i++) { quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND; } rec.setBaseQualities(quals); } out.addAlignment(rec); sanitizerProgress.record(rec); } } } return new long[]{discarded, total}; } /** * Generates a list by consuming from the iterator in order starting with the first available * read and continuing while subsequent reads share the same read name. If there are no reads * remaining returns an empty list. */ private List<SAMRecord> fetchByReadName(final PeekableIterator<SAMRecord> iterator) { final List<SAMRecord> out = new ArrayList<>(); if (iterator.hasNext()) { final SAMRecord first = iterator.next(); out.add(first); while (iterator.hasNext() && iterator.peek().getReadName().equals(first.getReadName())) { out.add(iterator.next()); } } return out; } private void overwriteSample(final List<SAMReadGroupRecord> readGroups, final String sampleAlias) { readGroups.forEach(rg -> rg.setSample(sampleAlias)); } private void overwriteLibrary(final List<SAMReadGroupRecord> readGroups, final String libraryName) { readGroups.forEach(rg -> rg.setLibrary(libraryName)); } static Map<String, File> createOutputMap( final File outputMapFile, final File outputDir, final String defaultExtension, final List<SAMReadGroupRecord> readGroups) { final Map<String, File> outputMap; if (outputMapFile != null) { outputMap = createOutputMapFromFile(outputMapFile); } else { outputMap = createOutputMap(readGroups, outputDir, defaultExtension); } return outputMap; } private static Map<String, File> createOutputMapFromFile(final File outputMapFile) { final Map<String, File> outputMap = new HashMap<>(); final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(outputMapFile); for (final TabbedTextFileWithHeaderParser.Row row : parser) { final String id = row.getField("READ_GROUP_ID"); final String output = row.getField("OUTPUT"); final File outputPath = new File(output); outputMap.put(id, outputPath); } CloserUtil.close(parser); return outputMap; } private static Map<String, File> createOutputMap(final List<SAMReadGroupRecord> readGroups, final File outputDir, final String extension) { final Map<String, File> outputMap = new HashMap<>(); for (final SAMReadGroupRecord readGroup : readGroups) { final String id = readGroup.getId(); final String fileName = id + extension; final Path outputPath = Paths.get(outputDir.toString(), fileName); outputMap.put(id, outputPath.toFile()); } return outputMap; } private Map<String, SAMFileHeader> createHeaderMap( final SAMFileHeader inHeader, final SortOrder sortOrder, final boolean removeAlignmentInformation) { final Map<String, SAMFileHeader> headerMap = new HashMap<>(); for (final SAMReadGroupRecord readGroup : inHeader.getReadGroups()) { final SAMFileHeader header = createOutHeader(inHeader, sortOrder, removeAlignmentInformation); header.addReadGroup(readGroup); headerMap.put(readGroup.getId(), header); } return headerMap; } private SAMFileHeader createOutHeader( final SAMFileHeader inHeader, final SAMFileHeader.SortOrder sortOrder, final boolean removeAlignmentInformation) { final SAMFileHeader outHeader = new SAMFileHeader(); outHeader.setSortOrder(sortOrder); if (!removeAlignmentInformation) { outHeader.setSequenceDictionary(inHeader.getSequenceDictionary()); outHeader.setProgramRecords(inHeader.getProgramRecords()); } return outHeader; } private Map<SAMReadGroupRecord, FastqQualityFormat> createReadGroupFormatMap( final SAMFileHeader inHeader, final File referenceSequence, final ValidationStringency validationStringency, final File input, final boolean restoreOriginalQualities) { final Map<SAMReadGroupRecord, FastqQualityFormat> readGroupToFormat = new HashMap<>(); // Figure out the quality score encoding scheme for each read group. for (final SAMReadGroupRecord rg : inHeader.getReadGroups()) { final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(referenceSequence).validationStringency(validationStringency).open(input); final SamRecordFilter filter = new SamRecordFilter() { public boolean filterOut(final SAMRecord rec) { return !rec.getReadGroup().getId().equals(rg.getId()); } public boolean filterOut(final SAMRecord first, final SAMRecord second) { throw new UnsupportedOperationException(); } }; readGroupToFormat.put(rg, QualityEncodingDetector.detect(QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, new FilteringSamIterator(reader.iterator(), filter), restoreOriginalQualities)); CloserUtil.close(reader); } for (final SAMReadGroupRecord r : readGroupToFormat.keySet()) { log.info("Detected quality format for " + r.getReadGroupId() + ": " + readGroupToFormat.get(r)); } if (readGroupToFormat.values().contains(FastqQualityFormat.Solexa)) { throw new PicardException("No quality score encoding conversion implemented for " + FastqQualityFormat.Solexa); } return readGroupToFormat; } /** * Contains a map of writers used when OUTPUT_BY_READGROUP=true * and a single writer used when OUTPUT_BY_READGROUP=false. */ private static class RevertSamWriter { private final Map<String, SAMFileWriter> writerMap = new HashMap<>(); private final SAMFileWriter singleWriter; private final boolean outputByReadGroup; RevertSamWriter( final boolean outputByReadGroup, final Map<String, SAMFileHeader> headerMap, final Map<String, File> outputMap, final SAMFileHeader singleOutHeader, final File singleOutput, final boolean presorted, final SAMFileWriterFactory factory, final File referenceFasta) { this.outputByReadGroup = outputByReadGroup; if (outputByReadGroup) { singleWriter = null; for (final Map.Entry<String, File> outputMapEntry : outputMap.entrySet()) { final String readGroupId = outputMapEntry.getKey(); final File output = outputMapEntry.getValue(); final SAMFileHeader header = headerMap.get(readGroupId); final SAMFileWriter writer = factory.makeWriter(header, presorted, output, referenceFasta); writerMap.put(readGroupId, writer); } } else { singleWriter = factory.makeWriter(singleOutHeader, presorted, singleOutput, referenceFasta); } } void addAlignment(final SAMRecord rec) { final SAMFileWriter writer; if (outputByReadGroup) { writer = writerMap.get(rec.getReadGroup().getId()); } else { writer = singleWriter; } writer.addAlignment(rec); } void close() { if (outputByReadGroup) { writerMap.values().forEach(SAMFileWriter::close); } else { singleWriter.close(); } } } /** * Contains a map of sorters used when OUTPUT_BY_READGROUP=true * and a single sorter used when OUTPUT_BY_READGROUP=false. */ private static class RevertSamSorter { private final Map<String, SortingCollection<SAMRecord>> sorterMap = new HashMap<>(); private final SortingCollection<SAMRecord> singleSorter; private final boolean outputByReadGroup; RevertSamSorter( final boolean outputByReadGroup, final Map<String, SAMFileHeader> headerMap, final SAMFileHeader singleOutHeader, final int maxRecordsInRam) { this.outputByReadGroup = outputByReadGroup; if (outputByReadGroup) { for (final Map.Entry<String, SAMFileHeader> entry : headerMap.entrySet()) { final String readGroupId = entry.getKey(); final SAMFileHeader outHeader = entry.getValue(); final SortingCollection<SAMRecord> sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), new SAMRecordQueryNameComparator(), maxRecordsInRam); sorterMap.put(readGroupId, sorter); } singleSorter = null; } else { singleSorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(singleOutHeader), new SAMRecordQueryNameComparator(), maxRecordsInRam); } } void add(final SAMRecord rec) { final SortingCollection<SAMRecord> sorter; if (outputByReadGroup) { sorter = sorterMap.get(rec.getReadGroup().getId()); } else { sorter = singleSorter; } sorter.add(rec); } List<PeekableIterator<SAMRecord>> iterators() { final List<PeekableIterator<SAMRecord>> iterators = new ArrayList<>(); if (outputByReadGroup) { for (final SortingCollection<SAMRecord> sorter : sorterMap.values()) { final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(sorter.iterator()); iterators.add(iterator); } } else { final PeekableIterator<SAMRecord> iterator = new PeekableIterator<>(singleSorter.iterator()); iterators.add(iterator); } return iterators; } } /** * Methods used for validating parameters to RevertSam. */ static class ValidationUtil { static void validateSanitizeSortOrder(final boolean sanitize, final SAMFileHeader.SortOrder sortOrder, final List<String> errors) { if (sanitize && sortOrder != SAMFileHeader.SortOrder.queryname) { errors.add("SORT_ORDER must be queryname when sanitization is enabled with SANITIZE=true."); } } static void validateOutputParams(final boolean outputByReadGroup, final File output, final File outputMap, final List<String> errors) { if (outputByReadGroup) { validateOutputParamsByReadGroup(output, outputMap, errors); } else { validateOutputParamsNotByReadGroup(output, outputMap, errors); } } static void validateOutputParamsByReadGroup(final File output, final File outputMap, final List<String> errors) { if (output != null) { if (!Files.isDirectory(output.toPath())) { errors.add("When OUTPUT_BY_READGROUP=true and OUTPUT is provided, it must be a directory: " + output); } return; } // output is null if we reached here if (outputMap == null) { errors.add("Must provide either OUTPUT or OUTPUT_MAP when OUTPUT_BY_READGROUP=true."); return; } if (!Files.isReadable(outputMap.toPath())) { errors.add("Cannot read OUTPUT_MAP " + outputMap); return; } final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(outputMap); if (!ValidationUtil.isOutputMapHeaderValid(parser.columnLabelsList())) { errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with READ_GROUP_ID as first column and OUTPUT as second column."); } } static void validateOutputParamsNotByReadGroup(final File output, final File outputMap, final List<String> errors) { if (outputMap != null) { errors.add("Cannot provide OUTPUT_MAP when OUTPUT_BY_READGROUP=false. Provide OUTPUT instead."); } if (output == null) { errors.add("OUTPUT is required when OUTPUT_BY_READGROUP=false"); return; } if (Files.isDirectory(output.toPath())) { errors.add("OUTPUT " + output + " should not be a directory when OUTPUT_BY_READGROUP=false"); } } /** * If we are going to override SAMPLE_ALIAS or LIBRARY_NAME, make sure all the read * groups have the same values. */ static void validateHeaderOverrides( final SAMFileHeader inHeader, final String sampleAlias, final String libraryName) { final List<SAMReadGroupRecord> rgs = inHeader.getReadGroups(); if (sampleAlias != null || libraryName != null) { boolean allSampleAliasesIdentical = true; boolean allLibraryNamesIdentical = true; for (int i = 1; i < rgs.size(); i++) { if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) { allSampleAliasesIdentical = false; } if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) { allLibraryNamesIdentical = false; } } if (sampleAlias != null && !allSampleAliasesIdentical) { throw new PicardException("Read groups have multiple values for sample. " + "A value for SAMPLE_ALIAS cannot be supplied."); } if (libraryName != null && !allLibraryNamesIdentical) { throw new PicardException("Read groups have multiple values for library name. " + "A value for library name cannot be supplied."); } } } static void assertWritable(final File output, final boolean outputByReadGroup) { if (outputByReadGroup) { if (output != null) { IOUtil.assertDirectoryIsWritable(output); } } else { IOUtil.assertFileIsWritable(output); } } static void assertAllReadGroupsMapped(final Map<String, File> outputMap, final List<SAMReadGroupRecord> readGroups) { for (final SAMReadGroupRecord readGroup : readGroups) { final String id = readGroup.getId(); final File output = outputMap.get(id); if (output == null) { throw new PicardException("Read group id " + id + " not found in OUTPUT_MAP " + outputMap); } } } static boolean isOutputMapHeaderValid(final List<String> columnLabels) { if (columnLabels.size() < 2) { return false; } if (!"READ_GROUP_ID".equals(columnLabels.get(0))) { return false; } if (!"OUTPUT".equals(columnLabels.get(1))) { return false; } return true; } } }