/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam; import htsjdk.samtools.SAMFileHeader.SortOrder; import htsjdk.samtools.SAMProgramRecord; import htsjdk.samtools.SamPairUtil; import htsjdk.samtools.util.Log; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.SamOrBam; import java.io.File; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * A command-line tool to merge BAM/SAM alignment info from a third-party aligner with the data in an * unmapped BAM file, producing a third BAM file that has alignment data and all the additional data * from the unmapped BAM * * @author ktibbett@broadinstitute.org */ @CommandLineProgramProperties( usage = "Merges alignment data from a SAM or BAM " + "file with additional data stored in an unmapped BAM file and produces a third SAM " + "or BAM file of aligned and unaligned reads. The purpose is to use information from the " + "unmapped BAM to fix up aligner output, so that the resulting file is valid for use by other " + "Picard programs. For simple BAM file merges, use MergeSamFiles. NOTE that MergeBamAlignment expects to " + "find a sequence dictionary in the same directory as REFERENCE_SEQUENCE and expects it " + "to have the same base name as the reference fasta except with the extension '.dict'", usageShort = "Merges alignment data from a SAM or BAM with data in an unmapped BAM file", programGroup = SamOrBam.class ) public class MergeBamAlignment extends CommandLineProgram { @Option(shortName="UNMAPPED", doc="Original SAM or BAM file of unmapped reads, which must be in queryname order.") public File UNMAPPED_BAM; @Option(shortName="ALIGNED", doc="SAM or BAM file(s) with alignment data.", mutex={"READ1_ALIGNED_BAM","READ2_ALIGNED_BAM"}, optional=true) public List<File> ALIGNED_BAM; @Option(shortName="R1_ALIGNED", doc="SAM or BAM file(s) with alignment data from the first read of a pair.", mutex={"ALIGNED_BAM"}, optional=true) public List<File> READ1_ALIGNED_BAM ; @Option(shortName="R2_ALIGNED", doc="SAM or BAM file(s) with alignment data from the second read of a pair.", mutex={"ALIGNED_BAM"}, optional=true) public List<File> READ2_ALIGNED_BAM; @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="Merged SAM or BAM file to write to.") public File OUTPUT; @Option(shortName=StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc="Path to the fasta file for the reference sequence.") public File REFERENCE_SEQUENCE; @Option(shortName=StandardOptionDefinitions.PROGRAM_RECORD_ID_SHORT_NAME, doc="The program group ID of the aligner (if not supplied by the aligned file).", optional=true) public String PROGRAM_RECORD_ID; @Option(shortName="PG_VERSION", doc="The version of the program group (if not supplied by the aligned file).", optional=true) public String PROGRAM_GROUP_VERSION; @Option(shortName="PG_COMMAND", doc="The command line of the program group (if not supplied by the aligned file).", optional=true) public String PROGRAM_GROUP_COMMAND_LINE; @Option(shortName="PG_NAME", doc="The name of the program group (if not supplied by the aligned file).", optional=true) public String PROGRAM_GROUP_NAME; @Deprecated @Option(doc="This argument is ignored and will be removed.", shortName="PE") public Boolean PAIRED_RUN; @Option(doc="The expected jump size (required if this is a jumping library). Deprecated. Use EXPECTED_ORIENTATIONS instead", shortName="JUMP", mutex="EXPECTED_ORIENTATIONS", optional=true) public Integer JUMP_SIZE; @Option(doc="Whether to clip adapters where identified.") public boolean CLIP_ADAPTERS = true; @Option(doc="Whether the lane is bisulfite sequence (used when caculating the NM tag).") public boolean IS_BISULFITE_SEQUENCE = false; @Option(doc="Whether to output only aligned reads. ") public boolean ALIGNED_READS_ONLY = false; @Option(doc="The maximum number of insertions or deletions permitted for an alignment to be " + "included. Alignments with more than this many insertions or deletions will be ignored. " + "Set to -1 to allow any number of insertions or deletions.", shortName="MAX_GAPS") public int MAX_INSERTIONS_OR_DELETIONS = 1; @Option(doc="Reserved alignment attributes (tags starting with X, Y, or Z) that should be " + "brought over from the alignment data when merging.") public List<String> ATTRIBUTES_TO_RETAIN = new ArrayList<String>(); @Option(doc="Attributes from the alignment record that should be removed when merging." + " This overrides ATTRIBUTES_TO_RETAIN if they share common tags.") public List<String> ATTRIBUTES_TO_REMOVE = new ArrayList<String>(); @Option(shortName="R1_TRIM", doc="The number of bases trimmed from the beginning of read 1 prior to alignment") public int READ1_TRIM = 0; @Option(shortName="R2_TRIM", doc="The number of bases trimmed from the beginning of read 2 prior to alignment") public int READ2_TRIM = 0; @Option(shortName="ORIENTATIONS", doc="The expected orientation of proper read pairs. Replaces JUMP_SIZE", mutex = "JUMP_SIZE", optional=true) public List<SamPairUtil.PairOrientation> EXPECTED_ORIENTATIONS; @Option(doc="Use the aligner's idea of what a proper pair is rather than computing in this program.") public boolean ALIGNER_PROPER_PAIR_FLAGS = false; @Option(shortName=StandardOptionDefinitions.SORT_ORDER_SHORT_NAME, doc="The order in which the merged reads should be output.") public SortOrder SORT_ORDER = SortOrder.coordinate; @Option(doc="Strategy for selecting primary alignment when the aligner has provided more than one alignment " + "for a pair or fragment, and none are marked as primary, more than one is marked as primary, or the primary " + "alignment is filtered out for some reason. " + "BestMapq expects that multiple alignments will be correlated with HI tag, and prefers the pair of " + "alignments with the largest MAPQ, in the absence of a primary selected by the aligner. " + "EarliestFragment prefers the alignment which maps the earliest base in the read. Note that EarliestFragment " + "may not be used for paired reads. " + "BestEndMapq is appropriate for cases in which the aligner is not pair-aware, and does not output the HI tag. " + "It simply picks the alignment for each end with the highest MAPQ, and makes those alignments primary, regardless " + "of whether the two alignments make sense together." + "MostDistant is also for a non-pair-aware aligner, and picks the alignment pair with the largest insert size. " + "If all alignments would be chimeric, it picks the alignments for each end with the best MAPQ. For all algorithms, " + "ties are resolved arbitrarily.") public PrimaryAlignmentStrategy PRIMARY_ALIGNMENT_STRATEGY = PrimaryAlignmentStrategy.BestMapq; @Option(doc="For paired reads, soft clip the 3' end of each read if necessary so that it does not extend past the 5' end of its mate.") public boolean CLIP_OVERLAPPING_READS = true; @Option(doc="If false, do not write secondary alignments to output.") public boolean INCLUDE_SECONDARY_ALIGNMENTS = true; @Option(shortName="MC", optional=true, doc="Adds the mate CIGAR tag (MC) if true, does not if false.") public Boolean ADD_MATE_CIGAR = true; private static final Log log = Log.getInstance(MergeBamAlignment.class); /** * Mechanism to bridge between command line option and PrimaryAlignmentSelectionStrategy implementation. */ enum PrimaryAlignmentStrategy { BestMapq(BestMapqPrimaryAlignmentSelectionStrategy.class), EarliestFragment(EarliestFragmentPrimaryAlignmentSelectionStrategy.class), BestEndMapq(BestEndMapqPrimaryAlignmentStrategy.class), MostDistant(MostDistantPrimaryAlignmentSelectionStrategy.class); private final Class<PrimaryAlignmentSelectionStrategy> clazz; PrimaryAlignmentStrategy(final Class<?> clazz) { this.clazz = (Class<PrimaryAlignmentSelectionStrategy>)clazz; } PrimaryAlignmentSelectionStrategy newInstance() { try { return clazz.newInstance(); } catch (Exception e) { throw new PicardException("Trouble instantiating " + clazz.getName(), e); } } } /** Required main method implementation. */ public static void main(final String[] argv) { System.exit(new MergeBamAlignment().instanceMain(argv)); } @Override protected int doWork() { // Check the files are readable/writable SAMProgramRecord prod = null; if (PROGRAM_RECORD_ID != null) { prod = new SAMProgramRecord(PROGRAM_RECORD_ID); prod.setProgramVersion(PROGRAM_GROUP_VERSION); prod.setCommandLine(PROGRAM_GROUP_COMMAND_LINE); prod.setProgramName(PROGRAM_GROUP_NAME); } // TEMPORARY FIX until internal programs all specify EXPECTED_ORIENTATIONS if (JUMP_SIZE != null) { EXPECTED_ORIENTATIONS = Arrays.asList(SamPairUtil.PairOrientation.RF); } else if (EXPECTED_ORIENTATIONS == null || EXPECTED_ORIENTATIONS.isEmpty()) { EXPECTED_ORIENTATIONS = Arrays.asList(SamPairUtil.PairOrientation.FR); } final SamAlignmentMerger merger = new SamAlignmentMerger(UNMAPPED_BAM, OUTPUT, REFERENCE_SEQUENCE, prod, CLIP_ADAPTERS, IS_BISULFITE_SEQUENCE, ALIGNED_READS_ONLY, ALIGNED_BAM, MAX_INSERTIONS_OR_DELETIONS, ATTRIBUTES_TO_RETAIN, ATTRIBUTES_TO_REMOVE, READ1_TRIM, READ2_TRIM, READ1_ALIGNED_BAM, READ2_ALIGNED_BAM, EXPECTED_ORIENTATIONS, SORT_ORDER, PRIMARY_ALIGNMENT_STRATEGY.newInstance(), ADD_MATE_CIGAR); merger.setClipOverlappingReads(CLIP_OVERLAPPING_READS); merger.setMaxRecordsInRam(MAX_RECORDS_IN_RAM); merger.setKeepAlignerProperPairFlags(ALIGNER_PROPER_PAIR_FLAGS); merger.setIncludeSecondaryAlignments(INCLUDE_SECONDARY_ALIGNMENTS); merger.mergeAlignment(); merger.close(); return 0; } /** * Put any custom command-line validation in an override of this method. * clp is initialized at this point and can be used to print usage and access argv. * Any options set by command-line parser can be validated. * @return null if command line is valid. If command line is invalid, returns * an array of error messages to be written to the appropriate place. */ protected String[] customCommandLineValidation() { if ((PROGRAM_RECORD_ID != null || PROGRAM_GROUP_VERSION != null || PROGRAM_GROUP_COMMAND_LINE != null) && (PROGRAM_RECORD_ID == null || PROGRAM_GROUP_VERSION == null || PROGRAM_GROUP_COMMAND_LINE == null)) { return new String[] {"PROGRAM_RECORD_ID, PROGRAM_GROUP_VERSION, and " + "PROGRAM_GROUP_COMMAND_LINE must all be supplied or none should " + "be included."}; } final boolean r1sExist = READ1_ALIGNED_BAM != null && READ1_ALIGNED_BAM.size() > 0; final boolean r2sExist = READ2_ALIGNED_BAM != null && READ2_ALIGNED_BAM.size() > 0; if ((r1sExist && !r2sExist) || (r2sExist && !r1sExist)) { return new String[] {"READ1_ALIGNED_BAM and READ2_ALIGNED_BAM " + "must both be supplied or neither should be included. For " + "single-end read use ALIGNED_BAM."}; } if (ALIGNED_BAM == null || ALIGNED_BAM.size() == 0 && !(r1sExist && r2sExist)) { return new String[] {"Either ALIGNED_BAM or the combination of " + "READ1_ALIGNED_BAM and READ2_ALIGNED_BAM must be supplied."}; } return null; } }