/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.illumina;
import htsjdk.samtools.ReservedTagConstants;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.Illumina;
import picard.util.AdapterMarker;
import picard.util.AdapterPair;
import picard.util.ClippingUtility;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import static picard.util.IlluminaUtil.IlluminaAdapterPair;
/**
* Command line program to mark the location of adapter sequences.
* This also outputs a Histogram of metrics describing the clipped bases
*
* @author Tim Fennell (adapted by mborkan@broadinstitute.org)
*/
@CommandLineProgramProperties(
usage = MarkIlluminaAdapters.USAGE_SUMMARY + MarkIlluminaAdapters.USAGE_DETAILS,
usageShort = MarkIlluminaAdapters.USAGE_SUMMARY,
programGroup = Illumina.class
)
public class MarkIlluminaAdapters extends CommandLineProgram {
static final String USAGE_SUMMARY = "Reads a SAM or BAM file and rewrites it with new adapter-trimming tags. ";
static final String USAGE_DETAILS = "<p>This tool clears any existing adapter-trimming tags (XT:i:) in the optional tag region of " +
"a SAM file. The SAM/BAM file must be sorted by query name.</p> "+
"<p>Outputs a metrics file histogram showing counts of bases_clipped per read." +
"" +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar MarkIlluminaAdapters \\<br /> " +
"INPUT=input.sam \\<br />" +
"METRICS=metrics.txt " +
"</pre>" +
"<hr />"
;
// The following attributes define the command-line arguments
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME)
public File INPUT;
@Option(doc = "If output is not specified, just the metrics are generated",
shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, optional = true)
public File OUTPUT;
@Option(doc = "Histogram showing counts of bases_clipped in how many reads", shortName = "M")
public File METRICS;
@Option(doc = "The minimum number of bases to match over when clipping single-end reads.")
public int MIN_MATCH_BASES_SE = ClippingUtility.MIN_MATCH_BASES;
@Option(doc = "The minimum number of bases to match over (per-read) when clipping paired-end reads.")
public int MIN_MATCH_BASES_PE = ClippingUtility.MIN_MATCH_PE_BASES;
@Option(doc = "The maximum mismatch error rate to tolerate when clipping single-end reads.")
public double MAX_ERROR_RATE_SE = ClippingUtility.MAX_ERROR_RATE;
@Option(doc = "The maximum mismatch error rate to tolerate when clipping paired-end reads.")
public double MAX_ERROR_RATE_PE = ClippingUtility.MAX_PE_ERROR_RATE;
@Option(doc = "DEPRECATED. Whether this is a paired-end run. No longer used.", shortName = "PE", optional = true)
public Boolean PAIRED_RUN;
@Option(doc = "Which adapters sequences to attempt to identify and clip.")
public List<IlluminaAdapterPair> ADAPTERS =
CollectionUtil.makeList(IlluminaAdapterPair.INDEXED,
IlluminaAdapterPair.DUAL_INDEXED,
IlluminaAdapterPair.PAIRED_END
);
@Option(doc = "For specifying adapters other than standard Illumina", optional = true)
public String FIVE_PRIME_ADAPTER;
@Option(doc = "For specifying adapters other than standard Illumina", optional = true)
public String THREE_PRIME_ADAPTER;
@Option(doc = "Adapters are truncated to this length to speed adapter matching. Set to a large number to effectively disable truncation.")
public int ADAPTER_TRUNCATION_LENGTH = AdapterMarker.DEFAULT_ADAPTER_LENGTH;
@Option(doc = "If looking for multiple adapter sequences, then after having seen this many adapters, shorten the list of sequences. " +
"Keep the adapters that were found most frequently in the input so far. " +
"Set to -1 if the input has a heterogeneous mix of adapters so shortening is undesirable.",
shortName = "APT")
public int PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN = AdapterMarker.DEFAULT_PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN;
@Option(doc = "If pruning the adapter list, keep only this many adapter sequences when pruning the list (plus any adapters that " +
"were tied with the adapters being kept).")
public int NUM_ADAPTERS_TO_KEEP = AdapterMarker.DEFAULT_NUM_ADAPTERS_TO_KEEP;
private static final Log log = Log.getInstance(MarkIlluminaAdapters.class);
// Stock main method
public static void main(final String[] args) {
System.exit(new MarkIlluminaAdapters().instanceMain(args));
}
@Override
protected String[] customCommandLineValidation() {
if ((FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER == null) || (THREE_PRIME_ADAPTER != null && FIVE_PRIME_ADAPTER == null)) {
return new String[]{"THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must either both be null or both be set."};
} else {
return null;
}
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(METRICS);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
final SAMFileHeader.SortOrder order = in.getFileHeader().getSortOrder();
SAMFileWriter out = null;
if (OUTPUT != null) {
IOUtil.assertFileIsWritable(OUTPUT);
out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
}
final Histogram<Integer> histo = new Histogram<Integer>("clipped_bases", "read_count");
// Combine any adapters and custom adapter pairs from the command line into an array for use in clipping
final AdapterPair[] adapters;
{
final List<AdapterPair> tmp = new ArrayList<AdapterPair>();
tmp.addAll(ADAPTERS);
if (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null) {
tmp.add(new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER));
}
adapters = tmp.toArray(new AdapterPair[tmp.size()]);
}
////////////////////////////////////////////////////////////////////////
// Main loop that consumes reads, clips them and writes them to the output
////////////////////////////////////////////////////////////////////////
final ProgressLogger progress = new ProgressLogger(log, 1000000, "Read");
final SAMRecordIterator iterator = in.iterator();
final AdapterMarker adapterMarker = new AdapterMarker(ADAPTER_TRUNCATION_LENGTH, adapters).
setMaxPairErrorRate(MAX_ERROR_RATE_PE).setMinPairMatchBases(MIN_MATCH_BASES_PE).
setMaxSingleEndErrorRate(MAX_ERROR_RATE_SE).setMinSingleEndMatchBases(MIN_MATCH_BASES_SE).
setNumAdaptersToKeep(NUM_ADAPTERS_TO_KEEP).
setThresholdForSelectingAdaptersToKeep(PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN);
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
final SAMRecord rec2 = rec.getReadPairedFlag() && iterator.hasNext() ? iterator.next() : null;
rec.setAttribute(ReservedTagConstants.XT, null);
// Do the clipping one way for PE and another for SE reads
if (rec.getReadPairedFlag()) {
// Assert that the input file is in query name order only if we see some PE reads
if (order != SAMFileHeader.SortOrder.queryname) {
throw new PicardException("Input BAM file must be sorted by queryname");
}
if (rec2 == null) throw new PicardException("Missing mate pair for paired read: " + rec.getReadName());
rec2.setAttribute(ReservedTagConstants.XT, null);
// Assert that we did in fact just get two mate pairs
if (!rec.getReadName().equals(rec2.getReadName())) {
throw new PicardException("Adjacent reads expected to be mate-pairs have different names: " +
rec.getReadName() + ", " + rec2.getReadName());
}
// establish which of pair is first and which second
final SAMRecord first, second;
if (rec.getFirstOfPairFlag() && rec2.getSecondOfPairFlag()) {
first = rec;
second = rec2;
} else if (rec.getSecondOfPairFlag() && rec2.getFirstOfPairFlag()) {
first = rec2;
second = rec;
} else {
throw new PicardException("Two reads with same name but not correctly marked as 1st/2nd of pair: " + rec.getReadName());
}
adapterMarker.adapterTrimIlluminaPairedReads(first, second);
} else {
adapterMarker.adapterTrimIlluminaSingleRead(rec);
}
// Then output the records, update progress and metrics
for (final SAMRecord r : new SAMRecord[]{rec, rec2}) {
if (r != null) {
progress.record(r);
if (out != null) out.addAlignment(r);
final Integer clip = r.getIntegerAttribute(ReservedTagConstants.XT);
if (clip != null) histo.increment(r.getReadLength() - clip + 1);
}
}
}
if (out != null) out.close();
// Lastly output the metrics to file
final MetricsFile<?, Integer> metricsFile = getMetricsFile();
metricsFile.setHistogram(histo);
metricsFile.write(METRICS);
CloserUtil.close(in);
return 0;
}
}