package picard.analysis.artifacts;
import htsjdk.samtools.AlignmentBlock;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.filter.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.*;
import picard.PicardException;
import picard.analysis.SinglePassSamProgram;
import picard.analysis.artifacts.SequencingArtifactMetrics.BaitBiasDetailMetrics;
import picard.analysis.artifacts.SequencingArtifactMetrics.BaitBiasSummaryMetrics;
import picard.analysis.artifacts.SequencingArtifactMetrics.PreAdapterDetailMetrics;
import picard.analysis.artifacts.SequencingArtifactMetrics.PreAdapterSummaryMetrics;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
import picard.util.DbSnpBitSetUtil;
import picard.util.VariantType;
import java.io.File;
import java.util.*;
import java.util.stream.Collectors;
import static htsjdk.samtools.util.CodeUtil.getOrElse;
import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
/**
* Quantify substitution errors caused by mismatched base pairings during various
* stages of sample / library prep.
*
* We measure two distinct error types - artifacts that are introduced before
* the addition of the read1/read2 adapters ("pre adapter") and those that are
* introduced after target selection ("bait bias"). For each of these, we provide
* summary metrics as well as detail metrics broken down by reference context
* (the ref bases surrounding the substitution event).
*
* For a deeper explanation, see Costello et al. 2013:
* http://www.ncbi.nlm.nih.gov/pubmed/23303777
*
* @author mattsooknah
*
*/
@CommandLineProgramProperties(
usage = CollectSequencingArtifactMetrics.USAGE_SUMMARY + CollectSequencingArtifactMetrics.USAGE_DETAILS,
usageShort = CollectSequencingArtifactMetrics.USAGE_SUMMARY,
programGroup = Metrics.class
)
public class CollectSequencingArtifactMetrics extends SinglePassSamProgram {
static final String USAGE_SUMMARY = "Collect metrics to quantify single-base sequencing artifacts. ";
static final String USAGE_DETAILS = "<p>This tool examines two sources of sequencing errors associated with hybrid selection "+
"protocols. These errors are divided into two broad categories, pre-adapter and bait-bias. Pre-adapter errors can arise from "+
"laboratory manipulations of a nucleic acid sample e.g. shearing and occur prior to the ligation of adapters for PCR "+
"amplification (hence the name pre-adapter). </p>" +
"<p>Bait-bias artifacts occur during or after the target selection step, and correlate with substitution rates that are "+
"'biased', or higher for sites having one base on the reference/positive strand relative to sites having the complementary "+
"base on that strand. For example, during the target selection step, a (G>T) artifact might result in a higher substitution "+
"rate at sites with a G on the positive strand (and C on the negative), relative to sites with the flip (C positive)/(G negative)." +
" This is known as the 'G-Ref' artifact. </p>" +
"" +
"<p>For additional information on these types of artifacts, please see the corresponding GATK dictionary entries on "+
"<a href='https://www.broadinstitute.org/gatk/guide/article?id=6333'>bait-bias</a> and "+
"<a href='https://www.broadinstitute.org/gatk/guide/article?id=6332'>pre-adapter artifacts</a>.</p>"+
""+
"<p>This tool produces four files; summary and detail metrics files for both pre-adapter and bait-bias artifacts. The detailed "+
"metrics show the error rates for each type of base substitution within every possible triplet base configuration. Error " +
"rates associated with these substitutions are Phred-scaled and provided as quality scores, the lower the value, the more " +
"likely it is that an alternate base call is due to an artifact. The summary metrics provide likelihood information on the " +
"'worst-case' errors. </p>" +
""+
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar CollectSequencingArtifactMetrics \\<br />" +
" I=input.bam \\<br />" +
" O=artifact_metrics.txt \\<br />" +
" R=reference_sequence.fasta" +
"</pre>" +
"Please see the metrics at the following links " +
"<a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#SequencingArtifactMetrics.PreAdapterDetailMetrics'>PreAdapterDetailMetrics</a>, "+
"<a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#SequencingArtifactMetrics.PreAdapterSummaryMetrics'>PreAdapterSummaryMetrics</a>, "+
"<a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#SequencingArtifactMetrics.BaitBiasDetailMetrics'>BaitBiasDetailMetrics</a>, and "+
"<a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#SequencingArtifactMetrics.BaitBiasSummaryMetrics'>BaitBiasSummaryMetrics</a> "+
"for complete descriptions of the output metrics produced by this tool. "+
"<hr />"
;
@Option(doc = "An optional list of intervals to restrict analysis to.", optional = true)
public File INTERVALS;
@Option(doc = "VCF format dbSNP file, used to exclude regions around known polymorphisms from analysis.", optional = true)
public File DB_SNP;
@Option(shortName = "Q", doc = "The minimum base quality score for a base to be included in analysis.")
public int MINIMUM_QUALITY_SCORE = 20;
@Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "The minimum mapping quality score for a base to be included in analysis.")
public int MINIMUM_MAPPING_QUALITY = 30;
@Option(shortName = "MIN_INS", doc = "The minimum insert size for a read to be included in analysis.")
public int MINIMUM_INSERT_SIZE = 60;
@Option(shortName = "MAX_INS", doc = "The maximum insert size for a read to be included in analysis. Set to 0 to have no maximum.")
public int MAXIMUM_INSERT_SIZE = 600;
@Option(shortName = "UNPAIRED", doc = "Include unpaired reads. If set to true then all paired reads will be included as well - " +
"MINIMUM_INSERT_SIZE and MAXIMUM_INSERT_SIZE will be ignored.")
public boolean INCLUDE_UNPAIRED = false;
@Option(shortName = "DUPES", doc = "Include duplicate reads. If set to true then all reads flagged as duplicates will be included as well.")
public boolean INCLUDE_DUPLICATES = false;
@Option(shortName = "NON_PF", doc = "Whether or not to include non-PF reads.")
public boolean INCLUDE_NON_PF_READS = false;
@Option(shortName = "TANDEM", doc = "Set to true if mate pairs are being sequenced from the same strand, " +
"i.e. they're expected to face the same direction.")
public boolean TANDEM_READS = false;
@Option(doc = "When available, use original quality scores for filtering.")
public boolean USE_OQ = true;
@Option(doc = "The number of context bases to include on each side of the assayed base.")
public int CONTEXT_SIZE = 1;
@Option(doc = "If specified, only print results for these contexts in the detail metrics output. " +
"However, the summary metrics output will still take all contexts into consideration.")
public Set<String> CONTEXTS_TO_PRINT = new HashSet<String>();
@Option(shortName = "EXT", doc="Append the given file extension to all metric file names (ex. OUTPUT.pre_adapter_summary_metrics.EXT). None if null", optional=true)
public String FILE_EXTENSION = null;
private static final String UNKNOWN_LIBRARY = "UnknownLibrary";
private static final String UNKNOWN_SAMPLE = "UnknownSample";
private File preAdapterSummaryOut;
private File preAdapterDetailsOut;
private File baitBiasSummaryOut;
private File baitBiasDetailsOut;
private File errorSummaryFile;
private IntervalListReferenceSequenceMask intervalMask;
private DbSnpBitSetUtil dbSnpMask;
private SamRecordFilter recordFilter;
private String currentRefString = null;
private int currentRefIndex = -1;
private final Set<String> samples = new HashSet<String>();
private final Set<String> libraries = new HashSet<String>();
private final Map<String, ArtifactCounter> artifactCounters = new HashMap<String, ArtifactCounter>();
private static final Log log = Log.getInstance(CollectSequencingArtifactMetrics.class);
@Override
protected String[] customCommandLineValidation() {
final List<String> messages = new ArrayList<String>();
final int contextFullLength = 2 * CONTEXT_SIZE + 1;
if (CONTEXT_SIZE < 0) messages.add("CONTEXT_SIZE cannot be negative");
for (final String context : CONTEXTS_TO_PRINT) {
if (context.length() != contextFullLength) {
messages.add("Context " + context + " is not the length implied by CONTEXT_SIZE: " + contextFullLength);
}
}
if (MINIMUM_INSERT_SIZE < 0) messages.add("MINIMUM_INSERT_SIZE cannot be negative");
if (MAXIMUM_INSERT_SIZE < 0) messages.add("MAXIMUM_INSERT_SIZE cannot be negative");
if (MAXIMUM_INSERT_SIZE > 0 && MAXIMUM_INSERT_SIZE < MINIMUM_INSERT_SIZE) {
messages.add("MAXIMUM_INSERT_SIZE cannot be less than MINIMUM_INSERT_SIZE unless set to 0");
}
if (REFERENCE_SEQUENCE == null) messages.add("REFERENCE_SEQUENCE must be provided.");
return messages.isEmpty() ? null : messages.toArray(new String[messages.size()]);
}
@Override
protected void setup(final SAMFileHeader header, final File samFile) {
final String outext = (null != FILE_EXTENSION) ? FILE_EXTENSION : ""; // Add a file extension if desired
preAdapterSummaryOut = new File(OUTPUT + SequencingArtifactMetrics.PRE_ADAPTER_SUMMARY_EXT + outext);
preAdapterDetailsOut = new File(OUTPUT + SequencingArtifactMetrics.PRE_ADAPTER_DETAILS_EXT + outext);
baitBiasSummaryOut = new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_SUMMARY_EXT + outext);
baitBiasDetailsOut = new File(OUTPUT + SequencingArtifactMetrics.BAIT_BIAS_DETAILS_EXT + outext);
errorSummaryFile = new File(OUTPUT + SequencingArtifactMetrics.ERROR_SUMMARY_EXT + outext);
IOUtil.assertFilesAreWritable(Arrays.asList(preAdapterSummaryOut, preAdapterDetailsOut, baitBiasSummaryOut, baitBiasDetailsOut, errorSummaryFile));
for (final SAMReadGroupRecord rec : header.getReadGroups()) {
samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE));
libraries.add(getOrElse(rec.getLibrary(), UNKNOWN_LIBRARY));
}
if (INTERVALS != null) {
IOUtil.assertFileIsReadable(INTERVALS);
final IntervalList intervalList = IntervalList.fromFile(INTERVALS).uniqued();
intervalMask = new IntervalListReferenceSequenceMask(intervalList);
if (DB_SNP != null) {
IOUtil.assertFileIsReadable(DB_SNP);
dbSnpMask = new DbSnpBitSetUtil(DB_SNP, header.getSequenceDictionary(), EnumSet.noneOf(VariantType.class), intervalList, Optional.of(log));
}
}
else if (DB_SNP != null) {
IOUtil.assertFileIsReadable(DB_SNP);
dbSnpMask = new DbSnpBitSetUtil(DB_SNP, header.getSequenceDictionary(), EnumSet.noneOf(VariantType.class), null, Optional.of(log));
}
// set record-level filters
final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
if (!INCLUDE_NON_PF_READS) filters.add(new FailsVendorReadQualityFilter());
filters.add(new NotPrimaryAlignmentFilter());
if (!INCLUDE_DUPLICATES) filters.add(new DuplicateReadFilter());
filters.add(new AlignedFilter(true)); // discard unmapped reads
filters.add(new MappingQualityFilter(MINIMUM_MAPPING_QUALITY));
if (!INCLUDE_UNPAIRED) {
final int effectiveMaxInsertSize = (MAXIMUM_INSERT_SIZE == 0) ? Integer.MAX_VALUE : MAXIMUM_INSERT_SIZE;
filters.add(new InsertSizeFilter(MINIMUM_INSERT_SIZE, effectiveMaxInsertSize));
}
recordFilter = new AggregateFilter(filters);
// set up the artifact counters
final String sampleAlias = StringUtil.join(",", new ArrayList<String>(samples));
for (final String library : libraries) {
artifactCounters.put(library, new ArtifactCounter(sampleAlias, library, CONTEXT_SIZE, TANDEM_READS));
}
}
@Override
protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) {
// see if the whole read should be skipped
if (recordFilter.filterOut(rec)) return;
// check read group + library
final String library = (rec.getReadGroup() == null) ? UNKNOWN_LIBRARY : getOrElse(rec.getReadGroup().getLibrary(), UNKNOWN_LIBRARY);
if (!libraries.contains(library)) {
// should never happen if SAM is valid
throw new PicardException("Record contains library that is missing from header: " + library);
}
// set up some constants that don't change in the loop below
final int contextFullLength = 2 * CONTEXT_SIZE + 1;
final ArtifactCounter counter = artifactCounters.get(library);
final byte[] readBases = rec.getReadBases();
final byte[] readQuals;
if (USE_OQ) {
final byte[] tmp = rec.getOriginalBaseQualities();
readQuals = tmp == null ? rec.getBaseQualities() : tmp;
} else {
readQuals = rec.getBaseQualities();
}
// iterate over aligned positions
for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
for (int offset = 0; offset < block.getLength(); offset++) {
// remember, these are 1-based!
final int readPos = block.getReadStart() + offset;
final int refPos = block.getReferenceStart() + offset;
// skip low BQ sites
final byte qual = readQuals[readPos - 1];
if (qual < MINIMUM_QUALITY_SCORE) continue;
// skip N bases in read
final char readBase = Character.toUpperCase((char)readBases[readPos - 1]);
if (readBase == 'N') continue;
/**
* Skip regions outside of intervals.
*
* NB: IntervalListReferenceSequenceMask.get() has side-effects which assume
* that successive ReferenceSequence's passed to this method will be in-order
* (e.g. it will break if you call acceptRead() with chr1, then chr2, then chr1
* again). So this only works if the underlying iteration is coordinate-sorted.
*/
if (intervalMask != null && !intervalMask.get(ref.getContigIndex(), refPos)) continue;
// skip dbSNP sites
if (dbSnpMask != null && dbSnpMask.isDbSnpSite(ref.getName(), refPos)) continue;
// skip the ends of the reference
final int contextStartIndex = refPos - CONTEXT_SIZE - 1;
if (contextStartIndex < 0 || contextStartIndex + contextFullLength > ref.length()) continue;
// skip contexts with N bases
final String context = getRefContext(ref, contextStartIndex, contextFullLength);
if (context.contains("N")) continue;
// count the base!
counter.countRecord(context, readBase, rec);
}
}
}
private String getRefContext(final ReferenceSequence ref, final int contextStartIndex, final int contextFullLength) {
// cache the upper-cased string version of this reference so we don't need to create a string for every base in every read
if (currentRefIndex != ref.getContigIndex()) {
currentRefString = new String(ref.getBases()).toUpperCase();
currentRefIndex = ref.getContigIndex();
}
return currentRefString.substring(contextStartIndex, contextStartIndex + contextFullLength);
}
@Override
protected void finish() {
final MetricsFile<PreAdapterSummaryMetrics, Integer> preAdapterSummaryMetricsFile = getMetricsFile();
final MetricsFile<PreAdapterDetailMetrics, Integer> preAdapterDetailMetricsFile = getMetricsFile();
final MetricsFile<BaitBiasSummaryMetrics, Integer> baitBiasSummaryMetricsFile = getMetricsFile();
final MetricsFile<BaitBiasDetailMetrics, Integer> baitBiasDetailMetricsFile = getMetricsFile();
final MetricsFile<ErrorSummaryMetrics,?> errorSummaryMetricsFile = getMetricsFile();
for (final ArtifactCounter counter : artifactCounters.values()) {
// build metrics
counter.finish();
// write metrics
preAdapterSummaryMetricsFile.addAllMetrics(counter.getPreAdapterSummaryMetrics());
baitBiasSummaryMetricsFile.addAllMetrics(counter.getBaitBiasSummaryMetrics());
for (final PreAdapterDetailMetrics preAdapterDetailMetrics : counter.getPreAdapterDetailMetrics()) {
if (CONTEXTS_TO_PRINT.isEmpty() || CONTEXTS_TO_PRINT.contains(preAdapterDetailMetrics.CONTEXT)) {
preAdapterDetailMetricsFile.addMetric(preAdapterDetailMetrics);
}
}
for (final BaitBiasDetailMetrics baitBiasDetailMetrics : counter.getBaitBiasDetailMetrics()) {
if (CONTEXTS_TO_PRINT.isEmpty() || CONTEXTS_TO_PRINT.contains(baitBiasDetailMetrics.CONTEXT)) {
baitBiasDetailMetricsFile.addMetric(baitBiasDetailMetrics);
}
}
}
preAdapterDetailMetricsFile.write(preAdapterDetailsOut);
preAdapterSummaryMetricsFile.write(preAdapterSummaryOut);
baitBiasDetailMetricsFile.write(baitBiasDetailsOut);
baitBiasSummaryMetricsFile.write(baitBiasSummaryOut);
// Calculate the summary error rates - it's CRITICAL that the other files are written out
// first as this code modifies the pre-adapter detail metrics!
if (!preAdapterDetailMetricsFile.getMetrics().isEmpty()) {
final List<PreAdapterDetailMetrics> in = preAdapterDetailMetricsFile.getMetrics();
in.forEach(m -> {
if (m.REF_BASE == 'G' || m.REF_BASE == 'T') {
m.REF_BASE = (char) SequenceUtil.complement((byte) m.REF_BASE);
m.ALT_BASE = (char) SequenceUtil.complement((byte) m.ALT_BASE);
}
});
// Group the metrics by error type
final Map<String,List<PreAdapterDetailMetrics>> byError =
in.stream().collect(Collectors.groupingBy(m -> m.REF_BASE + ">" + m.ALT_BASE));
for (final String error : new TreeSet<>(byError.keySet())) {
final List<PreAdapterDetailMetrics> ms = byError.get(error);
final ErrorSummaryMetrics summary = new ErrorSummaryMetrics();
summary.REF_BASE = ms.get(0).REF_BASE;
summary.ALT_BASE = ms.get(0).ALT_BASE;
summary.SUBSTITUTION = error;
summary.REF_COUNT = ms.stream().mapToLong(m -> m.PRO_REF_BASES + m.CON_REF_BASES).sum();
summary.ALT_COUNT = ms.stream().mapToLong(m -> m.PRO_ALT_BASES + m.CON_ALT_BASES).sum();
summary.calculateDerivedFields();
errorSummaryMetricsFile.addMetric(summary);
}
}
errorSummaryMetricsFile.write(errorSummaryFile);
}
@Override
protected boolean usesNoRefReads() { return false; }
}