/*
* The MIT License
*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.analysis;
import htsjdk.samtools.*;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.StringUtil;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import picard.PicardException;
import picard.cmdline.CommandLineProgramTest;
import picard.annotation.RefFlatReader.RefFlatColumns;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
public class CollectRnaSeqMetricsTest extends CommandLineProgramTest {
public String getCommandLineProgramName() {
return CollectRnaSeqMetrics.class.getSimpleName();
}
@Test
public void testBasic() throws Exception {
final String sequence = "chr1";
final String ignoredSequence = "chrM";
// Create some alignments that hit the ribosomal sequence, various parts of the gene, and intergenic.
final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
// Set seed so that strandedness is consistent among runs.
builder.setRandomSeed(0);
final int sequenceIndex = builder.getHeader().getSequenceIndex(sequence);
builder.addPair("pair1", sequenceIndex, 45, 475);
builder.addPair("pair2", sequenceIndex, 90, 225);
builder.addPair("pair3", sequenceIndex, 120, 600);
builder.addFrag("frag1", sequenceIndex, 150, true);
builder.addFrag("frag2", sequenceIndex, 450, true);
builder.addFrag("frag3", sequenceIndex, 225, false);
builder.addPair("rrnaPair", sequenceIndex, 400, 500);
builder.addFrag("ignoredFrag", builder.getHeader().getSequenceIndex(ignoredSequence), 1, false);
final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
samFile.deleteOnExit();
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile);
for (final SAMRecord rec: builder.getRecords()) samWriter.addAlignment(rec);
samWriter.close();
// Create an interval list with one ribosomal interval.
final Interval rRnaInterval = new Interval(sequence, 300, 520, true, "rRNA");
final IntervalList rRnaIntervalList = new IntervalList(builder.getHeader());
rRnaIntervalList.add(rRnaInterval);
final File rRnaIntervalsFile = File.createTempFile("tmp.rRna.", ".interval_list");
rRnaIntervalsFile.deleteOnExit();
rRnaIntervalList.write(rRnaIntervalsFile);
// Generate the metrics.
final File metricsFile = File.createTempFile("tmp.", ".rna_metrics");
metricsFile.deleteOnExit();
final String[] args = new String[] {
"INPUT=" + samFile.getAbsolutePath(),
"OUTPUT=" + metricsFile.getAbsolutePath(),
"REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(),
"RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
"STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
"IGNORE_SEQUENCE=" + ignoredSequence
};
Assert.assertEquals(runPicardCommandLine(args), 0);
final MetricsFile<RnaSeqMetrics, Comparable<?>> output = new MetricsFile<RnaSeqMetrics, Comparable<?>>();
output.read(new FileReader(metricsFile));
final RnaSeqMetrics metrics = output.getMetrics().get(0);
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 396);
Assert.assertEquals(metrics.PF_BASES, 432);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 108L);
Assert.assertEquals(metrics.CODING_BASES, 136);
Assert.assertEquals(metrics.UTR_BASES, 51);
Assert.assertEquals(metrics.INTRONIC_BASES, 50);
Assert.assertEquals(metrics.INTERGENIC_BASES, 51);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4);
Assert.assertEquals(metrics.IGNORED_READS, 1);
Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 2);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.333333);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.666667);
}
@DataProvider(name = "rRnaIntervalsFiles")
public static Object[][] rRnaIntervalsFiles() throws IOException {
return new Object[][] {
{null},
{File.createTempFile("tmp.rRna.", ".interval_list")}
};
}
@Test(dataProvider = "rRnaIntervalsFiles")
public void testNoIntevalsNoFragPercentage(final File rRnaIntervalsFile) throws Exception {
final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
// Add a header but no intervals
if ( rRnaIntervalsFile != null ) {
final IntervalList rRnaIntervalList = new IntervalList(builder.getHeader());
rRnaIntervalList.write(rRnaIntervalsFile);
rRnaIntervalsFile.deleteOnExit();
}
// Create some alignments
final String sequence = "chr1";
final String ignoredSequence = "chrM";
// Set seed so that strandedness is consistent among runs.
builder.setRandomSeed(0);
final int sequenceIndex = builder.getHeader().getSequenceIndex(sequence);
builder.addPair("pair1", sequenceIndex, 45, 475);
builder.addFrag("ignoredFrag", builder.getHeader().getSequenceIndex(ignoredSequence), 1, false);
final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
samFile.deleteOnExit();
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile);
for (final SAMRecord rec : builder.getRecords()) samWriter.addAlignment(rec);
samWriter.close();
// Generate the metrics.
final File metricsFile = File.createTempFile("tmp.", ".rna_metrics");
metricsFile.deleteOnExit();
final String rRnaIntervalsPath = rRnaIntervalsFile != null ? rRnaIntervalsFile.getAbsolutePath() : null;
final String[] args = new String[]{
"INPUT=" + samFile.getAbsolutePath(),
"OUTPUT=" + metricsFile.getAbsolutePath(),
"REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(),
"RIBOSOMAL_INTERVALS=" + rRnaIntervalsPath,
"RRNA_FRAGMENT_PERCENTAGE=" + 0.0,
"STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
"IGNORE_SEQUENCE=" + ignoredSequence
};
try {
Assert.assertEquals(runPicardCommandLine(args), 0);
} catch(final Exception e) {
if (rRnaIntervalsFile != null) {
Assert.assertEquals(e.getCause().getClass(), PicardException.class);
}
}
}
@Test
public void testMultiLevel() throws Exception {
final String sequence = "chr1";
final String ignoredSequence = "chrM";
// Create some alignments that hit the ribosomal sequence, various parts of the gene, and intergenic.
final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate, false);
// Set seed so that strandedness is consistent among runs.
builder.setRandomSeed(0);
final int sequenceIndex = builder.getHeader().getSequenceIndex(sequence);
final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("2");
rg1.setSample("Sample");
rg1.setLibrary("foo");
builder.setReadGroup(rg1);
builder.addPair("pair1", sequenceIndex, 45, 475);
builder.addPair("pair2", sequenceIndex, 90, 225);
builder.addFrag("frag1", sequenceIndex, 150, true);
builder.addFrag("frag2", sequenceIndex, 450, true);
final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("3");
rg2.setSample("Sample");
rg2.setLibrary("bar");
builder.setReadGroup(rg2);
builder.addPair("pair3", sequenceIndex, 120, 600);
builder.addFrag("frag3", sequenceIndex, 225, false);
builder.addPair("rrnaPair", sequenceIndex, 400, 500);
builder.addFrag("ignoredFrag", builder.getHeader().getSequenceIndex(ignoredSequence), 1, false);
final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
samFile.deleteOnExit();
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile);
for (final SAMRecord rec: builder.getRecords()) samWriter.addAlignment(rec);
samWriter.close();
// Create an interval list with one ribosomal interval.
final Interval rRnaInterval = new Interval(sequence, 300, 520, true, "rRNA");
final IntervalList rRnaIntervalList = new IntervalList(builder.getHeader());
rRnaIntervalList.add(rRnaInterval);
final File rRnaIntervalsFile = File.createTempFile("tmp.rRna.", ".interval_list");
rRnaIntervalsFile.deleteOnExit();
rRnaIntervalList.write(rRnaIntervalsFile);
// Generate the metrics.
final File metricsFile = File.createTempFile("tmp.", ".rna_metrics");
metricsFile.deleteOnExit();
final String[] args = new String[] {
"INPUT=" + samFile.getAbsolutePath(),
"OUTPUT=" + metricsFile.getAbsolutePath(),
"REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(),
"RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
"STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
"IGNORE_SEQUENCE=" + ignoredSequence,
"LEVEL=null",
"LEVEL=SAMPLE",
"LEVEL=LIBRARY"
};
Assert.assertEquals(runPicardCommandLine(args), 0);
final MetricsFile<RnaSeqMetrics, Comparable<?>> output = new MetricsFile<RnaSeqMetrics, Comparable<?>>();
output.read(new FileReader(metricsFile));
for (final RnaSeqMetrics metrics : output.getMetrics()) {
if (metrics.LIBRARY == null) {
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 396);
Assert.assertEquals(metrics.PF_BASES, 432);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 108L);
Assert.assertEquals(metrics.CODING_BASES, 136);
Assert.assertEquals(metrics.UTR_BASES, 51);
Assert.assertEquals(metrics.INTRONIC_BASES, 50);
Assert.assertEquals(metrics.INTERGENIC_BASES, 51);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4);
Assert.assertEquals(metrics.IGNORED_READS, 1);
Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 2);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.333333);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.666667);
}
else if (metrics.LIBRARY.equals("foo")) {
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 216);
Assert.assertEquals(metrics.PF_BASES, 216);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 36L);
Assert.assertEquals(metrics.CODING_BASES, 89);
Assert.assertEquals(metrics.UTR_BASES, 51);
Assert.assertEquals(metrics.INTRONIC_BASES, 25);
Assert.assertEquals(metrics.INTERGENIC_BASES, 15);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2);
Assert.assertEquals(metrics.IGNORED_READS, 0);
Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 0);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 1);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.0);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 1.0);
}
else if (metrics.LIBRARY.equals("bar")) {
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 180);
Assert.assertEquals(metrics.PF_BASES, 216);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 72L);
Assert.assertEquals(metrics.CODING_BASES, 47);
Assert.assertEquals(metrics.UTR_BASES, 0);
Assert.assertEquals(metrics.INTRONIC_BASES, 25);
Assert.assertEquals(metrics.INTERGENIC_BASES, 36);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 0);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2);
Assert.assertEquals(metrics.IGNORED_READS, 1);
Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 0);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 1);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 1.0);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.0);
}
}
}
@Test
public void testTranscriptionStrandMetrics() throws Exception {
final String sequence = "chr1";
final String ignoredSequence = "chrM";
// Create some alignments that hit the ribosomal sequence, various parts of the gene, and intergenic.
final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate);
// Set seed so that strandedness is consistent among runs.
builder.setRandomSeed(0);
builder.setReadLength(50);
final int sequenceIndex = builder.getHeader().getSequenceIndex(sequence);
// Reads that start *before* the gene: count as unexamined
builder.addPair("pair_prior_1", sequenceIndex, 45, 50, false, false, "50M", "50M", true, false, -1);
builder.addPair("pair_prior_2", sequenceIndex, 49, 50, false, false, "50M", "50M", true, false, -1);
// Read that is enclosed in the gene, but one end does not map: count as unexamined
builder.addPair("read_one_end_unmapped", sequenceIndex, 50, 51, false, true, "50M", null, false, false, -1);
// Reads that are enclosed in the gene, paired and frag: one count per template
builder.addFrag("frag_enclosed_forward", sequenceIndex, 150, false);
builder.addFrag("frag_enclosed_reverse", sequenceIndex, 150, true);
builder.addPair("pair_enclosed_forward", sequenceIndex, 150, 200, false, false, "50M", "50M", false, true, -1);
builder.addPair("pair_enclosed_reverse", sequenceIndex, 200, 150, false, false, "50M", "50M", true, false, -1);
// Read that starts *after* the gene: not counted
builder.addPair("pair_after_1", sequenceIndex, 545, 550, false, false, "50M", "50M", true, false, -1);
builder.addPair("pair_after_2", sequenceIndex, 549, 550, false, false, "50M", "50M", true, false, -1);
// A read that uses the read length instead of the mate cigar
builder.addFrag("frag_enclosed_forward_no_mate_cigar", sequenceIndex, 150, false).setAttribute(SAMTag.MC.name(), null);
// Duplicate reads are counted
builder.addFrag("frag_duplicate", sequenceIndex, 150, false).setDuplicateReadFlag(true);
// These reads should be ignored.
builder.addFrag("frag_secondary", sequenceIndex, 150, false).setNotPrimaryAlignmentFlag(true);
builder.addFrag("frag_supplementary", sequenceIndex, 150, false).setSupplementaryAlignmentFlag(true);
builder.addFrag("frag_qc_failure", sequenceIndex, 150, false).setReadFailsVendorQualityCheckFlag(true);
final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam");
samFile.deleteOnExit();
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile);
for (final SAMRecord rec: builder.getRecords()) samWriter.addAlignment(rec);
samWriter.close();
// Create an interval list with one ribosomal interval.
final Interval rRnaInterval = new Interval(sequence, 300, 520, true, "rRNA");
final IntervalList rRnaIntervalList = new IntervalList(builder.getHeader());
rRnaIntervalList.add(rRnaInterval);
final File rRnaIntervalsFile = File.createTempFile("tmp.rRna.", ".interval_list");
rRnaIntervalsFile.deleteOnExit();
rRnaIntervalList.write(rRnaIntervalsFile);
// Generate the metrics.
final File metricsFile = File.createTempFile("tmp.", ".rna_metrics");
metricsFile.deleteOnExit();
final String[] args = new String[] {
"INPUT=" + samFile.getAbsolutePath(),
"OUTPUT=" + metricsFile.getAbsolutePath(),
"REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(),
"RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(),
"STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND",
"IGNORE_SEQUENCE=" + ignoredSequence
};
Assert.assertEquals(runPicardCommandLine(args), 0);
final MetricsFile<RnaSeqMetrics, Comparable<?>> output = new MetricsFile<RnaSeqMetrics, Comparable<?>>();
output.read(new FileReader(metricsFile));
final RnaSeqMetrics metrics = output.getMetrics().get(0);
Assert.assertEquals(metrics.PF_ALIGNED_BASES, 900);
Assert.assertEquals(metrics.PF_BASES, 950);
Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 0L);
Assert.assertEquals(metrics.CODING_BASES, 471);
Assert.assertEquals(metrics.UTR_BASES, 125);
Assert.assertEquals(metrics.INTRONIC_BASES, 98);
Assert.assertEquals(metrics.INTERGENIC_BASES, 206);
Assert.assertEquals(metrics.CORRECT_STRAND_READS, 7);
Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 6);
Assert.assertEquals(metrics.IGNORED_READS, 0);
Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 4);
Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2);
Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 3);
Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.666667);
Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.333333);
}
public File getRefFlatFile(String sequence) throws Exception {
// Create a refFlat file with a single gene containing two exons, one of which is overlapped by the
// ribosomal interval.
final String[] refFlatFields = new String[RefFlatColumns.values().length];
refFlatFields[RefFlatColumns.GENE_NAME.ordinal()] = "myGene";
refFlatFields[RefFlatColumns.TRANSCRIPT_NAME.ordinal()] = "myTranscript";
refFlatFields[RefFlatColumns.CHROMOSOME.ordinal()] = sequence;
refFlatFields[RefFlatColumns.STRAND.ordinal()] = "+";
refFlatFields[RefFlatColumns.TX_START.ordinal()] = "49";
refFlatFields[RefFlatColumns.TX_END.ordinal()] = "500";
refFlatFields[RefFlatColumns.CDS_START.ordinal()] = "74";
refFlatFields[RefFlatColumns.CDS_END.ordinal()] = "400";
refFlatFields[RefFlatColumns.EXON_COUNT.ordinal()] = "2";
refFlatFields[RefFlatColumns.EXON_STARTS.ordinal()] = "49,249";
refFlatFields[RefFlatColumns.EXON_ENDS.ordinal()] = "200,500";
final File refFlatFile = File.createTempFile("tmp.", ".refFlat");
refFlatFile.deleteOnExit();
final PrintStream refFlatStream = new PrintStream(refFlatFile);
refFlatStream.println(StringUtil.join("\t", refFlatFields));
refFlatStream.close();
return refFlatFile;
}
}