/*
* The MIT License
*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.fingerprint;
import htsjdk.samtools.BamFileIoUtils;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.FormatUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.SAMReadGroupRecord;
import picard.cmdline.programgroups.Fingerprinting;
import java.io.File;
import java.io.PrintStream;
import java.util.*;
import java.util.concurrent.TimeUnit;
/**
* Program to check that all read groups within the set of BAM files appear to come from the same
* individual.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = "Checks if all read groups within a set of BAM files appear to come from the same individual",
usageShort = "Checks if all read groups appear to come from the same individual",
programGroup = Fingerprinting.class
)
public class CrosscheckReadGroupFingerprints extends CommandLineProgram {
@Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME,
doc="One or more input BAM files (or lists of BAM files) to compare fingerprints for.")
public List<File> INPUT;
@Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, optional=true,
doc="Optional output file to write metrics to. Default is to write to stdout.")
public File OUTPUT;
@Option(shortName="H", doc="The file of haplotype data to use to pick SNPs to fingerprint")
public File HAPLOTYPE_MAP;
@Option(shortName="LOD",
doc="If any two read groups match with a LOD score lower than the threshold the program will exit " +
"with a non-zero code to indicate error. 0 means equal probability the read groups match vs. " +
"come from different individuals, negative numbers mean N logs more likely that the read groups " +
"are from different individuals and positive numbers mean N logs more likely that the read groups " +
"are from the sample individual.")
public double LOD_THRESHOLD = 0;
@Option(doc="Instead of producing the normal comparison of read-groups, roll fingerprints up to the sample level " +
"and print out a sample x sample matrix with LOD scores.")
public boolean CROSSCHECK_SAMPLES = false;
@Option(doc="Instead of producing the normal comparison of read-groups, roll fingerprints up to the library level " +
"and print out a library x library matrix with LOD scores.")
public boolean CROSSCHECK_LIBRARIES = false;
@Option(doc="The number of threads to use to process BAM files and generate Fingerprints.")
public int NUM_THREADS = 1;
@Option(doc="Allow the use of duplicate reads in performing the comparison. Can be useful when duplicate " +
"marking has been overly aggressive and coverage is low.")
public boolean ALLOW_DUPLICATE_READS = false;
@Option(doc="Assumed genotyping error rate that provides a floor on the probability that a genotype comes from" +
" the expected sample.")
public double GENOTYPING_ERROR_RATE = 0.01;
@Option(doc="If true then only read groups that do not relate to each other as expected will have their LODs reported.")
public boolean OUTPUT_ERRORS_ONLY = false;
@Option(doc ="The rate at which a het in a normal sample turns into a hom in the tumor.", optional = true)
public double LOSS_OF_HET_RATE = 0.5;
@Option(doc="Expect all read groups' fingerprints to match, irrespective of their sample names. By default (with this value set to " +
"false), read groups with different sample names are expected to mismatch, and those with the same sample name are expected " +
"to match.")
public boolean EXPECT_ALL_READ_GROUPS_TO_MATCH = false;
@Option(doc="When one or more mismatches between read groups are detected, exit with this value instead of 0.")
public int EXIT_CODE_WHEN_MISMATCH = 1;
private final Log log = Log.getInstance(CrosscheckReadGroupFingerprints.class);
private final FormatUtil formatUtil = new FormatUtil();
// These are public so that other programs can parse status from the crosscheck file
public static final String EXPECTED_MATCH = "EXPECTED MATCH";
public static final String EXPECTED_MISMATCH = "EXPECTED MISMATCH";
public static final String UNEXPECTED_MATCH = "UNEXPECTED MATCH";
public static final String UNEXPECTED_MISMATCH = "UNEXPECTED MISMATCH";
/** Stock main method. */
public static void main(final String[] args) {
new CrosscheckReadGroupFingerprints().instanceMainWithExit(args);
}
@Override protected int doWork() {
// Check inputs
for (final File f : INPUT) IOUtil.assertFileIsReadable(f);
IOUtil.assertFileIsReadable(HAPLOTYPE_MAP);
if (OUTPUT != null) IOUtil.assertFileIsWritable(OUTPUT);
final HaplotypeMap map = new HaplotypeMap(HAPLOTYPE_MAP);
final FingerprintChecker checker = new FingerprintChecker(map);
checker.setAllowDuplicateReads(ALLOW_DUPLICATE_READS);
log.info("Done checking input files, moving onto fingerprinting files.");
List<File> unrolledFiles = IOUtil.unrollFiles(INPUT, BamFileIoUtils.BAM_FILE_EXTENSION, IOUtil.SAM_FILE_EXTENSION);
final Map<SAMReadGroupRecord, Fingerprint> fpMap = checker.fingerprintSamFiles(unrolledFiles, NUM_THREADS, 1, TimeUnit.DAYS);
final List<Fingerprint> fingerprints = new ArrayList<>(fpMap.values());
log.info("Finished generating fingerprints from BAM files, moving on to cross-checking.");
// Setup the output
final PrintStream out;
if (OUTPUT != null) {
out = new PrintStream(IOUtil.openFileForWriting(OUTPUT), true);
}
else {
out = System.out;
}
if (this.CROSSCHECK_SAMPLES) {
crossCheckSamples(fingerprints, out);
return 0;
}
else if (this.CROSSCHECK_LIBRARIES) {
crossCheckLibraries(fpMap, out);
return 0;
}
else {
return crossCheckReadGroups(fpMap, out);
}
}
/**
* Method that combines the fingerprint evidence across all the read groups for the same sample
* and then produces a matrix of LOD scores for comparing every sample with every other sample.
*/
private void crossCheckSamples(final List<Fingerprint> fingerprints, final PrintStream out) {
final SortedMap<String,Fingerprint> sampleFps = FingerprintChecker.mergeFingerprintsBySample(fingerprints);
final SortedSet<String> samples = (SortedSet<String>) sampleFps.keySet();
// Print header row
out.print("\t");
for (final String sample : samples) { out.print(sample); out.print("\t"); }
out.println();
// Print results rows
for (final String sample : samples) {
out.print(sample);
final Fingerprint fp = sampleFps.get(sample);
for (final String otherSample : samples) {
final MatchResults results = FingerprintChecker.calculateMatchResults(fp, sampleFps.get(otherSample), GENOTYPING_ERROR_RATE, LOSS_OF_HET_RATE);
out.print("\t");
out.print(formatUtil.format(results.getLOD()));
}
out.println();
}
}
/**
* Method that combines the fingerprint evidence across all the read groups for the same library
* and then produces a matrix of LOD scores for comparing every library with every other library.
*/
private void crossCheckLibraries(final Map<SAMReadGroupRecord,Fingerprint> fingerprints, final PrintStream out) {
final List<Fingerprint> fixedFps = new ArrayList<>();
for (final SAMReadGroupRecord rg : fingerprints.keySet()) {
final Fingerprint old = fingerprints.get(rg);
final String name = rg.getSample() + "::" + rg.getLibrary();
final Fingerprint newFp = new Fingerprint(name, old.getSource(), old.getInfo());
newFp.putAll(old);
fixedFps.add(newFp);
}
crossCheckSamples(fixedFps, out);
}
/**
* Method that pairwise checks every pair of read groups and reports a LOD score for the two read groups
* coming from the same sample.
*/
private int crossCheckReadGroups(final Map<SAMReadGroupRecord,Fingerprint> fingerprints, final PrintStream out) {
int mismatches = 0;
int unexpectedMatches = 0;
final List<SAMReadGroupRecord> readGroupRecords = new ArrayList<>(fingerprints.keySet());
final List<String> output = new ArrayList<>();
for (int i = 0; i < readGroupRecords.size(); i++) {
final SAMReadGroupRecord lhsRg = readGroupRecords.get(i);
for (int j= i+1; j < readGroupRecords.size(); j++) {
final SAMReadGroupRecord rhsRg = readGroupRecords.get(j);
final boolean expectedToMatch = EXPECT_ALL_READ_GROUPS_TO_MATCH || lhsRg.getSample().equals(rhsRg.getSample());
final MatchResults results = FingerprintChecker.calculateMatchResults(fingerprints.get(lhsRg), fingerprints.get(rhsRg), GENOTYPING_ERROR_RATE, LOSS_OF_HET_RATE);
if (expectedToMatch) {
if (results.getLOD() < LOD_THRESHOLD) {
mismatches++;
output.add(getMatchDetails(UNEXPECTED_MISMATCH, results, lhsRg, rhsRg));
} else {
if (!OUTPUT_ERRORS_ONLY) {
output.add(getMatchDetails(EXPECTED_MATCH, results, lhsRg, rhsRg));
}
}
} else {
if (results.getLOD() > -LOD_THRESHOLD) {
unexpectedMatches++;
output.add(getMatchDetails(UNEXPECTED_MATCH, results, lhsRg, rhsRg));
} else {
if (!OUTPUT_ERRORS_ONLY) {
output.add(getMatchDetails(EXPECTED_MISMATCH, results, lhsRg, rhsRg));
}
}
}
}
}
if (!output.isEmpty()) {
out.println("RESULT\tLOD_SCORE\tLOD_SCORE_TUMOR_NORMAL\tLOD_SCORE_NORMAL_TUMOR\tLEFT_RUN_BARCODE\tLEFT_LANE\tLEFT_MOLECULAR_BARCODE_SEQUENCE\tLEFT_LIBRARY\tLEFT_SAMPLE\t" +
"RIGHT_RUN_BARCODE\tRIGHT_LANE\tRIGHT_MOLECULAR_BARCODE_SEQUENCE\tRIGHT_LIBRARY\tRIGHT_SAMPLE");
out.println(String.join("\n", output));
}
if (mismatches + unexpectedMatches > 0) {
log.info("WARNING: At least two read groups did not relate as expected.");
return EXIT_CODE_WHEN_MISMATCH;
}
else {
log.info("All read groups related as expected.");
return 0;
}
}
/**
* Generates tab delimited string containing details about a possible match between fingerprints on two different SAMReadGroupRecords
* @param matchResult String describing the match type.
* @param results MatchResults object
* @param left left hand side SAMReadGroupRecord
* @param right right hand side SAMReadGroupRecord
* @return tab delimited string containing details about a possible match
*/
private String getMatchDetails(final String matchResult, final MatchResults results, final SAMReadGroupRecord left, final SAMReadGroupRecord right) {
final List<String> elements = new ArrayList<>(4);
elements.add(matchResult);
elements.add(formatUtil.format(results.getLOD()));
elements.add(formatUtil.format(results.getLodTN()));
elements.add(formatUtil.format(results.getLodNT()));
elements.add(getReadGroupDetails(left));
elements.add(getReadGroupDetails(right));
return String.join("\t", elements);
}
/**
* Generates tab delimited string containing details about the passed SAMReadGroupRecord
* @param readGroupRecord record
* @return tab delimited string containing details about the SAMReadGroupRecord
*/
private String getReadGroupDetails(final SAMReadGroupRecord readGroupRecord) {
final List<String> elements = new ArrayList<>(5);
final String[] tmp = readGroupRecord.getPlatformUnit().split("\\."); // Expect to look like: D047KACXX110901.1.ACCAACTG
String runBarcode = "?";
String lane = "?";
String molBarcode = "?";
if ((tmp.length == 3) || (tmp.length == 2)) {
runBarcode = tmp[0];
lane = tmp[1];
molBarcode = (tmp.length == 3) ? tmp[2] : ""; // In older BAMS there may be no molecular barcode sequence
} else {
log.error("Unexpected format " + readGroupRecord.getPlatformUnit() + " for PU attribute");
}
elements.add(runBarcode);
elements.add(lane);
elements.add(molBarcode);
elements.add(readGroupRecord.getLibrary());
elements.add(readGroupRecord.getSample());
return String.join("\t", elements);
}
}