/*
* The MIT License
*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.fingerprint;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.filter.NotPrimaryAlignmentFilter;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.util.*;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import picard.PicardException;
import java.io.File;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Major class that coordinates the activities involved in comparing genetic fingerprint
* data whether the source is from a genotyping platform or derived from sequence data.
*
* @author Tim Fennell
*/
public class FingerprintChecker {
public static final double DEFAULT_GENOTYPING_ERROR_RATE = 0.01;
public static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 10;
public static final int DEFAULT_MINIMUM_BASE_QUALITY = 20;
public static final int DEFAULT_MAXIMAL_PL_DIFFERENCE = 30;
private final HaplotypeMap haplotypes;
private int minimumBaseQuality = DEFAULT_MINIMUM_BASE_QUALITY;
private int minimumMappingQuality = DEFAULT_MINIMUM_MAPPING_QUALITY;
private double genotypingErrorRate = DEFAULT_GENOTYPING_ERROR_RATE;
private int maximalPLDifference = DEFAULT_MAXIMAL_PL_DIFFERENCE;
private boolean allowDuplicateReads = false;
private double pLossofHet = 0;
private final Log log = Log.getInstance(FingerprintChecker.class);
/**
* Creates a fingerprint checker that will work with the set of haplotypes stored in
* the supplied file.
*/
public FingerprintChecker(final File haplotypeData) {
this.haplotypes = new HaplotypeMap(haplotypeData);
}
/** Creates a fingerprint checker that will work with the set of haplotyped provided. */
public FingerprintChecker(final HaplotypeMap haplotypes) {
this.haplotypes = haplotypes;
}
/** Sets the minimum base quality for bases used when computing a fingerprint from sequence data. */
public void setMinimumBaseQuality(final int minimumBaseQuality) {
this.minimumBaseQuality = minimumBaseQuality;
}
/** Sets the minimum mapping quality for reads used when computing fingerprints from sequence data. */
public void setMinimumMappingQuality(final int minimumMappingQuality) {
this.minimumMappingQuality = minimumMappingQuality;
}
/** Sets the assumed genotyping error rate used when accurate error rates are not available. */
public void setGenotypingErrorRate(final double genotypingErrorRate) {
this.genotypingErrorRate = genotypingErrorRate;
}
/** Sets the maximal difference in PL scores considered when reading PLs from a VCF. */
public void setmaximalPLDifference(final int maximalPLDifference) {
this.maximalPLDifference = maximalPLDifference;
}
public SAMFileHeader getHeader(){
return haplotypes.getHeader();
}
/**
* Sets whether duplicate reads should be allowed when calling genotypes from SAM files. This is
* useful when comparing read groups within a SAM file and individual read groups show artifactually
* high duplication (e.g. a single-ended read group mixed in with paired-end read groups).
* @param allowDuplicateReads
*/
public void setAllowDuplicateReads(final boolean allowDuplicateReads) {
this.allowDuplicateReads = allowDuplicateReads;
}
//sets the value of the probability that a genotype underwent a Loss of Hetrozygosity (for Tumors)
public void setpLossofHet(final double pLossofHet) {
this.pLossofHet = pLossofHet;
}
/**
* Loads genotypes from the supplied file into one or more Fingerprint objects and returns them in a
* Map of Sample->Fingerprint.
*
* @param fingerprintFile - VCF file containing genotypes for one or more samples
* @param specificSample - null to load genotypes for all samples contained in the file or the name
* of an individual sample to load (and exclude all others).
* @return a Map of Sample name to Fingerprint
*/
public Map<String, Fingerprint> loadFingerprints(final File fingerprintFile, final String specificSample) {
final VCFFileReader reader = new VCFFileReader(fingerprintFile, false);
final CloseableIterator<VariantContext> iterator = reader.iterator();
SequenceUtil.assertSequenceDictionariesEqual(this.haplotypes.getHeader().getSequenceDictionary(),
reader.getSequenceDictionary(fingerprintFile));
final Map<String, Fingerprint> fingerprints = new HashMap<>();
Set<String> samples = null;
if (specificSample != null) {
samples = new HashSet<>();
samples.add(specificSample);
}
while (iterator.hasNext()) {
// Setup the sample names set if needed
final VariantContext ctx = iterator.next();
if (samples == null) samples = ctx.getSampleNames();
if (isUsableSnp(ctx)) {
final HaplotypeBlock h = this.haplotypes.getHaplotype(ctx.getContig(), ctx.getStart());
final Snp snp = this.haplotypes.getSnp(ctx.getContig(), ctx.getStart());
if (h == null) continue;
// Check the alleles from the file against the expected set of genotypes
{
boolean allelesOk = true;
for (final Allele allele : ctx.getAlleles()) {
final byte[] bases = allele.getBases();
if (bases.length > 1 || (bases[0] != snp.getAllele1() && bases[0] != snp.getAllele2())) {
allelesOk = false;
}
}
if (!allelesOk) {
log.warn("Problem with genotype file '" + fingerprintFile.getName() + "': Alleles "
+ ctx.getAlleles() + " do not match to alleles for SNP " + snp
+ " with alleles " + snp.getAlleleString());
continue ;
}
}
for (final String sample : samples) {
Fingerprint fp = fingerprints.get(sample);
// Find or construct the fingerprint for this sample
if (fp == null) {
fp = new Fingerprint(sample, fingerprintFile, null);
fingerprints.put(sample, fp);
}
//PLs are preferred over GTs
//TODO: this code is replicated in various places (ReconstructTriosFromVCF for example). Needs refactoring.
//TODO: add a way to force using GTs when both are available (why?)
// Get the genotype for the sample and check that it is useful
final Genotype genotype = ctx.getGenotype(sample);
if (genotype == null) {
throw new IllegalArgumentException("Cannot find sample " + sample + " in provided file: " + fingerprintFile);
}
if (genotype.hasPL()) {
final HaplotypeProbabilitiesFromGenotypeLikelihoods hFp = new HaplotypeProbabilitiesFromGenotypeLikelihoods(h);
//do not modify the PL array directly fragile!!!!!
final int[] pls = genotype.getPL();
final int[] newPLs = new int[pls.length];
for (int i = 0; i < pls.length; i++) {
newPLs[i] = Math.min(maximalPLDifference, pls[i]);
}
hFp.addToLogLikelihoods(snp, ctx.getAlleles(), GenotypeLikelihoods.fromPLs(newPLs).getAsVector());
fp.add(hFp);
} else {
if (genotype.isNoCall()) continue;
// TODO: when multiple genotypes are available for a Haplotype check that they
// TODO: agree. Not urgent since DownloadGenotypes already does this.
if (fp.containsKey(h)) continue;
final boolean hom = genotype.isHom();
final byte allele = StringUtil.toUpperCase(genotype.getAllele(0).getBases()[0]);
final double halfError = this.genotypingErrorRate / 2;
final double accuracy = 1 - this.genotypingErrorRate;
final double[] probs = new double[]{
(hom && allele == snp.getAllele1()) ? accuracy : halfError,
(!hom) ? accuracy : halfError,
(hom && allele == snp.getAllele2()) ? accuracy : halfError
};
fp.add(new HaplotypeProbabilitiesFromGenotype(snp, h, probs[0], probs[1], probs[2]));
}
}
}
}
return fingerprints;
}
/**
* Quick method to check and see if the variant context represents a usable SNP variant. Unfortunately
* ctx.isSnp doesn't always work if the genotype(s) are all monomorphic and the alternate allele isn't
* listed.
*/
public static boolean isUsableSnp(final VariantContext ctx) {
if (ctx.isFiltered()) return false;
if (ctx.isIndel()) return false;
if (ctx.isMixed()) return false;
// Also check that all alleles are length 1
for (final Allele a : ctx.getAlleles()) {
if (a.length() != 1) return false;
}
return true;
}
/**
* Takes a set of fingerprints and returns an IntervalList containing all the loci that
* can be productively examined in sequencing data to compare to one or more of the
* fingerprints.
*/
public IntervalList getLociToGenotype(final Collection<Fingerprint> fingerprints) {
final IntervalList intervals = new IntervalList(this.haplotypes.getHeader());
for (final Fingerprint fp : fingerprints) {
for (final HaplotypeProbabilities genotype : fp.values()) {
final HaplotypeBlock h = genotype.getHaplotype();
for (final Snp snp : h.getSnps()) {
intervals.add(new Interval(snp.getChrom(), snp.getPos(), snp.getPos(), false, snp.getName()));
}
}
}
return intervals.uniqued();
}
/**
* Generates a Fingerprint per read group in the supplied SAM file using the loci provided in
* the interval list.
*/
public Map<SAMReadGroupRecord, Fingerprint> fingerprintSamFile(final File samFile, final IntervalList loci) {
final SamReader in = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES)
.open(samFile);
SequenceUtil.assertSequenceDictionariesEqual(this.haplotypes.getHeader().getSequenceDictionary(),
in.getFileHeader().getSequenceDictionary());
final SamLocusIterator iterator = new SamLocusIterator(in, loci, in.hasIndex());
iterator.setEmitUncoveredLoci(true);
iterator.setMappingQualityScoreCutoff(this.minimumMappingQuality);
iterator.setQualityScoreCutoff(this.minimumBaseQuality);
// In some cases it is useful to allow duplicate reads to be used - the most common is in single-end
// sequence data where the duplicate marking may have been overly aggressive, and there is useful
// non-redundant data in the reads marked as "duplicates'.
if (this.allowDuplicateReads) {
final List<SamRecordFilter> filters = new ArrayList<>(1);
filters.add(new NotPrimaryAlignmentFilter());
iterator.setSamFilters(filters);
}
final Map<SAMReadGroupRecord, Fingerprint> fingerprintsByReadGroup = new HashMap<>();
final List<SAMReadGroupRecord> rgs = in.getFileHeader().getReadGroups();
for (final SAMReadGroupRecord rg : rgs) {
final Fingerprint fingerprint = new Fingerprint(rg.getSample(),
samFile,
rg.getPlatformUnit() != null ? rg.getPlatformUnit() : rg.getId());
fingerprintsByReadGroup.put(rg, fingerprint);
for (final HaplotypeBlock h : this.haplotypes.getHaplotypes()) {
fingerprint.add(new HaplotypeProbabilitiesFromSequence(h));
}
}
// Set of read/template names from which we have already sampled a base and a qual. Since we assume
// that all evidence for a haplotype is independent we can't sample two or more bases from a single
// read or read-pair because they would not be independent!
final Set<String> usedReadNames = new HashSet<String>(10000);
// Now go through the data at each locus and figure stuff out!
for (final SamLocusIterator.LocusInfo info : iterator) {
// TODO: Filter out the locus if the allele balance doesn't make sense for either a
// TODO: 50/50 het or a hom with some errors; in HS data with deep coverage any base
// TODO: with major strand bias could cause errors
// Find the matching Snp and HaplotypeProbs
final HaplotypeBlock haplotypeBlock = this.haplotypes.getHaplotype(info.getSequenceName(), info.getPosition());
final Snp snp = this.haplotypes.getSnp(info.getSequenceName(), info.getPosition());
for (final SamLocusIterator.RecordAndOffset rec : info.getRecordAndOffsets()) {
final SAMReadGroupRecord rg = rec.getRecord().getReadGroup();
if (rg == null || !fingerprintsByReadGroup.containsKey(rg)) {
final PicardException e = new PicardException("Unknown read group: " + rg);
log.error(e);
throw e;
}
else {
final String readName = rec.getRecord().getReadName();
if (!usedReadNames.contains(readName)) {
final HaplotypeProbabilitiesFromSequence probs = (HaplotypeProbabilitiesFromSequence) fingerprintsByReadGroup.get(rg).get(haplotypeBlock);
final byte base = StringUtil.toUpperCase(rec.getReadBase());
final byte qual = rec.getBaseQuality();
probs.addToProbs(snp, base, qual);
usedReadNames.add(readName);
}
}
}
}
return fingerprintsByReadGroup;
}
/**
* Generates a per-sample Fingerprint for the contaminant in the supplied SAM file.
* Data is aggregated by sample, not read-group.
*/
public Map<String, Fingerprint> identifyContaminant(final File samFile, final double contamination, final int locusMaxReads) {
final SamReader in = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES).open(samFile);
SequenceUtil.assertSequenceDictionariesEqual(this.haplotypes.getHeader().getSequenceDictionary(),
in.getFileHeader().getSequenceDictionary());
final SamLocusIterator iterator = new SamLocusIterator(in, haplotypes.getIntervalList(), in.hasIndex());
iterator.setEmitUncoveredLoci(true);
iterator.setMappingQualityScoreCutoff(this.minimumMappingQuality);
iterator.setQualityScoreCutoff(this.minimumBaseQuality);
// In some cases it is useful to allow duplicate reads to be used - the most common is in single-end
// sequence data where the duplicate marking may have been overly aggressive, and there is useful
// non-redundant data in the reads marked as "duplicates'.
if (this.allowDuplicateReads) {
final List<SamRecordFilter> filters = new ArrayList<>(1);
filters.add(new NotPrimaryAlignmentFilter());
iterator.setSamFilters(filters);
}
final Map<String, Fingerprint> fingerprintsBySample = new HashMap<>();
for (final SAMReadGroupRecord rg : in.getFileHeader().getReadGroups()) {
if (!fingerprintsBySample.containsKey(rg.getSample())) {
final Fingerprint fingerprint = new Fingerprint(rg.getSample(),
samFile,
rg.getSample());
for (final HaplotypeBlock h : this.haplotypes.getHaplotypes()) {
fingerprint.add(new HaplotypeProbabilitiesFromContaminatorSequence(h, contamination));
}
fingerprintsBySample.put(rg.getSample(), fingerprint);
}
}
// Set of read/template names from which we have already sampled a base and a qual. Since we assume
// that all evidence for a haplotype is independent we can't sample two or more bases from a single
// read or read-pair because they would not be independent!
final Set<String> usedReadNames = new HashSet<>(10000);
// Now go through the data at each locus and figure stuff out!
for (final SamLocusIterator.LocusInfo info : iterator) {
// Find the matching Snp and HaplotypeProbs
final HaplotypeBlock haplotypeBlock = this.haplotypes.getHaplotype(info.getSequenceName(), info.getPosition());
final Snp snp = this.haplotypes.getSnp(info.getSequenceName(), info.getPosition());
// randomly select locusMaxReads elements from the list
final List<SamLocusIterator.RecordAndOffset> recordAndOffsetList = randomSublist(info.getRecordAndOffsets(), locusMaxReads);
for (final SamLocusIterator.RecordAndOffset rec : recordAndOffsetList) {
final SAMReadGroupRecord rg = rec.getRecord().getReadGroup();
if (rg == null || !fingerprintsBySample.containsKey(rg.getSample())) {
final PicardException e = new PicardException("Unknown sample: " + (rg != null ? rg.getSample() : "(null readgroup)"));
log.error(e);
throw e;
} else {
final String readName = rec.getRecord().getReadName();
if (!usedReadNames.contains(readName)) {
final HaplotypeProbabilitiesFromContaminatorSequence probs =
(HaplotypeProbabilitiesFromContaminatorSequence) fingerprintsBySample.get(rg.getSample()).get(haplotypeBlock);
final byte base = StringUtil.toUpperCase(rec.getReadBase());
final byte qual = rec.getBaseQuality();
probs.addToProbs(snp, base, qual);
usedReadNames.add(readName);
}
}
}
}
return fingerprintsBySample;
}
/**
* A small utility function to choose n random elements (un-shuffled) from a list
*
* @param list A list of elements
* @param n a number of elements requested from list
* @return a list of n randomly chosen (but in the original order) elements from list.
* If the list has less than n elements it is returned in its entirety.
*/
protected static <T> List<T> randomSublist(final List<T> list, final int n) {
int availableElements = list.size();
if (availableElements <= n) return list;
int stillNeeded = n;
final Random rg = new Random();
final List<T> shortList = new ArrayList<>(n);
for (final T aList : list) {
if (rg.nextDouble() < stillNeeded / (double) availableElements) {
shortList.add(aList);
stillNeeded--;
}
if (stillNeeded == 0 ) break; // fast out if do not need more elements
availableElements--;
}
return shortList;
}
/**
* Fingerprints one or more SAM/BAM files at all available loci within the haplotype map, using multiple threads
* to speed up the processing.
*/
public Map<SAMReadGroupRecord, Fingerprint> fingerprintSamFiles(final Collection<File> files, final int threads,
final int waitTime, final TimeUnit waitTimeUnit) {
// Generate fingerprints from each BAM file first
final AtomicInteger filesRead = new AtomicInteger(0);
final ExecutorService executor = Executors.newFixedThreadPool(threads);
final IntervalList intervals = this.haplotypes.getIntervalList();
final Map<SAMReadGroupRecord, Fingerprint> retval = new ConcurrentHashMap<>();
for (final File f : files) {
executor.submit(() -> {
retval.putAll(fingerprintSamFile(f, intervals));
if (filesRead.incrementAndGet() % 100 == 0) {
log.info("Processed " + filesRead.get() + " out of " + files.size());
}
});
}
executor.shutdown();
try { executor.awaitTermination(waitTime, waitTimeUnit); }
catch (final InterruptedException ie) { log.warn(ie, "Interrupted while waiting for executor to terminate."); }
return retval;
}
/**
* Takes a collection of fingerprints and, assuming that they are independent, merged the fingerprints
* by samples and totals up the probabilities.
*/
static public SortedMap<String, Fingerprint> mergeFingerprintsBySample(final Collection<Fingerprint> inputs) {
final SortedMap<String, Fingerprint> sampleFps = new TreeMap<>();
for (final Fingerprint fp : inputs) {
Fingerprint sampleFp = sampleFps.get(fp.getSample());
if (sampleFp == null) {
sampleFp = new Fingerprint(fp.getSample(), null, fp.getSample());
sampleFps.put(fp.getSample(), sampleFp);
}
sampleFp.merge(fp);
}
return sampleFps;
}
/**
* Top level method to take a set of one or more SAM files and one or more Genotype files and compare
* each read group in each SAM file to each set of fingerprint genotypes.
*
* @param samFiles the list of SAM files to fingerprint
* @param genotypeFiles the list of genotype files from which to pull fingerprint genotypes
* @param specificSample an optional single sample who's genotypes to load from the supplied files
* @param ignoreReadGroups aggregate data into one fingerprint per file, instead of splitting by RG
*/
public List<FingerprintResults> checkFingerprints(final List<File> samFiles,
final List<File> genotypeFiles,
final String specificSample,
final boolean ignoreReadGroups) {
// Load the fingerprint genotypes
final List<Fingerprint> expectedFingerprints = new LinkedList<>();
for (final File f : genotypeFiles) {
expectedFingerprints.addAll(loadFingerprints(f, specificSample).values());
}
if (expectedFingerprints.isEmpty()) {
throw new IllegalStateException("Could not find any fingerprints in: " + genotypeFiles);
}
final List<FingerprintResults> resultsList = new ArrayList<>();
final IntervalList intervals = getLociToGenotype(expectedFingerprints);
// Fingerprint the SAM files and calculate the results
for (final File f : samFiles) {
final Map<SAMReadGroupRecord, Fingerprint> fingerprintsByReadGroup = fingerprintSamFile(f, intervals);
if (ignoreReadGroups) {
final Fingerprint combinedFp = new Fingerprint(specificSample, f, null);
fingerprintsByReadGroup.values().forEach(combinedFp::merge);
final FingerprintResults results = new FingerprintResults(f, null, specificSample);
for (final Fingerprint expectedFp : expectedFingerprints) {
final MatchResults result = calculateMatchResults(combinedFp, expectedFp, 0, pLossofHet);
results.addResults(result);
}
resultsList.add(results);
} else {
for (final SAMReadGroupRecord rg : fingerprintsByReadGroup.keySet()) {
final FingerprintResults results = new FingerprintResults(f, rg.getPlatformUnit(), specificSample);
for (final Fingerprint expectedFp : expectedFingerprints) {
final MatchResults result = calculateMatchResults(fingerprintsByReadGroup.get(rg), expectedFp, 0, pLossofHet);
results.addResults(result);
}
resultsList.add(results);
}
}
}
return resultsList;
}
/**
* Top level method to take a set of one or more observed genotype (VCF) files and one or more expected genotype (VCF) files and compare
* one or more sample in the observed genotype file with one or more in the expected file and generate results for each set.
*
* @param observedGenotypeFiles The list of genotype files containing observed calls, from which to pull fingerprint genotypes
* @param expectedGenotypeFiles The list of genotype files containing expected calls, from which to pull fingerprint genotypes
* @param observedSample an optional single sample whose genotypes to load from the observed genotype file (if null, use all)
* @param expectedSample an optional single sample whose genotypes to load from the expected genotype file (if null, use all)
*/
public List<FingerprintResults> checkFingerprints(final List<File> observedGenotypeFiles,
final List<File> expectedGenotypeFiles,
final String observedSample,
final String expectedSample) {
// Load the expected fingerprint genotypes
final List<Fingerprint> expectedFingerprints = new ArrayList<>();
for (final File f : expectedGenotypeFiles) {
expectedFingerprints.addAll(loadFingerprints(f, expectedSample).values());
}
if (expectedFingerprints.isEmpty()) {
throw new IllegalStateException("Could not find any fingerprints in: " + expectedGenotypeFiles);
}
final List<FingerprintResults> resultsList = new ArrayList<>();
for (final File f : observedGenotypeFiles) {
final Map<String, Fingerprint> observedFingerprintsBySample = loadFingerprints(f, observedSample);
if (observedFingerprintsBySample.isEmpty()) {
throw new IllegalStateException("Found no fingerprints in observed genotypes file: " + observedGenotypeFiles);
}
for (final String sample : observedFingerprintsBySample.keySet()) {
final FingerprintResults results = new FingerprintResults(f, null, sample);
for (Fingerprint expectedFp : expectedFingerprints) {
final MatchResults result = calculateMatchResults(observedFingerprintsBySample.get(sample), expectedFp, 0, pLossofHet);
results.addResults(result);
}
resultsList.add(results);
}
}
return resultsList;
}
/**
* Compares two fingerprints and calculates a MatchResults object which contains detailed
* information about the match (or mismatch) between fingerprints including the LOD score
* for whether or not the two are likely from the same sample.
*
* If comparing sequencing data to genotype data then the sequencing data should be passed
* as the observedFp and the genotype data as the expectedFp in order to get the best output.
*
* In the cases where the most likely genotypes from the two fingerprints do not match the
* lExpectedSample is Max(actualpExpectedSample, minPExpected).
*/
public static MatchResults calculateMatchResults(final Fingerprint observedFp, final Fingerprint expectedFp, final double minPExpected, final double pLoH) {
final List<LocusResult> locusResults = new ArrayList<>();
double llThisSample = 0;
double llOtherSample = 0;
double lodExpectedSampleTumorNormal = 0;
double lodExpectedSampleNormalTumor = 0;
final double lminPExpected = Math.log10(minPExpected);
for (final HaplotypeProbabilities probs2 : expectedFp.values()) {
final HaplotypeBlock haplotypeBlock = probs2.getHaplotype();
final HaplotypeProbabilities probs1 = observedFp.get(haplotypeBlock);
if (probs1 == null) continue;
final HaplotypeProbabilityOfNormalGivenTumor prob1AssumingDataFromTumor = new HaplotypeProbabilityOfNormalGivenTumor(probs1, pLoH);
final HaplotypeProbabilityOfNormalGivenTumor prob2AssumingDataFromTumor = new HaplotypeProbabilityOfNormalGivenTumor(probs2, pLoH);
// If one is from genotype data we'd like to report the output relative
// to the genotyped SNP instead of against a random SNP from the haplotype
final Snp snp = probs2.getRepresentativeSnp();
final DiploidGenotype externalGenotype = probs2.getMostLikelyGenotype(snp);
final LocusResult lr = new LocusResult(snp,
externalGenotype,
probs1.getMostLikelyGenotype(snp),
probs1.getObsAllele1(),
probs1.getObsAllele2(),
probs1.getLodMostProbableGenotype(),
// expected sample log-likelihood
probs1.shiftedLogEvidenceProbabilityGivenOtherEvidence(probs2),
// random sample log-likelihood
probs1.shiftedLogEvidenceProbability(),
// probs1 is tumor probs2 is normal, correct sample lod
prob1AssumingDataFromTumor.shiftedLogEvidenceProbabilityGivenOtherEvidence(probs2) -
prob1AssumingDataFromTumor.shiftedLogEvidenceProbability(),
// probs1 is normal probs2 is tumor, correct sample lod
probs1.shiftedLogEvidenceProbabilityGivenOtherEvidence(prob2AssumingDataFromTumor) -
probs1.shiftedLogEvidenceProbability());
locusResults.add(lr);
if (probs1.hasEvidence() && probs2.hasEvidence()) {
final double lRandom = lr.lRandomSample();
//TODO: what's the mathematics behind the lminPexpected?
final double lExpected = Math.max(lminPExpected, lr.lExpectedSample());
llThisSample += lExpected;
llOtherSample += lRandom;
lodExpectedSampleTumorNormal += lr.getLodExpectedSampleTumorNormal();
lodExpectedSampleNormalTumor += lr.getLodExpectedSampleNormalTumor();
}
}
// TODO: prune the set of LocusResults for things that are too close together?
return new MatchResults(expectedFp.getSource(), expectedFp.getSample(), llThisSample, llOtherSample, lodExpectedSampleTumorNormal, lodExpectedSampleNormalTumor, locusResults);
}
/**
* Compares two fingerprints and calculates a MatchResults object which contains detailed
* information about the match (or mismatch) between fingerprints including the LOD score
* for whether or not the two are likely from the same sample.
*
* If comparing sequencing data to genotype data then the sequencing data should be passed
* as the observedFp and the genotype data as the expectedFp in order to get the best output.
*/
public static MatchResults calculateMatchResults(final Fingerprint observedFp, final Fingerprint expectedFp) {
return calculateMatchResults(observedFp, expectedFp, 0, 0);
}
}