/* * The MIT License * * Copyright (c) 2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.vcf.MendelianViolations; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.Lazy; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextComparator; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.VcfOrBcf; import picard.pedigree.PedFile; import picard.vcf.processor.VariantProcessor; import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; import static htsjdk.variant.variantcontext.writer.Options.INDEX_ON_THE_FLY; /** * Finds mendelian violations of all types within a VCF. Takes in VCF or BCF and a pedigree file and looks * for high confidence calls where the genotype of the offspring is incompatible with the genotypes of the * parents. Key features: * 1) Checks for regular MVs in diploid regions and invalid transmissions in haploid regions * 2) Outputs metrics about the different kinds of MVs found * 3) Can output a per-trio VCF with violations; INFO field will contain a MV= attribute with the type of violation * * This CLP ignores variants that are: * - Not SNPs * - Filtered * - Multiallelic * - Monomorphic * - Within the SKIP_CHROMS contigs * * @author Tim Fennell */ @CommandLineProgramProperties( usage = "Finds mendelian violations of all types within a VCF. " + "Takes in VCF or BCF and a pedigree file and looks for high confidence calls " + "where the genotype of the offspring is incompatible with the genotypes of the parents. " + "Assumes the existence of format fields AD, DP, GT, GQ, and PL fields. " + "\n" + "Take note that the implementation assumes that reads from the PAR will be mapped to the female chromosome" + "rather than the male. This requires that the PAR in the male chromosome be masked so that the aligner " + "has a single coting to map to. This is normally done for the public releases of the human reference."+ "\n" + "Usage example: java -jar picard.jar FindMendelianViolations I=input.vcf \\\n" + " TRIO=family.ped \\\n" + " OUTPUT=mendelian.txt \\\n" + " MIN_DP=20 \n" + "\n" , usageShort = "Finds mendelian violations of all types within a VCF", programGroup = VcfOrBcf.class ) public class FindMendelianViolations extends CommandLineProgram { @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input VCF or BCF with genotypes.") public File INPUT; @Option(shortName = "PED", doc = "File of Trio information in PED format (with no genotype columns).") public File TRIOS; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output metrics file.") public File OUTPUT; @Option(shortName = "GQ", doc = "Minimum genotyping quality (or non-ref likelihood) to perform tests.") public int MIN_GQ = 30; @Option(shortName = "DP", doc="Minimum depth in each sample to consider possible mendelian violations.") public int MIN_DP = 0; @Option(shortName = "MINHET", doc = "Minimum allele balance at sites that are heterozygous in the offspring.") public double MIN_HET_FRACTION = 0.3; @Option(optional = true, doc = "If provided, output per-family VCFs of mendelian violations into this directory.") public File VCF_DIR; @Option(doc = "List of chromosome names to skip entirely.") public Set<String> SKIP_CHROMS = CollectionUtil.makeSet("MT", "chrM"); @Option(doc = "List of possible names for male sex chromosome(s)") public Set<String> MALE_CHROMS = CollectionUtil.makeSet("Y", "chrY"); @Option(doc = "List of possible names for female sex chromosome(s)") public Set<String> FEMALE_CHROMS = CollectionUtil.makeSet("X", "chrX"); @Option(doc = "List of chr:start-end for pseudo-autosomal regions on the female sex chromosome. Defaults to HG19/b37 & HG38 coordinates.") public Set<String> PSEUDO_AUTOSOMAL_REGIONS = CollectionUtil.makeSet("X:10001-2649520", "X:59034050-59373566", "chrX:10000-2781479", "chrX:155701382-156030895"); @Option(doc = "The number of threads that will be used to collect the metrics. ") public int THREAD_COUNT = 1; @Option(doc = "If true then fields need to be delimited by a single tab. If false the delimiter is one or more whitespace characters." + " Note that tab mode does not strictly follow the PED spec") public boolean TAB_MODE = false; private final Log LOG = Log.getInstance(FindMendelianViolations.class); private final ProgressLogger progressLogger = new ProgressLogger(LOG, 10000, "variants analyzed"); // The following two members are "lazy" since they need to wait until the commandline program populates // the files from the arguments before they can be realized (while keeping them final) private final Lazy<VCFHeader> inputHeader = new Lazy<>( () -> { try (final VCFFileReader in = new VCFFileReader(INPUT)) { return in.getFileHeader(); } }); private final Lazy<PedFile> pedFile = new Lazy<>(()-> PedFile.fromFile(TRIOS, TAB_MODE)); private final Lazy<Set<Interval>> parIntervals = new Lazy<>(() -> Collections.unmodifiableSet(parseIntervalLists(PSEUDO_AUTOSOMAL_REGIONS))); private Set<Interval> parseIntervalLists(final Set<String> intervalLists){ // Parse the PARs final Set<Interval> intervals = new HashSet<>(intervalLists.size()); for (final String par : intervalLists) { final String[] splits1 = par.split(":"); final String[] splits2 = splits1[1].split("-"); intervals.add(new Interval(splits1[0], Integer.parseInt(splits2[0]), Integer.parseInt(splits2[1]))); } return intervals; } /** * Validates that the sex chromosomes don't overlap and parses the pseudo-autosomal regions into usable * objects to ensure their parsability * * @return */ @Override protected String[] customCommandLineValidation() { final List<String> errors = new ArrayList<>(); // Check that the sex chromosomes are not overlapping final Set<String> sexChroms = new HashSet<>(); sexChroms.addAll(FEMALE_CHROMS); sexChroms.retainAll(MALE_CHROMS); if (!sexChroms.isEmpty()) errors.add("The following chromosomes were listed as both male and female sex chromosomes: " + sexChroms); if (errors.isEmpty()) return null; else return errors.toArray(new String[errors.size()]); } @Override protected int doWork() { /////////////////////////////////////////////////////////////////////// // Test and then load up the inputs /////////////////////////////////////////////////////////////////////// final boolean outputVcfs = VCF_DIR != null; IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(TRIOS); IOUtil.assertFileIsWritable(OUTPUT); if (outputVcfs) IOUtil.assertDirectoryIsWritable(VCF_DIR); LOG.info("Loading and filtering trios."); final MendelianViolationDetector.Result result = VariantProcessor.Builder .generatingAccumulatorsBy(this::buildDetector) .withInput(INPUT) .combiningResultsBy(MendelianViolationDetector.Result::merge) .multithreadingBy(THREAD_COUNT) .build() .process(); // Write out the metrics! final MetricsFile<MendelianViolationMetrics, ?> metricsFile = getMetricsFile(); for (final MendelianViolationMetrics m : result.metrics()) { m.calculateDerivedFields(); metricsFile.addMetric(m); } metricsFile.write(OUTPUT); writeAllViolations(result); return 0; } private void writeAllViolations(final MendelianViolationDetector.Result result) { if (VCF_DIR != null) { LOG.info(String.format("Writing family violation VCFs to %s/", VCF_DIR.getAbsolutePath())); final VariantContextComparator vcComparator = new VariantContextComparator(inputHeader.get().getContigLines()); final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>(inputHeader.get().getMetaDataInInputOrder()); headerLines.add(new VCFInfoHeaderLine(MendelianViolationDetector.MENDELIAN_VIOLATION_KEY, 1, VCFHeaderLineType.String, "Type of mendelian violation.")); headerLines.add(new VCFInfoHeaderLine(MendelianViolationDetector.ORIGINAL_AC, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Original AC")); headerLines.add(new VCFInfoHeaderLine(MendelianViolationDetector.ORIGINAL_AF, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Original AF")); headerLines.add(new VCFInfoHeaderLine(MendelianViolationDetector.ORIGINAL_AN, 1, VCFHeaderLineType.Integer, "Original AN")); for (final PedFile.PedTrio trio : pedFile.get().values()) { final File outputFile = new File(VCF_DIR, IOUtil.makeFileNameSafe(trio.getFamilyId() + ".vcf")); LOG.info(String.format("Writing %s violation VCF to %s", trio.getFamilyId(), outputFile.getAbsolutePath())); final VariantContextWriter out = new VariantContextWriterBuilder() .setOutputFile(outputFile) .unsetOption(INDEX_ON_THE_FLY) .build(); final VCFHeader newHeader = new VCFHeader(headerLines, CollectionUtil.makeList(trio.getMaternalId(), trio.getPaternalId(), trio.getIndividualId())); final TreeSet<VariantContext> orderedViolations = new TreeSet<>(vcComparator); orderedViolations.addAll(result.violations().get(trio.getFamilyId())); out.writeHeader(newHeader); orderedViolations.forEach(out::add); out.close(); } } } private MendelianViolationDetector buildDetector() { return new MendelianViolationDetector( ImmutableSet.copyOf(SKIP_CHROMS), ImmutableSet.copyOf(MALE_CHROMS), ImmutableSet.copyOf(FEMALE_CHROMS), MIN_HET_FRACTION, MIN_GQ, MIN_DP, generateTrioMetricsBase(), ImmutableList.copyOf(parIntervals.get()), progressLogger ); } /** Generates a {@link MendelianViolationMetrics} objects for each trio. */ private List<MendelianViolationMetrics> generateTrioMetricsBase() { final List<MendelianViolationMetrics> metrics = new ArrayList<>(); for (final PedFile.PedTrio trio : pedFile.get().values()) { final MendelianViolationMetrics m = new MendelianViolationMetrics(); m.MOTHER = trio.getMaternalId(); m.FATHER = trio.getPaternalId(); m.OFFSPRING = trio.getIndividualId(); m.FAMILY_ID = trio.getFamilyId(); m.OFFSPRING_SEX = trio.getSex(); metrics.add(m); } /////////////////////////////////////////////////////////////////////// // Filter out trios where one or more of the samples is not in the VCF /////////////////////////////////////////////////////////////////////// final Set<String> allSamples = new HashSet<>(inputHeader.get().getSampleNamesInOrder()); final Iterator<MendelianViolationMetrics> trioIterator = metrics.iterator(); while (trioIterator.hasNext()) { final MendelianViolationMetrics m = trioIterator.next(); final Set<String> trio = CollectionUtil.makeSet(m.MOTHER, m.FATHER, m.OFFSPRING); trio.removeAll(allSamples); if (!trio.isEmpty()) { LOG.warn("Removing trio due to the following missing samples in VCF: " + trio); trioIterator.remove(); } } return metrics; } }