/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; import htsjdk.variant.utils.GeneralUtils; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; /** * Manages header lines for standard VCF INFO and FORMAT fields * * Provides simple mechanisms for registering standard lines, * looking them up, and adding them to headers * * @author Mark DePristo * @since 6/12 */ public class VCFStandardHeaderLines { /** * Enabling this causes us to repair header lines even if only their descriptions differ */ private final static boolean REPAIR_BAD_DESCRIPTIONS = false; private static Standards<VCFFormatHeaderLine> formatStandards = new Standards<VCFFormatHeaderLine>(); private static Standards<VCFInfoHeaderLine> infoStandards = new Standards<VCFInfoHeaderLine>(); /** * Walks over the VCF header and repairs the standard VCF header lines in it, returning a freshly * allocated VCFHeader with standard VCF header lines repaired as necessary * * @param header * @return */ public static VCFHeader repairStandardHeaderLines(final VCFHeader header) { final Set<VCFHeaderLine> newLines = new LinkedHashSet<VCFHeaderLine>(header.getMetaDataInInputOrder().size()); for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { if ( line instanceof VCFFormatHeaderLine ) { line = formatStandards.repair((VCFFormatHeaderLine) line); } else if ( line instanceof VCFInfoHeaderLine) { line = infoStandards.repair((VCFInfoHeaderLine) line); } newLines.add(line); } return new VCFHeader(newLines, header.getGenotypeSamples()); } /** * Adds header lines for each of the format fields in IDs to header, returning the set of * IDs without standard descriptions, unless throwErrorForMissing is true, in which * case this situation results in a TribbleException * * @param IDs * @return */ public static Set<String> addStandardFormatLines(final Set<VCFHeaderLine> headerLines, final boolean throwErrorForMissing, final Collection<String> IDs) { return formatStandards.addToHeader(headerLines, IDs, throwErrorForMissing); } /** * @see #addStandardFormatLines(java.util.Set, boolean, java.util.Collection) * * @param headerLines * @param throwErrorForMissing * @param IDs * @return */ public static Set<String> addStandardFormatLines(final Set<VCFHeaderLine> headerLines, final boolean throwErrorForMissing, final String ... IDs) { return addStandardFormatLines(headerLines, throwErrorForMissing, Arrays.asList(IDs)); } /** * Returns the standard format line for ID. If none exists, return null or throw an exception, depending * on throwErrorForMissing * * @param ID * @param throwErrorForMissing * @return */ public static VCFFormatHeaderLine getFormatLine(final String ID, final boolean throwErrorForMissing) { return formatStandards.get(ID, throwErrorForMissing); } /** * Returns the standard format line for ID. If none exists throw an exception * * @param ID * @return */ public static VCFFormatHeaderLine getFormatLine(final String ID) { return formatStandards.get(ID, true); } private static void registerStandard(final VCFFormatHeaderLine line) { formatStandards.add(line); } /** * Adds header lines for each of the info fields in IDs to header, returning the set of * IDs without standard descriptions, unless throwErrorForMissing is true, in which * case this situation results in a TribbleException * * @param IDs * @return */ public static Set<String> addStandardInfoLines(final Set<VCFHeaderLine> headerLines, final boolean throwErrorForMissing, final Collection<String> IDs) { return infoStandards.addToHeader(headerLines, IDs, throwErrorForMissing); } /** * @see #addStandardFormatLines(java.util.Set, boolean, java.util.Collection) * * @param IDs * @return */ public static Set<String> addStandardInfoLines(final Set<VCFHeaderLine> headerLines, final boolean throwErrorForMissing, final String ... IDs) { return addStandardInfoLines(headerLines, throwErrorForMissing, Arrays.asList(IDs)); } /** * Returns the standard info line for ID. If none exists, return null or throw an exception, depending * on throwErrorForMissing * * @param ID * @param throwErrorForMissing * @return */ public static VCFInfoHeaderLine getInfoLine(final String ID, final boolean throwErrorForMissing) { return infoStandards.get(ID, throwErrorForMissing); } /** * Returns the standard info line for ID. If none exists throw an exception * * @param ID * @return */ public static VCFInfoHeaderLine getInfoLine(final String ID) { return getInfoLine(ID, true); } private static void registerStandard(final VCFInfoHeaderLine line) { infoStandards.add(line); } // // VCF header line constants // static { // FORMAT lines registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, 1, VCFHeaderLineType.String, "Genotype-level filter")); // INFO lines registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); registerStandard(new VCFInfoHeaderLine(VCFConstants.MLE_ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed")); registerStandard(new VCFInfoHeaderLine(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed")); registerStandard(new VCFInfoHeaderLine(VCFConstants.DOWNSAMPLED_KEY, 0, VCFHeaderLineType.Flag, "Were any of the samples downsampled?")); registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); } private static class Standards<T extends VCFCompoundHeaderLine> { private final Map<String, T> standards = new HashMap<String, T>(); public T repair(final T line) { final T standard = get(line.getID(), false); if ( standard != null ) { final boolean badCountType = line.getCountType() != standard.getCountType(); final boolean badCount = line.isFixedCount() && ! badCountType && line.getCount() != standard.getCount(); final boolean badType = line.getType() != standard.getType(); final boolean badDesc = ! line.getDescription().equals(standard.getDescription()); final boolean needsRepair = badCountType || badCount || badType || (REPAIR_BAD_DESCRIPTIONS && badDesc); if ( needsRepair ) { if ( GeneralUtils.DEBUG_MODE_ENABLED ) { System.err.println("Repairing standard header line for field " + line.getID() + " because" + (badCountType ? " -- count types disagree; header has " + line.getCountType() + " but standard is " + standard.getCountType() : "") + (badType ? " -- type disagree; header has " + line.getType() + " but standard is " + standard.getType() : "") + (badCount ? " -- counts disagree; header has " + line.getCount() + " but standard is " + standard.getCount() : "") + (badDesc ? " -- descriptions disagree; header has '" + line.getDescription() + "' but standard is '" + standard.getDescription() + "'": "")); } return standard; } else return line; } else return line; } public Set<String> addToHeader(final Set<VCFHeaderLine> headerLines, final Collection<String> IDs, final boolean throwErrorForMissing) { final Set<String> missing = new HashSet<String>(); for ( final String ID : IDs ) { final T line = get(ID, throwErrorForMissing); if ( line == null ) missing.add(ID); else headerLines.add(line); } return missing; } public void add(final T line) { if ( standards.containsKey(line.getID()) ) throw new TribbleException("Attempting to add multiple standard header lines for ID " + line.getID()); standards.put(line.getID(), line); } public T get(final String ID, final boolean throwErrorForMissing) { final T x = standards.get(ID); if ( throwErrorForMissing && x == null ) throw new TribbleException("Couldn't find a standard VCF header line for field " + ID); return x; } } }