/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; import htsjdk.tribble.readers.LineIterator; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.LinkedList; import java.util.List; /** * A feature codec for the VCF 4 specification * * <p> * VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a * header line, and then data lines each containing information about a position in the genome. * </p> * <p>One of the main uses of next-generation sequencing is to discover variation amongst large populations * of related samples. Recently the format for storing next-generation read alignments has been * standardised by the SAM/BAM file format specification. This has significantly improved the * interoperability of next-generation tools for alignment, visualisation, and variant calling. * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent * types of sequence variation, including SNPs, indels and larger structural variants, together * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for * fast data retrieval of variants from a range of positions on the reference genome. * The format was developed for the 1000 Genomes Project, and has also been adopted by other projects * such as UK10K, dbSNP, or the NHLBI Exome Project. VCFtools is a software suite that implements * various utilities for processing VCF files, including validation, merging and comparing, * and also provides a general Perl and Python API. * The VCF specification and VCFtools are available from http://vcftools.sourceforge.net.</p> * * <p> * See also: @see <a href="http://vcftools.sourceforge.net/specs.html">VCF specification</a><br> * See also: @see <a href="http://www.ncbi.nlm.nih.gov/pubmed/21653522">VCF spec. publication</a> * </p> * * <h2>File format example</h2> * <pre> * ##fileformat=VCFv4.0 * #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 * chr1 109 . A T 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99 * chr1 147 . C A 0 PASS AC=1 GT:AD:DP:GL:GQ 0/1:294,49:118:-57.87,-34.96,-338.46:99 * </pre> * * @author Mark DePristo * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; /** * Reads all of the header from the provided iterator, but no reads no further. * @param lineIterator the line reader to take header lines from * @return The parsed header */ @Override public Object readActualHeader(final LineIterator lineIterator) { final List<String> headerStrings = new ArrayList<String>(); String line; boolean foundHeaderVersion = false; while (lineIterator.hasNext()) { line = lineIterator.peek(); lineNo++; if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { final String[] lineFields = line.substring(2).split("="); if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); foundHeaderVersion = true; version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) ) throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 ) throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); } headerStrings.add(lineIterator.next()); } else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { if (!foundHeaderVersion) { throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); } headerStrings.add(lineIterator.next()); super.parseHeaderFromLines(headerStrings, version); return this.header; } else { throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); } } throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); } /** * parse the filter string, first checking to see if we already have parsed it in a previous attempt * * @param filterString the string to parse * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) */ protected List<String> parseFilters(final String filterString) { // null for unfiltered if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) return Collections.emptyList(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); if ( filterString.length() == 0 ) generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); // do we have the filter string cached? if ( filterHash.containsKey(filterString) ) return filterHash.get(filterString); // empty set for passes filters final List<String> fFields = new LinkedList<String>(); // otherwise we have to parse and cache the value if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) fFields.add(filterString); else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); filterHash.put(filterString, Collections.unmodifiableList(fFields)); return fFields; } @Override public boolean canDecode(final String potentialInput) { return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); } }