/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package htsjdk.variant.vcf; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.tribble.AsciiFeatureCodec; import htsjdk.tribble.Feature; import htsjdk.tribble.NameAwareCodec; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.ParsingUtils; import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.GenotypeLikelihoods; import htsjdk.variant.variantcontext.LazyGenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext> implements NameAwareCodec { public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column // we have to store the list of strings that make up the header until they're needed protected VCFHeader header = null; protected VCFHeaderVersion version = null; // a mapping of the allele protected Map<String, List<Allele>> alleleMap = new HashMap<String, List<Allele>>(3); // for ParsingUtils.split protected String[] GTValueArray = new String[100]; protected String[] genotypeKeyArray = new String[100]; protected String[] infoFieldArray = new String[1000]; protected String[] infoValueArray = new String[1000]; // for performance testing purposes public static boolean validate = true; // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over // todo: make this thread safe? protected String[] parts = null; protected String[] genotypeParts = null; protected final String[] locParts = new String[6]; // for performance we cache the hashmap of filter encodings for quick lookup protected HashMap<String,List<String>> filterHash = new HashMap<String,List<String>>(); // we store a name to give to each of the variant contexts we emit protected String name = "Unknown"; protected int lineNo = 0; protected Map<String, String> stringCache = new HashMap<String, String>(); protected boolean warnedAboutNoEqualsForNonFlag = false; /** * If true, then we'll magically fix up VCF headers on the fly when we read them in */ protected boolean doOnTheFlyModifications = true; /** * If non-null, we will replace the sample name read from the VCF header with this sample name. This feature works * only for single-sample VCFs. */ protected String remappedSampleName = null; protected AbstractVCFCodec() { super(VariantContext.class); } /** * Creates a LazyParser for a LazyGenotypesContext to use to decode * our genotypes only when necessary. We do this instead of eagarly * decoding the genotypes just to turn around and reencode in the frequent * case where we don't actually want to manipulate the genotypes */ class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { final List<Allele> alleles; final String contig; final int start; LazyVCFGenotypesParser(final List<Allele> alleles, final String contig, final int start) { this.alleles = alleles; this.contig = contig; this.start = start; } @Override public LazyGenotypesContext.LazyData parse(final Object data) { //System.out.printf("Loading genotypes... %s:%d%n", contig, start); return createGenotypeMap((String) data, alleles, contig, start); } } /** * parse the filter string, first checking to see if we already have parsed it in a previous attempt * @param filterString the string to parse * @return a set of the filters applied */ protected abstract List<String> parseFilters(String filterString); /** * create a VCF header from a set of header record lines * * @param headerStrings a list of strings that represent all the ## and # entries * @return a VCFHeader object */ protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, final VCFHeaderVersion version ) { this.version = version; Set<VCFHeaderLine> metaData = new LinkedHashSet<VCFHeaderLine>(); Set<String> sampleNames = new LinkedHashSet<String>(); int contigCounter = 0; // iterate over all the passed in strings for ( String str : headerStrings ) { if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); int arrayIndex = 0; for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { try { if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'"); } catch (IllegalArgumentException e) { throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name."); } arrayIndex++; } boolean sawFormatTag = false; if ( arrayIndex < strings.length ) { if ( !strings[arrayIndex].equals("FORMAT") ) throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); sawFormatTag = true; arrayIndex++; } while ( arrayIndex < strings.length ) sampleNames.add(strings[arrayIndex++]); if ( sawFormatTag && sampleNames.size() == 0 ) throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); // If we're performing sample name remapping and there is exactly one sample specified in the header, replace // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested // for this file. if ( remappedSampleName != null ) { // We currently only support on-the-fly sample name remapping for single-sample VCFs if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { throw new TribbleException(String.format("Cannot remap sample name to %s because %s samples are specified in the VCF header, and on-the-fly sample name remapping is only supported for single-sample VCFs", remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); } sampleNames.clear(); sampleNames.add(remappedSampleName); } } else { if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) { final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version); metaData.add(info); } else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) { final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version); metaData.add(filter); } else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) { final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version); metaData.add(format); } else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) { final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++); metaData.add(contig); } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")); metaData.add(alt); } else { int equals = str.indexOf("="); if ( equals != -1 ) metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); } } } this.header = new VCFHeader(metaData, sampleNames); if ( doOnTheFlyModifications ) this.header = VCFStandardHeaderLines.repairStandardHeaderLines(this.header); return this.header; } /** * Explicitly set the VCFHeader on this codec. This will overwrite the header read from the file * and the version state stored in this instance; conversely, reading the header from a file will * overwrite whatever is set here. The returned header may not be identical to the header argument * since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set. */ public VCFHeader setVCFHeader(final VCFHeader header, final VCFHeaderVersion version) { this.version = version; if (this.doOnTheFlyModifications) this.header = VCFStandardHeaderLines.repairStandardHeaderLines(header); else this.header = header; return this.header; } /** * the fast decode function * @param line the line of text for the record * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { return decodeLine(line, false); } /** * decode the line into a feature (VariantContext) * @param line the line * @return a VariantContext */ public VariantContext decode(String line) { return decodeLine(line, true); } private VariantContext decodeLine(final String line, final boolean includeGenotypes) { // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null; // our header cannot be null, we need the genotype sample names and counts if (header == null) throw new TribbleException("VCF Header cannot be null when decoding a record"); if (parts == null) parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)]; final int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true); // if we have don't have a header, or we have a header with no genotyping data check that we // have eight columns. Otherwise check that we have nine (normal columns + genotyping data) if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) || (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) ) throw new TribbleException("Line " + lineNo + ": there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) + " tokens, and saw " + nParts + " )"); return parseVCFLine(parts, includeGenotypes); } /** * parse out the VCF line * * @param parts the parts split up * @return a variant context object */ private VariantContext parseVCFLine(final String[] parts, final boolean includeGenotypes) { VariantContextBuilder builder = new VariantContextBuilder(); builder.source(getName()); // increment the line count // TODO -- because of the way the engine utilizes Tribble, we can parse a line multiple times (especially when // TODO -- the first record is far along the contig) and the line counter can get out of sync lineNo++; // parse out the required fields final String chr = getCachedString(parts[0]); builder.chr(chr); int pos = -1; try { pos = Integer.valueOf(parts[1]); } catch (NumberFormatException e) { generateException(parts[1] + " is not a valid start position in the VCF format"); } builder.start(pos); if ( parts[2].length() == 0 ) generateException("The VCF specification requires a valid ID field"); else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) builder.noID(); else builder.id(parts[2]); final String ref = getCachedString(parts[3].toUpperCase()); final String alts = getCachedString(parts[4]); builder.log10PError(parseQual(parts[5])); final List<String> filters = parseFilters(getCachedString(parts[6])); if ( filters != null ) builder.filters(new HashSet<String>(filters)); final Map<String, Object> attrs = parseInfo(parts[7]); builder.attributes(attrs); if ( attrs.containsKey(VCFConstants.END_KEY) ) { // update stop with the end key if provided try { builder.stop(Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString())); } catch (Exception e) { generateException("the END value in the INFO field is not valid"); } } else { builder.stop(pos + ref.length() - 1); } // get our alleles, filters, and setup an attribute map final List<Allele> alleles = parseAlleles(ref, alts, lineNo); builder.alleles(alleles); // do we have genotyping data if (parts.length > NUM_STANDARD_FIELDS && includeGenotypes) { final LazyGenotypesContext.LazyParser lazyParser = new LazyVCFGenotypesParser(alleles, chr, pos); final int nGenotypes = header.getNGenotypeSamples(); LazyGenotypesContext lazy = new LazyGenotypesContext(lazyParser, parts[8], nGenotypes); // did we resort the sample names? If so, we need to load the genotype data if ( !header.samplesWereAlreadySorted() ) lazy.decode(); builder.genotypesNoValidation(lazy); } VariantContext vc = null; try { vc = builder.make(); } catch (Exception e) { generateException(e.getMessage()); } return vc; } /** * get the name of this codec * @return our set name */ public String getName() { return name; } /** * set the name of this codec * @param name new name */ public void setName(String name) { this.name = name; } /** * Return a cached copy of the supplied string. * * @param str string * @return interned string */ protected String getCachedString(String str) { String internedString = stringCache.get(str); if ( internedString == null ) { internedString = new String(str); stringCache.put(internedString, internedString); } return internedString; } /** * parse out the info fields * @param infoField the fields * @return a mapping of keys to objects */ private Map<String, Object> parseInfo(String infoField) { Map<String, Object> attributes = new HashMap<String, Object>(); if ( infoField.length() == 0 ) generateException("The VCF specification requires a valid (non-zero length) info field"); if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) { if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 ) generateException("The VCF specification does not allow for whitespace in the INFO field. Offending field value was \"" + infoField + "\""); int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false); for (int i = 0; i < infoFieldSplitSize; i++) { String key; Object value; int eqI = infoFieldArray[i].indexOf("="); if ( eqI != -1 ) { key = infoFieldArray[i].substring(0, eqI); String valueString = infoFieldArray[i].substring(eqI+1); // split on the INFO field separator int infoValueSplitSize = ParsingUtils.split(valueString, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); if ( infoValueSplitSize == 1 ) { value = infoValueArray[0]; final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); if ( headerLine != null && headerLine.getType() == VCFHeaderLineType.Flag && value.equals("0") ) { // deal with the case where a flag field has =0, such as DB=0, by skipping the add continue; } } else { ArrayList<String> valueList = new ArrayList<String>(infoValueSplitSize); for ( int j = 0; j < infoValueSplitSize; j++ ) valueList.add(infoValueArray[j]); value = valueList; } } else { key = infoFieldArray[i]; final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); if ( headerLine != null && headerLine.getType() != VCFHeaderLineType.Flag ) { if ( GeneralUtils.DEBUG_MODE_ENABLED && ! warnedAboutNoEqualsForNonFlag ) { System.err.println("Found info key " + key + " without a = value, but the header says the field is of type " + headerLine.getType() + " but this construct is only value for FLAG type fields"); warnedAboutNoEqualsForNonFlag = true; } value = VCFConstants.MISSING_VALUE_v4; } else { value = true; } } // this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4; attributes.put(key, value); } } return attributes; } /** * create a an allele from an index and an array of alleles * @param index the index * @param alleles the alleles * @return an Allele */ protected static Allele oneAllele(String index, List<Allele> alleles) { if ( index.equals(VCFConstants.EMPTY_ALLELE) ) return Allele.NO_CALL; final int i; try { i = Integer.valueOf(index); } catch ( NumberFormatException e ) { throw new TribbleException.InternalCodecException("The following invalid GT allele index was encountered in the file: " + index); } if ( i >= alleles.size() ) throw new TribbleException.InternalCodecException("The allele with index " + index + " is not defined in the REF/ALT columns in the record"); return alleles.get(i); } /** * parse genotype alleles from the genotype string * @param GT GT string * @param alleles list of possible alleles * @param cache cache of alleles for GT * @return the allele list for the GT string */ protected static List<Allele> parseGenotypeAlleles(String GT, List<Allele> alleles, Map<String, List<Allele>> cache) { // cache results [since they are immutable] and return a single object for each genotype List<Allele> GTAlleles = cache.get(GT); if ( GTAlleles == null ) { StringTokenizer st = new StringTokenizer(GT, VCFConstants.PHASING_TOKENS); GTAlleles = new ArrayList<Allele>(st.countTokens()); while ( st.hasMoreTokens() ) { String genotype = st.nextToken(); GTAlleles.add(oneAllele(genotype, alleles)); } cache.put(GT, GTAlleles); } return GTAlleles; } /** * parse out the qual value * @param qualString the quality string * @return return a double */ protected static Double parseQual(String qualString) { // if we're the VCF 4 missing char, return immediately if ( qualString.equals(VCFConstants.MISSING_VALUE_v4)) return VariantContext.NO_LOG10_PERROR; Double val = Double.valueOf(qualString); // check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON)) return VariantContext.NO_LOG10_PERROR; // scale and return the value return val / -10.0; } /** * parse out the alleles * @param ref the reference base * @param alts a string of alternates to break into alleles * @param lineNo the line number for this record * @return a list of alleles, and a pair of the shortest and longest sequence */ protected static List<Allele> parseAlleles(String ref, String alts, int lineNo) { List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic // ref checkAllele(ref, true, lineNo); Allele refAllele = Allele.create(ref, true); alleles.add(refAllele); if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split parseSingleAltAllele(alleles, alts, lineNo); else for ( String alt : alts.split(",") ) parseSingleAltAllele(alleles, alt, lineNo); return alleles; } /** * check to make sure the allele is an acceptable allele * @param allele the allele to check * @param isRef are we the reference allele? * @param lineNo the line number for this record */ private static void checkAllele(String allele, boolean isRef, int lineNo) { if ( allele == null || allele.length() == 0 ) generateException(generateExceptionTextForBadAlleleBases(""), lineNo); if ( GeneralUtils.DEBUG_MODE_ENABLED && MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); } if ( isSymbolicAllele(allele) ) { if ( isRef ) { generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); } } else { // check for VCF3 insertions or deletions if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) ) generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" + " convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo); if (!Allele.acceptableAlleleBases(allele)) generateException(generateExceptionTextForBadAlleleBases(allele), lineNo); if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) ) generateException("The reference allele cannot be missing", lineNo); } } /** * Generates the exception text for the case where the allele string contains unacceptable bases. * * @param allele non-null allele string * @return non-null exception text string */ private static String generateExceptionTextForBadAlleleBases(final String allele) { if ( allele.length() == 0 ) return "empty alleles are not permitted in VCF records"; if ( allele.contains("[") || allele.contains("]") || allele.contains(":") || allele.contains(".") ) return "VCF support for complex rearrangements with breakends has not yet been implemented"; return "unparsable vcf record with allele " + allele; } /** * return true if this is a symbolic allele (e.g. <SOMETAG>) or * structural variation breakend (with [ or ]), otherwise false * @param allele the allele to check * @return true if the allele is a symbolic allele, otherwise false */ private static boolean isSymbolicAllele(String allele) { return (allele != null && allele.length() > 2 && ((allele.startsWith("<") && allele.endsWith(">")) || (allele.contains("[") || allele.contains("]")))); } /** * parse a single allele, given the allele list * @param alleles the alleles available * @param alt the allele to parse * @param lineNo the line number for this record */ private static void parseSingleAltAllele(List<Allele> alleles, String alt, int lineNo) { checkAllele(alt, false, lineNo); Allele allele = Allele.create(alt, false); if ( ! allele.isNoCall() ) alleles.add(allele); } public final static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) { try { return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) || isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE); } catch ( FileNotFoundException e ) { return false; } catch ( IOException e ) { return false; } } private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) { try { byte[] buff = new byte[MAGIC_HEADER_LINE.length()]; int nread = stream.read(buff, 0, MAGIC_HEADER_LINE.length()); boolean eq = Arrays.equals(buff, MAGIC_HEADER_LINE.getBytes()); return eq; // String firstLine = new String(buff); // return firstLine.startsWith(MAGIC_HEADER_LINE); } catch ( IOException e ) { return false; } catch ( RuntimeException e ) { return false; } finally { try { stream.close(); } catch ( IOException e ) {} } } /** * create a genotype map * * @param str the string * @param alleles the list of alleles * @return a mapping of sample name to genotype object */ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, final List<Allele> alleles, final String chr, final int pos) { if (genotypeParts == null) genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS]; int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); if ( nParts != genotypeParts.length ) generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo); ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts); // get the format keys int nGTKeys = ParsingUtils.split(genotypeParts[0], genotypeKeyArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); // cycle through the sample names Iterator<String> sampleNameIterator = header.getGenotypeSamples().iterator(); // clear out our allele mapping alleleMap.clear(); // cycle through the genotype strings for (int genotypeOffset = 1; genotypeOffset < nParts; genotypeOffset++) { int GTValueSplitSize = ParsingUtils.split(genotypeParts[genotypeOffset], GTValueArray, VCFConstants.GENOTYPE_FIELD_SEPARATOR_CHAR); final String sampleName = sampleNameIterator.next(); final GenotypeBuilder gb = new GenotypeBuilder(sampleName); // check to see if the value list is longer than the key list, which is a problem if (nGTKeys < GTValueSplitSize) generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); int genotypeAlleleLocation = -1; if (nGTKeys >= 1) { gb.maxAttributes(nGTKeys - 1); for (int i = 0; i < nGTKeys; i++) { final String gtKey = genotypeKeyArray[i]; boolean missing = i >= GTValueSplitSize; // todo -- all of these on the fly parsing of the missing value should be static constants if (gtKey.equals(VCFConstants.GENOTYPE_KEY)) { genotypeAlleleLocation = i; } else if ( missing ) { // if its truly missing (there no provided value) skip adding it to the attributes } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { final List<String> filters = parseFilters(getCachedString(GTValueArray[i])); if ( filters != null ) gb.filters(filters); } else if ( GTValueArray[i].equals(VCFConstants.MISSING_VALUE_v4) ) { // don't add missing values to the map } else { if (gtKey.equals(VCFConstants.GENOTYPE_QUALITY_KEY)) { if ( GTValueArray[i].equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) ) gb.noGQ(); else gb.GQ((int)Math.round(Double.valueOf(GTValueArray[i]))); } else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { gb.AD(decodeInts(GTValueArray[i])); } else if (gtKey.equals(VCFConstants.GENOTYPE_PL_KEY)) { gb.PL(decodeInts(GTValueArray[i])); } else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { gb.PL(GenotypeLikelihoods.fromGLField(GTValueArray[i]).getAsPLs()); } else if (gtKey.equals(VCFConstants.DEPTH_KEY)) { gb.DP(Integer.valueOf(GTValueArray[i])); } else { gb.attribute(gtKey, GTValueArray[i]); } } } } // check to make sure we found a genotype field if our version is less than 4.1 file if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_1) && genotypeAlleleLocation == -1 ) generateException("Unable to find the GT field for the record; the GT field is required before VCF4.1"); if ( genotypeAlleleLocation > 0 ) generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); final List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); gb.alleles(GTalleles); gb.phased(genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1); // add it to the list try { genotypes.add(gb.make()); } catch (TribbleException e) { throw new TribbleException.InternalCodecException(e.getMessage() + ", at position " + chr+":"+pos); } } return new LazyGenotypesContext.LazyData(genotypes, header.getSampleNamesInOrder(), header.getSampleNameToOffset()); } private final String[] INT_DECODE_ARRAY = new String[10000]; private final int[] decodeInts(final String string) { final int nValues = ParsingUtils.split(string, INT_DECODE_ARRAY, ','); final int[] values = new int[nValues]; try { for ( int i = 0; i < nValues; i++ ) values[i] = Integer.valueOf(INT_DECODE_ARRAY[i]); } catch (final NumberFormatException e) { return null; } return values; } /** * Forces all VCFCodecs to not perform any on the fly modifications to the VCF header * of VCF records. Useful primarily for raw comparisons such as when comparing * raw VCF records */ public final void disableOnTheFlyModifications() { doOnTheFlyModifications = false; } /** * Replaces the sample name read from the VCF header with the remappedSampleName. Works * only for single-sample VCFs -- attempting to perform sample name remapping for multi-sample * VCFs will produce an Exception. * * @param remappedSampleName replacement sample name for the sample specified in the VCF header */ public void setRemappedSampleName( final String remappedSampleName ) { this.remappedSampleName = remappedSampleName; } protected void generateException(String message) { throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); } protected static void generateException(String message, int lineNo) { throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); } }