/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.vcf;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.utils.GeneralUtils;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
public class VCFUtils {
public static Set<VCFHeaderLine> smartMergeHeaders(final Collection<VCFHeader> headers, final boolean emitWarnings) throws IllegalStateException {
// We need to maintain the order of the VCFHeaderLines, otherwise they will be scrambled in the returned Set.
// This will cause problems for VCFHeader.getSequenceDictionary and anything else that implicitly relies on the line ordering.
final TreeMap<String, VCFHeaderLine> map = new TreeMap<String, VCFHeaderLine>(); // from KEY.NAME -> line
final HeaderConflictWarner conflictWarner = new HeaderConflictWarner(emitWarnings);
// todo -- needs to remove all version headers from sources and add its own VCF version line
for ( final VCFHeader source : headers ) {
//System.out.printf("Merging in header %s%n", source);
for ( final VCFHeaderLine line : source.getMetaDataInSortedOrder()) {
String key = line.getKey();
if ( line instanceof VCFIDHeaderLine )
key = key + "-" + ((VCFIDHeaderLine)line).getID();
if ( map.containsKey(key) ) {
final VCFHeaderLine other = map.get(key);
if ( line.equals(other) ) {
// continue;
} else if ( ! line.getClass().equals(other.getClass()) ) {
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
} else if ( line instanceof VCFFilterHeaderLine ) {
final String lineName = ((VCFFilterHeaderLine) line).getID();
final String otherName = ((VCFFilterHeaderLine) other).getID();
if ( ! lineName.equals(otherName) )
throw new IllegalStateException("Incompatible header types: " + line + " " + other );
} else if ( line instanceof VCFCompoundHeaderLine ) {
final VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine)line;
final VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine)other;
// if the names are the same, but the values are different, we need to quit
if (! (compLine).equalsExcludingDescription(compOther) ) {
if ( compLine.getType().equals(compOther.getType()) ) {
// The Number entry is an Integer that describes the number of values that can be
// included with the INFO field. For example, if the INFO field contains a single
// number, then this value should be 1. However, if the INFO field describes a pair
// of numbers, then this value should be 2 and so on. If the number of possible
// values varies, is unknown, or is unbounded, then this value should be '.'.
conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other);
compOther.setNumberToUnbounded();
} else if ( compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float ) {
// promote key to Float
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
map.put(key, compOther);
} else if ( compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer ) {
// promote key to Float
conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther);
} else {
throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other );
}
}
if ( ! compLine.getDescription().equals(compOther.getDescription()) )
conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine);
} else {
// we are not equal, but we're not anything special either
conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other);
}
} else {
map.put(key, line);
//System.out.printf("Adding header line %s%n", line);
}
}
}
// returning a LinkedHashSet so that ordering will be preserved. Ensures the contig lines do not get scrambled.
return new LinkedHashSet<VCFHeaderLine>(map.values());
}
/**
* Add / replace the contig header lines in the VCFHeader with the in the reference file and master reference dictionary
*
* @param oldHeader the header to update
* @param referenceFile the file path to the reference sequence used to generate this vcf
* @param refDict the SAM formatted reference sequence dictionary
*/
public static VCFHeader withUpdatedContigs(final VCFHeader oldHeader, final File referenceFile, final SAMSequenceDictionary refDict) {
return new VCFHeader(withUpdatedContigsAsLines(oldHeader.getMetaDataInInputOrder(), referenceFile, refDict), oldHeader.getGenotypeSamples());
}
public static Set<VCFHeaderLine> withUpdatedContigsAsLines(final Set<VCFHeaderLine> oldLines, final File referenceFile, final SAMSequenceDictionary refDict) {
return withUpdatedContigsAsLines(oldLines, referenceFile, refDict, false);
}
public static Set<VCFHeaderLine> withUpdatedContigsAsLines(final Set<VCFHeaderLine> oldLines, final File referenceFile, final SAMSequenceDictionary refDict, final boolean referenceNameOnly) {
final Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>(oldLines.size());
for ( final VCFHeaderLine line : oldLines ) {
if ( line instanceof VCFContigHeaderLine )
continue; // skip old contig lines
if ( line.getKey().equals(VCFHeader.REFERENCE_KEY) )
continue; // skip the old reference key
lines.add(line);
}
for ( final VCFHeaderLine contigLine : makeContigHeaderLines(refDict, referenceFile) )
lines.add(contigLine);
final String referenceValue;
if (referenceFile != null) {
if (referenceNameOnly) {
final int extensionStart = referenceFile.getName().lastIndexOf(".");
referenceValue = extensionStart == -1 ? referenceFile.getName() : referenceFile.getName().substring(0, extensionStart);
}
else {
referenceValue = "file://" + referenceFile.getAbsolutePath();
}
lines.add(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, referenceValue));
}
return lines;
}
/**
* Create VCFHeaderLines for each refDict entry, and optionally the assembly if referenceFile != null
* @param refDict reference dictionary
* @param referenceFile for assembly name. May be null
* @return list of vcf contig header lines
*/
public static List<VCFContigHeaderLine> makeContigHeaderLines(final SAMSequenceDictionary refDict,
final File referenceFile) {
final List<VCFContigHeaderLine> lines = new ArrayList<VCFContigHeaderLine>();
final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null;
for ( final SAMSequenceRecord contig : refDict.getSequences() )
lines.add(makeContigHeaderLine(contig, assembly));
return lines;
}
private static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) {
final Map<String, String> map = new LinkedHashMap<String, String>(3);
map.put("ID", contig.getSequenceName());
map.put("length", String.valueOf(contig.getSequenceLength()));
if ( assembly != null ) map.put("assembly", assembly);
return new VCFContigHeaderLine(map, contig.getSequenceIndex());
}
private static String getReferenceAssembly(final String refPath) {
// This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot
String assembly = null;
if (refPath.contains("b37") || refPath.contains("v37"))
assembly = "b37";
else if (refPath.contains("b36"))
assembly = "b36";
else if (refPath.contains("hg18"))
assembly = "hg18";
else if (refPath.contains("hg19"))
assembly = "hg19";
return assembly;
}
/** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */
private static final class HeaderConflictWarner {
boolean emitWarnings;
Set<String> alreadyIssued = new HashSet<String>();
private HeaderConflictWarner( final boolean emitWarnings ) {
this.emitWarnings = emitWarnings;
}
public void warn(final VCFHeaderLine line, final String msg) {
if ( GeneralUtils.DEBUG_MODE_ENABLED && emitWarnings && ! alreadyIssued.contains(line.getKey()) ) {
alreadyIssued.add(line.getKey());
System.err.println(msg);
}
}
}
}