/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.vcf;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.variantcontext.VariantContextComparator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
/**
* NOTE: This class allows duplicate entries in the metadata & stores header lines in
* lots of places. The original author noted that this should be cleaned up at some point
* in the future (jgentry - 5/2013)
*
* @author aaron
* <p/>
* Class VCFHeader
* <p/>
* A class representing the VCF header
*/
public class VCFHeader {
// the mandatory header fields
public enum HEADER_FIELDS {
CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
}
// the associated meta data
private final Set<VCFHeaderLine> mMetaData = new LinkedHashSet<VCFHeaderLine>();
private final Map<String, VCFInfoHeaderLine> mInfoMetaData = new LinkedHashMap<String, VCFInfoHeaderLine>();
private final Map<String, VCFFormatHeaderLine> mFormatMetaData = new LinkedHashMap<String, VCFFormatHeaderLine>();
private final Map<String, VCFFilterHeaderLine> mFilterMetaData = new LinkedHashMap<String, VCFFilterHeaderLine>();
private final Map<String, VCFHeaderLine> mOtherMetaData = new LinkedHashMap<String, VCFHeaderLine>();
private final List<VCFContigHeaderLine> contigMetaData = new ArrayList<VCFContigHeaderLine>();
// the list of auxillary tags
private final List<String> mGenotypeSampleNames = new ArrayList<String>();
// the character string that indicates meta data
public static final String METADATA_INDICATOR = "##";
// the header string indicator
public static final String HEADER_INDICATOR = "#";
public static final String SOURCE_KEY = "source";
public static final String REFERENCE_KEY = "reference";
public static final String CONTIG_KEY = "contig";
public static final String INTERVALS_KEY = "intervals";
public static final String EXCLUDE_INTERVALS_KEY = "excludeIntervals";
public static final String INTERVAL_MERGING_KEY = "interval_merging";
public static final String INTERVAL_SET_RULE_KEY = "interval_set_rule";
public static final String INTERVAL_PADDING_KEY = "interval_padding";
// were the input samples sorted originally (or are we sorting them)?
private boolean samplesWereAlreadySorted = true;
// cache for efficient conversion of VCF -> VariantContext
private ArrayList<String> sampleNamesInOrder = null;
private HashMap<String, Integer> sampleNameToOffset = null;
private boolean writeEngineHeaders = true;
private boolean writeCommandLine = true;
/**
* Create an empty VCF header with no header lines and no samples
*/
public VCFHeader() {
this(Collections.<VCFHeaderLine>emptySet(), Collections.<String>emptySet());
}
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
*/
public VCFHeader(final Set<VCFHeaderLine> metaData) {
mMetaData.addAll(metaData);
loadVCFVersion();
loadMetaDataMaps();
}
/**
* Creates a deep copy of the given VCFHeader, duplicating all its metadata and
* sample names.
*/
public VCFHeader(final VCFHeader toCopy) {
this(toCopy.mMetaData, toCopy.mGenotypeSampleNames);
}
/**
* create a VCF header, given a list of meta data and auxillary tags
*
* @param metaData the meta data associated with this header
* @param genotypeSampleNames the sample names
*/
public VCFHeader(final Set<VCFHeaderLine> metaData, final Set<String> genotypeSampleNames) {
this(metaData, new ArrayList<String>(genotypeSampleNames));
}
public VCFHeader(final Set<VCFHeaderLine> metaData, final List<String> genotypeSampleNames) {
this(metaData);
if ( genotypeSampleNames.size() != new HashSet<String>(genotypeSampleNames).size() )
throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names");
mGenotypeSampleNames.addAll(genotypeSampleNames);
samplesWereAlreadySorted = ParsingUtils.isSorted(genotypeSampleNames);
buildVCFReaderMaps(genotypeSampleNames);
}
/**
* Tell this VCF header to use pre-calculated sample name ordering and the
* sample name -> offset map. This assumes that all VariantContext created
* using this header (i.e., read by the VCFCodec) will have genotypes
* occurring in the same order
*
* @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearance
*/
private void buildVCFReaderMaps(final Collection<String> genotypeSampleNamesInAppearenceOrder) {
sampleNamesInOrder = new ArrayList<String>(genotypeSampleNamesInAppearenceOrder.size());
sampleNameToOffset = new HashMap<String, Integer>(genotypeSampleNamesInAppearenceOrder.size());
int i = 0;
for (final String name : genotypeSampleNamesInAppearenceOrder) {
sampleNamesInOrder.add(name);
sampleNameToOffset.put(name, i++);
}
Collections.sort(sampleNamesInOrder);
}
/**
* Sets a header line in the header metadata. This is essentially a Set.add call, which means that
* equals() and hashCode() are used to determine whether an additional header line is added or an
* existing header line is replaced.
*/
public void addMetaDataLine(final VCFHeaderLine headerLine) {
mMetaData.add(headerLine);
loadMetaDataMaps();
}
/**
* @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present
*/
public List<VCFContigHeaderLine> getContigLines() {
return Collections.unmodifiableList(contigMetaData);
}
/**
* Returns the contigs in this VCF file as a SAMSequenceDictionary. Returns null if contigs lines are
* not present in the header. Throws SAMException if one or more contig lines do not have length
* information.
*/
public SAMSequenceDictionary getSequenceDictionary() {
final List<VCFContigHeaderLine> contigHeaderLines = this.getContigLines();
if (contigHeaderLines.isEmpty()) return null;
final List<SAMSequenceRecord> sequenceRecords = new ArrayList<SAMSequenceRecord>(contigHeaderLines.size());
for (final VCFContigHeaderLine contigHeaderLine : contigHeaderLines) {
sequenceRecords.add(contigHeaderLine.getSAMSequenceRecord());
}
return new SAMSequenceDictionary(sequenceRecords);
}
/**
* Completely replaces the contig records in this header with those in the given SAMSequenceDictionary.
*/
public void setSequenceDictionary(final SAMSequenceDictionary dictionary) {
this.contigMetaData.clear();
// Also need to remove contig record lines from mMetaData
final List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
for (final VCFHeaderLine line : mMetaData) {
if (line instanceof VCFContigHeaderLine) {
toRemove.add(line);
}
}
mMetaData.removeAll(toRemove);
for (final SAMSequenceRecord record : dictionary.getSequences()) {
contigMetaData.add(new VCFContigHeaderLine(record, record.getAssembly()));
}
this.mMetaData.addAll(contigMetaData);
}
public VariantContextComparator getVCFRecordComparator() {
return new VariantContextComparator(this.getContigLines());
}
/**
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
*/
public List<VCFFilterHeaderLine> getFilterLines() {
final List<VCFFilterHeaderLine> filters = new ArrayList<VCFFilterHeaderLine>();
for (final VCFHeaderLine line : mMetaData) {
if ( line instanceof VCFFilterHeaderLine ) {
filters.add((VCFFilterHeaderLine)line);
}
}
return filters;
}
/**
* @return all of the VCF FILTER lines in their original file order, or an empty list if none were present
*/
public List<VCFIDHeaderLine> getIDHeaderLines() {
final List<VCFIDHeaderLine> filters = new ArrayList<VCFIDHeaderLine>();
for (final VCFHeaderLine line : mMetaData) {
if (line instanceof VCFIDHeaderLine) {
filters.add((VCFIDHeaderLine)line);
}
}
return filters;
}
/**
* check our metadata for a VCF version tag, and throw an exception if the version is out of date
* or the version is not present
*/
public void loadVCFVersion() {
final List<VCFHeaderLine> toRemove = new ArrayList<VCFHeaderLine>();
for (final VCFHeaderLine line : mMetaData)
if (VCFHeaderVersion.isFormatString(line.getKey())) {
toRemove.add(line);
}
// remove old header lines for now,
mMetaData.removeAll(toRemove);
}
/**
* load the format/info meta data maps (these are used for quick lookup by key name)
*/
private void loadMetaDataMaps() {
for (final VCFHeaderLine line : mMetaData) {
if ( line instanceof VCFInfoHeaderLine ) {
final VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line;
addMetaDataMapBinding(mInfoMetaData, infoLine);
} else if ( line instanceof VCFFormatHeaderLine ) {
final VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line;
addMetaDataMapBinding(mFormatMetaData, formatLine);
} else if ( line instanceof VCFFilterHeaderLine ) {
final VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line;
mFilterMetaData.put(filterLine.getID(), filterLine);
} else if ( line instanceof VCFContigHeaderLine ) {
contigMetaData.add((VCFContigHeaderLine)line);
} else {
mOtherMetaData.put(line.getKey(), line);
}
}
if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no "
+ VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally"
+ " automatically adding a corresponding PL field to your VCF header");
}
addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"));
}
}
/**
* Add line to map, issuing warnings about duplicates
*
* @param map
* @param line
* @param <T>
*/
private <T extends VCFCompoundHeaderLine> void addMetaDataMapBinding(final Map<String, T> map, final T line) {
final String key = line.getID();
if ( map.containsKey(key) ) {
if ( GeneralUtils.DEBUG_MODE_ENABLED ) {
System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" );
}
}
else {
map.put(key, line);
}
}
/**
* get the header fields in order they're presented in the input file (which is now required to be
* the order presented in the spec).
*
* @return a set of the header fields, in order
*/
public Set<HEADER_FIELDS> getHeaderFields() {
return new LinkedHashSet<HEADER_FIELDS>(Arrays.asList(HEADER_FIELDS.values()));
}
/**
* get the meta data, associated with this header, in sorted order
*
* @return a set of the meta data
*/
public Set<VCFHeaderLine> getMetaDataInInputOrder() {
return makeGetMetaDataSet(mMetaData);
}
public Set<VCFHeaderLine> getMetaDataInSortedOrder() {
return makeGetMetaDataSet(new TreeSet<VCFHeaderLine>(mMetaData));
}
private static Set<VCFHeaderLine> makeGetMetaDataSet(final Set<VCFHeaderLine> headerLinesInSomeOrder) {
final Set<VCFHeaderLine> lines = new LinkedHashSet<VCFHeaderLine>();
lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_1.getFormatString(), VCFHeaderVersion.VCF4_1.getVersionString()));
lines.addAll(headerLinesInSomeOrder);
return Collections.unmodifiableSet(lines);
}
/**
* Get the VCFHeaderLine whose key equals key. Returns null if no such line exists
* @param key
* @return
*/
public VCFHeaderLine getMetaDataLine(final String key) {
for (final VCFHeaderLine line: mMetaData) {
if ( line.getKey().equals(key) )
return line;
}
return null;
}
/**
* get the genotyping sample names
*
* @return a list of the genotype column names, which may be empty if hasGenotypingData() returns false
*/
public List<String> getGenotypeSamples() {
return mGenotypeSampleNames;
}
public int getNGenotypeSamples() {
return mGenotypeSampleNames.size();
}
/**
* do we have genotyping data?
*
* @return true if we have genotyping columns, false otherwise
*/
public boolean hasGenotypingData() {
return getNGenotypeSamples() > 0;
}
/**
* were the input samples sorted originally?
*
* @return true if the input samples were sorted originally, false otherwise
*/
public boolean samplesWereAlreadySorted() {
return samplesWereAlreadySorted;
}
/** @return the column count */
public int getColumnCount() {
return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
}
/**
* Returns the INFO HeaderLines in their original ordering
*/
public Collection<VCFInfoHeaderLine> getInfoHeaderLines() {
return mInfoMetaData.values();
}
/**
* Returns the FORMAT HeaderLines in their original ordering
*/
public Collection<VCFFormatHeaderLine> getFormatHeaderLines() {
return mFormatMetaData.values();
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFInfoHeaderLine getInfoHeaderLine(final String id) {
return mInfoMetaData.get(id);
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFFormatHeaderLine getFormatHeaderLine(final String id) {
return mFormatMetaData.get(id);
}
/**
* @param id the header key name
* @return the meta data line, or null if there is none
*/
public VCFFilterHeaderLine getFilterHeaderLine(final String id) {
return mFilterMetaData.get(id);
}
public boolean hasInfoLine(final String id) {
return getInfoHeaderLine(id) != null;
}
public boolean hasFormatLine(final String id) {
return getFormatHeaderLine(id) != null;
}
public boolean hasFilterLine(final String id) {
return getFilterHeaderLine(id) != null;
}
/**
* @param key the header key name
* @return the meta data line, or null if there is none
*/
public VCFHeaderLine getOtherHeaderLine(final String key) {
return mOtherMetaData.get(key);
}
/**
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
* @return true if additional engine headers will be written to the VCF
*/
public boolean isWriteEngineHeaders() {
return writeEngineHeaders;
}
/**
* If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output.
* @param writeEngineHeaders true if additional engine headers will be written to the VCF
*/
public void setWriteEngineHeaders(final boolean writeEngineHeaders) {
this.writeEngineHeaders = writeEngineHeaders;
}
/**
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
* @return true if the command line will be written to the VCF
*/
public boolean isWriteCommandLine() {
return writeCommandLine;
}
/**
* If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF.
* @param writeCommandLine true if the command line will be written to the VCF
*/
public void setWriteCommandLine(final boolean writeCommandLine) {
this.writeCommandLine = writeCommandLine;
}
public ArrayList<String> getSampleNamesInOrder() {
return sampleNamesInOrder;
}
public HashMap<String, Integer> getSampleNameToOffset() {
return sampleNameToOffset;
}
@Override
public String toString() {
final StringBuilder b = new StringBuilder();
b.append("[VCFHeader:");
for ( final VCFHeaderLine line : mMetaData )
b.append("\n\t").append(line);
return b.append("\n]").toString();
}
}