/*******************************************************************************
* GenPlay, Einstein Genome Analyzer
* Copyright (C) 2009, 2014 Albert Einstein College of Medicine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu>
* Nicolas Fourel <nicolas.fourel@einstein.yu.edu>
* Eric Bouhassira <eric.bouhassira@einstein.yu.edu>
*
* Website: <http://genplay.einstein.yu.edu>
******************************************************************************/
package edu.yu.einstein.genplay.core.multiGenome.VCF.VCFFile;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFHeaderType.VCFHeaderAdvancedType;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFHeaderType.VCFHeaderAltType;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFHeaderType.VCFHeaderBasicType;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFHeaderType.VCFHeaderFilterType;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFHeaderType.VCFHeaderFormatType;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFHeaderType.VCFHeaderInfoType;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFHeaderType.VCFHeaderType;
import edu.yu.einstein.genplay.core.multiGenome.utils.FormattedMultiGenomeName;
import edu.yu.einstein.genplay.core.multiGenome.utils.VCFGenomeIndexer;
import edu.yu.einstein.genplay.dataStructure.enums.VCFColumnName;
import edu.yu.einstein.genplay.util.Utils;
/**
* @author Nicolas Fourel
* @version 0.1
*/
public class VCFHeader implements VCFGenomeIndexer, Serializable {
/** Default generated serial version ID */
private static final long serialVersionUID = 5071204705996276780L;
private static final int SAVED_FORMAT_VERSION_NUMBER = 0; // saved format version
private BufferedReader headerReader;
private Map<String, String> headerInfo; // Header main information
private Map<String, Class<?>> fieldType; // Association between field type and java class
private List<String> fixedColumn; // Fixed header names included in the VCF file
private List<VCFHeaderType> basicHeader; // Header without ID for ALT, QUAL and FILTER
private List<VCFHeaderType> altHeader; // Header for the ALT field
private List<VCFHeaderType> filterHeader; // Header for the FILTER field
private List<VCFHeaderAdvancedType> infoHeader; // Header for the INFO field
private List<VCFHeaderAdvancedType> formatHeader; // Header for the FORMAT field
private List<String> columnNames; // All column header names
private List<String> genomeRawNames; // Dynamic header names included in the VCF file (raw genome names)
private List<String> genomeNames; // Full genome names list
private Map<String, Integer> genomeMap; // map between genome names and their related index according to their location on the column line
/**
* Constructor of {@link VCFHeader}.
*/
protected VCFHeader () {
initFieldType();
initFixedColumnList();
genomeNames = new ArrayList<String>();
genomeMap = new HashMap<String, Integer>();
}
/**
* Add a genome name to the list of genome name
* @param genomeName a full genome name
*/
protected void addGenomeName (String genomeName) {
if (!genomeNames.contains(genomeName)) {
genomeNames.add(genomeName);
}
}
/**
* @return all the header type
*/
public List<VCFHeaderType> getAllHeader () {
List<VCFHeaderType> result = new ArrayList<VCFHeaderType>();
for (VCFHeaderType header: basicHeader) {
result.add(header);
}
for (VCFHeaderType header: altHeader) {
result.add(header);
}
for (VCFHeaderType header: filterHeader) {
result.add(header);
}
for (VCFHeaderType header: infoHeader) {
result.add(header);
}
for (VCFHeaderType header: formatHeader) {
result.add(header);
}
return result;
}
/**
* @return all the header type
*/
public List<VCFHeaderType> getAllNumberHeader () {
List<VCFHeaderType> result = new ArrayList<VCFHeaderType>();
for (VCFHeaderType header: basicHeader) {
if (header.getColumnCategory() == VCFColumnName.QUAL) {
result.add(header);
}
}
for (VCFHeaderType header: infoHeader) {
VCFHeaderAdvancedType advancedHeader = (VCFHeaderAdvancedType) header;
if ((advancedHeader.getType() == Integer.class) || (advancedHeader.getType() == Float.class)) {
result.add(header);
}
}
for (VCFHeaderType header: formatHeader) {
VCFHeaderAdvancedType advancedHeader = (VCFHeaderAdvancedType) header;
if ((advancedHeader.getType() == Integer.class) || (advancedHeader.getType() == Float.class)) {
result.add(header);
}
}
return result;
}
/**
* @return all sorted header types
*/
public List<VCFHeaderType> getAllSortedHeader () {
List<VCFHeaderType> result = new ArrayList<VCFHeaderType>();
for (VCFHeaderType header: basicHeader) {
result.add(header);
}
for (VCFHeaderType header: altHeader) {
result.add(header);
}
for (VCFHeaderType header: filterHeader) {
result.add(header);
}
for (VCFHeaderType header: infoHeader) {
if (((VCFHeaderAdvancedType)header).getType() == Boolean.class) {
result.add(header);
}
}
for (VCFHeaderType header: formatHeader) {
result.add(header);
}
for (VCFHeaderType header: infoHeader) {
if (((VCFHeaderAdvancedType)header).getType() != Boolean.class) {
result.add(header);
}
}
return result;
}
/**
* @return all sorted header types
*/
public List<VCFHeaderType> getAllSortedNumberHeader () {
List<VCFHeaderType> sortedList = getAllSortedHeader();
List<VCFHeaderType> result = new ArrayList<VCFHeaderType>();
for (VCFHeaderType header: sortedList) {
if (header.getColumnCategory() == VCFColumnName.QUAL) {
result.add(header);
} else {
if (header instanceof VCFHeaderAdvancedType) {
VCFHeaderAdvancedType advancedHeader = (VCFHeaderAdvancedType) header;
if ((advancedHeader.getType() == Integer.class) || (advancedHeader.getType() == Float.class)) {
result.add(header);
}
}
}
}
return result;
}
/**
* @return the altHeader
*/
public List<VCFHeaderType> getAltHeader() {
return altHeader;
}
/**
* Looks for a ALT header using an ID.
* @param id the ID
* @return the header or null
*/
public VCFHeaderType getAltHeaderFromID (String id) {
for (VCFHeaderType header: altHeader) {
if (header.getId().equals(id)) {
return header;
}
}
return null;
}
/**
* @return the basicHeader
*/
public List<VCFHeaderType> getBasicHeader() {
return basicHeader;
}
/**
* @return the columnNames
*/
public List<String> getColumnNames() {
return columnNames;
}
/**
* @return the filterHeader
*/
public List<VCFHeaderType> getFilterHeader() {
return filterHeader;
}
/**
* @return the fixedColumn
*/
public List<String> getFixedColumn() {
List<String> list = new ArrayList<String>(fixedColumn);
return list;
}
/**
* @return the formatHeader
*/
public List<VCFHeaderAdvancedType> getFormatHeader() {
return formatHeader;
}
/**
* Looks for a FORMAT header using an ID.
* @param id the ID
* @return the header or null
*/
public VCFHeaderAdvancedType getFormatHeaderFromID (String id) {
for (VCFHeaderAdvancedType header: formatHeader) {
if (header.getId().equals(id)) {
return header;
}
}
return null;
}
/**
* @return the genomeNames
*/
public List<String> getGenomeNames() {
return genomeNames;
}
@Override
public String getGenomeRawName(int index) {
for (String rawName: genomeMap.keySet()) {
if (genomeMap.get(rawName) == index) {
return rawName;
}
}
return null;
}
/**
* @return the columnNames
*/
@Override
public List<String> getGenomeRawNames() {
if (genomeRawNames == null) {
List<String> list = new ArrayList<String>();
for (String s: columnNames) {
if (!fixedColumn.contains(s)) {
list.add(s);
}
}
Collections.sort(list);
genomeRawNames = list;
}
return genomeRawNames;
}
/**
* @return the headerInfo
*/
public Map<String, String> getHeaderInfo() {
return headerInfo;
}
@Override
public int getIndexFromFullGenomeName(String genomeFullName) {
return getIndexFromRawGenomeName(FormattedMultiGenomeName.getRawName(genomeFullName));
}
@Override
public int getIndexFromRawGenomeName(String genomeRawName) {
try {
return genomeMap.get(genomeRawName);
} catch (Exception e) {
return -1;
}
}
/**
* @return the infoHeader
*/
public List<VCFHeaderAdvancedType> getInfoHeader() {
return infoHeader;
}
/**
* Looks for a INFO header using an ID.
* @param id the ID
* @return the header or null
*/
public VCFHeaderAdvancedType getInfoHeaderFromID (String id) {
for (VCFHeaderAdvancedType header: infoHeader) {
if (header.getId().equals(id)) {
return header;
}
}
return null;
}
/**
* Initializes field/java class association.
*/
protected void initFieldType () {
fieldType = new HashMap<String, Class<?>>();
fieldType.put("Integer", Integer.class);
fieldType.put("Float", Float.class);
fieldType.put("Flag", Boolean.class);
fieldType.put("Character", char.class);
fieldType.put("String", String.class);
}
/**
* Initializes column header list.
*/
private void initFixedColumnList () {
fixedColumn = new ArrayList<String>();
fixedColumn.add(VCFColumnName.CHROM.toString());
fixedColumn.add(VCFColumnName.POS.toString());
fixedColumn.add(VCFColumnName.ID.toString());
fixedColumn.add(VCFColumnName.REF.toString());
fixedColumn.add(VCFColumnName.ALT.toString());
fixedColumn.add(VCFColumnName.QUAL.toString());
fixedColumn.add(VCFColumnName.FILTER.toString());
fixedColumn.add(VCFColumnName.INFO.toString());
fixedColumn.add(VCFColumnName.FORMAT.toString());
}
/**
* This method parses the content of attributes header information.
* @param line
* @return the parsed line
*/
private Map<String, String> parseVCFHeaderInfo (String line) {
Map<String, String> info = new HashMap<String, String>();
String[] details = Utils.split(line, ',');
String detail[];
for (String s: details) {
detail = Utils.split(s, '=');
if ((detail.length > 1) && !detail[0].equals("Description")) {
info.put(detail[0], detail[1]);
}
}
String descriptionPattern = "Description=\"";
int descriptionStart = line.indexOf(descriptionPattern) + descriptionPattern.length();
int descriptionStop = line.indexOf("\"", descriptionStart);
String description = line.substring(descriptionStart, descriptionStop);
info.put("Description", description);
return info;
}
/**
* This method reads and saves the vcf header information
* @param reader the VCF file reader
* @throws FileNotFoundException
* @throws IOException
* @throws URISyntaxException
*/
public void processHeader (VCFReader reader) throws FileNotFoundException, IOException, URISyntaxException {
headerReader = new BufferedReader(new InputStreamReader(reader.getVCFParser().getmFp()));
boolean valid = true;
headerInfo = new HashMap<String, String>();
basicHeader = new ArrayList<VCFHeaderType>();
altHeader = new ArrayList<VCFHeaderType>();
filterHeader = new ArrayList<VCFHeaderType>();
infoHeader = new ArrayList<VCFHeaderAdvancedType>();
formatHeader = new ArrayList<VCFHeaderAdvancedType>();
VCFHeaderBasicType basicAlternativeHeader = new VCFHeaderBasicType();
basicAlternativeHeader.setColumnCategory(VCFColumnName.ALT);
basicAlternativeHeader.setId(VCFColumnName.ALT.toString());
basicAlternativeHeader.setDescription("Alternative value");
VCFHeaderBasicType basicQualityHeader = new VCFHeaderBasicType();
basicQualityHeader.setColumnCategory(VCFColumnName.QUAL);
basicQualityHeader.setId(VCFColumnName.QUAL.toString());
basicQualityHeader.setDescription("Quality value");
VCFHeaderBasicType basicFilterHeader = new VCFHeaderBasicType();
basicFilterHeader.setColumnCategory(VCFColumnName.FILTER);
basicFilterHeader.setId(VCFColumnName.FILTER.toString());
basicFilterHeader.setDescription("Filter value");
basicFilterHeader.addElement("PASS");
basicFilterHeader.addElement(".");
basicHeader.add(basicAlternativeHeader);
basicHeader.add(basicQualityHeader);
basicHeader.add(basicFilterHeader);
while (valid) {
String line = reader.getVCFParser().readLine(headerReader);
if (line == null) {
valid = false;
} else {
if ((line != null) && (line.length() > 0)) {
if (line.substring(0, 2).equals("##")) {
int equalChar = line.indexOf("=");
String type = line.substring(2, equalChar);
if (type.equals(VCFColumnName.INFO.toString()) ||
type.equals(VCFColumnName.ALT.toString()) ||
type.equals(VCFColumnName.FILTER.toString()) ||
type.equals(VCFColumnName.FORMAT.toString())) {
Map<String, String> info = parseVCFHeaderInfo(line.substring(equalChar + 2, line.length() - 1));
VCFHeaderType headerType = null;
if (type.equals(VCFColumnName.ALT.toString())) {
headerType = new VCFHeaderAltType();
altHeader.add(headerType);
} else if (type.equals(VCFColumnName.FILTER.toString())) {
headerType = new VCFHeaderFilterType();
filterHeader.add(headerType);
basicFilterHeader.addElement(headerType);
} else if (type.equals(VCFColumnName.INFO.toString())) {
headerType = new VCFHeaderInfoType();
infoHeader.add((VCFHeaderAdvancedType) headerType);
} else if (type.equals(VCFColumnName.FORMAT.toString())) {
headerType = new VCFHeaderFormatType();
formatHeader.add((VCFHeaderAdvancedType) headerType);
}
headerType.setId(info.get(VCFColumnName.ID.toString()));
headerType.setDescription(info.get("Description"));
if (headerType instanceof VCFHeaderAdvancedType) {
((VCFHeaderAdvancedType) headerType).setNumber(info.get("Number"));
((VCFHeaderAdvancedType) headerType).setType(fieldType.get(info.get("Type")));
}
} else {
headerInfo.put(type, line.substring(equalChar + 1, line.length() - 1));
}
} else {
valid = false;
if (line.substring(0, 1).equals("#")) {
columnNames = new ArrayList<String>();
String[] details = Utils.splitWithTab(line.substring(1, line.length()));
//for (String name: line.substring(1, line.length()).split("[\t]")) {
for (int i = 0; i < details.length; i++) {
columnNames.add(details[i].trim());
if (i > 8) {
genomeMap.put(details[i].trim(), i);
//genomeRawNames.add(details[i].trim());
}
}
}
}
}
}
}
}
/**
* Method used for unserialization
* @param in
* @throws IOException
* @throws ClassNotFoundException
*/
@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
in.readInt();
headerInfo = (Map<String, String>) in.readObject();
fieldType = (Map<String, Class<?>>) in.readObject();
fixedColumn = (List<String>) in.readObject();
basicHeader = (List<VCFHeaderType>) in.readObject();
altHeader = (List<VCFHeaderType>) in.readObject();
filterHeader = (List<VCFHeaderType>) in.readObject();
infoHeader = (ArrayList<VCFHeaderAdvancedType>) in.readObject();
formatHeader = (ArrayList<VCFHeaderAdvancedType>) in.readObject();
columnNames = (List<String>) in.readObject();
genomeRawNames = (List<String>) in.readObject();
genomeNames = (List<String>) in.readObject();
genomeMap = (Map<String, Integer>) in.readObject();
}
/**
* Shows the column names
*/
public void showColumnNames () {
System.out.println("===== Column names");
String line = "";
for (String name: columnNames) {
line = line + name + "\t";
}
System.out.println(line);
}
/**
* Shows the main header information
*/
public void showHeaderInfo () {
System.out.println("===== Header information");
for (String key: headerInfo.keySet()) {
System.out.println(key + ": " + headerInfo.get(key));
}
}
/**
* Method used for serialization
* @param out
* @throws IOException
*/
private void writeObject(ObjectOutputStream out) throws IOException {
out.writeInt(SAVED_FORMAT_VERSION_NUMBER);
out.writeObject(headerInfo);
out.writeObject(fieldType);
out.writeObject(fixedColumn);
out.writeObject(basicHeader);
out.writeObject(altHeader);
out.writeObject(filterHeader);
out.writeObject(infoHeader);
out.writeObject(formatHeader);
out.writeObject(columnNames);
out.writeObject(genomeRawNames);
out.writeObject(genomeNames);
out.writeObject(genomeMap);
}
}