package org.broad.igv.variant.New;
import htsjdk.tribble.AsciiFeatureCodec;
import htsjdk.tribble.readers.LineIterator;
import org.apache.log4j.Logger;
import org.broad.igv.Globals;
import org.broad.igv.feature.genome.Genome;
import org.broad.igv.util.ParsingUtils;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.util.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by jrobinso on 7/29/16.
*/
public class VariantCodec extends AsciiFeatureCodec<Variant> {
private static Logger log = Logger.getLogger(VariantCodec.class);
Genome genome;
VCFHeader header;
protected VariantCodec(Genome genome, ResourceLocator locator) throws IOException {
super(Variant.class);
this.genome = genome;
BufferedReader reader = ParsingUtils.openBufferedReader(locator);
header = parseHeader(reader);
}
@Override
public Variant decode(String line) {
String[] tokens = Globals.tabPattern.split(line);
if (tokens.length > 7) {
Variant variant = new Variant();
variant.chr = genome == null ? tokens[0] : genome.getCanonicalChrName(tokens[0]);
variant.pos = Integer.parseInt(tokens[1]) - 1;
variant.names = tokens[2]; // id in VCF
variant.referenceBases = tokens[3];
variant.alternateBases = tokens[4];
variant.quality = Integer.parseInt(tokens[5]);
variant.filter = tokens[6];
variant.info = tokens[7]; // TODO -- parse special reserved values, e.g. AF
String[] altTokens = variant.alternateBases.split(",");
variant.alleles = new String[altTokens.length + 1];
variant.alleles[0] = variant.referenceBases;
for (int i = 0; i < altTokens.length; i++) {
variant.alleles[i + 1] = altTokens[i];
}
computeStart(variant);
// TODO -- Calls
return variant;
} else {
// invalid line
return null;
}
}
private static void computeStart(Variant variant) {
if (variant.alleles.length > 1) {
variant.start = variant.pos;
variant.end = 0;
int refBasesLength = variant.alleles[0].length();
for (int i = 1; i < variant.alleles.length; i++) {
int altLength = variant.alleles[i].length();
if (altLength > 0) {
int s, e;
int diff = refBasesLength - altLength;
if (diff > 0) {
// deletion, assume left padded
s = variant.pos + altLength;
e = s + diff;
} else if (diff < 0) {
// Insertion, assume left padded, insertion begins to "right" of last ref base
s = variant.pos + refBasesLength;
e = s + 1; // Insertion between s & 3
} else {
// Substitution, SNP if seq.length == 1
s = variant.pos;
e = s + altLength;
}
// variant.alleles.push({allele: alt, start: s, end: e});
variant.start = Math.min(variant.start, s);
variant.end = Math.max(variant.end, e);
}
}
} else {
// Is this even legal VCF? (NO alt alleles)
variant.start = variant.pos - 1;
variant.end = variant.pos;
}
}
@Override
public Object readActualHeader(LineIterator lineIterator) {
return null; // Who actuall calls this?
}
@Override
public boolean canDecode(String s) {
return false;
}
static class VCFHeader {
String version;
Map<String, Map<String, String>> infoFields;
Map<String, Map<String, String>> formatFields;
Map<String, Map<String, String>> filterFields;
Map<Integer, String> callSetNames;
public VCFHeader() {
infoFields = new HashMap<>();
formatFields = new HashMap<>();
filterFields = new HashMap<>();
}
public void addMetaInfo(String type, String id, Map<String, String> info) {
Map<String, Map<String, String>> dict;
if ("##INFO".equals(type)) {
dict = infoFields;
} else if ("##FORMAT".equals(type)) {
dict = formatFields;
} else if ("##FILTER".equals(type)) {
dict = filterFields;
} else {
// ignored
return;
}
dict.put(type, info);
}
}
private VCFHeader parseHeader(BufferedReader reader) throws IOException {
VCFHeader header = new VCFHeader();
String line = reader.readLine();
// First line must be file format
if (line != null && line.startsWith("##fileformat")) {
header.version = line.substring(13);
} else {
throw new Error("Invalid VCF file: missing fileformat line");
}
while ((line = reader.readLine()) != null) {
if (line.startsWith("#")) {
if (line.startsWith("##")) {
if (line.startsWith("##INFO") || line.startsWith("##FILTER") || line.startsWith("##FORMAT")) {
int ltIdx = line.indexOf("<");
int gtIdx = line.lastIndexOf(">");
if (!(ltIdx > 2 && gtIdx > 0)) {
log.error("Malformed VCF header line: " + line);
continue;
}
//##INFO=<ID=AF,Number=A,Type=Float,Description="Allele frequency based on Flow Evaluator observation counts">
// ##FILTER=<ID=NOCALL,Description="Generic filter. Filtering details stored in FR info tag.">
// ##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele frequency based on Flow Evaluator observation counts">
String type = line.substring(2, ltIdx - 1);
List<String> tokens = StringUtils.breakQuotedString(line.substring(ltIdx + 1, gtIdx - 1), ',');
String id = null;
Map<String, String> values = new HashMap<>();
for (String token : tokens) {
String[] kv = token.split("=");
if (kv.length > 1) {
if ("ID".equals(kv[0])) {
id = kv[1];
} else {
values.put(kv[0], kv[1]);
}
}
}
;
if (id != null) {
header.addMetaInfo(type, id, values);
}
} else {
// Ignoring other ## header lines
}
} else if (line.startsWith("#CHROM")) {
String[] tokens = line.split("\t");
if (tokens.length > 8) {
// call set names -- create column # -> cs name table
header.callSetNames = new HashMap<>();
for (int j = 9; j < tokens.length; j++) {
header.callSetNames.put(j, tokens[j]);
}
}
break;
}
} else {
break;
}
}
return header;
}
//
// function extractCallFields(tokens) {
//
// var callFields = {
// genotypeIndex:-1,
// genotypeLikelihoodIndex:-1,
// phasesetIndex:-1,
// fields:tokens
// },
// i;
//
// for (i = 0; i < tokens.length; i++) {
//
// if ("GT" == = tokens[i]) {
// callFields.genotypeIndex = i;
// } else if ("GL" == = tokens[i]) {
// callFields.genotypeLikelihoodIndex = i;
// } else if ("PS" == = tokens[i]) {
// callFields.phasesetIndex = i;
// }
// }
// return callFields;
//
// }
}