/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.core.variant.io;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.writer.Options;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder;
import htsjdk.variant.vcf.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.*;
import org.opencb.biodata.models.variant.stats.VariantStats;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.Query;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.io.DataWriter;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.variant.VariantStorageEngine;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptorUtils;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBIterator;
import org.opencb.opencga.storage.core.variant.adaptors.VariantSourceDBAdaptor;
import org.opencb.opencga.storage.core.variant.io.db.VariantDBReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.text.DecimalFormat;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Created by jmmut on 2015-06-25.
*
* @author Jose Miguel Mut Lopez <jmmut@ebi.ac.uk>
* @author Matthias Haimel
*/
public class VariantVcfDataWriter implements DataWriter<Variant> {
private static final DecimalFormat DECIMAL_FORMAT_7 = new DecimalFormat("#.#######");
private static final DecimalFormat DECIMAL_FORMAT_3 = new DecimalFormat("#.###");
private final Logger logger = LoggerFactory.getLogger(VariantVcfDataWriter.class);
private static final String DEFAULT_ANNOTATIONS = "allele|gene|ensemblGene|ensemblTranscript|biotype|consequenceType|phastCons|phylop"
+ "|populationFrequency|cDnaPosition|cdsPosition|proteinPosition|sift|polyphen|clinvar|cosmic|gwas|drugInteraction";
private static final String ALL_ANNOTATIONS = "allele|gene|ensemblGene|ensemblTranscript|biotype|consequenceType|phastCons|phylop"
+ "|populationFrequency|cDnaPosition|cdsPosition|proteinPosition|sift|polyphen|clinvar|cosmic|gwas|drugInteraction";
private final StudyConfiguration studyConfiguration;
private final VariantSourceDBAdaptor sourceDBAdaptor;
private final OutputStream outputStream;
private final Query query;
private final QueryOptions queryOptions;
private final AtomicReference<Function<String, String>> sampleNameConverter = new AtomicReference<>(s -> s);
private final int studyId;
private VariantContextWriter writer;
private List<String> annotations;
private int failedVariants;
private final List<String> sampleNames = new ArrayList<>();
private final Map<String, String> sampleNameMapping = new ConcurrentHashMap<>();
private final AtomicReference<BiConsumer<Variant, RuntimeException>> converterErrorListener = new AtomicReference<>((v, r) -> { });
private final AtomicBoolean exportGenotype = new AtomicBoolean(true);
public VariantVcfDataWriter(StudyConfiguration studyConfiguration, VariantSourceDBAdaptor sourceDBAdaptor, OutputStream outputStream,
Query query, QueryOptions queryOptions) {
this.studyConfiguration = studyConfiguration;
this.sourceDBAdaptor = sourceDBAdaptor;
this.outputStream = outputStream;
this.query = query == null ? new Query() : query;
this.queryOptions = queryOptions == null ? new QueryOptions() : queryOptions;
studyId = this.studyConfiguration.getStudyId();
}
public void setSampleNameConverter(Function<String, String> converter) {
sampleNameConverter.set(converter);
}
public void setConverterErrorListener(BiConsumer<Variant, RuntimeException> converterErrorListener) {
this.converterErrorListener.set(converterErrorListener);
}
public void setExportGenotype(boolean exportGenotype) {
this.exportGenotype.set(exportGenotype);
}
/**
* Uses a reader and a writer to dump a vcf.
* TODO jmmut: use studyConfiguration to know the order of
*
* @param adaptor The query adaptor to execute the query
* @param studyConfiguration Configuration object
* @param outputUri The destination file
* @param query The query object
* @param options The options
*/
@Deprecated
public static void vcfExport(VariantDBAdaptor adaptor, StudyConfiguration studyConfiguration, URI outputUri, Query query,
QueryOptions options) {
// Default objects
VariantDBReader reader = new VariantDBReader(studyConfiguration, adaptor, query, options);
org.opencb.biodata.formats.variant.vcf4.io.VariantVcfDataWriter writer =
new org.opencb.biodata.formats.variant.vcf4.io.VariantVcfDataWriter(reader, outputUri.getPath());
int batchSize = 100;
// user tuning
// if (options != null) {
// batchSize = options.getInt(VariantStorageEngine.BATCH_SIZE, batchSize);
// }
// setup
reader.open();
reader.pre();
writer.open();
writer.pre();
// actual loop
List<Variant> variants = reader.read(batchSize);
// while (!(variants = reader.read(batchSize)).isEmpty()) {
// writer.write(variants);
// }
while (!variants.isEmpty()) {
writer.write(variants);
variants = reader.read(batchSize);
}
// tear down
reader.post();
reader.close();
writer.post();
writer.close();
}
@Deprecated
public static int htsExport(VariantDBIterator iterator, StudyConfiguration studyConfiguration, VariantSourceDBAdaptor sourceDBAdaptor,
OutputStream outputStream, QueryOptions queryOptions) {
VariantVcfDataWriter exporter = new VariantVcfDataWriter(studyConfiguration, sourceDBAdaptor, outputStream, new Query(),
queryOptions);
exporter.open();
exporter.pre();
iterator.forEachRemaining(exporter::write);
exporter.post();
exporter.close();
return exporter.failedVariants;
}
@Override
public boolean pre() {
LinkedHashSet<VCFHeaderLine> meta = new LinkedHashSet<>();
sampleNames.clear();
sampleNames.addAll(getSamples());
logger.info("Use {} samples for export ... ", this.sampleNames.size());
sampleNameMapping.putAll(
sampleNames.stream().collect(Collectors.toMap(s -> s, s -> sampleNameConverter.get().apply(s))));
List<String> names = sampleNames.stream().map(s -> sampleNameMapping.get(s)).collect(Collectors.toList());
logger.info("Samples mapped: {} ... ", names.size());
// Iterator<VariantSource> iterator = sourceDBAdaptor.iterator(
// new Query(VariantStorageEngine.Options.STUDY_ID.key(), studyConfiguration.getStudyId()),
// new QueryOptions());
// if (iterator.hasNext()) {
// VariantSource source = iterator.next();
// fileHeader = source.getMetadata().get(VariantFileUtils.VARIANT_FILE_HEADER).toString();
// } else {
// throw new IllegalStateException("file headers not available for study " + studyConfiguration.getStudyName()
// + ". note: check files: " + studyConfiguration.getFileIds().values().toString());
// }
/* FILTER */
meta.add(new VCFFilterHeaderLine("PASS", "Valid variant"));
meta.add(new VCFFilterHeaderLine(".", "No FILTER info"));
/* INFO */
meta.add(new VCFInfoHeaderLine("PR", 1, VCFHeaderLineType.Float, "Pass rate"));
meta.add(new VCFInfoHeaderLine("CR", 1, VCFHeaderLineType.Float, "Call rate"));
meta.add(new VCFInfoHeaderLine("OPR", 1, VCFHeaderLineType.Float, "Overall Pass rate"));
addCohortInfo(meta);
addAnnotationInfo(meta);
/* FORMAT */
meta.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
meta.add(new VCFFormatHeaderLine("PF", 1, VCFHeaderLineType.Integer,
"Variant was PASS (1) filter in original vcf"));
final VCFHeader header = new VCFHeader(meta, names);
final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary();
// setup writer
VariantContextWriterBuilder builder = new VariantContextWriterBuilder()
.setOutputStream(outputStream)
.setReferenceDictionary(sequenceDictionary)
.unsetOption(Options.INDEX_ON_THE_FLY);
if (sampleNames.isEmpty() || !this.exportGenotype.get()) {
builder.setOption(Options.DO_NOT_WRITE_GENOTYPES);
}
List<String> formatFields = studyConfiguration.getAttributes()
.getAsStringList(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key());
List<String> formatFieldsType = studyConfiguration.getAttributes()
.getAsStringList(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS_TYPE.key());
for (int i = 0; i < formatFields.size(); i++) {
String id = formatFields.get(i);
if (header.getFormatHeaderLine(id) == null) {
header.addMetaDataLine(new VCFFormatHeaderLine(id, 1, VCFHeaderLineType.valueOf(formatFieldsType.get(i)), ""));
}
}
writer = builder.build();
writer.writeHeader(header);
return true;
}
private void addAnnotationInfo(LinkedHashSet<VCFHeaderLine> meta) {
// check if variant annotations are exported in the INFO column
annotations = null;
if (queryOptions != null && queryOptions.getString("annotations") != null && !queryOptions.getString("annotations").isEmpty()) {
String annotationString;
switch (queryOptions.getString("annotations")) {
case "all":
annotationString = ALL_ANNOTATIONS.replaceAll(",", "|");
break;
case "default":
annotationString = DEFAULT_ANNOTATIONS.replaceAll(",", "|");
break;
default:
annotationString = queryOptions.getString("annotations").replaceAll(",", "|");
break;
}
// String annotationString = queryOptions.getString("annotations", DEFAULT_ANNOTATIONS).replaceAll(",", "|");
annotations = Arrays.asList(annotationString.split("\\|"));
meta.add(new VCFInfoHeaderLine("CSQ", 1, VCFHeaderLineType.String, "Consequence annotations from CellBase. "
+ "Format: " + annotationString));
}
}
private void addCohortInfo(LinkedHashSet<VCFHeaderLine> meta) {
for (String cohortName : studyConfiguration.getCohortIds().keySet()) {
if (cohortName.equals(StudyEntry.DEFAULT_COHORT)) {
meta.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A,
VCFHeaderLineType.Integer, "Total number of alternate alleles in called genotypes,"
+ " for each ALT allele, in the same order as listed"));
meta.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A,
VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, calculated from AC and AN, in the range (0,1),"
+ " in the same order as listed"));
meta.add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1,
VCFHeaderLineType.Integer, "Total number of alleles in called genotypes"));
continue;
}
// header.addMetaDataLine(new VCFInfoHeaderLine(cohortName + VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A,
// VCFHeaderLineType.Integer, "Total number of alternate alleles in called genotypes,"
// + " for each ALT allele, in the same order as listed"));
meta.add(new VCFInfoHeaderLine(cohortName + "_" + VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A,
VCFHeaderLineType.Float,
"Allele frequency in the " + cohortName + " cohort calculated from AC and AN, in the range (0,1),"
+ " in the same order as listed"));
// header.addMetaDataLine(new VCFInfoHeaderLine(cohortName + VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer,
// "Total number of alleles in called genotypes"));
}
}
@Override
public boolean write(List<Variant> batch) {
for (Variant variant : batch) {
try {
VariantContext variantContext = convertVariantToVariantContext(variant, annotations);
if (variantContext != null) {
writer.add(variantContext);
}
} catch (RuntimeException e) {
logger.error("Error exporting variant " + variant, e);
failedVariants++;
converterErrorListener.get().accept(variant, e);
}
}
return true;
}
@Override
public boolean post() {
if (failedVariants > 0) {
logger.warn(failedVariants + " variants were not written due to errors");
}
return true;
}
@Override
public boolean close() {
writer.close();
return true;
}
// private VCFHeader getVcfHeader(StudyConfiguration studyConfiguration, QueryOptions options) throws IOException {
// List<String> returnedSamples = getReturnedSamples(studyConfiguration, options);
// // get header from studyConfiguration
// Collection<String> headers = studyConfiguration.getHeaders().values();
// String fileHeader;
// if (headers.isEmpty()) {
// Iterator<VariantSource> iterator = sourceDBAdaptor.iterator(
// new Query(VariantStorageEngine.Options.STUDY_ID.key(), studyConfiguration.getStudyId()),
// new QueryOptions());
// if (iterator.hasNext()) {
// VariantSource source = iterator.next();
// fileHeader = source.getMetadata().get(VariantFileUtils.VARIANT_FILE_HEADER).toString();
// } else {
// fileHeader = null;
// }
//// else {
//// throw new IllegalStateException("file headers not available for study " + studyConfiguration.getStudyName()
//// + ". note: check files: " + studyConfiguration.getFileIds().values().toString());
//// }
// } else {
// fileHeader = headers.iterator().next();
// }
//
// if (fileHeader != null) {
// int lastLineIndex = fileHeader.lastIndexOf("#CHROM");
// if (lastLineIndex >= 0) {
// String substring = fileHeader.substring(0, lastLineIndex);
//
// String samples = String.join("\t", returnedSamples);
// logger.debug("export will be done on samples: [{}]", samples);
//
// if (returnedSamples.isEmpty()) {
// fileHeader = substring + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\t";
// } else {
// fileHeader = substring + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + samples;
// }
// }
//
// return VariantFileMetadataToVCFHeaderConverter.parseVcfHeader(fileHeader);
// } else {
// HashSet<VCFHeaderLine> metaData = new HashSet<>();
// VCFHeader vcfHeader = new VCFHeader(metaData, returnedSamples);
// return vcfHeader;
// }
// }
private List<String> getReturnedSamples(StudyConfiguration studyConfiguration, QueryOptions options) {
Map<Integer, List<Integer>> returnedSamplesMap =
VariantDBAdaptorUtils.getReturnedSamples(new Query(options), options, Collections.singletonList(studyConfiguration));
List<String> returnedSamples = returnedSamplesMap.get(studyConfiguration.getStudyId()).stream()
.map(sampleId -> studyConfiguration.getSampleIds().inverse().get(sampleId))
.collect(Collectors.toList());
return returnedSamples;
}
protected List<String> getSamples() {
if (!this.exportGenotype.get()) {
logger.info("Do NOT export genotype -> sample list empty!!!");
return Collections.emptyList();
}
// Get Sample names from query & study configuration
List<String> sampleNames = VariantDBAdaptorUtils.getSamplesMetadata(query, studyConfiguration)
.get(studyConfiguration.getStudyName());
return sampleNames;
}
/**
* Convert org.opencb.biodata.models.variant.Variant into a htsjdk.variant.variantcontext.VariantContext
* some assumptions:
* * splitted multiallelic variants will produce only one variantContexts. Merging is done
* * If some normalization has been applied, the source entries may have an attribute ORI like: "POS:REF:ALT_0(,ALT_N)*:ALT_IDX"
*
* @param variant A variant object to be converted
* @param studyConfiguration StudyConfiguration
* @param annotations Variant annotation
* @return The variant in HTSJDK format
* TODO: Move to a separated converter
*/
public VariantContext convertVariantToVariantContext(Variant variant, StudyConfiguration studyConfiguration,
List<String> annotations) { //, StudyConfiguration
return convertVariantToVariantContext(variant, annotations);
}
public List<String> buildAlleles(Variant variant, Pair<Integer, Integer> adjustedRange) {
String reference = variant.getReference();
String alternate = variant.getAlternate();
List<AlternateCoordinate> secAlts = variant.getStudy(this.studyConfiguration.getStudyName()).getSecondaryAlternates();
List<String> alleles = new ArrayList<>(secAlts.size() + 2);
Integer origStart = variant.getStart();
Integer origEnd = variant.getEnd();
alleles.add(buildAllele(variant.getChromosome(), origStart, origEnd, reference, adjustedRange));
alleles.add(buildAllele(variant.getChromosome(), origStart, origEnd, alternate, adjustedRange));
secAlts.forEach(alt -> {
alleles.add(buildAllele(variant.getChromosome(), alt.getStart(), alt.getEnd(), alt.getAlternate(), adjustedRange));
});
return alleles;
}
public String buildAllele(String chromosome, Integer start, Integer end, String allele, Pair<Integer, Integer> adjustedRange) {
if (start.equals(adjustedRange.getLeft()) && end.equals(adjustedRange.getRight())) {
return allele; // same start / end
}
if (StringUtils.startsWith(allele, "*")) {
return allele; // no need
}
return getReferenceBase(chromosome, adjustedRange.getLeft(), start) + allele
+ getReferenceBase(chromosome, end, adjustedRange.getRight());
}
/**
* Get bases from reference sequence.
* @param chromosome Chromosome.
* @param from Start ( inclusive) position.
* @param to End (exclusive) position.
* @return String Reference sequence of length to - from.
*/
private String getReferenceBase(String chromosome, Integer from, Integer to) {
int length = to - from;
if (length < 0) {
throw new IllegalStateException(
"Sequence length is negative: chromosome " + chromosome + " from " + from + " to " + to);
}
return StringUtils.repeat('N', length); // current return default base TODO load reference sequence
}
public VariantContext convertVariantToVariantContext(Variant variant, List<String> annotations) { //, StudyConfiguration
final String noCallAllele = String.valueOf(VCFConstants.NO_CALL_ALLELE);
VariantContextBuilder variantContextBuilder = new VariantContextBuilder();
VariantType type = variant.getType();
Pair<Integer, Integer> adjustedRange = adjustedVariantStart(variant);
StudyEntry studyEntry = variant.getStudy(this.studyConfiguration.getStudyName());
String[] ori = getOri(studyEntry);
List<String> originalAlleles = getOriginalAlleles(ori);
List<String> allelesArray;
if (originalAlleles != null) {
allelesArray = originalAlleles;
} else {
allelesArray = buildAlleles(variant, adjustedRange);
}
Set<Integer> nocallAlleles = IntStream.range(0, allelesArray.size()).boxed()
.filter(i -> noCallAllele.equals(allelesArray.get(i)))
.collect(Collectors.toSet());
String filter = "PASS";
String prk = "PR";
String crk = "CR";
String oprk = "OPR";
//Attributes for INFO column
ObjectMap attributes = new ObjectMap();
ArrayList<Genotype> genotypes = new ArrayList<>();
// Integer originalPosition = null;
// List<String> originalAlleles = null;
// TODO work out properly how to deal with multi allelic sites.
// Integer auxOriginalPosition = getOriginalPosition(ori);
// if (originalPosition != null && auxOriginalPosition != null && !originalPosition.equals(auxOriginalPosition)) {
// throw new IllegalStateException("Two or more VariantSourceEntries have different origin. Unable to merge");
// }
// originalPosition = auxOriginalPosition;
// if (originalAlleles == null) {
// originalAlleles = allelesArray;
// }
//
//Only print those variants in which the alternate is the first alternate from the multiallelic alternatives
if (originalAlleles != null && originalAlleles.size() > 2 && !"0".equals(getOriginalAlleleIndex(ori))) {
logger.debug("Skip multi allelic variant! {}", variant);
return null;
}
String sourceFilter = studyEntry.getAttribute("FILTER");
if (sourceFilter != null && !filter.equals(sourceFilter)) {
filter = "."; // write PASS iff all sources agree that the filter is "PASS" or assumed if not present, otherwise write "."
}
if (studyEntry.getFiles() != null && studyEntry.getFiles().size() == 1) {
Map<String, String> fileAttributes = studyEntry.getFiles().get(0).getAttributes();
if (fileAttributes.containsKey("PR")) {
attributes.putIfNotNull(prk, DECIMAL_FORMAT_7.format(Double.valueOf(fileAttributes.get("PR"))));
}
if (fileAttributes.containsKey("CR")) {
attributes.putIfNotNull(crk, DECIMAL_FORMAT_7.format(Double.valueOf(fileAttributes.get("CR"))));
}
if (fileAttributes.containsKey("OPR")) {
attributes.putIfNotNull(oprk, DECIMAL_FORMAT_7.format(Double.valueOf(fileAttributes.get("OPR"))));
}
}
String refAllele = allelesArray.get(0);
for (String sampleName : this.sampleNames) {
String gtStr = studyEntry.getSampleData(sampleName, "GT");
String genotypeFilter = studyEntry.getSampleData(sampleName, "FT");
if (Objects.isNull(gtStr)) {
gtStr = noCallAllele;
genotypeFilter = noCallAllele;
}
List<String> gtSplit = new ArrayList<>(Arrays.asList(gtStr.split(",")));
List<String> ftSplit = new ArrayList<>(Arrays.asList(
(StringUtils.isBlank(genotypeFilter) ? "" : genotypeFilter).split(",")));
while (gtSplit.size() > 1) {
int idx = gtSplit.indexOf(noCallAllele);
if (idx < 0) {
idx = gtSplit.indexOf("0/0");
}
if (idx < 0) {
break;
}
gtSplit.remove(idx);
ftSplit.remove(idx);
}
String gt = gtSplit.get(0);
String ft = ftSplit.get(0);
org.opencb.biodata.models.feature.Genotype genotype =
new org.opencb.biodata.models.feature.Genotype(gt, refAllele, allelesArray.subList(1, allelesArray.size()));
List<Allele> alleles = new ArrayList<>();
for (int gtIdx : genotype.getAllelesIdx()) {
if (gtIdx < allelesArray.size() && gtIdx >= 0 && !nocallAlleles.contains(gtIdx)) { // .. AND NOT a nocall allele
alleles.add(Allele.create(allelesArray.get(gtIdx), gtIdx == 0)); // allele is ref. if the alleleIndex is 0
} else {
alleles.add(Allele.create(noCallAllele, false)); // genotype of a secondary alternate, or an actual missing
}
}
if (StringUtils.isBlank(ft)) {
genotypeFilter = null;
} else if (StringUtils.equals("PASS", ft)) {
genotypeFilter = "1";
} else {
genotypeFilter = "0";
}
GenotypeBuilder builder = new GenotypeBuilder()
.name(this.sampleNameMapping.get(sampleName));
if (studyEntry.getFormatPositions().containsKey("GT")) {
builder.alleles(alleles)
.phased(genotype.isPhased());
}
if (genotypeFilter != null) {
builder.attribute("PF", genotypeFilter);
}
for (String id : studyEntry.getFormat()) {
if (id.equals("GT") || id.equals("FT")) {
continue;
}
String value = studyEntry.getSampleData(sampleName, id);
builder.attribute(id, value);
}
genotypes.add(builder.make());
}
addStats(studyEntry, attributes);
variantContextBuilder.start(adjustedRange.getLeft())
.stop(adjustedRange.getLeft() + refAllele.length() - 1) //TODO mh719: check what happens for Insertions
.chr(variant.getChromosome())
.filter(filter); // TODO jmmut: join attributes from different source entries? what to do on a collision?
if (genotypes.isEmpty()) {
variantContextBuilder.noGenotypes();
} else {
variantContextBuilder.genotypes(genotypes);
}
if (type.equals(VariantType.NO_VARIATION) && allelesArray.get(1).isEmpty()) {
variantContextBuilder.alleles(refAllele);
} else {
variantContextBuilder.alleles(allelesArray.stream().filter(a -> !a.equals(noCallAllele)).collect(Collectors.toList()));
}
// if asked variant annotations are exported
if (annotations != null) {
addAnnotations(variant, annotations, attributes);
}
variantContextBuilder.attributes(attributes);
if (StringUtils.isNotEmpty(variant.getId()) && !variant.toString().equals(variant.getId())) {
StringBuilder ids = new StringBuilder();
ids.append(variant.getId());
if (variant.getNames() != null) {
for (String name : variant.getNames()) {
ids.append(VCFConstants.ID_FIELD_SEPARATOR).append(name);
}
}
variantContextBuilder.id(ids.toString());
} else {
variantContextBuilder.id(VCFConstants.EMPTY_ID_FIELD);
}
return variantContextBuilder.make();
}
/**
* Adjust start/end if a reference base is required due to an empty allele. All variants are checked due to SecAlts.
* @param variant {@link Variant} object.
* @return Pair<Integer, Integer> The adjusted (or same) start/end position e.g. SV and MNV as SecAlt, INDEL, etc.
*/
protected Pair<Integer, Integer> adjustedVariantStart(Variant variant) {
Integer start = variant.getStart();
Integer end = variant.getEnd();
if (StringUtils.isBlank(variant.getReference()) || StringUtils.isBlank(variant.getAlternate())) {
start = start - 1;
}
for (AlternateCoordinate alternateCoordinate : variant.getStudy(this.studyConfiguration.getStudyName()).getSecondaryAlternates()) {
start = Math.min(start, alternateCoordinate.getStart());
end = Math.max(end, alternateCoordinate.getEnd());
if (StringUtils.isBlank(alternateCoordinate.getAlternate()) || StringUtils.isBlank(alternateCoordinate.getReference())) {
start = Math.min(start, alternateCoordinate.getStart() - 1);
}
}
return new ImmutablePair<>(start, end);
}
private Map<String, Object> addAnnotations(Variant variant, List<String> annotations, Map<String, Object> attributes) {
StringBuilder stringBuilder = new StringBuilder();
if (variant.getAnnotation() == null) {
return attributes;
}
// for (ConsequenceType consequenceType : variant.getAnnotation().getConsequenceTypes()) {
for (int i = 0; i < variant.getAnnotation().getConsequenceTypes().size(); i++) {
ConsequenceType consequenceType = variant.getAnnotation().getConsequenceTypes().get(i);
for (int j = 0; j < annotations.size(); j++) {
switch (annotations.get(j)) {
case "allele":
stringBuilder.append(variant.getAlternate());
break;
case "consequenceType":
stringBuilder.append(consequenceType.getSequenceOntologyTerms().stream()
.map(SequenceOntologyTerm::getName).collect(Collectors.joining(",")));
break;
case "gene":
if (consequenceType.getGeneName() != null) {
stringBuilder.append(consequenceType.getGeneName());
}
break;
case "ensemblGene":
if (consequenceType.getEnsemblGeneId() != null) {
stringBuilder.append(consequenceType.getEnsemblGeneId());
}
break;
case "ensemblTranscript":
if (consequenceType.getEnsemblTranscriptId() != null) {
stringBuilder.append(consequenceType.getEnsemblTranscriptId());
}
break;
case "biotype":
if (consequenceType.getBiotype() != null) {
stringBuilder.append(consequenceType.getBiotype());
}
break;
case "phastCons":
if (variant.getAnnotation().getConservation() != null) {
List<Double> phastCons = variant.getAnnotation().getConservation().stream()
.filter(t -> t.getSource().equalsIgnoreCase("phastCons"))
.map(Score::getScore)
.collect(Collectors.toList());
if (phastCons.size() > 0) {
stringBuilder.append(DECIMAL_FORMAT_3.format(phastCons.get(0)));
}
}
break;
case "phylop":
if (variant.getAnnotation().getConservation() != null) {
List<Double> phylop = variant.getAnnotation().getConservation().stream()
.filter(t -> t.getSource().equalsIgnoreCase("phylop"))
.map(Score::getScore)
.collect(Collectors.toList());
if (phylop.size() > 0) {
stringBuilder.append(DECIMAL_FORMAT_3.format(phylop.get(0)));
}
}
break;
case "populationFrequency":
List<PopulationFrequency> populationFrequencies = variant.getAnnotation().getPopulationFrequencies();
if (populationFrequencies != null) {
stringBuilder.append(populationFrequencies.stream()
.map(t -> t.getPopulation() + ":" + t.getAltAlleleFreq())
.collect(Collectors.joining(",")));
}
break;
case "cDnaPosition":
stringBuilder.append(consequenceType.getCdnaPosition());
break;
case "cdsPosition":
stringBuilder.append(consequenceType.getCdsPosition());
break;
case "proteinPosition":
if (consequenceType.getProteinVariantAnnotation() != null) {
stringBuilder.append(consequenceType.getProteinVariantAnnotation().getPosition());
}
break;
case "sift":
if (consequenceType.getProteinVariantAnnotation() != null
&& consequenceType.getProteinVariantAnnotation().getSubstitutionScores() != null) {
List<Double> sift = consequenceType.getProteinVariantAnnotation().getSubstitutionScores().stream()
.filter(t -> t.getSource().equalsIgnoreCase("sift"))
.map(Score::getScore)
.collect(Collectors.toList());
if (sift.size() > 0) {
stringBuilder.append(DECIMAL_FORMAT_3.format(sift.get(0)));
}
}
break;
case "polyphen":
if (consequenceType.getProteinVariantAnnotation() != null
&& consequenceType.getProteinVariantAnnotation().getSubstitutionScores() != null) {
List<Double> polyphen = consequenceType.getProteinVariantAnnotation().getSubstitutionScores().stream()
.filter(t -> t.getSource().equalsIgnoreCase("polyphen"))
.map(Score::getScore)
.collect(Collectors.toList());
if (polyphen.size() > 0) {
stringBuilder.append(DECIMAL_FORMAT_3.format(polyphen.get(0)));
}
}
break;
case "clinvar":
if (variant.getAnnotation().getVariantTraitAssociation() != null
&& variant.getAnnotation().getVariantTraitAssociation().getClinvar() != null) {
stringBuilder.append(variant.getAnnotation().getVariantTraitAssociation().getClinvar().stream()
.map(ClinVar::getTraits).flatMap(Collection::stream)
.collect(Collectors.joining(",")));
}
break;
case "cosmic":
if (variant.getAnnotation().getVariantTraitAssociation() != null
&& variant.getAnnotation().getVariantTraitAssociation().getCosmic() != null) {
stringBuilder.append(variant.getAnnotation().getVariantTraitAssociation().getCosmic().stream()
.map(Cosmic::getPrimarySite)
.collect(Collectors.joining(",")));
}
break;
case "gwas":
if (variant.getAnnotation().getVariantTraitAssociation() != null
&& variant.getAnnotation().getVariantTraitAssociation().getGwas() != null) {
stringBuilder.append(variant.getAnnotation().getVariantTraitAssociation().getGwas().stream()
.map(Gwas::getTraits).flatMap(Collection::stream)
.collect(Collectors.joining(",")));
}
break;
case "drugInteraction":
stringBuilder.append(variant.getAnnotation().getGeneDrugInteraction().stream()
.map(GeneDrugInteraction::getDrugName).collect(Collectors.joining(",")));
break;
default:
logger.error("Unknown annotation: " + annotations.get(j));
break;
}
if (j < annotations.size() - 1) {
stringBuilder.append("|");
}
}
if (i < variant.getAnnotation().getConsequenceTypes().size() - 1) {
stringBuilder.append("&");
}
}
attributes.put("CSQ", stringBuilder.toString());
// infoAnnotations.put("CSQ", stringBuilder.toString().replaceAll("&|$", ""));
return attributes;
}
private void addStats(StudyEntry studyEntry, Map<String, Object> attributes) {
if (studyEntry.getStats() == null) {
return;
}
for (Map.Entry<String, VariantStats> entry : studyEntry.getStats().entrySet()) {
String cohortName = entry.getKey();
VariantStats stats = entry.getValue();
if (cohortName.equals(StudyEntry.DEFAULT_COHORT)) {
cohortName = "";
int an = stats.getAltAlleleCount() + stats.getRefAlleleCount();
if (an >= 0) {
attributes.put(cohortName + VCFConstants.ALLELE_NUMBER_KEY, String.valueOf(an));
}
if (stats.getAltAlleleCount() >= 0) {
attributes.put(cohortName + VCFConstants.ALLELE_COUNT_KEY, String.valueOf(stats.getAltAlleleCount()));
}
} else {
cohortName = cohortName + "_";
}
attributes.put(cohortName + VCFConstants.ALLELE_FREQUENCY_KEY, DECIMAL_FORMAT_7.format(stats.getAltAlleleFreq()));
}
}
/**
* Assumes that ori is in the form "POS:REF:ALT_0(,ALT_N)*:ALT_IDX".
* ALT_N is the n-th allele if this is the n-th variant resultant of a multiallelic vcf row
*
* @param ori
* @return
*/
private static List<String> getOriginalAlleles(String[] ori) {
if (ori != null && ori.length == 4) {
String[] multiAllele = ori[2].split(",");
if (multiAllele.length != 1) {
ArrayList<String> alleles = new ArrayList<>(multiAllele.length + 1);
alleles.add(ori[1]);
alleles.addAll(Arrays.asList(multiAllele));
return alleles;
} else {
return Arrays.asList(ori[1], ori[2]);
}
}
return null;
}
private static String getOriginalAlleleIndex(String[] ori) {
if (ori != null && ori.length == 4) {
return ori[3];
}
return null;
}
/**
* Assumes that ori is in the form "POS:REF:ALT_0(,ALT_N)*:ALT_IDX".
*
* @param ori
* @return
*/
private static Integer getOriginalPosition(String[] ori) {
if (ori != null && ori.length == 4) {
return Integer.parseInt(ori[0]);
}
return null;
}
private static String[] getOri(StudyEntry studyEntry) {
List<FileEntry> files = studyEntry.getFiles();
Set<String> calls = new HashSet<>();
String call = null;
if (!files.isEmpty()) {
for (FileEntry file : files) {
call = file.getCall();
// if (call != null) {
calls.add(call);
// }
}
}
if (calls.size() == 1 && StringUtils.isNotEmpty(call)) {
// Return this CALL only if all the files have the same one
return calls.iterator().next().split(":");
}
return null;
}
/**
* Unclosable output stream.
*
* Avoid passing System.out directly to HTSJDK, because it will close it at the end.
*
* http://stackoverflow.com/questions/8941298/system-out-closed-can-i-reopen-it/23791138#23791138
*/
public static class UnclosableOutputStream extends FilterOutputStream {
public UnclosableOutputStream(OutputStream os) {
super(os);
}
@Override
public void close() throws IOException {
super.flush();
}
}
}