/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant.converters;
import com.google.common.collect.BiMap;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.client.Result;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.AlternateCoordinate;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.biodata.models.variant.avro.VariantType;
import org.opencb.biodata.models.variant.protobuf.VariantProto;
import org.opencb.biodata.models.variant.stats.VariantStats;
import org.opencb.biodata.tools.variant.converters.Converter;
import org.opencb.biodata.tools.variant.merge.VariantMerger;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.datastore.core.QueryResult;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.metadata.StudyConfigurationManager;
import org.opencb.opencga.storage.core.variant.adaptors.VariantField;
import org.opencb.opencga.storage.hadoop.variant.GenomeHelper;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableHelper;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableStudyRow;
import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseStudyConfigurationManager;
import org.opencb.opencga.storage.hadoop.variant.converters.annotation.HBaseToVariantAnnotationConverter;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.converters.stats.HBaseToVariantStatsConverter;
import org.opencb.opencga.storage.hadoop.variant.models.protobuf.SampleList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.Map.Entry;
/**
* Created on 20/11/15.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class HBaseToVariantConverter implements Converter<Result, Variant> {
private final StudyConfigurationManager scm;
private final HBaseToVariantAnnotationConverter annotationConverter;
private final HBaseToVariantStatsConverter statsConverter;
private final GenomeHelper genomeHelper;
private final QueryOptions scmOptions = new QueryOptions(StudyConfigurationManager.READ_ONLY, true)
.append(StudyConfigurationManager.CACHED, true);
private final Map<Integer, LinkedHashMap<String, Integer>> returnedSamplesPositionMap = new HashMap<>();
private final Logger logger = LoggerFactory.getLogger(HBaseToVariantConverter.class);
private List<String> returnedSamples = null;
private static boolean failOnWrongVariants = false; //FIXME
private boolean studyNameAsStudyId = false;
private boolean mutableSamplesPosition = true;
private boolean failOnEmptyVariants = false;
private boolean simpleGenotypes = false;
private Set<VariantField> variantFields = null;
public HBaseToVariantConverter(VariantTableHelper variantTableHelper) throws IOException {
this(variantTableHelper, new HBaseStudyConfigurationManager(variantTableHelper.getOutputTableAsString(),
variantTableHelper.getConf(), new ObjectMap()));
}
public HBaseToVariantConverter(GenomeHelper genomeHelper, StudyConfigurationManager scm) {
this.genomeHelper = genomeHelper;
this.scm = scm;
this.annotationConverter = new HBaseToVariantAnnotationConverter(genomeHelper);
this.statsConverter = new HBaseToVariantStatsConverter(genomeHelper);
}
public HBaseToVariantConverter setReturnedSamples(List<String> returnedSamples) {
this.returnedSamples = returnedSamples;
return this;
}
public HBaseToVariantConverter setReturnedFields(Set<VariantField> fields) {
variantFields = fields;
annotationConverter.setReturnedFields(fields);
return this;
}
public HBaseToVariantConverter setStudyNameAsStudyId(boolean studyNameAsStudyId) {
this.studyNameAsStudyId = studyNameAsStudyId;
return this;
}
public HBaseToVariantConverter setMutableSamplesPosition(boolean mutableSamplesPosition) {
this.mutableSamplesPosition = mutableSamplesPosition;
return this;
}
public HBaseToVariantConverter setFailOnEmptyVariants(boolean failOnEmptyVariants) {
this.failOnEmptyVariants = failOnEmptyVariants;
return this;
}
public HBaseToVariantConverter setSimpleGenotypes(boolean simpleGenotypes) {
this.simpleGenotypes = simpleGenotypes;
return this;
}
@Override
public Variant convert(Result result) {
VariantAnnotation annotation = annotationConverter.convert(result);
Map<Integer, Map<Integer, VariantStats>> stats = statsConverter.convert(result);
return convert(genomeHelper.extractVariantFromVariantRowKey(result.getRow()), VariantTableStudyRow.parse(result, genomeHelper),
stats, annotation);
}
public Variant convert(ResultSet resultSet) throws SQLException {
Variant variant = new Variant(resultSet.getString(VariantPhoenixHelper.VariantColumn.CHROMOSOME.column()),
resultSet.getInt(VariantPhoenixHelper.VariantColumn.POSITION.column()),
resultSet.getString(VariantPhoenixHelper.VariantColumn.REFERENCE.column()),
resultSet.getString(VariantPhoenixHelper.VariantColumn.ALTERNATE.column())
);
String type = resultSet.getString(VariantPhoenixHelper.VariantColumn.TYPE.column());
if (StringUtils.isNotBlank(type)) {
variant.setType(VariantType.valueOf(type));
}
try {
Map<Integer, Map<Integer, VariantStats>> stats = statsConverter.convert(resultSet);
VariantAnnotation annotation = annotationConverter.convert(resultSet);
return convert(variant, VariantTableStudyRow.parse(variant, resultSet, genomeHelper), stats, annotation);
} catch (RuntimeException e) {
logger.error("Fail to parse variant: " + variant);
throw e;
}
}
public Variant convert(VariantTableStudyRow row) {
return convert(new Variant(row.getChromosome(), row.getPos(), row.getRef(), row.getAlt()),
Collections.singletonList(row), Collections.emptyMap(), null);
}
protected Variant convert(Variant variant, List<VariantTableStudyRow> rows, Map<Integer, Map<Integer, VariantStats>> stats,
VariantAnnotation annotation) {
if (annotation == null) {
annotation = new VariantAnnotation();
annotation.setConsequenceTypes(Collections.emptyList());
}
if (failOnEmptyVariants && rows.isEmpty()) {
throw new IllegalStateException("No Row columns supplied for row " + variant);
}
for (VariantTableStudyRow row : rows) {
Map<String, String> attributesMap = new HashMap<>();
Integer studyId = row.getStudyId();
QueryResult<StudyConfiguration> queryResult = scm.getStudyConfiguration(studyId, scmOptions);
if (queryResult.getResult().isEmpty()) {
throw new IllegalStateException("No study found for study ID: " + studyId);
}
StudyConfiguration studyConfiguration = queryResult.first();
LinkedHashMap<String, Integer> returnedSamplesPosition = getReturnedSamplesPosition(studyConfiguration);
if (mutableSamplesPosition) {
returnedSamplesPosition = new LinkedHashMap<>(returnedSamplesPosition);
}
// Do not throw any exception. It may happen that the study is not loaded yet or no samples are required!
// if (returnedSamplesPosition.isEmpty()) {
// throw new IllegalStateException("No samples found for study!!!");
// }
BiMap<String, Integer> loadedSamples = StudyConfiguration.getIndexedSamples(studyConfiguration);
List<String> format = Arrays.asList(VariantMerger.GT_KEY, VariantMerger.GENOTYPE_FILTER_KEY);
int gtIdx = format.indexOf(VariantMerger.GT_KEY);
int ftIdx = format.indexOf(VariantMerger.GENOTYPE_FILTER_KEY);
int loadedSamplesSize = loadedSamples.size();
calculatePassCallRates(row, attributesMap, loadedSamplesSize);
Integer nSamples = returnedSamplesPosition.size();
@SuppressWarnings ("unchecked")
List<String>[] samplesDataArray = new List[nSamples];
Set<Integer> sampleWithVariant = new HashSet<>();
BiMap<Integer, String> mapSampleIds = studyConfiguration.getSampleIds().inverse();
for (String genotype : row.getGenotypes()) {
sampleWithVariant.addAll(row.getSampleIds(genotype));
if (genotype.equals(VariantTableStudyRow.OTHER)) {
continue; // skip OTHER -> see Complex type
}
for (Integer sampleId : row.getSampleIds(genotype)) {
String sampleName = mapSampleIds.get(sampleId);
Integer sampleIdx = returnedSamplesPosition.get(sampleName);
if (sampleIdx == null) {
continue; //Sample may not be required. Ignore this sample.
}
List<String> lst = Arrays.asList(genotype, VariantMerger.PASS_VALUE);
samplesDataArray[sampleIdx] = lst;
}
}
// Load Secondary Index
List<VariantProto.AlternateCoordinate> s2cgt = row.getComplexVariant().getSecondaryAlternatesList();
int secondaryAlternatesCount = row.getComplexVariant().getSecondaryAlternatesCount();
List<AlternateCoordinate> secAltArr = new ArrayList<AlternateCoordinate>(secondaryAlternatesCount);
if (secondaryAlternatesCount > 0) {
for (VariantProto.AlternateCoordinate altcoord : s2cgt) {
VariantType vart = VariantType.valueOf(altcoord.getType().name());
String chr = StringUtils.isEmpty(altcoord.getChromosome()) ? variant.getChromosome() : altcoord.getChromosome();
Integer start = altcoord.getStart() == 0 ? variant.getStart() : altcoord.getStart();
Integer end = altcoord.getEnd() == 0 ? variant.getEnd() : altcoord.getEnd();
String reference = StringUtils.isEmpty(altcoord.getReference()) ? "" : altcoord.getReference();
String alternate = StringUtils.isEmpty(altcoord.getAlternate()) ? "" : altcoord.getAlternate();
AlternateCoordinate alt = new AlternateCoordinate(chr, start, end, reference, alternate, vart);
secAltArr.add(alt);
}
}
// Load complex genotypes
for (Entry<Integer, String> entry : row.getComplexVariant().getSampleToGenotype().entrySet()) {
sampleWithVariant.add(entry.getKey());
Integer samplePosition = getSamplePosition(returnedSamplesPosition, mapSampleIds, entry.getKey());
if (samplePosition == null) {
continue; //Sample may not be required. Ignore this sample.
}
String genotype = entry.getValue();
String returnedGenotype;
// FIXME: Decide what to do with lists of genotypes
if (simpleGenotypes) {
returnedGenotype = getSimpleGenotype(genotype);
logger.debug("Return simplified genotype: {} -> {}", genotype, returnedGenotype);
} else {
returnedGenotype = genotype;
}
samplesDataArray[samplePosition] = Arrays.asList(returnedGenotype, VariantMerger.PASS_VALUE);
}
// Fill gaps (with HOM_REF)
int gapCounter = 0;
for (int i = 0; i < samplesDataArray.length; i++) {
if (samplesDataArray[i] == null) {
++gapCounter;
samplesDataArray[i] = Arrays.asList(VariantTableStudyRow.HOM_REF, VariantMerger.PASS_VALUE);
}
}
// Set pass field
int passCount = loadedSamplesSize;
for (Entry<String, SampleList> entry : row.getComplexFilter().getFilterNonPass().entrySet()) {
String filterString = entry.getKey();
passCount -= entry.getValue().getSampleIdsCount();
for (Integer id : entry.getValue().getSampleIdsList()) {
Integer samplePosition = getSamplePosition(returnedSamplesPosition, mapSampleIds, id);
if (samplePosition == null) {
continue; // Sample may not be required. Ignore this sample.
}
samplesDataArray[samplePosition].set(ftIdx, filterString);
}
}
// Check pass count
if (passCount != row.getPassCount()) {
String message = String.format(
"Error parsing variant %s. Pass count %s does not match filter fill count: %s using %s loaded samples.",
row.toString(), row.getPassCount(), passCount, loadedSamplesSize);
wrongVariant(message);
}
// Check homRef count
int homRefCount = loadedSamplesSize;
homRefCount -= sampleWithVariant.size();
if (homRefCount != row.getHomRefCount()) {
String message = "Wrong number of HomRef samples for variant " + variant + ". Got " + homRefCount + ", expect "
+ row.getHomRefCount() + ". Samples number: " + samplesDataArray.length + " , ";
message += "'" + VariantTableStudyRow.HOM_REF + "':" + row.getHomRefCount() + " , ";
for (String studyColumn : VariantTableStudyRow.GENOTYPE_COLUMNS) {
message += "'" + studyColumn + "':" + row.getSampleIds(studyColumn) + " , ";
}
wrongVariant(message);
}
List<List<String>> samplesData = Arrays.asList(samplesDataArray);
StudyEntry studyEntry;
if (studyNameAsStudyId) {
studyEntry = new StudyEntry(studyConfiguration.getStudyName());
} else {
studyEntry = new StudyEntry(Integer.toString(studyConfiguration.getStudyId()));
}
studyEntry.setSortedSamplesPosition(returnedSamplesPosition);
studyEntry.setSamplesData(samplesData);
studyEntry.setFormat(format);
studyEntry.setFiles(Collections.singletonList(new FileEntry("", "", attributesMap)));
studyEntry.setSecondaryAlternates(secAltArr);
Map<Integer, VariantStats> convertedStatsMap = stats.get(studyConfiguration.getStudyId());
if (convertedStatsMap != null) {
Map<String, VariantStats> statsMap = new HashMap<>(convertedStatsMap.size());
for (Entry<Integer, VariantStats> entry : convertedStatsMap.entrySet()) {
String cohortName = studyConfiguration.getCohortIds().inverse().get(entry.getKey());
statsMap.put(cohortName, entry.getValue());
}
studyEntry.setStats(statsMap);
}
variant.addStudyEntry(studyEntry);
}
variant.setAnnotation(annotation);
if (StringUtils.isNotEmpty(annotation.getId())) {
variant.setId(annotation.getId());
} else {
variant.setId(variant.toString());
}
if (failOnEmptyVariants && variant.getStudies().isEmpty()) {
throw new IllegalStateException("No Studies registered for variant!!! " + variant);
}
return variant;
}
private void calculatePassCallRates(VariantTableStudyRow row, Map<String, String> attributesMap, int
loadedSamplesSize) {
attributesMap.put("PASS", row.getPassCount().toString());
attributesMap.put("CALL", row.getCallCount().toString());
double passRate = row.getPassCount().doubleValue() / loadedSamplesSize;
double callRate = row.getCallCount().doubleValue() / loadedSamplesSize;
double opr = passRate * callRate;
attributesMap.put("PR", String.valueOf(passRate));
attributesMap.put("CR", String.valueOf(callRate));
attributesMap.put("OPR", String.valueOf(opr)); // OVERALL pass rate
attributesMap.put("NS", String.valueOf(loadedSamplesSize)); // Number of Samples
}
private String getSimpleGenotype(String genotype) {
if (genotype.contains(",")) {
return genotype.split(",")[0];
} else {
return genotype;
}
}
private void wrongVariant(String message) {
if (failOnWrongVariants) {
throw new IllegalStateException(message);
} else {
logger.warn(message);
}
}
private Integer getSamplePosition(LinkedHashMap<String, Integer> returnedSamplesPosition, BiMap<Integer, String> mapSampleIds,
Integer sampleId) {
String sampleName = mapSampleIds.get(sampleId);
Integer samplePosition = returnedSamplesPosition.get(sampleName);
return samplePosition;
}
/**
* Creates a SORTED MAP with the required samples position.
*
* @param studyConfiguration Study Configuration
* @return Sorted linked hash map
*/
private LinkedHashMap<String, Integer> getReturnedSamplesPosition(StudyConfiguration studyConfiguration) {
if (!returnedSamplesPositionMap.containsKey(studyConfiguration.getStudyId())) {
LinkedHashMap<String, Integer> samplesPosition = StudyConfiguration.getReturnedSamplesPosition(studyConfiguration,
returnedSamples == null ? null : new LinkedHashSet<>(returnedSamples), StudyConfiguration::getIndexedSamples);
returnedSamplesPositionMap.put(studyConfiguration.getStudyId(), samplesPosition);
}
return returnedSamplesPositionMap.get(studyConfiguration.getStudyId());
}
public static boolean isFailOnWrongVariants() {
return failOnWrongVariants;
}
public static void setFailOnWrongVariants(boolean b) {
failOnWrongVariants = b;
}
}