/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.hpg.bigdata.tools.converters.mr.hbase;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.ga4gh.models.Call;
import org.ga4gh.models.Variant;
import org.opencb.hpg.bigdata.tools.utils.HBaseUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import java.util.stream.Collectors;
/**
* @author Matthias Haimel mh719+git@cam.ac.uk
*/
public class GenomeVariantConverter extends Mapper<AvroKey<Variant>, NullWritable, String, ByteBuffer> {
private final Logger log = LoggerFactory.getLogger(GenomeVariantConverter.class);
private static final String ILLUMINA_GVCF_BLOCK_END = "END";
public static final Integer BUCKET_SIZE = 100;
private DatumWriter<Variant> variantDatumWriter = new SpecificDatumWriter<Variant>(Variant.class);
public GenomeVariantConverter() {
// TODO Auto-generated constructor stub
}
@Override
protected void map(AvroKey<Variant> key, NullWritable value, Mapper<AvroKey<Variant>, NullWritable, String, ByteBuffer>.Context context)
throws IOException, InterruptedException {
Variant variant = key.datum();
List<Variant> varList = process(context, variant);
Map<String, List<Variant>> grouped = groupVariants(varList);
Map<String, ByteBuffer> packageVariants = packageVariants(grouped);
submit(context, packageVariants);
}
private void submit(Context context, Map<String, ByteBuffer> packageVariants) throws IOException, InterruptedException {
for (Entry<String, ByteBuffer> entry : packageVariants.entrySet()) {
context.write(entry.getKey(), entry.getValue());
}
}
private ByteBuffer convert(List<Variant> list) {
StringBuilder sb = new StringBuilder();
for (Variant variant : list) {
sb.append(variant); // as JSON for testing TODO change to propper format
}
ByteBuffer wrap = ByteBuffer.wrap(sb.toString().getBytes());
return wrap;
}
private Map<String, ByteBuffer> packageVariants(Map<String, List<Variant>> groups) {
Map<String, ByteBuffer> pack = new HashMap<>();
groups.forEach((k, v) -> pack.put(k, convert(v)));
return pack;
}
private String generateBlockId(Variant var) {
long start = var.getStart().longValue() / BUCKET_SIZE;
StringBuilder sb = new StringBuilder(var.getReferenceName());
sb.append("_");
sb.append(start);
return sb.toString();
}
private Map<String, List<Variant>> groupVariants(List<Variant> varList) {
Map<String, List<Variant>> grouped = varList.stream().collect(Collectors.groupingBy(var -> generateBlockId(var)));
return grouped;
}
public List<Variant> process(Context context, Variant variant) {
if (isReference(variant)) { // is just reference
List<Variant> varList = expandReferenceRegion(context, variant);
return varList;
} else { // is a variant (not just coverage info)
List<Variant> varList = expandAltRegion(context, variant);
return varList;
/* Ignore fields */
// List<String> ids = v.getAlleleIds(); // graph mode -> not supported
}
}
private List<Variant> expandAltRegion(Context context, Variant variant) {
List<Variant> varList = new ArrayList<Variant>(1);
String refBases = variant.getReferenceBases();
List<String> altBasesList = variant.getAlternateBases();
int altCnt = altBasesList.size();
List<Call> calls = nonull(variant.getCalls());
if (altCnt > 1) {
context.getCounter("VCF", "biallelic_COUNT").increment(1);
// e.g.
// 1 10409 . ACCCTAACCCTAACCCTAACCCTAACCCTAAC A,ACCCTAACCCTAACCCTAACCCTAACCCTAA
// GT:GQ:GQX:DPI:AD 1/2:265:12:14:4,6,6
}
if (calls.isEmpty()) { // IF not call made - still store it.
context.getCounter("VCF", "NO_CALL_COUNT").increment(1);
}
for (int altIdx = 0; altIdx < altCnt; ++altIdx) {
String altBases = altBasesList.get(altIdx);
// variant.getAlleleIds() TODO
if (altBases.length() >= HBaseUtils.SV_THRESHOLD || refBases.length() >= HBaseUtils.SV_THRESHOLD) {
// KEEP SV information for the moment - evaluate if there are issues.
// TODO change if needed
context.getCounter("VCF", "SV_LIMIT_REACHED_COUNT").increment(1);
}
}
varList.add(variant);
return varList;
}
private boolean isReference(Variant variant) {
return null == variant.getAlternateBases() || variant.getAlternateBases().isEmpty();
}
protected List<Variant> expandReferenceRegion(Context context, Variant variant) {
Long start = variant.getStart();
Long endPos = start + 1;
List<Call> calls = nonull(variant.getCalls());
boolean nocall = calls.isEmpty();
context.getCounter("VCF", "REG_EXPAND" + (nocall ? "_NOCALL" : "")).increment(1);
Map<String, List<String>> info = new HashMap<String, List<String>>(variant.getInfo());
List<String> endLst = nonull(info.remove(ILLUMINA_GVCF_BLOCK_END)); // Get End position
if (endLst.isEmpty()) {
// Region of size 1
context.getCounter("VCF", "REF_END_EMPTY" + (nocall ? "_NOCALL" : "")).increment(1);
} else {
String endStr = endLst.get(0).toString();
endPos = Long.valueOf(endStr);
}
String counterName = "REG_EXPAND_CNT" + (nocall ? "_NOCALL" : "");
context.getCounter("VCF", counterName).increment((endPos - start));
List<Variant> expVarList = expand(variant, start, endPos, info, calls);
return expVarList;
}
protected List<Variant> expand(Variant variant, Long start, Long end, Map<String, List<String>> info, List<Call> calls) {
List<Variant> varList = new ArrayList<>(Long.valueOf((end - start)).intValue());
List<String> names = variant.getNames();
String setId = variant.getVariantSetId();
Long created = variant.getCreated();
String id = variant.getId();
Long updated = variant.getUpdated();
String refName = variant.getReferenceName();
String refBases = variant.getReferenceBases();
// the following parameters shouldn't really matter
// List<String> alleleIds = variant.getAlleleIds();
List<String> alternateBases = variant.getAlternateBases();
for (Long pos = start; pos < end; ++pos) {
Variant var = new Variant();
/* from parameter */
var.setStart(pos);
var.setEnd(pos + 1);
var.setInfo(info);
var.setCalls(calls);
/* from variant */
var.setNames(names);
var.setVariantSetId(setId);
var.setCreated(created);
var.setId(id);
var.setUpdated(updated);
var.setReferenceName(refName);
var.setReferenceBases(refBases);
var.setAlternateBases(alternateBases);
// var.setAlleleIds(alleleIds);
/* ADD */
varList.add(var);
}
return varList;
}
private <T> List<T> nonull(List<T> list) {
if (null == list) {
return Collections.emptyList();
}
return list;
}
// public static Logger getLog() {
// return log;
// }
}