package org.opencb.opencga.storage.hadoop.variant.annotation;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.phoenix.schema.types.PDataType;
import org.apache.phoenix.schema.types.PhoenixArray;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.VariantAnnotation;
import org.opencb.commons.datastore.core.ObjectMap;
import org.opencb.opencga.storage.core.config.StorageConfiguration;
import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotatorException;
import org.opencb.opencga.storage.core.variant.annotation.annotators.VariantAnnotator;
import org.opencb.opencga.storage.core.variant.annotation.annotators.VariantAnnotatorFactory;
import org.opencb.opencga.storage.hadoop.variant.AbstractHBaseMapReduce;
import org.opencb.opencga.storage.hadoop.variant.GenomeHelper;
import org.opencb.opencga.storage.hadoop.variant.converters.annotation.VariantAnnotationToHBaseConverter;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.PhoenixHelper;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper;
import java.io.IOException;
import java.sql.Array;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;
/**
* Created by mh719 on 15/12/2016.
*/
public class AnalysisAnnotateMapper extends AbstractHBaseMapReduce<NullWritable, PhoenixVariantAnnotationWritable> {
public static final String CONFIG_VARIANT_TABLE_ANNOTATE_FORCE = "opencga.variant.table.annotate.force";
private VariantAnnotator variantAnnotator;
private byte[] studiesRow;
private boolean forceAnnotation;
private VariantAnnotationToHBaseConverter annotationConverter;
private VariantPhoenixHelper.VariantColumn[] columnsOrdered;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
this.forceAnnotation = context.getConfiguration().getBoolean(CONFIG_VARIANT_TABLE_ANNOTATE_FORCE, false);
studiesRow = getHelper().generateVariantRowKey(GenomeHelper.DEFAULT_METADATA_ROW_KEY, 0);
/* Annotation -> Phoenix converter */
annotationConverter = new VariantAnnotationToHBaseConverter(getHelper());
columnsOrdered = VariantPhoenixHelper.VariantColumn.values();
/* Annotator config */
String configFile = "storage-configuration.yml";
String storageEngine = "hadoop"; //
ObjectMap options = new ObjectMap(); // empty
try {
StorageConfiguration storageConfiguration = StorageConfiguration.load(
StorageConfiguration.class.getClassLoader().getResourceAsStream(configFile));
this.variantAnnotator = VariantAnnotatorFactory.buildVariantAnnotator(storageConfiguration, storageEngine, options);
} catch (Exception e) {
throw new IllegalStateException("Problems loading storage configuration from " + configFile, e);
}
}
private final CopyOnWriteArrayList<Variant> variantsToAnnotate = new CopyOnWriteArrayList<>();
private void annotateVariants(Context context, boolean force) throws IOException, InterruptedException, VariantAnnotatorException {
if (this.variantsToAnnotate.isEmpty()) {
return;
}
// not enough data
if (this.variantsToAnnotate.size() < 200 && !force) {
return;
}
long start = System.nanoTime();
getLog().info("Annotate {} variants ... ", this.variantsToAnnotate.size());
List<VariantAnnotation> annotate = this.variantAnnotator.annotate(this.variantsToAnnotate);
getLog().info("Submit {} [annot time: {}] ... ", annotate.size(), System.nanoTime() - start);
start = System.nanoTime();
for (VariantAnnotation annotation : annotate) {
Map<PhoenixHelper.Column, ?> columnMap = annotationConverter.convert(annotation);
List<Object> orderedValues = toOrderedList(columnMap);
PhoenixVariantAnnotationWritable writeable = new PhoenixVariantAnnotationWritable(orderedValues);
context.getCounter("opencga", "variant.annotate.submit").increment(1);
context.write(NullWritable.get(), writeable);
}
getLog().info("Done [submit time: {}] ... ", System.nanoTime() - start);
this.variantsToAnnotate.clear();
}
@Override
public void run(Context context) throws IOException, InterruptedException {
this.setup(context);
try {
while (context.nextKeyValue()) {
this.map(context.getCurrentKey(), context.getCurrentValue(), context);
annotateVariants(context, false);
}
annotateVariants(context, true);
} catch (VariantAnnotatorException e) {
throw new RuntimeException(e);
} finally {
this.cleanup(context);
}
}
@Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException,
InterruptedException {
String hexBytes = Bytes.toHex(key.get());
Cell[] cells = value.rawCells();
try {
if (cells.length < 2) {
context.getCounter("opencga", "row.empty").increment(1);
return;
}
if (!Bytes.startsWith(value.getRow(), this.studiesRow)) { // ignore _METADATA row
context.getCounter("opencga", "variant.read").increment(1);
getLog().info("Convert ... ");
long start = System.nanoTime();
Variant variant = this.getHbaseToVariantConverter().convert(value);
if (!requireAnnotation(variant)) {
context.getCounter("opencga", "variant.no-annotation-required").increment(1);
return; // No annotation needed
}
getLog().info("Add to annotate set {} [convert time: {}] ... ", variant, System.nanoTime() - start);
variantsToAnnotate.add(variant);
}
} catch (Exception e) {
throw new IllegalStateException("Problems with row [hex:" + hexBytes + "] for cells " + cells.length, e);
}
}
private List<Object> toOrderedList(Map<PhoenixHelper.Column, ?> columnMap) {
List<Object> orderedValues = new ArrayList<>(columnsOrdered.length);
for (VariantPhoenixHelper.VariantColumn column : columnsOrdered) {
Object columnValue = columnMap.get(column);
if (columnValue != null) {
if (column.getPDataType().isArrayType()) {
if (columnValue instanceof Collection) {
columnValue = toArray(column.getPDataType(), (Collection) columnValue);
} else {
throw new IllegalArgumentException("Column " + column + " is not a collection " + columnValue);
}
}
orderedValues.add(columnValue);
} else {
orderedValues.add(column.getPDataType().getSqlType());
}
}
return orderedValues;
}
private Array toArray(PDataType elementDataType, Collection<?> input) {
if (elementDataType.isArrayType()) {
elementDataType = PDataType.arrayBaseType(elementDataType);
}
return new PhoenixArray(elementDataType, input.toArray(new Object[input.size()]));
}
private boolean requireAnnotation(Variant variant) {
if (this.forceAnnotation) {
return true;
}
VariantAnnotation annotation = variant.getAnnotation();
if (annotation == null) {
return true;
}
// Chromosome not set -> require annotation !!!!
return StringUtils.isEmpty(annotation.getChromosome());
}
private boolean isEmpty(Collection<?> collection) {
if (null == collection) {
return true;
}
return collection.isEmpty();
}
}