/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant.index;
import com.google.common.collect.BiMap;
import com.google.protobuf.InvalidProtocolBufferException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.tools.variant.merge.VariantMerger;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.hadoop.variant.AbstractHBaseMapReduce;
import org.opencb.opencga.storage.hadoop.variant.GenomeHelper;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveResultToVariantConverter;
import org.opencb.opencga.storage.hadoop.variant.converters.HBaseToVariantConverter;
import org.opencb.opencga.storage.hadoop.variant.models.protobuf.VariantTableStudyRowsProto;
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Abstract variant table map reduce.
*
* @author Matthias Haimel mh719+git@cam.ac.uk
*/
public abstract class AbstractVariantTableMapReduce extends AbstractHBaseMapReduce<ImmutableBytesWritable, Mutation> {
public static final String COUNTER_GROUP_NAME = "OPENCGA.HBASE";
public static final String SPECIFIC_PUT = "opencga.storage.hadoop.hbase.merge.use_specific_put";
public static final String ARCHIVE_GET_BATCH_SIZE = "opencga.storage.hadoop.hbase.merge.archive.scan.batchsize";
protected ArchiveResultToVariantConverter resultConverter;
protected VariantMerger variantMerger;
protected Set<String> currentIndexingSamples;
protected Integer archiveBatchSize;
protected ArchiveResultToVariantConverter getResultConverter() {
return resultConverter;
}
protected VariantMerger getVariantMerger() {
return variantMerger;
}
/**
* Extracts file Ids from column names - ignoring _V columns.
* @param value
* @return Set of file IDs
*/
private Set<Integer> extractFileIds(Result value) {
return Arrays.stream(value.rawCells())
.filter(c -> Bytes.equals(CellUtil.cloneFamily(c), getHelper().getColumnFamily()))
.filter(c -> !Bytes.startsWith(CellUtil.cloneQualifier(c), GenomeHelper.VARIANT_COLUMN_B_PREFIX))
.map(c -> Integer.parseInt(Bytes.toString(CellUtil.cloneQualifier(c))))
.collect(Collectors.toSet());
}
protected List<Variant> parseCurrentVariantsRegion(List<Cell> variantCells, String chromosome) {
List<VariantTableStudyRow> tableStudyRows = parseVariantStudyRowsFromArchive(variantCells, chromosome);
HBaseToVariantConverter converter = getHbaseToVariantConverter();
List<Variant> variants = new ArrayList<>(tableStudyRows.size());
for (VariantTableStudyRow tableStudyRow : tableStudyRows) {
variants.add(converter.convert(tableStudyRow));
}
return variants;
}
protected List<VariantTableStudyRow> parseVariantStudyRowsFromArchive(List<Cell> variantCells, String chr) {
return variantCells.stream().flatMap(c -> {
try {
byte[] protoData = CellUtil.cloneValue(c);
if (protoData != null && protoData.length > 0) {
List<VariantTableStudyRow> tableStudyRows =
parseVariantStudyRowsFromArchive(chr, VariantTableStudyRowsProto.parseFrom(protoData));
return tableStudyRows.stream();
}
return Stream.empty();
} catch (InvalidProtocolBufferException e) {
throw new IllegalStateException(e);
}
}).collect(Collectors.toList());
}
protected List<VariantTableStudyRow> parseVariantStudyRowsFromArchive(String chr, VariantTableStudyRowsProto
variantTableStudyRowsProto) {
return variantTableStudyRowsProto.getRowsList().stream()
.map(v -> new VariantTableStudyRow(v, chr, getStudyConfiguration().getStudyId()))
.collect(Collectors.toList());
}
/**
* Load (if available) current data, merge information and store new object in DB.
*
* @param context Context
* @param analysisVar Analysis variants
* @param rows Variant Table rows
* @param newSampleIds Sample Ids currently processed
*/
protected void updateOutputTable(Context context, Collection<Variant> analysisVar,
List<VariantTableStudyRow> rows, Set<Integer> newSampleIds) {
int studyId = getStudyConfiguration().getStudyId();
BiMap<String, Integer> idMapping = getStudyConfiguration().getSampleIds();
for (Variant variant : analysisVar) {
VariantTableStudyRow row = updateOutputTable(context, studyId, idMapping, variant, newSampleIds);
rows.add(row);
}
}
protected VariantTableStudyRow updateOutputTable(Context context, int studyId, BiMap<String, Integer> idMapping,
Variant variant, Set<Integer> newSampleIds) {
try {
VariantTableStudyRow row = new VariantTableStudyRow(variant, studyId, idMapping);
boolean specificPut = context.getConfiguration().getBoolean(SPECIFIC_PUT, true);
Put put = null;
if (specificPut && null != newSampleIds) {
put = row.createSpecificPut(getHelper(), newSampleIds);
} else {
put = row.createPut(getHelper());
}
if (put != null) {
context.write(new ImmutableBytesWritable(getHelper().getOutputTable()), put);
context.getCounter(COUNTER_GROUP_NAME, "VARIANT_TABLE_ROW-put").increment(1);
}
return row;
} catch (RuntimeException | InterruptedException | IOException e) {
throw new IllegalStateException("Problems updating " + variant, e);
}
}
protected void updateOutputTable(Context context, Collection<VariantTableStudyRow> variants) {
for (VariantTableStudyRow variant : variants) {
Put put = variant.createPut(getHelper());
if (put != null) {
try {
context.write(new ImmutableBytesWritable(getHelper().getOutputTable()), put);
} catch (IOException | InterruptedException e) {
throw new IllegalStateException(e);
}
context.getCounter(COUNTER_GROUP_NAME, "VARIANT_TABLE_ROW-put").increment(1);
}
}
}
protected void updateArchiveTable(byte[] rowKey, Context context, List<VariantTableStudyRow> tableStudyRows) {
if (tableStudyRows.isEmpty()) {
getLog().info("No new data - tableStudyRows emtpy");
return;
}
getLog().info("Store variants: " + tableStudyRows.size());
Put put = new Put(rowKey);
for (VariantTableStudyRow row : tableStudyRows) {
byte[] value = VariantTableStudyRow.toProto(Collections.singletonList(row), getTimestamp()).toByteArray();
String column = GenomeHelper.getVariantcolumn(row);
put.addColumn(getHelper().getColumnFamily(), Bytes.toBytes(column), value);
}
try {
context.write(new ImmutableBytesWritable(getHelper().getIntputTable()), put);
} catch (IOException | InterruptedException e) {
throw new IllegalStateException(e);
}
context.getCounter(COUNTER_GROUP_NAME, "ARCHIVE_TABLE_ROW_PUT").increment(1);
context.getCounter(COUNTER_GROUP_NAME, "ARCHIVE_TABLE_ROWS_IN_PUT").increment(tableStudyRows.size());
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
this.archiveBatchSize = context.getConfiguration().getInt(ARCHIVE_GET_BATCH_SIZE, 500);
// Load VCF meta data for columns
int studyId = getStudyConfiguration().getStudyId();
resultConverter = new ArchiveResultToVariantConverter(studyId, getHelper().getColumnFamily(), this.getStudyConfiguration());
variantMerger = new VariantMerger(true);
variantMerger.setStudyId(Integer.toString(studyId));
String[] toIdxFileIds = context.getConfiguration().getStrings(AbstractVariantTableDriver.CONFIG_VARIANT_FILE_IDS, new String[0]);
if (toIdxFileIds.length == 0) {
throw new IllegalStateException(
"File IDs to be indexed not found in configuration: " + AbstractVariantTableDriver.CONFIG_VARIANT_FILE_IDS);
}
Set<String> toIndexSampleNames = new HashSet<>();
Set<Integer> toIndexFileIdSet = Arrays.stream(toIdxFileIds).map(id -> Integer.valueOf(id)).collect(Collectors.toSet());
BiMap<Integer, String> sampleIdToSampleName = StudyConfiguration.inverseMap(getStudyConfiguration().getSampleIds());
for (BiMap.Entry<Integer, LinkedHashSet<Integer>> entry : getStudyConfiguration().getSamplesInFiles().entrySet()) {
if (toIndexFileIdSet.contains(entry.getKey())) {
entry.getValue().forEach(sid -> toIndexSampleNames.add(sampleIdToSampleName.get(sid)));
}
}
getVariantMerger().setExpectedSamples(getIndexedSamples().keySet());
// Add all samples which are currently being indexed.
this.currentIndexingSamples = new HashSet<>(toIndexSampleNames);
getVariantMerger().addExpectedSamples(toIndexSampleNames);
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
if (null != this.getHelper()) {
this.getHelper().close();
}
}
@Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
getLog().info("Start mapping key: " + Bytes.toString(key.get()));
startTime();
if (value.isEmpty()) {
context.getCounter(COUNTER_GROUP_NAME, "VCF_RESULT_EMPTY").increment(1);
return; // TODO search backwards?
}
if (Bytes.equals(key.get(), getHelper().getMetaRowKey())) {
return; // ignore metadata column
}
context.getCounter(COUNTER_GROUP_NAME, "VCF_BLOCK_READ").increment(1);
// Calculate various positions
byte[] currRowKey = key.get();
String sliceKey = Bytes.toString(currRowKey);
VariantTableHelper h = getHelper();
String chr = h.extractChromosomeFromBlockId(sliceKey);
Long sliceReg = h.extractSliceFromBlockId(sliceKey);
long startPos = h.getStartPositionFromSlice(sliceReg);
long nextStartPos = h.getStartPositionFromSlice(sliceReg + 1);
Set<Integer> fileIds = extractFileIds(value);
if (getLog().isDebugEnabled()) {
getLog().debug("Results contain file IDs : " + StringUtils.join(fileIds, ','));
}
Set<Integer> sampleIds = new HashSet<>();
for (Integer fid : fileIds) {
LinkedHashSet<Integer> sids = getStudyConfiguration().getSamplesInFiles().get(fid);
sampleIds.addAll(sids);
}
getLog().debug("Processing slice {}", sliceKey);
VariantMapReduceContext ctx = new VariantMapReduceContext(currRowKey, context, value, fileIds,
sampleIds, chr, startPos, nextStartPos);
endTime("1 Prepare slice");
/* *********************************** */
/* ********* CALL concrete class ***** */
doMap(ctx);
/* *********************************** */
// Clean up of this slice
for (Entry<String, Long> entry : this.getTimes().entrySet()) {
context.getCounter(COUNTER_GROUP_NAME, "VCF_TIMER_" + entry.getKey().replace(' ', '_')).increment(entry.getValue());
}
this.getTimes().clear();
getLog().info("Finished mapping key: " + Bytes.toString(key.get()));
}
abstract void doMap(VariantMapReduceContext ctx) throws IOException, InterruptedException;
protected static class VariantMapReduceContext {
public VariantMapReduceContext(byte[] currRowKey, Context context, Result value, Set<Integer> fileIds,
Set<Integer> sampleIds, String chr, long startPos, long nextStartPos) {
this.currRowKey = currRowKey;
this.context = context;
this.value = value;
this.fileIds = fileIds;
this.sampleIds = sampleIds;
this.chr = chr;
this.startPos = startPos;
this.nextStartPos = nextStartPos;
}
protected final byte[] currRowKey;
protected final Context context;
protected final Result value;
protected final Set<Integer> fileIds;
protected final Set<Integer> sampleIds;
private final String chr;
protected final long startPos;
protected final long nextStartPos;
public byte[] getCurrRowKey() {
return currRowKey;
}
public Context getContext() {
return context;
}
public Result getValue() {
return value;
}
public Set<Integer> getSampleIds() {
return sampleIds;
}
public String getChromosome() {
return chr;
}
}
}