/* * Copyright 2015-2016 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.opencga.storage.hadoop.variant.archive; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.BufferedMutator; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Put; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.protobuf.VcfSliceProtos.VcfSlice; import org.opencb.biodata.tools.variant.converters.proto.VariantToProtoVcfRecord; import org.opencb.biodata.tools.variant.converters.proto.VariantToVcfSliceConverter; import org.opencb.commons.run.ParallelTaskRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; /** * @author Matthias Haimel mh719+git@cam.ac.uk */ public class VariantHbaseTransformTask implements ParallelTaskRunner.Task<Variant, VcfSlice> { protected final Logger logger = LoggerFactory.getLogger(VariantHbaseTransformTask.class); private static final List<VcfSlice> EMPTY_LIST = Collections.emptyList(); private final VariantToVcfSliceConverter converter; private final ArchiveHelper helper; private final Set<String> storedChr; private final Set<String> lookup; private final Map<String, List<Variant>> buffer; private final LinkedList<String> lookupOrder; private final AtomicLong timeProto = new AtomicLong(0); private final AtomicLong timeIndex = new AtomicLong(0); private final AtomicLong timePut = new AtomicLong(0); private final AtomicInteger bufferSize = new AtomicInteger(200); private final TableName tableName; private Connection connection; private BufferedMutator tableMutator; /** * @param helper {@link ArchiveHelper} * @param table {@link String} HBase table name */ public VariantHbaseTransformTask(ArchiveHelper helper, String table) { converter = new VariantToVcfSliceConverter(); this.helper = helper; storedChr = new HashSet<>(); lookup = new HashSet<>(); buffer = new HashMap<>(); lookupOrder = new LinkedList<>(); this.tableName = table == null ? null : TableName.valueOf(table); } public void setBufferSize(Integer size) { this.bufferSize.set(size); } public int getBufferSize() { return bufferSize.get(); } @Override public List<VcfSlice> apply(List<Variant> batch) { return encodeVariants(batch); } protected List<VcfSlice> encodeVariants(List<Variant> variants) { long curr = System.currentTimeMillis(); variants.forEach(var -> addVariant(var)); this.timeIndex.addAndGet(System.currentTimeMillis() - curr); List<VcfSlice> data = checkSlices(getBufferSize()); curr = System.currentTimeMillis(); submit(data); this.timePut.addAndGet(System.currentTimeMillis() - curr); return data; } @Override public List<VcfSlice> drain() { List<VcfSlice> data = checkSlices(0); submit(data); return data; } private void submit(List<VcfSlice> data) { if (null != this.tableName) { List<Put> putList = data.stream().map(s -> this.getHelper().wrap(s)).collect(Collectors.toList()); try { this.tableMutator.mutate(putList); } catch (IOException e) { throw new RuntimeException(String.format("Problems submitting %s data to hbase %s ", putList.size(), this.tableName.getNameAsString()), e); } } } private List<VcfSlice> checkSlices(int limit) { if (lookupOrder.size() < limit) { return EMPTY_LIST; } SortedMap<Long, String> keys = orderKeys(lookupOrder, storedChr, getHelper().getChunkSize()); List<VcfSlice> retSlice = new ArrayList<>(); while (lookupOrder.size() > limit) { // key buffer size if (keys.isEmpty()) { keys = orderKeys(lookupOrder, storedChr, getHelper().getChunkSize()); // next Chromosome starts } Long firstKey = keys.firstKey(); String key = keys.remove(firstKey); lookupOrder.remove(key); List<Variant> data = buffer.remove(key); if (data.size() > 0) { String chr = data.get(0).getChromosome(); if (storedChr.add(chr)) { logger.debug("Flush for {}: {}", chr, lookupOrder); lookup.clear(); // clear for each Chromosome lookup.addAll(lookupOrder); } long sliceStart = getHelper().extractPositionFromBlockId(key); // List<Variant> varLst = data.stream().map(v -> new // Variant(v)).collect(Collectors.toList()); long curr = System.currentTimeMillis(); VcfSlice slice = converter.convert(data, (int) sliceStart); this.timeProto.addAndGet(System.currentTimeMillis() - curr); retSlice.add(slice); } } return retSlice; } private SortedMap<Long, String> orderKeys(LinkedList<String> lookupOrder, Set<String> storedChr, int chunkSize) { List<String> currentChr = lookupOrder.stream().filter(s -> storedChr.contains(getHelper().splitBlockId(s)[0])) .collect(Collectors.toList()); // first finish off current chromosome if (currentChr.isEmpty()) { currentChr = new ArrayList<String>(lookupOrder); } // find min position TreeMap<Long, String> idx = new TreeMap<>(); for (String slice : currentChr) { Long extr = getHelper().extractPositionFromBlockId(slice); idx.put(extr, slice); } return idx; } private void addVariant(Variant var) { String chromosome = var.getChromosome(); long[] coveredSlicePositions = getCoveredSlicePositions(var); for (long slicePos : coveredSlicePositions) { String blockKey = getHelper().generateBlockId(chromosome, slicePos); addVariant(blockKey, var); } } private void addVariant(String blockKey, Variant var) { List<Variant> list = null; if (!lookup.contains(blockKey)) { lookup.add(blockKey); lookupOrder.add(blockKey); list = new ArrayList<Variant>(); buffer.put(blockKey, list); } else { list = buffer.get(blockKey); } if (list == null) { logger.error(" Current lookup queue: " + StringUtils.join(lookupOrder, ',')); logger.error(String.format("Current Variant: %s", var.getImpl())); throw new IllegalStateException("Input file not sorted!!!: " + blockKey); } list.add(var); } private long[] getCoveredSlicePositions(Variant var) { return getCoveredSlicePositions(var.getChromosome(), var.getStart(), var.getEnd(), getHelper().getChunkSize()); } public static long[] getCoveredSlicePositions(String chromosome, long start, long end, int chunkSize) { long startChunk = VariantToProtoVcfRecord.getSlicePosition((int) start, chunkSize); long endChunk = VariantToProtoVcfRecord.getSlicePosition((int) end, chunkSize); if (endChunk == startChunk) { return new long[]{startChunk}; } int len = (int) ((endChunk - startChunk) / chunkSize) + 1; long[] ret = new long[len]; for (int i = 0; i < len; ++i) { ret[i] = startChunk + (((long) i) * chunkSize); } return ret; } @Override public void pre() { if (null != this.tableName) { try { logger.info("Open connection using " + getHelper().getConf()); connection = ConnectionFactory.createConnection(getHelper().getConf()); tableMutator = connection.getBufferedMutator(this.tableName); } catch (IOException e) { throw new RuntimeException("Failed to connect to Hbase", e); } } } @Override public void post() { logger.info(String.format("Time norm2proto: %s", this.timeProto.get())); logger.info(String.format("Time idx: %s", this.timeIndex.get())); if (null != this.tableName) { if (null != this.tableMutator) { try { this.tableMutator.close(); } catch (IOException e) { logger.error("Problem closing Table mutator from HBase", e); } finally { this.tableMutator = null; } } if (null != connection) { try { connection.close(); } catch (IOException e) { logger.error("Issue with closing DB connection", e); } finally { connection = null; } logger.info(String.format("Time put: %s", this.timeIndex.get())); } } } private ArchiveHelper getHelper() { return this.helper; } }