/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
package org.opencb.opencga.storage.hadoop.variant;
import com.google.protobuf.MessageLite;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.phoenix.query.QueryConstants;
import org.apache.phoenix.schema.types.PUnsignedInt;
import org.apache.phoenix.schema.types.PVarchar;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.opencga.storage.hadoop.utils.HBaseManager;
import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveDriver;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableStudyRow;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
/**
* @author Matthias Haimel mh719+git@cam.ac.uk.
*/
public class GenomeHelper implements AutoCloseable {
private final Logger logger = LoggerFactory.getLogger(GenomeHelper.class);
public static final String CONFIG_STUDY_ID = "opencga.study.id";
//upload HBase jars and jars for any of the configured job classes via the distributed cache (tmpjars).
public static final String CONFIG_HBASE_ADD_DEPENDENCY_JARS = "opencga.hbase.addDependencyJars";
public static final String CONFIG_HBASE_COLUMN_FAMILY = "opencga.hbase.column_family";
public static final String METADATA_PREFIX = "_";
public static final String DEFAULT_METADATA_ROW_KEY = "_METADATA";
public static final String DEFAULT_ROWKEY_SEPARATOR = "_";
public static final String DEFAULT_COLUMN_FAMILY = "0"; // MUST BE UPPER CASE!!!
public static final String VARIANT_COLUMN_PREFIX = "_V";
public static final byte[] VARIANT_COLUMN_B_PREFIX = Bytes.toBytes(VARIANT_COLUMN_PREFIX);
private final AtomicInteger chunkSize = new AtomicInteger(ArchiveDriver.DEFAULT_CHUNK_SIZE);
private final char separator;
private final byte[] columnFamily;
private final byte[] metaRowKey;
private final String metaRowKeyString;
private final Configuration conf;
protected final HBaseManager hBaseManager;
private final int studyId;
public GenomeHelper(Configuration conf, Connection connection) {
this.conf = conf;
this.separator = conf.get(ArchiveDriver.CONFIG_ARCHIVE_ROW_KEY_SEPARATOR, DEFAULT_ROWKEY_SEPARATOR).charAt(0);
// TODO: Check if columnFamily is upper case
// Phoenix local indexes fail if the default_column_family is lower case
// TODO: Report this bug to phoenix JIRA
this.columnFamily = Bytes.toBytes(conf.get(CONFIG_HBASE_COLUMN_FAMILY, DEFAULT_COLUMN_FAMILY));
this.metaRowKeyString = DEFAULT_METADATA_ROW_KEY;
this.metaRowKey = Bytes.toBytes(metaRowKeyString);
this.chunkSize.set(conf.getInt(ArchiveDriver.CONFIG_ARCHIVE_CHUNK_SIZE, ArchiveDriver.DEFAULT_CHUNK_SIZE));
this.studyId = conf.getInt(CONFIG_STUDY_ID, -1);
this.hBaseManager = new HBaseManager(conf, connection);
}
public GenomeHelper(Configuration conf) {
this(conf, null);
}
public GenomeHelper(GenomeHelper other) {
this(other.getConf(), other.getHBaseManager().getCloseConnection() ? null : other.getHBaseManager()
.getConnection());
}
public Configuration getConf() {
return conf;
}
public HBaseManager getHBaseManager() {
return hBaseManager;
}
public static String printClassJarPath(Class<?> clazz) {
StringBuilder sb = new StringBuilder();
String nl = "\n";
sb.append(clazz.getProtectionDomain().getCodeSource().getLocation()).append(nl);
sb.append(clazz.getResource('/' + clazz.getName().replace('.', '/') + ".class")).append(nl);
return sb.toString();
}
public static String printSystemProperties() {
StringBuilder sb = new StringBuilder();
String nl = "\n";
System.getProperties().forEach((a, b) -> sb.append(a + " - " + b).append(nl));
return sb.toString();
}
public static String printConfig(Configuration conf) {
StringBuilder sb = new StringBuilder();
String nl = "\n";
conf.iterator().forEachRemaining(e -> sb.append(e.getKey() + " - " + e.getValue()).append(nl));
return sb.toString();
}
public static void setChunkSize(Configuration conf, Integer size) {
conf.setInt(ArchiveDriver.CONFIG_ARCHIVE_CHUNK_SIZE, size);
}
public static void setStudyId(Configuration conf, Integer studyId) {
conf.setInt(CONFIG_STUDY_ID, studyId);
}
public int getStudyId() {
return this.studyId;
}
public char getSeparator() {
return separator;
}
public byte[] getColumnFamily() {
return columnFamily;
}
public int getChunkSize() {
return chunkSize.get();
}
public long getSliceId(long position) {
return getChunkSize() > 0
? position / (long) getChunkSize()
: position;
}
public long getStartPositionFromSlice(long slice) {
return slice * (long) getChunkSize();
}
public byte[] getMetaRowKey() {
return metaRowKey;
}
public String getMetaRowKeyString() {
return metaRowKeyString;
}
/**
* Generates a Row key based on Chromosome and position adjusted for the
* Chunk size. <br>
* <ul>
* <li>Using {@link #standardChromosome(String)} to get standard chromosome
* name
* <li>Using {@link #getSliceId(long)} to return slice position
* </ul>
* e.g. using chunk size 100, separator _ with chr2 and 1234 would result in
* 2_12
*
* @param chrom Chromosome name
* @param position Genomic position
* @return {@link String} Row key string
*/
public String generateBlockId(String chrom, long position) {
return generateBlockIdFromSlice(chrom, getSliceId(position));
}
public String generateBlockIdFromSlice(String chrom, long slice) {
StringBuilder sb = new StringBuilder(standardChromosome(chrom));
sb.append(getSeparator());
sb.append(String.format("%012d", slice));
return sb.toString();
}
/**
* Changes the String from {@link #generateBlockId(String, long)} to bytes.
*
* @param chrom Chromosome
* @param start Position
* @return {@link Byte} array
*/
public byte[] generateBlockIdAsBytes(String chrom, int start) {
return Bytes.toBytes(generateBlockId(chrom, start));
}
public String extractChromosomeFromBlockId(String blockId) {
return extractChromosomeFromBlockId(splitBlockId(blockId));
}
public String extractChromosomeFromBlockId(String[] strings) {
return strings[0];
}
public Long extractSliceFromBlockId(String blockId) {
return Long.valueOf(splitBlockId(blockId)[1]);
}
public Long extractPositionFromBlockId(String blockId) {
return Long.valueOf(splitBlockId(blockId)[1]) * getChunkSize();
}
public String[] splitBlockId(String blockId) {
char sep = getSeparator();
String[] split = StringUtils.splitPreserveAllTokens(blockId, sep);
if (split.length < 2) {
throw new IllegalStateException(
String.format("Block ID is not valid - expected 2 or more blocks separated by `%s`; value `%s`", sep, blockId));
}
// Should parse contigs with separator in names, e.g. NC_007605
StringBuilder contig = new StringBuilder();
for (int i = 0; i < split.length - 1; i++) {
contig.append(split[i]);
if (i < split.length - 2) {
contig.append(String.valueOf(sep));
}
}
String[] res = {contig.toString(), split[split.length - 1]};
return res;
}
/* ***************
* Variant Row Key helper methods
*
* Generators and extractors
*
*/
public byte[] generateVariantRowKey(String chrom, int position) {
return generateVariantRowKey(chrom, position, "", "");
}
public byte[] generateVariantRowKey(Variant var) {
return generateVariantRowKey(var.getChromosome(), var.getStart(), var.getReference(), var.getAlternate());
}
/**
* Generates a Row key based on Chromosome, position, ref and alt. <br>
* <ul>
* <li>Using {@link #standardChromosome(String)} to get standard chromosome
* name
* </ul>
*
* @param chrom Chromosome name
* @param position Genomic position
* @param ref Reference name
* @param alt Alt name
* @return {@link String} Row key string
*/
public byte[] generateVariantRowKey(String chrom, int position, String ref, String alt) {
int size = PVarchar.INSTANCE.estimateByteSizeFromLength(chrom.length())
+ QueryConstants.SEPARATOR_BYTE_ARRAY.length
+ PUnsignedInt.INSTANCE.estimateByteSize(position)
+ PVarchar.INSTANCE.estimateByteSizeFromLength(ref.length());
if (!alt.isEmpty()) {
size += QueryConstants.SEPARATOR_BYTE_ARRAY.length
+ PVarchar.INSTANCE.estimateByteSizeFromLength(alt.length());
}
byte[] rk = new byte[size];
int offset = 0;
offset += PVarchar.INSTANCE.toBytes(chrom, rk, offset);
rk[offset++] = QueryConstants.SEPARATOR_BYTE;
offset += PUnsignedInt.INSTANCE.toBytes(position, rk, offset);
// Separator not needed. PUnsignedInt.INSTANCE.isFixedWidth() = true
offset += PVarchar.INSTANCE.toBytes(ref, rk, offset);
if (!alt.isEmpty()) {
// If the last element is null, don't require separator
rk[offset++] = QueryConstants.SEPARATOR_BYTE;
offset += PVarchar.INSTANCE.toBytes(alt, rk, offset);
}
// assert offset == size;
return rk;
}
/**
* TODO: Query CellBase to get the chromosomes and sizes!
* @param numberOfSplits Number of splits
* @param keyGenerator Function to generate the rowKeys given a chromosome and a start
* @return List of splits
*/
public static List<byte[]> generateBootPreSplitsHuman(int numberOfSplits, BiFunction<String, Integer, byte[]> keyGenerator) {
String[] chr = new String[]{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "X", "Y", };
long[] posarr = new long[]{249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663,
146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753,
81195210, 78077248, 59128983, 63025520, 48129895, 51304566, 155270560, 59373566, };
Map<String, Long> regions = new HashMap<>();
for (int i = 0; i < chr.length; i++) {
regions.put(chr[i], posarr[i]);
}
return generateBootPreSplits(numberOfSplits, keyGenerator, regions);
}
static List<byte[]> generateBootPreSplits(int numberOfSplits, BiFunction<String, Integer, byte[]> keyGenerator,
Map<String, Long> regionsMap) {
// Create a sorted map for the regions that sorts as will sort HBase given the row_key generator
// In archive table, chr1 goes after chr19, and in Variants table, chr1 is always the first
SortedMap<String, Long> sortedRegions = new TreeMap<>((s1, s2) ->
Bytes.compareTo(keyGenerator.apply(s1, 0), keyGenerator.apply(s2, 0)));
sortedRegions.putAll(regionsMap);
long total = sortedRegions.values().stream().reduce((a, b) -> a + b).orElse(0L);
long chunkSize = total / numberOfSplits;
List<byte[]> splitList = new ArrayList<>();
long splitPos = chunkSize;
while (splitPos < total) {
long tmpPos = 0;
String chr = null;
for (Map.Entry<String, Long> entry : sortedRegions.entrySet()) {
long v = entry.getValue();
if ((tmpPos + v) > splitPos) {
chr = entry.getKey();
break;
}
tmpPos += v;
}
byte[] rowKey = keyGenerator.apply(chr, (int) (splitPos - tmpPos));
splitList.add(rowKey);
splitPos += chunkSize;
}
return splitList;
}
public byte[] generateVariantPositionPrefix(String chrom, Long position) {
int pos = position.intValue();
int size = PVarchar.INSTANCE.estimateByteSizeFromLength(chrom.length()) + QueryConstants.SEPARATOR_BYTE_ARRAY.length
+ PUnsignedInt.INSTANCE.estimateByteSize(pos);
byte[] rk = new byte[size];
int offset = 0;
offset += PVarchar.INSTANCE.toBytes(chrom, rk, offset);
rk[offset++] = QueryConstants.SEPARATOR_BYTE;
offset += PUnsignedInt.INSTANCE.toBytes(pos, rk, offset);
return rk;
}
public Variant extractVariantFromVariantRowKey(byte[] variantRowKey) {
int chrPosSeparator = ArrayUtils.indexOf(variantRowKey, (byte) 0);
String chromosome = (String) PVarchar.INSTANCE.toObject(variantRowKey, 0, chrPosSeparator, PVarchar.INSTANCE);
Integer intSize = PUnsignedInt.INSTANCE.getByteSize();
int position = (Integer) PUnsignedInt.INSTANCE.toObject(variantRowKey, chrPosSeparator + 1, intSize, PUnsignedInt.INSTANCE);
int referenceOffset = chrPosSeparator + 1 + intSize;
int refAltSeparator = ArrayUtils.indexOf(variantRowKey, (byte) 0, referenceOffset);
String reference;
String alternate;
if (refAltSeparator < 0) {
reference = (String) PVarchar.INSTANCE.toObject(variantRowKey, referenceOffset, variantRowKey.length - referenceOffset,
PVarchar.INSTANCE);
alternate = "";
} else {
reference = (String) PVarchar.INSTANCE.toObject(variantRowKey, referenceOffset, refAltSeparator - referenceOffset,
PVarchar.INSTANCE);
alternate = (String) PVarchar.INSTANCE.toObject(variantRowKey, refAltSeparator + 1,
variantRowKey.length - (refAltSeparator + 1), PVarchar.INSTANCE);
}
try {
return new Variant(chromosome, position, reference, alternate);
} catch (RuntimeException e) {
throw new IllegalStateException("Problems creating variant using [chr:"
+ chromosome + ", pos:" + position + ", ref:" + reference + ", alt:" + alternate + "];[hexstring:"
+ Bytes.toHex(variantRowKey) + "]", e);
}
}
//
// public String[] splitVariantRowkey (String rowkey) {
// char sep = getSeparator();
// String[] split = StringUtils.splitPreserveAllTokens(rowkey, sep);
// if (split.length < 2)
// throw new IllegalStateException(String.format("Variant rowkey is not valid - exected >2 blocks separaed by `%s`; value
// `%s`", sep,
// rowkey));
// return split;
// }
/**
* Creates a standard chromosome name from the provided string.
*
* @param chrom Chromosome string
* @return String chromosome name
*/
public String standardChromosome(String chrom) {
if (chrom.startsWith("chr")) {
return chrom.substring(2);
} // TODO MT, X, Y, ...
return chrom;
}
public <T extends MessageLite> Put wrapAsPut(byte[] column, byte[] row, T meta) {
byte[] data = meta.toByteArray();
Put put = new Put(row);
put.addColumn(getColumnFamily(), column, data);
return put;
}
@Override
public void close() throws IOException {
this.hBaseManager.close();
}
public static List<Cell> getVariantColumns(Cell[] cells) {
return Arrays.stream(cells).filter(c -> Bytes.startsWith(CellUtil.cloneQualifier(c), VARIANT_COLUMN_B_PREFIX))
.collect(Collectors.toList());
}
public static String getVariantcolumn(VariantTableStudyRow row) {
return VARIANT_COLUMN_PREFIX + "_" + row.getPos() + "_" + row.getRef() + "_" + row.getAlt();
}
}