/*
* Copyright 2015-2016 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.opencga.storage.hadoop.variant.index.phoenix;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.phoenix.schema.PTable;
import org.apache.phoenix.schema.PTableType;
import org.apache.phoenix.schema.types.*;
import org.apache.phoenix.util.SchemaUtil;
import org.opencb.opencga.storage.core.metadata.StudyConfiguration;
import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptorUtils;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException;
import org.opencb.opencga.storage.hadoop.variant.GenomeHelper;
import org.opencb.opencga.storage.hadoop.variant.index.VariantTableStudyRow;
import org.opencb.opencga.storage.hadoop.variant.index.phoenix.PhoenixHelper.Column;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.*;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor.VariantQueryParams.ANNOT_CONSERVATION;
import static org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor.VariantQueryParams.ANNOT_FUNCTIONAL_SCORE;
import static org.opencb.opencga.storage.hadoop.variant.index.phoenix.VariantPhoenixHelper.VariantColumn.*;
/**
* Created on 15/12/15.
*
* @author Jacobo Coll <jacobo167@gmail.com>
*/
public class VariantPhoenixHelper {
public static final String STATS_PREFIX = "";
public static final byte[] STATS_PREFIX_BYTES = Bytes.toBytes(STATS_PREFIX);
public static final String ANNOTATION_PREFIX = "A_";
public static final String POPULATION_FREQUENCY_PREFIX = ANNOTATION_PREFIX + "PF_";
public static final String FUNCTIONAL_SCORE_PREFIX = ANNOTATION_PREFIX + "FS_";
public static final String STATS_PROTOBUF_SUFIX = "_PB";
public static final byte[] STATS_PROTOBUF_SUFIX_BYTES = Bytes.toBytes(STATS_PROTOBUF_SUFIX);
public static final String MAF_SUFIX = "_MAF";
public static final String MGF_SUFIX = "_MGF";
private static final String STUDY_POP_FREQ_SEPARATOR = "_";
private final PhoenixHelper phoenixHelper;
private final GenomeHelper genomeHelper;
protected static Logger logger = LoggerFactory.getLogger(VariantPhoenixHelper.class);
public enum VariantColumn implements Column {
CHROMOSOME("CHROMOSOME", PVarchar.INSTANCE),
POSITION("POSITION", PUnsignedInt.INSTANCE),
REFERENCE("REFERENCE", PVarchar.INSTANCE),
ALTERNATE("ALTERNATE", PVarchar.INSTANCE),
TYPE("TYPE", PVarchar.INSTANCE),
SO(ANNOTATION_PREFIX + "SO", PIntegerArray.INSTANCE),
GENES(ANNOTATION_PREFIX + "GENES", PVarcharArray.INSTANCE),
BIOTYPE(ANNOTATION_PREFIX + "BIOTYPE", PVarcharArray.INSTANCE),
TRANSCRIPTS(ANNOTATION_PREFIX + "TRANSCRIPTS", PVarcharArray.INSTANCE),
TRANSCRIPTION_FLAGS(ANNOTATION_PREFIX + "FLAGS", PVarcharArray.INSTANCE),
GENE_TRAITS_NAME(ANNOTATION_PREFIX + "GT_NAME", PVarcharArray.INSTANCE),
GENE_TRAITS_ID(ANNOTATION_PREFIX + "GT_ID", PVarcharArray.INSTANCE),
HPO(ANNOTATION_PREFIX + "HPO", PVarcharArray.INSTANCE),
PROTEIN_KEYWORDS(ANNOTATION_PREFIX + "PROT_KW", PVarcharArray.INSTANCE),
DRUG(ANNOTATION_PREFIX + "DRUG", PVarcharArray.INSTANCE),
XREFS(ANNOTATION_PREFIX + "XREFS", PVarcharArray.INSTANCE),
//Protein substitution scores
POLYPHEN(ANNOTATION_PREFIX + "POLYPHEN", PFloatArray.INSTANCE),
POLYPHEN_DESC(ANNOTATION_PREFIX + "POLYPHEN_DESC", PVarcharArray.INSTANCE),
SIFT(ANNOTATION_PREFIX + "SIFT", PFloatArray.INSTANCE),
SIFT_DESC(ANNOTATION_PREFIX + "SIFT_DESC", PVarcharArray.INSTANCE),
//Conservation Scores
PHASTCONS(ANNOTATION_PREFIX + "PHASTCONS", PFloat.INSTANCE),
PHYLOP(ANNOTATION_PREFIX + "PHYLOP", PFloat.INSTANCE),
GERP(ANNOTATION_PREFIX + "GERP", PFloat.INSTANCE),
//Functional Scores
CADD_SCALLED(FUNCTIONAL_SCORE_PREFIX + "CADD_S", PFloat.INSTANCE),
CADD_RAW(FUNCTIONAL_SCORE_PREFIX + "CADD_R", PFloat.INSTANCE),
FULL_ANNOTATION(ANNOTATION_PREFIX + "FULL", PVarchar.INSTANCE);
private final String columnName;
private final byte[] columnNameBytes;
private PDataType pDataType;
private final String sqlTypeName;
private final boolean nullable;
private static Map<String, Column> columns = null;
VariantColumn(String columnName, PDataType pDataType) {
this.columnName = columnName;
this.pDataType = pDataType;
this.sqlTypeName = pDataType.getSqlTypeName();
columnNameBytes = Bytes.toBytes(columnName);
nullable = false;
}
@Override
public String column() {
return columnName;
}
@Override
public byte[] bytes() {
return columnNameBytes;
}
@Override
public PDataType getPDataType() {
return pDataType;
}
@Override
public String sqlType() {
return sqlTypeName;
}
@Override
public boolean nullable() {
return nullable;
}
@Override
public String toString() {
return columnName;
}
public static Column getColumn(String columnName) {
if (columns == null) {
Map<String, Column> map = new HashMap<>();
for (VariantColumn column : VariantColumn.values()) {
map.put(column.column(), column);
}
columns = map;
}
return columns.get(columnName);
}
}
private static final Map<String, String> MAPPING_POPULATION_SUDIES;
private static final List<Column> HUMAN_POPULATION_FREQUENCIES_COLUMNS;
static {
HashMap<String, String> mappingPopulationStudies = new HashMap<>(2);
mappingPopulationStudies.put("1000GENOMES_PHASE_3", "1KG_PHASE3");
mappingPopulationStudies.put("ESP_6500", "ESP6500");
MAPPING_POPULATION_SUDIES = Collections.unmodifiableMap(mappingPopulationStudies);
HUMAN_POPULATION_FREQUENCIES_COLUMNS = Collections.unmodifiableList(Arrays.asList(
getPopulationFrequencyColumn("1kG_phase3", "ALL"),
getPopulationFrequencyColumn("1kG_phase3", "AFR"),
getPopulationFrequencyColumn("1kG_phase3", "AMR"),
getPopulationFrequencyColumn("1kG_phase3", "EAS"),
getPopulationFrequencyColumn("1kG_phase3", "EUR"),
getPopulationFrequencyColumn("1kG_phase3", "SAS"),
getPopulationFrequencyColumn("1kG_phase3", "ACB"),
getPopulationFrequencyColumn("1kG_phase3", "ASW"),
getPopulationFrequencyColumn("1kG_phase3", "BEB"),
getPopulationFrequencyColumn("1kG_phase3", "CDX"),
getPopulationFrequencyColumn("1kG_phase3", "CEU"),
getPopulationFrequencyColumn("1kG_phase3", "CHB"),
getPopulationFrequencyColumn("1kG_phase3", "CHD"),
getPopulationFrequencyColumn("1kG_phase3", "CHS"),
getPopulationFrequencyColumn("1kG_phase3", "CLM"),
getPopulationFrequencyColumn("1kG_phase3", "ESN"),
getPopulationFrequencyColumn("1kG_phase3", "FIN"),
getPopulationFrequencyColumn("1kG_phase3", "GBR"),
getPopulationFrequencyColumn("1kG_phase3", "GIH"),
getPopulationFrequencyColumn("1kG_phase3", "GWD"),
getPopulationFrequencyColumn("1kG_phase3", "IBS"),
getPopulationFrequencyColumn("1kG_phase3", "ITU"),
getPopulationFrequencyColumn("1kG_phase3", "JPT"),
getPopulationFrequencyColumn("1kG_phase3", "KHV"),
getPopulationFrequencyColumn("1kG_phase3", "LWK"),
getPopulationFrequencyColumn("1kG_phase3", "MSL"),
getPopulationFrequencyColumn("1kG_phase3", "MXL"),
getPopulationFrequencyColumn("1kG_phase3", "PEL"),
getPopulationFrequencyColumn("1kG_phase3", "PJL"),
getPopulationFrequencyColumn("1kG_phase3", "PUR"),
getPopulationFrequencyColumn("1kG_phase3", "STU"),
getPopulationFrequencyColumn("1kG_phase3", "TSI"),
getPopulationFrequencyColumn("1kG_phase3", "YRI"),
getPopulationFrequencyColumn("ESP6500", "ALL"),
getPopulationFrequencyColumn("ESP6500", "EA"),
getPopulationFrequencyColumn("ESP6500", "AA"),
getPopulationFrequencyColumn("EXAC", "ALL"),
getPopulationFrequencyColumn("EXAC", "AFR"),
getPopulationFrequencyColumn("EXAC", "AMR"),
getPopulationFrequencyColumn("EXAC", "EAS"),
getPopulationFrequencyColumn("EXAC", "FIN"),
getPopulationFrequencyColumn("EXAC", "NFE"),
getPopulationFrequencyColumn("EXAC", "OTH"),
getPopulationFrequencyColumn("EXAC", "SAS"),
getPopulationFrequencyColumn("GONL", "ALL"),
getPopulationFrequencyColumn("UK10K_ALSPAC", "ALL"),
getPopulationFrequencyColumn("UK10K_TWINSUK", "ALL")
));
}
public static List<Column> getHumanPopulationFrequenciesColumns() {
return HUMAN_POPULATION_FREQUENCIES_COLUMNS;
}
public VariantPhoenixHelper(GenomeHelper genomeHelper) {
this.genomeHelper = genomeHelper;
phoenixHelper = new PhoenixHelper(genomeHelper.getConf());
}
public Connection newJdbcConnection() throws SQLException, ClassNotFoundException {
return phoenixHelper.newJdbcConnection(genomeHelper.getConf());
}
public Connection newJdbcConnection(Configuration conf) throws SQLException, ClassNotFoundException {
return phoenixHelper.newJdbcConnection(conf);
}
public PhoenixHelper getPhoenixHelper() {
return phoenixHelper;
}
public void updateAnnotationColumns(Connection con, String tableName) throws SQLException {
List<Column> annotColumns = Arrays.asList(VariantColumn.values());
phoenixHelper.addMissingColumns(con, tableName, annotColumns, true);
}
public void updateStatsColumns(Connection con, String tableName, StudyConfiguration studyConfiguration) throws SQLException {
List<Column> columns = new ArrayList<>();
for (Integer cohortId : studyConfiguration.getCohortIds().values()) {
for (Column column : getStatsColumns(studyConfiguration.getStudyId(), cohortId)) {
columns.add(column);
}
}
phoenixHelper.addMissingColumns(con, tableName, columns, true);
}
public void registerNewStudy(Connection con, String table, Integer studyId) throws SQLException {
createTableIfNeeded(con, table);
addColumns(con, table, studyId, PUnsignedInt.INSTANCE, VariantTableStudyRow.HOM_REF, VariantTableStudyRow.PASS_CNT,
VariantTableStudyRow.CALL_CNT);
addColumns(con, table, studyId, PUnsignedIntArray.INSTANCE, VariantTableStudyRow.HET_REF, VariantTableStudyRow.HOM_VAR,
VariantTableStudyRow.OTHER, VariantTableStudyRow.NOCALL);
addColumns(con, table, studyId, PVarbinary.INSTANCE, VariantTableStudyRow.COMPLEX, VariantTableStudyRow.FILTER_OTHER);
con.commit();
}
public void createSchemaIfNeeded(Connection con, String schema) throws SQLException {
String sql = "CREATE SCHEMA IF NOT EXISTS \"" + schema + "\"";
logger.info(sql);
phoenixHelper.execute(con, sql);
}
public void createTableIfNeeded(Connection con, String table) throws SQLException {
if (!phoenixHelper.tableExists(con, table)) {
String sql = buildCreate(table);
logger.info(sql);
try {
phoenixHelper.execute(con, sql);
} catch (Exception e) {
if (!phoenixHelper.tableExists(con, table)) {
throw e;
} else {
logger.info("Table {} already exists", table);
logger.debug("Table " + table + " already exists. Hide exception", e);
}
}
} else {
logger.info("Table {} already exists", table);
}
}
private void addColumns(Connection con, String tableName, Integer studyId, PDataType<?> dataType, String ... columns)
throws SQLException {
for (String col : columns) {
String sql = phoenixHelper.buildAlterAddColumn(tableName,
VariantTableStudyRow.buildColumnKey(studyId, col), dataType.getSqlTypeName());
phoenixHelper.execute(con, sql);
}
}
public String buildCreate(String tableName) {
return buildCreate(tableName, Bytes.toString(genomeHelper.getColumnFamily()), PhoenixHelper.DEFAULT_TABLE_TYPE);
}
public String buildCreateView(String tableName) {
return buildCreateView(tableName, Bytes.toString(genomeHelper.getColumnFamily()));
}
public static String buildCreateView(String tableName, String columnFamily) {
return buildCreate(tableName, columnFamily, PTableType.VIEW);
}
public String buildCreateTable(String tableName) {
return buildCreateTable(tableName, Bytes.toString(genomeHelper.getColumnFamily()));
}
public static String buildCreateTable(String tableName, String columnFamily) {
return buildCreate(tableName, columnFamily, PTableType.TABLE);
}
public static String buildCreate(String tableName, String columnFamily, PTableType tableType) {
StringBuilder sb = new StringBuilder().append("CREATE ").append(tableType).append(" IF NOT EXISTS ")
.append(SchemaUtil.getEscapedFullTableName(tableName)).append(" ").append("(");
for (VariantColumn variantColumn : VariantColumn.values()) {
switch (variantColumn) {
case CHROMOSOME:
case POSITION:
sb.append(" ").append(variantColumn).append(" ").append(variantColumn.sqlType()).append(" NOT NULL , ");
break;
default:
sb.append(" ").append(variantColumn).append(" ").append(variantColumn.sqlType()).append(" , ");
break;
}
}
// for (Column column : VariantPhoenixHelper.HUMAN_POPULATION_FREQUENCIES_COLUMNS) {
// sb.append(" \"").append(column).append("\" ").append(column.sqlType()).append(" , ");
// }
return sb.append(" ")
.append("CONSTRAINT PK PRIMARY KEY (")
.append(CHROMOSOME).append(", ")
.append(POSITION).append(", ")
.append(REFERENCE).append(", ")
.append(ALTERNATE).append(") ").append(") ").toString();
}
public void createVariantIndexes(Connection con, String tableName) throws SQLException {
List<PhoenixHelper.Index> indices = getIndices(tableName);
phoenixHelper.createIndexes(con, tableName, indices, false);
}
public static List<PhoenixHelper.Index> getPopFreqIndices(String tableName) {
return Arrays.asList(getPopFreqIndex(tableName, "1kG_phase3", "ALL"), getPopFreqIndex(tableName, "EXAC", "ALL"));
}
public static PhoenixHelper.Index getPopFreqIndex(String tableName, String study, String population) {
TableName table = TableName.valueOf(tableName);
Column column = getPopulationFrequencyColumn(study, population);
List<Column> defaultInclude = Arrays.asList(GENES, SO);
return new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList(
"\"" + column.column() + "\"[2]",
"\"" + column.column() + "\"[1]"), defaultInclude);
}
public static List<PhoenixHelper.Index> getIndices(String tableName) {
TableName table = TableName.valueOf(tableName);
List<Column> defaultInclude = Arrays.asList(GENES, SO);
return Arrays.asList(
new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList(PHASTCONS), defaultInclude),
new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList(PHYLOP), defaultInclude),
new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList(GERP), defaultInclude),
new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList(CADD_RAW), defaultInclude),
new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList(CADD_SCALLED), defaultInclude),
// Index the min value
new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList("\"" + POLYPHEN + "\"[1]"), defaultInclude),
// Index the max value
new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList("\"" + SIFT + "\"[2]"), defaultInclude),
new PhoenixHelper.Index(table, PTable.IndexType.LOCAL, Arrays.asList(TYPE), defaultInclude)
// new PhoenixHelper.Index("POLYPHEN_IDX", PTable.IndexType.LOCAL,
// Arrays.asList(CHROMOSOME.column(), POSITION.column(), REFERENCE.column(), ALTERNATE.column(), POLYPHEN.column()),
// Arrays.asList(TYPE.column())),
// new PhoenixHelper.Index("SIFT_IDX", PTable.IndexType.LOCAL,
// Arrays.asList(CHROMOSOME.column(), POSITION.column(), REFERENCE.column(), ALTERNATE.column(), SIFT.column()),
// Arrays.asList(TYPE.column()))
);
}
public static Column getFunctionalScoreColumn(String source) {
return getFunctionalScoreColumn(source, true, source);
}
public static Column getFunctionalScoreColumn(String source, String rawValue) {
return getFunctionalScoreColumn(source, true, rawValue);
}
public static Column getFunctionalScoreColumn(String source, boolean throwException, String rawValue) {
switch (source.toUpperCase()) {
case "CADD_RAW":
return CADD_RAW;
case "CADD_SCALED":
return CADD_SCALLED;
default:
if (throwException) {
// throw VariantQueryException.malformedParam(ANNOT_FUNCTIONAL_SCORE, rawValue, "Unknown functional score.");
throw VariantQueryException.malformedParam(ANNOT_FUNCTIONAL_SCORE, rawValue);
} else {
logger.warn("Unknown Conservation source {}", source);
}
}
return Column.build(FUNCTIONAL_SCORE_PREFIX + source.toUpperCase(), PFloat.INSTANCE);
}
public static Column getPopulationFrequencyColumn(String study, String population) {
study = study.toUpperCase();
for (Map.Entry<String, String> entry : MAPPING_POPULATION_SUDIES.entrySet()) {
study = study.replace(entry.getKey(), entry.getValue());
}
return Column.build(POPULATION_FREQUENCY_PREFIX + study + STUDY_POP_FREQ_SEPARATOR + population.toUpperCase(),
PFloatArray.INSTANCE);
}
public static Column getPopulationFrequencyColumn(String studyPopulation) {
studyPopulation = studyPopulation.toUpperCase();
for (Map.Entry<String, String> entry : MAPPING_POPULATION_SUDIES.entrySet()) {
studyPopulation = studyPopulation.replace(entry.getKey(), entry.getValue());
}
String studyPopFreq = studyPopulation.replace(VariantDBAdaptorUtils.STUDY_POP_FREQ_SEPARATOR, STUDY_POP_FREQ_SEPARATOR);
return Column.build(POPULATION_FREQUENCY_PREFIX + studyPopFreq, PFloatArray.INSTANCE);
}
public static Column getConservationScoreColumn(String source)
throws VariantQueryException {
return getConservationScoreColumn(source, source, true);
}
public static Column getConservationScoreColumn(String source, String rawValue, boolean throwException)
throws VariantQueryException {
source = source.toUpperCase();
switch (source) {
case "PHASTCONS":
return PHASTCONS;
case "PHYLOP":
return PHYLOP;
case "GERP":
return GERP;
default:
if (throwException) {
throw VariantQueryException.malformedParam(ANNOT_CONSERVATION, rawValue);
} else {
logger.warn("Unknown Conservation source {}", rawValue);
}
return null;
}
}
public static List<Column> getStatsColumns(int studyId, int cohortId) {
return Arrays.asList(getStatsColumn(studyId, cohortId), getMafColumn(studyId, cohortId), getMgfColumn(studyId, cohortId));
}
public static Column getStatsColumn(int studyId, int cohortId) {
return Column.build(STATS_PREFIX + studyId + "_" + cohortId + STATS_PROTOBUF_SUFIX, PVarbinary.INSTANCE);
}
public static Column getStudyColumn(int studyId) {
return Column.build(VariantTableStudyRow.buildColumnKey(studyId, VariantTableStudyRow.HOM_REF), PUnsignedInt.INSTANCE);
}
public static Column getMafColumn(int studyId, int cohortId) {
return Column.build(STATS_PREFIX + studyId + "_" + cohortId + MAF_SUFIX, PFloat.INSTANCE);
}
public static Column getMgfColumn(int studyId, int cohortId) {
return Column.build(STATS_PREFIX + studyId + "_" + cohortId + MGF_SUFIX, PFloat.INSTANCE);
}
}