package org.gbif.occurrence.persistence.hbase;
import org.gbif.api.vocabulary.Extension;
import org.gbif.api.vocabulary.OccurrenceIssue;
import org.gbif.dwc.terms.GbifInternalTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.TermFactory;
import org.gbif.dwc.terms.UnknownTerm;
import org.gbif.occurrence.common.TermUtils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.hadoop.hbase.util.Bytes;
import static com.google.common.base.Preconditions.checkNotNull;
/**
* Utility class to deal with occurrence hbase columns.
* Primarily translate from Terms to their corresponding HBase column name (in the occurrence table), but
* also deals with any other names used, e.g. identifiers, issue columns, etc.
*/
public class Columns {
// the one column family for all columns of the occurrence table
public static final String OCCURRENCE_COLUMN_FAMILY = "o";
public static final byte[] CF = Bytes.toBytes(OCCURRENCE_COLUMN_FAMILY);
private static final Pattern PREFIX_REPLACE = Pattern.compile(":");
// a prefix required for all non term based columns
private static final String INTERNAL_PREFIX = "_";
// the counter table is a single cell that is the "autoincrement" number for new keys, with column family, column,
// and key ("row" in hbase speak)
public static final String COUNTER_COLUMN = "id";
// the lookup table is a secondary index of unique ids (holy triplet or publisher-provided) to GBIF integer keys
public static final String LOOKUP_KEY_COLUMN = "i";
public static final String LOOKUP_LOCK_COLUMN = "l";
public static final String LOOKUP_STATUS_COLUMN = "s";
// each UnknownTerm is prefixed differently
private static final String VERBATIM_TERM_PREFIX = "v_";
// a single occurrence will have 0 or more OccurrenceIssues. Once column per issue, each one prefixed
private static final String ISSUE_PREFIX = INTERNAL_PREFIX + "iss_";
// An occurrence can have 0-n identifiers, each of a certain type. Their column names look like _t1, _i1, _t2, _i2,
// etc.
private static final String IDENTIFIER_TYPE_COLUMN = INTERNAL_PREFIX + "t";
private static final String IDENTIFIER_COLUMN = INTERNAL_PREFIX + "i";
public static final String EXTENSION_CANT_BE_NULL_MSG = "extension can't be null";
/**
* Should never be instantiated.
*/
private Columns() {
}
/**
* Returns the column for the given term.
* If an interpreted column exists for the given term it will be returned, otherwise the verbatim column will be
* used.
* Not that GbifInternalTerm are always interpreted and do not exist as verbatim columns.
* Asking for a "secondary" interpreted term like country which is used during interpretation but not stored
* will result in an IllegalArgumentException. dwc:countryCode is the right term in this case.
* Key terms like taxonID or occurrenceID are considered verbatim terms and do not map to the respective GBIF
* columns.
* Please use the GbifTerm enum for those!
*/
public static String column(Term term) {
if (term instanceof GbifInternalTerm || TermUtils.isOccurrenceJavaProperty(term) || GbifTerm.mediaType == term) {
return column(term, "");
} else if (TermUtils.isInterpretedSourceTerm(term)) {
// "secondary" terms used in interpretation but not used to store the interpreted values should never be asked for
throw new IllegalArgumentException("The term " + term + " is interpreted and only relevant for verbatim values");
} else {
return verbatimColumn(term);
}
}
/**
* Return the column for the given extension. There will always be both verbatim and interpreted versions of each
* extension. This is the interpreted extension's column.
*
* @param extension the column to build
*
* @return the extension's column name
*/
public static String column(Extension extension) {
checkNotNull(extension, EXTENSION_CANT_BE_NULL_MSG);
return column(extension, "");
}
/**
* Returns the verbatim column for a term.
* GbifInternalTerm is not permitted and will result in an IllegalArgumentException!
*/
public static String verbatimColumn(Term term) {
if (term instanceof GbifInternalTerm) {
throw new IllegalArgumentException(
"Internal terms (like the tried [" + term.simpleName() + "]) do not exist as verbatim columns");
}
return column(term, VERBATIM_TERM_PREFIX);
}
/**
* Return the verbatim column for the given extension. There will always be both verbatim and interpreted versions of
* each extension. This is the verbatim extension's column.
*
* @param extension the column to build
*
* @return the extension's column name
*/
public static String verbatimColumn(Extension extension) {
checkNotNull(extension, EXTENSION_CANT_BE_NULL_MSG);
return column(extension, VERBATIM_TERM_PREFIX);
}
private static String column(Extension extension, String colPrefix) {
return colPrefix + PREFIX_REPLACE.matcher(extension.getRowType()).replaceAll("_");
}
private static String column(Term term, String colPrefix) {
checkNotNull(term, "term can't be null");
// unknown terms will never be mapped in Hive, and we can't replace : with anything and guarantee that it will
// be reversible
if (term instanceof UnknownTerm) {
return colPrefix + term.qualifiedName();
}
// known terms are mapped to their unique simple name with an optional (v_) prefix
return colPrefix + term.simpleName();
}
public static String idColumn(int index) {
return IDENTIFIER_COLUMN + index;
}
public static String idTypeColumn(int index) {
return IDENTIFIER_TYPE_COLUMN + index;
}
/**
* Returns the term for a strictly verbatim column.
* If the column given is not a verbatim column, null will be returned.
*/
@Nullable
public static Term termFromVerbatimColumn(byte[] qualifier) {
checkNotNull(qualifier, "qualifier can't be null");
String colName = Bytes.toString(qualifier);
if (!colName.startsWith(VERBATIM_TERM_PREFIX)) {
// we asked for a verbatim column but this one lacks the verbatim prefix!
return null;
}
// this is a verbatim term column
return TermFactory.instance().findTerm(colName.substring(VERBATIM_TERM_PREFIX.length()));
}
public static String column(OccurrenceIssue issue) {
checkNotNull(issue, "issue can't be null");
return ISSUE_PREFIX + issue.name();
}
}