package org.gbif.occurrence.download.hive;
import org.gbif.api.vocabulary.Extension;
import org.gbif.api.vocabulary.OccurrenceIssue;
import org.gbif.dwc.terms.GbifInternalTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.occurrence.persistence.hbase.Columns;
import java.util.List;
import java.util.Set;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
/**
* This provides the definition of the HBase occurrence table, for use as a Hive table.
*/
public class OccurrenceHBaseTableDefinition {
private static final String HBASE_KEY_MAPPING = ":key"; // mapping to the HBase row key
/**
* Assemble the mapping for verbatim fields.
*
* @return the list of fields that are used in the verbatim context
*/
private static List<HBaseField> verbatimFields() {
Set<Term> exclusions =
ImmutableSet.<Term>of(GbifTerm.gbifID, GbifTerm.mediaType // stripped explicitly as it is handled as an array
);
ImmutableList.Builder<HBaseField> builder = ImmutableList.builder();
for (Term t : Terms.verbatimTerms()) {
if (!exclusions.contains(t)) {
builder.add(verbatimField(t));
}
}
return builder.build();
}
/**
* Assemble the mapping for interpreted fields, taking note that in reality, many are mounted onto the vebatim
* HBase columns.
*
* @return the list of fields that are used in the interpreted context
*/
private static List<HBaseField> interpretedFields() {
Set<Term> exclusions = ImmutableSet.<Term>of(GbifTerm.gbifID, // treated as a special field (primary key)
GbifTerm.mediaType, // stripped explicitly as it is handled as an array
GbifTerm.issue // stripped explicitly as it is handled as an array
);
ImmutableList.Builder<HBaseField> builder = ImmutableList.builder();
for (Term t : Terms.interpretedTerms()) {
if (!exclusions.contains(t)) {
builder.add(interpretedField(t));
}
}
return builder.build();
}
/**
* The internal fields stored in HBase which we wish to expose through Hive. The fragment and fragment hash
* are removed and not present.
*
* @return the list of fields that are exposed through Hive
*/
private static List<HBaseField> internalFields() {
Set<GbifInternalTerm> exclusions = ImmutableSet.of(GbifInternalTerm.fragmentHash, GbifInternalTerm.fragment);
ImmutableList.Builder<HBaseField> builder = ImmutableList.builder();
for (GbifInternalTerm t : GbifInternalTerm.values()) {
if (!exclusions.contains(t)) {
// they are mapped the same as interpreted terms in HBase
builder.add(interpretedField(t));
}
}
return builder.build();
}
/**
* The fields stored in HBase which represent occurrence issues.
*
* @return the list of issue fields that are exposed through Hive
*/
private static List<HBaseField> issueFields() {
ImmutableList.Builder<HBaseField> builder = ImmutableList.builder();
for (OccurrenceIssue issue : OccurrenceIssue.values()) {
builder.add(new HBaseField(GbifTerm.issue, // repeated for all, as they become an array
HiveColumns.columnFor(issue), HiveDataTypes.TYPE_INT, // always
Columns.OCCURRENCE_COLUMN_FAMILY + ":" + Columns.column(issue)));
}
return builder.build();
}
/**
* The fields stored in HBase which represent an extension.
*
* @return the list of fields that are exposed through Hive
*/
private static List<HBaseField> extensions() {
// only MULTIMEDIA is supported
Set<Extension> extensions = ImmutableSet.of(Extension.MULTIMEDIA);
ImmutableList.Builder<HBaseField> builder = ImmutableList.builder();
for (Extension e : extensions) {
builder.add(new HBaseField(GbifTerm.Multimedia,
HiveColumns.columnFor(e),
HiveDataTypes.TYPE_STRING,
// always, as it has a custom serialization
Columns.OCCURRENCE_COLUMN_FAMILY + ':' + Columns.column(e)));
}
return builder.build();
}
/**
* Constructs the field for the primary key, which is a special case in that it needs a special mapping.
*/
private static HBaseField keyField() {
return new HBaseField(GbifTerm.gbifID,
HiveColumns.columnFor(GbifTerm.gbifID),
HiveDataTypes.typeForTerm(GbifTerm.gbifID, true),
HBASE_KEY_MAPPING
// special(!) mapping just for key
);
}
/**
* Generates the conceptual definition for the occurrence tables when used in hive.
*
* @return a list of fields, with the types.
*/
public static List<HBaseField> definition() {
return ImmutableList.<HBaseField>builder()
.add(keyField())
.addAll(verbatimFields())
.addAll(internalFields())
.addAll(interpretedFields())
.addAll(issueFields())
.addAll(extensions())
.build();
}
/**
* Constructs a Field for the given term, when used in the verbatim context.
*/
private static HBaseField verbatimField(Term term) {
return new HBaseField(term,
HiveColumns.VERBATIM_COL_PREFIX + term.simpleName().toLowerCase(),
// no escape needed, due to prefix
HiveDataTypes.typeForTerm(term, true),
// verbatim context
Columns.OCCURRENCE_COLUMN_FAMILY + ':' + Columns.verbatimColumn(term));
}
/**
* Constructs a Field for the given term, when used in the interpreted context context.
*/
private static HBaseField interpretedField(Term term) {
return new HBaseField(term, HiveColumns.columnFor(term),
// note that Columns takes care of whether this is mounted on a verbatim or an interpreted
// column in HBase for us
HiveDataTypes.typeForTerm(term, false), // not verbatim context
Columns.OCCURRENCE_COLUMN_FAMILY + ':' + Columns.column(term));
}
/**
* Hidden constructor.
*/
private OccurrenceHBaseTableDefinition() {
//empty constructor
}
}