package org.gbif.occurrence.common;
import org.gbif.api.vocabulary.Extension;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifInternalTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.dwc.terms.TermFactory;
import java.util.List;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
/**
* Static utils class to deal with Term enumeration for occurrences.
* Note to developers:
* If you modify this class, make sure to have a look at org.gbif.occurrence.download.hive.Terms in the
* occurrence-hdfs-table module.
*
*/
public class TermUtils {
private static final Set<Term> EXTENSION_TERMS =
ImmutableSet.copyOf(Iterables.transform(ImmutableList.copyOf(Extension.values()), new Function<Extension, Term>() {
@Nullable
@Override
public Term apply(@Nullable Extension e) {
return TermFactory.instance().findTerm(e.getRowType());
}
}));
private static final Set<? extends Term> INTERPRETED_DATES = ImmutableSet.of(DwcTerm.eventDate,
DwcTerm.dateIdentified,
GbifTerm.lastInterpreted,
GbifTerm.lastParsed,
GbifTerm.lastCrawled,
DcTerm.modified,
GbifInternalTerm.fragmentCreated);
private static final Set<? extends Term> INTERPRETED_NUM = ImmutableSet.of(DwcTerm.year,
DwcTerm.month,
DwcTerm.day,
DwcTerm.individualCount,
GbifTerm.taxonKey,
GbifTerm.kingdomKey,
GbifTerm.phylumKey,
GbifTerm.classKey,
GbifTerm.orderKey,
GbifTerm.familyKey,
GbifTerm.genusKey,
GbifTerm.subgenusKey,
GbifTerm.speciesKey,
GbifInternalTerm.crawlId,
GbifInternalTerm.identifierCount);
private static final Set<? extends Term> INTERPRETED_BOOLEAN =
ImmutableSet.of(GbifTerm.hasCoordinate, GbifTerm.hasGeospatialIssues);
private static final Set<? extends Term> COMPLEX_TYPE = ImmutableSet.of(GbifTerm.mediaType, GbifTerm.issue);
private static final Set<? extends Term> INTERPRETED_DOUBLE = ImmutableSet.of(DwcTerm.decimalLatitude,
DwcTerm.decimalLongitude,
GbifTerm.coordinateAccuracy,
GbifTerm.elevation,
GbifTerm.elevationAccuracy,
GbifTerm.depth,
GbifTerm.depthAccuracy,
DwcTerm.coordinateUncertaintyInMeters,
DwcTerm.coordinatePrecision);
private static final Set<? extends Term> NON_OCCURRENCE_TERMS =
(Set<? extends Term>) ImmutableSet.copyOf(Iterables.concat(DwcTerm.listByGroup(DwcTerm.GROUP_MEASUREMENTORFACT),
DwcTerm.listByGroup(DwcTerm.GROUP_RESOURCERELATIONSHIP),
Sets.newHashSet(GbifTerm.infraspecificMarker,
GbifTerm.isExtinct,
GbifTerm.isFreshwater,
GbifTerm.isHybrid,
GbifTerm.isMarine,
GbifTerm.isPlural,
GbifTerm.isPreferredName,
GbifTerm.isTerrestrial,
GbifTerm.livingPeriod,
GbifTerm.lifeForm,
GbifTerm.ageInDays,
GbifTerm.sizeInMillimeter,
GbifTerm.massInGram,
GbifTerm.organismPart,
GbifTerm.appendixCITES,
GbifTerm.typeDesignatedBy,
GbifTerm.typeDesignationType,
GbifTerm.canonicalName,
GbifTerm.nameType,
GbifTerm.verbatimLabel,
GbifTerm.infraspecificMarker)));
/**
* Interpreted terms that exist as java properties on Occurrence.
*/
private static final Set<? extends Term> JAVA_PROPERTY_TERMS = ImmutableSet.of(DwcTerm.decimalLatitude,
DwcTerm.decimalLongitude,
DwcTerm.continent,
DwcTerm.waterBody,
DwcTerm.stateProvince,
DwcTerm.countryCode,
DwcTerm.dateIdentified,
DwcTerm.eventDate,
DwcTerm.year,
DwcTerm.month,
DwcTerm.day,
DwcTerm.kingdom,
DwcTerm.phylum,
DwcTerm.class_,
DwcTerm.order,
DwcTerm.family,
DwcTerm.genus,
DwcTerm.subgenus,
GbifTerm.species,
DwcTerm.scientificName,
DwcTerm.taxonRank,
GbifTerm.genericName,
DwcTerm.specificEpithet,
DwcTerm.infraspecificEpithet,
DwcTerm.basisOfRecord,
DwcTerm.individualCount,
DwcTerm.sex,
DwcTerm.lifeStage,
DwcTerm.establishmentMeans,
GbifTerm.taxonKey,
DwcTerm.typeStatus,
GbifTerm.typifiedName,
GbifTerm.kingdomKey,
GbifTerm.phylumKey,
GbifTerm.classKey,
GbifTerm.orderKey,
GbifTerm.familyKey,
GbifTerm.genusKey,
GbifTerm.subgenusKey,
GbifTerm.speciesKey,
GbifTerm.datasetKey,
GbifTerm.publishingCountry,
GbifTerm.lastInterpreted,
DcTerm.modified,
DwcTerm.coordinateUncertaintyInMeters,
DwcTerm.coordinatePrecision,
GbifTerm.elevation,
GbifTerm.elevationAccuracy,
GbifTerm.depth,
GbifTerm.depthAccuracy,
GbifInternalTerm.unitQualifier,
GbifTerm.issue,
DcTerm.references,
GbifTerm.datasetKey,
GbifTerm.publishingCountry,
GbifTerm.protocol,
GbifTerm.lastCrawled,
GbifTerm.lastParsed,
DcTerm.license);
/**
* TODO: is this correct? -> Terms used during interpretation and superseded by an interpreted property
*/
private static final Set<? extends Term> INTERPRETED_SOURCE_TERMS =
(Set<? extends Term>) ImmutableSet.copyOf(Iterables.concat(JAVA_PROPERTY_TERMS,
Lists.newArrayList(DwcTerm.decimalLatitude,
DwcTerm.decimalLongitude,
DwcTerm.verbatimLatitude,
DwcTerm.verbatimLongitude,
DwcTerm.verbatimCoordinates,
DwcTerm.geodeticDatum,
DwcTerm.coordinateUncertaintyInMeters,
DwcTerm.coordinatePrecision,
DwcTerm.continent,
DwcTerm.waterBody,
DwcTerm.stateProvince,
DwcTerm.country,
DwcTerm.countryCode,
DwcTerm.scientificName,
DwcTerm.scientificNameAuthorship,
DwcTerm.taxonRank,
DwcTerm.kingdom,
DwcTerm.phylum,
DwcTerm.class_,
DwcTerm.order,
DwcTerm.family,
DwcTerm.genus,
DwcTerm.subgenus,
GbifTerm.genericName,
DwcTerm.specificEpithet,
DwcTerm.infraspecificEpithet,
DcTerm.modified,
DwcTerm.dateIdentified,
DwcTerm.eventDate,
DwcTerm.year,
DwcTerm.month,
DwcTerm.day,
DwcTerm.minimumDepthInMeters,
DwcTerm.maximumDepthInMeters,
DwcTerm.minimumElevationInMeters,
DwcTerm.maximumElevationInMeters,
DwcTerm.associatedMedia)));
/**
* Term list of the extension exluding the coreid just as defined by:
* http://rs.gbif.org/extension/gbif/1.0/multimedia.xml
*/
private static final List<DcTerm> MULTIMEDIA_TERMS = ImmutableList.of(DcTerm.type,
DcTerm.format,
DcTerm.identifier,
DcTerm.references,
DcTerm.title,
DcTerm.description,
DcTerm.created,
DcTerm.creator,
DcTerm.contributor,
DcTerm.publisher,
DcTerm.audience,
DcTerm.source,
DcTerm.license,
DcTerm.rightsHolder);
private TermUtils() {
// private constructor
}
/**
* Lists all terms that have been used during interpretation and are superseded by an interpreted,
* typed java Occurrence property.
*
* @return iterable of terms that have been used during interpretation
*/
public static Iterable<? extends Term> interpretedSourceTerms() {
return INTERPRETED_SOURCE_TERMS;
}
/**
* @return true if the term is used during interpretation and superseded by an interpreted property
*/
public static boolean isInterpretedSourceTerm(Term term) {
return INTERPRETED_SOURCE_TERMS.contains(term);
}
/**
* @return true if the term is an interpreted value stored as a java property on Occurrence.
*/
public static boolean isOccurrenceJavaProperty(Term term) {
return JAVA_PROPERTY_TERMS.contains(term);
}
/**
* Lists all terms relevant for an interpreted occurrence record, starting with occurrenceID as the key.
* UnknownTerms are not included as they are open ended.
*/
public static Iterable<? extends Term> interpretedTerms() {
return Iterables.concat(Lists.newArrayList(GbifTerm.gbifID),
Iterables.filter(Lists.newArrayList(DcTerm.values()), new Predicate<DcTerm>() {
@Override
public boolean apply(@Nullable DcTerm t) {
return !t.isClass() && (!INTERPRETED_SOURCE_TERMS.contains(t) || JAVA_PROPERTY_TERMS.contains(t));
}
}), Iterables.filter(Lists.newArrayList(DwcTerm.values()), new Predicate<DwcTerm>() {
@Override
public boolean apply(@Nullable DwcTerm t) {
return !t.isClass() && !NON_OCCURRENCE_TERMS.contains(t) && (!INTERPRETED_SOURCE_TERMS.contains(t)
|| JAVA_PROPERTY_TERMS.contains(t));
}
}), Iterables.filter(Lists.newArrayList(GbifTerm.values()), new Predicate<GbifTerm>() {
@Override
public boolean apply(@Nullable GbifTerm t) {
// GbifTerm.coordinateAccuracy is deprecated
return !t.isClass() && !NON_OCCURRENCE_TERMS.contains(t) && GbifTerm.gbifID != t
&& GbifTerm.coordinateAccuracy !=t;
}
}));
}
/**
* Lists all terms relevant for a verbatim occurrence record.
* gbifID is included and comes first as its the foreign key to the core record.
* UnknownTerms are not included as they are open ended.
*/
public static Iterable<? extends Term> verbatimTerms() {
return Iterables.concat(Lists.newArrayList(GbifTerm.gbifID),
Iterables.filter(Lists.newArrayList(DcTerm.values()), new Predicate<DcTerm>() {
@Override
public boolean apply(@Nullable DcTerm t) {
return !t.isClass();
}
}), Iterables.filter(Lists.newArrayList(DwcTerm.values()), new Predicate<DwcTerm>() {
@Override
public boolean apply(@Nullable DwcTerm t) {
return !t.isClass() && !NON_OCCURRENCE_TERMS.contains(t);
}
}));
}
/**
* Lists all terms relevant for a multimedia extension record.
* gbifID is included and comes first as its the foreign key to the core record.
*/
public static Iterable<? extends Term> multimediaTerms() {
return Iterables.concat(Lists.newArrayList(GbifTerm.gbifID), MULTIMEDIA_TERMS);
}
/**
* @return true if the term is an interpreted date and stored as a binary in HBase
*/
public static boolean isInterpretedDate(Term term) {
return INTERPRETED_DATES.contains(term);
}
/**
* @return true if the term is an interpreted numerical and stored as a binary in HBase
*/
public static boolean isInterpretedNumerical(Term term) {
return INTERPRETED_NUM.contains(term);
}
/**
* @return true if the term is an interpreted double and stored as a binary in HBase
*/
public static boolean isInterpretedDouble(Term term) {
return INTERPRETED_DOUBLE.contains(term);
}
/**
* @return true if the term is an interpreted boolean and stored as a binary in HBase
*/
public static boolean isInterpretedBoolean(Term term) {
return INTERPRETED_BOOLEAN.contains(term);
}
/**
* @return true if the term is an complex type in Hive or Hbase: array, struct, json, etc.
*/
public static boolean isComplexType(Term term) {
return COMPLEX_TYPE.contains(term);
}
public static boolean isExtensionTerm(Term term) {
return EXTENSION_TERMS.contains(term);
}
}