package org.gbif.occurrence.persistence.util; import org.gbif.api.model.common.Identifier; import org.gbif.api.model.common.MediaObject; import org.gbif.api.model.occurrence.Occurrence; import org.gbif.api.model.occurrence.VerbatimOccurrence; import org.gbif.api.util.ClassificationUtils; import org.gbif.api.util.VocabularyUtils; import org.gbif.api.vocabulary.BasisOfRecord; import org.gbif.api.vocabulary.Continent; import org.gbif.api.vocabulary.Country; import org.gbif.api.vocabulary.EndpointType; import org.gbif.api.vocabulary.EstablishmentMeans; import org.gbif.api.vocabulary.Extension; import org.gbif.api.vocabulary.IdentifierType; import org.gbif.api.vocabulary.License; import org.gbif.api.vocabulary.LifeStage; import org.gbif.api.vocabulary.OccurrenceIssue; import org.gbif.api.vocabulary.OccurrenceSchemaType; import org.gbif.api.vocabulary.Rank; import org.gbif.api.vocabulary.Sex; import org.gbif.api.vocabulary.TypeStatus; import org.gbif.dwc.terms.DcTerm; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifInternalTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; import org.gbif.hbase.util.ResultReader; import org.gbif.occurrence.common.TermUtils; import org.gbif.occurrence.common.json.ExtensionSerDeserUtils; import org.gbif.occurrence.common.json.MediaSerDeserUtils; import org.gbif.occurrence.persistence.api.Fragment; import org.gbif.occurrence.persistence.hbase.Columns; import org.gbif.occurrence.persistence.hbase.ExtResultReader; import java.util.Date; import java.util.EnumSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import javax.annotation.Nullable; import javax.validation.ValidationException; import com.google.common.base.Strings; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.util.Bytes; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A utility class to build object models from the HBase occurrence "row". */ public class OccurrenceBuilder { private static final Logger LOG = LoggerFactory.getLogger(OccurrenceBuilder.class); // TODO: move these maps to Classification, Term or RankUtils public static final Map<Rank, Term> rank2taxonTerm = ImmutableMap.<Rank, Term>builder().put(Rank.KINGDOM, DwcTerm.kingdom).put(Rank.PHYLUM, DwcTerm.phylum) .put(Rank.CLASS, DwcTerm.class_).put(Rank.ORDER, DwcTerm.order).put(Rank.FAMILY, DwcTerm.family) .put(Rank.GENUS, DwcTerm.genus).put(Rank.SUBGENUS, DwcTerm.subgenus).put(Rank.SPECIES, GbifTerm.species).build(); public static final Map<Rank, Term> rank2KeyTerm = ImmutableMap.<Rank, Term>builder().put(Rank.KINGDOM, GbifTerm.kingdomKey).put(Rank.PHYLUM, GbifTerm.phylumKey) .put(Rank.CLASS, GbifTerm.classKey).put(Rank.ORDER, GbifTerm.orderKey).put(Rank.FAMILY, GbifTerm.familyKey) .put(Rank.GENUS, GbifTerm.genusKey).put(Rank.SUBGENUS, GbifTerm.subgenusKey) .put(Rank.SPECIES, GbifTerm.speciesKey).build(); // should never be instantiated private OccurrenceBuilder() { } /** * Builds a Fragment object from the given result, assigning the passed in key. * * @param result an HBase scan/get Result * @return the Fragment or null if the passed in Result is null * @throws ValidationException if the fragment as stored in the table is invalid */ public static Fragment buildFragment(@Nullable Result result) { if (result == null) { return null; } int key = Bytes.toInt(result.getRow()); String rawDatasetKey = ExtResultReader.getString(result, GbifTerm.datasetKey); if (rawDatasetKey == null) { throw new ValidationException("Fragment with key [" + key + "] has no datasetKey."); } UUID datasetKey = UUID.fromString(rawDatasetKey); Integer crawlId = ExtResultReader.getInteger(result, GbifInternalTerm.crawlId); if (crawlId == null) { throw new ValidationException("Fragment with key [" + key + "] has no crawlId."); } Long harvested = ExtResultReader.getLong(result, GbifTerm.lastCrawled); if (harvested == null) { throw new ValidationException("Fragment with key [" + key + "] has no harvestedDate."); } Date harvestedDate = new Date(harvested); String unitQualifier = ExtResultReader.getString(result, GbifInternalTerm.unitQualifier); byte[] data = ExtResultReader.getBytes(result, GbifInternalTerm.fragment); byte[] dataHash = ExtResultReader.getBytes(result, GbifInternalTerm.fragmentHash); Long created = ExtResultReader.getLong(result, GbifInternalTerm.fragmentCreated); String rawSchema = ExtResultReader.getString(result, GbifInternalTerm.xmlSchema); OccurrenceSchemaType schema; if (rawSchema == null) { // this is typically called just before updating the fragment, meaning schemaType will then be correctly set LOG.debug("Fragment with key [{}] has no schema type - assuming DWCA.", key); schema = OccurrenceSchemaType.DWCA; } else { schema = OccurrenceSchemaType.valueOf(rawSchema); } String rawProtocol = ExtResultReader.getString(result, GbifTerm.protocol); EndpointType protocol = rawProtocol == null ? null : EndpointType.valueOf(rawProtocol); Fragment frag; if (schema == null || schema == OccurrenceSchemaType.DWCA) { frag = new Fragment(datasetKey, data, dataHash, Fragment.FragmentType.JSON, protocol, harvestedDate, crawlId, schema, null, created); } else { frag = new Fragment(datasetKey, data, dataHash, Fragment.FragmentType.XML, protocol, harvestedDate, crawlId, schema, unitQualifier, created); } frag.setKey(key); return frag; } /** * Utility to build an API Occurrence from an HBase row. * * @return A complete occurrence, or null */ public static Occurrence buildOccurrence(@Nullable Result row) { if (row == null || row.isEmpty()) { return null; } else { Occurrence occ = new Occurrence(buildVerbatimOccurrence(row, false)); // filter out verbatim terms that have been interpreted for (Term t : TermUtils.interpretedSourceTerms()) { occ.getVerbatimFields().remove(t); } Integer key = Bytes.toInt(row.getRow()); occ.setKey(key); // taxonomy terms occ.setTaxonKey(ExtResultReader.getInteger(row, GbifTerm.taxonKey)); occ.setScientificName(ExtResultReader.getString(row, DwcTerm.scientificName)); occ.setGenericName(ExtResultReader.getString(row, GbifTerm.genericName)); occ.setSpecificEpithet(ExtResultReader.getString(row, DwcTerm.specificEpithet)); occ.setInfraspecificEpithet(ExtResultReader.getString(row, DwcTerm.infraspecificEpithet)); occ.setTaxonRank(ExtResultReader.getEnum(row, DwcTerm.taxonRank, Rank.class)); for (Rank r : Rank.DWC_RANKS) { ClassificationUtils .setHigherRankKey(occ, r, ExtResultReader.getInteger(row, OccurrenceBuilder.rank2KeyTerm.get(r))); ClassificationUtils .setHigherRank(occ, r, ExtResultReader.getString(row, OccurrenceBuilder.rank2taxonTerm.get(r))); } // other java properties occ.setBasisOfRecord(ExtResultReader.getEnum(row, DwcTerm.basisOfRecord, BasisOfRecord.class)); occ.setElevation(ExtResultReader.getDouble(row, GbifTerm.elevation)); occ.setElevationAccuracy(ExtResultReader.getDouble(row, GbifTerm.elevationAccuracy)); occ.setDepth(ExtResultReader.getDouble(row, GbifTerm.depth)); occ.setDepthAccuracy(ExtResultReader.getDouble(row, GbifTerm.depthAccuracy)); occ.setDatasetKey(ExtResultReader.getUuid(row, GbifTerm.datasetKey)); occ.setPublishingOrgKey(ExtResultReader.getUuid(row, GbifInternalTerm.publishingOrgKey)); occ.setPublishingCountry(Country.fromIsoCode(ExtResultReader.getString(row, GbifTerm.publishingCountry))); occ.setLastInterpreted(ExtResultReader.getDate(row, GbifTerm.lastInterpreted)); occ.setModified(ExtResultReader.getDate(row, DcTerm.modified)); occ.setDateIdentified(ExtResultReader.getDate(row, DwcTerm.dateIdentified)); occ.setProtocol(ExtResultReader.getEnum(row, GbifTerm.protocol, EndpointType.class)); occ.setDecimalLatitude(ExtResultReader.getDouble(row, DwcTerm.decimalLatitude)); occ.setDecimalLongitude(ExtResultReader.getDouble(row, DwcTerm.decimalLongitude)); // TODO removed after complete reintepretation 2016-04-27 occ.setCoordinateAccuracy(ExtResultReader.getDouble(row, GbifTerm.coordinateAccuracy)); occ.setCoordinatePrecision(ExtResultReader.getDouble(row, DwcTerm.coordinatePrecision)); occ.setCoordinateUncertaintyInMeters(ExtResultReader.getDouble(row, DwcTerm.coordinateUncertaintyInMeters)); occ.setCountry(Country.fromIsoCode(ExtResultReader.getString(row, DwcTerm.countryCode))); occ.setStateProvince(ExtResultReader.getString(row, DwcTerm.stateProvince)); occ.setContinent(ExtResultReader.getEnum(row, DwcTerm.continent, Continent.class)); occ.setWaterBody(ExtResultReader.getString(row, DwcTerm.waterBody)); occ.setEventDate(ExtResultReader.getDate(row, DwcTerm.eventDate)); occ.setYear(ExtResultReader.getInteger(row, DwcTerm.year)); occ.setMonth(ExtResultReader.getInteger(row, DwcTerm.month)); occ.setDay(ExtResultReader.getInteger(row, DwcTerm.day)); occ.setIndividualCount(ExtResultReader.getInteger(row, DwcTerm.individualCount)); occ.setEstablishmentMeans(ExtResultReader.getEnum(row, DwcTerm.establishmentMeans, EstablishmentMeans.class)); occ.setLifeStage(ExtResultReader.getEnum(row, DwcTerm.lifeStage, LifeStage.class)); occ.setSex(ExtResultReader.getEnum(row, DwcTerm.sex, Sex.class)); occ.setTypeStatus(ExtResultReader.getEnum(row, DwcTerm.typeStatus, TypeStatus.class)); occ.setTypifiedName(ExtResultReader.getString(row, GbifTerm.typifiedName)); occ.setReferences(ExtResultReader.getUri(row, DcTerm.references)); occ.setIdentifiers(extractIdentifiers(key, row)); occ.setIssues(extractIssues(row)); occ.setMedia(buildMedia(row)); //It should be replaced by License.fromString(value).orNull() but conflicts of Guava versions avoid its usage occ.setLicense(VocabularyUtils.lookupEnum(ExtResultReader.getString(row, DcTerm.license), License.class)); return occ; } } /** * Utility to build an API Occurrence from an HBase row. * * @return A complete verbatim occurrence, or null */ public static VerbatimOccurrence buildVerbatimOccurrence(@Nullable Result row) { return buildVerbatimOccurrence(row, true); } /** * Utility to build an API Occurrence from an HBase row. * * @param readExtensions if true reads verbatim extension data into extensions map * @return A complete verbatim occurrence, or null */ private static VerbatimOccurrence buildVerbatimOccurrence(@Nullable Result row, boolean readExtensions) { if (row == null || row.isEmpty()) { return null; } VerbatimOccurrence verb = new VerbatimOccurrence(); verb.setKey(Bytes.toInt(row.getRow())); verb.setDatasetKey(ExtResultReader.getUuid(row, GbifTerm.datasetKey)); verb.setPublishingOrgKey(ExtResultReader.getUuid(row, GbifInternalTerm.publishingOrgKey)); verb.setPublishingCountry(Country.fromIsoCode(ExtResultReader.getString(row, GbifTerm.publishingCountry))); verb.setLastCrawled(ExtResultReader.getDate(row, GbifTerm.lastCrawled)); verb.setLastParsed(ExtResultReader.getDate(row, GbifTerm.lastParsed)); verb.setProtocol(EndpointType.fromString(ExtResultReader.getString(row, GbifTerm.protocol))); verb.setCrawlId(ExtResultReader.getInteger(row, GbifInternalTerm.crawlId)); for (Cell cell : row.rawCells()) { // all verbatim Term fields in row are prefixed. Columns without that prefix return null! // extensions are also kept with a v_ prefix, so explicitly ignore them. Term term = Columns.termFromVerbatimColumn(CellUtil.cloneQualifier(cell)); if (term != null && !TermUtils.isExtensionTerm(term)) { verb.setVerbatimField(term, Bytes.toString(CellUtil.cloneValue(cell))); } } if (readExtensions) { verb.setExtensions(readVerbatimExtensions(row)); } return verb; } /** * Reads the extensions from a result row. */ private static Map<Extension, List<Map<Term, String>>> readVerbatimExtensions(@Nullable Result row) { Map<Extension, List<Map<Term, String>>> extensions = Maps.newHashMap(); for (Extension extension : Extension.values()) { String jsonExtensions = ExtResultReader.getString(row, Columns.verbatimColumn(extension)); if (!Strings.isNullOrEmpty(jsonExtensions)) { extensions.put(extension, ExtensionSerDeserUtils.fromJson(jsonExtensions)); } } return extensions; } private static List<Identifier> extractIdentifiers(Integer key, Result result) { List<Identifier> records = Lists.newArrayList(); Integer maxCount = ExtResultReader.getInteger(result, GbifInternalTerm.identifierCount); if (maxCount != null) { for (int count = 0; count < maxCount; count++) { String idCol = Columns.idColumn(count); String idTypeCol = Columns.idTypeColumn(count); String id = ResultReader.getString(result, Columns.OCCURRENCE_COLUMN_FAMILY, idCol, null); String rawType = ResultReader.getString(result, Columns.OCCURRENCE_COLUMN_FAMILY, idTypeCol, null); if (id != null && rawType != null) { IdentifierType idType = null; try { idType = IdentifierType.valueOf(rawType); } catch (IllegalArgumentException e) { LOG.warn("Unrecognized value for IdentifierType from field [{}] - data is corrupt.", rawType); } if (idType != null) { Identifier record = new Identifier(); record.setIdentifier(id); record.setType(idType); records.add(record); } } } } return records; } private static Set<OccurrenceIssue> extractIssues(Result result) { Set<OccurrenceIssue> issues = EnumSet.noneOf(OccurrenceIssue.class); for (OccurrenceIssue issue : OccurrenceIssue.values()) { String column = Columns.column(issue); byte[] val = result.getValue(Columns.CF, Bytes.toBytes(column)); if (val != null) { issues.add(issue); } } return issues; } /** * Builds the list of media objects. */ public static List<MediaObject> buildMedia(Result result) { List<MediaObject> media = null; String mediaJson = ExtResultReader.getString(result, Columns.column(Extension.MULTIMEDIA)); if (mediaJson != null && !mediaJson.isEmpty()) { try { media = MediaSerDeserUtils.fromJson(mediaJson); } catch (Exception e) { LOG.warn("Unable to deserialize media objects from hbase", e); } } return media; } }