package org.gbif.occurrence.persistence; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.inject.Inject; import com.google.inject.Singleton; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.filter.CompareFilter; import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; import org.apache.hadoop.hbase.util.Bytes; import org.gbif.api.exception.ServiceUnavailableException; import org.gbif.api.model.occurrence.Occurrence; import org.gbif.api.model.occurrence.VerbatimOccurrence; import org.gbif.api.util.ClassificationUtils; import org.gbif.api.vocabulary.Extension; import org.gbif.api.vocabulary.OccurrenceIssue; import org.gbif.api.vocabulary.Rank; import org.gbif.dwc.terms.*; import org.gbif.occurrence.common.TermUtils; import org.gbif.occurrence.common.config.OccHBaseConfiguration; import org.gbif.occurrence.common.json.ExtensionSerDeserUtils; import org.gbif.occurrence.common.json.MediaSerDeserUtils; import org.gbif.occurrence.persistence.api.OccurrencePersistenceService; import org.gbif.occurrence.persistence.hbase.Columns; import org.gbif.occurrence.persistence.hbase.ExtResultReader; import org.gbif.occurrence.persistence.hbase.RowUpdate; import org.gbif.occurrence.persistence.util.OccurrenceBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; import java.math.BigDecimal; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; import static com.google.common.base.Preconditions.checkNotNull; /** * An implementation of OccurrenceService and OccurrenceWriter for persisting and retrieving Occurrence objects in * HBase. */ @Singleton public class OccurrencePersistenceServiceImpl implements OccurrencePersistenceService { private static final Logger LOG = LoggerFactory.getLogger(OccurrencePersistenceServiceImpl.class); private static final int SCANNER_CACHE_SIZE = 50; private final String occurrenceTableName; private final Connection connection; @Inject public OccurrencePersistenceServiceImpl(OccHBaseConfiguration cfg, Connection connection) { occurrenceTableName = checkNotNull(cfg.occTable, "tableName can't be null"); this.connection = checkNotNull(connection, "connection can't be null"); } /** * Note that the returned fragment here is a String that holds the actual xml or json snippet for this occurrence, * and not the Fragment object that is used elsewhere. * * @param key that identifies an occurrence * @return a String holding the original xml or json snippet for this occurrence */ @Override public String getFragment(int key) { String fragment = null; try (Table table = connection.getTable(TableName.valueOf(occurrenceTableName))) { Get get = new Get(Bytes.toBytes(key)); Result result = table.get(get); if (result == null || result.isEmpty()) { LOG.info("Couldn't find occurrence for id [{}], returning null", key); return null; } byte[] rawFragment = ExtResultReader.getBytes(result, Columns.column(GbifInternalTerm.fragment)); if (rawFragment != null) { fragment = Bytes.toString(rawFragment); } } catch (IOException e) { throw new ServiceUnavailableException("Could not read from HBase", e); } return fragment; } @Nullable @Override public VerbatimOccurrence getVerbatim(@Nullable Integer key) { if (key == null) { return null; } VerbatimOccurrence verb = null; try (Table table = connection.getTable(TableName.valueOf(occurrenceTableName))) { Get get = new Get(Bytes.toBytes(key)); Result result = table.get(get); if (result == null || result.isEmpty()) { LOG.debug("Couldn't find occurrence for key [{}], returning null", key); return null; } verb = OccurrenceBuilder.buildVerbatimOccurrence(result); } catch (IOException e) { throw new ServiceUnavailableException("Could not read from HBase", e); } return verb; } @Override public Occurrence get(@Nullable Integer key) { if (key == null) { return null; } Occurrence occ = null; try (Table table = connection.getTable(TableName.valueOf(occurrenceTableName))) { Get get = new Get(Bytes.toBytes(key)); Result result = table.get(get); if (result == null || result.isEmpty()) { LOG.debug("Couldn't find occurrence for key [{}], returning null", key); return null; } occ = OccurrenceBuilder.buildOccurrence(result); } catch (Exception e) { throw new ServiceUnavailableException("Could not read from HBase", e); } return occ; } @Override public Iterator<Integer> getKeysByColumn(byte[] columnValue, String columnName) { byte[] col = Bytes.toBytes(columnName); Scan scan = new Scan(); scan.setCaching(SCANNER_CACHE_SIZE); scan.addColumn(Columns.CF, col); scan.setFilter(new SingleColumnValueFilter(Columns.CF, col, CompareFilter.CompareOp.EQUAL, columnValue)); return new OccurrenceKeyIterator(connection, occurrenceTableName, scan); } @Override public void update(VerbatimOccurrence occurrence) { updateOcc(occurrence); } @Override public void update(Occurrence occ) { updateOcc(occ); } @Override public Occurrence delete(int occurrenceKey) { Occurrence occurrence = get(occurrenceKey); if (occurrence == null) { LOG.debug("Occurrence for key [{}] not found, ignoring delete request.", occurrenceKey); } else { delete(new ImmutableList.Builder<Integer>().add(occurrenceKey).build()); } LOG.debug("<< delete [{}]", occurrenceKey); return occurrence; } @Override public void delete(List<Integer> occurrenceKeys) { checkNotNull(occurrenceKeys, "occurrenceKeys can't be null"); try (Table table = connection.getTable(TableName.valueOf(occurrenceTableName))) { List<Delete> deletes = Lists.newArrayListWithExpectedSize(occurrenceKeys.size()); for (Integer occurrenceKey : occurrenceKeys) { if (occurrenceKey != null) { deletes.add(new Delete(Bytes.toBytes(occurrenceKey))); } } LOG.debug("Deleting [{}] occurrences", occurrenceKeys.size()); table.delete(deletes); } catch (IOException e) { throw new ServiceUnavailableException("Could not access HBase", e); } } <T extends VerbatimOccurrence> RowUpdate buildRowUpdate(T occ) { checkNotNull(occ, "occurrence can't be null"); checkNotNull(occ.getKey(), "occurrence's key can't be null"); RowUpdate upd = new RowUpdate(occ.getKey()); try (Table table = connection.getTable(TableName.valueOf(occurrenceTableName))) { if (occ instanceof Occurrence) { populateVerbatimPutDelete(table, upd, occ, false); populateInterpretedPutDelete(upd, (Occurrence) occ); } else { populateVerbatimPutDelete(table, upd, occ, true); } } catch (IOException e) { throw new ServiceUnavailableException("Could not access HBase", e); } return upd; } private <T extends VerbatimOccurrence> void updateOcc(T occ) { checkNotNull(occ, "occurrence can't be null"); checkNotNull(occ.getKey(), "occurrence's key can't be null"); RowUpdate upd = buildRowUpdate(occ); try (Table table = connection.getTable(TableName.valueOf(occurrenceTableName))) { upd.execute(table); } catch (IOException e) { throw new ServiceUnavailableException("Could not access HBase", e); } } /** * Populates the put and delete for a verbatim record. * * @param deleteInterpretedVerbatimColumns if true deletes also the verbatim columns removed during interpretation * (typically true when updating an Occurrence and false for * VerbatimOccurrence) */ private void populateVerbatimPutDelete(Table occTable, RowUpdate upd, VerbatimOccurrence occ, boolean deleteInterpretedVerbatimColumns) throws IOException { // adding the mutations to the HTable is quite expensive, hence worth all these comparisons VerbatimOccurrence oldVerb = getVerbatim(occ.getKey()); // schedule delete of any fields that are on the oldVerb but not on the updated verb, but only if we've been // explicitly asked to delete empty term columns (deleteInterpretedVerbatimColumns) or if the column is one that is // used equally by the verbatim and interp occurrences (e.g. changes to catalogNumber on either of verb or interp // should be reflected here (it is not an InterpretedSourceTerm), but not something like verbatimLatitude // (which is an InterpretedSourceTerm)). // for (Term term : oldVerb.getVerbatimFields().keySet()) { if ((!occ.hasVerbatimField(term) || occ.getVerbatimField(term) == null) && (deleteInterpretedVerbatimColumns || !TermUtils.isInterpretedSourceTerm(term))) { upd.deleteVerbatimField(term); } } // schedule the updates for any verbatim field that has changed for (Map.Entry<Term, String> field : occ.getVerbatimFields().entrySet()) { String oldValue = oldVerb.getVerbatimField(field.getKey()); String newValue = field.getValue(); if (newValue != null && !newValue.equals(oldValue)) { upd.setVerbatimField(field.getKey(), field.getValue()); } } if (!Objects.equals(oldVerb.getDatasetKey(), occ.getDatasetKey())) { upd.setInterpretedField(GbifTerm.datasetKey, occ.getDatasetKey()); } if (!Objects.equals(oldVerb.getPublishingCountry(), occ.getPublishingCountry())) { upd.setInterpretedField(GbifTerm.publishingCountry, occ.getPublishingCountry()); } if (!Objects.equals(oldVerb.getPublishingOrgKey(), occ.getPublishingOrgKey())) { upd.setInterpretedField(GbifInternalTerm.publishingOrgKey, occ.getPublishingOrgKey()); } if (!Objects.equals(oldVerb.getProtocol(), occ.getProtocol())) { upd.setInterpretedField(GbifTerm.protocol, occ.getProtocol()); } if (!Objects.equals(oldVerb.getLastCrawled(), occ.getLastCrawled())) { upd.setInterpretedField(GbifTerm.lastCrawled, occ.getLastCrawled()); } if (!Objects.equals(oldVerb.getLastParsed(), occ.getLastParsed())) { upd.setInterpretedField(GbifTerm.lastParsed, occ.getLastParsed()); } updateExtensions(oldVerb, occ, upd); } /** * Updates the extensions map of the newOcc object into the upd object. */ private void updateExtensions(VerbatimOccurrence oldOcc, VerbatimOccurrence newOcc, RowUpdate upd) throws IOException { for (Extension extension : Extension.values()) { String newExtensions = getExtensionAsJson(newOcc, extension); if (!Objects.equals(getExtensionAsJson(oldOcc, extension), newExtensions)) { upd.setVerbatimExtension(extension, newExtensions); } } } /** * Returns the JSON object of verbatimOccurrence.getExtensions().get(extension). * If verbatimOccurrence is null or the requested extension doesn't exist returns null. */ private String getExtensionAsJson(VerbatimOccurrence verbatimOccurrence, Extension extension) { String jsonExtensions = null; if (verbatimOccurrence.getExtensions() != null && verbatimOccurrence.getExtensions().containsKey(extension)) { jsonExtensions = ExtensionSerDeserUtils.toJson(verbatimOccurrence.getExtensions().get(extension)); } return jsonExtensions; } /** * Populates put and delete for the occurrence specific interpreted columns, leaving any verbatim columns untouched. * TODO: use reflection to get values from the java properties now that we have corresponding terms? */ private void populateInterpretedPutDelete(RowUpdate upd, Occurrence occ) throws IOException { Occurrence oldOcc = get(occ.getKey()); if (!Objects.equals(oldOcc.getBasisOfRecord(), occ.getBasisOfRecord())) { upd.setInterpretedField(DwcTerm.basisOfRecord, occ.getBasisOfRecord()); } if (!Objects.equals(oldOcc.getTaxonKey(), occ.getTaxonKey())) { upd.setInterpretedField(GbifTerm.taxonKey, occ.getTaxonKey()); } for (Rank r : Rank.DWC_RANKS) { if (!Objects.equals(oldOcc.getHigherRankKey(r), occ.getHigherRankKey(r))) { upd.setInterpretedField(OccurrenceBuilder.rank2KeyTerm.get(r), ClassificationUtils.getHigherRankKey(occ, r)); } if (!Objects.equals(oldOcc.getHigherRank(r), occ.getHigherRank(r))) { upd.setInterpretedField(OccurrenceBuilder.rank2taxonTerm.get(r), ClassificationUtils.getHigherRank(occ, r)); } } if (!Objects.equals(oldOcc.getDepth(), occ.getDepth())) { upd.setInterpretedField(GbifTerm.depth, occ.getDepth()); } if (!Objects.equals(oldOcc.getDepthAccuracy(), occ.getDepthAccuracy())) { upd.setInterpretedField(GbifTerm.depthAccuracy, occ.getDepthAccuracy()); } if (!Objects.equals(oldOcc.getElevation(), occ.getElevation())) { upd.setInterpretedField(GbifTerm.elevation, occ.getElevation()); } if (!Objects.equals(oldOcc.getElevationAccuracy(), occ.getElevationAccuracy())) { upd.setInterpretedField(GbifTerm.elevationAccuracy, occ.getElevationAccuracy()); } if (!Objects.equals(oldOcc.getDecimalLatitude(), occ.getDecimalLatitude())) { upd.setInterpretedField(DwcTerm.decimalLatitude, occ.getDecimalLatitude()); } if (!Objects.equals(oldOcc.getDecimalLongitude(), occ.getDecimalLongitude())) { upd.setInterpretedField(DwcTerm.decimalLongitude, occ.getDecimalLongitude()); } if (!Objects.equals(oldOcc.getCountry(), occ.getCountry())) { upd.setInterpretedField(DwcTerm.countryCode, occ.getCountry()); } if (!Objects.equals(oldOcc.getModified(), occ.getModified())) { upd.setInterpretedField(DcTerm.modified, occ.getModified()); } if (!Objects.equals(oldOcc.getEventDate(), occ.getEventDate())) { upd.setInterpretedField(DwcTerm.eventDate, occ.getEventDate()); } if (!Objects.equals(oldOcc.getYear(), occ.getYear())) { upd.setInterpretedField(DwcTerm.year, occ.getYear()); } if (!Objects.equals(oldOcc.getMonth(), occ.getMonth())) { upd.setInterpretedField(DwcTerm.month, occ.getMonth()); } if (!Objects.equals(oldOcc.getDay(), occ.getDay())) { upd.setInterpretedField(DwcTerm.day, occ.getDay()); } if (!Objects.equals(oldOcc.getScientificName(), occ.getScientificName())) { upd.setInterpretedField(DwcTerm.scientificName, occ.getScientificName()); } if (!Objects.equals(oldOcc.getGenericName(), occ.getGenericName())) { upd.setInterpretedField(GbifTerm.genericName, occ.getGenericName()); } if (!Objects.equals(oldOcc.getSpecificEpithet(), occ.getSpecificEpithet())) { upd.setInterpretedField(DwcTerm.specificEpithet, occ.getSpecificEpithet()); } if (!Objects.equals(oldOcc.getInfraspecificEpithet(), occ.getInfraspecificEpithet())) { upd.setInterpretedField(DwcTerm.infraspecificEpithet, occ.getInfraspecificEpithet()); } if (!Objects.equals(oldOcc.getTaxonRank(), occ.getTaxonRank())) { upd.setInterpretedField(DwcTerm.taxonRank, occ.getTaxonRank()); } if (!Objects.equals(oldOcc.getCoordinateUncertaintyInMeters(), occ.getCoordinateUncertaintyInMeters())) { upd.setInterpretedField(DwcTerm.coordinateUncertaintyInMeters, occ.getCoordinateUncertaintyInMeters()); } if (!Objects.equals(oldOcc.getCoordinatePrecision(), occ.getCoordinatePrecision())) { upd.setInterpretedField(DwcTerm.coordinatePrecision, occ.getCoordinatePrecision()); } if (!Objects.equals(oldOcc.getContinent(), occ.getContinent())) { upd.setInterpretedField(DwcTerm.continent, occ.getContinent()); } if (!Objects.equals(oldOcc.getDateIdentified(), occ.getDateIdentified())) { upd.setInterpretedField(DwcTerm.dateIdentified, occ.getDateIdentified()); } if (!Objects.equals(oldOcc.getEstablishmentMeans(), occ.getEstablishmentMeans())) { upd.setInterpretedField(DwcTerm.establishmentMeans, occ.getEstablishmentMeans()); } if (!Objects.equals(oldOcc.getIndividualCount(), occ.getIndividualCount())) { upd.setInterpretedField(DwcTerm.individualCount, occ.getIndividualCount()); } if (!Objects.equals(oldOcc.getLifeStage(), occ.getLifeStage())) { upd.setInterpretedField(DwcTerm.lifeStage, occ.getLifeStage()); } if (!Objects.equals(oldOcc.getSex(), occ.getSex())) { upd.setInterpretedField(DwcTerm.sex, occ.getSex()); } if (!Objects.equals(oldOcc.getStateProvince(), occ.getStateProvince())) { upd.setInterpretedField(DwcTerm.stateProvince, occ.getStateProvince()); } if (!Objects.equals(oldOcc.getWaterBody(), occ.getWaterBody())) { upd.setInterpretedField(DwcTerm.waterBody, occ.getWaterBody()); } if (!Objects.equals(oldOcc.getTypeStatus(), occ.getTypeStatus())) { upd.setInterpretedField(DwcTerm.typeStatus, occ.getTypeStatus()); } if (!Objects.equals(oldOcc.getTypifiedName(), occ.getTypifiedName())) { upd.setInterpretedField(GbifTerm.typifiedName, occ.getTypifiedName()); } if (!Objects.equals(oldOcc.getLastInterpreted(), occ.getLastInterpreted())) { upd.setInterpretedField(GbifTerm.lastInterpreted, occ.getLastInterpreted()); } if (!Objects.equals(oldOcc.getReferences(), occ.getReferences())) { upd.setInterpretedField(DcTerm.references, occ.getReferences()); } if (!Objects.equals(oldOcc.getLicense(), occ.getLicense())) { upd.setInterpretedField(DcTerm.license, occ.getLicense()); } // Multimedia extension String newMediaJson = MediaSerDeserUtils.toJson(occ.getMedia()); if (!Objects.equals(MediaSerDeserUtils.toJson(oldOcc.getMedia()), newMediaJson)) { upd.setInterpretedExtension(Extension.MULTIMEDIA, newMediaJson); } // OccurrenceIssues for (OccurrenceIssue issue : oldOcc.getIssues()) { if (!occ.getIssues().contains(issue)) { upd.setField(Columns.column(issue), null); } } for (OccurrenceIssue issue : occ.getIssues()) { if (!oldOcc.getIssues().contains(issue)) { upd.setField(Columns.column(issue), Bytes.toBytes(1)); } } } /** * Used to round (with half up) a BigDecimal to only keep a certain number of digit(s). */ private static BigDecimal nullSafeRoundHalfUp(BigDecimal value, int scale){ if (value == null) { return null; } return value.setScale(scale, BigDecimal.ROUND_HALF_UP); } }