package org.gbif.occurrence.cli.registry.sync;
import org.gbif.api.model.occurrence.Occurrence;
import org.gbif.api.model.registry.Dataset;
import org.gbif.api.model.registry.Organization;
import org.gbif.common.messaging.api.messages.DeleteDatasetOccurrencesMessage;
import org.gbif.common.messaging.api.messages.OccurrenceDeletionReason;
import org.gbif.common.messaging.api.messages.OccurrenceMutatedMessage;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* To be replaced by {@link OccurrenceRegistryMapper}
*
* A mapreduce Mapper that synchronizes occurrences with the registry. It checks for changes detected by
* {@link RegistryBasedOccurrenceMutator} and dataset deletions. For organization changes the new values are written
* to the occurrence HBase table via occurrence persistence, and then an OccurrenceMutatedMessage is sent. For dataset
* deletions a DeleteDatasetMessage is sent.
*/
public class OccurrenceScanMapper extends AbstractOccurrenceRegistryMapper {
private static final Logger LOG = LoggerFactory.getLogger(OccurrenceScanMapper.class);
private static final Set<UUID> DEAD_DATASETS = Sets.newHashSet();
private static final Set<UUID> UNCHANGED_DATASETS = Sets.newHashSet();
private static final Map<UUID, Organization> DATASET_TO_OWNING_ORG = Maps.newHashMap();
private int numRecords = 0;
@Override
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
UUID datasetKey = UUID.fromString(Bytes.toString(values.getValue(SyncCommon.OCC_CF, SyncCommon.DK_COL)));
if (DEAD_DATASETS.contains(datasetKey) || UNCHANGED_DATASETS.contains(datasetKey)) {
return;
}
Dataset dataset = datasetService.get(datasetKey);
if (dataset.getDeleted() != null) {
DEAD_DATASETS.add(datasetKey);
try {
LOG.info("Sending delete dataset message for dataset [{}]", datasetKey);
messagePublisher.send(new DeleteDatasetOccurrencesMessage(datasetKey, OccurrenceDeletionReason.DATASET_MANUAL));
} catch (IOException e) {
LOG.warn("Failed to send update message", e);
}
return;
}
// dataset exists, now compare with values we have on the occurrence
Organization publishingOrg;
boolean needsUpdate;
if (DATASET_TO_OWNING_ORG.containsKey(datasetKey)) {
// seen it before, no need to do comparisons - record needs updating
publishingOrg = DATASET_TO_OWNING_ORG.get(datasetKey);
needsUpdate = true;
} else {
publishingOrg = orgService.get(dataset.getPublishingOrganizationKey());
if (occurrenceMutator.requiresUpdate(dataset, publishingOrg, values)) {
needsUpdate = true;
DATASET_TO_OWNING_ORG.put(datasetKey, publishingOrg);
} else {
needsUpdate = false;
UNCHANGED_DATASETS.add(datasetKey);
}
}
if (needsUpdate) {
Occurrence origOcc = occurrencePersistenceService.get(Bytes.toInt(row.get()));
// we have no clone or other easy copy method
Occurrence updatedOcc = occurrencePersistenceService.get(Bytes.toInt(row.get()));
occurrenceMutator.mutateOccurrence(updatedOcc, dataset, publishingOrg);
occurrencePersistenceService.update(updatedOcc);
int crawlId = Bytes.toInt(values.getValue(SyncCommon.OCC_CF, SyncCommon.CI_COL));
OccurrenceMutatedMessage msg =
OccurrenceMutatedMessage.buildUpdateMessage(datasetKey, origOcc, updatedOcc, crawlId);
try {
LOG.info(
"Sending update for key [{}], publishing org changed from [{}] to [{}] and host country from [{}] to [{}]",
datasetKey, origOcc.getPublishingOrgKey(), updatedOcc.getPublishingOrgKey(), origOcc.getPublishingCountry(),
updatedOcc.getPublishingCountry());
messagePublisher.send(msg);
} catch (IOException e) {
LOG.warn("Failed to send update message", e);
}
}
numRecords++;
if (numRecords % 10000 == 0) {
context.setStatus("mapper processed " + numRecords + " records so far");
}
}
}