package org.gbif.checklistbank.cli.importer; import org.gbif.api.model.Constants; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.NameUsageMetrics; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.model.checklistbank.VerbatimNameUsage; import org.gbif.api.service.checklistbank.NameUsageService; import org.gbif.api.util.ClassificationUtils; import org.gbif.api.vocabulary.Origin; import org.gbif.api.vocabulary.Rank; import org.gbif.api.vocabulary.TaxonomicStatus; import org.gbif.checklistbank.cli.model.UsageFacts; import org.gbif.checklistbank.kryo.CliKryoFactory; import org.gbif.checklistbank.model.UsageExtensions; import org.gbif.checklistbank.model.UsageForeignKeys; import org.gbif.checklistbank.neo.ImportDb; import org.gbif.checklistbank.neo.Labels; import org.gbif.checklistbank.neo.NeoProperties; import org.gbif.checklistbank.neo.RelType; import org.gbif.checklistbank.neo.UsageDao; import org.gbif.checklistbank.neo.traverse.ChunkingEvaluator; import org.gbif.checklistbank.neo.traverse.MultiRootNodeIterator; import org.gbif.checklistbank.neo.traverse.Traversals; import org.gbif.checklistbank.neo.traverse.TreeIterablesSorted; import org.gbif.checklistbank.nub.model.NubUsage; import org.gbif.checklistbank.service.DatasetImportService; import org.gbif.checklistbank.service.ImporterCallback; import org.gbif.checklistbank.service.UsageService; import java.io.ByteArrayOutputStream; import java.util.Calendar; import java.util.Iterator; import java.util.List; import java.util.Queue; import java.util.Set; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; import com.esotericsoftware.kryo.pool.KryoPool; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.inject.Inject; import org.neo4j.graphdb.Direction; import org.neo4j.graphdb.Node; import org.neo4j.graphdb.Relationship; import org.neo4j.graphdb.Transaction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Importer that reads a neo database and syncs it with a postgres checklistbank db and solr index. * It understands pro parte synonym relations and creates multiple postgres usages for each accepted parent. */ public class Importer extends ImportDb implements Runnable, ImporterCallback { private static final Logger LOG = LoggerFactory.getLogger(Importer.class); private final static int SELF_ID = -1; private final ImporterConfiguration cfg; private int syncCounterMain; private int syncCounterBatches; private int syncCounterProParte; private int delCounter; private final DatasetImportService sqlService; private final DatasetImportService solrService; private final NameUsageService nameUsageService; private final UsageService usageService; // neo internal ids to clb usage keys private ConcurrentHashMap<Integer, Integer> clbKeys = new ConcurrentHashMap<Integer, Integer>(); // map based around internal neo4j node ids: private ConcurrentHashMap<Integer, UsageForeignKeys> postKeys = new ConcurrentHashMap<Integer, UsageForeignKeys>(); // list of pro parte synonym neo node ids private Set<Long> proParteNodes = Sets.newHashSet(); private int maxExistingNubKey = -1; private volatile int firstUsageKey = -1; private Future<List<NameUsage>> proParteFuture; private Queue<Future<List<Integer>>> usageFutures = new ConcurrentLinkedQueue<Future<List<Integer>>>(); private Queue<Future<?>> otherFutures = new ConcurrentLinkedQueue<Future<?>>(); private final KryoPool kryoPool = new KryoPool.Builder(new CliKryoFactory()).build(); private enum KeyType {PARENT, ACCEPTED, BASIONYM, CLASSIFICATION} @Inject private Importer(UUID datasetKey, UsageDao dao, NameUsageService nameUsageService, UsageService usageService, DatasetImportService sqlService, DatasetImportService solrService, ImporterConfiguration cfg) { super(datasetKey, dao); this.cfg = cfg; this.nameUsageService = nameUsageService; this.usageService = usageService; this.sqlService = sqlService; this.solrService = solrService; } /** * @param usageService only needed if you gonna sync the backbone dataset. Tests can usually just pass in null! */ public static Importer create(ImporterConfiguration cfg, UUID datasetKey, NameUsageService nameUsageService, UsageService usageService, DatasetImportService sqlService, DatasetImportService solrService) { return new Importer(datasetKey, UsageDao.persistentDao(cfg.neo, datasetKey, true, null, false), nameUsageService, usageService, sqlService, solrService, cfg); } public void run() { LOG.info("Start importing checklist"); try { syncDataset(); LOG.info("Waiting for threads to finish {} sql and {} solr jobs", usageFutures.size(), otherFutures.size()); awaitUsageFutures(); awaitProParteFuture(); // wait for extensions and solr jobs to finish awaitOtherFutures(); LOG.info("Importing succeeded. {} main, {} subtree chunk and {} pro parte usages synced", syncCounterMain, syncCounterBatches, syncCounterProParte); } catch (InterruptedException e) { Throwables.propagate(e); LOG.error("Job interrupted, data is likely to be inconsistent."); Thread.currentThread().interrupt(); } catch (ExecutionException e) { LOG.error("Error executing job", e.getCause()); Throwables.propagate(e); } finally { LOG.debug("Shutting down graph database"); dao.close(); LOG.info("Neo database shut down."); } } /** * Iterates over all accepted taxa in taxonomical order including all synonyms and syncs the usage individually * with Checklist Bank Postgres. As basionym relations can crosslink basically any record we first set the basionym * key to null and update just those keys in a second iteration. Most usages will not have a basionymKey, so * performance should only be badly impacted in rare cases. * * The taxonomic tree is traversed and whenever possible subtrees are processed in separate batches. * The main traversal will happen more or less syncronously as it is required to exist before the subtrees can be synced. * * @throws EmptyImportException if no records at all have been imported */ private void syncDataset() throws EmptyImportException, ExecutionException, InterruptedException { if (datasetKey.equals(Constants.NUB_DATASET_KEY)) { // remember the current highest nub key so we know if incoming ones are inserts or updates Integer high = usageService.maxUsageKey(Constants.NUB_DATASET_KEY); maxExistingNubKey = high == null ? -1 : high; LOG.info("Sync GBIF backbone. Current max nub usageKey={}", maxExistingNubKey); } // we keep the very first usage key to retrieve the exact last modified timestamp from the database // in order to avoid clock differences between machines and threads. firstUsageKey = -1; int chunks = 0; // traverse the tree try (Transaction tx = dao.beginTx()) { LOG.info("Chunking imports into slices of {} to {}", cfg.chunkMinSize, cfg.chunkSize); ChunkingEvaluator chunkingEvaluator = new ChunkingEvaluator(dao, cfg.chunkMinSize, cfg.chunkSize); List<Integer> batch = Lists.newArrayList(); for (Node n : MultiRootNodeIterator.create(TreeIterablesSorted.findRoot(dao.getNeo()), Traversals.TREE_WITHOUT_PRO_PARTE.evaluator(chunkingEvaluator))) { if (chunkingEvaluator.isChunk(n.getId())) { LOG.debug("chunk node {} found", n.getId()); Future<List<Integer>> f = null; if (!batch.isEmpty()) { LOG.debug("submit {} main nodes for concurrent syncing starting with node {}", batch.size(), batch.get(0)); f = sqlService.sync(datasetKey, this, batch); } // while main nodes sync we can read in the new subtree already batch = subtreeBatch(n); chunks++; syncCounterBatches = syncCounterBatches + batch.size(); // wait for main future to finish and submit solr update ... if (f != null) { otherFutures.add(solrService.sync(datasetKey,this, f.get())); LOG.debug("main nodes synced. Submit solr update"); } // main nodes are in postgres. Now we can submit the sync task for the subtree LOG.debug("submit subtree chunk with {} usages starting with {}", batch.size(), n); usageFutures.add(sqlService.sync(datasetKey, this, batch)); // reset main batch for new usages batch = Lists.newArrayList(); clearFinishedUsageTasks(); } else { // add to main batch batch.add((int)n.getId()); if (isProParteNode(n)) { proParteNodes.add(n.getId()); } syncCounterMain++; } } if (!batch.isEmpty()) { LOG.debug("submit final {} main nodes for concurrent syncing", batch.size()); usageFutures.add(sqlService.sync(datasetKey,this, batch)); } } // wait for main sql usage imports to be done so we dont break foreign key constraints LOG.info("Wait for usage import tasks to finish."); awaitUsageFutures(); LOG.info("Core usage import completed. {} chunk jobs synced with {} main usages and {} subtree batch usages usages.", chunks, syncCounterMain, syncCounterBatches); if (clbKeys.size() != syncCounterMain + syncCounterBatches) { LOG.warn("{} clb usage keys known for {} neo nodes ({} main, {} chunk). Expecting \"NodeId not in CLB exceptions\" ...", clbKeys.size(), syncCounterMain+syncCounterBatches, syncCounterMain, syncCounterBatches); } // finally update foreign keys that did not exist during initial inserts updateForeignKeys(); // finally import extra pro parte usages syncProParte(); // make sure we have imported at least one record if (firstUsageKey < 0) { LOG.warn("No records imported. Keep all existing data!"); throw new EmptyImportException(datasetKey, "No records imported for dataset " + datasetKey); } // remove old usages deleteOldUsages(); } private void updateForeignKeys() { if (!postKeys.isEmpty()) { // update neo ids to clb usage keys LOG.info("Updating foreign keys for {} usages", postKeys.size()); for (UsageForeignKeys fk : postKeys.values()) { fk.setUsageKey(clbKey(fk.getUsageKey())); fk.setParentKey(clbKey(fk.getParentKey())); fk.setBasionymKey(clbKey(fk.getBasionymKey())); } List<UsageForeignKeys> fks = ImmutableList.copyOf(postKeys.values()); sqlService.updateForeignKeys(datasetKey, fks); solrService.updateForeignKeys(datasetKey, fks); } } private void syncProParte() { if (!proParteNodes.isEmpty()) { LOG.info("Syncing {} pro parte usages", proParteNodes.size()); for (List<Long> ids : Iterables.partition(proParteNodes, cfg.chunkSize)) { List<NameUsage> usages = Lists.newArrayList(); List<ParsedName> names = Lists.newArrayList(); try (Transaction tx = dao.getNeo().beginTx()) { for (Long id : ids) { Node n = dao.getNeo().getNodeById(id); NameUsage primary = readUsage(n); ParsedName pn = readName(id); // modify as a template for all cloned pro parte usages primary.setProParteKey(primary.getKey()); primary.setOrigin(Origin.PROPARTE); primary.setTaxonID(null); // if we keep the original id we will do an update, not an insert primary.setParentKey(null); for (Relationship rel : n.getRelationships(RelType.PROPARTE_SYNONYM_OF, Direction.OUTGOING)) { // pro parte synonyms keep their id in the relation, read it // http://dev.gbif.org/issues/browse/POR-2872 NameUsage u = clone(primary); u.setKey( (Integer) rel.getProperty(NeoProperties.USAGE_KEY, null)); Node accN = rel.getEndNode(); // all nodes should be synced by now, so clb keys must be known u.setAcceptedKey(clbKeys.get((int) accN.getId())); // use accepted taxon classification for this synonym record applyClbClassification(u, accN.getId()); usages.add(u); names.add(pn); } } } // submit sync job syncCounterProParte = syncCounterProParte + usages.size(); proParteFuture = sqlService.sync(datasetKey, this, usages, names); } } } /** * Applies the classification from another node, transforming the neo node ids into existing clb usage keys * @param u * @param classificationNodeId */ private void applyClbClassification(NameUsage u, long classificationNodeId) { UsageFacts facts = dao.readFacts(classificationNodeId); // apply classification if existing if (facts != null && facts.classification != null) { ClassificationUtils.copyLinneanClassificationKeys(facts.classification, u); ClassificationUtils.copyLinneanClassification(facts.classification, u); } // convert to clb keys for (Rank r : Rank.DWC_RANKS) { ClassificationUtils.setHigherRankKey(u, r, clbKey(u.getHigherRankKey(r))); } } private List<Integer> subtreeBatch(Node startNode) { List<Integer> ids = Lists.newArrayList(); try (Transaction tx = dao.beginTx()) { // returns all descendant nodes, accepted and synonyms but exclude pro parte relations! for (Node n : MultiRootNodeIterator.create(startNode, Traversals.TREE_WITHOUT_PRO_PARTE)) { ids.add((int)n.getId()); if (isProParteNode(n)) { proParteNodes.add(n.getId()); } } } LOG.debug("Created batch of {} nodes starting with {}", ids.size(), startNode); return ids; } private boolean isProParteNode(Node n) { if (n.hasRelationship(Direction.OUTGOING, RelType.PROPARTE_SYNONYM_OF)) { return true; } return false; } private void deleteOldUsages() { NameUsage first = nameUsageService.get(firstUsageKey, null); if (first == null || first.getLastInterpreted() == null) { LOG.error("First synced name usage with id {} not found", firstUsageKey); throw new EmptyImportException(datasetKey, "Error importing name usages for dataset " + datasetKey); } Calendar cal = Calendar.getInstance(); cal.setTime(first.getLastInterpreted()); // use 2 seconds before first insert/update as the threshold to remove records cal.add(Calendar.SECOND, -2); LOG.info("Deleting all usages before {}", cal.getTime()); // iterate over all ids to be deleted and remove them from solr first List<Integer> ids = usageService.listOldUsages(datasetKey, cal.getTime()); otherFutures.add(solrService.deleteUsages(datasetKey, ids)); otherFutures.add(sqlService.deleteUsages(datasetKey, ids)); delCounter = ids.size(); } /** * Blocks until all currently listed futures are completed. */ private void awaitOtherFutures() throws ExecutionException, InterruptedException { for (Future<?> f : otherFutures) { f.get(); } otherFutures.clear(); } private void awaitProParteFuture() throws ExecutionException, InterruptedException { if (proParteFuture != null) { // wait for pro parte pg sync. // solr doesnt need the parsed names otherFutures.add(solrService.sync(datasetKey, this, proParteFuture.get(), null)); } } /** * * Waits for all core usages jobs to finish and submits solr updates for all of them once completed. */ private void awaitUsageFutures() throws ExecutionException, InterruptedException { for (Future<List<Integer>> f : usageFutures) { List<Integer> ids = f.get(); otherFutures.add(solrService.sync(datasetKey, this, ids)); } usageFutures.clear(); } private void clearFinishedUsageTasks() throws ExecutionException, InterruptedException { Iterator<Future<List<Integer>>> iter = usageFutures.iterator(); while (iter.hasNext()) { Future<List<Integer>> f = iter.next(); if (f.isDone()) { List<Integer> ids = f.get(); otherFutures.add(solrService.sync(datasetKey, this, ids)); iter.remove(); } } } /** * @return list of parental clb usage keys */ private List<Integer> buildClbParents(Node n) { // we copy the transformed, short list as it is still backed by some neo transaction return StreamSupport.stream(n.getRelationships(RelType.PARENT_OF, Direction.INCOMING).spliterator(), false) .map(rel -> rel != null ? clbKey((int) rel.getStartNode().getId()) : null) .collect(Collectors.toList()); } /** * Maps a neo node id to an already created clb postgres id. * If the mapping does not exist an IllegalStateException is thrown. */ private Integer clbKey(Integer nodeId) { if (nodeId == null) { return null; } if (clbKeys.containsKey(nodeId)) { return clbKeys.get(nodeId); } else { // missing key try (Transaction tx = dao.getNeo().beginTx()) { Node n = dao.getNeo().getNodeById(nodeId); LOG.error("Clb usage key missing for {}: {}", n, NeoProperties.getScientificName(n)); NubUsage nub = dao.readNub(n); if (nub != null) { LOG.info("Nub usage for missing key: {}", nub.toStringComplete()); } else { LOG.warn("Nub usage for missing key {} not found", nodeId); } } catch (Exception e) { // ignore, we throw anyways } throw new IllegalStateException("NodeId not in CLB yet: " + nodeId); } } /** * Maps a neo node id of a foreign key to an already created clb postgres id. * If the requested nodeID actually refers to the current node id, then -1 will be returned to indicate to the mybatis * mapper that it should use the newly generated sequence value. * * @param nodeId the node id casted from long that represents the currently processed name usage record * @param nodeFk the foreign key to the node id we wanna setup the relation to */ private Integer clbForeignKey(long nodeId, Integer nodeFk, KeyType type) { if (nodeFk == null) return null; if (clbKeys.containsKey(nodeFk)) { // already imported the node and we know the clb key return clbKeys.get(nodeFk); } else if (nodeId == (long) nodeFk) { // tell postgres to use the newly generated key of the inserted record return SELF_ID; } else if (KeyType.CLASSIFICATION == type) { // should not happen as we process the usages in a taxonomic hierarchy from top down. // if you see this it looks like the normalizer did a bad job somewhere throw new IllegalStateException("Higher classification NodeId not in CLB yet: " + nodeFk); } else { // remember non classification keys for update after all records have been synced once int nid = (int) nodeId; if (!postKeys.containsKey(nid)) { postKeys.put(nid, new UsageForeignKeys(nid)); } setFK(postKeys.get(nid), nodeFk, type); return null; } } private UsageForeignKeys setFK(UsageForeignKeys fk, Integer key, KeyType type) { if (key != null && type != null) { switch (type) { case BASIONYM: fk.setBasionymKey(key); break; case PARENT: case ACCEPTED: fk.setParentKey(key); break; case CLASSIFICATION: throw new IllegalArgumentException(); } } return fk; } private NameUsage clone(NameUsage u) { Kryo kryo = kryoPool.borrow(); try { // write ByteArrayOutputStream buffer = new ByteArrayOutputStream(256); Output output = new Output(buffer, 256); kryo.writeObject(output, u); output.close(); // read return kryo.readObject(new Input(buffer.toByteArray()), NameUsage.class); } finally { kryoPool.release(kryo); } } @Override public NameUsage readUsage(long id) { try (Transaction tx = dao.beginTx()) { Node n = dao.getNeo().getNodeById(id); return readUsage(n); } } @Override public ParsedName readName(long id) { if (Constants.NUB_DATASET_KEY.equals(datasetKey)) { return dao.readNubName(id); } else { return dao.readName(id); } } /** * Reads the full name usage from neo and updates all foreign keys to use CLB usage keys. */ private NameUsage readUsage(Node n) { // this is using neo4j internal node ids as keys: NameUsage u = dao.readUsage(n, true); Preconditions.checkNotNull(u, "Node %s not found in kvp store", n.getId()); Integer id = (int) n.getId(); if (clbKeys.containsKey(id)) { u.setKey(clbKeys.get(id)); } UsageFacts facts; if (n.hasLabel(Labels.SYNONYM)) { // use the classification of the parent in case of synonyms facts = dao.readFacts(u.getAcceptedKey()); u.setTaxonomicStatus(u.getTaxonomicStatus() == null ? TaxonomicStatus.SYNONYM : u.getTaxonomicStatus()); u.setAcceptedKey(clbForeignKey(n.getId(), u.getAcceptedKey(), KeyType.ACCEPTED)); } else { facts = dao.readFacts(n.getId()); u.setTaxonomicStatus(u.getTaxonomicStatus() == null ? TaxonomicStatus.ACCEPTED : u.getTaxonomicStatus()); u.setParentKey(clbForeignKey(n.getId(), u.getParentKey(), KeyType.PARENT)); } if (facts != null && facts.classification != null) { ClassificationUtils.copyLinneanClassificationKeys(facts.classification, u); ClassificationUtils.copyLinneanClassification(facts.classification, u); } u.setBasionymKey(clbForeignKey(n.getId(), u.getBasionymKey(), KeyType.BASIONYM)); for (Rank r : Rank.DWC_RANKS) { try { ClassificationUtils.setHigherRankKey(u, r, clbForeignKey(n.getId(), u.getHigherRankKey(r), KeyType.CLASSIFICATION)); } catch (IllegalStateException e) { LOG.error("{} (nodeID={}) has unprocessed {} reference to nodeId {}", n.getProperty(NeoProperties.SCIENTIFIC_NAME, "no name"), n.getId(), r, u.getHigherRankKey(r)); throw e; } } u.setDatasetKey(datasetKey); // update usage status and ppkey for primary pro parte usages if (isProParteNode(n)) { u.setTaxonomicStatus(TaxonomicStatus.PROPARTE_SYNONYM); u.setProParteKey(SELF_ID); } return u; } @Override public boolean isInsert(NameUsage u) { if (datasetKey.equals(Constants.NUB_DATASET_KEY)) { // for nub builds we generate the usageKey in code already. Both for inserts and updates. check key range return u.getKey() == null || u.getKey() > maxExistingNubKey; } else { return false; } } @Override public UsageExtensions readExtensions(long id) { return dao.readExtensions(id); } @Override public NameUsageMetrics readMetrics(long id) { UsageFacts facts = dao.readFacts(id); if (facts != null) { return facts.metrics; } return new NameUsageMetrics(); } @Override public VerbatimNameUsage readVerbatim(long id) { return dao.readVerbatim(id); } @Override public List<Integer> readParentKeys(long id) { try (Transaction tx = dao.beginTx()) { Node n = dao.getNeo().getNodeById(id); return buildClbParents(n); } } @Override public void reportUsageKey(long nodeId, int usageKey) { if (datasetKey.equals(Constants.NUB_DATASET_KEY)) { Preconditions.checkArgument(usageKey < Constants.NUB_MAXIMUM_KEY, "New usage key {} for node {} is larger than allowed maximum {}", usageKey, nodeId, Constants.NUB_MAXIMUM_KEY); } // keep map of node ids to clb usage keys clbKeys.put( (int) nodeId, usageKey); // keep reference to first synced record in the db to get the modified timestamp from the db later. // this doesnt have to be exact so we do not need to worry about concurrent access much if (firstUsageKey < 0) { firstUsageKey = usageKey; LOG.info("First synced usage key is {}", firstUsageKey); } } @Override public void reportNewFuture(Future<List<Integer>> future) { otherFutures.add(future); } public int getSyncCounter() { return syncCounterMain + syncCounterBatches + syncCounterProParte; } public int getDelCounter() { return delCounter; } }