package org.gbif.checklistbank.cli.normalizer; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.model.checklistbank.VerbatimNameUsage; import org.gbif.api.model.common.LinneanClassification; import org.gbif.api.service.checklistbank.NameParser; import org.gbif.api.util.ClassificationUtils; import org.gbif.api.vocabulary.NameUsageIssue; import org.gbif.api.vocabulary.Origin; import org.gbif.api.vocabulary.Rank; import org.gbif.api.vocabulary.TaxonomicStatus; import org.gbif.checklistbank.cli.common.Metrics; import org.gbif.checklistbank.cli.model.NameUsageNode; import org.gbif.checklistbank.cli.model.RankedName; import org.gbif.checklistbank.neo.ImportDb; import org.gbif.checklistbank.neo.Labels; import org.gbif.checklistbank.neo.NeoInserter; import org.gbif.checklistbank.neo.NeoProperties; import org.gbif.checklistbank.neo.NotUniqueException; import org.gbif.checklistbank.neo.NotUniqueRuntimeException; import org.gbif.checklistbank.neo.RelType; import org.gbif.checklistbank.neo.UsageDao; import org.gbif.checklistbank.neo.traverse.NubMatchHandler; import org.gbif.checklistbank.neo.traverse.Traversals; import org.gbif.checklistbank.neo.traverse.TreeWalker; import org.gbif.checklistbank.neo.traverse.UsageMetricsHandler; import org.gbif.dwc.terms.DwcTerm; import org.gbif.nameparser.GBIFNameParser; import org.gbif.nub.lookup.straight.IdLookup; import org.gbif.nub.lookup.straight.IdLookupPassThru; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import javax.annotation.Nullable; import com.codahale.metrics.Meter; import com.codahale.metrics.MetricRegistry; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Splitter; import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.commons.lang3.ObjectUtils; import org.neo4j.graphdb.*; import org.neo4j.helpers.collection.Iterators; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Reads a good id based dwc archive and produces a neo4j graph from it. */ public class Normalizer extends ImportDb implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(Normalizer.class); private static final List<Splitter> COMMON_SPLITTER = Lists.newArrayList(); private static final Set<Rank> UNKNOWN_RANKS = ImmutableSet.of(Rank.UNRANKED, Rank.OTHER); private static final List<Rank> DWC_RANKS_REVERSE = ImmutableList.copyOf(Lists.reverse(Rank.DWC_RANKS)); private static final NameParser PARSER = new GBIFNameParser(); static { for (char del : "[|;, ]".toCharArray()) { COMMON_SPLITTER.add(Splitter.on(del).trimResults().omitEmptyStrings()); } } private final IdLookup lookup; private final Map<String, UUID> constituents; private final File dwca; private final Meter relationMeter; private final Meter denormedMeter; private final Meter metricsMeter; private final int batchSize; private InsertMetadata meta; private List<String> cycles = Lists.newArrayList(); private UsageMetricsHandler metricsHandler; private NubMatchHandler matchHandler; private Normalizer(UUID datasetKey, UsageDao dao, File dwca, int batchSize, MetricRegistry registry, Map<String, UUID> constituents, IdLookup lookup) { super(datasetKey, dao); this.constituents = constituents; this.relationMeter = registry.meter(Metrics.RELATION_METER); this.metricsMeter = registry.meter(Metrics.METRICS_METER); this.denormedMeter = registry.meter(Metrics.DENORMED_METER); this.dwca = dwca; this.lookup = lookup; this.batchSize = batchSize; } /** * Creates a dataset specific normalizer using the configuration {@link NormalizerConfiguration#archiveDir(UUID)} * to load the archive. */ public static Normalizer create(NormalizerConfiguration cfg, UUID datasetKey, MetricRegistry registry, Map<String, UUID> constituents, IdLookup lookup) { return new Normalizer(datasetKey, UsageDao.persistentDao(cfg.neo, datasetKey, false, registry, true), cfg.archiveDir(datasetKey), cfg.neo.batchSize, registry, constituents, lookup); } public static Normalizer create(UUID datasetKey, UsageDao dao, File dwca, IdLookup lookup, int batchSize) { return new Normalizer(datasetKey, dao, dwca, batchSize, new MetricRegistry(), Maps.newHashMap(), lookup); } /** * Creates a dataset specific normalizer with an internal metrics registry and a pass thru nub matcher. */ public static Normalizer create(NormalizerConfiguration cfg, UUID datasetKey) { MetricRegistry registry = new MetricRegistry(); return Normalizer.create(cfg, registry, datasetKey); } /** * Creates a dataset specific normalizer with a pass thru nub matcher. */ public static Normalizer create(NormalizerConfiguration cfg, MetricRegistry registry, UUID datasetKey) { return Normalizer.create(cfg, datasetKey, registry, Maps.<String, UUID>newHashMap(), new IdLookupPassThru()); } /** * Simple wrapper class that lazily loads a name usage from the dao if needed. * used for logging just in some cases */ private class LazyUsage { private final Node n; private NameUsage u; private LazyUsage(Node n) { this.n = n; } public NameUsage getUsage() { if (u == null) { u = dao.readUsage(n, false); } return u; } public String scientificName() { return getUsage().getScientificName(); } } /** * Run the normalizer and close the dao. * * @throws NormalizationFailedException */ public void run() throws NormalizationFailedException { //default behavior run(true); } /** * Run the normalizer. * * @param closeDao Should the dao be closed after running or on exception? * * @throws NormalizationFailedException */ public void run(boolean closeDao) throws NormalizationFailedException { LOG.info("Start normalization of checklist {}", datasetKey); try { // batch import uses its own batchdb batchInsertData(); // insert neo db relations, create implicit nodes if needed and parse names normalize(); // parse all names parseNames(); // match to nub and build metrics buildMetricsAndMatchBackbone(); LOG.info("Normalization succeeded"); } finally { if (closeDao) { dao.close(); LOG.info("Normalizer database shut down"); } } } public NormalizerStats getStats() { return metricsHandler.getStats(meta == null ? 0 : meta.getIgnored(), cycles); } private void batchInsertData() throws NormalizationFailedException { try { NeoInserter inserter = dao.createBatchInserter(batchSize); try { meta = inserter.insert(dwca, constituents); } finally { // closing the batch inserter open the neo db again for regular access via the DAO inserter.close(); } } catch (NotUniqueRuntimeException e) { throw new NormalizationFailedException(e.getProperty() + " values not unique: " + e.getKey(), e); } catch (IOException e) { throw new NormalizationFailedException("IO error: " + e.getMessage(), e); } } /** * Applies the classification given as denormalized higher taxa terms * after the parent / accepted relations have been applied. * It also removes the ROOT label if new parents are assigned. * We need to be careful as the classification coming in first via the parentNameUsage(ID) terms * is variable and must not always include a rank. */ private void applyDenormedClassification() { LOG.info("Start processing higher denormalized classification ..."); if (!meta.isDenormedClassificationMapped()) { LOG.info("No higher classification mapped"); return; } int counter = 0; Transaction tx = dao.getNeo().beginTx(); try { for (Node n : dao.getNeo().getAllNodes()) { if (counter % batchSize * 10 == 0) { tx.success(); tx.close(); LOG.info("Higher classifications processed for {} taxa", counter); tx = dao.getNeo().beginTx(); } applyClassification(n); counter++; denormedMeter.mark(); } } finally { tx.success(); tx.close(); } LOG.info("Classification processing completed, {} nodes processed", counter); } private void applyClassification(Node n) { RankedName highest; if (meta.isParentNameMapped()) { // verify if we already have a classification, that it ends with a known rank highest = getDirectParent(n); if (highest.node != n && (highest.rank == null || UNKNOWN_RANKS.contains(highest.rank))) { LOG.debug("Node {} already has a classification which ends in an uncomparable rank.", n.getId()); addIssueRemark(n, null, NameUsageIssue.CLASSIFICATION_NOT_APPLIED); return; } } else { // use this node highest = dao.readRankedName(n); } // shortcut: exit if highest is already a kingdom, the denormed classification cannot add to it anymore! if (highest.rank != null && highest.rank == Rank.KINGDOM) { return; } LinneanClassification lc = dao.readUsage(n, false); applyClassification(highest, lc); } @VisibleForTesting protected static Rank getLowestExistingRank(LinneanClassification lc) { for (Rank r : DWC_RANKS_REVERSE) { if (lc.getHigherRank(r) != null) { return r; } } return null; } private void removeGenusAndBelow(LinneanClassification lc) { ClassificationUtils.setHigherRank(lc, Rank.GENUS, null); ClassificationUtils.setHigherRank(lc, Rank.SUBGENUS, null); ClassificationUtils.setHigherRank(lc, Rank.SPECIES, null); } private void applyClassification(RankedName taxon, LinneanClassification lc) { Node parent = null; Rank parentRank = null; // exclude lowest rank from classification to be applied if this taxon is rankless and has the same name if (taxon.rank == null || taxon.rank.isUncomparable()) { Rank lowest = getLowestExistingRank(lc); if (lowest != null && lc.getHigherRank(lowest).equalsIgnoreCase(taxon.name)) { ClassificationUtils.setHigherRank(lc, lowest, null); } } // ignore same rank from classification if accepted if (!taxon.node.hasLabel(Labels.SYNONYM) && taxon.rank != null) { ClassificationUtils.setHigherRank(lc, taxon.rank, null); } // ignore genus and below for synonyms // http://dev.gbif.org/issues/browse/POR-2992 if (taxon.node.hasLabel(Labels.SYNONYM)) { removeGenusAndBelow(lc); } // from kingdom to genus for (Rank hr : Rank.DWC_RANKS) { if ((taxon.rank == null || !taxon.rank.higherThan(hr)) && lc.getHigherRank(hr) != null) { // test for existing usage with that name & rank boolean found = false; for (Node n : nodesByCanonicalAndRank(lc.getHigherRank(hr), hr)) { if (parent == null) { // make sure node does also not have a higher linnean rank parent Node p = getLinneanRankParent(n); if (p == null) { // aligns! parent = n; parentRank = hr; found = true; break; } } else { // verify the parents for the next higher rank are the same try { Node p = n.getSingleRelationship(RelType.PARENT_OF, Direction.INCOMING).getStartNode(); Node p2 = Traversals.findParentWithRank(n, parentRank); if (p.equals(parent) || (p2 !=null && p2.equals(parent))) { parent = n; parentRank = hr; found = true; break; } } catch (Exception e) { // log? } } } if (!found) { // persistent new higher taxon if not found Node lowerParent = create(Origin.DENORMED_CLASSIFICATION, lc.getHigherRank(hr), hr, TaxonomicStatus.ACCEPTED, parent == null).node; // insert parent relationship? assignParent(parent, lowerParent); parent = lowerParent; parentRank = hr; } } } // finally apply to initial node assignParent(parent, taxon.node); } private void assignParent(Node parent, Node child) { if (parent != null) { parent.createRelationshipTo(child, RelType.PARENT_OF); child.removeLabel(Labels.ROOT); } } /** * Sanitizes relations and does the following cleanup: * <ul> * <li>Relink synonym of synonyms to make sure synonyms always point to a direct accepted taxon.</li> * <li>(Re)move parent relationship for synonyms.</li> * <li>Break eternal classification loops at lowest rank</li> * </ul> */ private void cleanupRelations() { LOG.info("Cleanup relations ..."); // cut synonym cycles int counter = 0; while (true) { try (Transaction tx = dao.getNeo().beginTx()) { Result result = dao.getNeo().execute("MATCH (s:TAXON)-[sr:SYNONYM_OF]->(x)-[:SYNONYM_OF*]->(s) RETURN sr LIMIT 1"); if (result.hasNext()) { Relationship sr = (Relationship) result.next().get("sr"); Node syn = sr.getStartNode(); NameUsage su = dao.readUsage(syn, false); su.addIssue(NameUsageIssue.CHAINED_SYNOYM); su.addIssue(NameUsageIssue.PARENT_CYCLE); dao.store(syn.getId(), su, false); String taxonID = (String) syn.getProperty(NeoProperties.TAXON_ID, null); cycles.add(taxonID); NameUsageNode acc = create(Origin.MISSING_ACCEPTED, NormalizerConstants.PLACEHOLDER_NAME, null, TaxonomicStatus.DOUBTFUL, true, null, "Synonym cycle cut for taxonID " + taxonID); createSynonymRel(syn, acc.node); sr.delete(); tx.success(); if (counter++ % 100 == 0) { LOG.debug("Synonym cycles cut so far: {}", counter); } } else { break; } } } // relink synonym chain to single accepted int chainedSynonyms = 0; while (true) { try (Transaction tx = dao.getNeo().beginTx()) { Result result = dao.getNeo().execute("MATCH (s:TAXON)-[sr:SYNONYM_OF*]->(x)-[:SYNONYM_OF]->(t:TAXON) " + "WHERE NOT (t)-[:SYNONYM_OF]->() " + "RETURN sr, t LIMIT 1"); if (result.hasNext()) { Map<String, Object> row = result.next(); Node acc = (Node) row.get("t"); for (Relationship sr : (Collection<Relationship>) row.get("sr")) { Node syn = sr.getStartNode(); addIssueRemark(syn, null, NameUsageIssue.CHAINED_SYNOYM); createSynonymRel(syn, acc); sr.delete(); chainedSynonyms++; } tx.success(); if (counter++ % 100 == 0) { LOG.debug("Synonym chain cut so far: {}", counter); } } else { break; } } } // removes parent relations for synonyms // if synonyms are parents of other taxa relinks relationship to the accepted // presence of both confuses subsequent imports, see http://dev.gbif.org/issues/browse/POR-2755 int parentOfRelDeleted = 0; int parentOfRelRelinked = 0; int childOfRelDeleted = 0; int childOfRelRelinkedToAccepted = 0; try (Transaction tx = dao.getNeo().beginTx()) { for (Node syn : Iterators.loop(dao.allSynonyms())) { Node accepted = syn.getSingleRelationship(RelType.SYNONYM_OF, Direction.OUTGOING).getEndNode(); LazyUsage synU = new LazyUsage(syn); LazyUsage accU = new LazyUsage(accepted); // if the synonym is a parent of another child taxon - relink accepted as parent of child for (Relationship rel : syn.getRelationships(RelType.PARENT_OF, Direction.OUTGOING)) { Node child = rel.getOtherNode(syn); if (child.equals(accepted)) { // accepted is also the parent. Delete parent rel in this case rel.delete(); parentOfRelDeleted++; } else { rel.delete(); accepted.createRelationshipTo(child, RelType.PARENT_OF); parentOfRelRelinked++; addIssueRemark(child, "Parent relation taken from synonym " + synU.scientificName()); } } // remove parent rel for synonyms for (Relationship rel : syn.getRelationships(RelType.PARENT_OF, Direction.INCOMING)) { // before we delete the relation make sure the accepted does have a parent rel or is ROOT if (accepted.hasRelationship(RelType.PARENT_OF, Direction.INCOMING)) { // delete childOfRelDeleted++; rel.delete(); } else { Node parent = rel.getOtherNode(syn); // relink if parent is not the accepted and parent rank is higher than accepted or null if (!parent.equals(accepted)) { NameUsage parentU = dao.readUsage(parent, false); if (parentU.getRank() == null || (accU.getUsage().getRank() != null && parentU.getRank().higherThan(accU.getUsage().getRank()))) { LOG.debug("Relink parent rel of synonym {}", synU.scientificName()); childOfRelRelinkedToAccepted++; parent.createRelationshipTo(accepted, RelType.PARENT_OF); addIssueRemark(accepted, "Parent relation taken from synonym " + synU.scientificName()); } } rel.delete(); } } } tx.success(); } LOG.info("Relations cleaned up, {} synonym cycles detected, {} chained synonyms relinked", cycles.size(), chainedSynonyms); LOG.info("Synonym relations cleaned up. " + "{} childOf relations deleted, {} childOf rels relinked to accepted," + "{} parentOf relations deleted, {} parentOf rels moved from synonym to accepted", childOfRelDeleted, childOfRelRelinkedToAccepted, parentOfRelDeleted, parentOfRelRelinked); } /** * Reads a name usage from the kvp store, adds issues and or remarks and persists it again. * Only use this method if you just have a node a no usage instance yet at hand. */ private NameUsageNode addIssueRemark(Node n, @Nullable String remark, NameUsageIssue... issues) { NameUsageNode nn = new NameUsageNode(n, dao.readUsage(n, false), true); nn.addIssue(issues); if (remark != null) { nn.addRemark(remark); } dao.store(nn, false); return nn; } /** * Creates a synonym relationship between the given synonym and the accepted node, updating labels accordingly * and also moving potentially existing parent_of relations. */ private void createSynonymRel(Node synonym, Node accepted) { synonym.createRelationshipTo(accepted, RelType.SYNONYM_OF); if (synonym.hasRelationship(RelType.PARENT_OF)) { try { Relationship rel = synonym.getSingleRelationship(RelType.PARENT_OF, Direction.INCOMING); if (rel != null) { // check if accepted has a parent relation already if (!accepted.hasRelationship(RelType.PARENT_OF, Direction.INCOMING)) { rel.getStartNode().createRelationshipTo(accepted, RelType.PARENT_OF); accepted.removeLabel(Labels.ROOT); } } } catch (RuntimeException e) { // more than one parent relationship exists, should never be the case, sth wrong! LOG.warn("Synonym {} has multiple parent relationships. Deleting them all!", synonym.getId()); //for (Relationship r : synonym.getRelationships(RelType.PARENT_OF)) { // r.delete(); //} } } } /** * Matches every node to the backbone and calculates a usage metric. * This is done jointly as both needs the full Linnean classification for every node. */ private void buildMetricsAndMatchBackbone() throws NormalizationFailedException { checkInterrupted(); LOG.info("Walk all accepted taxa, build metrics and match to the GBIF backbone"); metricsHandler = new UsageMetricsHandler(dao); matchHandler = new NubMatchHandler(lookup, dao); final long before = metricsMeter.getCount(); TreeWalker.walkAcceptedTree(dao.getNeo(), null, null, metricsMeter, metricsHandler, matchHandler); final long after = metricsMeter.getCount(); LOG.info("Walked all {} accepted taxa and built metrics", after-before); } /** * @return if splittable 2 ore more values, otherwise the original value alone unless its an empty string */ @VisibleForTesting protected static List<String> splitByCommonDelimiters(String val) { if (Strings.isNullOrEmpty(val)) { return Lists.newArrayList(); } for (Splitter splitter : COMMON_SPLITTER) { List<String> vals = splitter.splitToList(val); if (vals.size() > 1) { return vals; } } return Lists.newArrayList(val); } /** * Checks if this node is a pro parte synonym by looking if multiple accepted taxa are referred to. * If so, new taxon nodes are created each with a single, unique acceptedNameUsageID property. */ private List<String> parseAcceptedIDs(NameUsageNode nn, @Nullable VerbatimNameUsage v) { List<String> acceptedIds = Lists.newArrayList(); final String unsplitIds = v.getCoreField(DwcTerm.acceptedNameUsageID); if (unsplitIds != null) { if (unsplitIds.equals(nn.usage.getTaxonID())) { acceptedIds.add(unsplitIds); } else { if (meta.getMultiValueDelimiters().containsKey(DwcTerm.acceptedNameUsageID)) { acceptedIds = meta.getMultiValueDelimiters().get(DwcTerm.acceptedNameUsageID).splitToList(unsplitIds); } else { // lookup by taxon id to see if this is an existing identifier or if we should try to split it Node a = nodeByTaxonId(unsplitIds); if (a != null) { acceptedIds.add(unsplitIds); } else { acceptedIds = splitByCommonDelimiters(unsplitIds); } } } } return acceptedIds; } private Transaction renewTx(Transaction tx) { tx.success(); tx.close(); return dao.getNeo().beginTx(); } /** * Creates implicit nodes and sets up relations between taxa. */ private void normalize() throws NormalizationFailedException { LOG.info("Start processing explicit relations ..."); int counter = 0; Transaction tx = dao.getNeo().beginTx(); try { // This iterates over ALL NODES, even the ones created within this loop which trigger a transaction commit! // iteration is by node id starting from node id 1 to highest. // if nodes are created within this loop they receive the highest node id and thus are added to the end of this loop for (Node n : dao.getNeo().getAllNodes()) { setupRelation(n); // inc counters & commit batch counter++; relationMeter.mark(); tx = renewTx(tx); if (counter % 10000 == 0) { LOG.debug("Processed relations for {} nodes", counter); // interrupted? then lets get out of here checkInterrupted(); } } } finally { tx.success(); tx.close(); } // now process the denormalized classifications applyDenormedClassification(); // finally resolve cycles and other bad relations cleanupRelations(); LOG.info("Relation setup completed, {} nodes processed. Setup rate: {}", counter, relationMeter.getMeanRate()); } /** * Parses all names and stores them in the DAO * NubMatchHandler relies on this! * @throws NormalizationFailedException */ private void parseNames() throws NormalizationFailedException { LOG.info("Start parsing all names ..."); try (Transaction tx = dao.beginTx()) { int counter = 0; for (Node n : dao.allNodes()) { // parse name ParsedName pn = PARSER.parseQuietly(NeoProperties.getScientificName(n), NeoProperties.getRank(n, null)); dao.store(n.getId(), pn); if (counter % 10000 == 0) { LOG.debug("Parsed names for {} nodes", counter); // interrupted? then lets get out of here checkInterrupted(); } } tx.success(); } } private NameUsage setupRelation(Node n) { final NameUsageNode nn = new NameUsageNode(n, dao.readUsage(n, false), true); final VerbatimNameUsage v = dao.readVerbatim(n.getId()); setupAcceptedRel(nn, v); setupParentRel(nn, v); setupBasionymRel(nn, v); dao.store(nn, false); return nn.usage; } /** * Creates synonym_of relationship based on the verbatim dwc:acceptedNameUsageID and dwc:acceptedNameUsage term values. * Assumes pro parte synonyms are dealt with before and the remaining accepted identifier refers to a single taxon only. * * @param nn the usage to process */ private void setupAcceptedRel(NameUsageNode nn, @Nullable VerbatimNameUsage v) { Node accepted = null; if (v != null && meta.isAcceptedNameMapped()) { List<String> acceptedIds = parseAcceptedIDs(nn, v); if (!acceptedIds.isEmpty()) { String id = acceptedIds.get(0); // make sure it is not an accepted taxon pointing to itself if (!id.equals(nn.usage.getTaxonID())) { accepted = nodeByTaxonId(id); if (accepted == null) { nn.addIssue(NameUsageIssue.ACCEPTED_NAME_USAGE_ID_INVALID); LOG.debug("acceptedNameUsageID {} not existing", id); // is the accepted name also mapped? String name = ObjectUtils.firstNonNull(v.getCoreField(DwcTerm.acceptedNameUsage), NormalizerConstants.PLACEHOLDER_NAME); accepted = createTaxonWithClassification(Origin.MISSING_ACCEPTED, name, nn.usage.getRank(), TaxonomicStatus.DOUBTFUL, nn, id, "Placeholder for the missing accepted taxonID for synonym " + nn.usage.getScientificName(), v); } // persistent proparte rels if needed Iterator<String> additionalIds = acceptedIds.listIterator(1); while (additionalIds.hasNext()) { final String id2 = additionalIds.next(); Node accepted2 = nodeByTaxonId(id2); if (accepted2 == null) { nn.addIssue(NameUsageIssue.ACCEPTED_NAME_USAGE_ID_INVALID); LOG.debug("acceptedNameUsageID {} not existing", id2); } else { nn.node.createRelationshipTo(accepted2, RelType.PROPARTE_SYNONYM_OF); } } } } else { final String name = v.getCoreField(DwcTerm.acceptedNameUsage); if (name != null && !name.equals(nn.usage.getScientificName())) { try { accepted = nodeBySciname(name); if (accepted == null && !name.equals(nn.usage.getCanonicalName())) { accepted = nodeByCanonical(name); if (accepted == null) { LOG.debug("acceptedNameUsage {} not existing, materialize it", name); accepted = createTaxonWithClassification(Origin.VERBATIM_ACCEPTED, name, null, TaxonomicStatus.DOUBTFUL, nn, null, null, v); } } } catch (NotUniqueException e) { nn.addIssue(NameUsageIssue.ACCEPTED_NAME_NOT_UNIQUE); LOG.warn("acceptedNameUsage {} not unique, duplicate accepted name for synonym {} and taxonID {}", name, nn.usage.getScientificName(), nn.usage.getTaxonID()); accepted = createTaxonWithClassification(Origin.VERBATIM_ACCEPTED, name, null, TaxonomicStatus.DOUBTFUL, nn, null, null, v); } } } } // if status is synonym but we aint got no idea of the accepted insert an incertae sedis record of same rank if (nn.usage.isSynonym() && accepted == null) { nn.addIssue(NameUsageIssue.ACCEPTED_NAME_MISSING); accepted = createTaxonWithClassification(Origin.MISSING_ACCEPTED, NormalizerConstants.PLACEHOLDER_NAME, nn.usage.getRank(), TaxonomicStatus.DOUBTFUL, nn, null, "Placeholder for the missing accepted taxon for synonym " + nn.usage.getScientificName(), v); } if (accepted != null && !accepted.equals(nn.node)) { // make sure taxonomic status reflects the synonym relation if (!nn.usage.isSynonym()) { nn.usage.setTaxonomicStatus(TaxonomicStatus.SYNONYM); } nn.node.createRelationshipTo(accepted, RelType.SYNONYM_OF); nn.node.addLabel(Labels.SYNONYM); } } /** * Sets up the parent relations using the parentNameUsage(ID) term values. * The denormed, flat classification is used in a next step later. */ private void setupParentRel(NameUsageNode nn, @Nullable VerbatimNameUsage v) { Node parent = null; if (v != null) { final String id = v.getCoreField(DwcTerm.parentNameUsageID); if (id != null) { if ((nn.usage.getTaxonID() == null || !id.equals(nn.usage.getTaxonID()))) { parent = nodeByTaxonId(id); if (parent == null) { nn.addIssue(NameUsageIssue.PARENT_NAME_USAGE_ID_INVALID); LOG.debug("parentNameUsageID {} not existing", id); } } } else { final String name = v.getCoreField(DwcTerm.parentNameUsage); if (name != null && !name.equals(nn.usage.getScientificName())) { try { parent = nodeBySciname(name); if (parent == null && !name.equals(nn.usage.getCanonicalName())) { parent = nodeByCanonical(name); } if (parent == null) { LOG.debug("parentNameUsage {} not existing, materialize it", name); parent = create(Origin.VERBATIM_PARENT, name, null, TaxonomicStatus.DOUBTFUL, true).node; } } catch (NotUniqueException e) { nn.addIssue(NameUsageIssue.PARENT_NAME_NOT_UNIQUE); LOG.warn("parentNameUsage {} not unique, ignore relationship for name {} and taxonID {}", name, nn.usage.getScientificName(), nn.usage.getTaxonID()); parent = create(Origin.VERBATIM_PARENT, name, null, TaxonomicStatus.DOUBTFUL, true).node; } } } } if (parent != null && !parent.equals(nn.node)) { parent.createRelationshipTo(nn.node, RelType.PARENT_OF); } else if (!nn.usage.isSynonym()) { nn.node.addLabel(Labels.ROOT); } } private void setupBasionymRel(NameUsageNode nn, @Nullable VerbatimNameUsage v) { if (meta.isOriginalNameMapped() && v != null) { Node basionym = null; final String id = v.getCoreField(DwcTerm.originalNameUsageID); if (id != null) { if (!id.equals(nn.usage.getTaxonID())) { basionym = nodeByTaxonId(id); if (basionym == null) { nn.addIssue(NameUsageIssue.ORIGINAL_NAME_USAGE_ID_INVALID); LOG.debug("originalNameUsageID {} not existing", id); } } } else { final String name = v.getCoreField(DwcTerm.originalNameUsage); if (name != null && !name.equals(nn.usage.getScientificName())) { try { basionym = nodeBySciname(name); if (basionym == null && !name.equals(nn.usage.getCanonicalName())) { basionym = nodeByCanonical(name); } if (basionym == null) { LOG.debug("originalNameUsage {} not existing, materialize it", name); basionym = create(Origin.VERBATIM_BASIONYM, name, null, TaxonomicStatus.DOUBTFUL, true).node; } } catch (NotUniqueException e) { nn.addIssue(NameUsageIssue.ORIGINAL_NAME_NOT_UNIQUE); LOG.warn("originalNameUsage {} not unique, ignore relationship for taxonID {}", nn.usage.getScientificName(), nn.usage.getTaxonID()); } } } if (basionym != null && !basionym.equals(nn.node)) { basionym.createRelationshipTo(nn.node, RelType.BASIONYM_OF); } } } /** * Creates a new taxon in neo and the name usage kvp using the source usages as a template for the classification properties. * Only copies the classification above genus and ignores genus and below! * A verbatim usage is created with just the parentNameUsage(ID) values so they can get resolved into proper neo relations later. * * @param taxonID the optional taxonID to apply to the new node */ private Node createTaxonWithClassification(Origin origin, String sciname, Rank rank, TaxonomicStatus status, NameUsageNode source, @Nullable String taxonID, @Nullable String remarks, VerbatimNameUsage sourceVerbatim) { NameUsage u = new NameUsage(); u.setScientificName(sciname); u.setCanonicalName(sciname); u.setRank(rank); u.setOrigin(origin); u.setTaxonomicStatus(status); u.setTaxonID(taxonID); u.setRemarks(remarks); // copy verbatim classification from source ClassificationUtils.copyLinneanClassification(source.usage, u); removeGenusAndBelow(u); Node n = create(u, false).node; // copy parent props from source VerbatimNameUsage v = new VerbatimNameUsage(); v.setCoreField(DwcTerm.parentNameUsageID, sourceVerbatim.getCoreField(DwcTerm.parentNameUsageID)); v.setCoreField(DwcTerm.parentNameUsage, sourceVerbatim.getCoreField(DwcTerm.parentNameUsage)); dao.store(n.getId(), v); return n; } private void checkInterrupted() throws NormalizationFailedException { if (Thread.interrupted()) { LOG.warn("Normalizer interrupted, exit {} early with incomplete parsing", datasetKey); throw new NormalizationFailedException("Normalizer interrupted"); } } }