package org.gbif.checklistbank.nub; import org.apache.commons.lang3.StringUtils; import org.gbif.api.exception.UnparsableException; import org.gbif.api.model.Constants; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.service.checklistbank.NameParser; import org.gbif.api.vocabulary.Kingdom; import org.gbif.api.vocabulary.NameType; import org.gbif.api.vocabulary.NameUsageIssue; import org.gbif.api.vocabulary.Origin; import org.gbif.api.vocabulary.Rank; import org.gbif.api.vocabulary.TaxonomicStatus; import org.gbif.checklistbank.authorship.AuthorComparator; import org.gbif.checklistbank.authorship.BasionymGroup; import org.gbif.checklistbank.authorship.BasionymSorter; import org.gbif.checklistbank.cli.normalizer.NormalizerStats; import org.gbif.checklistbank.cli.nubbuild.NubConfiguration; import org.gbif.checklistbank.iterable.CloseableIterator; import org.gbif.checklistbank.model.Equality; import org.gbif.checklistbank.neo.Labels; import org.gbif.checklistbank.neo.NeoProperties; import org.gbif.checklistbank.neo.RelType; import org.gbif.checklistbank.neo.UsageDao; import org.gbif.checklistbank.neo.traverse.Traversals; import org.gbif.checklistbank.neo.traverse.TreeWalker; import org.gbif.checklistbank.neo.traverse.UsageMetricsHandler; import org.gbif.checklistbank.nub.model.NubUsage; import org.gbif.checklistbank.nub.model.NubUsageMatch; import org.gbif.checklistbank.nub.model.SrcUsage; import org.gbif.checklistbank.nub.source.ClbSource; import org.gbif.checklistbank.nub.source.ClbSourceList; import org.gbif.checklistbank.nub.source.NubSource; import org.gbif.checklistbank.nub.source.NubSourceList; import org.gbif.checklistbank.nub.validation.NubAssertions; import org.gbif.checklistbank.nub.validation.NubTreeValidation; import org.gbif.checklistbank.nub.validation.NubValidation; import org.gbif.checklistbank.utils.SciNameNormalizer; import org.gbif.nameparser.GBIFNameParser; import org.gbif.nub.lookup.straight.IdLookup; import org.gbif.nub.lookup.straight.IdLookupImpl; import org.gbif.utils.collection.MapUtils; import java.io.File; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Ordering; import com.google.common.collect.Sets; import com.google.common.collect.UnmodifiableIterator; import it.unimi.dsi.fastutil.ints.Int2LongMap; import it.unimi.dsi.fastutil.ints.Int2LongOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2IntMap; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; import it.unimi.dsi.fastutil.objects.Object2LongMap; import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; import org.neo4j.graphdb.Direction; import org.neo4j.graphdb.Node; import org.neo4j.graphdb.NotFoundException; import org.neo4j.graphdb.Relationship; import org.neo4j.graphdb.ResourceIterable; import org.neo4j.graphdb.ResourceIterator; import org.neo4j.graphdb.Transaction; import org.neo4j.graphdb.traversal.Evaluators; import org.neo4j.helpers.collection.Iterators; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class NubBuilder implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(NubBuilder.class); private static final Joiner SEMICOLON_JOIN = Joiner.on("; ").skipNulls(); public static final Set<Rank> NUB_RANKS; static { List<Rank> ranks = Lists.newArrayList(Rank.LINNEAN_RANKS); ranks.add(Rank.SUBSPECIES); ranks.add(Rank.VARIETY); ranks.add(Rank.FORM); ranks.remove(Rank.KINGDOM); // we only use kingdoms from our enum NUB_RANKS = ImmutableSet.copyOf(ranks); } private final Set<NameType> ignoredNameTypes = Sets.newHashSet( NameType.CANDIDATUS, NameType.CULTIVAR, NameType.INFORMAL, NameType.NO_NAME, NameType.PLACEHOLDER, NameType.NO_NAME ); private final static ImmutableMap<TaxonomicStatus, Integer> STATUS_ORDER = ImmutableMap.of( TaxonomicStatus.HOMOTYPIC_SYNONYM, 1, TaxonomicStatus.HETEROTYPIC_SYNONYM, 2, TaxonomicStatus.SYNONYM, 3, TaxonomicStatus.ACCEPTED, 4, TaxonomicStatus.DOUBTFUL, 5 ); private static final Pattern EX_AUTHOR = Pattern.compile("^(.+) ex ", Pattern.CASE_INSENSITIVE); private final Set<Rank> allowedRanks = Sets.newHashSet(); private final NubDb db; private final boolean closeDao; private final NubSourceList sources; private final NameParser parser; private final NubConfiguration cfg; private NubSource currSrc; private ParentStack parents; private int sourceUsageCounter = 0; private final AuthorComparator authorComparator; private final IdGenerator idGen; private final Int2LongMap src2NubKey = new Int2LongOpenHashMap(); private final Long2IntMap basionymRels = new Long2IntOpenHashMap(); // node.id -> src.usageKey private final Map<UUID, Integer> priorities = Maps.newHashMap(); private Integer maxPriority = 0; private int datasetCounter = 1; private NubBuilder(UsageDao dao, NubSourceList sources, IdLookup idLookup, AuthorComparator authorComparator, int newIdStart, boolean closeDao, NubConfiguration cfg) { db = NubDb.create(dao, authorComparator); this.sources = sources; this.authorComparator = authorComparator; idGen = new IdGenerator(idLookup, newIdStart); this.closeDao = closeDao; this.cfg = cfg; this.parser = new GBIFNameParser(cfg.parserTimeout); } public static NubBuilder create(NubConfiguration cfg) { UsageDao dao = UsageDao.persistentDao(cfg.neo, Constants.NUB_DATASET_KEY, false, null, true); try { IdLookupImpl idLookup = IdLookupImpl.temp().load(cfg.clb, true); return new NubBuilder(dao, ClbSourceList.create(cfg), idLookup, idLookup.getAuthorComparator(), idLookup.getKeyMax() + 1, true, cfg); } catch (Exception e) { throw new IllegalStateException("Failed to load existing backbone ids", e); } } /** * @param dao the dao to persistent the nub. Will be left open after run() is called. */ public static NubBuilder create(UsageDao dao, NubSourceList sources, IdLookup idLookup, int newIdStart, int parserTimeout) { NubConfiguration cfg = new NubConfiguration(); cfg.groupBasionyms = true; cfg.validate = true; cfg.runAssertions = false; cfg.autoImport = false; cfg.neo.batchSize = 5000; cfg.parserTimeout = parserTimeout; return new NubBuilder(dao, sources, idLookup, idLookup.getAuthorComparator(), newIdStart, false, cfg); } /** * Builds a new neo4j based backbone with metrics and stable ids already mapped. * The DAO is kept open if you provided it explicitly, otherwise its being closed. */ @Override public void run() { try { addKingdoms(); parents = new ParentStack(db.kingdom(Kingdom.INCERTAE_SEDIS)); // main work importing all source checklists addDatasets(); // change current datasource to nub algorithm, avoiding using the last source for algorithmically generated usages currSrc = new ClbSource(null, Constants.NUB_DATASET_KEY, "Backbone algorithm"); // detect and group basionyms groupByBasionym(); // extract synonyms from ex authors synonymizeExAuthors(); // flagging of suspicous usages flagParentMismatch(); flagEmptyGenera(); cleanImplicitTaxa(); flagDuplicateAcceptedNames(); flagSimilarNames(); flagDoubtfulOriginalNames(); // persist missing autonyms fixInfraspeciesHierarchy(); manageAutonyms(); // basic neo tree checks, fail fast if (cfg.validate) { validate(new NubTreeValidation(db)); } // add extra data addPublicationDois(); addExtensionData(); // match to old nub and assign (stable) usage keys for postgres assignUsageKeys(); // final validation with often reported issues if (cfg.runAssertions) { validate(new NubAssertions(db)); } // convert usages for the importer and build metrics db.dao().convertNubUsages(); builtUsageMetrics(); LOG.info("New backbone built successfully!"); } catch (AssertionError e){ LOG.error("Backbone invalid, build failed!", e); throw e; } catch (RuntimeException e){ LOG.error("Fatal error. Backbone build failed!", e); db.dao().consistencyNubReport(); throw e; } finally { sources.close(); if (closeDao) { db.dao().close(); LOG.info("Backbone dao closed orderly"); } else { LOG.warn("Backbone dao not closed!"); } } } /** * Goes through all names with ex authors (99% botanical) and creates homotypic synonyms with the ex authorship. * Ex authors are publications which published a name earlier than the regular author, but which are illegitimate according to the code, for example a nomen nudum. * * See http://dev.gbif.org/issues/browse/POR-3147 */ private void synonymizeExAuthors() { LOG.info("Extract ex author species synonyms"); try (Transaction tx = db.beginTx()) { synonymizeExAuthors(db.dao().allSpecies()); tx.success(); } LOG.info("Extract ex author infraspecies synonyms"); try (Transaction tx = db.beginTx()) { synonymizeExAuthors(db.dao().allInfraSpecies()); tx.success(); } } private void synonymizeExAuthors(ResourceIterator<Node> iter) { for (Node n : Iterators.loop(iter)) { NubUsage nub = read(n); if (!StringUtils.isBlank(nub.parsedName.getAuthorship())) { Matcher m = EX_AUTHOR.matcher(nub.parsedName.getAuthorship()); if (m.find()) { try { // create synonym if not already existing SrcUsage syn = new SrcUsage(); syn.parsedName = nub.parsedName; syn.parsedName.setAuthorship(m.group(1)); syn.parsedName.setYear(null); syn.parsedName.setScientificName(syn.parsedName.canonicalNameComplete()); syn.rank = nub.rank; syn.status = TaxonomicStatus.HOMOTYPIC_SYNONYM; // the parent nub does not matter as we always do a qualified author based matching NubUsageMatch match = db.findNubUsage(nub.datasetKey, syn, nub.kingdom, null); if (!match.isMatch() || !match.usage.parsedName.hasAuthorship()) { // create a new synonym NubUsage accepted = nub.status.isAccepted() ? nub : db.parent(nub); LOG.debug("Create ex author synonym {}", syn.parsedName.fullName()); db.addUsage(accepted, syn, Origin.EX_AUTHOR_SYNONYM, currSrc.key); } } catch (IgnoreSourceUsageException e) { // swallow } } } } } /** * If there are several accepted infraspecific ranks for a given species * this method makes sure the species subtree makes sense. * * The same epithet can be used again within a species, at whatever level, only if the names with the re-used epithet * are attached to the same type. Thus there can be a form called Poa secunda f. juncifolia as well as * the subspecies Poa secunda subsp. juncifolia if, and only if, the type specimen of Poa secunda f. juncifolia * is the same as the type specimen of Poa secunda subsp. juncifolia. * In other words, if there is a single type specimen whose classification is Poa secunda subsp. juncifolia f. juncifolia. */ private void fixInfraspeciesHierarchy() { //TODO } /** * Writes a file based report about deleted, resurrected and newly added taxa. */ public void report(File reportingDir) { try { idGen.writeReports(reportingDir); } catch (IOException e) { LOG.warn("Failed to write ID report", e); } } private void flagDuplicateAcceptedNames() { for (Kingdom k : Kingdom.values()) { try (Transaction tx = db.beginTx()) { LOG.info("Start flagging doubtful duplicate names in {}", k); NubUsage ku = db.kingdom(k); markDuplicatesRedundant(Traversals.ACCEPTED_TREE.traverse(ku.node).nodes()); tx.success(); } } } /** * http://dev.gbif.org/issues/browse/POR-2815 */ private void flagDoubtfulOriginalNames() { LOG.info("Start flagging doubtful original names"); try (Transaction tx = db.beginTx()) { for (Node gn : Iterators.loop(db.dao().allGenera())) { NubUsage genus = read(gn); Integer gYear = genus.parsedName.getYearInt(); if (gYear != null) { // all accepted included taxa should have been described after the genus // flag the ones that have an earlier publication date! for (Node n : Traversals.SORTED_ACCEPTED_TREE.traverse(gn).nodes()) { NubUsage u = read(n); Integer year = u.parsedName.getYearInt(); if (year != null && year < gYear) { u.issues.add(NameUsageIssue.PUBLISHED_BEFORE_GENUS); db.store(u); } } } } tx.success(); } } /** * Goes through all accepted species and infraspcecies and flags suspicous similar names. * Adds a NameUsageIssue.ORTHOGRAPHIC_VARIANT to all similar names. */ private void flagSimilarNames() { LOG.info("Start flagging similar species"); try (Transaction tx = db.beginTx()) { flagSimilarNames(db.dao().allSpecies()); tx.success(); } LOG.info("Start flagging similar infraspecies"); try (Transaction tx = db.beginTx()) { flagSimilarNames(db.dao().allInfraSpecies()); tx.success(); } } /** * Small null safe wrapper around our DAO that logs missing nub usages for existing nodes. */ private NubUsage read(Node n) { NubUsage u = db.dao().readNub(n); if (u == null) { LOG.error("Missing kvp nub usage for node {} {}", n.getId(), NeoProperties.getScientificName(n)); throw new IllegalStateException("Missing kvp nub usage for node " + n.getId()); } return u; } private void flagSimilarNames(ResourceIterator<Node> iter) { Map<String, String> names = Maps.newHashMap(); for (Node n : Iterators.loop(iter)) { if (!n.hasLabel(Labels.SYNONYM)) { NubUsage u = read(n); String normedName = db.dao().canonicalOrScientificName(u.parsedName, false); if (!StringUtils.isBlank(normedName)) { if (names.containsKey(normedName)) { u.issues.add(NameUsageIssue.ORTHOGRAPHIC_VARIANT); u.addRemark("Possible variant of "+names.get(normedName)); db.store(u); } else { names.put(normedName, u.parsedName.canonicalNameComplete()); } } } } } /** * Assigns a doubtful status to accepted names that only differ in authorship * * @param nodes any node iterable to check for names */ private void markDuplicatesRedundant(ResourceIterable<Node> nodes) { Object2LongMap<String> names = new Object2LongOpenHashMap<>(); for (Node n : nodes) { if (!n.hasLabel(Labels.SYNONYM)) { NubUsage u = read(n); String name = u.parsedName.canonicalName(); if (u.status == TaxonomicStatus.ACCEPTED && !StringUtils.isBlank(name)) { // prefix with rank ordinal to become unique across ranks (ordinal is shorter than full name to save mem) String indexedName = u.rank.ordinal() + name; if (names.containsKey(indexedName)) { // duplicate accepted canonical name. Check which has priority Node n1 = db.getNode(names.get(indexedName)); NubUsage u1 = read(n1); int p1 = priorities.get(u1.datasetKey); int p2 = priorities.get(u.datasetKey); if (p2 < p1) { // the old usage is from a less trusted source u1.status = TaxonomicStatus.DOUBTFUL; db.store(u1); names.put(indexedName, n.getId()); } else { // the old usage is from a higher trusted source, keep it u.status = TaxonomicStatus.DOUBTFUL; db.store(u); } } else { names.put(indexedName, n.getId()); } } } } } /** * Incorporate Rod Pages IPNI name DOIs from https://github.com/rdmpage/ipni-names */ private void addPublicationDois() { } private void validate(NubValidation validator) throws AssertionError { try (Transaction tx = db.beginTx()) { boolean valid = validator.validate(); if (valid) { LOG.info("{} passed", validator.getClass().getSimpleName()); } else { LOG.error("Backbone is not valid! {} failed", validator.getClass().getSimpleName()); throw new AssertionError("Backbone is not valid!"); } } } /** * Goes through all accepted infraspecies and checks if a matching autonym exists, * creating missing autonyms where needed. * An autonym is an infraspecific taxon that has the same species and infraspecific epithet. * We do this last to not persistent autonyms that we dont need after basionyms are grouped or status has changed for some other reason. */ private void manageAutonyms() { if (!cfg.keepLonelyAutonyms) { LOG.info("Delete lonely autonyms"); try (Transaction tx = db.beginTx()) { for (Node n : Iterators.loop(db.dao().allAutonyms())) { Rank rank = NeoProperties.getRank(n, Rank.UNRANKED); if (!n.hasLabel(Labels.SYNONYM)) { Node p = db.parent(n); // count all childs of same rank int count = 0; for (Node c : Traversals.CHILDREN.traverse(p).nodes()) { if (NeoProperties.getRank(c, Rank.UNRANKED) == rank) { count++; } } if (count == 1) { // only this accepted autonym, try to remove!!! LOG.info("Removing lonely {} autonym {} {}", rank, n, NeoProperties.getScientificName(n)); removeTaxonIfEmpty(db.dao().readNub(n)); } } } tx.success(); } } LOG.info("Start creating missing autonyms"); try (Transaction tx = db.beginTx()) { int counter = 0; for (Node n : Iterators.loop(db.dao().allInfraSpecies())) { if (!n.hasLabel(Labels.SYNONYM)) { NubUsage u = read(n); // check for autonyms if (!u.parsedName.isAutonym()) { ParsedName pn = new ParsedName(); pn.setType(NameType.SCIENTIFIC); pn.setGenusOrAbove(u.parsedName.getGenusOrAbove()); pn.setSpecificEpithet(u.parsedName.getSpecificEpithet()); pn.setInfraSpecificEpithet(u.parsedName.getSpecificEpithet()); pn.setScientificName(pn.canonicalName()); pn.setRank(u.rank); try { NubUsageMatch autoMatch = db.findNubUsage(pn.canonicalName(), u.rank, u.kingdom, true); if (!autoMatch.isMatch()) { NubUsage parent = db.parent(u); SrcUsage autonym = new SrcUsage(); autonym.rank = u.rank; autonym.scientificName = pn.canonicalName(); autonym.parsedName = pn; autonym.status = TaxonomicStatus.ACCEPTED; try { createNubUsage(autonym, Origin.AUTONYM, parent); counter++; } catch (IgnoreSourceUsageException e) { LOG.warn("Fail to persistent missing autonym {}", pn.canonicalName()); } } } catch (HomonymException e) { LOG.error("Homonym autonym found: {}", e.getName()); } } } } tx.success(); LOG.info("Created {} missing autonyms", counter); } } private List<Node> listFamilies() { List<Node> families; try (Transaction tx = db.beginTx()) { families = Iterators.asList(db.dao().allFamilies()); } return families; } /** * Goes through all usages and tries to discover basionyms by comparing the specific or infraspecific epithet and the authorships within a family. * As we often see missing brackets from author names we must code defensively and allow several original names in the data for a single epithet. */ private void detectBasionyms() { try { LOG.info("Discover basionyms"); int newBasionyms = 0; int newRelations = 0; final BasionymSorter basSorter = new BasionymSorter(authorComparator); // load all family nodes into list so we can process them seach in a separate transaction later on List<Node> families = listFamilies(); for (Node n : families) { try (Transaction tx = db.beginTx()) { NubUsage fam = read(n); if (!fam.status.isSynonym()) { Map<String, List<NubUsage>> epithets = Maps.newHashMap(); Map<String, Set<String>> epithetBridges = Maps.newHashMap(); LOG.debug("Discover basionyms in family {}", fam.parsedName.canonicalNameComplete()); // key all names by their terminal epithet for (Node c : Traversals.DESCENDANTS.traverse(n).nodes()) { NubUsage nub = read(c); // ignore all supra specific names and autonyms if (nub.rank.isSpeciesOrBelow() && !c.hasLabel(Labels.AUTONYM)) { String epithet = SciNameNormalizer.stemEpithet(nub.parsedName.getTerminalEpithet()); if (!epithets.containsKey(epithet)) { epithets.put(epithet, Lists.newArrayList(nub)); } else { epithets.get(epithet).add(nub); } // now check if a basionym relation exists already that reaches out to some other epithet, e.g. due to gender changes for (Node bg : Traversals.BASIONYM_GROUP.evaluator(Evaluators.excludeStartPosition()).traverse(c).nodes()) { NubUsage bgu = read(bg); String epithet2 = SciNameNormalizer.stemEpithet(bgu.parsedName.getTerminalEpithet()); if (epithet2 != null && !epithet2.equals(epithet)) { if (!epithetBridges.containsKey(epithet)) { epithetBridges.put(epithet, Sets.newHashSet(epithet2)); } else { epithetBridges.get(epithet).add(epithet2); } } } } } LOG.debug("{} distinct epithets found in family {}", epithets.size(), fam.parsedName.canonicalNameComplete()); // merge epithet groups based on existing basionym relations, catching some gender changes LOG.debug("{} epithets are connected with explicit basionym relations", epithetBridges.size()); for (Map.Entry<String, Set<String>> bridge : epithetBridges.entrySet()) { if (epithets.containsKey(bridge.getKey())) { List<NubUsage> usages = epithets.get(bridge.getKey()); for (String epi2 : bridge.getValue()) { if (epithets.containsKey(epi2)) { LOG.debug("Merging {} usages of epithet {} into epithet group {}", epithets.get(epi2).size(), epi2, bridge.getKey()); usages.addAll(epithets.remove(epi2)); } } } } // now compare authorships for each epithet group for (Map.Entry<String, List<NubUsage>> epithetGroup : epithets.entrySet()) { Collection<BasionymGroup<NubUsage>> groups = basSorter.groupBasionyms(epithetGroup.getValue(), new Function<NubUsage, ParsedName>() { @Override public ParsedName apply(NubUsage nub) { return nub.parsedName; } }); // go through groups and persistent basionym relations where needed for (BasionymGroup<NubUsage> group : groups) { // we only need to process groups that contain recombinations if (group.hasRecombinations()) { // if we have a basionym creating relations is straight forward NubUsage basionym = null; if (group.hasBasionym()) { basionym = group.getBasionym(); } else if (group.getRecombinations().size() > 1) { // we need to persistent a placeholder basionym to group the 2 or more recombinations newBasionyms++; basionym = createBasionymPlaceholder(fam, group); } // persistent basionym relations if (basionym != null) { for (NubUsage u : group.getRecombinations()) { if (createBasionymRelationIfNotExisting(basionym.node, u.node)) { newRelations++; u.issues.add(NameUsageIssue.ORIGINAL_NAME_DERIVED); db.store(u); } } } } } } } tx.success(); } catch (Exception e) { LOG.error("Error detecting basionyms for family {}", n.getProperty(NeoProperties.SCIENTIFIC_NAME, "no name"), e); } } LOG.info("Discovered {} new basionym relations and created {} basionym placeholders", newRelations, newBasionyms); } catch (Throwable e) { LOG.error("Error detecting basionyms", e); } } private NubUsage createBasionymPlaceholder(NubUsage family, BasionymGroup group) { NubUsage basionym = new NubUsage(); basionym.datasetKey = null; basionym.origin = Origin.BASIONYM_PLACEHOLDER; basionym.rank = Rank.UNRANKED; basionym.status = TaxonomicStatus.DOUBTFUL; basionym.parsedName = new ParsedName(); basionym.parsedName.setGenusOrAbove("?"); basionym.parsedName.setSpecificEpithet(group.getEpithet()); basionym.parsedName.setAuthorship(group.getAuthorship()); basionym.parsedName.setYear(group.getYear()); basionym.parsedName.setType(NameType.PLACEHOLDER); basionym.parsedName.setScientificName(basionym.parsedName.fullName()); LOG.debug("creating basionym placeholder {} in family {}", basionym.parsedName.canonicalNameComplete(), family.parsedName.canonicalName()); return db.addUsage(family, basionym); } private void addKingdoms() { try (Transaction tx = db.beginTx()) { LOG.info("Adding kingdom"); currSrc = new ClbSource(null, Constants.NUB_DATASET_KEY, "Backbone kingdoms"); for (Kingdom k : Kingdom.values()) { NubUsage ku = new NubUsage(); ku.usageKey = idGen.reissue(k.nubUsageKey()); ku.kingdom = k; ku.datasetKey = Constants.NUB_DATASET_KEY; ku.origin = Origin.SOURCE; ku.rank = Rank.KINGDOM; ku.status = TaxonomicStatus.ACCEPTED; ku.parsedName = new ParsedName(); ku.parsedName.setType(NameType.SCIENTIFIC); ku.parsedName.setGenusOrAbove(k.scientificName()); ku.parsedName.setScientificName(k.scientificName()); // treat incertae sedis placeholder kingdom different if (k == Kingdom.INCERTAE_SEDIS) { ku.status = TaxonomicStatus.DOUBTFUL; ku.parsedName.setType(NameType.PLACEHOLDER); } db.addRoot(ku); } tx.success(); } } /** * TODO: to be implemented. * Now clb still dynamically retrieves extension data from all checklists, but in the future we like to control * which extension record is attached to a backbone usage. * Adds all extension data, e.g. vernacular names, to the backbone directly. * TODO: * - build map from source usage key to nub node id * - stream (jdbc copy) through all extension data in postgres and attach to relevant nub node */ private void addExtensionData() { LOG.warn("NOT IMPLEMENTED: Copy extension data to backbone"); //Joiner commaJoin = Joiner.on(", ").skipNulls(); //for (Node n : Iterators.loop(db.allTaxa())) { // NubUsage nub = read(n); // if (!nub.sourceIds.isEmpty()) { // LOG.debug("Add extension data from source ids {}", commaJoin.join(nub.sourceIds)); // } //} } /** * Flags emtpy genera or removes them if they have an IMPLICIT origin */ private void flagEmptyGenera() { LOG.info("flag empty genera as doubtful"); try (Transaction tx = db.beginTx()) { for (Node gen : Iterators.loop(db.dao().allGenera())) { if (!gen.hasRelationship(RelType.PARENT_OF, Direction.OUTGOING)) { NubUsage nub = read(gen); if (nub.origin == Origin.IMPLICIT_NAME) { // remove this genus as it was created by the nub builder as an implicit genus name for a species we seem to have moved or deleted since if (removeTaxonIfEmpty(nub)) { continue; } } if (!nub.status.isSynonym()) { nub.issues.add(NameUsageIssue.NO_SPECIES); if (TaxonomicStatus.ACCEPTED == nub.status) { nub.status = TaxonomicStatus.DOUBTFUL; } db.store(nub); } } } tx.success(); } } /** * Updates implicit names to be accepted (not doubtful) and removes implicit taxa with no children if configured to do so. */ private void cleanImplicitTaxa() { LOG.info("Clean implicit taxa"); try (Transaction tx = db.beginTx()) { for (Node n : Iterators.loop(db.dao().allImplicitNames())) { NubUsage nub = read(n); if (!cfg.keepEmptyImplicitNames) { if (removeTaxonIfEmpty(nub)) { nub = null; } } // update status if still existing if (nub != null) { if (nub.status == TaxonomicStatus.DOUBTFUL) { nub.status = TaxonomicStatus.ACCEPTED; db.store(nub); } } } tx.success(); } } /** * Goes through all accepted species and infraspecies and makes sure the name matches the genus, species classification. * For example an accepted species Picea alba with a parent genus of Abies is taxonomic nonsense. * Badly classified names are assigned the doubtful status and an NameUsageIssue.NAME_PARENT_MISMATCH is flagged */ private void flagParentMismatch() { LOG.info("flag classification name mismatches"); try (Transaction tx = db.beginTx()) { for (Node gn : Iterators.loop(db.dao().allGenera())) { if (!gn.hasLabel(Labels.SYNONYM)) { NubUsage gen = read(gn); if (gen.kingdom == Kingdom.VIRUSES) { // virus names are unparsable... continue; } if (gen.parsedName == null || gen.parsedName.getGenusOrAbove() == null) { LOG.warn("Genus {} without genus name part: {} {}", gn, gen.rank, NeoProperties.getScientificName(gn)); continue; } String genus = gen.parsedName.getGenusOrAbove(); // flag non matching names for (Node spn : Traversals.CHILDREN.traverse(gn).nodes()) { NubUsage sp = db.dao().readNub(spn); if (sp.rank != Rank.SPECIES) { LOG.warn("Genus child {} is not a species: {} {}", spn, sp.rank, NeoProperties.getScientificName(spn)); continue; } if (sp.parsedName == null || sp.parsedName.getGenusOrAbove() == null) { LOG.warn("Genus child {} without genus name part: {} {}", spn, sp.rank, NeoProperties.getScientificName(spn)); continue; } if (!genus.equals(sp.parsedName.getGenusOrAbove())){ sp.issues.add(NameUsageIssue.NAME_PARENT_MISMATCH); } db.store(sp); // check infraspecific names String species = sp.parsedName.getSpecificEpithet(); for (Node ispn : Traversals.CHILDREN.traverse(spn).nodes()) { NubUsage isp = db.dao().readNub(ispn); if (isp.parsedName.getInfraSpecificEpithet() == null) { LOG.warn("Species child {} without an infraspecific epithet: {} {}", ispn, isp.rank, NeoProperties.getScientificName(ispn)); continue; } if (!genus.equals(isp.parsedName.getGenusOrAbove()) || !species.equals(isp.parsedName.getInfraSpecificEpithet())){ isp.issues.add(NameUsageIssue.NAME_PARENT_MISMATCH); db.store(isp); } } } } } } } private void addDatasets() { LOG.info("Start adding backbone sources"); for (NubSource src : sources) { try { addDataset(src); } catch (Exception e) { LOG.error("Error processing source {}", src.name, e); } finally { Stopwatch sw = Stopwatch.createStarted(); src.close(); LOG.debug("Closing source {} took {}ms", src.name, sw.elapsed(TimeUnit.MILLISECONDS)); } } } private void addDataset(NubSource source) { LOG.info("Adding {}th source {}", datasetCounter++, source.name); currSrc = source; priorities.put(source.key, ++maxPriority); // clear dataset wide caches parents.clear(); basionymRels.clear(); src2NubKey.clear(); allowedRanks.clear(); // prepare set of allowed ranks for this source for (Rank r : Rank.values()) { if (NUB_RANKS.contains(r) && r.ordinal() >= source.ignoreRanksAbove.ordinal()) { allowedRanks.add(r); } } int start = sourceUsageCounter; // do transactions in batches to dont slow down neo too much int batchCounter = 1; // makes sure to close the iterator - important for releasing neo resources, slows down considerably otherwise! try (CloseableIterator<SrcUsage> iter = source.iterator()) { UnmodifiableIterator<List<SrcUsage>> batchIter = com.google.common.collect.Iterators.partition(iter, cfg.neo.batchSize); while (batchIter.hasNext()) { try (Transaction tx = db.beginTx()) { List<SrcUsage> batch = batchIter.next(); LOG.debug("process batch {} with {} usages", batchCounter++, batch.size()); for (SrcUsage u : batch) { // catch errors processing individual records too try { LOG.debug("process {} {} {}", u.status, u.rank, u.scientificName); sourceUsageCounter++; parents.add(u); NubUsage parent = parents.nubParent(); // replace accepted taxa with doubtful ones for all nomenclators and for synonym parents // http://dev.gbif.org/issues/browse/POR-2780 if (TaxonomicStatus.ACCEPTED == u.status && (currSrc.nomenclator || parent.status.isSynonym())) { u.status = TaxonomicStatus.DOUBTFUL; } if (parent.status.isSynonym()) { // use accepted instead parent = db.parent(parent); } NubUsage nub = processSourceUsage(u, Origin.SOURCE, parent); if (nub != null) { parents.put(nub); } } catch (IgnoreSourceUsageException e) { LOG.debug("Ignore usage {} >{}< {}", u.key, u.scientificName, e.getMessage()); } catch (StackOverflowError e) { // if this happens its time to fix some code! LOG.error("CODE BUG: StackOverflowError processing {} from source {}", u.scientificName, source.name, e); LOG.error("CAUSE: {}", u.parsedName); } catch (RuntimeException e) { LOG.error("RuntimeException processing {} from source {}", u.scientificName, source.name, e); } } tx.success(); } } } catch (Exception e) { Throwables.propagate(e); } // process explicit basionym relations processExplicitBasionymRels(); LOG.info("Processed {} source usages for {}", sourceUsageCounter - start, source.name); } private void processExplicitBasionymRels() { try (Transaction tx = db.beginTx()) { LOG.info("Processing {} explicit basionym relations from {}", basionymRels.size(), currSrc.name); for (Map.Entry<Long, Integer> entry : basionymRels.entrySet()) { Node n = db.getNode(entry.getKey()); Node bas = db.getNode(src2NubKey.get(entry.getValue())); // find basionym node by sourceKey if (n != null && bas != null) { // basionym has not been verified yet, make sure its of rank <= genus and its name type is no placeholder NubUsage basUsage = read(bas); if (!basUsage.rank.isSpeciesOrBelow()) { LOG.warn("Ignore explicit basionym {} of rank {}", basUsage.parsedName.getScientificName(), basUsage.rank); continue; } if (!basUsage.parsedName.getType().isBackboneType()) { LOG.warn("Ignore explicit basionym {} with name type {}", basUsage.parsedName.getScientificName(), basUsage.parsedName.getType()); continue; } if (!createBasionymRelationIfNotExisting(bas, n)) { LOG.warn("Nub usage {} already contains a contradicting basionym relation. Ignore basionym {} from source {}", n.getProperty(NeoProperties.SCIENTIFIC_NAME, n.getId()), bas.getProperty(NeoProperties.SCIENTIFIC_NAME, bas.getId()), currSrc.name); } } else { LOG.warn("Could not resolve basionym relation for nub {} to source usage {}", entry.getKey(), entry.getValue()); } } tx.success(); } } /** * Looks for all implicit names and tries to find a match with author to replace them. * This is done after we added an entire dataset so we can prefer accepted names over doubtful ones. * If we instead would do this on the fly as we add new names we might prefer a doubtful name. */ private void replaceImplicitNames() { LOG.info("Replace implicit names for dataset {}", currSrc.name); try (Transaction tx = db.beginTx()) { for (Node n : Iterators.loop(db.dao().allImplicitNames())) { NubUsage nub = read(n); try { NubUsageMatch match = db.findNubUsage(nub.datasetKey, nub.parsedName, nub.rank, nub.status, nub.kingdom, db.parent(nub)); if (match.isMatch()) { // replace implicit name with match LOG.debug("Replace implicit name {} with {}", nub.parsedName.fullName(), match.usage.parsedName.fullName()); //db.store(nub); } } catch (IgnoreSourceUsageException e) { LOG.warn(e.name, e); } } tx.success(); } } private void swapName(NubUsage target, NubUsage source) { db.transferChildren(target, source); } /** * @return true if basionym relationship was created */ private boolean createBasionymRelationIfNotExisting(Node basionym, Node n) { if (!basionym.equals(n) && !n.hasRelationship(RelType.BASIONYM_OF, Direction.BOTH)) { basionym.createRelationshipTo(n, RelType.BASIONYM_OF); basionym.addLabel(Labels.BASIONYM); return true; } return false; } private NubUsage processSourceUsage(SrcUsage u, Origin origin, NubUsage parent) throws IgnoreSourceUsageException { Preconditions.checkNotNull(u.status); Preconditions.checkArgument(parent.status.isAccepted()); // try to parse name addParsedNameIfNull(u); // match to existing usages NubUsageMatch match = db.findNubUsage(currSrc.key, u, parents.nubKingdom(), parent); // process only usages not to be ignored and with desired ranks if (!match.ignore && u.rank != null && allowedRanks.contains(u.rank)) { // from now on a rank is guaranteed! if (!match.isMatch()) { // remember if we had a doubtful match NubUsage doubtful = match.doubtfulUsage; // persistent new nub usage if there wasnt any yet match = createNubUsage(u, origin, parent); // check if we had a doubtful or implicit & accepted name match if (doubtful != null && u.status == TaxonomicStatus.ACCEPTED) { db.transferChildren(match.usage, doubtful); } } else { if (origin == Origin.IMPLICIT_NAME) { // do not update or change usages with implicit names return match.usage; } Equality authorEq = authorComparator.compare(match.usage.parsedName, u.parsedName); if (match.usage.status.isSynonym() == u.status.isSynonym()) { // update nub usage if status matches updateNub(match.usage, u, origin, parent); } else if (Equality.DIFFERENT == authorEq) { // persistent new nub usage with different status and authorship as before match = createNubUsage(u, origin, parent); } else if (fromCurrentSource(match.usage) && !u.status.isSynonym()) { // prefer accepted over synonym if from the same source LOG.debug("prefer accepted {} over synonym usage from the same source", u.scientificName); delete(match.usage); match = createNubUsage(u, origin, parent); } else if (fromCurrentSource(match.usage) && u.parsedName.hasAuthorship() && Equality.EQUAL != authorEq) { // allow new synonyms with non equal authorship to be created match = createNubUsage(u, origin, parent); } else if (currSrc.nomenclator) { updateNomenclature(match.usage, u); } else { LOG.debug("Ignore source usage. Status {} is different from nub ({}): {}", u.status, match.usage.status, u.scientificName); } } if (match.isMatch()) { if (u.key != null) { // remember all original source usage key to nub id mappings per dataset src2NubKey.put((int)u.key, match.usage.node.getId()); } if (u.originalNameKey != null) { // remember basionym relation. // Basionyms do not follow the taxnomic hierarchy, so we might not have seen some source keys yet // we will process all basionyms at the end of each source dataset basionymRels.put(match.usage.node.getId(), (int) u.originalNameKey); } } } else { LOG.debug("Ignore {} source usage: {}", u.rank, u.scientificName); } return match.usage; } private void delete(NubUsage nub) { for (int sourceId : nub.sourceIds) { src2NubKey.remove(sourceId); } basionymRels.remove(nub.node.getId()); db.dao().delete(nub); } /** * Removes a taxon if it has no accepted children or synonyms * @return true if usage was deleted */ private boolean removeTaxonIfEmpty(NubUsage u) { if (u != null && !u.node.hasRelationship(Direction.INCOMING, RelType.SYNONYM_OF, RelType.PROPARTE_SYNONYM_OF) && !u.node.hasRelationship(Direction.OUTGOING, RelType.PARENT_OF) ) { delete(u); return true; } return false; } private NubUsageMatch createNubUsage(SrcUsage u, Origin origin, NubUsage p) throws IgnoreSourceUsageException { addParsedNameIfNull(u); // if this is a synonym but the parent is not part of the nub (e.g. cause its a placeholder name) ignore it! // http://dev.gbif.org/issues/browse/POR-2990 if (u.status.isSynonym() && !parents.parentInNub()) { throw new IgnoreSourceUsageException("Ignoring synonym as accepted parent is not part of the nub", u.scientificName); } // ignore synonyms of low rank for higher taxa // http://dev.gbif.org/issues/browse/POR-3169 if (u.status.isSynonym() && !u.rank.higherThan(Rank.GENUS) && p.rank.higherThan(Rank.FAMILY)) { String message = String.format("Ignoring %s synonym %s for %s %s", u.rank, u.scientificName, p.rank, p.parsedName.fullName()); throw new IgnoreSourceUsageException(message, u.scientificName); } // make sure parent is accepted if (p.status.isSynonym()) { LOG.warn("Parent {} of {} is a synonym", p.parsedName.canonicalNameComplete(), u.parsedName.canonicalNameComplete()); throw new IllegalStateException("Parent is a synonym"+u.scientificName); } // make sure we have a parsed genus to deal with implicit names and the kingdom is not viruses as these have no structured name if (p.kingdom != Kingdom.VIRUSES) { if (u.status.isAccepted()) { // skip badly organized rank hierarchies if (!p.rank.higherThan(u.rank)) { LOG.warn("Source {} {} with inversed parent {} {}", u.rank, u.scientificName, p.rank, p.parsedName.canonicalNameComplete()); throw new IgnoreSourceUsageException("Ignoring source with inverted rank order", u.scientificName); } // we want the parent of any infraspecies ranks to be the species if (p.rank.isInfraspecific()) { p = findParentSpecies(p); } // check if implicit species or genus parents are needed SrcUsage implicit = new SrcUsage(); NubUsage implicitParent = null; try { if (u.parsedName.getGenusOrAbove() != null) { if (u.rank == Rank.SPECIES && p.rank != Rank.GENUS) { implicit.rank = Rank.GENUS; implicit.scientificName = u.parsedName.getGenusOrAbove(); implicit.status = TaxonomicStatus.DOUBTFUL; implicitParent = processSourceUsage(implicit, Origin.IMPLICIT_NAME, p); } else if (u.rank.isInfraspecific() && p.rank != Rank.SPECIES) { implicit.rank = Rank.SPECIES; implicit.scientificName = u.parsedName.canonicalSpeciesName(); implicit.status = TaxonomicStatus.DOUBTFUL; implicitParent = processSourceUsage(implicit, Origin.IMPLICIT_NAME, p); } } else { LOG.warn("Missing genus in parsed name for {}", u.scientificName); } } catch (IgnoreSourceUsageException e) { LOG.warn("Ignore implicit {} {}", implicit.rank, implicit.scientificName); } catch (Exception e) { LOG.error("Failed to persistent implicit {} {}", implicit.rank, implicit.scientificName, e); } if (implicitParent != null) { // in case the implicit parent species is a synonym turn the infraspecies also into a synonym if (implicitParent.status.isSynonym() && implicitParent.rank == Rank.SPECIES) { // http://dev.gbif.org/issues/browse/POR-2780 u.status = TaxonomicStatus.SYNONYM; p = db.parent(implicitParent); } else { // use the implicit parent p = implicitParent; } } } else { // a synonym // avoid cases where synonyms for a binnomial are monomials of rank genus or even higher! if (p.parsedName.isBinomial() && u.rank.ordinal() < Rank.INFRAGENERIC_NAME.ordinal()) { LOG.warn("Source synonym {} {} with accepted binomial name {} {}", u.rank, u.scientificName, p.rank, p.parsedName.canonicalNameComplete()); throw new IgnoreSourceUsageException("Ignoring source with inverted rank order", u.scientificName); } } } // add to nub db return NubUsageMatch.match(db.addUsage(p, u, origin, currSrc.key)); } /** * moves up the parent_of rels to the species or first taxon above. * Returns original usage in case rank was at species level or above already */ private NubUsage findParentSpecies(NubUsage p) { while (p.rank.isInfraspecific()) { p = db.parent(p); } return p; } private void addParsedNameIfNull(SrcUsage u) throws IgnoreSourceUsageException { if (u.parsedName == null) { try { u.parsedName = parser.parse(u.scientificName, u.rank); // avoid indet names if (ignoredNameTypes.contains(u.parsedName.getType())) { throw new IgnoreSourceUsageException("Ignore " + u.parsedName.getType() + " name", u.scientificName); } // avoid incomplete names if ((!StringUtils.isBlank(u.parsedName.getInfraSpecificEpithet()) && StringUtils.isBlank(u.parsedName.getSpecificEpithet())) || !StringUtils.isBlank(u.parsedName.getSpecificEpithet()) && StringUtils.isBlank(u.parsedName.getGenusOrAbove())) { throw new IgnoreSourceUsageException("Ignore incomplete name", u.scientificName); } // avoid taxon concept names if (!StringUtils.isBlank(u.parsedName.getSensu())) { throw new IgnoreSourceUsageException("Ignore taxon concept names", u.scientificName); } // avoid names with nulls in epithets if ("null".equals(u.parsedName.getSpecificEpithet()) || "null".equals(u.parsedName.getInfraSpecificEpithet())) { throw new IgnoreSourceUsageException("Ignore names with null epithets", u.scientificName); } // consider infraspecific names subspecies if (u.parsedName.getRank() == Rank.INFRASPECIFIC_NAME && u.parsedName.isBinomial() && u.parsedName.getInfraSpecificEpithet() != null) { u.parsedName.setRank(Rank.SUBSPECIES); } // consider parsed rank only for bi/trinomials Rank pRank = u.parsedName.isBinomial() ? u.parsedName.getRank() : null; if (pRank != null && pRank != u.rank && !pRank.isUncomparable()) { if (u.rank == null) { LOG.debug("Use parsed rank {}", pRank); u.rank = pRank; } else if (u.rank.isUncomparable()) { LOG.debug("Prefer parsed rank {} over {}", pRank, u.rank); u.rank = pRank; } else { LOG.debug("Rank {} does not match parsed rank {}. Ignore {}", u.rank, pRank, u.scientificName); throw new IgnoreSourceUsageException("Parsed rank mismatch", u.scientificName); } } else if (Rank.INFRAGENERIC_NAME == u.rank && u.parsedName.isBinomial()) { // this is an aggregate species rank as we have a binomial & rank=INFRAGENERIC - treat as a species! u.rank = Rank.SPECIES; LOG.debug("Treat infrageneric name {} as species", u.scientificName); } // strip author names from higher taxa if (u.rank != null && u.rank.higherThan(Rank.GENUS)) { clearAuthorship(u.parsedName); } //TODO: rebuild name in canonical form - e.g. removes subgenus references //u.parsedName.setScientificName(u.parsedName.canonicalNameComplete()); } catch (UnparsableException e) { // allow virus names in the nub if (e.type == NameType.VIRUS) { u.parsedName = new ParsedName(); u.parsedName.setScientificName(u.scientificName); u.parsedName.setType(e.type); } else { throw new IgnoreSourceUsageException("Unparsable " + e.type, u.scientificName); } } } } private void clearAuthorship(ParsedName pn) { pn.setAuthorship(null); pn.setYear(null); pn.setBracketAuthorship(null); pn.setBracketYear(null); } private void updateNomenclature(NubUsage nub, SrcUsage u) { LOG.debug("Updating nomenclature for {} from source {}", nub.parsedName.getScientificName(), u.parsedName.getScientificName()); // authorship if (!u.parsedName.authorshipComplete().isEmpty() && (nub.parsedName.authorshipComplete().isEmpty() || currSrc.nomenclator)) { nub.parsedName.setAuthorship(u.parsedName.getAuthorship()); nub.parsedName.setYear(u.parsedName.getYear()); nub.parsedName.setBracketAuthorship(u.parsedName.getBracketAuthorship()); nub.parsedName.setBracketYear(u.parsedName.getBracketYear()); nub.parsedName.setAuthorsParsed(true); nub.parsedName.setScientificName(u.parsedName.canonicalNameComplete()); } // publishedIn if (u.publishedIn != null && (nub.publishedIn == null || currSrc.nomenclator)) { nub.publishedIn = u.publishedIn; } // nom status if (u.nomStatus != null && u.nomStatus.length > 0 && (nub.nomStatus.isEmpty() || currSrc.nomenclator)) { nub.nomStatus = Sets.newHashSet(u.nomStatus); } } private void updateNub(NubUsage nub, SrcUsage u, Origin origin, NubUsage parent) { LOG.debug("Updating {} from source {}", nub.parsedName.getScientificName(), u.parsedName.getScientificName()); NubUsage currNubParent = db.parent(nub); // update nomenclature and status only from source usages if (u.key != null) { nub.sourceIds.add(u.key); // update author, publication and nom status updateNomenclature(nub, u); // prefer accepted version over doubtful if its coming from the same dataset! if (nub.status == TaxonomicStatus.DOUBTFUL && u.status == TaxonomicStatus.ACCEPTED && fromCurrentSource(nub)) { nub.status = u.status; if (isNewParentApplicable(nub, currNubParent, parent) && !db.existsInClassification(currNubParent.node, parent.node, false)) { // current classification doesnt have that parent yet, lets apply it LOG.debug("Update doubtful {} classification with new parent {} {}", nub.parsedName.getScientificName(), parent.rank, parent.parsedName.getScientificName()); db.createParentRelation(nub, parent); } } if (origin == Origin.SOURCE) { // only override original origin value if we update from a true source nub.origin = Origin.SOURCE; } } if (nub.status.isSynonym()) { // maybe we have a proparte synonym from the same dataset? if (fromCurrentSource(nub) && !parent.node.equals(currNubParent.node)) { nub.status = TaxonomicStatus.PROPARTE_SYNONYM; // persistent new pro parte relation LOG.debug("New accepted name {} found for pro parte synonym {}", parent.parsedName.getScientificName(), nub.parsedName.getScientificName()); db.setSingleFromRelationship(nub.node, parent.node, RelType.PROPARTE_SYNONYM_OF); } else { // this might be a more exact kind of synonym status if (nub.status == TaxonomicStatus.SYNONYM) { nub.status = u.status; } } } else { // ACCEPTED if (isNewParentApplicable(nub, currNubParent, parent) && (currNubParent.kingdom == Kingdom.INCERTAE_SEDIS || db.existsInClassification(parent.node, currNubParent.node, false) && currNubParent.rank != parent.rank ) ) { LOG.debug("Update {} classification with new parent {} {}", nub.parsedName.getScientificName(), parent.rank, parent.parsedName.getScientificName()); db.createParentRelation(nub, parent); } } db.store(nub); } private boolean isNewParentApplicable(NubUsage nub, NubUsage currParent, NubUsage newParent) { return newParent != null && !currParent.equals(newParent) && (currParent.rank.higherThan(newParent.rank) || currParent.rank == newParent.rank) && newParent.rank.higherThan(nub.rank); } private boolean fromCurrentSource(NubUsage nub) { return nub.datasetKey.equals(currSrc.key); } private void groupByBasionym() { if (cfg.groupBasionyms) { LOG.info("Start basionym consolidation"); verifyBasionyms(); detectBasionyms(); consolidateBasionymGroups(); } else { LOG.info("Skip basionym consolidation"); } } /** * Verifies existing basionyms by checking that the basionym does not have an original author */ private void verifyBasionyms() { LOG.info("Verify existing basionyms - TO BE IMPLEMENTED!"); } /** * Make sure we only have at most one accepted name for each homotypical basionym group! * An entire group can consist of synonyms without a problem, but they must all refer to the same accepted name. * If a previously accepted name needs to be turned into a synonym it will be of type homotypical_synonym. * * As we merge names from different taxonomies it is possible there are multiple accepted names (maybe via a synonym relation) in such a group. * We always stick to the first combination with the highest priority and make all others * a) synonyms of this if it is accepted * b) synonyms of the primary's accepted name if it was a synonym itself * * In case of conflicting accepted names we also flag these names with CONFLICTING_BASIONYM_COMBINATION */ private void consolidateBasionymGroups() { int counter = 0; int counterModified = 0; // first load all basionym node ids into a set so we can process them individually in separate transactions LongSet basIds = new LongOpenHashSet(); try (Transaction tx = db.beginTx()) { for (Node bas : Iterators.loop(db.dao().allBasionyms())) { basIds.add(bas.getId()); } LOG.info("Found {} basionyms to consolidate", basIds.size()); } // now consolidate each basionym group in its own transaction for (long basId : basIds) { try (Transaction tx = db.beginTx()) { Node bas = db.getNode(basId); counter++; // sort all usage by source dataset priority, placing doubtful names last List<NubUsage> group = db.listBasionymGroup(bas); if (group.size() > 1) { // we stick to the first combination with the highest priority and make all others // a) synonyms of this if it is accepted // b) synonyms of the primary's accepted name if it was a synonym itself // if there are several usages with the same priority select one according to some defined rules final NubUsage primary = findPrimaryUsage(group); // get the accepted usage in case of synonyms final NubUsage accepted = primary.status.isSynonym() ? db.parent(primary) : primary; final TaxonomicStatus synStatus = primary.status.isSynonym() ? primary.status : TaxonomicStatus.HOMOTYPIC_SYNONYM; Set<Node> parents = ImmutableSet.copyOf(db.parents(accepted.node)); LOG.debug("Consolidating basionym group with {} primary usage {}: {}", primary.status, primary.parsedName.canonicalNameComplete(), names(group)); int modified = 0; for (NubUsage u : group) { if (u.equals(primary)) continue; if (parents.contains(u.node)) { LOG.debug("Exclude parent {} from basionym consolidation of {}", u.parsedName.canonicalNameComplete(), primary.parsedName.canonicalNameComplete()); } else if (!hasAccepted(u, accepted)) { modified++; NubUsage previousParent = db.parent(u); if (previousParent != null) { u.addRemark(String.format("Originally found in sources as %s %s %s", u.status.toString().toLowerCase().replaceAll("_", " "), u.status.isSynonym() ? "of" : "taxon within", previousParent.parsedName.canonicalNameComplete()) ); } db.convertToSynonym(u, accepted, synStatus, NameUsageIssue.CONFLICTING_BASIONYM_COMBINATION); } } counterModified = counterModified + modified; } tx.success(); } catch (NotFoundException e) { LOG.info("Basionym {} was removed. Ignore for consolidation", basId, e); } } LOG.info("Consolidated {} usages from {} basionyms in total", counterModified, counter); } private int priority(NubUsage usage) { return priorities.containsKey(usage.datasetKey) ? priorities.get(usage.datasetKey) : maxPriority + 1; } /** * From a list of equally trusted usages of a basionym group select one primary usage to trust most. * Selection follows the order: * 1) the accepted usage which most usages link to (from a synonym) * 2) status synonym > accepted > doubtful * 3) highest rank */ private NubUsage findPrimaryUsage(List<NubUsage> basionymGroup) { if (basionymGroup == null || basionymGroup.isEmpty()) { return null; } // a single usage only if (basionymGroup.size() == 1) { return basionymGroup.get(0); } // keep shringing this list until we get one! List<NubUsage> candidates = Lists.newArrayList(); // 1. by dataset priority int highestPriority = Integer.MAX_VALUE; for (NubUsage u : basionymGroup) { int datasetPriority = priority(u); if (datasetPriority < highestPriority) { highestPriority = datasetPriority; } } for (NubUsage u : basionymGroup) { if (priority(u) == highestPriority) { candidates.add(u); } } if (candidates.size() > 1) { // 2. accepted with most usages Map<NubUsage, NubUsage> u2acc= Maps.newHashMap(); Map<NubUsage, Integer> accCounts = Maps.newHashMap(); for (NubUsage u : candidates) { final NubUsage accepted = u.status.isSynonym() ? db.parent(u) : u; u2acc.put(u, accepted); if (!accCounts.containsKey(accepted)) { accCounts.put(accepted, 1); } else { accCounts.put(accepted, accCounts.get(accepted)+1); } } int maxCount = MapUtils.sortByValue(accCounts, Ordering.<Integer>natural().reverse()).values().iterator().next(); Iterator<NubUsage> iter = candidates.iterator(); while (iter.hasNext()) { NubUsage u = iter.next(); if (accCounts.get(u2acc.get(u)) != maxCount) { iter.remove(); } } if (candidates.size() > 1) { // 3. by status: syn > acc > doubtful Map<NubUsage, Integer> score = Maps.newHashMap(); for (NubUsage u : candidates) { score.put(u, MapUtils.getOrDefault(STATUS_ORDER, u.status, 10)); } int maxScore = MapUtils.sortByValue(score).values().iterator().next(); iter = candidates.iterator(); while (iter.hasNext()) { NubUsage u = iter.next(); if (score.get(u) != maxScore) { iter.remove(); } } if (candidates.size() > 1) { // 4. by higher rank List<Rank> ranks = Lists.newArrayList(); for (NubUsage u : candidates) { ranks.add(u.rank); } Collections.sort(ranks); Rank minRank = ranks.get(0); iter = candidates.iterator(); while (iter.hasNext()) { NubUsage u = iter.next(); if (minRank != u.rank) { iter.remove(); } } } } } return candidates.get(0); } /** * @return true of the given usage u has a SYNONYM_OF relation to the given acc usage */ private boolean hasAccepted(NubUsage u, NubUsage acc) { if (u.node.equals(acc.node)) return true; try (ResourceIterator<Node> iter = Traversals.ACCEPTED.traverse(u.node).nodes().iterator()) { while (iter.hasNext()) { if (iter.next().equals(acc.node)) { return true; } } } return false; } private String names(Collection<NubUsage> usages) { return SEMICOLON_JOIN.join(usages.stream().map(u -> u.parsedName.fullName()).iterator()); } /** * Assigns a unique usageKey to all nodes by matching a usage to the previous backbone to keep stable identifiers. */ private void assignUsageKeys() { LOG.info("Assigning final clb ids to all nub usages..."); for (Map.Entry<Long, NubUsage> entry : db.dao().nubUsages()) { NubUsage u = entry.getValue(); if (u.rank != Rank.KINGDOM) { u.usageKey = idGen.issue(u.parsedName.canonicalName(), u.parsedName.getAuthorship(), u.parsedName.getYear(), u.rank, u.kingdom); db.dao().update(entry.getKey(), u); } } // for pro parte synonyms we need to assign extra keys, one per relation! // http://dev.gbif.org/issues/browse/POR-2872 try (Transaction tx = db.beginTx()) { try (ResourceIterator<Relationship> rels = db.dao().listAllRelationships(RelType.PROPARTE_SYNONYM_OF)) { while (rels.hasNext()) { Relationship rel = rels.next(); NubUsage u = db.dao().readNub(rel.getStartNode()); NubUsage acc = db.dao().readNub(rel.getEndNode()); if (acc.usageKey <= 0) { LOG.warn("No usage key assigned to {}", acc); } int ppKey = idGen.issue(u.parsedName.canonicalName(), u.parsedName.getAuthorship(), u.parsedName.getYear(), u.rank, u.kingdom, acc.usageKey); LOG.debug("Assign id {} for pro parte relation of primary usage {} {}", ppKey, u.usageKey, u.parsedName.canonicalNameComplete()); rel.setProperty(NeoProperties.USAGE_KEY, ppKey); } } tx.success(); } } private void builtUsageMetrics() { LOG.info("Walk all accepted taxa and build usage metrics"); UsageMetricsHandler metricsHandler = new UsageMetricsHandler(db.dao()); // TaxonWalker deals with transactions TreeWalker.walkAcceptedTree(db.dao().getNeo(), metricsHandler); NormalizerStats normalizerStats = metricsHandler.getStats(0, null); LOG.info("Walked all taxa (root={}, total={}, synonyms={}) and built usage metrics", normalizerStats.getRoots(), normalizerStats.getCount(), normalizerStats.getSynonyms()); } }