package org.gbif.checklistbank.nub.source; import org.gbif.api.service.checklistbank.NameParser; import org.gbif.api.vocabulary.NomenclaturalStatus; import org.gbif.api.vocabulary.Rank; import org.gbif.api.vocabulary.TaxonomicStatus; import org.gbif.checklistbank.cli.common.NeoConfiguration; import org.gbif.checklistbank.iterable.CloseableIterable; import org.gbif.checklistbank.iterable.CloseableIterator; import org.gbif.checklistbank.neo.Labels; import org.gbif.checklistbank.neo.NeoProperties; import org.gbif.checklistbank.neo.RelType; import org.gbif.checklistbank.neo.UsageDao; import org.gbif.checklistbank.neo.traverse.TreeIterables; import org.gbif.checklistbank.nub.NubBuilder; import org.gbif.checklistbank.nub.model.SrcUsage; import org.gbif.checklistbank.postgres.TabMapperBase; import org.gbif.nameparser.GBIFNameParser; import java.io.File; import java.io.IOException; import java.util.Date; import java.util.Iterator; import java.util.UUID; import java.util.concurrent.TimeUnit; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; import com.google.common.base.Strings; import com.google.common.io.Files; import it.unimi.dsi.fastutil.ints.Int2IntMap; import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; import it.unimi.dsi.fastutil.ints.Int2ObjectMap; import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; import org.neo4j.graphdb.Node; import org.neo4j.graphdb.Relationship; import org.neo4j.graphdb.Transaction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A backbone source dataset with some basic metadata that allows to iterate over its source usages. * A intermediate neo4j db is created reading from a postgres checklistbank db using the native postgres jdbc copy manager. * The init() method connects to an CLB instance and copies all the minimal information needed to build a * taxonomic tree into an embedded, persistent neo db. No extension data is copied, just core taxonomic information. * This abstract class reads a tab delimited text stream expected with the following columns: * <ul> * <li>usageKey</li> * <li>parentKey</li> * <li>basionymKey</li> * <li>rank (enum)</li> * <li>taxonomicStatus (enum)</li> * <li>nomenclaturalStatus (enum[])</li> * <li>scientificName</li> * <li>namePublishedIn</li> * </ul> * Implement the abstract initNeo method to supply such a tab delimited stream to the NeoUsageWriter instance. */ public abstract class NubSource implements CloseableIterable<SrcUsage> { private static final Logger LOG = LoggerFactory.getLogger(NubSource.class); private static final NeoConfiguration cfg = new NeoConfiguration(); private final Stopwatch watch = Stopwatch.createUnstarted(); static { cfg.neoRepository = Files.createTempDir(); cfg.mappedMemory = 256; } public UUID key; public String name; public Rank ignoreRanksAbove = Rank.FAMILY; public Date created; public boolean nomenclator = false; public boolean ignoreSynonyms = false; private UsageDao dao; private final boolean useTmpDao; /** * @param useTmpDao if true uses a temporary DAO that is not closed at the end of init. * If too many sources are created this can result in a large number of open files! * Do not use this for production. */ public NubSource(UUID key, String name, boolean useTmpDao) { this.key = key; this.name = name; this.useTmpDao = useTmpDao; } /** * Loads data into the source and does any other initialization needed before usages() can be called. * Make sure to call this method once before the usage iterator is used! * * @param writeNeoProperties if true the scientific name and rank will also be added to the neo node properties * @param nubRanksOnly if true skip non nub ranks * @param parseNames if true parse names and populate SrcUsage.parsedName which will be null otherwise! */ public void init(boolean writeNeoProperties, boolean nubRanksOnly, boolean parseNames, boolean ignoreSynonyms) throws Exception { // load data into neo4j LOG.debug("Start loading source data from {} into neo", name); watch.reset().start(); UsageDao initDao; if (useTmpDao) { initDao = UsageDao.temporaryDao(128); // reuse the dao for reading dao = initDao; } else { initDao = open(false, true); } try (NeoUsageWriter writer = new NeoUsageWriter(initDao, writeNeoProperties, nubRanksOnly, parseNames, ignoreSynonyms)) { initNeo(writer); LOG.info("Loaded nub source data {} with {} usages into neo4j in {}ms, skipping {}", name, writer.getCounter(), watch.elapsed(TimeUnit.MILLISECONDS), writer.getSkipped()); } } public void setNeoRepository(File repository) { Preconditions.checkArgument(repository.isDirectory()); cfg.neoRepository = repository; } abstract void initNeo(NeoUsageWriter writer) throws Exception; public class NeoUsageWriter extends TabMapperBase { private int counter = 0; private int skipped = 0; private Transaction tx; private Int2IntMap ids = new Int2IntOpenHashMap(); private Int2ObjectMap<Integer> nonNubRankUsages = new Int2ObjectOpenHashMap<>(); private final UsageDao dao; private final boolean writeNeoProperties; private final boolean nubRanksOnly; private final boolean parseNames; private final NameParser parser; /** * @param writeNeoProperties if true the scientific name and rank will also be added to the neo node properties * @param nubRanksOnly if true skip non nub ranks * @param parseNames if true parse names and populate SrcUsage.parsedName which will be null otherwise! */ public NeoUsageWriter(UsageDao dao, boolean writeNeoProperties, boolean nubRanksOnly, boolean parseNames, boolean ignoreSynonyms) { // the number of columns in our query to consume super(8); this.dao = dao; this.writeNeoProperties = writeNeoProperties; this.nubRanksOnly = nubRanksOnly; this.parseNames = parseNames; // we only need a parser in case we need to write neo properties or parse names parser = writeNeoProperties || parseNames ? new GBIFNameParser() : null; tx = dao.beginTx(); } @Override protected void addRow(String[] row) { SrcUsage u = new SrcUsage(); u.key = toInt(row[0]); u.parentKey = toInt(row[1]); u.originalNameKey = toInt(row[2]); u.rank = row[3] == null ? null : Rank.valueOf(row[3]); u.status = row[4] == null ? null : TaxonomicStatus.valueOf(row[4]); u.nomStatus = toNomStatus(row[5]); u.scientificName = row[6]; u.publishedIn= row[7]; if (parseNames) { u.parsedName = parser.parseQuietly(u.scientificName, u.rank); } if (ignoreSynonyms && u.status != null && u.status.isSynonym()) { skipped++; return; } if (nubRanksOnly) { if ((u.rank == null || !NubBuilder.NUB_RANKS.contains(u.rank))) { // do not persistent a node, just keep the id mapped to the next higher parent with an nub rank nonNubRankUsages.put(u.key, u.parentKey); // we might have created a node already, delete it if there is one if (ids.containsKey(u.key)) { Node n = dao.getNeo().getNodeById(ids.get(u.key)); // delete all relations and relink parent rel to next nub rank while (u.parentKey != null && nonNubRankUsages.containsKey(u.parentKey)) { u.parentKey = nonNubRankUsages.get(u.parentKey); } Node nubParent = null; if (u.parentKey != null) { nubParent = getOrCreate(u.parentKey); } for (Relationship rel : n.getRelationships()) { if (rel.isType(RelType.PARENT_OF)) { if (nubParent != null) { Node child = rel.getOtherNode(n); nubParent.createRelationshipTo(child, RelType.PARENT_OF); } } rel.delete(); } n.delete(); } skipped++; return; } else { // make sure the parent and basionym are nub ranks while (u.parentKey != null && nonNubRankUsages.containsKey(u.parentKey)) { u.parentKey = nonNubRankUsages.get(u.parentKey); } if (u.originalNameKey != null && nonNubRankUsages.containsKey(u.originalNameKey)) { u.originalNameKey = null; } } } counter++; Node n = getOrCreate(u.key); dao.storeSourceUsage(n, u); // also add neo properties? if (writeNeoProperties) { n.setProperty(NeoProperties.SCIENTIFIC_NAME, u.scientificName); String canonical = parser.parseToCanonical(u.scientificName, u.rank); if (canonical != null) { n.setProperty(NeoProperties.CANONICAL_NAME, canonical); } if (u.rank != null) { n.setProperty(NeoProperties.RANK, u.rank.ordinal()); } } // root? if (u.parentKey == null) { n.addLabel(Labels.ROOT); } else { int pid = u.parentKey; Node p = getOrCreate(pid); if (u.status.isSynonym()) { n.createRelationshipTo(p, RelType.SYNONYM_OF); n.addLabel(Labels.SYNONYM); } else { p.createRelationshipTo(n, RelType.PARENT_OF); } } // establish basionym a relation? if (u.originalNameKey != null) { Node o = getOrCreate(u.originalNameKey); o.createRelationshipTo(n, RelType.BASIONYM_OF); o.addLabel(Labels.BASIONYM); } if (counter % 10000 == 0) { renewTx(); } } private Node getOrCreate(int key) { if (ids.containsKey(key)) { return dao.getNeo().getNodeById(ids.get(key)); } else { Node n = dao.createTaxon(); ids.put(key, (int) n.getId()); return n; } } // TODO: implement {NOM, NOM} parsing protected NomenclaturalStatus[] toNomStatus(String x) { return null; } private Integer toInt(String x) { return Strings.isNullOrEmpty(x) ? null : Integer.valueOf(x); } @Override public void close() throws IOException { tx.success(); tx.close(); if (!useTmpDao) { dao.close(); } } private void renewTx() { tx.success(); tx.close(); tx = dao.beginTx(); } public int getCounter() { return counter; } public int getSkipped() { return skipped; } } public class SrcUsageIterator implements CloseableIterator<SrcUsage> { private final Transaction tx; private final Iterator<Node> nodes; public SrcUsageIterator(UsageDao dao) { tx = dao.beginTx(); this.nodes = TreeIterables.allNodes(dao.getNeo(), null, null, true).iterator(); } @Override public boolean hasNext() { return nodes.hasNext(); } @Override public SrcUsage next() { return dao.readSourceUsage(nodes.next()); } @Override public void remove() { throw new UnsupportedOperationException("Not implemented"); } @Override public void close() { tx.success(); tx.close(); } } /** * Returns a neo db backed iterator over all usages. * The iteration is in taxonomic order, starting with the highest root taxa and walks * the taxonomic tree in depth order first, including synonyms. */ @Override public CloseableIterator<SrcUsage> iterator() { if (dao == null) { dao = open(true, false); } return new SrcUsageIterator(dao); } public UsageDao getDao() { return dao; } /** * @return a new read only dao */ public UsageDao open(boolean readOnly, boolean eraseExisting) { if (useTmpDao) { throw new IllegalStateException("Temporary DAOs cannot be opened again"); } watch.reset().start(); UsageDao d = UsageDao.persistentDao(cfg, key, readOnly, null, eraseExisting); LOG.debug("Opening DAO in {}ms for dataset {}", watch.elapsed(TimeUnit.MILLISECONDS), key); watch.stop(); return d; } /** * Closes dao and deletes all intermediate persistence files. */ @Override public void close() { if (dao != null) { dao.closeAndDelete(); } } }