package org.gbif.checklistbank.neo.traverse;
import org.gbif.api.model.checklistbank.NameUsage;
import org.gbif.api.model.checklistbank.NameUsageMetrics;
import org.gbif.api.util.ClassificationUtils;
import org.gbif.api.vocabulary.Origin;
import org.gbif.api.vocabulary.Rank;
import org.gbif.checklistbank.cli.normalizer.NormalizerStats;
import org.gbif.checklistbank.neo.UsageDao;
import org.gbif.checklistbank.model.Classification;
import org.gbif.checklistbank.cli.model.UsageFacts;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.base.Preconditions;
import org.neo4j.graphdb.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Builds higher classification keys (not the verbatim names) and NameUsageMetrics for all accepted usages.
* Synonym usages do not need a data record as its zero all over.
* The handler works on taxonomic neo relations and the NameUsage instances in the kvp store, so make sure they exist!
*/
public class UsageMetricsHandler implements StartEndHandler {
private static final Logger LOG = LoggerFactory.getLogger(UsageMetricsHandler.class);
// neo node ids for the higher classification links
private final Classification classification = new Classification();
private final LinkedList<NameUsageMetrics> parentCounts = Lists.newLinkedList();
private int counter;
private int roots;
private int maxDepth;
private int depth;
private int synonyms;
private Map<Origin, Integer> countByOrigin = Maps.newHashMap();
private Map<Rank, Integer> countByRank = Maps.newHashMap();
private final UsageDao dao;
private final boolean debug = false;
public UsageMetricsHandler(UsageDao dao) {
this.dao = dao;
}
@Override
public void start(Node n) {
NameUsage u = dao.readUsage(n, false);
Preconditions.checkNotNull(u, "node " + n.getId() + " with missing name usage found");
// increase counters
count(u);
counter++;
depth++;
if (depth > maxDepth) {
maxDepth = depth;
}
if (depth == 1) {
roots++;
}
if (u.getRank() != null && u.getRank().isLinnean()) {
ClassificationUtils.setHigherRankKey(classification, u.getRank(), (int) n.getId());
ClassificationUtils.setHigherRank(classification, u.getRank(), u.getCanonicalOrScientificName());
}
// for linnean ranks increase all parent data
if (u.getRank() != null && u.getRank().isLinnean() && u.getRank() != Rank.KINGDOM) {
for (NameUsageMetrics m : parentCounts) {
setNumByRank(m, u.getRank(), m.getNumByRank(u.getRank()) + 1);
}
}
// increase direct parents children counter by one
if (!parentCounts.isEmpty()) {
parentCounts.getLast().setNumChildren(parentCounts.getLast().getNumChildren() + 1);
}
// add new data to list of parent data
NameUsageMetrics m = new NameUsageMetrics();
// keep current total counter state so we can calculate the difference for the num descendants when coming up again
m.setNumDescendants(counter);
parentCounts.add(m);
if (debug) LOG.info("start: {} {} {} # {}-0-{}", u.getTaxonID(), u.getRank(), u.getScientificName(), counter, parentCounts.size());
}
@Override
public void end(Node n) {
depth--;
NameUsage u = dao.readUsage(n, false);
// final data update
NameUsageMetrics metrics = parentCounts.removeLast();
metrics.setNumSynonyms(processSynonyms(n));
metrics.setNumDescendants(counter - metrics.getNumDescendants());
// persist data and classification with nub key
UsageFacts facts = new UsageFacts();
facts.metrics = metrics;
facts.classification = classification;
dao.store(n.getId(), facts);
// remove this rank from current classification
if (u.getRank() != null && u.getRank().isLinnean()) {
ClassificationUtils.setHigherRankKey(classification, u.getRank(), null);
ClassificationUtils.setHigherRank(classification, u.getRank(), null);
}
if (debug) LOG.info("end: {} {} {} # {}-{}-{}", u.getTaxonID(), u.getRank(), u.getScientificName(), metrics.getNumDescendants(), metrics.getNumSynonyms(), parentCounts.size());
}
public NormalizerStats getStats(int ignored, List<String> cycles) {
return new NormalizerStats(roots, maxDepth, synonyms, ignored, countByOrigin, countByRank, cycles);
}
/**
* Process all synonymsTD doing a nub lookup for each of them
*
* @return the number of processed synonymsTD
*/
private int processSynonyms(Node n) {
int synCounter = 0;
for (Node syn : Traversals.SYNONYMS.traverse(n).nodes()) {
NameUsage u = dao.readUsage(syn, false);
synCounter++;
count(u);
}
synonyms = synonyms + synCounter;
return synCounter;
}
private static void setNumByRank(NameUsageMetrics u, Rank rank, int count) {
if (rank == Rank.PHYLUM) {
u.setNumPhylum(count);
}
if (rank == Rank.CLASS) {
u.setNumClass(count);
}
if (rank == Rank.ORDER) {
u.setNumOrder(count);
}
if (rank == Rank.FAMILY) {
u.setNumFamily(count);
}
if (rank == Rank.GENUS) {
u.setNumGenus(count);
}
if (rank == Rank.SUBGENUS) {
u.setNumSubgenus(count);
}
if (rank == Rank.SPECIES) {
u.setNumSpecies(count);
}
}
private void count(NameUsage u) {
if (u.getOrigin() != null) {
if (!countByOrigin.containsKey(u.getOrigin())) {
countByOrigin.put(u.getOrigin(), 1);
} else {
countByOrigin.put(u.getOrigin(), countByOrigin.get(u.getOrigin()) + 1);
}
}
if (u.getRank() != null) {
if (!countByRank.containsKey(u.getRank())) {
countByRank.put(u.getRank(), 1);
} else {
countByRank.put(u.getRank(), countByRank.get(u.getRank()) + 1);
}
}
}
}