package org.gbif.nub.lookup.similarity; import org.gbif.checklistbank.utils.SciNameNormalizer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Apply normalizations to scientific names before scoring them for similarity * using edit distance applied to each epithet. */ public class ScientificNameSimilarity implements StringSimilarity { private static final Logger LOG = LoggerFactory.getLogger(ScientificNameSimilarity.class); private static final int MUST_MATCH = 4; ModifiedDamerauLevenshtein mdl1 = new ModifiedDamerauLevenshtein(1); ModifiedDamerauLevenshtein mdl3 = new ModifiedDamerauLevenshtein(3); @Override public double getSimilarity(String x1, String x2) { if (x1.equals(x2)) return 100d; LOG.debug("‘{}’\twas previously {}% like ‘{}’", x1, mdl3.getSimilarity(x1, x2), x2); x1 = SciNameNormalizer.normalize(x1); x2 = SciNameNormalizer.normalize(x2); String[] x1s = x1.split(" "); String[] x2s = x2.split(" "); // Compare the whole name if they don't have the same number of tokens. if (x1s.length != x2s.length) { double sim = mdl3.getSimilarity(x1, x2); LOG.debug("‘{}’\tis {}% like ‘{}’ (but lengths differ)\n", x1, sim, x2); return sim; } boolean bad = false; double overallSim = 0; int i; for (i = 0; i < x1s.length; i++) { double sim = similarity(x1s[i], x2s[i]); // The score of the first epithet (e.g. genus) is scaled down, 100→100, 50→0, <50→0 if (i == 0) { sim = Math.max(0, (2*sim - 100)); } overallSim += sim; if (sim == 0) bad = true; } overallSim = overallSim/i; // Any epithet that doesn't match enough makes the whole match bad. if (bad && overallSim > 5) { overallSim = 5; } LOG.debug("‘{}’\tis {}% like ‘{}’\n", x1, overallSim, x2); return overallSim; } private double similarity(String x1, String x2) { // First letter much match if (x1.charAt(0) != x2.charAt(0)) { LOG.debug("\t‘{}’\tis not at all like ‘{}’ (‘{}’≠‘{}’)", x1, x2, x1.charAt(0), x2.charAt(0)); return 0; } // Very short epithets must match exactly if (x1.length() < MUST_MATCH || x2.length() < MUST_MATCH) { if (x1.equals(x2)) { LOG.debug("\t‘{}’\tis short and exactly like ‘{}’", x1, x2); return 100; } else { LOG.debug("\t‘{}’\tis short and nothing like ‘{}’", x1, x2); return 0; } } // Longer ones can have one change in the first MUST_MATCH letters // TODO: Consider whether the first letters must match exactly. String x1head = x1.substring(0, MUST_MATCH); String x2head = x2.substring(0, MUST_MATCH); int dist; if ((dist = mdl1.getEditDistance(x1head, x2head)) > 1) { LOG.debug("\t‘{}’\tis nothing like ‘{}’ (‘{}’≠‘{}’, dist={})", x1, x2, x1head, x2head, dist); return 0; } // And up to two changes in the whole epithet // TODO: Use Markus’ distance utility thing to take account of length. dist = mdl1.getEditDistance(x1, x2); double r = (dist == 0 ? 100 : (dist == 1 ? 90 : (dist <= 2 ? 80 : 0))); LOG.debug("\t‘{}’\tis {}% like ‘{}’", x1, r, x2); return r; } // // // }