package org.gbif.checklistbank.utils; import java.util.regex.Pattern; import static org.gbif.utils.text.StringUtils.foldToAscii; /** * A scientific name normalizer that replaces common misspellings and epithet gender changes. */ public class SciNameNormalizer { private static final Pattern suffix_a = Pattern.compile("(?:on|um|us|a)$"); private static final Pattern suffix_i = Pattern.compile("ei$"); private static final Pattern i = Pattern.compile("(?<!\\b)[jyi]+"); private static final Pattern trh = Pattern.compile("([tr])h", Pattern.CASE_INSENSITIVE); private static final Pattern white = Pattern.compile("\\s{2,}"); private static final Pattern empty = Pattern.compile("['_-]"); private static final Pattern removeRepeatedLetter = Pattern.compile("(\\p{L})\\1+"); private static final Pattern removeHybridSignGenus = Pattern.compile("^\\s*[×xX]\\s*([A-Z])"); private static final Pattern removeHybridSignEpithet = Pattern.compile("(?:^|\\s)(?:×\\s*|[xX]\\s+)([^A-Z])"); // dont use guava or commons so we dont have to bundle it for the solr cloud plugin ... public static boolean hasContent(String s) { return s != null && !(s.trim().isEmpty()); } public static String nullToEmpty(String s) { return (s == null) ? "" : s; } /** * Normalizes and entire scientific name, keeping monomials or the first genus part rather unchanged, * applying the more drastic normalization incl stemming to the remainder of the name only. */ public static String normalize(String s) { return normalize(s, false, true); } /** * Normalizes and entire scientific name, keeping monomials or the first genus part rather unchanged, * applying the more drastic normalization to the remainder of the name only. */ public static String normalize(String s, boolean stemming) { return normalize(s, false, stemming); } /** * Normalizes an entire name string including monomials and genus parts of a name. */ public static String normalizeAll(String s) { return normalize(s, true, true); } /** * Normalizes an entire name string including monomials and genus parts of a name. */ public static String normalizeAll(String s, boolean stemming) { return normalize(s, true, stemming); } private static String normalize(String s, boolean normMonomials, boolean stemming) { if (!hasContent(s)) return ""; s = s.trim(); // Remove a hybrid cross, or a likely hybrid cross. s = removeHybridSignGenus.matcher(s).replaceAll("$1"); s = removeHybridSignEpithet.matcher(s).replaceAll(" $1"); // Normalize letters and ligatures to their ASCII equivalent s = foldToAscii(s); // normalize whitespace s = empty.matcher(s).replaceAll(""); s = white.matcher(s).replaceAll(" "); // Only for bi/trinomials, otherwise we mix up ranks. if (normMonomials) { s = normStrongly(s, stemming); } else if (s.indexOf(' ') > 2) { String[] parts = s.split(" ", 2); s = parts[0] + " " + normStrongly(parts[1], stemming); } return s.trim(); } private static String normStrongly(String s, boolean stemming) { // remove repeated letters→leters in binomials s = removeRepeatedLetter.matcher(s).replaceAll("$1"); if (stemming) { s = stemEpithet(s); } // normalize frequent variations of i s = i.matcher(s).replaceAll("i"); if (stemming) { s = suffix_i.matcher(s).replaceAll("i"); } // normalize frequent variations of t/r sometimes followed by an 'h' return trh.matcher(s).replaceAll("$1"); } /** * Does a stemming of a latin epithet and return the female version ending with 'a'. */ public static String stemEpithet(String epithet) { if (!hasContent(epithet)) return ""; return suffix_a.matcher(epithet).replaceFirst("a"); } }