package org.gbif.checklistbank.authorship;
import org.gbif.api.model.checklistbank.ParsedName;
import org.gbif.checklistbank.model.Equality;
import org.gbif.utils.ObjectUtils;
import org.gbif.utils.file.FileUtils;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Resources;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Utility to compare scientific name authorships, i.e. the recombination and basionym author and publishing year.
* Author strings are normalized to ASCII and then compared. As authors are often abbreviated in all kind of ways a shared common substring is accepted
* as a positive equality.
* If any of the names given has an empty author & year the results will always be Equality.UNKNOWN.
*
* The class exposes two kind of compare methods. A strict one always requiring both year and author to match
* and a more lax default comparison that only looks at years when the authors differ (as it is quite hard to compare authors)
*/
public class AuthorComparator {
private static final Logger LOG = LoggerFactory.getLogger(AuthorComparator.class);
private static final Pattern AND = Pattern.compile("( et | and |&|&)", Pattern.CASE_INSENSITIVE);
private static final Pattern IN = Pattern.compile(" in .+$", Pattern.CASE_INSENSITIVE);
private static final Pattern EX = Pattern.compile("^.+ ex ", Pattern.CASE_INSENSITIVE);
private static final Pattern FIL = Pattern.compile("([A-Z][a-z]*)[\\. ]\\s*f(:?il)?\\.?\\b");
private static final Pattern TRANSLITERATIONS = Pattern.compile("([auo])e", Pattern.CASE_INSENSITIVE);
private static final Pattern SURNAME = Pattern.compile("([a-z]+)(:? filius)?$");
private static final Pattern FIRST_INITIALS = Pattern.compile("^([a-z]\\s)+");
private static final String AUTHOR_MAP_FILENAME = "/authorship/authormap.txt";
private static final Pattern PUNCTUATION = Pattern.compile("[\\p{Punct}&&[^,]]+");
private static final Pattern COMMA = Pattern.compile("\\s*,\\s*");
private static final Splitter AUTHOR_SPLITTER = Splitter.on(",").omitEmptyStrings();
private final Map<String, String> authorMap;
private static final int MIN_AUTHOR_LENGTH_WITHOUT_LOOKUP = 4;
private final int minCommonSubstring;
private AuthorComparator(Map<String, String> authors) {
Map<String, String> map = Maps.newHashMap();
this.minCommonSubstring = 4;
int counter=0;
for (Map.Entry<String, String> entry : authors.entrySet()) {
String key = normalize(entry.getKey());
String val = normalize(entry.getValue());
if (key != null && val != null) {
map.put(key, val);
counter++;
}
}
this.authorMap = ImmutableMap.copyOf(map);
LOG.info("Created author comparator with {} abbreviation entries", counter);
}
public static AuthorComparator createWithoutAuthormap() {
return new AuthorComparator(Maps.<String, String>newHashMap());
}
public static AuthorComparator createWithAuthormap() {
try {
AuthorComparator ac = new AuthorComparator(
FileUtils.streamToMap(Resources.asByteSource(AuthorComparator.class.getResource(AUTHOR_MAP_FILENAME)).openStream(),
Maps.<String, String>newHashMap(), 0, 2, true)
);
return ac;
} catch (IOException e) {
throw new RuntimeException("Failed to load author map from classpath", e);
}
}
public static AuthorComparator createWithAuthormap(Map<String, String> authorMap) {
return new AuthorComparator(authorMap);
}
/**
* Compares the author and year of two names by first evaluating equivalence of the authors.
* Only if they appear to differ also a year comparison is done which can still yield an overall EQUAL in case years match.
*/
public Equality compare(@Nullable String authors1, @Nullable String year1, @Nullable String authors2, @Nullable String year2) {
// compare recombination authors first
Equality result = compareAuthorteam(authors1, authors2, minCommonSubstring, MIN_AUTHOR_LENGTH_WITHOUT_LOOKUP);
if (result != Equality.EQUAL) {
// if authors are not the same we allow a positive year comparison to override it as author comparison is very difficult
Equality yresult = new YearComparator(year1, year2).compare();
if (yresult != Equality.UNKNOWN) {
result = yresult;
}
}
return result;
}
/**
* Does a comparison of recombination and basionym authorship using the author compare method once for the recombination authorship and once for the basionym.
*/
public Equality compare(ParsedName n1, ParsedName n2) {
if (!n1.isAuthorsParsed()) {
// copy parsed name to not alter the original
n1 = clone(n1);
parseAuthorship(n1);
}
if (!n2.isAuthorsParsed()) {
// copy parsed name to not alter the original
n2 = clone(n2);
parseAuthorship(n2);
}
Equality recomb = compare(n1.getAuthorship(), n1.getYear(), n2.getAuthorship(), n2.getYear());
if (recomb == Equality.DIFFERENT) {
// in case the recomb author differs we are done, no need for basionym authorship comparison
return recomb;
}
Equality original = compare(n1.getBracketAuthorship(), n1.getBracketYear(), n2.getBracketAuthorship(), n2.getBracketYear());
if (recomb == Equality.UNKNOWN && original == Equality.UNKNOWN) {
// a common error is missing brackets, so if all is unknown we compare authorship across brackets and return a possible match
Equality across = Equality.UNKNOWN;
if (Strings.isNullOrEmpty(n1.getAuthorship()) && Strings.isNullOrEmpty(n1.getYear())) {
across = compare(n1.getBracketAuthorship(), n1.getBracketYear(), n2.getAuthorship(), n2.getYear());
} else if (Strings.isNullOrEmpty(n1.getBracketAuthorship()) && Strings.isNullOrEmpty(n1.getBracketYear())) {
across = compare(n1.getAuthorship(), n1.getYear(), n2.getBracketAuthorship(), n2.getBracketYear());
}
return across == Equality.EQUAL ? Equality.EQUAL : Equality.UNKNOWN;
}
return recomb.and(original);
}
/**
* Compares two sets of author & year for equality.
* This is more strict than the normal compare method and requires both authors and year to match.
* Author matching is still done fuzzily
*
* @return true if both sets match
*/
public boolean compareStrict(String author1, @Nullable String year1, String author2, @Nullable String year2) {
// strictly compare authors first
Equality result = compareAuthorteam(author1, author2, minCommonSubstring, Integer.MAX_VALUE);
if (result != Equality.EQUAL) {
return false;
}
// now also compare the year
if (year1 == null && year2 == null) {
return true;
}
return Equality.EQUAL == new YearComparator(year1, year2).compare();
}
/**
* @return ascii only, lower cased string without punctuation. Empty string instead of null.
* Umlaut transliterations reduced to single letter
*/
@VisibleForTesting
protected static String normalize(String x) {
if (StringUtils.isBlank(x)) {
return null;
}
// remove in publications
x = IN.matcher(x).replaceFirst("");
// remove ex authors
x = EX.matcher(x).replaceFirst("");
// normalize filius
x = FIL.matcher(x).replaceAll("$1 filius");
// normalize and
x = AND.matcher(x).replaceAll(", ");
// remove ex authors
x = TRANSLITERATIONS.matcher(x).replaceAll("$1");
// fold to ascii
x = org.gbif.utils.text.StringUtils.foldToAscii(x);
// replace all punctuation but commas
x = PUNCTUATION.matcher(x).replaceAll(" ");
// normalize commas
x = COMMA.matcher(x).replaceAll(",");
x = StringUtils.normalizeSpace(x);
if (StringUtils.isBlank(x)) {
return null;
}
return x.toLowerCase();
}
/**
* Looks up individual authors from an authorship string
* @return entire authorship string with expanded authors if found
*/
@VisibleForTesting
protected String lookup(String normalizedAuthor) {
if (normalizedAuthor!=null && authorMap.containsKey(normalizedAuthor)) {
return authorMap.get(normalizedAuthor);
} else {
return normalizedAuthor;
}
}
private List<String> lookup(List<String> authorTeam) {
List<String> authors = Lists.newArrayList();
for (String author : authorTeam) {
authors.add(lookup(author));
}
return authors;
}
private List<String> splitAndLookup(String normalizedAuthorTeam, int minAuthorLengthWithoutLookup) {
List<String> authors = Lists.newArrayList();
if (normalizedAuthorTeam!=null){
for (String author : AUTHOR_SPLITTER.split(normalizedAuthorTeam)) {
if (minAuthorLengthWithoutLookup > 0 && author.length() < minAuthorLengthWithoutLookup) {
authors.add(lookup(author));
} else {
authors.add(author);
}
}
}
return authors;
}
private ParsedName clone(ParsedName pn) {
ParsedName pn2 = new ParsedName();
try {
BeanUtils.copyProperties(pn2, pn);
} catch (IllegalAccessException e) {
Throwables.propagate(e);
} catch (InvocationTargetException e) {
Throwables.propagate(e);
}
return pn2;
}
/**
* Extract authorship from the name itself as best as we can to at least do some common string comparison
*/
private void parseAuthorship(ParsedName pn) {
// try to use full sciname minus the epithets
String lastEpithet = ObjectUtils.coalesce(pn.getInfraSpecificEpithet(), pn.getSpecificEpithet(), pn.getGenusOrAbove());
if (lastEpithet != null && pn.getScientificName() != null) {
int idx = pn.getScientificName().lastIndexOf(lastEpithet);
if (idx >= 0) {
pn.setAuthorship(pn.getScientificName().substring(idx + lastEpithet.length()));
}
}
// copy full name to year, will be extracted/normalized in year comparison
pn.setYear(pn.getScientificName());
pn.setAuthorsParsed(true);
}
/**
* Does an author comparison, normalizing the strings and try 3 comparisons:
* 1) checks regular string equality
* 2) checks for equality of the longest common substring
* 3) do an author lookup and then check for common substring
*/
private Equality compareAuthorteam(@Nullable String a1, @Nullable String a2, int minCommonSubstring, int maxAuthorLengthWithoutLookup) {
// convert to all lower case, no punctuation but commas seperating authors and normed whitespace
List<String> authorTeam1 = splitAndLookup(normalize(a1), maxAuthorLengthWithoutLookup);
List<String> authorTeam2 = splitAndLookup(normalize(a2), maxAuthorLengthWithoutLookup);
if (!authorTeam1.isEmpty() && !authorTeam2.isEmpty()) {
Equality equality = compareNormalizedAuthorteam(authorTeam1, authorTeam2, minCommonSubstring);
if (equality != Equality.EQUAL) {
List<String> authorTeam1l = lookup(authorTeam1);
List<String> authorTeam2l = lookup(authorTeam2);
if (!authorTeam1.equals(authorTeam1l) || !authorTeam2.equals(authorTeam2l)) {
equality = compareNormalizedAuthorteam(authorTeam1l, authorTeam2l, minCommonSubstring);
}
}
return equality;
}
return Equality.UNKNOWN;
}
private int lengthWithoutWhitespace(String x) {
return StringUtils.deleteWhitespace(x).length();
}
/**
* compares entire author team strings
*/
private Equality compareNormalizedAuthorteam(final List<String> authorTeam1, final List<String> authorTeam2, final int minCommonStart) {
// quick check avoiding subsequent heavier processing
if (authorTeam1.equals(authorTeam2)) {
// we can stop here, authors are equal, thats enough
return Equality.EQUAL;
} else {
// compare all authors to each other - a single match is good enough!
for (String author1 : authorTeam1) {
for (String author2 : authorTeam2) {
if (Equality.EQUAL == compareNormalizedAuthor(author1, author2, minCommonStart)) {
return Equality.EQUAL;
}
}
}
}
return Equality.DIFFERENT;
}
@VisibleForTesting
protected String extractSurname(String name) {
Matcher m = SURNAME.matcher(name);
if (m.find()){
return m.group(1);
}
return name;
}
/**
* compares a single author potentially with initials
*/
private Equality compareNormalizedAuthor(final String a1, final String a2, final int minCommonStart) {
if (a1.equals(a2)) {
// we can stop here, authors are equal, thats enough
return Equality.EQUAL;
} else {
final String surname1 = extractSurname(a1);
final String surname2 = extractSurname(a2);
String common = StringUtils.getCommonPrefix(surname1, surname2);
if (surname1.equals(surname2) || common.length() >= minCommonStart) {
// do both names have a single initial which is different?
// this is often the case when authors are relatives like brothers or son & father
if (firstInitialsDiffer(a1, a2)) {
return Equality.DIFFERENT;
} else {
return Equality.EQUAL;
}
} else if (a1.equals(common) && (surname2.startsWith(common))
|| a2.equals(common) && (surname1.startsWith(common))
) {
// the smallest common substring is the same as one of the inputs
// if it also matches the start of the first longer surname then we are ok as the entire string is the best match we can have
// likey a short abbreviation
return Equality.EQUAL;
} else if (lengthWithoutWhitespace(StringUtils.getCommonPrefix(a1, a2)) > minCommonStart) {
// the author string incl initials but without whitespace shares at least minCommonStart+1 characters
return Equality.EQUAL;
}
}
return Equality.DIFFERENT;
}
/**
* Gracefully compare initials of the first author only
* @return true if they differ
*/
@VisibleForTesting
protected boolean firstInitialsDiffer(String a1, String a2) {
Matcher m1 = FIRST_INITIALS.matcher(a1);
Matcher m2 = FIRST_INITIALS.matcher(a2);
if (m1.find() && m2.find()) {
String i1 = m1.group(0);
String i2 = m2.group(0);
if (i1.equals(i2)) {
return false;
} else {
// if one set of chars is a subset of the other we consider this a match
List<Character> smaller = Lists.charactersOf(StringUtils.deleteWhitespace(i1));
List<Character> larger = Lists.charactersOf(StringUtils.deleteWhitespace(i2));
if (smaller.size() > larger.size()) {
// swap, the Sets difference method needs the right inputs
List<Character> tmp = smaller;
smaller = larger;
larger = tmp;
}
// remove all of the chars from the larger list and see if any remain
if (CollectionUtils.isSubCollection(smaller, larger)) {
// one is a subset of the other
return false;
}
}
// they seem to differ
return true;
} else {
// no initials
return false;
}
}
}