/***************************************************************************
* Copyright 2010 Global Biodiversity Information Facility Secretariat
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
***************************************************************************/
package org.gbif.nub.lookup.fuzzy;
import org.gbif.api.model.common.LinneanClassification;
import org.gbif.api.vocabulary.Kingdom;
import org.gbif.api.vocabulary.Rank;
import org.gbif.nub.utils.RsGbifOrg;
import org.gbif.utils.file.FileUtils;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Collection;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Pattern;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import com.google.common.io.Resources;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A tool to lookup synonyms for higher taxa names. It does a rather strong match and tries to normalize names
* a lot so we can use them for comparisons of entire higher classifications.
*
* The class uses file based dictionaries which are hosted on http://rs.gbif.org/dictionaries/synonyms/
* and which are maintained with OpenRefine based on real data found in occurrence and checklist records in github:
* https://github.com/gbif/rs.gbif.org/tree/master/dictionaries/synonyms
*/
public class HigherTaxaComparator {
private final static Map<Rank, String> SYNONYM_FILENAMES = Maps.newHashMap();
static {
SYNONYM_FILENAMES.put(Rank.KINGDOM, "kingdom.txt");
SYNONYM_FILENAMES.put(Rank.PHYLUM, "phylum.txt");
SYNONYM_FILENAMES.put(Rank.CLASS, "class.txt");
SYNONYM_FILENAMES.put(Rank.ORDER, "order.txt");
SYNONYM_FILENAMES.put(Rank.FAMILY, "family.txt");
}
private static final Set<String> NON_NAMES = Sets.newHashSet();
private Logger log = LoggerFactory.getLogger(HigherTaxaComparator.class);
private Map<Rank, Map<String, String>> syn = Maps.newHashMap();
private Map<String, Kingdom> kingdoms = Maps.newHashMap();
/**
*
*/
public HigherTaxaComparator() {
for (Kingdom k : Kingdom.values()) {
this.kingdoms.put(norm(k.name()), k);
}
}
/**
* Compares a single higher rank and returns the matching confidence supplied.
* @param rank the rank to be compared
* @param query the classification of the query
* @param ref the classification of the nub reference usage
* @param match confidence returned if the classifications match for the given rank
* @param mismatch confidence returned if the classifications do not match for the given rank
* @param missing confidence returned if one or both classifications have missing information for the given rank
* @return match, mismatch or missing confidence depending on match
*/
public int compareHigherRank(Rank rank, LinneanClassification query, LinneanClassification ref, int match, int mismatch, int missing) {
if (!StringUtils.isBlank(query.getHigherRank(rank)) && !StringUtils.isBlank(ref.getHigherRank(rank))) {
String querySyn = lookup(query.getHigherRank(rank), rank);
String refSyn = lookup(ref.getHigherRank(rank), rank);
if (!StringUtils.isBlank(querySyn) && !StringUtils.isBlank(refSyn) && querySyn.equalsIgnoreCase(refSyn)){
return match;
} else {
return mismatch;
}
}
return missing;
}
public boolean isInKingdoms(LinneanClassification n, Kingdom ... kingdoms){
String syn = lookup(n.getKingdom(), Rank.KINGDOM);
if (!Strings.isNullOrEmpty(syn)){
for (Kingdom kingdom : kingdoms){
if (syn.equalsIgnoreCase(kingdom.name())){
return true;
}
}
}
return false;
}
/**
* Lookup higher taxa synonym dictionary across all ranks and return the first match found
*
* @param higherTaxon
* @return the looked up accepted name or the original higherTaxon
*/
@VisibleForTesting
protected String lookup(String higherTaxon) {
if (higherTaxon == null) {
return null;
}
for (Rank r : syn.keySet()) {
String result = lookup(higherTaxon, r);
if (result != null) {
return result;
}
}
return higherTaxon;
}
/**
* Lookup synonym for given higher rank.
* Can be null.
*
* @param higherTaxon higher rank name, case insensitive
* @param rank the rank to lookup for
* @return the looked up accepted name, null for blacklisted names or the original higherTaxon if no synonym is known
*/
@VisibleForTesting
protected String lookup(String higherTaxon, Rank rank) {
if (higherTaxon == null) {
return null;
}
if (isBlacklisted(higherTaxon)) {
return null;
}
if (syn.containsKey(rank)) {
String normedHT = norm(higherTaxon);
Map<String, String> x = syn.get(rank);
if (syn.get(rank).containsKey(normedHT)) {
return syn.get(rank).get(normedHT);
}
}
return higherTaxon;
}
/**
* Check for obvious, blacklisted garbage and return true if thats the case.
* The underlying set is hosted at http://rs.gbif.org/dictionaries/authority/blacklisted.txt
*/
public boolean isBlacklisted(String name) {
if (name != null) {
name = norm(name);
if (NON_NAMES.contains(name)) {
return true;
}
}
return false;
}
/**
* @return non empty uppercased string with normalized whitespace and all non latin letters replaced. Or null
*/
@VisibleForTesting
protected static String norm(String x) {
Pattern REMOVE_NON_LETTERS = Pattern.compile("[\\W\\d]+");
x = Strings.nullToEmpty(x);
x = REMOVE_NON_LETTERS.matcher(x).replaceAll(" ");
x = StringUtils.normalizeSpace(x).toUpperCase();
return StringUtils.trimToNull(x);
}
private Map<String, String> readSynonymStream(Rank rank, InputStream in) {
Map<String, String> synonyms = Maps.newHashMap();
try {
synonyms = FileUtils.streamToMap(in, 0, 1, true);
} catch (IOException e) {
log.warn("Cannot read synonym map from stream for {}. Use empty map instead.", rank, e);
} finally {
Closeables.closeQuietly(in);
}
log.debug("loaded " + synonyms.size() + " synonyms for " + rank);
return synonyms;
}
/**
*
* @param file the synonym file on rs.gbif.org
* @return
*/
private Map<String, String> readSynonymUrl(Rank rank, String file) {
try {
URL url = RsGbifOrg.synonymUrl(file);
log.debug("Reading " + url.toString());
return readSynonymStream(rank, url.openStream());
} catch (IOException e) {
log.warn("Cannot read synonym map from " + file + ". Use empty map instead.", e);
}
return Maps.newHashMap();
}
/**
*
* @param file the local file to read
* @return
*/
private Map<String, String> readSynonymFile(Rank rank, File file) {
try {
log.debug("Reading " + file.getAbsolutePath());
return readSynonymStream(rank, new FileInputStream(file));
} catch (IOException e) {
log.warn("Cannot read synonym map from " + file + ". Use empty map instead: ", e.getMessage());
}
return Maps.newHashMap();
}
/**
* Reads blacklisted names from rs.gbif.org
*/
private void readOnlineBlacklist() {
try {
URL url = RsGbifOrg.authorityUrl(RsGbifOrg.FILENAME_BLACKLIST);
log.debug("Reading " + url.toString());
readBlacklistStream(url.openStream());
} catch (IOException e) {
log.warn("Cannot read online blacklist.", e);
}
}
/**
* Reads blacklisted names from file
*/
private void readBlacklistFile(File folder) {
File blacklist = new File(folder, RsGbifOrg.FILENAME_BLACKLIST);
try {
readBlacklistStream(new FileInputStream(blacklist));
} catch (IOException e) {
log.warn("Cannot read local blacklist {}. {}", blacklist.getAbsoluteFile(), e.getMessage());
}
}
/**
* Reads blacklisted names from stream
*/
private void readBlacklistStream(InputStream in) {
NON_NAMES.clear();
try {
NON_NAMES.addAll(FileUtils.streamToSet(in));
} catch (IOException e) {
log.warn("Cannot read blacklist. Use empty set instead.", e);
} finally {
Closeables.closeQuietly(in);
}
log.debug("loaded " + NON_NAMES.size() + " blacklisted names");
}
/**
* Reads synonym dicts from given folder.
* File names must be the same as on rs.gbif.org.
*/
public void loadLocalDicts(File folder) {
log.info("Reloading dictionary files from rs.gbif.org ...");
for (Rank rank : SYNONYM_FILENAMES.keySet()) {
Map<String, String> synonyms = readSynonymFile(rank, new File(folder, SYNONYM_FILENAMES.get(rank)));
setSynonyms(rank, synonyms);
}
// read blacklisted names
readBlacklistFile(folder);
}
/**
* Reads synonym dicts from given classpath root path.
* File names must be the same as on rs.gbif.org.
*/
public void loadClasspathDicts(String classpathFolder) throws IOException {
log.info("Reloading dictionary files from classpath ...");
for (Rank rank : SYNONYM_FILENAMES.keySet()) {
InputStream synIn = Resources.asByteSource(Resources.getResource(classpathFolder + "/" + SYNONYM_FILENAMES.get(rank))).openStream();
Map<String, String> synonyms = readSynonymStream(rank, synIn);
setSynonyms(rank, synonyms);
}
// read blacklisted names
InputStream blackIn = Resources.asByteSource(Resources.getResource(classpathFolder + "/" + RsGbifOrg.FILENAME_BLACKLIST)).openStream();
readBlacklistStream(blackIn);
}
/**
* Reloads all synonym files found on rs.gbif.org replacing existing mappings.
*/
public void loadOnlineDicts() {
log.info("Reloading dictionary files from rs.gbif.org ...");
for (Rank rank : SYNONYM_FILENAMES.keySet()) {
Map<String, String> synonyms = readSynonymUrl(rank, SYNONYM_FILENAMES.get(rank));
setSynonyms(rank, synonyms);
}
// read blacklisted names
readOnlineBlacklist();
}
/**
* Sets the synonym lookup map for a given rank.
* Names will be normalised and checked for existance of the same entry as key or value.
*
* @param rank
* @param synonyms
*/
public void setSynonyms(Rank rank, Map<String, String> synonyms) {
Map<String, String> synonymsNormed = Maps.newHashMap();
// normalise keys
for (Entry<String, String> entry : synonyms.entrySet()) {
synonymsNormed.put(norm(entry.getKey()), entry.getValue());
}
// test if synonyms show up as accepted too
Collection<String> syns = Sets.newHashSet(synonymsNormed.keySet());
for (String syn : syns) {
if (synonymsNormed.containsKey(synonymsNormed.get(syn))) {
log.warn(syn + " is both synonym and accepted - ignore synonym.");
synonymsNormed.remove(syn);
}
}
syn.put(rank, synonymsNormed);
log.debug("Loaded " + synonyms.size() + " " + rank.name() + " synonyms ");
// also insert kingdom enum lookup in case of kingdom synonyms
if (Rank.KINGDOM == rank) {
Map<String, String> map = syn.get(Rank.KINGDOM);
if (map != null) {
for (String syn : map.keySet()) {
Kingdom k = null;
String key = map.get(syn);
if (key != null) {
key = key.toLowerCase();
key = StringUtils.capitalize(key);
try {
k = Kingdom.valueOf(key);
} catch (Exception e) {
}
}
this.kingdoms.put(norm(syn), k);
}
}
for (Kingdom k : Kingdom.values()) {
this.kingdoms.put(norm(k.name()), k);
}
}
}
/**
* @return the number of entries across all ranks
*/
public int size() {
int all = 0;
for (Rank r : syn.keySet()) {
all += size(r);
}
return all;
}
/**
* @return the number of entries for a given rank
*/
public int size(Rank rank) {
if (syn.containsKey(rank)) {
return syn.get(rank).size();
}
return 0;
}
public Kingdom toKingdom(String kingdom) {
if (kingdom == null) {
return null;
}
return kingdoms.get(kingdom.trim().toUpperCase());
}
}