package org.gbif.occurrence.processor.interpreting; import org.gbif.api.exception.UnparsableException; import org.gbif.api.model.checklistbank.NameUsageMatch; import org.gbif.api.model.checklistbank.ParsedName; import org.gbif.api.model.occurrence.Occurrence; import org.gbif.api.model.occurrence.VerbatimOccurrence; import org.gbif.api.vocabulary.Extension; import org.gbif.api.vocabulary.Kingdom; import org.gbif.api.vocabulary.OccurrenceIssue; import org.gbif.api.vocabulary.Rank; import org.gbif.common.parsers.RankParser; import org.gbif.common.parsers.core.OccurrenceParseResult; import org.gbif.common.parsers.core.ParseResult; import org.gbif.common.parsers.utils.ClassificationUtils; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; import org.gbif.nameparser.GBIFNameParser; import org.gbif.occurrence.processor.guice.ApiClientConfiguration; import org.gbif.occurrence.processor.interpreting.util.RetryingWebserviceClient; import java.io.Serializable; import java.util.Collection; import java.util.Map; import java.util.concurrent.TimeUnit; import javax.ws.rs.core.MultivaluedMap; import com.google.common.base.Strings; import com.google.common.cache.CacheBuilder; import com.google.common.cache.LoadingCache; import com.google.inject.Inject; import com.sun.jersey.api.client.WebResource; import com.sun.jersey.core.util.MultivaluedMapImpl; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Takes a VerbatimOccurrence and does nub lookup on its provided taxonomy, then writes the result to the passed in * Occurrence. */ public class TaxonomyInterpreter implements Serializable { private static final Logger LOG = LoggerFactory.getLogger(TaxonomyInterpreter.class); private static final GBIFNameParser PARSER = new GBIFNameParser(); private static final RankParser RANK_PARSER = RankParser.getInstance(); private static final String MATCH_PATH = "species/match"; // The repetitive nature of our data encourages use of a light cache to reduce WS load private static final LoadingCache<WebResource, NameUsageMatch> CACHE = CacheBuilder.newBuilder() .maximumSize(10000) .expireAfterAccess(120, TimeUnit.MINUTES) .build(RetryingWebserviceClient.newInstance(NameUsageMatch.class, 5, 2000)); private final WebResource matchingWs; @Inject public TaxonomyInterpreter(WebResource apiBaseWs) { matchingWs = apiBaseWs.path(MATCH_PATH); } public TaxonomyInterpreter(ApiClientConfiguration cfg) { this(cfg.newApiClient()); } /** * Assembles the most complete scientific name based on full and individual name parts. * @param scientificName the full scientific name * @param genericName see GbifTerm.genericName * @param genus see DwcTerm.genus * @param specificEpithet see DwcTerm.specificEpithet * @param infraspecificEpithet see DwcTerm.infraspecificEpithet */ public static String buildScientificName(String scientificName, String authorship, String genericName, String genus, String specificEpithet, String infraspecificEpithet) { String sciname = ClassificationUtils.clean(scientificName); if (sciname == null) { // handle case when the scientific name is null and only given as atomized fields: genus & speciesEpitheton ParsedName pn = new ParsedName(); if (!StringUtils.isBlank(genericName)) { pn.setGenusOrAbove(genericName); } else { pn.setGenusOrAbove(genus); } pn.setSpecificEpithet(specificEpithet); pn.setInfraSpecificEpithet(infraspecificEpithet); pn.setAuthorship(authorship); sciname = pn.canonicalNameComplete(); } else if (!Strings.isNullOrEmpty(authorship) && !sciname.toLowerCase().contains(authorship.toLowerCase())) { sciname = sciname + " " + authorship; } return sciname; } private OccurrenceParseResult<NameUsageMatch> match(Map<Term, String> terms) { Rank rank = interpretRank(terms); return match( value(terms, DwcTerm.kingdom), value(terms, DwcTerm.phylum), value(terms, DwcTerm.class_), value(terms, DwcTerm.order), value(terms, DwcTerm.family), value(terms, DwcTerm.genus), value(terms, DwcTerm.scientificName), value(terms, DwcTerm.scientificNameAuthorship), value(terms, GbifTerm.genericName), value(terms, DwcTerm.specificEpithet), value(terms, DwcTerm.infraspecificEpithet), rank); } public OccurrenceParseResult<NameUsageMatch> match(String kingdom, String phylum, String clazz, String order, String family, String genus, String scientificName, String authorship, String genericName, String specificEpithet, String infraspecificEpithet, Rank rank) { String cleanGenus = ClassificationUtils.clean(genus); String cleanGenericName = ClassificationUtils.clean(genericName); String cleanSpecificEpithet = ClassificationUtils.cleanAuthor(specificEpithet); String cleanInfraspecificEpithet = ClassificationUtils.cleanAuthor(infraspecificEpithet); String cleanAuthorship = ClassificationUtils.cleanAuthor(authorship); String sciname = buildScientificName(scientificName, cleanAuthorship, cleanGenericName, cleanGenus, cleanSpecificEpithet, cleanInfraspecificEpithet); OccurrenceParseResult<NameUsageMatch> result; MultivaluedMap<String, String> queryParams = new MultivaluedMapImpl(); queryParams.add("kingdom", ClassificationUtils.clean(kingdom)); queryParams.add("phylum", ClassificationUtils.clean(phylum)); queryParams.add("class", ClassificationUtils.clean(clazz)); queryParams.add("order", ClassificationUtils.clean(order)); queryParams.add("family", ClassificationUtils.clean(family)); queryParams.add("genus", cleanGenus); queryParams.add("name", sciname); if (rank != null) { queryParams.add("rank", rank.name()); } LOG.debug("Attempt to match name [{}]", sciname); WebResource res = matchingWs.queryParams(queryParams); LOG.debug("WS call with: {}", res.getURI()); try { NameUsageMatch lookup = CACHE.get(res); result = OccurrenceParseResult.success(ParseResult.CONFIDENCE.DEFINITE, lookup); switch (lookup.getMatchType()) { case NONE: result = OccurrenceParseResult.fail(lookup, OccurrenceIssue.TAXON_MATCH_NONE); LOG.info("match for [{}] returned no match. Lookup note: [{}]", scientificName, lookup.getNote()); break; case FUZZY: result.addIssue(OccurrenceIssue.TAXON_MATCH_FUZZY); LOG.debug("match for [{}] was fuzzy. Match note: [{}]", scientificName, lookup.getNote()); break; case HIGHERRANK: result.addIssue(OccurrenceIssue.TAXON_MATCH_HIGHERRANK); LOG.debug("match for [{}] was to higher rank only. Match note: [{}]", scientificName, lookup.getNote()); break; } } catch (Exception e) { // Log the error LOG.error("Failed WS call with: {}", res.getURI()); result = OccurrenceParseResult.error(e); } return result; } private static void applyMatch(Occurrence occ, NameUsageMatch match, Collection<OccurrenceIssue> issues) { occ.setTaxonKey(match.getUsageKey()); occ.setScientificName(match.getScientificName()); occ.setTaxonRank(match.getRank()); // copy issues occ.getIssues().addAll(issues); // parse name into pieces - we dont get them from the nub lookup try { ParsedName pn = PARSER.parse(match.getScientificName(), match.getRank()); occ.setGenericName(pn.getGenusOrAbove()); occ.setSpecificEpithet(pn.getSpecificEpithet()); occ.setInfraspecificEpithet(pn.getInfraSpecificEpithet()); } catch (UnparsableException e) { if (e.type.isParsable()) { LOG.warn("Fail to parse backbone {} name for occurrence {}: {}", e.type, occ.getKey(), e.name); } } for (Rank r : Rank.DWC_RANKS) { org.gbif.api.util.ClassificationUtils.setHigherRank(occ, r, match.getHigherRank(r)); org.gbif.api.util.ClassificationUtils.setHigherRankKey(occ, r, match.getHigherRankKey(r)); } LOG.debug("Occurrence {} matched to nub {} [{}]", occ.getKey(), occ.getScientificName(), occ.getTaxonKey()); } private static String value(Map<Term, String> terms, Term term) { return terms.get(term); } private static boolean hasTerm(Map<Term, String> terms, Term term) { return !Strings.isNullOrEmpty(value(terms, term)); } public void interpretTaxonomy(VerbatimOccurrence verbatim, Occurrence occ) { // try core taxon fields first OccurrenceParseResult<NameUsageMatch> matchPR = match(verbatim.getVerbatimFields()); // try the identification extension if no core match if (!matchPR.isSuccessful() && verbatim.getExtensions().containsKey(Extension.IDENTIFICATION)) { // there may be many identifications but we only want the latest, current one //TODO: use latest identification only sorting records by their dwc:dateIdentified for (Map<Term, String> rec : verbatim.getExtensions().get(Extension.IDENTIFICATION)) { matchPR = match(rec); if (matchPR.isSuccessful()) { // TODO: copy other identification terms to core??? // identifiedBy // dateIdentified // identificationReferences // identificationRemarks // identificationQualifier // identificationVerificationStatus // typeStatus // taxonID // taxonConceptID // nameAccordingTo // nameAccordingToID // taxonRemarks break; } } } // apply taxonomy if we got a match if (matchPR.isSuccessful()) { applyMatch(occ, matchPR.getPayload(), matchPR.getIssues()); } else { LOG.debug("No backbone match for occurrence {}", occ.getKey()); occ.addIssue(OccurrenceIssue.TAXON_MATCH_NONE); // assign unknown kingdom applyKingdom(occ, Kingdom.INCERTAE_SEDIS); } } private static void applyKingdom(Occurrence occ, Kingdom k){ occ.setTaxonKey(k.nubUsageKey()); occ.setScientificName(k.scientificName()); occ.setTaxonRank(Rank.KINGDOM); } private static Rank interpretRank(Map<Term, String> terms){ Rank rank = null; if (hasTerm(terms, DwcTerm.taxonRank)) { rank = RANK_PARSER.parse(value(terms, DwcTerm.taxonRank)).getPayload(); } // try again with verbatim if it exists if (rank == null && hasTerm(terms, DwcTerm.verbatimTaxonRank)) { rank = RANK_PARSER.parse(value(terms, DwcTerm.verbatimTaxonRank)).getPayload(); } // derive from atomized fields if (rank == null && hasTerm(terms, DwcTerm.genus)) { if (hasTerm(terms, DwcTerm.specificEpithet)) { if (hasTerm(terms, DwcTerm.infraspecificEpithet)) { rank = Rank.INFRASPECIFIC_NAME; } else { rank = Rank.SPECIES; } } else { rank = Rank.GENUS; } } return rank; } }