package org.gbif.checklistbank.cli.normalizer; import org.gbif.api.model.checklistbank.Description; import org.gbif.api.model.checklistbank.Distribution; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.NameUsageMediaObject; import org.gbif.api.model.checklistbank.Reference; import org.gbif.api.model.checklistbank.SpeciesProfile; import org.gbif.api.model.checklistbank.TypeSpecimen; import org.gbif.api.model.checklistbank.VerbatimNameUsage; import org.gbif.api.model.checklistbank.VernacularName; import org.gbif.api.model.common.Identifier; import org.gbif.api.vocabulary.Extension; import org.gbif.api.vocabulary.IdentifierType; import org.gbif.api.vocabulary.MediaType; import org.gbif.api.vocabulary.NameUsageIssue; import org.gbif.checklistbank.model.UsageExtensions; import org.gbif.checklistbank.neo.NeoInserter; import org.gbif.common.parsers.BooleanParser; import org.gbif.common.parsers.CitesAppendixParser; import org.gbif.common.parsers.CountryParser; import org.gbif.common.parsers.EstablishmentMeansParser; import org.gbif.common.parsers.LanguageParser; import org.gbif.common.parsers.LifeStageParser; import org.gbif.common.parsers.MediaParser; import org.gbif.common.parsers.MediaTypeParser; import org.gbif.common.parsers.NumberParser; import org.gbif.common.parsers.OccurrenceStatusParser; import org.gbif.common.parsers.SexParser; import org.gbif.common.parsers.ThreatStatusParser; import org.gbif.common.parsers.TypeStatusParser; import org.gbif.common.parsers.UrlParser; import org.gbif.common.parsers.core.EnumParser; import org.gbif.common.parsers.date.DateParseUtils; import org.gbif.dwc.terms.AcTerm; import org.gbif.dwc.terms.DcElement; import org.gbif.dwc.terms.DcTerm; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.IucnTerm; import org.gbif.dwc.terms.Term; import org.gbif.dwc.terms.XmpRightsTerm; import org.gbif.dwc.terms.XmpTerm; import java.net.URI; import java.util.Iterator; import java.util.Map; import javax.annotation.Nullable; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.annotations.VisibleForTesting; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class ExtensionInterpreter { private static final Logger LOG = LoggerFactory.getLogger(ExtensionInterpreter.class); private final CountryParser countryParser = CountryParser.getInstance(); private final LanguageParser languageParser = LanguageParser.getInstance(); private final LifeStageParser lifeStageParser = LifeStageParser.getInstance(); private final SexParser sexParser = SexParser.getInstance(); private final EstablishmentMeansParser establishmentMeansParser = EstablishmentMeansParser.getInstance(); private final BooleanParser booleanParser = BooleanParser.getInstance(); private final MediaParser mediaParser = MediaParser.getInstance(); private final OccurrenceStatusParser occurrenceStatusParser = OccurrenceStatusParser.getInstance(); private final ThreatStatusParser threatStatusParser = ThreatStatusParser.getInstance(); private final CitesAppendixParser citesAppendixParser= CitesAppendixParser.getInstance(); private final TypeStatusParser typeStatusParser = TypeStatusParser.getInstance(); private final MediaTypeParser mediaTypeParser = MediaTypeParser.getInstance(); /** * Tries various terms in given order until it finds a non empty value. * @param rec * @param terms * @return non empty, cleaned value or null */ private String value(Map<Term, String> rec, Term ... terms) { for (Term t : terms) { if (rec.containsKey(t)) { String val = NeoInserter.clean(rec.get(t)); if (val != null) { return val; } } } return null; } /** * Tries to parse the verbatim value for the given term into a boolean. * If a value exists but cannot be parsed tan issue is added to the usage container. * @param rec * @param issue kind of issue to add if parsing fails * @param u * @param term * @return */ private Boolean bool(Map<Term, String> rec, NameUsageIssue issue, NameUsage u, Term term) { Boolean result = null; String val = value(rec, term); if (val != null) { result = booleanParser.parse(val).getPayload(); if (result == null) { u.addIssue(issue); } } return result; } private Integer integer(Map<Term, String> rec, NameUsageIssue issue, NameUsage u, Term term) { Integer i = null; String val = value(rec, term); if (val != null) { i = NumberParser.parseInteger(val); if (i == null) { u.addIssue(issue); } } return i; } private <T extends Enum<T>> T enumify(Map<Term, String> rec, @Nullable NameUsageIssue issue, EnumParser<T> parser, NameUsage u, Term ... terms) { boolean valuesFound = false; for (Term t : terms) { if (rec.containsKey(t)) { String val = NeoInserter.clean(rec.get(t)); if (val != null) { valuesFound = true; T result = parser.parse(val).getPayload(); if (result != null) { return result; } } } } // nothing found, raise issue? if (valuesFound && issue != null) { u.addIssue(issue); } return null; } private void interpretVernacularNames(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { if (v.hasExtension(Extension.VERNACULAR_NAME)) { for (Map<Term, String> rec : v.getExtensions().get(Extension.VERNACULAR_NAME)) { VernacularName vn = new VernacularName(); vn.setVernacularName(value(rec, DwcTerm.vernacularName)); if (vn.getVernacularName() == null) { u.addIssue(NameUsageIssue.VERNACULAR_NAME_INVALID); continue; } // locationID > locality vn.setArea(value(rec, DwcTerm.locationID, DwcTerm.locality)); vn.setCountry(enumify(rec, null, countryParser, u, DwcTerm.countryCode, DwcTerm.country, DwcTerm.locationID, DwcTerm.locality)); vn.setLanguage(enumify(rec, NameUsageIssue.VERNACULAR_NAME_INVALID, languageParser, u, DcTerm.language)); vn.setLifeStage(enumify(rec, NameUsageIssue.VERNACULAR_NAME_INVALID, lifeStageParser, u, DwcTerm.lifeStage)); vn.setPlural(bool(rec, NameUsageIssue.VERNACULAR_NAME_INVALID, u, GbifTerm.isPlural)); vn.setPreferred(bool(rec, NameUsageIssue.VERNACULAR_NAME_INVALID, u, GbifTerm.isPreferredName)); vn.setSex(enumify(rec, NameUsageIssue.VERNACULAR_NAME_INVALID, sexParser, u, DwcTerm.sex)); vn.setSource(value(rec, DcTerm.source)); // interpret rec e.vernacularNames.add(vn); } } } /** * We only keep type names and ignore specimens... */ private void interpretTypes(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { if (v.hasExtension(Extension.TYPES_AND_SPECIMEN)) { for (Map<Term, String> rec : v.getExtensions().get(Extension.TYPES_AND_SPECIMEN)) { TypeSpecimen t = new TypeSpecimen(); // interpret t.setScientificName(expandGenus(value(rec, DwcTerm.scientificName), u.getScientificName())); if (t.getScientificName() == null || t.getScientificName().equalsIgnoreCase(u.getScientificName())) { LOG.debug("Ignore type name for {} as the name is the same as the taxon", u.getScientificName()); continue; } t.setTypeDesignatedBy(value(rec, GbifTerm.typeDesignatedBy)); t.setTypeStatus(enumify(rec, null, typeStatusParser, u, DwcTerm.typeStatus)); t.setSource(value(rec, DcTerm.source)); //t.setCitation(value(rec, DcTerm.bibliographicCitation)); //t.setTypeDesignationType(value(rec, GbifTerm.typeDesignatedType)); e.typeSpecimens.add(t); } } } /** * Expands abbreviated genus names with the full genus * @param abbreviatedName the potentially abbreviated scientific name, e.g. "A. alba" * @param scientificName the full scientific name for the main taxon */ @VisibleForTesting protected static String expandGenus(String abbreviatedName, String scientificName) { // test if name has an abbreviated genus if (abbreviatedName != null && abbreviatedName.length() > 1 && scientificName != null && scientificName.length() > 2) { String[] parts = abbreviatedName.split("\\s+", 2); String genus = scientificName.split("\\s+", 2)[0]; String abbrev = parts[0].replaceAll("\\.$", ""); if (parts.length == 2 && abbrev.length() < 4 && genus.startsWith(abbrev)) { return genus + " " + parts[1]; } } return abbreviatedName; } private void interpretSpeciesProfiles(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { if (v.hasExtension(Extension.SPECIES_PROFILE)) { for (Map<Term, String> rec : v.getExtensions().get(Extension.SPECIES_PROFILE)) { SpeciesProfile s = new SpeciesProfile(); // interpret rec s.setSource(value(rec, DcTerm.source)); s.setAgeInDays(integer(rec, NameUsageIssue.SPECIES_PROFILE_INVALID, u, GbifTerm.ageInDays)); s.setMassInGram(integer(rec, NameUsageIssue.SPECIES_PROFILE_INVALID, u, GbifTerm.massInGram)); s.setSizeInMillimeter(integer(rec, NameUsageIssue.SPECIES_PROFILE_INVALID, u, GbifTerm.sizeInMillimeter)); s.setHybrid(bool(rec, NameUsageIssue.SPECIES_PROFILE_INVALID, u, GbifTerm.isHybrid)); s.setMarine(bool(rec, NameUsageIssue.SPECIES_PROFILE_INVALID, u, GbifTerm.isMarine)); s.setFreshwater(bool(rec, NameUsageIssue.SPECIES_PROFILE_INVALID, u, GbifTerm.isFreshwater)); s.setTerrestrial(bool(rec, NameUsageIssue.SPECIES_PROFILE_INVALID, u, GbifTerm.isTerrestrial)); s.setExtinct(bool(rec, NameUsageIssue.SPECIES_PROFILE_INVALID, u, GbifTerm.isExtinct)); s.setLivingPeriod(value(rec, GbifTerm.livingPeriod)); s.setLifeForm(value(rec, GbifTerm.lifeForm)); s.setHabitat(value(rec, DwcTerm.habitat)); e.speciesProfiles.add(s); } } } private void interpretReference(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { if (v.hasExtension(Extension.REFERENCE)) { for (Map<Term, String> rec : v.getExtensions().get(Extension.REFERENCE)) { Reference r = new Reference(); // interpret rec r.setType(value(rec, DcTerm.type)); r.setCitation(value(rec, DcTerm.bibliographicCitation)); r.setTitle(value(rec, DcTerm.title)); r.setAuthor(value(rec, DcTerm.creator)); r.setDate(value(rec, DcTerm.date, DcTerm.created)); r.setSource(value(rec, DcTerm.source)); r.setRemarks(value(rec, DwcTerm.taxonRemarks)); // TODO: need to check this mapping! r.setDoi(value(rec, DcTerm.identifier)); r.setLink(value(rec, DcTerm.references)); e.referenceList.add(r); } } } private void extractMedia(NameUsage u, UsageExtensions e, VerbatimNameUsage v, Extension ext, boolean requireType) { if (v.hasExtension(ext)) { for (Map<Term, String> rec : v.getExtensions().get(ext)) { URI uri = UrlParser.parse(value(rec, AcTerm.accessURI, DcTerm.identifier, DcElement.identifier)); URI link = UrlParser.parse(value(rec, AcTerm.furtherInformationURL, DcTerm.references, AcTerm.attributionLinkURL)); // EOL media extension is also used to publish text descriptions - avoid those MediaType type = enumify(rec, null, mediaTypeParser, u, DcTerm.type, DcElement.type); if (requireType && type == null) { continue; } // link or media uri must exist if (uri == null && link == null) { u.addIssue(NameUsageIssue.MULTIMEDIA_INVALID); } else { NameUsageMediaObject m = new NameUsageMediaObject(); m.setType(type); m.setIdentifier(uri); m.setReferences(link); m.setTitle(value(rec, DcTerm.title, AcTerm.caption)); m.setDescription(value(rec, DcTerm.description)); m.setLicense(value(rec, DcTerm.license, XmpRightsTerm.UsageTerms, DcTerm.rights)); m.setPublisher(value(rec, DcTerm.publisher)); m.setContributor(value(rec, DcTerm.contributor)); m.setSource(value(rec, AcTerm.derivedFrom, DcTerm.source)); m.setAudience(value(rec, DcTerm.audience)); m.setRightsHolder(value(rec, XmpRightsTerm.Owner, DcTerm.rightsHolder)); m.setCreator(value(rec, DcTerm.creator)); m.setFormat(mediaParser.parseMimeType(value(rec, DcTerm.format))); String created = value(rec, XmpTerm.CreateDate, DcTerm.created, DcTerm.date); if (created != null) { m.setCreated(DateParseUtils.parse(created).getPayload()); } mediaParser.detectType(m); e.media.add(m); } } } } private void interpretMultimedia(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { extractMedia(u, e, v, Extension.IMAGE, false); extractMedia(u, e, v, Extension.MULTIMEDIA, false); extractMedia(u, e, v, Extension.EOL_MEDIA, true); extractMedia(u, e, v, Extension.AUDUBON, false); extractMediaCore(u, e, v); /** * merges media records if the same image URL or link is given several times. * Remove any media that has not either a file or webpage uri. */ Map<String, NameUsageMediaObject> media = Maps.newLinkedHashMap(); for (NameUsageMediaObject m : e.media) { // we can get file uris or weblinks. Prefer file URIs as they clearly identify a single image URI uri = m.getIdentifier() != null ? m.getIdentifier() : m.getReferences(); if (uri != null) { String url = uri.toString(); if (media.containsKey(url)) { // TODO: merge infos about the same image? } else { media.put(url, m); } } } e.media = Lists.newArrayList(media.values()); } private void extractMediaCore(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { if (v.hasCoreField(DwcTerm.associatedMedia)) { for (URI uri : UrlParser.parseUriList(v.getCoreField(DwcTerm.associatedMedia))) { if (uri != null) { NameUsageMediaObject m = new NameUsageMediaObject(); m.setIdentifier(uri); mediaParser.detectType(m); e.media.add(m); } } } } private void interpretIdentifier(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { if (v.hasExtension(Extension.IDENTIFIER)) { for (Map<Term, String> rec : v.getExtensions().get(Extension.IDENTIFIER)) { Identifier i = new Identifier(); // interpret rec i.setIdentifier(value(rec, DcTerm.identifier)); if (i.getIdentifier() == null) { u.addIssue(NameUsageIssue.ALT_IDENTIFIER_INVALID); continue; } i.setTitle(value(rec, DcTerm.title)); i.setType(IdentifierType.inferFrom(i.getIdentifier())); e.identifiers.add(i); } } } private void interpretDistribution(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { if (v.hasExtension(Extension.DISTRIBUTION)) { for (Map<Term, String> rec : v.getExtensions().get(Extension.DISTRIBUTION)) { Distribution d = new Distribution(); // interpret rec d.setLocality(value(rec, DwcTerm.locality)); d.setLocationId(value(rec, DwcTerm.locationID)); d.setCountry(enumify(rec, NameUsageIssue.DISTRIBUTION_INVALID, countryParser, u, DwcTerm.country, DwcTerm.countryCode)); // some location is required, otherwise its pointless if (d.getLocality() == null && d.getLocationId() == null && d.getCountry() == null) { u.addIssue(NameUsageIssue.DISTRIBUTION_INVALID); continue; } d.setStatus(enumify(rec, NameUsageIssue.DISTRIBUTION_INVALID, occurrenceStatusParser, u, DwcTerm.occurrenceStatus)); d.setEstablishmentMeans(enumify(rec, NameUsageIssue.DISTRIBUTION_INVALID, establishmentMeansParser, u, DwcTerm.establishmentMeans)); d.setAppendixCites(enumify(rec, NameUsageIssue.DISTRIBUTION_INVALID, citesAppendixParser, u, GbifTerm.appendixCITES)); d.setThreatStatus(enumify(rec, NameUsageIssue.DISTRIBUTION_INVALID, threatStatusParser, u, IucnTerm.threatStatus)); d.setLifeStage(enumify(rec, NameUsageIssue.DISTRIBUTION_INVALID, lifeStageParser, u, DwcTerm.lifeStage)); d.setTemporal(value(rec, DwcTerm.eventDate, DcTerm.temporal)); d.setEndDayOfYear(integer(rec, NameUsageIssue.DISTRIBUTION_INVALID, u, DwcTerm.endDayOfYear)); d.setStartDayOfYear(integer(rec, NameUsageIssue.DISTRIBUTION_INVALID, u, DwcTerm.startDayOfYear)); d.setRemarks(value(rec, DwcTerm.occurrenceRemarks, DwcTerm.taxonRemarks)); d.setSource(value(rec, DcTerm.source)); e.distributions.add(d); } } } private void interpretDescription(NameUsage u, UsageExtensions e, VerbatimNameUsage v) { if (v.hasExtension(Extension.DESCRIPTION)) { for (Map<Term, String> rec : v.getExtensions().get(Extension.DESCRIPTION)) { Description d = new Description(); // interpret rec d.setDescription(value(rec, DcTerm.description, DcTerm.abstract_)); d.setType(value(rec, DcTerm.type)); d.setSource(value(rec, DcTerm.source)); d.setContributor(value(rec, DcTerm.contributor)); d.setCreator(value(rec, DcTerm.creator, DcTerm.rightsHolder)); d.setLanguage(enumify(rec, NameUsageIssue.DESCRIPTION_INVALID, languageParser, u, DcTerm.language)); d.setLicense(value(rec, DcTerm.license, DcTerm.rights)); e.descriptions.add(d); } } // EOL MULTIMEDIA else if (v.hasExtension(Extension.EOL_MEDIA)) { for (Map<Term, String> rec : v.getExtensions().get(Extension.EOL_MEDIA)) { // ignore non text type records if (!isTextType(value(rec, DcTerm.type))) { continue; } Description d = new Description(); // interpret rec d.setType(value(rec, DcTerm.title)); d.setDescription(value(rec, DcTerm.description, DcTerm.abstract_)); // make sure we have some description if (d.getDescription() == null) { u.addIssue(NameUsageIssue.DESCRIPTION_INVALID); continue; } d.setSource(value(rec, DcTerm.source, DcTerm.bibliographicCitation)); d.setContributor(value(rec, DcTerm.contributor)); d.setCreator(value(rec, DcTerm.creator, DcTerm.rightsHolder, XmpRightsTerm.Owner, DcTerm.publisher)); d.setLanguage(enumify(rec, NameUsageIssue.DESCRIPTION_INVALID, languageParser, u, DcTerm.language)); d.setLicense(value(rec, DcTerm.license, DcTerm.rights, XmpRightsTerm.UsageTerms)); e.descriptions.add(d); } } // verify descriptions // make sure we have some description Iterator<Description> iter = e.descriptions.iterator(); while(iter.hasNext()) { Description d = iter.next(); if (StringUtils.isBlank(d.getDescription())) { u.addIssue(NameUsageIssue.DESCRIPTION_INVALID); iter.remove(); } } } private boolean isTextType(String type) { return type.equalsIgnoreCase("http://purl.org/dc/dcmitype/Text") || type.equalsIgnoreCase("purl.org/dc/dcmitype/Text") || type.equalsIgnoreCase("dcmitype:Text") || type.equalsIgnoreCase("dctype:Text") || type.equalsIgnoreCase("dc:Text") || type.equalsIgnoreCase("Text"); } public UsageExtensions interpret(NameUsage u, VerbatimNameUsage v) { UsageExtensions ext = new UsageExtensions(); interpretDescription(u, ext,v); interpretDistribution(u ,ext, v); interpretIdentifier(u, ext, v); interpretMultimedia(u, ext, v); interpretReference(u, ext, v); interpretSpeciesProfiles(u, ext, v); interpretTypes(u, ext, v); interpretVernacularNames(u, ext, v); return ext; } }