package org.gbif.checklistbank.index; import com.google.common.base.Function; import com.google.common.base.Strings; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; import org.gbif.api.model.checklistbank.*; import org.gbif.api.model.checklistbank.search.NameUsageSearchResult; import org.gbif.api.model.checklistbank.search.NameUsageSuggestResult; import org.gbif.api.model.common.LinneanClassification; import org.gbif.api.model.common.LinneanClassificationKeys; import org.gbif.api.util.ClassificationUtils; import org.gbif.api.vocabulary.*; import org.gbif.checklistbank.model.UsageExtensions; import org.gbif.common.parsers.HabitatParser; import org.gbif.common.parsers.core.ParseResult; import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.util.List; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Threadsafe class that transforms a {@link NameUsage} object into {@link SolrInputDocument} and vice versa. */ public class NameUsageDocConverter { private static final Logger LOG = LoggerFactory.getLogger(NameUsageDocConverter.class); private static final String CONCAT = " # "; private static final Pattern LANG_SPLIT = Pattern.compile("^([a-zA-Z]*)" + CONCAT + "(.*)$"); /** * Takes a name usage instance and its associated information and transform it into a {@link SolrInputDocument}. * * @param usage container to be transformed into a {@link SolrInputDocument}. * vernacular names, descriptions, species profiles and distributions are used, so populate them! * * @return a {@link SolrInputDocument} using the Object parameter. */ public SolrInputDocument toDoc(NameUsage usage, List<Integer> parents, @Nullable UsageExtensions extensions) { try { SolrInputDocument doc = new SolrInputDocument(); doc.addField("key", usage.getKey()); doc.addField("name_key", usage.getNameKey()); doc.addField("nub_key", usage.getNubKey()); doc.addField("dataset_key", str(usage.getDatasetKey())); doc.addField("constituent_key", str(usage.getConstituentKey())); doc.addField("parent_key", usage.getParentKey()); doc.addField("parent", usage.getParent()); doc.addField("accepted_key", usage.getAcceptedKey()); doc.addField("accepted", usage.getAccepted()); doc.addField("basionym_key", usage.getBasionymKey()); doc.addField("basionym", usage.getBasionym()); doc.addField("scientific_name", usage.getScientificName()); doc.addField("canonical_name", usage.getCanonicalName()); doc.addField("name_type", ordinal(usage.getNameType())); doc.addField("authorship", usage.getAuthorship()); doc.addField("origin_key", ordinal(usage.getOrigin())); doc.addField("taxonomic_status_key", ordinal(usage.getTaxonomicStatus())); if (usage.getNomenclaturalStatus() != null) { for (NomenclaturalStatus ns : usage.getNomenclaturalStatus()) { doc.addField("nomenclatural_status_key", ns.ordinal()); } } doc.addField("rank_key", ordinal(usage.getRank())); doc.addField("published_in", usage.getPublishedIn()); doc.addField("according_to", usage.getAccordingTo()); doc.addField("num_descendants", usage.getNumDescendants()); doc.addField("source_id", usage.getTaxonID()); // classification fields addClassification(doc, usage); // higher_taxon_key addHigherTaxonKeys(parents, doc); // issues addIssues(usage, doc); // extract extension infos if (extensions != null) { // vernacular_name, vernacular_name_lang addVernacularNames(doc, extensions); // description addDescriptions(doc, extensions); // threat_status_key addDistributionsAndThreatStatus(doc, extensions); // habitat_key, extinct addSpeciesProfiles(doc, extensions); } return doc; } catch (Exception e) { LOG.error("Error converting usage {} to solr document: {}", usage.getKey(), e.getMessage()); throw new RuntimeException(e); } } public NameUsageSearchResult toSearchUsage(SolrDocument doc, boolean addExtensionData) { NameUsageSearchResult u = new NameUsageSearchResult(); u.setKey((Integer)doc.getFieldValue("key")); u.setNameKey((Integer)doc.getFieldValue("name_key")); u.setNubKey((Integer)doc.getFieldValue("nub_key")); u.setDatasetKey(toUUID(doc.getFieldValue("dataset_key"))); u.setConstituentKey(toUUID(doc.getFieldValue("constituent_key"))); u.setParentKey((Integer)doc.getFieldValue("parent_key")); u.setParent((String)doc.getFieldValue("parent")); u.setAcceptedKey((Integer)doc.getFieldValue("accepted_key")); u.setAccepted((String)doc.getFieldValue("accepted")); u.setBasionymKey((Integer)doc.getFieldValue("basionym_key")); u.setBasionym((String)doc.getFieldValue("basionym")); u.setScientificName((String)doc.getFieldValue("scientific_name")); u.setCanonicalName((String)doc.getFieldValue("canonical_name")); u.setNameType(toEnum(doc, NameType.class, "name_type")); u.setAuthorship((String)doc.getFieldValue("authorship")); u.setOrigin(toEnum(doc, Origin.class, "origin_key")); u.setTaxonomicStatus(toEnum(doc, TaxonomicStatus.class, "taxonomic_status_key")); addEnumList(NomenclaturalStatus.class, u.getNomenclaturalStatus(), doc, "nomenclatural_status_key"); u.setRank(toEnum(doc, Rank.class, "rank_key")); u.setPublishedIn((String)doc.getFieldValue("published_in")); u.setAccordingTo((String)doc.getFieldValue("according_to")); addClassification(doc, u, u); u.setNumDescendants((Integer)doc.getFieldValue("num_descendants")); u.setTaxonID((String)doc.getFieldValue("source_id")); if (addExtensionData) { // habitat_key,extinct addEnumList(Habitat.class, u.getHabitats(), doc, "habitat_key"); u.setExtinct((Boolean) doc.getFieldValue("extinct")); // threat_status_key addEnumList(ThreatStatus.class, u.getThreatStatuses(), doc, "threat_status_key"); // vernacular_name, vernacular_name_lang addObjList(u.getVernacularNames(), doc, "vernacular_name_lang", new Function<Object, VernacularName>() { @Nullable @Override public VernacularName apply(@Nullable Object input) { return deserializeVernacularName((String)input); } }); // description addObjList(u.getDescriptions(), doc, "description", new Function<Object, Description>() { @Nullable @Override public Description apply(@Nullable Object input) { return deserializeDescription((String)input); } }); } return u; } private static <T extends Enum<?>> void addEnumList(Class<T> vocab, List<T> data, SolrDocument doc, String field) { if (doc.getFieldValues(field) != null) { for (Object val : doc.getFieldValues(field)) { data.add(vocab.getEnumConstants()[(Integer) val]); } } } private static <T> void addObjList(List<T> data, SolrDocument doc, String field, Function<Object, T> func) { if (doc.getFieldValues(field) != null) { for (Object val : doc.getFieldValues(field)) { data.add(func.apply(val)); } } } public NameUsageSuggestResult toSuggestUsage(SolrDocument doc) { NameUsageSuggestResult u = new NameUsageSuggestResult(); u.setKey((Integer)doc.getFieldValue("key")); u.setNameKey((Integer)doc.getFieldValue("name_key")); u.setNubKey((Integer)doc.getFieldValue("nub_key")); u.setParentKey((Integer)doc.getFieldValue("parent_key")); u.setParent((String)doc.getFieldValue("parent")); u.setScientificName((String)doc.getFieldValue("scientific_name")); u.setCanonicalName((String)doc.getFieldValue("canonical_name")); u.setStatus(toEnum(doc, TaxonomicStatus.class, "taxonomic_status_key")); u.setRank(toEnum(doc, Rank.class, "rank_key")); addClassification(doc, u, u); return u; } private void addClassification(SolrInputDocument doc, NameUsage usage) { for (Rank r : Rank.DWC_RANKS) { doc.addField(r.name().toLowerCase(), usage.getHigherRank(r)); doc.addField(r.name().toLowerCase()+"_key", usage.getHigherRankKey(r)); } } private void addClassification(SolrDocument doc, LinneanClassification lc, LinneanClassificationKeys lck) { for (Rank r : Rank.DWC_RANKS) { ClassificationUtils.setHigherRank(lc, r, (String) doc.getFieldValue(r.name().toLowerCase())); ClassificationUtils.setHigherRankKey(lck, r, (Integer) doc.getFieldValue(r.name().toLowerCase()+"_key")); } } private static Description deserializeDescription(String description) { Description d = new Description(); d.setDescription(description); return d; } private static VernacularName deserializeVernacularName(String vernacularName) { Matcher m = LANG_SPLIT.matcher(vernacularName); VernacularName vn = new VernacularName(); if (m.find()) { vn.setLanguage(Language.fromIsoCode(m.group(1))); vn.setVernacularName(m.group(2)); } else { vn.setVernacularName(vernacularName); } return vn; } private static String serializeDescription(Description description) { return stripHtml(description.getDescription()); } private static String stripHtml(String html) { if (!Strings.isNullOrEmpty(html)) { try { return Jsoup.parse(html).text(); } catch (RuntimeException e) { LOG.error("Failed to read description input"); } } return null; } private static String serializeVernacularName(VernacularName vn) { return vn.getLanguage() == null ? vn.getVernacularName() : vn.getLanguage().getIso2LetterCode() + CONCAT + vn.getVernacularName(); } private static Integer ordinal(Enum val) { if (val != null) { return val.ordinal(); } return null; } private static String str(@Nullable Object obj) { return obj == null ? null : obj.toString(); } private static <T extends Enum<?>> T toEnum(SolrDocument doc, Class<T> vocab, String field) { return toEnum(vocab, (Integer) doc.getFieldValue(field)); } private static <T extends Enum<?>> T toEnum(Class<T> vocab, Integer ordinal) { if (ordinal != null) { T[] values = vocab.getEnumConstants(); return values[ordinal]; } return null; } private static UUID toUUID(Object value) { if (value != null) { return UUID.fromString(value.toString()); } return null; } private void addIssues(NameUsage nameUsage, SolrInputDocument doc) { try { if (!nameUsage.getIssues().isEmpty()) { for (NameUsageIssue issue : nameUsage.getIssues()) { // Uses the converter to get the key and name values doc.addField("issues", issue.ordinal()); } } } catch (Exception e) { LOG.error("Error converting issues for usage {}", nameUsage.getKey(), e); } } /** * Utility method that iterates over all the {@link Description} objects of a {@link NameUsage}. * * @param doc to be modified by adding the description fields */ private void addDescriptions(SolrInputDocument doc, UsageExtensions ext) { if (ext.descriptions == null) { return; } for (Description description : ext.descriptions) { doc.addField("description", serializeDescription(description)); } } /** * Utility method that iterates over all the {@link Distribution} objects of a {@link NameUsage}. * * @param doc to be modified by adding the distributions fields */ private void addDistributionsAndThreatStatus(SolrInputDocument doc, UsageExtensions ext) { if (ext.distributions == null) { return; } for (Distribution distribution : ext.distributions) { if (distribution.getThreatStatus() != null) { doc.addField("threat_status_key", distribution.getThreatStatus().ordinal()); } } } /** * Adds the multivalued field higher_taxon_key field. * @param parents * @param doc to be modified by adding the higher taxon fields. */ private void addHigherTaxonKeys(List<Integer> parents, SolrInputDocument doc) { if (parents != null) { doc.addField("higher_taxon_key", parents); } } /** * Utility method that iterates over all the {@link SpeciesProfile} objects of a {@link NameUsage}. * * @param doc to be modified by adding the species profiles(extinct & marine) fields */ private void addSpeciesProfiles(SolrInputDocument doc, UsageExtensions ext) { if (ext.speciesProfiles == null) { return; } // use container logic to build a single value NameUsageContainer usage = new NameUsageContainer(); usage.setSpeciesProfiles(ext.speciesProfiles); doc.addField("extinct", usage.isExtinct()); // derive habitat values from boolean flags addHabitat(doc, usage.isFreshwater(), Habitat.FRESHWATER); addHabitat(doc, usage.isMarine(), Habitat.MARINE); addHabitat(doc, usage.isTerrestrial(), Habitat.TERRESTRIAL); // see if we can make use of uncontrolled habitat values with the parser, CoL uses it a lot! HabitatParser hp = HabitatParser.getInstance(); for (String habitat : usage.getHabitats()) { ParseResult<Habitat> result = hp.parse(habitat); if (result.isSuccessful()) { addHabitat(doc, result.getPayload()); } } } private void addHabitat(SolrInputDocument doc, Boolean add, Habitat habitat) { if (add != null && add) { addHabitat(doc, habitat); } } private void addHabitat(SolrInputDocument doc, Habitat habitat) { if (habitat != null) { doc.addField("habitat_key", habitat.ordinal()); } } /** * Utility method that iterates over all the {@link VernacularName} objects of a {@link NameUsage}. * * @param doc to be modified by adding the vernacular name fields */ private void addVernacularNames(SolrInputDocument doc, UsageExtensions ext) { if (ext.vernacularNames == null) { return; } for (VernacularName vernacularName : ext.vernacularNames) { doc.addField("vernacular_name", vernacularName.getVernacularName()); doc.addField("vernacular_name_lang", serializeVernacularName(vernacularName)); } } }