package org.gbif.checklistbank.index.backfill; import org.gbif.api.model.checklistbank.Description; import org.gbif.api.model.checklistbank.Distribution; import org.gbif.api.model.checklistbank.NameUsage; import org.gbif.api.model.checklistbank.NameUsageContainer; import org.gbif.api.model.checklistbank.VernacularName; import org.gbif.api.vocabulary.Habitat; import org.gbif.checklistbank.index.model.NameUsageAvro; import org.gbif.checklistbank.model.UsageExtensions; import org.gbif.common.parsers.HabitatParser; import org.gbif.common.parsers.core.ParseResult; import java.util.Collection; import java.util.List; import javax.annotation.Nullable; import com.google.common.base.Strings; import com.google.common.collect.Lists; import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Threadsafe class that transforms a {@link org.gbif.api.model.checklistbank.NameUsage} object into {@link org.apache.solr.common.SolrInputDocument}. */ public class NameUsageAvroConverter { private static final String CONCAT = " # "; /** * Logger for the {@link NameUsageAvroConverter} class */ protected static final Logger LOG = LoggerFactory.getLogger(NameUsageAvroConverter.class); public static String serializeDescription(Description description) { return stripHtml(description.getDescription()); } private static String stripHtml(String html) { if (!Strings.isNullOrEmpty(html)) { try { return Jsoup.parse(html).text(); } catch (RuntimeException e) { LOG.error("Failed to read description input"); } } return null; } public static String serializeVernacularName(VernacularName vernacularName) { return vernacularName.getLanguage().getIso2LetterCode() + CONCAT + vernacularName.getVernacularName(); } /** * Takes a Generic object and transform it into a {@link org.apache.solr.common.SolrInputDocument}. * * @param usage container to be transformed into a {@link org.apache.solr.common.SolrInputDocument}. * vernacular names, descriptions, species profiles and distributions are used, so populate them! * * @return a {@link org.apache.solr.common.SolrInputDocument} using the Object parameter. */ public static NameUsageAvro toObject(NameUsage usage, List<Integer> parents, @Nullable UsageExtensions extensions) { try { NameUsageAvro nameUsageAvro = new NameUsageAvro(); nameUsageAvro.setKey(usage.getKey()); nameUsageAvro.setNubKey(usage.getNubKey()); if(usage.getDatasetKey() != null) { nameUsageAvro.setDatasetKey(usage.getDatasetKey().toString()); } if(usage.getConstituentKey() != null) { nameUsageAvro.setConstituentKey(usage.getConstituentKey().toString()); } nameUsageAvro.setParent(usage.getParent()); nameUsageAvro.setParentKey(usage.getParentKey()); nameUsageAvro.setAccepted(usage.getAccepted()); nameUsageAvro.setAcceptedKey(usage.getAcceptedKey()); nameUsageAvro.setBasionym(usage.getBasionym()); nameUsageAvro.setBasionymKey(usage.getBasionymKey()); nameUsageAvro.setScientificName(usage.getScientificName()); nameUsageAvro.setCanonicalName(usage.getCanonicalName()); nameUsageAvro.setAuthorship(usage.getAuthorship()); nameUsageAvro.setPublishedIn(usage.getPublishedIn()); nameUsageAvro.setAccordingTo(usage.getAccordingTo()); nameUsageAvro.setKingdom(usage.getKingdom()); nameUsageAvro.setKingdomKey(usage.getKingdomKey()); nameUsageAvro.setPhylum(usage.getPhylum()); nameUsageAvro.setPhylumKey(usage.getPhylumKey()); nameUsageAvro.setClazz(usage.getClazz()); nameUsageAvro.setClassKey(usage.getClassKey()); nameUsageAvro.setOrder(usage.getOrder()); nameUsageAvro.setOrderKey(usage.getOrderKey()); nameUsageAvro.setFamily(usage.getFamily()); nameUsageAvro.setFamilyKey(usage.getFamilyKey()); nameUsageAvro.setGenus(usage.getGenus()); nameUsageAvro.setGenusKey(usage.getGenusKey()); nameUsageAvro.setSubgenus(usage.getSubgenus()); nameUsageAvro.setSubgenusKey(usage.getSubgenusKey()); nameUsageAvro.setSpecies(usage.getSpecies()); nameUsageAvro.setSpeciesKey(usage.getSpeciesKey()); nameUsageAvro.setNumDescendants(usage.getNumDescendants()); nameUsageAvro.setIsSynonym(usage.isSynonym()); // higher taxa nameUsageAvro.setHigherTaxonKey(parents); // enums if(usage.getNameType() != null) { nameUsageAvro.setNameType(usage.getNameType().ordinal()); } nameUsageAvro.setIssues(getOrdinals(usage.getIssues())); nameUsageAvro.setNomenclaturalStatusKey(getOrdinals(usage.getNomenclaturalStatus())); if(usage.getOrigin() != null) { nameUsageAvro.setOriginKey(usage.getOrigin().ordinal()); } if(usage.getTaxonomicStatus() != null) { nameUsageAvro.setTaxonomicStatusKey(usage.getTaxonomicStatus().ordinal()); } if(usage.getRank() != null) { nameUsageAvro.setRankKey(usage.getRank().ordinal()); } // extract extension infos if (extensions != null) { addVernacularNames(nameUsageAvro, extensions); addDescriptions(nameUsageAvro, extensions); addDistributionsAndThreatStatus(nameUsageAvro, extensions); addSpeciesProfiles(nameUsageAvro, extensions); } return nameUsageAvro; } catch (Exception e) { LOG.error("Error converting usage {} extension {} and parent {} to avro", usage, extensions, parents, e); throw new RuntimeException(e); } } private static List<Integer> getOrdinals(Collection<? extends Enum> enums){ List<Integer> ordinals = null; try { if (enums != null && !enums.isEmpty()) { ordinals = Lists.newArrayList(); for (Enum<?> literal : enums) { if(literal != null) { ordinals.add(literal.ordinal()); } } } } catch (Exception e) { LOG.error("Error converting ordinals for enum", e); } return ordinals; } /** * Utility method that iterates over all the {@link org.gbif.api.model.checklistbank.Description} objects of a {@link org.gbif.api.model.checklistbank.NameUsage}. * * @param doc to be modified by adding the description fields */ private static void addDescriptions(NameUsageAvro nameUsageAvro, UsageExtensions ext) { if (ext.descriptions == null) { return; } List<String> descriptions = Lists.newArrayList(); for (Description description : ext.descriptions) { descriptions.add(serializeDescription(description)); } nameUsageAvro.setDescription(descriptions); } /** * Utility method that iterates over all the {@link org.gbif.api.model.checklistbank.Distribution} objects of a {@link org.gbif.api.model.checklistbank.NameUsage}. * * @param doc to be modified by adding the distributions fields */ private static void addDistributionsAndThreatStatus(NameUsageAvro nameUsageAvro, UsageExtensions ext) { if (ext.distributions == null) { return; } List<Integer> threatStatusKeys = Lists.newArrayList(); for (Distribution distribution : ext.distributions) { if (distribution.getThreatStatus() != null) { threatStatusKeys.add(distribution.getThreatStatus().ordinal()); } } nameUsageAvro.setThreatStatusKey(threatStatusKeys); } /** * Utility method that iterates over all the {@link org.gbif.api.model.checklistbank.SpeciesProfile} objects of a {@link org.gbif.api.model.checklistbank.NameUsage}. * * @param doc to be modified by adding the species profiles(extinct & marine) fields */ private static void addSpeciesProfiles(NameUsageAvro nameUsageAvro, UsageExtensions ext) { if (ext.speciesProfiles == null) { return; } // use container logic to build a single value NameUsageContainer usage = new NameUsageContainer(); usage.setSpeciesProfiles(ext.speciesProfiles); nameUsageAvro.setExtinct(usage.isExtinct()); nameUsageAvro.setHabitatKey(getHabitatsKeys(usage)); } private static List<Integer> getHabitatsKeys(NameUsageContainer usage){ List<Integer> habitats = Lists.newArrayList(); // derive habitat values from boolean flags addHabitat(habitats, usage.isFreshwater(), Habitat.FRESHWATER); addHabitat(habitats, usage.isMarine(), Habitat.MARINE); addHabitat(habitats, usage.isTerrestrial(), Habitat.TERRESTRIAL); // see if we can make use of uncontrolled habitat values with the parser, CoL uses it a lot! HabitatParser hp = HabitatParser.getInstance(); for (String habitat : usage.getHabitats()) { ParseResult<Habitat> result = hp.parse(habitat); if (result.isSuccessful()) { addHabitat(habitats, result.getPayload()); } } return habitats; } private static void addHabitat(List<Integer> habitats, Boolean add, Habitat habitat) { if (add != null && add) { addHabitat(habitats, habitat); } } private static void addHabitat(List<Integer> habitats, Habitat habitat) { if (habitat != null) { habitats.add(habitat.ordinal()); } } /** * Utility method that iterates over all the {@link org.gbif.api.model.checklistbank.VernacularName} objects of a {@link org.gbif.api.model.checklistbank.NameUsage}. * * @param doc to be modified by adding the vernacular name fields */ private static void addVernacularNames(NameUsageAvro nameUsageAvro, UsageExtensions ext) { if (ext.vernacularNames == null) { return; } List<String> vernacularNames = Lists.newArrayList(); List<String> vernacularLang = Lists.newArrayList(); List<String> vernacularNamesLang = Lists.newArrayList(); for (VernacularName vernacularName : ext.vernacularNames) { vernacularNames.add(vernacularName.getVernacularName()); if (vernacularName.getLanguage() != null) { vernacularLang.add(vernacularName.getLanguage().getIso2LetterCode()); vernacularNamesLang.add(serializeVernacularName(vernacularName)); } } nameUsageAvro.setVernacularName(vernacularNames); nameUsageAvro.setVernacularLang(vernacularLang); nameUsageAvro.setVernacularNameLang(vernacularNamesLang); } }