package org.gbif.occurrence.hive.udf; import org.gbif.api.model.checklistbank.NameUsageMatch; import org.gbif.api.vocabulary.Rank; import org.gbif.common.parsers.RankParser; import org.gbif.common.parsers.core.ParseResult; import org.gbif.common.parsers.utils.ClassificationUtils; import org.gbif.occurrence.processor.guice.ApiClientConfiguration; import org.gbif.occurrence.processor.interpreting.TaxonomyInterpreter; import java.net.URI; import java.util.Arrays; import java.util.List; import com.beust.jcommander.internal.Lists; import com.google.common.base.Joiner; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.MapredContext; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A UDF to run a backbone species match against the GBIF API. * The UDF is lazily initialized with the base URL of the API to be used. * Within the same JVM the UDF will only ever use the first URL used and ignores subsequently changed URLs. */ @Description( name = "match", value = "_FUNC_(apiUrl, kingdom, phylum, class, order, family, genus, scientificName, specificEpithet, infraspecificEpithet, rank)") public class SpeciesMatchUDF extends GenericUDF { private static final Logger LOG = LoggerFactory.getLogger(SpeciesMatchUDF.class); private static final int ARG_LENGTH = 11; private static final Joiner JOIN_COMMA = Joiner.on(",").useForNull("-"); private static final RankParser RANK_PARSER = RankParser.getInstance(); private TaxonomyInterpreter taxonomyInterpreter; private Object lock = new Object(); public TaxonomyInterpreter getInterpreter(URI apiWs) { TaxonomyInterpreter ti = taxonomyInterpreter; if (ti == null) { synchronized (lock) { // while we were waiting for the lock, another thread may have instantiated the object ti = taxonomyInterpreter; if (ti == null) { LOG.info("Create new species match client using API at {}", apiWs); ApiClientConfiguration cfg = new ApiClientConfiguration(); cfg.url = apiWs; ti = new TaxonomyInterpreter(cfg); taxonomyInterpreter = ti; } } } return ti; } private String clean(Object object) { if (object != null) { return ClassificationUtils.clean(object.toString()); } return null; } @Override public void configure(MapredContext context) { super.configure(context); } @Override public Object evaluate(DeferredObject[] arguments) throws HiveException { assert arguments.length == ARG_LENGTH; URI api = URI.create(arguments[0].get().toString()); String k = clean(arguments[1].get()); String p = clean(arguments[2].get()); String c = clean(arguments[3].get()); String o = clean(arguments[4].get()); String f = clean(arguments[5].get()); String g = clean(arguments[6].get()); String name = clean(arguments[7].get()); String sp = clean(arguments[8].get()); String ssp = clean(arguments[9].get()); Rank rank = null; Object rankInput = arguments[10].get(); if (rankInput != null) { rank = RANK_PARSER.parse(rankInput.toString()).getPayload(); } List<Object> result = Lists.newArrayList(21); //TODO: add authorship as a standalone parameter ParseResult<NameUsageMatch> response = getInterpreter(api).match(k, p, c, o, f, g, name, null, null, sp, ssp, rank); if (response != null) { result.add(response.getStatus()); if (response.getPayload() != null) { NameUsageMatch lookup = response.getPayload(); result.add(lookup.getUsageKey()); result.add(lookup.getScientificName()); result.add(lookup.getRank()); result.add(lookup.getStatus()); result.add(lookup.getMatchType()); result.add(lookup.getConfidence()); result.add(lookup.getKingdomKey()); result.add(lookup.getPhylumKey()); result.add(lookup.getClassKey()); result.add(lookup.getOrderKey()); result.add(lookup.getFamilyKey()); result.add(lookup.getGenusKey()); result.add(lookup.getSpeciesKey()); result.add(lookup.getKingdom()); result.add(lookup.getPhylum()); result.add(lookup.getClazz()); result.add(lookup.getOrder()); result.add(lookup.getFamily()); result.add(lookup.getGenus()); result.add(lookup.getSpecies()); result.add(lookup.getStatus()); } else if (response.getError() != null) { LOG.error("Error finding species match", response.getError()); } } return result; } @Override public String getDisplayString(String[] strings) { assert strings.length == ARG_LENGTH; return "match(" + JOIN_COMMA.join(strings) + ')'; } @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { if (arguments.length != ARG_LENGTH) { throw new UDFArgumentException("nubLookup takes " + ARG_LENGTH + " arguments"); } return ObjectInspectorFactory.getStandardStructObjectInspector(Arrays .asList("queryStatus", "taxonKey", "scientificName", "rank", "status", "matchType", "confidence", "kingdomKey", "phylumKey", "classKey", "orderKey", "familyKey", "genusKey", "speciesKey", "kingdom", "phylum", "class_", "order_", "family", "genus", "species", "taxonomicStatus"), Arrays.<ObjectInspector>asList( PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaIntObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector, PrimitiveObjectInspectorFactory.javaStringObjectInspector ) ); } }