/* * Copyright 2012 Global Biodiversity Information Facility (GBIF) * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.gbif.occurrence.download.query; import org.gbif.api.model.occurrence.predicate.CompoundPredicate; import org.gbif.api.model.occurrence.predicate.ConjunctionPredicate; import org.gbif.api.model.occurrence.predicate.DisjunctionPredicate; import org.gbif.api.model.occurrence.predicate.EqualsPredicate; import org.gbif.api.model.occurrence.predicate.GreaterThanOrEqualsPredicate; import org.gbif.api.model.occurrence.predicate.GreaterThanPredicate; import org.gbif.api.model.occurrence.predicate.InPredicate; import org.gbif.api.model.occurrence.predicate.IsNotNullPredicate; import org.gbif.api.model.occurrence.predicate.LessThanOrEqualsPredicate; import org.gbif.api.model.occurrence.predicate.LessThanPredicate; import org.gbif.api.model.occurrence.predicate.LikePredicate; import org.gbif.api.model.occurrence.predicate.NotPredicate; import org.gbif.api.model.occurrence.predicate.Predicate; import org.gbif.api.model.occurrence.predicate.SimplePredicate; import org.gbif.api.model.occurrence.predicate.WithinPredicate; import org.gbif.api.model.occurrence.search.OccurrenceSearchParameter; import org.gbif.api.util.IsoDateParsingUtils; import org.gbif.api.util.IsoDateParsingUtils.IsoDateFormat; import org.gbif.api.util.SearchTypeValidator; import org.gbif.dwc.terms.DcTerm; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifInternalTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; import org.gbif.occurrence.common.HiveColumnsUtils; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; import com.google.common.base.CharMatcher; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Range; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This class builds a WHERE clause for a Hive query from a {@link org.gbif.api.model.occurrence.predicate.Predicate} * object. * </p> * This is not thread-safe but one instance can be reused. It is package-local and should usually be accessed through * {@link org.gbif.api.service.occurrence.DownloadRequestService}. All {@code visit} methods have to be public for the * {@link Class#getMethod(String, Class[])} call to work. This is the primary reason for this class being * package-local. * </p> * The only entry point into this class is the {@code getHiveQuery} method! */ // TODO: We should check somewhere for the length of the string to avoid possible attacks/oom situations (OCC-35) public class HiveQueryVisitor { private static final Logger LOG = LoggerFactory.getLogger(HiveQueryVisitor.class); private static final String CONJUNCTION_OPERATOR = " AND "; private static final String DISJUNCTION_OPERATOR = " OR "; private static final String EQUALS_OPERATOR = " = "; private static final String GREATER_THAN_OPERATOR = " > "; private static final String GREATER_THAN_EQUALS_OPERATOR = " >= "; private static final String LESS_THAN_OPERATOR = " < "; private static final String LESS_THAN_EQUALS_OPERATOR = " <= "; private static final String NOT_OPERATOR = "NOT "; private static final String LIKE_OPERATOR = " LIKE "; private static final String IS_NOT_NULL_OPERATOR = " IS NOT NULL "; private static final String IS_NOT_NULL_ARRAY_OPERATOR = "(%1$s IS NOT NULL AND size(%1$s) > 0)"; private static final CharMatcher APOSTROPHE_MATCHER = CharMatcher.is('\''); // where query to execute a select all private static final String ALL_QUERY = "true"; private static final String MEDIATYPE_CONTAINS_FMT = "array_contains(" + HiveColumnsUtils.getHiveColumn(GbifTerm.mediaType) + ",'%s')"; private static final String ISSUE_CONTAINS_FMT = "array_contains(" + HiveColumnsUtils.getHiveColumn(GbifTerm.issue) + ",'%s')"; private static final String HIVE_ARRAY_PRE = "ARRAY"; private static final List<GbifTerm> NUB_KEYS = ImmutableList.of(GbifTerm.taxonKey, GbifTerm.kingdomKey, GbifTerm.phylumKey, GbifTerm.classKey, GbifTerm.orderKey, GbifTerm.familyKey, GbifTerm.genusKey, GbifTerm.subgenusKey, GbifTerm.speciesKey); // parameter that map directly to Hive, boundingBox, coordinate and taxonKey are treated special ! private static final Map<OccurrenceSearchParameter, ? extends Term> PARAM_TO_TERM = ImmutableMap.<OccurrenceSearchParameter, Term>builder() .put(OccurrenceSearchParameter.DATASET_KEY, GbifTerm.datasetKey) .put(OccurrenceSearchParameter.YEAR, DwcTerm.year) .put(OccurrenceSearchParameter.MONTH, DwcTerm.month) .put(OccurrenceSearchParameter.DECIMAL_LATITUDE, DwcTerm.decimalLatitude) .put(OccurrenceSearchParameter.DECIMAL_LONGITUDE, DwcTerm.decimalLongitude) .put(OccurrenceSearchParameter.ELEVATION, GbifTerm.elevation) .put(OccurrenceSearchParameter.DEPTH, GbifTerm.depth) .put(OccurrenceSearchParameter.INSTITUTION_CODE, DwcTerm.institutionCode) .put(OccurrenceSearchParameter.COLLECTION_CODE, DwcTerm.collectionCode) .put(OccurrenceSearchParameter.CATALOG_NUMBER, DwcTerm.catalogNumber) .put(OccurrenceSearchParameter.SCIENTIFIC_NAME, DwcTerm.scientificName) .put(OccurrenceSearchParameter.OCCURRENCE_ID, DwcTerm.occurrenceID).put(OccurrenceSearchParameter.ESTABLISHMENT_MEANS, DwcTerm.establishmentMeans) // the following need some value transformation .put(OccurrenceSearchParameter.EVENT_DATE, DwcTerm.eventDate) .put(OccurrenceSearchParameter.LAST_INTERPRETED, GbifTerm.lastInterpreted) .put(OccurrenceSearchParameter.BASIS_OF_RECORD, DwcTerm.basisOfRecord) .put(OccurrenceSearchParameter.COUNTRY, DwcTerm.countryCode) .put(OccurrenceSearchParameter.CONTINENT, DwcTerm.continent) .put(OccurrenceSearchParameter.PUBLISHING_COUNTRY, GbifTerm.publishingCountry) .put(OccurrenceSearchParameter.RECORDED_BY, DwcTerm.recordedBy) .put(OccurrenceSearchParameter.RECORD_NUMBER, DwcTerm.recordNumber) .put(OccurrenceSearchParameter.TYPE_STATUS, DwcTerm.typeStatus) .put(OccurrenceSearchParameter.HAS_COORDINATE, GbifTerm.hasCoordinate) .put(OccurrenceSearchParameter.HAS_GEOSPATIAL_ISSUE, GbifTerm.hasGeospatialIssues) .put(OccurrenceSearchParameter.MEDIA_TYPE, GbifTerm.mediaType) .put(OccurrenceSearchParameter.ISSUE, GbifTerm.issue) .put(OccurrenceSearchParameter.KINGDOM_KEY, GbifTerm.kingdomKey) .put(OccurrenceSearchParameter.PHYLUM_KEY, GbifTerm.phylumKey) .put(OccurrenceSearchParameter.CLASS_KEY, GbifTerm.classKey) .put(OccurrenceSearchParameter.ORDER_KEY, GbifTerm.orderKey) .put(OccurrenceSearchParameter.FAMILY_KEY, GbifTerm.familyKey) .put(OccurrenceSearchParameter.GENUS_KEY, GbifTerm.genusKey) .put(OccurrenceSearchParameter.SUBGENUS_KEY, GbifTerm.subgenusKey) .put(OccurrenceSearchParameter.SPECIES_KEY, GbifTerm.speciesKey) .put(OccurrenceSearchParameter.REPATRIATED, GbifTerm.repatriated) .put(OccurrenceSearchParameter.ORGANISM_ID, DwcTerm.organismID) .put(OccurrenceSearchParameter.LOCALITY, DwcTerm.locality) .put(OccurrenceSearchParameter.STATE_PROVINCE, DwcTerm.stateProvince) .put(OccurrenceSearchParameter.WATER_BODY, DwcTerm.waterBody) .put(OccurrenceSearchParameter.PROTOCOL, GbifTerm.protocol) .put(OccurrenceSearchParameter.LICENSE, DcTerm.license) .put(OccurrenceSearchParameter.PUBLISHING_ORG, GbifInternalTerm.publishingOrgKey) .put(OccurrenceSearchParameter.CRAWL_ID, GbifInternalTerm.crawlId) .build(); private StringBuilder builder; private static String toHiveField(OccurrenceSearchParameter param) { if (PARAM_TO_TERM.containsKey(param)) { return HiveColumnsUtils.getHiveColumn(PARAM_TO_TERM.get(param)); } // QueryBuildingException requires an underlying exception throw new IllegalArgumentException("Search parameter " + param + " is not mapped to Hive"); } /** * Converts a value to the form expected by Hive/Hbase based on the OccurrenceSearchParameter. * Most values pass by unaltered. Quotes are added for values that need to be quoted, escaping any existing quotes. * * @param param the type of parameter defining the expected type * @param value the original query value * * @return the converted value expected by HBase */ private static String toHiveValue(OccurrenceSearchParameter param, String value) { if (Enum.class.isAssignableFrom(param.type())) { // all enum parameters are uppercase return '\'' + value.toUpperCase() + '\''; } if (Date.class.isAssignableFrom(param.type())) { // use longs for timestamps expressed as ISO dates Date d = IsoDateParsingUtils.parseDate(value); return String.valueOf(d.getTime()); } else if (Number.class.isAssignableFrom(param.type()) || Boolean.class.isAssignableFrom(param.type())) { // dont quote numbers return value; } else { // quote value, escape existing quotes return '\'' + APOSTROPHE_MATCHER.replaceFrom(value, "\\\'") + '\''; } } /** * Translates a valid {@link org.gbif.api.model.occurrence.Download} object and translates it into a * strings that can be used as the <em>WHERE</em> clause for a Hive download. * * @param predicate to translate * * @return WHERE clause */ public String getHiveQuery(Predicate predicate) throws QueryBuildingException { String hiveQuery = ALL_QUERY; if (predicate != null) { // null predicate means a SELECT ALL builder = new StringBuilder(); visit(predicate); hiveQuery = builder.toString(); } // Set to null to prevent old StringBuilders hanging around in case this class is reused somewhere else builder = null; return hiveQuery; } public void visit(ConjunctionPredicate predicate) throws QueryBuildingException { visitCompoundPredicate(predicate, CONJUNCTION_OPERATOR); } public void visit(DisjunctionPredicate predicate) throws QueryBuildingException { visitCompoundPredicate(predicate, DISJUNCTION_OPERATOR); } /** * Supports all parameters incl taxonKey expansion for higher taxa. */ public void visit(EqualsPredicate predicate) throws QueryBuildingException { if (OccurrenceSearchParameter.TAXON_KEY == predicate.getKey()) { appendTaxonKeyFilter(predicate.getValue()); } else if (OccurrenceSearchParameter.MEDIA_TYPE == predicate.getKey()) { builder.append(String.format(MEDIATYPE_CONTAINS_FMT, predicate.getValue().toUpperCase())); } else if (OccurrenceSearchParameter.ISSUE == predicate.getKey()) { builder.append(String.format(ISSUE_CONTAINS_FMT, predicate.getValue().toUpperCase())); } else { visitSimplePredicate(predicate, EQUALS_OPERATOR); } } public void visit(GreaterThanOrEqualsPredicate predicate) throws QueryBuildingException { visitSimplePredicate(predicate, GREATER_THAN_EQUALS_OPERATOR); } public void visit(GreaterThanPredicate predicate) throws QueryBuildingException { visitSimplePredicate(predicate, GREATER_THAN_OPERATOR); } public void visit(InPredicate predicate) throws QueryBuildingException { builder.append('('); Iterator<String> iterator = predicate.getValues().iterator(); while (iterator.hasNext()) { String value = iterator.next(); builder.append('('); builder.append(toHiveField(predicate.getKey())); builder.append(EQUALS_OPERATOR); builder.append(toHiveValue(predicate.getKey(), value)); builder.append(')'); if (iterator.hasNext()) { builder.append(DISJUNCTION_OPERATOR); } } builder.append(')'); } public void visit(LessThanOrEqualsPredicate predicate) throws QueryBuildingException { visitSimplePredicate(predicate, LESS_THAN_EQUALS_OPERATOR); } public void visit(LessThanPredicate predicate) throws QueryBuildingException { visitSimplePredicate(predicate, LESS_THAN_OPERATOR); } // TODO: This probably won't work without a bit more intelligence public void visit(LikePredicate predicate) throws QueryBuildingException { visitSimplePredicate(predicate, LIKE_OPERATOR); } public void visit(NotPredicate predicate) throws QueryBuildingException { builder.append(NOT_OPERATOR); visit(predicate.getPredicate()); } public void visit(IsNotNullPredicate predicate) throws QueryBuildingException { if (isHiveArray(predicate.getParameter())) { builder.append(String.format(IS_NOT_NULL_ARRAY_OPERATOR, toHiveField(predicate.getParameter()))); } else { builder.append(toHiveField(predicate.getParameter())); builder.append(IS_NOT_NULL_OPERATOR); } } public void visit(WithinPredicate within) { // the geometry must be valid - it was validated in the predicates constructor builder.append("contains(\""); builder.append(within.getGeometry()); builder.append("\", "); builder.append(HiveColumnsUtils.getHiveColumn(DwcTerm.decimalLatitude)); builder.append(", "); builder.append(HiveColumnsUtils.getHiveColumn(DwcTerm.decimalLongitude)); builder.append(')'); } /** * Builds a list of predicates joined by 'op' statements. * The final statement will look like this: * <p/> * <pre> * ((predicate) op (predicate) ... op (predicate)) * </pre> */ public void visitCompoundPredicate(CompoundPredicate predicate, String op) throws QueryBuildingException { builder.append('('); Iterator<Predicate> iterator = predicate.getPredicates().iterator(); while (iterator.hasNext()) { Predicate subPredicate = iterator.next(); builder.append('('); visit(subPredicate); builder.append(')'); if (iterator.hasNext()) { builder.append(op); } } builder.append(')'); } public void visitSimplePredicate(SimplePredicate predicate, String op) throws QueryBuildingException { if (OccurrenceSearchParameter.ISSUE == predicate.getKey()) { // ignore - there's no way to actually request this in the interface, nor is it indexed in solr } else { if (Date.class.isAssignableFrom(predicate.getKey().type())) { if (SearchTypeValidator.isRange(predicate.getValue())) { visit(toDateRangePredicate(IsoDateParsingUtils.parseDateRange(predicate.getValue()), predicate.getKey())); return; } else { IsoDateFormat isoDateFormat = IsoDateParsingUtils.getFirstDateFormatMatch(predicate.getValue()); if (IsoDateFormat.FULL != isoDateFormat) { visit(toDatePredicateQuery(predicate.getKey(), predicate.getValue(), isoDateFormat)); return; } } } builder.append(toHiveField(predicate.getKey())); builder.append(op); builder.append(toHiveValue(predicate.getKey(), predicate.getValue())); } } /** * Determines if the type of a parameter it'a a Hive array. */ private static boolean isHiveArray(OccurrenceSearchParameter parameter) { return HiveColumnsUtils.getHiveType(PARAM_TO_TERM.get(parameter)).startsWith(HIVE_ARRAY_PRE); } /** * Searches any of the nub keys in HBase of any rank. * * @param taxonKey to append as filter */ private void appendTaxonKeyFilter(String taxonKey) { builder.append('('); boolean first = true; for (Term term : NUB_KEYS) { if (!first) { builder.append(DISJUNCTION_OPERATOR); } builder.append(HiveColumnsUtils.getHiveColumn(term)); builder.append(EQUALS_OPERATOR); builder.append(taxonKey); first = false; } builder.append(')'); } /** * Converts a Date query into conjunction predicate. * If the value has the forms 'yyyy'/'yyyy-MM' it's translated to: * field >= firstDateOfYear/Month(value) and field <= lastDateOfYear/Month(value). * If the value is a range it is translated to: field >= range.lower AND field <= range.upper. */ private static CompoundPredicate toDatePredicateQuery(OccurrenceSearchParameter key, String value, IsoDateFormat dateFormat) { Date lowerDate = IsoDateParsingUtils.parseDate(value); return toDateRangePredicate(Range.closed(lowerDate, IsoDateParsingUtils.toLastDayOf(lowerDate, dateFormat)), key); } /** * Converts date range into a conjunction predicate with the form: field >= range.lower AND field <= range.upper. */ private static CompoundPredicate toDateRangePredicate(Range<Date> range, OccurrenceSearchParameter key) { ImmutableList<Predicate> predicates = new ImmutableList.Builder<Predicate>().add(new GreaterThanOrEqualsPredicate( key, IsoDateFormat.FULL.getDateFormat().format(range.lowerEndpoint().getTime()))) .add(new LessThanOrEqualsPredicate(key, IsoDateFormat.FULL.getDateFormat().format(range.upperEndpoint()))) .build(); return new ConjunctionPredicate(predicates); } private void visit(Object object) throws QueryBuildingException { Method method = null; try { method = getClass().getMethod("visit", new Class[] {object.getClass()}); } catch (NoSuchMethodException e) { LOG.warn("Visit method could not be found. That means a unknown Predicate has been passed", e); throw new IllegalArgumentException("Unknown Predicate", e); } try { method.invoke(this, object); } catch (IllegalAccessException e) { LOG.error("This error shouldn't occurr if all visit methods are public. Probably a programming error", e); Throwables.propagate(e); } catch (InvocationTargetException e) { LOG.info("Exception thrown while building the query", e); throw new QueryBuildingException(e); } } }