DruidSerDe.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.druid.serde;

import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;

import org.apache.calcite.adapter.druid.DruidTable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.druid.DruidStorageHandler;
import org.apache.hadoop.hive.druid.DruidStorageHandlerUtils;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ShortWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.joda.time.Period;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.metamx.common.lifecycle.Lifecycle;
import com.metamx.http.client.HttpClient;
import com.metamx.http.client.HttpClientConfig;
import com.metamx.http.client.HttpClientInit;

import io.druid.query.Druids;
import io.druid.query.Druids.SegmentMetadataQueryBuilder;
import io.druid.query.Query;
import io.druid.query.aggregation.AggregatorFactory;
import io.druid.query.aggregation.PostAggregator;
import io.druid.query.dimension.DimensionSpec;
import io.druid.query.groupby.GroupByQuery;
import io.druid.query.metadata.metadata.ColumnAnalysis;
import io.druid.query.metadata.metadata.SegmentAnalysis;
import io.druid.query.metadata.metadata.SegmentMetadataQuery;
import io.druid.query.select.SelectQuery;
import io.druid.query.timeseries.TimeseriesQuery;
import io.druid.query.topn.TopNQuery;

/**
 * DruidSerDe that is used to  deserialize objects from a Druid data source.
 */
@SerDeSpec(schemaProps = { Constants.DRUID_DATA_SOURCE })
public class DruidSerDe extends AbstractSerDe {

  protected static final Logger LOG = LoggerFactory.getLogger(DruidSerDe.class);

  private String[] columns;
  private PrimitiveTypeInfo[] types;
  private ObjectInspector inspector;

  @Override
  public void initialize(Configuration configuration, Properties properties) throws SerDeException {

    final List<String> columnNames = new ArrayList<>();
    final List<PrimitiveTypeInfo> columnTypes = new ArrayList<>();
    List<ObjectInspector> inspectors = new ArrayList<>();

    // Druid query
    String druidQuery = properties.getProperty(Constants.DRUID_QUERY_JSON);
    if (druidQuery == null) {
      // No query. Either it is a CTAS, or we need to create a Druid
      // Segment Metadata query that retrieves all columns present in
      // the data source (dimensions and metrics).
      if (!org.apache.commons.lang3.StringUtils
              .isEmpty(properties.getProperty(serdeConstants.LIST_COLUMNS))
              && !org.apache.commons.lang3.StringUtils
              .isEmpty(properties.getProperty(serdeConstants.LIST_COLUMN_TYPES))) {
        columnNames.addAll(Utilities.getColumnNames(properties));
        if (!columnNames.contains(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
          throw new SerDeException("Timestamp column (' " + DruidTable.DEFAULT_TIMESTAMP_COLUMN +
                  "') not specified in create table; list of columns is : " +
                  properties.getProperty(serdeConstants.LIST_COLUMNS));
        }
        columnTypes.addAll(Lists.transform(Utilities.getColumnTypes(properties),
                new Function<String, PrimitiveTypeInfo>() {
                  @Override
                  public PrimitiveTypeInfo apply(String type) {
                    return TypeInfoFactory.getPrimitiveTypeInfo(type);
                  }
                }
        ));
        inspectors.addAll(Lists.transform(columnTypes,
                new Function<PrimitiveTypeInfo, ObjectInspector>() {
                  @Override
                  public ObjectInspector apply(PrimitiveTypeInfo type) {
                    return PrimitiveObjectInspectorFactory
                            .getPrimitiveWritableObjectInspector(type);
                  }
                }
        ));
        columns = columnNames.toArray(new String[columnNames.size()]);
        types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
        inspector = ObjectInspectorFactory
                .getStandardStructObjectInspector(columnNames, inspectors);
      } else {
        String dataSource = properties.getProperty(Constants.DRUID_DATA_SOURCE);
        if (dataSource == null) {
          throw new SerDeException("Druid data source not specified; use " +
                  Constants.DRUID_DATA_SOURCE + " in table properties");
        }
        SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
        builder.dataSource(dataSource);
        builder.merge(true);
        builder.analysisTypes();
        SegmentMetadataQuery query = builder.build();

        // Execute query in Druid
        String address = HiveConf.getVar(configuration,
                HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS
        );
        if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
          throw new SerDeException("Druid broker address not specified in configuration");
        }

        // Infer schema
        SegmentAnalysis schemaInfo;
        try {
          schemaInfo = submitMetadataRequest(address, query);
        } catch (IOException e) {
          throw new SerDeException(e);
        }
        for (Entry<String, ColumnAnalysis> columnInfo : schemaInfo.getColumns().entrySet()) {
          if (columnInfo.getKey().equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
            // Special handling for timestamp column
            columnNames.add(columnInfo.getKey()); // field name
            PrimitiveTypeInfo type = TypeInfoFactory.timestampTypeInfo; // field type
            columnTypes.add(type);
            inspectors
                    .add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
            continue;
          }
          columnNames.add(columnInfo.getKey()); // field name
          PrimitiveTypeInfo type = DruidSerDeUtils.convertDruidToHiveType(
                  columnInfo.getValue().getType()); // field type
          columnTypes.add(type);
          inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
        }
        columns = columnNames.toArray(new String[columnNames.size()]);
        types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
        inspector = ObjectInspectorFactory
                .getStandardStructObjectInspector(columnNames, inspectors);
      }
    } else {
      // Query is specified, we can extract the results schema from the query
      Query<?> query;
      try {
        query = DruidStorageHandlerUtils.JSON_MAPPER.readValue(druidQuery, Query.class);

        switch (query.getType()) {
          case Query.TIMESERIES:
            inferSchema((TimeseriesQuery) query, columnNames, columnTypes);
            break;
          case Query.TOPN:
            inferSchema((TopNQuery) query, columnNames, columnTypes);
            break;
          case Query.SELECT:
            String address = HiveConf.getVar(configuration,
                    HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
            if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
              throw new SerDeException("Druid broker address not specified in configuration");
            }
            inferSchema((SelectQuery) query, columnNames, columnTypes, address);
            break;
          case Query.GROUP_BY:
            inferSchema((GroupByQuery) query, columnNames, columnTypes);
            break;
          default:
            throw new SerDeException("Not supported Druid query");
        }
      } catch (Exception e) {
        throw new SerDeException(e);
      }

      columns = new String[columnNames.size()];
      types = new PrimitiveTypeInfo[columnNames.size()];
      for (int i = 0; i < columnTypes.size(); ++i) {
        columns[i] = columnNames.get(i);
        types[i] = columnTypes.get(i);
        inspectors
                .add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(types[i]));
      }
      inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug("DruidSerDe initialized with\n"
              + "\t columns: " + columnNames
              + "\n\t types: " + columnTypes);
    }
  }

  /* Submits the request and returns */
  protected SegmentAnalysis submitMetadataRequest(String address, SegmentMetadataQuery query)
          throws SerDeException, IOException {
    InputStream response;
    try {
      response = DruidStorageHandlerUtils.submitRequest(DruidStorageHandler.getHttpClient(),
              DruidStorageHandlerUtils.createRequest(address, query)
      );
    } catch (Exception e) {
      throw new SerDeException(StringUtils.stringifyException(e));
    }

    // Retrieve results
    List<SegmentAnalysis> resultsList;
    try {
      resultsList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response,
              new TypeReference<List<SegmentAnalysis>>() {
              }
      );
    } catch (Exception e) {
      response.close();
      throw new SerDeException(StringUtils.stringifyException(e));
    }
    if (resultsList == null || resultsList.isEmpty()) {
      throw new SerDeException("Connected to Druid but could not retrieve datasource information");
    }
    if (resultsList.size() != 1) {
      throw new SerDeException("Information about segments should have been merged");
    }

    return resultsList.get(0);
  }

  /* Timeseries query */
  private void inferSchema(TimeseriesQuery query, List<String> columnNames,
          List<PrimitiveTypeInfo> columnTypes
  ) {
    // Timestamp column
    columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
    columnTypes.add(TypeInfoFactory.timestampTypeInfo);
    // Aggregator columns
    for (AggregatorFactory af : query.getAggregatorSpecs()) {
      columnNames.add(af.getName());
      columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
    }
    // Post-aggregator columns
    // TODO: Currently Calcite only infers avg for post-aggregate,
    // but once we recognize other functions, we will need to infer
    // different types for post-aggregation functions
    for (PostAggregator pa : query.getPostAggregatorSpecs()) {
      columnNames.add(pa.getName());
      columnTypes.add(TypeInfoFactory.floatTypeInfo);
    }
  }

  /* TopN query */
  private void inferSchema(TopNQuery query, List<String> columnNames,
          List<PrimitiveTypeInfo> columnTypes
  ) {
    // Timestamp column
    columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
    columnTypes.add(TypeInfoFactory.timestampTypeInfo);
    // Dimension column
    columnNames.add(query.getDimensionSpec().getOutputName());
    columnTypes.add(TypeInfoFactory.stringTypeInfo);
    // Aggregator columns
    for (AggregatorFactory af : query.getAggregatorSpecs()) {
      columnNames.add(af.getName());
      columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
    }
    // Post-aggregator columns
    // TODO: Currently Calcite only infers avg for post-aggregate,
    // but once we recognize other functions, we will need to infer
    // different types for post-aggregation functions
    for (PostAggregator pa : query.getPostAggregatorSpecs()) {
      columnNames.add(pa.getName());
      columnTypes.add(TypeInfoFactory.floatTypeInfo);
    }
  }

  /* Select query */
  private void inferSchema(SelectQuery query, List<String> columnNames,
          List<PrimitiveTypeInfo> columnTypes, String address) throws SerDeException {
    // Timestamp column
    columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
    columnTypes.add(TypeInfoFactory.timestampTypeInfo);
    // Dimension columns
    for (DimensionSpec ds : query.getDimensions()) {
      columnNames.add(ds.getOutputName());
      columnTypes.add(TypeInfoFactory.stringTypeInfo);
    }
    // The type for metric columns is not explicit in the query, thus in this case
    // we need to emit a metadata query to know their type
    SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
    builder.dataSource(query.getDataSource());
    builder.merge(true);
    builder.analysisTypes();
    SegmentMetadataQuery metadataQuery = builder.build();
    // Execute query in Druid
    SegmentAnalysis schemaInfo;
    try {
      schemaInfo = submitMetadataRequest(address, metadataQuery);
    } catch (IOException e) {
      throw new SerDeException(e);
    }
    if (schemaInfo == null) {
      throw new SerDeException("Connected to Druid but could not retrieve datasource information");
    }
    for (String metric : query.getMetrics()) {
      columnNames.add(metric);
      columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(
              schemaInfo.getColumns().get(metric).getType()));
    }
  }

  /* GroupBy query */
  private void inferSchema(GroupByQuery query, List<String> columnNames,
          List<PrimitiveTypeInfo> columnTypes
  ) {
    // Timestamp column
    columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
    columnTypes.add(TypeInfoFactory.timestampTypeInfo);
    // Dimension columns
    for (DimensionSpec ds : query.getDimensions()) {
      columnNames.add(ds.getOutputName());
      columnTypes.add(TypeInfoFactory.stringTypeInfo);
    }
    // Aggregator columns
    for (AggregatorFactory af : query.getAggregatorSpecs()) {
      columnNames.add(af.getName());
      columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
    }
    // Post-aggregator columns
    // TODO: Currently Calcite only infers avg for post-aggregate,
    // but once we recognize other functions, we will need to infer
    // different types for post-aggregation functions
    for (PostAggregator pa : query.getPostAggregatorSpecs()) {
      columnNames.add(pa.getName());
      columnTypes.add(TypeInfoFactory.floatTypeInfo);
    }
  }

  @Override
  public Class<? extends Writable> getSerializedClass() {
    return DruidWritable.class;
  }

  @Override
  public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException {
    if (objectInspector.getCategory() != ObjectInspector.Category.STRUCT) {
      throw new SerDeException(getClass().toString()
              + " can only serialize struct types, but we got: "
              + objectInspector.getTypeName());
    }

    // Prepare the field ObjectInspectors
    StructObjectInspector soi = (StructObjectInspector) objectInspector;
    List<? extends StructField> fields = soi.getAllStructFieldRefs();
    List<Object> values = soi.getStructFieldsDataAsList(o);
    // We deserialize the result
    Map<String, Object> value = new HashMap<>();
    for (int i = 0; i < columns.length; i++) {
      if (values.get(i) == null) {
        // null, we just add it
        value.put(columns[i], null);
        continue;
      }
      final Object res;
      switch (types[i].getPrimitiveCategory()) {
        case TIMESTAMP:
          res = ((TimestampObjectInspector) fields.get(i).getFieldObjectInspector())
                  .getPrimitiveJavaObject(
                          values.get(i)).getTime();
          break;
        case BYTE:
          res = ((ByteObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
          break;
        case SHORT:
          res = ((ShortObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
          break;
        case INT:
          res = ((IntObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
          break;
        case LONG:
          res = ((LongObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
          break;
        case FLOAT:
          res = ((FloatObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
          break;
        case DOUBLE:
          res = ((DoubleObjectInspector) fields.get(i).getFieldObjectInspector())
                  .get(values.get(i));
          break;
        case DECIMAL:
          res = ((HiveDecimalObjectInspector) fields.get(i).getFieldObjectInspector())
                  .getPrimitiveJavaObject(values.get(i)).doubleValue();
          break;
        case STRING:
          res = ((StringObjectInspector) fields.get(i).getFieldObjectInspector())
                  .getPrimitiveJavaObject(
                          values.get(i));
          break;
        default:
          throw new SerDeException("Unknown type: " + types[i].getPrimitiveCategory());
      }
      value.put(columns[i], res);
    }
    value.put(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME,
            ((TimestampObjectInspector) fields.get(columns.length).getFieldObjectInspector())
                    .getPrimitiveJavaObject(values.get(columns.length)).getTime()
    );
    return new DruidWritable(value);
  }

  @Override
  public SerDeStats getSerDeStats() {
    // no support for statistics
    return null;
  }

  @Override
  public Object deserialize(Writable writable) throws SerDeException {
    DruidWritable input = (DruidWritable) writable;
    List<Object> output = Lists.newArrayListWithExpectedSize(columns.length);
    for (int i = 0; i < columns.length; i++) {
      final Object value = input.getValue().get(columns[i]);
      if (value == null) {
        output.add(null);
        continue;
      }
      switch (types[i].getPrimitiveCategory()) {
        case TIMESTAMP:
          output.add(new TimestampWritable(new Timestamp((Long) value)));
          break;
        case BYTE:
          output.add(new ByteWritable(((Number) value).byteValue()));
          break;
        case SHORT:
          output.add(new ShortWritable(((Number) value).shortValue()));
          break;
        case INT:
          output.add(new IntWritable(((Number) value).intValue()));
          break;
        case LONG:
          output.add(new LongWritable(((Number) value).longValue()));
          break;
        case FLOAT:
          output.add(new FloatWritable(((Number) value).floatValue()));
          break;
        case DOUBLE:
          output.add(new DoubleWritable(((Number) value).doubleValue()));
          break;
        case DECIMAL:
          output.add(new HiveDecimalWritable(HiveDecimal.create(((Number) value).doubleValue())));
          break;
        case STRING:
          output.add(new Text(value.toString()));
          break;
        default:
          throw new SerDeException("Unknown type: " + types[i].getPrimitiveCategory());
      }
    }
    return output;
  }

  @Override
  public ObjectInspector getObjectInspector() throws SerDeException {
    return inspector;
  }

}