/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.druid.serde;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import org.apache.calcite.adapter.druid.DruidTable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.druid.DruidStorageHandler;
import org.apache.hadoop.hive.druid.DruidStorageHandlerUtils;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ShortWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.joda.time.Period;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.metamx.common.lifecycle.Lifecycle;
import com.metamx.http.client.HttpClient;
import com.metamx.http.client.HttpClientConfig;
import com.metamx.http.client.HttpClientInit;
import io.druid.query.Druids;
import io.druid.query.Druids.SegmentMetadataQueryBuilder;
import io.druid.query.Query;
import io.druid.query.aggregation.AggregatorFactory;
import io.druid.query.aggregation.PostAggregator;
import io.druid.query.dimension.DimensionSpec;
import io.druid.query.groupby.GroupByQuery;
import io.druid.query.metadata.metadata.ColumnAnalysis;
import io.druid.query.metadata.metadata.SegmentAnalysis;
import io.druid.query.metadata.metadata.SegmentMetadataQuery;
import io.druid.query.select.SelectQuery;
import io.druid.query.timeseries.TimeseriesQuery;
import io.druid.query.topn.TopNQuery;
/**
* DruidSerDe that is used to deserialize objects from a Druid data source.
*/
@SerDeSpec(schemaProps = { Constants.DRUID_DATA_SOURCE })
public class DruidSerDe extends AbstractSerDe {
protected static final Logger LOG = LoggerFactory.getLogger(DruidSerDe.class);
private String[] columns;
private PrimitiveTypeInfo[] types;
private ObjectInspector inspector;
@Override
public void initialize(Configuration configuration, Properties properties) throws SerDeException {
final List<String> columnNames = new ArrayList<>();
final List<PrimitiveTypeInfo> columnTypes = new ArrayList<>();
List<ObjectInspector> inspectors = new ArrayList<>();
// Druid query
String druidQuery = properties.getProperty(Constants.DRUID_QUERY_JSON);
if (druidQuery == null) {
// No query. Either it is a CTAS, or we need to create a Druid
// Segment Metadata query that retrieves all columns present in
// the data source (dimensions and metrics).
if (!org.apache.commons.lang3.StringUtils
.isEmpty(properties.getProperty(serdeConstants.LIST_COLUMNS))
&& !org.apache.commons.lang3.StringUtils
.isEmpty(properties.getProperty(serdeConstants.LIST_COLUMN_TYPES))) {
columnNames.addAll(Utilities.getColumnNames(properties));
if (!columnNames.contains(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
throw new SerDeException("Timestamp column (' " + DruidTable.DEFAULT_TIMESTAMP_COLUMN +
"') not specified in create table; list of columns is : " +
properties.getProperty(serdeConstants.LIST_COLUMNS));
}
columnTypes.addAll(Lists.transform(Utilities.getColumnTypes(properties),
new Function<String, PrimitiveTypeInfo>() {
@Override
public PrimitiveTypeInfo apply(String type) {
return TypeInfoFactory.getPrimitiveTypeInfo(type);
}
}
));
inspectors.addAll(Lists.transform(columnTypes,
new Function<PrimitiveTypeInfo, ObjectInspector>() {
@Override
public ObjectInspector apply(PrimitiveTypeInfo type) {
return PrimitiveObjectInspectorFactory
.getPrimitiveWritableObjectInspector(type);
}
}
));
columns = columnNames.toArray(new String[columnNames.size()]);
types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
inspector = ObjectInspectorFactory
.getStandardStructObjectInspector(columnNames, inspectors);
} else {
String dataSource = properties.getProperty(Constants.DRUID_DATA_SOURCE);
if (dataSource == null) {
throw new SerDeException("Druid data source not specified; use " +
Constants.DRUID_DATA_SOURCE + " in table properties");
}
SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
builder.dataSource(dataSource);
builder.merge(true);
builder.analysisTypes();
SegmentMetadataQuery query = builder.build();
// Execute query in Druid
String address = HiveConf.getVar(configuration,
HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS
);
if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
throw new SerDeException("Druid broker address not specified in configuration");
}
// Infer schema
SegmentAnalysis schemaInfo;
try {
schemaInfo = submitMetadataRequest(address, query);
} catch (IOException e) {
throw new SerDeException(e);
}
for (Entry<String, ColumnAnalysis> columnInfo : schemaInfo.getColumns().entrySet()) {
if (columnInfo.getKey().equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
// Special handling for timestamp column
columnNames.add(columnInfo.getKey()); // field name
PrimitiveTypeInfo type = TypeInfoFactory.timestampTypeInfo; // field type
columnTypes.add(type);
inspectors
.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
continue;
}
columnNames.add(columnInfo.getKey()); // field name
PrimitiveTypeInfo type = DruidSerDeUtils.convertDruidToHiveType(
columnInfo.getValue().getType()); // field type
columnTypes.add(type);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
}
columns = columnNames.toArray(new String[columnNames.size()]);
types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
inspector = ObjectInspectorFactory
.getStandardStructObjectInspector(columnNames, inspectors);
}
} else {
// Query is specified, we can extract the results schema from the query
Query<?> query;
try {
query = DruidStorageHandlerUtils.JSON_MAPPER.readValue(druidQuery, Query.class);
switch (query.getType()) {
case Query.TIMESERIES:
inferSchema((TimeseriesQuery) query, columnNames, columnTypes);
break;
case Query.TOPN:
inferSchema((TopNQuery) query, columnNames, columnTypes);
break;
case Query.SELECT:
String address = HiveConf.getVar(configuration,
HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
throw new SerDeException("Druid broker address not specified in configuration");
}
inferSchema((SelectQuery) query, columnNames, columnTypes, address);
break;
case Query.GROUP_BY:
inferSchema((GroupByQuery) query, columnNames, columnTypes);
break;
default:
throw new SerDeException("Not supported Druid query");
}
} catch (Exception e) {
throw new SerDeException(e);
}
columns = new String[columnNames.size()];
types = new PrimitiveTypeInfo[columnNames.size()];
for (int i = 0; i < columnTypes.size(); ++i) {
columns[i] = columnNames.get(i);
types[i] = columnTypes.get(i);
inspectors
.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(types[i]));
}
inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
}
if (LOG.isDebugEnabled()) {
LOG.debug("DruidSerDe initialized with\n"
+ "\t columns: " + columnNames
+ "\n\t types: " + columnTypes);
}
}
/* Submits the request and returns */
protected SegmentAnalysis submitMetadataRequest(String address, SegmentMetadataQuery query)
throws SerDeException, IOException {
InputStream response;
try {
response = DruidStorageHandlerUtils.submitRequest(DruidStorageHandler.getHttpClient(),
DruidStorageHandlerUtils.createRequest(address, query)
);
} catch (Exception e) {
throw new SerDeException(StringUtils.stringifyException(e));
}
// Retrieve results
List<SegmentAnalysis> resultsList;
try {
resultsList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response,
new TypeReference<List<SegmentAnalysis>>() {
}
);
} catch (Exception e) {
response.close();
throw new SerDeException(StringUtils.stringifyException(e));
}
if (resultsList == null || resultsList.isEmpty()) {
throw new SerDeException("Connected to Druid but could not retrieve datasource information");
}
if (resultsList.size() != 1) {
throw new SerDeException("Information about segments should have been merged");
}
return resultsList.get(0);
}
/* Timeseries query */
private void inferSchema(TimeseriesQuery query, List<String> columnNames,
List<PrimitiveTypeInfo> columnTypes
) {
// Timestamp column
columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
columnTypes.add(TypeInfoFactory.timestampTypeInfo);
// Aggregator columns
for (AggregatorFactory af : query.getAggregatorSpecs()) {
columnNames.add(af.getName());
columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
}
// Post-aggregator columns
// TODO: Currently Calcite only infers avg for post-aggregate,
// but once we recognize other functions, we will need to infer
// different types for post-aggregation functions
for (PostAggregator pa : query.getPostAggregatorSpecs()) {
columnNames.add(pa.getName());
columnTypes.add(TypeInfoFactory.floatTypeInfo);
}
}
/* TopN query */
private void inferSchema(TopNQuery query, List<String> columnNames,
List<PrimitiveTypeInfo> columnTypes
) {
// Timestamp column
columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
columnTypes.add(TypeInfoFactory.timestampTypeInfo);
// Dimension column
columnNames.add(query.getDimensionSpec().getOutputName());
columnTypes.add(TypeInfoFactory.stringTypeInfo);
// Aggregator columns
for (AggregatorFactory af : query.getAggregatorSpecs()) {
columnNames.add(af.getName());
columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
}
// Post-aggregator columns
// TODO: Currently Calcite only infers avg for post-aggregate,
// but once we recognize other functions, we will need to infer
// different types for post-aggregation functions
for (PostAggregator pa : query.getPostAggregatorSpecs()) {
columnNames.add(pa.getName());
columnTypes.add(TypeInfoFactory.floatTypeInfo);
}
}
/* Select query */
private void inferSchema(SelectQuery query, List<String> columnNames,
List<PrimitiveTypeInfo> columnTypes, String address) throws SerDeException {
// Timestamp column
columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
columnTypes.add(TypeInfoFactory.timestampTypeInfo);
// Dimension columns
for (DimensionSpec ds : query.getDimensions()) {
columnNames.add(ds.getOutputName());
columnTypes.add(TypeInfoFactory.stringTypeInfo);
}
// The type for metric columns is not explicit in the query, thus in this case
// we need to emit a metadata query to know their type
SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
builder.dataSource(query.getDataSource());
builder.merge(true);
builder.analysisTypes();
SegmentMetadataQuery metadataQuery = builder.build();
// Execute query in Druid
SegmentAnalysis schemaInfo;
try {
schemaInfo = submitMetadataRequest(address, metadataQuery);
} catch (IOException e) {
throw new SerDeException(e);
}
if (schemaInfo == null) {
throw new SerDeException("Connected to Druid but could not retrieve datasource information");
}
for (String metric : query.getMetrics()) {
columnNames.add(metric);
columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(
schemaInfo.getColumns().get(metric).getType()));
}
}
/* GroupBy query */
private void inferSchema(GroupByQuery query, List<String> columnNames,
List<PrimitiveTypeInfo> columnTypes
) {
// Timestamp column
columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
columnTypes.add(TypeInfoFactory.timestampTypeInfo);
// Dimension columns
for (DimensionSpec ds : query.getDimensions()) {
columnNames.add(ds.getOutputName());
columnTypes.add(TypeInfoFactory.stringTypeInfo);
}
// Aggregator columns
for (AggregatorFactory af : query.getAggregatorSpecs()) {
columnNames.add(af.getName());
columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
}
// Post-aggregator columns
// TODO: Currently Calcite only infers avg for post-aggregate,
// but once we recognize other functions, we will need to infer
// different types for post-aggregation functions
for (PostAggregator pa : query.getPostAggregatorSpecs()) {
columnNames.add(pa.getName());
columnTypes.add(TypeInfoFactory.floatTypeInfo);
}
}
@Override
public Class<? extends Writable> getSerializedClass() {
return DruidWritable.class;
}
@Override
public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException {
if (objectInspector.getCategory() != ObjectInspector.Category.STRUCT) {
throw new SerDeException(getClass().toString()
+ " can only serialize struct types, but we got: "
+ objectInspector.getTypeName());
}
// Prepare the field ObjectInspectors
StructObjectInspector soi = (StructObjectInspector) objectInspector;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
List<Object> values = soi.getStructFieldsDataAsList(o);
// We deserialize the result
Map<String, Object> value = new HashMap<>();
for (int i = 0; i < columns.length; i++) {
if (values.get(i) == null) {
// null, we just add it
value.put(columns[i], null);
continue;
}
final Object res;
switch (types[i].getPrimitiveCategory()) {
case TIMESTAMP:
res = ((TimestampObjectInspector) fields.get(i).getFieldObjectInspector())
.getPrimitiveJavaObject(
values.get(i)).getTime();
break;
case BYTE:
res = ((ByteObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
break;
case SHORT:
res = ((ShortObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
break;
case INT:
res = ((IntObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
break;
case LONG:
res = ((LongObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
break;
case FLOAT:
res = ((FloatObjectInspector) fields.get(i).getFieldObjectInspector()).get(values.get(i));
break;
case DOUBLE:
res = ((DoubleObjectInspector) fields.get(i).getFieldObjectInspector())
.get(values.get(i));
break;
case DECIMAL:
res = ((HiveDecimalObjectInspector) fields.get(i).getFieldObjectInspector())
.getPrimitiveJavaObject(values.get(i)).doubleValue();
break;
case STRING:
res = ((StringObjectInspector) fields.get(i).getFieldObjectInspector())
.getPrimitiveJavaObject(
values.get(i));
break;
default:
throw new SerDeException("Unknown type: " + types[i].getPrimitiveCategory());
}
value.put(columns[i], res);
}
value.put(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME,
((TimestampObjectInspector) fields.get(columns.length).getFieldObjectInspector())
.getPrimitiveJavaObject(values.get(columns.length)).getTime()
);
return new DruidWritable(value);
}
@Override
public SerDeStats getSerDeStats() {
// no support for statistics
return null;
}
@Override
public Object deserialize(Writable writable) throws SerDeException {
DruidWritable input = (DruidWritable) writable;
List<Object> output = Lists.newArrayListWithExpectedSize(columns.length);
for (int i = 0; i < columns.length; i++) {
final Object value = input.getValue().get(columns[i]);
if (value == null) {
output.add(null);
continue;
}
switch (types[i].getPrimitiveCategory()) {
case TIMESTAMP:
output.add(new TimestampWritable(new Timestamp((Long) value)));
break;
case BYTE:
output.add(new ByteWritable(((Number) value).byteValue()));
break;
case SHORT:
output.add(new ShortWritable(((Number) value).shortValue()));
break;
case INT:
output.add(new IntWritable(((Number) value).intValue()));
break;
case LONG:
output.add(new LongWritable(((Number) value).longValue()));
break;
case FLOAT:
output.add(new FloatWritable(((Number) value).floatValue()));
break;
case DOUBLE:
output.add(new DoubleWritable(((Number) value).doubleValue()));
break;
case DECIMAL:
output.add(new HiveDecimalWritable(HiveDecimal.create(((Number) value).doubleValue())));
break;
case STRING:
output.add(new Text(value.toString()));
break;
default:
throw new SerDeException("Unknown type: " + types[i].getPrimitiveCategory());
}
}
return output;
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return inspector;
}
}