/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.IOException;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData;
import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Date;
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest;
import org.apache.hadoop.hive.metastore.api.StringColumnStatsData;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.DriverContext;
import org.apache.hadoop.hive.ql.QueryPlan;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.ExplainConfiguration.AnalyzeState;
import org.apache.hadoop.hive.ql.plan.ColumnStatsWork;
import org.apache.hadoop.hive.ql.plan.api.StageType;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.StringUtils;
/**
* ColumnStatsTask implementation.
**/
public class ColumnStatsTask extends Task<ColumnStatsWork> implements Serializable {
private static final long serialVersionUID = 1L;
private FetchOperator ftOp;
private static transient final Logger LOG = LoggerFactory.getLogger(ColumnStatsTask.class);
public ColumnStatsTask() {
super();
}
@Override
public void initialize(QueryState queryState, QueryPlan queryPlan, DriverContext ctx,
CompilationOpContext opContext) {
super.initialize(queryState, queryPlan, ctx, opContext);
work.initializeForFetch(opContext);
try {
JobConf job = new JobConf(conf);
ftOp = new FetchOperator(work.getfWork(), job);
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
throw new RuntimeException(e);
}
}
private void unpackBooleanStats(ObjectInspector oi, Object o, String fName,
ColumnStatisticsObj statsObj) {
long v = ((LongObjectInspector) oi).get(o);
if (fName.equals("counttrues")) {
statsObj.getStatsData().getBooleanStats().setNumTrues(v);
} else if (fName.equals("countfalses")) {
statsObj.getStatsData().getBooleanStats().setNumFalses(v);
} else if (fName.equals("countnulls")) {
statsObj.getStatsData().getBooleanStats().setNumNulls(v);
}
}
@SuppressWarnings("serial")
class UnsupportedDoubleException extends Exception {
}
private void unpackDoubleStats(ObjectInspector oi, Object o, String fName,
ColumnStatisticsObj statsObj) throws UnsupportedDoubleException {
if (fName.equals("countnulls")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getDoubleStats().setNumNulls(v);
} else if (fName.equals("numdistinctvalues")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getDoubleStats().setNumDVs(v);
} else if (fName.equals("max")) {
double d = ((DoubleObjectInspector) oi).get(o);
if (Double.isInfinite(d) || Double.isNaN(d)) {
throw new UnsupportedDoubleException();
}
statsObj.getStatsData().getDoubleStats().setHighValue(d);
} else if (fName.equals("min")) {
double d = ((DoubleObjectInspector) oi).get(o);
if (Double.isInfinite(d) || Double.isNaN(d)) {
throw new UnsupportedDoubleException();
}
statsObj.getStatsData().getDoubleStats().setLowValue(d);
} else if (fName.equals("ndvbitvector")) {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
String v = ((StringObjectInspector) poi).getPrimitiveJavaObject(o);
statsObj.getStatsData().getDoubleStats().setBitVectors(v);;
}
}
private void unpackDecimalStats(ObjectInspector oi, Object o, String fName,
ColumnStatisticsObj statsObj) {
if (fName.equals("countnulls")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getDecimalStats().setNumNulls(v);
} else if (fName.equals("numdistinctvalues")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getDecimalStats().setNumDVs(v);
} else if (fName.equals("max")) {
HiveDecimal d = ((HiveDecimalObjectInspector) oi).getPrimitiveJavaObject(o);
statsObj.getStatsData().getDecimalStats().setHighValue(convertToThriftDecimal(d));
} else if (fName.equals("min")) {
HiveDecimal d = ((HiveDecimalObjectInspector) oi).getPrimitiveJavaObject(o);
statsObj.getStatsData().getDecimalStats().setLowValue(convertToThriftDecimal(d));
} else if (fName.equals("ndvbitvector")) {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
String v = ((StringObjectInspector) poi).getPrimitiveJavaObject(o);
statsObj.getStatsData().getDecimalStats().setBitVectors(v);;
}
}
private Decimal convertToThriftDecimal(HiveDecimal d) {
return new Decimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short)d.scale());
}
private void unpackLongStats(ObjectInspector oi, Object o, String fName,
ColumnStatisticsObj statsObj) {
if (fName.equals("countnulls")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getLongStats().setNumNulls(v);
} else if (fName.equals("numdistinctvalues")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getLongStats().setNumDVs(v);
} else if (fName.equals("max")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getLongStats().setHighValue(v);
} else if (fName.equals("min")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getLongStats().setLowValue(v);
} else if (fName.equals("ndvbitvector")) {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
String v = ((StringObjectInspector) poi).getPrimitiveJavaObject(o);
statsObj.getStatsData().getLongStats().setBitVectors(v);;
}
}
private void unpackStringStats(ObjectInspector oi, Object o, String fName,
ColumnStatisticsObj statsObj) {
if (fName.equals("countnulls")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getStringStats().setNumNulls(v);
} else if (fName.equals("numdistinctvalues")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getStringStats().setNumDVs(v);
} else if (fName.equals("avglength")) {
double d = ((DoubleObjectInspector) oi).get(o);
statsObj.getStatsData().getStringStats().setAvgColLen(d);
} else if (fName.equals("maxlength")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getStringStats().setMaxColLen(v);
} else if (fName.equals("ndvbitvector")) {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
String v = ((StringObjectInspector) poi).getPrimitiveJavaObject(o);
statsObj.getStatsData().getStringStats().setBitVectors(v);;
}
}
private void unpackBinaryStats(ObjectInspector oi, Object o, String fName,
ColumnStatisticsObj statsObj) {
if (fName.equals("countnulls")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getBinaryStats().setNumNulls(v);
} else if (fName.equals("avglength")) {
double d = ((DoubleObjectInspector) oi).get(o);
statsObj.getStatsData().getBinaryStats().setAvgColLen(d);
} else if (fName.equals("maxlength")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getBinaryStats().setMaxColLen(v);
}
}
private void unpackDateStats(ObjectInspector oi, Object o, String fName,
ColumnStatisticsObj statsObj) {
if (fName.equals("countnulls")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getDateStats().setNumNulls(v);
} else if (fName.equals("numdistinctvalues")) {
long v = ((LongObjectInspector) oi).get(o);
statsObj.getStatsData().getDateStats().setNumDVs(v);
} else if (fName.equals("max")) {
DateWritable v = ((DateObjectInspector) oi).getPrimitiveWritableObject(o);
statsObj.getStatsData().getDateStats().setHighValue(new Date(v.getDays()));
} else if (fName.equals("min")) {
DateWritable v = ((DateObjectInspector) oi).getPrimitiveWritableObject(o);
statsObj.getStatsData().getDateStats().setLowValue(new Date(v.getDays()));
} else if (fName.equals("ndvbitvector")) {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
String v = ((StringObjectInspector) poi).getPrimitiveJavaObject(o);
statsObj.getStatsData().getDateStats().setBitVectors(v);;
}
}
private void unpackPrimitiveObject (ObjectInspector oi, Object o, String fieldName,
ColumnStatisticsObj statsObj) throws UnsupportedDoubleException {
if (o == null) {
return;
}
// First infer the type of object
if (fieldName.equals("columntype")) {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
String s = ((StringObjectInspector) poi).getPrimitiveJavaObject(o);
ColumnStatisticsData statsData = new ColumnStatisticsData();
if (s.equalsIgnoreCase("long")) {
LongColumnStatsData longStats = new LongColumnStatsData();
statsData.setLongStats(longStats);
statsObj.setStatsData(statsData);
} else if (s.equalsIgnoreCase("double")) {
DoubleColumnStatsData doubleStats = new DoubleColumnStatsData();
statsData.setDoubleStats(doubleStats);
statsObj.setStatsData(statsData);
} else if (s.equalsIgnoreCase("string")) {
StringColumnStatsData stringStats = new StringColumnStatsData();
statsData.setStringStats(stringStats);
statsObj.setStatsData(statsData);
} else if (s.equalsIgnoreCase("boolean")) {
BooleanColumnStatsData booleanStats = new BooleanColumnStatsData();
statsData.setBooleanStats(booleanStats);
statsObj.setStatsData(statsData);
} else if (s.equalsIgnoreCase("binary")) {
BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
statsData.setBinaryStats(binaryStats);
statsObj.setStatsData(statsData);
} else if (s.equalsIgnoreCase("decimal")) {
DecimalColumnStatsData decimalStats = new DecimalColumnStatsData();
statsData.setDecimalStats(decimalStats);
statsObj.setStatsData(statsData);
} else if (s.equalsIgnoreCase("date")) {
DateColumnStatsData dateStats = new DateColumnStatsData();
statsData.setDateStats(dateStats);
statsObj.setStatsData(statsData);
}
} else {
// invoke the right unpack method depending on data type of the column
if (statsObj.getStatsData().isSetBooleanStats()) {
unpackBooleanStats(oi, o, fieldName, statsObj);
} else if (statsObj.getStatsData().isSetLongStats()) {
unpackLongStats(oi, o, fieldName, statsObj);
} else if (statsObj.getStatsData().isSetDoubleStats()) {
unpackDoubleStats(oi,o,fieldName, statsObj);
} else if (statsObj.getStatsData().isSetStringStats()) {
unpackStringStats(oi, o, fieldName, statsObj);
} else if (statsObj.getStatsData().isSetBinaryStats()) {
unpackBinaryStats(oi, o, fieldName, statsObj);
} else if (statsObj.getStatsData().isSetDecimalStats()) {
unpackDecimalStats(oi, o, fieldName, statsObj);
} else if (statsObj.getStatsData().isSetDateStats()) {
unpackDateStats(oi, o, fieldName, statsObj);
}
}
}
private void unpackStructObject(ObjectInspector oi, Object o, String fName,
ColumnStatisticsObj cStatsObj) throws UnsupportedDoubleException {
if (oi.getCategory() != ObjectInspector.Category.STRUCT) {
throw new RuntimeException("Invalid object datatype : " + oi.getCategory().toString());
}
StructObjectInspector soi = (StructObjectInspector) oi;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
List<Object> list = soi.getStructFieldsDataAsList(o);
for (int i = 0; i < fields.size(); i++) {
// Get the field objectInspector, fieldName and the field object.
ObjectInspector foi = fields.get(i).getFieldObjectInspector();
Object f = (list == null ? null : list.get(i));
String fieldName = fields.get(i).getFieldName();
if (foi.getCategory() == ObjectInspector.Category.PRIMITIVE) {
unpackPrimitiveObject(foi, f, fieldName, cStatsObj);
} else {
unpackStructObject(foi, f, fieldName, cStatsObj);
}
}
}
private List<ColumnStatistics> constructColumnStatsFromPackedRows(
Hive db) throws HiveException, MetaException, IOException {
String currentDb = SessionState.get().getCurrentDatabase();
String tableName = work.getColStats().getTableName();
String partName = null;
List<String> colName = work.getColStats().getColName();
List<String> colType = work.getColStats().getColType();
boolean isTblLevel = work.getColStats().isTblLevel();
List<ColumnStatistics> stats = new ArrayList<ColumnStatistics>();
InspectableObject packedRow;
Table tbl = db.getTable(currentDb, tableName);
while ((packedRow = ftOp.getNextRow()) != null) {
if (packedRow.oi.getCategory() != ObjectInspector.Category.STRUCT) {
throw new HiveException("Unexpected object type encountered while unpacking row");
}
List<ColumnStatisticsObj> statsObjs = new ArrayList<ColumnStatisticsObj>();
StructObjectInspector soi = (StructObjectInspector) packedRow.oi;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
List<Object> list = soi.getStructFieldsDataAsList(packedRow.o);
List<FieldSchema> partColSchema = tbl.getPartCols();
// Partition columns are appended at end, we only care about stats column
int numOfStatCols = isTblLevel ? fields.size() : fields.size() - partColSchema.size();
for (int i = 0; i < numOfStatCols; i++) {
// Get the field objectInspector, fieldName and the field object.
ObjectInspector foi = fields.get(i).getFieldObjectInspector();
Object f = (list == null ? null : list.get(i));
String fieldName = fields.get(i).getFieldName();
ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
statsObj.setColName(colName.get(i));
statsObj.setColType(colType.get(i));
try {
unpackStructObject(foi, f, fieldName, statsObj);
statsObjs.add(statsObj);
} catch (UnsupportedDoubleException e) {
// due to infinity or nan.
LOG.info("Because " + colName.get(i) + " is infinite or NaN, we skip stats.");
}
}
if (!isTblLevel) {
List<String> partVals = new ArrayList<String>();
// Iterate over partition columns to figure out partition name
for (int i = fields.size() - partColSchema.size(); i < fields.size(); i++) {
Object partVal = ((PrimitiveObjectInspector)fields.get(i).getFieldObjectInspector()).
getPrimitiveJavaObject(list.get(i));
partVals.add(partVal == null ? // could be null for default partition
this.conf.getVar(ConfVars.DEFAULTPARTITIONNAME) : partVal.toString());
}
partName = Warehouse.makePartName(partColSchema, partVals);
}
String [] names = Utilities.getDbTableName(currentDb, tableName);
ColumnStatisticsDesc statsDesc = getColumnStatsDesc(names[0], names[1], partName, isTblLevel);
ColumnStatistics colStats = new ColumnStatistics();
colStats.setStatsDesc(statsDesc);
colStats.setStatsObj(statsObjs);
if (!statsObjs.isEmpty()) {
stats.add(colStats);
}
}
ftOp.clearFetchContext();
return stats;
}
private ColumnStatisticsDesc getColumnStatsDesc(String dbName, String tableName,
String partName, boolean isTblLevel)
{
ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc();
statsDesc.setDbName(dbName);
statsDesc.setTableName(tableName);
statsDesc.setIsTblLevel(isTblLevel);
if (!isTblLevel) {
statsDesc.setPartName(partName);
} else {
statsDesc.setPartName(null);
}
return statsDesc;
}
private int persistColumnStats(Hive db) throws HiveException, MetaException, IOException {
// Construct a column statistics object from the result
List<ColumnStatistics> colStats = constructColumnStatsFromPackedRows(db);
// Persist the column statistics object to the metastore
// Note, this function is shared for both table and partition column stats.
if (colStats.isEmpty()) {
return 0;
}
SetPartitionsStatsRequest request = new SetPartitionsStatsRequest(colStats);
if (work.getColStats() != null && work.getColStats().getNumBitVector() > 0) {
request.setNeedMerge(true);
}
db.setPartitionColumnStatistics(request);
return 0;
}
@Override
public int execute(DriverContext driverContext) {
if (driverContext.getCtx().getExplainAnalyze() == AnalyzeState.RUNNING) {
return 0;
}
try {
Hive db = getHive();
return persistColumnStats(db);
} catch (Exception e) {
LOG.error("Failed to run column stats task", e);
}
return 1;
}
@Override
public StageType getType() {
return StageType.COLUMNSTATS;
}
@Override
public String getName() {
return "COLUMNSTATS TASK";
}
}