/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hadoop.hive.metastore.hbase.stats;
import java.util.List;
import org.apache.hadoop.hive.metastore.NumDistinctValueEstimator;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.StringColumnStatsData;
public class StringColumnStatsAggregator extends ColumnStatsAggregator {
@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames,
List<ColumnStatistics> css) throws MetaException {
ColumnStatisticsObj statsObj = null;
// check if all the ColumnStatisticsObjs contain stats and all the ndv are
// bitvectors. Only when both of the conditions are true, we merge bit
// vectors. Otherwise, just use the maximum function.
boolean doAllPartitionContainStats = partNames.size() == css.size();
boolean isNDVBitVectorSet = true;
String colType = null;
for (ColumnStatistics cs : css) {
if (cs.getStatsObjSize() != 1) {
throw new MetaException(
"The number of columns should be exactly one in aggrStats, but found "
+ cs.getStatsObjSize());
}
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
if (statsObj == null) {
colType = cso.getColType();
statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso
.getStatsData().getSetField());
}
if (numBitVectors <= 0 || !cso.getStatsData().getStringStats().isSetBitVectors()
|| cso.getStatsData().getStringStats().getBitVectors().length() == 0) {
isNDVBitVectorSet = false;
break;
}
}
ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
if (doAllPartitionContainStats && isNDVBitVectorSet) {
StringColumnStatsData aggregateData = null;
NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
for (ColumnStatistics cs : css) {
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
StringColumnStatsData newData = cso.getStatsData().getStringStats();
ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(),
ndvEstimator.getnumBitVectors()));
if (aggregateData == null) {
aggregateData = newData.deepCopy();
} else {
aggregateData
.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
aggregateData
.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
}
}
aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
columnStatisticsData.setStringStats(aggregateData);
} else {
StringColumnStatsData aggregateData = null;
for (ColumnStatistics cs : css) {
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
StringColumnStatsData newData = cso.getStatsData().getStringStats();
if (aggregateData == null) {
aggregateData = newData.deepCopy();
} else {
aggregateData
.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
aggregateData
.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
}
}
columnStatisticsData.setStringStats(aggregateData);
}
statsObj.setStatsData(columnStatisticsData);
return statsObj;
}
}