/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.index.bitmap; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.ql.Driver; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.index.HiveIndexQueryContext; import org.apache.hadoop.hive.ql.index.HiveIndexedInputFormat; import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer; import org.apache.hadoop.hive.ql.index.IndexSearchCondition; import org.apache.hadoop.hive.ql.index.TableBasedIndexHandler; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.IndexUtils; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; /** * Index handler for the bitmap index. Bitmap index uses an EWAH-compressed * bitmap to represent the values in a table. */ public class BitmapIndexHandler extends TableBasedIndexHandler { private Configuration configuration; private static final Logger LOG = LoggerFactory.getLogger(BitmapIndexHandler.class.getName()); @Override public void generateIndexQuery(List<Index> indexes, ExprNodeDesc predicate, ParseContext pctx, HiveIndexQueryContext queryContext) { Map<Index, ExprNodeDesc> indexPredicates = decomposePredicate( predicate, indexes, queryContext); if (indexPredicates == null) { LOG.info("No decomposed predicate found"); queryContext.setQueryTasks(null); return; // abort if we couldn't pull out anything from the predicate } List<BitmapInnerQuery> iqs = new ArrayList<BitmapInnerQuery>(indexes.size()); int i = 0; for (Index index : indexes) { ExprNodeDesc indexPredicate = indexPredicates.get(index); if (indexPredicate != null) { iqs.add(new BitmapInnerQuery( index.getIndexTableName(), indexPredicate, "ind" + i++)); } } // setup TableScanOperator to change input format for original query queryContext.setIndexInputFormat(HiveIndexedInputFormat.class.getName()); // Build reentrant QL for index query StringBuilder qlCommand = new StringBuilder("INSERT OVERWRITE DIRECTORY "); String tmpFile = pctx.getContext().getMRTmpPath().toUri().toString(); qlCommand.append( "\"" + tmpFile + "\" "); // QL includes " around file name qlCommand.append("SELECT bucketname AS `_bucketname` , COLLECT_SET(offset) AS `_offsets` FROM "); qlCommand.append("(SELECT `_bucketname` AS bucketname , `_offset` AS offset FROM "); BitmapQuery head = iqs.get(0); for ( i = 1; i < iqs.size(); i++) { head = new BitmapOuterQuery("oind"+i, head, iqs.get(i)); } qlCommand.append(head.toString()); qlCommand.append(" WHERE NOT EWAH_BITMAP_EMPTY(" + head.getAlias() + ".`_bitmaps`) ) tmp_index GROUP BY bucketname"); // generate tasks from index query string LOG.info("Generating tasks for re-entrant QL query: " + qlCommand.toString()); HiveConf queryConf = new HiveConf(pctx.getConf(), BitmapIndexHandler.class); HiveConf.setBoolVar(queryConf, HiveConf.ConfVars.COMPRESSRESULT, false); Driver driver = new Driver(queryConf); driver.compile(qlCommand.toString(), false); queryContext.setIndexIntermediateFile(tmpFile); queryContext.addAdditionalSemanticInputs(driver.getPlan().getInputs()); queryContext.setQueryTasks(driver.getPlan().getRootTasks()); } /** * Split the predicate into the piece we can deal with (pushed), and the one we can't (residual) * @param predicate * @param index * @return */ private Map<Index, ExprNodeDesc> decomposePredicate(ExprNodeDesc predicate, List<Index> indexes, HiveIndexQueryContext queryContext) { Map<Index, ExprNodeDesc> indexPredicates = new HashMap<Index, ExprNodeDesc>(); // compute overall residual IndexPredicateAnalyzer analyzer = getIndexPredicateAnalyzer(indexes, queryContext.getQueryPartitions()); List<IndexSearchCondition> searchConditions = new ArrayList<IndexSearchCondition>(); ExprNodeDesc residualPredicate = analyzer.analyzePredicate(predicate, searchConditions); // pass residual predicate back out for further processing queryContext.setResidualPredicate(residualPredicate); if (searchConditions.size() == 0) { return null; } for (Index index : indexes) { ArrayList<Index> in = new ArrayList<Index>(1); in.add(index); analyzer = getIndexPredicateAnalyzer(in, queryContext.getQueryPartitions()); searchConditions = new ArrayList<IndexSearchCondition>(); // split predicate into pushed (what we can handle), and residual (what we can't handle) // pushed predicate from translateSearchConditions is stored for the current index // This ensures that we apply all possible predicates to each index analyzer.analyzePredicate(predicate, searchConditions); if (searchConditions.size() == 0) { indexPredicates.put(index, null); } else { indexPredicates.put(index, analyzer.translateSearchConditions(searchConditions)); } } return indexPredicates; } /** * Instantiate a new predicate analyzer suitable for determining * whether we can use an index, based on rules for indexes in * WHERE clauses that we support * * @return preconfigured predicate analyzer for WHERE queries */ private IndexPredicateAnalyzer getIndexPredicateAnalyzer(List<Index> indexes, Set<Partition> queryPartitions) { IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer(); analyzer.addComparisonOp(GenericUDFOPEqual.class.getName()); analyzer.addComparisonOp(GenericUDFOPLessThan.class.getName()); analyzer.addComparisonOp(GenericUDFOPEqualOrLessThan.class.getName()); analyzer.addComparisonOp(GenericUDFOPGreaterThan.class.getName()); analyzer.addComparisonOp(GenericUDFOPEqualOrGreaterThan.class.getName()); // only return results for columns in the list of indexes for (Index index : indexes) { List<FieldSchema> columnSchemas = index.getSd().getCols(); for (FieldSchema column : columnSchemas) { analyzer.allowColumnName(column.getName()); } } // partitioned columns are treated as if they have indexes so that the partitions // are used during the index query generation for (Partition part : queryPartitions) { if (part.getSpec().isEmpty()) { continue; // empty partitions are from whole tables, so we don't want to add them in } for (String column : part.getSpec().keySet()) { analyzer.allowColumnName(column); } } return analyzer; } @Override public void analyzeIndexDefinition(Table baseTable, Index index, Table indexTable) throws HiveException { StorageDescriptor storageDesc = index.getSd(); if (this.usesIndexTable() && indexTable != null) { StorageDescriptor indexTableSd = storageDesc.deepCopy(); List<FieldSchema> indexTblCols = indexTableSd.getCols(); FieldSchema bucketFileName = new FieldSchema("_bucketname", "string", ""); indexTblCols.add(bucketFileName); FieldSchema offSets = new FieldSchema("_offset", "bigint", ""); indexTblCols.add(offSets); FieldSchema bitmaps = new FieldSchema("_bitmaps", "array<bigint>", ""); indexTblCols.add(bitmaps); indexTable.setSd(indexTableSd); } } @Override protected Task<?> getIndexBuilderMapRedTask(Set<ReadEntity> inputs, Set<WriteEntity> outputs, List<FieldSchema> indexField, boolean partitioned, PartitionDesc indexTblPartDesc, String indexTableName, PartitionDesc baseTablePartDesc, String baseTableName, String dbName) throws HiveException { HiveConf builderConf = new HiveConf(getConf(), BitmapIndexHandler.class); HiveConf.setBoolVar(builderConf, HiveConf.ConfVars.HIVEROWOFFSET, true); String indexCols = HiveUtils.getUnparsedColumnNamesFromFieldSchema(indexField); //form a new insert overwrite query. StringBuilder command= new StringBuilder(); LinkedHashMap<String, String> partSpec = indexTblPartDesc.getPartSpec(); command.append("INSERT OVERWRITE TABLE " + HiveUtils.unparseIdentifier(dbName) + "." + HiveUtils.unparseIdentifier(indexTableName )); if (partitioned && indexTblPartDesc != null) { command.append(" PARTITION ( "); List<String> ret = getPartKVPairStringArray(partSpec); for (int i = 0; i < ret.size(); i++) { String partKV = ret.get(i); command.append(partKV); if (i < ret.size() - 1) { command.append(","); } } command.append(" ) "); } command.append(" SELECT "); command.append(indexCols); command.append(","); command.append(VirtualColumn.FILENAME.getName()); command.append(","); command.append(VirtualColumn.BLOCKOFFSET.getName()); command.append(","); command.append("EWAH_BITMAP("); command.append(VirtualColumn.ROWOFFSET.getName()); command.append(")"); command.append(" FROM " + HiveUtils.unparseIdentifier(dbName) + "." + HiveUtils.unparseIdentifier(baseTableName)); LinkedHashMap<String, String> basePartSpec = baseTablePartDesc.getPartSpec(); if(basePartSpec != null) { command.append(" WHERE "); List<String> pkv = getPartKVPairStringArray(basePartSpec); for (int i = 0; i < pkv.size(); i++) { String partKV = pkv.get(i); command.append(partKV); if (i < pkv.size() - 1) { command.append(" AND "); } } } command.append(" GROUP BY "); command.append(VirtualColumn.FILENAME.getName()); command.append(","); command.append(VirtualColumn.BLOCKOFFSET.getName()); for (FieldSchema fieldSchema : indexField) { command.append(","); command.append(HiveUtils.unparseIdentifier(fieldSchema.getName())); } // Require clusterby ROWOFFSET if map-size aggregation is off. // TODO: Make this work without map side aggregation if (!builderConf.get("hive.map.aggr", null).equals("true")) { throw new HiveException("Cannot construct index without map-side aggregation"); } Task<?> rootTask = IndexUtils.createRootTask(builderConf, inputs, outputs, command, partSpec, indexTableName, dbName); return rootTask; } @Override /** * No lower bound on bitmap index query size, so this will always return true */ public boolean checkQuerySize(long querySize, HiveConf hiveConf) { return true; } @Override public boolean usesIndexTable() { return true; } }