/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.parquet; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.RowSchema; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.mapred.JobConf; public class ProjectionPusher { private static final Logger LOG = LoggerFactory.getLogger(ProjectionPusher.class); private final Map<Path, PartitionDesc> pathToPartitionInfo = new LinkedHashMap<>(); /** * MapWork is the Hive object which describes input files, * columns projections, and filters. */ private MapWork mapWork; /** * Sets the mapWork variable based on the current JobConf in order to get all partitions. * * @param job */ private void updateMrWork(final JobConf job) { final String plan = HiveConf.getVar(job, HiveConf.ConfVars.PLAN); if (mapWork == null && plan != null && plan.length() > 0) { mapWork = Utilities.getMapWork(job); pathToPartitionInfo.clear(); for (final Map.Entry<Path, PartitionDesc> entry : mapWork.getPathToPartitionInfo().entrySet()) { // key contains scheme (such as pfile://) and we want only the path portion fix in HIVE-6366 pathToPartitionInfo.put(Path.getPathWithoutSchemeAndAuthority(entry.getKey()), entry.getValue()); } } } private void pushProjectionsAndFilters(final JobConf jobConf, final String splitPath, final String splitPathWithNoSchema) { if (mapWork == null) { return; } else if (mapWork.getPathToAliases() == null) { return; } final Set<String> aliases = new HashSet<String>(); final Iterator<Entry<Path, ArrayList<String>>> iterator = mapWork.getPathToAliases().entrySet().iterator(); while (iterator.hasNext()) { final Entry<Path, ArrayList<String>> entry = iterator.next(); final String key = entry.getKey().toUri().getPath(); if (splitPath.equals(key) || splitPathWithNoSchema.equals(key)) { aliases.addAll(entry.getValue()); } } // Collect the needed columns from all the aliases and create ORed filter // expression for the table. boolean allColumnsNeeded = false; boolean noFilters = false; Set<Integer> neededColumnIDs = new HashSet<Integer>(); // To support nested column pruning, we need to track the path from the top to the nested // fields Set<String> neededNestedColumnPaths = new HashSet<String>(); List<ExprNodeGenericFuncDesc> filterExprs = new ArrayList<ExprNodeGenericFuncDesc>(); RowSchema rowSchema = null; for(String alias : aliases) { final Operator<? extends Serializable> op = mapWork.getAliasToWork().get(alias); if (op != null && op instanceof TableScanOperator) { final TableScanOperator ts = (TableScanOperator) op; if (ts.getNeededColumnIDs() == null) { allColumnsNeeded = true; } else { neededColumnIDs.addAll(ts.getNeededColumnIDs()); neededNestedColumnPaths.addAll(ts.getNeededNestedColumnPaths()); } rowSchema = ts.getSchema(); ExprNodeGenericFuncDesc filterExpr = ts.getConf() == null ? null : ts.getConf().getFilterExpr(); noFilters = filterExpr == null; // No filter if any TS has no filter expression filterExprs.add(filterExpr); } } ExprNodeGenericFuncDesc tableFilterExpr = null; if (!noFilters) { try { for (ExprNodeGenericFuncDesc filterExpr : filterExprs) { if (tableFilterExpr == null ) { tableFilterExpr = filterExpr; } else { tableFilterExpr = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), Arrays.<ExprNodeDesc>asList(tableFilterExpr, filterExpr)); } } } catch(UDFArgumentException ex) { LOG.debug("Turn off filtering due to " + ex); tableFilterExpr = null; } } // push down projections if (!allColumnsNeeded) { if (!neededColumnIDs.isEmpty()) { ColumnProjectionUtils.appendReadColumns(jobConf, new ArrayList<Integer>(neededColumnIDs)); ColumnProjectionUtils.appendNestedColumnPaths(jobConf, new ArrayList<String>(neededNestedColumnPaths)); } } else { ColumnProjectionUtils.setReadAllColumns(jobConf); } pushFilters(jobConf, rowSchema, tableFilterExpr); } private void pushFilters(final JobConf jobConf, RowSchema rowSchema, ExprNodeGenericFuncDesc filterExpr) { // construct column name list for reference by filter push down Utilities.setColumnNameList(jobConf, rowSchema); // push down filters if (filterExpr == null) { LOG.debug("Not pushing filters because FilterExpr is null"); return; } final String filterText = filterExpr.getExprString(); final String filterExprSerialized = SerializationUtilities.serializeExpression(filterExpr); jobConf.set( TableScanDesc.FILTER_TEXT_CONF_STR, filterText); jobConf.set( TableScanDesc.FILTER_EXPR_CONF_STR, filterExprSerialized); } public JobConf pushProjectionsAndFilters(JobConf jobConf, Path path) throws IOException { updateMrWork(jobConf); // TODO: refactor this in HIVE-6366 final JobConf cloneJobConf = new JobConf(jobConf); final PartitionDesc part = pathToPartitionInfo.get(path); try { if ((part != null) && (part.getTableDesc() != null)) { Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), cloneJobConf); } } catch (Exception e) { throw new IOException(e); } pushProjectionsAndFilters(cloneJobConf, path.toString(), path.toUri().getPath()); return cloneJobConf; } }