/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.RecordIdentifier; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.StringUtils; import com.google.common.annotations.VisibleForTesting; /** * Map operator. This triggers overall map side processing. This is a little * different from regular operators in that it starts off by processing a * Writable data structure from a Table (instead of a Hive Object). **/ @SuppressWarnings("deprecation") public class MapOperator extends AbstractMapOperator { private static final long serialVersionUID = 1L; protected transient long cntr = 1; protected transient long logEveryNRows = 0; // input path --> {operator --> context} private final Map<String, Map<Operator<?>, MapOpCtx>> opCtxMap = new HashMap<String, Map<Operator<?>, MapOpCtx>>(); // child operator --> object inspector (converted OI if it's needed) private final Map<Operator<?>, StructObjectInspector> childrenOpToOI = new HashMap<Operator<?>, StructObjectInspector>(); // context for current input file protected transient MapOpCtx[] currentCtxs; protected static class MapOpCtx { final String alias; final Operator<?> op; final PartitionDesc partDesc; StructObjectInspector partObjectInspector; // partition columns StructObjectInspector vcsObjectInspector; // virtual columns StructObjectInspector rowObjectInspector; Converter partTblObjectInspectorConverter; Object[] rowWithPart; Object[] rowWithPartAndVC; Deserializer deserializer; String tableName; String partName; List<VirtualColumn> vcs; Object[] vcValues; public MapOpCtx(String alias, Operator<?> op, PartitionDesc partDesc) { this.alias = alias; this.op = op; this.partDesc = partDesc; } private boolean isPartitioned() { return partObjectInspector != null; } private boolean hasVC() { return vcsObjectInspector != null; } private Object readRow(Writable value, ExecMapperContext context) throws SerDeException { Object deserialized = deserializer.deserialize(value); Object row = partTblObjectInspectorConverter.convert(deserialized); if (hasVC()) { rowWithPartAndVC[0] = row; if (context != null) { populateVirtualColumnValues(context, vcs, vcValues, deserializer); } int vcPos = isPartitioned() ? 2 : 1; rowWithPartAndVC[vcPos] = vcValues; return rowWithPartAndVC; } else if (isPartitioned()) { rowWithPart[0] = row; return rowWithPart; } return row; } public boolean forward(Object row) throws HiveException { if (op.getDone()) { return false; } op.process(row, 0); return true; } } /** * Initializes this map op as the root of the tree. It sets JobConf & * MapRedWork and starts initialization of the operator tree rooted at this * op. * * @param hconf * @param mapWork * @throws HiveException */ @VisibleForTesting void initializeAsRoot(JobConf hconf, MapWork mapWork) throws Exception { setConf(mapWork); setChildren(hconf); passExecContext(new ExecMapperContext(hconf)); initializeMapOperator(hconf); } private MapOpCtx initObjectInspector(Configuration hconf, MapOpCtx opCtx, StructObjectInspector tableRowOI) throws Exception { PartitionDesc pd = opCtx.partDesc; TableDesc td = pd.getTableDesc(); // Use table properties in case of unpartitioned tables, // and the union of table properties and partition properties, with partition // taking precedence, in the case of partitioned tables Properties overlayedProps = SerDeUtils.createOverlayedProperties(td.getProperties(), pd.getProperties()); Map<String, String> partSpec = pd.getPartSpec(); opCtx.tableName = String.valueOf(overlayedProps.getProperty("name")); opCtx.partName = String.valueOf(partSpec); opCtx.deserializer = pd.getDeserializer(hconf); StructObjectInspector partRawRowObjectInspector; boolean isAcid = AcidUtils.isTablePropertyTransactional(td.getProperties()); if (Utilities.isSchemaEvolutionEnabled(hconf, isAcid) && Utilities.isInputFileFormatSelfDescribing(pd)) { partRawRowObjectInspector = tableRowOI; } else { partRawRowObjectInspector = (StructObjectInspector) opCtx.deserializer.getObjectInspector(); } opCtx.partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter(partRawRowObjectInspector, tableRowOI); // Next check if this table has partitions and if so // get the list of partition names as well as allocate // the serdes for the partition columns String pcols = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); if (pcols != null && pcols.length() > 0) { String[] partKeys = pcols.trim().split("/"); String pcolTypes = overlayedProps .getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); String[] partKeyTypes = pcolTypes.trim().split(":"); if (partKeys.length > partKeyTypes.length) { throw new HiveException("Internal error : partKeys length, " +partKeys.length + " greater than partKeyTypes length, " + partKeyTypes.length); } List<String> partNames = new ArrayList<String>(partKeys.length); Object[] partValues = new Object[partKeys.length]; List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(partKeys.length); for (int i = 0; i < partKeys.length; i++) { String key = partKeys[i]; partNames.add(key); ObjectInspector oi = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector (TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i])); // Partitions do not exist for this table if (partSpec == null) { // for partitionless table, initialize partValue to null partValues[i] = null; } else { partValues[i] = ObjectInspectorConverters. getConverter(PrimitiveObjectInspectorFactory. javaStringObjectInspector, oi).convert(partSpec.get(key)); } partObjectInspectors.add(oi); } opCtx.rowWithPart = new Object[] {null, partValues}; opCtx.partObjectInspector = ObjectInspectorFactory .getStandardStructObjectInspector(partNames, partObjectInspectors); } // The op may not be a TableScan for mapjoins // Consider the query: select /*+MAPJOIN(a)*/ count(*) FROM T1 a JOIN T2 b ON a.key = b.key; // In that case, it will be a Select, but the rowOI need not be amended if (opCtx.op instanceof TableScanOperator) { TableScanOperator tsOp = (TableScanOperator) opCtx.op; TableScanDesc tsDesc = tsOp.getConf(); if (tsDesc != null && tsDesc.hasVirtualCols()) { opCtx.vcs = tsDesc.getVirtualCols(); opCtx.vcValues = new Object[opCtx.vcs.size()]; opCtx.vcsObjectInspector = VirtualColumn.getVCSObjectInspector(opCtx.vcs); if (opCtx.isPartitioned()) { opCtx.rowWithPartAndVC = Arrays.copyOfRange(opCtx.rowWithPart, 0, 3); } else { opCtx.rowWithPartAndVC = new Object[2]; } } } if (!opCtx.hasVC() && !opCtx.isPartitioned()) { opCtx.rowObjectInspector = tableRowOI; return opCtx; } List<StructObjectInspector> inspectors = new ArrayList<StructObjectInspector>(); inspectors.add(tableRowOI); if (opCtx.isPartitioned()) { inspectors.add(opCtx.partObjectInspector); } if (opCtx.hasVC()) { inspectors.add(opCtx.vcsObjectInspector); } opCtx.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(inspectors); return opCtx; } // Return the mapping for table descriptor to the expected table OI /** * Traverse all the partitions for a table, and get the OI for the table. * Note that a conversion is required if any of the partition OI is different * from the table OI. For eg. if the query references table T (partitions P1, P2), * and P1's schema is same as T, whereas P2's scheme is different from T, conversion * might be needed for both P1 and P2, since SettableOI might be needed for T */ private Map<TableDesc, StructObjectInspector> getConvertedOI(Map<String, Configuration> tableToConf) throws HiveException { Map<TableDesc, StructObjectInspector> tableDescOI = new HashMap<TableDesc, StructObjectInspector>(); Set<TableDesc> identityConverterTableDesc = new HashSet<TableDesc>(); try { Map<ObjectInspector, Boolean> oiSettableProperties = new HashMap<ObjectInspector, Boolean>(); for (Path onefile : conf.getPathToAliases().keySet()) { PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile); TableDesc tableDesc = pd.getTableDesc(); Configuration hconf = tableToConf.get(tableDesc.getTableName()); Deserializer partDeserializer = pd.getDeserializer(hconf); StructObjectInspector partRawRowObjectInspector; boolean isAcid = AcidUtils.isTablePropertyTransactional(tableDesc.getProperties()); if (Utilities.isSchemaEvolutionEnabled(hconf, isAcid) && Utilities.isInputFileFormatSelfDescribing(pd)) { Deserializer tblDeserializer = tableDesc.getDeserializer(hconf); partRawRowObjectInspector = (StructObjectInspector) tblDeserializer.getObjectInspector(); } else { partRawRowObjectInspector = (StructObjectInspector) partDeserializer.getObjectInspector(); } StructObjectInspector tblRawRowObjectInspector = tableDescOI.get(tableDesc); if ((tblRawRowObjectInspector == null) || (identityConverterTableDesc.contains(tableDesc))) { Deserializer tblDeserializer = tableDesc.getDeserializer(hconf); tblRawRowObjectInspector = (StructObjectInspector) ObjectInspectorConverters.getConvertedOI( partRawRowObjectInspector, tblDeserializer.getObjectInspector(), oiSettableProperties); if (identityConverterTableDesc.contains(tableDesc)) { if (!partRawRowObjectInspector.equals(tblRawRowObjectInspector)) { identityConverterTableDesc.remove(tableDesc); } } else if (partRawRowObjectInspector.equals(tblRawRowObjectInspector)) { identityConverterTableDesc.add(tableDesc); } tableDescOI.put(tableDesc, tblRawRowObjectInspector); } } } catch (Exception e) { throw new HiveException(e); } return tableDescOI; } /** * For each source table, combine the nested column pruning information from all its * table scan descriptors and set it in a configuration copy. This is necessary since * the configuration property "READ_NESTED_COLUMN_PATH_CONF_STR" is set on a per-table * basis, so we can't just use a single configuration for all the tables. */ private Map<String, Configuration> cloneConfsForNestedColPruning(Configuration hconf) { Map<String, Configuration> tableNameToConf = new HashMap<>(); for (Map.Entry<Path, ArrayList<String>> e : conf.getPathToAliases().entrySet()) { List<String> aliases = e.getValue(); if (aliases == null || aliases.isEmpty()) { continue; } String tableName = conf.getPathToPartitionInfo().get(e.getKey()).getTableName(); for (String alias: aliases) { Operator<?> rootOp = conf.getAliasToWork().get(alias); if (!(rootOp instanceof TableScanOperator)) { continue; } TableScanDesc tableScanDesc = ((TableScanOperator) rootOp).getConf(); List<String> nestedColumnPaths = tableScanDesc.getNeededNestedColumnPaths(); if (nestedColumnPaths == null || nestedColumnPaths.isEmpty()) { continue; } if (!tableNameToConf.containsKey(tableName)) { Configuration clonedConf = new Configuration(hconf); clonedConf.unset(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR); tableNameToConf.put(tableName, clonedConf); } Configuration newConf = tableNameToConf.get(tableName); ColumnProjectionUtils.appendNestedColumnPaths(newConf, nestedColumnPaths); } } // Assign tables without nested column pruning info to the default conf for (PartitionDesc pd : conf.getPathToPartitionInfo().values()) { if (!tableNameToConf.containsKey(pd.getTableName())) { tableNameToConf.put(pd.getTableName(), hconf); } } for (PartitionDesc pd: conf.getAliasToPartnInfo().values()) { if (!tableNameToConf.containsKey(pd.getTableName())) { tableNameToConf.put(pd.getTableName(), hconf); } } return tableNameToConf; } /* * This is the same as the setChildren method below but for empty tables. * It takes care of the following: * 1. Create the right object inspector. * 2. Set up the childrenOpToOI with the object inspector. * So as to ensure that the initialization happens correctly. */ public void initEmptyInputChildren(List<Operator<?>> children, Configuration hconf) throws SerDeException, Exception { setChildOperators(children); Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf); for (Operator<?> child : children) { TableScanOperator tsOp = (TableScanOperator) child; StructObjectInspector soi = null; PartitionDesc partDesc = conf.getAliasToPartnInfo().get(tsOp.getConf().getAlias()); Configuration newConf = tableNameToConf.get(partDesc.getTableDesc().getTableName()); Deserializer serde = partDesc.getTableDesc().getDeserializer(); partDesc.setProperties(partDesc.getProperties()); MapOpCtx opCtx = new MapOpCtx(tsOp.getConf().getAlias(), child, partDesc); StructObjectInspector tableRowOI = (StructObjectInspector) serde.getObjectInspector(); initObjectInspector(newConf, opCtx, tableRowOI); soi = opCtx.rowObjectInspector; child.getParentOperators().add(this); childrenOpToOI.put(child, soi); } } public void setChildren(Configuration hconf) throws Exception { List<Operator<? extends OperatorDesc>> children = new ArrayList<Operator<? extends OperatorDesc>>(); Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf); Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf); for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) { Path onefile = entry.getKey(); List<String> aliases = entry.getValue(); PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile); TableDesc tableDesc = partDesc.getTableDesc(); Configuration newConf = tableNameToConf.get(tableDesc.getTableName()); for (String alias : aliases) { Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias); if (isLogDebugEnabled) { LOG.debug("Adding alias " + alias + " to work list for file " + onefile); } Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(onefile.toString()); if (contexts == null) { opCtxMap.put(onefile.toString(), contexts = new LinkedHashMap<Operator<?>, MapOpCtx>()); } if (contexts.containsKey(op)) { continue; } MapOpCtx context = new MapOpCtx(alias, op, partDesc); StructObjectInspector tableRowOI = convertedOI.get(partDesc.getTableDesc()); contexts.put(op, initObjectInspector(newConf, context, tableRowOI)); if (children.contains(op) == false) { op.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(1)); op.getParentOperators().add(this); children.add(op); } } } initOperatorContext(children); // we found all the operators that we are supposed to process. setChildOperators(children); } private void initOperatorContext(List<Operator<? extends OperatorDesc>> children) throws HiveException { for (Map<Operator<?>, MapOpCtx> contexts : opCtxMap.values()) { for (MapOpCtx context : contexts.values()) { if (!children.contains(context.op)) { continue; } StructObjectInspector prev = childrenOpToOI.put(context.op, context.rowObjectInspector); if (prev != null && !prev.equals(context.rowObjectInspector)) { throw new HiveException("Conflict on row inspector for " + context.alias); } if (isLogDebugEnabled) { LOG.debug("dump " + context.op + " " + context.rowObjectInspector.getTypeName()); } } } } /** Kryo ctor. */ protected MapOperator() { super(); } public MapOperator(CompilationOpContext ctx) { super(ctx); } @Override public void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); } public void initializeMapOperator(Configuration hconf) throws HiveException { super.initializeMapOperator(hconf); cntr = 1; logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS); for (Entry<Operator<?>, StructObjectInspector> entry : childrenOpToOI.entrySet()) { Operator<?> child = entry.getKey(); child.initialize(hconf, new ObjectInspector[] {entry.getValue()}); } } // Find context for current input file @Override public void cleanUpInputFileChangedOp() throws HiveException { super.cleanUpInputFileChangedOp(); Path fpath = getExecContext().getCurrentInputPath(); String nominalPath = getNominalPath(fpath); Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(nominalPath); if (isLogInfoEnabled) { StringBuilder builder = new StringBuilder(); for (MapOpCtx context : contexts.values()) { if (builder.length() > 0) { builder.append(", "); } builder.append(context.alias); } if (isLogDebugEnabled) { LOG.debug("Processing alias(es) " + builder.toString() + " for file " + fpath); } } // Add alias, table name, and partitions to hadoop conf so that their // children will inherit these for (Entry<Operator<?>, MapOpCtx> entry : contexts.entrySet()) { Operator<?> operator = entry.getKey(); MapOpCtx context = entry.getValue(); operator.setInputContext(context.tableName, context.partName); } currentCtxs = contexts.values().toArray(new MapOpCtx[contexts.size()]); } public void process(Writable value) throws HiveException { // A mapper can span multiple files/partitions. // The serializers need to be reset if the input file changed ExecMapperContext context = getExecContext(); if (context != null && context.inputFileChanged()) { // The child operators cleanup if input file has changed cleanUpInputFileChanged(); } int childrenDone = 0; for (MapOpCtx current : currentCtxs) { Object row = null; try { row = current.readRow(value, context); if (!current.forward(row)) { childrenDone++; } } catch (Exception e) { // TODO: policy on deserialization errors String message = null; try { message = toErrorMessage(value, row, current.rowObjectInspector); } catch (Throwable t) { message = "[" + row + ", " + value + "]: cannot get error message " + t.getMessage(); } if (row == null) { deserialize_error_count.set(deserialize_error_count.get() + 1); throw new HiveException("Hive Runtime Error while processing writable " + message, e); } throw new HiveException("Hive Runtime Error while processing row " + message, e); } } rowsForwarded(childrenDone, 1); } protected final void rowsForwarded(int childrenDone, int rows) { numRows += rows; if (isLogInfoEnabled) { while (numRows >= cntr) { cntr = logEveryNRows == 0 ? cntr * 10 : numRows + logEveryNRows; if (cntr < 0 || numRows < 0) { cntr = 1; numRows = 0; } LOG.info(toString() + ": records read - " + numRows); } } if (childrenDone == currentCtxs.length) { setDone(true); } } private String toErrorMessage(Writable value, Object row, ObjectInspector inspector) { try { if (row != null) { return SerDeUtils.getJSONString(row, inspector); } return String.valueOf(value); } catch (Exception e) { return "[Error getting row data with exception " + StringUtils.stringifyException(e) + " ]"; } } public static Object[] populateVirtualColumnValues(ExecMapperContext ctx, List<VirtualColumn> vcs, Object[] vcValues, Deserializer deserializer) { if (vcs == null) { return vcValues; } if (vcValues == null) { vcValues = new Object[vcs.size()]; } for (int i = 0; i < vcs.size(); i++) { switch(vcs.get(i)) { case FILENAME : if (ctx.inputFileChanged()) { vcValues[i] = new Text(ctx.getCurrentInputPath().toString()); } break; case BLOCKOFFSET: { long current = ctx.getIoCxt().getCurrentBlockStart(); LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } break; case ROWOFFSET: { long current = ctx.getIoCxt().getCurrentRow(); LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } break; case RAWDATASIZE: long current = 0L; SerDeStats stats = deserializer.getSerDeStats(); if(stats != null) { current = stats.getRawDataSize(); } LongWritable old = (LongWritable) vcValues[i]; if (old == null) { old = new LongWritable(current); vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } break; case ROWID: if(ctx.getIoCxt().getRecordIdentifier() == null) { vcValues[i] = null; } else { if(vcValues[i] == null) { vcValues[i] = new Object[RecordIdentifier.Field.values().length]; } RecordIdentifier.StructInfo.toArray(ctx.getIoCxt().getRecordIdentifier(), (Object[])vcValues[i]); ctx.getIoCxt().setRecordIdentifier(null);//so we don't accidentally cache the value; shouldn't //happen since IO layer either knows how to produce ROW__ID or not - but to be safe } break; } } return vcValues; } @Override public void process(Object row, int tag) throws HiveException { throw new HiveException("Hive 2 Internal error: should not be called!"); } @Override public String getName() { return MapOperator.getOperatorName(); } static public String getOperatorName() { return "MAP"; } @Override public OperatorType getType() { return null; } public void initializeContexts() { Path fpath = getExecContext().getCurrentInputPath(); String nominalPath = getNominalPath(fpath); Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(nominalPath); currentCtxs = contexts.values().toArray(new MapOpCtx[contexts.size()]); } public Deserializer getCurrentDeserializer() { return currentCtxs[0].deserializer; } }