/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.IOContext; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.StringUtils; /** * Map operator. This triggers overall map side processing. This is a little * different from regular operators in that it starts off by processing a * Writable data structure from a Table (instead of a Hive Object). **/ public class MapOperator extends Operator<MapredWork> implements Serializable { private static final long serialVersionUID = 1L; /** * Counter. * */ public static enum Counter { DESERIALIZE_ERRORS } private final transient LongWritable deserialize_error_count = new LongWritable(); private transient Deserializer deserializer; private transient Object[] rowWithPart; private transient Writable[] vcValues; private transient List<VirtualColumn> vcs; private transient Object[] rowWithPartAndVC; private transient StructObjectInspector rowObjectInspector; private transient boolean isPartitioned; private transient boolean hasVC; private Map<MapInputPath, MapOpCtx> opCtxMap; private final Set<MapInputPath> listInputPaths = new HashSet<MapInputPath>(); private Map<Operator<? extends Serializable>, java.util.ArrayList<String>> operatorToPaths; private final Map<Operator<? extends Serializable>, MapOpCtx> childrenOpToOpCtxMap = new HashMap<Operator<? extends Serializable>, MapOpCtx>(); private ArrayList<Operator<? extends Serializable>> extraChildrenToClose = null; private static class MapInputPath { String path; String alias; Operator<? extends Serializable> op; /** * @param path * @param alias * @param op */ public MapInputPath(String path, String alias, Operator<? extends Serializable> op) { this.path = path; this.alias = alias; this.op = op; } @Override public boolean equals(Object o) { if (o instanceof MapInputPath) { MapInputPath mObj = (MapInputPath) o; if (mObj == null) { return false; } return path.equals(mObj.path) && alias.equals(mObj.alias) && op.equals(mObj.op); } return false; } @Override public int hashCode() { int ret = (path == null) ? 0 : path.hashCode(); ret += (alias == null) ? 0 : alias.hashCode(); ret += (op == null) ? 0 : op.hashCode(); return ret; } public Operator<? extends Serializable> getOp() { return op; } public void setOp(Operator<? extends Serializable> op) { this.op = op; } } private static class MapOpCtx { boolean isPartitioned; StructObjectInspector rawRowObjectInspector; // without partition StructObjectInspector partObjectInspector; // partition StructObjectInspector rowObjectInspector; Object[] rowWithPart; Deserializer deserializer; public String tableName; public String partName; /** * @param isPartitioned * @param rowObjectInspector * @param rowWithPart */ public MapOpCtx(boolean isPartitioned, StructObjectInspector rowObjectInspector, StructObjectInspector rawRowObjectInspector, StructObjectInspector partObjectInspector, Object[] rowWithPart, Deserializer deserializer) { this.isPartitioned = isPartitioned; this.rowObjectInspector = rowObjectInspector; this.rawRowObjectInspector = rawRowObjectInspector; this.partObjectInspector = partObjectInspector; this.rowWithPart = rowWithPart; this.deserializer = deserializer; } /** * @return the isPartitioned */ public boolean isPartitioned() { return isPartitioned; } /** * @return the rowObjectInspector */ public StructObjectInspector getRowObjectInspector() { return rowObjectInspector; } /** * @return the rowWithPart */ public Object[] getRowWithPart() { return rowWithPart; } /** * @return the deserializer */ public Deserializer getDeserializer() { return deserializer; } } /** * Initializes this map op as the root of the tree. It sets JobConf & * MapRedWork and starts initialization of the operator tree rooted at this * op. * * @param hconf * @param mrwork * @throws HiveException */ public void initializeAsRoot(Configuration hconf, MapredWork mrwork) throws HiveException { setConf(mrwork); setChildren(hconf); initialize(hconf, null); } private static MapOpCtx initObjectInspector(MapredWork conf, Configuration hconf, String onefile) throws HiveException, ClassNotFoundException, InstantiationException, IllegalAccessException, SerDeException { PartitionDesc td = conf.getPathToPartitionInfo().get(onefile); LinkedHashMap<String, String> partSpec = td.getPartSpec(); Properties tblProps = td.getProperties(); Class sdclass = td.getDeserializerClass(); if (sdclass == null) { String className = td.getSerdeClassName(); if ((className == "") || (className == null)) { throw new HiveException( "SerDe class or the SerDe class name is not set for table: " + td.getProperties().getProperty("name")); } sdclass = hconf.getClassByName(className); } String tableName = String.valueOf(tblProps.getProperty("name")); String partName = String.valueOf(partSpec); // HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME, tableName); // HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, partName); Deserializer deserializer = (Deserializer) sdclass.newInstance(); deserializer.initialize(hconf, tblProps); StructObjectInspector rawRowObjectInspector = (StructObjectInspector) deserializer .getObjectInspector(); MapOpCtx opCtx = null; // Next check if this table has partitions and if so // get the list of partition names as well as allocate // the serdes for the partition columns String pcols = tblProps .getProperty(org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS); // Log LOG = LogFactory.getLog(MapOperator.class.getName()); if (pcols != null && pcols.length() > 0) { String[] partKeys = pcols.trim().split("/"); List<String> partNames = new ArrayList<String>(partKeys.length); Object[] partValues = new Object[partKeys.length]; List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>( partKeys.length); for (int i = 0; i < partKeys.length; i++) { String key = partKeys[i]; partNames.add(key); // Partitions do not exist for this table if (partSpec == null) { partValues[i] = new Text(); } else { partValues[i] = new Text(partSpec.get(key)); } partObjectInspectors .add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); } StructObjectInspector partObjectInspector = ObjectInspectorFactory .getStandardStructObjectInspector(partNames, partObjectInspectors); Object[] rowWithPart = new Object[2]; rowWithPart[1] = partValues; StructObjectInspector rowObjectInspector = ObjectInspectorFactory .getUnionStructObjectInspector(Arrays .asList(new StructObjectInspector[] {rawRowObjectInspector, partObjectInspector})); // LOG.info("dump " + tableName + " " + partName + " " + // rowObjectInspector.getTypeName()); opCtx = new MapOpCtx(true, rowObjectInspector, rawRowObjectInspector, partObjectInspector, rowWithPart, deserializer); } else { // LOG.info("dump2 " + tableName + " " + partName + " " + // rowObjectInspector.getTypeName()); opCtx = new MapOpCtx(false, rawRowObjectInspector, rawRowObjectInspector, null, null, deserializer); } opCtx.tableName = tableName; opCtx.partName = partName; return opCtx; } /** * Set the inspectors given a input. Since a mapper can span multiple partitions, the inspectors * need to be changed if the input changes **/ private void setInspectorInput(MapInputPath inp) { Operator<? extends Serializable> op = inp.getOp(); deserializer = opCtxMap.get(inp).getDeserializer(); isPartitioned = opCtxMap.get(inp).isPartitioned(); rowWithPart = opCtxMap.get(inp).getRowWithPart(); rowObjectInspector = opCtxMap.get(inp).getRowObjectInspector(); if (listInputPaths.contains(inp)) { return; } listInputPaths.add(inp); StructObjectInspector rawRowObjectInspector = opCtxMap.get(inp).rawRowObjectInspector; StructObjectInspector partObjectInspector = opCtxMap.get(inp).partObjectInspector; if (op instanceof TableScanOperator) { TableScanOperator tsOp = (TableScanOperator) op; TableScanDesc tsDesc = tsOp.getConf(); if (tsDesc != null) { this.vcs = tsDesc.getVirtualCols(); if (vcs != null && vcs.size() > 0) { this.hasVC = true; List<String> vcNames = new ArrayList<String>(vcs.size()); this.vcValues = new Writable[vcs.size()]; List<ObjectInspector> vcsObjectInspectors = new ArrayList<ObjectInspector>(vcs.size()); for (int i = 0; i < vcs.size(); i++) { VirtualColumn vc = vcs.get(i); vcsObjectInspectors.add( PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( ((PrimitiveTypeInfo) vc.getTypeInfo()).getPrimitiveCategory())); vcNames.add(vc.getName()); } StructObjectInspector vcStructObjectInspector = ObjectInspectorFactory .getStandardStructObjectInspector(vcNames, vcsObjectInspectors); if (isPartitioned) { this.rowWithPartAndVC = new Object[3]; this.rowWithPartAndVC[1] = this.rowWithPart[1]; } else { this.rowWithPartAndVC = new Object[2]; } if (partObjectInspector == null) { this.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays .asList(new StructObjectInspector[] { rowObjectInspector, vcStructObjectInspector})); } else { this.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays .asList(new StructObjectInspector[] { rawRowObjectInspector, partObjectInspector, vcStructObjectInspector})); } opCtxMap.get(inp).rowObjectInspector = this.rowObjectInspector; } } } } public void setChildren(Configuration hconf) throws HiveException { Path fpath = new Path((new Path(HiveConf.getVar(hconf, HiveConf.ConfVars.HADOOPMAPFILENAME))).toUri().getPath()); ArrayList<Operator<? extends Serializable>> children = new ArrayList<Operator<? extends Serializable>>(); opCtxMap = new HashMap<MapInputPath, MapOpCtx>(); operatorToPaths = new HashMap<Operator<? extends Serializable>, java.util.ArrayList<String>>(); statsMap.put(Counter.DESERIALIZE_ERRORS, deserialize_error_count); try { for (String onefile : conf.getPathToAliases().keySet()) { MapOpCtx opCtx = initObjectInspector(conf, hconf, onefile); Path onepath = new Path(new Path(onefile).toUri().getPath()); List<String> aliases = conf.getPathToAliases().get(onefile); for (String onealias : aliases) { Operator<? extends Serializable> op = conf.getAliasToWork().get( onealias); LOG.info("Adding alias " + onealias + " to work list for file " + onefile); MapInputPath inp = new MapInputPath(onefile, onealias, op); opCtxMap.put(inp, opCtx); if (operatorToPaths.get(op) == null) { operatorToPaths.put(op, new java.util.ArrayList<String>()); } operatorToPaths.get(op).add(onefile); op.setParentOperators(new ArrayList<Operator<? extends Serializable>>()); op.getParentOperators().add(this); // check for the operators who will process rows coming to this Map // Operator if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) { children.add(op); childrenOpToOpCtxMap.put(op, opCtx); LOG.info("dump " + op.getName() + " " + opCtxMap.get(inp).getRowObjectInspector().getTypeName()); } setInspectorInput(inp); } } if (children.size() == 0) { // didn't find match for input file path in configuration! // serious problem .. LOG.error("Configuration does not have any alias for path: " + fpath.toUri().getPath()); throw new HiveException("Configuration and input path are inconsistent"); } // we found all the operators that we are supposed to process. setChildOperators(children); } catch (Exception e) { throw new HiveException(e); } } @Override public void initializeOp(Configuration hconf) throws HiveException { // set that parent initialization is done and call initialize on children state = State.INIT; List<Operator<? extends Serializable>> children = getChildOperators(); for (Entry<Operator<? extends Serializable>, MapOpCtx> entry : childrenOpToOpCtxMap .entrySet()) { Operator<? extends Serializable> child = entry.getKey(); MapOpCtx mapOpCtx = entry.getValue(); // Add alias, table name, and partitions to hadoop conf so that their // children will // inherit these HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME, mapOpCtx.tableName); HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, mapOpCtx.partName); child.initialize(hconf, new ObjectInspector[] {mapOpCtx.getRowObjectInspector()}); } for (Entry<MapInputPath, MapOpCtx> entry : opCtxMap.entrySet()) { // Add alias, table name, and partitions to hadoop conf so that their // children will // inherit these HiveConf.setVar(hconf, HiveConf.ConfVars.HIVETABLENAME, entry.getValue().tableName); HiveConf.setVar(hconf, HiveConf.ConfVars.HIVEPARTITIONNAME, entry .getValue().partName); MapInputPath input = entry.getKey(); Operator<? extends Serializable> op = input.op; // op is not in the children list, so need to remember it and close it // afterwards if (children.indexOf(op) == -1) { if (extraChildrenToClose == null) { extraChildrenToClose = new ArrayList<Operator<? extends Serializable>>(); } extraChildrenToClose.add(op); op.initialize(hconf, new ObjectInspector[] {entry.getValue().getRowObjectInspector()}); } } } /** * close extra child operators that are initialized but are not executed. */ @Override public void closeOp(boolean abort) throws HiveException { if (extraChildrenToClose != null) { for (Operator<? extends Serializable> op : extraChildrenToClose) { op.close(abort); } } } // Change the serializer etc. since it is a new file, and split can span // multiple files/partitions. @Override public void cleanUpInputFileChangedOp() throws HiveException { Path fpath = new Path((new Path(this.getExecContext().getCurrentInputFile())) .toUri().getPath()); for (String onefile : conf.getPathToAliases().keySet()) { Path onepath = new Path(new Path(onefile).toUri().getPath()); // check for the operators who will process rows coming to this Map // Operator if (!onepath.toUri().relativize(fpath.toUri()).equals(fpath.toUri())) { String onealias = conf.getPathToAliases().get(onefile).get(0); Operator<? extends Serializable> op = conf.getAliasToWork().get(onealias); LOG.info("Processing alias " + onealias + " for file " + onefile); MapInputPath inp = new MapInputPath(onefile, onealias, op); setInspectorInput(inp); break; } } } public void process(Writable value) throws HiveException { // A mapper can span multiple files/partitions. // The serializers need to be reset if the input file changed if ((this.getExecContext() != null) && this.getExecContext().inputFileChanged()) { // The child operators cleanup if input file has changed cleanUpInputFileChanged(); } Object row = null; try { if (this.hasVC) { this.rowWithPartAndVC[0] = deserializer.deserialize(value); int vcPos = isPartitioned ? 2 : 1; populateVirtualColumnValues(); this.rowWithPartAndVC[vcPos] = this.vcValues; } else if (!isPartitioned) { row = deserializer.deserialize((Writable) value); } else { rowWithPart[0] = deserializer.deserialize((Writable) value); } } catch (Exception e) { // Serialize the row and output. String rawRowString; try { rawRowString = value.toString(); } catch (Exception e2) { rawRowString = "[Error getting row data with exception " + StringUtils.stringifyException(e2) + " ]"; } // TODO: policy on deserialization errors deserialize_error_count.set(deserialize_error_count.get() + 1); throw new HiveException("Hive Runtime Error while processing writable " + rawRowString, e); } try { if (this.hasVC) { forward(this.rowWithPartAndVC, this.rowObjectInspector); } else if (!isPartitioned) { forward(row, rowObjectInspector); } else { forward(rowWithPart, rowObjectInspector); } } catch (Exception e) { // Serialize the row and output the error message. String rowString; try { if (this.hasVC) { rowString = SerDeUtils.getJSONString(rowWithPartAndVC, rowObjectInspector); } else if (!isPartitioned) { rowString = SerDeUtils.getJSONString(row, rowObjectInspector); } else { rowString = SerDeUtils.getJSONString(rowWithPart, rowObjectInspector); } } catch (Exception e2) { rowString = "[Error getting row data with exception " + StringUtils.stringifyException(e2) + " ]"; } throw new HiveException("Hive Runtime Error while processing row " + rowString, e); } } private void populateVirtualColumnValues() { if (this.vcs != null) { ExecMapperContext mapExecCxt = this.getExecContext(); IOContext ioCxt = mapExecCxt.getIoCxt(); for (int i = 0; i < vcs.size(); i++) { VirtualColumn vc = vcs.get(i); if (vc.equals(VirtualColumn.FILENAME) && mapExecCxt.inputFileChanged()) { this.vcValues[i] = new Text(mapExecCxt.getCurrentInputFile()); } else if (vc.equals(VirtualColumn.BLOCKOFFSET)) { long current = ioCxt.getCurrentBlockStart(); LongWritable old = (LongWritable) this.vcValues[i]; if (old == null) { old = new LongWritable(current); this.vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } else if (vc.equals(VirtualColumn.ROWOFFSET)) { long current = ioCxt.getCurrentRow(); LongWritable old = (LongWritable) this.vcValues[i]; if (old == null) { old = new LongWritable(current); this.vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } else if (vc.equals(VirtualColumn.RAWDATASIZE)) { long current = 0L; SerDeStats stats = this.deserializer.getSerDeStats(); if(stats != null) { current = stats.getRawDataSize(); } LongWritable old = (LongWritable) this.vcValues[i]; if (old == null) { old = new LongWritable(current); this.vcValues[i] = old; continue; } if (current != old.get()) { old.set(current); } } } } } @Override public void processOp(Object row, int tag) throws HiveException { throw new HiveException("Hive 2 Internal error: should not be called!"); } @Override public String getName() { return "MAP"; } @Override public OperatorType getType() { return null; } }