/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.tez;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorDeserializeRow;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedBatchUtil;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe;
import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead;
import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.tez.runtime.api.Reader;
import org.apache.tez.runtime.library.api.KeyValueReader;
import org.apache.tez.runtime.library.api.KeyValuesReader;
import com.google.common.base.Preconditions;
/**
* Process input from tez LogicalInput and write output - for a map plan
* Just pump the records through the query plan.
*/
@SuppressWarnings("deprecation")
public class ReduceRecordSource implements RecordSource {
public static final Logger l4j = LoggerFactory.getLogger(ReduceRecordSource.class);
private static final String CLASS_NAME = ReduceRecordSource.class.getName();
private byte tag;
private boolean abort = false;
private Deserializer inputKeyDeserializer;
// Input value serde needs to be an array to support different SerDe
// for different tags
private AbstractSerDe inputValueDeserializer;
private TableDesc keyTableDesc;
private TableDesc valueTableDesc;
private ObjectInspector rowObjectInspector;
private Operator<?> reducer;
private Object keyObject = null;
private BytesWritable groupKey;
private boolean vectorized = false;
private VectorDeserializeRow<BinarySortableDeserializeRead> keyBinarySortableDeserializeToRow;
private VectorDeserializeRow<LazyBinaryDeserializeRead> valueLazyBinaryDeserializeToRow;
private VectorizedRowBatch batch;
// number of columns pertaining to keys in a vectorized row batch
private int firstValueColumnOffset;
private final int BATCH_BYTES = VectorizedRowBatch.DEFAULT_BYTES;
private StructObjectInspector keyStructInspector;
private StructObjectInspector valueStructInspectors;
/* this is only used in the error code path */
private List<VectorExpressionWriter> valueStringWriters;
private KeyValuesAdapter reader;
private boolean handleGroupKey;
private ObjectInspector valueObjectInspector;
private final PerfLogger perfLogger = SessionState.getPerfLogger();
private Iterable<Object> valueWritables;
private final GroupIterator groupIterator = new GroupIterator();
private long vectorizedVertexNum;
void init(JobConf jconf, Operator<?> reducer, boolean vectorized, TableDesc keyTableDesc,
TableDesc valueTableDesc, Reader reader, boolean handleGroupKey, byte tag,
VectorizedRowBatchCtx batchContext, long vectorizedVertexNum)
throws Exception {
this.vectorizedVertexNum = vectorizedVertexNum;
ObjectInspector keyObjectInspector;
this.reducer = reducer;
this.vectorized = vectorized;
this.keyTableDesc = keyTableDesc;
if (reader instanceof KeyValueReader) {
this.reader = new KeyValuesFromKeyValue((KeyValueReader) reader);
} else {
this.reader = new KeyValuesFromKeyValues((KeyValuesReader) reader);
}
this.handleGroupKey = handleGroupKey;
this.tag = tag;
try {
inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc
.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
keyObjectInspector = inputKeyDeserializer.getObjectInspector();
if(vectorized) {
keyStructInspector = (StructObjectInspector) keyObjectInspector;
firstValueColumnOffset = keyStructInspector.getAllStructFieldRefs().size();
}
// We should initialize the SerDe with the TypeInfo when available.
this.valueTableDesc = valueTableDesc;
inputValueDeserializer = (AbstractSerDe) ReflectionUtils.newInstance(
valueTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(inputValueDeserializer, null,
valueTableDesc.getProperties(), null);
valueObjectInspector = inputValueDeserializer.getObjectInspector();
ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
if(vectorized) {
/* vectorization only works with struct object inspectors */
valueStructInspectors = (StructObjectInspector) valueObjectInspector;
final int totalColumns = firstValueColumnOffset +
valueStructInspectors.getAllStructFieldRefs().size();
valueStringWriters = new ArrayList<VectorExpressionWriter>(totalColumns);
valueStringWriters.addAll(Arrays
.asList(VectorExpressionWriterFactory
.genVectorStructExpressionWritables(keyStructInspector)));
valueStringWriters.addAll(Arrays
.asList(VectorExpressionWriterFactory
.genVectorStructExpressionWritables(valueStructInspectors)));
rowObjectInspector = Utilities.constructVectorizedReduceRowOI(keyStructInspector,
valueStructInspectors);
batch = batchContext.createVectorizedRowBatch();
// Setup vectorized deserialization for the key and value.
BinarySortableSerDe binarySortableSerDe = (BinarySortableSerDe) inputKeyDeserializer;
keyBinarySortableDeserializeToRow =
new VectorDeserializeRow<BinarySortableDeserializeRead>(
new BinarySortableDeserializeRead(
VectorizedBatchUtil.typeInfosFromStructObjectInspector(
keyStructInspector),
/* useExternalBuffer */ true,
binarySortableSerDe.getSortOrders(),
binarySortableSerDe.getNullMarkers(),
binarySortableSerDe.getNotNullMarkers()));
keyBinarySortableDeserializeToRow.init(0);
final int valuesSize = valueStructInspectors.getAllStructFieldRefs().size();
if (valuesSize > 0) {
valueLazyBinaryDeserializeToRow =
new VectorDeserializeRow<LazyBinaryDeserializeRead>(
new LazyBinaryDeserializeRead(
VectorizedBatchUtil.typeInfosFromStructObjectInspector(
valueStructInspectors),
/* useExternalBuffer */ true));
valueLazyBinaryDeserializeToRow.init(firstValueColumnOffset);
// Create data buffers for value bytes column vectors.
for (int i = firstValueColumnOffset; i < batch.numCols; i++) {
ColumnVector colVector = batch.cols[i];
if (colVector instanceof BytesColumnVector) {
BytesColumnVector bytesColumnVector = (BytesColumnVector) colVector;
bytesColumnVector.initBuffer();
}
}
}
} else {
ois.add(keyObjectInspector);
ois.add(valueObjectInspector);
rowObjectInspector =
ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList,
ois);
}
} catch (Throwable e) {
abort = true;
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else {
throw new RuntimeException("Reduce operator initialization failed", e);
}
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
@Override
public final boolean isGrouped() {
return vectorized;
}
@Override
public boolean pushRecord() throws HiveException {
if (vectorized) {
return pushRecordVector();
}
if (groupIterator.hasNext()) {
// if we have records left in the group we push one of those
groupIterator.next();
return true;
}
try {
if (!reader.next()) {
return false;
}
BytesWritable keyWritable = (BytesWritable) reader.getCurrentKey();
valueWritables = reader.getCurrentValues();
//Set the key, check if this is a new group or same group
try {
keyObject = inputKeyDeserializer.deserialize(keyWritable);
} catch (Exception e) {
throw new HiveException("Hive Runtime Error: Unable to deserialize reduce input key from "
+ Utilities.formatBinaryString(keyWritable.getBytes(), 0, keyWritable.getLength())
+ " with properties " + keyTableDesc.getProperties(), e);
}
if (handleGroupKey && !keyWritable.equals(this.groupKey)) {
// If a operator wants to do some work at the beginning of a group
if (groupKey == null) { // the first group
this.groupKey = new BytesWritable();
} else {
// If a operator wants to do some work at the end of a group
reducer.endGroup();
}
groupKey.set(keyWritable.getBytes(), 0, keyWritable.getLength());
reducer.startGroup();
reducer.setGroupKeyObject(keyObject);
}
groupIterator.initialize(valueWritables, keyObject, tag);
if (groupIterator.hasNext()) {
groupIterator.next(); // push first record of group
}
return true;
} catch (Throwable e) {
abort = true;
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else {
l4j.error(StringUtils.stringifyException(e));
throw new RuntimeException(e);
}
}
}
private Object deserializeValue(BytesWritable valueWritable, byte tag)
throws HiveException {
try {
return inputValueDeserializer.deserialize(valueWritable);
} catch (SerDeException e) {
throw new HiveException(
"Hive Runtime Error: Unable to deserialize reduce input value (tag="
+ tag
+ ") from "
+ Utilities.formatBinaryString(valueWritable.getBytes(), 0, valueWritable.getLength())
+ " with properties " + valueTableDesc.getProperties(), e);
}
}
private class GroupIterator {
private final List<Object> row = new ArrayList<Object>(Utilities.reduceFieldNameList.size());
private List<Object> passDownKey = null;
private Iterator<Object> values;
private byte tag;
private Object keyObject;
public void initialize(Iterable<Object> values, Object keyObject, byte tag) {
this.passDownKey = null;
this.values = values.iterator();
this.tag = tag;
this.keyObject = keyObject;
}
public boolean hasNext() {
return values != null && values.hasNext();
}
public void next() throws HiveException {
row.clear();
Object value = values.next();
BytesWritable valueWritable = (BytesWritable) value;
if (passDownKey == null) {
row.add(this.keyObject);
} else {
row.add(passDownKey.get(0));
}
if ((passDownKey == null) && (reducer instanceof CommonMergeJoinOperator)) {
passDownKey =
(List<Object>) ObjectInspectorUtils.copyToStandardObject(row,
reducer.getInputObjInspectors()[tag], ObjectInspectorCopyOption.WRITABLE);
row.remove(0);
row.add(0, passDownKey.get(0));
}
row.add(deserializeValue(valueWritable, tag));
try {
reducer.process(row, tag);
} catch (Exception e) {
String rowString = null;
try {
rowString = SerDeUtils.getJSONString(row, rowObjectInspector);
} catch (Exception e2) {
rowString = "[Error getting row data with exception "
+ StringUtils.stringifyException(e2) + " ]";
}
throw new HiveException("Hive Runtime Error while processing row (tag="
+ tag + ") " + rowString, e);
}
}
}
private boolean pushRecordVector() {
try {
if (!reader.next()) {
return false;
}
BytesWritable keyWritable = (BytesWritable) reader.getCurrentKey();
valueWritables = reader.getCurrentValues();
// Check if this is a new group or same group
if (handleGroupKey && !keyWritable.equals(this.groupKey)) {
// If a operator wants to do some work at the beginning of a group
if (groupKey == null) { // the first group
this.groupKey = new BytesWritable();
} else {
// If a operator wants to do some work at the end of a group
reducer.endGroup();
}
groupKey.set(keyWritable.getBytes(), 0, keyWritable.getLength());
reducer.startGroup();
}
processVectorGroup(keyWritable, valueWritables, tag);
return true;
} catch (Throwable e) {
abort = true;
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else {
l4j.error(StringUtils.stringifyException(e));
throw new RuntimeException(e);
}
}
}
/**
* @param values
* @return true if it is not done and can take more inputs
*/
private void processVectorGroup(BytesWritable keyWritable,
Iterable<Object> values, byte tag) throws HiveException, IOException {
// Deserialize key into vector row columns.
// Since we referencing byte column vector byte arrays by reference, we don't need
// a data buffer.
byte[] keyBytes = keyWritable.getBytes();
int keyLength = keyWritable.getLength();
// l4j.info("ReduceRecordSource processVectorGroup keyBytes " + keyLength + " " +
// VectorizedBatchUtil.displayBytes(keyBytes, 0, keyLength));
keyBinarySortableDeserializeToRow.setBytes(keyBytes, 0, keyLength);
try {
keyBinarySortableDeserializeToRow.deserialize(batch, 0);
} catch (Exception e) {
throw new HiveException(
"\nDeserializeRead details: " +
keyBinarySortableDeserializeToRow.getDetailedReadPositionString(),
e);
}
for(int i = 0; i < firstValueColumnOffset; i++) {
VectorizedBatchUtil.setRepeatingColumn(batch, i);
}
final int maxSize = batch.getMaxSize();
Preconditions.checkState(maxSize > 0);
int rowIdx = 0;
int batchBytes = keyBytes.length;
try {
for (Object value : values) {
if (valueLazyBinaryDeserializeToRow != null) {
// Deserialize value into vector row columns.
BytesWritable valueWritable = (BytesWritable) value;
byte[] valueBytes = valueWritable.getBytes();
int valueLength = valueWritable.getLength();
batchBytes += valueLength;
// l4j.info("ReduceRecordSource processVectorGroup valueBytes " + valueLength + " " +
// VectorizedBatchUtil.displayBytes(valueBytes, 0, valueLength));
valueLazyBinaryDeserializeToRow.setBytes(valueBytes, 0, valueLength);
valueLazyBinaryDeserializeToRow.deserialize(batch, rowIdx);
}
rowIdx++;
if (rowIdx >= maxSize || batchBytes >= BATCH_BYTES) {
// Batch is full.
batch.size = rowIdx;
reducer.process(batch, tag);
// Reset just the value columns and value buffer.
for (int i = firstValueColumnOffset; i < batch.numCols; i++) {
// Note that reset also resets the data buffer for bytes column vectors.
batch.cols[i].reset();
}
rowIdx = 0;
batchBytes = 0;
}
}
if (rowIdx > 0) {
// Flush final partial batch.
VectorizedBatchUtil.setBatchSize(batch, rowIdx);
reducer.process(batch, tag);
}
batch.reset();
} catch (Exception e) {
String rowString = null;
try {
rowString = batch.toString();
} catch (Exception e2) {
rowString = "[Error getting row data with exception "
+ StringUtils.stringifyException(e2) + " ]";
}
throw new HiveException("Hive Runtime Error while processing vector batch (tag="
+ tag + ") (vectorizedVertexNum " + vectorizedVertexNum + ") " +
rowString, e);
}
}
boolean close() throws Exception {
try {
if (handleGroupKey && groupKey != null) {
// If a operator wants to do some work at the end of a group
reducer.endGroup();
}
} catch (Exception e) {
if (!abort) {
// signal new failure to map-reduce
throw new RuntimeException("Hive Runtime Error while closing operators: "
+ e.getMessage(), e);
}
}
return abort;
}
public ObjectInspector getObjectInspector() {
return rowObjectInspector;
}
}