BatchToRowReader.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io;


import com.google.common.collect.Lists;

import org.apache.hadoop.hive.llap.DebugUtils;

import java.util.Arrays;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.RecordReader;

/**
 * A record reader wrapper that converts VRB reader into an OI-based reader.
 * Due to the fact that changing table OIs in the plan after compilation is nearly impossible,
 * this is made an abstract class where type-specific implementations can plug in certain details,
 * so that the data produced after wrapping a vectorized reader would conform to the original OIs.
 */
public abstract class BatchToRowReader<StructType, UnionType>
    implements RecordReader<NullWritable, Object> {
  protected static final Logger LOG = LoggerFactory.getLogger(BatchToRowReader.class);

  private final NullWritable key;
  private final VectorizedRowBatch batch;
  private final RecordReader<NullWritable, VectorizedRowBatch> vrbReader;

  private final List<TypeInfo> schema;
  private final boolean[] included;
  private int rowInBatch = 0;

  public BatchToRowReader(RecordReader<NullWritable, VectorizedRowBatch> vrbReader,
      VectorizedRowBatchCtx vrbCtx, List<Integer> includedCols) {
    this.vrbReader = vrbReader;
    this.key = vrbReader.createKey();
    this.batch = vrbReader.createValue();
    this.schema = Lists.<TypeInfo>newArrayList(vrbCtx.getRowColumnTypeInfos());
    // TODO: does this include partition columns?
    boolean[] included = new boolean[schema.size()];
    if (includedCols != null) {
      for (int colIx : includedCols) {
        included[colIx] = true;
      }
    } else {
      Arrays.fill(included, true);
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("Including the columns " + DebugUtils.toString(included));
    }
    this.included = included;
  }

  protected abstract StructType createStructObject(Object previous, List<TypeInfo> childrenTypes);
  protected abstract void setStructCol(StructType structObj, int i, Object value);
  protected abstract Object getStructCol(StructType structObj, int i);
  protected abstract UnionType createUnionObject(List<TypeInfo> childrenTypes, Object previous);
  protected abstract void setUnion(UnionType unionObj, byte tag, Object object);
  protected abstract Object getUnionField(UnionType unionObj);

  @Override
  public NullWritable createKey() {
    return key;
  }

  @Override
  public Object createValue() {
    return createStructObject(null, schema);
  }

  @Override
  public long getPos() throws IOException {
    return -1;
  }

  @Override
  public float getProgress() throws IOException {
    return 0;
  }

  @Override
  public boolean next(NullWritable key, Object previous) throws IOException {
    if (!ensureBatch()) {
      return false;
    }
    @SuppressWarnings("unchecked")
    StructType value = (StructType)previous;
    for (int i = 0; i < schema.size(); ++i) {
      if (!included[i]) continue; // TODO: shortcut for last col below length?
      try {
        setStructCol(value, i,
            nextValue(batch.cols[i], rowInBatch, schema.get(i), getStructCol(value, i)));
      } catch (Throwable t) {
        LOG.error("Error at row " + rowInBatch + "/" + batch.size + ", column " + i
            + "/" + schema.size() + " " + batch.cols[i], t);
        throw (t instanceof IOException) ? (IOException)t : new IOException(t);
      }
    }
    ++rowInBatch;
    return true;
  }

  /**
   * If the current batch is empty, get a new one.
   * @return true if we have rows available.
   */
  private boolean ensureBatch() throws IOException {
    if (rowInBatch >= batch.size) {
      rowInBatch = 0;
      return vrbReader.next(key, batch) && batch.size > 0;
    }
    return true;
  }


  @Override
  public void close() throws IOException {
    vrbReader.close();
    batch.cols = null;
  }

  /* Routines for stubbing into Writables */

  public static BooleanWritable nextBoolean(ColumnVector vector,
                                     int row,
                                     Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      BooleanWritable result;
      if (previous == null || previous.getClass() != BooleanWritable.class) {
        result = new BooleanWritable();
      } else {
        result = (BooleanWritable) previous;
      }
      result.set(((LongColumnVector) vector).vector[row] != 0);
      return result;
    } else {
      return null;
    }
  }

  public static ByteWritable nextByte(ColumnVector vector,
                               int row,
                               Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      ByteWritable result;
      if (previous == null || previous.getClass() != ByteWritable.class) {
        result = new ByteWritable();
      } else {
        result = (ByteWritable) previous;
      }
      result.set((byte) ((LongColumnVector) vector).vector[row]);
      return result;
    } else {
      return null;
    }
  }

  public static ShortWritable nextShort(ColumnVector vector,
                                 int row,
                                 Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      ShortWritable result;
      if (previous == null || previous.getClass() != ShortWritable.class) {
        result = new ShortWritable();
      } else {
        result = (ShortWritable) previous;
      }
      result.set((short) ((LongColumnVector) vector).vector[row]);
      return result;
    } else {
      return null;
    }
  }

  public static IntWritable nextInt(ColumnVector vector,
                             int row,
                             Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      IntWritable result;
      if (previous == null || previous.getClass() != IntWritable.class) {
        result = new IntWritable();
      } else {
        result = (IntWritable) previous;
      }
      result.set((int) ((LongColumnVector) vector).vector[row]);
      return result;
    } else {
      return null;
    }
  }

  public static LongWritable nextLong(ColumnVector vector,
                               int row,
                               Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      LongWritable result;
      if (previous == null || previous.getClass() != LongWritable.class) {
        result = new LongWritable();
      } else {
        result = (LongWritable) previous;
      }
      result.set(((LongColumnVector) vector).vector[row]);
      return result;
    } else {
      return null;
    }
  }

  public static FloatWritable nextFloat(ColumnVector vector,
                                 int row,
                                 Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      FloatWritable result;
      if (previous == null || previous.getClass() != FloatWritable.class) {
        result = new FloatWritable();
      } else {
        result = (FloatWritable) previous;
      }
      result.set((float) ((DoubleColumnVector) vector).vector[row]);
      return result;
    } else {
      return null;
    }
  }

  public static DoubleWritable nextDouble(ColumnVector vector,
                                   int row,
                                   Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      DoubleWritable result;
      if (previous == null || previous.getClass() != DoubleWritable.class) {
        result = new DoubleWritable();
      } else {
        result = (DoubleWritable) previous;
      }
      result.set(((DoubleColumnVector) vector).vector[row]);
      return result;
    } else {
      return null;
    }
  }

  public static Text nextString(ColumnVector vector,
                         int row,
                         Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      Text result;
      if (previous == null || previous.getClass() != Text.class) {
        result = new Text();
      } else {
        result = (Text) previous;
      }
      BytesColumnVector bytes = (BytesColumnVector) vector;
      result.set(bytes.vector[row], bytes.start[row], bytes.length[row]);
      return result;
    } else {
      return null;
    }
  }

  public static HiveCharWritable nextChar(ColumnVector vector,
                                   int row,
                                   int size,
                                   Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      HiveCharWritable result;
      if (previous == null || previous.getClass() != HiveCharWritable.class) {
        result = new HiveCharWritable();
      } else {
        result = (HiveCharWritable) previous;
      }
      BytesColumnVector bytes = (BytesColumnVector) vector;
      result.set(bytes.toString(row), size);
      return result;
    } else {
      return null;
    }
  }

  public static HiveVarcharWritable nextVarchar(
      ColumnVector vector, int row, int size, Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      HiveVarcharWritable result;
      if (previous == null || previous.getClass() != HiveVarcharWritable.class) {
        result = new HiveVarcharWritable();
      } else {
        result = (HiveVarcharWritable) previous;
      }
      BytesColumnVector bytes = (BytesColumnVector) vector;
      result.set(bytes.toString(row), size);
      return result;
    } else {
      return null;
    }
  }

  public static BytesWritable nextBinary(ColumnVector vector,
                                  int row,
                                  Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      BytesWritable result;
      if (previous == null || previous.getClass() != BytesWritable.class) {
        result = new BytesWritable();
      } else {
        result = (BytesWritable) previous;
      }
      BytesColumnVector bytes = (BytesColumnVector) vector;
      result.set(bytes.vector[row], bytes.start[row], bytes.length[row]);
      return result;
    } else {
      return null;
    }
  }

  public static HiveDecimalWritable nextDecimal(ColumnVector vector,
                                         int row,
                                         Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      HiveDecimalWritable result;
      if (previous == null || previous.getClass() != HiveDecimalWritable.class) {
        result = new HiveDecimalWritable();
      } else {
        result = (HiveDecimalWritable) previous;
      }
      result.set(((DecimalColumnVector) vector).vector[row]);
      return result;
    } else {
      return null;
    }
  }

  public static DateWritable nextDate(ColumnVector vector,
                               int row,
                               Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      DateWritable result;
      if (previous == null || previous.getClass() != DateWritable.class) {
        result = new DateWritable();
      } else {
        result = (DateWritable) previous;
      }
      int date = (int) ((LongColumnVector) vector).vector[row];
      result.set(date);
      return result;
    } else {
      return null;
    }
  }

  public static TimestampWritable nextTimestamp(ColumnVector vector,
                                         int row,
                                         Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      TimestampWritable result;
      if (previous == null || previous.getClass() != TimestampWritable.class) {
        result = new TimestampWritable();
      } else {
        result = (TimestampWritable) previous;
      }
      TimestampColumnVector tcv = (TimestampColumnVector) vector;
      result.setInternal(tcv.time[row], tcv.nanos[row]);
      return result;
    } else {
      return null;
    }
  }

  public StructType nextStruct(
      ColumnVector vector, int row, StructTypeInfo schema, Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      List<TypeInfo> childrenTypes = schema.getAllStructFieldTypeInfos();
      StructType result = createStructObject(previous, childrenTypes);
      StructColumnVector struct = (StructColumnVector) vector;
      for (int f = 0; f < childrenTypes.size(); ++f) {
        setStructCol(result, f, nextValue(struct.fields[f], row,
            childrenTypes.get(f), getStructCol(result, f)));
      }
      return result;
    } else {
      return null;
    }
  }

  private UnionType nextUnion(
      ColumnVector vector, int row, UnionTypeInfo schema, Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      List<TypeInfo> childrenTypes = schema.getAllUnionObjectTypeInfos();
      UnionType result = createUnionObject(childrenTypes, previous);
      UnionColumnVector union = (UnionColumnVector) vector;
      byte tag = (byte) union.tags[row];
      setUnion(result, tag, nextValue(union.fields[tag], row, childrenTypes.get(tag),
          getUnionField(result)));
      return result;
    } else {
      return null;
    }
  }

  private ArrayList<Object> nextList(
      ColumnVector vector, int row, ListTypeInfo schema, Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      ArrayList<Object> result;
      if (previous == null || previous.getClass() != ArrayList.class) {
        result = new ArrayList<>();
      } else {
        result = (ArrayList<Object>) previous;
      }
      ListColumnVector list = (ListColumnVector) vector;
      int length = (int) list.lengths[row];
      int offset = (int) list.offsets[row];
      result.ensureCapacity(length);
      int oldLength = result.size();
      int idx = 0;
      TypeInfo childType = schema.getListElementTypeInfo();
      while (idx < length && idx < oldLength) {
        result.set(idx, nextValue(list.child, offset + idx, childType,
            result.get(idx)));
        idx += 1;
      }
      if (length < oldLength) {
        for(int i= oldLength - 1; i >= length; --i) {
          result.remove(i);
        }
      } else if (oldLength < length) {
        while (idx < length) {
          result.add(nextValue(list.child, offset + idx, childType, null));
          idx += 1;
        }
      }
      return result;
    } else {
      return null;
    }
  }

  private HashMap<Object,Object> nextMap(
      ColumnVector vector, int row, MapTypeInfo schema, Object previous) {
    if (vector.isRepeating) {
      row = 0;
    }
    if (vector.noNulls || !vector.isNull[row]) {
      MapColumnVector map = (MapColumnVector) vector;
      int length = (int) map.lengths[row];
      int offset = (int) map.offsets[row];
      TypeInfo keyType = schema.getMapKeyTypeInfo();
      TypeInfo valueType = schema.getMapValueTypeInfo();
      HashMap<Object,Object> result;
      if (previous == null || previous.getClass() != HashMap.class) {
        result = new HashMap<Object,Object>(length);
      } else {
        result = (HashMap<Object,Object>) previous;
        // I couldn't think of a good way to reuse the keys and value objects
        // without even more allocations, so take the easy and safe approach.
        result.clear();
      }
      for(int e=0; e < length; ++e) {
        result.put(nextValue(map.keys, e + offset, keyType, null),
                   nextValue(map.values, e + offset, valueType, null));
      }
      return result;
    } else {
      return null;
    }
  }

  private Object nextValue(ColumnVector vector, int row, TypeInfo schema, Object previous) {
    switch (schema.getCategory()) {
      case STRUCT:
        return nextStruct(vector, row, (StructTypeInfo)schema, previous);
      case UNION:
        return nextUnion(vector, row, (UnionTypeInfo)schema, previous);
      case LIST:
        return nextList(vector, row, (ListTypeInfo)schema, previous);
      case MAP:
        return nextMap(vector, row, (MapTypeInfo)schema, previous);
      case PRIMITIVE: {
        PrimitiveTypeInfo pschema = (PrimitiveTypeInfo)schema;
        switch (pschema.getPrimitiveCategory()) {
        case BOOLEAN:
          return nextBoolean(vector, row, previous);
        case BYTE:
          return nextByte(vector, row, previous);
        case SHORT:
          return nextShort(vector, row, previous);
        case INT:
          return nextInt(vector, row, previous);
        case LONG:
          return nextLong(vector, row, previous);
        case FLOAT:
          return nextFloat(vector, row, previous);
        case DOUBLE:
          return nextDouble(vector, row, previous);
        case STRING:
          return nextString(vector, row, previous);
        case CHAR:
          return nextChar(vector, row, ((CharTypeInfo)pschema).getLength(), previous);
        case VARCHAR:
          return nextVarchar(vector, row, ((VarcharTypeInfo)pschema).getLength(), previous);
        case BINARY:
          return nextBinary(vector, row, previous);
        case DECIMAL:
          return nextDecimal(vector, row, previous);
        case DATE:
          return nextDate(vector, row, previous);
        case TIMESTAMP:
          return nextTimestamp(vector, row, previous);
        default:
          throw new IllegalArgumentException("Unknown type " + schema);
        }
      }
      default:
        throw new IllegalArgumentException("Unknown type " + schema);
    }
  }
}