ColumnarStruct.java example

Explorer
howl-master
- howl-howl
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.columnar;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
import org.apache.hadoop.hive.serde2.lazy.LazyFactory;
import org.apache.hadoop.hive.serde2.lazy.LazyObject;
import org.apache.hadoop.hive.serde2.lazy.LazyUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.Text;

/**
 * ColumnarStruct is different from LazyStruct in that ColumnarStruct's field
 * Object get parsed at its initialize time when call
 * {@link #init(BytesRefArrayWritable cols)}, while LazyStruct parse fields in a
 * lazy way.
 *
 */
public class ColumnarStruct {

  private static final Log LOG = LogFactory.getLog(ColumnarStruct.class);

  int[] prjColIDs = null; // list of projected column IDs

  Text nullSequence;
  int lengthNullSequence;

  /**
   * Construct a ColumnarStruct object with the TypeInfo. It creates the first
   * level object at the first place
   *
   * @param oi
   *          the ObjectInspector representing the type of this LazyStruct.
   */
  public ColumnarStruct(ObjectInspector oi) {
    this(oi, null, null);
  }

  /**
   * Construct a ColumnarStruct object with the TypeInfo. It creates the first
   * level object at the first place
   *
   * @param oi
   *          the ObjectInspector representing the type of this LazyStruct.
   * @param notSkippedColumnIDs
   *          the column ids that should not be skipped
   */
  public ColumnarStruct(ObjectInspector oi,
      ArrayList<Integer> notSkippedColumnIDs, Text nullSequence) {
    List<? extends StructField> fieldRefs = ((StructObjectInspector) oi)
        .getAllStructFieldRefs();
    int num = fieldRefs.size();

    fieldInfoList = new FieldInfo[num];

    if (nullSequence != null) {
      this.nullSequence = nullSequence;
      this.lengthNullSequence = nullSequence.getLength();
    }

    // if no columns is set to be skipped, add all columns in
    // 'notSkippedColumnIDs'
    if (notSkippedColumnIDs == null || notSkippedColumnIDs.size() == 0) {
      for (int i = 0; i < num; i++) {
        notSkippedColumnIDs.add(i);
      }
    }

    for (int i = 0; i < num; i++) {
      fieldInfoList[i] = new FieldInfo(
          LazyFactory.createLazyObject(fieldRefs.get(i)
          .getFieldObjectInspector()),
          !notSkippedColumnIDs.contains(i));
    }

    // maintain a list of non-NULL column IDs
    int min = notSkippedColumnIDs.size() > num ? num : notSkippedColumnIDs
        .size();
    prjColIDs = new int[min];
    for (int i = 0, index = 0; i < notSkippedColumnIDs.size(); ++i) {
      int readCol = notSkippedColumnIDs.get(i).intValue();
      if (readCol < num) {
        prjColIDs[index] = readCol;
        index++;
      }
    }
  }

  /**
   * Get one field out of the struct.
   *
   * If the field is a primitive field, return the actual object. Otherwise
   * return the LazyObject. This is because PrimitiveObjectInspector does not
   * have control over the object used by the user - the user simply directly
   * use the Object instead of going through Object
   * PrimitiveObjectInspector.get(Object).
   *
   * NOTE: separator and nullSequence has to be the same each time this method
   * is called. These two parameters are used only once to parse each record.
   *
   * @param fieldID
   *          The field ID
   * @param nullSequence
   *          The sequence for null value
   * @return The field as a LazyObject
   */
  public Object getField(int fieldID) {
    return fieldInfoList[fieldID].uncheckedGetField();
  }

  class FieldInfo {
    LazyObject field;
    /*
     * use an array instead of only one object in case in future hive does not do
     * the byte copy.
     */
    ByteArrayRef cachedByteArrayRef;
    BytesRefWritable rawBytesField;
    boolean inited;
    boolean fieldSkipped;

    public FieldInfo(LazyObject lazyObject, boolean fieldSkipped) {
      field = lazyObject;
      cachedByteArrayRef = new ByteArrayRef();
      if (fieldSkipped) {
        this.fieldSkipped = true;
        inited = true;
      } else {
        inited = false;
      }
    }

    /*
     * ============================ [PERF] ===================================
     * This function is called for every row. Setting up the selected/projected
     * columns at the first call, and don't do that for the following calls.
     * Ideally this should be done in the constructor where we don't need to
     * branch in the function for each row.
     * =========================================================================
     */
    public void init(BytesRefWritable col) {
        if (col != null) {
          rawBytesField= col;
          inited = false;
        } else {
          // select columns that actually do not exist in the file.
          fieldSkipped = true;
        }
    }

    /**
     * Get the field out of the row without checking parsed. This is called by
     * both getField and getFieldsAsList.
     *
     * @param fieldID
     *          The id of the field starting from 0.
     * @param nullSequence
     *          The sequence representing NULL value.
     * @return The value of the field
     */
    protected Object uncheckedGetField() {
      if (fieldSkipped) {
        return null;
      }
      if (!inited) {
        try {
          cachedByteArrayRef.setData(rawBytesField.getData());
        } catch (IOException e) {
          throw new RuntimeException(e);
        }
        field.init(cachedByteArrayRef, rawBytesField
            .getStart(), rawBytesField.getLength());
        inited = true;
      }


      int fieldLen = rawBytesField.length;
      if (fieldLen == lengthNullSequence) {
        byte[] data = cachedByteArrayRef.getData();

        if (LazyUtils.compare(data, rawBytesField.getStart(), fieldLen,
            nullSequence.getBytes(), 0, lengthNullSequence) == 0) {
          return null;
        }
      }

      return field.getObject();

    }
  }

  FieldInfo[] fieldInfoList = null;


  /*
   * ============================ [PERF] ===================================
   * This function is called for every row. Setting up the selected/projected
   * columns at the first call, and don't do that for the following calls.
   * Ideally this should be done in the constructor where we don't need to
   * branch in the function for each row.
   * =========================================================================
   */
  public void init(BytesRefArrayWritable cols) {
    for (int i = 0; i < prjColIDs.length; ++i) {
      int fieldIndex = prjColIDs[i];
      if (fieldIndex < cols.size()) {
        fieldInfoList[fieldIndex].init(cols.unCheckedGet(fieldIndex));
      } else {
        // select columns that actually do not exist in the file.
        fieldInfoList[fieldIndex].init(null);
      }
    }
  }

  ArrayList<Object> cachedList;

  /**
   * Get the values of the fields as an ArrayList.
   *
   * @param nullSequence
   *          The sequence for the NULL value
   * @return The values of the fields as an ArrayList.
   */
  public ArrayList<Object> getFieldsAsList() {
    if (cachedList == null) {
      cachedList = new ArrayList<Object>();
    } else {
      cachedList.clear();
    }
    for (int i = 0; i < fieldInfoList.length; i++) {
      cachedList.add(fieldInfoList[i].uncheckedGetField());
    }
    return cachedList;
  }
}