PTFOperator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec;

import java.io.Serializable;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.PTFPartition.PTFPartitionIterator;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.parse.LeadLagInfo;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.PTFDesc;
import org.apache.hadoop.hive.ql.plan.PTFDeserializer;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.plan.ptf.PTFExpressionDef;
import org.apache.hadoop.hive.ql.plan.ptf.PTFInputDef;
import org.apache.hadoop.hive.ql.plan.ptf.PartitionDef;
import org.apache.hadoop.hive.ql.plan.ptf.PartitionedTableFunctionDef;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag;
import org.apache.hadoop.hive.ql.udf.ptf.TableFunctionEvaluator;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;

public class PTFOperator extends Operator<PTFDesc> implements Serializable {

  private static final long serialVersionUID = 1L;
  boolean isMapOperator;

  transient KeyWrapperFactory keyWrapperFactory;
  protected transient KeyWrapper currentKeys;
  protected transient KeyWrapper newKeys;
  /*
   * for map-side invocation of PTFs, we cannot utilize the currentkeys null check
   * to decide on invoking startPartition in streaming mode. Hence this extra flag.
   */
  transient boolean firstMapRow;
  transient Configuration hiveConf;
  transient PTFInvocation ptfInvocation;

  /** Kryo ctor. */
  protected PTFOperator() {
    super();
  }

  public PTFOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  /*
   * 1. Find out if the operator is invoked at Map-Side or Reduce-side
   * 2. Get the deserialized QueryDef
   * 3. Reconstruct the transient variables in QueryDef
   * 4. Create input partition to store rows coming from previous operator
   */
  @Override
  protected void initializeOp(Configuration jobConf) throws HiveException {
    super.initializeOp(jobConf);
    hiveConf = jobConf;
    isMapOperator = conf.isMapSide();
    currentKeys = null;

    reconstructQueryDef(hiveConf);

    if (isMapOperator) {
      PartitionedTableFunctionDef tDef = conf.getStartOfChain();
      outputObjInspector = tDef.getRawInputShape().getOI();
    } else {
      outputObjInspector = conf.getFuncDef().getOutputShape().getOI();
    }

    setupKeysWrapper(inputObjInspectors[0]);

    ptfInvocation = setupChain();
    ptfInvocation.initializeStreaming(jobConf, isMapOperator);
    firstMapRow = true;
  }

  @Override
  protected void closeOp(boolean abort) throws HiveException {
    super.closeOp(abort);
    ptfInvocation.finishPartition();
    ptfInvocation.close();
  }

  @Override
  public void process(Object row, int tag) throws HiveException {
    if (!isMapOperator ) {
      /*
       * checkif current row belongs to the current accumulated Partition:
       * - If not:
       *  - process the current Partition
       *  - reset input Partition
       * - set currentKey to the newKey if it is null or has changed.
       */
      newKeys.getNewKey(row, inputObjInspectors[0]);
      boolean keysAreEqual = (currentKeys != null && newKeys != null)?
              newKeys.equals(currentKeys) : false;

      if (currentKeys != null && !keysAreEqual) {
        ptfInvocation.finishPartition();
      }

      if (currentKeys == null || !keysAreEqual) {
        ptfInvocation.startPartition();
        if (currentKeys == null) {
          currentKeys = newKeys.copyKey();
        } else {
          currentKeys.copyKey(newKeys);
        }
      }
    } else if ( firstMapRow ) {
      ptfInvocation.startPartition();
      firstMapRow = false;
    }

    ptfInvocation.processRow(row);
  }

  /**
   * Initialize the visitor to use the QueryDefDeserializer Use the order
   * defined in QueryDefWalker to visit the QueryDef
   *
   * @param hiveConf
   * @throws HiveException
   */
  protected void reconstructQueryDef(Configuration hiveConf) throws HiveException {

    PTFDeserializer dS =
        new PTFDeserializer(conf, (StructObjectInspector)inputObjInspectors[0], hiveConf);
    dS.initializePTFChain(conf.getFuncDef());
  }

  protected void setupKeysWrapper(ObjectInspector inputOI) throws HiveException {
    PartitionDef pDef = conf.getStartOfChain().getPartition();
    List<PTFExpressionDef> exprs = pDef.getExpressions();
    int numExprs = exprs.size();
    ExprNodeEvaluator[] keyFields = new ExprNodeEvaluator[numExprs];
    ObjectInspector[] keyOIs = new ObjectInspector[numExprs];
    ObjectInspector[] currentKeyOIs = new ObjectInspector[numExprs];

    for(int i=0; i<numExprs; i++) {
      PTFExpressionDef exprDef = exprs.get(i);
      /*
       * Why cannot we just use the ExprNodeEvaluator on the column?
       * - because on the reduce-side it is initialized based on the rowOI of the HiveTable
       *   and not the OI of the parent of this Operator on the reduce-side
       */
      keyFields[i] = ExprNodeEvaluatorFactory.get(exprDef.getExprNode());
      keyOIs[i] = keyFields[i].initialize(inputOI);
      currentKeyOIs[i] =
          ObjectInspectorUtils.getStandardObjectInspector(keyOIs[i],
              ObjectInspectorCopyOption.WRITABLE);
    }

    keyWrapperFactory = new KeyWrapperFactory(keyFields, keyOIs, currentKeyOIs);
    newKeys = keyWrapperFactory.getKeyWrapper();
  }

  /**
   * @return the name of the operator
   */
  @Override
  public String getName() {
    return PTFOperator.getOperatorName();
  }

  static public String getOperatorName() {
    return "PTF";
  }


  @Override
  public OperatorType getType() {
    return OperatorType.PTF;
  }

  private PTFInvocation setupChain() {
    Stack<PartitionedTableFunctionDef> fnDefs = new Stack<PartitionedTableFunctionDef>();
    PTFInputDef iDef = conf.getFuncDef();

    while (iDef instanceof PartitionedTableFunctionDef) {
      fnDefs.push((PartitionedTableFunctionDef) iDef);
      iDef = ((PartitionedTableFunctionDef) iDef).getInput();
    }

    PTFInvocation curr = null, first = null;

    while(!fnDefs.isEmpty()) {
      PartitionedTableFunctionDef currFn = fnDefs.pop();
      curr = new PTFInvocation(curr, currFn.getTFunction());
      if ( first == null ) {
        first = curr;
      }
    }
    return first;
  }

  public static void connectLeadLagFunctionsToPartition(LeadLagInfo leadLagInfo,
      PTFPartitionIterator<Object> pItr) throws HiveException {
    if (leadLagInfo == null || leadLagInfo.getLeadLagExprs() == null) {
      return;
    }

    for (ExprNodeGenericFuncDesc llFnDesc : leadLagInfo.getLeadLagExprs()) {
      GenericUDFLeadLag llFn = (GenericUDFLeadLag) llFnDesc.getGenericUDF();
      llFn.setpItr(pItr);
    }
  }

  /*
   * Responsible for the flow of rows through the PTF Chain.
   * An Invocation wraps a TableFunction.
   * The PTFOp hands the chain each row through the processRow call.
   * It also notifies the chain of when a Partition starts/finishes.
   *
   * There are several combinations depending
   * whether the TableFunction and its successor support Streaming or Batch mode.
   *
   * Combination 1: Streaming + Streaming
   * - Start Partition: invoke startPartition on tabFn.
   * - Process Row: invoke process Row on tabFn.
   *   Any output rows hand to next tabFn in chain or forward to next Operator.
   * - Finish Partition: invoke finishPartition on tabFn.
   *   Any output rows hand to next tabFn in chain or forward to next Operator.
   *
   * Combination 2: Streaming + Batch
   * same as Combination 1
   *
   * Combination 3: Batch + Batch
   * - Start Partition: create or reset the Input Partition for the tabFn
   *   caveat is: if prev is also batch and it is not providing an Output Iterator
   *   then we can just use its Output Partition.
   * - Process Row: collect row in Input Partition
   * - Finish Partition : invoke evaluate on tabFn on Input Partition
   *   If function gives an Output Partition: set it on next Invocation's Input Partition
   *   If function gives an Output Iterator: iterate and call processRow on next Invocation.
   *   For last Invocation in chain: forward rows to next Operator.
   *
   * Combination 3: Batch + Stream
   * Similar to Combination 3, except Finish Partition behavior slightly different
   * - Finish Partition : invoke evaluate on tabFn on Input Partition
   *   iterate output rows: hand to next tabFn in chain or forward to next Operator.
   *
   */
  class PTFInvocation {

    PTFInvocation prev;
    PTFInvocation next;
    TableFunctionEvaluator tabFn;
    PTFPartition inputPart;
    PTFPartition outputPart;
    Iterator<Object> outputPartRowsItr;

    public PTFInvocation(PTFInvocation prev, TableFunctionEvaluator tabFn) {
      this.prev = prev;
      this.tabFn = tabFn;
      if ( prev != null ) {
        prev.next = this;
      }
    }

    boolean isOutputIterator() {
      return tabFn.canAcceptInputAsStream() || tabFn.canIterateOutput();
    }

    boolean isStreaming() {
      return tabFn.canAcceptInputAsStream();
    }

    void initializeStreaming(Configuration cfg, boolean isMapSide) throws HiveException {
      PartitionedTableFunctionDef tabDef = tabFn.getTableDef();
      PTFInputDef inputDef = tabDef.getInput();
      ObjectInspector inputOI = conf.getStartOfChain() == tabDef ?
          inputObjInspectors[0] : inputDef.getOutputShape().getOI();

      tabFn.initializeStreaming(cfg, (StructObjectInspector) inputOI, isMapSide);

      if ( next != null ) {
        next.initializeStreaming(cfg, isMapSide);
      }
    }

    void startPartition() throws HiveException {
      if ( isStreaming() ) {
        tabFn.startPartition();
      } else {
        if ( prev == null || prev.isOutputIterator() ) {
          if ( inputPart == null ) {
            createInputPartition();
          } else {
            inputPart.reset();
          }
        }
      }
      if ( next != null ) {
        next.startPartition();
      }
    }

    void processRow(Object row) throws HiveException {
      if ( isStreaming() ) {
        handleOutputRows(tabFn.processRow(row));
      } else {
        inputPart.append(row);
      }
    }

    void handleOutputRows(List<Object> outRows) throws HiveException {
      if ( outRows != null ) {
        for (Object orow : outRows ) {
          if ( next != null ) {
            next.processRow(orow);
          } else {
            forward(orow, outputObjInspector);
          }
        }
      }
    }

    void finishPartition() throws HiveException {
      if ( isStreaming() ) {
        handleOutputRows(tabFn.finishPartition());
      } else {
        if ( tabFn.canIterateOutput() ) {
          outputPartRowsItr = inputPart == null ? null :
            tabFn.iterator(inputPart.iterator());
        } else {
          outputPart = inputPart == null ? null : tabFn.execute(inputPart);
          outputPartRowsItr = outputPart == null ? null : outputPart.iterator();
        }
        if ( next != null ) {
          if (!next.isStreaming() && !isOutputIterator() ) {
            next.inputPart = outputPart;
          } else {
            if ( outputPartRowsItr != null ) {
              while(outputPartRowsItr.hasNext() ) {
                next.processRow(outputPartRowsItr.next());
              }
            }
          }
        }
      }

      if ( next != null ) {
        next.finishPartition();
      } else {
        if (!isStreaming() ) {
          if ( outputPartRowsItr != null ) {
            while(outputPartRowsItr.hasNext() ) {
              forward(outputPartRowsItr.next(), outputObjInspector);
            }
          }
        }
      }
    }

    /**
     * Create a new Partition.
     * A partition has 2 OIs: the OI for the rows being put in and the OI for the rows
     * coming out. You specify the output OI by giving the Serde to use to Serialize.
     * Typically these 2 OIs are the same; but not always. For the
     * first PTF in a chain the OI of the incoming rows is dictated by the Parent Op
     * to this PTFOp. The output OI from the Partition is typically LazyBinaryStruct, but
     * not always. In the case of Noop/NoopMap we keep the Strcuture the same as
     * what is given to us.
     * <p>
     * The Partition we want to create here is for feeding the First table function in the chain.
     * So for map-side processing use the Serde from the output Shape its InputDef.
     * For reduce-side processing use the Serde from its RawInputShape(the shape
     * after map-side processing).
     * @param oi
     * @param hiveConf
     * @param isMapSide
     * @return
     * @throws HiveException
     */
    private void createInputPartition() throws HiveException {
      PartitionedTableFunctionDef tabDef = tabFn.getTableDef();
      PTFInputDef inputDef = tabDef.getInput();
      ObjectInspector inputOI = conf.getStartOfChain() == tabDef ?
          inputObjInspectors[0] : inputDef.getOutputShape().getOI();

      AbstractSerDe serde = conf.isMapSide() ? tabDef.getInput().getOutputShape().getSerde() :
        tabDef.getRawInputShape().getSerde();
      StructObjectInspector outputOI = conf.isMapSide() ? tabDef.getInput().getOutputShape().getOI() :
        tabDef.getRawInputShape().getOI();
      inputPart = PTFPartition.create(conf.getCfg(),
          serde,
          (StructObjectInspector) inputOI,
          outputOI);
    }

    void close() {
      if ( inputPart != null ) {
        inputPart.close();
        inputPart = null;
      }
      tabFn.close();
      if ( next != null ) {
        next.close();
      }
    }
  }

}