PartitionSenderRootExec.java example

Explorer
drill-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.physical.impl.partitionsender;

import java.io.IOException;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicIntegerArray;

import org.apache.drill.common.expression.ErrorCollector;
import org.apache.drill.common.expression.ErrorCollectorImpl;
import org.apache.drill.common.expression.LogicalExpression;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.exception.ClassTransformationException;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.ClassGenerator;
import org.apache.drill.exec.expr.CodeGenerator;
import org.apache.drill.exec.expr.ExpressionTreeMaterializer;
import org.apache.drill.exec.ops.AccountingDataTunnel;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.MetricDef;
import org.apache.drill.exec.ops.OperatorStats;
import org.apache.drill.exec.physical.MinorFragmentEndpoint;
import org.apache.drill.exec.physical.config.HashPartitionSender;
import org.apache.drill.exec.physical.impl.BaseRootExec;
import org.apache.drill.exec.planner.physical.PlannerSettings;
import org.apache.drill.exec.proto.ExecProtos.FragmentHandle;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode;
import org.apache.drill.exec.record.FragmentWritableBatch;
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.record.RecordBatch.IterOutcome;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.server.options.OptionManager;
import org.apache.drill.exec.vector.CopyUtil;

import com.carrotsearch.hppc.IntArrayList;
import com.google.common.annotations.VisibleForTesting;
import com.sun.codemodel.JExpr;
import com.sun.codemodel.JExpression;
import com.sun.codemodel.JType;

public class PartitionSenderRootExec extends BaseRootExec {
  private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(PartitionSenderRootExec.class);
  private RecordBatch incoming;
  private HashPartitionSender operator;
  private PartitionerDecorator partitioner;

  private FragmentContext context;
  private boolean ok = true;
  private final int outGoingBatchCount;
  private final HashPartitionSender popConfig;
  private final double cost;

  private final AtomicIntegerArray remainingReceivers;
  private final AtomicInteger remaingReceiverCount;
  private volatile boolean done = false;
  private boolean first = true;

  long minReceiverRecordCount = Long.MAX_VALUE;
  long maxReceiverRecordCount = Long.MIN_VALUE;
  protected final int numberPartitions;
  protected final int actualPartitions;

  private IntArrayList terminations = new IntArrayList();

  public enum Metric implements MetricDef {
    BATCHES_SENT,
    RECORDS_SENT,
    MIN_RECORDS,
    MAX_RECORDS,
    N_RECEIVERS,
    BYTES_SENT,
    SENDING_THREADS_COUNT,
    COST;

    @Override
    public int metricId() {
      return ordinal();
    }
  }

  public PartitionSenderRootExec(FragmentContext context,
                                 RecordBatch incoming,
                                 HashPartitionSender operator) throws OutOfMemoryException {
    super(context, context.newOperatorContext(operator, null), operator);
    this.incoming = incoming;
    this.operator = operator;
    this.context = context;
    outGoingBatchCount = operator.getDestinations().size();
    popConfig = operator;
    remainingReceivers = new AtomicIntegerArray(outGoingBatchCount);
    remaingReceiverCount = new AtomicInteger(outGoingBatchCount);
    stats.setLongStat(Metric.N_RECEIVERS, outGoingBatchCount);
    // Algorithm to figure out number of threads to parallelize output
    // numberOfRows/sliceTarget/numReceivers/threadfactor
    this.cost = operator.getChild().getCost();
    final OptionManager optMgr = context.getOptions();
    long sliceTarget = optMgr.getOption(ExecConstants.SLICE_TARGET).num_val;
    int threadFactor = optMgr.getOption(PlannerSettings.PARTITION_SENDER_THREADS_FACTOR.getOptionName()).num_val.intValue();
    int tmpParts = 1;
    if ( sliceTarget != 0 && outGoingBatchCount != 0 ) {
      tmpParts = (int) Math.round((((cost / (sliceTarget*1.0)) / (outGoingBatchCount*1.0)) / (threadFactor*1.0)));
      if ( tmpParts < 1) {
        tmpParts = 1;
      }
    }
    final int imposedThreads = optMgr.getOption(PlannerSettings.PARTITION_SENDER_SET_THREADS.getOptionName()).num_val.intValue();
    if (imposedThreads > 0 ) {
      this.numberPartitions = imposedThreads;
    } else {
      this.numberPartitions = Math.min(tmpParts, optMgr.getOption(PlannerSettings.PARTITION_SENDER_MAX_THREADS.getOptionName()).num_val.intValue());
    }
    logger.info("Preliminary number of sending threads is: " + numberPartitions);
    this.actualPartitions = outGoingBatchCount > numberPartitions ? numberPartitions : outGoingBatchCount;
    this.stats.setLongStat(Metric.SENDING_THREADS_COUNT, actualPartitions);
    this.stats.setDoubleStat(Metric.COST, this.cost);
  }

  @Override
  public boolean innerNext() {
    if (!ok) {
      return false;
    }

    IterOutcome out;
    if (!done) {
      out = next(incoming);
    } else {
      incoming.kill(true);
      out = IterOutcome.NONE;
    }

    logger.debug("Partitioner.next(): got next record batch with status {}", out);
    if (first && out == IterOutcome.OK) {
      out = IterOutcome.OK_NEW_SCHEMA;
    }
    switch(out){
      case NONE:
        try {
          // send any pending batches
          if(partitioner != null) {
            partitioner.flushOutgoingBatches(true, false);
          } else {
            sendEmptyBatch(true);
          }
        } catch (IOException e) {
          incoming.kill(false);
          logger.error("Error while creating partitioning sender or flushing outgoing batches", e);
          context.fail(e);
        }
        return false;

      case OUT_OF_MEMORY:
        throw new OutOfMemoryException();

      case STOP:
        if (partitioner != null) {
          partitioner.clear();
        }
        return false;

      case OK_NEW_SCHEMA:
        try {
          // send all existing batches
          if (partitioner != null) {
            partitioner.flushOutgoingBatches(false, true);
            partitioner.clear();
          }
          createPartitioner();

          if (first) {
            // Send an empty batch for fast schema
            first = false;
            sendEmptyBatch(false);
          }
        } catch (IOException e) {
          incoming.kill(false);
          logger.error("Error while flushing outgoing batches", e);
          context.fail(e);
          return false;
        } catch (SchemaChangeException e) {
          incoming.kill(false);
          logger.error("Error while setting up partitioner", e);
          context.fail(e);
          return false;
        }
      case OK:
        try {
          partitioner.partitionBatch(incoming);
        } catch (IOException e) {
          context.fail(e);
          incoming.kill(false);
          return false;
        }
        for (VectorWrapper<?> v : incoming) {
          v.clear();
        }
        return true;
      case NOT_YET:
      default:
        throw new IllegalStateException();
    }
  }

  @VisibleForTesting
  protected void createPartitioner() throws SchemaChangeException {
    final int divisor = Math.max(1, outGoingBatchCount/actualPartitions);
    final int longTail = outGoingBatchCount % actualPartitions;

    final List<Partitioner> subPartitioners = createClassInstances(actualPartitions);
    int startIndex = 0;
    int endIndex = 0;

    boolean success = false;
    try {
      for (int i = 0; i < actualPartitions; i++) {
        startIndex = endIndex;
        endIndex = (i < actualPartitions - 1) ? startIndex + divisor : outGoingBatchCount;
        if (i < longTail) {
          endIndex++;
        }
        final OperatorStats partitionStats = new OperatorStats(stats, true);
        subPartitioners.get(i).setup(context, incoming, popConfig, partitionStats, oContext,
            startIndex, endIndex);
      }

      synchronized (this) {
        partitioner = new PartitionerDecorator(subPartitioners, stats, context);
        for (int index = 0; index < terminations.size(); index++) {
          partitioner.getOutgoingBatches(terminations.buffer[index]).terminate();
        }
        terminations.clear();
      }

      success = true;
    } finally {
      if (!success) {
        for (Partitioner p : subPartitioners) {
          p.clear();
        }
      }
    }
  }

  private List<Partitioner> createClassInstances(int actualPartitions) throws SchemaChangeException {
    // set up partitioning function
    final LogicalExpression expr = operator.getExpr();
    final ErrorCollector collector = new ErrorCollectorImpl();
    final ClassGenerator<Partitioner> cg ;

    cg = CodeGenerator.getRoot(Partitioner.TEMPLATE_DEFINITION, context.getFunctionRegistry(), context.getOptions());
    cg.getCodeGenerator().plainJavaCapable(true);
    // Uncomment out this line to debug the generated code.
//    cg.getCodeGenerator().saveCodeForDebugging(true);
    ClassGenerator<Partitioner> cgInner = cg.getInnerGenerator("OutgoingRecordBatch");

    final LogicalExpression materializedExpr = ExpressionTreeMaterializer.materialize(expr, incoming, collector, context.getFunctionRegistry());
    if (collector.hasErrors()) {
      throw new SchemaChangeException(String.format(
          "Failure while trying to materialize incoming schema.  Errors:\n %s.",
          collector.toErrorString()));
    }

    // generate code to copy from an incoming value vector to the destination partition's outgoing value vector
    JExpression bucket = JExpr.direct("bucket");

    // generate evaluate expression to determine the hash
    ClassGenerator.HoldingContainer exprHolder = cg.addExpr(materializedExpr);
    cg.getEvalBlock().decl(JType.parse(cg.getModel(), "int"), "bucket", exprHolder.getValue().mod(JExpr.lit(outGoingBatchCount)));
    cg.getEvalBlock()._return(cg.getModel().ref(Math.class).staticInvoke("abs").arg(bucket));

    CopyUtil.generateCopies(cgInner, incoming, incoming.getSchema().getSelectionVectorMode() == SelectionVectorMode.FOUR_BYTE);

    try {
      // compile and setup generated code
      List<Partitioner> subPartitioners = context.getImplementationClass(cg, actualPartitions);
      return subPartitioners;

    } catch (ClassTransformationException | IOException e) {
      throw new SchemaChangeException("Failure while attempting to load generated class", e);
    }
  }

  /**
   * Find min and max record count seen across the outgoing batches and put them in stats.
   */
  private void updateAggregateStats() {
    for (Partitioner part : partitioner.getPartitioners() ) {
      for (PartitionOutgoingBatch o : part.getOutgoingBatches()) {
        long totalRecords = o.getTotalRecords();
        minReceiverRecordCount = Math.min(minReceiverRecordCount, totalRecords);
        maxReceiverRecordCount = Math.max(maxReceiverRecordCount, totalRecords);
      }
    }
    stats.setLongStat(Metric.MIN_RECORDS, minReceiverRecordCount);
    stats.setLongStat(Metric.MAX_RECORDS, maxReceiverRecordCount);
  }

  @Override
  public void receivingFragmentFinished(FragmentHandle handle) {
    final int id = handle.getMinorFragmentId();
    if (remainingReceivers.compareAndSet(id, 0, 1)) {
      synchronized (this) {
        if (partitioner == null) {
          terminations.add(id);
        } else {
          partitioner.getOutgoingBatches(id).terminate();
        }
      }

      int remaining = remaingReceiverCount.decrementAndGet();
      if (remaining == 0) {
        done = true;
      }
    }
  }

  @Override
  public void close() throws Exception {
    logger.debug("Partition sender stopping.");
    super.close();
    ok = false;
    if (partitioner != null) {
      updateAggregateStats();
      partitioner.clear();
    }
  }

  public void sendEmptyBatch(boolean isLast) {
    BatchSchema schema = incoming.getSchema();
    if (schema == null) {
      // If the incoming batch has no schema (possible when there are no input records),
      // create an empty schema to avoid NPE.
      schema = BatchSchema.newBuilder().build();
    }

    FragmentHandle handle = context.getHandle();
    for (MinorFragmentEndpoint destination : popConfig.getDestinations()) {
      AccountingDataTunnel tunnel = context.getDataTunnel(destination.getEndpoint());
      FragmentWritableBatch writableBatch = FragmentWritableBatch.getEmptyBatchWithSchema(
          isLast,
          handle.getQueryId(),
          handle.getMajorFragmentId(),
          handle.getMinorFragmentId(),
          operator.getOppositeMajorFragmentId(),
          destination.getId(),
          schema);
      stats.startWait();
      try {
        tunnel.sendRecordBatch(writableBatch);
      } finally {
        stats.stopWait();
      }
    }
    stats.addLongStat(Metric.BATCHES_SENT, 1);
  }

  @VisibleForTesting
  protected PartitionerDecorator getPartitioner() {
    return partitioner;
  }
}