AbstractQueueVisitor.java example

Explorer
test-master
- bazel-master
// Copyright 2014 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.devtools.build.lib.concurrent;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Throwables;
import com.google.common.util.concurrent.AtomicLongMap;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.devtools.build.lib.concurrent.ErrorClassifier.ErrorClassification;
import com.google.devtools.build.lib.util.Preconditions;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

/** A {@link QuiescingExecutor} implementation that wraps an {@link ExecutorService}. */
public class AbstractQueueVisitor implements QuiescingExecutor {

  /**
   * Default factory function for constructing {@link ThreadPoolExecutor}s. The {@link
   * ThreadPoolExecutor}s this creates have the same value for {@code corePoolSize} and {@code
   * maximumPoolSize} because that results in a fixed-size thread pool, and the current use cases
   * for {@link AbstractQueueVisitor} don't require any more sophisticated thread pool size
   * management.
   *
   * <p>If client use cases change, they may invoke one of the {@link
   * AbstractQueueVisitor#AbstractQueueVisitor} constructors that accepts a pre-constructed {@link
   * ThreadPoolExecutor}.
   */
  public static final Function<ExecutorParams, ThreadPoolExecutor> EXECUTOR_FACTORY =
      new Function<ExecutorParams, ThreadPoolExecutor>() {
        @Override
        public ThreadPoolExecutor apply(ExecutorParams p) {
          return new ThreadPoolExecutor(
              /*corePoolSize=*/ p.getParallelism(),
              /*maximumPoolSize=*/ p.getParallelism(),
              p.getKeepAliveTime(),
              p.getUnits(),
              p.getWorkQueue(),
              new ThreadFactoryBuilder().setNameFormat(p.getPoolName() + " %d").build());
        }
      };

  /**
   * The most severe unhandled exception thrown by a worker thread, according to
   * {@link #errorClassifier}. This exception gets propagated to the calling thread of
   * {@link #awaitQuiescence} . We use the most severe error for the sake of not masking e.g.
   * crashes in worker threads after the first critical error that can occur due to race conditions
   * in client code.
   *
   * <p>Field updates happen only in blocks that are synchronized on the {@link
   * AbstractQueueVisitor} object.
   *
   * <p>If {@link AbstractQueueVisitor} clients don't like the semantics of storing and propagating
   * the most severe error, then they should be provide an {@link ErrorClassifier} that does the
   * right thing (e.g. to cause the _first_ error to be propagated, you'd want to provide an
   * {@link ErrorClassifier} that gives all errors the exact same {@link ErrorClassification}).
   *
   * <p>Note that this is not a performance-critical path.
   */
  private volatile Throwable unhandled = null;

  /**
   * An uncaught exception when submitting a job to the {@link ExecutorService} is catastrophic,
   * and usually indicates a lack of stack space on which to allocate a native thread. The {@link
   * ExecutorService} may reach an inconsistent state in such circumstances, so we avoid blocking
   * on its termination when this field is non-{@code null}.
   */
  private volatile Throwable catastrophe;

  /**
   * An object used in the manner of a {@link java.util.concurrent.locks.Condition} object, for the
   * condition {@code remainingTasks.get() == 0 || jobsMustBeStopped}.
   * TODO(bazel-team): Replace with an actual {@link java.util.concurrent.locks.Condition} object.
   */
  private final Object zeroRemainingTasks = new Object();

  /** The number of {@link Runnable}s {@link #execute}-d that have not finished evaluation. */
  private final AtomicLong remainingTasks = new AtomicLong(0);

  /**
   * Flag used to record when all threads were killed by failed action execution. Only ever
   * transitions from {@code false} to {@code true}.
   *
   * <p>Except for {@link #mustJobsBeStopped}, may only be accessed in a block that is synchronized
   * on {@link #zeroRemainingTasks}.
   */
  private volatile boolean jobsMustBeStopped = false;

  /** Map from thread to number of jobs executing in the thread. Used for interrupt handling. */
  private final AtomicLongMap<Thread> jobs = AtomicLongMap.create();

  private final ExecutorService executorService;

  /**
   * Flag used to record when the main thread (the thread which called {@link #awaitQuiescence})
   * is interrupted.
   *
   * When this is {@code true}, adding tasks to the {@link ExecutorService} will fail quietly as
   * a part of the process of shutting down the worker threads.
   */
  private volatile boolean threadInterrupted = false;

  /**
   * Latches used to signal when the visitor has been interrupted or seen an exception. Used only
   * for testing.
   */
  private final CountDownLatch interruptedLatch = new CountDownLatch(1);
  private final CountDownLatch exceptionLatch = new CountDownLatch(1);

  /** If {@code true}, don't run new actions after an uncaught exception. */
  private final boolean failFastOnException;

  /** If {@code true}, shut down the {@link ExecutorService} on completion. */
  private final boolean ownExecutorService;

  private final ErrorClassifier errorClassifier;

  private static final Logger logger = Logger.getLogger(AbstractQueueVisitor.class.getName());

  private static ExecutorService createExecutorService(
      int parallelism,
      long keepAliveTime,
      TimeUnit units,
      String poolName,
      Function<ExecutorParams, ? extends ExecutorService> executorFactory) {
    return Preconditions.checkNotNull(executorFactory)
        .apply(
            new ExecutorParams(
                parallelism,
                keepAliveTime,
                units,
                Preconditions.checkNotNull(poolName),
                new BlockingStack<Runnable>()));
  }
  /**
   * Create the {@link AbstractQueueVisitor}.
   *
   * @param parallelism a measure of parallelism for the {@link ExecutorService}, such as {@code
   *     parallelism} in {@link java.util.concurrent.ForkJoinPool}, or both {@code corePoolSize} and
   *     {@code maximumPoolSize} in {@link ThreadPoolExecutor}.
   * @param keepAliveTime the keep-alive time for the {@link ExecutorService}, if applicable.
   * @param units the time units of keepAliveTime.
   * @param failFastOnException if {@code true}, don't run new actions after an uncaught exception.
   * @param poolName sets the name of threads spawned by the {@link ExecutorService}. If {@code
   *     null}, default thread naming will be used.
   * @param executorFactory the factory for constructing the executor service.
   * @param errorClassifier an error classifier used to determine whether to log and/or stop jobs.
   */
  public AbstractQueueVisitor(
      int parallelism,
      long keepAliveTime,
      TimeUnit units,
      boolean failFastOnException,
      String poolName,
      Function<ExecutorParams, ? extends ExecutorService> executorFactory,
      ErrorClassifier errorClassifier) {
    this(
        createExecutorService(parallelism, keepAliveTime, units, poolName, executorFactory),
        true,
        failFastOnException,
        errorClassifier);
  }

  /**
   * Create the AbstractQueueVisitor.
   *
   * @param executorService The {@link ExecutorService} to use.
   * @param shutdownOnCompletion If {@code true}, pass ownership of the {@link ExecutorService} to
   *     this class. The service will be shut down after a call to {@link #awaitQuiescence}. Callers
   *     must not shut down the {@link ExecutorService} while queue visitors use it.
   * @param failFastOnException if {@code true}, don't run new actions after an uncaught exception.
   * @param errorClassifier an error classifier used to determine whether to log and/or stop jobs.
   */
  protected AbstractQueueVisitor(
      ExecutorService executorService,
      boolean shutdownOnCompletion,
      boolean failFastOnException,
      ErrorClassifier errorClassifier) {
    this.failFastOnException = failFastOnException;
    this.ownExecutorService = shutdownOnCompletion;
    this.executorService = Preconditions.checkNotNull(executorService);
    this.errorClassifier = Preconditions.checkNotNull(errorClassifier);
  }

  @Override
  public final void awaitQuiescence(boolean interruptWorkers) throws InterruptedException {
    awaitTermination(interruptWorkers);
  }

  /** Schedules a call. Called in a worker thread. */
  @Override
  public final void execute(Runnable runnable) {
    if (runConcurrently()) {
      WrappedRunnable wrappedRunnable = new WrappedRunnable(runnable);
      try {
        // It's impossible for this increment to result in remainingTasks.get <= 0 because
        // remainingTasks is never negative. Therefore it isn't necessary to check its value for
        // the purpose of updating zeroRemainingTasks.
        long tasks = remainingTasks.incrementAndGet();
        Preconditions.checkState(
            tasks > 0,
            "Incrementing remaining tasks counter resulted in impossible non-positive number.");
        executeRunnable(wrappedRunnable);
      } catch (Throwable e) {
        if (!wrappedRunnable.ran) {
          // Note that keeping track of ranTask is necessary to disambiguate the case where
          // execute() itself failed, vs. a caller-runs policy on pool exhaustion, where the
          // runnable threw. To be extra cautious, we decrement the task count in a finally
          // block, even though the CountDownLatch is unlikely to throw.
          recordError(e);
        }
      }
    } else {
      runnable.run();
    }
  }

  /**
   * Subclasses may override this to make dynamic decisions about whether to run tasks
   * asynchronously versus in-thread.
   */
  protected boolean runConcurrently() {
    return true;
  }

  /**
   * Returns an approximate count of how many threads in the queue visitor's thread pool are
   * occupied with tasks.
   */
  protected final int activeParallelTasks() {
    return jobs.asMap().size();
  }

  protected void executeRunnable(Runnable runnable) {
    executorService.execute(runnable);
  }

  private synchronized void maybeSaveUnhandledThrowable(Throwable e, boolean markToStopJobs) {
    boolean critical = false;
    ErrorClassification errorClassification = errorClassifier.classify(e);
    switch (errorClassification) {
        case AS_CRITICAL_AS_POSSIBLE:
      case CRITICAL_AND_LOG:
        critical = true;
        logger.log(Level.WARNING, "Found critical error in queue visitor", e);
        break;
        case CRITICAL:
          critical = true;
          break;
        default:
          break;
    }
    if (unhandled == null
        || errorClassification.compareTo(errorClassifier.classify(unhandled)) > 0) {
      // Save the most severe error.
      unhandled = e;
      exceptionLatch.countDown();
    }
    if (markToStopJobs) {
      synchronized (zeroRemainingTasks) {
        if (critical && !jobsMustBeStopped) {
          jobsMustBeStopped = true;
          // This introduces a benign race, but it's the best we can do. When we have multiple
          // errors of the same severity that is at least CRITICAL, we'll end up saving (above) and
          // propagating (in 'awaitQuiescence') the most severe one we see, but the set of errors we
          // see is non-deterministic and is at the mercy of how quickly the calling thread of
          // 'awaitQuiescence' can do its thing after this 'notify' call.
          zeroRemainingTasks.notify();
        }
      }
    }
  }

  private void recordError(Throwable e) {
    try {
      // If threadInterrupted is true, then RejectedExecutionExceptions are expected. There's no
      // need to remember them, but there is a need to call decrementRemainingTasks, which is
      // satisfied by the finally block below.
      if (e instanceof RejectedExecutionException && threadInterrupted) {
        return;
      }
      catastrophe = e;
      maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ false);
    } finally {
      decrementRemainingTasks();
    }
  }

  /**
   * A wrapped {@link Runnable} that:
   * <ul>
   *   <li>Sets {@link #run} to {@code true} when {@code WrappedRunnable} is run,
   *   <li>Records the thread evaluating {@code r} in {@link #jobs} while {@code r} is evaluated,
   *   <li>Prevents {@link #originalRunnable} from being invoked if {@link #blockNewActions} returns
   *   {@code true},
   *   <li>Synchronously invokes {@code runnable.run()},
   *   <li>Catches any {@link Throwable} thrown by {@code runnable.run()}, and if it is the most
   *   severe {@link Throwable} seen by this {@link AbstractQueueVisitor}, assigns it to
   *   {@link #unhandled}, and sets {@link #jobsMustBeStopped} if necessary,
   *   <li>And, lastly, calls {@link #decrementRemainingTasks}.
   * </ul>
   */
  private final class WrappedRunnable implements Runnable {
    private final Runnable originalRunnable;
    private volatile boolean ran;

    private WrappedRunnable(Runnable originalRunnable) {
      this.originalRunnable = originalRunnable;
    }

    @Override
    public void run() {
      ran = true;
      Thread thread = null;
      boolean addedJob = false;
      try {
        thread = Thread.currentThread();
        addJob(thread);
        addedJob = true;
        if (blockNewActions()) {
          // Make any newly enqueued tasks quickly die. We check after adding to the jobs map so
          // that if another thread is racing to kill this thread and didn't make it before this
          // conditional, it will be able to find and kill this thread anyway.
          return;
        }
        originalRunnable.run();
      } catch (Throwable e) {
        maybeSaveUnhandledThrowable(e, /*markToStopJobs=*/ true);
      } finally {
        try {
          if (thread != null && addedJob) {
            removeJob(thread);
          }
        } finally {
          decrementRemainingTasks();
        }
      }
    }
  }

  private void addJob(Thread thread) {
    jobs.incrementAndGet(thread);
  }

  private void removeJob(Thread thread) {
    if (jobs.decrementAndGet(thread) == 0) {
      jobs.remove(thread);
    }
  }

  /** Set an internal flag to show that an interrupt was detected. */
  protected final void setInterrupted() {
    threadInterrupted = true;
  }

  private void decrementRemainingTasks() {
    // This decrement statement may result in remainingTasks.get() == 0, so it must be checked
    // and the zeroRemainingTasks condition object notified if that condition is obtained.
    long tasks = remainingTasks.decrementAndGet();
    Preconditions.checkState(
        tasks >= 0, "Decrementing remaining tasks counter resulted in impossible negative number.");
    if (tasks == 0) {
      synchronized (zeroRemainingTasks) {
        zeroRemainingTasks.notify();
      }
    }
  }

  /** If this returns true, don't enqueue new actions. */
  protected boolean blockNewActions() {
    return isInterrupted() || (unhandled != null && failFastOnException);
  }

  @VisibleForTesting
  @Override
  public final CountDownLatch getExceptionLatchForTestingOnly() {
    return exceptionLatch;
  }

  @VisibleForTesting
  @Override
  public final CountDownLatch getInterruptionLatchForTestingOnly() {
    return interruptedLatch;
  }

  /** Get the value of the interrupted flag. */
  @ThreadSafety.ThreadSafe
  protected final boolean isInterrupted() {
    return threadInterrupted;
  }

  /**
   * Get number of jobs remaining. Note that this can increase in value if running tasks submit
   * further jobs.
   */
  public final long getTaskCount() {
    return remainingTasks.get();
  }

  /**
   * Whether all running and pending jobs will be stopped or cancelled. Also newly submitted tasks
   * will be rejected if this is true.
   *
   * <p>This function returns the CURRENT state of whether jobs should be stopped. If the value is
   * false right now, it may be changed to true by another thread later.
   */
  protected final boolean mustJobsBeStopped() {
    return jobsMustBeStopped;
  }

  /**
   * Waits for the task queue to drain. Then if {@code ownExecutorService} is true, shuts down the
   * {@link ExecutorService} and waits for it to terminate. Throws (the same) unchecked exception if
   * any worker thread failed unexpectedly.
   */
  protected final void awaitTermination(boolean interruptWorkers) throws InterruptedException {
    Throwables.propagateIfPossible(catastrophe);
    try {
      synchronized (zeroRemainingTasks) {
        while (remainingTasks.get() != 0 && !jobsMustBeStopped) {
          zeroRemainingTasks.wait();
        }
      }
    } catch (InterruptedException e) {
      // Mark the visitor, so that it's known to be interrupted, and
      // then break out of here, stop the worker threads and return ASAP,
      // sending the interruption to the parent thread.
      setInterrupted();
    }

    reallyAwaitTermination(interruptWorkers);

    if (isInterrupted()) {
      // Set interrupted bit on current thread so that callers can see that it was interrupted. Note
      // that if the thread was interrupted while awaiting termination, we might not hit this
      // codepath, but then the current thread's interrupt bit is already set, so we are fine.
      Thread.currentThread().interrupt();
    }
    // Throw the first unhandled (worker thread) exception in the main thread. We throw an unchecked
    // exception instead of InterruptedException if both are present because an unchecked exception
    // may indicate a catastrophic failure that should shut down the program. The caller can
    // check the interrupted bit if they will handle the unchecked exception without crashing.
    Throwables.propagateIfPossible(unhandled);

    if (Thread.interrupted()) {
      throw new InterruptedException();
    }
  }

  private void reallyAwaitTermination(boolean interruptWorkers) {
    // TODO(bazel-team): verify that interrupt() is safe for every use of
    // AbstractQueueVisitor and remove the interruptWorkers flag.
    if (interruptWorkers && !jobs.isEmpty()) {
      interruptInFlightTasks();
    }

    if (isInterrupted()) {
      interruptedLatch.countDown();
    }

    Throwables.propagateIfPossible(catastrophe);
    synchronized (zeroRemainingTasks) {
      while (remainingTasks.get() != 0) {
        try {
          zeroRemainingTasks.wait();
        } catch (InterruptedException e) {
          setInterrupted();
        }
      }
    }

    if (ownExecutorService) {
      executorService.shutdown();
      for (;;) {
        try {
          Throwables.propagateIfPossible(catastrophe);
          executorService.awaitTermination(Integer.MAX_VALUE, TimeUnit.SECONDS);
          break;
        } catch (InterruptedException e) {
          setInterrupted();
        }
      }
    }
  }

  private void interruptInFlightTasks() {
    Thread thisThread = Thread.currentThread();
    for (Thread thread : jobs.asMap().keySet()) {
      if (thisThread != thread) {
        thread.interrupt();
      }
    }
  }
}