RCAFramework.java example

Explorer
pinot-master
package com.linkedin.thirdeye.rootcause;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * Container class for configuring and executing a root cause search with multiple pipelines.
 * The framework is instantiated with multiple (named) pipelines and a result aggregator. The run()
 * method then executes the configured pipelines for arbitrary inputs without
 * maintaining any additional state within the RCAFramework.
 *
 * RCAFramework supports parallel DAG execution and requires pipelines to form a valid path
 * from {@code INPUT} to {@code OUTPUT}. The execution order of pipelines is guaranteed to be
 * compatible with serial execution in one single thread.
 */

/*
 *                   /-> pipeline.run() --> pipeline.run() \
 *                  /                                       \
 * INPUT --> run() ---> pipeline.run() ---> pipeline.run() --> OUTPUT
 *                  \                    /
 *                   \-> pipeline.run() /
 */
public class RCAFramework {
  private static final Logger LOG = LoggerFactory.getLogger(RCAFramework.class);

  public static final String INPUT = "INPUT";
  public static final String OUTPUT = "OUTPUT";
  public static final long TIMEOUT = 600000;

  private final Map<String, Pipeline> pipelines;
  private final ExecutorService executor;

  public RCAFramework(Collection<Pipeline> pipelines, ExecutorService executor) {
    this.executor = executor;

    if(!isValidDAG(pipelines))
      throw new IllegalArgumentException(String.format("Invalid DAG. '%s' not reachable output name '%s'", OUTPUT, INPUT));

    this.pipelines = new HashMap<>();
    for(Pipeline p : pipelines) {
      if(INPUT.equals(p.getOutputName()))
        throw new IllegalArgumentException(String.format("Must not contain a pipeline with output name '%s'", INPUT));
      if(this.pipelines.containsKey(p.getOutputName()))
        throw new IllegalArgumentException(String.format("Already contains pipeline with output name '%s'", p.getOutputName()));
      this.pipelines.put(p.getOutputName(), p);
    }

    if(!this.pipelines.containsKey(OUTPUT))
      throw new IllegalArgumentException(String.format("Must contain a pipeline with output name '%s'", OUTPUT));
  }

  /**
   * Performs rootcause search for a user-specified set of input entities.
   * Fans out entities to individual pipelines, collects results, and aggregates them.
   *
   * @param input user-specified search entities
   * @return aggregated results
   */
  public RCAFrameworkExecutionResult run(Set<Entity> input) throws Exception {
    Map<String, Pipeline> pipelines = new HashMap<>(this.pipelines);
    pipelines.put(INPUT, new StaticPipeline(INPUT, Collections.<String>emptySet(), input));

    LOG.info("Constructing flow for input '{}'", input);
    Map<String, Future<PipelineResult>> flow = constructDAG(pipelines);

    Map<String, PipelineResult> results = new HashMap<>();
    for(Map.Entry<String, Future<PipelineResult>> e : flow.entrySet()) {
      PipelineResult r = e.getValue().get(TIMEOUT, TimeUnit.MILLISECONDS);
      if(LOG.isDebugEnabled())
        logResultDetails(r);
      results.put(e.getKey(), r);
    }

    return new RCAFrameworkExecutionResult(results.get(OUTPUT).getEntities(), results);
  }

  static void logResultDetails(PipelineResult result) {
    List<Entity> entities = new ArrayList<>(result.getEntities());
    Collections.sort(entities, new Comparator<Entity>() {
      @Override
      public int compare(Entity o1, Entity o2) {
        return -Double.compare(o1.getScore(), o2.getScore());
      }
    });

    for(Entity e : entities) {
      LOG.debug("{} [{}] {}", Math.round(e.getScore() * 1000) / 1000.0, e.getClass().getSimpleName(), e.getUrn());
    }
  }

  static boolean isValidDAG(Collection<Pipeline> pipelines) {
    Set<String> visited = new HashSet<>();
    visited.add(INPUT);

    int prevSize = 0;
    while(prevSize < visited.size()) {
      prevSize = visited.size();
      for (Pipeline p : pipelines) {
        if (visited.containsAll(p.getInputNames()))
          visited.add(p.getOutputName());
      }
    }

    return visited.contains(OUTPUT);
  }

  Map<String, Future<PipelineResult>> constructDAG(Map<String, Pipeline> pipelines) {
    // TODO purge pipelines not on critical path
    Map<String, Future<PipelineResult>> tasks = new HashMap<>();
    Pipeline input = pipelines.get(INPUT);
    PipelineCallable inputCallable = new PipelineCallable(Collections.<String, Future<PipelineResult>>emptyMap(), input);
    tasks.put(INPUT, this.executor.submit(inputCallable));

    int prevSize = 0;
    while(prevSize < tasks.size()) {
      prevSize = tasks.size();
      for(Pipeline p : pipelines.values()) {
        if(!tasks.containsKey(p.getOutputName()) && tasks.keySet().containsAll(p.getInputNames())) {
          Map<String, Future<PipelineResult>> dependencies = new HashMap<>();
          for(String inputName : p.getInputNames()) {
            dependencies.put(inputName, tasks.get(inputName));
          }

          PipelineCallable c = new PipelineCallable(dependencies, p);
          tasks.put(p.getOutputName(), this.executor.submit(c));
        }
      }
    }

    return tasks;
  }
}