Statistics.java example

Explorer
HDP-2.2-Patched-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.mapred.gridmix;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.gridmix.Gridmix.Component;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.tools.rumen.JobStory;

import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

/**
 * Component collecting the stats required by other components
 * to make decisions.
 * Single thread collector tries to collect the stats (currently cluster stats)
 * and caches it internally.
 * Components interested in these stats need to register themselves and will get
 * notified either on every job completion event or some fixed time interval.
 */
public class Statistics implements Component<Statistics.JobStats> {
  public static final Log LOG = LogFactory.getLog(Statistics.class);

  private final StatCollector statistics = new StatCollector();
  private JobClient cluster;

  //List of cluster status listeners.
  private final List<StatListener<ClusterStats>> clusterStatlisteners =
    new CopyOnWriteArrayList<StatListener<ClusterStats>>();

  //List of job status listeners.
  private final List<StatListener<JobStats>> jobStatListeners =
    new CopyOnWriteArrayList<StatListener<JobStats>>();

  // A map of job-sequence-id to job-stats of submitted jobs
  private static final Map<Integer, JobStats> submittedJobsMap =
    new ConcurrentHashMap<Integer, JobStats>();
  
  // total number of map tasks submitted
  private static volatile int numMapsSubmitted = 0;

  // total number of reduce tasks submitted
  private static volatile int numReducesSubmitted = 0;
  
  private int completedJobsInCurrentInterval = 0;
  private final int jtPollingInterval;
  private volatile boolean shutdown = false;
  private final int maxJobCompletedInInterval;
  private static final String MAX_JOBS_COMPLETED_IN_POLL_INTERVAL_KEY =
    "gridmix.max-jobs-completed-in-poll-interval";
  private final ReentrantLock lock = new ReentrantLock();
  private final Condition jobCompleted = lock.newCondition();
  private final CountDownLatch startFlag;

  public Statistics(
    final Configuration conf, int pollingInterval, CountDownLatch startFlag)
    throws IOException, InterruptedException {
      UserGroupInformation ugi = UserGroupInformation.getLoginUser();
      this.cluster = ugi.doAs(new PrivilegedExceptionAction<JobClient>() {
        public JobClient run() throws IOException {
          return new JobClient(new JobConf(conf));
        }
      });

    this.jtPollingInterval = pollingInterval;
    maxJobCompletedInInterval = conf.getInt(
      MAX_JOBS_COMPLETED_IN_POLL_INTERVAL_KEY, 1);
    this.startFlag = startFlag;
  }

  /**
   * Generates a job stats.
   */
  public static JobStats generateJobStats(Job job, JobStory jobdesc) {
    int seq = GridmixJob.getJobSeqId(job);
    // bail out if job description is missing for a job to be simulated
    if (seq >= 0 && jobdesc == null) {
      throw new IllegalArgumentException("JobStory not available for job " 
                                         + job.getJobID());
    }
    
    int maps = -1;
    int reds = -1;
    if (jobdesc != null) {
      // Note that the ZombieJob will return a >= 0 value
      maps = jobdesc.getNumberMaps();
      reds = jobdesc.getNumberReduces();
    }
    return new JobStats(maps, reds, job);
  }
  
  /**
   * Add a submitted job for monitoring.
   */
  public void addJobStats(JobStats stats) {
    int seq = GridmixJob.getJobSeqId(stats.getJob());
    if (seq < 0) {
      LOG.info("Not tracking job " + stats.getJob().getJobName()
               + " as seq id is less than zero: " + seq);
      return;
    }
    submittedJobsMap.put(seq, stats);
    numMapsSubmitted += stats.getNoOfMaps();
    numReducesSubmitted += stats.getNoOfReds();
  }

  /**
   * Used by JobMonitor to add the completed job.
   */
  @Override
  public void add(Statistics.JobStats job) {
    //This thread will be notified initially by job-monitor incase of
    //data generation. Ignore that as we are getting once the input is
    //generated.
    if (!statistics.isAlive()) {
      return;
    }
    JobStats stat = submittedJobsMap.remove(GridmixJob.getJobSeqId(job.getJob()));
    
    // stat cannot be null
    if (stat == null) {
      LOG.error("[Statistics] Missing entry for job " 
                + job.getJob().getJobID());
      return;
    }
    
    // update the total number of submitted map/reduce task count
    numMapsSubmitted -= stat.getNoOfMaps();
    numReducesSubmitted -= stat.getNoOfReds();
    
    completedJobsInCurrentInterval++;
    //check if we have reached the maximum level of job completions.
    if (completedJobsInCurrentInterval >= maxJobCompletedInInterval) {
      if (LOG.isDebugEnabled()) {
        LOG.debug(
          " Reached maximum limit of jobs in a polling interval " +
            completedJobsInCurrentInterval);
      }
      completedJobsInCurrentInterval = 0;
      lock.lock();
      try {
        //Job is completed notify all the listeners.
        for (StatListener<JobStats> l : jobStatListeners) {
          l.update(stat);
        }
        this.jobCompleted.signalAll();
      } finally {
        lock.unlock();
      }
    }
  }

  //TODO: We have just 2 types of listeners as of now . If no of listeners
  //increase then we should move to map kind of model.

  public void addClusterStatsObservers(StatListener<ClusterStats> listener) {
    clusterStatlisteners.add(listener);
  }

  public void addJobStatsListeners(StatListener<JobStats> listener) {
    this.jobStatListeners.add(listener);
  }

  /**
   * Attempt to start the service.
   */
  @Override
  public void start() {
    statistics.start();
  }

  private class StatCollector extends Thread {

    StatCollector() {
      super("StatsCollectorThread");
    }

    public void run() {
      try {
        startFlag.await();
        if (Thread.currentThread().isInterrupted()) {
          return;
        }
      } catch (InterruptedException ie) {
        LOG.error(
          "Statistics Error while waiting for other threads to get ready ", ie);
        return;
      }
      while (!shutdown) {
        lock.lock();
        try {
          jobCompleted.await(jtPollingInterval, TimeUnit.MILLISECONDS);
        } catch (InterruptedException ie) {
          if (!shutdown) {
            LOG.error("Statistics interrupt while waiting for completion of "
                + "a job.", ie);
          }
          return;
        } finally {
          lock.unlock();
        }

        //Fetch cluster data only if required.i.e .
        // only if there are clusterStats listener.
        if (clusterStatlisteners.size() > 0) {
          try {
            ClusterStatus clusterStatus = cluster.getClusterStatus();
            updateAndNotifyClusterStatsListeners(clusterStatus);
          } catch (IOException e) {
            LOG.error(
              "Statistics io exception while polling JT ", e);
            return;
          }
        }
      }
    }

    private void updateAndNotifyClusterStatsListeners(
      ClusterStatus clusterStatus) {
      ClusterStats stats = ClusterStats.getClusterStats();
      stats.setClusterMetric(clusterStatus);
      for (StatListener<ClusterStats> listener : clusterStatlisteners) {
        listener.update(stats);
      }
    }

  }

  /**
   * Wait until the service completes. It is assumed that either a
   * {@link #shutdown} or {@link #abort} has been requested.
   */
  @Override
  public void join(long millis) throws InterruptedException {
    statistics.join(millis);
  }

  @Override
  public void shutdown() {
    shutdown = true;
    submittedJobsMap.clear();
    clusterStatlisteners.clear();
    jobStatListeners.clear();
    statistics.interrupt();
  }

  @Override
  public void abort() {
    shutdown = true;
    submittedJobsMap.clear();
    clusterStatlisteners.clear();
    jobStatListeners.clear();
    statistics.interrupt();
  }

  /**
   * Class to encapsulate the JobStats information.
   * Current we just need information about completedJob.
   * TODO: In future we need to extend this to send more information.
   */
  static class JobStats {
    private final int noOfMaps;
    private final int noOfReds;
    private JobStatus currentStatus;
    private final Job job;

    public JobStats(int noOfMaps,int numOfReds, Job job){
      this.job = job;
      this.noOfMaps = noOfMaps;
      this.noOfReds = numOfReds;
    }
    public int getNoOfMaps() {
      return noOfMaps;
    }
    public int getNoOfReds() {
      return noOfReds;
    }

    /**
     * Returns the job ,
     * We should not use job.getJobID it returns null in 20.1xx.
     * Use (GridmixJob.getJobSeqId(job)) instead
     * @return job
     */
    public Job getJob() {
      return job;
    }
    
    /**
     * Update the job statistics.
     */
    public synchronized void updateJobStatus(JobStatus status) {
      this.currentStatus = status;
    }
    
    /**
     * Get the current job status.
     */
    public synchronized JobStatus getJobStatus() {
      return currentStatus;
    }
  }

  static class ClusterStats {
    private ClusterStatus status = null;
    private static ClusterStats stats = new ClusterStats();

    private ClusterStats() {

    }

    /**
     * @return stats
     */
    static ClusterStats getClusterStats() {
      return stats;
    }

    /**
     * @param metrics
     */
    void setClusterMetric(ClusterStatus metrics) {
      this.status = metrics;
    }

    /**
     * @return metrics
     */
    public ClusterStatus getStatus() {
      return status;
    }

    int getNumRunningJob() {
      return submittedJobsMap.size();
    }

    /**
     * @return runningWatitingJobs
     */
    static Collection<JobStats> getRunningJobStats() {
      return submittedJobsMap.values();
    }

    /**
     * Returns the total number of submitted map tasks
     */
    static int getSubmittedMapTasks() {
      return numMapsSubmitted;
    }
    
    /**
     * Returns the total number of submitted reduce tasks
     */
    static int getSubmittedReduceTasks() {
      return numReducesSubmitted;
    }
  }
}