ParseJobsFromLogs.java example

Explorer
white-elephant-master
- hadoop
  - src
    - java
      - com
        linkedin
        whiteelephant
        ProcessLogs.java
        analysis
        ComputeUsagePerHour.java
        mapreduce
        MyAvroMultipleOutputs.java
        lib
        input
        CombineDocumentFileFormat.java
        CombinedTextInputFormat.java
        job
        StagedOutputJob.java
        StagedOutputJobExecutor.java
        parsing
        LineParsing.java
        ParseJobConfs.java
        ParseJobsFromLogs.java
        util
        JobStatsProcessing.java
- server
  - src
    - java
      - com
        linkedin
        whiteelephant
        TimeZoneConversion.java
/*
 * Copyright 2012 LinkedIn, Inc
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.linkedin.whiteelephant.parsing;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TimeZone;
import java.util.concurrent.ExecutionException;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.log4j.Logger;
import org.json.JSONObject;

import com.google.common.base.Predicate;
import com.google.common.collect.Collections2;
import com.linkedin.whiteelephant.parsing.Attempt;
import com.linkedin.whiteelephant.parsing.DerivedAttemptData;
import com.linkedin.whiteelephant.parsing.Job;
import com.linkedin.whiteelephant.parsing.LogData;
import com.linkedin.whiteelephant.parsing.Task;
import com.linkedin.whiteelephant.mapreduce.MyAvroMultipleOutputs;
import com.linkedin.whiteelephant.mapreduce.lib.input.CombinedTextInputFormat;
import com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJob;
import com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJobExecutor;
import com.linkedin.whiteelephant.parsing.LineParsing;
import com.linkedin.whiteelephant.util.JobStatsProcessing;

public class ParseJobsFromLogs
{  
  private static final String CPU_MILLISECONDS = "CPU_MILLISECONDS";
  
  private final Logger _log;
  private final Properties _props;
  private final FileSystem _fs;
  private final String _name;
  
  private final String _jobsOutputPathRoot;
  private final String _logsRoot;
  private final String _clusterNames;
  private final int _numDays;
  private final int _numDaysForced;
  private final boolean _incremental;
  
  public ParseJobsFromLogs(String name, Properties props) throws IOException
  {
    _log = Logger.getLogger(name);
    _name = name;
    _props = props;    
    
    Configuration conf = StagedOutputJob.createConfigurationFromProps(_props);
    
    System.out.println("fs.default.name: " + conf.get("fs.default.name"));
    
    _fs = FileSystem.get(conf);
    
    if (_props.get("cluster.names") == null) {
      throw new IllegalArgumentException("cluster.names is not specified.");
    }
    
    if (_props.get("jobs.output.path") == null) {
       throw new IllegalArgumentException("attempts.output.path is not specified.");
    }
    
    if (_props.get("num.days") == null) {
      throw new IllegalArgumentException("num.days is not specified");
    }
    
    if (_props.get("num.days.forced") == null) {
      throw new IllegalArgumentException("num.days.forced is not specified");
    }
    
    if (_props.get("incremental") == null) {
      throw new IllegalArgumentException("incremental is not specified.");
    }

    if (_props.get("logs.root") == null) {
      throw new IllegalArgumentException("logs.root is not specified.");
    }

    _jobsOutputPathRoot = (String)_props.get("jobs.output.path");
    _logsRoot = (String)_props.get("logs.root");
    _clusterNames = (String)_props.get("cluster.names");
    _numDays = Integer.parseInt((String)_props.get("num.days"));
    _numDaysForced = Integer.parseInt((String)_props.get("num.days.forced"));
    _incremental = Boolean.parseBoolean((String)_props.get("incremental"));
  }
  
  public void execute(StagedOutputJobExecutor executor) throws IOException, InterruptedException, ExecutionException
  {
    for (String clusterName : _clusterNames.split(","))
    {
      System.out.println("Processing cluster " + clusterName);
            
      List<JobStatsProcessing.ProcessingTask> processingTasks = JobStatsProcessing.getTasks(_fs, _logsRoot, clusterName, _jobsOutputPathRoot, "log", _incremental, _numDays, _numDaysForced);
      
      for (JobStatsProcessing.ProcessingTask task : processingTasks)
      {      
        List<String> inputPaths = new ArrayList<String>();
        inputPaths.add(task.inputPathFormat);
        
        String outputPath = task.outputPath;
        
        final StagedOutputJob job = StagedOutputJob.createStagedJob(
           _props,
           _name + "-parse-jobs-" + task.id,
           inputPaths,
           "/tmp" + outputPath,
           outputPath,
           _log);
        
        job.getConfiguration().set("jobs.output.path", _jobsOutputPathRoot);
        job.getConfiguration().set("logs.cluster.name", clusterName);
                
        // 1 reducer per 12 GB of input data
        long numReduceTasks = (int)Math.ceil(((double)task.totalLength) / 1024 / 1024 / 1024 / 12);
                
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(BytesWritable.class);
  
        job.setInputFormatClass(CombinedTextInputFormat.class);
        job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
  
        AvroJob.setOutputKeySchema(job, Schema.create(Type.STRING));
        AvroJob.setOutputValueSchema(job, LogData.SCHEMA$);
        
        job.setNumReduceTasks((int)numReduceTasks);
   
        job.setMapperClass(ParseJobsFromLogs.TheMapper.class);
        job.setReducerClass(ParseJobsFromLogs.TheReducer.class);
         
        AvroJob.setMapOutputKeySchema(job, Schema.create(Type.STRING));
        AvroJob.setMapOutputValueSchema(job, LogData.SCHEMA$);
        
        MyAvroMultipleOutputs.addNamedOutput(job, "logs", AvroKeyValueOutputFormat.class, Schema.create(Type.STRING), LogData.SCHEMA$);
        
        executor.submit(job);
      }
      
      executor.waitForCompletion();
    }
  }
  
  public static class TheMapper extends Mapper<LongWritable, Text, AvroWrapper<String>, AvroWrapper<LogData>> 
  {
    String _clusterName;
    
    @Override
    protected void setup(Context context)
    {
      _clusterName = context.getConfiguration().get("logs.cluster.name");
    }
    
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 
    {
      List<CharSequence> inputSplits = new ArrayList<CharSequence>();
      
      CombineFileSplit fileSplit = (CombineFileSplit) context.getInputSplit();
                  
      for (Path path : fileSplit.getPaths())
      {        
        inputSplits.add(path.toString());
      }
      
      String line = value.toString(); 
      
      Job job = null;
      Attempt attempt = null;
      Task task = null;
      
      job = LineParsing.tryParseJob(line);
      
      if (job == null)
      {      
        attempt = LineParsing.tryParseAttempt(line);
        
        if (attempt == null)
        {
          task = LineParsing.tryParseTask(line);
        }
      }
      
      LogData data = new LogData();
      String jobId = null;
      
      data.setCluster(_clusterName);
      
      try
      {
        if (job != null)
        {
          // log lines are sometimes truncated, so data may be missing - just ignore these
          if (job.getJobId() != null)
          {
            jobId = job.getJobId().toString();
            job.setTasks(new ArrayList<Task>());
            data.setEntry(job);
            data.setPath(findInputSplitForJob(jobId,inputSplits));
            context.write(new AvroKey<String>(jobId), new AvroValue<LogData>(data));
          }
        }
        else if (attempt != null)
        {
          // log lines are sometimes truncated, so data may be missing - just ignore these
          if (attempt.getJobId() != null && attempt.getTaskId() != null && attempt.getTaskAttemptId() != null && attempt.getType() != null)
          {
            jobId = attempt.getJobId().toString();
            data.setEntry(attempt);
            data.setPath(findInputSplitForJob(jobId,inputSplits));
            context.write(new AvroKey<String>(jobId), new AvroValue<LogData>(data));
          }
        }
        else if (task != null)
        {
          // log lines are sometimes truncated, so data may be missing - just ignore these
          if (task.getJobId() != null && task.getTaskId() != null && task.getType() != null)
          {
            jobId = task.getJobId().toString();
            task.setAttempts(new ArrayList<Attempt>());
            data.setEntry(task);
            data.setPath(findInputSplitForJob(jobId,inputSplits));
            context.write(new AvroKey<String>(jobId), new AvroValue<LogData>(data));
          }
        }
      }
      catch (Exception e)
      {
        System.out.println("Exception writing log data: " + e.toString());
        if (jobId != null)
        {
          System.out.println("jobId: " + jobId);
          CharSequence path = data.getPath();
          if (path != null)
          {
            System.out.println("input: " + path);
          }
        }
        throw new IOException(e);
      }
    }
    
    private CharSequence findInputSplitForJob(String jobId, List<CharSequence> inputSplits)
    {
      if (jobId != null)
      {
        for (CharSequence inputSplit : inputSplits)
        {
          if (inputSplit.toString().contains(jobId))
          {
            return inputSplit;
          }
        }
      }
      
      return null;
    }
  }
  
  public static class TheReducer extends Reducer<AvroKey<String>, AvroValue<LogData>, AvroWrapper<String>, AvroWrapper<LogData>> 
  {    
    private String jobOutputPath;
    
    @Override
    protected void setup(Context context)
    {
      jobOutputPath = context.getConfiguration().get("jobs.output.path");
      System.out.println("Job output path: " + jobOutputPath);
    }
    
    @Override
    protected void reduce(AvroKey<String> key, Iterable<AvroValue<LogData>> values, final Context context) throws IOException, InterruptedException 
    {      
      String jobId = key.datum();
      
      String inputPath = null;
      
      List<Job> jobEntries = new ArrayList<Job>();
      List<Attempt> attemptEntries = new ArrayList<Attempt>();
      List<Task> taskEntries = new ArrayList<Task>();
      for (AvroValue<LogData> value : values)
      {        
        inputPath = value.datum().getPath().toString();
        
        if (value.datum().getEntry() instanceof Job)
        {
          jobEntries.add(Job.newBuilder((Job)value.datum().getEntry()).build());
        }
        else if (value.datum().getEntry() instanceof Task)
        {
          taskEntries.add(Task.newBuilder((Task)value.datum().getEntry()).build());
        }
        else if (value.datum().getEntry() instanceof Attempt)
        {
          attemptEntries.add(Attempt.newBuilder((Attempt)value.datum().getEntry()).build());
        }
      }
      
      Job job = new Job();
      
      mergeJobEntries(job, jobEntries);
      mergeTaskEntries(job, taskEntries);
      mergeTaskAttemptEntries(job, attemptEntries);      
      
      LogData data = new LogData();
      data.setPath(inputPath);
      data.setEntry(job);
      
      try
      {
        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy/MMdd");     
        dateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
        
//        Date submitTime = new Date(job.getSubmitTime());
//        String baseOutputPath = String.format("%s/%s/part",cluster,dateFormat.format(submitTime));                        
//        amos.write("logs", new AvroWrapper<String>(jobId), new AvroWrapper<LogData>(data), baseOutputPath);
        
        context.write(new AvroWrapper<String>(jobId), new AvroWrapper<LogData>(data));
      }
      catch (Exception e)
      {
        System.out.println("Exception writing log data: " + e.toString());
        
        if (inputPath != null)
        {
          System.out.println("input: " + inputPath);
        }
        
        System.out.println("key: " + jobId);
        
        try
        {
          System.out.println("value:\n" + (new JSONObject(GenericData.get().toString(data)).toString(2)));
        }
        catch (Exception e1)
        {
          System.out.println("Error generating JSON value: " + e1.toString());
        }
        throw new IOException(e);
      }
    }
    
    public static void mergeJobEntries(Job parsedJob, List<Job> jobEntries/*, final Context context*/)
    {      
      for (Job jobLine : jobEntries)
      {        
        // ignore status unless it's the last one, status actually shows up multiple times as the job passes through various states
        if (jobLine.getJobStatus() != null && jobLine.getFinishTime() != null)
        {
          parsedJob.setJobStatus(jobLine.getJobStatus());
          parsedJob.setFinishTime(jobLine.getFinishTime());
        }
        
        if (jobLine.getUser() != null)
        {
          parsedJob.setUser(jobLine.getUser());
        }
        
        if (jobLine.getJobId() != null)
        {
          parsedJob.setJobId(jobLine.getJobId());
        }
        
        if (jobLine.getUser() != null)
        {
          parsedJob.setUser(jobLine.getUser());
        }
        
        if (jobLine.getJobName() != null)
        {
          parsedJob.setJobName(jobLine.getJobName());
        }
        
        if (jobLine.getJobQueue() != null)
        {
          parsedJob.setJobQueue(jobLine.getJobQueue());
        }
        
        if (jobLine.getSubmitTime() != null)
        {
          parsedJob.setSubmitTime(jobLine.getSubmitTime());
        }
        
        if (jobLine.getLaunchTime() != null)
        {
          parsedJob.setLaunchTime(jobLine.getLaunchTime());
        }
        
        if (jobLine.getTotalMaps() != null)
        {
          parsedJob.setTotalMaps(jobLine.getTotalMaps());
        }
        
        if (jobLine.getTotalReduces() != null)
        {
          parsedJob.setTotalReduces(jobLine.getTotalReduces());
        }
        
        if (jobLine.getFinishedMaps() != null)
        {
          parsedJob.setFinishedMaps(jobLine.getFinishedMaps());
        }
        
        if (jobLine.getFinishedReduces() != null)
        {
          parsedJob.setFinishedReduces(jobLine.getFinishedReduces());
        }
        
        if (jobLine.getFailedMaps() != null)
        {
          parsedJob.setFailedMaps(jobLine.getFailedMaps());
        }
        
        if (jobLine.getFailedReduces() != null)
        {
          parsedJob.setFailedReduces(jobLine.getFailedReduces());
        }
      }
    }
    
    public static void mergeTaskEntries(Job job, List<Task> taskEntries)
    {
      Map<String,Task> taskIdToTask = new HashMap<String,Task>();
            
      for (Task entry : taskEntries)
      {
        String taskId = entry.getTaskId().toString();
        
        if (taskId == null)
        {
          throw new RuntimeException("Missing task ID");
        }
        
        Task task = null;
        
        if (!taskIdToTask.containsKey(taskId))
        { 
          taskIdToTask.put(taskId,new Task());
        }
        
        task = taskIdToTask.get(taskId);
        
        if (task.getAttempts() == null)
        {
          task.setAttempts(new ArrayList<Attempt>());
        }
        
        if (entry.getTaskId() != null)
        {
          task.setTaskId(entry.getTaskId());
        }

        if (entry.getTaskStatus() != null)
        {
          task.setTaskStatus(entry.getTaskStatus());
        }
        
        if (entry.getStartTime() != null)
        {
          task.setStartTime(entry.getStartTime());
        }
        
        if (entry.getFinishTime() != null)
        {
          task.setFinishTime(entry.getFinishTime());
        }
        
        if (entry.getJobId() != null)
        {
          task.setJobId(entry.getJobId());
        }
        
        if (entry.getType() != null)
        {
          task.setType(entry.getType());
        }
      }
      
      List<Task> tasks = new ArrayList<Task>(taskIdToTask.values());
      
      Collections.sort(tasks, new Comparator<Task>() {
        @Override
        public int compare(Task o1, Task o2)
        {
          return o1.getTaskId().toString().compareTo(o2.getTaskId().toString());
        }
      });
      
      job.setTasks(tasks);
    }
    
    /**
     * Merges together attempt data having the same task attempt ID so all the data for an attempt is in a single record.
     * 
     * @param values
     * @param context
     * @return
     */
    public static void mergeTaskAttemptEntries(Job job, Iterable<Attempt> attemptEntries/*, final Context context*/)
    {
      // merge together the entries for each task attempt
      Map<String,Attempt> taskAttemptIdToAttempt = new HashMap<String,Attempt>();      
      for (Attempt attempt : attemptEntries)
      {        
        Attempt mergedAttempt;
        if (!taskAttemptIdToAttempt.containsKey(attempt.getTaskAttemptId().toString()))
        {
          mergedAttempt = new Attempt();
          mergedAttempt.setCounters(new HashMap<CharSequence,Long>());
          mergedAttempt.setDerived(new DerivedAttemptData());
          taskAttemptIdToAttempt.put(attempt.getTaskAttemptId().toString(),mergedAttempt);
        }
        else
        {
          mergedAttempt = taskAttemptIdToAttempt.get(attempt.getTaskAttemptId().toString());
        }              
        
        if (attempt.getType() != null)
        {
          mergedAttempt.setType(attempt.getType());
        }
        
        if (attempt.getJobId() != null)
        {
          mergedAttempt.setJobId(attempt.getJobId());
        }
        
        if (attempt.getTaskId() != null)
        {
          mergedAttempt.setTaskId(attempt.getTaskId());
        }
        
        if (attempt.getTaskAttemptId() != null)
        {
          mergedAttempt.setTaskAttemptId(attempt.getTaskAttemptId());
        }
        
        if (attempt.getStartTime() != null)
        {
          // take the later start time in case there are multiple
          if (mergedAttempt.getStartTime() == null || mergedAttempt.getStartTime() < attempt.getStartTime())
          {
            mergedAttempt.setStartTime(attempt.getStartTime());
          }
        }
        
        if (attempt.getFinishTime() != null)
        {
          // take the later finish time in case there are multiple
          if (mergedAttempt.getFinishTime() == null || mergedAttempt.getFinishTime() < attempt.getFinishTime())
          {
            mergedAttempt.setFinishTime(attempt.getFinishTime());
          }
        }
        
        if (attempt.getShuffleFinished() != null)
        {
          // take the later finish time in case there are multiple
          if (mergedAttempt.getShuffleFinished() == null || mergedAttempt.getShuffleFinished() < attempt.getShuffleFinished())
          {
            mergedAttempt.setShuffleFinished(attempt.getShuffleFinished());
          }
        }
        
        if (attempt.getSortFinished() != null)
        {
          // take the later finish time in case there are multiple
          if (mergedAttempt.getSortFinished() == null || mergedAttempt.getSortFinished() < attempt.getSortFinished())
          {
            mergedAttempt.setSortFinished(attempt.getSortFinished());
          }
        }
                
        if (attempt.getTaskStatus() != null)
        {
          mergedAttempt.setTaskStatus(attempt.getTaskStatus());
        }
        
        if (attempt.getCounters() != null && attempt.getCounters().size() > 0)
        {
          mergedAttempt.setCounters(attempt.getCounters());
        }
      }
      
      // filter out bad data
      Collection<Attempt> filteredAttempts = Collections2.filter(taskAttemptIdToAttempt.values(), new Predicate<Attempt>() {
        @Override
        public boolean apply(Attempt attempt)
        {
          if (attempt.getTaskAttemptId() == null)
          {
            System.out.println("Did not find task attempt ID");
            return false;
          }
          
          if (attempt.getTaskStatus() == null)
          {
            // The logs can sometimes be cut off, just count this and hopefully it isn't significant.  
            // The task probably didn't execute in this case.
            //context.getCounter("Job Parsing", "Missing status").increment(1);
            System.out.println("Did not find status for attempt " + attempt.getTaskAttemptId());
            return false;
          }
          
          if (attempt.getStartTime() == null)
          {
            // The logs can sometimes be cut off, just count this and hopefully it isn't significant.  
            // The task probably didn't execute in this case.
            //context.getCounter("Job Parsing", "Missing startTime").increment(1);
            System.out.println("Did not find startTime for attempt " + attempt.getTaskAttemptId());
            return false;
          }
          
          if (attempt.getFinishTime() == null)
          {
            // The logs can sometimes be cut off, just count this and hopefully it isn't significant.  
            // The task probably didn't execute in this case.
            //context.getCounter("Job Parsing", "Missing finishTime").increment(1);
            System.out.println("Did not find finishTime for attempt " + attempt.getTaskAttemptId());
            return false;
          }
          
          if (attempt.getFinishTime() < attempt.getStartTime())
          {
            //context.getCounter("Job Parsing", "Finish time before start time").increment(1);
            System.out.println("Finish time is before start for attempt " + attempt.getTaskAttemptId());
            return false;
          }
          
          return true;
        }
      });
      
      // map to look up task by task id
      Map<String,Task> taskIdToTask = new HashMap<String,Task>();
      for (Task task : job.getTasks())
      {
        taskIdToTask.put(task.getTaskId().toString(), task);
      }
      
      // collect each of the attempts and add to the corresponding task
      for (Attempt attempt : filteredAttempts)
      {
        Task task = taskIdToTask.get(attempt.getTaskId().toString());
        
        if (task == null)
        {
          throw new RuntimeException("Could not find task");
        }
        
        if (task.getAttempts() == null)
        {
          task.setAttempts(new ArrayList<Attempt>());
        }
        
        task.getAttempts().add(attempt);
      }
      
      for (Task task : job.getTasks())
      {
        if (task.getAttempts().size() > 0)
        {
          // sort attempts by start time
          Collections.sort(task.getAttempts(),new Comparator<Attempt>() {
            @Override
            public int compare(Attempt o1, Attempt o2)
            {
              return o1.getStartTime().compareTo(o2.getStartTime());
            }
          });
          
          boolean foundSuccess = false;
          
          // For simplicity we'll say that all attempts which are not successful are excess.
          // In reality there could be some overlapping successful attempts, but we'll ignore this
          // because it should be rare.
          
          for (Attempt attempt : task.getAttempts())
          { 
            if (attempt.getStartTime() == 0 || attempt.getFinishTime() == 0)
            {
              attempt.setStartTime(null);
              attempt.setFinishTime(null);
              //context.getCounter("Job Parsing", "startTime or finishTime zero").increment(1);
            }
            else
            {
              ((DerivedAttemptData)attempt.getDerived()).setMinutes((attempt.getFinishTime() - attempt.getStartTime())/1000.0/60.0);
            }
            
            if (attempt.getCounters().containsKey(CPU_MILLISECONDS))
            {
              attempt.getDerived().setCpuMinutes(attempt.getCounters().get(CPU_MILLISECONDS)/1000.0/60.0);
            }
            
            if (attempt.getTaskStatus().equals("SUCCESS"))
            {
              ((DerivedAttemptData)attempt.getDerived()).setExcess(false);
              foundSuccess = true;
            }
            else
            {
              ((DerivedAttemptData)attempt.getDerived()).setExcess(true);
            }
          }
          
          // If none were successful then mark the first attempt as the non-excess one.
          if (task.getAttempts().size() > 0 && !foundSuccess)
          {
            ((DerivedAttemptData)task.getAttempts().get(0).getDerived()).setExcess(false);
          }
          
          // sort by task attempt id
          Collections.sort(task.getAttempts(),new Comparator<Attempt>() {
            @Override
            public int compare(Attempt o1, Attempt o2)
            {
              return o1.getTaskAttemptId().toString().compareTo(o2.getTaskAttemptId().toString());
            }
          });
        }
      }
    }
  }
}