/* * Copyright 2012 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.linkedin.whiteelephant.parsing; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.TimeZone; import java.util.concurrent.ExecutionException; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; import org.apache.avro.generic.GenericData; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapred.AvroValue; import org.apache.avro.mapred.AvroWrapper; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyValueOutputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit; import org.apache.log4j.Logger; import org.json.JSONObject; import com.google.common.base.Predicate; import com.google.common.collect.Collections2; import com.linkedin.whiteelephant.parsing.Attempt; import com.linkedin.whiteelephant.parsing.DerivedAttemptData; import com.linkedin.whiteelephant.parsing.Job; import com.linkedin.whiteelephant.parsing.LogData; import com.linkedin.whiteelephant.parsing.Task; import com.linkedin.whiteelephant.mapreduce.MyAvroMultipleOutputs; import com.linkedin.whiteelephant.mapreduce.lib.input.CombinedTextInputFormat; import com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJob; import com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJobExecutor; import com.linkedin.whiteelephant.parsing.LineParsing; import com.linkedin.whiteelephant.util.JobStatsProcessing; public class ParseJobsFromLogs { private static final String CPU_MILLISECONDS = "CPU_MILLISECONDS"; private final Logger _log; private final Properties _props; private final FileSystem _fs; private final String _name; private final String _jobsOutputPathRoot; private final String _logsRoot; private final String _clusterNames; private final int _numDays; private final int _numDaysForced; private final boolean _incremental; public ParseJobsFromLogs(String name, Properties props) throws IOException { _log = Logger.getLogger(name); _name = name; _props = props; Configuration conf = StagedOutputJob.createConfigurationFromProps(_props); System.out.println("fs.default.name: " + conf.get("fs.default.name")); _fs = FileSystem.get(conf); if (_props.get("cluster.names") == null) { throw new IllegalArgumentException("cluster.names is not specified."); } if (_props.get("jobs.output.path") == null) { throw new IllegalArgumentException("attempts.output.path is not specified."); } if (_props.get("num.days") == null) { throw new IllegalArgumentException("num.days is not specified"); } if (_props.get("num.days.forced") == null) { throw new IllegalArgumentException("num.days.forced is not specified"); } if (_props.get("incremental") == null) { throw new IllegalArgumentException("incremental is not specified."); } if (_props.get("logs.root") == null) { throw new IllegalArgumentException("logs.root is not specified."); } _jobsOutputPathRoot = (String)_props.get("jobs.output.path"); _logsRoot = (String)_props.get("logs.root"); _clusterNames = (String)_props.get("cluster.names"); _numDays = Integer.parseInt((String)_props.get("num.days")); _numDaysForced = Integer.parseInt((String)_props.get("num.days.forced")); _incremental = Boolean.parseBoolean((String)_props.get("incremental")); } public void execute(StagedOutputJobExecutor executor) throws IOException, InterruptedException, ExecutionException { for (String clusterName : _clusterNames.split(",")) { System.out.println("Processing cluster " + clusterName); List<JobStatsProcessing.ProcessingTask> processingTasks = JobStatsProcessing.getTasks(_fs, _logsRoot, clusterName, _jobsOutputPathRoot, "log", _incremental, _numDays, _numDaysForced); for (JobStatsProcessing.ProcessingTask task : processingTasks) { List<String> inputPaths = new ArrayList<String>(); inputPaths.add(task.inputPathFormat); String outputPath = task.outputPath; final StagedOutputJob job = StagedOutputJob.createStagedJob( _props, _name + "-parse-jobs-" + task.id, inputPaths, "/tmp" + outputPath, outputPath, _log); job.getConfiguration().set("jobs.output.path", _jobsOutputPathRoot); job.getConfiguration().set("logs.cluster.name", clusterName); // 1 reducer per 12 GB of input data long numReduceTasks = (int)Math.ceil(((double)task.totalLength) / 1024 / 1024 / 1024 / 12); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormatClass(CombinedTextInputFormat.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); AvroJob.setOutputKeySchema(job, Schema.create(Type.STRING)); AvroJob.setOutputValueSchema(job, LogData.SCHEMA$); job.setNumReduceTasks((int)numReduceTasks); job.setMapperClass(ParseJobsFromLogs.TheMapper.class); job.setReducerClass(ParseJobsFromLogs.TheReducer.class); AvroJob.setMapOutputKeySchema(job, Schema.create(Type.STRING)); AvroJob.setMapOutputValueSchema(job, LogData.SCHEMA$); MyAvroMultipleOutputs.addNamedOutput(job, "logs", AvroKeyValueOutputFormat.class, Schema.create(Type.STRING), LogData.SCHEMA$); executor.submit(job); } executor.waitForCompletion(); } } public static class TheMapper extends Mapper<LongWritable, Text, AvroWrapper<String>, AvroWrapper<LogData>> { String _clusterName; @Override protected void setup(Context context) { _clusterName = context.getConfiguration().get("logs.cluster.name"); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { List<CharSequence> inputSplits = new ArrayList<CharSequence>(); CombineFileSplit fileSplit = (CombineFileSplit) context.getInputSplit(); for (Path path : fileSplit.getPaths()) { inputSplits.add(path.toString()); } String line = value.toString(); Job job = null; Attempt attempt = null; Task task = null; job = LineParsing.tryParseJob(line); if (job == null) { attempt = LineParsing.tryParseAttempt(line); if (attempt == null) { task = LineParsing.tryParseTask(line); } } LogData data = new LogData(); String jobId = null; data.setCluster(_clusterName); try { if (job != null) { // log lines are sometimes truncated, so data may be missing - just ignore these if (job.getJobId() != null) { jobId = job.getJobId().toString(); job.setTasks(new ArrayList<Task>()); data.setEntry(job); data.setPath(findInputSplitForJob(jobId,inputSplits)); context.write(new AvroKey<String>(jobId), new AvroValue<LogData>(data)); } } else if (attempt != null) { // log lines are sometimes truncated, so data may be missing - just ignore these if (attempt.getJobId() != null && attempt.getTaskId() != null && attempt.getTaskAttemptId() != null && attempt.getType() != null) { jobId = attempt.getJobId().toString(); data.setEntry(attempt); data.setPath(findInputSplitForJob(jobId,inputSplits)); context.write(new AvroKey<String>(jobId), new AvroValue<LogData>(data)); } } else if (task != null) { // log lines are sometimes truncated, so data may be missing - just ignore these if (task.getJobId() != null && task.getTaskId() != null && task.getType() != null) { jobId = task.getJobId().toString(); task.setAttempts(new ArrayList<Attempt>()); data.setEntry(task); data.setPath(findInputSplitForJob(jobId,inputSplits)); context.write(new AvroKey<String>(jobId), new AvroValue<LogData>(data)); } } } catch (Exception e) { System.out.println("Exception writing log data: " + e.toString()); if (jobId != null) { System.out.println("jobId: " + jobId); CharSequence path = data.getPath(); if (path != null) { System.out.println("input: " + path); } } throw new IOException(e); } } private CharSequence findInputSplitForJob(String jobId, List<CharSequence> inputSplits) { if (jobId != null) { for (CharSequence inputSplit : inputSplits) { if (inputSplit.toString().contains(jobId)) { return inputSplit; } } } return null; } } public static class TheReducer extends Reducer<AvroKey<String>, AvroValue<LogData>, AvroWrapper<String>, AvroWrapper<LogData>> { private String jobOutputPath; @Override protected void setup(Context context) { jobOutputPath = context.getConfiguration().get("jobs.output.path"); System.out.println("Job output path: " + jobOutputPath); } @Override protected void reduce(AvroKey<String> key, Iterable<AvroValue<LogData>> values, final Context context) throws IOException, InterruptedException { String jobId = key.datum(); String inputPath = null; List<Job> jobEntries = new ArrayList<Job>(); List<Attempt> attemptEntries = new ArrayList<Attempt>(); List<Task> taskEntries = new ArrayList<Task>(); for (AvroValue<LogData> value : values) { inputPath = value.datum().getPath().toString(); if (value.datum().getEntry() instanceof Job) { jobEntries.add(Job.newBuilder((Job)value.datum().getEntry()).build()); } else if (value.datum().getEntry() instanceof Task) { taskEntries.add(Task.newBuilder((Task)value.datum().getEntry()).build()); } else if (value.datum().getEntry() instanceof Attempt) { attemptEntries.add(Attempt.newBuilder((Attempt)value.datum().getEntry()).build()); } } Job job = new Job(); mergeJobEntries(job, jobEntries); mergeTaskEntries(job, taskEntries); mergeTaskAttemptEntries(job, attemptEntries); LogData data = new LogData(); data.setPath(inputPath); data.setEntry(job); try { SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy/MMdd"); dateFormat.setTimeZone(TimeZone.getTimeZone("GMT")); // Date submitTime = new Date(job.getSubmitTime()); // String baseOutputPath = String.format("%s/%s/part",cluster,dateFormat.format(submitTime)); // amos.write("logs", new AvroWrapper<String>(jobId), new AvroWrapper<LogData>(data), baseOutputPath); context.write(new AvroWrapper<String>(jobId), new AvroWrapper<LogData>(data)); } catch (Exception e) { System.out.println("Exception writing log data: " + e.toString()); if (inputPath != null) { System.out.println("input: " + inputPath); } System.out.println("key: " + jobId); try { System.out.println("value:\n" + (new JSONObject(GenericData.get().toString(data)).toString(2))); } catch (Exception e1) { System.out.println("Error generating JSON value: " + e1.toString()); } throw new IOException(e); } } public static void mergeJobEntries(Job parsedJob, List<Job> jobEntries/*, final Context context*/) { for (Job jobLine : jobEntries) { // ignore status unless it's the last one, status actually shows up multiple times as the job passes through various states if (jobLine.getJobStatus() != null && jobLine.getFinishTime() != null) { parsedJob.setJobStatus(jobLine.getJobStatus()); parsedJob.setFinishTime(jobLine.getFinishTime()); } if (jobLine.getUser() != null) { parsedJob.setUser(jobLine.getUser()); } if (jobLine.getJobId() != null) { parsedJob.setJobId(jobLine.getJobId()); } if (jobLine.getUser() != null) { parsedJob.setUser(jobLine.getUser()); } if (jobLine.getJobName() != null) { parsedJob.setJobName(jobLine.getJobName()); } if (jobLine.getJobQueue() != null) { parsedJob.setJobQueue(jobLine.getJobQueue()); } if (jobLine.getSubmitTime() != null) { parsedJob.setSubmitTime(jobLine.getSubmitTime()); } if (jobLine.getLaunchTime() != null) { parsedJob.setLaunchTime(jobLine.getLaunchTime()); } if (jobLine.getTotalMaps() != null) { parsedJob.setTotalMaps(jobLine.getTotalMaps()); } if (jobLine.getTotalReduces() != null) { parsedJob.setTotalReduces(jobLine.getTotalReduces()); } if (jobLine.getFinishedMaps() != null) { parsedJob.setFinishedMaps(jobLine.getFinishedMaps()); } if (jobLine.getFinishedReduces() != null) { parsedJob.setFinishedReduces(jobLine.getFinishedReduces()); } if (jobLine.getFailedMaps() != null) { parsedJob.setFailedMaps(jobLine.getFailedMaps()); } if (jobLine.getFailedReduces() != null) { parsedJob.setFailedReduces(jobLine.getFailedReduces()); } } } public static void mergeTaskEntries(Job job, List<Task> taskEntries) { Map<String,Task> taskIdToTask = new HashMap<String,Task>(); for (Task entry : taskEntries) { String taskId = entry.getTaskId().toString(); if (taskId == null) { throw new RuntimeException("Missing task ID"); } Task task = null; if (!taskIdToTask.containsKey(taskId)) { taskIdToTask.put(taskId,new Task()); } task = taskIdToTask.get(taskId); if (task.getAttempts() == null) { task.setAttempts(new ArrayList<Attempt>()); } if (entry.getTaskId() != null) { task.setTaskId(entry.getTaskId()); } if (entry.getTaskStatus() != null) { task.setTaskStatus(entry.getTaskStatus()); } if (entry.getStartTime() != null) { task.setStartTime(entry.getStartTime()); } if (entry.getFinishTime() != null) { task.setFinishTime(entry.getFinishTime()); } if (entry.getJobId() != null) { task.setJobId(entry.getJobId()); } if (entry.getType() != null) { task.setType(entry.getType()); } } List<Task> tasks = new ArrayList<Task>(taskIdToTask.values()); Collections.sort(tasks, new Comparator<Task>() { @Override public int compare(Task o1, Task o2) { return o1.getTaskId().toString().compareTo(o2.getTaskId().toString()); } }); job.setTasks(tasks); } /** * Merges together attempt data having the same task attempt ID so all the data for an attempt is in a single record. * * @param values * @param context * @return */ public static void mergeTaskAttemptEntries(Job job, Iterable<Attempt> attemptEntries/*, final Context context*/) { // merge together the entries for each task attempt Map<String,Attempt> taskAttemptIdToAttempt = new HashMap<String,Attempt>(); for (Attempt attempt : attemptEntries) { Attempt mergedAttempt; if (!taskAttemptIdToAttempt.containsKey(attempt.getTaskAttemptId().toString())) { mergedAttempt = new Attempt(); mergedAttempt.setCounters(new HashMap<CharSequence,Long>()); mergedAttempt.setDerived(new DerivedAttemptData()); taskAttemptIdToAttempt.put(attempt.getTaskAttemptId().toString(),mergedAttempt); } else { mergedAttempt = taskAttemptIdToAttempt.get(attempt.getTaskAttemptId().toString()); } if (attempt.getType() != null) { mergedAttempt.setType(attempt.getType()); } if (attempt.getJobId() != null) { mergedAttempt.setJobId(attempt.getJobId()); } if (attempt.getTaskId() != null) { mergedAttempt.setTaskId(attempt.getTaskId()); } if (attempt.getTaskAttemptId() != null) { mergedAttempt.setTaskAttemptId(attempt.getTaskAttemptId()); } if (attempt.getStartTime() != null) { // take the later start time in case there are multiple if (mergedAttempt.getStartTime() == null || mergedAttempt.getStartTime() < attempt.getStartTime()) { mergedAttempt.setStartTime(attempt.getStartTime()); } } if (attempt.getFinishTime() != null) { // take the later finish time in case there are multiple if (mergedAttempt.getFinishTime() == null || mergedAttempt.getFinishTime() < attempt.getFinishTime()) { mergedAttempt.setFinishTime(attempt.getFinishTime()); } } if (attempt.getShuffleFinished() != null) { // take the later finish time in case there are multiple if (mergedAttempt.getShuffleFinished() == null || mergedAttempt.getShuffleFinished() < attempt.getShuffleFinished()) { mergedAttempt.setShuffleFinished(attempt.getShuffleFinished()); } } if (attempt.getSortFinished() != null) { // take the later finish time in case there are multiple if (mergedAttempt.getSortFinished() == null || mergedAttempt.getSortFinished() < attempt.getSortFinished()) { mergedAttempt.setSortFinished(attempt.getSortFinished()); } } if (attempt.getTaskStatus() != null) { mergedAttempt.setTaskStatus(attempt.getTaskStatus()); } if (attempt.getCounters() != null && attempt.getCounters().size() > 0) { mergedAttempt.setCounters(attempt.getCounters()); } } // filter out bad data Collection<Attempt> filteredAttempts = Collections2.filter(taskAttemptIdToAttempt.values(), new Predicate<Attempt>() { @Override public boolean apply(Attempt attempt) { if (attempt.getTaskAttemptId() == null) { System.out.println("Did not find task attempt ID"); return false; } if (attempt.getTaskStatus() == null) { // The logs can sometimes be cut off, just count this and hopefully it isn't significant. // The task probably didn't execute in this case. //context.getCounter("Job Parsing", "Missing status").increment(1); System.out.println("Did not find status for attempt " + attempt.getTaskAttemptId()); return false; } if (attempt.getStartTime() == null) { // The logs can sometimes be cut off, just count this and hopefully it isn't significant. // The task probably didn't execute in this case. //context.getCounter("Job Parsing", "Missing startTime").increment(1); System.out.println("Did not find startTime for attempt " + attempt.getTaskAttemptId()); return false; } if (attempt.getFinishTime() == null) { // The logs can sometimes be cut off, just count this and hopefully it isn't significant. // The task probably didn't execute in this case. //context.getCounter("Job Parsing", "Missing finishTime").increment(1); System.out.println("Did not find finishTime for attempt " + attempt.getTaskAttemptId()); return false; } if (attempt.getFinishTime() < attempt.getStartTime()) { //context.getCounter("Job Parsing", "Finish time before start time").increment(1); System.out.println("Finish time is before start for attempt " + attempt.getTaskAttemptId()); return false; } return true; } }); // map to look up task by task id Map<String,Task> taskIdToTask = new HashMap<String,Task>(); for (Task task : job.getTasks()) { taskIdToTask.put(task.getTaskId().toString(), task); } // collect each of the attempts and add to the corresponding task for (Attempt attempt : filteredAttempts) { Task task = taskIdToTask.get(attempt.getTaskId().toString()); if (task == null) { throw new RuntimeException("Could not find task"); } if (task.getAttempts() == null) { task.setAttempts(new ArrayList<Attempt>()); } task.getAttempts().add(attempt); } for (Task task : job.getTasks()) { if (task.getAttempts().size() > 0) { // sort attempts by start time Collections.sort(task.getAttempts(),new Comparator<Attempt>() { @Override public int compare(Attempt o1, Attempt o2) { return o1.getStartTime().compareTo(o2.getStartTime()); } }); boolean foundSuccess = false; // For simplicity we'll say that all attempts which are not successful are excess. // In reality there could be some overlapping successful attempts, but we'll ignore this // because it should be rare. for (Attempt attempt : task.getAttempts()) { if (attempt.getStartTime() == 0 || attempt.getFinishTime() == 0) { attempt.setStartTime(null); attempt.setFinishTime(null); //context.getCounter("Job Parsing", "startTime or finishTime zero").increment(1); } else { ((DerivedAttemptData)attempt.getDerived()).setMinutes((attempt.getFinishTime() - attempt.getStartTime())/1000.0/60.0); } if (attempt.getCounters().containsKey(CPU_MILLISECONDS)) { attempt.getDerived().setCpuMinutes(attempt.getCounters().get(CPU_MILLISECONDS)/1000.0/60.0); } if (attempt.getTaskStatus().equals("SUCCESS")) { ((DerivedAttemptData)attempt.getDerived()).setExcess(false); foundSuccess = true; } else { ((DerivedAttemptData)attempt.getDerived()).setExcess(true); } } // If none were successful then mark the first attempt as the non-excess one. if (task.getAttempts().size() > 0 && !foundSuccess) { ((DerivedAttemptData)task.getAttempts().get(0).getDerived()).setExcess(false); } // sort by task attempt id Collections.sort(task.getAttempts(),new Comparator<Attempt>() { @Override public int compare(Attempt o1, Attempt o2) { return o1.getTaskAttemptId().toString().compareTo(o2.getTaskAttemptId().toString()); } }); } } } } }