/*
* Copyright 2012 LinkedIn, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.linkedin.whiteelephant.util;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.TimeZone;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class JobStatsProcessing
{
// The logs are uploaded to directories named according to approximately the GMT time of the job submission.
private static TimeZone timeZone = TimeZone.getTimeZone("GMT");
public static List<ProcessingTask> getTasks(FileSystem fs, String logsRoot, String clusterName, String outputPathRoot, String suffix, boolean incremental, int numDays, int numDaysForced) throws IOException
{
Calendar cal = Calendar.getInstance(timeZone);
SimpleDateFormat yearFormat = new SimpleDateFormat("yyyy");
SimpleDateFormat dayFormat = new SimpleDateFormat("MMdd");
SimpleDateFormat idFormat = new SimpleDateFormat("yyyy-MM-dd");
yearFormat.setTimeZone(timeZone);
dayFormat.setTimeZone(timeZone);
idFormat.setTimeZone(timeZone);
List<ProcessingTask> processingTasks = new ArrayList<ProcessingTask>();
numDays = Math.max(numDays, numDaysForced);
// Start processing previous day of data since current day isn't yet finished. Unless we are aggregating hourly data there is no point.
cal.add(Calendar.DAY_OF_MONTH, -1);
int numPaths = 0;
long totalLength = 0;
for (int i=0; i<numDays; i++, cal.add(Calendar.DAY_OF_MONTH, -1))
{
Date date = cal.getTime();
String pathFormat = String.format("%s/%s/daily/*/%s/%s/*%s",logsRoot,clusterName,yearFormat.format(date),dayFormat.format(date), suffix);
FileStatus[] stats = fs.globStatus(new Path(pathFormat));
StringBuilder msg = new StringBuilder(pathFormat + " => " + stats.length + " files");
String outputPathForDay = String.format("%s/%s/%s/%s",outputPathRoot,clusterName,yearFormat.format(date),dayFormat.format(date));
if (stats.length > 0)
{
if (!incremental || !fs.exists(new Path(outputPathForDay)) || i<numDaysForced)
{
for (FileStatus stat : stats)
{
totalLength += stat.getLen();
numPaths++;
}
String id = clusterName + "-" + idFormat.format(date);
System.out.println(msg);
processingTasks.add(new ProcessingTask(id,pathFormat,outputPathForDay, totalLength));
}
else if (incremental && fs.exists(new Path(outputPathForDay)))
{
msg.append(" (skipping)");
System.out.println(msg);
}
}
}
System.out.println("Found " + numPaths + " paths to process, totalling " + totalLength + " bytes (" + (totalLength/(1024*1024*1024)) + " gigabytes)");
return processingTasks;
}
public static class ProcessingTask
{
public final String id;
public final String inputPathFormat;
public final String outputPath;
public final long totalLength;
public ProcessingTask(String id, String inputPathFormat, String outputPath, long totalLength)
{
this.id = id;
this.inputPathFormat = inputPathFormat;
this.outputPath = outputPath;
this.totalLength = totalLength;
}
}
}