package org.openstack.atlas.logs.hadoop.jobs;
import com.hadoop.mapreduce.LzoTextInputFormat;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.joda.time.DateTime;
import org.openstack.atlas.logs.hadoop.comparators.LogGroupComparator;
import org.openstack.atlas.logs.hadoop.comparators.LogSortComparator;
import org.openstack.atlas.logs.hadoop.counters.CounterUtils;
import org.openstack.atlas.logs.hadoop.counters.LogCounters;
import org.openstack.atlas.logs.hadoop.mappers.LogMapper;
import org.openstack.atlas.logs.hadoop.partitioners.LogPartitioner;
import org.openstack.atlas.logs.hadoop.reducers.LogReducer;
import org.openstack.atlas.logs.hadoop.writables.LogMapperOutputKey;
import org.openstack.atlas.logs.hadoop.writables.LogMapperOutputValue;
import org.openstack.atlas.logs.hadoop.writables.LogReducerOutputKey;
import org.openstack.atlas.logs.hadoop.writables.LogReducerOutputValue;
import org.openstack.atlas.util.staticutils.StaticDateTimeUtils;
import org.openstack.atlas.util.staticutils.StaticFileUtils;
import org.openstack.atlas.util.staticutils.StaticStringUtils;
import org.openstack.atlas.util.common.VerboseLogger;
public class HadoopLogSplitterJob extends HadoopJob {
private static final VerboseLogger vlog = new VerboseLogger(HadoopLogSplitterJob.class);
private static final Log LOG = LogFactory.getLog(HadoopLogSplitterJob.class);
@Override
public int run(String[] args) throws Exception {
if (args.length < 6) {
vlog.log("usage is <jarPath> <outDir> <histDir> <fileHour> <nReducers> <userName> <lzoFiles...>");
return -1;
}
Path jarPath = new Path(args[0]);
String outDir = args[1];
String histDir = args[2];
String fileHour = args[3];
int nReducers = Integer.parseInt(args[4]) + 1; // The extra reducer is for the unknown.zip file
String userName = args[5];
List<String> lzoFiles = new ArrayList<String>();
for (int i = 6; i < args.length; i++) {
lzoFiles.add(args[i]);
}
Job job = new Job(conf);
System.setProperty("HADOOP_USER_NAME", userName);
DateTime dt = StaticDateTimeUtils.nowDateTime(true);
long dateOrd = StaticDateTimeUtils.dateTimeToOrdinalMillis(dt);
String jobName = "LB_STATS" + ":" + fileHour + ":" + dateOrd;
vlog.log(String.format("jobName=%s", jobName));
job.setJarByClass(HadoopLogSplitterJob.class);
job.setJobName(jobName);
String hdfsZipDir = StaticFileUtils.joinPath(outDir, "zips");
job.getConfiguration().set("fileHour", fileHour);
job.getConfiguration().set("hdfs_user_name", userName);
job.getConfiguration().set("hdfs_zip_dir", hdfsZipDir);
URI defaultHdfsUri = FileSystem.getDefaultUri(conf);
FileSystem fs = FileSystem.get(defaultHdfsUri, conf, userName);
//DistributedCache.addCacheFile(jarPath.toUri(), job.getConfiguration());
DistributedCache.addFileToClassPath(jarPath, job.getConfiguration(), fs);
//DistributedCache.createSymlink(job.getConfiguration());
vlog.log(String.format("jobJar = %s", job.getJar()));
job.setMapperClass(LogMapper.class);
job.setMapOutputKeyClass(LogMapperOutputKey.class);
job.setMapOutputValueClass(LogMapperOutputValue.class);
job.setReducerClass(LogReducer.class);
job.setOutputKeyClass(LogReducerOutputKey.class);
job.setOutputValueClass(LogReducerOutputValue.class);
job.setPartitionerClass(LogPartitioner.class);
job.setSortComparatorClass(LogSortComparator.class);
job.setGroupingComparatorClass(LogGroupComparator.class);
job.setInputFormatClass(LzoTextInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
//job.getConfiguration().set("hadoop.jop.history.user.location", histDir);
for (String lzoFileName : lzoFiles) {
LzoTextInputFormat.addInputPath(job, new Path(lzoFileName));
}
FileOutputFormat.setOutputPath(job, new Path(outDir));
String codecClassName = "com.hadoop.compression.lzo.LzopCodec";
Class codecClass = Class.forName(codecClassName);
job.getConfiguration().setClass("mapred.map.output.compression.codec", codecClass, CompressionCodec.class);
job.getConfiguration().setBoolean("mapred.compress.map.output", true);
job.setNumReduceTasks(nReducers);
int exit;
if (job.waitForCompletion(true)) {
exit = 0;
vlog.log("LogSplitter job finished");
} else {
exit = -1;
vlog.log("LogSplitter job failed");
}
vlog.log(String.format("%s\n", CounterUtils.showCounters(job, LogCounters.values())));
return exit;
}
}