package com.linkedin.camus.etl.kafka.mapred;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;
import org.joda.time.format.DateTimeFormatter;
import com.linkedin.camus.etl.Partitioner;
import com.linkedin.camus.etl.RecordWriterProvider;
import com.linkedin.camus.etl.kafka.common.AvroRecordWriterProvider;
import com.linkedin.camus.etl.kafka.common.DateUtils;
import com.linkedin.camus.etl.kafka.common.EtlKey;
import com.linkedin.camus.etl.kafka.partitioner.DefaultPartitioner;
/**
* MultipleAvroOutputFormat.
*
* File names are determined by output keys.
*/
public class EtlMultiOutputFormat extends FileOutputFormat<EtlKey, Object> {
public static final String ETL_DESTINATION_PATH = "etl.destination.path";
public static final String ETL_DESTINATION_PATH_TOPIC_SUBDIRECTORY = "etl.destination.path.topic.sub.dir";
public static final String ETL_DESTINATION_PATH_TOPIC_SUBDIRFORMAT = "etl.destination.path.topic.sub.dirformat";
public static final String ETL_DESTINATION_PATH_TOPIC_SUBDIRFORMAT_LOCALE = "etl.destination.path.topic.sub.dirformat.locale";
public static final String ETL_RUN_MOVE_DATA = "etl.run.move.data";
public static final String ETL_RUN_TRACKING_POST = "etl.run.tracking.post";
public static final String ETL_DEFAULT_TIMEZONE = "etl.default.timezone";
public static final String ETL_DEFLATE_LEVEL = "etl.deflate.level";
public static final String ETL_AVRO_WRITER_SYNC_INTERVAL = "etl.avro.writer.sync.interval";
public static final String ETL_OUTPUT_FILE_TIME_PARTITION_MINS = "etl.output.file.time.partition.mins";
public static final String KAFKA_MONITOR_TIME_GRANULARITY_MS = "kafka.monitor.time.granularity";
public static final String ETL_DEFAULT_PARTITIONER_CLASS = "etl.partitioner.class";
public static final String ETL_OUTPUT_CODEC = "etl.output.codec";
public static final String ETL_DEFAULT_OUTPUT_CODEC = "deflate";
public static final String ETL_RECORD_WRITER_PROVIDER_CLASS = "etl.record.writer.provider.class";
public static final DateTimeFormatter FILE_DATE_FORMATTER = DateUtils.getDateTimeFormatter("YYYYMMddHH");
public static final String OFFSET_PREFIX = "offsets";
public static final String ERRORS_PREFIX = "errors";
public static final String COUNTS_PREFIX = "counts";
public static final String REQUESTS_FILE = "requests.previous";
private static EtlMultiOutputCommitter committer = null;
private static Map<String, Partitioner> partitionersByTopic = new HashMap<String, Partitioner>();
private static Logger log = Logger.getLogger(EtlMultiOutputFormat.class);
@Override
public RecordWriter<EtlKey, Object> getRecordWriter(TaskAttemptContext context) throws IOException,
InterruptedException {
if (committer == null)
committer = new EtlMultiOutputCommitter(getOutputPath(context), context, log);
return new EtlMultiOutputRecordWriter(context, committer);
}
@Override
public synchronized OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
if (committer == null)
committer = new EtlMultiOutputCommitter(getOutputPath(context), context, log);
return committer;
}
public static void setRecordWriterProviderClass(JobContext job, Class<RecordWriterProvider> recordWriterProviderClass) {
job.getConfiguration().setClass(ETL_RECORD_WRITER_PROVIDER_CLASS, recordWriterProviderClass,
RecordWriterProvider.class);
}
public static Class<RecordWriterProvider> getRecordWriterProviderClass(JobContext job) {
return (Class<RecordWriterProvider>) job.getConfiguration().getClass(ETL_RECORD_WRITER_PROVIDER_CLASS,
AvroRecordWriterProvider.class);
}
public static RecordWriterProvider getRecordWriterProvider(JobContext job) {
try {
return (RecordWriterProvider) job.getConfiguration()
.getClass(ETL_RECORD_WRITER_PROVIDER_CLASS, AvroRecordWriterProvider.class).newInstance();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public static void setDefaultTimeZone(JobContext job, String tz) {
job.getConfiguration().set(ETL_DEFAULT_TIMEZONE, tz);
}
public static String getDefaultTimeZone(JobContext job) {
return job.getConfiguration().get(ETL_DEFAULT_TIMEZONE, "America/Los_Angeles");
}
public static void setDestinationPath(JobContext job, Path dest) {
job.getConfiguration().set(ETL_DESTINATION_PATH, dest.toString());
}
public static Path getDestinationPath(JobContext job) {
return new Path(job.getConfiguration().get(ETL_DESTINATION_PATH));
}
public static void setDestPathTopicSubDir(JobContext job, String subPath) {
job.getConfiguration().set(ETL_DESTINATION_PATH_TOPIC_SUBDIRECTORY, subPath);
}
public static Path getDestPathTopicSubDir(JobContext job) {
return new Path(job.getConfiguration().get(ETL_DESTINATION_PATH_TOPIC_SUBDIRECTORY, "hourly"));
}
public static void setMonitorTimeGranularityMins(JobContext job, int mins) {
job.getConfiguration().setInt(KAFKA_MONITOR_TIME_GRANULARITY_MS, mins);
}
public static int getMonitorTimeGranularityMins(JobContext job) {
return job.getConfiguration().getInt(KAFKA_MONITOR_TIME_GRANULARITY_MS, 10);
}
public static long getMonitorTimeGranularityMs(JobContext job) {
return job.getConfiguration().getInt(KAFKA_MONITOR_TIME_GRANULARITY_MS, 10) * 60000L;
}
public static void setEtlAvroWriterSyncInterval(JobContext job, int val) {
job.getConfiguration().setInt(ETL_AVRO_WRITER_SYNC_INTERVAL, val);
}
public static int getEtlAvroWriterSyncInterval(JobContext job) {
return job.getConfiguration().getInt(ETL_AVRO_WRITER_SYNC_INTERVAL, 16000);
}
public static void setEtlDeflateLevel(JobContext job, int val) {
job.getConfiguration().setInt(ETL_DEFLATE_LEVEL, val);
}
public static void setEtlOutputCodec(JobContext job, String codec) {
job.getConfiguration().set(ETL_OUTPUT_CODEC, codec);
}
public static String getEtlOutputCodec(JobContext job) {
return job.getConfiguration().get(ETL_OUTPUT_CODEC, ETL_DEFAULT_OUTPUT_CODEC);
}
public static int getEtlDeflateLevel(JobContext job) {
return job.getConfiguration().getInt(ETL_DEFLATE_LEVEL, 6);
}
public static int getEtlOutputFileTimePartitionMins(JobContext job) {
return job.getConfiguration().getInt(ETL_OUTPUT_FILE_TIME_PARTITION_MINS, 60);
}
public static void setEtlOutputFileTimePartitionMins(JobContext job, int val) {
job.getConfiguration().setInt(ETL_OUTPUT_FILE_TIME_PARTITION_MINS, val);
}
public static boolean isRunMoveData(JobContext job) {
return job.getConfiguration().getBoolean(ETL_RUN_MOVE_DATA, true);
}
public static void setRunMoveData(JobContext job, boolean value) {
job.getConfiguration().setBoolean(ETL_RUN_MOVE_DATA, value);
}
public static boolean isRunTrackingPost(JobContext job) {
return job.getConfiguration().getBoolean(ETL_RUN_TRACKING_POST, false);
}
public static void setRunTrackingPost(JobContext job, boolean value) {
job.getConfiguration().setBoolean(ETL_RUN_TRACKING_POST, value);
}
public static String getWorkingFileName(JobContext context, EtlKey key) throws IOException {
Partitioner partitioner = getPartitioner(context, key.getTopic());
return partitioner.getWorkingFileName(context, key.getTopic(), key.getLeaderId(), key.getPartition(),
partitioner.encodePartition(context, key));
}
public static void setDefaultPartitioner(JobContext job, Class<?> cls) {
job.getConfiguration().setClass(ETL_DEFAULT_PARTITIONER_CLASS, cls, Partitioner.class);
}
public static Partitioner getDefaultPartitioner(JobContext job) {
return ReflectionUtils.newInstance(
job.getConfiguration().getClass(ETL_DEFAULT_PARTITIONER_CLASS, DefaultPartitioner.class, Partitioner.class),
job.getConfiguration());
}
public static Partitioner getPartitioner(JobContext job, String topicName) throws IOException {
String customPartitionerProperty = ETL_DEFAULT_PARTITIONER_CLASS + "." + topicName;
if (partitionersByTopic.get(customPartitionerProperty) == null) {
List<Partitioner> partitioners = new ArrayList<Partitioner>();
if (partitioners.isEmpty()) {
return getDefaultPartitioner(job);
} else {
partitionersByTopic.put(customPartitionerProperty, partitioners.get(0));
}
}
return partitionersByTopic.get(customPartitionerProperty);
}
public static void resetPartitioners() {
partitionersByTopic = new HashMap<String, Partitioner>();
}
}