EtlInputFormat.java example

Explorer

camus-master
- camus-api
  - src
    - main
      - java
        com
        linkedin
        camus
        coders
        CamusWrapper.java
        Message.java
        MessageDecoder.java
        MessageDecoderException.java
        MessageEncoder.java
        MessageEncoderException.java
        etl
        IEtlKey.java
        Partitioner.java
        RecordWriterProvider.java
        schemaregistry
        CachedSchemaRegistry.java
        SchemaDetails.java
        SchemaNotFoundException.java
        SchemaRegistry.java
        SchemaRegistryException.java
        workallocater
        CamusRequest.java
        WorkAllocator.java
    - test
      - java
        com
        linkedin
        camus
        schemaregistry
        CachedSchemaRegistryTest.java
- camus-etl-kafka
  - src
    - main
      - java
        com
        linkedin
        camus
        etl
        kafka
        CamusJob.java
        coders
        MessageDecoderFactory.java
        common
        AbstractMonitoringEvent.java
        AvroRecordWriterProvider.java
        DateUtils.java
        EmailClient.java
        EtlCounts.java
        EtlKey.java
        EtlRequest.java
        ExceptionWritable.java
        KafkaETLKey.java
        KafkaMessage.java
        KafkaReader.java
        LeaderInfo.java
        SequenceFileRecordWriterProvider.java
        Source.java
        StringRecordWriterProvider.java
        mapred
        EtlInputFormat.java
        EtlMapper.java
        EtlMultiOutputCommitter.java
        EtlMultiOutputFormat.java
        EtlMultiOutputRecordWriter.java
        EtlRecordReader.java
        EtlSplit.java
        partitioner
        BaseTimeBasedPartitioner.java
        DailyPartitioner.java
        DefaultPartitioner.java
        HourlyPartitioner.java
        TimeBasedPartitioner.java
        TopicGroupingPartitioner.java
        reporter
        BaseReporter.java
        StatsdReporter.java
        TimeReporter.java
        test
        TestKafkaZkClient.java
        workallocater
        BaseAllocator.java
        TopicGroupingAllocator.java
    - test
      - java
        com
        linkedin
        camus
        etl
        kafka
        CamusJobTest.java
        CamusJobTestWithMock.java
        KafkaCluster.java
        coders
        EncoderForUnitTest.java
        FailDecoder.java
        GenericRecordForUnitTest.java
        MessageDecoderHelperTest.java
        MonitoringEventForUnitTest.java
        TestDefaultPartitioner.java
        common
        DateUtilsTest.java
        EmailClientTest.java
        EtlCountsForUnitTest.java
        EtlKeyTest.java
        KafkaReaderTest.java
        mapred
        EtlInputFormatForUnitTest.java
        EtlInputFormatTest.java
        EtlMultiOutputCommitterTest.java
        EtlRecordReaderForUnitTest.java
        partitioner
        BaseTimeBasedPartitionerTest.java
        DailyPartitionerTest.java
        HourlyPartitionerTest.java
        TimeBasedPartitionerTest.java
- camus-example
  - src
    - main
      - java
        com
        linkedin
        camus
        example
        records
        DummyLog.java
        DummyLog2.java
        schemaregistry
        DummySchemaRegistry.java
- camus-kafka-coders
  - src
    - main
      - java
        com
        linkedin
        camus
        etl
        kafka
        coders
        JSONToAvroMessageDecoder.java
        JsonStringMessageDecoder.java
        JsonWrappedStringMessageDecoder.java
        KafkaAvroMessageDecoder.java
        KafkaAvroMessageEncoder.java
        LatestSchemaKafkaAvroMessageDecoder.java
    - test
      - java
        com
        linkedin
        camus
        etl
        kafka
        coders
        TestJsonStringMessageDecoder.java
        TestMessage.java
        TestWrappedJsonStringMessageDecoder.java
- camus-schema-registry
  - src
    - main
      - java
        com
        linkedin
        camus
        schemaregistry
        FileSchemaRegistry.java
        MemorySchemaRegistry.java
        Serde.java
    - test
      - java
        com
        linkedin
        camus
        schemaregistry
        TestSchemaRegistries.java
- camus-schema-registry-avro
  - src
    - main
      - java
        com
        linkedin
        camus
        schemaregistry
        AvroRestSchemaRegistry.java
    - test
      - java
        com
        linkedin
        camus
        schemaregistry
        TestAvroRestSchemaRegistry.java
- camus-sweeper
  - src
    - main
      - java
        com
        linkedin
        camus
        sweeper
        CamusCleaner.java
        CamusSingleFolderSweeper.java
        CamusSingleFolderSweeperPlanner.java
        CamusSweeper.java
        CamusSweeperDatePartitionPlanner.java
        CamusSweeperMetrics.java
        CamusSweeperPlanner.java
        HiddenFilter.java
        JobCancelledException.java
        mapreduce
        AvroKeyCombineFileInputFormat.java
        AvroKeyMapper.java
        AvroKeyReducer.java
        CamusSweeperAvroKeyJob.java
        CamusSweeperJob.java
        CamusSweeperOutputCommitter.java
        utils
        DateUtils.java
        PriorityExecutor.java
        RelaxedAvroKeyOutputFormat.java
        RelaxedAvroSerialization.java
        RelaxedSchemaUtils.java
        Utils.java
    - test
      - java
        com
        linkedin
        camus
        sweeper
        CamusSingleFolderSweeperPlannerTest.java

package com.linkedin.camus.etl.kafka.mapred;

import com.google.common.base.Strings;
import com.linkedin.camus.coders.CamusWrapper;
import com.linkedin.camus.coders.MessageDecoder;
import com.linkedin.camus.etl.kafka.CamusJob;
import com.linkedin.camus.etl.kafka.coders.KafkaAvroMessageDecoder;
import com.linkedin.camus.etl.kafka.coders.MessageDecoderFactory;
import com.linkedin.camus.etl.kafka.common.EmailClient;
import com.linkedin.camus.etl.kafka.common.EtlKey;
import com.linkedin.camus.etl.kafka.common.EtlRequest;
import com.linkedin.camus.etl.kafka.common.LeaderInfo;
import com.linkedin.camus.workallocater.CamusRequest;
import com.linkedin.camus.workallocater.WorkAllocator;

import java.io.IOException;
import java.net.URI;
import java.security.InvalidParameterException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;

import kafka.api.PartitionOffsetRequestInfo;
import kafka.common.ErrorMapping;
import kafka.common.TopicAndPartition;
import kafka.javaapi.OffsetRequest;
import kafka.javaapi.OffsetResponse;
import kafka.javaapi.PartitionMetadata;
import kafka.javaapi.TopicMetadata;
import kafka.javaapi.TopicMetadataRequest;
import kafka.javaapi.consumer.SimpleConsumer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;


/**
 * Input format for a Kafka pull job.
 */
public class EtlInputFormat extends InputFormat<EtlKey, CamusWrapper> {

  public static final String KAFKA_BLACKLIST_TOPIC = "kafka.blacklist.topics";
  public static final String KAFKA_WHITELIST_TOPIC = "kafka.whitelist.topics";

  public static final String KAFKA_MOVE_TO_LAST_OFFSET_LIST = "kafka.move.to.last.offset.list";
  public static final String KAFKA_MOVE_TO_EARLIEST_OFFSET = "kafka.move.to.earliest.offset";

  public static final String KAFKA_CLIENT_BUFFER_SIZE = "kafka.client.buffer.size";
  public static final String KAFKA_CLIENT_SO_TIMEOUT = "kafka.client.so.timeout";

  public static final String KAFKA_MAX_PULL_HRS = "kafka.max.pull.hrs";
  public static final String KAFKA_MAX_PULL_MINUTES_PER_TASK = "kafka.max.pull.minutes.per.task";
  public static final String KAFKA_MAX_HISTORICAL_DAYS = "kafka.max.historical.days";

  public static final String CAMUS_MESSAGE_DECODER_CLASS = "camus.message.decoder.class";
  public static final String ETL_IGNORE_SCHEMA_ERRORS = "etl.ignore.schema.errors";
  public static final String ETL_AUDIT_IGNORE_SERVICE_TOPIC_LIST = "etl.audit.ignore.service.topic.list";

  public static final String CAMUS_WORK_ALLOCATOR_CLASS = "camus.work.allocator.class";
  public static final String CAMUS_WORK_ALLOCATOR_DEFAULT = "com.linkedin.camus.workallocater.BaseAllocator";

  private static final int BACKOFF_UNIT_MILLISECONDS = 1000;

  public static final int NUM_TRIES_PARTITION_METADATA = 3;
  public static final int NUM_TRIES_FETCH_FROM_LEADER = 3;
  public static final int NUM_TRIES_TOPIC_METADATA = 3;

  public static boolean reportJobFailureDueToOffsetOutOfRange = false;
  public static boolean reportJobFailureUnableToGetOffsetFromKafka = false;
  public static boolean reportJobFailureDueToLeaderNotAvailable = false;

  private static Logger log = null;

  public EtlInputFormat() {
    if (log == null)
      log = Logger.getLogger(getClass());
  }

  public static void setLogger(Logger log) {
    EtlInputFormat.log = log;
  }

  @Override
  public RecordReader<EtlKey, CamusWrapper> createRecordReader(InputSplit split, TaskAttemptContext context)
      throws IOException, InterruptedException {
    return new EtlRecordReader(this, split, context);
  }

  /**
   * Gets the metadata from Kafka
   *
   * @param context
   * @param metaRequestTopics specify the list of topics to get topicMetadata. The empty list means
   * get the TopicsMetadata for all topics.
   * @return the list of TopicMetadata
   */
  public List<TopicMetadata> getKafkaMetadata(JobContext context, List<String> metaRequestTopics) {
    CamusJob.startTiming("kafkaSetupTime");
    String brokerString = CamusJob.getKafkaBrokers(context);
    if (brokerString.isEmpty())
      throw new InvalidParameterException("kafka.brokers must contain at least one node");
    List<String> brokers = Arrays.asList(brokerString.split("\\s*,\\s*"));
    Collections.shuffle(brokers);
    boolean fetchMetaDataSucceeded = false;
    int i = 0;
    List<TopicMetadata> topicMetadataList = null;
    Exception savedException = null;
    while (i < brokers.size() && !fetchMetaDataSucceeded) {
      SimpleConsumer consumer = createBrokerConsumer(context, brokers.get(i));
      log.info(String.format("Fetching metadata from broker %s with client id %s for %d topic(s) %s", brokers.get(i),
          consumer.clientId(), metaRequestTopics.size(), metaRequestTopics));
      try {
        for (int iter = 0; iter < NUM_TRIES_TOPIC_METADATA; iter++) {
          try {
            topicMetadataList = consumer.send(new TopicMetadataRequest(metaRequestTopics)).topicsMetadata();
            fetchMetaDataSucceeded = true;
            break;
          } catch (Exception e) {
            savedException = e;
            log.warn(String.format(
                "Fetching topic metadata with client id %s for topics [%s] from broker [%s] failed, iter[%s]",
                consumer.clientId(), metaRequestTopics, brokers.get(i), iter), e);
            try {
              Thread.sleep((long) (Math.random() * (iter + 1) * 1000));
            } catch (InterruptedException ex) {
              log.warn("Caught InterruptedException: " + ex);
            }
          }
        }
      } finally {
        consumer.close();
        i++;
      }
    }
    if (!fetchMetaDataSucceeded) {
      throw new RuntimeException("Failed to obtain metadata!", savedException);
    }
    CamusJob.stopTiming("kafkaSetupTime");
    return topicMetadataList;
  }

  private SimpleConsumer createBrokerConsumer(JobContext context, String broker) {
    if (!broker.matches(".+:\\d+"))
      throw new InvalidParameterException("The kakfa broker " + broker + " must follow address:port pattern");
    String[] hostPort = broker.split(":");
    return createSimpleConsumer(context, hostPort[0], Integer.valueOf(hostPort[1]));
  }

  public SimpleConsumer createSimpleConsumer(JobContext context, String host, int port) {
    SimpleConsumer consumer =
        new SimpleConsumer(host, port, CamusJob.getKafkaTimeoutValue(context), CamusJob.getKafkaBufferSize(context),
            CamusJob.getKafkaClientName(context));
    return consumer;
  }

  /**
   * Gets the latest offsets and create the requests as needed
   *
   * @param context
   * @param offsetRequestInfo
   * @return
   */
  public ArrayList<CamusRequest> fetchLatestOffsetAndCreateEtlRequests(JobContext context,
      HashMap<LeaderInfo, ArrayList<TopicAndPartition>> offsetRequestInfo) {
    ArrayList<CamusRequest> finalRequests = new ArrayList<CamusRequest>();
    for (LeaderInfo leader : offsetRequestInfo.keySet()) {
      SimpleConsumer consumer = createSimpleConsumer(context, leader.getUri().getHost(), leader.getUri().getPort());
      // Latest Offset
      PartitionOffsetRequestInfo partitionLatestOffsetRequestInfo =
          new PartitionOffsetRequestInfo(kafka.api.OffsetRequest.LatestTime(), 1);
      // Earliest Offset
      PartitionOffsetRequestInfo partitionEarliestOffsetRequestInfo =
          new PartitionOffsetRequestInfo(kafka.api.OffsetRequest.EarliestTime(), 1);
      Map<TopicAndPartition, PartitionOffsetRequestInfo> latestOffsetInfo =
          new HashMap<TopicAndPartition, PartitionOffsetRequestInfo>();
      Map<TopicAndPartition, PartitionOffsetRequestInfo> earliestOffsetInfo =
          new HashMap<TopicAndPartition, PartitionOffsetRequestInfo>();
      ArrayList<TopicAndPartition> topicAndPartitions = offsetRequestInfo.get(leader);
      for (TopicAndPartition topicAndPartition : topicAndPartitions) {
        latestOffsetInfo.put(topicAndPartition, partitionLatestOffsetRequestInfo);
        earliestOffsetInfo.put(topicAndPartition, partitionEarliestOffsetRequestInfo);
      }

      OffsetResponse latestOffsetResponse = getLatestOffsetResponse(consumer, latestOffsetInfo, context);
      OffsetResponse earliestOffsetResponse = null;
      if (latestOffsetResponse != null) {
        earliestOffsetResponse = getLatestOffsetResponse(consumer, earliestOffsetInfo, context);
      }
      consumer.close();
      if (earliestOffsetResponse == null) {
        log.warn(generateLogWarnForSkippedTopics(earliestOffsetInfo, consumer));
        reportJobFailureUnableToGetOffsetFromKafka = true;
        continue;
      }

      for (TopicAndPartition topicAndPartition : topicAndPartitions) {
        long latestOffset = latestOffsetResponse.offsets(topicAndPartition.topic(), topicAndPartition.partition())[0];
        long earliestOffset =
            earliestOffsetResponse.offsets(topicAndPartition.topic(), topicAndPartition.partition())[0];

        //TODO: factor out kafka specific request functionality
        CamusRequest etlRequest =
            new EtlRequest(context, topicAndPartition.topic(), Integer.toString(leader.getLeaderId()),
                topicAndPartition.partition(), leader.getUri());
        etlRequest.setLatestOffset(latestOffset);
        etlRequest.setEarliestOffset(earliestOffset);
        finalRequests.add(etlRequest);
      }
    }
    return finalRequests;
  }

  protected OffsetResponse getLatestOffsetResponse(SimpleConsumer consumer,
      Map<TopicAndPartition, PartitionOffsetRequestInfo> offsetInfo, JobContext context) {
    for (int i = 0; i < NUM_TRIES_FETCH_FROM_LEADER; i++) {
      try {
        OffsetResponse offsetResponse =
            consumer.getOffsetsBefore(new OffsetRequest(offsetInfo, kafka.api.OffsetRequest.CurrentVersion(), CamusJob
                .getKafkaClientName(context)));
        if (offsetResponse.hasError()) {
          throw new RuntimeException("offsetReponse has error.");
        }
        return offsetResponse;
      } catch (Exception e) {
        log.warn("Fetching offset from leader " + consumer.host() + ":" + consumer.port() + " has failed " + (i + 1)
            + " time(s). Reason: " + e.getMessage() + " " + (NUM_TRIES_FETCH_FROM_LEADER - i - 1) + " retries left.");
        if (i < NUM_TRIES_FETCH_FROM_LEADER - 1) {
          try {
            Thread.sleep((long) (Math.random() * (i + 1) * 1000));
          } catch (InterruptedException e1) {
            log.error("Caught interrupted exception between retries of getting latest offsets. " + e1.getMessage());
          }
        }
      }
    }
    return null;
  }

  private String generateLogWarnForSkippedTopics(Map<TopicAndPartition, PartitionOffsetRequestInfo> offsetInfo,
      SimpleConsumer consumer) {
    StringBuilder sb = new StringBuilder();
    sb.append("The following topics will be skipped due to failure in fetching latest offsets from leader "
        + consumer.host() + ":" + consumer.port());
    for (TopicAndPartition topicAndPartition : offsetInfo.keySet()) {
      sb.append("  " + topicAndPartition.topic());
    }
    return sb.toString();
  }

  public String createTopicRegEx(HashSet<String> topicsSet) {
    String regex = "";
    StringBuilder stringbuilder = new StringBuilder();
    for (String whiteList : topicsSet) {
      stringbuilder.append(whiteList);
      stringbuilder.append("|");
    }
    regex = "(" + stringbuilder.substring(0, stringbuilder.length() - 1) + ")";
    Pattern.compile(regex);
    return regex;
  }

  public List<TopicMetadata> filterWhitelistTopics(List<TopicMetadata> topicMetadataList,
      HashSet<String> whiteListTopics) {
    ArrayList<TopicMetadata> filteredTopics = new ArrayList<TopicMetadata>();
    String regex = createTopicRegEx(whiteListTopics);
    for (TopicMetadata topicMetadata : topicMetadataList) {
      if (Pattern.matches(regex, topicMetadata.topic())) {
        filteredTopics.add(topicMetadata);
      } else {
        log.info("Discarding topic : " + topicMetadata.topic());
      }
    }
    return filteredTopics;
  }

  @Override
  public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    CamusJob.startTiming("getSplits");
    ArrayList<CamusRequest> finalRequests;
    HashMap<LeaderInfo, ArrayList<TopicAndPartition>> offsetRequestInfo =
        new HashMap<LeaderInfo, ArrayList<TopicAndPartition>>();
    try {

      // Get Metadata for all topics
      List<TopicMetadata> topicMetadataList = getKafkaMetadata(context, new ArrayList<String>());

      // Filter any white list topics
      HashSet<String> whiteListTopics = new HashSet<String>(Arrays.asList(getKafkaWhitelistTopic(context)));
      if (!whiteListTopics.isEmpty()) {
        topicMetadataList = filterWhitelistTopics(topicMetadataList, whiteListTopics);
      }

      // Filter all blacklist topics
      HashSet<String> blackListTopics = new HashSet<String>(Arrays.asList(getKafkaBlacklistTopic(context)));
      String regex = "";
      if (!blackListTopics.isEmpty()) {
        regex = createTopicRegEx(blackListTopics);
      }

      for (TopicMetadata topicMetadata : topicMetadataList) {
        if (Pattern.matches(regex, topicMetadata.topic())) {
          log.info("Discarding topic (blacklisted): " + topicMetadata.topic());
        } else if (!createMessageDecoder(context, topicMetadata.topic())) {
          log.info("Discarding topic (Decoder generation failed) : " + topicMetadata.topic());
        } else if (topicMetadata.errorCode() != ErrorMapping.NoError()) {
          log.info("Skipping the creation of ETL request for Whole Topic : " + topicMetadata.topic() + " Exception : "
              + ErrorMapping.exceptionFor(topicMetadata.errorCode()));
        } else {
          for (PartitionMetadata partitionMetadata : topicMetadata.partitionsMetadata()) {
            // We only care about LeaderNotAvailableCode error on partitionMetadata level
            // Error codes such as ReplicaNotAvailableCode should not stop us.
            partitionMetadata =
                this.refreshPartitionMetadataOnLeaderNotAvailable(partitionMetadata, topicMetadata, context,
                    NUM_TRIES_PARTITION_METADATA);

            if (partitionMetadata.errorCode() == ErrorMapping.LeaderNotAvailableCode()) {
              log.info("Skipping the creation of ETL request for Topic : " + topicMetadata.topic()
                  + " and Partition : " + partitionMetadata.partitionId() + " Exception : "
                  + ErrorMapping.exceptionFor(partitionMetadata.errorCode()));
              reportJobFailureDueToLeaderNotAvailable = true;
            } else {
              if (partitionMetadata.errorCode() != ErrorMapping.NoError()) {
                log.warn("Receiving non-fatal error code, Continuing the creation of ETL request for Topic : "
                    + topicMetadata.topic() + " and Partition : " + partitionMetadata.partitionId() + " Exception : "
                    + ErrorMapping.exceptionFor(partitionMetadata.errorCode()));
              }
              LeaderInfo leader =
                  new LeaderInfo(new URI("tcp://" + partitionMetadata.leader().getConnectionString()),
                      partitionMetadata.leader().id());
              if (offsetRequestInfo.containsKey(leader)) {
                ArrayList<TopicAndPartition> topicAndPartitions = offsetRequestInfo.get(leader);
                topicAndPartitions.add(new TopicAndPartition(topicMetadata.topic(), partitionMetadata.partitionId()));
                offsetRequestInfo.put(leader, topicAndPartitions);
              } else {
                ArrayList<TopicAndPartition> topicAndPartitions = new ArrayList<TopicAndPartition>();
                topicAndPartitions.add(new TopicAndPartition(topicMetadata.topic(), partitionMetadata.partitionId()));
                offsetRequestInfo.put(leader, topicAndPartitions);
              }

            }
          }
        }
      }
    } catch (Exception e) {
      log.error("Unable to pull requests from Kafka brokers. Exiting the program", e);
      throw new IOException("Unable to pull requests from Kafka brokers.", e);
    }
    // Get the latest offsets and generate the EtlRequests
    finalRequests = fetchLatestOffsetAndCreateEtlRequests(context, offsetRequestInfo);

    Collections.sort(finalRequests, new Comparator<CamusRequest>() {
      @Override
      public int compare(CamusRequest r1, CamusRequest r2) {
        return r1.getTopic().compareTo(r2.getTopic());
      }
    });

    writeRequests(finalRequests, context);
    Map<CamusRequest, EtlKey> offsetKeys = getPreviousOffsets(FileInputFormat.getInputPaths(context), context);
    Set<String> moveLatest = getMoveToLatestTopicsSet(context);
    String camusRequestEmailMessage = "";
    for (CamusRequest request : finalRequests) {
      if (moveLatest.contains(request.getTopic()) || moveLatest.contains("all")) {
        log.info("Moving to latest for topic: " + request.getTopic());
        //TODO: factor out kafka specific request functionality
        EtlKey oldKey = offsetKeys.get(request);
        EtlKey newKey =
            new EtlKey(request.getTopic(), ((EtlRequest) request).getLeaderId(), request.getPartition(), 0,
                request.getLastOffset());

        if (oldKey != null)
          newKey.setMessageSize(oldKey.getMessageSize());

        offsetKeys.put(request, newKey);
      }

      EtlKey key = offsetKeys.get(request);

      if (key != null) {
        request.setOffset(key.getOffset());
        request.setAvgMsgSize(key.getMessageSize());
      }

      if (request.getEarliestOffset() > request.getOffset() || request.getOffset() > request.getLastOffset()) {
        if (request.getEarliestOffset() > request.getOffset()) {
          log.error("The earliest offset was found to be more than the current offset: " + request);
        } else {
          log.error("The current offset was found to be more than the latest offset: " + request);
        }

        boolean move_to_earliest_offset = context.getConfiguration().getBoolean(KAFKA_MOVE_TO_EARLIEST_OFFSET, false);
        boolean offsetUnset = request.getOffset() == EtlRequest.DEFAULT_OFFSET;
        log.info("move_to_earliest: " + move_to_earliest_offset + " offset_unset: " + offsetUnset);
        // When the offset is unset, it means it's a new topic/partition, we also need to consume the earliest offset
        if (move_to_earliest_offset || offsetUnset) {
          log.error("Moving to the earliest offset available");
          request.setOffset(request.getEarliestOffset());
          offsetKeys.put(
              request,
              //TODO: factor out kafka specific request functionality
              new EtlKey(request.getTopic(), ((EtlRequest) request).getLeaderId(), request.getPartition(), 0, request
                  .getOffset()));
        } else {
          log.error("Offset range from kafka metadata is outside the previously persisted offset, " + request + "\n" +
                    " Topic " + request.getTopic() + " will be skipped.\n" +
                    " Please check whether kafka cluster configuration is correct." +
                    " You can also specify config parameter: " + KAFKA_MOVE_TO_EARLIEST_OFFSET +
                    " to start processing from earliest kafka metadata offset.");
          reportJobFailureDueToOffsetOutOfRange = true;
        }
      } else if (3 * (request.getOffset() - request.getEarliestOffset())
          < request.getLastOffset() - request.getOffset()) {
        camusRequestEmailMessage +=
                "The current offset is too close to the earliest offset, Camus might be falling behind: "
                    + request + "\n";
      }
      log.info(request);
    }
    if(!Strings.isNullOrEmpty(camusRequestEmailMessage)) {
      EmailClient.sendEmail(camusRequestEmailMessage);
    }

    writePrevious(offsetKeys.values(), context);

    CamusJob.stopTiming("getSplits");
    CamusJob.startTiming("hadoop");
    CamusJob.setTime("hadoop_start");

    WorkAllocator allocator = getWorkAllocator(context);
    Properties props = new Properties();
    props.putAll(context.getConfiguration().getValByRegex(".*"));
    allocator.init(props);

    return allocator.allocateWork(finalRequests, context);
  }

  private Set<String> getMoveToLatestTopicsSet(JobContext context) {
    Set<String> topics = new HashSet<String>();

    String[] arr = getMoveToLatestTopics(context);

    if (arr != null) {
      for (String topic : arr) {
        topics.add(topic);
      }
    }

    return topics;
  }

  private boolean createMessageDecoder(JobContext context, String topic) {
    try {
      MessageDecoderFactory.createMessageDecoder(context, topic);
      return true;
    } catch (Exception e) {
      log.error("failed to create decoder", e);
      return false;
    }
  }

  private void writePrevious(Collection<EtlKey> missedKeys, JobContext context) throws IOException {
    FileSystem fs = FileSystem.get(context.getConfiguration());
    Path output = FileOutputFormat.getOutputPath(context);

    if (fs.exists(output)) {
      fs.mkdirs(output);
    }

    output = new Path(output, EtlMultiOutputFormat.OFFSET_PREFIX + "-previous");
    SequenceFile.Writer writer =
        SequenceFile.createWriter(fs, context.getConfiguration(), output, EtlKey.class, NullWritable.class);

    for (EtlKey key : missedKeys) {
      writer.append(key, NullWritable.get());
    }

    writer.close();
  }

  protected void writeRequests(List<CamusRequest> requests, JobContext context) throws IOException {
    FileSystem fs = FileSystem.get(context.getConfiguration());
    Path output = FileOutputFormat.getOutputPath(context);

    if (fs.exists(output)) {
      fs.mkdirs(output);
    }

    output = new Path(output, EtlMultiOutputFormat.REQUESTS_FILE);
    SequenceFile.Writer writer =
        SequenceFile.createWriter(fs, context.getConfiguration(), output, EtlRequest.class, NullWritable.class);

    for (CamusRequest r : requests) {
      //TODO: factor out kafka specific request functionality
      writer.append(r, NullWritable.get());
    }
    writer.close();
  }

  private Map<CamusRequest, EtlKey> getPreviousOffsets(Path[] inputs, JobContext context) throws IOException {
    Map<CamusRequest, EtlKey> offsetKeysMap = new HashMap<CamusRequest, EtlKey>();
    for (Path input : inputs) {
      FileSystem fs = input.getFileSystem(context.getConfiguration());
      for (FileStatus f : fs.listStatus(input, new OffsetFileFilter())) {
        log.info("previous offset file:" + f.getPath().toString());
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), context.getConfiguration());
        EtlKey key = new EtlKey();
        while (reader.next(key, NullWritable.get())) {
          //TODO: factor out kafka specific request functionality
          CamusRequest request = new EtlRequest(context, key.getTopic(), key.getLeaderId(), key.getPartition());
          if (offsetKeysMap.containsKey(request)) {

            EtlKey oldKey = offsetKeysMap.get(request);
            if (oldKey.getOffset() < key.getOffset()) {
              offsetKeysMap.put(request, key);
            }
          } else {
            offsetKeysMap.put(request, key);
          }
          key = new EtlKey();
        }
        reader.close();
      }
    }
    return offsetKeysMap;
  }

  public PartitionMetadata refreshPartitionMetadataOnLeaderNotAvailable(PartitionMetadata partitionMetadata,
      TopicMetadata topicMetadata, JobContext context, int numTries) throws InterruptedException {
    int tryCounter = 0;
    while (tryCounter < numTries && partitionMetadata.errorCode() == ErrorMapping.LeaderNotAvailableCode()) {
      log.info("Retry to referesh the topicMetadata on LeaderNotAvailable...");
      List<TopicMetadata> topicMetadataList =
          this.getKafkaMetadata(context, Collections.singletonList(topicMetadata.topic()));
      if (topicMetadataList == null || topicMetadataList.size() == 0) {
        log.warn("The topicMetadataList for topic " + topicMetadata.topic() + " is empty.");
      } else {
        topicMetadata = topicMetadataList.get(0);
        boolean partitionFound = false;
        for (PartitionMetadata metadataPerPartition : topicMetadata.partitionsMetadata()) {
          if (metadataPerPartition.partitionId() == partitionMetadata.partitionId()) {
            partitionFound = true;
            if (metadataPerPartition.errorCode() != ErrorMapping.LeaderNotAvailableCode()) {
              return metadataPerPartition;
            } else { //retry again.
              if (tryCounter < numTries - 1) {
                Thread.sleep((long) (Math.random() * (tryCounter + 1) * BACKOFF_UNIT_MILLISECONDS));
              }
              break;
            }
          }
        }
        if (!partitionFound) {
          log.error("No matching partition found in the topicMetadata for Partition: "
              + partitionMetadata.partitionId());
        }
      }
      tryCounter++;
    }
    return partitionMetadata;
  }

  public static void setWorkAllocator(JobContext job, Class<WorkAllocator> val) {
    job.getConfiguration().setClass(CAMUS_WORK_ALLOCATOR_CLASS, val, WorkAllocator.class);
  }

  public static WorkAllocator getWorkAllocator(JobContext job) {
    try {
      return (WorkAllocator) job.getConfiguration()
          .getClass(CAMUS_WORK_ALLOCATOR_CLASS, Class.forName(CAMUS_WORK_ALLOCATOR_DEFAULT)).newInstance();
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }

  public static void setMoveToLatestTopics(JobContext job, String val) {
    job.getConfiguration().set(KAFKA_MOVE_TO_LAST_OFFSET_LIST, val);
  }

  public static String[] getMoveToLatestTopics(JobContext job) {
    return job.getConfiguration().getStrings(KAFKA_MOVE_TO_LAST_OFFSET_LIST);
  }

  public static void setKafkaClientBufferSize(JobContext job, int val) {
    job.getConfiguration().setInt(KAFKA_CLIENT_BUFFER_SIZE, val);
  }

  public static int getKafkaClientBufferSize(JobContext job) {
    return job.getConfiguration().getInt(KAFKA_CLIENT_BUFFER_SIZE, 2 * 1024 * 1024);
  }

  public static void setKafkaClientTimeout(JobContext job, int val) {
    job.getConfiguration().setInt(KAFKA_CLIENT_SO_TIMEOUT, val);
  }

  public static int getKafkaClientTimeout(JobContext job) {
    return job.getConfiguration().getInt(KAFKA_CLIENT_SO_TIMEOUT, 60000);
  }

  public static void setKafkaMaxPullHrs(JobContext job, int val) {
    job.getConfiguration().setInt(KAFKA_MAX_PULL_HRS, val);
  }

  public static int getKafkaMaxPullHrs(JobContext job) {
    return job.getConfiguration().getInt(KAFKA_MAX_PULL_HRS, -1);
  }

  public static void setKafkaMaxPullMinutesPerTask(JobContext job, int val) {
    job.getConfiguration().setInt(KAFKA_MAX_PULL_MINUTES_PER_TASK, val);
  }

  public static int getKafkaMaxPullMinutesPerTask(JobContext job) {
    return job.getConfiguration().getInt(KAFKA_MAX_PULL_MINUTES_PER_TASK, -1);
  }

  public static void setKafkaMaxHistoricalDays(JobContext job, int val) {
    job.getConfiguration().setInt(KAFKA_MAX_HISTORICAL_DAYS, val);
  }

  public static int getKafkaMaxHistoricalDays(JobContext job) {
    return job.getConfiguration().getInt(KAFKA_MAX_HISTORICAL_DAYS, -1);
  }

  public static void setKafkaBlacklistTopic(JobContext job, String val) {
    job.getConfiguration().set(KAFKA_BLACKLIST_TOPIC, val);
  }

  public static String[] getKafkaBlacklistTopic(JobContext job) {
    return getKafkaBlacklistTopic(job.getConfiguration());
  }

  public static String[] getKafkaBlacklistTopic(Configuration conf) {
    final String blacklistStr = conf.get(KAFKA_BLACKLIST_TOPIC);
    if (blacklistStr != null && !blacklistStr.isEmpty()) {
      return conf.getStrings(KAFKA_BLACKLIST_TOPIC);
    } else {
      return new String[] {};
    }
  }

  public static void setKafkaWhitelistTopic(JobContext job, String val) {
    job.getConfiguration().set(KAFKA_WHITELIST_TOPIC, val);
  }

  public static String[] getKafkaWhitelistTopic(JobContext job) {
    return getKafkaWhitelistTopic(job.getConfiguration());
  }

  public static String[] getKafkaWhitelistTopic(Configuration conf) {
    final String whitelistStr = conf.get(KAFKA_WHITELIST_TOPIC);
    if (whitelistStr != null && !whitelistStr.isEmpty()) {
      return conf.getStrings(KAFKA_WHITELIST_TOPIC);
    } else {
      return new String[] {};
    }
  }

  public static void setEtlIgnoreSchemaErrors(JobContext job, boolean val) {
    job.getConfiguration().setBoolean(ETL_IGNORE_SCHEMA_ERRORS, val);
  }

  public static boolean getEtlIgnoreSchemaErrors(JobContext job) {
    return job.getConfiguration().getBoolean(ETL_IGNORE_SCHEMA_ERRORS, false);
  }

  public static void setEtlAuditIgnoreServiceTopicList(JobContext job, String topics) {
    job.getConfiguration().set(ETL_AUDIT_IGNORE_SERVICE_TOPIC_LIST, topics);
  }

  public static String[] getEtlAuditIgnoreServiceTopicList(JobContext job) {
    return job.getConfiguration().getStrings(ETL_AUDIT_IGNORE_SERVICE_TOPIC_LIST, "");
  }

  public static void setMessageDecoderClass(JobContext job, Class<MessageDecoder> cls) {
    job.getConfiguration().setClass(CAMUS_MESSAGE_DECODER_CLASS, cls, MessageDecoder.class);
  }

  public static Class<MessageDecoder> getMessageDecoderClass(JobContext job) {
    return (Class<MessageDecoder>) job.getConfiguration().getClass(CAMUS_MESSAGE_DECODER_CLASS,
        KafkaAvroMessageDecoder.class);
  }

  public static Class<MessageDecoder> getMessageDecoderClass(JobContext job, String topicName) {
    Class<MessageDecoder> topicDecoder =
        (Class<MessageDecoder>) job.getConfiguration().getClass(CAMUS_MESSAGE_DECODER_CLASS + "." + topicName, null);
    return topicDecoder == null ? getMessageDecoderClass(job) : topicDecoder;
  }

  private class OffsetFileFilter implements PathFilter {

    @Override
    public boolean accept(Path arg0) {
      return arg0.getName().startsWith(EtlMultiOutputFormat.OFFSET_PREFIX);
    }
  }
}