LLRealtimeSegmentDataManager.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.linkedin.pinot.core.data.manager.realtime;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.Uninterruptibles;
import com.linkedin.pinot.common.config.AbstractTableConfig;
import com.linkedin.pinot.common.config.IndexingConfig;
import com.linkedin.pinot.common.config.SegmentPartitionConfig;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.data.StarTreeIndexSpec;
import com.linkedin.pinot.common.metadata.instance.InstanceZKMetadata;
import com.linkedin.pinot.common.metadata.segment.LLCRealtimeSegmentZKMetadata;
import com.linkedin.pinot.common.metadata.segment.RealtimeSegmentZKMetadata;
import com.linkedin.pinot.common.metadata.stream.KafkaStreamMetadata;
import com.linkedin.pinot.common.metrics.ServerGauge;
import com.linkedin.pinot.common.metrics.ServerMeter;
import com.linkedin.pinot.common.metrics.ServerMetrics;
import com.linkedin.pinot.common.protocols.SegmentCompletionProtocol;
import com.linkedin.pinot.common.utils.CommonConstants;
import com.linkedin.pinot.common.utils.LLCSegmentName;
import com.linkedin.pinot.common.utils.NetUtil;
import com.linkedin.pinot.common.utils.TarGzCompressionUtils;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.data.extractors.FieldExtractorFactory;
import com.linkedin.pinot.core.data.extractors.PlainFieldExtractor;
import com.linkedin.pinot.core.data.manager.offline.SegmentDataManager;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.indexsegment.generator.SegmentVersion;
import com.linkedin.pinot.core.realtime.converter.RealtimeSegmentConverter;
import com.linkedin.pinot.core.realtime.impl.RealtimeSegmentImpl;
import com.linkedin.pinot.core.realtime.impl.kafka.KafkaLowLevelStreamProviderConfig;
import com.linkedin.pinot.core.realtime.impl.kafka.KafkaMessageDecoder;
import com.linkedin.pinot.core.realtime.impl.kafka.KafkaSimpleConsumerFactoryImpl;
import com.linkedin.pinot.core.realtime.impl.kafka.SimpleConsumerWrapper;
import com.linkedin.pinot.core.segment.index.loader.IndexLoadingConfig;
import com.linkedin.pinot.server.realtime.ServerSegmentCompletionProtocolHandler;
import com.yammer.metrics.core.Meter;
import kafka.message.MessageAndOffset;


/**
 * Segment data manager for low level consumer realtime segments, which manages consumption and segment completion.
 */
public class LLRealtimeSegmentDataManager extends SegmentDataManager {
  protected enum State {
    // The state machine starts off with this state. While in this state we consume kafka events
    // and index them in memory. We continue to be in this state until the end criteria is satisfied
    // (time or number of rows)
    INITIAL_CONSUMING,

    // In this state, we consume from kafka until we reach the _finalOffset (exclusive)
    CATCHING_UP,

    // In this state, we sleep for MAX_HOLDING_TIME_MS, and the make a segmentConsumed() call to the
    // controller.
    HOLDING,

    // We have been asked to go Online from Consuming state, and are trying a last attempt to catch up to the
    // target offset. In this state, we have a time constraint as well as a final offset constraint to look into
    // before we stop consuming.
    CONSUMING_TO_ONLINE,

    // We have been asked by the controller to retain the segment we have in memory at the current offset.
    // We should build the segment, and replace it with the in-memory segment.
    RETAINING,

    // We have been asked by the controller to commit the segment at the current offset. Build the segment
    // and make a segmentCommit() call to the controller.
    COMMITTING,

    // We have been asked to discard the in-memory segment we have. We will be serving queries, but not consuming
    // anymore rows from kafka. We wait for a helix transition to go ONLINE, at which point, we can download the
    // segment from the controller and replace it with the in-memory segment.
    DISCARDED,

    // We have replaced our in-memory segment with the segment that has been built locally.
    RETAINED,

    // We have committed our segment to the controller, and also replaced it locally with the constructed segment.
    COMMITTED,

    // Something went wrong, we need to download the segment when we get the ONLINE transition.
    ERROR;

    public boolean shouldConsume() {
      if (this.equals(INITIAL_CONSUMING) || this.equals(CATCHING_UP) || this.equals(CONSUMING_TO_ONLINE)) {
        return true;
      }
      return false;
    }

    public boolean isFinal() {
      if (this.equals(ERROR) || this.equals(COMMITTED) || this.equals(RETAINED) || this.equals(DISCARDED)) {
        return true;
      }
      return false;
    }
  }
  private static final Logger LOGGER = LoggerFactory.getLogger(LLRealtimeSegmentDataManager.class);
  private static final long TIME_THRESHOLD_FOR_LOG_MINUTES = 1;
  private static final long TIME_EXTENSION_ON_EMPTY_SEGMENT_HOURS = 1;
  private static final int MSG_COUNT_THRESHOLD_FOR_LOG = 100000;
  private final int MAX_CONSECUTIVE_ERROR_COUNT = 5;

  private final LLCRealtimeSegmentZKMetadata _segmentZKMetadata;
  private final AbstractTableConfig _tableConfig;
  private final RealtimeTableDataManager _realtimeTableDataManager;
  private final KafkaMessageDecoder _messageDecoder;
  private final int _segmentMaxRowCount;
  private final String _resourceDataDir;
  private final IndexLoadingConfig _indexLoadingConfig;
  private final Schema _schema;
  private final String _metricKeyName;
  private final ServerMetrics _serverMetrics;
  private final RealtimeSegmentImpl _realtimeSegment;
  private volatile long _currentOffset;
  private volatile State _state;
  private volatile int _numRowsConsumed = 0;
  private volatile int consecutiveErrorCount = 0;
  private long _startTimeMs = 0;
  private final String _segmentNameStr;
  private final SegmentVersion _segmentVersion;
  private final SegmentBuildTimeLeaseExtender _leaseExtender;

  // Segment end criteria
  private volatile long _consumeEndTime = 0;
  private volatile long _finalOffset = -1;
  private volatile boolean _shouldStop = false;

  // It takes 30s to locate controller leader, and more if there are multiple controller failures.
  // For now, we let 31s pass for this state transition.
  private final int _maxTimeForConsumingToOnlineSec = 31;

  private Thread _consumerThread;
  private final String _kafkaTopic;
  private final int _kafkaPartitionId;
  final String _clientId;
  private final LLCSegmentName _segmentName;
  private final PlainFieldExtractor _fieldExtractor;
  private SimpleConsumerWrapper _consumerWrapper = null;
  private final File _resourceTmpDir;
  private final String _tableName;
  private final List<String> _invertedIndexColumns;
  private final List<String> _noDictionaryColumns;
  private final StarTreeIndexSpec _starTreeIndexSpec;
  private final String _sortedColumn;
  private Logger segmentLogger = LOGGER;
  private final String _tableStreamName;
  private AtomicLong _lastUpdatedRawDocuments = new AtomicLong(0);
  private final String _instanceId;
  private final ServerSegmentCompletionProtocolHandler _protocolHandler;
  private final long _consumeStartTime;
  private final long _startOffset;
  private final KafkaStreamMetadata _kafkaStreamMetadata;
  private final String _kafkaBootstrapNodes;

  private long _lastLogTime = 0;
  private int _lastConsumedCount = 0;
  private String _stopReason = null;


  // TODO each time this method is called, we print reason for stop. Good to print only once.
  private boolean endCriteriaReached() {
    Preconditions.checkState(_state.shouldConsume(), "Incorrect state %s", _state);
    long now = now();
    switch(_state) {
      case INITIAL_CONSUMING:
        // The segment has been created, and we have not posted a segmentConsumed() message on the controller yet.
        // We need to consume as much data as available, until we have either reached the max number of rows or
        // the max time we are allowed to consume.
        if (now >= _consumeEndTime) {
          if (_realtimeSegment.getRawDocumentCount() == 0) {
            segmentLogger.info("No events came in, extending time by {} hours", TIME_EXTENSION_ON_EMPTY_SEGMENT_HOURS);
            _consumeEndTime += TimeUnit.HOURS.toMillis(TIME_EXTENSION_ON_EMPTY_SEGMENT_HOURS);
            return false;
          }
          segmentLogger.info("Stopping consumption due to time limit start={} now={} numRows={}", _startTimeMs, now, _numRowsConsumed);
          _stopReason = SegmentCompletionProtocol.REASON_TIME_LIMIT;
          return true;
        } else if (_numRowsConsumed >= _segmentMaxRowCount) {
          segmentLogger.info("Stopping consumption due to row limit nRows={} maxNRows={}", _numRowsConsumed,
              _segmentMaxRowCount);
          _stopReason = SegmentCompletionProtocol.REASON_ROW_LIMIT;
          return true;
        }
        return false;

      case CATCHING_UP:
        _stopReason = null;
        // We have posted segmentConsumed() at least once, and the controller is asking us to catch up to a certain offset.
        // There is no time limit here, so just check to see that we are still within the offset we need to reach.
        // Going past the offset is an exception.
        if (_currentOffset == _finalOffset) {
          segmentLogger.info("Caught up to offset={}, state={}", _finalOffset, _state.toString());
          return true;
        }
        if (_currentOffset > _finalOffset) {
          segmentLogger.error("Offset higher in state={}, current={}, final={}", _state.toString(), _currentOffset, _finalOffset);
          throw new RuntimeException("Past max offset");
        }
        return false;

      case CONSUMING_TO_ONLINE:
        // We are attempting to go from CONSUMING to ONLINE state. We are making a last attempt to catch up to the
        // target offset. We have a time constraint, and need to stop consuming if we cannot get to the target offset
        // within that time.
        if (_currentOffset == _finalOffset) {
          segmentLogger.info("Caught up to offset={}, state={}", _finalOffset, _state.toString());
          return true;
        } else if (now >= _consumeEndTime) {
          segmentLogger.info("Past max time budget: offset={}, state={}", _currentOffset, _state.toString());
          return true;
        }
        if (_currentOffset > _finalOffset) {
          segmentLogger.error("Offset higher in state={}, current={}, final={}", _state.toString(), _currentOffset, _finalOffset);
          throw new RuntimeException("Past max offset");
        }
        return false;
      default:
        segmentLogger.error("Illegal state {}" + _state.toString());
        throw new RuntimeException("Illegal state to consume");
    }
  }

  private void handleTransientKafkaErrors(Exception e) throws  Exception {
    consecutiveErrorCount++;
    if (consecutiveErrorCount > MAX_CONSECUTIVE_ERROR_COUNT) {
      segmentLogger.warn("Kafka transient exception when fetching messages, stopping consumption after {} attempts", consecutiveErrorCount, e);
      throw e;
    } else {
      segmentLogger.warn("Kafka transient exception when fetching messages, retrying (count={})", consecutiveErrorCount, e);
      Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
      makeConsumerWrapper("Too many transient errors");
    }
  }

  protected boolean consumeLoop() throws Exception {
    _fieldExtractor.resetCounters();
    final long idlePipeSleepTimeMillis = 100;
    final long maxIdleCountBeforeStatUpdate = (3 * 60 * 1000)/idlePipeSleepTimeMillis;  // 3 minute count
    long lastUpdatedOffset = _currentOffset;  // so that we always update the metric when we enter this method.
    long idleCount = 0;

    final long _endOffset = Long.MAX_VALUE; // No upper limit on Kafka offset
    segmentLogger.info("Starting consumption loop start offset {}, finalOffset {}", _currentOffset, _finalOffset);
    while(!_shouldStop && !endCriteriaReached()) {
      // Consume for the next _kafkaReadTime ms, or we get to final offset, whichever happens earlier,
      // Update _currentOffset upon return from this method
      Iterable<MessageAndOffset> messagesAndOffsets = null;
      Long highWatermark = null;
      try {
        Pair<Iterable<MessageAndOffset>, Long> messagesAndWatermark =
            _consumerWrapper.fetchMessagesAndHighWatermark(_currentOffset, _endOffset,
                _kafkaStreamMetadata.getKafkaFetchTimeoutMillis());
        consecutiveErrorCount = 0;
        messagesAndOffsets = messagesAndWatermark.getLeft();
        highWatermark = messagesAndWatermark.getRight();
      } catch (TimeoutException e) {
        handleTransientKafkaErrors(e);
        continue;
      } catch (SimpleConsumerWrapper.TransientConsumerException e) {
        handleTransientKafkaErrors(e);
        continue;
      } catch (SimpleConsumerWrapper.PermanentConsumerException e) {
        segmentLogger.warn("Kafka permanent exception when fetching messages, stopping consumption", e);
        throw e;
      } catch (Exception e) {
        // Unknown exception from Kafka. Treat as a transient exception.
        // One such exception seen so far is java.net.SocketTimeoutException
        handleTransientKafkaErrors(e);
        continue;
      }

      processKafkaEvents(messagesAndOffsets, highWatermark, idlePipeSleepTimeMillis);

      if (_currentOffset != lastUpdatedOffset) {
        // We consumed something. Update the highest kafka offset as well as partition-consuming metric.
        _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.HIGHEST_KAFKA_OFFSET_CONSUMED, _currentOffset);
        _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.LLC_PARTITION_CONSUMING, 1);
        lastUpdatedOffset = _currentOffset;
      } else {
        // We did not consume any rows. Update the partition-consuming metric only if we have been idling for a long time.
        // Create a new kafka consumer wrapper, in case we are stuck on something.
        if (++idleCount > maxIdleCountBeforeStatUpdate) {
          _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.LLC_PARTITION_CONSUMING, 1);
          idleCount = 0;
          makeConsumerWrapper("Idle for too long");
        }
      }
    }

    _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.ROWS_WITH_ERRORS,
        (long) _fieldExtractor.getTotalErrors());
    _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.ROWS_NEEDING_CONVERSIONS,
        (long) _fieldExtractor.getTotalConversions());
    _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.ROWS_WITH_NULL_VALUES,
        (long) _fieldExtractor.getTotalNulls());
    _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.COLUMNS_WITH_NULL_VALUES,
        (long) _fieldExtractor.getTotalNullCols());
    return true;
  }

  private void processKafkaEvents(Iterable<MessageAndOffset> messagesAndOffsets, Long highWatermark, long idlePipeSleepTimeMillis) {
    Meter realtimeRowsConsumedMeter = null;
    Meter realtimeRowsDroppedMeter = null;

    Iterator<MessageAndOffset> msgIterator = messagesAndOffsets.iterator();

    int indexedMessageCount = 0;
    int kafkaMessageCount = 0;
    boolean canTakeMore = true;
    GenericRow decodedRow = null;
    GenericRow transformedRow = null;
    while (!_shouldStop && !endCriteriaReached() && msgIterator.hasNext()) {
      if (!canTakeMore) {
        // The RealtimeSegmentImpl that we are pushing rows into has indicated that it cannot accept any more
        // rows. This can happen in one of two conditions:
        // 1. We are in INITIAL_CONSUMING state, and we somehow exceeded the max number of rows we are allowed to consume
        //    for this row. Something is seriously wrong, because endCriteriaReached() should have returned true when
        //    we hit the row limit.
        //    Throw an exception.
        //
        // 2. We are in CATCHING_UP state, and we legally hit this error due to Kafka unclean leader election where
        //    offsets get changed with higher generation numbers for some pinot servers but not others. So, if another
        //    server (who got a larger kafka offset) asked us to catch up to that offset, but we are connected to a
        //    broker who has smaller offsets, then we may try to push more rows into the buffer than maximum. This
        //    is a rare case, and we really don't know how to handle this at this time.
        //    Throw an exception.
        //
        segmentLogger.error("Buffer full with {} rows consumed (row limit {})", _numRowsConsumed, _segmentMaxRowCount);
        throw new RuntimeException("Realtime segment full");
      }
      // Index each message
      MessageAndOffset messageAndOffset = msgIterator.next();
      byte[] array = messageAndOffset.message().payload().array();
      int offset = messageAndOffset.message().payload().arrayOffset();
      int length = messageAndOffset.message().payloadSize();
      decodedRow = GenericRow.createOrReuseRow(decodedRow);
      decodedRow = _messageDecoder.decode(array, offset, length, decodedRow);

      // Update lag metric on the first message of each batch
      if (kafkaMessageCount == 0) {
        long messageOffset = messageAndOffset.offset();
        long offsetDifference = highWatermark - messageOffset;
        _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.KAFKA_PARTITION_OFFSET_LAG, offsetDifference);
      }

      if (decodedRow != null) {
        transformedRow = GenericRow.createOrReuseRow(transformedRow);
        transformedRow = _fieldExtractor.transform(decodedRow, transformedRow);

        if (transformedRow != null) {
          realtimeRowsConsumedMeter = _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.REALTIME_ROWS_CONSUMED, 1, realtimeRowsConsumedMeter);
          indexedMessageCount++;
        } else {
          realtimeRowsDroppedMeter = _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.INVALID_REALTIME_ROWS_DROPPED, 1, realtimeRowsDroppedMeter);
        }

        canTakeMore = _realtimeSegment.index(transformedRow);
      } else {
        realtimeRowsDroppedMeter = _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.INVALID_REALTIME_ROWS_DROPPED, 1, realtimeRowsDroppedMeter);
      }

      _currentOffset = messageAndOffset.nextOffset();
      _numRowsConsumed++;
      kafkaMessageCount++;
    }
    updateCurrentDocumentCountMetrics();
    if (kafkaMessageCount != 0) {
      segmentLogger.debug("Indexed {} messages ({} messages read from Kafka) current offset {}", indexedMessageCount,
          kafkaMessageCount, _currentOffset);
    } else {
      // If there were no messages to be fetched from Kafka, wait for a little bit as to avoid hammering the
      // Kafka broker
      Uninterruptibles.sleepUninterruptibly(idlePipeSleepTimeMillis, TimeUnit.MILLISECONDS);
    }
  }

  public class PartitionConsumer implements Runnable {
    public void run() {
      long initialConsumptionEnd = 0L;
      long lastCatchUpStart = 0L;
      long catchUpTimeMillis = 0L;
      _startTimeMs = now();
      try {
        while (!_state.isFinal()) {
          if (_state.shouldConsume()) {
            consumeLoop();  // Consume until we reached the end criteria, or we are stopped.
          }
          if (_shouldStop) {
            break;
          }

          if (_state == State.INITIAL_CONSUMING) {
            initialConsumptionEnd = now();
            _serverMetrics.setValueOfTableGauge(_metricKeyName,
                ServerGauge.LAST_REALTIME_SEGMENT_INITIAL_CONSUMPTION_DURATION_SECONDS,
                TimeUnit.MILLISECONDS.toSeconds(initialConsumptionEnd - _startTimeMs));
          } else if (_state == State.CATCHING_UP) {
            catchUpTimeMillis += now() - lastCatchUpStart;
            _serverMetrics.setValueOfTableGauge(_metricKeyName,
                ServerGauge.LAST_REALTIME_SEGMENT_CATCHUP_DURATION_SECONDS,
                TimeUnit.MILLISECONDS.toSeconds(catchUpTimeMillis));
          }

          // If we are sending segmentConsumed() to the controller, we are in HOLDING state.
          _state = State.HOLDING;
          SegmentCompletionProtocol.Response response = postSegmentConsumedMsg();
          SegmentCompletionProtocol.ControllerResponseStatus status = response.getStatus();
          long rspOffset = response.getOffset();
          boolean success;
          switch (status) {
            case NOT_LEADER:
              _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.LLC_CONTROLLER_RESPONSE_NOT_LEADER, 1);
              // Retain the same state
              segmentLogger.warn("Got not leader response");
              hold();
              break;
            case CATCH_UP:
              _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.LLC_CONTROLLER_RESPONSE_CATCH_UP, 1);
              if (rspOffset <= _currentOffset) {
                // Something wrong with the controller. Back off and try again.
                segmentLogger.error("Invalid catchup offset {} in controller response, current offset {}", rspOffset,
                    _currentOffset);
                hold();
              } else {
                _state = State.CATCHING_UP;
                _finalOffset = rspOffset;
                lastCatchUpStart = now();
                // We will restart consumption when we loop back above.
              }
              break;
            case HOLD:
              _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.LLC_CONTROLLER_RESPONSE_HOLD, 1);
              hold();
              break;
            case DISCARD:
              _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.LLC_CONTROLLER_RESPONSE_DISCARD, 1);
              // Keep this in memory, but wait for the online transition, and download when it comes in.
              _state = State.DISCARDED;
              break;
            case KEEP:
              _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.LLC_CONTROLLER_RESPONSE_KEEP, 1);
              _state = State.RETAINING;
              success = buildSegmentAndReplace();
              if (success) {
                _state = State.RETAINED;
              } else {
                // Could not build segment for some reason. We can only download it.
                _state = State.ERROR;
              }
              break;
            case COMMIT:
              _serverMetrics.addMeteredTableValue(_metricKeyName, ServerMeter.LLC_CONTROLLER_RESPONSE_COMMIT, 1);
              _state = State.COMMITTING;
              long buildTimeSeconds = response.getBuildTimeSeconds();
              success = buildSegment(true, buildTimeSeconds*1000L);
              if (!success) {
                // We could not build the segment. Go into error state.
                _state = State.ERROR;
              } else {
                success = commitSegment();
                if (success) {
                  _state = State.COMMITTED;
                } else {
                  // If for any reason commit failed, we don't want to be in COMMITTING state when we hold.
                  // Change the state to HOLDING before looping around.
                  _state = State.HOLDING;
                  segmentLogger.info("Could not commit segment. Retrying after hold");
                  hold();
                }
              }
              break;
            default:
              segmentLogger.error("Holding after response from Controller: {}", response.toJsonString());
              hold();
              break;
          }
        }
      } catch (Exception e) {
        segmentLogger.error("Exception while in work", e);
        postStopConsumedMsg(e.getClass().getName());
        _state = State.ERROR;
        _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.LLC_PARTITION_CONSUMING, 0);
        return;
      }

      if (initialConsumptionEnd != 0L) {
        _serverMetrics.setValueOfTableGauge(_metricKeyName,
            ServerGauge.LAST_REALTIME_SEGMENT_COMPLETION_DURATION_SECONDS,
            TimeUnit.MILLISECONDS.toSeconds(now() - initialConsumptionEnd));
      }
      _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.LLC_PARTITION_CONSUMING, 0);
    }
  }

  private File makeSegmentDirPath() {
    return new File(_resourceDataDir, _segmentZKMetadata.getSegmentName());
  }

  /**
   *
   * @param forCommit true if you want the method to also build tgz file for committing
   * @param buildTimeLeaseMs
   * @return true if all succeeds.
   */
  protected boolean buildSegment(boolean forCommit, long buildTimeLeaseMs) {
    try {
      if (buildTimeLeaseMs <= 0) {
        buildTimeLeaseMs = SegmentCompletionProtocol.getDefaultMaxSegmentCommitTimeSeconds() * 1000L;
      }
      if (forCommit) {
        _leaseExtender.addSegment(_segmentNameStr, buildTimeLeaseMs, _currentOffset);
      }
      return buildSegmentInternal(forCommit);
    } finally {
      if (forCommit) {
        _leaseExtender.removeSegment(_segmentNameStr);
      }
    }
  }

  private boolean buildSegmentInternal(boolean forCommit) {
    long startTimeMillis = System.currentTimeMillis();
    // Build a segment from in-memory rows.If buildTgz is true, then build the tar.gz file as well
    // TODO Use an auto-closeable object to delete temp resources.
    File tempSegmentFolder = new File(_resourceTmpDir, "tmp-" + _segmentNameStr + "-" + String.valueOf(now()));
    // lets convert the segment now
    RealtimeSegmentConverter converter =
        new RealtimeSegmentConverter(_realtimeSegment, tempSegmentFolder.getAbsolutePath(), _schema,
            _segmentZKMetadata.getTableName(), _segmentZKMetadata.getSegmentName(), _sortedColumn,
            _invertedIndexColumns, _noDictionaryColumns, _starTreeIndexSpec);
    logStatistics();
    segmentLogger.info("Trying to build segment");
    final long buildStartTime = now();
    try {
      converter.build(_segmentVersion);
    } catch (Exception e) {
      segmentLogger.error("Could not build segment", e);
      FileUtils.deleteQuietly(tempSegmentFolder);
      return false;
    }
    final long buildEndTime = now();
    segmentLogger.info("Successfully built segment in {} ms", (buildEndTime - buildStartTime));
    File destDir = makeSegmentDirPath();
    FileUtils.deleteQuietly(destDir);
    try {
      FileUtils.moveDirectory(tempSegmentFolder.listFiles()[0], destDir);
      if (forCommit) {
        TarGzCompressionUtils.createTarGzOfDirectory(destDir.getAbsolutePath());
      }
    } catch (IOException e) {
      segmentLogger.error("Exception during move/tar segment", e);
      FileUtils.deleteQuietly(tempSegmentFolder);
      return false;
    }
    FileUtils.deleteQuietly(tempSegmentFolder);
    long endTimeMillis = System.currentTimeMillis();

    _serverMetrics.setValueOfTableGauge(_metricKeyName, ServerGauge.LAST_REALTIME_SEGMENT_CREATION_DURATION_SECONDS,
        TimeUnit.MILLISECONDS.toSeconds(endTimeMillis - startTimeMillis));

    return true;
  }

  protected boolean commitSegment() {
    // Send segmentCommit() to the controller
    // if that succeeds, swap in-memory segment with the one built.
    File destSeg = makeSegmentDirPath();
    final String segTarFileName = destSeg.getAbsolutePath() + TarGzCompressionUtils.TAR_GZ_FILE_EXTENTION;
    try {
      SegmentCompletionProtocol.Response response = _protocolHandler.segmentCommit(_currentOffset, _segmentNameStr,
          new File(segTarFileName));
      if (!response.getStatus().equals(SegmentCompletionProtocol.ControllerResponseStatus.COMMIT_SUCCESS)) {
        segmentLogger.warn("Received controller response {}", response);
        return false;
      }
      _realtimeTableDataManager.replaceLLSegment(_segmentNameStr, _indexLoadingConfig);
    } catch (FileNotFoundException e) {
      segmentLogger.error("Tar file {} not found", segTarFileName, e);
      return false;
    } finally {
      FileUtils.deleteQuietly(new File(segTarFileName));
    }
    return true;
  }

  protected boolean buildSegmentAndReplace() {
    boolean success = buildSegmentInternal(false);
    if (!success) {
      return success;
    }
    _realtimeTableDataManager.replaceLLSegment(_segmentNameStr, _indexLoadingConfig);
    return true;
  }

  protected void hold() {
    try {
      Thread.sleep(SegmentCompletionProtocol.MAX_HOLD_TIME_MS);
    } catch (InterruptedException e) {
      segmentLogger.warn("Interrupted while holding");
    }
  }

  // Inform the controller that the server had to stop consuming due to an error.
  protected void postStopConsumedMsg(String reason) {
    do {
      SegmentCompletionProtocol.Request.Params params = new SegmentCompletionProtocol.Request.Params();
      params.withOffset(_currentOffset).withReason(reason).withSegmentName(_segmentNameStr);

      SegmentCompletionProtocol.Response response = _protocolHandler.segmentStoppedConsuming(params);
      if (response.getStatus() == SegmentCompletionProtocol.ControllerResponseStatus.PROCESSED) {
        LOGGER.info("Got response {}", response.toJsonString());
        break;
      }
      Uninterruptibles.sleepUninterruptibly(10, TimeUnit.SECONDS);
      LOGGER.info("Retrying after response {}", response.toJsonString());
    } while (!_shouldStop);
  }

  protected SegmentCompletionProtocol.Response postSegmentConsumedMsg() {
    // Post segmentConsumed to current leader.
    // Retry maybe once if leader is not found.
    SegmentCompletionProtocol.Request.Params params = new SegmentCompletionProtocol.Request.Params();
    params.withOffset(_currentOffset).withSegmentName(_segmentNameStr).withReason(_stopReason);
    return _protocolHandler.segmentConsumed(params);
  }

  public void goOnlineFromConsuming(RealtimeSegmentZKMetadata metadata) throws InterruptedException {
    LLCRealtimeSegmentZKMetadata llcMetadata = (LLCRealtimeSegmentZKMetadata)metadata;
    _leaseExtender.removeSegment(_segmentNameStr);
    final long endOffset = llcMetadata.getEndOffset();
    segmentLogger.info("State: {}, transitioning from CONSUMING to ONLINE (startOffset: {}, endOffset: {})",
        _state.toString(), _startOffset, endOffset);
    stop();
    segmentLogger.info("Consumer thread stopped in state {}", _state.toString());

    switch (_state) {
      case COMMITTED:
      case RETAINED:
        // Nothing to do. we already built local segment and swapped it with in-memory data.
        segmentLogger.info("State {}. Nothing to do", _state.toString());
        break;
      case DISCARDED:
      case ERROR:
        segmentLogger.info("State {}. Downloading to replace", _state.toString());
        downloadSegmentAndReplace(llcMetadata);
        break;
      case CATCHING_UP:
      case HOLDING:
      case INITIAL_CONSUMING:
        // Allow to catch up upto final offset, and then replace.
        if (_currentOffset > endOffset) {
          // We moved ahead of the offset that is committed in ZK.
          segmentLogger.warn("Current offset {} ahead of the offset in zk {}. Downloading to replace", _currentOffset,
              endOffset);
          downloadSegmentAndReplace(llcMetadata);
        } else if (_currentOffset == endOffset) {
          segmentLogger.info("Current offset {} matches offset in zk {}. Replacing segment", _currentOffset, endOffset);
          buildSegmentAndReplace();
        } else {
          segmentLogger.info("Attempting to catch up from offset {} to {} ", _currentOffset, endOffset);
          boolean success = catchupToFinalOffset(endOffset,
              TimeUnit.MILLISECONDS.convert(_maxTimeForConsumingToOnlineSec, TimeUnit.SECONDS));
          if (success) {
            segmentLogger.info("Caught up to offset {}", _currentOffset);
            buildSegmentAndReplace();
          } else {
            segmentLogger.info("Could not catch up to offset (current = {}). Downloading to replace", _currentOffset);
            downloadSegmentAndReplace(llcMetadata);
          }
        }
        break;
      default:
        segmentLogger.info("Downloading to replace segment while in state {}", _state.toString());
        downloadSegmentAndReplace(llcMetadata);
        break;
    }
  }

  protected void downloadSegmentAndReplace(LLCRealtimeSegmentZKMetadata metadata) {
    _realtimeTableDataManager.downloadAndReplaceSegment(_segmentNameStr, metadata, _indexLoadingConfig);
  }

  protected long now() {
    return System.currentTimeMillis();
  }

  private boolean catchupToFinalOffset(long endOffset, long timeoutMs) {
    _finalOffset = endOffset;
    _consumeEndTime = now() + timeoutMs;
    _state = State.CONSUMING_TO_ONLINE;
    _shouldStop = false;
    try {
      consumeLoop();
    } catch (Exception e) {
      // We will end up downloading the segment, so this is not a serious problem
      segmentLogger.warn("Exception when catching up to final offset", e);
      return false;
    }
    if (_currentOffset != endOffset) {
      // Timeout?
      segmentLogger.error("Could not consume up to {} (current offset {})", endOffset, _currentOffset);
      return false;
    }

    return true;
  }

  public void destroy() {
    try {
      stop();
    } catch (InterruptedException e) {
      segmentLogger.error("Could not stop consumer thread");
    }
    _realtimeSegment.destroy();
    try {
      _consumerWrapper.close();
    } catch (Exception e) {
      segmentLogger.warn("Could not close consumer wrapper", e);
    }
  }

  protected void start() {
    _consumerThread = new Thread(new PartitionConsumer(), _segmentNameStr);
    segmentLogger.info("Created new consumer thread {} for {}", _consumerThread, this.toString());
    _consumerThread.start();
  }

  /**
   * Stop the consuming thread.
   */
  public void stop() throws InterruptedException {
    _shouldStop = true;
    // This method could be called either when we get an ONLINE transition or
    // when we commit a segment and replace the realtime segment with a committed
    // one. In the latter case, we don't want to call join.
    if (Thread.currentThread() != _consumerThread) {
      Uninterruptibles.joinUninterruptibly(_consumerThread, 10, TimeUnit.MINUTES);

      if (_consumerThread.isAlive()) {
        segmentLogger.warn("Failed to stop consumer thread within 10 minutes");
      }
    }
  }

  // TODO Make this a factory class.
  protected KafkaLowLevelStreamProviderConfig createStreamProviderConfig() {
    return new KafkaLowLevelStreamProviderConfig();
  }

  // Assume that this is called only on OFFLINE to CONSUMING transition.
  // If the transition is OFFLINE to ONLINE, the caller should have downloaded the segment and we don't reach here.
  public LLRealtimeSegmentDataManager(RealtimeSegmentZKMetadata segmentZKMetadata, AbstractTableConfig tableConfig,
      InstanceZKMetadata instanceZKMetadata, RealtimeTableDataManager realtimeTableDataManager, String resourceDataDir,
      IndexLoadingConfig indexLoadingConfig, Schema schema, ServerMetrics serverMetrics)
      throws Exception {
    _segmentZKMetadata = (LLCRealtimeSegmentZKMetadata) segmentZKMetadata;
    _tableConfig = tableConfig;
    _realtimeTableDataManager = realtimeTableDataManager;
    _resourceDataDir = resourceDataDir;
    _indexLoadingConfig = indexLoadingConfig;
    _schema = schema;
    _serverMetrics = serverMetrics;
    _segmentVersion = indexLoadingConfig.getSegmentVersion();
    _instanceId = _realtimeTableDataManager.getServerInstance();
    _leaseExtender = SegmentBuildTimeLeaseExtender.getLeaseExtender(_instanceId);
    _protocolHandler = new ServerSegmentCompletionProtocolHandler(_instanceId);

    // TODO Validate configs
    IndexingConfig indexingConfig = _tableConfig.getIndexingConfig();
    _kafkaStreamMetadata = new KafkaStreamMetadata(indexingConfig.getStreamConfigs());
    KafkaLowLevelStreamProviderConfig kafkaStreamProviderConfig = createStreamProviderConfig();
    kafkaStreamProviderConfig.init(tableConfig, instanceZKMetadata, schema);
    _kafkaBootstrapNodes = indexingConfig.getStreamConfigs()
        .get(CommonConstants.Helix.DataSource.STREAM_PREFIX + "." + CommonConstants.Helix.DataSource.Realtime.Kafka.KAFKA_BROKER_LIST);
    _kafkaTopic = kafkaStreamProviderConfig.getTopicName();
    _segmentNameStr = _segmentZKMetadata.getSegmentName();
    _segmentName = new LLCSegmentName(_segmentNameStr);
    _kafkaPartitionId = _segmentName.getPartitionId();
    _tableName = _tableConfig.getTableName();
    _metricKeyName = _tableName + "-" + _kafkaTopic + "-" + _kafkaPartitionId;
    segmentLogger = LoggerFactory.getLogger(LLRealtimeSegmentDataManager.class.getName() +
        "_" + _segmentNameStr);
    _tableStreamName = _tableName + "_" + kafkaStreamProviderConfig.getStreamName();

    List<String> sortedColumns = indexLoadingConfig.getSortedColumns();
    if (sortedColumns.isEmpty()) {
      segmentLogger.info("RealtimeDataResourceZKMetadata contains no information about sorted column for segment {}",
          _segmentName);
      _sortedColumn = null;
    } else {
      String firstSortedColumn = sortedColumns.get(0);
      if (_schema.hasColumn(firstSortedColumn)) {
        segmentLogger.info("Setting sorted column name: {} from RealtimeDataResourceZKMetadata for segment {}",
            firstSortedColumn, _segmentName);
        _sortedColumn = firstSortedColumn;
      } else {
        segmentLogger.warn(
            "Sorted column name: {} from RealtimeDataResourceZKMetadata is not existed in schema for segment {}.",
            firstSortedColumn, _segmentName);
        _sortedColumn = null;
      }
    }

    // Inverted index columns
    Set<String> invertedIndexColumns = indexLoadingConfig.getInvertedIndexColumns();
    // We need to add sorted column into inverted index columns because when we convert realtime in memory segment into
    // offline segment, we use sorted column's inverted index to maintain the order of the records so that the records
    // are sorted on the sorted column.
    if (_sortedColumn != null) {
      invertedIndexColumns.add(_sortedColumn);
    }
    _invertedIndexColumns = new ArrayList<>(invertedIndexColumns);

    // No dictionary Columns
    _noDictionaryColumns = new ArrayList<>(indexLoadingConfig.getNoDictionaryColumns());

    // Read the star tree config
    _starTreeIndexSpec = indexingConfig.getStarTreeIndexSpec();

    // Read the max number of rows
    int segmentMaxRowCount = kafkaStreamProviderConfig.getSizeThresholdToFlushSegment();

    if (0 < segmentZKMetadata.getSizeThresholdToFlushSegment()) {
      segmentMaxRowCount = segmentZKMetadata.getSizeThresholdToFlushSegment();
    }

    _segmentMaxRowCount = segmentMaxRowCount;

    // Start new realtime segment
    _realtimeSegment = new RealtimeSegmentImpl(schema, _segmentMaxRowCount, tableConfig.getTableName(),
        segmentZKMetadata.getSegmentName(), _kafkaTopic, _serverMetrics, _invertedIndexColumns,
        indexLoadingConfig.getRealtimeAvgMultiValueCount());
    _realtimeSegment.setSegmentMetadata(segmentZKMetadata, schema);

    // Create message decoder
    _messageDecoder = kafkaStreamProviderConfig.getDecoder();
    _clientId = _kafkaPartitionId + "-" + NetUtil.getHostnameOrAddress();

    // Create field extractor
    _fieldExtractor = FieldExtractorFactory.getPlainFieldExtractor(schema);
    makeConsumerWrapper("Starting");

    SegmentPartitionConfig segmentPartitionConfig = indexingConfig.getSegmentPartitionConfig();
    if (segmentPartitionConfig != null) {
      try {
        int nPartitions = _consumerWrapper.getPartitionCount(_kafkaTopic, /*maxWaitTimeMs=*/5000L);
        segmentPartitionConfig.setNumPartitions(nPartitions);
        _realtimeSegment.setSegmentPartitionConfig(segmentPartitionConfig);
      } catch (Exception e) {
        segmentLogger.warn("Couldn't get number of partitions in 5s, not using partition config {}", e.getMessage());
        _realtimeSegment.setSegmentPartitionConfig(null);
        makeConsumerWrapper("Timeout getting number of partitions");
      }
    }

    _startOffset = _segmentZKMetadata.getStartOffset();
    _currentOffset = _startOffset;
    _resourceTmpDir = new File(resourceDataDir, "_tmp");
    if (!_resourceTmpDir.exists()) {
      _resourceTmpDir.mkdirs();
    }
    _state = State.INITIAL_CONSUMING;
    long now = now();
    _consumeStartTime = now;
    _consumeEndTime = now + kafkaStreamProviderConfig.getTimeThresholdToFlushSegment();
    start();
  }

  private void logStatistics() {
    int numErrors, numConversions, numNulls, numNullCols;
    if ((numErrors = _fieldExtractor.getTotalErrors()) > 0) {
      _serverMetrics.addMeteredTableValue(_tableStreamName,
          ServerMeter.ROWS_WITH_ERRORS, (long) numErrors);
    }
    Map<String, Integer> errorCount = _fieldExtractor.getErrorCount();
    for (String column : errorCount.keySet()) {
      if ((numErrors = errorCount.get(column)) > 0) {
        segmentLogger.warn("Column {} had {} rows with errors", column, numErrors);
      }
    }
    if ((numConversions = _fieldExtractor.getTotalConversions()) > 0) {
      _serverMetrics.addMeteredTableValue(_tableStreamName,
          ServerMeter.ROWS_NEEDING_CONVERSIONS, (long) numConversions);
      segmentLogger.info("{} rows needed conversions ", numConversions);
    }
    if ((numNulls = _fieldExtractor.getTotalNulls()) > 0) {
      _serverMetrics.addMeteredTableValue(_tableStreamName,
          ServerMeter.ROWS_WITH_NULL_VALUES, (long) numNulls);
      segmentLogger.info("{} rows had null columns", numNulls);
    }
    if ((numNullCols = _fieldExtractor.getTotalNullCols()) > 0) {
      _serverMetrics.addMeteredTableValue(_tableStreamName,
          ServerMeter.COLUMNS_WITH_NULL_VALUES, (long) numNullCols);
      segmentLogger.info("{} columns had null values", numNullCols);
    }
  }

  private void makeConsumerWrapper(String reason) {
    if (_consumerWrapper != null) {
      try {
        _consumerWrapper.close();
      } catch (Exception e) {
        segmentLogger.warn("Could not close Kafka consumer wrapper");
      }
    }
    segmentLogger.info("Creating new Kafka consumer wrapper, reason: {}", reason);
    _consumerWrapper = SimpleConsumerWrapper.forPartitionConsumption(new KafkaSimpleConsumerFactoryImpl(),
        _kafkaBootstrapNodes, _clientId, _kafkaTopic, _kafkaPartitionId,
        _kafkaStreamMetadata.getKafkaConnectionTimeoutMillis());
  }

  // This should be done during commit? We may not always commit when we build a segment....
  // TODO Call this method when we are loading the segment, which we do from table datamanager afaik
  private void updateCurrentDocumentCountMetrics() {
    int currentRawDocs = _realtimeSegment.getRawDocumentCount();
    _serverMetrics.addValueToTableGauge(_tableName, ServerGauge.DOCUMENT_COUNT, (currentRawDocs - _lastUpdatedRawDocuments
        .get()));
    _lastUpdatedRawDocuments.set(currentRawDocs);
    final long now = now();
    final int rowsConsumed = _numRowsConsumed - _lastConsumedCount;
    final long prevTime = _lastConsumedCount == 0 ? _consumeStartTime : _lastLogTime;
    // Log every minute or 100k events
    if (now - prevTime > TimeUnit.MINUTES.toMillis(TIME_THRESHOLD_FOR_LOG_MINUTES) || rowsConsumed >= MSG_COUNT_THRESHOLD_FOR_LOG) {
      segmentLogger.info("Consumed {} events from (rate:{}/s), currentOffset={}", rowsConsumed,
            (float) (rowsConsumed) * 1000 / (now - prevTime), _currentOffset);
      _lastConsumedCount = _numRowsConsumed;
      _lastLogTime = now;
    }
  }

  @Override
  public IndexSegment getSegment() {
    return _realtimeSegment;
  }

  @Override
  public String getSegmentName() {
    return _segmentNameStr;
  }

  public int getMaxTimeForConsumingToOnlineSec() {
    return _maxTimeForConsumingToOnlineSec;
  }
}