/**
* This file is part of Graylog.
*
* Graylog is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Graylog is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Graylog. If not, see <http://www.gnu.org/licenses/>.
*/
package org.graylog2.shared.journal;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.github.joschi.jadconfig.util.Size;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.google.common.io.Files;
import com.google.common.primitives.Ints;
import com.google.common.util.concurrent.AbstractIdleService;
import com.google.common.util.concurrent.Uninterruptibles;
import kafka.common.KafkaException;
import kafka.common.OffsetOutOfRangeException;
import kafka.common.TopicAndPartition;
import kafka.log.CleanerConfig;
import kafka.log.Log;
import kafka.log.LogAppendInfo;
import kafka.log.LogConfig;
import kafka.log.LogManager;
import kafka.log.LogSegment;
import kafka.message.ByteBufferMessageSet;
import kafka.message.Message;
import kafka.message.MessageAndOffset;
import kafka.message.MessageSet;
import kafka.server.BrokerState;
import kafka.server.RunningAsBroker;
import kafka.utils.KafkaScheduler;
import kafka.utils.Time;
import org.graylog2.plugin.GlobalMetricNames;
import org.graylog2.plugin.ServerStatus;
import org.graylog2.plugin.ThrottleState;
import org.graylog2.plugin.lifecycles.LoadBalancerStatus;
import org.graylog2.shared.metrics.HdrTimer;
import org.graylog2.shared.utilities.ByteBufferUtils;
import org.joda.time.DateTimeUtils;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.collection.Iterator;
import scala.collection.JavaConversions;
import scala.collection.Map$;
import scala.runtime.AbstractFunction1;
import javax.inject.Inject;
import javax.inject.Named;
import javax.inject.Singleton;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.SyncFailedException;
import java.nio.channels.ClosedByInterruptException;
import java.nio.charset.StandardCharsets;
import java.nio.file.AccessDeniedException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import static com.codahale.metrics.MetricRegistry.name;
import static java.util.concurrent.TimeUnit.DAYS;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.graylog2.plugin.Tools.bytesToHex;
@Singleton
public class KafkaJournal extends AbstractIdleService implements Journal {
private static final Logger LOG = LoggerFactory.getLogger(KafkaJournal.class);
private static final int NUM_IO_THREADS = 1;
public static final long DEFAULT_COMMITTED_OFFSET = Long.MIN_VALUE;
public static final int NOTIFY_ON_UTILIZATION_PERCENTAGE = 95;
private final ServerStatus serverStatus;
// this exists so we can use JodaTime's millis provider in tests.
// kafka really only cares about the milliseconds() method in here
private static final Time JODA_TIME = new Time() {
@Override
public long milliseconds() {
return DateTimeUtils.currentTimeMillis();
}
@Override
public long nanoseconds() {
return System.nanoTime();
}
@Override
public void sleep(long ms) {
Uninterruptibles.sleepUninterruptibly(ms, MILLISECONDS);
}
};
private final LogManager logManager;
private final Log kafkaLog;
private final File committedReadOffsetFile;
private final AtomicLong committedOffset = new AtomicLong(DEFAULT_COMMITTED_OFFSET);
private final ScheduledExecutorService scheduler;
private final Timer writeTime;
private final Timer readTime;
private final KafkaScheduler kafkaScheduler;
private final Meter writtenMessages;
private final Meter readMessages;
private final Meter writeDiscardedMessages;
private final OffsetFileFlusher offsetFlusher;
private final DirtyLogFlusher dirtyLogFlusher;
private final RecoveryCheckpointFlusher recoveryCheckpointFlusher;
private final LogRetentionCleaner logRetentionCleaner;
private final long maxSegmentSize;
private final int maxMessageSize;
private long nextReadOffset = 0L;
private ScheduledFuture<?> checkpointFlusherFuture;
private ScheduledFuture<?> dirtyLogFlushFuture;
private ScheduledFuture<?> logRetentionFuture;
private ScheduledFuture<?> offsetFlusherFuture;
private volatile boolean shuttingDown;
private final AtomicReference<ThrottleState> throttleState = new AtomicReference<>();
private final AtomicInteger purgedSegmentsInLastRetention = new AtomicInteger();
private final int throttleThresholdPercentage;
@Inject
public KafkaJournal(@Named("message_journal_dir") File journalDirectory,
@Named("scheduler") ScheduledExecutorService scheduler,
@Named("message_journal_segment_size") Size segmentSize,
@Named("message_journal_segment_age") Duration segmentAge,
@Named("message_journal_max_size") Size retentionSize,
@Named("message_journal_max_age") Duration retentionAge,
@Named("message_journal_flush_interval") long flushInterval,
@Named("message_journal_flush_age") Duration flushAge,
@Named("lb_throttle_threshold_percentage") int throttleThresholdPercentage,
MetricRegistry metricRegistry,
ServerStatus serverStatus) {
this.scheduler = scheduler;
this.throttleThresholdPercentage = intRange(throttleThresholdPercentage, 0, 100);
this.serverStatus = serverStatus;
this.maxSegmentSize = segmentSize.toBytes();
// Max message size should not be bigger than max segment size.
this.maxMessageSize = Ints.saturatedCast(maxSegmentSize);
this.writtenMessages = metricRegistry.meter(name(this.getClass(), "writtenMessages"));
this.readMessages = metricRegistry.meter(name(this.getClass(), "readMessages"));
this.writeDiscardedMessages = metricRegistry.meter(name(this.getClass(), "writeDiscardedMessages"));
registerUncommittedGauge(metricRegistry, name(this.getClass(), "uncommittedMessages"));
// the registerHdrTimer helper doesn't throw on existing metrics
this.writeTime = registerHdrTimer(metricRegistry, name(this.getClass(), "writeTime"));
this.readTime = registerHdrTimer(metricRegistry, name(this.getClass(), "readTime"));
final Map<String, Object> config = ImmutableMap.<String, Object>builder()
// segmentSize: The soft maximum for the size of a segment file in the log
.put(LogConfig.SegmentBytesProp(), Ints.saturatedCast(segmentSize.toBytes()))
// segmentMs: The soft maximum on the amount of time before a new log segment is rolled
.put(LogConfig.SegmentMsProp(), segmentAge.getMillis())
// segmentJitterMs The maximum random jitter subtracted from segmentMs to avoid thundering herds of segment rolling
.put(LogConfig.SegmentJitterMsProp(), 0)
// flushInterval: The number of messages that can be written to the log before a flush is forced
.put(LogConfig.FlushMessagesProp(), flushInterval)
// flushMs: The amount of time the log can have dirty data before a flush is forced
.put(LogConfig.FlushMsProp(), flushAge.getMillis())
// retentionSize: The approximate total number of bytes this log can use
.put(LogConfig.RetentionBytesProp(), retentionSize.toBytes())
// retentionMs: The age approximate maximum age of the last segment that is retained
.put(LogConfig.RetentionMsProp(), retentionAge.getMillis())
// maxMessageSize: The maximum size of a message in the log (ensure that it's not larger than the max segment size)
.put(LogConfig.MaxMessageBytesProp(), maxMessageSize)
// maxIndexSize: The maximum size of an index file
.put(LogConfig.SegmentIndexBytesProp(), Ints.saturatedCast(Size.megabytes(1L).toBytes()))
// indexInterval: The approximate number of bytes between index entries
.put(LogConfig.IndexIntervalBytesProp(), 4096)
// fileDeleteDelayMs: The time to wait before deleting a file from the filesystem
.put(LogConfig.FileDeleteDelayMsProp(), MINUTES.toMillis(1L))
// deleteRetentionMs: The time to retain delete markers in the log. Only applicable for logs that are being compacted.
.put(LogConfig.DeleteRetentionMsProp(), DAYS.toMillis(1L))
// minCleanableRatio: The ratio of bytes that are available for cleaning to the bytes already cleaned
.put(LogConfig.MinCleanableDirtyRatioProp(), 0.5)
// compact: Should old segments in this log be deleted or de-duplicated?
.put(LogConfig.Compact(), false)
// uncleanLeaderElectionEnable Indicates whether unclean leader election is enabled; actually a controller-level property
// but included here for topic-specific configuration validation purposes
.put(LogConfig.UncleanLeaderElectionEnableProp(), true)
// minInSyncReplicas If number of insync replicas drops below this number, we stop accepting writes with -1 (or all) required acks
.put(LogConfig.MinInSyncReplicasProp(), 1)
.build();
final LogConfig defaultConfig = new LogConfig(config);
// these are the default values as per kafka 0.8.1.1, except we don't turn on the cleaner
// Cleaner really is log compaction with respect to "deletes" in the log.
// we never insert a message twice, at least not on purpose, so we do not "clean" logs, ever.
final CleanerConfig cleanerConfig =
new CleanerConfig(
1,
Size.megabytes(4L).toBytes(),
0.9d,
Ints.saturatedCast(Size.megabytes(1L).toBytes()),
Ints.saturatedCast(Size.megabytes(32L).toBytes()),
Ints.saturatedCast(Size.megabytes(5L).toBytes()),
SECONDS.toMillis(15L),
false,
"MD5");
if (!journalDirectory.exists() && !journalDirectory.mkdirs()) {
LOG.error("Cannot create journal directory at {}, please check the permissions", journalDirectory.getAbsolutePath());
final AccessDeniedException accessDeniedException = new AccessDeniedException(journalDirectory.getAbsolutePath(), null, "Could not create journal directory.");
throw new RuntimeException(accessDeniedException);
}
// TODO add check for directory, etc
committedReadOffsetFile = new File(journalDirectory, "graylog2-committed-read-offset");
try {
if (!committedReadOffsetFile.createNewFile()) {
final String line = Files.readFirstLine(committedReadOffsetFile, StandardCharsets.UTF_8);
// the file contains the last offset graylog2 has successfully processed.
// thus the nextReadOffset is one beyond that number
if (line != null) {
committedOffset.set(Long.parseLong(line.trim()));
nextReadOffset = committedOffset.get() + 1;
}
}
} catch (IOException e) {
LOG.error("Cannot access offset file: {}", e.getMessage());
final AccessDeniedException accessDeniedException = new AccessDeniedException(committedReadOffsetFile.getAbsolutePath(), null, e.getMessage());
throw new RuntimeException(accessDeniedException);
}
try {
final BrokerState brokerState = new BrokerState();
brokerState.newState(RunningAsBroker.state());
kafkaScheduler = new KafkaScheduler(2, "kafka-journal-scheduler-", false); // TODO make thread count configurable
kafkaScheduler.startup();
logManager = new LogManager(
new File[]{journalDirectory},
Map$.MODULE$.<String, LogConfig>empty(),
defaultConfig,
cleanerConfig,
NUM_IO_THREADS,
SECONDS.toMillis(60L),
SECONDS.toMillis(60L),
SECONDS.toMillis(60L),
kafkaScheduler, // Broker state
brokerState,
JODA_TIME);
final TopicAndPartition topicAndPartition = new TopicAndPartition("messagejournal", 0);
final Option<Log> messageLog = logManager.getLog(topicAndPartition);
if (messageLog.isEmpty()) {
kafkaLog = logManager.createLog(topicAndPartition, logManager.defaultConfig());
} else {
kafkaLog = messageLog.get();
}
LOG.info("Initialized Kafka based journal at {}", journalDirectory);
setupKafkaLogMetrics(metricRegistry);
offsetFlusher = new OffsetFileFlusher();
dirtyLogFlusher = new DirtyLogFlusher();
recoveryCheckpointFlusher = new RecoveryCheckpointFlusher();
logRetentionCleaner = new LogRetentionCleaner();
} catch (KafkaException e) {
// most likely failed to grab lock
LOG.error("Unable to start logmanager.", e);
throw new RuntimeException(e);
}
}
/**
* Ensures that an integer is within a given range.
*
* @param i The number to check.
* @param min The minimum value to return.
* @param max The maximum value to return.
* @return {@code i} if the number is between {@code min} and {@code max},
* {@code min} if {@code i} is less than the minimum,
* {@code max} if {@code i} is greater than the maximum.
*/
private static int intRange(int i, int min, int max) {
return Integer.min(Integer.max(min, i), max);
}
private Timer registerHdrTimer(MetricRegistry metricRegistry, final String metricName) {
Timer timer;
try {
timer = metricRegistry.register(metricName, new HdrTimer(1, MINUTES, 1));
} catch (IllegalArgumentException e) {
final SortedMap<String, Timer> timers = metricRegistry.getTimers((name, metric) -> metricName.equals(name));
timer = Iterables.getOnlyElement(timers.values());
}
return timer;
}
private void registerUncommittedGauge(MetricRegistry metricRegistry, String name) {
try {
metricRegistry.register(name,
(Gauge<Long>) () -> Math.max(0, getLogEndOffset() - 1 - committedOffset.get()));
} catch (IllegalArgumentException ignored) {
// already registered, we'll ignore that.
}
}
public int getPurgedSegmentsInLastRetention() {
return purgedSegmentsInLastRetention.get();
}
private void setupKafkaLogMetrics(final MetricRegistry metricRegistry) {
metricRegistry.register(name(KafkaJournal.class, "size"), (Gauge<Long>) kafkaLog::size);
metricRegistry.register(name(KafkaJournal.class, "logEndOffset"), (Gauge<Long>) kafkaLog::logEndOffset);
metricRegistry.register(name(KafkaJournal.class, "numberOfSegments"), (Gauge<Integer>) kafkaLog::numberOfSegments);
metricRegistry.register(name(KafkaJournal.class, "unflushedMessages"), (Gauge<Long>) kafkaLog::unflushedMessages);
metricRegistry.register(name(KafkaJournal.class, "recoveryPoint"), (Gauge<Long>) kafkaLog::recoveryPoint);
metricRegistry.register(name(KafkaJournal.class, "lastFlushTime"), (Gauge<Long>) kafkaLog::lastFlushTime);
// must not be a lambda, because the serialization cannot determine the proper Metric type :(
metricRegistry.register(GlobalMetricNames.JOURNAL_OLDEST_SEGMENT, (Gauge<Date>) new Gauge<Date>() {
@Override
public Date getValue() {
long oldestSegment = Long.MAX_VALUE;
for (final LogSegment segment : KafkaJournal.this.getSegments()) {
oldestSegment = Math.min(oldestSegment, segment.created());
}
return new Date(oldestSegment);
}
});
}
/**
* Creates an opaque object which can be passed to {@link #write(java.util.List)} for a bulk journal write.
*
* @param idBytes a byte array which represents the key for the entry
* @param messageBytes the journal entry's payload, i.e. the message itself
* @return a journal entry to be passed to {@link #write(java.util.List)}
*/
@Override
public Entry createEntry(byte[] idBytes, byte[] messageBytes) {
return new Entry(idBytes, messageBytes);
}
/**
* Writes the list of entries to the journal.
*
* @param entries journal entries to be written
* @return the last position written to in the journal
*/
@Override
public long write(List<Entry> entries) {
try (Timer.Context ignored = writeTime.time()) {
long payloadSize = 0L;
long messageSetSize = 0L;
long lastWriteOffset = 0L;
final List<Message> messages = new ArrayList<>(entries.size());
for (final Entry entry : entries) {
final byte[] messageBytes = entry.getMessageBytes();
final byte[] idBytes = entry.getIdBytes();
payloadSize += messageBytes.length;
final Message newMessage = new Message(messageBytes, idBytes);
// Calculate the size of the new message in the message set by including the overhead for the log entry.
final int newMessageSize = MessageSet.entrySize(newMessage);
if (newMessageSize > maxMessageSize) {
writeDiscardedMessages.mark();
LOG.warn("Message with ID <{}> is too large to store in journal, skipping! (size: {} bytes / max: {} bytes)",
new String(idBytes, StandardCharsets.UTF_8), newMessageSize, maxMessageSize);
payloadSize = 0;
continue;
}
// If adding the new message to the message set would overflow the max segment size, flush the current
// list of message to avoid a MessageSetSizeTooLargeException.
if ((messageSetSize + newMessageSize) > maxSegmentSize) {
if (LOG.isDebugEnabled()) {
LOG.debug("Flushing {} bytes message set with {} messages to avoid overflowing segment with max size of {} bytes",
messageSetSize, messages.size(), maxSegmentSize);
}
lastWriteOffset = flushMessages(messages, payloadSize);
// Reset the messages list and size counters to start a new batch.
messages.clear();
messageSetSize = 0;
payloadSize = 0;
}
messages.add(newMessage);
messageSetSize += newMessageSize;
if (LOG.isTraceEnabled()) {
LOG.trace("Message {} contains bytes {}", bytesToHex(idBytes), bytesToHex(messageBytes));
}
}
// Flush the rest of the messages.
if (messages.size() > 0) {
lastWriteOffset = flushMessages(messages, payloadSize);
}
return lastWriteOffset;
}
}
private long flushMessages(List<Message> messages, long payloadSize) {
if (messages.isEmpty()) {
LOG.debug("No messages to flush, not trying to write an empty message set.");
return -1L;
}
final ByteBufferMessageSet messageSet = new ByteBufferMessageSet(JavaConversions.asScalaBuffer(messages).toSeq());
if (LOG.isDebugEnabled()) {
LOG.debug("Trying to write ByteBufferMessageSet with size of {} bytes to journal", messageSet.sizeInBytes());
}
final LogAppendInfo appendInfo = kafkaLog.append(messageSet, true);
long lastWriteOffset = appendInfo.lastOffset();
if (LOG.isDebugEnabled()) {
LOG.debug("Wrote {} messages to journal: {} bytes (payload {} bytes), log position {} to {}",
messages.size(), messageSet.sizeInBytes(), payloadSize, appendInfo.firstOffset(), lastWriteOffset);
}
writtenMessages.mark(messages.size());
return lastWriteOffset;
}
/**
* Writes a single message to the journal and returns the new write position
*
* @param idBytes byte array congaing the message id
* @param messageBytes encoded message payload
* @return the last position written to in the journal
*/
@Override
public long write(byte[] idBytes, byte[] messageBytes) {
final Entry journalEntry = createEntry(idBytes, messageBytes);
return write(Collections.singletonList(journalEntry));
}
@Override
public List<JournalReadEntry> read(long requestedMaximumCount) {
return read(nextReadOffset, requestedMaximumCount);
}
public List<JournalReadEntry> read(long readOffset, long requestedMaximumCount) {
// Always read at least one!
final long maximumCount = Math.max(1, requestedMaximumCount);
long maxOffset = readOffset + maximumCount;
if (shuttingDown) {
return Collections.emptyList();
}
final List<JournalReadEntry> messages = new ArrayList<>(Ints.saturatedCast(maximumCount));
try (Timer.Context ignored = readTime.time()) {
final long logStartOffset = getLogStartOffset();
if (readOffset < logStartOffset) {
LOG.info(
"Read offset {} before start of log at {}, starting to read from the beginning of the journal.",
readOffset,
logStartOffset);
readOffset = logStartOffset;
maxOffset = readOffset + maximumCount;
}
LOG.debug("Requesting to read a maximum of {} messages (or 5MB) from the journal, offset interval [{}, {})",
maximumCount, readOffset, maxOffset);
// TODO benchmark and make read-ahead strategy configurable for performance tuning
final MessageSet messageSet = kafkaLog.read(readOffset,
5 * 1024 * 1024,
Option.<Object>apply(maxOffset)).messageSet();
final Iterator<MessageAndOffset> iterator = messageSet.iterator();
long firstOffset = Long.MIN_VALUE;
long lastOffset = Long.MIN_VALUE;
long totalBytes = 0;
while (iterator.hasNext()) {
final MessageAndOffset messageAndOffset = iterator.next();
if (firstOffset == Long.MIN_VALUE) firstOffset = messageAndOffset.offset();
// always remember the last seen offset for debug purposes below
lastOffset = messageAndOffset.offset();
final byte[] payloadBytes = ByteBufferUtils.readBytes(messageAndOffset.message().payload());
if (LOG.isTraceEnabled()) {
final byte[] keyBytes = ByteBufferUtils.readBytes(messageAndOffset.message().key());
LOG.trace("Read message {} contains {}", bytesToHex(keyBytes), bytesToHex(payloadBytes));
}
totalBytes += payloadBytes.length;
messages.add(new JournalReadEntry(payloadBytes, messageAndOffset.offset()));
// remember where to read from
nextReadOffset = messageAndOffset.nextOffset();
}
if (messages.isEmpty()) {
LOG.debug("No messages available to read for offset interval [{}, {}).", readOffset, maxOffset);
} else {
LOG.debug(
"Read {} messages, total payload size {}, from journal, offset interval [{}, {}], requested read at {}",
messages.size(),
totalBytes,
firstOffset,
lastOffset,
readOffset);
}
} catch (OffsetOutOfRangeException e) {
// This is fine, the reader tries to read faster than the writer committed data. Next read will get the data.
LOG.debug("Offset out of range, no messages available starting at offset {}", readOffset);
} catch (Exception e) {
// the scala code does not declare the IOException in kafkaLog.read() so we can't catch it here
// sigh.
if (shuttingDown) {
LOG.debug("Caught exception during shutdown, ignoring it because we might have been blocked on a read.");
return Collections.emptyList();
}
//noinspection ConstantConditions
if (e instanceof ClosedByInterruptException) {
LOG.debug("Interrupted while reading from journal, during shutdown this is harmless and ignored.", e);
} else {
throw e;
}
}
readMessages.mark(messages.size());
return messages;
}
/**
* Upon fully processing, and persistently storing, a batch of messages, the system should mark the message with the
* highest offset as committed. A background job will write the last position to disk periodically.
*
* @param offset the offset of the latest committed message
*/
@Override
public void markJournalOffsetCommitted(long offset) {
long prev;
// the caller will not care about offsets going backwards, so we need to make sure we don't backtrack
int i = 0;
do {
prev = committedOffset.get();
// at least warn if this spins often, that would be a sign of very high contention, which should not happen
if (++i % 10 == 0) {
LOG.warn("Committing journal offset spins {} times now, this might be a bug. Continuing to try update.",
i);
}
} while (!committedOffset.compareAndSet(prev, Math.max(offset, prev)));
}
/**
* A Java transliteration of what the scala implementation does, which unfortunately is declared as private
*/
protected void flushDirtyLogs() {
LOG.debug("Checking for dirty logs to flush...");
final Set<Map.Entry<TopicAndPartition, Log>> entries = JavaConversions.mapAsJavaMap(logManager.logsByTopicPartition()).entrySet();
for (final Map.Entry<TopicAndPartition, Log> topicAndPartitionLogEntry : entries) {
final TopicAndPartition topicAndPartition = topicAndPartitionLogEntry.getKey();
final Log kafkaLog = topicAndPartitionLogEntry.getValue();
final long timeSinceLastFlush = JODA_TIME.milliseconds() - kafkaLog.lastFlushTime();
try {
LOG.debug(
"Checking if flush is needed on {} flush interval {} last flushed {} time since last flush: {}",
topicAndPartition.topic(),
kafkaLog.config().flushInterval(),
kafkaLog.lastFlushTime(),
timeSinceLastFlush);
if (timeSinceLastFlush >= kafkaLog.config().flushMs()) {
kafkaLog.flush();
}
} catch (Exception e) {
LOG.error("Error flushing topic " + topicAndPartition.topic(), e);
}
}
}
public long getCommittedOffset() {
return committedOffset.get();
}
public long getNextReadOffset() {
return nextReadOffset;
}
@Override
protected void startUp() throws Exception {
// do NOT let Kafka's LogManager create its management threads, we will run them ourselves.
// The problem is that we can't reliably decorate or subclass them, so we will peel the methods out and call
// them ourselves. it sucks, but i haven't found a better way yet.
// /* don't call */ logManager.startup();
// flush dirty logs regularly
dirtyLogFlushFuture = scheduler.scheduleAtFixedRate(dirtyLogFlusher,
SECONDS.toMillis(30),
logManager.flushCheckMs(),
MILLISECONDS);
// write recovery checkpoint files
checkpointFlusherFuture = scheduler.scheduleAtFixedRate(recoveryCheckpointFlusher,
SECONDS.toMillis(30),
logManager.flushCheckpointMs(),
MILLISECONDS);
// custom log retention cleaner
logRetentionFuture = scheduler.scheduleAtFixedRate(logRetentionCleaner,
SECONDS.toMillis(30),
logManager.retentionCheckMs(),
MILLISECONDS);
// regularly write the currently committed read offset to disk
offsetFlusherFuture = scheduler.scheduleAtFixedRate(offsetFlusher, 1, 1, SECONDS);
}
@Override
protected void shutDown() throws Exception {
LOG.debug("Shutting down journal!");
shuttingDown = true;
offsetFlusherFuture.cancel(false);
logRetentionFuture.cancel(false);
checkpointFlusherFuture.cancel(false);
dirtyLogFlushFuture.cancel(false);
kafkaScheduler.shutdown();
logManager.shutdown();
// final flush
offsetFlusher.run();
}
public int cleanupLogs() {
try {
return logRetentionCleaner.call();
} catch (Exception e) {
LOG.error("Unable to delete expired segments.", e);
return 0;
}
}
// default visibility for tests
public Iterable<LogSegment> getSegments() {
return JavaConversions.asJavaIterable(kafkaLog.logSegments());
}
/**
* Returns the journal size in bytes, exluding index files.
*
* @return journal size in bytes
*/
public long size() {
return kafkaLog.size();
}
/**
* Returns the number of segments this journal consists of.
*
* @return number of segments
*/
public int numberOfSegments() {
return kafkaLog.numberOfSegments();
}
/**
* Returns the highest journal offset that has been writting to persistent storage by Graylog.
* <p>
* Every message at an offset prior to this one can be considered as processed and does not need to be held in
* the journal any longer. By default Graylog will try to aggressively flush the journal to consume a smaller
* amount of disk space.
* </p>
*
* @return the offset of the last message which has been successfully processed.
*/
public long getCommittedReadOffset() {
return committedOffset.get();
}
/**
* Discards all data in the journal prior to the given offset.
*
* @param offset offset to truncate to, so that no offset in the journal is larger than this.
*/
public void truncateTo(long offset) {
kafkaLog.truncateTo(offset);
}
/**
* Returns the first valid offset in the entire journal.
*
* @return first offset
*/
public long getLogStartOffset() {
final Iterable<LogSegment> logSegments = JavaConversions.asJavaIterable(kafkaLog.logSegments());
final LogSegment segment = Iterables.getFirst(logSegments, null);
if (segment == null) {
return 0;
}
return segment.baseOffset();
}
/**
* returns the offset for the next value to be inserted in the entire journal.
*
* @return the next offset value (last valid offset is this number - 1)
*/
public long getLogEndOffset() {
return kafkaLog.logEndOffset();
}
/**
* For informational purposes this method provides access to the current state of the journal.
*
* @return the journal state for throttling purposes
*/
public ThrottleState getThrottleState() {
return throttleState.get();
}
public void setThrottleState(ThrottleState state) {
throttleState.set(state);
}
public class OffsetFileFlusher implements Runnable {
@Override
public void run() {
// Do not write the file if committedOffset has never been updated.
if (committedOffset.get() == DEFAULT_COMMITTED_OFFSET) {
return;
}
try (final FileOutputStream fos = new FileOutputStream(committedReadOffsetFile)) {
fos.write(String.valueOf(committedOffset.get()).getBytes(StandardCharsets.UTF_8));
// flush stream
fos.flush();
// actually sync to disk
fos.getFD().sync();
} catch (SyncFailedException e) {
LOG.error("Cannot sync " + committedReadOffsetFile.getAbsolutePath() + " to disk. Continuing anyway," +
" but there is no guarantee that the file has been written.", e);
} catch (IOException e) {
LOG.error("Cannot write " + committedReadOffsetFile.getAbsolutePath() + " to disk.", e);
}
}
}
/**
* Java implementation of the Kafka log retention cleaner.
*/
public class LogRetentionCleaner implements Runnable, Callable<Integer> {
private final Logger loggerForCleaner = LoggerFactory.getLogger(LogRetentionCleaner.class);
@Override
public void run() {
try {
call();
} catch (Exception e) {
loggerForCleaner.error("Unable to delete expired segments. Will try again.", e);
}
}
@Override
public Integer call() throws Exception {
loggerForCleaner.debug("Beginning log cleanup");
int total = 0;
final Timer.Context ctx = new Timer().time();
for (final Log kafkaLog : JavaConversions.asJavaIterable(logManager.allLogs())) {
if (kafkaLog.config().compact()) continue;
loggerForCleaner.debug("Garbage collecting {}", kafkaLog.name());
total += cleanupExpiredSegments(kafkaLog) +
cleanupSegmentsToMaintainSize(kafkaLog) +
cleanupSegmentsToRemoveCommitted(kafkaLog);
}
loggerForCleaner.debug("Log cleanup completed. {} files deleted in {} seconds",
total,
NANOSECONDS.toSeconds(ctx.stop()));
return total;
}
private int cleanupExpiredSegments(final Log kafkaLog) {
// don't run if nothing will be done
if (kafkaLog.size() == 0 && kafkaLog.numberOfSegments() < 1) {
KafkaJournal.this.purgedSegmentsInLastRetention.set(0);
return 0;
}
int deletedSegments = kafkaLog.deleteOldSegments(new AbstractFunction1<LogSegment, Object>() {
@Override
public Object apply(LogSegment segment) {
final long segmentAge = JODA_TIME.milliseconds() - segment.lastModified();
final boolean shouldDelete = segmentAge > kafkaLog.config().retentionMs();
if (shouldDelete) {
loggerForCleaner.debug(
"[cleanup-time] Removing segment with age {}s, older than then maximum retention age {}s",
MILLISECONDS.toSeconds(segmentAge),
MILLISECONDS.toSeconds(kafkaLog.config().retentionMs()));
}
return shouldDelete;
}
});
KafkaJournal.this.purgedSegmentsInLastRetention.set(deletedSegments);
return deletedSegments;
}
/**
* Change the load balancer status from ALIVE to THROTTLE, or vice versa depending on the
* journal utilization percentage. As the utilization ratio is reliable only after cleanup,
* that's where this is called from.
*/
private void updateLoadBalancerStatus(double utilizationPercentage) {
final LoadBalancerStatus currentStatus = serverStatus.getLifecycle().getLoadbalancerStatus();
// Flip the status. The next lifecycle events may change status. This should be good enough, because
// throttling does not offer hard guarantees.
if (currentStatus == LoadBalancerStatus.THROTTLED && utilizationPercentage < throttleThresholdPercentage) {
serverStatus.running();
LOG.info(String.format(Locale.ENGLISH,
"Journal usage is %.2f%% (threshold %d%%), changing load balancer status from THROTTLED to ALIVE",
utilizationPercentage, throttleThresholdPercentage));
} else if (currentStatus == LoadBalancerStatus.ALIVE && utilizationPercentage >= throttleThresholdPercentage) {
serverStatus.throttle();
LOG.info(String.format(Locale.ENGLISH,
"Journal usage is %.2f%% (threshold %d%%), changing load balancer status from ALIVE to THROTTLED",
utilizationPercentage, throttleThresholdPercentage));
}
}
private int cleanupSegmentsToMaintainSize(Log kafkaLog) {
final long retentionSize = kafkaLog.config().retentionSize();
final long currentSize = kafkaLog.size();
final double utilizationPercentage = retentionSize > 0 ? (currentSize * 100) / retentionSize : 0.0;
if (utilizationPercentage > KafkaJournal.NOTIFY_ON_UTILIZATION_PERCENTAGE) {
LOG.warn("Journal utilization ({}%) has gone over {}%.", utilizationPercentage,
KafkaJournal.NOTIFY_ON_UTILIZATION_PERCENTAGE);
}
updateLoadBalancerStatus(utilizationPercentage);
if (retentionSize < 0 || currentSize < retentionSize) {
KafkaJournal.this.purgedSegmentsInLastRetention.set(0);
return 0;
}
final long[] diff = {currentSize - retentionSize};
int deletedSegments = kafkaLog.deleteOldSegments(new AbstractFunction1<LogSegment, Object>() { // sigh scala
@Override
public Object apply(LogSegment segment) {
if (diff[0] - segment.size() >= 0) {
diff[0] -= segment.size();
loggerForCleaner.debug(
"[cleanup-size] Removing segment starting at offset {}, size {} bytes, to shrink log to new size {}, target size {}",
segment.baseOffset(),
segment.size(),
diff[0],
retentionSize);
return true;
} else {
return false;
}
}
});
KafkaJournal.this.purgedSegmentsInLastRetention.set(deletedSegments);
return deletedSegments;
}
private int cleanupSegmentsToRemoveCommitted(Log kafkaLog) {
if (kafkaLog.numberOfSegments() <= 1) {
loggerForCleaner.debug(
"[cleanup-committed] The journal is already minimal at {} segment(s), not trying to remove more segments.",
kafkaLog.numberOfSegments());
return 0;
}
// we need to iterate through all segments to the find the cutoff point for the committed offset.
// unfortunately finding the largest offset contained in a segment is expensive (it involves reading the entire file)
// so we have to get a global view.
final long committedOffset = KafkaJournal.this.committedOffset.get();
final HashSet<LogSegment> logSegments = Sets.newHashSet(
JavaConversions.asJavaIterable(kafkaLog.logSegments(committedOffset, Long.MAX_VALUE))
);
loggerForCleaner.debug("[cleanup-committed] Keeping segments {}", logSegments);
return kafkaLog.deleteOldSegments(new AbstractFunction1<LogSegment, Object>() {
@Override
public Object apply(LogSegment segment) {
final boolean shouldDelete = !logSegments.contains(segment);
if (shouldDelete) {
loggerForCleaner.debug(
"[cleanup-committed] Should delete segment {} because it is prior to committed offset {}",
segment,
committedOffset);
}
return shouldDelete;
}
});
}
}
public class RecoveryCheckpointFlusher implements Runnable {
@Override
public void run() {
try {
logManager.checkpointRecoveryPointOffsets();
} catch (Exception e) {
LOG.error("Unable to flush checkpoint recovery point offsets. Will try again.", e);
}
}
}
public class DirtyLogFlusher implements Runnable {
@Override
public void run() {
try {
flushDirtyLogs();
} catch (Exception e) {
LOG.error("Unable to flush dirty logs. Will try again.", e);
}
}
}
}