/*
* Copyright (c) 2015 Spotify AB.
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.heroic.consumer.kafka;
import com.spotify.heroic.consumer.ConsumerSchema;
import com.spotify.heroic.consumer.ConsumerSchemaValidationException;
import com.spotify.heroic.statistics.ConsumerReporter;
import com.spotify.heroic.statistics.FutureReporter;
import com.spotify.heroic.time.Clock;
import eu.toolchain.async.AsyncFramework;
import eu.toolchain.async.AsyncFuture;
import eu.toolchain.async.ResolvableFuture;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.atomic.LongAdder;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public final class ConsumerThread extends Thread {
private static final long RETRY_INITIAL_SLEEP = 5;
private static final long RETRY_MAX_SLEEP = 40;
private final AsyncFramework async;
private final Clock clock;
private final String name;
private final ConsumerReporter reporter;
private final KafkaStream<byte[]> stream;
private final ConsumerSchema.Consumer schema;
private final AtomicInteger active;
private final AtomicLong errors;
private final LongAdder consumed;
// use a latch as a signal so that we can block on it instead of Thread#sleep (or similar) which
// would be a pain to
// interrupt.
private final CountDownLatch shouldStop = new CountDownLatch(1);
protected final ResolvableFuture<Void> hasStopped;
private volatile AtomicReference<CountDownLatch> shouldPause = new AtomicReference<>();
private final boolean enablePeriodicCommit;
private final AtomicLong outstandingConsumptionRequests = new AtomicLong(0);
private final long periodicCommitInterval;
// Timestamp specifying when the next consumer commit should happen
private final AtomicLong nextOffsetsCommitTSGlobal;
// Thread-local copy of the above timestamp
private long nextOffsetsCommitTSThreadLocal;
@Setter
private ConsumerThreadCoordinator coordinator;
public ConsumerThread(
final AsyncFramework async, final Clock clock, final String name,
final ConsumerReporter reporter, final KafkaStream<byte[]> stream,
final ConsumerSchema.Consumer schema, final AtomicInteger active, final AtomicLong errors,
final LongAdder consumed, final boolean enablePeriodicCommit,
final long periodicCommitInterval, final AtomicLong nextOffsetsCommitTSGlobal
) {
super(String.format("%s: %s", ConsumerThread.class.getCanonicalName(), name));
this.async = async;
this.clock = clock;
this.name = name;
this.reporter = reporter;
this.stream = stream;
this.schema = schema;
this.active = active;
this.errors = errors;
this.consumed = consumed;
this.enablePeriodicCommit = enablePeriodicCommit;
this.periodicCommitInterval = periodicCommitInterval;
this.nextOffsetsCommitTSGlobal = nextOffsetsCommitTSGlobal;
this.nextOffsetsCommitTSThreadLocal = this.nextOffsetsCommitTSGlobal.get();
this.coordinator = null;
this.hasStopped = async.future();
}
@Override
public void run() {
log.info("{}: Starting thread", name);
threadIsStarting();
try {
guardedRun();
} catch (final Throwable e) {
log.error("{}: Error in thread", name, e);
threadIsStopping();
hasStopped.fail(e);
return;
}
log.info("{}: Stopping thread", name);
threadIsStopping();
hasStopped.resolve(null);
return;
}
private void threadIsStarting() {
active.incrementAndGet();
reporter.reportConsumerThreadsIncrement();
}
private void threadIsStopping() {
active.decrementAndGet();
reporter.reportConsumerThreadsDecrement();
}
public AsyncFuture<Void> pauseConsumption() {
final CountDownLatch old = this.shouldPause.getAndSet(new CountDownLatch(1));
if (old != null) {
old.countDown();
}
return async.resolved();
}
public AsyncFuture<Void> resumeConsumption() {
final CountDownLatch old = this.shouldPause.getAndSet(null);
if (old != null) {
old.countDown();
}
return async.resolved();
}
public boolean isPausing() {
return this.shouldPause.get() != null;
}
public long getNumOutstandingRequests() {
return outstandingConsumptionRequests.get();
}
public AsyncFuture<Void> shutdown() {
shouldStop.countDown();
final CountDownLatch old = this.shouldPause.getAndSet(null);
if (old != null) {
old.countDown();
}
return hasStopped;
}
private void guardedRun() throws Exception {
for (final byte[] messageBody : stream.messageIterable()) {
if (shouldStop.getCount() == 0 || messageBody == null) {
// Kafka will send a null message when connection is closing
break;
}
consumeOneWithRetry(messageBody);
maybePause();
if (shouldStop.getCount() == 0) {
break;
}
}
}
private void maybePause() throws InterruptedException {
CountDownLatch p = shouldPause.get();
if (p == null) {
return;
}
log.info("Pausing");
/* block on stop signal while shouldPause, re-check since multiple calls to
* {#link #pauseConsumption()} might swap it */
while (p != null && shouldStop.getCount() > 0) {
p.await();
p = shouldPause.get();
}
log.info("Resuming");
}
private void consumeOneWithRetry(final byte[] body) throws InterruptedException {
long sleep = RETRY_INITIAL_SLEEP;
while (shouldStop.getCount() > 0) {
final boolean retry = consumeOne(body);
if (retry) {
handleRetry(sleep);
sleep = Math.min(sleep * 2, RETRY_MAX_SLEEP);
continue;
}
break;
}
}
private boolean consumeOne(final byte[] body) {
try {
/* We have read something. This is a good time to check if we should prepare to commit.
* Why is it a good time? Because if we pause now, then send off one more consumption
* request, then we know that there will be _at least one_ write being finished sometime
* soon so we can use onFinished() to do the commit. */
maybePrepareToCommitConsumerOffsets();
final FutureReporter.Context consumptionContext = reporter.reportConsumption();
// Actually consume
final AsyncFuture<Void> future = schema.consume(body);
if (enablePeriodicCommit) {
outstandingConsumptionRequests.incrementAndGet();
future.onFinished(() -> {
long value = outstandingConsumptionRequests.decrementAndGet();
if (value == 0) {
// If applicable, commit consumer offsets
coordinator.commitConsumerOffsets();
}
});
}
future.onDone(consumptionContext);
reporter.reportMessageSize(body.length);
consumed.increment();
return false;
} catch (final ConsumerSchemaValidationException e) {
/* these messages should be ignored */
reporter.reportConsumerSchemaError();
return false;
} catch (final Exception e) {
errors.incrementAndGet();
log.error("{}: Failed to consume", name, e);
reporter.reportMessageError();
return true;
}
}
/* There's a timestamp, nextOffsetsCommitTSGlobal, saying when we should commit consumer offsets
* the next time. This method looks at a thread local cached copy of that timestamp, to make
* this check as fast as possible. When the cached copy says that we should commit, we check the
* actual nextOffsetsCommitTSGlobal to see if the value has changed (if some other thread has
* already done a commit).
*/
private boolean maybePrepareToCommitConsumerOffsets() {
if (!enablePeriodicCommit) {
return false;
}
final Long currTS = clock.currentTimeMillis();
if (nextOffsetsCommitTSThreadLocal > currTS) {
return false;
}
/* Thread local cached copy of nextOffsetsCommitTSGlobal says that it's time. Double check
* with the actual nextOffsetsCommitTSGlobal. */
synchronized (nextOffsetsCommitTSGlobal) {
// We now have exclusive access to nextOffsetsCommitTSGlobal
final Long nextOffsetsCommitTSGlobalValue = nextOffsetsCommitTSGlobal.get();
nextOffsetsCommitTSThreadLocal = nextOffsetsCommitTSGlobalValue;
if (nextOffsetsCommitTSGlobalValue > currTS) {
return false;
}
final Long nextTS = currTS + periodicCommitInterval;
nextOffsetsCommitTSGlobal.set(nextTS);
nextOffsetsCommitTSThreadLocal = nextTS;
/* This will ask all threads to pause and will set us up for committing when all writes
* are done. */
coordinator.prepareToCommitConsumerOffsets();
return true;
}
}
private void handleRetry(final long sleep) throws InterruptedException {
log.info("{}: Retrying in {} second(s)", name, sleep);
/* decrementing the number of active active consumers indicates an error to the consumer
* module. This makes sure that the status of the service is set to as 'failing'. */
active.decrementAndGet();
shouldStop.await(sleep, TimeUnit.SECONDS);
active.incrementAndGet();
}
}