/** * Copyright 2016 Yahoo Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.yahoo.pulsar.broker.service.persistent; import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.CompletableFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; import org.apache.bookkeeper.mledger.AsyncCallbacks.ReadEntriesCallback; import org.apache.bookkeeper.mledger.Entry; import org.apache.bookkeeper.mledger.ManagedCursor; import org.apache.bookkeeper.mledger.ManagedLedgerException; import org.apache.bookkeeper.mledger.ManagedLedgerException.TooManyRequestsException; import org.apache.bookkeeper.mledger.Position; import org.apache.bookkeeper.mledger.impl.PositionImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.carrotsearch.hppc.ObjectHashSet; import com.carrotsearch.hppc.ObjectSet; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Sets; import com.yahoo.pulsar.broker.service.BrokerServiceException; import com.yahoo.pulsar.broker.service.Consumer; import com.yahoo.pulsar.broker.service.Dispatcher; import com.yahoo.pulsar.client.impl.Backoff; import com.yahoo.pulsar.common.api.proto.PulsarApi.CommandSubscribe.SubType; import com.yahoo.pulsar.common.util.Codec; import com.yahoo.pulsar.utils.CopyOnWriteArrayList; /** */ public class PersistentDispatcherMultipleConsumers implements Dispatcher, ReadEntriesCallback { private static final int MaxReadBatchSize = 100; private static final int MaxRoundRobinBatchSize = 20; private final PersistentTopic topic; private final ManagedCursor cursor; private final CopyOnWriteArrayList<Consumer> consumerList = new CopyOnWriteArrayList<>(); private final ObjectSet<Consumer> consumerSet = new ObjectHashSet<>(); private CompletableFuture<Void> closeFuture = null; private TreeSet<PositionImpl> messagesToReplay; private int currentConsumerRoundRobinIndex = 0; private boolean havePendingRead = false; private boolean havePendingReplayRead = false; private boolean shouldRewindBeforeReadingOrReplaying = false; private final String name; private int totalAvailablePermits = 0; private int readBatchSize; private final Backoff readFailureBackoff = new Backoff(15, TimeUnit.SECONDS, 1, TimeUnit.MINUTES); private static final int FALSE = 0; private static final int TRUE = 1; private static final AtomicIntegerFieldUpdater<PersistentDispatcherMultipleConsumers> IS_CLOSED_UPDATER = AtomicIntegerFieldUpdater.newUpdater(PersistentDispatcherMultipleConsumers.class, "isClosed"); private volatile int isClosed = FALSE; private static final AtomicIntegerFieldUpdater<PersistentDispatcherMultipleConsumers> TOTAL_UNACKED_MESSAGES_UPDATER = AtomicIntegerFieldUpdater.newUpdater(PersistentDispatcherMultipleConsumers.class, "totalUnackedMessages"); private volatile int totalUnackedMessages = 0; private final int maxUnackedMessages; private volatile int blockedDispatcherOnUnackedMsgs = FALSE; private static final AtomicIntegerFieldUpdater<PersistentDispatcherMultipleConsumers> BLOCKED_DISPATCHER_ON_UNACKMSG_UPDATER = AtomicIntegerFieldUpdater.newUpdater(PersistentDispatcherMultipleConsumers.class, "blockedDispatcherOnUnackedMsgs"); enum ReadType { Normal, Replay } public PersistentDispatcherMultipleConsumers(PersistentTopic topic, ManagedCursor cursor) { this.cursor = cursor; this.name = topic.getName() + " / " + Codec.decode(cursor.getName()); this.topic = topic; this.messagesToReplay = Sets.newTreeSet(); this.readBatchSize = MaxReadBatchSize; this.maxUnackedMessages = topic.getBrokerService().pulsar().getConfiguration() .getMaxUnackedMessagesPerSubscription(); } @Override public synchronized void addConsumer(Consumer consumer) { if (IS_CLOSED_UPDATER.get(this) == TRUE) { log.warn("[{}] Dispatcher is already closed. Closing consumer ", name, consumer); consumer.disconnect(); return; } if (consumerList.isEmpty()) { if (havePendingRead || havePendingReplayRead) { // There is a pending read from previous run. We must wait for it to complete and then rewind shouldRewindBeforeReadingOrReplaying = true; } else { cursor.rewind(); shouldRewindBeforeReadingOrReplaying = false; } messagesToReplay.clear(); } consumerList.add(consumer); consumerList.sort((c1, c2) -> c1.getPriorityLevel() - c2.getPriorityLevel()); consumerSet.add(consumer); } @Override public synchronized void removeConsumer(Consumer consumer) throws BrokerServiceException { // decrement unack-message count for removed consumer addUnAckedMessages(-consumer.getUnackedMessages()); if (consumerSet.removeAll(consumer) == 1) { consumerList.remove(consumer); log.info("Removed consumer {} with pending {} acks", consumer, consumer.getPendingAcks().size()); if (consumerList.isEmpty()) { if (havePendingRead && cursor.cancelPendingReadRequest()) { havePendingRead = false; } messagesToReplay.clear(); if (closeFuture != null) { log.info("[{}] All consumers removed. Subscription is disconnected", name); closeFuture.complete(null); } totalAvailablePermits = 0; } else { if (log.isDebugEnabled()) { log.debug("[{}] Consumer are left, reading more entries", name); } consumer.getPendingAcks().forEach((ledgerId, entryId, batchSize, none) -> { messagesToReplay.add(new PositionImpl(ledgerId, entryId)); }); totalAvailablePermits -= consumer.getAvailablePermits(); readMoreEntries(); } } else { if (log.isDebugEnabled()) { log.debug("[{}] Trying to remove a non-connected consumer: {}", name, consumer); } } } @Override public synchronized void consumerFlow(Consumer consumer, int additionalNumberOfMessages) { if (!consumerSet.contains(consumer)) { if (log.isDebugEnabled()) { log.debug("[{}] Ignoring flow control from disconnected consumer {}", name, consumer); } return; } totalAvailablePermits += additionalNumberOfMessages; if (log.isDebugEnabled()) { log.debug("[{}] Trigger new read after receiving flow control message", consumer); } readMoreEntries(); } private void readMoreEntries() { if (totalAvailablePermits > 0 && isAtleastOneConsumerAvailable()) { int messagesToRead = Math.min(totalAvailablePermits, readBatchSize); if (!messagesToReplay.isEmpty()) { if (havePendingReplayRead) { log.debug("[{}] Skipping replay while awaiting previous read to complete", name); return; } Set<PositionImpl> messagesToReplayNow = ImmutableSet .copyOf(Iterables.limit(messagesToReplay, messagesToRead)); if (log.isDebugEnabled()) { log.debug("[{}] Schedule replay of {} messages for {} consumers", name, messagesToReplayNow.size(), consumerList.size()); } havePendingReplayRead = true; Set<? extends Position> deletedMessages = cursor.asyncReplayEntries(messagesToReplayNow, this, ReadType.Replay); // clear already acked positions from replay bucket messagesToReplay.removeAll(deletedMessages); // if all the entries are acked-entries and cleared up from messagesToReplay, try to read // next entries as readCompletedEntries-callback was never called if ((messagesToReplayNow.size() - deletedMessages.size()) == 0) { havePendingReplayRead = false; readMoreEntries(); } } else if (BLOCKED_DISPATCHER_ON_UNACKMSG_UPDATER.get(this) == TRUE) { log.warn("[{}] Dispatcher read is blocked due to unackMessages {} reached to max {}", name, TOTAL_UNACKED_MESSAGES_UPDATER.get(this), maxUnackedMessages); } else if (!havePendingRead) { if (log.isDebugEnabled()) { log.debug("[{}] Schedule read of {} messages for {} consumers", name, messagesToRead, consumerList.size()); } havePendingRead = true; cursor.asyncReadEntriesOrWait(messagesToRead, this, ReadType.Normal); } else { log.debug("[{}] Cannot schedule next read until previous one is done", name); } } else { if (log.isDebugEnabled()) { log.debug("[{}] Consumer buffer is full, pause reading", name); } } } @Override public boolean isConsumerConnected() { return !consumerList.isEmpty(); } @Override public CopyOnWriteArrayList<Consumer> getConsumers() { return consumerList; } @Override public synchronized boolean canUnsubscribe(Consumer consumer) { return consumerList.size() == 1 && consumerSet.contains(consumer); } @Override public CompletableFuture<Void> close() { IS_CLOSED_UPDATER.set(this, TRUE); return disconnectAllConsumers(); } @Override public synchronized CompletableFuture<Void> disconnectAllConsumers() { closeFuture = new CompletableFuture<>(); if (consumerList.isEmpty()) { closeFuture.complete(null); } else { consumerList.forEach(Consumer::disconnect); if (havePendingRead && cursor.cancelPendingReadRequest()) { havePendingRead = false; } } return closeFuture; } @Override public void reset() { IS_CLOSED_UPDATER.set(this, FALSE); } @Override public SubType getType() { return SubType.Shared; } @Override public synchronized void readEntriesComplete(List<Entry> entries, Object ctx) { ReadType readType = (ReadType) ctx; int start = 0; int entriesToDispatch = entries.size(); if (readType == ReadType.Normal) { havePendingRead = false; } else { havePendingReplayRead = false; } if (readBatchSize < MaxReadBatchSize) { int newReadBatchSize = Math.min(readBatchSize * 2, MaxReadBatchSize); if (log.isDebugEnabled()) { log.debug("[{}] Increasing read batch size from {} to {}", name, readBatchSize, newReadBatchSize); } readBatchSize = newReadBatchSize; } readFailureBackoff.reduceToHalf(); if (shouldRewindBeforeReadingOrReplaying && readType == ReadType.Normal) { // All consumers got disconnected before the completion of the read operation entries.forEach(Entry::release); cursor.rewind(); shouldRewindBeforeReadingOrReplaying = false; readMoreEntries(); return; } if (log.isDebugEnabled()) { log.debug("[{}] Distributing {} messages to {} consumers", name, entries.size(), consumerList.size()); } while (entriesToDispatch > 0 && totalAvailablePermits > 0 && isAtleastOneConsumerAvailable()) { Consumer c = getNextConsumer(); if (c == null) { // Do nothing, cursor will be rewind at reconnection entries.subList(start, entries.size()).forEach(Entry::release); cursor.rewind(); return; } // round-robin dispatch batch size for this consumer int messagesForC = Math.min(Math.min(entriesToDispatch, c.getAvailablePermits()), MaxRoundRobinBatchSize); if (messagesForC > 0) { // remove positions first from replay list first : sendMessages recycles entries if (readType == ReadType.Replay) { entries.subList(start, start + messagesForC).forEach(entry -> { messagesToReplay.remove((PositionImpl) entry.getPosition()); }); } int msgSent = c.sendMessages(entries.subList(start, start + messagesForC)).getRight(); start += messagesForC; entriesToDispatch -= messagesForC; totalAvailablePermits -= msgSent; } } if (entriesToDispatch > 0) { if (log.isDebugEnabled()) { log.debug("[{}] No consumers found with available permits, storing {} positions for later replay", name, entries.size() - start); } entries.subList(start, entries.size()).forEach(entry -> { messagesToReplay.add((PositionImpl) entry.getPosition()); entry.release(); }); } readMoreEntries(); } @Override public synchronized void readEntriesFailed(ManagedLedgerException exception, Object ctx) { ReadType readType = (ReadType) ctx; long waitTimeMillis = readFailureBackoff.next(); if (!(exception instanceof TooManyRequestsException)) { log.error("[{}] Error reading entries at {} : {}, Read Type {} - Retrying to read in {} seconds", name, cursor.getReadPosition(), exception.getMessage(), readType, waitTimeMillis / 1000.0); } else { if (log.isDebugEnabled()) { log.debug("[{}] Error reading entries at {} : {}, Read Type {} - Retrying to read in {} seconds", name, cursor.getReadPosition(), exception.getMessage(), readType, waitTimeMillis / 1000.0); } } if (shouldRewindBeforeReadingOrReplaying) { shouldRewindBeforeReadingOrReplaying = false; cursor.rewind(); } if (readType == ReadType.Normal) { havePendingRead = false; } else { havePendingReplayRead = false; if (exception instanceof ManagedLedgerException.InvalidReplayPositionException) { PositionImpl markDeletePosition = (PositionImpl) cursor.getMarkDeletedPosition(); messagesToReplay.removeIf(current -> current.compareTo(markDeletePosition) <= 0); } } readBatchSize = 1; topic.getBrokerService().executor().schedule(() -> { synchronized (PersistentDispatcherMultipleConsumers.this) { if (!havePendingRead) { log.info("[{}] Retrying read operation", name); readMoreEntries(); } else { log.info("[{}] Skipping read retry: havePendingRead {}", name, havePendingRead, exception); } } }, waitTimeMillis, TimeUnit.MILLISECONDS); } /** * <pre> * Broker gives more priority while dispatching messages. Here, broker follows descending priorities. (eg: * 0=max-priority, 1, 2,..) * <p> * Broker will first dispatch messages to max priority-level consumers if they * have permits, else broker will consider next priority level consumers. * Also on the same priority-level, it selects consumer in round-robin manner. * <p> * If subscription has consumer-A with priorityLevel 1 and Consumer-B with priorityLevel 2 then broker will dispatch * messages to only consumer-A until it runs out permit and then broker starts dispatching messages to Consumer-B. * <p> * Consumer PriorityLevel Permits * C1 0 2 * C2 0 1 * C3 0 1 * C4 1 2 * C5 1 1 * Result of getNextConsumer(): C1, C2, C3, C1, C4, C5, C4 * </pre> * * <pre> * <b>Algorithm:</b> * 1. consumerList: it stores consumers in sorted-list: max-priority stored first * 2. currentConsumerRoundRobinIndex: it always stores last served consumer-index * * Each time getNextConsumer() is called:<p> * 1. It always starts to traverse from the max-priority consumer (first element) from sorted-list * 2. Consumers on same priority-level will be treated equally and it tries to pick one of them in round-robin manner * 3. If consumer is not available on given priority-level then only it will go to the next lower priority-level consumers * 4. Returns null in case it doesn't find any available consumer * </pre> * * @return nextAvailableConsumer */ private Consumer getNextConsumer() { if (consumerList.isEmpty() || IS_CLOSED_UPDATER.get(this) == TRUE) { // abort read if no consumers are connected or if disconnect is initiated return null; } if (currentConsumerRoundRobinIndex >= consumerList.size()) { currentConsumerRoundRobinIndex = 0; } int currentRoundRobinConsumerPriority = consumerList.get(currentConsumerRoundRobinIndex).getPriorityLevel(); // first find available-consumer on higher level unless currentIndex is not on highest level which is 0 if (currentRoundRobinConsumerPriority != 0) { int higherPriorityConsumerIndex = getConsumerFromHigherPriority(currentRoundRobinConsumerPriority); if (higherPriorityConsumerIndex != -1) { currentConsumerRoundRobinIndex = higherPriorityConsumerIndex + 1; return consumerList.get(higherPriorityConsumerIndex); } } // currentIndex is already on highest level or couldn't find consumer on higher level so, find consumer on same or lower // level int availableConsumerIndex = getNextConsumerFromSameOrLowerLevel(currentConsumerRoundRobinIndex); if (availableConsumerIndex != -1) { currentConsumerRoundRobinIndex = availableConsumerIndex + 1; return consumerList.get(availableConsumerIndex); } // couldn't find available consumer return null; } /** * Finds index of first available consumer which has higher priority then given targetPriority * @param targetPriority * @return -1 if couldn't find any available consumer */ private int getConsumerFromHigherPriority(int targetPriority) { for (int i = 0; i < currentConsumerRoundRobinIndex; i++) { Consumer consumer = consumerList.get(i); if (consumer.getPriorityLevel() < targetPriority) { if (isConsumerAvailable(consumerList.get(i))) { return i; } } else { break; } } return -1; } /** * Finds index of round-robin available consumer that present on same level as consumer on currentRoundRobinIndex if doesn't * find consumer on same level then it finds first available consumer on lower priority level else returns index=-1 * if couldn't find any available consumer in the list * * @param currentRoundRobinIndex * @return */ private int getNextConsumerFromSameOrLowerLevel(int currentRoundRobinIndex) { int targetPriority = consumerList.get(currentRoundRobinIndex).getPriorityLevel(); // use to do round-robin if can't find consumer from currentRR to last-consumer in list int scanIndex = currentRoundRobinIndex; int endPriorityLevelIndex = currentRoundRobinIndex; do { Consumer scanConsumer = scanIndex < consumerList.size() ? consumerList.get(scanIndex) : null /* reached to last consumer of list */; // if reached to last consumer of list then check from beginning to currentRRIndex of the list if (scanConsumer == null || scanConsumer.getPriorityLevel() != targetPriority) { endPriorityLevelIndex = scanIndex; // last consumer on this level scanIndex = getFirstConsumerIndexOfPriority(targetPriority); } else { if (isConsumerAvailable(scanConsumer)) { return scanIndex; } scanIndex++; } } while (scanIndex != currentRoundRobinIndex); // it means: didn't find consumer in the same priority-level so, check available consumer lower than this level for (int i = endPriorityLevelIndex; i < consumerList.size(); i++) { if (isConsumerAvailable(consumerList.get(i))) { return i; } } return -1; } /** * Finds index of first consumer in list which has same priority as given targetPriority * @param targetPriority * @return */ private int getFirstConsumerIndexOfPriority(int targetPriority) { for (int i = 0; i < consumerList.size(); i++) { if (consumerList.get(i).getPriorityLevel() == targetPriority) { return i; } } return -1; } /** * returns true only if {@link consumerList} has atleast one unblocked consumer and have available permits * * @return */ private boolean isAtleastOneConsumerAvailable() { if (consumerList.isEmpty() || IS_CLOSED_UPDATER.get(this) == TRUE) { // abort read if no consumers are connected or if disconnect is initiated return false; } for(Consumer consumer : consumerList) { if (isConsumerAvailable(consumer)) { return true; } } return false; } private boolean isConsumerAvailable(Consumer consumer) { return consumer != null && !consumer.isBlocked() && consumer.getAvailablePermits() > 0; } @Override public synchronized void redeliverUnacknowledgedMessages(Consumer consumer) { consumer.getPendingAcks().forEach((ledgerId, entryId, batchSize, none) -> { messagesToReplay.add(new PositionImpl(ledgerId, entryId)); }); if (log.isDebugEnabled()) { log.debug("[{}] Redelivering unacknowledged messages for consumer ", consumer); } readMoreEntries(); } @Override public synchronized void redeliverUnacknowledgedMessages(Consumer consumer, List<PositionImpl> positions) { messagesToReplay.addAll(positions); if (log.isDebugEnabled()) { log.debug("[{}] Redelivering unacknowledged messages for consumer ", consumer); } readMoreEntries(); } @Override public void addUnAckedMessages(int numberOfMessages) { // don't block dispatching if maxUnackedMessages = 0 if(maxUnackedMessages <= 0) { return; } int unAckedMessages = TOTAL_UNACKED_MESSAGES_UPDATER.addAndGet(this, numberOfMessages); if (unAckedMessages >= maxUnackedMessages && BLOCKED_DISPATCHER_ON_UNACKMSG_UPDATER.compareAndSet(this, FALSE, TRUE)) { log.info("[{}] Dispatcher is blocked due to unackMessages {} reached to max {}", name, TOTAL_UNACKED_MESSAGES_UPDATER.get(this), maxUnackedMessages); } else if (BLOCKED_DISPATCHER_ON_UNACKMSG_UPDATER.get(this) == TRUE && unAckedMessages < maxUnackedMessages / 2) { if (BLOCKED_DISPATCHER_ON_UNACKMSG_UPDATER.compareAndSet(this, TRUE, FALSE)) { log.info("[{}] Dispatcher is unblocked", name); topic.getBrokerService().executor().submit(() -> readMoreEntries()); } } } public boolean isBlockedDispatcherOnUnackedMsgs() { return BLOCKED_DISPATCHER_ON_UNACKMSG_UPDATER.get(this) == TRUE; } public int getTotalUnackedMessages() { return TOTAL_UNACKED_MESSAGES_UPDATER.get(this); } private static final Logger log = LoggerFactory.getLogger(PersistentDispatcherMultipleConsumers.class); }