/* * Copyright © 2014-2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.transaction.queue; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.common.conf.CConfiguration; import co.cask.cdap.common.queue.QueueName; import co.cask.cdap.common.utils.ImmutablePair; import co.cask.cdap.data2.queue.ConsumerConfig; import co.cask.cdap.data2.queue.DequeueResult; import co.cask.cdap.data2.queue.DequeueStrategy; import co.cask.cdap.data2.queue.QueueConsumer; import co.cask.tephra.Transaction; import co.cask.tephra.TransactionAware; import co.cask.tephra.TxConstants; import com.google.common.base.Function; import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.base.Stopwatch; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.common.collect.Maps; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.Set; import java.util.SortedMap; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; /** * Common queue consumer for persisting engines such as HBase and LevelDB. */ public abstract class AbstractQueueConsumer implements QueueConsumer, TransactionAware { private static final Logger LOG = LoggerFactory.getLogger(AbstractQueueConsumer.class); private static final DequeueResult<byte[]> EMPTY_RESULT = DequeueResult.Empty.result(); // TODO: Make these configurable. // Minimum number of rows to fetch per scan. private static final int MIN_FETCH_ROWS = 100; // Multiple of batches to fetch per scan. // Number of rows to scan = max(MIN_FETCH_ROWS, dequeueBatchSize * groupSize * PREFETCH_BATCHES) private static final int PREFETCH_BATCHES = 10; private static final Function<SimpleQueueEntry, byte[]> ENTRY_TO_BYTE_ARRAY = new Function<SimpleQueueEntry, byte[]>() { @Override public byte[] apply(SimpleQueueEntry input) { return input.getData(); } }; protected final byte[] stateColumnName; private final ConsumerConfig consumerConfig; private final QueueName queueName; private final SortedMap<byte[], SimpleQueueEntry> entryCache; private final NavigableMap<byte[], SimpleQueueEntry> consumingEntries; private final byte[] queueRowPrefix; // Maximum amount of time spent in dequeue to avoid transaction timeout. private final long maxDequeueMillis; private byte[] scanStartRow; private boolean committed; protected Transaction transaction; protected int commitCount; protected abstract boolean claimEntry(byte[] rowKey, byte[] stateContent) throws IOException; protected abstract void updateState(Set<byte[]> rowKeys, byte[] stateColumnName, byte[] stateContent) throws IOException; protected abstract void undoState(Set<byte[]> rowKeys, byte[] stateColumnName) throws IOException, InterruptedException; protected abstract QueueScanner getScanner(byte[] startRow, byte[] stopRow, int numRows) throws IOException; protected AbstractQueueConsumer(CConfiguration cConf, ConsumerConfig consumerConfig, QueueName queueName) { this(cConf, consumerConfig, queueName, null); } protected AbstractQueueConsumer(CConfiguration cConf, ConsumerConfig consumerConfig, QueueName queueName, @Nullable byte[] startRow) { this.consumerConfig = consumerConfig; this.queueName = queueName; this.entryCache = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); this.consumingEntries = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); this.queueRowPrefix = QueueEntryRow.getQueueRowPrefix(queueName); this.scanStartRow = (startRow == null || startRow.length == 0) ? QueueEntryRow.getQueueEntryRowKey(queueName, 0L, 0) : startRow; this.stateColumnName = Bytes.add(QueueEntryRow.STATE_COLUMN_PREFIX, Bytes.toBytes(consumerConfig.getGroupId())); // Maximum time to spend in dequeue. int dequeuePercent = cConf.getInt(QueueConstants.ConfigKeys.DEQUEUE_TX_PERCENT); Preconditions.checkArgument(dequeuePercent > 0 && dequeuePercent <= 100, "Invalid value for %s", QueueConstants.ConfigKeys.DEQUEUE_TX_PERCENT); long txTimeout = TimeUnit.SECONDS.toMillis(cConf.getLong(TxConstants.Manager.CFG_TX_TIMEOUT)); this.maxDequeueMillis = txTimeout * dequeuePercent / 100; } @Override public QueueName getQueueName() { return queueName; } @Override public ConsumerConfig getConfig() { return consumerConfig; } @Override public DequeueResult<byte[]> dequeue() throws IOException { return dequeue(1); } @Override public DequeueResult<byte[]> dequeue(int maxBatchSize) throws IOException { DequeueResult<byte[]> result = performDequeue(maxBatchSize); // Start row can be updated to the largest rowKey in the consumingEntries // that is smaller than or equal to scanStartRow. If no such key exists, update start row to scanStartRow byte[] floorKey = consumingEntries.floorKey(scanStartRow); updateStartRow(floorKey == null ? scanStartRow : floorKey); return result; } @Override public void startTx(Transaction tx) { consumingEntries.clear(); this.transaction = tx; this.committed = false; } @Override public void updateTx(Transaction transaction) { this.transaction = transaction; } @Override public Collection<byte[]> getTxChanges() { // No conflicts guaranteed in dequeue logic. return ImmutableList.of(); } @Override public boolean commitTx() throws Exception { if (consumingEntries.isEmpty()) { return true; } byte[] stateContent = encodeStateColumn(ConsumerEntryState.PROCESSED); updateState(consumingEntries.keySet(), stateColumnName, stateContent); commitCount += consumingEntries.size(); committed = true; return true; } @Override public boolean rollbackTx() throws Exception { if (consumingEntries.isEmpty()) { return true; } // Put the consuming entries back to cache entryCache.putAll(consumingEntries); // If not committed, no need to update HBase. if (!committed) { return true; } commitCount -= consumingEntries.size(); // Revert changes in HBase rows // If it is FIFO, restore to the CLAIMED state. This instance will retry it on the next dequeue. if (getConfig().getDequeueStrategy() == DequeueStrategy.FIFO && getConfig().getGroupSize() > 1) { byte[] stateContent = encodeStateColumn(ConsumerEntryState.CLAIMED); updateState(consumingEntries.keySet(), stateColumnName, stateContent); } else { undoState(consumingEntries.keySet(), stateColumnName); } return true; } /** * Called when the start row is updated. */ protected void updateStartRow(byte[] startRow) { // No-op by default. } private DequeueResult<byte[]> performDequeue(int maxBatchSize) throws IOException { Preconditions.checkArgument(maxBatchSize > 0, "Batch size must be > 0."); // pre-compute the "claimed" state content in case of FIFO. byte[] claimedStateValue = null; if (getConfig().getDequeueStrategy() == DequeueStrategy.FIFO && getConfig().getGroupSize() > 1) { claimedStateValue = encodeStateColumn(ConsumerEntryState.CLAIMED); } boolean isReachedDequeueTimeLimit = false; Stopwatch stopwatch = new Stopwatch(); stopwatch.start(); while (consumingEntries.size() < maxBatchSize && getEntries(consumingEntries, maxBatchSize, stopwatch)) { // ANDREAS: this while loop should stop once getEntries/populateCache reaches the end of the queue. Currently, it // will retry as long as it gets at least one entry in every round, even if that is an entry that must be ignored // because it cannot be claimed. // ANDREAS: It could be a problem that we always read to the end of the queue. This way one flowlet instance may // always all entries, while others are idle. // For FIFO, need to try claiming the entry if group size > 1 if (getConfig().getDequeueStrategy() == DequeueStrategy.FIFO && getConfig().getGroupSize() > 1) { Iterator<Map.Entry<byte[], SimpleQueueEntry>> iterator = consumingEntries.entrySet().iterator(); while (iterator.hasNext()) { SimpleQueueEntry entry = iterator.next().getValue(); if (entry.getState() == null || QueueEntryRow.getStateInstanceId(entry.getState()) >= getConfig().getGroupSize()) { // If not able to claim it, remove it, and move to next one. if (!claimEntry(entry.getRowKey(), claimedStateValue)) { iterator.remove(); } if (stopwatch.elapsedMillis() >= maxDequeueMillis) { break; } } } // Drain the iterator in case of dequeue time limit reached Iterators.advance(iterator, Integer.MAX_VALUE); } if (stopwatch.elapsedMillis() >= maxDequeueMillis) { // If time limit reached and yet we don't have enough entries as requested, treat it as dequeue time limit // reached. There can be some false positive (reached the end of queue, yet passed the time limit), but // it's ok since we only use this boolean for logging only and normally it won't be the case as long as // dequeue is completed in relatively short time comparing to the tx timeout. isReachedDequeueTimeLimit = consumingEntries.size() < maxBatchSize; break; } } // If nothing get dequeued, return the empty result. if (consumingEntries.isEmpty()) { if (isReachedDequeueTimeLimit) { LOG.warn("Unable to dequeue any entry after {}ms.", maxDequeueMillis); } return EMPTY_RESULT; } if (isReachedDequeueTimeLimit) { LOG.warn("Dequeue time limit of {}ms reached. Requested batch size {}, dequeued {}", maxDequeueMillis, maxBatchSize, consumingEntries.size()); } return new SimpleDequeueResult(consumingEntries.values()); } /** * Try to dequeue (claim) entries up to a maximum size. * @param entries For claimed entries to fill in. * @param maxBatchSize Maximum number of entries to claim. * @return The entries instance. * @throws java.io.IOException */ private boolean getEntries(SortedMap<byte[], SimpleQueueEntry> entries, int maxBatchSize, Stopwatch stopwatch) throws IOException { boolean hasEntry = fetchFromCache(entries, maxBatchSize); // If not enough entries from the cache, try to get more. if (entries.size() < maxBatchSize) { populateRowCache(entries.keySet(), maxBatchSize, stopwatch); hasEntry = fetchFromCache(entries, maxBatchSize) || hasEntry; } return hasEntry; } private boolean fetchFromCache(SortedMap<byte[], SimpleQueueEntry> entries, int maxBatchSize) { if (entryCache.isEmpty()) { return false; } Iterator<Map.Entry<byte[], SimpleQueueEntry>> iterator = entryCache.entrySet().iterator(); while (entries.size() < maxBatchSize && iterator.hasNext()) { Map.Entry<byte[], SimpleQueueEntry> entry = iterator.next(); entries.put(entry.getKey(), entry.getValue()); iterator.remove(); } return true; } private void populateRowCache(Set<byte[]> excludeRows, int maxBatchSize, Stopwatch stopwatch) throws IOException { long readPointer = transaction.getReadPointer(); // Scan the table for queue entries. int numRows = Math.max(MIN_FETCH_ROWS, maxBatchSize * PREFETCH_BATCHES); QueueScanner scanner = getScanner(scanStartRow, QueueEntryRow.getStopRowForTransaction(queueRowPrefix, transaction), numRows); try { // Try fill up the cache boolean firstScannedRow = true; while (entryCache.size() < numRows) { ImmutablePair<byte[], Map<byte[], byte[]>> entry = scanner.next(); if (entry == null) { // No more result, breaking out. break; } byte[] rowKey = entry.getFirst(); if (excludeRows.contains(rowKey)) { continue; } // Row key is queue_name + writePointer + counter long writePointer = QueueEntryRow.getWritePointer(rowKey, queueRowPrefix.length); // If it is first row returned by the scanner and was written before the earliest in progress, // it's safe to advance scanStartRow to current row because nothing can be written before this row. if (firstScannedRow && writePointer < transaction.getFirstInProgress()) { firstScannedRow = false; scanStartRow = Arrays.copyOf(rowKey, rowKey.length); } // If writes later than the reader pointer, abort the loop, as entries that comes later are all uncommitted. // this is probably not needed due to the limit of the scan to the stop row, but to be safe... if (writePointer > readPointer) { break; } // If the write is in the excluded list, ignore it. if (transaction.isExcluded(writePointer)) { continue; } // Based on the strategy to determine if include the given entry or not. byte[] dataBytes = entry.getSecond().get(QueueEntryRow.DATA_COLUMN); byte[] metaBytes = entry.getSecond().get(QueueEntryRow.META_COLUMN); if (dataBytes == null || metaBytes == null) { continue; } byte[] stateBytes = entry.getSecond().get(stateColumnName); int counter = Bytes.toInt(rowKey, rowKey.length - 4, Ints.BYTES); if (!shouldInclude(writePointer, counter, metaBytes, stateBytes)) { continue; } entryCache.put(rowKey, new SimpleQueueEntry(rowKey, dataBytes, stateBytes)); // Check here to make sure there is at least one entry read to make sure there is some progress if (stopwatch.elapsedMillis() >= maxDequeueMillis) { break; } } } finally { scanner.close(); } } private byte[] encodeStateColumn(ConsumerEntryState state) { // State column content is encoded as (writePointer) + (instanceId) + (state) byte[] stateContent = new byte[Longs.BYTES + Ints.BYTES + 1]; Bytes.putLong(stateContent, 0, transaction.getWritePointer()); Bytes.putInt(stateContent, Longs.BYTES, getConfig().getInstanceId()); Bytes.putByte(stateContent, Longs.BYTES + Ints.BYTES, state.getState()); return stateContent; } private boolean shouldInclude(long enqueueWritePointer, int counter, byte[] metaValue, byte[] stateValue) throws IOException { QueueEntryRow.CanConsume canConsume = QueueEntryRow.canConsume(getConfig(), transaction, enqueueWritePointer, counter, metaValue, stateValue); if (QueueEntryRow.CanConsume.NO_INCLUDING_ALL_OLDER == canConsume) { scanStartRow = getNextRow(scanStartRow, enqueueWritePointer, counter); return false; } return QueueEntryRow.CanConsume.YES == canConsume; } /** * Get the next row based on the given write pointer and counter. It modifies the given row byte[] in place * and returns it. */ private byte[] getNextRow(byte[] row, long writePointer, int count) { Bytes.putLong(row, queueRowPrefix.length, writePointer); Bytes.putInt(row, queueRowPrefix.length + Longs.BYTES, count + 1); return row; } @Override public String getTransactionAwareName() { return getClass().getSimpleName() + "(queue = " + queueName + ")"; } /** * Implementation of dequeue result. */ private final class SimpleDequeueResult implements DequeueResult<byte[]> { private final List<SimpleQueueEntry> entries; private SimpleDequeueResult(Iterable<SimpleQueueEntry> entries) { this.entries = ImmutableList.copyOf(entries); } @Override public boolean isEmpty() { return entries.isEmpty(); } @Override public void reclaim() { // Simply put all entries into consumingEntries and clear those up from the entry cache as well. for (SimpleQueueEntry entry : entries) { consumingEntries.put(entry.getRowKey(), entry); entryCache.remove(entry.getRowKey()); } } @Override public int size() { return entries.size(); } @Override public Iterator<byte[]> iterator() { if (isEmpty()) { return Iterators.emptyIterator(); } return Iterators.transform(entries.iterator(), ENTRY_TO_BYTE_ARRAY); } @Override public String toString() { return Objects.toStringHelper(this) .add("size", entries.size()) .add("queue", queueName) .add("config", getConfig()) .toString(); } } }