/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.transaction.stream; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.flow.flowlet.StreamEvent; import co.cask.cdap.common.conf.CConfiguration; import co.cask.cdap.data.file.FileReader; import co.cask.cdap.data.file.ReadFilter; import co.cask.cdap.data.file.ReadFilters; import co.cask.cdap.data.stream.StreamEventOffset; import co.cask.cdap.data.stream.StreamFileOffset; import co.cask.cdap.data.stream.StreamUtils; import co.cask.cdap.data2.queue.ConsumerConfig; import co.cask.cdap.data2.queue.DequeueResult; import co.cask.cdap.data2.queue.DequeueStrategy; import co.cask.cdap.data2.transaction.queue.ConsumerEntryState; import co.cask.cdap.data2.transaction.queue.QueueEntryRow; import co.cask.cdap.proto.Id; import co.cask.tephra.Transaction; import co.cask.tephra.TxConstants; import com.google.common.base.Function; import com.google.common.base.Objects; import com.google.common.base.Stopwatch; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.common.io.ByteArrayDataOutput; import com.google.common.io.ByteStreams; import com.google.common.primitives.Ints; import com.google.common.primitives.Longs; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Closeable; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; import javax.annotation.concurrent.NotThreadSafe; /** * A {@link StreamConsumer} that read events from stream file and uses a table to store consumer states. * * <p> * State table ROW key schema: * * <pre>{@code * row_key = <group_id> <stream_file_offset> * group_id = 8 bytes consumer group id * stream_file_offset = <partition_start> <partition_end> <name_prefix> <sequence_id> <offset> * partition_start = 8 bytes timestamp of partition start time * partition_end = 8 bytes timestamp of partition end time * name_prefix = DataOutput UTF-8 output. * sequence_id = 4 bytes stream file sequence id * offset = 8 bytes offset inside the stream file * }</pre> * * The state table has single column * ({@link QueueEntryRow#COLUMN_FAMILY}:{@link QueueEntryRow#STATE_COLUMN_PREFIX}) to store state. * * The state value: * * <pre>{@code * state_value = <write_pointer> <instance_id> <state> * write_pointer = 8 bytes Transaction write point of the consumer who update this state. * instance_id = 4 bytes Instance id of the consumer who update this state. * state = ConsumerEntryState.getState(), either CLAIMED or PROCESSED * }</pre> * */ @NotThreadSafe public abstract class AbstractStreamFileConsumer implements StreamConsumer { private static final Logger LOG = LoggerFactory.getLogger(AbstractStreamFileConsumer.class); private static final HashFunction ROUND_ROBIN_HASHER = Hashing.murmur3_32(); protected static final int MAX_SCAN_ROWS = 1000; // Persist state at most once per second. private static final long STATE_PERSIST_MIN_INTERVAL = TimeUnit.SECONDS.toNanos(1); private static final DequeueResult<StreamEvent> EMPTY_RESULT = DequeueResult.Empty.result(); private static final Function<PollStreamEvent, byte[]> EVENT_ROW_KEY = new Function<PollStreamEvent, byte[]>() { @Override public byte[] apply(PollStreamEvent input) { return input.getStateRow(); } }; // Special comparator to only compare the row prefix, hence different row key can be used directly, // reducing the need for byte[] array creation. private static final Comparator<byte[]> ROW_PREFIX_COMPARATOR = new Comparator<byte[]>() { @Override public int compare(byte[] bytes1, byte[] bytes2) { // Compare row keys without the offset part (last 8 bytes). return Bytes.compareTo(bytes1, 0, bytes1.length - Longs.BYTES, bytes2, 0, bytes2.length - Longs.BYTES); } }; private static final Function<PollStreamEvent, StreamEventOffset> CONVERT_STREAM_EVENT_OFFSET = new Function<PollStreamEvent, StreamEventOffset>() { @Override public StreamEventOffset apply(PollStreamEvent input) { return input.getStreamEventOffset(); } }; private static final Function<PollStreamEvent, StreamEvent> CONVERT_STREAM_EVENT = new Function<PollStreamEvent, StreamEvent>() { @Override public StreamEvent apply(PollStreamEvent input) { return input; } }; protected final byte[] stateColumnName; private final long txTimeoutNano; private final Id.Stream streamName; private final StreamConfig streamConfig; private final ConsumerConfig consumerConfig; private final StreamConsumerStateStore consumerStateStore; private final FileReader<StreamEventOffset, Iterable<StreamFileOffset>> reader; private final ReadFilter readFilter; // Map from row key prefix (row key without last eight bytes offset) to a sorted map of row key to state value // The rows are only needed for entries that are already in the state table when this consumer start. private final Map<byte[], SortedMap<byte[], byte[]>> entryStates; private final Set<byte[]> entryStatesScanCompleted; private final StreamConsumerState consumerState; private final List<StreamEventOffset> eventCache; private Transaction transaction; private List<PollStreamEvent> polledEvents; private long nextPersistStateTime; private boolean committed; private boolean closed; private StreamConsumerState lastPersistedState; /** * * @param streamConfig Stream configuration. * @param consumerConfig Consumer configuration. * @param reader For reading stream events. This class is responsible for closing the reader. * @param consumerStateStore The state store for saving consumer state * @param beginConsumerState Consumer state to begin with. * @param extraFilter Extra {@link ReadFilter} that is ANDed with default read filter and applied first. */ protected AbstractStreamFileConsumer(CConfiguration cConf, StreamConfig streamConfig, ConsumerConfig consumerConfig, FileReader<StreamEventOffset, Iterable<StreamFileOffset>> reader, StreamConsumerStateStore consumerStateStore, StreamConsumerState beginConsumerState, @Nullable ReadFilter extraFilter) { LOG.info("Create consumer {}, reader offsets: {}", consumerConfig, reader.getPosition()); this.txTimeoutNano = TimeUnit.SECONDS.toNanos(cConf.getInt(TxConstants.Manager.CFG_TX_TIMEOUT, TxConstants.Manager.DEFAULT_TX_TIMEOUT)); this.streamName = streamConfig.getStreamId(); this.streamConfig = streamConfig; this.consumerConfig = consumerConfig; this.consumerStateStore = consumerStateStore; this.reader = reader; this.readFilter = createReadFilter(consumerConfig, extraFilter); this.entryStates = Maps.newTreeMap(ROW_PREFIX_COMPARATOR); this.entryStatesScanCompleted = Sets.newTreeSet(ROW_PREFIX_COMPARATOR); this.eventCache = Lists.newArrayList(); this.consumerState = beginConsumerState; this.lastPersistedState = new StreamConsumerState(beginConsumerState); this.stateColumnName = Bytes.add(QueueEntryRow.STATE_COLUMN_PREFIX, Bytes.toBytes(consumerConfig.getGroupId())); } protected void doClose() throws IOException { // No-op. } protected abstract boolean claimFifoEntry(byte[] row, byte[] value, byte[] oldValue) throws IOException; protected abstract void updateState(Iterable<byte[]> rows, int size, byte[] value) throws IOException; protected abstract void undoState(Iterable<byte[]> rows, int size) throws IOException; protected abstract StateScanner scanStates(byte[] startRow, byte[] endRow) throws IOException; @Override public final Id.Stream getStreamId() { return streamName; } @Override public final ConsumerConfig getConsumerConfig() { return consumerConfig; } @Override public final DequeueResult<StreamEvent> poll(int maxEvents, long timeout, TimeUnit timeoutUnit) throws IOException, InterruptedException { // Only need the CLAIMED state for FIFO with group size > 1. byte[] fifoStateContent = null; if (consumerConfig.getDequeueStrategy() == DequeueStrategy.FIFO && consumerConfig.getGroupSize() > 1) { fifoStateContent = encodeStateColumn(ConsumerEntryState.CLAIMED); } // Try to read from cache if any if (!eventCache.isEmpty()) { getEvents(eventCache, polledEvents, maxEvents, fifoStateContent); } if (polledEvents.size() == maxEvents) { return new SimpleDequeueResult(polledEvents); } // Number of events it tries to read by multiply the maxEvents with the group size. It doesn't have to be exact, // just a rough estimate for better read throughput. // Also, this maxRead is used throughout the read loop below, hence some extra events might be read and cached // for next poll call. int maxRead = maxEvents * consumerConfig.getGroupSize(); long timeoutNano = timeoutUnit.toNanos(timeout); Stopwatch stopwatch = new Stopwatch(); stopwatch.start(); // Save the reader position. // It's a conservative approach to save the reader position before reading so that no // event will be missed upon restart. consumerState.setState(reader.getPosition()); // Read from the underlying file reader while (polledEvents.size() < maxEvents) { int readCount = reader.read(eventCache, maxRead, timeoutNano, TimeUnit.NANOSECONDS, readFilter); long elapsedNano = stopwatch.elapsedTime(TimeUnit.NANOSECONDS); timeoutNano -= elapsedNano; if (readCount > 0) { int eventsClaimed = getEvents(eventCache, polledEvents, maxEvents - polledEvents.size(), fifoStateContent); // TODO: This is a quick fix for preventing backoff logic in flowlet drive kicks in too early. // But it doesn't entirely prevent backoff. A proper fix would have a special state in the dequeue result // to let flowlet driver knows it shouldn't have backoff. // If able to read some events but nothing is claimed, don't check for normal timeout. // Only do short transaction timeout checks. if (eventsClaimed == 0 && polledEvents.isEmpty()) { if (elapsedNano < (txTimeoutNano / 2)) { // If still last than half of tx timeout, continue polling without checking normal timeout. continue; } } } if (timeoutNano <= 0) { break; } } if (polledEvents.isEmpty()) { return EMPTY_RESULT; } else { return new SimpleDequeueResult(polledEvents); } } @Override public final void close() throws IOException { if (closed) { return; } closed = true; try { persistConsumerState(); doClose(); } finally { try { reader.close(); } finally { consumerStateStore.close(); } } } @Override public final void startTx(Transaction tx) { transaction = tx; if (polledEvents == null) { polledEvents = Lists.newArrayList(); } else { polledEvents.clear(); } committed = false; } @Override public final void updateTx(Transaction tx) { this.transaction = tx; } @Override public final Collection<byte[]> getTxChanges() { // Guaranteed no conflict in the consumer logic return ImmutableList.of(); } @Override public final boolean commitTx() throws Exception { if (polledEvents.isEmpty()) { return true; } // For each polled events, set the state column to PROCESSED updateState(Iterables.transform(polledEvents, EVENT_ROW_KEY), polledEvents.size(), encodeStateColumn(ConsumerEntryState.PROCESSED)); committed = true; return true; } @Override public void postTxCommit() { long currentNano = System.nanoTime(); if (currentNano >= nextPersistStateTime) { nextPersistStateTime = currentNano + STATE_PERSIST_MIN_INTERVAL; persistConsumerState(); } // Cleanup the entryStates map to free up memory for (PollStreamEvent event : polledEvents) { SortedMap<byte[], byte[]> states = entryStates.get(event.getStateRow()); if (states != null) { states.headMap(event.getStateRow()).clear(); } } } @Override public boolean rollbackTx() throws Exception { if (polledEvents.isEmpty()) { return true; } // Reset the consumer state to some earlier persisted state. // This is to avoid upon close() is called right after rollback, it recorded uncommitted file offsets. consumerState.setState(lastPersistedState.getState()); // Insert all polled events back to beginning of the eventCache eventCache.addAll(0, Lists.transform(polledEvents, CONVERT_STREAM_EVENT_OFFSET)); // Special case for FIFO. On rollback, put the CLAIMED state into the entry states for claim entry to use. byte[] fifoState = null; if (consumerConfig.getDequeueStrategy() == DequeueStrategy.FIFO && consumerConfig.getGroupSize() > 1) { fifoState = encodeStateColumn(ConsumerEntryState.CLAIMED); for (PollStreamEvent event : polledEvents) { entryStates.get(event.getStateRow()).put(event.getStateRow(), fifoState); } } // If committed, also need to rollback backing store. if (committed) { // Special case for FIFO. // If group size > 1, need to update the rows states to CLAIMED state with this instance Id. // The transaction pointer used for the entry doesn't matter. if (consumerConfig.getDequeueStrategy() == DequeueStrategy.FIFO && consumerConfig.getGroupSize() > 1) { updateState(Iterables.transform(polledEvents, EVENT_ROW_KEY), polledEvents.size(), fifoState); } else { undoState(Iterables.transform(polledEvents, EVENT_ROW_KEY), polledEvents.size()); } } return true; } @Override public String getTransactionAwareName() { return toString(); } @Override public String toString() { return Objects.toStringHelper(this) .add("stream", streamConfig) .add("consumer", consumerConfig) .toString(); } private ReadFilter createReadFilter(ConsumerConfig consumerConfig, @Nullable ReadFilter extraFilter) { ReadFilter baseFilter = createBaseReadFilter(consumerConfig); if (extraFilter != null) { return ReadFilters.and(extraFilter, baseFilter); } else { return baseFilter; } } private ReadFilter createBaseReadFilter(final ConsumerConfig consumerConfig) { final int groupSize = consumerConfig.getGroupSize(); final DequeueStrategy strategy = consumerConfig.getDequeueStrategy(); if (groupSize == 1 || strategy == DequeueStrategy.FIFO) { return ReadFilter.ALWAYS_ACCEPT; } // For RoundRobin and Hash partition, the claim is done by matching hashCode to instance id. // For Hash, to preserve existing behavior, everything route to instance 0. // For RoundRobin, the idea is to scatter the events across consumers evenly. Since there is no way to known // about the absolute starting point to do true round robin, we employ a good enough hash function on the // file offset as a way to spread events across consumers final int instanceId = consumerConfig.getInstanceId(); return new ReadFilter() { @Override public boolean acceptOffset(long offset) { int hashValue = Math.abs(strategy == DequeueStrategy.HASH ? 0 : ROUND_ROBIN_HASHER.hashLong(offset).hashCode()); return instanceId == (hashValue % groupSize); } }; } private int getEvents(List<? extends StreamEventOffset> source, List<? super PollStreamEvent> result, int maxEvents, byte[] stateContent) throws IOException { Iterator<? extends StreamEventOffset> iterator = Iterators.consumingIterator(source.iterator()); int eventsClaimed = 0; while (result.size() < maxEvents && iterator.hasNext()) { StreamEventOffset event = iterator.next(); byte[] stateRow = claimEntry(event.getOffset(), stateContent); if (stateRow == null) { continue; } result.add(new PollStreamEvent(event, stateRow)); eventsClaimed++; } return eventsClaimed; } private void persistConsumerState() { try { if (lastPersistedState == null || !consumerState.equals(lastPersistedState)) { consumerStateStore.save(consumerState); lastPersistedState = new StreamConsumerState(consumerState); } } catch (IOException e) { LOG.error("Failed to persist consumer state for consumer {} of stream {}", consumerConfig, getStreamId(), e); } } /** * Encodes the value for the state column with the current transaction and consumer information. * * @param state The state to encode * @return The stateContent byte array */ // TODO: This method is copied from AbstractQueue2Consumer. Future effort is needed to unify them. private byte[] encodeStateColumn(ConsumerEntryState state) { byte[] stateContent = new byte[Longs.BYTES + Ints.BYTES + 1]; // State column content is encoded as (writePointer) + (instanceId) + (state) Bytes.putLong(stateContent, 0, transaction.getWritePointer()); Bytes.putInt(stateContent, Longs.BYTES, consumerConfig.getInstanceId()); Bytes.putByte(stateContent, Longs.BYTES + Ints.BYTES, state.getState()); return stateContent; } /** * Try to claim a stream event offset. * * @return The row key for writing to the state table if successfully claimed or {@code null} if not claimed. */ private byte[] claimEntry(StreamFileOffset offset, byte[] claimedStateContent) throws IOException { ByteArrayDataOutput out = ByteStreams.newDataOutput(50); out.writeLong(consumerConfig.getGroupId()); StreamUtils.encodeOffset(out, offset); byte[] row = out.toByteArray(); SortedMap<byte[], byte[]> rowStates = getInitRowStates(row); // See if the entry should be ignored. If it is in the rowStates with null value, then it should be ignored. byte[] rowState = rowStates.get(row); if (rowStates.containsKey(row) && rowState == null) { return null; } // Only need to claim entry if FIFO and group size > 1 if (consumerConfig.getDequeueStrategy() == DequeueStrategy.FIFO && consumerConfig.getGroupSize() > 1) { return claimFifoEntry(row, claimedStateContent, rowState) ? row : null; } // For Hash, RR and FIFO with group size == 1, no need to claim and check, // as it's already handled by the readFilter return row; } /** * Returns the initial scanned states for the given entry key. * * Conceptually scan will perform from the given entry key till the end of entry represented * by that stream file (i.e. offset = Long.MAX_VALUE) as indicated by the row prefix (row prefix uniquely identify * the stream file). * However, due to memory limit, scanning is done progressively until it sees an entry with state value * written with transaction write pointer later than the this consumer starts. * * @param row the entry row key. */ private SortedMap<byte[], byte[]> getInitRowStates(byte[] row) throws IOException { SortedMap<byte[], byte[]> rowStates = entryStates.get(row); if (rowStates != null) { // If scan is completed for this row prefix, simply return the cached entries. // Or if the cached states is beyond current row, just return as the caller only use the cached state to do // point lookup. if (entryStatesScanCompleted.contains(row) || !rowStates.tailMap(row).isEmpty()) { return rowStates; } } else { rowStates = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); entryStates.put(row, rowStates); } // Scan from the given row till to max file offset // Last 8 bytes are the file offset, make it max value so that it scans till last offset. byte[] stopRow = Arrays.copyOf(row, row.length); Bytes.putLong(stopRow, stopRow.length - Longs.BYTES, Long.MAX_VALUE); try (StateScanner scanner = scanStates(row, stopRow)) { // Scan until MAX_SCAN_ROWS or exhausted the scanner int rowCached = 0; while (scanner.nextStateRow() && rowCached < MAX_SCAN_ROWS) { if (storeInitState(scanner.getRow(), scanner.getState(), rowStates)) { rowCached++; } } // If no row is cached, no need to scan again, as they'll be inserted after this consumer starts if (rowCached == 0) { entryStatesScanCompleted.add(row); } } return rowStates; } /** * Determines if need to cache initial entry states. * * @param row Entry row key * @param stateValue Entry state value * @param cache The cache to fill it if the row key and state value needs to be cached. * @return {@code true} if the entry is stored into cache, {@code false} if the entry is not stored. */ private boolean storeInitState(byte[] row, byte[] stateValue, Map<byte[], byte[]> cache) { // Logic is adpated from QueueEntryRow.canConsume(), with modification. if (stateValue == null) { // State value shouldn't be null, as the row is only written with state value. return false; } long offset = Bytes.toLong(row, row.length - Longs.BYTES); long stateWritePointer = QueueEntryRow.getStateWritePointer(stateValue); // If the entry offset is not accepted by the read filter, this consumer won't see this entry in future read. // If it is written after the current transaction, it happens with the current consumer config. // In both cases, no need to cache if (!readFilter.acceptOffset(offset) || stateWritePointer >= transaction.getWritePointer()) { return false; } // If state is PROCESSED and committed, need to memorize it so that it can be skipped. ConsumerEntryState state = QueueEntryRow.getState(stateValue); if (state == ConsumerEntryState.PROCESSED && transaction.isVisible(stateWritePointer)) { // No need to store the state value. cache.put(row, null); return true; } // Special case for FIFO. // For group size > 1 case, if the state is not committed, need to memorize current state value for claim entry. if (consumerConfig.getDequeueStrategy() == DequeueStrategy.FIFO && consumerConfig.getGroupSize() > 1) { int stateInstanceId = QueueEntryRow.getStateInstanceId(stateValue); // If the state was written by a consumer that is still live, and not by itself, // record the state value as null so that it'll get skipped in the claim entry logic. if (stateInstanceId < consumerConfig.getGroupSize() && stateInstanceId != consumerConfig.getInstanceId()) { cache.put(row, null); } else { // Otherwise memorize the value for checkAndPut operation in claim entry. cache.put(row, stateValue); } return true; } return false; } /** * Scanner for scanning state table. */ protected interface StateScanner extends Closeable { boolean nextStateRow() throws IOException; byte[] getRow(); byte[] getState(); } /** * Represents a {@link StreamEvent} created by the {@link #poll(int, long, java.util.concurrent.TimeUnit)} call. */ private static final class PollStreamEvent extends StreamEvent { private final byte[] stateRow; private final StreamEventOffset streamEventOffset; protected PollStreamEvent(StreamEventOffset streamEventOffset, byte[] stateRow) { super(streamEventOffset); this.streamEventOffset = streamEventOffset; this.stateRow = stateRow; } public StreamEventOffset getStreamEventOffset() { return streamEventOffset; } @Override public ByteBuffer getBody() { // Return a copy of the ByteBuffer (not copy of content), // so that the underlying stream buffer can be reused (rollback, retries). return streamEventOffset.getBody().slice(); } private byte[] getStateRow() { return stateRow; } } /** * A {@link DequeueResult} returned by {@link #poll(int, long, java.util.concurrent.TimeUnit)}. */ private final class SimpleDequeueResult implements DequeueResult<StreamEvent> { private final List<PollStreamEvent> events; private SimpleDequeueResult(List<PollStreamEvent> events) { this.events = ImmutableList.copyOf(events); } @Override public boolean isEmpty() { return events.isEmpty(); } @Override public void reclaim() { // Copy events back to polledEvents and need to remove them from eventCache polledEvents.clear(); polledEvents.addAll(events); eventCache.removeAll(Lists.transform(events, CONVERT_STREAM_EVENT_OFFSET)); } @Override public int size() { return events.size(); } @Override public Iterator<StreamEvent> iterator() { return Iterators.transform(events.iterator(), CONVERT_STREAM_EVENT); } } }