/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kafka.common.record;
import org.apache.kafka.common.KafkaException;
import org.apache.kafka.common.header.Header;
import org.apache.kafka.common.utils.ByteBufferInputStream;
import org.apache.kafka.common.utils.ByteUtils;
import org.apache.kafka.common.utils.CloseableIterator;
import org.apache.kafka.common.utils.Crc32C;
import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import static org.apache.kafka.common.record.Records.LOG_OVERHEAD;
/**
* RecordBatch implementation for magic 2 and above. The schema is given below:
*
* RecordBatch =>
* BaseOffset => Int64
* Length => Int32
* PartitionLeaderEpoch => Int32
* Magic => Int8
* CRC => Uint32
* Attributes => Int16
* LastOffsetDelta => Int32
* BaseTimestamp => Int64
* MaxTimestamp => Int64
* ProducerId => Int64
* ProducerEpoch => Int16
* BaseSequence => Int32
* Records => [Record]
*
* Note that when compression is enabled (see attributes below), the compressed record data is serialized
* directly following the count of the number of records.
*
* The CRC covers the data from the attributes to the end of the batch (i.e. all the bytes that follow the CRC). It is
* located after the magic byte, which means that clients must parse the magic byte before deciding how to interpret
* the bytes between the batch length and the magic byte. The partition leader epoch field is not included in the CRC
* computation to avoid the need to recompute the CRC when this field is assigned for every batch that is received by
* the broker. The CRC-32C (Castagnoli) polynomial is used for the computation.
*
* The current attributes are given below:
*
* -------------------------------------------------------------------------------------------------
* | Unused (6-15) | Control (5) | Transactional (4) | Timestamp Type (3) | Compression Type (0-2) |
* -------------------------------------------------------------------------------------------------
*/
public class DefaultRecordBatch extends AbstractRecordBatch implements MutableRecordBatch {
static final int BASE_OFFSET_OFFSET = 0;
static final int BASE_OFFSET_LENGTH = 8;
static final int LENGTH_OFFSET = BASE_OFFSET_OFFSET + BASE_OFFSET_LENGTH;
static final int LENGTH_LENGTH = 4;
static final int PARTITION_LEADER_EPOCH_OFFSET = LENGTH_OFFSET + LENGTH_LENGTH;
static final int PARTITION_LEADER_EPOCH_LENGTH = 4;
static final int MAGIC_OFFSET = PARTITION_LEADER_EPOCH_OFFSET + PARTITION_LEADER_EPOCH_LENGTH;
static final int MAGIC_LENGTH = 1;
static final int CRC_OFFSET = MAGIC_OFFSET + MAGIC_LENGTH;
static final int CRC_LENGTH = 4;
static final int ATTRIBUTES_OFFSET = CRC_OFFSET + CRC_LENGTH;
static final int ATTRIBUTE_LENGTH = 2;
static final int LAST_OFFSET_DELTA_OFFSET = ATTRIBUTES_OFFSET + ATTRIBUTE_LENGTH;
static final int LAST_OFFSET_DELTA_LENGTH = 4;
static final int BASE_TIMESTAMP_OFFSET = LAST_OFFSET_DELTA_OFFSET + LAST_OFFSET_DELTA_LENGTH;
static final int BASE_TIMESTAMP_LENGTH = 8;
static final int MAX_TIMESTAMP_OFFSET = BASE_TIMESTAMP_OFFSET + BASE_TIMESTAMP_LENGTH;
static final int MAX_TIMESTAMP_LENGTH = 8;
static final int PRODUCER_ID_OFFSET = MAX_TIMESTAMP_OFFSET + MAX_TIMESTAMP_LENGTH;
static final int PRODUCER_ID_LENGTH = 8;
static final int PRODUCER_EPOCH_OFFSET = PRODUCER_ID_OFFSET + PRODUCER_ID_LENGTH;
static final int PRODUCER_EPOCH_LENGTH = 2;
static final int BASE_SEQUENCE_OFFSET = PRODUCER_EPOCH_OFFSET + PRODUCER_EPOCH_LENGTH;
static final int BASE_SEQUENCE_LENGTH = 4;
static final int RECORDS_COUNT_OFFSET = BASE_SEQUENCE_OFFSET + BASE_SEQUENCE_LENGTH;
static final int RECORDS_COUNT_LENGTH = 4;
static final int RECORDS_OFFSET = RECORDS_COUNT_OFFSET + RECORDS_COUNT_LENGTH;
public static final int RECORD_BATCH_OVERHEAD = RECORDS_OFFSET;
private static final byte COMPRESSION_CODEC_MASK = 0x07;
private static final byte TRANSACTIONAL_FLAG_MASK = 0x10;
private static final int CONTROL_FLAG_MASK = 0x20;
private static final byte TIMESTAMP_TYPE_MASK = 0x08;
private final ByteBuffer buffer;
DefaultRecordBatch(ByteBuffer buffer) {
this.buffer = buffer;
}
@Override
public byte magic() {
return buffer.get(MAGIC_OFFSET);
}
@Override
public void ensureValid() {
if (sizeInBytes() < RECORD_BATCH_OVERHEAD)
throw new InvalidRecordException("Record batch is corrupt (the size " + sizeInBytes() +
"is smaller than the minimum allowed overhead " + RECORD_BATCH_OVERHEAD + ")");
if (!isValid())
throw new InvalidRecordException("Record is corrupt (stored crc = " + checksum()
+ ", computed crc = " + computeChecksum() + ")");
}
private long baseTimestamp() {
return buffer.getLong(BASE_TIMESTAMP_OFFSET);
}
@Override
public long maxTimestamp() {
return buffer.getLong(MAX_TIMESTAMP_OFFSET);
}
@Override
public TimestampType timestampType() {
return (attributes() & TIMESTAMP_TYPE_MASK) == 0 ? TimestampType.CREATE_TIME : TimestampType.LOG_APPEND_TIME;
}
@Override
public long baseOffset() {
return buffer.getLong(BASE_OFFSET_OFFSET);
}
@Override
public long lastOffset() {
return baseOffset() + lastOffsetDelta();
}
@Override
public long producerId() {
return buffer.getLong(PRODUCER_ID_OFFSET);
}
@Override
public short producerEpoch() {
return buffer.getShort(PRODUCER_EPOCH_OFFSET);
}
@Override
public int baseSequence() {
return buffer.getInt(BASE_SEQUENCE_OFFSET);
}
private int lastOffsetDelta() {
return buffer.getInt(LAST_OFFSET_DELTA_OFFSET);
}
@Override
public int lastSequence() {
int baseSequence = baseSequence();
if (baseSequence == RecordBatch.NO_SEQUENCE)
return RecordBatch.NO_SEQUENCE;
return baseSequence() + lastOffsetDelta();
}
@Override
public CompressionType compressionType() {
return CompressionType.forId(attributes() & COMPRESSION_CODEC_MASK);
}
@Override
public int sizeInBytes() {
return LOG_OVERHEAD + buffer.getInt(LENGTH_OFFSET);
}
private int count() {
return buffer.getInt(RECORDS_COUNT_OFFSET);
}
@Override
public Integer countOrNull() {
return count();
}
@Override
public void writeTo(ByteBuffer buffer) {
buffer.put(this.buffer.duplicate());
}
@Override
public boolean isTransactional() {
return (attributes() & TRANSACTIONAL_FLAG_MASK) > 0;
}
@Override
public boolean isControlBatch() {
return (attributes() & CONTROL_FLAG_MASK) > 0;
}
@Override
public int partitionLeaderEpoch() {
return buffer.getInt(PARTITION_LEADER_EPOCH_OFFSET);
}
private CloseableIterator<Record> compressedIterator() {
ByteBuffer buffer = this.buffer.duplicate();
buffer.position(RECORDS_OFFSET);
final DataInputStream stream = new DataInputStream(compressionType().wrapForInput(
new ByteBufferInputStream(buffer), magic()));
return new RecordIterator() {
@Override
protected Record readNext(long baseOffset, long baseTimestamp, int baseSequence, Long logAppendTime) {
try {
return DefaultRecord.readFrom(stream, baseOffset, baseTimestamp, baseSequence, logAppendTime);
} catch (IOException e) {
throw new KafkaException("Failed to decompress record stream", e);
}
}
@Override
public void close() {
try {
stream.close();
} catch (IOException e) {
throw new KafkaException("Failed to close record stream", e);
}
}
};
}
private CloseableIterator<Record> uncompressedIterator() {
final ByteBuffer buffer = this.buffer.duplicate();
buffer.position(RECORDS_OFFSET);
return new RecordIterator() {
@Override
protected Record readNext(long baseOffset, long baseTimestamp, int baseSequence, Long logAppendTime) {
return DefaultRecord.readFrom(buffer, baseOffset, baseTimestamp, baseSequence, logAppendTime);
}
@Override
public void close() {}
};
}
@Override
public Iterator<Record> iterator() {
if (!isCompressed())
return uncompressedIterator();
// for a normal iterator, we cannot ensure that the underlying compression stream is closed,
// so we decompress the full record set here. Use cases which call for a lower memory footprint
// can use `streamingIterator` at the cost of additional complexity
try (CloseableIterator<Record> iterator = compressedIterator()) {
List<Record> records = new ArrayList<>(count());
while (iterator.hasNext())
records.add(iterator.next());
return records.iterator();
}
}
@Override
public CloseableIterator<Record> streamingIterator() {
if (isCompressed())
return compressedIterator();
else
return uncompressedIterator();
}
@Override
public void setLastOffset(long offset) {
buffer.putLong(BASE_OFFSET_OFFSET, offset - lastOffsetDelta());
}
@Override
public void setMaxTimestamp(TimestampType timestampType, long maxTimestamp) {
long currentMaxTimestamp = maxTimestamp();
// We don't need to recompute crc if the timestamp is not updated.
if (timestampType() == timestampType && currentMaxTimestamp == maxTimestamp)
return;
byte attributes = computeAttributes(compressionType(), timestampType, isTransactional(), isControlBatch());
buffer.putShort(ATTRIBUTES_OFFSET, attributes);
buffer.putLong(MAX_TIMESTAMP_OFFSET, maxTimestamp);
long crc = computeChecksum();
ByteUtils.writeUnsignedInt(buffer, CRC_OFFSET, crc);
}
@Override
public void setPartitionLeaderEpoch(int epoch) {
buffer.putInt(PARTITION_LEADER_EPOCH_OFFSET, epoch);
}
@Override
public long checksum() {
return ByteUtils.readUnsignedInt(buffer, CRC_OFFSET);
}
public boolean isValid() {
return sizeInBytes() >= RECORD_BATCH_OVERHEAD && checksum() == computeChecksum();
}
private long computeChecksum() {
return Crc32C.compute(buffer, ATTRIBUTES_OFFSET, buffer.limit() - ATTRIBUTES_OFFSET);
}
private byte attributes() {
// note we're not using the second byte of attributes
return (byte) buffer.getShort(ATTRIBUTES_OFFSET);
}
@Override
public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
DefaultRecordBatch that = (DefaultRecordBatch) o;
return buffer != null ? buffer.equals(that.buffer) : that.buffer == null;
}
@Override
public int hashCode() {
return buffer != null ? buffer.hashCode() : 0;
}
private static byte computeAttributes(CompressionType type, TimestampType timestampType,
boolean isTransactional, boolean isControl) {
if (timestampType == TimestampType.NO_TIMESTAMP_TYPE)
throw new IllegalArgumentException("Timestamp type must be provided to compute attributes for message " +
"format v2 and above");
byte attributes = isTransactional ? TRANSACTIONAL_FLAG_MASK : 0;
if (isControl)
attributes |= CONTROL_FLAG_MASK;
if (type.id > 0)
attributes |= COMPRESSION_CODEC_MASK & type.id;
if (timestampType == TimestampType.LOG_APPEND_TIME)
attributes |= TIMESTAMP_TYPE_MASK;
return attributes;
}
static void writeHeader(ByteBuffer buffer,
long baseOffset,
int lastOffsetDelta,
int sizeInBytes,
byte magic,
CompressionType compressionType,
TimestampType timestampType,
long baseTimestamp,
long maxTimestamp,
long producerId,
short epoch,
int sequence,
boolean isTransactional,
boolean isControlBatch,
int partitionLeaderEpoch,
int numRecords) {
if (magic < RecordBatch.CURRENT_MAGIC_VALUE)
throw new IllegalArgumentException("Invalid magic value " + magic);
if (baseTimestamp < 0 && baseTimestamp != NO_TIMESTAMP)
throw new IllegalArgumentException("Invalid message timestamp " + baseTimestamp);
short attributes = computeAttributes(compressionType, timestampType, isTransactional, isControlBatch);
int position = buffer.position();
buffer.putLong(position + BASE_OFFSET_OFFSET, baseOffset);
buffer.putInt(position + LENGTH_OFFSET, sizeInBytes - LOG_OVERHEAD);
buffer.putInt(position + PARTITION_LEADER_EPOCH_OFFSET, partitionLeaderEpoch);
buffer.put(position + MAGIC_OFFSET, magic);
buffer.putShort(position + ATTRIBUTES_OFFSET, attributes);
buffer.putLong(position + BASE_TIMESTAMP_OFFSET, baseTimestamp);
buffer.putLong(position + MAX_TIMESTAMP_OFFSET, maxTimestamp);
buffer.putInt(position + LAST_OFFSET_DELTA_OFFSET, lastOffsetDelta);
buffer.putLong(position + PRODUCER_ID_OFFSET, producerId);
buffer.putShort(position + PRODUCER_EPOCH_OFFSET, epoch);
buffer.putInt(position + BASE_SEQUENCE_OFFSET, sequence);
buffer.putInt(position + RECORDS_COUNT_OFFSET, numRecords);
long crc = Crc32C.compute(buffer, ATTRIBUTES_OFFSET, sizeInBytes - ATTRIBUTES_OFFSET);
buffer.putInt(position + CRC_OFFSET, (int) crc);
}
@Override
public String toString() {
return "RecordBatch(magic=" + magic() + ", offsets=[" + baseOffset() + ", " + lastOffset() + "], " +
"compression=" + compressionType() + ", timestampType=" + timestampType() + ", crc=" + checksum() + ")";
}
public static int sizeInBytes(long baseOffset, Iterable<Record> records) {
Iterator<Record> iterator = records.iterator();
if (!iterator.hasNext())
return 0;
int size = RECORD_BATCH_OVERHEAD;
Long baseTimestamp = null;
while (iterator.hasNext()) {
Record record = iterator.next();
int offsetDelta = (int) (record.offset() - baseOffset);
if (baseTimestamp == null)
baseTimestamp = record.timestamp();
long timestampDelta = record.timestamp() - baseTimestamp;
size += DefaultRecord.sizeInBytes(offsetDelta, timestampDelta, record.key(), record.value(),
record.headers());
}
return size;
}
public static int sizeInBytes(Iterable<SimpleRecord> records) {
Iterator<SimpleRecord> iterator = records.iterator();
if (!iterator.hasNext())
return 0;
int size = RECORD_BATCH_OVERHEAD;
int offsetDelta = 0;
Long baseTimestamp = null;
while (iterator.hasNext()) {
SimpleRecord record = iterator.next();
if (baseTimestamp == null)
baseTimestamp = record.timestamp();
long timestampDelta = record.timestamp() - baseTimestamp;
size += DefaultRecord.sizeInBytes(offsetDelta++, timestampDelta, record.key(), record.value(),
record.headers());
}
return size;
}
/**
* Get an upper bound on the size of a batch with only a single record using a given key and value.
*/
static int batchSizeUpperBound(byte[] key, byte[] value, Header[] headers) {
return RECORD_BATCH_OVERHEAD + DefaultRecord.recordSizeUpperBound(key, value, headers);
}
private abstract class RecordIterator implements CloseableIterator<Record> {
private final Long logAppendTime;
private final long baseOffset;
private final long baseTimestamp;
private final int baseSequence;
private final int numRecords;
private int readRecords = 0;
public RecordIterator() {
this.logAppendTime = timestampType() == TimestampType.LOG_APPEND_TIME ? maxTimestamp() : null;
this.baseOffset = baseOffset();
this.baseTimestamp = baseTimestamp();
this.baseSequence = baseSequence();
int numRecords = count();
if (numRecords < 0)
throw new InvalidRecordException("Found invalid record count " + numRecords + " in magic v" +
magic() + " batch");
this.numRecords = numRecords;
}
@Override
public boolean hasNext() {
return readRecords < numRecords;
}
@Override
public Record next() {
if (readRecords >= numRecords)
throw new NoSuchElementException();
readRecords++;
return readNext(baseOffset, baseTimestamp, baseSequence, logAppendTime);
}
protected abstract Record readNext(long baseOffset, long baseTimestamp, int baseSequence, Long logAppendTime);
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
}