/** * Copyright (C) 2010-2013 Alibaba Group Holding Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.alibaba.rocketmq.store; import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.alibaba.rocketmq.common.ServiceThread; import com.alibaba.rocketmq.common.UtilAll; import com.alibaba.rocketmq.common.constant.LoggerName; import com.alibaba.rocketmq.common.message.MessageConst; import com.alibaba.rocketmq.common.message.MessageDecoder; import com.alibaba.rocketmq.common.message.MessageExt; import com.alibaba.rocketmq.common.sysflag.MessageSysFlag; import com.alibaba.rocketmq.store.config.BrokerRole; import com.alibaba.rocketmq.store.config.FlushDiskType; import com.alibaba.rocketmq.store.ha.HAService; import com.alibaba.rocketmq.store.schedule.ScheduleMessageService; /** * CommitLog实现 * * @author shijia.wxr<vintage.wang@gmail.com> * @since 2013-7-21 */ public class CommitLog { private static final Logger log = LoggerFactory.getLogger(LoggerName.StoreLoggerName); // 每个消息对应的MAGIC CODE daa320a7 /** * chen.si 标识commit log的普通消息 */ private final static int MessageMagicCode = 0xAABBCCDD ^ 1880681586 + 8; // 文件末尾空洞对应的MAGIC CODE cbd43194 /** * chen.si 标识commit log的文件末尾 */ private final static int BlankMagicCode = 0xBBCCDDEE ^ 1880681586 + 8; // 存储消息的队列 private final MapedFileQueue mapedFileQueue; // 存储顶层对象 private final DefaultMessageStore defaultMessageStore; // CommitLog刷盘服务 private final FlushCommitLogService flushCommitLogService; // 存储消息时的回调接口 private final AppendMessageCallback appendMessageCallback; // 用来保存每个ConsumeQueue的当前最大Offset信息 private HashMap<String/* topic-queueid */, Long/* offset */> topicQueueTable = new HashMap<String, Long>( 1024); /** * 构造函数 */ public CommitLog(final DefaultMessageStore defaultMessageStore) { /** * chen.si 初始化存储数据消息的抽象队列 */ this.mapedFileQueue = new MapedFileQueue(defaultMessageStore.getMessageStoreConfig().getStorePathCommitLog(), defaultMessageStore.getMessageStoreConfig().getMapedFileSizeCommitLog(), defaultMessageStore.getAllocateMapedFileService()); this.defaultMessageStore = defaultMessageStore; /** * chen.si 刷盘类型,可配置 */ if (FlushDiskType.SYNC_FLUSH == defaultMessageStore.getMessageStoreConfig().getFlushDiskType()) { this.flushCommitLogService = new GroupCommitService(); } else { this.flushCommitLogService = new FlushRealTimeService(); } this.appendMessageCallback = new DefaultAppendMessageCallback(defaultMessageStore.getMessageStoreConfig() .getMaxMessageSize()); } public boolean load() { boolean result = this.mapedFileQueue.load(); log.info("load commit log " + (result ? "OK" : "Failed")); return result; } /** * chen.si 启动刷盘服务 */ public void start() { this.flushCommitLogService.start(); } /** * chen.si 关闭刷盘服务 */ public void shutdown() { this.flushCommitLogService.shutdown(); } public long getMinOffset() { MapedFile mapedFile = this.mapedFileQueue.getFirstMapedFileOnLock(); if (mapedFile != null) { /** * chen.si TODO */ if (mapedFile.isAvailable()) { return mapedFile.getFileFromOffset(); } else { /** * chen.si 找到下一个文件的offset */ return this.rollNextFile(mapedFile.getFileFromOffset()); } } return -1; } /** * chen.si 找到当前offset对应的下一个文件,返回下一个文件的起始offset * 注意:这里的下一个文件的起始offset可能仍然是不可用的 * @param offset * @return */ public long rollNextFile(final long offset) { int mapedFileSize = this.defaultMessageStore.getMessageStoreConfig().getMapedFileSizeCommitLog(); return (offset + mapedFileSize - offset % mapedFileSize); } /** * chen.si 获取commit log的最大的offset,即:队列尾的offset,此offset不指向任何消息,指向下一个待写的位置 * @return */ public long getMaxOffset() { return this.mapedFileQueue.getMaxOffset(); } /** * chen.si 用于 commit log文件的自动清理服务, 消息最多保留N天,超过N天的,必须删除掉,判断依据为file的元数据 最近修改时间戳 * * @param expiredTime * @param deleteFilesInterval * @param intervalForcibly * @param cleanImmediately * @return */ public int deleteExpiredFile(// final long expiredTime, // final int deleteFilesInterval, // final long intervalForcibly,// final boolean cleanImmediately// ) { return this.mapedFileQueue.deleteExpiredFileByTime(expiredTime, deleteFilesInterval, intervalForcibly, cleanImmediately); } /** * 读取CommitLog数据,数据复制时使用 */ public SelectMapedBufferResult getData(final long offset) { /** * chen.si 用0 表示 queue的第1个文件 */ return this.getData(offset, (0 == offset ? true : false)); } public SelectMapedBufferResult getData(final long offset, final boolean returnFirstOnNotFound) { int mapedFileSize = this.defaultMessageStore.getMessageStoreConfig().getMapedFileSizeCommitLog(); MapedFile mapedFile = this.mapedFileQueue.findMapedFileByOffset(offset, returnFirstOnNotFound); if (mapedFile != null) { int pos = (int) (offset % mapedFileSize); SelectMapedBufferResult result = mapedFile.selectMapedBuffer(pos); return result; } return null; } /** * 正常退出时,数据恢复,所有内存数据都已经刷盘 */ public void recoverNormally() { /** * chen.si:这个方法的主要任务如下: * * 1. 设置 commit log queue中的最后一个消息位置 * * 2. 设置最后一个消息所在 log文件的commit 和 write position * * 3. 删除多余的文件 */ boolean checkCRCOnRecover = this.defaultMessageStore.getMessageStoreConfig().isCheckCRCOnRecover(); final List<MapedFile> mapedFiles = this.mapedFileQueue.getMapedFiles(); if (!mapedFiles.isEmpty()) { /** * chen.si:正常情况下,3个文件足够恢复了。也就是说,最后一个可写文件一般就在这3个文件中 */ // 从倒数第三个文件开始恢复 int index = mapedFiles.size() - 3; if (index < 0) index = 0; MapedFile mapedFile = mapedFiles.get(index); ByteBuffer byteBuffer = mapedFile.sliceByteBuffer(); /** * chen.si:这个是个global offset,指示最后一条消息 */ long processOffset = mapedFile.getFileFromOffset(); /** * chen.si:当前文件内的offset */ long mapedFileOffset = 0; while (true) { DispatchRequest dispatchRequest = this.checkMessageAndReturnSize(byteBuffer, checkCRCOnRecover); int size = dispatchRequest.getMsgSize(); // 正常数据 if (size > 0) { /** * chen.si:累加当前文件的消息offset(local offset 另外一个称为global offset) */ mapedFileOffset += size; } // 文件中间读到错误 /** * chen.si:文件未写完而已,并不是错误。此时直接break,因为这个文件就是最后一个待用文件 */ else if (size == -1) { //new DispatchRequest(-1) log.info("recover physics file end, " + mapedFile.getFileName()); break; } // 走到文件末尾,切换至下一个文件 // 由于返回0代表是遇到了最后的空洞,这个可以不计入truncate offset中 else if (size == 0) { //new DispatchRequest(0) /** * chen.si:遇到了空洞文件末尾,切换到下一个文件 */ index++; if (index >= mapedFiles.size()) { // 当前条件分支不可能发生 log.info("recover last 3 physics file over, last maped file " + mapedFile.getFileName()); break; } else { mapedFile = mapedFiles.get(index); byteBuffer = mapedFile.sliceByteBuffer(); /** * chen.si:每个map file的起始global offset, */ processOffset = mapedFile.getFileFromOffset(); /** * chen.si:localoffset */ mapedFileOffset = 0; log.info("recover next physics file, " + mapedFile.getFileName()); } } } processOffset += mapedFileOffset; // chen.si:最后一个消息的global offset this.mapedFileQueue.setCommittedWhere(processOffset); // chen.si:删除无用文件,其实就是 到 当前ready to write的文件 为止,后续的文件,都属于dirty files this.mapedFileQueue.truncateDirtyFiles(processOffset); } } public DispatchRequest checkMessageAndReturnSize(java.nio.ByteBuffer byteBuffer, final boolean checkCRC) { return this.checkMessageAndReturnSize(byteBuffer, checkCRC, true); } /** * 服务端使用 检查消息并返回消息大小 * * @return 0 表示走到文件末尾 >0 正常消息 -1 消息校验失败 */ public DispatchRequest checkMessageAndReturnSize(java.nio.ByteBuffer byteBuffer, final boolean checkCRC, final boolean readBody) { try { java.nio.ByteBuffer byteBufferMessage = ((DefaultAppendMessageCallback) this.appendMessageCallback).getMsgStoreItemMemory(); byte[] bytesContent = byteBufferMessage.array(); // 1 TOTALSIZE int totalSize = byteBuffer.getInt(); // 2 MAGICCODE int magicCode = byteBuffer.getInt(); switch (magicCode) { case MessageMagicCode: break; case BlankMagicCode: return new DispatchRequest(0); default: /** * chen.si:找到了最后一条消息,并且文件未写完,仍然可写。所以这里是一个常态,非异常 */ log.warn("found a illegal magic code 0x" + Integer.toHexString(magicCode)); return new DispatchRequest(-1); } // 3 BODYCRC int bodyCRC = byteBuffer.getInt(); // 4 QUEUEID int queueId = byteBuffer.getInt(); // 5 FLAG int flag = byteBuffer.getInt(); flag = flag + 0; // 6 QUEUEOFFSET long queueOffset = byteBuffer.getLong(); // 7 PHYSICALOFFSET long physicOffset = byteBuffer.getLong(); // 8 SYSFLAG int sysFlag = byteBuffer.getInt(); // 9 BORNTIMESTAMP long bornTimeStamp = byteBuffer.getLong(); bornTimeStamp = bornTimeStamp + 0; // 10 BORNHOST(IP+PORT) byteBuffer.get(bytesContent, 0, 8); // 11 STORETIMESTAMP long storeTimestamp = byteBuffer.getLong(); // 12 STOREHOST(IP+PORT) byteBuffer.get(bytesContent, 0, 8); // 13 RECONSUMETIMES int reconsumeTimes = byteBuffer.getInt(); // 14 Prepared Transaction Offset long preparedTransactionOffset = byteBuffer.getLong(); // 15 BODY int bodyLen = byteBuffer.getInt(); if (bodyLen > 0) { if (readBody) { byteBuffer.get(bytesContent, 0, bodyLen); // 校验CRC if (checkCRC) { int crc = UtilAll.crc32(bytesContent, 0, bodyLen); if (crc != bodyCRC) { log.warn("CRC check failed " + crc + " " + bodyCRC); return new DispatchRequest(-1); } } } else { //chen.si:不需要body,直接跳过 byteBuffer.position(byteBuffer.position() + bodyLen); } } // 16 TOPIC byte topicLen = byteBuffer.get(); byteBuffer.get(bytesContent, 0, topicLen); String topic = new String(bytesContent, 0, topicLen); long tagsCode = 0; String keys = ""; // 17 properties short propertiesLength = byteBuffer.getShort(); if (propertiesLength > 0) { byteBuffer.get(bytesContent, 0, propertiesLength); String properties = new String(bytesContent, 0, propertiesLength); Map<String, String> propertiesMap = MessageDecoder.string2messageProperties(properties); keys = propertiesMap.get(MessageConst.PROPERTY_KEYS); String tags = propertiesMap.get(MessageConst.PROPERTY_TAGS); if (tags != null && tags.length() > 0) { tagsCode = MessageExtBrokerInner.tagsString2tagsCode( MessageExt.parseTopicFilterType(sysFlag), tags); } } return new DispatchRequest(// topic,// 1 queueId,// 2 physicOffset,// 3 totalSize,// 4 tagsCode,// 5 storeTimestamp,// 6 queueOffset,// 7 keys,// 8 sysFlag,// 9 0L,// 10 preparedTransactionOffset,// 11 null// 12 ); } catch (BufferUnderflowException e) { byteBuffer.position(byteBuffer.limit()); } catch (Exception e) { byteBuffer.position(byteBuffer.limit()); } return new DispatchRequest(-1); } public void recoverAbnormally() { // 根据最小时间戳来恢复 boolean checkCRCOnRecover = this.defaultMessageStore.getMessageStoreConfig().isCheckCRCOnRecover(); final List<MapedFile> mapedFiles = this.mapedFileQueue.getMapedFiles(); if (!mapedFiles.isEmpty()) { // 寻找从哪个文件开始恢复 int index = mapedFiles.size() - 1; MapedFile mapedFile = null; for (; index >= 0; index--) { mapedFile = mapedFiles.get(index); /** * chen.si 不是恢复所有文件checkpoint。看如下的示意图 * |F1| |F2| |F3| * checkpoint是一个时间戳,实际上可能会指向任意一个文件的任意位置,一般来说是最后1个文件,因为checkpoint一直在刷盘。 * 具体指向什么,是按照commit log的消息的storeTimestamp来确定的。确定方式,就是比较checkpoint时间戳和File的第1个消息的storeTimestamp * 如果checkpoint的时间戳 晚于 消息的storeTimestamp,则说明就从这个文件开始恢复,否则继续找前一个文件 * */ if (this.isMapedFileMatchedRecover(mapedFile)) { // 考虑 store时间戳的误差,所以从上一个文件进行恢复,防止消息丢失 // TODO 最准确的方式是直接找到上一个文件的checkpoint对应的点,然后恢复剩余的消息,避免恢复整个文件。 // 但是commit log需要从头寻找才能确定消息,而且都要走一遍page cache,性能相差基本不大 log.info("recover from this maped file " + mapedFile.getFileName()); break; } /** * chen.si 走到这里,说明 需要找 再之前的文件, 进行恢复 */ } if (index < 0) { index = 0; mapedFile = mapedFiles.get(index); } ByteBuffer byteBuffer = mapedFile.sliceByteBuffer(); /** * chen.si global offset */ long processOffset = mapedFile.getFileFromOffset(); /** * chen.si local offset */ long mapedFileOffset = 0; while (true) { /** * chen.si 深入看checkMessageAndReturnSize方法,会发现最后DispatchRequest的producerGroup为null * * 这是因为:关于事务消息 的恢复,有单独的恢复流程,直接根据cq来恢复 */ DispatchRequest dispatchRequest = this.checkMessageAndReturnSize(byteBuffer, checkCRCOnRecover); int size = dispatchRequest.getMsgSize(); // 正常数据 if (size > 0) { mapedFileOffset += size; /** * chen.si 关键点,重放 commit log的消息 到 cq中, cq会选择是 因为已经建立索引消息而忽略, 或者 补偿建立索引消息 */ this.defaultMessageStore.putDispatchRequest(dispatchRequest); } // 文件中间读到错误 else if (size == -1) { log.info("recover physics file end, " + mapedFile.getFileName()); break; } // 走到文件末尾,切换至下一个文件 // 由于返回0代表是遇到了最后的空洞,这个可以不计入truncate offset中 else if (size == 0) { index++; if (index >= mapedFiles.size()) { // 当前条件分支正常情况下不应该发生 log.info("recover physics file over, last maped file " + mapedFile.getFileName()); break; } else { mapedFile = mapedFiles.get(index); byteBuffer = mapedFile.sliceByteBuffer(); processOffset = mapedFile.getFileFromOffset(); mapedFileOffset = 0; log.info("recover next physics file, " + mapedFile.getFileName()); } } } processOffset += mapedFileOffset; this.mapedFileQueue.setCommittedWhere(processOffset); this.mapedFileQueue.truncateDirtyFiles(processOffset); // 清除ConsumeQueue的多余数据 /** * chen.si TODO 不知道什么场景会产生如下情况: * * cq的索引消息中存在多余消息,其 对应的 数据消息 在 commit log中不存在。 * * 这个是以文件为级别的,发现中的第1条消息是多余消息,则删掉整个文件。 * 如果文件中 开始一部分消息合法,但是 后续消息不合法呢?cq.truncateDirtyLogicFiles方法中没看到处理逻辑,却是直接return的 * * 这里并没有覆盖文件后续的内容,而是直接通过write和commit position来截止到这里,实际上可能是不安全的。 * 因为假设此时不再有新的消息,但是正常关闭了,则下次cq被恢复时,会直接全部读完。 * * TODO 建议这里覆盖掉后面的内容 * * 2017/04/13 * 实际上这里有一个问题,就是上面的this.defaultMessageStore.putDispatchRequest(dispatchRequest);是异步的 * 这里直接truncate,是不正确的,需要等待上述恢复完成,这里才能truncate * * 好在,3.5.8甚至后面的4.0版本,已经全部采用同步的方式 * * 2017/05/08 * 在外层才有判断queue中是否有remaining的消息,这时候已经晚了,已经truncate完成了 * */ this.defaultMessageStore.truncateDirtyLogicFiles(processOffset); } // 物理文件都被删除情况下 else { /** * chen.si 直接重置cq */ this.mapedFileQueue.setCommittedWhere(0); this.defaultMessageStore.destroyLogics(); } } private boolean isMapedFileMatchedRecover(final MapedFile mapedFile) { ByteBuffer byteBuffer = mapedFile.sliceByteBuffer(); /** * chen.si 找第1个物理消息 */ int magicCode = byteBuffer.getInt(MessageDecoder.MessageMagicCodePostion); if (magicCode != MessageMagicCode) { return false; } /** * chen.si 获取第1个物理消息的存储时间 */ long storeTimestamp = byteBuffer.getLong(MessageDecoder.MessageStoreTimestampPostion); if (0 == storeTimestamp) { return false; } /** * chen.si 带上 index queue的 时间 来比较 */ if (this.defaultMessageStore.getMessageStoreConfig().isMessageIndexEnable()// && this.defaultMessageStore.getMessageStoreConfig().isMessageIndexSafe()) { if (storeTimestamp <= this.defaultMessageStore.getStoreCheckpoint().getMinTimestampIndex()) { log.info("find check timestamp, {} {}", // storeTimestamp,// UtilAll.timeMillisToHumanString(storeTimestamp)); return true; } } else { /** * chen.si 直接比较 commit queue 和 cq 的时间 */ if (storeTimestamp <= this.defaultMessageStore.getStoreCheckpoint().getMinTimestamp()) { log.info("find check timestamp, {} {}", // storeTimestamp,// UtilAll.timeMillisToHumanString(storeTimestamp)); return true; } } return false; } public PutMessageResult putMessage(final MessageExtBrokerInner msg) { /** * chen.si 真正存储消息的入口 */ // 设置存储时间 msg.setStoreTimestamp(System.currentTimeMillis()); // 设置消息体BODY CRC(考虑在客户端设置最合适) msg.setBodyCRC(UtilAll.crc32(msg.getBody())); // 返回结果 AppendMessageResult result = null; StoreStatsService storeStatsService = this.defaultMessageStore.getStoreStatsService(); String topic = msg.getTopic(); int queueId = msg.getQueueId(); long tagsCode = msg.getTagsCode(); final int tranType = MessageSysFlag.getTransactionValue(msg.getSysFlag()); if (tranType == MessageSysFlag.TransactionNotType// || tranType == MessageSysFlag.TransactionCommitType) { /** * chen.si 对于普通消息 和 commit消息,需要考虑 延迟发送 功能 */ // 延时投递 if (msg.getDelayTimeLevel() > 0) { if (msg.getDelayTimeLevel() > this.defaultMessageStore.getScheduleMessageService() .getMaxDelayLevel()) { msg.setDelayTimeLevel(this.defaultMessageStore.getScheduleMessageService() .getMaxDelayLevel()); } topic = ScheduleMessageService.SCHEDULE_TOPIC; queueId = ScheduleMessageService.delayLevel2QueueId(msg.getDelayTimeLevel()); tagsCode = this.defaultMessageStore.getScheduleMessageService().computeDeliverTimestamp( msg.getDelayTimeLevel(), msg.getStoreTimestamp()); /** * 备份真实的topic,queueId */ msg.putProperty(MessageConst.PROPERTY_REAL_TOPIC, msg.getTopic()); msg.putProperty(MessageConst.PROPERTY_REAL_QUEUE_ID, String.valueOf(msg.getQueueId())); msg.setPropertiesString(MessageDecoder.messageProperties2String(msg.getProperties())); msg.setTopic(topic); msg.setQueueId(queueId); } } // 写文件要加锁 synchronized (this) { long beginLockTimestamp = this.defaultMessageStore.getSystemClock().now(); // 这里设置存储时间戳,才能保证全局有序 /** * chen.si 这个是关键点, 才能保证后续的 恢复流程,可以依赖 存储时间戳 * * 不过这里的SystemClock是定时更新的,1ms更新一次。 * 实际上可能会出现多个消息的store时间一致的情况,会导致异常恢复,如果同一时间的消息跨越2个文件,会导致消息漏恢复 */ msg.setStoreTimestamp(beginLockTimestamp); // 尝试写入 MapedFile mapedFile = this.mapedFileQueue.getLastMapedFile(); if (null == mapedFile) { log.error("create maped file1 error, topic: " + msg.getTopic() + " clientAddr: " + msg.getBornHostString()); return new PutMessageResult(PutMessageStatus.CREATE_MAPEDFILE_FAILED, null); } result = mapedFile.appendMessage(msg, this.appendMessageCallback); switch (result.getStatus()) { // 成功追加消息 case PUT_OK: break; // 走到文件末尾 case END_OF_FILE: // 创建新文件,重新写消息 mapedFile = this.mapedFileQueue.getLastMapedFile(); if (null == mapedFile) { log.error("create maped file2 error, topic: " + msg.getTopic() + " clientAddr: " + msg.getBornHostString()); return new PutMessageResult(PutMessageStatus.CREATE_MAPEDFILE_FAILED, result); } result = mapedFile.appendMessage(msg, this.appendMessageCallback); break; // 消息大小超限 case MESSAGE_SIZE_EXCEEDED: return new PutMessageResult(PutMessageStatus.MESSAGE_ILLEGAL, result); // 未知错误 case UNKNOWN_ERROR: return new PutMessageResult(PutMessageStatus.UNKNOWN_ERROR, result); default: return new PutMessageResult(PutMessageStatus.UNKNOWN_ERROR, result); } /** * chen.si:这里的cq 和 tran消息都是异步,在commit log成功后,系统宕机,会导致消息直接丢失。 所以有异常恢复机制来确保消息不丢 */ DispatchRequest dispatchRequest = new DispatchRequest(// topic,// 1 queueId,// 2 result.getWroteOffset(),// 3 result.getWroteBytes(),// 4 tagsCode,// 5 msg.getStoreTimestamp(),// 6 result.getLogicsOffset(),// 7 msg.getKeys(),// 8 /** * 事务部分 */ msg.getSysFlag(),// 9 msg.getQueueOffset(), // 10 msg.getPreparedTransactionOffset(),// 11 msg.getProperty(MessageConst.PROPERTY_PRODUCER_GROUP)// 12 ); this.defaultMessageStore.putDispatchRequest(dispatchRequest); long eclipseTime = this.defaultMessageStore.getSystemClock().now() - beginLockTimestamp; if (eclipseTime > 1000) { log.warn("putMessage in lock eclipse time(ms) " + eclipseTime); } } // 返回结果 PutMessageResult putMessageResult = new PutMessageResult(PutMessageStatus.PUT_OK, result); // 统计消息SIZE storeStatsService.getSinglePutMessageTopicSizeTotal(topic).addAndGet(result.getWroteBytes()); GroupCommitRequest request = null; // 同步刷盘 if (FlushDiskType.SYNC_FLUSH == this.defaultMessageStore.getMessageStoreConfig().getFlushDiskType()) { /** * chen.si 同步模式, 将消息发送给 flush 线程, flush成功后,才会返回,除非超时 */ GroupCommitService service = (GroupCommitService) this.flushCommitLogService; if (msg.isWaitStoreMsgOK()) { request = new GroupCommitRequest(result.getWroteOffset() + result.getWroteBytes()); service.putRequest(request); boolean flushOK = request.waitForFlush(this.defaultMessageStore.getMessageStoreConfig() .getSyncFlushTimeout()); if (!flushOK) { log.error("do groupcommit, wait for flush failed, topic: " + msg.getTopic() + " tags: " + msg.getTags() + " client address: " + msg.getBornHostString()); putMessageResult.setPutMessageStatus(PutMessageStatus.FLUSH_DISK_TIMEOUT); } } else { service.wakeup(); } } // 异步刷盘 else { /** * chen.si 通知进行刷盘 */ this.flushCommitLogService.wakeup(); } // 同步双写 if (BrokerRole.SYNC_MASTER == this.defaultMessageStore.getMessageStoreConfig().getBrokerRole()) { HAService service = this.defaultMessageStore.getHaService(); if (msg.isWaitStoreMsgOK()) { // 判断是否要等待 if (service.isSlaveOK(result.getWroteOffset() + result.getWroteBytes())) { if (null == request) { request = new GroupCommitRequest(result.getWroteOffset() + result.getWroteBytes()); } service.putRequest(request); service.getWaitNotifyObject().wakeupAll(); boolean flushOK = // TODO 此处参数与刷盘公用是否合适 request.waitForFlush(this.defaultMessageStore.getMessageStoreConfig() .getSyncFlushTimeout()); if (!flushOK) { log.error("do sync transfer other node, wait return, but failed, topic: " + msg.getTopic() + " tags: " + msg.getTags() + " client address: " + msg.getBornHostString()); putMessageResult.setPutMessageStatus(PutMessageStatus.FLUSH_SLAVE_TIMEOUT); } } // Slave异常 else { // 告诉发送方,Slave异常 putMessageResult.setPutMessageStatus(PutMessageStatus.SLAVE_NOT_AVAILABLE); } } } // 向发送方返回结果 return putMessageResult; } /** * 根据offset获取特定消息的存储时间 如果出错,则返回-1 */ public long pickupStoretimestamp(final long offset, final int size) { /** * chen.si offset为phy offset */ SelectMapedBufferResult result = this.getMessage(offset, size); if (null != result) { try { return result.getByteBuffer().getLong(MessageDecoder.MessageStoreTimestampPostion); } finally { result.release(); } } return -1; } /** * 读取消息 */ public SelectMapedBufferResult getMessage(final long offset, final int size) { int mapedFileSize = this.defaultMessageStore.getMessageStoreConfig().getMapedFileSizeCommitLog(); /** * chen.si 获取phy offset所在的map file */ MapedFile mapedFile = this.mapedFileQueue.findMapedFileByOffset(offset, (0 == offset ? true : false)); if (mapedFile != null) { /** * chen.si 获取指定位置的消息的消息缓冲区 */ int pos = (int) (offset % mapedFileSize); SelectMapedBufferResult result = mapedFile.selectMapedBuffer(pos, size); return result; } return null; } public HashMap<String, Long> getTopicQueueTable() { return topicQueueTable; } public void setTopicQueueTable(HashMap<String, Long> topicQueueTable) { this.topicQueueTable = topicQueueTable; } public void destroy() { this.mapedFileQueue.destroy(); } public boolean appendData(long startOffset, byte[] data) { // 写文件要加锁 synchronized (this) { // 尝试写入 MapedFile mapedFile = this.mapedFileQueue.getLastMapedFile(startOffset); if (null == mapedFile) { log.error("appendData getLastMapedFile error " + startOffset); return false; } return mapedFile.appendMessage(data); } } public boolean retryDeleteFirstFile(final long intervalForcibly) { return this.mapedFileQueue.retryDeleteFirstFile(intervalForcibly); } abstract class FlushCommitLogService extends ServiceThread { } /** * 异步实时刷盘服务 */ class FlushRealTimeService extends FlushCommitLogService { private static final int RetryTimesOver = 3; private long lastFlushTimestamp = 0; private long printTimes = 0; public void run() { CommitLog.log.info(this.getServiceName() + " service started"); while (!this.isStoped()) { int interval = CommitLog.this.defaultMessageStore.getMessageStoreConfig() .getFlushIntervalCommitLog(); int flushPhysicQueueLeastPages = CommitLog.this.defaultMessageStore.getMessageStoreConfig() .getFlushCommitLogLeastPages(); int flushPhysicQueueThoroughInterval = CommitLog.this.defaultMessageStore.getMessageStoreConfig() .getFlushCommitLogThoroughInterval(); boolean printFlushProgress = false; // 定时刷盘,定时打印刷盘进度 long currentTimeMillis = System.currentTimeMillis(); if (currentTimeMillis >= (this.lastFlushTimestamp + flushPhysicQueueThoroughInterval)) { this.lastFlushTimestamp = currentTimeMillis; flushPhysicQueueLeastPages = 0; printFlushProgress = ((printTimes++ % 10) == 0); } try { this.waitForRunning(interval); if (printFlushProgress) { this.printFlushProgress(); } /** * chen.si 3种情况,会触发实际的commit: * * 1. 超时时间到了,必须刷新 * 2. 超时时间未到,但是数据已经超过X页,必须刷新(尽管waitForRunning每次都会被唤醒,并且执行commit,但是commit中会忽略不满X页的commit) * 3. 超时时间未到,数据未超过X页,但是 文件满了 * * 以上3种情况,只会commit当前文件,如果还存在下一个文件(最后一个文件,即:queue.committedWhere指向倒数第2个文件), * 则下一个文件的数据必须等到下一次commit */ CommitLog.this.mapedFileQueue.commit(flushPhysicQueueLeastPages); /** * chens.si TODO 第2和3种情况的刷新,不更新storeTimestamp,也不会更新checkpoint, 不知道为什么 * * 至少不会造成错误,checkpoint比实际操作慢一点,是没有问题的 */ long storeTimestamp = CommitLog.this.mapedFileQueue.getStoreTimestamp(); if (storeTimestamp > 0) { CommitLog.this.defaultMessageStore.getStoreCheckpoint().setPhysicMsgTimestamp( storeTimestamp); } } catch (Exception e) { CommitLog.log.warn(this.getServiceName() + " service has exception. ", e); this.printFlushProgress(); } } // 正常shutdown时,要保证全部刷盘才退出 /** * chen.si TODO 最后要再看下,是不是commit log完成了,才会触发这里。 不然的话,会有问题(文件满的情况下造成问题) */ boolean result = false; for (int i = 0; i < RetryTimesOver && !result; i++) { result = CommitLog.this.mapedFileQueue.commit(0); CommitLog.log.info(this.getServiceName() + " service shutdown, retry " + (i + 1) + " times " + (result ? "OK" : "Not OK")); } this.printFlushProgress(); CommitLog.log.info(this.getServiceName() + " service end"); } @Override public String getServiceName() { return FlushCommitLogService.class.getSimpleName(); } private void printFlushProgress() { CommitLog.log.info("how much disk fall behind memory, " + CommitLog.this.mapedFileQueue.howMuchFallBehind()); } @Override public long getJointime() { // 由于CommitLog数据量较大,所以回收时间要更长 return 1000 * 60 * 5; } } public class GroupCommitRequest { // 当前消息对应的下一个Offset private final long nextOffset; // 异步通知对象 private final CountDownLatch countDownLatch = new CountDownLatch(1); // 刷盘是否成功 private volatile boolean flushOK = false; public GroupCommitRequest(long nextOffset) { this.nextOffset = nextOffset; } public long getNextOffset() { return nextOffset; } public void wakeupCustomer(final boolean flushOK) { this.flushOK = flushOK; this.countDownLatch.countDown(); } public boolean waitForFlush(long timeout) { try { boolean result = this.countDownLatch.await(timeout, TimeUnit.MILLISECONDS); return result || this.flushOK; } catch (InterruptedException e) { e.printStackTrace(); return false; } } } /** * GroupCommit Service */ class GroupCommitService extends FlushCommitLogService { private volatile List<GroupCommitRequest> requestsWrite = new ArrayList<GroupCommitRequest>(); private volatile List<GroupCommitRequest> requestsRead = new ArrayList<GroupCommitRequest>(); public void putRequest(final GroupCommitRequest request) { synchronized (this) { this.requestsWrite.add(request); if (!this.hasNotified) { this.hasNotified = true; this.notify(); } } } private void swapRequests() { List<GroupCommitRequest> tmp = this.requestsWrite; this.requestsWrite = this.requestsRead; this.requestsRead = tmp; } private void doCommit() { if (!this.requestsRead.isEmpty()) { for (GroupCommitRequest req : this.requestsRead) { // 消息有可能在下一个文件,所以最多刷盘2次 /** * chen.si 如果上一个文件满,则先要刷上一个文件,然后再刷当前文件。所以可能要循环2次,具体参考commit方法 */ boolean flushOK = false; for (int i = 0; (i < 2) && !flushOK; i++) { /** * chen.si queue中当前消息是否已经刷新到磁盘 */ flushOK = (CommitLog.this.mapedFileQueue.getCommittedWhere() >= req.getNextOffset()); if (!flushOK) { /** * chen.si 强制刷新 */ CommitLog.this.mapedFileQueue.commit(0); } } /** * chen.si 唤醒commit log线程,数据消息存储成功 */ req.wakeupCustomer(flushOK); } long storeTimestamp = CommitLog.this.mapedFileQueue.getStoreTimestamp(); /** * chen.si 刷新成功,设置commit log的checkpoint * * TODO 单看这里的时间,是有问题的。 commit完成后, store的时间可能已经变化了,这样就标识的checkpoint 比实际的 要大 * 除非 commit那边是同步的, 这边也是单线程的,再看看异步模式 */ if (storeTimestamp > 0) { CommitLog.this.defaultMessageStore.getStoreCheckpoint().setPhysicMsgTimestamp( storeTimestamp); } this.requestsRead.clear(); } else { // 由于个别消息设置为不同步刷盘,所以会走到此流程 CommitLog.this.mapedFileQueue.commit(0); } } public void run() { CommitLog.log.info(this.getServiceName() + " service started"); while (!this.isStoped()) { try { this.waitForRunning(0); this.doCommit(); } catch (Exception e) { CommitLog.log.warn(this.getServiceName() + " service has exception. ", e); } } // 在正常shutdown情况下,等待请求到来,然后再刷盘 try { Thread.sleep(10); } catch (InterruptedException e) { CommitLog.log.warn("GroupCommitService Exception, ", e); } synchronized (this) { this.swapRequests(); } this.doCommit(); CommitLog.log.info(this.getServiceName() + " service end"); } @Override protected void onWaitEnd() { this.swapRequests(); } @Override public String getServiceName() { return GroupCommitService.class.getSimpleName(); } @Override public long getJointime() { // 由于CommitLog数据量较大,所以回收时间要更长 return 1000 * 60 * 5; } } class DefaultAppendMessageCallback implements AppendMessageCallback { // 文件末尾空洞最小定长 private static final int END_FILE_MIN_BLANK_LENGTH = 4 + 4; // 存储消息ID private final ByteBuffer msgIdMemory; // 存储消息内容 private final ByteBuffer msgStoreItemMemory; // 消息的最大长度 private final int maxMessageSize; DefaultAppendMessageCallback(final int size) { this.msgIdMemory = ByteBuffer.allocate(MessageDecoder.MSG_ID_LENGTH); this.msgStoreItemMemory = ByteBuffer.allocate(size + END_FILE_MIN_BLANK_LENGTH); this.maxMessageSize = size; } public ByteBuffer getMsgStoreItemMemory() { return msgStoreItemMemory; } public AppendMessageResult doAppend(final long fileFromOffset, final ByteBuffer byteBuffer, final int maxBlank, final Object msg) { /** * 生成消息ID STORETIMESTAMP + STOREHOSTADDRESS + OFFSET <br> */ MessageExtBrokerInner msgInner = (MessageExtBrokerInner) msg; /** * chen.si 这里单独传参fileFromOffset,理解不了用意。 * 对于commit log来说,只负责持久化消息,它面向的应该是 一个 buffer,以及 标识当前位置的global offset */ // PHY OFFSET long wroteOffset = fileFromOffset + byteBuffer.position(); String msgId = MessageDecoder.createMessageId(this.msgIdMemory, msgInner.getStoreHostBytes(), wroteOffset); /** * 记录ConsumeQueue信息 */ String key = msgInner.getTopic() + "-" + msgInner.getQueueId(); Long queueOffset = CommitLog.this.topicQueueTable.get(key); if (null == queueOffset) { queueOffset = 0L; CommitLog.this.topicQueueTable.put(key, queueOffset); } /** * 事务消息需要特殊处理 */ final int tranType = MessageSysFlag.getTransactionValue(msgInner.getSysFlag()); switch (tranType) { /** * chen.si prepare消息是新消息,所以需要offset,指向transaction table */ case MessageSysFlag.TransactionPreparedType: queueOffset = CommitLog.this.defaultMessageStore.getTransactionStateService() .getTranStateTableOffset().get(); break; case MessageSysFlag.TransactionRollbackType: queueOffset = msgInner.getQueueOffset(); break; case MessageSysFlag.TransactionNotType: case MessageSysFlag.TransactionCommitType: default: break; } /** * 序列化消息 */ final byte[] propertiesData = msgInner.getPropertiesString() == null ? null : msgInner.getPropertiesString().getBytes(); final int propertiesLength = propertiesData == null ? 0 : propertiesData.length; final byte[] topicData = msgInner.getTopic().getBytes(); final int topicLength = topicData == null ? 0 : topicData.length; final int bodyLength = msgInner.getBody() == null ? 0 : msgInner.getBody().length; final int msgLen = 4 // 1 TOTALSIZE + 4 // 2 MAGICCODE + 4 // 3 BODYCRC + 4 // 4 QUEUEID + 4 // 5 FLAG + 8 // 6 QUEUEOFFSET + 8 // 7 PHYSICALOFFSET + 4 // 8 SYSFLAG + 8 // 9 BORNTIMESTAMP + 8 // 10 BORNHOST + 8 // 11 STORETIMESTAMP + 8 // 12 STOREHOSTADDRESS + 4 // 13 RECONSUMETIMES + 8 // 14 Prepared Transaction Offset + 4 + bodyLength // 14 BODY + 1 + topicLength // 15 TOPIC + 2 + propertiesLength // 16 propertiesLength + 0; // 消息超过设定的最大值 if (msgLen > this.maxMessageSize) { CommitLog.log.warn("message size exceeded, msg total size: " + msgLen + ", msg body size: " + bodyLength + ", maxMessageSize: " + this.maxMessageSize); return new AppendMessageResult(AppendMessageStatus.MESSAGE_SIZE_EXCEEDED); } // 判断是否有足够空余空间 if ((msgLen + END_FILE_MIN_BLANK_LENGTH) > maxBlank) { this.resetMsgStoreItemMemory(maxBlank); // 1 TOTALSIZE this.msgStoreItemMemory.putInt(maxBlank); // 2 MAGICCODE this.msgStoreItemMemory.putInt(CommitLog.BlankMagicCode); // 3 剩余空间可能是任何值 // // 此处长度特意设置为maxBlank byteBuffer.put(this.msgStoreItemMemory.array(), 0, maxBlank); return new AppendMessageResult(AppendMessageStatus.END_OF_FILE, wroteOffset, maxBlank, msgId, msgInner.getStoreTimestamp(), queueOffset); } // 初始化存储空间 this.resetMsgStoreItemMemory(msgLen); // 1 TOTALSIZE this.msgStoreItemMemory.putInt(msgLen); // 2 MAGICCODE this.msgStoreItemMemory.putInt(CommitLog.MessageMagicCode); // 3 BODYCRC this.msgStoreItemMemory.putInt(msgInner.getBodyCRC()); // 4 QUEUEID this.msgStoreItemMemory.putInt(msgInner.getQueueId()); // 5 FLAG this.msgStoreItemMemory.putInt(msgInner.getFlag()); // 6 QUEUEOFFSET this.msgStoreItemMemory.putLong(queueOffset); // 7 PHYSICALOFFSET this.msgStoreItemMemory.putLong(fileFromOffset + byteBuffer.position()); // 8 SYSFLAG this.msgStoreItemMemory.putInt(msgInner.getSysFlag()); // 9 BORNTIMESTAMP this.msgStoreItemMemory.putLong(msgInner.getBornTimestamp()); // 10 BORNHOST this.msgStoreItemMemory.put(msgInner.getBornHostBytes()); // 11 STORETIMESTAMP this.msgStoreItemMemory.putLong(msgInner.getStoreTimestamp()); // 12 STOREHOSTADDRESS this.msgStoreItemMemory.put(msgInner.getStoreHostBytes()); // 13 RECONSUMETIMES this.msgStoreItemMemory.putInt(msgInner.getReconsumeTimes()); // 14 Prepared Transaction Offset this.msgStoreItemMemory.putLong(msgInner.getPreparedTransactionOffset()); // 15 BODY this.msgStoreItemMemory.putInt(bodyLength); if (bodyLength > 0) this.msgStoreItemMemory.put(msgInner.getBody()); // 16 TOPIC this.msgStoreItemMemory.put((byte) topicLength); this.msgStoreItemMemory.put(topicData); // 17 PROPERTIES this.msgStoreItemMemory.putShort((short) propertiesLength); if (propertiesLength > 0) this.msgStoreItemMemory.put(propertiesData); // 向队列缓冲区写入消息 byteBuffer.put(this.msgStoreItemMemory.array(), 0, msgLen); AppendMessageResult result = new AppendMessageResult(AppendMessageStatus.PUT_OK, wroteOffset, msgLen, msgId, msgInner.getStoreTimestamp(), queueOffset); switch (tranType) { case MessageSysFlag.TransactionPreparedType: CommitLog.this.defaultMessageStore.getTransactionStateService().getTranStateTableOffset() .incrementAndGet(); break; case MessageSysFlag.TransactionRollbackType: break; case MessageSysFlag.TransactionNotType: case MessageSysFlag.TransactionCommitType: // 更新下一次的ConsumeQueue信息 CommitLog.this.topicQueueTable.put(key, ++queueOffset); break; default: break; } // 返回结果 return result; } private void resetMsgStoreItemMemory(final int length) { this.msgStoreItemMemory.flip(); this.msgStoreItemMemory.limit(length); } } }