/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ignite.internal.processors.cache.database.freelist; import java.util.concurrent.atomic.AtomicReferenceArray; import org.apache.ignite.IgniteCheckedException; import org.apache.ignite.IgniteLogger; import org.apache.ignite.internal.pagemem.PageIdAllocator; import org.apache.ignite.internal.pagemem.PageIdUtils; import org.apache.ignite.internal.pagemem.PageUtils; import org.apache.ignite.internal.pagemem.wal.IgniteWriteAheadLogManager; import org.apache.ignite.internal.pagemem.wal.record.delta.DataPageInsertFragmentRecord; import org.apache.ignite.internal.pagemem.wal.record.delta.DataPageInsertRecord; import org.apache.ignite.internal.pagemem.wal.record.delta.DataPageRemoveRecord; import org.apache.ignite.internal.pagemem.wal.record.delta.DataPageUpdateRecord; import org.apache.ignite.internal.processors.cache.database.CacheDataRow; import org.apache.ignite.internal.processors.cache.database.MemoryMetricsImpl; import org.apache.ignite.internal.processors.cache.database.MemoryPolicy; import org.apache.ignite.internal.processors.cache.database.evict.PageEvictionTracker; import org.apache.ignite.internal.processors.cache.database.tree.io.CacheVersionIO; import org.apache.ignite.internal.processors.cache.database.tree.io.DataPageIO; import org.apache.ignite.internal.processors.cache.database.tree.io.DataPagePayload; import org.apache.ignite.internal.processors.cache.database.tree.io.PageIO; import org.apache.ignite.internal.processors.cache.database.tree.reuse.ReuseBag; import org.apache.ignite.internal.processors.cache.database.tree.reuse.ReuseList; import org.apache.ignite.internal.processors.cache.database.tree.util.PageHandler; import org.apache.ignite.internal.util.typedef.internal.U; /** */ public class FreeListImpl extends PagesList implements FreeList, ReuseList { /** */ private static final int BUCKETS = 256; // Must be power of 2. /** */ private static final int REUSE_BUCKET = BUCKETS - 1; /** */ private static final Integer COMPLETE = Integer.MAX_VALUE; /** */ private static final Integer FAIL_I = Integer.MIN_VALUE; /** */ private static final Long FAIL_L = Long.MAX_VALUE; /** */ private static final int MIN_PAGE_FREE_SPACE = 8; /** */ private final int shift; /** */ private final AtomicReferenceArray<Stripe[]> buckets = new AtomicReferenceArray<>(BUCKETS); /** */ private final int MIN_SIZE_FOR_DATA_PAGE; /** */ private final int emptyDataPagesBucket; /** */ private final PageHandler<CacheDataRow, Boolean> updateRow = new UpdateRowHandler(); /** */ private final MemoryMetricsImpl memMetrics; /** */ private final PageEvictionTracker evictionTracker; /** * */ private final class UpdateRowHandler extends PageHandler<CacheDataRow, Boolean> { @Override public Boolean run( int cacheId, long pageId, long page, long pageAddr, PageIO iox, Boolean walPlc, CacheDataRow row, int itemId) throws IgniteCheckedException { DataPageIO io = (DataPageIO)iox; int rowSize = getRowSize(row); boolean updated = io.updateRow(pageAddr, itemId, pageSize(), null, row, rowSize); evictionTracker.touchPage(pageId); if (updated && needWalDeltaRecord(pageId, page, walPlc)) { // TODO This record must contain only a reference to a logical WAL record with the actual data. byte[] payload = new byte[rowSize]; DataPagePayload data = io.readPayload(pageAddr, itemId, pageSize()); assert data.payloadSize() == rowSize; PageUtils.getBytes(pageAddr, data.offset(), payload, 0, rowSize); wal.log(new DataPageUpdateRecord( cacheId, pageId, itemId, payload)); } return updated; } } /** */ private final PageHandler<CacheDataRow, Integer> writeRow = new WriteRowHandler(); /** * */ private final class WriteRowHandler extends PageHandler<CacheDataRow, Integer> { @Override public Integer run( int cacheId, long pageId, long page, long pageAddr, PageIO iox, Boolean walPlc, CacheDataRow row, int written) throws IgniteCheckedException { DataPageIO io = (DataPageIO)iox; int rowSize = getRowSize(row); int oldFreeSpace = io.getFreeSpace(pageAddr); assert oldFreeSpace > 0 : oldFreeSpace; // If the full row does not fit into this page write only a fragment. written = (written == 0 && oldFreeSpace >= rowSize) ? addRow(pageId, page, pageAddr, io, row, rowSize): addRowFragment(pageId, page, pageAddr, io, row, written, rowSize); // Reread free space after update. int newFreeSpace = io.getFreeSpace(pageAddr); if (newFreeSpace > MIN_PAGE_FREE_SPACE) { int bucket = bucket(newFreeSpace, false); put(null, pageId, page, pageAddr, bucket); } if (written == rowSize) evictionTracker.touchPage(pageId); // Avoid boxing with garbage generation for usual case. return written == rowSize ? COMPLETE : written; } /** * @param pageId Page ID. * @param page Page pointer. * @param pageAddr Page address. * @param io IO. * @param row Row. * @param rowSize Row size. * @return Written size which is always equal to row size here. * @throws IgniteCheckedException If failed. */ private int addRow( long pageId, long page, long pageAddr, DataPageIO io, CacheDataRow row, int rowSize ) throws IgniteCheckedException { io.addRow(pageAddr, row, rowSize, pageSize()); if (needWalDeltaRecord(pageId, page, null)) { // TODO This record must contain only a reference to a logical WAL record with the actual data. byte[] payload = new byte[rowSize]; DataPagePayload data = io.readPayload(pageAddr, PageIdUtils.itemId(row.link()), pageSize()); assert data.payloadSize() == rowSize; PageUtils.getBytes(pageAddr, data.offset(), payload, 0, rowSize); wal.log(new DataPageInsertRecord( cacheId, pageId, payload)); } return rowSize; } /** * @param pageId Page ID. * @param page Page pointer. * @param pageAddr Page address. * @param io IO. * @param row Row. * @param written Written size. * @param rowSize Row size. * @return Updated written size. * @throws IgniteCheckedException If failed. */ private int addRowFragment( long pageId, long page, long pageAddr, DataPageIO io, CacheDataRow row, int written, int rowSize ) throws IgniteCheckedException { // Read last link before the fragment write, because it will be updated there. long lastLink = row.link(); int payloadSize = io.addRowFragment(pageMem, pageAddr, row, written, rowSize, pageSize()); assert payloadSize > 0 : payloadSize; if (needWalDeltaRecord(pageId, page, null)) { // TODO This record must contain only a reference to a logical WAL record with the actual data. byte[] payload = new byte[payloadSize]; DataPagePayload data = io.readPayload(pageAddr, PageIdUtils.itemId(row.link()), pageSize()); PageUtils.getBytes(pageAddr, data.offset(), payload, 0, payloadSize); wal.log(new DataPageInsertFragmentRecord(cacheId, pageId, payload, lastLink)); } return written + payloadSize; } } /** */ private final PageHandler<Void, Long> rmvRow = new RemoveRowHandler(); /** * */ private final class RemoveRowHandler extends PageHandler<Void, Long> { @Override public Long run( int cacheId, long pageId, long page, long pageAddr, PageIO iox, Boolean walPlc, Void ignored, int itemId) throws IgniteCheckedException { DataPageIO io = (DataPageIO)iox; int oldFreeSpace = io.getFreeSpace(pageAddr); assert oldFreeSpace >= 0: oldFreeSpace; long nextLink = io.removeRow(pageAddr, itemId, pageSize()); if (needWalDeltaRecord(pageId, page, walPlc)) wal.log(new DataPageRemoveRecord(cacheId, pageId, itemId)); int newFreeSpace = io.getFreeSpace(pageAddr); if (newFreeSpace > MIN_PAGE_FREE_SPACE) { int newBucket = bucket(newFreeSpace, false); if (oldFreeSpace > MIN_PAGE_FREE_SPACE) { int oldBucket = bucket(oldFreeSpace, false); if (oldBucket != newBucket) { // It is possible that page was concurrently taken for put, in this case put will handle bucket change. if (removeDataPage(pageId, page, pageAddr, io, oldBucket)) put(null, pageId, page, pageAddr, newBucket); } } else put(null, pageId, page, pageAddr, newBucket); if (io.isEmpty(pageAddr)) evictionTracker.forgetPage(pageId); } // For common case boxed 0L will be cached inside of Long, so no garbage will be produced. return nextLink; } } /** * @param cacheId Cache ID. * @param name Name (for debug purpose). * @param memMetrics Memory metrics. * @param memPlc Memory policy. * @param reuseList Reuse list or {@code null} if this free list will be a reuse list for itself. * @param wal Write ahead log manager. * @param metaPageId Metadata page ID. * @param initNew {@code True} if new metadata should be initialized. * @throws IgniteCheckedException If failed. */ public FreeListImpl( int cacheId, String name, MemoryMetricsImpl memMetrics, MemoryPolicy memPlc, ReuseList reuseList, IgniteWriteAheadLogManager wal, long metaPageId, boolean initNew) throws IgniteCheckedException { super(cacheId, name, memPlc.pageMemory(), BUCKETS, wal, metaPageId); this.evictionTracker = memPlc.evictionTracker(); this.reuseList = reuseList == null ? this : reuseList; int pageSize = pageMem.pageSize(); assert U.isPow2(pageSize) : "Page size must be a power of 2: " + pageSize; assert U.isPow2(BUCKETS); assert BUCKETS <= pageSize : pageSize; // TODO this constant is used because currently we cannot reuse data pages as index pages // TODO and vice-versa. It should be removed when data storage format is finalized. MIN_SIZE_FOR_DATA_PAGE = pageSize - DataPageIO.MIN_DATA_PAGE_OVERHEAD; int shift = 0; while (pageSize > BUCKETS) { shift++; pageSize >>>= 1; } this.shift = shift; this.memMetrics = memMetrics; emptyDataPagesBucket = bucket(MIN_SIZE_FOR_DATA_PAGE, false); init(metaPageId, initNew); } /** * Calculates average fill factor over FreeListImpl instance. */ public float fillFactor() { long pageSize = pageSize(); long totalSize = 0; long loadSize = 0; for (int b = BUCKETS - 2; b > 0; b--) { long bsize = pageSize - ((REUSE_BUCKET - b) << shift); long pages = bucketsSize[b].longValue(); loadSize += pages * (pageSize - bsize); totalSize += pages * pageSize; } return totalSize == 0 ? -1L : ((float) loadSize / totalSize); } /** {@inheritDoc} */ @Override public void dumpStatistics(IgniteLogger log) { long dataPages = 0; final boolean dumpBucketsInfo = false; for (int b = 0; b < BUCKETS; b++) { long size = bucketsSize[b].longValue(); if (!isReuseBucket(b)) dataPages += size; if (dumpBucketsInfo) { Stripe[] stripes = getBucket(b); boolean empty = true; if (stripes != null) { for (Stripe stripe : stripes) { if (!stripe.empty) { empty = false; break; } } } log.info("Bucket [b=" + b + ", size=" + size + ", stripes=" + (stripes != null ? stripes.length : 0) + ", stripesEmpty=" + empty + ']'); } } if (dataPages > 0) { log.info("FreeList [name=" + name + ", buckets=" + BUCKETS + ", dataPages=" + dataPages + ", reusePages=" + bucketsSize[REUSE_BUCKET].longValue() + "]"); } } /** * @param freeSpace Page free space. * @param allowReuse {@code True} if it is allowed to get reuse bucket. * @return Bucket. */ private int bucket(int freeSpace, boolean allowReuse) { assert freeSpace > 0 : freeSpace; int bucket = freeSpace >>> shift; assert bucket >= 0 && bucket < BUCKETS : bucket; if (!allowReuse && isReuseBucket(bucket)) bucket--; return bucket; } /** * @param part Partition. * @return Page ID. * @throws IgniteCheckedException If failed. */ private long allocateDataPage(int part) throws IgniteCheckedException { assert part <= PageIdAllocator.MAX_PARTITION_ID; assert part != PageIdAllocator.INDEX_PARTITION; return pageMem.allocatePage(cacheId, part, PageIdAllocator.FLAG_DATA); } /** {@inheritDoc} */ @Override public void insertDataRow(CacheDataRow row) throws IgniteCheckedException { int rowSize = getRowSize(row); int written = 0; do { if (written != 0) memMetrics.incrementLargeEntriesPages(); int freeSpace = Math.min(MIN_SIZE_FOR_DATA_PAGE, rowSize - written); long pageId = 0L; if (freeSpace == MIN_SIZE_FOR_DATA_PAGE) pageId = takeEmptyPage(emptyDataPagesBucket, DataPageIO.VERSIONS); boolean reuseBucket = false; // TODO: properly handle reuse bucket. if (pageId == 0L) { for (int b = bucket(freeSpace, false) + 1; b < BUCKETS - 1; b++) { pageId = takeEmptyPage(b, DataPageIO.VERSIONS); if (pageId != 0L) { reuseBucket = isReuseBucket(b); break; } } } boolean allocated = pageId == 0L; if (allocated) pageId = allocateDataPage(row.partition()); DataPageIO init = reuseBucket || allocated ? DataPageIO.VERSIONS.latest() : null; written = write(pageId, writeRow, init, row, written, FAIL_I); assert written != FAIL_I; // We can't fail here. } while (written != COMPLETE); } /** {@inheritDoc} */ @Override public boolean updateDataRow(long link, CacheDataRow row) throws IgniteCheckedException { assert link != 0; long pageId = PageIdUtils.pageId(link); int itemId = PageIdUtils.itemId(link); Boolean updated = write(pageId, updateRow, row, itemId, null); assert updated != null; // Can't fail here. return updated; } /** {@inheritDoc} */ @Override public void removeDataRowByLink(long link) throws IgniteCheckedException { assert link != 0; long pageId = PageIdUtils.pageId(link); int itemId = PageIdUtils.itemId(link); long nextLink = write(pageId, rmvRow, itemId, FAIL_L); assert nextLink != FAIL_L; // Can't fail here. while (nextLink != 0L) { memMetrics.decrementLargeEntriesPages(); itemId = PageIdUtils.itemId(nextLink); pageId = PageIdUtils.pageId(nextLink); nextLink = write(pageId, rmvRow, itemId, FAIL_L); assert nextLink != FAIL_L; // Can't fail here. } } /** {@inheritDoc} */ @Override protected Stripe[] getBucket(int bucket) { return buckets.get(bucket); } /** {@inheritDoc} */ @Override protected boolean casBucket(int bucket, Stripe[] exp, Stripe[] upd) { return buckets.compareAndSet(bucket, exp, upd); } /** {@inheritDoc} */ @Override protected boolean isReuseBucket(int bucket) { return bucket == REUSE_BUCKET; } /** * @return Number of empty data pages in free list. */ public int emptyDataPages() { return bucketsSize[emptyDataPagesBucket].intValue(); } /** {@inheritDoc} */ @Override public void addForRecycle(ReuseBag bag) throws IgniteCheckedException { assert reuseList == this: "not allowed to be a reuse list"; put(bag, 0, 0, 0L, REUSE_BUCKET); } /** {@inheritDoc} */ @Override public long takeRecycledPage() throws IgniteCheckedException { assert reuseList == this: "not allowed to be a reuse list"; return takeEmptyPage(REUSE_BUCKET, null); } /** {@inheritDoc} */ @Override public long recycledPagesCount() throws IgniteCheckedException { assert reuseList == this: "not allowed to be a reuse list"; return storedPagesCount(REUSE_BUCKET); } /** * @param row Row. * @return Entry size on page. * @throws IgniteCheckedException If failed. */ private static int getRowSize(CacheDataRow row) throws IgniteCheckedException { int keyLen = row.key().valueBytesLength(null); int valLen = row.value().valueBytesLength(null); return keyLen + valLen + CacheVersionIO.size(row.version(), false) + 8 + (row.cacheId() == 0 ? 0 : 4); } /** {@inheritDoc} */ @Override public String toString() { return "FreeList [name=" + name + ']'; } }