/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ package com.github.geophile.erdo.map.diskmap.tree; import com.github.geophile.erdo.AbstractKey; import com.github.geophile.erdo.UsageError; import com.github.geophile.erdo.config.ConfigurationKeys; import com.github.geophile.erdo.map.Factory; import com.github.geophile.erdo.map.MapCursor; import com.github.geophile.erdo.map.MissingKeyAction; import com.github.geophile.erdo.map.diskmap.DBStructure; import com.github.geophile.erdo.map.diskmap.DiskPage; import com.github.geophile.erdo.map.diskmap.IndexRecord; import com.github.geophile.erdo.map.diskmap.Manifest; import com.github.geophile.erdo.transaction.TransactionManager; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; /* * A Tree is a multi-way tree, organized into pages, much like a * btree. However, due to the append-only nature of Erdo, trees are * static once created, so pages are packed as full as possible, and the * only update permitted is appending (implemented by the Writeable* * classes). * * A Tree is organized into one or more TreeLevels, with the leaf level * considered to be at level 0. * * A TreeLevel is organized into fixed-size TreeSegments. Each * TreeSegment is stored in its own file. Segment size is configured by * disk.segmentSizeBytes, and all segments, except possibly the rightmost * segment of a level, are of this size. * * Each TreeSegment has a TreeSegmentSummary, which records the last key * of the segment, and a bloom filter, used to check whether a key is * present in the segment. * * Each TreeLevel is divided into pages of size disk.pageSizeBytes. * * Erdo maintains in memory a complete representation of all Trees, * TreeLevels, TreeSegments and TreeSegmentSummaries. * * Leaf pages contain full records. * * Non-leaf pages contain IndexRecords, which comprise a key and a child * pointer. Records on the child pages have a key >= the index record key. * I.e., the key is the first key on the child page. (Corollary: The * smallest key in the tree is the first key of the root page.) */ public class Tree { // Object interface @Override public String toString() { return String.format("T%s", treeId); } // Tree interface public MapCursor cursor(AbstractKey startKey) throws IOException, InterruptedException { return level(0).leafLevelEmpty() ? MapCursor.EMPTY : TreeLevelCursor.newCursor(this, startKey); } public MapCursor consolidationScan() throws IOException, InterruptedException { // If there aren't two levels, do a normal, slow cursor. This consolidates small files. Also, fast merge logic // is dependent on the existence of level 1, to delimit level 0 files. return levels.size() <= 1 ? cursor(null) : new LevelOneCursorToFindLevelZeroSegments(this); } public long sizeBytes() { if (sizeBytes == -1L) { sizeBytes = 0; TreeLevel leafLevel = level(0); int segments = leafLevel.segments(); for (int s = 0; s < segments; s++) { sizeBytes += leafLevel.segment(s).pages() * pageSizeBytes; } } return sizeBytes; } public void destroy() { for (TreeLevel level : levels) { level.destroy(); } } public TransactionManager transactionManager() { return transactionManager; } public long treeId() { return treeId; } public int levels() { return levels.size(); } public TreeLevel level(int levelNumber) { return levels.get(levelNumber); } public static WriteableTree create(Factory factory, DBStructure dbStructure, long treeId) { WriteableTree tree = new WriteableTree(factory, dbStructure, treeId); TreeLevel rootLevel = WriteableTreeLevel.create(tree, 0); tree.levels.add(rootLevel); return tree; } public static Tree recover(Factory factory, DBStructure dbStructure, Manifest manifest) throws IOException, InterruptedException { Tree tree = new Tree(factory, dbStructure, manifest.treeId()); tree.recover(manifest); return tree; } // For use by this package Factory factory() { return factory; } public DBStructure dbStructure() { return dbStructure; } int pageSizeBytes() { return pageSizeBytes; } long maxFileSizeBytes() { return maxFileSizeBytes; } int segmentNumber(int pageAddress) { return pageAddress >>> pageNumberBits; } int pageNumber(int pageAddress) { return pageAddress & pageNumberMask; } int pageAddress(int segmentNumber, int pageNumber) { return (segmentNumber << pageNumberBits) | pageNumber; } TreePosition newPosition() { TreePositionPool treePositionPool = factory.threadTreePositionPool(); TreePosition treePosition = (TreePosition) treePositionPool.takeResource(); treePosition.initialize(this); return treePosition; } void descendToLeaf(TreePosition position, AbstractKey startKey, MissingKeyAction missingKeyAction) throws IOException, InterruptedException { int level = position.level().levelNumber(); int recordNumber = recordNumber(position.page(), startKey, missingKeyAction); if (recordNumber == -1) { assert level == 0 : startKey; position.goToEnd(); } else { position.recordNumber(recordNumber); } if (level > 0) { IndexRecord indexRecord = (IndexRecord) position.materializeRecord(); position.level(level - 1).pageAddress(indexRecord.childPageAddress()); descendToLeaf(position, startKey, missingKeyAction); } } // Return -1 to indicate that the resulting cursor will be immediately closed. I.e., missingKeyAction == FORWARD // and startKey > last on page; or missingKeyAction == BACKWARD and startKey < first on page. int recordNumber(DiskPage page, AbstractKey startKey, MissingKeyAction missingKeyAction) throws IOException, InterruptedException { int recordNumber = page.recordNumber(startKey); if (recordNumber >= 0) { // startKey found } else if (page.level() == 0) { // recordNumber is -p-1 where p is insertion point of key. recordNumber = -recordNumber - 1; if (recordNumber == page.nRecords() && missingKeyAction == MissingKeyAction.FORWARD || recordNumber == 0 && missingKeyAction == MissingKeyAction.BACKWARD) { recordNumber = -1; } else if (recordNumber > 0 && missingKeyAction == MissingKeyAction.BACKWARD) { recordNumber--; } } else { // recordNumber is -p-1 where p is insertion point of key. We are above the leaf level so // we want the preceding record. if p = 0, then either this is the left most node (page 0), // or we made a mistake getting here from the parent. assert page.level() > 0 : startKey; if (recordNumber == -1) { int pageNumber = pageNumber(page.pageAddress()); assert pageNumber == 0 : startKey; recordNumber = 0; } else { recordNumber = -recordNumber - 2; } } return recordNumber; } // For use by this class // For use by subclasses protected Tree(Factory factory, DBStructure dbStructure, long treeId) { this.treeId = treeId; this.factory = factory; this.transactionManager = factory.transactionManager(); this.dbStructure = dbStructure; this.pageSizeBytes = factory.configuration().diskPageSizeBytes(); this.maxFileSizeBytes = factory.configuration().diskSegmentSizeBytes(); this.pageNumberBits = com.github.geophile.erdo.util.Math.ceilLog2((int) (maxFileSizeBytes / pageSizeBytes)); this.pageNumberMask = (1 << pageNumberBits) - 1; if (maxFileSizeBytes % pageSizeBytes != 0) { throw new UsageError(String.format("%s (%s) is not divisible by %s (%s)", ConfigurationKeys.DISK_SEGMENT_SIZE_BYTES, maxFileSizeBytes, ConfigurationKeys.DISK_PAGE_SIZE_BYTES, pageSizeBytes)); } } // For use by this class private void recover(Manifest manifest) throws IOException, InterruptedException { for (int level = 0; level < manifest.levels(); level++) { levels.add(TreeLevel.recover(this, level, manifest)); } } // Class state protected static final Logger LOG = Logger.getLogger(Tree.class.getName()); // Object state protected final long treeId; protected final Factory factory; protected final TransactionManager transactionManager; protected final DBStructure dbStructure; protected final int pageSizeBytes; protected final long maxFileSizeBytes; protected final int pageNumberBits; protected final int pageNumberMask; protected final List<TreeLevel> levels = new ArrayList<>(); private long sizeBytes = -1L; }