package com.bigdata.resources; import java.lang.ref.SoftReference; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReentrantLock; import org.apache.log4j.Logger; import com.bigdata.btree.BTree; import com.bigdata.btree.BTreeCounters; import com.bigdata.btree.ILocalBTreeView; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.IndexSegment; import com.bigdata.journal.AbstractJournal; import com.bigdata.mdi.IResourceMetadata; import com.bigdata.mdi.LocalPartitionMetadata; import com.bigdata.rawstore.IRawStore; import com.bigdata.resources.StoreManager.ManagedJournal; /** * Class encapsulates a bunch of metadata used to make decisions about how to * handle an index partition during asynchronous overflow. * <p> * Note: This class uses {@link SoftReference}s to hold onto the mutable * {@link BTree}. The {@link SoftReference} was chosen because it is important * to keep these {@link BTree}s open so that we do not loose their buffers * until we have finish asynchronous overflow for a given {@link BTree}. Once * asynchronous overflow processing is complete for the {@link BTree} you SHOULD * use {@link #clearRef()} to release your hold those buffers. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ class BTreeMetadata { static protected final Logger log = Logger.getLogger(BTreeMetadata.class); /** * The object which may be used to (re-)open the {@link BTree} and the index * partition view. */ protected final ResourceManager resourceManager; /** * The commit time associated with the {@link BTree} and the index partition * view. */ public final long commitTime; /** * Name of the local index (the index partition name). */ public final String name; /** * A {@link SoftReference} is used to cache the {@link BTree} reference * since we really want to hold onto the reference until we get around to * finishing overflow processing for this index partition. However, you * SHOULD clear the reference using {@link #clearRef()} as soon as you * have handled asynchronous overflow for this view. */ private volatile SoftReference<BTree> ref; /** * Open the mutable {@link BTree}. The {@link BTree} reference is cached by * a {@link SoftReference}. If the reference has been cleared then the * {@link BTree} is re-opened from the backing journal. */ final public BTree getBTree() { // double checked locking. BTree btree = ref == null ? null : ref.get(); if (btree == null) { synchronized (this) { btree = ref == null ? null : ref.get(); /* * The mutable btree on the journal associated with the * commitTime, not the full view of that index. */ final AbstractJournal store = resourceManager .getJournal(commitTime); btree = (BTree) resourceManager.getIndexOnStore(name, commitTime, store); if (btree == null) throw new IllegalArgumentException(); ref = new SoftReference<BTree>(btree); } } return btree; } /** * Release the {@link SoftReference} for the {@link BTree}. */ public void clearRef() { synchronized(this) { if(ref != null) { ref.clear(); } } } public final IndexMetadata indexMetadata; public final LocalPartitionMetadata pmd; /** * The #of journals and index segments in the view. */ public final int sourceCount; /** * The #of journals in the view. */ public final int sourceJournalCount; /** * The #of index segments in the view. */ public final int sourceSegmentCount; /** * The sum of the size on disk across the index segments in the view. */ public final long sumSegBytes; /** * These constants are used to compute the {@link #mergePriority}. * * See src/architecture/mergePriority.xls. */ final private int A = 3, B = 1;//, C = 10; /** * This is the inverse of the {@link #mergePriority}. If the merge priority * is ZERO (0), then this is 1.0 (which is greater than the build priority * which associated with any index partition with a non-zero merge * priority). */ public final double buildPriority; /** * The computed merge priority is based on the complexity of the view. This * is ZERO (0) if there is no reason to perform a merge. */ public final double mergePriority; // /** // * The split priority is based solely on {@link #sumSegBytes} (it is the // * ratio of that value to the nominal shard size) and is ZERO (0) until // * {@link #sumSegBytes} is GTE the {@link OverflowManager#nominalShardSize}. // * // * @deprecated This did not consider the adjusted nominal shard size. // */ // public final double splitPriority; /** * <code>true</code> iff this index partition meets the criteria for a * mandatory compacting merge (too many journals in the view, too many index * segments in the view, or too many sources in the view). * * @deprecated by {@link #mergePriority}. * <p> * Note: This field can be dropped once we are running split, * tailSplit, and scatterSplit as merge after actions. At the * same time, make sure that those tasks will not accept a view * unless it is a {@link #compactView}. */ public final boolean mandatoryMerge; /** * <code>true</code> iff there are two sources in the view and the second * source is an {@link IndexSegment} (the first will be the {@link BTree} on * some {@link ManagedJournal}). */ public final boolean compactView; /** * The entry count for the {@link BTree} itself NOT the view. */ public final long entryCount; /** * The counters for the index partition view. */ public final BTreeCounters btreeCounters; /** * The percentage of leaf splits which occurred at or near the head of the * {@link BTree}. */ public final double percentHeadSplits; /** * The percentage of leaf splits which occurred at or near the tail of the * {@link BTree}. */ public final double percentTailSplits; /** * A package private lock used to ensure that decisions concerning which * index partition operation (build, merge, split, etc) to execute are * serialized. * * @see OverflowMetadata#setAction(String, OverflowActionEnum) */ final ReentrantLock lock = new ReentrantLock(); /** * The action taken and <code>null</code> if no action has been taken for * this local index. * <p> * Note: An {@link AtomicReference} is used to permit inspection of the * value without holding the {@link #lock}. If you want to make an atomic * decision based on this value, then make sure that you are holding the * {@link #lock} before you look at the value. */ private final AtomicReference<OverflowActionEnum> actionRef = new AtomicReference<OverflowActionEnum>(); /** * The action taken and <code>null</code> if no action has been taken for * this local index (non-blocking). * <p> * Note: An {@link AtomicReference} is used to permit inspection of the * value without holding the {@link #lock}. If you want to make an atomic * decision based on this value, then make sure that you are holding the * {@link #lock} before you look at the value. */ public OverflowActionEnum getAction() { return actionRef.get(); } /** * Set the action to be taken. * * @param action * The action. * @throws IllegalArgumentException * if the argument is <code>null</code>. * @throws IllegalMonitorStateException * unless the {@link #lock} is held by the caller. * @throws IllegalStateException * if the action has already been set. */ public void setAction(final OverflowActionEnum action) { if(action == null) throw new IllegalArgumentException(); if (!lock.isHeldByCurrentThread()) throw new IllegalMonitorStateException(); if (actionRef.get() != null) { throw new IllegalStateException("Already set: " + actionRef.get() + ", given=" + action); } actionRef.set(action); } /** * Used to force clear a {@link OverflowActionEnum#Copy} action * when we will force a compacting merge. This allows us to do * compacting merges on shard views which would otherwise simply * be copied onto the new journal. */ void clearCopyAction() { lock.lock(); try { if(actionRef.get().equals(OverflowActionEnum.Copy)) { actionRef.set(null/*clear*/); } } finally { lock.unlock(); } } /** * * @param resourceManager * Used to (re-)open the {@link BTree} as necessary. * @param commitTime * The commit time corresponding to the desired commit point. * @param name * The name of the {@link BTree}. * @param btreeCounters * The aggregated counters for the {@link BTree} or the * {@link ILocalBTreeView index partition view} as reported by * the {@link IndexManager} */ public BTreeMetadata(final ResourceManager resourceManager, final long commitTime, final String name, final BTreeCounters btreeCounters) { if (resourceManager == null) throw new IllegalArgumentException(); if (name == null) throw new IllegalArgumentException(); if (btreeCounters == null) throw new IllegalArgumentException(); this.resourceManager = resourceManager; this.commitTime = commitTime; this.name = name; // eager resolution to put a SoftReference in place. final BTree btree = getBTree(); // index metadata for that index partition. indexMetadata = btree.getIndexMetadata(); // index partition metadata pmd = indexMetadata.getPartitionMetadata(); if (pmd == null) log.warn("Not an index partition: " + name); // #of sources in the view (very fast). int sourceCount = 0, sourceJournalCount = 0, sourceSegmentCount = 0; long sumSegBytes = 0L; boolean secondSourceIsSeg = false; if (pmd != null) { for (IResourceMetadata x : pmd.getResources()) { sourceCount++; if (x.isJournal()) { sourceJournalCount++; } else { sourceSegmentCount++; /* * Note: This opens the backing segment store in order to * determine its size on the disk. This is a fairly light * weight operation (the nodes region is not read, just the * checkpoint record and the IndexMetadata record). * * Note: This does not use IResourceMetadata#getFile() to * determine the size of the file in the backing file system * because it is not an absolute file path. */ final IRawStore store = resourceManager.openStore(x.getUUID()); sumSegBytes += store.size();//new File(x.getFile()).length(); if (sourceCount == 2) { secondSourceIsSeg = true; } } } } this.sourceCount = sourceCount; this.sourceJournalCount = sourceJournalCount; this.sourceSegmentCount = sourceSegmentCount; this.sumSegBytes = sumSegBytes; this.compactView = sourceCount == 2 && secondSourceIsSeg; if (sourceJournalCount + sourceSegmentCount < 2) { /* * Nothing to merge. The build priority is 1.0 for this case. The * maximum build priority for an index partition with a non-zero * mergePriority will always be less than one. */ this.mergePriority = 0d; this.buildPriority = 1d; } else { /* * Compute a score that will be used to prioritize compacting merges * vs builds for index partitions where either option is allowable. * The higher the score, the more we want to make sure that we do a * compacting merge for that index. * * Note: The main purpose of an index partition build is to convert * from a write-order to a read-order and permit the release of the * old journal. However, applications which require frequent access * to historical commit points on the old journals will continue to * rely on the write-order journals. * * Note: I have removed the sumSegBytes term from this formula since * that would tend to cause the priority of merge to increase for a * view until a split is performed, with the likelihood that * repeated merges would be performed for the same view just when it * is nearing its largest extent. Instead I have modified the * formula to consider only the view complexity for merge. * * @todo if the application requires access to modest amounts of * history then consider a policy where the buffers are retained for * old journals up to the minReleaseAge. Of course, this can run * into memory constraints so that needs to be traded off against * IOWAIT. */ this.mergePriority = (sourceJournalCount - 1) * A + (sourceSegmentCount * B) //+ ((sumSegBytes / resourceManager.nominalShardSize) * C) ; this.buildPriority = 1. / mergePriority; } // /* // * The splitPriority considers only sumSegBytes. // * // * @todo This does not consider the adjustedNominalShardSize. // */ // this.splitPriority = (sumSegBytes < resourceManager.nominalShardSize) ? 0 // : (sumSegBytes / (double) resourceManager.nominalShardSize); this.mandatoryMerge // = sourceJournalCount >= resourceManager.maximumJournalsPerView // || sourceSegmentCount >= resourceManager.maximumSegmentsPerView // ; // BTree's directly maintained entry count (very fast). this.entryCount = btree.getEntryCount(); this.btreeCounters = btreeCounters; // Note: +1 in the denominator to avoid divide by zero. this.percentHeadSplits = btreeCounters.headSplit / (btreeCounters.leavesSplit + 1d); // Note: +1 in the denominator to avoid divide by zero. this.percentTailSplits = btreeCounters.tailSplit / (btreeCounters.leavesSplit + 1d); } public String toString() { final StringBuilder sb = new StringBuilder(); sb.append("name=" + name); sb.append(", action=" + actionRef.get()); sb.append(", entryCount=" + entryCount); sb.append(", sumSegBytes=" + sumSegBytes); sb.append(", mergePriority=" + mergePriority); // sb.append(", splitPriority=" + splitPriority); sb.append(", manditoryMerge=" + mandatoryMerge); sb.append(", sourceCounts=" + "{all=" + sourceCount + ",journals=" + sourceJournalCount + ",segments=" + sourceSegmentCount + "}"); sb.append(", #leafSplit=" + btreeCounters.leavesSplit); sb.append(", #headSplit=" + btreeCounters.headSplit); sb.append(", #tailSplit=" + btreeCounters.tailSplit); sb.append(", percentHeadSplits=" + percentHeadSplits); sb.append(", percentTailSplits=" + percentTailSplits); toString(sb); return sb.toString(); } /** * Permits extension of {@link #toString()} in subclass. * * @param sb */ protected void toString(final StringBuilder sb) { // NOP } }