/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Mar 24, 2008
*/
package com.bigdata.resources;
import java.io.File;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Lock;
import org.apache.log4j.Logger;
import com.bigdata.btree.AbstractBTree;
import com.bigdata.btree.BTree;
import com.bigdata.btree.BTreeCounters;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.IndexSegmentBuilder;
import com.bigdata.btree.IndexSegmentCheckpoint;
import com.bigdata.btree.IndexSegmentStore;
import com.bigdata.btree.ReadCommittedView;
import com.bigdata.btree.view.FusedView;
import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout;
import com.bigdata.cache.HardReferenceQueue;
import com.bigdata.cache.LRUCache;
import com.bigdata.concurrent.NamedLock;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.ICounterSet;
import com.bigdata.journal.AbstractJournal;
import com.bigdata.journal.AbstractTask;
import com.bigdata.journal.ConcurrencyManager;
import com.bigdata.journal.ICommitRecord;
import com.bigdata.journal.IJournal;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Journal;
import com.bigdata.journal.Name2Addr;
import com.bigdata.journal.NoSuchIndexException;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.journal.Tx;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.SegmentMetadata;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.service.Event;
import com.bigdata.service.EventType;
import com.bigdata.service.IBigdataClient;
import com.bigdata.service.IDataService;
import com.bigdata.service.ndx.IClientIndex;
import com.bigdata.util.Bytes;
import com.bigdata.util.NT;
/**
* Class encapsulates logic and handshaking for tracking which indices (and
* their backing stores) are recently and currently referenced. This information
* is used to coordinate the close out of index resources (and their backing
* stores) on an LRU basis by the {@link ResourceManager}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
abstract public class IndexManager extends StoreManager {
/**
* Logger.
*/
private static final Logger log = Logger.getLogger(IndexManager.class);
/**
* Options understood by the {@link IndexManager}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
public static interface Options extends StoreManager.Options {
/**
* The capacity of the LRU cache of open {@link IIndex}s. The capacity
* of this cache indirectly controls how many {@link IIndex}s will be
* held open. The main reason for keeping an {@link IIndex} open is to
* reuse its buffers, including its node and leaf cache, if another
* request arrives "soon" which would require on that {@link IIndex}.
* <p>
* The effect of this parameter is indirect owning to the semantics of
* weak references and the control of the JVM over when they are
* cleared. Once an index becomes weakly reachable, the JVM will
* eventually GC the index object, thereby effectively closing it (or at
* least releasing all resources associated with that index). Since
* indices which are strongly reachable are never "closed" this provides
* our guarantee that indices are never closed if they are in use.
* <p>
* Note: The {@link IIndex}s managed by this class are a
* {@link FusedView} of {@link AbstractBTree}s. Each
* {@link AbstractBTree} has a hard reference to the backing
* {@link IRawStore} and will keep the {@link IRawStore} from being
* finalized as long as a hard reference exists to the
* {@link AbstractBTree} (the reverse is not true - an {@link IRawStore}
* reference does NOT hold a hard reference to {@link AbstractBTree}s
* on that {@link IRawStore}).
* <p>
* Note: The retention of the {@link BTree}s on the live
* {@link ManagedJournal}s is governed by
* {@link com.bigdata.journal.Options#LIVE_INDEX_CACHE_CAPACITY}.
* <p>
* Note: The retention of the {@link BTree}s on the open historical
* {@link ManagedJournal}s is governed by
* {@link com.bigdata.journal.Options#HISTORICAL_INDEX_CACHE_CAPACITY}.
*
* @see #DEFAULT_INDEX_CACHE_CAPACITY
*/
String INDEX_CACHE_CAPACITY = IndexManager.class.getName()
+ ".indexCacheCapacity";
String DEFAULT_INDEX_CACHE_CAPACITY = "20";
/**
* The time in milliseconds before an entry in the index cache will be
* cleared from the backing {@link HardReferenceQueue} (default
* {@value #DEFAULT_INDEX_CACHE_TIMEOUT}). This property controls how
* long the index cache will retain an {@link IIndex} which has not been
* recently used. This is in contrast to the cache capacity.
*/
String INDEX_CACHE_TIMEOUT = IndexManager.class.getName()
+ ".indexCacheTimeout";
String DEFAULT_INDEX_CACHE_TIMEOUT = ""+(60*1000); // One minute.
/**
* The capacity of the LRU cache of open {@link IndexSegment}s. The
* capacity of this cache indirectly controls how many
* {@link IndexSegment}s will be held open. The main reason for keeping
* an {@link IndexSegment} open is to reuse its buffers, including its
* node and leaf cache, if another request arrives "soon" which would
* read on that {@link IndexSegment}.
* <p>
* The effect of this parameter is indirect owning to the semantics of
* weak references and the control of the JVM over when they are
* cleared. Once an index becomes weakly reachable, the JVM will
* eventually GC the index object, thereby effectively closing it (or at
* least releasing all resources associated with that index). Since
* indices which are strongly reachable are never "closed" this provides
* our guarantee that indices are never closed if they are in use.
* <p>
* Note: {@link IndexSegment}s have a hard reference to the backing
* {@link IndexSegmentStore} and will keep the {@link IndexSegmentStore}
* from being finalized as long as a hard reference exists to the
* {@link IndexSegment} (the reverse is not true - the
* {@link IndexSegmentStore} does NOT hold a hard reference to the
* {@link IndexSegment}).
*
* @see #DEFAULT_INDEX_SEGMENT_CACHE_CAPACITY
*/
String INDEX_SEGMENT_CACHE_CAPACITY = IndexManager.class.getName()
+ ".indexSegmentCacheCapacity";
/**
* The default for the {@link #INDEX_SEGMENT_CACHE_CAPACITY} option.
*/
String DEFAULT_INDEX_SEGMENT_CACHE_CAPACITY = "60";
/**
* The time in milliseconds before an entry in the index segment cache
* will be cleared from the backing {@link HardReferenceQueue} (default
* {@value #DEFAULT_INDEX_SEGMENT_CACHE_TIMEOUT}). This property
* controls how long the index segment cache will retain an
* {@link IndexSegment} which has not been recently used. This is in
* contrast to the cache capacity.
*/
String INDEX_SEGMENT_CACHE_TIMEOUT = IndexManager.class.getName()
+ ".indexCacheTimeout";
String DEFAULT_INDEX_SEGMENT_CACHE_TIMEOUT = "" + (60 * 1000); // One
// minute.
}
/**
* Performance counters for the {@link IndexManager}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
public static interface IIndexManagerCounters {
/**
* The parent under which the per-index partition performance counters
* are listed.
*/
String Indices = "indices";
/**
* The capacity of the cache of stale locators.
*
* @see StaleLocatorException
*/
String StaleLocatorCacheCapacity = "Stale Locator Cache Capacity";
/**
* The #of stale locators in the cache.
*
* @see StaleLocatorException
*/
String StaleLocatorCacheSize = "Stale Locator Cache Size";
/**
* The stale locators, including the {@link StaleLocatorReason} for each
* one.
*/
String StaleLocators = "Stale Locators";
/**
* The #of named indices on the live journal. Each index partition is
* registered as an named index on the live journal, so this may also be
* interpreted as the #of index partitions on the data service.
*/
String IndexCount = "Index Count";
/**
* The capacity of the index cache.
*/
String IndexCacheCapacity = "Index Cache Capacity";
/**
* The approximate #of open indices.
*/
String IndexCacheSize = "Index Cache Size";
/**
* The capacity of the {@link IndexSegment} cache.
*/
String IndexSegmentCacheCapacity = "Index Segment Cache Capacity";
/**
* The approximate #of open {@link IndexSegment}s.
*/
String IndexSegmentCacheSize = "Index Segment Cache Size";
/**
* The approximate #of {@link IndexSegment} leaves that are buffered in
* memory.
*/
String IndexSegmentOpenLeafCount = "Index Segment Open Leaf Count";
/**
* The #of bytes on disk occupied by the {@link IndexSegment} leaves
* which are currently loaded into memory (their in-memory profile can
* not be directly captured by the java runtime, but you can get it from
* a heap dump). Likewise, you can directly obtain the #of bytes on disk
* per leaf from the {@link IndexSegmentCheckpoint} or from
* {@link DumpFederation}.
*/
String IndexSegmentOpenLeafByteCount = "Index Segment Open Leaf Byte Count";
}
/**
* This map is used to note index partitions which could not be split and
* have become overextended as a result (they are at least 2x the nominal
* size of a shard and are refusing to split). These indices are registered
* in this map in order to disallow additional writes onto the index, which
* pushes the problem back onto the application.
*/
private final ConcurrentHashMap<String, Void> disabledShards = new ConcurrentHashMap<String, Void>();
/**
* Declare that the named index will no longer accept writes (transient
* effect only).
*/
public void disableWrites(final String name) {
disabledShards.putIfAbsent(name, null);
}
/**
* Declare that the named index will accept writes (default).
*/
public void enableWrites(final String name) {
disabledShards.remove(name);
}
/**
* Return <code>true</code> if writes have been disabled for the named
* index.
*
* @param name
* The index name.
*
* @return <code>true</code> if writes are disabled for that index.
*/
public boolean isDisabledWrites(final String name) {
return disabledShards.contains(name);
}
/**
* Cache of added/retrieved {@link IIndex}s by name and timestamp.
* <p>
* Map from the name and timestamp of an index to a weak reference for the
* corresponding {@link IIndex}. Entries will be cleared from this map
* after they have become only weakly reachable. Entries are associated with
* a timestamp based on their last use and entries whose timestamp exceeds
* the {@link Options#INDEX_CACHE_TIMEOUT} will be cleared from the backing
* {@link HardReferenceQueue}. If they become weakly reachable they will
* then be cleared from the cache as well.
* <p>
* Note: The capacity of the backing {@link HardReferenceQueue} effects how
* many _clean_ indices can be held in the cache. Dirty indices remain
* strongly reachable owing to their existence in the
* {@link Name2Addr#commitList}.
* <p>
* Note: Read-historical and read-committed tasks need to hold a read lock
* on the local resources in order to prevent their being released if there
* is a concurrent commit followed by a request to the StoreManager to
* purgeResources. This problem is very similar to the problem of the
* transaction manager which needs to manage the global release time.
* <p>
* Note: {@link ITx#READ_COMMITTED} indices MUST NOT be allowed into this
* cache. Each time there is a commit for a given {@link BTree}, the
* {@link ITx#READ_COMMITTED} view of that {@link BTree} needs to be
* replaced by the most recently committed view, which is a different
* {@link BTree} object and is loaded from a different checkpoint record.
* <p>
* Note: {@link ITx#UNISOLATED} indices have a related problem. Those views
* are no longer valid after synchronous overflow since a new view is
* defined by that process. Likewise, the various atomic update tasks during
* asynchronous overflow also change the definition of the view. Therefore I
* have modified the IndexManager to NOT permit UNISOLATES views into the
* index cache. Note however that the Journal still retains a live index
* cache and that we still have a separate cache for index segment stores.
*
* @see Options#INDEX_CACHE_CAPACITY
* @see Options#INDEX_CACHE_TIMEOUT
*
* @todo alternatively, if such views are allowed in then this cache must be
* encapsulated by logic that examines the view when the timestamp is
* {@link ITx#READ_COMMITTED} to make sure that the BTree associated
* with that view is current (as of the last commit point). If not,
* then the entire view needs to be regenerated since the index view
* definition (index segments in use) might have changed as well.
*/
// final private WeakValueCache<NT, IIndex> indexCache;
final private IndexCache<ILocalBTreeView> indexCache;
/**
* The earliest timestamp that MUST be retained for the read-historical
* indices in the cache and {@link Long#MAX_VALUE} if there are NO
* read-historical indices in the cache.
*
* @see StoreManager#indexCacheLock
*/
@Override
protected long getIndexRetentionTime() {
final long t = indexCache.getRetentionTime();
assert t > 0 : "t=" + t;
return t;
}
/**
* A canonicalizing cache for {@link IndexSegment}s.
* <p>
* Note: {@link IndexSegmentStore} already makes the {@link IndexSegment}s
* canonical and the {@link StoreManager#storeCache} makes the
* {@link IndexSegmentStore}s canonical so what this really does is give
* you a cache which lets you exert some more control over the #of
* {@link IndexSegment}s that are open.
*
* FIXME It might be better to break this down as a journalCache and a
* segmentCache on the {@link StoreManager}. That is more explicit and
* there is less interaction between the configuration choices with that
* breakdown.
*
* @see Options#INDEX_SEGMENT_CACHE_CAPACITY
* @see Options#INDEX_SEGMENT_CACHE_TIMEOUT
*/
final private ConcurrentWeakValueCacheWithTimeout<UUID, IndexSegment> indexSegmentCache;
/**
* Provides locks on a per-{name+timestamp} basis for higher concurrency.
*/
private final transient NamedLock<NT> namedLock = new NamedLock<NT>();
/**
* Provides locks on a per-{@link IndexSegment} UUID basis for higher
* concurrency.
* <p>
* Note: The UUID is the unique key for the {@link #indexSegmentCache}.
* <p>
* Note: The index name + timestamp is NOT a good basis for locking for the
* {@link #indexSegmentCache} because many different timestamps will be
* mapped onto the same {@link IndexSegment}.
*/
private final transient NamedLock<UUID> segmentLock = new NamedLock<UUID>();
/**
* The #of entries in the hard reference cache for {@link IIndex}s. There
* MAY be more {@link IIndex}s open than are reported by this method if
* there are hard references held by the application to those {@link IIndex}s.
* {@link IIndex}s that are not fixed by a hard reference will be quickly
* finalized by the JVM.
*/
public int getIndexCacheSize() {
return indexCache.size();
}
/**
* The configured capacity of the index cache.
*
* @see Options#INDEX_CACHE_CAPACITY
*/
public int getIndexCacheCapacity() {
return indexCache.capacity();
}
/**
* The #of entries in the hard reference cache for {@link IndexSegment}s.
* There MAY be more {@link IndexSegment}s open than are reported by this
* method if there are hard references held by the application to those
* {@link IndexSegment}s. {@link IndexSegment}s that are not fixed by a
* hard reference will be quickly finalized by the JVM.
*/
public int getIndexSegmentCacheSize() {
return indexSegmentCache.size();
}
/**
* The configured capacity of the index segment cache.
*
* @see Options#INDEX_SEGMENT_CACHE_CAPACITY
*/
public int getIndexSegmentCacheCapacity() {
return indexSegmentCache.capacity();
}
/**
* Statistics about the {@link IndexSegment}s open in the cache.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
public static class IndexSegmentStats {
public long leafCount;
public long leafByteCount;
}
// /**
// * The approximate #of {@link IndexSegment} leaves in memory.
// */
// public int getIndexSegmentOpenLeafCount() {
//
// final Iterator<WeakReference<IndexSegment>> itr = indexSegmentCache
// .iterator();
//
// int leafCount = 0;
//
// while (itr.hasNext()) {
//
// final IndexSegment seg = itr.next().get();
//
// if (seg != null) {
//
// leafCount += seg.getOpenLeafCount();
//
// }
//
// }
//
// return leafCount;
//
// }
//
// /**
// * The #of bytes on disk occupied by the {@link IndexSegment} leaves which
// * are currently loaded into memory (their in-memory profile can not be
// * directly captured by the java runtime, but you can get it from a heap
// * dump). Likewise, you can directly obtain the #of bytes on disk per leaf
// * from the {@link IndexSegmentCheckpoint} or from {@link DumpFederation}.
// */
// public long getIndexSegmentOpenLeafByteCount() {
//
// final Iterator<WeakReference<IndexSegment>> itr = indexSegmentCache
// .iterator();
//
// long leafByteCount = 0;
//
// while (itr.hasNext()) {
//
// final IndexSegment seg = itr.next().get();
//
// if (seg != null) {
//
// leafByteCount += seg.getOpenLeafByteCount();
//
// }
//
// }
//
// return leafByteCount;
//
// }
/**
* This cache is used to provide remote clients with an unambiguous
* indication that an index partition has been rather than simply not
* existing or having been dropped.
*<p>
* The keys are the name of an index partitions that has been split, joined,
* or moved. Such index partitions are no longer available and have been
* replaced by one or more new index partitions (having a distinct partition
* identifier) either on the same or on another data service. The value is a
* reason, e.g., "split", "join", or "move".
*/
// private // @todo exposed for counters - should be private.
protected final LRUCache<String/* name */, StaleLocatorReason/* reason */> staleLocatorCache = new LRUCache<String, StaleLocatorReason>(
1000);
/**
* Note: this information is based on an LRU cache with a large fixed
* capacity. It is expected that the cache size is sufficient to provide
* good information to clients having queued write tasks. If the index
* partition split/move/join changes somehow outpace the cache size then
* the client would see a {@link NoSuchIndexException} instead.
*/
@Override
public StaleLocatorReason getIndexPartitionGone(final String name) {
return staleLocatorCache.get(name);
}
/**
* Notify the {@link ResourceManager} that the named index partition was
* split, joined or moved. This effects only the unisolated view of that
* index partition. Historical views will continue to exist and reside as
* before.
*
* @param name
* The name of the index partition.
* @param reason
* The reason (split, join, or move).
*
* FIXME Should also include "deleted" and handle case where a scale-out
* index is deleted and then re-created so that we don't get the
* {@link StaleLocatorException} after the recreate.
*/
protected void setIndexPartitionGone(final String name,
final StaleLocatorReason reason) {
if (name == null)
throw new IllegalArgumentException();
if (reason == null)
throw new IllegalArgumentException();
if (log.isInfoEnabled())
log.info("name=" + name + ", reason=" + reason);
staleLocatorCache.put(name, reason, true);
// clear from the index counters.
indexCounters.remove(name);
}
/**
* The #of entries in the stale locator LRU.
*/
protected int getStaleLocatorCount() {
return staleLocatorCache.size();
}
protected IndexManager(final Properties properties) {
super(properties);
/*
* indexCache
*/
{
final int indexCacheCapacity = Integer.parseInt(properties.getProperty(
Options.INDEX_CACHE_CAPACITY,
Options.DEFAULT_INDEX_CACHE_CAPACITY));
if (log.isInfoEnabled())
log.info(Options.INDEX_CACHE_CAPACITY + "="
+ indexCacheCapacity);
if (indexCacheCapacity <= 0)
throw new RuntimeException(Options.INDEX_CACHE_CAPACITY
+ " must be positive");
final long indexCacheTimeout = Long.parseLong(properties
.getProperty(Options.INDEX_CACHE_TIMEOUT,
Options.DEFAULT_INDEX_CACHE_TIMEOUT));
if (log.isInfoEnabled())
log.info(Options.INDEX_CACHE_TIMEOUT + "=" + indexCacheTimeout);
if (indexCacheTimeout < 0)
throw new RuntimeException(Options.INDEX_CACHE_TIMEOUT
+ " must be non-negative");
indexCache = new IndexCache(indexCacheCapacity, indexCacheTimeout);
}
/*
* indexSegmentCache
*/
{
final int indexSegmentCacheCapacity = Integer.parseInt(properties.getProperty(
Options.INDEX_SEGMENT_CACHE_CAPACITY,
Options.DEFAULT_INDEX_SEGMENT_CACHE_CAPACITY));
if (log.isInfoEnabled())
log.info(Options.INDEX_SEGMENT_CACHE_CAPACITY + "="
+ indexSegmentCacheCapacity);
if (indexSegmentCacheCapacity <= 0)
throw new RuntimeException(Options.INDEX_SEGMENT_CACHE_CAPACITY
+ " must be positive");
final long indexSegmentCacheTimeout = Long.parseLong(properties
.getProperty(Options.INDEX_SEGMENT_CACHE_TIMEOUT,
Options.DEFAULT_INDEX_SEGMENT_CACHE_TIMEOUT));
if (log.isInfoEnabled())
log.info(Options.INDEX_SEGMENT_CACHE_TIMEOUT + "="
+ indexSegmentCacheTimeout);
if (indexSegmentCacheTimeout < 0)
throw new RuntimeException(Options.INDEX_SEGMENT_CACHE_TIMEOUT
+ " must be non-negative");
indexSegmentCache = new ConcurrentWeakValueCacheWithTimeout<UUID, IndexSegment>(
indexSegmentCacheCapacity, TimeUnit.MILLISECONDS
.toNanos(indexSegmentCacheTimeout));
}
}
/**
* Return a reference to the named index as of the specified timestamp on
* the identified resource.
* <p>
* Note: {@link AbstractTask} handles the load of the {@link ITx#UNISOLATED}
* index from the live journal in such a manner as to provide ACID semantics
* for add/drop of indices.
* <p>
* Note: The returned index is NOT isolated. Isolation is handled by the
* {@link Tx}.
*
* @param name
* The index name.
* @param timestamp
* A transaction identifier, {@link ITx#UNISOLATED} for the
* unisolated index view, {@link ITx#READ_COMMITTED}, or
* <code>timestamp</code> for a historical view no later than
* the specified timestamp.
* @param store
* The store from which the index will be loaded.
*
* @return A reference to the index -or- <code>null</code> if the index
* was not registered on the resource as of the timestamp or if the
* store has no data for that timestamp.
*
* @todo this might have to be private since we assume that the store is in
* {@link StoreManager#openStores}.
*/
public AbstractBTree getIndexOnStore(final String name,
final long timestamp, final IRawStore store) {
if (name == null)
throw new IllegalArgumentException();
if (store == null)
throw new IllegalArgumentException();
final AbstractBTree btree;
if (store instanceof IJournal) {
/*
* A BTree on this Journal.
*/
btree = getIndexOnJournal(name, timestamp, (AbstractJournal) store);
} else {
/*
* An IndexSegmentStore containing a single IndexSegment.
*/
btree = getIndexOnSegment(name, timestamp,
(IndexSegmentStore) store);
}
if (btree != null) {
/*
* Make sure that it is using the canonical counters for that index.
*
* Note: AbstractTask also does this for UNISOLATED indices which it
* loads by itself as part of providing ACID semantics for add/drop
* of indices.
*/
btree.setBTreeCounters(getIndexCounters(name));
}
if (log.isInfoEnabled())
log.info("name=" + name + ", timestamp=" + timestamp + ", found="
+ (btree != null) + ", store=" + store + " : " + btree);
return btree;
}
final private AbstractBTree getIndexOnJournal(final String name,
final long timestamp, final AbstractJournal journal) {
final AbstractBTree btree;
if (timestamp == ITx.UNISOLATED) {
/*
* Unisolated index.
*/
// MAY be null.
btree = (BTree) journal.getIndex(name);
} else if (timestamp == ITx.READ_COMMITTED) {
/*
* Read committed operation against the most recent commit point.
*
* Note: This commit record is always defined, but that does not
* mean that any indices have been registered.
*/
final ICommitRecord commitRecord = journal.getCommitRecord();
final long ts = commitRecord.getTimestamp();
if (ts == 0L) {
log.warn("Nothing committed: read-committed operation.");
return null;
}
// MAY be null.
btree = (BTree) journal
.getIndexWithCommitRecord(name, commitRecord);
if (btree != null) {
assert ((BTree) btree).getLastCommitTime() != 0;
}
} else {
/*
* A specified historical index commit point.
*/
// use absolute value in case timestamp is negative.
final long ts = Math.abs(timestamp);
// the corresponding commit record on the journal.
final ICommitRecord commitRecord = journal.getCommitRecord(ts);
if (commitRecord == null) {
log.warn("Resource has no data for timestamp: name=" + name
+ ", timestamp=" + timestamp + ", resource="
+ journal.getResourceMetadata());
return null;
}
// open index on that journal (MAY be null).
btree = (BTree) journal
.getIndexWithCommitRecord(name, commitRecord);
if (btree == null)
log.warn("Index not found: name=" + name + ", timestamp="
+ TimestampUtility.toString(timestamp) + ", ts=" + ts
+ ", commitRecord=" + commitRecord + ", ds="
+ getDataServiceUUID());
if (btree != null) {
assert ((BTree) btree).getLastCommitTime() != 0;
}
}
// MAY be null.
return btree;
}
final private IndexSegment getIndexOnSegment(final String name,
final long timestamp, IndexSegmentStore segStore) {
final IndexSegment btree;
if (timestamp != ITx.READ_COMMITTED && timestamp != ITx.UNISOLATED) {
// use absolute value in case timestamp is negative.
final long ts = Math.abs(timestamp);
if (segStore.getCheckpoint().commitTime > ts) {
log.warn("Resource has no data for timestamp: name=" + name
+ ", timestamp=" + timestamp + ", store=" + segStore);
return null;
}
}
{
final IResourceMetadata resourceMetadata = segStore
.getResourceMetadata();
final UUID storeUUID = resourceMetadata.getUUID();
/*
* Note: synchronization is required to have the semantics of an
* atomic get/put against the WeakValueCache.
*
* Note: The load of the index segment from the store can have
* significant latency. The use of a per-UUID lock allows us to load
* index segments for different index views concurrently.
*
* Note: We DO NOT use a name+timestamp lock here because many
* different timestamp values will be served by the same
* IndexSegment.
*/
final Lock lock = segmentLock.acquireLock(storeUUID);
try {
// check the cache first.
IndexSegment seg = indexSegmentCache.get(storeUUID);
if (seg == null) {
if (log.isInfoEnabled())
log
.info("Loading index segment from store: name="
+ name + ", file="
+ resourceMetadata.getFile());
// Open an index segment.
seg = segStore.loadIndexSegment();
indexSegmentCache.put(storeUUID, seg);
}
btree = seg;
} finally {
lock.unlock();
}
}
// MAY be null.
return btree;
}
@Override
public AbstractBTree[] getIndexSources(final String name,
final long timestamp) {
if (log.isInfoEnabled())
log.info("name=" + name + ", timestamp=" + timestamp);
/*
* Open the index on the journal for that timestamp.
*/
final BTree btree;
{
// the corresponding journal (can be the live journal).
final AbstractJournal journal = getJournal(timestamp);
if (journal == null) {
log.warn("No journal with data for timestamp: name=" + name
+ ", timestamp=" + timestamp);
return null;
}
btree = (BTree) getIndexOnStore(name, timestamp, journal);
if (btree == null) {
log.warn("No such index: name=" + name + ", timestamp="
+ TimestampUtility.toString(timestamp));
return null;
}
if (log.isInfoEnabled())
log.info("name=" + name + ", timestamp=" + timestamp
+ ", counter=" + btree.getCounter().get()
+ ", journal=" + journal.getResourceMetadata());
}
return getIndexSources(name, timestamp, btree);
}
@Override
public AbstractBTree[] getIndexSources(final String name,
final long timestamp, final BTree btree) {
/*
* Get the index partition metadata (if any). If defined, then we know
* that this is an index partition and that the view is defined by the
* resources named in that index partition. Otherwise the index is
* unpartitioned.
*/
final LocalPartitionMetadata pmd = btree.getIndexMetadata()
.getPartitionMetadata();
if (pmd == null) {
// An unpartitioned index (one source).
if (log.isInfoEnabled())
log.info("Unpartitioned index: name=" + name + ", ts="
+ timestamp);
return new AbstractBTree[] { btree };
}
/*
* An index partition.
*/
final AbstractBTree[] sources;
{
// live resources for that index partition.
final IResourceMetadata[] a = pmd.getResources();
assert a != null : "No resources: name="+name+", pmd="+pmd;
sources = new AbstractBTree[a.length];
// the most recent is this btree.
sources[0/* j */] = btree;
for (int i = 1; i < a.length; i++) {
final IResourceMetadata resource = a[i];
final IRawStore store;
try {
store = openStore(resource.getUUID());
} catch (NoSuchStoreException ex) {
/*
* There is dependency for that index that is on a resource
* (a ManagedJournal or IndexSegment) that is no longer
* available.
*/
// add some more information to the error message.
throw new NoSuchStoreException(
"Could not load index: name=" + name
+ ", timestamp=" + timestamp
+ ", storeUUID=" + resource.getUUID()
+ ", storeFile=" + resource.getFile()
+ ", pmd=" + pmd + " : " + ex, ex);
}
final long ts;
if (timestamp == ITx.UNISOLATED
|| timestamp == ITx.READ_COMMITTED) {
if (store instanceof IndexSegmentStore) {
// there is only one timestamp for an index segment store.
ts = ((IndexSegmentStore) store).getCheckpoint().commitTime;
} else if (resource.getCommitTime() == 0L) {
/*
* Interpret for a historical store as the last
* committed data on that store.
*/
// the last commit time on the historical journal.
ts = ((AbstractJournal) store).getRootBlockView()
.getLastCommitTime();
} else {
// The specific commit time on which to read.
ts = resource.getCommitTime();
}
} else {
ts = timestamp;
}
assert ts != ITx.UNISOLATED;
assert ts != ITx.READ_COMMITTED;
final AbstractBTree ndx = getIndexOnStore(name, ts, store);
if (ndx == null) {
throw new RuntimeException(
"Could not load component index: name=" + name
+ ", timestamp=" + timestamp
+ ", resource=" + resource);
}
if (log.isInfoEnabled())
log.info("Added to view: " + resource);
sources[i] = ndx;
}
}
if (log.isInfoEnabled())
log.info("Opened index partition: name=" + name + ", timestamp="
+ timestamp);
return sources;
}
/**
* {@inheritDoc}
* <p>
* Note: An {@link ITx#READ_COMMITTED} view returned by this method WILL NOT
* update if there are intervening commits. This decision was made based on
* the fact that views are requested from the {@link IndexManager} by an
* {@link AbstractTask} running on the {@link ConcurrencyManager}. Such
* tasks, and hence such views, have a relatively short life. However, the
* {@link Journal} implementation of this method is different and will
* return a {@link ReadCommittedView} precisely because objects are directly
* requested from a {@link Journal} by the application and the application
* can hold onto a read-committed view for an arbitrary length of time. This
* has the pragmatic effect of allowing us to cache read-committed views in
* the application and in the {@link IBigdataClient}. For the
* {@link IBigdataClient}, the view acquires its read-committed semantics
* because an {@link IClientIndex} generates {@link AbstractTask}(s) for
* each {@link IIndex} operation and submits those task(s) to the
* appropriate {@link IDataService}(s) for evaluation. The
* {@link IDataService} will resolve the index using this method, and it
* will always see the then-current read-committed view and the
* {@link IClientIndex} will appear to have read-committed semantics.
*
* @see Journal#getIndex(String, long)
*/
@Override
public ILocalBTreeView getIndex(final String name, /*final*/ long timestamp) {
if (name == null) {
throw new IllegalArgumentException();
}
/*
* Note: Contention is with purgeResources().
*/
indexCacheLock.readLock().lock();
try {
if (timestamp == ITx.READ_COMMITTED) {
/*
* @todo experimental alternative gives a view based on the most
* recent commit time. The only drawback about this approach is that
* each request by the same operation will return the then most
* recently committed view, well and the IIndex will report the
* actual timestamp used. The upside is that the view is cached
* since it has a normal timestamp and we need do nothing more to
* provide a read lock for read-committed requests. In fact, if we
* simply did this when the task began to execute then it would use
* a consistent timestamp for all of its index views.
*/
timestamp = getLiveJournal().getRootBlockView().getLastCommitTime();
}
final NT nt = new NT(name, timestamp);
final Lock lock = namedLock.acquireLock(nt);
try {
if (timestamp != ITx.READ_COMMITTED) {
// test the indexCache.
// synchronized (indexCache) {
final ILocalBTreeView ndx = indexCache.get(nt);
if (ndx != null) {
if (log.isInfoEnabled())
log.info("Cache hit: " + nt);
return ndx;
}
// }
}
// is this a read-write transactional view?
final boolean isReadWriteTx = TimestampUtility.isReadWriteTx(timestamp);
// lookup transaction iff transactional view.
final ITx tx = (isReadWriteTx ? getConcurrencyManager()
.getTransactionManager().getTx(timestamp) : null);
if (isReadWriteTx) {
/*
* Handle fully isolated (read-write) transactional views.
*/
if (tx == null) {
log.warn("Unknown transaction: name=" + name + ", tx="
+ timestamp);
return null;
}
if (!tx.isActive()) {
// typically this means that the transaction has already
// prepared.
log.warn("Transaction not active: name=" + name + ", tx="
+ timestamp + ", prepared=" + tx.isPrepared()
+ ", complete=" + tx.isComplete() + ", aborted="
+ tx.isAborted());
return null;
}
}
if (isReadWriteTx && tx == null) {
/*
* Note: This will happen both if you attempt to use a
* transaction identified that has not been registered or if you
* attempt to use a transaction manager after the transaction
* has been either committed or aborted.
*/
log.warn("No such transaction: name=" + name + ", tx=" + tx);
return null;
}
final boolean readOnly = TimestampUtility.isReadOnly(timestamp);
// || (isReadWriteTx && tx.isReadOnly());
final ILocalBTreeView tmp;
if (isReadWriteTx) {
/*
* Isolated operation.
*
* Note: The backing index is always a historical state of the
* named index.
*
* Note: Tx#getIndex(String name) serializes concurrent requests
* for the same index (thread-safe).
*/
final ILocalBTreeView isolatedIndex = tx.getIndex(name);
if (isolatedIndex == null) {
log.warn("No such index: name=" + name + ", timestamp="
+ TimestampUtility.toString(timestamp));
return null;
}
tmp = isolatedIndex;
} else {
/*
* Non-transactional view.
*/
if (readOnly) {
/*
* historical read -or- read-committed operation.
*/
if (timestamp == ITx.READ_COMMITTED) {
/*
* Check to see if an index partition was split, joined
* or moved.
*/
final StaleLocatorReason reason = getIndexPartitionGone(name);
if (reason != null) {
// Notify client of stale locator.
throw new StaleLocatorException(name, reason);
}
}
final AbstractBTree[] sources = getIndexSources(name,
timestamp);
if (sources == null) {
log.warn("No such index: name=" + name + ", timestamp="
+ TimestampUtility.toString(timestamp));
return null;
}
assert sources.length > 0;
assert sources[0].isReadOnly();
if (sources.length == 1) {
tmp = (BTree) sources[0];
} else {
tmp = new FusedView(sources);
}
} else {
/*
* Writable unisolated index.
*
* Note: This is the "live" mutable index. This index is NOT
* thread-safe. A lock manager is used to ensure that at
* most one task has access to this index at a time.
*/
assert timestamp == ITx.UNISOLATED : "timestamp="
+ timestamp;
/*
* Check to see if an index partition was split, joined or
* moved.
*/
final StaleLocatorReason reason = getIndexPartitionGone(name);
if (reason != null) {
// Notify client of stale locator.
throw new StaleLocatorException(name, reason);
}
if (isDisabledWrites(name)) {
/*
* Writes on the index have been disabled. This
* occurs when the index refuses to split and is at
* least two times larger than the nominal shard
* size. In this case writes are disabled to push
* the problem back onto the application (typically
* the problem is a bad split handler supplied by
* the application).
*
* To fix this condition, you must fix the split
* handler, explicitly enable writes, and then
* update the IndexMetadata for the each shard of
* the index and in the MDS as well.
*
* Note: This check is only performed for the full
* view of the shard. It MUST NOT be performed by
* getIndexOnJournal(...) since that code path is
* used to update the definition of the shard view
* and we need to continue to propagate the shard
* view definition from overflow to overflow even
* after further writes on the shard have been
* disabled.
*/
throw new RuntimeException(
"Index writes disabled: " + name);
}
final AbstractBTree[] sources = getIndexSources(name,
ITx.UNISOLATED);
if (sources == null) {
log.warn("No such index: name=" + name + ", timestamp="
+ TimestampUtility.toString(timestamp));
return null;
}
assert !sources[0].isReadOnly();
if (sources.length == 1) {
tmp = (BTree) sources[0];
} else {
tmp = new FusedView(sources);
}
}
}
if (timestamp != ITx.READ_COMMITTED
&& timestamp != ITx.UNISOLATED) {
// update the indexCache.
if (log.isInfoEnabled())
log.info("Adding to cache: " + nt);
// synchronized (indexCache) {
// indexCache.put(nt, tmp, true/* dirty */);
indexCache.put(nt, tmp);
// }
}
return tmp;
} finally {
lock.unlock();
}
} finally {
indexCacheLock.readLock().unlock();
}
}
/**
* Dump index metadata as of the timestamp.
*
* @param timestamp
*
* @throws IllegalArgumentException
* if <i>timestamp</i> is positive (a transaction identifier).
*
* @return The dump.
*
* @throws IllegalStateException
* if the live journal is closed when this method is invoked.
* @throws RuntimeException
* if the live journal is closed asynchronously while this
* method is running.
*/
public String listIndexPartitions(long timestamp) {
if (timestamp == ITx.UNISOLATED || timestamp == ITx.READ_COMMITTED) {
timestamp = getLiveJournal().getLastCommitTime();
}
final StringBuilder sb = new StringBuilder();
final AbstractJournal journal = getJournal(timestamp);
if (journal == null) {
/*
* This condition can occur if there are no shard views on the
* previous journal and the releaseAge is zero since the previous
* journal can be purged (deleted) before this method is invoked.
* This situation arises in a few of the unit tests which begin with
* an empty journal and copy everything onto the new journal such
* that the old journal can be immediately released.
*/
return "No journal: timestamp=" + timestamp;
}
sb.append("timestamp="+timestamp+"\njournal="+journal.getResourceMetadata());
// // historical view of Name2Addr as of that timestamp.
// final ITupleIterator<?> itr = journal.getName2Addr(timestamp)
// .rangeIterator();
//
// while (itr.hasNext()) {
//
// final ITuple<?> tuple = itr.next();
//
// final Entry entry = EntrySerializer.INSTANCE
// .deserialize(new DataInputBuffer(tuple.getValue()));
//
// // the name of an index to consider.
// final String name = entry.name;
//
// /*
// * Open the mutable BTree only (not the full view since we don't
// * want to force the read of index segments from the disk).
// */
// final BTree btree = (BTree) journal
// .getIndexWithCheckpointAddr(entry.checkpointAddr);
final Iterator<String> itr = journal.indexNameScan(null/* prefix */,
timestamp);
while(itr.hasNext()) {
final String name = itr.next();
/*
* Open the mutable BTree only (not the full view since we don't
* want to force the read of index segments from the disk).
*/
final BTree btree = (BTree) journal.getIndexLocal(name, timestamp);
assert btree != null : name;
// index metadata for that index partition.
final IndexMetadata indexMetadata = btree.getIndexMetadata();
// index partition metadata
final LocalPartitionMetadata pmd = indexMetadata
.getPartitionMetadata();
sb.append("\nname="+name+", checkpoint="+btree.getCheckpoint()+", pmd="+pmd);
}
return sb.toString();
}
/**
* Build an {@link IndexSegment} from an index partition. Delete markers are
* propagated to the {@link IndexSegment} unless <i>compactingMerge</i> is
* <code>true</code>.
* <p>
* Note: {@link IndexSegment}s are registered with the {@link StoreManager}
* by this method but are also placed into a hard reference collection (the
* <i>retentionSet</i>) in order to prevent their being released before
* they are put to use by incorporating them into an index partition view.
* The caller MUST remove the {@link IndexSegment} from that hard reference
* collection once the index has been incorporated into an index partition
* view or is no longer required (e.g., has been MOVEd). However, the caller
* MUST NOT remove the {@link IndexSegment} from the hard reference
* collection until after the commit point for the task which incorporates it
* into the index partition view. In practice, this means that those tasks
* must be encapsulated with either a post-condition action or wrapped by a
* caller which provides the necessary after-action in a finally{} clause.
*
* @param indexPartitionName
* The name of the index partition (not the name of the scale-out
* index).
* @param src
* A view of the index partition as of the <i>createTime</i>.
* This may be a partial view of comprised from only the first N
* sources in the view, in which case <i>compactingMerge := false</code>.
* @param compactingMerge
* When <code>true</code> the caller asserts that <i>src</i>
* is a {@link FusedView} and deleted index entries WILL NOT be
* included in the generated {@link IndexSegment}. Otherwise, it
* is assumed that the only select component(s) of the index
* partition view are being exported onto an {@link IndexSegment}
* and deleted index entries will therefore be propagated to the
* new {@link IndexSegment}.
* @param commitTime
* The commit time associated with the view from which the
* {@link IndexSegment} is being generated. This value is written
* into {@link IndexSegmentCheckpoint#commitTime}.
* @param fromKey
* The lowest key that will be included (inclusive). When <code>null</code>
* there is no lower bound.
* @param toKey
* The first key that will not be included (exclusive). When
* <code>null</code> there is no upper bound.
*
* @return A {@link BuildResult} identifying the new {@link IndexSegment}
* and the source index.
*
* @throws Exception
* if any errors are encountered then the file (if it exists)
* will be deleted as a side-effect before returning control to
* the caller.
*
* @see StoreManager#purgeOldResources(long, boolean)
*/
public BuildResult buildIndexSegment(final String indexPartitionName,
final ILocalBTreeView src, final boolean compactingMerge,
final long commitTime, final byte[] fromKey, final byte[] toKey,
final Event parentEvent) throws Exception {
if (indexPartitionName == null)
throw new IllegalArgumentException();
if (src == null)
throw new IllegalArgumentException();
if (parentEvent == null)
throw new IllegalArgumentException();
final Event e;
{
final Map<String, Object> m = new HashMap<String, Object>();
m.put("name", indexPartitionName);
m.put("merge", compactingMerge);
m.put("#sources", src.getSourceCount());
// #of MBs of source index segment data.
long sumSegBytes = 0L;
for (AbstractBTree tmp : src.getSources()) {
if (tmp instanceof IndexSegment) {
sumSegBytes += ((IndexSegment) tmp).getStore().size();
}
}
m.put("MB(in)", fpf
.format(((double) sumSegBytes / Bytes.megabyte32)));
// #of concurrent index segment build tasks.
m.put("#build", concurrentBuildTaskCount.get() + 1);
// #of concurrent index segment merge tasks.
m.put("#merge", concurrentMergeTaskCount.get() + 1);
e = parentEvent.newSubEvent(EventType.IndexSegmentBuild, m).start();
}
File outFile = null;
try {
final IndexMetadata indexMetadata;
final SegmentMetadata segmentMetadata;
final IndexSegmentBuilder builder;
try {
// metadata for that index / index partition.
indexMetadata = src.getIndexMetadata();
// the file to be generated.
outFile = getIndexSegmentFile(indexMetadata);
// new builder.
builder = IndexSegmentBuilder.newInstance(/*indexPartitionName,*/ src, outFile,
tmpDir, compactingMerge, commitTime, fromKey, toKey);
try {
// place on the active tasks lists.
buildTasks.put(outFile, builder);
if(compactingMerge)
concurrentMergeTaskCount.incrementAndGet();
else
concurrentBuildTaskCount.incrementAndGet();
// build the index segment.
builder.call();
} finally {
// remove from the active tasks list.
buildTasks.remove(outFile);
if(compactingMerge)
concurrentMergeTaskCount.decrementAndGet();
else
concurrentBuildTaskCount.decrementAndGet();
}
/*
* Report on a bulk merge/build of an {@link IndexSegment}.
*/
{
final long nbytes = builder.getCheckpoint().length;
// data rate in MB/sec.
float mbPerSec = builder.mbPerSec;
// add more event details.
e.addDetail("filename", outFile);
e.addDetail("expectedNodeCount", builder.plan.nnodes);
e.addDetail("expectedLeafCount", builder.plan.nleaves);
e.addDetail("expectedRangeCount", builder.plan.nentries);
e.addDetail("actualNodeCount", builder.getCheckpoint().nnodes);
e.addDetail("actualLeafCount", builder.getCheckpoint().nleaves);
e.addDetail("actualRangeCount", builder.getCheckpoint().nentries);
e.addDetail("commitTime", commitTime);
e.addDetail("elapsed", +builder.elapsed);
e.addDetail("MB(out)", fpf
.format(((double) nbytes / Bytes.megabyte32)));
e.addDetail("MB/s", fpf.format(mbPerSec));
}
// Describe the index segment.
segmentMetadata = new SegmentMetadata(//
outFile, //
builder.segmentUUID, //
commitTime //
);
/*
* Add to the retention set so the newly built index segment
* will not be deleted before it is put to use.
*/
retentionSetAdd(segmentMetadata.getUUID());
/*
* Now that the file is protected from release, notify the
* resource manager so that it can find this file.
*/
addResource(segmentMetadata, outFile);
} catch (Throwable t) {
if (outFile != null && outFile.exists()) {
try {
outFile.delete();
} catch (Throwable t2) {
log.warn(t2.getLocalizedMessage(), t2);
}
}
if (t instanceof Exception)
throw (Exception) t;
throw new RuntimeException(t);
}
/*
* Note: Now that the resource is registered with the StoreManager
* we have to handle errors somewhat differently.
*/
try {
final BuildResult tmp = new BuildResult(indexPartitionName, compactingMerge,
src.getSources(), indexMetadata, segmentMetadata,
builder);
if (log.isInfoEnabled())
log.info("built index segment: " + tmp);
return tmp;
} catch (Throwable t) {
try {
// make it releasable.
retentionSetRemove(segmentMetadata.getUUID());
} catch (Throwable t2) {
log.warn(t2.getLocalizedMessage(), t2);
}
try {
// release it.
deleteResource(segmentMetadata.getUUID(), false/* isJournal */);
} catch (Throwable t2) {
log.warn(t2.getLocalizedMessage(), t2);
}
if (t instanceof Exception)
throw (Exception) t;
throw new RuntimeException(t);
}
} finally {
e.end();
}
}
/**
* A map containing the concurrently executing index segment build tasks.
* This is used to report those tasks out via the performance counters
* interface.
*/
protected final ConcurrentHashMap<File,IndexSegmentBuilder> buildTasks =
new ConcurrentHashMap<File, IndexSegmentBuilder>();
/**
* The #of build tasks which are executing concurrently.
*/
protected final AtomicInteger concurrentBuildTaskCount = new AtomicInteger();
/**
* The #of merge tasks which are executing concurrently.
*/
protected final AtomicInteger concurrentMergeTaskCount = new AtomicInteger();
/*
* Per index counters.
*/
/**
* Canonical per-index partition {@link BTreeCounters}. These counters are
* set on each {@link AbstractBTree} that is materialized by
* {@link #getIndexOnStore(String, long, IRawStore)}. The same
* {@link BTreeCounters} object is used for the unisolated, read-committed,
* read-historical and isolated views of the index partition and for each
* source in the view regardless of whether the source is a mutable
* {@link BTree} on the live journal, a read-only {@link BTree} on a
* historical journal, or an {@link IndexSegment}.
*
* FIXME An {@link IndexSegment} can be used by more than one view of an
* index partition. This is not a problem and no double counting,
* misassignment of credit, or lost counters will result. However, if an
* {@link IndexSegment} is used by different index partitions (which might
* well be allowed in a post-split scenario but is not possible for a post-
* move or post-join scenario, and those are the three ways in which a new
* index partition can be created (other than by registering a new scale-out
* index) then the {@link BTreeCounters} will only reflect all activity on
* an {@link IndexSegment} in the index partition which last (re-)opened
* that {@link IndexSegment}.
*
* FIXME Index partitions which have been dropped should be cleared from the
* map at overflow unless they have been re-registered since (the map could
* also use the index UUID as the key in case the index is re-registered).
* Use {@link #getIndexPartitionGone(String)} to figure out if each index
* partition has been dropped during synchronous overflow. Then cross check
* to verify that it does not still exist.
* <p>
* Slightly better would be to reset the index counters at the drop (except
* that they will immediately disappear) or best yet to always reset the
* index counters on add and to clear at overflow if split/moved/deleted or
* otherwise gone.
* <p>
* When a scale-out index is deleted clear out the entries in
* {@link #getIndexPartitionGone(String)} so that we do not run into trouble
* if the index is re-registered!
*/
final private ConcurrentHashMap<String/* name */, BTreeCounters> indexCounters = new ConcurrentHashMap<String, BTreeCounters>();
/**
* The aggregated performance counters for each unisolated index partition
* view as of the time when the old journal was closed for writes. This is
* used to compute the delta for each unisolated index partition view at the
* end of the life cycle for the new live journal.
*/
private Map<String/*name*/, BTreeCounters> mark = new HashMap<String, BTreeCounters>();
@Override
public BTreeCounters getIndexCounters(final String name) {
if (name == null)
throw new IllegalArgumentException();
// first test for existence.
BTreeCounters t = indexCounters.get(name);
if (t == null) {
// not found. create a new instance.
t = new BTreeCounters();
// put iff absent.
final BTreeCounters oldval = indexCounters.putIfAbsent(name, t);
if (oldval != null) {
// someone else got there first so use their instance.
t = oldval;
} else {
if (log.isInfoEnabled())
log.info("New counters: indexPartitionName=" + name);
}
}
assert t != null;
return t;
}
/**
* Snapshots the index partition performance counters and returns a map
* containing the net change in the performance counters for each index
* partition since the last time this method was invoked (it is invoked by
* {@link #overflow()}).
* <p>
* Note: This method has a side effect of setting a new mark. It SHOULD NOT
* be used except at overflow since the "mark" is used to determine the net
* change in the per-index partition performance counters. If used other
* than at overflow the net change will be under-reported.
*
* @return A map containing the net change in the index partition
* performance counters for each index partition.
*/
synchronized protected Map<String, BTreeCounters> markAndGetDelta() {
final Map<String/*name*/, BTreeCounters> newMark = new HashMap<String, BTreeCounters>();
/*
* The net change in the performance counters for each unisolated index
* partition view over the life cycle of the old journal. This is used
* to determine the amount of activity on each index partition during
* the life cycle of the old journal. That is used to compute the
* {@link Score} for each index partition. Those {@link Score}s inform
* the choice of the index partition moves.
*/
final Map<String/* name */, BTreeCounters> delta = new HashMap<String, BTreeCounters>();
final Iterator<Map.Entry<String,BTreeCounters>> itr = indexCounters.entrySet().iterator();
while(itr.hasNext()) {
final Map.Entry<String, BTreeCounters> entry = itr.next();
// name of the index partition.
final String name = entry.getKey();
// current counters (strictly increasing over time).
final BTreeCounters current = entry.getValue();
// the previous total for this index partition (if any).
final BTreeCounters prior = this.mark.get(name);
if (prior == null) {
// first total for this index partition.
delta.put(name, current);
if (log.isInfoEnabled())
log.info("First time: " + name);
} else {
// compute the delta for this index partition.
delta.put(name, current.subtract(prior));
if (log.isInfoEnabled())
log.info("Computed delta: " + name);
}
// record the total for use in the new mark.
newMark.put(name, current);
}
// replace the old mark with the new one.
this.mark = newMark;
// return summary of the net change in activity for each index partition.
return delta;
}
/**
* Return a {@link CounterSet} reflecting use of the named indices. When an
* index partition is in use, its {@link CounterSet} is reported under a
* path formed from name of the scale-out index and partition identifier.
*
* @return A new {@link CounterSet} reflecting the use of the named indices.
*/
public CounterSet getIndexCounters() {
final CounterSet tmp = new CounterSet();
final Iterator<Map.Entry<String, BTreeCounters>> itr = indexCounters
.entrySet().iterator();
while (itr.hasNext()) {
final Map.Entry<String, BTreeCounters> entry = itr.next();
final String name = entry.getKey();
final BTreeCounters btreeCounters = entry.getValue();
assert btreeCounters != null : "name=" + name;
// // non-null iff this is an index partition.
// final LocalPartitionMetadata pmd = viewCounters.pmd;
/*
* Note: this is a hack. We parse the index name in order to
* recognize whether or not it is an index partition since we want
* to know that even if the we get a StaleLocatorException from the
* ResourceManager. This will work fine as long as the the basename
* of the index does not use a '#' character.
*/
final String path;
final int indexOf = name.lastIndexOf('#');
if (indexOf != -1) {
path = name.substring(0, indexOf) + ICounterSet.pathSeparator
+ name;
} else {
path = name;
}
/*
* Note: The code below works and avoids re-opening a closed index
* but it makes the presence of the additional counters dependent on
* recent state in a manner that I do not like.
*/
// IIndex view = null;
// try {
// if (resourceManager instanceof ResourceManager) {
// /*
// * Get the live index object from the cache and [null]
// * if it is not in the cache. When the view is not in
// * the cache we simply do not update our counters from
// * the view.
// *
// * Note: Using the cache prevents a request for the
// * counters from forcing the index to be re-loaded.
// *
// * Note: This is the LIVE index object. We DO NOT hold
// * an exclusive lock. Therefore we MUST NOT use most of
// * its API, but we are only concerned with its counters
// * here and that is thread-safe.
// */
// final ResourceManager rmgr = ((ResourceManager) resourceManager);
// view = rmgr.indexCache.get(new NT(name, ITx.UNISOLATED));
// final StaleLocatorReason reason =
// rmgr.getIndexPartitionGone(name);
// if (reason != null) {
// // Note that the index partition is gone.
// t.addCounter("pmd" + ICounterSet.pathSeparator+"StaleLocator",
// new OneShotInstrument<String>(reason.toString()));
// }
// } else {
// /*
// * Get the live index object from Name2Addr's cache. It
// * will be [null] if the index is not in the cache. When
// * the index is not in the cache we simply do not update
// * our counters from the view.
// */
// final Journal jnl = ((Journal)resourceManager);
// synchronized(jnl.name2Addr) {
// view = jnl.name2Addr.getIndexCache(name);
// // view = jnl.getIndex(name, ITx.READ_COMMITTED);
// }
// }
// } catch (Throwable ex) {
// log.error("Could not update counters: name=" + name + " : "
// + ex, ex);
// // fall through - [view] will be null.
// }
//
// if (view == null) {
//
// /*
// * Note: the view can be unavailable either because the
// * index was concurrently registered and has not been
// * committed yet or because the index has been dropped.
// *
// * Note: an index partition that moved, split, or joined is
// * handled above.
// */
//
// // t.addCounter("No data", new OneShotInstrument<String>(
// // "Read committed view not available"));
//
// continue;
//
// }
// create counter set for this index / index partition.
final CounterSet t = tmp.makePath(path);
/*
* Attach the aggregated counters for the index / index partition.
*/
t.attach(btreeCounters.getCounters());
// if (pmd != null) {
//
// /*
// * A partitioned index.
// */
//
// final CounterSet pmdcs = t.makePath("pmd");
//
// pmdcs.addCounter("leftSeparatorKey",
// new OneShotInstrument<String>(BytesUtil.toString(pmd
// .getLeftSeparatorKey())));
//
// pmdcs.addCounter("rightSeparatorKey",
// new OneShotInstrument<String>(BytesUtil.toString(pmd
// .getRightSeparatorKey())));
//
// pmdcs.addCounter("history", new OneShotInstrument<String>(pmd
// .getHistory()));
//
// final IResourceMetadata[] resources = pmd.getResources();
//
// for (int i = 0; i < resources.length; i++) {
//
// final IResourceMetadata resource = resources[i];
//
// final CounterSet rescs = pmdcs.makePath("resource[" + i
// + "]");
//
// rescs.addCounter("file", new OneShotInstrument<String>(
// resource.getFile()));
//
// rescs.addCounter("uuid", new OneShotInstrument<String>(
// resource.getUUID().toString()));
//
// rescs.addCounter("createTime",
// new OneShotInstrument<String>(Long
// .toString(resource.getCreateTime())));
//
// }
//
// }
}
return tmp;
}
}