/* Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Mar 24, 2008 */ package com.bigdata.resources; import java.io.File; import java.io.IOException; import java.lang.ref.WeakReference; import java.net.InetAddress; import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.TreeSet; import java.util.UUID; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.log4j.Logger; import com.bigdata.bfs.BigdataFileSystem; import com.bigdata.btree.BTree; import com.bigdata.btree.Checkpoint; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITuple; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.IndexSegment; import com.bigdata.btree.IndexSegmentStore; import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout; import com.bigdata.cache.HardReferenceQueue; import com.bigdata.concurrent.NamedLock; import com.bigdata.io.SerializerUtil; import com.bigdata.journal.AbstractJournal; import com.bigdata.journal.AbstractLocalTransactionManager; import com.bigdata.journal.BufferMode; import com.bigdata.journal.CommitRecordIndex; import com.bigdata.journal.ConcurrencyManager; import com.bigdata.journal.DiskOnlyStrategy; import com.bigdata.journal.IBufferStrategy; import com.bigdata.journal.ICommitRecord; import com.bigdata.journal.IConcurrencyManager; import com.bigdata.journal.ILocalTransactionManager; import com.bigdata.journal.IResourceLockService; import com.bigdata.journal.IResourceManager; import com.bigdata.journal.IRootBlockView; import com.bigdata.journal.ITransactionService; import com.bigdata.journal.ITx; import com.bigdata.journal.Name2Addr; import com.bigdata.journal.TemporaryStore; import com.bigdata.journal.WORMStrategy; import com.bigdata.journal.WORMStrategy.StoreCounters; import com.bigdata.journal.WriteExecutorService; import com.bigdata.mdi.IPartitionMetadata; import com.bigdata.mdi.IResourceMetadata; import com.bigdata.mdi.IndexPartitionCause; import com.bigdata.mdi.JournalMetadata; import com.bigdata.mdi.LocalPartitionMetadata; import com.bigdata.mdi.SegmentMetadata; import com.bigdata.rawstore.IRawStore; import com.bigdata.relation.locator.DefaultResourceLocator; import com.bigdata.service.DataService; import com.bigdata.service.Event; import com.bigdata.service.EventResource; import com.bigdata.service.EventType; import com.bigdata.service.IBigdataFederation; import com.bigdata.service.ManagedResourceService; import com.bigdata.service.MetadataService; import com.bigdata.sparse.SparseRowStore; import com.bigdata.util.Bytes; import com.bigdata.util.DaemonThreadFactory; import com.bigdata.util.config.NicUtil; /** * Class encapsulates logic for managing the store files (journals and index * segments), including the logic to compute the effective release time for the * managed resources and to release those resources by deleting them from the * file system. * * @todo There is neither a "CREATE_TEMP_DIR" and "DELETE_ON_CLOSE" does not * remove all directories created during setup. One of the consequences is * that you have to explicitly clean up after a unit test using a * {@link ResourceManager} or it will leave its files around. * * @todo {@link BufferMode#Temporary} is not supported (verify whether the * Transient mode is supported). * * @todo If we approach the limit on free space for the {@link #dataDir} then we * need to shed index partitions to other data services or potentially * become more aggressive in releasing old resources. See * {@link #getDataDirFreeSpace(File)} * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ abstract public class StoreManager extends ResourceEvents implements IResourceManager { /** * Logger. */ private static final Logger log = Logger.getLogger(StoreManager.class); /** * Options for the {@link StoreManager}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ public static interface Options extends com.bigdata.journal.Options { /** * The property whose value is the name of the directory in which the * store files will be created (no default). This property is required * unless the instance is transient. If you specify * {@link com.bigdata.journal.Options#BUFFER_MODE} as * {@link BufferMode#Transient} then journals will be NOT stored in the * file system and {@link ResourceManager#overflow()} will be disabled. * <p> * The files are created within subdirectories as follows: The * "journals" subdirectory contains the journal files. The "segments" * directory contains subdirectories corresponding to the index UUID for * each scale-out index. Within those index-specific directories, the * index segment files are assigned to files using the temporary file * mechanisms using the munged index name as the file prefix and * {@link Options#SEG} as the file suffix. If the index is partitioned * then the partition identifier appears as part of the file prefix. * <p> * Note: While files are stored per the scheme described above, the * entire {@link #DATA_DIR} will be scanned recursively to identify all * journal files and index segments during startup. Files will be used * where ever they are found but the {@link IResourceMetadata#getFile()} * read from a given resource MUST correspond to its relative location * within the {@link #DATA_DIR}. * <p> * Note: Each {@link DataService} or {@link MetadataService} MUST have * its own {@link #DATA_DIR}. */ String DATA_DIR = StoreManager.class.getName()+".dataDir"; /** * The capacity of the LRU cache of open {@link IRawStore}s. The * capacity of this cache indirectly controls how many stores will be * held open. The main reason for keeping an store open is to reuse its * buffers if another request arrives "soon" which would read on that * store. Note that "stores" includes both {@link ManagedJournal}s and * {@link IndexSegmentStore}s. * <p> * The effect of this parameter is indirect owning to the semantics of * weak references and the control of the JVM over when they are * cleared. Once an index becomes weakly reachable, the JVM will * eventually GC the index object, thereby releasing its object graph. * Since stores which are strongly reachable never have their weak * reference cleared this provides our guarantee that stores are never * closed if they are in use. * <p> * Stores have non-transient resources and MUST explicitly be closed. * Since we are not notified before the weak reference is closed, our * only remaining option is {@link AbstractJournal#finalize()} and * {@link IndexSegmentStore#finalize()}, both of which close the store * if it is still open. * * @see #DEFAULT_STORE_CACHE_CAPACITY */ String STORE_CACHE_CAPACITY = StoreManager.class.getName() + ".storeCacheCapacity"; /** * The default for the {@link #STORE_CACHE_CAPACITY} option. */ String DEFAULT_STORE_CACHE_CAPACITY = "20"; /** * The time in milliseconds before an entry in the store cache will be * cleared from the backing {@link HardReferenceQueue} (default * {@value #DEFAULT_STORE_CACHE_TIMEOUT}). This property controls how * long the store cache will retain an {@link IRawStore} which has not * been recently used. This is in contrast to the cache capacity. */ String STORE_CACHE_TIMEOUT = StoreManager.class.getName() + ".storeCacheTimeout"; String DEFAULT_STORE_CACHE_TIMEOUT = "" + (60 * 1000); // One minute. /** * A boolean property whose value determines whether or not startup will * complete successfully if bad files are identified during the startup * scan (default {@value #DEFAULT_IGNORE_BAD_FILES}). When * <code>false</code> the {@link StoreManager} will refuse to start if * if find bad files. When <code>true</code> the {@link StoreManager} * will startup anyway but some index views may not be available. * Regardless, bad files will be logged as they are identified and all * files will be scanned before the {@link StoreManager} aborts. */ String IGNORE_BAD_FILES = StoreManager.class.getName()+".ignoreBadFiles"; String DEFAULT_IGNORE_BAD_FILES = "false"; /** * Option may be used to disable the purge of old resources during * startup. */ String PURGE_OLD_RESOURCES_DURING_STARTUP = StoreManager.class.getName() + ".purgeOldResourcesDuringStartup"; String DEFAULT_PURGE_OLD_RESOURCES_DURING_STARTUP = "true"; /** * Option specifies the #of bytes under management below which we will * accelerate the overflow of the live journal by reducing its maximum * extent below the nominal configured maximum extent. The purpose of * this option is to promote rapid overflow of a new data service (where * new is measured by the #of bytes under management). This helps to * increase the rate at which index partitions are split (and moved if * the there is more than one new data service starting). When ZERO (0) * the feature is disabled. */ String ACCELERATE_OVERFLOW_THRESHOLD = StoreManager.class.getName() + ".accelerateOverflowThreshold"; String DEFAULT_ACCELERATE_OVERFLOW_THRESHOLD = ""+(Bytes.gigabyte); } /** * Performance counters for the {@link StoreManager}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ public static interface IStoreManagerCounters { /** * The configured data directory. */ String DataDir = "DataDir"; /** * The configured tmp directory. */ String TmpDir = "TmpDir"; /** * <code>true</code> iff {@link StoreManager#isOpen()} */ String IsOpen = "isOpen"; /** * <code>true</code> iff {@link StoreManager#isStarting()} */ String IsStarting = "isStarting"; /** * <code>true</code> iff {@link StoreManager#isRunning()} */ String IsRunning = "isRunning"; String StoreCacheCapacity = "Store Cache Capacity"; String StoreCacheSize = "Store Cache Size"; /** * #of journals currently under management. */ String ManagedJournalCount = "Managed Journal Count"; /** * #of index segments currently under management." */ String ManagedSegmentStoreCount = "Managed Segment Store Count"; String JournalReopenCount = "Journal (Re-)open Count"; String SegmentStoreReopenCount = "Segment Store (Re-)open Count"; /** * #of journals which have been deleted. */ String JournalDeleteCount = "Journal Delete Count"; /** * #of index segments which have been deleted. */ String SegmentStoreDeleteCount = "Segment Store Delete Count"; /** * The #of bytes currently under management by the {@link StoreManager}. */ String BytesUnderManagement = "Bytes Under Management"; /** * The #of bytes in journals currently under management by the * {@link StoreManager}. */ String JournalBytesUnderManagement = "Journal Bytes Under Management"; /** * The #of bytes in index segments currently under management by the * {@link StoreManager}. */ String SegmentBytesUnderManagement = "Segment Bytes Under Management"; /** * The #of bytes in resources that have been deleted by the * {@link StoreManager} after they became release free. */ String BytesDeleted = "Bytes Deleted"; /** * The #of bytes available on the disk volume on which the data * directory is located. */ String DataDirBytesAvailable = "Data Volume Bytes Available"; /** * The #of bytes available on the disk volume on which the temporary * directory is located. */ String TmpDirBytesAvailable = "Temp Volume Bytes Available"; /** * The maximum extent of any journal managed by this service as of the * time when it was closed out by synchronous overflow processing. */ String MaximumJournalSizeAtOverflow = "Maximum Journal Size At Overflow"; /** * The elapsed milliseconds to date required to purge old resources from * the file system. * * @see StoreManager#purgeOldResources() */ String PurgeResourcesMillis = "Purge Resources Millis"; /** * The current release time for the {@link StoreManager}. * * @see StoreManager#getReleaseTime() */ String ReleaseTime = "Release Time"; /** * The timestamp associated with the last synchronous overflow event. */ String LastOverflowTime = "Last Overflow Time"; /** * The most recent commit time preserved when resources were last purged * from the {@link StoreManger}. * * @see StoreManager#purgeResources */ String LastCommitTimePreserved = "Last Commit Time Preserved"; /** * The most recent commit time. */ String LastCommitTime = "Last Commit Time"; } /** * The directory in which the data files reside. * <p> * Note: It is a hard requirement that each resource is located by the * {@link IResourceMetadata#getFile() path} relative to the {@link #dataDir}. * * @see Options#DATA_DIR * @see IResourceMetadata#getFile() */ protected final File dataDir; /** Directory containing the journal resources. */ protected final File journalsDir; /** Directory containing the index segment resources. */ protected final File segmentsDir; /** * The directory in which the temporary files will reside. * * @see Options#TMP_DIR */ protected final File tmpDir; /** * The performance counters for the {@link IBufferStrategy} backing the live * journal and any historical journals which are concurrently open with the * live journal. A single instance of this object is used, and a hard * reference to that instance is held here, so that we can track the * cumulative performance counters across the live cycles of all journal * instances used by the data service over time. The performance counters * are not themselves persistent and do not survive a restart of the * {@link StoreManager}. */ private final StoreCounters storeCounters = new StoreCounters(); /** * The performance counters for the {@link IBufferStrategy} backing the live * journal and any historical journals which are concurrently open with the * live journal. A single instance of this object is used, and a hard * reference to that instance is held here, so that we can track the * cumulative performance counters across the live cycles of all journal * instances used by the data service over time. The performance counters * are not themselves persistent and do not survive a restart of the * {@link StoreManager}. */ public final StoreCounters getStoreCounters() { return storeCounters; } /** * A map over the journal histories. The map is transient and is * re-populated from a scan of the file system during startup. * <P> * The keys are the timestamp at which the journal was put into service. The * values are the journal resource descriptions. Given the timestamp of some * historical state of an index, this map is used to locate the journal on * which that historical state of the index would be found. */ final private JournalIndex journalIndex; /** * A map over the index segments by ascending createTime and UUID. The map * is transient and is re-populated from a scan of the file system during * startup. * <p> * The keys are the createTime of the index segment followed by the index * segment UUID (to break ties). The values are the * {@link IResourceMetadata} object describing that index segment. This map * is used to provide some basic reporting but is primarily used to delete * index segment resources once they are no longer required. * * @todo Is this strictly necessary? Do we have all the necessary * information in the journals? Review the logic and decide. */ final private IndexSegmentIndex segmentIndex; /** * A non-thread-safe collection of {@link UUID}s for {@link IndexSegment}s * which have been newly built but not yet incorporated in a re-start safe * manner into an index partition view. {@link UUID}s in this collection * are excluded from release by {@link #purgeOldResources()}. * * @see #purgeOldResources() * @see IndexManager#buildIndexSegment(String, * com.bigdata.btree.ILocalBTreeView, boolean, long, byte[], byte[], * Event) */ final private Set<UUID> retentionSet = new HashSet<UUID>(); /** * Add an {@link IndexSegment} to the set of {@link IndexSegment}s which * have been generated but not yet incorporated into an index partition view * and hence we must take special cautions to prevent their release. * * @param The {@link UUID} of the {@link IndexSegmentStore}. * * @see #retentionSetRemove(UUID) * @see #retentionSet */ protected void retentionSetAdd(final UUID uuid) { if (uuid == null) throw new IllegalArgumentException(); synchronized (retentionSet) { if (!retentionSet.add(uuid)) { // that UUID is already in this collection. throw new IllegalStateException("Already in set: " + uuid); } } } /** * Remove an {@link IndexSegment} from the {@link #retentionSet}. DO NOT * invoke this until the {@link IndexSegment} has been incorporated in a * restart safe manner into an index partition view (that is, post-commit * rather than during the task that incorporates it into the view) or is * known to be no longer required (post MOVE, task failed, etc). * * @param uuid * The {@link UUID} of the {@link IndexSegmentStore}. * * @see #retentionSetAdd(UUID) * @see #retentionSet */ protected void retentionSetRemove(final UUID uuid) { if (uuid == null) throw new IllegalArgumentException(); synchronized (retentionSet) { if (!retentionSet.remove(uuid)) { /* * Note: Only a warning since invoked during error handling when * the resource might have not made it into the retentionSet in * the first place. */ log.warn("Not in retentionSet: " + uuid); } } } /** * A cache that is used by the to automatically close out unused * {@link IndexSegmentStore}s. An {@link IndexSegment} that is no longer * used will have its reference cleared when it is swept by the garbage * collector and will automatically release all of its buffers (node and * leaf cache, etc). However, at that point the {@link IndexSegmentStore} is * still open, and it can buffer a significant amount of data in addition to * the file handle. * <p> * When the weak reference is cleared we know that there are no longer any * hard references to the {@link IndexSegment} and hence the corresponding * {@link IndexSegmentStore} should be closed. In fact, we can immediately * remove the {@link IndexSegmentStore} from the cache of open stores and * then close the store. At this point if the store is re-opened it will be * a new object. This is easy enough to do since the {@link UUID} of the * {@link IndexSegmentStore} is the key in our map! * * @see Options#STORE_CACHE_CAPACITY * @see Options#STORE_CACHE_TIMEOUT */ // final protected WeakValueCache<UUID, IRawStore> storeCache; final protected ConcurrentWeakValueCacheWithTimeout<UUID, IRawStore> storeCache; /** * Provides locks on a per-{resourceUUID} basis for higher concurrency. */ private final transient NamedLock<UUID> namedLock = new NamedLock<UUID>(); /** * The #of entries in the hard reference cache for {@link IRawStore}s, * including both {@link ManagedJournal}s and IndexSegment}s. There MAY be * more {@link IRawStore}s open than are reported by this method if there * are hard references held by the application to those {@link IRawStore}s. * {@link IRawStore}s that are not fixed by a hard reference will be * quickly finalized by the JVM. */ public int getStoreCacheSize() { return storeCache.size(); } /** * <code>true</code> iff {@link BufferMode#Transient} was indicated. */ private final boolean isTransient; // /** // * A direct {@link ByteBuffer} that will be used as the write cache for the // * live journal and which will be handed off from live journal to live // * journal during overflow processing which is allocated iff // * {@link BufferMode#Disk} is chosen. // * <p> // * Note: This design is motivated by by JVM bug <a // * href="http://bugs.sun.com/bugdatabase/view_bug.do;jsessionid=8fab76d1d4479fffffffffa5abfb09c719a30?bug_id=6210541"> // * 6210541</a> which describes a failure by // * <code>releaseTemporaryDirectBuffer()</code> to release temporary direct // * {@link ByteBuffer}s that are allocated for channel IO. // * // * @see com.bigdata.journal.Options#WRITE_CACHE_CAPACITY // * @see DiskOnlyStrategy // */ // private ByteBuffer writeCache; /** * A atomic hard reference to the live journal. */ final protected AtomicReference<ManagedJournal> liveJournalRef = new AtomicReference<ManagedJournal>(null); /** * <code>true</code> initially and remains <code>true</code> until the * {@link ResourceManager} is shutdown. * * @see #isOpen() */ private final AtomicBoolean open = new AtomicBoolean(true); /** * <code>true</code> initially and until {@link #start()} completes * successfully, this is used to disambiguate the startup transient state * from the shutdown state. * * @see #isStarting() */ private final AtomicBoolean starting = new AtomicBoolean(true); /** * The service used to send files to other data services and to exchange NIO * {@link ByteBuffer} in support of distributed query processing. */ private ManagedResourceService resourceService; /** * The service used to send files to other data services and to exchange NIO * {@link ByteBuffer} in support of distributed query processing. */ public ManagedResourceService getResourceService() { assertRunning(); return resourceService; } // /** // * The port at which you can connect to the {@link ResourceService}. This // * service provides remote access to resources hosted by the owning // * {@link DataService}. This is used for moving resources to other data // * services in the federation, including supporting service failover. // * // * @return The port used to connect to that service. // * // * @todo this could also be used for remote backup. however, note that you // * can not read the live journal using this object. // */ // public int getResourceServicePort() { // // assertRunning(); // // return resourceService.port; // // } /** * @see Options#IGNORE_BAD_FILES */ private final boolean ignoreBadFiles; /** * @see Options#PURGE_OLD_RESOURCES_DURING_STARTUP */ private final boolean purgeOldResourcesDuringStartup; /** * @see Options#ACCELERATE_OVERFLOW_THRESHOLD */ protected final long accelerateOverflowThreshold; /** * Used to run the {@link Startup}. @todo defer to init() outside of ctor. Also, defer {@link Startup} until init() outside of ctor. */ private final ExecutorService startupService = Executors .newSingleThreadExecutor(new DaemonThreadFactory (getClass().getName()+".startupService")); /** * Succeeds if the {@link StoreManager} {@link #isOpen()} and is NOT * {@link #isStarting()} (the test itself is NOT atomic). * * @throws IllegalStateException * unless open and not starting. */ protected void assertRunning() { if (!isOpen()) throw new IllegalStateException("Not open"); if (isStarting()) throw new IllegalStateException("Starting up"); } /** * Return <code>true</code> iff the {@link StoreManager} is open and * startup processing has been completed. */ public boolean isRunning() { return isOpen() && !isStarting(); } /** * @throws IllegalStateException * unless open. */ protected void assertOpen() { if (!isOpen()) throw new IllegalStateException(); } /** * @throws IllegalStateException * if open. */ protected void assertNotOpen() { if (isOpen()) throw new IllegalStateException(); } /** * Return <code>true</code> iff the {@link StoreManager} is running. If * the {@link StoreManager} is currently starting up, then this will await * the completion of the {@link Startup} task. * * @return <code>true</code> if the {@link StoreManager} is running and * <code>false</code> if it is shutdown. */ public boolean awaitRunning() { while (isOpen() && isStarting()) { try { if (log.isInfoEnabled()) log.info("Waiting on startup : " + dataDir + " ..."); Thread.sleep(1000/* ms */); } catch (InterruptedException ex) { throw new RuntimeException("Interrupted awaiting startup: " + ex); } } return isRunning(); } /** * A map from the resource UUID to the absolute {@link File} for that * resource. * <p> * Note: The {@link IResourceMetadata} reported by an * {@link AbstractJournal} or {@link IndexSegmentStore} generally reflects * the name of the file as specified to the ctor for that class, so it may * be relative to some arbitrary directory or absolute within the file * system. */ private final Map<UUID, File> resourceFiles = new HashMap<UUID, File>(); /** * The properties given to the ctor. */ private final Properties properties; /** * Release time is zero (0L) until notified otherwise - 0L is ignored. * * @see #setReleaseTime(long) */ private long releaseTime = 0L; /** * The elapsed #of milliseconds in {@link #purgeOldResources()} */ protected long purgeResourcesMillis = 0L; /** * The last value computed by {@link #getEffectiveReleaseTime()} and ZERO(0) * until a value has been calculated. */ protected long lastCommitTimePreserved = 0L; /** * The last commit time corresponding to the last synchronous overflow event * and ZERO (0L) until there has been a synchronous overflow event. */ protected long lastOverflowTime = 0L; /** * The observed maximum size of a journal (its length in bytes) as measured * at each synchronous overflow event. */ protected long maximumJournalSizeAtOverflow = 0L; /** * The #of {@link ManagedJournal}s that have been (re-)opened to date. */ final protected AtomicLong journalReopenCount = new AtomicLong(); /** * The #of {@link IndexSegmentStore}s that have been (re-)opened to date. */ final protected AtomicLong segmentStoreReopenCount = new AtomicLong(); /** * The #of {@link ManagedJournal}s that have been deleted to date. */ final protected AtomicLong journalDeleteCount = new AtomicLong(); /** * The #of {@link IndexSegmentStore}s that have been deleted to date. */ final protected AtomicLong segmentStoreDeleteCount = new AtomicLong(); /** * The #of bytes currently under management EXCEPT those on the live * journal. This is incremented each time a new resource is added using * {@link #addResource(IResourceMetadata, File)} and decremented each * time a resource is deleted. */ final protected AtomicLong bytesUnderManagement = new AtomicLong(); final protected AtomicLong journalBytesUnderManagement = new AtomicLong(); final protected AtomicLong segmentBytesUnderManagement = new AtomicLong(); /** * The #of bytes that have been deleted since startup. */ final protected AtomicLong bytesDeleted = new AtomicLong(); /** * The #of bytes currently under management, including those written on the * live journal. * * @throws IllegalStateException * during startup or if the {@link StoreManager} is closed. */ public long getBytesUnderManagement() { assertRunning(); return bytesUnderManagement.get() + getLiveJournal().getBufferStrategy().getExtent(); } /** * The #of bytes in {@link ManagedJournal}s, including those written on the * live journal. * * @throws IllegalStateException * during startup or if the {@link StoreManager} is closed. */ public long getJournalBytesUnderManagement() { assertRunning(); return journalBytesUnderManagement.get() + getLiveJournal().getBufferStrategy().getExtent(); } /** * The #of bytes in managed {@link IndexSegmentStore}s. * * @throws IllegalStateException * during startup or if the {@link StoreManager} is closed. */ public long getSegmentBytesUnderManagement() { assertRunning(); return segmentBytesUnderManagement.get(); } /** * The #of bytes of free space remaining on the volume hosting the * {@link #dataDir}. * * @return The #of bytes of free space remaining -or- <code>-1L</code> if * the free space could not be determined. */ public long getDataDirFreeSpace() { return getFreeSpace(dataDir); } /** * The #of bytes of free space remaining on the volume hosting the * {@link #tmpDir}. * * @return The #of bytes of free space remaining -or- <code>-1L</code> if * the free space could not be determined. */ public long getTempDirFreeSpace() { return getFreeSpace(tmpDir); } /** * Return the free space in bytes on the volume hosting some directory. * * @param dir * A directory hosted on some volume. * * @return The #of bytes of free space remaining for the volume hosting the * directory -or- <code>-1L</code> if the free space could not be * determined. */ /* * Note: This was written using Apache FileSystemUtil originally. That would * shell out "df" under un*x. Unfortunately, shelling out a child process * requires a commitment from the OS to support a process with as much * process space as the parent. For the data service, that is a lot of RAM. * In general, the O/S allows "over committment" of the available swap * space, but you can run out of swap and then you have a problem. If the * host was configured with scanty swap, then this problem could be * triggered very easily and would show up as "Could not allocate memory". * * See http://forums.sun.com/thread.jspa?messageID=9834041#9834041 */ private long getFreeSpace(final File dir) { try { if(!dir.exists()) { return -1; } /* * Note: This return 0L if there is no free space or if the File * does not "name" a partition in the file system semantics. That * is why we check dir.exists() above. */ return dir.getUsableSpace(); } catch(Throwable t) { log.error("Could not get free space: dir=" + dir + " : " + t, t); // the error is logger and ignored. return -1L; } } // /** // * Return the free space in bytes on the volume hosting some directory. // * <p> // * Note: This uses the apache IO commons {@link FileSystemUtils} to report // * the free space on the volume hosting the directory and then converts kb // * to bytes. // * // * @param dir // * A directory hosted on some volume. // * // * @return The #of bytes of free space remaining for the volume hosting the // * directory -or- <code>-1L</code> if the free space could not be // * determined. // * // * @see http://commons.apache.org/io/api-release/org/apache/commons/io/FileSystemUtils.html // */ // private long getFreeSpace(final File dir) { // // try { // // return FileSystemUtils.freeSpaceKb(dir.toString()) // * Bytes.kilobyte; // // } catch(Throwable t) { // // log.error("Could not get free space: dir=" + dir + " : " // + t, t); // // // the error is logger and ignored. // return -1L; // // } // // } /** * An object wrapping the {@link Properties} given to the ctor. */ public Properties getProperties() { return new Properties(this.properties); } /** * Return <code>true</code> iff data can not be made restart-safe. */ public boolean isTransient() { return isTransient; } /** * Note: This constructor starts an asynchronous thread that scans the data * directory for journals and index segments and creates the initial journal * if no store files are found. * <p> * Note: The store files are NOT accessible until the asynchronous startup * is finished. Caller's MUST verify that the {@link StoreManager#isOpen()} * AND NOT submit tasks until {@link StoreManager#isStarting()} returns * <code>false</code>. * * @param properties * See {@link Options}. * * @see Startup */ protected StoreManager(final Properties properties) { if (properties == null) throw new IllegalArgumentException(); this.properties = properties; // ignoreBadFiles { ignoreBadFiles = Boolean .parseBoolean(properties.getProperty( Options.IGNORE_BAD_FILES, Options.DEFAULT_IGNORE_BAD_FILES)); if (log.isInfoEnabled()) log.info(Options.IGNORE_BAD_FILES + "=" + ignoreBadFiles); } // purgeOldResourcesDuringStartup { purgeOldResourcesDuringStartup = Boolean .parseBoolean(properties.getProperty( Options.PURGE_OLD_RESOURCES_DURING_STARTUP, Options.DEFAULT_PURGE_OLD_RESOURCES_DURING_STARTUP)); if (log.isInfoEnabled()) log.info(Options.PURGE_OLD_RESOURCES_DURING_STARTUP + "=" + purgeOldResourcesDuringStartup); } // accelerateOverflowThreshold { accelerateOverflowThreshold = Long.parseLong(properties .getProperty(Options.ACCELERATE_OVERFLOW_THRESHOLD, Options.DEFAULT_ACCELERATE_OVERFLOW_THRESHOLD)); if (log.isInfoEnabled()) log.info(Options.ACCELERATE_OVERFLOW_THRESHOLD + "=" + accelerateOverflowThreshold); if (accelerateOverflowThreshold < 0) { throw new RuntimeException( Options.ACCELERATE_OVERFLOW_THRESHOLD + " must be non-negative"); } } /* * storeCacheCapacity */ { final int storeCacheCapacity = Integer.parseInt(properties .getProperty(Options.STORE_CACHE_CAPACITY, Options.DEFAULT_STORE_CACHE_CAPACITY)); if (log.isInfoEnabled()) log.info(Options.STORE_CACHE_CAPACITY + "=" + storeCacheCapacity); if (storeCacheCapacity <= 0) throw new RuntimeException(Options.STORE_CACHE_CAPACITY + " must be positive"); final long storeCacheTimeout = Long.parseLong(properties .getProperty(Options.STORE_CACHE_TIMEOUT, Options.DEFAULT_STORE_CACHE_TIMEOUT)); if (log.isInfoEnabled()) log.info(Options.STORE_CACHE_TIMEOUT + "=" + storeCacheTimeout); if (storeCacheTimeout < 0) throw new RuntimeException(Options.STORE_CACHE_TIMEOUT + " must be non-negative"); storeCache = new ConcurrentWeakValueCacheWithTimeout<UUID, IRawStore>( storeCacheCapacity, TimeUnit.MILLISECONDS .toNanos(storeCacheTimeout)); // storeCache = new WeakValueCache<UUID, IRawStore>( // new LRUCache<UUID, IRawStore>(storeCacheCapacity)); } // /* // * Allocate an optional write cache that will be passed from live // * journal to live journal during overflow. // */ // { // // writeCache = AbstractJournal.getWriteCache(properties); // // } /* * Create the _transient_ index in which we will store the mapping from * the commit times of the journals to their resource descriptions. */ journalIndex = JournalIndex.createTransient();//tmpStore); segmentIndex = IndexSegmentIndex.createTransient();//(tmpStore); if (log.isInfoEnabled()) log.info("Current working directory: " + new File(".").getAbsolutePath()); // true iff transient journals is requested. isTransient = BufferMode.valueOf(properties.getProperty( Options.BUFFER_MODE, Options.DEFAULT_BUFFER_MODE.toString())) == BufferMode.Transient; /* * data directory. */ if (isTransient) { /* * Transient. */ dataDir = null; journalsDir = null; segmentsDir = null; } else { /* * Persistent. */ // Note: dataDir is _canonical_ final File dataDir; try { final String val = properties.getProperty(Options.DATA_DIR); if (val == null) { throw new RuntimeException("Required property: " + Options.DATA_DIR); } // Note: stored in canonical form. dataDir = new File(val).getCanonicalFile(); if (log.isInfoEnabled()) log.info(Options.DATA_DIR + "=" + dataDir); journalsDir = new File(dataDir, "journals").getCanonicalFile(); segmentsDir = new File(dataDir, "segments").getCanonicalFile(); } catch (IOException ex) { throw new RuntimeException(ex); } if (!dataDir.exists()) { if (log.isInfoEnabled()) log.info("Creating: " + dataDir); if (!dataDir.mkdirs()) { throw new RuntimeException("Could not create directory: " + dataDir.getAbsolutePath()); } } if (!journalsDir.exists()) { if(log.isInfoEnabled()) log.info("Creating: " + journalsDir); if (!journalsDir.mkdirs()) { throw new RuntimeException("Could not create directory: " + journalsDir.getAbsolutePath()); } } if (!segmentsDir.exists()) { if(log.isInfoEnabled()) log.info("Creating: " + segmentsDir); if (!segmentsDir.mkdirs()) { throw new RuntimeException("Could not create directory: " + segmentsDir.getAbsolutePath()); } } // verify all are directories vs regular files. if (!dataDir.isDirectory()) { throw new RuntimeException("Not a directory: " + dataDir.getAbsolutePath()); } if (!journalsDir.isDirectory()) { throw new RuntimeException("Not a directory: " + journalsDir.getAbsolutePath()); } if (!segmentsDir.isDirectory()) { throw new RuntimeException("Not a directory: " + segmentsDir.getAbsolutePath()); } this.dataDir = dataDir; } // temp directory. { // Note: tmpDir is _canonical_ final File tmpDir; try { tmpDir = new File(properties.getProperty(Options.TMP_DIR, System.getProperty("java.io.tmpdir"))) .getCanonicalFile(); } catch (IOException ex) { throw new RuntimeException(ex); } if(log.isInfoEnabled()) log.info(Options.TMP_DIR + "=" + tmpDir); if (!tmpDir.exists()) { if(log.isInfoEnabled()) log.info("Creating temp directory: " + tmpDir); if (!tmpDir.mkdirs()) { throw new RuntimeException("Could not create directory: " + tmpDir.getAbsolutePath()); } } if (!tmpDir.isDirectory()) { throw new RuntimeException("Not a directory: " + tmpDir.getAbsolutePath()); } this.tmpDir = tmpDir; } /* * Asynchronous startup processing. */ startupService.submit(new Startup()); } /** * Runs a startup scan of the data directory and creates the initial journal * if none was found. If the {@link Startup} task fails or is interrupted * then the {@link StoreManager} will be {@link StoreManager#shutdownNow()}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ private class Startup implements Runnable { @Override public void run() { try { try { start(); // successful startup starting.set(false); // Purge any resources that we no longer require. if(purgeOldResourcesDuringStartup) purgeOldResources(); } catch (Throwable ex) { // avoid possibility that isRunning() could become true. open.set(false); log.error("Problem during startup? : " + ex, ex); shutdownNow(); // terminate Startup task. throw new RuntimeException(ex); } } finally { /* * Whether or not startup was successful, we make sure that this * flag is turned off. */ starting.set(false); if (log.isInfoEnabled()) log.info("Startup " + (isOpen() ? "successful" : "failed") + " : " + (isTransient ? "transient" : Options.DATA_DIR + "=" + dataDir)); } } /** * Starts up the {@link StoreManager}. * <p> * Note: Implementations of this method MUST be * <code>synchronized</code>. * * @throws InterruptedException * * @throws IllegalStateException * if the {@link IConcurrencyManager} has not been set * (after a timeout). * * @throws IllegalStateException * if the the {@link ResourceManager} is already running. * * @throws InterruptedException * if the startup scan is interrupted. * * @throws RuntimeException * if bad files are encountered, etc. */ final private void start() throws InterruptedException { if (!isStarting()) { throw new IllegalStateException(); } /* * Verify that the concurrency manager has been set and wait a while * it if is not available yet. */ { int nwaits = 0; while (true) { try { getConcurrencyManager(); break; } catch (IllegalStateException ex) { Thread.sleep(100/* ms */); if (++nwaits % 50 == 0) log.warn("Waiting for concurrency manager"); } } } try { final IBigdataFederation<?> fed = getFederation(); if (fed == null) { /* * Some of the unit tests do not start the txs until after * the DataService. For those unit tests getFederation() * will return null during startup() of the DataService. To * have a common code path, we throw the exception here * which is caught below. */ throw new UnsupportedOperationException(); } while (true) { if (fed.getTransactionService() != null) { break; } log.warn("Waiting for transaction service discovery"); } } catch (UnsupportedOperationException ex) { log.warn("Federation not available - running in test case?"); } /* * Look for pre-existing data files. */ if (!isTransient) { if (log.isInfoEnabled()) log.info("Starting scan of data directory: " + dataDir); final Stats stats = new Stats(); scanDataDirectory(dataDir, stats); final int nbad = stats.badFiles.size(); if(log.isInfoEnabled()) log.info("Scan results: " + stats); if (!stats.badFiles.isEmpty()) { if (ignoreBadFiles) { log.warn("The following " + nbad + " file(s) had problems and are being ignored: " + stats.badFiles); } else { /* * Note: This exception will be thrown if we could not * get a lock on a journal file (see FileMetadata - the * lock error is not reported until we try to read the * magic field) or if there is a problem with the data * in the file. You have to examine the stack trace to * see what the root cause is. */ final String msg = "Could not open " + nbad + " files - will not start : problem files=" + stats.badFiles; log.fatal(msg); throw new RuntimeException(msg); } } assert journalIndex.getEntryCount() == stats.njournals; assert segmentIndex.getEntryCount() == stats.nsegments; assert resourceFiles.size() + nbad == stats.nfiles : "#resourceFiles=" + resourceFiles.size() + ", #nbad=" + nbad + ", nfiles=" + stats.nfiles; } /* * Open the live journal. */ openLiveJournal(); // /* // * Purge any index partition moves which did not complete before // * shutdown. // */ // purgeIncompleteMoves(); /* * Notify the transaction service of the last commit time for the * live journal for this data service. This will be zero (0L) iff * this is a new journal on a new data service. * * Note: This notification is not required unless the commit time * log for the transaction service is lost. In that case it provides * a backup allowing new transactions to read from the last global * commit point (once all data services have joined). */ final long lastCommitTime = liveJournalRef.get() .getLastCommitTime(); if (lastCommitTime != 0L) { getConcurrencyManager().getTransactionManager().notifyCommit( lastCommitTime); } try { resourceService = new ManagedResourceService( new InetSocketAddress( InetAddress .getByName(NicUtil .getIpAddress( "default.nic"/* systemPropertyName */, "default"/* defaultNicName */, false/* loopbackOk */)), 0/* port */), 0/* requestServicePoolSize */) { @Override protected File getResource(final UUID uuid) throws Exception { if (!isRunning()) { throw new Exception("Not running."); } return resourceFiles.get(uuid); } }; } catch (IOException ex) { throw new RuntimeException("Could not start: " + resourceService, ex); } } /** * Open the "live" journal. */ private void openLiveJournal() throws InterruptedException { if (log.isInfoEnabled()) log.info("Creating/opening the live journal: dataDir=" + dataDir); if (Thread.interrupted()) throw new InterruptedException(); final Properties p = getProperties(); final File file; final boolean newJournal; if (journalIndex.getEntryCount() == 0) { /* * There are no existing journal files. Create new journal using * a unique filename in the appropriate subdirectory of the data * directory. Since the file is empty, it will be initialized * as a new Journal. */ if (log.isInfoEnabled()) log.info("Creating initial journal: dataDir=" + dataDir); // unique file name for new journal. if (isTransient) { file = null; } else { try { file = File.createTempFile("journal", // prefix Options.JNL,// suffix journalsDir // directory ).getCanonicalFile(); } catch (IOException e) { throw new RuntimeException(e); } } /* * Set the createTime on the new journal resource. */ p.setProperty(Options.CREATE_TIME, Long .toString(nextTimestamp())); overrideJournalExtent(p); newJournal = true; } else { /* * There is at least one pre-existing journal file, so we open * the one with the largest timestamp - this will be the most * current journal and the one that will receive writes until it * overflows. */ // resource metadata for journal with the largest // timestamp. final IResourceMetadata resource = journalIndex .find(Long.MAX_VALUE); if (log.isInfoEnabled()) log.info("Will open as live journal: " + resource); assert resource != null : "No resource? : timestamp=" + Long.MAX_VALUE; // lookup absolute file for that resource. file = resourceFiles.get(resource.getUUID()); if (file == null) { throw new NoSuchStoreException(resource.getUUID()); } if (log.isInfoEnabled()) log.info("Opening most recent journal: " + file + ", resource=" + resource); newJournal = false; } if (!isTransient) { assert file.isAbsolute() : "Path must be absolute: " + file; p.setProperty(Options.FILE, file.toString()); } if (log.isInfoEnabled()) log.info("Open/create of live journal: newJournal=" + newJournal + ", file=" + file); // Create/open journal. { if (Thread.interrupted()) throw new InterruptedException(); final ManagedJournal tmp = new ManagedJournal(p); if (newJournal) { // add to the set of managed resources. addResource(tmp.getResourceMetadata(), tmp.getFile()); } /* * Add to set of open stores. * * Note: single-threaded during startup. */ storeCache.put(tmp.getRootBlockView().getUUID(), tmp); // storeCache.put(tmp.getRootBlockView().getUUID(), tmp, false/* // dirty */); if (Thread.interrupted()) throw new InterruptedException(); liveJournalRef.set(tmp); /* * Subtract out the #of bytes in the live journal. */ final long extent = -tmp.getBufferStrategy().getExtent(); bytesUnderManagement.addAndGet(extent); journalBytesUnderManagement.addAndGet(extent); } } // /** // * Purge any index partition moves which did not complete successfully // * on restart. These index partitions are identified by scanning the // * indices registered on the live journal. If an index has // * <code>sourcePartitionId != -1</code> in its // * {@link LocalPartitionMetadata} then the index was being moved onto // * this {@link IDataService} when the service was shutdown. The index // * (together with any {@link IndexSegment} resources that are identified // * in its {@link LocalPartitionMetadata}) is deleted. // * // * @todo write a unit test for this feature. // * // * @todo test MDS to verify that the index partition flagged as an // * incomplete move is not registered as part of scale-out index? // * // * @deprecated This is no longer necessary. The new MOVE does not use // * {@link LocalPartitionMetadata#getSourcePartitionId()} // * field. Index segments are cleaned up during a failed // * receive. If the index segment for some reason is NOT // * cleaned up, then it will be released eventually (unless // * an immortal database is being used) since it will not be // * incorporated into any index partition view. // */ // private void purgeIncompleteMoves() { // // final boolean reallyDelete = true; // // final ManagedJournal liveJournal = liveJournalRef.get(); // // // using read-committed view of Name2Addr // final ITupleIterator itr = liveJournal.getName2Addr() // .rangeIterator(); // // // the list of indices that will be dropped. // final List<String> toDrop = new LinkedList<String>(); // // while (itr.hasNext()) { // // final ITuple tuple = itr.next(); // // final Entry entry = EntrySerializer.INSTANCE // .deserialize(new DataInputBuffer(tuple.getValue())); // // /* // * Open the mutable btree on the journal (not the full view of // * that index). // */ // final BTree btree = (BTree) liveJournal.getIndex(entry.checkpointAddr); // // final String name = btree.getIndexMetadata().getName(); // // final LocalPartitionMetadata pmd = btree.getIndexMetadata().getPartitionMetadata(); // // if (pmd != null) { // //// System.err.println("\nname=" + name + "\npmd=" + pmd); // // if (pmd.getSourcePartitionId() != -1) { // // log.warn("Incomplete index partition move: name=" // + name + ", pmd=" + pmd); // // for (IResourceMetadata resource : pmd.getResources()) { // // if (resource.isIndexSegment()) { // // final File file = resourceFiles.get(resource.getUUID()); // //// final File file = new File(segmentsDir, //// resource.getFile()); // // log.warn("Deleting index segment: " + file); // // if (file.exists()) { // // if (reallyDelete) { // // deleteResource(resource.getUUID(), // false/* isJournal */); // // } // // } else { // // log.warn("Could not locate file: " + file); // // } // // } // // } // // } // // } // // if (!toDrop.isEmpty() && reallyDelete) { // // for (String s : toDrop) { // // liveJournal.dropIndex(s); // // } // // liveJournal.commit(); // // } // // } // // } // purgeIncompleteMoves() } // class Startup /** * <code>true</code> initially and until {@link #start()} completes * successfully. */ public boolean isStarting() { return starting.get(); } /** * <code>false</code> initially and remains <code>false</code> until * {@link #start()} completes successfully. once <code>true</code> this * remains <code>true</code> until either {@link #shutdown()} or * {@link #shutdownNow()} is invoked. */ @Override public boolean isOpen() { return open.get(); } // /** // * Clears any stale entries in the LRU backing the {@link #storeCache} // */ // public void clearStaleCacheEntries() { // // storeCache.clearStaleRefs(); // // } @Override synchronized public void shutdown() { if (log.isInfoEnabled()) log.info(""); final boolean wasOpen = this.open.get(); /* * Note: clear before we clear [starting] or the * StoreManager#isRunning() could report true. */ this.open.set(false); // Note: if startup is running, then cancel immediately. startupService.shutdownNow(); // failsafe clear : note that [open] is already false. starting.set(false); if (!wasOpen) return; try { closeStores(); } catch (Exception ex) { log.warn(ex.getMessage(), ex); } if (resourceService != null) { resourceService.shutdown(); resourceService = null; } // try { // tmpStore.destroy(); // } catch (Exception ex) { // log.warn(ex.getMessage(), ex); // } // // release the write cache. // writeCache = null; } @Override synchronized public void shutdownNow() { if(log.isInfoEnabled()) log.info(""); final boolean wasOpen = this.open.get(); /* * Note: clear before we clear [starting] or the * StoreManager#isRunning() could report true. */ this.open.set(false); startupService.shutdownNow(); // failsafe clear : note that [open] is already false. starting.set(false); if (!wasOpen) return; try { closeStores(); } catch (Exception ex) { log.warn(ex.getMessage(), ex); } if (resourceService != null) { resourceService.shutdownNow(); resourceService = null; } // try { // tmpStore.destroy(); // } catch (Exception ex) { // log.warn(ex.getMessage(), ex); // } // // release the write cache. // writeCache = null; } /** * Helper class gathers statistics about files during a scan. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ private static class Stats { /** * #of files scanned. */ public int nfiles; /** * #of journal files scanned. */ public int njournals; /** * #of index segment files found. */ public int nsegments; /** * A list of all bad files found during the scan. */ public Collection<String> badFiles = Collections .synchronizedCollection(new TreeSet<String>()); /** * total #of bytes of user data found in those files. */ public long nbytes; public String toString() { return "Stats{nfiles=" + nfiles + ", njournals=" + njournals + ", nsegments=" + nsegments + ", nbad=" + badFiles.size() + ", nbytes=" + nbytes + ", badFiles=" + badFiles + "}"; } }; /** * Recursively scan a directory structure identifying all journal and index * segment resources and populating the internal {@link #resourceFiles} map. * In addition, all journal files are listed in the {@link #journals} map so * that we can find the relevant journal quickly for a given timestamp. * <p> * Note: This requires that we open each resource in order to extract its * {@link IResourceMetadata} description. We only open the {@link IRawStore} * for the resource, not its indices. The stores are closed again * immediately. * * @param dir * A directory to scan. * * @throws InterruptedException */ private void scanDataDirectory(File dir, Stats stats) throws InterruptedException { if (dir == null) throw new IllegalArgumentException(); if (!dir.isDirectory()) throw new IllegalArgumentException(); if (Thread.interrupted()) throw new InterruptedException(); final File[] files = dir.listFiles(newFileFilter()); for (final File file : files) { if (file.isDirectory()) { scanDataDirectory(file, stats); } else { scanFile(file, stats); } } } private void scanFile(final File file, final Stats stats) throws InterruptedException { if (Thread.interrupted()) throw new InterruptedException(); if (log.isInfoEnabled()) log.info("Scanning file: " + file + ", stats=" + stats); final IResourceMetadata resource; // name of the file. final String name = file.getName(); // #of bytes in the file as reported by the OS. final long len = file.length(); if (len > 0 && name.endsWith(Options.JNL)) { final Properties properties = getProperties(); properties.setProperty(Options.FILE, file.getAbsolutePath()); // Note: no writes allowed during startup. // Note: disables the write cache among other things. properties.setProperty(Options.READ_ONLY, "true"); final AbstractJournal tmp; try { tmp = new ManagedJournal(properties); } catch (Exception ex) { log.error("Problem opening journal: file=" + file.getAbsolutePath(), ex); stats.nfiles++; stats.badFiles.add(file.getAbsolutePath()); return; } try { resource = tmp.getResourceMetadata(); stats.nfiles++; stats.njournals++; stats.nbytes += len; } finally { tmp.close(); } } else if (len > 0 && name.endsWith(Options.SEG)) { /* * Attempt to open the index segment. */ final IndexSegmentStore segStore; try { segStore = new IndexSegmentStore(file); } catch (Exception ex) { log.error("Problem opening segment: file=" + file.getAbsolutePath(), ex); stats.nfiles++; stats.badFiles.add(file.getAbsolutePath()); return; } try { resource = segStore.getResourceMetadata(); stats.nfiles++; stats.nsegments++; stats.nbytes += len; } finally { if(segStore.isOpen()) { /* * Note: opening the segment with [load == false] does not * really open anything so you do not need to close the * segment afterwards. I've put the conditional logic here * just in case that changes. */ segStore.close(); } } } else { if (len == 0L && (name.endsWith(Options.JNL) || name .endsWith(Options.SEG))) { log.warn("Ignoring empty file: " + file); } else { /* * This file is not relevant to the resource manager. */ log.warn("Ignoring file: " + file); } return; } if (log.isInfoEnabled()) log.info("Found " + resource + " in " + file); // if (!file.getName().equals(new File(resource.getFile()).getName())) { // // /* // * The base name and extension of the file does not agree with that // * metadata reported by the store (unlikely since the store reports // * its metadata based on the file that it opened). // */ // // log.error("Wrong filename: actual=" + file + ", expected=" // + file); // // } // addResource(resource, file.getAbsoluteFile()); addResource(resource, file); } @Override public File getTmpDir() { return tmpDir; } /** * Note: The returned {@link File} is in canonical form. */ @Override public File getDataDir() { return dataDir; } /** * Closes ALL open store files. * <p> * Note: This is invoked by {@link #shutdown()} and {@link #shutdownNow()}. */ private void closeStores() { // final Iterator<IRawStore> itr = storeCache.iterator(); final Iterator<WeakReference<IRawStore>> itr = storeCache.iterator(); while (itr.hasNext()) { // final IRawStore store = itr.next(); final IRawStore store = itr.next().get(); if (store == null) { // weak reference has been cleared. continue; } try { store.close(); } catch (Exception ex) { log.warn(ex.getMessage(), ex); } itr.remove(); } } /** * The #of journals on hand. */ synchronized public long getManagedJournalCount() { assertOpen(); return journalIndex.getEntryCount(); } /** * The #of index segments on hand. */ synchronized public long getManagedSegmentCount() { assertOpen(); return segmentIndex.getEntryCount(); } /** * Notify the resource manager of a new resource. The resource is added to * {@link #resourceFiles} and to either {@link #journalIndex} or * {@link #segmentIndex} as appropriate. As a post-condition, you can use * {@link #openStore(UUID)} to open the resource using the {@link UUID} * specified by {@link IResourceMetadata#getUUID()}. * <p> * Note: This also adds the size of the store in bytes as reported by the OS * to {@link #bytesUnderManagement}. * <p> * Note: Adding a resource to the store manager has no persistent effect * other than the presumed presence of the specified file in the file * system. However, error handling routines SHOULD invoke * {@link #deleteResource(UUID, boolean)} in order to remove a resource that * was not built correctly or not incorporated into the view. Otherwise the * mapping from the {@link UUID} to the {@link File} will be maintained in * memory and the {@link StoreManager} will overreport the #of bytes under * management. * * @param resourceMetadata * The metadata describing that resource. * @param file * The file in the local file system which is the resource. * * @throws RuntimeException * if the file does not exist. * @throws RuntimeException * if there is already a resource registered with the same UUID * as reported by {@link IResourceMetadata#getUUID()} * @throws RuntimeException * if the {@link #journalIndex} or {@link #segmentIndex} already * know about that resource. * @throws RuntimeException * if {@link #openStore(UUID)} already knows about that * resource. * @throws IllegalArgumentException * if the <i>resourceMetadata</i> is <code>null</code>. * @throws IllegalArgumentException * if the <i>file</i> is <code>null</code> and * {@link #isTransient} is <code>false</code>. * * @see #deleteResource(UUID, boolean) * @see #retentionSetAdd(UUID) * @see #retentionSetRemove(UUID) */ synchronized protected void addResource( final IResourceMetadata resourceMetadata, File file ) { if (resourceMetadata == null) throw new IllegalArgumentException(); if (file == null && !isTransient) throw new IllegalArgumentException(); assertOpen(); final UUID uuid = resourceMetadata.getUUID(); if (log.isInfoEnabled()) log.info("file=" + file + ", uuid=" + uuid); if (file != null) { file = file.getAbsoluteFile(); } // synchronized (storeCache) { if (storeCache.get(uuid) != null) { throw new RuntimeException("Resource already open?: " + resourceMetadata); } // } final long extent; if (!isTransient) { if (!file.exists()) { throw new RuntimeException("File not found: " + file); } // check for existing entry under that UUID. final File tmp = resourceFiles.get(uuid); if (tmp != null) { throw new RuntimeException("Resource already registered: uuid=" + uuid + " as file=" + tmp + " (given file=" + file + ")"); } // add new entry. resourceFiles.put(uuid, file); // size of the file. extent = file.length(); } else { // transient resource - no extent. extent = 0L; } if (resourceMetadata.isJournal()) { journalIndex.add((JournalMetadata)resourceMetadata); journalBytesUnderManagement.addAndGet(extent); } else { segmentIndex.add((SegmentMetadata)resourceMetadata); segmentBytesUnderManagement.addAndGet(extent); } /* * Track the #of bytes under management. */ bytesUnderManagement.addAndGet(extent); } /** * Returns a filter that is used to recognize files that are managed by this * class. The {@link ResourceManager} will log warnings if it sees an * unexpected file and will NOT {@link #deleteResources()} files that it * does not recognize. * * @see ResourceFileFilter * * @todo perhaps define setFileFilter and getFileFilter instead since * subclassing this method is a bit difficult. The * {@link ResourceFileFilter} would have to be a static class and we * would have to pass in the {@link IResourceManager} so that it could * get the {@link #dataDir}. */ protected ResourceFileFilter newFileFilter() { return new ResourceFileFilter(this); } /** * The object used to control access to the index resources. * * @throws IllegalStateException * if the object has not been set yet using * {@link #setConcurrencyManager(IConcurrencyManager)}. */ public abstract IConcurrencyManager getConcurrencyManager(); public abstract void setConcurrencyManager(IConcurrencyManager concurrencyManager); /** * The {@link ManagedJournal} provides the backing store used to absorb * writes and retain history for the scale-out architecture. * <p> * Note: This implementation is designed to use a shared * {@link ConcurrencyManager} across all open journal instances for a * {@link DataService}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> */ public class ManagedJournal extends AbstractJournal { // /** // * Note: Each instance of the {@link ManagedJournal} reuses the SAME // * {@link StoreManager#writeCache}. Therefore you MUST close out writes // * on the old journal BEFORE you may allocate a new journal. // * // * @param properties // * // * @see AbstractJournal#closeForWrites(long) // */ protected ManagedJournal(final Properties properties) { super(properties);//, writeCache); /* * Set the performance counters on the new store so that we have a * cumulative track of all activity on both the "live" journals and * the "historical" journals managed by this data service. * * FIXME Must also roll the counters forward for the other journal * buffer strategies! (The implementation class is different for the * WORMStrategy, which is causing complications right now.) */ if (getBufferStrategy() instanceof DiskOnlyStrategy) { ((DiskOnlyStrategy) getBufferStrategy()) .setStoreCounters(getStoreCounters()); } else if (getBufferStrategy() instanceof WORMStrategy) { ((WORMStrategy) getBufferStrategy()) .setStoreCounters(getStoreCounters()); } } @Override public String toString() { /* * Note: Should not depend on any state that might be unreachable, * e.g., because the store is not open, etc. */ final IRootBlockView rootBlock = getRootBlockView(); return getClass().getName() + "{file=" + getFile() + ", open=" + ManagedJournal.this.isOpen() + (rootBlock != null ? ", uuid=" + getRootBlockView().getUUID() : "") + "}"; } /** * Note: Exposed for the {@link DataService} which needs this for its * 2-phase commit protocol. */ @Override public long commitNow(final long commitTime) { return super.commitNow(commitTime); } /** * Exposed for {@link StoreManger#getResourcesForTimestamp(long)} which * requires access to the {@link CommitRecordIndex} for the * lastCommitTime on the historical journals. * <p> * Note: This always returns a distinct index object. The code relies on * this fact to avoid contention with the live {@link CommitRecordIndex} * for the live journal. */ @Override public CommitRecordIndex getCommitRecordIndex(final long addr, final boolean readOnly) { return super.getCommitRecordIndex(addr, readOnly); } @Override public AbstractLocalTransactionManager getLocalTransactionManager() { return (AbstractLocalTransactionManager) getConcurrencyManager() .getTransactionManager(); } // public DataServiceTransactionManager getLocalTransactionManager() { // // return (DataServiceTransactionManager) getConcurrencyManager() // .getTransactionManager(); // // } /** * The data services support group commit. */ @Override public boolean isGroupCommit() { return true; } @Override public SparseRowStore getGlobalRowStore() { return getFederation().getGlobalRowStore(); } @Override public SparseRowStore getGlobalRowStore(final long timestamp) { return getFederation().getGlobalRowStore(timestamp); } @Override public BigdataFileSystem getGlobalFileSystem() { return getFederation().getGlobalFileSystem(); } @Override public DefaultResourceLocator getResourceLocator() { return (DefaultResourceLocator) getFederation() .getResourceLocator(); } @Override public ExecutorService getExecutorService() { return getFederation().getExecutorService(); } @Override public IResourceLockService getResourceLockService() { return getFederation().getResourceLockService(); } @Override public TemporaryStore getTempStore() { return getFederation().getTempStore(); } /** * Extended to set the {@link IResourceMetadata} to this journal if it * is <code>null</code> since a remote caller can not have the correct * metadata on hand when they formulate the request. */ @Override protected void validateIndexMetadata(final String name, final IndexMetadata metadata) { super.validateIndexMetadata(name, metadata); final LocalPartitionMetadata pmd = metadata.getPartitionMetadata(); if(pmd == null) { /* * Note: This case permits unpartitioned indices for the MDS. */ return; } if (pmd.getResources() == null) { /* * A [null] for the resources field is a specific indication * that we need to specify the resource metadata for the live * journal at the time that the index partition is registered. * This indicator is used when the metadata service registers an * index partition remotely on a data service since it does not * (and can not) have access to the resource metadata for the * live journal as of the time that the index partition actually * gets registered on the data service. * * The index partition split and join tasks do not have this * problem since they are run locally. However, an index * partition move operation also needs to do this. */ final ResourceManager resourceManager = ((ResourceManager) (StoreManager.this)); metadata.setPartitionMetadata(// new LocalPartitionMetadata(// pmd.getPartitionId(),// pmd.getSourcePartitionId(),// pmd.getLeftSeparatorKey(),// pmd.getRightSeparatorKey(),// new IResourceMetadata[] {// // The live journal. getResourceMetadata() // }, // cause IndexPartitionCause.register(resourceManager) // /* // * Note: Retains whatever history given by the // * caller. // */ // , pmd.getHistory() + "register(name=" + name // + ",partitionId=" // + pmd.getPartitionId() + ") " )); } else { if (pmd.getResources().length == 0) { throw new RuntimeException( "Missing resource description: name=" + name + ", pmd=" + pmd); } if (!pmd.getResources()[0].isJournal()) { throw new RuntimeException( "Expecting resources[0] to be journal: name=" + name + ", pmd=" + pmd); } if (!pmd.getResources()[0].getUUID().equals( getRootBlockView().getUUID())) { throw new RuntimeException( "Expecting resources[0] to be this journal but has wrong UUID: name=" + name + ", pmd=" + pmd); } } } @Override public ScheduledFuture<?> addScheduledTask(Runnable task, long initialDelay, long delay, TimeUnit unit) { return getFederation().addScheduledTask(task, initialDelay, delay, unit); } @Override public boolean getCollectPlatformStatistics() { return getFederation().getCollectPlatformStatistics(); } @Override public boolean getCollectQueueStatistics() { return getFederation().getCollectQueueStatistics(); } @Override public int getHttpdPort() { return getFederation().getHttpdPort(); } @Override public boolean isHAJournal() { return false; } } // class ManagedJournal /** * The journal on which writes are made. * * @throws IllegalStateException * if the {@link StoreManager} is not open. * @throws IllegalStateException * if the {@link StoreManager} is still starting up. */ @Override public ManagedJournal getLiveJournal() { assertRunning(); final ManagedJournal tmp = liveJournalRef.get(); assert tmp != null : "open=" + isOpen() + ", starting=" + isStarting() + ", dataDir=" + dataDir; assert tmp.isOpen(); /* * Note: There is a brief period when we close out writes on the live * journal before we cut over to the new live journal. Therefore this * assertion can not be made since it is violated during that brief * period. * * Note: Concurrent readers are always allowed, even during that brief * period. */ // assert !liveJournal.isReadOnly(); return tmp; } // /** // * This lock is used to prevent asynchronous processes such as // * {@link ConcurrencyManager#getIndexCounters()} from acquiring the live // * journal during the period between when we close out the old journal // * against future writes and when the new live journal is in place. // * <p> // * Note: {@link AbstractJournal#closeForWrites(long)} does not disturb // * concurrent readers. // */ // protected final ReentrantLock liveJournalLock = new ReentrantLock(); /** * @throws IllegalStateException * if the {@link StoreManager} is not open. * @throws IllegalStateException * if the {@link StoreManager} is still starting up. * * @todo write tests for unisolated and read-committed. make sure that there * is no fencepost for read committed immediately after an overflow * (there should not be since we do a commit when we register the * indices on the new store). */ @Override public AbstractJournal getJournal(final long timestamp) { assertRunning(); if (timestamp == ITx.UNISOLATED || timestamp == ITx.READ_COMMITTED) { /* * This is a request for the live journal. * * Note: The live journal remains open except during overflow, when * it is changed to a new journal and the old live journal is * closed. Therefore we NEVER cause the live journal to be opened * from the disk in this method. */ return getLiveJournal(); } final IResourceMetadata resource; synchronized (journalIndex) { /* * @todo add a weak reference cache in front of this by timestamp? * (The MDI had a hotspot for a similar pattern of use, but I have * not verified yet whether there is such a hotspot here). */ resource = journalIndex.find(Math.abs(timestamp)); } if (resource == null) { log.warn("No such journal: timestamp=" + timestamp); return null; } return (AbstractJournal) openStore(resource.getUUID()); } /** * Opens an {@link IRawStore}. * * @param uuid * The UUID identifying that store file. * * @return The open {@link IRawStore}. * * @throws IllegalStateException * if the {@link StoreManager} is not open. * @throws IllegalStateException * if the {@link StoreManager} is still starting up. * @throws IllegalArgumentException * if <i>uuid</i> is <code>null</code>. * @throws NoSuchStoreException * if the {@link UUID} is not recognized. * @throws NoSuchStoreException * if the resource for that {@link UUID} could not be found. * @throws RuntimeException * if something else goes wrong. * * @todo it seems that we always have the {@link IResourceMetadata} on hand * when we need to (re-)open a store so it might be nice to pass that * in as it would make for more informative error messages when * something goes wrong (except that I was planning to drop the file * name from that interface). */ @Override public IRawStore openStore(final UUID uuid) { assertRunning(); if (uuid == null) { throw new IllegalArgumentException(); } /* * Note: These operations can have modest latency, especially if we open * a fully buffered index segment. Therefore we use a per-store * (actually, per-resource UUID, which is the same thing) lock to avoid * imposing latency on threads requiring access to different stores. */ final Lock lock = namedLock.acquireLock(uuid); try { /* * Check to see if the given resource is already open. */ IRawStore store; // synchronized(storeCache) { store = storeCache.get(uuid); // } if (store != null) { if (!store.isOpen()) { if (store instanceof IndexSegmentStore) { /* * We can simply re-open an index segment's store file. */ // // Note: relative to the data directory! // final File file = resourceFiles.get(uuid); // // if (file == null) { // // throw new NoSuchStoreException(uuid); // // } // // if (!file.exists()) { // // throw new RuntimeException( // "Resource file missing? uuid=" + uuid // + ", file=" + file); // // } // re-open the store file. it will complain if the file is gone. ((IndexSegmentStore) store).reopen(); // re-opening the store. segmentStoreReopenCount.incrementAndGet(); // done. return store; } else { /* * Note: Journals should not be closed without also * removing them from the list of open resources. The * live journal SHOULD NOT be closed except during * shutdown or overflow (when it is replaced by a new * live journal). */ throw new AssertionError(); } } return store; } if (store == null) { /* * Attempt to open the resource. */ // Lookup filename by resource UUID. final File file = resourceFiles.get(uuid); if (file == null) { /* * Note: Non-transactional read-historical operations DO NOT * declare read locks and therefore are unable to prevent * resources from being released, which can lead to this * exception. */ throw new NoSuchStoreException(uuid); } if (!file.exists()) { throw new NoSuchStoreException("Resource file missing? uuid=" + uuid + ", file=" + file); } final UUID actualUUID; if (file.getName().endsWith(Options.JNL)) { /* * Open a historical journal. * * Note: The live journal is never opened by this code path. * It is opened when the resource manager is instantiated * and it will remain open except during shutdown and * overflow (when it is replaced by a new live journal). */ final Properties properties = getProperties(); properties.setProperty(Options.FILE, file.toString()); // All historical journals are read-only! // Note: disables the write cache among other things. properties.setProperty(Options.READ_ONLY, "true"); final AbstractJournal journal = new ManagedJournal( properties); final long closeTime = journal.getRootBlockView() .getCloseTime(); // verify journal was closed for writes. assert closeTime != 0 : "Journal not closed for writes? " + " : file=" + file + ", uuid=" + uuid + ", closeTime=" + closeTime; assert journal.isReadOnly(); actualUUID = journal.getRootBlockView().getUUID(); store = journal; // opened another journal. journalReopenCount.incrementAndGet(); } else { /* * FIXME Make sure that the segStore either makes it into * the cache or is closed even for spurious exceptions. * E.g., * * try {segStore=...; store=segStore;} catch() * {if(store!=null)store.close();} * * But not it if was already open and not after it makes * it into the cache. */ final IndexSegmentStore segStore = new IndexSegmentStore(file); actualUUID = segStore.getCheckpoint().segmentUUID; store = segStore; // opened another index segment store. segmentStoreReopenCount.incrementAndGet(); } /* * Verify the resource UUID. */ if (!actualUUID.equals(uuid)) { // close the resource. store.close(); throw new RuntimeException("Wrong UUID: file=" + file + ", expecting=" + uuid + ", actual=" + actualUUID); } assert store != null; assert store.isOpen(); assert store.isStable(); } // cache the reference. // synchronized(storeCache) { storeCache.put(uuid, store);//, false/* dirty */); // storeCache.put(uuid, store, false/* dirty */); // } // return the reference to the open store. return store; } finally { lock.unlock(); } } /** * Report the next timestamp assigned by the {@link ITransactionService}. */ protected long nextTimestamp() { final ILocalTransactionManager transactionManager = getConcurrencyManager() .getTransactionManager(); return transactionManager.nextTimestamp(); } @Override public void deleteResources() { assertNotOpen(); // NOP if transient. if (isTransient()) return; if (log.isInfoEnabled()) log.info("Deleting all resources: " + dataDir); recursiveDelete(dataDir); // approx. #of bytes deleted. bytesDeleted.addAndGet(bytesUnderManagement.get()); // nothing left under management. bytesUnderManagement.set(0L); journalBytesUnderManagement.set(0L); segmentBytesUnderManagement.set(0L); } /** * Recursively removes any files and subdirectories and then removes the * file (or directory) itself. * <p> * Note: Files that are not recognized will be logged by the * {@link ResourceFileFilter}. * * @param f * A file or directory. */ private void recursiveDelete(final File f) { if (f.isDirectory()) { final File[] children = f.listFiles(newFileFilter()); if (children == null) { // No such file or directory exists. return; } for (int i = 0; i < children.length; i++) { recursiveDelete(children[i]); } } if (log.isInfoEnabled()) log.info("Removing: " + f); if (f.exists() && !f.delete()) { log.warn("Could not remove: " + f); } } /** * Updates the {@link #releaseTime}. * <p> * Data services MAY release data for views whose timestamp is less than or * equal to the specified release time IFF that action would be in keeping * with their local history retention policy (minReleaseAge) AND if the data * is not required for the most current committed state (data for the most * current committed state is not releasable regardless of the release time * or the minReleaseAge). * * @see #purgeOldResources(), which is responsible for actually deleting the * old resources. */ public void setReleaseTime(final long releaseTime) { assertOpen(); if (releaseTime < 0L) { throw new IllegalArgumentException(); } this.releaseTime = releaseTime; } /** * Return the last value set with {@link #setReleaseTime(long)}. */ public long getReleaseTime() { return releaseTime; } /** * @see IndexManager#getIndexRetentionTime() */ abstract protected long getIndexRetentionTime(); /** * In order to have atomic semantics and prevent a read-historical operation * from starting concurrently that would have access to a view that is being * purged, {@link IndexManager#getIndex(String, long)} and * {@link StoreManager#purgeOldResources()} MUST contend for a shared lock. * This is a {@link ReentrantReadWriteLock} since concurrent getIndex() * requests can proceed as long as {@link StoreManager#purgeOldResources()} * is not running. Also note that contention is not required for * {@link ITx#UNISOLATED} index views. */ protected final ReentrantReadWriteLock indexCacheLock = new ReentrantReadWriteLock(); /** * Identify and delete resources no longer required by the index views from * the current releaseTime up to the lastCommitTime. * <p> * Note: The ability to read from a historical commit point requires the * existence of the journals back until the one covering that historical * commit point. This is because the distinct historical commit points for * the indices are ONLY defined on the journals. The index segments carry * forward the committed state of a specific index as of the commitTime of * the index from which the segment was built. This means that you can * substitute the index segment for the historical index state on older * journals, but the index segment carries forward only a single commit * point for the index so it can not be used to read from arbitrary * historical commit points. * <p> * The caller MUST hold the exclusive lock on the * {@link WriteExecutorService}. * * @return A summary of the work done -or- <code>null</code> if the * preconditions for the purge operation were not satisfied. * * @see src/architecture/purgeResourceDecisionsMatrix.xls * * @see #purgeOldResources(long, boolean) */ final protected PurgeResult purgeOldResources() { final long beginPurgeTime = System.currentTimeMillis(); /* * The last commit time on record in the live journal. * * Note: This used to be invoked during synchronous overflow so the * [lastCommitTime] was in fact the last commit time on the OLD journal. * However, this is now invoked at arbitrary times (as long as there is * a lock on the write service) so we really need to use the * [lastOverflowTime] here to have the same semantics. */ final long lastCommitTime = getLiveJournal().getRootBlockView().getLastCommitTime(); if (lastCommitTime == 0L) { if (log.isInfoEnabled()) log.info("Nothing committed yet."); return null; } /* * Make sure that we have the current release time. It is periodically * pushed by the transaction manager, but we pull it here since we are * about to make a decision based on the releaseTime concerning which * resources to release. */ { final IBigdataFederation fed; try { fed = getFederation(); } catch (UnsupportedOperationException ex) { log.warn("Federation not available: Running in test harness?"); return null; } try { final ITransactionService txService = fed .getTransactionService(); if (txService != null) { this.releaseTime = txService.getReleaseTime(); } else { log .warn("Could not discover txService - Proceeding with current release time."); } } catch (IOException ex) { /* * Since the releaseTime is monotonically increasing, if there * is an RMI problem then we use the last release time that was * pushed to us by the txService. */ log.warn("Proceeding with current release time: " + ex); } } if (this.releaseTime == 0L) { /* * Note: The [releaseTime] is advanced by the transaction service * when it decides that a commit point will no longer be reachable * by new transactions and no running transactions is reading from * that commit point. * * Note: We do not release anything until the releaseTime has been * set by the transaction service. This centralizes decisions * concerning how long to preserve history while distributing the * actions taken based on those decisions. */ log.warn("releaseTime not set."); return null; } // // debugging - writes out stores and indices in their respective // caches. // if(false) {// @todo remove code. // int nstores = 0, nindices = 0; // { // Iterator<WeakReference<IRawStore>> itr = storeCache.iterator(); // while (itr.hasNext()) { // IRawStore store = itr.next().get(); // if (store != null) { // log.warn("Store: " + store); // nstores++; // } // } // } // { // Iterator<WeakReference<ILocalBTreeView>> itr2 = ((IndexManager) this).indexCache // .iterator(); // while (itr2.hasNext()) { // IIndex ndx = itr2.next().get(); // if (ndx != null) { // log.warn("Index: " + ndx); // nindices++; // } // } // } // log.warn("nstores=" + nstores + ", nindices=" + nindices); // } final Event e = new Event(getFederation(), new EventResource(), EventType.PurgeResources).start(); /* * Prevent concurrent access to the index cache. */ indexCacheLock.writeLock().lock(); try { /* * The earliest timestamp that MUST be retained for the * read-historical indices in the cache. * * FIXME There is a cycle here which makes it impossible to release * an index view sooner than the timeout on the index cache when the * index cache capacity is larger than the current minimum * requirements (review store cache and index segment as well). * * The problem is that the backing hard reference queue for the * index cache does not distinguish between actively used indices * and those that are just being held open in case they might be * used against "soon" so we are not able to figure out which * indices can be closed and are therefore required to accept a * release time which is MUCH earlier than the release time given by * the transaction service. * * There are a few ways to approach this. One is to use local * read-historical transactions for flyweight read-only operations. * That will give us a real measure of the #of operations reading on * any given timestamp [a fair amount of work and requires * duplicating many of the facilities of the distributed transaction * manager so that we can track the earliest local tx]. Another is * to reduce the index cache capacity and timeout and then use a * fully buffered journal so it does not matter as much if we close * out an index [a partial fix]. */ final long indexRetentionTime = getIndexRetentionTime(); /* * Choose whichever timestamp would preserve more history (that is, * choose the earlier timestamp). Note that the index retention time * is -1 if there are no indices in the cache. */ final long choosenReleaseTime = indexRetentionTime == -1L ? this.releaseTime : Math.min(indexRetentionTime, this.releaseTime); // final long releaseTime = Math.min(indexRetentionTime, Math.min( // maxReleaseTime, this.releaseTime)); /* * This is the age of the selected release time as computed from the * last commit time on the live journal. */ final long releaseAge = (lastCommitTime - choosenReleaseTime); if (log.isInfoEnabled()) log.info("Choosen releaseTime=" + choosenReleaseTime + ": given releaseTime=" + this.releaseTime + ", indexRetentionTime=" + indexRetentionTime + " (this is " + TimeUnit.MILLISECONDS.toSeconds(releaseAge) + " seconds before/after the lastCommitTime="+lastCommitTime+")"); /* * The earliest commit time on record in any journal available to * the StoreManager. */ final long firstCommitTime; { // the earliest journal available to the store manager. final IResourceMetadata resource = journalIndex.findNext(0L); // open that journal. final AbstractJournal j0 = (AbstractJournal) openStore(resource .getUUID()); // the first commit time on the earliest journal available. firstCommitTime = j0.getRootBlockView().getFirstCommitTime(); } /* * Find the commitTime that we are going to preserve. */ final long commitTimeToPreserve; if (choosenReleaseTime < firstCommitTime) { /* * If the computed [releaseTime] is before the first commit * record on the earliest available journal then there was * nothing that could be deleted and we just return immediately. */ if (log.isInfoEnabled()) log.info("Release time is earlier than any commit time."); // Nothing to do. return null; } else if (choosenReleaseTime >= lastCommitTime) { /* * If the computed [releaseTime] GTE the last commit point then * we choose the [lastCommitTime] instead. * * Note: If there have been no writes on this data service but * there have been writes on other data services then the * txService will eventually advance the releaseTime beyond the * lastCommitTime on this data service. Since we never release * the last commit point we set the commitTimeToPreserve to the * lastCommitTime on the local data service. */ commitTimeToPreserve = lastCommitTime; if (log.isInfoEnabled()) log.info("commitTimeToPreserve := " + commitTimeToPreserve + " (this is the lastCommitTime)"); } else { /* * Find the timestamp for the commit record that is strictly * greater than the release time. */ commitTimeToPreserve = getCommitTimeStrictlyGreaterThan(choosenReleaseTime); if (log.isInfoEnabled()) log .info("commitTimeToPreserve := " + commitTimeToPreserve + " (this is the first commitTime GT the releaseTime=" + choosenReleaseTime + ")"); } /* * Make a note for reporting purposes. */ this.lastCommitTimePreserved = commitTimeToPreserve; /* * Find resources that were in use as of that commitTime. */ final Set<UUID> resourcesInUse; final long elapsedScanCommitIndicesTime; { final long begin = System.currentTimeMillis(); resourcesInUse = getResourcesForTimestamp(commitTimeToPreserve); synchronized(retentionSet) { resourcesInUse.addAll(retentionSet); } elapsedScanCommitIndicesTime = System.currentTimeMillis() - begin; } if (log.isInfoEnabled()) { /* Log the in use resources (resources that MUST NOT be * deleted). */ for (UUID uuid : resourcesInUse) { log.info("In use: file=" + resourceFiles.get(uuid) + ", uuid=" + uuid); } } final long journalBeforeCount = getManagedJournalCount(); final long segmentBeforeCount = getManagedSegmentCount(); final long bytesBeforeCount = getBytesUnderManagement(); /* * Delete anything that is: ( NOT in use ) * * AND ( createTime < commitTimeToPreserve ) */ final long elapsedDeleteResourcesTime; { final long begin = System.currentTimeMillis(); deleteUnusedResources(commitTimeToPreserve, resourcesInUse); elapsedDeleteResourcesTime = System.currentTimeMillis() - begin; } final long journalAfterCount = getManagedJournalCount(); final long segmentAfterCount = getManagedSegmentCount(); final long bytesAfterCount = getBytesUnderManagement(); final long elapsedPurgeResourcesTime = System.currentTimeMillis() - beginPurgeTime; purgeResourcesMillis += elapsedPurgeResourcesTime; final PurgeResult result = new PurgeResult(firstCommitTime, lastCommitTime, this.releaseTime, indexRetentionTime, choosenReleaseTime, commitTimeToPreserve, resourcesInUse.size(), journalBeforeCount, journalAfterCount, segmentBeforeCount, segmentAfterCount, bytesBeforeCount, bytesAfterCount, elapsedScanCommitIndicesTime, elapsedDeleteResourcesTime, elapsedPurgeResourcesTime); e.addDetails(result.getParams()); return result; } finally { indexCacheLock.writeLock().unlock(); e.end(); } } /** * Delete unused resources given a set of resources that are still in use. * The unused resources are identified by scanning the {@link #journalIndex} * and the {@link #segmentIndex}. For each resource found in either of * those indices which is NOT found in <i>resourcesInUse</i> and whose * createTime is GTE the specified timestamp, we take the following steps: * <ol> * <li>close iff open</li> * <li>remove from lists of known resources</li> * <li>clear the associated {@link ILRUCache}</li> * <li>delete in the file system</li> * </ol> * Note: {@link IndexSegment}s pose a special case. Their create time is * the timestamp associated with their source view. During asynchronous * overflow processing we generate {@link IndexSegment}s from the * lastCommitTime of the old journal. Therefore their createTime timestamp * is often LT the <i>commitTimeToPreserve</i>. In order to prevent these * {@link IndexSegment}s from being released before they are put to use (by * incorporating them into an index partition view) we DO NOT add them to * the {@link #segmentIndex} until they are part of an index partition view. * * @param commitTimeToPreserve * Resources created as of or later than this timestamp WILL NOT * be deleted. * @param resourcesInUse * The set of resources required by views as of the * <i>commitTimeToPreserve</i>. These resources have create * times LTE to <i>commitTimeToPreserve</i> but are in use but * at least one view as of that commit time and therefore MUST * NOT be deleted. * * @see IndexManager#buildIndexSegment(String, * com.bigdata.btree.ILocalBTreeView, boolean, long, byte[], byte[], * Event) */ private void deleteUnusedResources(final long commitTimeToPreserve, final Set<UUID> resourcesInUse) { /* * Delete old journals. */ // #of journals deleted. int njournals = 0; { /* * Note: This iterator supports traversal with concurrent * modification (by a single thread). If we decide to delete a * journal resource, then deleteResource() will be tasked to delete * it from the [journalIndex] as well. */ final ITupleIterator itr = journalIndex.rangeIterator( null/* fromKey */, null/* toKey */, 0/* capacity */, IRangeQuery.DEFAULT | IRangeQuery.CURSOR, null/*filter*/); while (itr.hasNext()) { final ITuple tuple = itr.next(); final IResourceMetadata resourceMetadata = (IResourceMetadata) SerializerUtil .deserialize(tuple.getValue()); // the create timestamp for that resource. final long createTime = resourceMetadata.getCreateTime(); if (createTime >= commitTimeToPreserve) { /* * Do NOT delete any resources whose createTime is GTE the * given commit time. */ if (log.isInfoEnabled()) log .info("Stopping at resource GTE commitTime to preserve: createTime=" + createTime + ", file=" + resourceMetadata.getFile()); break; } final UUID uuid = resourceMetadata.getUUID(); if (resourcesInUse.contains(uuid)) { // still required as of that timestamp. continue; } try { deleteUnusedResource(resourceMetadata); } catch (Throwable t) { // log error and keep going. log.error("Could not delete journal: " + resourceMetadata.getFile(), t); } // remove from the [journalIndex]. itr.remove(); njournals++; } } /* * Delete old index segments. */ // #of segments deleted. int nsegments = 0; { /* * Note: This iterator supports traversal with concurrent * modification (by a single thread). If we decide to delete a * indexSegment resource, then deleteResource() will be tasked to * delete it from the [segmentIndex] as well. */ final ITupleIterator itr = segmentIndex.rangeIterator( null/* fromKey */, null/* toKey */, 0/* capacity */, IRangeQuery.DEFAULT | IRangeQuery.CURSOR, null/* filter */); while (itr.hasNext()) { final ITuple tuple = itr.next(); final IResourceMetadata resourceMetadata = (IResourceMetadata) SerializerUtil .deserialize(tuple.getValue()); // the create timestamp for that resource. final long createTime = resourceMetadata.getCreateTime(); if (createTime >= commitTimeToPreserve) { /* * Do NOT delete any resources whose createTime is GTE the * given commit time. */ if (log.isInfoEnabled()) log .info("Stopping at resource GTE commitTime to preserve: createTime=" + createTime + ", file=" + resourceMetadata.getFile()); break; } final UUID uuid = resourceMetadata.getUUID(); if (resourcesInUse.contains(uuid)) { // still required as of that timestamp. continue; } try { // delete the backing file. deleteUnusedResource(resourceMetadata); } catch (Throwable t) { // log error and keep going. log.error("Could not delete segment - continuing: " + resourceMetadata.getFile(), t); } // remove from the [segmentIndex] itr.remove(); nsegments++; } } if (log.isInfoEnabled()) log.info("Given " + resourcesInUse.size() + " resources that are in use as of timestamp=" + commitTimeToPreserve + ", deleted " + njournals + " journals and " + nsegments + " segments"); } /** * Delete the resource in the file system and remove it from the * {@link #storeCache} and {@link #resourceFiles} and either * {@link #journalIndex} or {@link #segmentIndex} as appropriate. * <p> * * <strong>DO NOT delete resources that are in use!</strong> * * A resource that has not yet been incoporated into a view may be deleted * without futher concern. However, once a resource has been incorporated * into a view then you MUST arange for appropriate synchronization before * the resource may be deleted. For example, {@link #purgeOldResources()} * imposes that constraint on the caller that they are responsible for * synchronization and is generally invoked during synchronous overflow * since we know that there are no active writers at that time. * <p> * Pre-conditions: * <ul> * <li>The resource identified by that {@link UUID} exists and is not the * live journal.</li> * <li>The resource is not in use (not checked).</li> * <li>The resource is found in {@link #resourceFiles}.</li> * </ul> * Post-conditions: * <ul> * <li>The resource is closed if it was open and is no longer found in the * {@link #storeCache}.</li> * <li>The resource is no longer found in {@link #resourceFiles}. </li> * <li>The backing file for the resource has been deleted (the backing file * is obtain from {@link #resourceFiles}).</li> * <li>Various counters maintained by the {@link StoreManager} have been * updated (bytes delete, bytes under management, etc).</li> * <li>The file has been removed from either the {@link #journalIndex} or * the {@link #segmentIndex} as appropriate.</li> * </ul> * * @param uuid * The {@link UUID} which identifies the resource. * @param isJournal * <code>true</code> if the resource is a journal. */ protected void deleteResource(final UUID uuid, final boolean isJournal) throws NoSuchStoreException { if (log.isInfoEnabled()) log.info("deleteResource: uuid=" + uuid + ", isJournal=" + isJournal); if (uuid == null) throw new IllegalArgumentException(); if (uuid == liveJournalRef.get().getRootBlockView().getUUID()) { /* * Can't close out the live journal! * * Note: using the reference directly since invoked during startup * to delete index segments left lying around if there is an * incomplete move. */ throw new IllegalArgumentException(); } synchronized (retentionSet) { if (retentionSet.contains(uuid)) { throw new IllegalStateException("Resource in retentionSet: " + uuid); } } /* * Close out store iff open. */ { final IRawStore store = storeCache.remove(uuid); if (store != null) { final File file = store.getFile(); if(isJournal) { assert store instanceof AbstractJournal; } else { assert store instanceof IndexSegmentStore; } try { if (store.isOpen()) { // make sure the store is closed. store.close(); } } catch (IllegalStateException t) { /* * There should not be closed journals in the cache since * they are only closed by the finalizer. * * However, an IndexSegmentStore will be closed if the * IndexSegment is closed and it can still be in the cache * until its reference is cleared when it gets finalized. * * Note: if there is a concurrent close then that might be * interesting and should at least be explored further. */ if (isJournal) // probably a problem. log.error(file, t); else // probably NOT a problem. log.warn(file, t); } } } /* * delete the backing file. */ { final File file = resourceFiles.remove(uuid); if (log.isInfoEnabled()) log.info("DELETE: file=" + file + ", uuid=" + uuid + ", isJournal=" + isJournal); if (file == null) { /* * Note: This can happen if you confuse the indexUUID and the * indexSegment's UUID in the code. The former is on the * IndexMetadata while the latter (the one that you want) is on * the SegmentMetadata. */ throw new NoSuchStoreException(uuid); } if (!file.exists()) { throw new RuntimeException("Not found: " + file); } final long length = file.length(); if (!file.delete()) { throw new RuntimeException("Could not delete: " + file); } // track #of bytes deleted since startup. bytesDeleted.addAndGet(length); // track #of bytes still under management. bytesUnderManagement.addAndGet(-length); if(isJournal) { journalBytesUnderManagement.addAndGet(-length); journalDeleteCount.incrementAndGet(); } else { segmentBytesUnderManagement.addAndGet(-length); segmentStoreDeleteCount.incrementAndGet(); } } /* * Remove the resource from either journalIndex or segmentIndex as * appropriate. */ { boolean found = false; if (isJournal) { synchronized (journalIndex) { @SuppressWarnings("unchecked") final ITupleIterator<JournalMetadata> itr = journalIndex .rangeIterator(null/* fromKey */, null/* toKey */, 0/* capacity */, IRangeQuery.DEFAULT | IRangeQuery.CURSOR, null/* filter */); while(itr.hasNext()) { final IResourceMetadata md = itr.next().getObject(); if(md.getUUID().equals(uuid)) { itr.remove(); found = true; break; } } } } else { synchronized (segmentIndex) { @SuppressWarnings("unchecked") final ITupleIterator<SegmentMetadata> itr = segmentIndex .rangeIterator(null/* fromKey */, null/* toKey */, 0/* capacity */, IRangeQuery.DEFAULT | IRangeQuery.CURSOR, null/* filter */); while (itr.hasNext()) { final IResourceMetadata md = itr.next().getObject(); if (md.getUUID().equals(uuid)) { itr.remove(); found = true; break; } } } } if (!found) throw new NoSuchStoreException(uuid); } } /** * Variant used by {@link #deleteUnusedResources(long, Set)}, which is in * turned invoked by {@link #purgeOldResources()}. This implementation is * different in that we have the {@link IResourceManager} on hand when we * need to delete the resource. I judge it worth the redundency in the code * to have a variant specific to this use case so that the DELETE log * messages report the {@link IResourceMetadata#getCreateTime() create time} * which can be used as a cross-check on {@link #purgeOldResources()}. * Pre-conditions: * <ul> * <li>The resource described by the {@link IResourceMetadata} exists and * is not the live journal.</li> * <li>The resource is not in use (not checked).</li> * <li>The resource is found in {@link #resourceFiles}.</li> * </ul> * Post-conditions: * <ul> * <li>The resource is closed if it was open and is no longer found in the * {@link #storeCache}.</li> * <li>The resource is no longer found in {@link #resourceFiles}. </li> * <li>The {@link ILRUCache} for that resource has been cleared. </li> * <li>The backing file for the resource has been deleted (the backing file * is obtain from {@link #resourceFiles}).</li> * <li>Various counters maintained by the {@link StoreManager} have been * updated (bytes delete, bytes under management, etc).</li> * </ul> * <p> * Note: The caller MUST remove the entry for the resource from either * {@link #journalIndex} or the {@link #segmentIndex} as appropriate. For * this use case, the caller can handle that efficiently since they are * already traversing an iterator on the appropriate {@link BTree} and can * use {@link Iterator#remove()} to delete the corresponding entry from the * {@link BTree}. * * @param resourceMetadata * The metadata describing the resource to be deleted. */ private void deleteUnusedResource(final IResourceMetadata resourceMetadata) { if (log.isInfoEnabled()) log.info("deleteResource: " + resourceMetadata); if (resourceMetadata == null) throw new IllegalArgumentException(); final UUID uuid = resourceMetadata.getUUID(); if (uuid == liveJournalRef.get().getRootBlockView().getUUID()) { /* * Can't close out the live journal! * * Note: using the reference directly since invoked during startup * to delete index segments left lying around if there is an * incomplete move. */ throw new IllegalArgumentException(); } synchronized (retentionSet) { if (retentionSet.contains(uuid)) { throw new IllegalStateException("Resource in retentionSet: " + uuid); } } /* * Close out store iff open. */ { final IRawStore store = storeCache.remove(uuid); if (store != null) { final File file = store.getFile(); if(resourceMetadata.isJournal()) { assert store instanceof AbstractJournal; } else { assert store instanceof IndexSegmentStore; } try { if (store.isOpen()) { // make sure the store is closed. store.close(); } } catch (IllegalStateException t) { /* * There should not be closed journals in the cache since * they are only closed by the finalizer. * * However, an IndexSegmentStore will be closed if the * IndexSegment is closed and it can still be in the cache * until its reference is cleared when it gets finalized. * * Note: if there is a concurrent close then that might be * interesting and should at least be explored further. */ if (resourceMetadata.isJournal()) // probably a problem. log.error(file, t); else // probably NOT a problem. log.warn(file, t); } } } // @see BLZG-1501 (remove LRUNexus) // /* // * Clear record for that store from the LRUNexus and remove the entry // * for the store itself from the LRUNexus. // */ // if (LRUNexus.INSTANCE != null) { // // LRUNexus.INSTANCE.deleteCache(uuid); // // } /* * delete the backing file. */ { final File file = resourceFiles.remove(uuid); /* * Note: This logs the file as reported by [resourceFiles] as well * as the file in IResourceMetadata in case any discrepency arises. */ // if (log.isInfoEnabled()) // log.info("DELETE: " + resourceMetadata + " : " + file); log.warn("DELETE: " + resourceMetadata + " : " + file); if (file == null) { /* * Note: This can happen if you confuse the indexUUID and the * indexSegment's UUID in the code. The former is on the * IndexMetadata while the latter (the one that you want) is on * the SegmentMetadata. */ throw new NoSuchStoreException(uuid); } if (!file.exists()) { throw new RuntimeException("Not found: " + file); } final long length = file.length(); if (!file.delete()) { throw new RuntimeException("Could not delete: " + file); } // track #of bytes deleted since startup. bytesDeleted.addAndGet(length); // track #of bytes still under management. bytesUnderManagement.addAndGet(-length); if(resourceMetadata.isJournal()) { journalBytesUnderManagement.addAndGet(-length); journalDeleteCount.incrementAndGet(); } else { segmentBytesUnderManagement.addAndGet(-length); segmentStoreDeleteCount.incrementAndGet(); } } } /** * Finds the journal spanning the first {@link ICommitRecord} that is * strictly greater than the specified timestamp and returns the timestamp * of that {@link ICommitRecord}. * * @param releaseTime * A release time as set by {@link #setReleaseTime(long)}. Any * resource as of this timestamp is available for release. * * @return The timestamp of the first commit record whose timestamp is * strictly greater than <i>releaseTime</i>. * * @throws IllegalArgumentException * If there is no commit point that is strictly greater than the * releaseTime. This implies that the release time is either in * the future or, if the releaseTime is equal to the last * commitTime, that you are trying to release everything in the * database. */ protected long getCommitTimeStrictlyGreaterThan(final long releaseTime) { final ManagedJournal journal = (ManagedJournal) getJournal(releaseTime); if (journal == null) { throw new IllegalArgumentException("No data for releaseTime=" + releaseTime); } final IRootBlockView rootBlockView = journal.getRootBlockView(); final ICommitRecord commitRecord = journal .getCommitRecordStrictlyGreaterThan(releaseTime); if (commitRecord == null) { final long closeTime = rootBlockView.getCloseTime(); if (closeTime == 0L) { /* * Since this journal is not closed then we know that the next * commit would be on this journal, but there is no commit for * that release time. */ throw new IllegalArgumentException("No data for releaseTime=" + releaseTime); } /* * Otherwise this journal was closed as of this timestamp. * getJournal(timestamp) returns the journal having data for the * timestamp. However, since we are interested in the _next_ commit * point, we need to recursively invoke ourselves when the close * time of this journal. */ log.warn("Examining prior journal (fence post): closeTime=" + closeTime + ", releaseTime=" + releaseTime); return getCommitTimeStrictlyGreaterThan(closeTime); } /* * This is the timestamp associated with the commit point that is the * first commit point strictly greater than the given release time. */ final long commitTime = commitRecord.getTimestamp(); log.warn("Chose commitTime=" + commitTime + " given releaseTime=" + releaseTime); assert commitTime > releaseTime; return commitTime; } /** * Finds all resources used by any registered index as of the * <i>commitTimeToPreserve</i> up to and including the lastCommitTime for * the live journal. * <p> * Note: We include all dependencies for all commit points subsequent to the * probe in order to ensure that we do not accidently release dependencies * required for more current views of the index. * <p> * Note: This method solely considers the index views as defined at each * commit point starting with the given commit point. It DOES NOT pay * attention to the release time or to any other aspect of the state of the * system. * * @param commitTimeToPreserve * The commit time corresponding to the first commit point which * must be preserved. * * @return The set of resource {@link UUID}s required by at least one index * for any commit time GTE the specified commit time. */ protected Set<UUID> getResourcesForTimestamp(final long commitTimeToPreserve) { if (log.isDebugEnabled()) log.debug("commitTimeToPreserve=" + commitTimeToPreserve + ", lastCommitTime=" + getLiveJournal().getRootBlockView().getLastCommitTime()); // must be a commitTime. if (commitTimeToPreserve <= 0) throw new IllegalArgumentException(); final Set<UUID> uuids = new LinkedHashSet<UUID>(512); /* * The live journal is always a dependency, even if there are no indices * declared. */ uuids.add(getLiveJournal().getRootBlockView().getUUID()); /* * Scan all journals having data for commit points GTE the given * [commitTime]. * * Note: We have to scan ALL journals since they are organized by their * createTime in the [journalIndex] not their [lastCommitTime]. */ synchronized(journalIndex) { @SuppressWarnings("unchecked") final ITupleIterator<JournalMetadata> itr = journalIndex.rangeIterator(); while(itr.hasNext()) { final ITuple<JournalMetadata> tuple = itr.next(); final JournalMetadata journalMetadata = tuple.getObject(); final UUID uuid = journalMetadata.getUUID(); final ManagedJournal journal = (ManagedJournal) openStore(uuid); // the last commit point on that journal. final long lastCommitTime = journal.getRootBlockView() .getLastCommitTime(); if (lastCommitTime < commitTimeToPreserve) { /* * Ignore this journal since last commit point is strictly * LT our starting [commitTime]. * * Note: Since the index partition views are re-defined on * the new journal by each synchronous overflow operation, * we do not need to consider older journals in order to * discover the resources used by all index partition views * defined as of the start of any given journal. */ continue; } /* * Scan commit points on that journal. */ { if (log.isDebugEnabled()) log.debug("Examining journal: file=" + journal.getFile() + ", lastCommitTime=" + lastCommitTime + ", uuid=" + journal.getRootBlockView().getUUID()); /* * The index of commit points for the journal, loaded from * the last commit point on the journal. This is Ok since we * always want to read up to the lastCommitPoint on each * journal, including on the live journal. * * Note: This is NOT the live CommitRecordIndex. The live * CommitRecordIndex is NOT protected for use by concurrent * threads. */ final CommitRecordIndex commitRecordIndex = journal .getCommitRecordIndex(journal.getRootBlockView() .getCommitRecordIndexAddr(),true/*readOnly*/); /* * A per-journal hash set of the [checkpointAddr] for the * BTree's that we have examined so that we can skip over * any BTree whose state has not been changed since the last * commit point (if it has the same checkpointAddr in two * different commit point then its state has not changed * between those commit points). */ final Set<Long/* checkpointAddr */> addrs = new HashSet<Long>( 512); /* * In order to scan timestamps from [commitTime] through to * the end. For each tuple, fetch the corresponding * [commitRecord]. For each commitRecord, fetch the * Name2Addr index and visit its Entries. */ @SuppressWarnings("unchecked") final ITupleIterator<ICommitRecord> itr2 = commitRecordIndex .rangeIterator(commitTimeToPreserve/* fromKey */, null/* toKey */); while(itr2.hasNext()) { final ITuple tuple2 = itr2.next(); final CommitRecordIndex.Entry entry2 = (CommitRecordIndex.Entry) tuple2 .getObject(); /* * For each distinct checkpoint, load the BTree and * fetch its local partition metadata which specifies * its resource dependencies. For each resource, add it * to the set of resources that we are collecting. All * of those resources MUST be retained. */ final ICommitRecord commitRecord = commitRecordIndex .fetchCommitRecord(entry2); final Name2Addr name2addr = (Name2Addr) Name2Addr .load( journal, commitRecord .getRootAddr(AbstractJournal.ROOT_NAME2ADDR), true/* readOnly */); @SuppressWarnings("unchecked") final ITupleIterator<Name2Addr.Entry> itr3 = name2addr.rangeIterator(); while(itr3.hasNext()) { final ITuple<Name2Addr.Entry> tuple3 = itr3.next(); final Name2Addr.Entry entry3 = tuple3.getObject(); final long checkpointAddr = entry3.checkpointAddr; if(addrs.add(checkpointAddr)) { /* * New checkpoint address. */ if (log.isDebugEnabled()) log.debug("index: name=" + entry3.name); // load checkpoint record from the store. final Checkpoint checkpoint = Checkpoint.load(journal, entry3.checkpointAddr); // read the index metadata object for that checkpoint. final IndexMetadata indexMetadata = IndexMetadata.read(journal, checkpoint.getMetadataAddr()); // this is where the definition of the view is stored. final LocalPartitionMetadata pmd = indexMetadata.getPartitionMetadata(); if (pmd == null) { /* * For scale-out, all indices should be * index partitions and should define the * resources required by their view. * * Note: However, the metadata service is * not currently partitioned so you will see * unpartitioned indices there. */ continue; } for(IResourceMetadata t : pmd.getResources()) { if (uuids.add(t.getUUID())) { if (log.isInfoEnabled()) log.info("Dependency: file=" + t.getFile() + ", uuid=" + t.getUUID() + ", view=" + pmd); } } // next resource in view } // end if } // next Name2Addr.Entry } // next CommitRecordIndex.Entry } // block } // while(journalIndex.rangeIterator.hasNext()) } // synchronized( journalIndex ) if (log.isInfoEnabled()) log.info("commitTime=" + commitTimeToPreserve + ", #used=" + uuids.size()); return uuids; } /** * Munge a name index so that it is suitable for use in a filesystem. In * particular, any non-word characters are converted to an underscore * character ("_"). This gets rid of all punctuation characters and * whitespace in the index name itself, but will not translate unicode * characters. * * @param s * The name of the scale-out index. * * @return A string suitable for inclusion in a filename. */ static public String munge(final String s) { return s.replaceAll("[\\W]", "_"); } @Override public File getIndexSegmentFile(final IndexMetadata indexMetadata) { if (indexMetadata == null) throw new IllegalArgumentException(); final IPartitionMetadata pmd = indexMetadata.getPartitionMetadata(); return getIndexSegmentFile(indexMetadata.getName(), indexMetadata .getIndexUUID(), pmd == null ? -1 : pmd.getPartitionId()); } /** * Return the file on which a new {@link IndexSegment} should be written. * The file will exist but will have zero length. The file is created using * the {@link File#createTempFile(String, String, File)} mechanism within * the configured {@link #dataDir} in the subdirectory for the specified * scale-out index. * <p> * Note: The index name appears in the file path above the {@link UUID} of * the scale-out index. Therefore it is not possible to have collisions * arise in the file system when given indices whose scale-out names differ * only in characters that are munged onto the same character since the * files will always be stored in a directory specific to the scale-out * index. * * @param scaleOutIndexName * The name of the scale-out index. * @param indexUUID * The UUID of the scale-out index. * @param partitionId * The index partition identifier -or- <code>-1</code> if the * index is not partitioned (handles the MDS which does not use * partitioned indices at this time). * * @return The {@link File} on which a {@link IndexSegmentStore} for that * index partition may be written. The file will be unique and * empty. * * @throws IllegalArgumentException * if any argument is <code>null</code> * @throws IllegalArgumentException * if the partitionId is negative and not <code>-1</code> * * @todo should the filename be relative or absolute? */ public File getIndexSegmentFile(final String scaleOutIndexName, final UUID indexUUID, final int partitionId) { assertOpen(); if (scaleOutIndexName == null) throw new IllegalArgumentException(); if (indexUUID == null) throw new IllegalArgumentException(); if (partitionId < -1) throw new IllegalArgumentException(); // munge index name to fit the file system. final String mungedName = munge(scaleOutIndexName); // subdirectory into which the individual index segs will be placed. final File indexDir = new File(segmentsDir, mungedName + File.separator + indexUUID.toString()); // make sure that directory exists. indexDir.mkdirs(); final String partitionStr = (partitionId == -1 ? "" : "_shardId" + leadingZeros.format(partitionId)); final String prefix = mungedName + "" + partitionStr + "_"; final File file; try { file = File.createTempFile(prefix, Options.SEG, indexDir); } catch (IOException e) { throw new RuntimeException(e); } if (log.isInfoEnabled()) log.info("Created file: " + file); return file; } /** * This attempts to obtain the exclusive lock for the * {@link WriteExecutorService}. If successful, it purges any resources that * are no longer required based on * {@link StoreManager.Options#MIN_RELEASE_AGE} and optionally truncates the * live journal such that no free space remains in the journal. * <p> * Note: If there is heavy write activity on the service then the timeout * may well expire before the exclusive write lock becomes available. * Further, the acquisition of the exclusive write lock will throttle * concurrent write activity and negatively impact write performance if the * system is heavily loaded by write tasks. * * @param timeout * The timeout (in milliseconds) that the method will await the * pause of the write service. * @param truncateJournal * When <code>true</code>, the live journal will be truncated to * its minimum extent (all writes will be preserved but there * will be no free space left in the journal). This may be used * to force the {@link DataService} to its minimum possible * footprint for the configured history retention policy. * * @return <code>true</code> if successful and <code>false</code> if the * write service could not be paused after the specified timeout. * * @param truncateJournal * When <code>true</code> the live journal will be truncated such * that no free space remains in the journal. If writes are * directed to the live journal after it has been truncated then * it will transparently re-extended. * * @throws IOException * @throws InterruptedException * @throws IllegalStateException * if the {@link StoreManager} is not running. */ public boolean purgeOldResources(final long timeout, final boolean truncateJournal) throws InterruptedException { final WriteExecutorService writeService = getConcurrencyManager() .getWriteService(); if (writeService.tryLock(timeout, TimeUnit.MILLISECONDS)) { assertRunning(); try { final Event event = new Event(getFederation(), new EventResource(), EventType.PurgeResources).start(); try { final PurgeResult purgeResult = purgeOldResources(); if (purgeResult != null) { log.warn(purgeResult.toString()); event.addDetails(purgeResult.getParams()); } if (truncateJournal) { assertRunning(); getLiveJournal().truncate(); } } finally { event.end(); } return true; } finally { // release the lock. writeService.unlock(); } } else { log.warn("Purge resources did not run: timeout=" + timeout); return false; } } /** * When the {@link StoreManager} is relatively new (as measured by the #of * bytes under management) we discount the journal extent in order to * trigger overflow earlier. Together with the discount applied to the split * handler by the {@link AsynchronousOverflowTask}, this helps to break * down new index partitions allocated on the new data service and * re-distribute those index partitions (if there are other data services * which have even less utilization). * * @param p * The properties (modified as side-effect). */ protected void overrideJournalExtent(final Properties p) { final long bytesUnderManagement = this.bytesUnderManagement.get(); if (accelerateOverflowThreshold == 0 || bytesUnderManagement >= accelerateOverflowThreshold) { /* * Crossed the threshold where we no longer accelerate overflow. */ return; } final double d = (double) bytesUnderManagement / accelerateOverflowThreshold; final long initialExtent = Long.parseLong(p.getProperty( Options.INITIAL_EXTENT, Options.DEFAULT_INITIAL_EXTENT)); final long maximumExtent = Long.parseLong(p.getProperty( Options.INITIAL_EXTENT, Options.DEFAULT_MAXIMUM_EXTENT)); /* * Don't allow a journal w/ less than 10M or the minimum specified by * Options. */ final long minimumExtent = Math.max(Options.minimumInitialExtent, Bytes.megabyte * 10); /* * Use the same value for initial and maximum extents since we plan to * overflow rapidly. We choose the value as a discount on the maximum * extent. This prevents numerous extensions until we get near to the * maximum extent. */ final long adjustedExtent = Math.max(minimumExtent, (long) (maximumExtent * d)); p.setProperty(Options.INITIAL_EXTENT, Long.toString(adjustedExtent)); p.setProperty(Options.MAXIMUM_EXTENT, Long.toString(adjustedExtent)); if (log.isInfoEnabled()) log.info("discount=" + d // + ", bytesUnderManagement=" + bytesUnderManagement // + ", threshold=" + accelerateOverflowThreshold// + ", minimimInitialExtent=" + minimumExtent// + ", initialExtent=" + initialExtent // + ", maximumExtent=" + maximumExtent // + ", adjustedExtent=" + adjustedExtent); return; } }