package water;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import water.fvec.Chunk;
import water.util.Log;
import water.util.PrettyPrint;
/** Store Cleaner: User-Mode Swap-To-Disk */
class Cleaner extends Thread {
// msec time at which the STORE was dirtied.
// Long.MAX_VALUE if clean.
static private volatile long _dirty; // When was store dirtied
static long dirty() { return _dirty; } // exposed for testing only
static void dirty_store() { dirty_store(System.currentTimeMillis()); }
static void dirty_store( long x ) {
// Keep earliest dirty time seen
if( x < _dirty ) _dirty = x;
}
static volatile long HEAP_USED_AT_LAST_GC;
static volatile long KV_USED_AT_LAST_GC;
static volatile long TIME_AT_LAST_GC=System.currentTimeMillis();
static final Cleaner THE_CLEANER = new Cleaner();
static void kick_store_cleaner() {
synchronized(THE_CLEANER) { THE_CLEANER.notifyAll(); }
}
private static void block_store_cleaner() {
synchronized(THE_CLEANER) { try { THE_CLEANER.wait(5000); } catch (InterruptedException ignore) { } }
}
volatile boolean _did_sweep;
static void block_for_test() throws InterruptedException {
THE_CLEANER._did_sweep = false;
synchronized(THE_CLEANER) {
while( !THE_CLEANER._did_sweep )
THE_CLEANER.wait();
}
}
// Desired cache level. Set by the MemoryManager asynchronously.
static volatile long DESIRED;
Cleaner() {
super("MemCleaner");
setDaemon(true);
setPriority(MAX_PRIORITY-2);
_dirty = Long.MAX_VALUE; // Set to clean-store
Histo.current(true); // Build/allocate a first histogram
Histo.current(true); // Force a recompute with a good eldest
MemoryManager.set_goals("init",false);
}
public static boolean isDiskFull() { // free disk space < 5K?
long space = availableDiskSpace();
return space >= 0 && space < (5 << 10);
}
public static long availableDiskSpace() {
return H2O.getPM().getIce().getUsableSpace();
}
// Cleaner thread runs in a forever loop. (This call cannot be synchronized,
// lest we hold the lock during a (very long) clean process - and various
// async callbacks attempt to "kick" the Cleaner awake - which will require
// taking the lock... blocking the kicking thread for the duration.
@Override /*synchronized*/ public void run() {
boolean diskFull = false;
while( true ) {
// Sweep the K/V store, writing out Values (cleaning) and free'ing
// - Clean all "old" values (lazily, optimistically)
// - Clean and free old values if above the desired cache level
// Do not let optimistic cleaning get in the way of emergency cleaning.
// Get a recent histogram, computing one as needed
Histo h = Histo.current(false);
long now = System.currentTimeMillis();
long dirty = _dirty; // When things first got dirtied
// Start cleaning if: "dirty" was set a "long" time ago, or we beyond
// the desired cache levels. Inverse: go back to sleep if the cache
// is below desired levels & nothing has been dirty awhile.
if( h._cached < DESIRED && // Cache is low and
(now-dirty < 5000) ) { // not dirty a long time
// Block asleep, waking every 5 secs to check for stuff, or when poked
block_store_cleaner();
continue; // Awoke; loop back and re-check histogram.
}
now = System.currentTimeMillis();
_dirty = Long.MAX_VALUE; // Reset, since we are going write stuff out
MemoryManager.set_goals("preclean",false);
// The age beyond which we need to toss out things to hit the desired
// caching levels. If forced, be exact (toss out the minimal amount).
// If lazy, store-to-disk things down to 1/2 the desired cache level
// and anything older than 5 secs.
boolean force = (h._cached >= DESIRED || !MemoryManager.CAN_ALLOC); // Forced to clean
if( force && diskFull ) // Try to clean the diskFull flag
diskFull = isDiskFull();
long clean_to_age = h.clean_to(force ? DESIRED : (DESIRED>>1));
// If not forced cleaning, expand the cleaning age to allows Values
// more than 5sec old
if( !force ) clean_to_age = Math.max(clean_to_age,now-5000);
if( DESIRED == -1 ) clean_to_age = now; // Test mode: clean all
// No logging if under memory pressure: can deadlock the cleaner thread
String s = h+" DESIRED="+(DESIRED>>20)+"M dirtysince="+(now-dirty)+" force="+force+" clean2age="+(now-clean_to_age);
if( MemoryManager.canAlloc() ) Log.debug(s);
else System.err.println(s);
long cleaned = 0; // Disk i/o bytes
long freed = 0; // memory freed bytes
long io_ns = 0; // i/o ns writing
// For faster K/V store walking get the NBHM raw backing array,
// and walk it directly.
Object[] kvs = H2O.STORE.raw_array();
// Start the walk at slot 2, because slots 0,1 hold meta-data
for( int i=2; i<kvs.length; i += 2 ) {
// In the raw backing array, Keys and Values alternate in slots
Object ok = kvs[i], ov = kvs[i+1];
if( !(ok instanceof Key ) ) continue; // Ignore tombstones and Primes and null's
if( !(ov instanceof Value) ) continue; // Ignore tombstones and Primes and null's
Value val = (Value)ov;
byte[] m = val.rawMem();
Object p = val.rawPOJO();
if( m == null && p == null ) continue; // Nothing to throw out
if( val.isLockable() ) continue; // we do not want to throw out Lockables.
boolean isChunk = p instanceof Chunk && !((Chunk)p).isVolatile();
// Ignore things younger than the required age. In particular, do
// not spill-to-disk all dirty things we find.
long touched = val._lastAccessedTime;
if( touched > clean_to_age ) { // Too recently touched?
// But can toss out a byte-array if already deserialized & on disk
// (no need for both forms). Note no savings for Chunks, for which m==p._mem
if( val.isPersisted() && m != null && p != null && !isChunk ) {
val.freeMem(); // Toss serialized form, since can rebuild from POJO
freed += val._max;
}
dirty_store(touched); // But may write it out later
continue; // Too young
}
// Spiller turned off?
if( !H2O.ARGS.cleaner ) continue;
// CNC - Memory cleaning turned off, except for Chunks
// Too many POJOs are written to dynamically; cannot spill & reload
// them without losing changes.
// Should I write this value out to disk?
// Should I further force it from memory?
if( isChunk && !val.isPersisted() && !diskFull && ((Key)ok).home() ) { // && (force || (lazyPersist() && lazy_clean(key)))) {
long now_ns = System.nanoTime();
try { val.storePersist(); } // Write to disk
catch( FileNotFoundException fnfe ) { continue; } // Can happen due to racing key delete/remove
catch( IOException e ) {
Log.warn( isDiskFull()
? "Disk full! Disabling swapping to disk." + (force?" Memory low! Please free some space in " + H2O.ICE_ROOT + "!":"")
: "Disk swapping failed! " + e.getMessage());
// Something is wrong so mark disk as full anyways so we do not
// attempt to write again. (will retry next run when memory is low)
diskFull = true;
}
if( m == null ) m = val.rawMem();
if( m != null ) cleaned += m.length; // Accumulate i/o bytes
io_ns += System.nanoTime() - now_ns; // Accumulate i/o time
}
// And, under pressure, free all
if( isChunk && force && (val.isPersisted() || !((Key)ok).home()) ) {
val.freeMem (); if( m != null ) freed += val._max; m = null;
val.freePOJO(); if( p != null ) freed += val._max; p = null;
if( isChunk ) freed -= val._max; // Double-counted freed mem for Chunks since val._pojo._mem & val._mem are the same.
}
// If we have both forms, toss the byte[] form - can be had by
// serializing again.
if( m != null && p != null && !isChunk ) {
val.freeMem();
freed += val._max;
}
// If a GC cycle happened and we can no longer alloc, start forcing
// from RAM as we go
force = (h._cached >= DESIRED || !MemoryManager.CAN_ALLOC); // Forced to clean
}
String s1 = "Cleaner pass took: "+PrettyPrint.msecs(System.currentTimeMillis()-now,true)+
", spilled "+PrettyPrint.bytes(cleaned)+" in "+PrettyPrint.usecs(io_ns>>10);
h = Histo.current(true); // Force a new histogram
MemoryManager.set_goals("postclean",false);
// No logging if under memory pressure: can deadlock the cleaner thread
String s2 = h+" diski_o="+PrettyPrint.bytes(cleaned)+", freed="+(freed>>20)+"M, DESIRED="+(DESIRED>>20)+"M";
if( MemoryManager.canAlloc() ) Log.debug(s1,s2);
else System.err.println(s1+"\n"+s2);
// For testing thread
synchronized(this) {
_did_sweep = true;
if( DESIRED == -1 ) DESIRED = 0; // Turn off test-mode after 1 sweep
notifyAll(); // Wake up testing thread
}
}
}
// Histogram class
static class Histo {
// Current best histogram
static private volatile Histo H;
// Return the current best histogram, recomputing in-place if it is getting
// stale. Synchronized so the same histogram can be called into here and
// will be only computed into one-at-a-time.
synchronized static Histo current( boolean force ) {
final Histo h = H; // Grab current best histogram
if( !force && System.currentTimeMillis() < h._when+2000 )
return h; // It is recent; use it
if( h != null && h._clean && _dirty==Long.MAX_VALUE )
return h; // No change to the K/V store, so no point
// Use last oldest value for computing the next histogram in-place
return (H = new Histo(h==null ? 0 : h._oldest)); // Record current best histogram & return it
}
// Latest best-effort cached amount, without forcing a histogram to be
// built nor blocking for one being in-progress.
static long cached() { return H._cached; }
static long swapped(){ return H._swapped;}
final long[] _hs = new long[128];
long _oldest; // Time of the oldest K/V discovered this pass
long _eldest; // Time of the eldest K/V found in some prior pass
long _hStep; // Histogram step: (now-eldest)/histogram.length
long _cached; // Total alive data in the histogram
long _total; // Total data in local K/V
long _when; // When was this histogram computed
long _swapped;// On-disk stuff
Value _vold; // For assertions: record the oldest Value
boolean _clean; // Was "clean" K/V when built?
// Compute a histogram
Histo( long eldest ) {
Arrays.fill(_hs, 0);
_when = System.currentTimeMillis();
_eldest = eldest; // Eldest seen in some prior pass
_hStep = Math.max(1,(_when-eldest)/_hs.length);
boolean clean = _dirty==Long.MAX_VALUE;
// Compute the hard way
Object[] kvs = H2O.STORE.raw_array();
long cached = 0; // Total K/V cached in ram
long total = 0; // Total K/V in local node
long swapped=0; // Total K/V persisted
long oldest = Long.MAX_VALUE; // K/V with the longest time since being touched
Value vold = null;
// Start the walk at slot 2, because slots 0,1 hold meta-data
for( int i=2; i<kvs.length; i += 2 ) {
// In the raw backing array, Keys and Values alternate in slots
Object ok = kvs[i], ov = kvs[i+1];
if( !(ok instanceof Key ) ) continue; // Ignore tombstones and Primes and null's
if( !(ov instanceof Value) ) continue; // Ignore tombstones and Primes and null's
Value val = (Value)ov;
if( val.isNull() ) { Value.STORE_get(val._key); continue; } // Another flavor of NULL
total += val._max;
if( val.isPersisted() ) swapped += val._max;
int len = 0;
byte[] m = val.rawMem();
Object p = val.rawPOJO();
if( m != null ) len += val._max;
if( p != null ) len += val._max;
if( m != null && p instanceof Chunk ) len -= val._max; // Do not double-count Chunks
if( len == 0 ) continue;
cached += len; // Accumulate total amount of cached keys
if( val._lastAccessedTime < oldest ) { // Found an older Value?
vold = val; // Record oldest Value seen
oldest = val._lastAccessedTime;
}
// Compute histogram bucket
int idx = (int)((val._lastAccessedTime - eldest)/_hStep);
if( idx < 0 ) idx = 0;
else if( idx >= _hs.length ) idx = _hs.length-1;
_hs[idx] += len; // Bump histogram bucket
}
_cached = cached; // Total cached; NOTE: larger than sum of histogram buckets
_total = total; // Total used data
_swapped = swapped;
_oldest = oldest; // Oldest seen in this pass
_vold = vold;
_clean = clean && _dirty==Long.MAX_VALUE; // Looks like a clean K/V the whole time?
}
// Compute the time (in msec) for which we need to throw out things
// to throw out enough things to hit the desired cached memory level.
long clean_to( long desired ) {
long age = _eldest; // Age of bucket zero
if( _cached < desired ) return age; // Already there; nothing to remove
long s = 0; // Total amount toss out
for( long t : _hs ) { // For all buckets...
s += t; // Raise amount tossed out
age += _hStep; // Raise age beyond which you need to go
if( _cached - s < desired ) break;
}
return age;
}
// Pretty print
@Override public String toString() {
long x = _eldest;
long now = System.currentTimeMillis();
return "H(cached:"+(_cached>>20)+"M, eldest:"+x+"L < +"+(_oldest-x)+"ms <...{"+_hStep+"ms}...< +"+(_hStep*_hs.length)+"ms < +"+(now-x)+")";
}
}
}