package water; import java.io.*; import java.net.*; import java.nio.ByteBuffer; import java.nio.channels.DatagramChannel; import java.util.*; import jsr166y.*; import water.Job.JobCancelledException; import water.fvec.Chunk; import water.fvec.Frame; import water.ga.EventHit; import water.ga.GoogleAnalytics; import water.nbhm.NonBlockingHashMap; import water.persist.*; import water.util.*; import water.util.Log.Tag.Sys; import water.license.LicenseManager; import java.nio.channels.ServerSocketChannel; /** * Start point for creating or joining an <code>H2O</code> Cloud. * * @author <a href="mailto:cliffc@h2o.ai"></a> * @version 1.0 */ public final class H2O { public static volatile AbstractEmbeddedH2OConfig embeddedH2OConfig; public static volatile ApiIpPortWatchdogThread apiIpPortWatchdog; public static volatile LicenseManager licenseManager; public static String VERSION = "(unknown)"; public static long START_TIME_MILLIS = -1; // User name for this Cloud (either the username or the argument for the option -name) public static String NAME; // The default port for finding a Cloud public static int DEFAULT_PORT = 54321; public static int H2O_PORT; // Fast/small UDP transfers public static int API_PORT; // RequestServer and the new API HTTP port // Whether to toggle to single precision as upper limit for storing floating point numbers public static boolean SINGLE_PRECISION = false; // Max. number of factor levels ber column (before flipping all to NAs) public static int DATA_MAX_FACTOR_LEVELS = 1000000; public static int LOG_CHK = 22; // Chunks are 1<<22, or 4Meg // The multicast discovery port static MulticastSocket CLOUD_MULTICAST_SOCKET; static NetworkInterface CLOUD_MULTICAST_IF; static InetAddress CLOUD_MULTICAST_GROUP; static int CLOUD_MULTICAST_PORT ; // Default NIO Datagram channel static DatagramChannel CLOUD_DGRAM; // Myself, as a Node in the Cloud public static H2ONode SELF = null; public static InetAddress SELF_ADDRESS; public static String DEFAULT_ICE_ROOT() { String username = System.getProperty("user.name"); if (username == null) username = ""; String u2 = username.replaceAll(" ", "_"); if (u2.length() == 0) u2 = "unknown"; return "/tmp/h2o-" + u2; } public static URI ICE_ROOT; // Initial arguments public static String[] ARGS; public static final PrintStream OUT = System.out; public static final PrintStream ERR = System.err; public static final int NUMCPUS = Runtime.getRuntime().availableProcessors(); // Convenience error public static RuntimeException unimpl(String msg) { return new RuntimeException("unimplemented: " + msg); } public static RuntimeException unimpl() { return new RuntimeException("unimplemented"); } public static RuntimeException fail() { return new RuntimeException("do not call"); } public static RuntimeException fail(String msg) { return new RuntimeException("FAILURE: " + msg); } // Central /dev/null for ignored exceptions public static void ignore(Throwable e) { ignore(e,"[h2o] Problem ignored: "); } public static void ignore(Throwable e, String msg) { ignore(e, msg, true); } public static void ignore(Throwable e, String msg, boolean printException) { Log.debug(Sys.WATER, msg + (printException? e.toString() : "")); } //Google analytics performance measurement public static GoogleAnalytics GA; public static int CLIENT_TYPE_GA_CUST_DIM = 1; // -------------------------------------------------------------------------- // Embedded configuration for a full H2O node to be implanted in another // piece of software (e.g. Hadoop mapper task). /** * Register embedded H2O configuration object with H2O instance. */ public static void setEmbeddedH2OConfig(AbstractEmbeddedH2OConfig c) { embeddedH2OConfig = c; } public static AbstractEmbeddedH2OConfig getEmbeddedH2OConfig() { return embeddedH2OConfig; } /** * Tell the embedding software that this H2O instance belongs to * a cloud of a certain size. * This may be nonblocking. * * @param ip IP address this H2O can be reached at. * @param port Port this H2O can be reached at (for REST API and browser). * @param size Number of H2O instances in the cloud. */ public static void notifyAboutCloudSize(InetAddress ip, int port, int size) { if (embeddedH2OConfig == null) { return; } embeddedH2OConfig.notifyAboutCloudSize(ip, port, size); } /** * Notify embedding software instance H2O wants to exit. * @param status H2O's requested process exit value. */ public static void exit(int status) { // embeddedH2OConfig is only valid if this H2O node is living inside // another software instance (e.g. a Hadoop mapper task). // // Expect embeddedH2OConfig to be null if H2O is run standalone. // Cleanly shutdown internal H2O services. if (apiIpPortWatchdog != null) { apiIpPortWatchdog.shutdown(); } if (embeddedH2OConfig == null) { // Standalone H2O path. System.exit (status); } // Embedded H2O path (e.g. inside Hadoop mapper task). embeddedH2OConfig.exit(status); // Should never reach here. System.exit(222); } /** Shutdown itself by sending a shutdown UDP packet. */ public void shutdown() { UDPRebooted.T.shutdown.send(H2O.SELF); H2O.exit(0); } // -------------------------------------------------------------------------- // The Current Cloud. A list of all the Nodes in the Cloud. Changes if we // decide to change Clouds via atomic Cloud update. static public volatile H2O CLOUD = new H2O(new H2ONode[0],0,0); // --- // A dense array indexing all Cloud members. Fast reversal from "member#" to // Node. No holes. Cloud size is _members.length. public final H2ONode[] _memary; public final int _hash; //public boolean _healthy; // A dense integer identifier that rolls over rarely. Rollover limits the // number of simultaneous nested Clouds we are operating on in-parallel. // Really capped to 1 byte, under the assumption we won't have 256 nested // Clouds. Capped at 1 byte so it can be part of an atomically-assigned // 'long' holding info specific to this Cloud. public final char _idx; // no unsigned byte, so unsigned char instead // Is nnn larger than old (counting for wrap around)? Gets confused if we // start seeing a mix of more than 128 unique clouds at the same time. Used // to tell the order of Clouds appearing. static public boolean larger( int nnn, int old ) { assert (0 <= nnn && nnn <= 255); assert (0 <= old && old <= 255); return ((nnn-old)&0xFF) < 64; } static public boolean isHealthy() { H2O cloud = H2O.CLOUD; for (H2ONode h2o : cloud._memary) { if(!h2o._node_healthy) return false; } return true; } // Static list of acceptable Cloud members public static HashSet<H2ONode> STATIC_H2OS = null; // Reverse cloud index to a cloud; limit of 256 old clouds. static private final H2O[] CLOUDS = new H2O[256]; // Enables debug features like more logging and multiple instances per JVM public static final String DEBUG_ARG = "h2o.debug"; public static final boolean DEBUG = System.getProperty(DEBUG_ARG) != null; // Construct a new H2O Cloud from the member list public H2O( H2ONode[] h2os, int hash, int idx ) { _memary = h2os; // Need to clone? Arrays.sort(_memary); // ... sorted! _hash = hash; // And record hash for cloud rollover _idx = (char)(idx&0x0ff); // Roll-over at 256 } // One-shot atomic setting of the next Cloud, with an empty K/V store. // Called single-threaded from Paxos. Constructs the new H2O Cloud from a // member list. void set_next_Cloud( H2ONode[] h2os, int hash ) { synchronized(this) { int idx = _idx+1; // Unique 1-byte Cloud index if( idx == 256 ) idx=1; // wrap, avoiding zero CLOUDS[idx] = CLOUD = new H2O(h2os,hash,idx); } SELF._heartbeat._cloud_size=(char)CLOUD.size(); } public final int size() { return _memary.length; } public final H2ONode leader() { return _memary[0]; } public static void waitForCloudSize(int x) { waitForCloudSize(x, 10000); } public static void waitForCloudSize(int x, long ms) { long start = System.currentTimeMillis(); while( System.currentTimeMillis() - start < ms ) { if( CLOUD.size() >= x && Paxos._commonKnowledge ) break; try { Thread.sleep(100); } catch( InterruptedException ie ) { } } if( H2O.CLOUD.size() < x ) throw new RuntimeException("Cloud size under " + x); } // Find the node index for this H2ONode, or a negative number on a miss public int nidx( H2ONode h2o ) { return Arrays.binarySearch(_memary,h2o); } public boolean contains( H2ONode h2o ) { return nidx(h2o) >= 0; } // BIG WARNING: do you not change this toString() method since cloud hash value depends on it @Override public String toString() { return Arrays.toString(_memary); } public String toPrettyString() { if (_memary==null || _memary.length==0) return "[]"; int iMax = _memary.length - 1; StringBuilder sb = new StringBuilder(); sb.append('['); for (int i = 0; ; i++) { sb.append(String.valueOf(_memary[i])); if (_memary[i]!=null) sb.append(" (").append(PrettyPrint.msecs(_memary[i].runtime(),false)).append(')'); if (i==iMax) return sb.append(']').toString(); sb.append(", "); } } /** * Return a list of interfaces sorted by importance (most important first). * This is the order we want to test for matches when selecting an interface. */ private static ArrayList<NetworkInterface> calcPrioritizedInterfaceList() { ArrayList<NetworkInterface> networkInterfaceList = null; try { Enumeration<NetworkInterface> nis = NetworkInterface.getNetworkInterfaces(); ArrayList<NetworkInterface> tmpList = Collections.list(nis); Comparator<NetworkInterface> c = new Comparator<NetworkInterface>() { @Override public int compare(NetworkInterface lhs, NetworkInterface rhs) { // Handle null inputs. if ((lhs == null) && (rhs == null)) { return 0; } if (lhs == null) { return 1; } if (rhs == null) { return -1; } // If the names are equal, then they are equal. if (lhs.getName().equals (rhs.getName())) { return 0; } // If both are bond drivers, choose a precedence. if (lhs.getName().startsWith("bond") && (rhs.getName().startsWith("bond"))) { Integer li = lhs.getName().length(); Integer ri = rhs.getName().length(); // Bond with most number of characters is always highest priority. if (li.compareTo(ri) != 0) { return li.compareTo(ri); } // Otherwise, sort lexicographically by name. return lhs.getName().compareTo(rhs.getName()); } // If only one is a bond driver, give that precedence. if (lhs.getName().startsWith("bond")) { return -1; } if (rhs.getName().startsWith("bond")) { return 1; } // Everything that isn't a bond driver is equal. return 0; } }; Collections.sort(tmpList, c); networkInterfaceList = tmpList; } catch( SocketException e ) { Log.err(e); } return networkInterfaceList; } /** * Return a list of internet addresses sorted by importance (most important first). * This is the order we want to test for matches when selecting an internet address. */ public static ArrayList<java.net.InetAddress> calcPrioritizedInetAddressList() { ArrayList<java.net.InetAddress> ips = new ArrayList<java.net.InetAddress>(); { ArrayList<NetworkInterface> networkInterfaceList = calcPrioritizedInterfaceList(); for (int i = 0; i < networkInterfaceList.size(); i++) { NetworkInterface ni = networkInterfaceList.get(i); Enumeration<InetAddress> ias = ni.getInetAddresses(); while( ias.hasMoreElements() ) { InetAddress ia; ia = ias.nextElement(); ips.add(ia); Log.info("Possible IP Address: " + ni.getName() + " (" + ni.getDisplayName() + "), " + ia.getHostAddress()); } } } return ips; } public static InetAddress findInetAddressForSelf() throws Error { if(SELF_ADDRESS == null) { if ((OPT_ARGS.ip != null) && (OPT_ARGS.network != null)) { Log.err("ip and network options must not be used together"); H2O.exit(-1); } ArrayList<UserSpecifiedNetwork> networkList = UserSpecifiedNetwork.calcArrayList(OPT_ARGS.network); if (networkList == null) { Log.err("Exiting."); H2O.exit(-1); } // Get a list of all valid IPs on this machine. ArrayList<InetAddress> ips = calcPrioritizedInetAddressList(); InetAddress local = null; // My final choice // Check for an "-ip xxxx" option and accept a valid user choice; required // if there are multiple valid IP addresses. InetAddress arg = null; if (OPT_ARGS.ip != null) { try{ arg = InetAddress.getByName(OPT_ARGS.ip); } catch( UnknownHostException e ) { Log.err(e); H2O.exit(-1); } if( !(arg instanceof Inet4Address) ) { Log.warn("Only IP4 addresses allowed."); H2O.exit(-1); } if( !ips.contains(arg) ) { Log.warn("IP address not found on this machine"); H2O.exit(-1); } local = arg; } else if (networkList.size() > 0) { // Return the first match from the list, if any. // If there are no matches, then exit. Log.info("Network list was specified by the user. Searching for a match..."); for( InetAddress ip : ips ) { Log.info(" Considering " + ip.getHostAddress() + " ..."); for ( UserSpecifiedNetwork n : networkList ) { if (n.inetAddressOnNetwork(ip)) { Log.info(" Matched " + ip.getHostAddress()); local = ip; SELF_ADDRESS = local; return SELF_ADDRESS; } } } Log.err("No interface matches the network list from the -network option. Exiting."); H2O.exit(-1); } else { // No user-specified IP address. Attempt auto-discovery. Roll through // all the network choices on looking for a single Inet4. ArrayList<InetAddress> validIps = new ArrayList(); for( InetAddress ip : ips ) { // make sure the given IP address can be found here if( ip instanceof Inet4Address && !ip.isLoopbackAddress() && !ip.isLinkLocalAddress() ) { validIps.add(ip); } } if( validIps.size() == 1 ) { local = validIps.get(0); } else { local = guessInetAddress(validIps); } } // The above fails with no network connection, in that case go for a truly // local host. if( local == null ) { try { Log.warn("Failed to determine IP, falling back to localhost."); // set default ip address to be 127.0.0.1 /localhost local = InetAddress.getByName("127.0.0.1"); } catch( UnknownHostException e ) { throw Log.errRTExcept(e); } } SELF_ADDRESS = local; } return SELF_ADDRESS; } private static InetAddress guessInetAddress(List<InetAddress> ips) { String m = "Multiple local IPs detected:\n"; for(InetAddress ip : ips) m+=" " + ip; m+="\nAttempting to determine correct address...\n"; Socket s = null; try { // using google's DNS server as an external IP to find // Add a timeout to the touch of google. // https://0xdata.atlassian.net/browse/HEX-743 s = new Socket(); // only 3000 milliseconds before giving up // Exceptions: IOException, SocketTimeoutException, plus two Illegal* exceptions s.connect(new InetSocketAddress("8.8.8.8", 53), 3000); m+="Using " + s.getLocalAddress() + "\n"; return s.getLocalAddress(); } catch( java.net.SocketException se ) { return null; // No network at all? (Laptop w/wifi turned off?) } catch( java.net.SocketTimeoutException se ) { return null; // could be firewall? } catch( Throwable t ) { Log.err(t); return null; } finally { Log.info(m); Utils.close(s); } } // -------------------------------------------------------------------------- // The (local) set of Key/Value mappings. static final NonBlockingHashMap<Key,Value> STORE = new NonBlockingHashMap<Key, Value>(); // Dummy shared volatile for ordering games static public volatile int VOLATILE; // PutIfMatch // - Atomically update the STORE, returning the old Value on success // - Kick the persistence engine as needed // - Return existing Value on fail, no change. // // Keys are interned here: I always keep the existing Key, if any. The // existing Key is blind jammed into the Value prior to atomically inserting // it into the STORE and interning. // // Because of the blind jam, there is a narrow unusual race where the Key // might exist but be stale (deleted, mapped to a TOMBSTONE), a fresh put() // can find it and jam it into the Value, then the Key can be deleted // completely (e.g. via an invalidate), the table can resize flushing the // stale Key, an unrelated weak-put can re-insert a matching Key (but as a // new Java object), and delete it, and then the original thread can do a // successful put_if_later over the missing Key and blow the invariant that a // stored Value always points to the physically equal Key that maps to it // from the STORE. If this happens, some of replication management bits in // the Key will be set in the wrong Key copy... leading to extra rounds of // replication. public static Value putIfMatch( Key key, Value val, Value old ) { if( old != null ) // Have an old value? key = old._key; // Use prior key if( val != null ) val._key = key; // Insert into the K/V store Value res = STORE.putIfMatchUnlocked(key,val,old); if( res != old ) return res; // Return the failure cause // Persistence-tickle. // If the K/V mapping is going away, remove the old guy. // If the K/V mapping is changing, let the store cleaner just overwrite. // If the K/V mapping is new, let the store cleaner just create if( old != null && val == null ) old.removeIce(); // Remove the old guy if( val != null ) { dirty_store(); // Start storing the new guy Scope.track(key); } return old; // Return success } // Raw put; no marking the memory as out-of-sync with disk. Used to import // initial keys from local storage, or to intern keys. public static Value putIfAbsent_raw( Key key, Value val ) { Value res = STORE.putIfMatchUnlocked(key,val,null); assert res == null; return res; } // Get the value from the store public static Value get( Key key ) { return STORE.get(key); } public static Value raw_get( Key key ) { return STORE.get(key); } public static Key getk( Key key ) { return STORE.getk(key); } public static Set<Key> localKeySet( ) { return STORE.keySet(); } public static Collection<Value> values( ) { return STORE.values(); } public static int store_size() { return STORE.size(); } // -------------------------------------------------------------------------- // The worker pools - F/J pools with different priorities. // These priorities are carefully ordered and asserted for... modify with // care. The real problem here is that we can get into cyclic deadlock // unless we spawn a thread of priority "X+1" in order to allow progress // on a queue which might be flooded with a large number of "<=X" tasks. // // Example of deadlock: suppose TaskPutKey and the Invalidate ran at the same // priority on a 2-node cluster. Both nodes flood their own queues with // writes to unique keys, which require invalidates to run on the other node. // Suppose the flooding depth exceeds the thread-limit (e.g. 99); then each // node might have all 99 worker threads blocked in TaskPutKey, awaiting // remote invalidates - but the other nodes' threads are also all blocked // awaiting invalidates! // // We fix this by being willing to always spawn a thread working on jobs at // priority X+1, and guaranteeing there are no jobs above MAX_PRIORITY - // i.e., jobs running at MAX_PRIORITY cannot block, and when those jobs are // done, the next lower level jobs get unblocked, etc. public static final byte MAX_PRIORITY = Byte.MAX_VALUE-1; public static final byte ACK_ACK_PRIORITY = MAX_PRIORITY-0; public static final byte FETCH_ACK_PRIORITY = MAX_PRIORITY-1; public static final byte ACK_PRIORITY = MAX_PRIORITY-2; public static final byte DESERIAL_PRIORITY = MAX_PRIORITY-3; public static final byte INVALIDATE_PRIORITY = MAX_PRIORITY-3; public static final byte GET_KEY_PRIORITY = MAX_PRIORITY-4; public static final byte PUT_KEY_PRIORITY = MAX_PRIORITY-5; public static final byte ATOMIC_PRIORITY = MAX_PRIORITY-6; public static final byte GUI_PRIORITY = MAX_PRIORITY-7; public static final byte MIN_HI_PRIORITY = MAX_PRIORITY-7; public static final byte MIN_PRIORITY = 0; // F/J threads that remember the priority of the last task they started // working on. public static class FJWThr extends ForkJoinWorkerThread { public int _priority; FJWThr(ForkJoinPool pool) { super(pool); _priority = ((ForkJoinPool2)pool)._priority; setPriority( _priority == Thread.MIN_PRIORITY ? Thread.NORM_PRIORITY-1 : Thread. MAX_PRIORITY-1 ); setName("FJ-"+_priority+"-"+getPoolIndex()); } } // Factory for F/J threads, with cap's that vary with priority. static class FJWThrFact implements ForkJoinPool.ForkJoinWorkerThreadFactory { private final int _cap; FJWThrFact( int cap ) { _cap = cap; } @Override public ForkJoinWorkerThread newThread(ForkJoinPool pool) { int cap = 4 * NUMCPUS; return pool.getPoolSize() <= cap ? new FJWThr(pool) : null; } } // A standard FJ Pool, with an expected priority level. static class ForkJoinPool2 extends ForkJoinPool { final int _priority; private ForkJoinPool2(int p, int cap) { super((OPT_ARGS == null || OPT_ARGS.nthreads <= 0) ? NUMCPUS : OPT_ARGS.nthreads, new FJWThrFact(cap), null, p<MIN_HI_PRIORITY); _priority = p; } private H2OCountedCompleter poll2() { return (H2OCountedCompleter)pollSubmission(); } } // Hi-priority work, sorted into individual queues per-priority. // Capped at a small number of threads per pool. private static final ForkJoinPool2 FJPS[] = new ForkJoinPool2[MAX_PRIORITY+1]; static { // Only need 1 thread for the AckAck work, as it cannot block FJPS[ACK_ACK_PRIORITY] = new ForkJoinPool2(ACK_ACK_PRIORITY,1); for( int i=MIN_HI_PRIORITY+1; i<MAX_PRIORITY; i++ ) FJPS[i] = new ForkJoinPool2(i,NUMCPUS); // All CPUs, but no more for blocking purposes FJPS[GUI_PRIORITY] = new ForkJoinPool2(GUI_PRIORITY,2); } // Easy peeks at the FJ queues static int getWrkQueueSize (int i) { return FJPS[i]==null ? -1 : FJPS[i].getQueuedSubmissionCount();} static int getWrkThrPoolSize(int i) { return FJPS[i]==null ? -1 : FJPS[i].getPoolSize(); } // Submit to the correct priority queue public static H2OCountedCompleter submitTask( H2OCountedCompleter task ) { int priority = task.priority(); assert MIN_PRIORITY <= priority && priority <= MAX_PRIORITY:"priority " + priority + " is out of range, expected range is < " + MIN_PRIORITY + "," + MAX_PRIORITY + ">"; if( FJPS[priority]==null ) synchronized( H2O.class ) { if( FJPS[priority] == null ) FJPS[priority] = new ForkJoinPool2(priority,-1); } FJPS[priority].submit(task); return task; } // Simple wrapper over F/J CountedCompleter to support priority queues. F/J // queues are simple unordered (and extremely light weight) queues. However, // we frequently need priorities to avoid deadlock and to promote efficient // throughput (e.g. failure to respond quickly to TaskGetKey can block an // entire node for lack of some small piece of data). So each attempt to do // lower-priority F/J work starts with an attempt to work & drain the // higher-priority queues. public static abstract class H2OCountedCompleter<T extends H2OCountedCompleter> extends CountedCompleter implements Cloneable { public H2OCountedCompleter(){} protected H2OCountedCompleter(H2OCountedCompleter completer){super(completer);} // Once per F/J task, drain the high priority queue before doing any low // priority work. @Override public final void compute() { FJWThr t = (FJWThr)Thread.currentThread(); int pp = ((ForkJoinPool2)t.getPool())._priority; // Drain the high priority queues before the normal F/J queue H2OCountedCompleter h2o = null; try { assert priority() == pp; // Job went to the correct queue? assert t._priority <= pp; // Thread attempting the job is only a low-priority? final int p2 = Math.max(pp,MIN_HI_PRIORITY); for( int p = MAX_PRIORITY; p > p2; p-- ) { if( FJPS[p] == null ) continue; h2o = FJPS[p].poll2(); if( h2o != null ) { // Got a hi-priority job? t._priority = p; // Set & do it now! t.setPriority(Thread.MAX_PRIORITY-1); h2o.compute2(); // Do it ahead of normal F/J work p++; // Check again the same queue } } } catch( Throwable ex ) { // If the higher priority job popped an exception, complete it // exceptionally... but then carry on and do the lower priority job. if( h2o != null ) h2o.onExceptionalCompletion(ex, h2o.getCompleter()); else ex.printStackTrace(); } finally { t._priority = pp; if( pp == MIN_PRIORITY ) t.setPriority(Thread.NORM_PRIORITY-1); } // Now run the task as planned compute2(); } // Do the actually intended work public abstract void compute2(); @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { if(!(ex instanceof JobCancelledException) && !(ex instanceof IllegalArgumentException) && this.getCompleter() == null) ex.printStackTrace(); return true; } // In order to prevent deadlock, threads that block waiting for a reply // from a remote node, need the remote task to run at a higher priority // than themselves. This field tracks the required priority. public byte priority() { return MIN_PRIORITY; } @Override public T clone(){ try { return (T)super.clone(); } catch( CloneNotSupportedException e ) { throw water.util.Log.errRTExcept(e); } } } public static abstract class H2OCallback<T extends H2OCountedCompleter> extends H2OCountedCompleter{ public H2OCallback(){} public H2OCallback(H2OCountedCompleter cc){super(cc);} @Override public void compute2(){throw new UnsupportedOperationException();} @Override public void onCompletion(CountedCompleter caller){callback((T) caller);} public abstract void callback(T t); } public static class H2OEmptyCompleter extends H2OCountedCompleter{ public H2OEmptyCompleter(){} public H2OEmptyCompleter(H2OCountedCompleter cc){super(cc);} @Override public void compute2(){throw new UnsupportedOperationException();} } // -------------------------------------------------------------------------- public static OptArgs OPT_ARGS = new OptArgs(); public static class OptArgs extends Arguments.Opt { public String name; // set_cloud_name_and_mcast() public String flatfile; // set_cloud_name_and_mcast() public int baseport; // starting number to search for open ports public int port; // set_cloud_name_and_mcast() public String ip; // Named IP4/IP6 address instead of the default public String network; // Network specification for acceptable interfaces to bind to. public String ice_root; // ice root directory public String hdfs; // HDFS backend public String hdfs_version; // version of the filesystem public String hdfs_config; // configuration file of the HDFS public String hdfs_skip = null; // used by hadoop driver to not unpack and load any hdfs jar file at runtime. public String aws_credentials; // properties file for aws credentials public String keepice; // Do not delete ice on startup public String soft = null; // soft launch for demos public String random_udp_drop = null; // test only, randomly drop udp incoming public int pparse_limit = Integer.MAX_VALUE; public String no_requests_log = null; // disable logging of Web requests public boolean check_rest_params = true; // enable checking unused/unknown REST params e.g., -check_rest_params=false disable control of unknown rest params public int nthreads=NUMCPUS; // desired F/J parallelism level for low priority queues. public String license; // License file public String h = null; public String help = null; public String version = null; public String single_precision = null; public int data_max_factor_levels; public String many_cols = null; public int chunk_bytes; public String beta = null; public String mem_watchdog = null; // For developer debugging public boolean md5skip = false; public boolean ga_opt_out = false; public String ga_hadoop_ver = null; public boolean no_ice = false; } public static void printHelp() { String s = "Start an H2O node.\n" + "\n" + "Usage: java [-Xmx<size>] -jar h2o.jar [options]\n" + " (Note that every option has a default and is optional.)\n" + "\n" + " -h | -help\n" + " Print this help.\n" + "\n" + " -version\n" + " Print version info and exit.\n" + "\n" + " -name <h2oCloudName>\n" + " Cloud name used for discovery of other nodes.\n" + " Nodes with the same cloud name will form an H2O cloud\n" + " (also known as an H2O cluster).\n" + "\n" + " -flatfile <flatFileName>\n" + " Configuration file explicitly listing H2O cloud node members.\n" + "\n" + " -ip <ipAddressOfNode>\n" + " IP address of this node.\n" + "\n" + " -port <port>\n" + " Port number for this node (note: port+1 is also used).\n" + " (The default port is " + DEFAULT_PORT + ".)\n" + "\n" + " -network <IPv4network1Specification>[,<IPv4network2Specification> ...]\n" + " The IP address discovery code will bind to the first interface\n" + " that matches one of the networks in the comma-separated list.\n" + " Use instead of -ip when a broad range of addresses is legal.\n" + " (Example network specification: '10.1.2.0/24' allows 256 legal\n" + " possibilities.)\n" + "\n" + " -ice_root <fileSystemPath>\n" + " The directory where H2O spills temporary data to disk.\n" + " (The default is '" + DEFAULT_ICE_ROOT() + "'.)\n" + "\n" + " -single_precision\n" + " Reduce the max. (storage) precision for floating point numbers\n" + " from double to single precision to save memory of numerical data.\n" + " (The default is double precision.)\n" + "\n" + " -many_cols\n" + " Enables improved handling of high-dimensional datasets. Same as -chunk_bytes 24.\n" + "\n" + " -chunk_bytes <integer>\n" + " Experimental option. Not in combination with -many_cols. The log (base 2) of chunk size in bytes.\n" + " (The default is " + LOG_CHK + ", which leads to a chunk size of " + PrettyPrint.bytes(1<<LOG_CHK) + ".)\n" + "\n" + " -data_max_factor_levels <integer>\n" + " The maximum number of factor levels for categorical columns.\n" + " Columns with more than the specified number of factor levels\n" + " are converted into all missing values.\n" + " (The default is " + DATA_MAX_FACTOR_LEVELS + ".)\n" + "\n" + " -nthreads <#threads>\n" + " Maximum number of threads in the low priority batch-work queue.\n" + " (The default is 4*numcpus.)\n" + "\n" + " -license <licenseFilePath>\n" + " Path to license file on local filesystem.\n" + "\n" + "Cloud formation behavior:\n" + "\n" + " New H2O nodes join together to form a cloud at startup time.\n" + " Once a cloud is given work to perform, it locks out new members\n" + " from joining.\n" + "\n" + "Examples:\n" + "\n" + " Start an H2O node with 4GB of memory and a default cloud name:\n" + " $ java -Xmx4g -jar h2o.jar\n" + "\n" + " Start an H2O node with 6GB of memory and a specify the cloud name:\n" + " $ java -Xmx6g -jar h2o.jar -name MyCloud\n" + "\n" + " Start an H2O cloud with three 2GB nodes and a default cloud name:\n" + " $ java -Xmx2g -jar h2o.jar &\n" + " $ java -Xmx2g -jar h2o.jar &\n" + " $ java -Xmx2g -jar h2o.jar &\n" + "\n"; System.out.print(s); } public static boolean IS_SYSTEM_RUNNING = false; /** Load a h2o build version or return default unknown version * @return never returns null */ public static AbstractBuildVersion getBuildVersion() { try { Class klass = Class.forName("water.BuildVersion"); java.lang.reflect.Constructor constructor = klass.getConstructor(); AbstractBuildVersion abv = (AbstractBuildVersion) constructor.newInstance(); return abv; // it exists on the classpath } catch (Exception e) { return AbstractBuildVersion.UNKNOWN_VERSION; } } /** * If logging has not been setup yet, then Log.info will only print to stdout. * This allows for early processing of the '-version' option without unpacking * the jar file and other startup stuff. */ public static void printAndLogVersion() { // Try to load a version AbstractBuildVersion abv = getBuildVersion(); String build_branch = abv.branchName(); String build_hash = abv.lastCommitHash(); String build_describe = abv.describe(); String build_project_version = abv.projectVersion(); String build_by = abv.compiledBy(); String build_on = abv.compiledOn(); Log.info ("----- H2O started -----"); Log.info ("Build git branch: " + build_branch); Log.info ("Build git hash: " + build_hash); Log.info ("Build git describe: " + build_describe); Log.info ("Build project version: " + build_project_version); Log.info ("Built by: '" + build_by + "'"); Log.info ("Built on: '" + build_on + "'"); Runtime runtime = Runtime.getRuntime(); double ONE_GB = 1024 * 1024 * 1024; Log.info ("Java availableProcessors: " + runtime.availableProcessors()); Log.info ("Java heap totalMemory: " + String.format("%.2f gb", runtime.totalMemory() / ONE_GB)); Log.info ("Java heap maxMemory: " + String.format("%.2f gb", runtime.maxMemory() / ONE_GB)); Log.info ("Java version: " + String.format("Java %s (from %s)", System.getProperty("java.version"), System.getProperty("java.vendor"))); Log.info ("OS version: " + String.format("%s %s (%s)", System.getProperty("os.name"), System.getProperty("os.version"), System.getProperty("os.arch"))); long totalMemory = OSUtils.getTotalPhysicalMemory(); Log.info ("Machine physical memory: " + (totalMemory==-1 ? "NA" : String.format("%.2f gb", totalMemory / ONE_GB))); } /** * We had a report from a user that H2O didn't start properly on MacOS X in a * case where the user was part of the root group. So warn about it. */ public static void printWarningIfRootOnMac() { String os_name = System.getProperty("os.name"); if (os_name.equals("Mac OS X")) { String user_name = System.getProperty("user.name"); if (user_name.equals("root")) { Log.warn("Running as root on MacOS; check if java binary is unintentionally setuid"); } } } public static String getVersion() { String build_project_version = "(unknown)"; try { Class klass = Class.forName("water.BuildVersion"); java.lang.reflect.Constructor constructor = klass.getConstructor(); AbstractBuildVersion abv = (AbstractBuildVersion) constructor.newInstance(); build_project_version = abv.projectVersion(); // it exists on the classpath } catch (Exception e) { // it does not exist on the classpath } return build_project_version; } // Start up an H2O Node and join any local Cloud public static void main( String[] args ) { Log.POST(300,""); // To support launching from JUnit, JUnit expects to call main() repeatedly. // We need exactly 1 call to main to startup all the local services. if (IS_SYSTEM_RUNNING) return; IS_SYSTEM_RUNNING = true; VERSION = getVersion(); // Pick this up from build-specific info. START_TIME_MILLIS = System.currentTimeMillis(); // Parse args Arguments arguments = new Arguments(args); arguments.extract(OPT_ARGS); ARGS = arguments.toStringArray(); printAndLogVersion(); printWarningIfRootOnMac(); GA = new GoogleAnalytics("UA-56665317-2","H2O",H2O.getVersion()); if((new File(".h2o_no_collect")).exists() || (new File(System.getProperty("user.home")+File.separator+".h2o_no_collect")).exists() || OPT_ARGS.ga_opt_out ) { GA.setEnabled(false); Log.info("Opted out of sending usage metrics."); } if (OPT_ARGS.baseport != 0) { DEFAULT_PORT = OPT_ARGS.baseport; } SINGLE_PRECISION = OPT_ARGS.single_precision != null; if (SINGLE_PRECISION) Log.info("Using single precision for floating-point numbers."); if (OPT_ARGS.data_max_factor_levels != 0) { DATA_MAX_FACTOR_LEVELS = OPT_ARGS.data_max_factor_levels; Log.info("Max. number of factor levels per column: " + DATA_MAX_FACTOR_LEVELS); } if (OPT_ARGS.chunk_bytes != 0 || OPT_ARGS.many_cols != null) { if (OPT_ARGS.many_cols != null) { LOG_CHK = 24; if (OPT_ARGS.chunk_bytes > 0) { Log.warn("-chunk_bytes is ignored since -many_cols was set."); } } else if (OPT_ARGS.chunk_bytes > 0) { LOG_CHK = OPT_ARGS.chunk_bytes; if (OPT_ARGS.chunk_bytes < 22) { Log.warn("-chunk_bytes < 22 is not officially supported. Use at your own risk."); } if (OPT_ARGS.chunk_bytes > 24) { Log.warn("-chunk_bytes > 24 is not officially supported. Use at your own risk."); } } } Log.info("Chunk size: " + PrettyPrint.bytes(1<<LOG_CHK)); // Get ice path before loading Log or Persist class String ice = DEFAULT_ICE_ROOT(); if( OPT_ARGS.ice_root != null ) ice = OPT_ARGS.ice_root.replace("\\", "/"); try { ICE_ROOT = new URI(ice); } catch(URISyntaxException ex) { throw new RuntimeException("Invalid ice_root: " + ice + ", " + ex.getMessage()); } Log.info ("ICE root: '" + ICE_ROOT + "'"); findInetAddressForSelf(); //if (OPT_ARGS.rshell.equals("false")) Log.POST(310,""); Log.wrap(); // Logging does not wrap when the rshell is on. // Start the local node startLocalNode(); Log.POST(320,""); String logDir = (Log.getLogDir() != null) ? Log.getLogDir() : "(unknown)"; Log.info ("Log dir: '" + logDir + "'"); // Load up from disk and initialize the persistence layer initializePersistence(); Log.POST(340, ""); initializeLicenseManager(); Log.POST(345, ""); // Start network services, including heartbeats & Paxos startNetworkServices(); // start server services Log.POST(350,""); startApiIpPortWatchdog(); // Check if the API port becomes unreachable Log.POST(360,""); if (OPT_ARGS.mem_watchdog != null) { startMemoryWatchdog(); Log.POST(370, ""); } startupFinalize(); // finalizes the startup & tests (if any) Log.POST(380,""); startGAStartupReport(); } /** Starts the local k-v store. * Initializes the local k-v store, local node and the local cloud with itself * as the only member. */ private static void startLocalNode() { // Print this first, so if any network stuff is affected it's clear this is going on. if (OPT_ARGS.random_udp_drop != null) { Log.warn("Debugging option RANDOM UDP DROP is ENABLED, make sure you really meant it"); } // Figure self out; this is surprisingly hard initializeNetworkSockets(); // Do not forget to put SELF into the static configuration (to simulate // proper multicast behavior) if( STATIC_H2OS != null && !STATIC_H2OS.contains(SELF)) { Log.warn("Flatfile configuration does not include self: " + SELF+ " but contains " + STATIC_H2OS); STATIC_H2OS.add(SELF); } Log.info ("H2O cloud name: '" + NAME + "'"); Log.info("(v"+VERSION+") '"+NAME+"' on " + SELF+(OPT_ARGS.flatfile==null ? (", discovery address "+CLOUD_MULTICAST_GROUP+":"+CLOUD_MULTICAST_PORT) : ", static configuration based on -flatfile "+OPT_ARGS.flatfile)); Log.info("If you have trouble connecting, try SSH tunneling from your local machine (e.g., via port 55555):\n" + " 1. Open a terminal and run 'ssh -L 55555:localhost:" + API_PORT + " " + System.getProperty("user.name") + "@" + SELF_ADDRESS.getHostAddress() + "'\n" + " 2. Point your browser to http://localhost:55555"); // Create the starter Cloud with 1 member SELF._heartbeat._jar_md5 = Boot._init._jarHash; Paxos.doHeartbeat(SELF); assert SELF._heartbeat._cloud_hash != 0; } /** Initializes the network services of the local node. * * Starts the worker threads, receiver threads, heartbeats and all other * network related services. */ private static void startNetworkServices() { // We've rebooted the JVM recently. Tell other Nodes they can ignore task // prior tasks by us. Do this before we receive any packets UDPRebooted.T.reboot.broadcast(); // Start the UDPReceiverThread, to listen for requests from other Cloud // Nodes. There should be only 1 of these, and it never shuts down. // Started first, so we can start parsing UDP packets new UDPReceiverThread().start(); // Start the MultiReceiverThread, to listen for multi-cast requests from // other Cloud Nodes. There should be only 1 of these, and it never shuts // down. Started soon, so we can start parsing multicast UDP packets new MultiReceiverThread().start(); // Start the Persistent meta-data cleaner thread, which updates the K/V // mappings periodically to disk. There should be only 1 of these, and it // never shuts down. Needs to start BEFORE the HeartBeatThread to build // an initial histogram state. new Cleaner().start(); // Start the heartbeat thread, to publish the Clouds' existence to other // Clouds. This will typically trigger a round of Paxos voting so we can // join an existing Cloud. new HeartBeatThread().start(); // Start a UDP timeout worker thread. This guy only handles requests for // which we have not recieved a timely response and probably need to // arrange for a re-send to cover a dropped UDP packet. new UDPTimeOutThread().start(); new H2ONode.AckAckTimeOutThread().start(); // Start the TCPReceiverThread, to listen for TCP requests from other Cloud // Nodes. There should be only 1 of these, and it never shuts down. new TCPReceiverThread().start(); // Start the Nano HTTP server thread water.api.RequestServer.start(); } /** Initializes a watchdog thread to make sure the API IP:Port is reachable. * * The IP and port are meant to be accessible from outside this * host, much less inside. The real reason behind this check is the * one-node cloud case where people move their laptop around and * DHCP assigns them a new IP address. */ private static void startApiIpPortWatchdog() { apiIpPortWatchdog = new ApiIpPortWatchdogThread(); apiIpPortWatchdog.start(); } private static void startMemoryWatchdog() { new MemoryWatchdogThread().start(); } private static void startGAStartupReport() { new GAStartupReportThread().start(); } // Used to update the Throwable detailMessage field. private static java.lang.reflect.Field DETAILMESSAGE; public static <T extends Throwable> T setDetailMessage( T t, String s ) { try { if( DETAILMESSAGE != null ) DETAILMESSAGE.set(t,s); } catch( IllegalAccessException iae) {} return t; } /** Finalizes the node startup. * * Displays the startup message and runs the tests (if applicable). */ private static void startupFinalize() { // Allow Throwable detailMessage's to be updated on the fly. Ugly, ugly, // but I want to add info without rethrowing/rebuilding whole exceptions. try { DETAILMESSAGE = Throwable.class.getDeclaredField("detailMessage"); DETAILMESSAGE.setAccessible(true); } catch( NoSuchFieldException nsfe ) { } // Sleep a bit so all my other threads can 'catch up' try { Thread.sleep(100); } catch( InterruptedException e ) { } } public static DatagramChannel _udpSocket; public static ServerSocket _apiSocket; // Parse arguments and set cloud name in any case. Strip out "-name NAME" // and "-flatfile <filename>". Ignore the rest. Set multi-cast port as a hash // function of the name. Parse node ip addresses from the filename. static void initializeNetworkSockets( ) { // Assign initial ports API_PORT = OPT_ARGS.port != 0 ? OPT_ARGS.port : DEFAULT_PORT; while (true) { H2O_PORT = API_PORT+1; if( API_PORT<0 || API_PORT>65534 ) // 65535 is max, implied for udp port Log.die("Attempting to use system illegal port, either "+API_PORT+" or "+ H2O_PORT); try { // kbn. seems like we need to set SO_REUSEADDR before binding? // http://www.javadocexamples.com/java/net/java.net.ServerSocket.html#setReuseAddress:boolean // When a TCP connection is closed the connection may remain in a timeout state // for a period of time after the connection is closed (typically known as the // TIME_WAIT state or 2MSL wait state). For applications using a well known socket address // or port it may not be possible to bind a socket to the required SocketAddress // if there is a connection in the timeout state involving the socket address or port. // Enabling SO_REUSEADDR prior to binding the socket using bind(SocketAddress) // allows the socket to be bound even though a previous connection is in a timeout state. // cnc: this is busted on windows. Back to the old code. // If the user specified the -ip flag, honor it for the Web UI interface bind. // Otherwise bind to all interfaces. _apiSocket = OPT_ARGS.ip == null ? new ServerSocket(API_PORT) : new ServerSocket(API_PORT, -1/*defaultBacklog*/, SELF_ADDRESS); _apiSocket.setReuseAddress(true); // Bind to the UDP socket _udpSocket = DatagramChannel.open(); _udpSocket.socket().setReuseAddress(true); InetSocketAddress isa = new InetSocketAddress(H2O.SELF_ADDRESS, H2O_PORT); _udpSocket.socket().bind(isa); // Bind to the TCP socket also TCPReceiverThread.SOCK = ServerSocketChannel.open(); TCPReceiverThread.SOCK.socket().setReceiveBufferSize(water.AutoBuffer.TCP_BUF_SIZ); TCPReceiverThread.SOCK.socket().bind(isa); break; } catch (IOException e) { try { if( _apiSocket != null ) _apiSocket.close(); } catch( IOException ohwell ) { Log.err(ohwell); } Utils.close(_udpSocket); if( TCPReceiverThread.SOCK != null ) try { TCPReceiverThread.SOCK.close(); } catch( IOException ie ) { } _apiSocket = null; _udpSocket = null; TCPReceiverThread.SOCK = null; if( OPT_ARGS.port != 0 ) Log.die("On " + SELF_ADDRESS + " some of the required ports " + (OPT_ARGS.port+0) + ", " + (OPT_ARGS.port+1) + " are not available, change -port PORT and try again."); } API_PORT += 2; } SELF = H2ONode.self(SELF_ADDRESS); Log.info("Internal communication uses port: ", H2O_PORT,"\nListening for HTTP and REST traffic on http://",SELF_ADDRESS.getHostAddress(),":"+_apiSocket.getLocalPort()+"/"); String embeddedConfigFlatfile = null; AbstractEmbeddedH2OConfig ec = getEmbeddedH2OConfig(); if (ec != null) { ec.notifyAboutEmbeddedWebServerIpPort (SELF_ADDRESS, API_PORT); if (ec.providesFlatfile()) { try { embeddedConfigFlatfile = ec.fetchFlatfile(); } catch (Exception e) { Log.err("Failed to get embedded config flatfile"); Log.err(e); H2O.exit(1); } } } NAME = OPT_ARGS.name==null? System.getProperty("user.name") : OPT_ARGS.name; // Read a flatfile of allowed nodes if (embeddedConfigFlatfile != null) { STATIC_H2OS = parseFlatFileFromString(embeddedConfigFlatfile); } else { STATIC_H2OS = parseFlatFile(OPT_ARGS.flatfile); } // Multi-cast ports are in the range E1.00.00.00 to EF.FF.FF.FF int hash = NAME.hashCode()&0x7fffffff; int port = (hash % (0xF0000000-0xE1000000))+0xE1000000; byte[] ip = new byte[4]; for( int i=0; i<4; i++ ) ip[i] = (byte)(port>>>((3-i)<<3)); try { CLOUD_MULTICAST_GROUP = InetAddress.getByAddress(ip); } catch( UnknownHostException e ) { throw Log.errRTExcept(e); } CLOUD_MULTICAST_PORT = (port>>>16); } // Multicast send-and-close. Very similar to udp_send, except to the // multicast port (or all the individuals we can find, if multicast is // disabled). static void multicast( ByteBuffer bb ) { try { multicast2(bb); } catch (Exception xe) {} } static private void multicast2( ByteBuffer bb ) { if( H2O.STATIC_H2OS == null ) { byte[] buf = new byte[bb.remaining()]; bb.get(buf); synchronized( H2O.class ) { // Sync'd so single-thread socket create/destroy assert H2O.CLOUD_MULTICAST_IF != null; try { if( CLOUD_MULTICAST_SOCKET == null ) { CLOUD_MULTICAST_SOCKET = new MulticastSocket(); // Allow multicast traffic to go across subnets CLOUD_MULTICAST_SOCKET.setTimeToLive(2); CLOUD_MULTICAST_SOCKET.setNetworkInterface(H2O.CLOUD_MULTICAST_IF); } // Make and send a packet from the buffer CLOUD_MULTICAST_SOCKET.send(new DatagramPacket(buf, buf.length, CLOUD_MULTICAST_GROUP,CLOUD_MULTICAST_PORT)); } catch( Exception e ) { // On any error from anybody, close all sockets & re-open // and if not a soft launch (hibernate mode) if(H2O.OPT_ARGS.soft == null) Log.err("Multicast Error ",e); if( CLOUD_MULTICAST_SOCKET != null ) try { CLOUD_MULTICAST_SOCKET.close(); } catch( Exception e2 ) { Log.err("Got",e2); } finally { CLOUD_MULTICAST_SOCKET = null; } } } } else { // Multicast Simulation // The multicast simulation is little bit tricky. To achieve union of all // specified nodes' flatfiles (via option -flatfile), the simulated // multicast has to send packets not only to nodes listed in the node's // flatfile (H2O.STATIC_H2OS), but also to all cloud members (they do not // need to be specified in THIS node's flatfile but can be part of cloud // due to another node's flatfile). // // Furthermore, the packet have to be send also to Paxos proposed members // to achieve correct functionality of Paxos. Typical situation is when // this node receives a Paxos heartbeat packet from a node which is not // listed in the node's flatfile -- it means that this node is listed in // another node's flatfile (and wants to create a cloud). Hence, to // allow cloud creation, this node has to reply. // // Typical example is: // node A: flatfile (B) // node B: flatfile (C), i.e., A -> (B), B-> (C), C -> (A) // node C: flatfile (A) // Cloud configuration: (A, B, C) // // Hideous O(n) algorithm for broadcast - avoid the memory allocation in // this method (since it is heavily used) HashSet<H2ONode> nodes = (HashSet<H2ONode>)H2O.STATIC_H2OS.clone(); nodes.addAll(Paxos.PROPOSED.values()); bb.mark(); for( H2ONode h2o : nodes ) { bb.reset(); try { H2O.CLOUD_DGRAM.send(bb, h2o._key); } catch( IOException e ) { Log.warn("Multicast Error to "+h2o+e); } } } } /** * Read a set of Nodes from a file. Format is: * * name/ip_address:port * - name is unused and optional * - port is optional * - leading '#' indicates a comment * * For example: * * 10.10.65.105:54322 * # disabled for testing * # 10.10.65.106 * /10.10.65.107 * # run two nodes on 108 * 10.10.65.108:54322 * 10.10.65.108:54325 */ private static HashSet<H2ONode> parseFlatFile( String fname ) { if( fname == null ) return null; File f = new File(fname); if( !f.exists() ) { Log.warn("-flatfile specified but not found: " + fname); return null; // No flat file } HashSet<H2ONode> h2os = new HashSet<H2ONode>(); List<FlatFileEntry> list = parseFlatFile(f); for(FlatFileEntry entry : list) h2os.add(H2ONode.intern(entry.inet, entry.port+1));// use the UDP port here return h2os; } public static HashSet<H2ONode> parseFlatFileFromString( String s ) { HashSet<H2ONode> h2os = new HashSet<H2ONode>(); InputStream is = new ByteArrayInputStream(s.getBytes()); List<FlatFileEntry> list = parseFlatFile(is); for(FlatFileEntry entry : list) h2os.add(H2ONode.intern(entry.inet, entry.port+1));// use the UDP port here return h2os; } public static class FlatFileEntry { public InetAddress inet; public int port; } public static List<FlatFileEntry> parseFlatFile( File f ) { InputStream is = null; try { is = new FileInputStream(f); } catch (Exception e) { Log.die(e.toString()); } return parseFlatFile(is); } public static List<FlatFileEntry> parseFlatFile( InputStream is ) { List<FlatFileEntry> list = new ArrayList<FlatFileEntry>(); BufferedReader br = null; int port = DEFAULT_PORT; try { br = new BufferedReader(new InputStreamReader(is)); String strLine = null; while( (strLine = br.readLine()) != null) { strLine = strLine.trim(); // be user friendly and skip comments and empty lines if (strLine.startsWith("#") || strLine.isEmpty()) continue; String ip = null, portStr = null; int slashIdx = strLine.indexOf('/'); int colonIdx = strLine.indexOf(':'); if( slashIdx == -1 && colonIdx == -1 ) { ip = strLine; } else if( slashIdx == -1 ) { ip = strLine.substring(0, colonIdx); portStr = strLine.substring(colonIdx+1); } else if( colonIdx == -1 ) { ip = strLine.substring(slashIdx+1); } else if( slashIdx > colonIdx ) { Log.die("Invalid format, must be name/ip[:port], not '"+strLine+"'"); } else { ip = strLine.substring(slashIdx+1, colonIdx); portStr = strLine.substring(colonIdx+1); } InetAddress inet = InetAddress.getByName(ip); if( !(inet instanceof Inet4Address) ) Log.die("Only IP4 addresses allowed: given " + ip); if( portStr!=null && !portStr.equals("") ) { try { port = Integer.decode(portStr); } catch( NumberFormatException nfe ) { Log.die("Invalid port #: "+portStr); } } FlatFileEntry entry = new FlatFileEntry(); entry.inet = inet; entry.port = port; list.add(entry); } } catch( Exception e ) { Log.die(e.toString()); } finally { Utils.close(br); } return list; } static void initializePersistence() { Log.POST(3001); HdfsLoader.loadJars(); Log.POST(3002); if( OPT_ARGS.aws_credentials != null ) { try { Log.POST(3003); PersistS3.getClient(); Log.POST(3004); } catch( IllegalArgumentException e ) { Log.POST(3005); Log.err(e); } } Log.POST(3006); Persist.initialize(); Log.POST(3007); } static void initializeLicenseManager() { licenseManager = new LicenseManager(); if (OPT_ARGS.license != null) { LicenseManager.Result r = licenseManager.readLicenseFile(OPT_ARGS.license); if (r == LicenseManager.Result.OK) { Log.info("Successfully read license file ("+ OPT_ARGS.license + ")"); licenseManager.logLicensedFeatures(); } else { Log.err("readLicenseFile failed (" + r + ")"); } } } // Cleaner --------------------------------------------------------------- // msec time at which the STORE was dirtied. // Long.MAX_VALUE if clean. static private volatile long _dirty; // When was store dirtied static void dirty_store() { dirty_store(System.currentTimeMillis()); } static void dirty_store( long x ) { // Keep earliest dirty time seen if( x < _dirty ) _dirty = x; } public abstract static class KVFilter { public abstract boolean filter(KeyInfo k); } public static final class KeyInfo extends Iced implements Comparable<KeyInfo>{ public final Key _key; public final int _type; public final boolean _rawData; public final int _sz; public final int _ncols; public final long _nrows; public final byte _backEnd; public KeyInfo(Key k, Value v){ assert k!=null : "Key should be not null!"; assert v!=null : "Value should be not null!"; _key = k; _type = v.type(); _rawData = v.isRawData(); if(v.isFrame()){ Frame f = v.get(); // NOTE: can't get byteSize here as it may invoke RollupStats! :( // _sz = f.byteSize(); _sz = v._max; // do at least nrows/ncols instead _ncols = f.numCols(); _nrows = f.numRows(); } else { _sz = v._max; _ncols = 0; _nrows = 0; } _backEnd = v.backend(); } @Override public int compareTo(KeyInfo ki){ return _key.compareTo(ki._key);} public boolean isFrame(){ return _type == TypeMap.onIce(Frame.class.getName()); } public boolean isLockable(){ return TypeMap.newInstance(_type) instanceof Lockable; } } public static class KeySnapshot extends Iced { private static volatile long _lastUpdate; private static final long _updateInterval = 1000; private static volatile KeySnapshot _cache; public final KeyInfo [] _keyInfos; public long lastUpdated(){return _lastUpdate;} public KeySnapshot cache(){return _cache;} public KeySnapshot filter(KVFilter kvf){ ArrayList<KeyInfo> res = new ArrayList<KeyInfo>(); for(KeyInfo kinfo: _keyInfos) if(kvf.filter(kinfo))res.add(kinfo); return new KeySnapshot(res.toArray(new KeyInfo[res.size()])); } KeySnapshot(KeyInfo [] snapshot){ _keyInfos = snapshot;} public Key [] keys(){ Key [] res = new Key[_keyInfos.length]; for(int i = 0; i < _keyInfos.length; ++i) res[i] = _keyInfos[i]._key; return res; } public <T extends Iced> Map<String, T> fetchAll(Class<T> c) { return fetchAll(c,false,0,Integer.MAX_VALUE);} public <T extends Iced> Map<String, T> fetchAll(Class<T> c, boolean exact) { return fetchAll(c,exact,0,Integer.MAX_VALUE);} public <T extends Iced> Map<String, T> fetchAll(Class<T> c, boolean exact, int offset, int limit) { TreeMap<String, T> res = new TreeMap<String, T>(); final int typeId = TypeMap.onIce(c.getName()); for (KeyInfo kinfo : _keyInfos) { if (kinfo._type == typeId || (!exact && c.isAssignableFrom(TypeMap.clazz(kinfo._type)))) { if (offset > 0) { --offset; continue; } Value v = DKV.get(kinfo._key); if (v != null) { T t = v.get(); res.put(kinfo._key.toString(), t); if (res.size() == limit) break; } } } return res; } public static KeySnapshot localSnapshot(){return localSnapshot(false);} public static KeySnapshot localSnapshot(boolean homeOnly){ Object [] kvs = STORE.raw_array(); ArrayList<KeyInfo> res = new ArrayList<KeyInfo>(); for(int i = 2; i < kvs.length; i+= 2){ Object ok = kvs[i], ov = kvs[i+1]; if( !(ok instanceof Key ) || ov==null ) continue; // Ignore tombstones or deleted values Key key = (Key) ok; if(!key.user_allowed())continue; if(homeOnly && !key.home())continue; // Raw array can contain regular and also wrapped values into Prime marker class: // - if we see Value object, create instance of KeyInfo // - if we do not see Value object, try to unwrap it via calling STORE.get and then // look at wrapped value again. if (!(ov instanceof Value)) { ov = H2O.get(key); // H2Oget returns already Value object or null if (ov==null) continue; } res.add(new KeyInfo(key,(Value)ov)); } final KeyInfo [] arr = res.toArray(new KeyInfo[res.size()]); Arrays.sort(arr); return new KeySnapshot(arr); } public static KeySnapshot globalSnapshot(){ return globalSnapshot(-1);} public static KeySnapshot globalSnapshot(long timeTolerance){ KeySnapshot res = _cache; final long t = System.currentTimeMillis(); if(res == null || (t - _lastUpdate) > timeTolerance) res = new KeySnapshot(new GlobalUKeySetTask().invokeOnAllNodes()._res); else if(t - _lastUpdate > _updateInterval) H2O.submitTask(new H2OCountedCompleter() { @Override public void compute2() { new GlobalUKeySetTask().invokeOnAllNodes(); } }); return res; } private static class GlobalUKeySetTask extends DRemoteTask<GlobalUKeySetTask> { KeyInfo [] _res; @Override public byte priority(){return H2O.GET_KEY_PRIORITY;} @Override public void lcompute(){ _res = localSnapshot(true)._keyInfos; tryComplete(); } @Override public void reduce(GlobalUKeySetTask gbt){ if(_res == null)_res = gbt._res; else if(gbt._res != null){ // merge sort keys together KeyInfo [] res = new KeyInfo[_res.length + gbt._res.length]; int j = 0, k = 0; for(int i = 0; i < res.length; ++i) res[i] = j < gbt._res.length && (k == _res.length || gbt._res[j].compareTo(_res[k]) < 0)?gbt._res[j++]:_res[k++]; _res = res; } } @Override public void postGlobal(){ _cache = new KeySnapshot(_res); _lastUpdate = System.currentTimeMillis(); } } } // Periodically write user keys to disk public static class Cleaner extends Thread { // Desired cache level. Set by the MemoryManager asynchronously. static public volatile long DESIRED; // Histogram used by the Cleaner private final Histo _myHisto; boolean _diskFull = false; public Cleaner() { super("MemCleaner"); setDaemon(true); setPriority(MAX_PRIORITY-2); _dirty = Long.MAX_VALUE; // Set to clean-store _myHisto = new Histo(); // Build/allocate a first histogram _myHisto.compute(0); // Compute lousy histogram; find eldest H = _myHisto; // Force to be the most recent _myHisto.histo(true); // Force a recompute with a good eldest MemoryManager.set_goals("init",false); } static boolean lazyPersist(){ // free disk > our DRAM? return !H2O.OPT_ARGS.no_ice && H2O.SELF._heartbeat.get_free_disk() > MemoryManager.MEM_MAX; } static boolean isDiskFull(){ // free disk space < 5K? long space = Persist.getIce().getUsableSpace(); return space != Persist.UNKNOWN && space < (5 << 10); } @Override public void run() { boolean diskFull = false; while( true ) { // Sweep the K/V store, writing out Values (cleaning) and free'ing // - Clean all "old" values (lazily, optimistically) // - Clean and free old values if above the desired cache level // Do not let optimistic cleaning get in the way of emergency cleaning. // Get a recent histogram, computing one as needed Histo h = _myHisto.histo(false); long now = System.currentTimeMillis(); long dirty = _dirty; // When things first got dirtied // Start cleaning if: "dirty" was set a "long" time ago, or we beyond // the desired cache levels. Inverse: go back to sleep if the cache // is below desired levels & nothing has been dirty awhile. if( h._cached < DESIRED && // Cache is low and (now-dirty < 5000) ) { // not dirty a long time // Block asleep, waking every 5 secs to check for stuff, or when poked Boot.block_store_cleaner(); continue; // Awoke; loop back and re-check histogram. } now = System.currentTimeMillis(); _dirty = Long.MAX_VALUE; // Reset, since we are going write stuff out MemoryManager.set_goals("preclean",false); // The age beyond which we need to toss out things to hit the desired // caching levels. If forced, be exact (toss out the minimal amount). // If lazy, store-to-disk things down to 1/2 the desired cache level // and anything older than 5 secs. boolean force = (h._cached >= DESIRED); // Forced to clean if( force && diskFull ) diskFull = isDiskFull(); long clean_to_age = h.clean_to(force ? DESIRED : (DESIRED>>1)); // If not forced cleaning, expand the cleaning age to allows Values // more than 5sec old if( !force ) clean_to_age = Math.max(clean_to_age,now-5000); // No logging if under memory pressure: can deadlock the cleaner thread if( Log.flag(Sys.CLEAN) ) { String s = h+" DESIRED="+(DESIRED>>20)+"M dirtysince="+(now-dirty)+" force="+force+" clean2age="+(now-clean_to_age); if( MemoryManager.canAlloc() ) Log.debug(Sys.CLEAN ,s); else Log.unwrap(System.err,s); } long cleaned = 0; long freed = 0; // For faster K/V store walking get the NBHM raw backing array, // and walk it directly. Object[] kvs = STORE.raw_array(); // Start the walk at slot 2, because slots 0,1 hold meta-data for( int i=2; i<kvs.length; i += 2 ) { // In the raw backing array, Keys and Values alternate in slots Object ok = kvs[i], ov = kvs[i+1]; if( !(ok instanceof Key ) ) continue; // Ignore tombstones and Primes and null's Key key = (Key )ok; if( !(ov instanceof Value) ) continue; // Ignore tombstones and Primes and null's Value val = (Value)ov; byte[] m = val.rawMem(); Object p = val.rawPOJO(); if( m == null && p == null ) continue; // Nothing to throw out if( val.isLockable() ) continue; // we do not want to throw out Lockables. boolean isChunk = p instanceof Chunk; // Ignore things younger than the required age. In particular, do // not spill-to-disk all dirty things we find. long touched = val._lastAccessedTime; if( touched > clean_to_age ) { // Too recently touched? // But can toss out a byte-array if already deserialized & on disk // (no need for both forms). Note no savings for Chunks, for which m==p._mem if( val.isPersisted() && m != null && p != null && !isChunk ) { val.freeMem(); // Toss serialized form, since can rebuild from POJO freed += val._max; } dirty_store(touched); // But may write it out later continue; // Too young } // Should I write this value out to disk? // Should I further force it from memory? if( !val.isPersisted() && !diskFull && (force || (lazyPersist() && lazy_clean(key)))) { try { val.storePersist(); // Write to disk if( m == null ) m = val.rawMem(); if( m != null ) cleaned += m.length; } catch(IOException e) { if( isDiskFull() ) Log.warn(Sys.CLEAN,"Disk full! Disabling swapping to disk." + (force?" Memory low! Please free some space in " + Persist.getIce().getPath() + "!":"")); else Log.warn(Sys.CLEAN,"Disk swapping failed! " + e.getMessage()); // Something is wrong so mark disk as full anyways so we do not // attempt to write again. (will retry next run when memory is low) diskFull = true; } } // And, under pressure, free all if( force && val.isPersisted() ) { val.freeMem (); if( m != null ) freed += val._max; m = null; val.freePOJO(); if( p != null ) freed += val._max; p = null; if( isChunk ) freed -= val._max; // Double-counted freed mem for Chunks since val._pojo._mem & val._mem are the same. } // If we have both forms, toss the byte[] form - can be had by // serializing again. if( m != null && p != null && !isChunk ) { val.freeMem(); freed += val._max; } } h = _myHisto.histo(true); // Force a new histogram MemoryManager.set_goals("postclean",false); // No logging if under memory pressure: can deadlock the cleaner thread if( Log.flag(Sys.CLEAN) ) { String s = h+" cleaned="+(cleaned>>20)+"M, freed="+(freed>>20)+"M, DESIRED="+(DESIRED>>20)+"M"; if( MemoryManager.canAlloc() ) Log.debug(Sys.CLEAN ,s); else Log.unwrap(System.err,s); } } } // Rules on when to write & free a Key, when not under memory pressure. boolean lazy_clean( Key key ) { // Only data chunks are worth tossing out even lazily. if( !key.isChunkKey() ) // Not arraylet? return false; // Not enough savings to write it with mem-pressure to force us // If this is a chunk of a system-defined array, then assume it has // short lifetime, and we do not want to spin the disk writing it // unless we're under memory pressure. Key veckey = key.getVecKey(); return veckey.user_allowed(); // Write user keys but not system keys } // Current best histogram static private volatile Histo H; // Histogram class public static class Histo { final long[] _hs = new long[128]; long _oldest; // Time of the oldest K/V discovered this pass long _eldest; // Time of the eldest K/V found in some prior pass long _hStep; // Histogram step: (now-eldest)/histogram.length long _cached; // Total alive data in the histogram long _when; // When was this histogram computed Value _vold; // For assertions: record the oldest Value boolean _clean; // Was "clean" K/V when built? // Return the current best histogram static Histo best_histo() { return H; } // Return the current best histogram, recomputing in-place if it is // getting stale. Synchronized so the same histogram can be called into // here and will be only computed into one-at-a-time. synchronized Histo histo( boolean force ) { final Histo h = H; // Grab current best histogram if( !force && System.currentTimeMillis() < h._when+100 ) return h; // It is recent; use it if( h._clean && _dirty==Long.MAX_VALUE ) return h; // No change to the K/V store, so no point compute(h._oldest); // Use last oldest value for computing the next histogram in-place return (H = this); // Record current best histogram & return it } // Compute a histogram public void compute( long eldest ) { Arrays.fill(_hs, 0); _when = System.currentTimeMillis(); _eldest = eldest; // Eldest seen in some prior pass _hStep = Math.max(1,(_when-eldest)/_hs.length); boolean clean = _dirty==Long.MAX_VALUE; // Compute the hard way Object[] kvs = STORE.raw_array(); long cached = 0; // Total K/V cached in ram long oldest = Long.MAX_VALUE; // K/V with the longest time since being touched Value vold = null; // Start the walk at slot 2, because slots 0,1 hold meta-data for( int i=2; i<kvs.length; i += 2 ) { // In the raw backing array, Keys and Values alternate in slots Object ok = kvs[i+0], ov = kvs[i+1]; if( !(ok instanceof Key ) ) continue; // Ignore tombstones and Primes and null's if( !(ov instanceof Value) ) continue; // Ignore tombstones and Primes and null's Value val = (Value)ov; int len = 0; byte[] m = val.rawMem(); Object p = val.rawPOJO(); if( m != null ) len += val._max; if( p != null ) len += val._max; if( p instanceof Chunk ) len -= val._max; // Do not double-count Chunks if( len == 0 ) continue; cached += len; // Accumulate total amount of cached keys if( val._lastAccessedTime < oldest ) { // Found an older Value? vold = val; // Record oldest Value seen oldest = val._lastAccessedTime; } // Compute histogram bucket int idx = (int)((val._lastAccessedTime - eldest)/_hStep); if( idx < 0 ) idx = 0; else if( idx >= _hs.length ) idx = _hs.length-1; _hs[idx] += len; // Bump histogram bucket } _cached = cached; // Total cached; NOTE: larger than sum of histogram buckets _oldest = oldest; // Oldest seen in this pass _vold = vold; _clean = clean && _dirty==Long.MAX_VALUE; // Looks like a clean K/V the whole time? } // Compute the time (in msec) for which we need to throw out things // to throw out enough things to hit the desired cached memory level. long clean_to( long desired ) { long age = _eldest; // Age of bucket zero if( _cached < desired ) return age; // Already there; nothing to remove long s = 0; // Total amount toss out for( long t : _hs ) { // For all buckets... s += t; // Raise amount tossed out age += _hStep; // Raise age beyond which you need to go if( _cached - s < desired ) break; } return age; } // Pretty print @Override public String toString() { long x = _eldest; long now = System.currentTimeMillis(); return "H("+(_cached>>20)+"M, "+x+"ms < +"+(_oldest-x)+"ms <...{"+_hStep+"ms}...< +"+(_hStep*128)+"ms < +"+(now-x)+")"; } } } // API IP Port Watchdog --------------------------------------------------------------- // Monitor API IP:Port for availability. // // This thread is only a watchdog. You can comment this thread out // so it does not run without affecting any service functionality. public static class ApiIpPortWatchdogThread extends Thread { final private String threadName = "ApiPortWatchdog"; private volatile boolean gracefulShutdownInitiated; // Thread-safe. // Failure-tracking. private int consecutiveFailures; private long failureStartTimestampMillis; // Timing things that can be tuned if needed. final private int maxFailureSeconds = 180; final private int maxConsecutiveFailures = 20; final private int checkIntervalSeconds = 10; final private int timeoutSeconds = 30; final private int millisPerSecond = 1000; final private int timeoutMillis = timeoutSeconds * millisPerSecond; final private int sleepMillis = checkIntervalSeconds * millisPerSecond; // Constructor. public ApiIpPortWatchdogThread() { super("ApiWatch"); // Only 9 characters get printed in the log. setDaemon(true); setPriority(MAX_PRIORITY-2); reset(); gracefulShutdownInitiated = false; } // Exit this watchdog thread. public void shutdown() { gracefulShutdownInitiated = true; } // Sleep method. private void mySleep(int millis) { try { Thread.sleep (sleepMillis); } catch (Exception xe) {} } // Print some help for the user if a failure occurs. private void printPossibleCauses() { Log.info(threadName + ": A possible cause is DHCP (e.g. changing WiFi networks)"); Log.info(threadName + ": A possible cause is your laptop going to sleep (if running on a laptop)"); Log.info(threadName + ": A possible cause is the network interface going down"); Log.info(threadName + ": A possible cause is this host being overloaded"); } // Reset the failure counting when a successful check() occurs. private void reset() { consecutiveFailures = 0; failureStartTimestampMillis = 0; } // Count the impact of one failure. @SuppressWarnings("unused") private void failed() { printPossibleCauses(); if (consecutiveFailures == 0) { failureStartTimestampMillis = System.currentTimeMillis(); } consecutiveFailures++; } // Check if enough failures have occurred or time has passed to // shut down this node. private void testForFailureShutdown() { if (consecutiveFailures >= maxConsecutiveFailures) { Log.err(threadName + ": Too many failures (>= " + maxConsecutiveFailures + "), H2O node shutting down"); H2O.exit(1); } if (consecutiveFailures > 0) { final long now = System.currentTimeMillis(); final long deltaMillis = now - failureStartTimestampMillis; final long thresholdMillis = (maxFailureSeconds * millisPerSecond); if (deltaMillis > thresholdMillis) { Log.err(threadName + ": Failure time threshold exceeded (>= " + thresholdMillis + " ms), H2O node shutting down"); H2O.exit(1); } } } // Do the watchdog check. private void check() { final Socket s = new Socket(); final InetSocketAddress apiIpPort = new InetSocketAddress(H2O.SELF_ADDRESS, H2O.API_PORT); Exception e=null; String msg=null; try { s.connect (apiIpPort, timeoutMillis); reset(); } catch (SocketTimeoutException se) { e= se; msg=": Timed out"; } catch (IOException ioe) { e=ioe; msg=": Failed"; } catch (Exception ee) { e= ee; msg=": Failed unexpectedly"; } finally { if (gracefulShutdownInitiated) { return; } if( e != null ) { Log.err(threadName+msg+" trying to connect to REST API IP and Port (" + H2O.SELF_ADDRESS + ":" + H2O.API_PORT + ", " + timeoutMillis + " ms)"); fail(); } testForFailureShutdown(); try { s.close(); } catch (Exception xe) {} } } // Class main thread. @Override public void run() { Log.debug (threadName + ": Thread run() started"); reset(); while (true) { mySleep (sleepMillis); if (gracefulShutdownInitiated) { break; } check(); if (gracefulShutdownInitiated) { break; } } } } /** * Log physical (RSS) memory usage periodically. * Used by developers to look for memory leaks. * Currently this only works for Linux. */ private static class MemoryWatchdogThread extends Thread { final private String threadName = "MemoryWatchdog"; private volatile boolean gracefulShutdownInitiated; // Thread-safe. // Timing things that can be tuned if needed. final private int checkIntervalSeconds = 5; final private int millisPerSecond = 1000; final private int sleepMillis = checkIntervalSeconds * millisPerSecond; // Constructor. public MemoryWatchdogThread() { super("MemWatch"); // Only 9 characters get printed in the log. setDaemon(true); setPriority(MAX_PRIORITY - 2); gracefulShutdownInitiated = false; } // Exit this watchdog thread. public void shutdown() { gracefulShutdownInitiated = true; } // Sleep method. private void mySleep(int millis) { try { Thread.sleep (sleepMillis); } catch (Exception xe) {} } // Do the watchdog check. private void check() { water.util.LinuxProcFileReader r = new LinuxProcFileReader(); r.read(); long rss = -1; try { rss = r.getProcessRss(); } catch (AssertionError xe) {} Log.info("RSS: " + rss); } // Class main thread. @Override public void run() { Log.debug(threadName + ": Thread run() started"); while (true) { mySleep (sleepMillis); if (gracefulShutdownInitiated) { break; } check(); if (gracefulShutdownInitiated) { break; } } } } public static class GAStartupReportThread extends Thread { final private String threadName = "GAStartupReport"; final private int sleepMillis = 150 * 1000; //2.5 min // Constructor. public GAStartupReportThread() { super("GAStartupReport"); // Only 9 characters get printed in the log. setDaemon(true); setPriority(MAX_PRIORITY - 2); } // Class main thread. @Override public void run() { try { Thread.sleep (sleepMillis); } catch (Exception ignore) {}; if (H2O.SELF == H2O.CLOUD._memary[0]) { if (OPT_ARGS.ga_hadoop_ver != null) H2O.GA.postAsync(new EventHit("System startup info", "Hadoop version", OPT_ARGS.ga_hadoop_ver, 1)); H2O.GA.postAsync(new EventHit("System startup info", "Cloud", "Cloud size", CLOUD.size())); } } } }