/* * (C) Copyright IBM Corp. 2008 * * LICENSE: Eclipse Public License v1.0 * http://www.eclipse.org/legal/epl-v10.html */ package com.ibm.gaiandb; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.HashSet; import java.util.Hashtable; import java.util.Iterator; import java.util.Set; import java.util.Stack; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import com.ibm.gaiandb.diags.GDBMessages; /** * This class runs a set of threads to check the health of JDBC connections against peer GaianDB nodes and to take corrective action when * required - in particular when a connection is hanging. * * The outer class and its 2 inner classes are all Runnable. * The outer class implements a perpetual watch-dog thread (not to be confused with the GaianDB main watch-dog) which manages the other inner-class threads. * The inner class threads are used to test individual JDBC connections: * - ConnectionMaintainer inner class: Uses a short-lived SQL procedure on a JDBC connection established against a discovered peer GaianDB node - * to establish/maintain a "two-way connection" with it. * - ConnectionTester inner class: Uses a simple SQL query "values 1" to poll a peer GaianDB node against which an in-progress query is suspected * to be hanging. The poll is run on a separate JDBC connection. * * NOTE: As said above - ConnectionTester only currently tests long-running queries against peer GaianDB nodes. * Faulty leaf JDBC source are only currently semi-handled by DataSourcesManager.cleanAndPreloadDataSources()/isValidAndActiveSourceHandle(). - i.e. hanging ones are not. * * The outer class periodically wakes up - based on configurable parameter: GAIAN_CONNECTIONS_CHECKER_HEARTBEAT_MS - and then checks the status for each * inner class threads it kicked off. * If an inner class thread fails to respond successfully within the GAIAN_CONNECTIONS_CHECKER_HEARTBEAT_MS delay, then the Gaian connection is * dropped - i.e. All JDBC connections to the peer node are closed and associated GaianDB data-sources are removed. * See VTIRDBResult.lostConnection() -> GaianNodeSeeker.lostDiscoveredConnection() -> DataSourcesManager.unloadAllDataSourcesAndClearConnectionPoolForGaianConnection(). * * The outer class thread only dies when GaianNode.isRunning() is no longer true. * * ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- * Change history comments: * * 12/04/2016 - Removed use of redundant "initialConnectionTimeout" in ConnectionMaintainer inner class. * User property GAIAN_CONNECTIONS_CHECKER_HEARTBEAT_MS should be tuned to be sufficiently large to account for initial connection establishment. * * TODO: * ??/??/???? - Add use of java thread pools - rather than starting a new thread for each connection maintenance/testing attempt. * ??/??/???? - Re-factor monitoring code for outcome of ConnectionTester into one-off thread runs of the outer class - to ensure that hanging queries are rooted out * as fast as possible after GAIAN_CONNECTIONS_CHECKER_HEARTBEAT_MS has elapsed - rather than waiting for a previous iteration to complete. * * ... * * @author drvyvyan */ public class DatabaseConnectionsChecker implements Runnable { // private final static String connectionMaintainerFunction = "maintainConnection"; // public final static String INIT_SQL = // "!DROP FUNCTION " + connectionMaintainerFunction + ";CREATE FUNCTION " + connectionMaintainerFunction + // " (nodeid VARCHAR(100), usr VARCHAR(10), pwd VARCHAR(10)) RETURNS VARCHAR(500)" + // " PARAMETER STYLE JAVA LANGUAGE JAVA NO SQL EXTERNAL NAME 'com.ibm.gaiandb.GaianDBConfigProcedures." + // connectionMaintainerFunction + "'"; private final static String connectionMaintainerFunction = "maintainConnection2"; public final static String INIT_SQL = "!DROP FUNCTION " + connectionMaintainerFunction + ";CREATE FUNCTION " + connectionMaintainerFunction + " (nodeid VARCHAR(100), usr VARCHAR(10), pwd VARCHAR(10), extraInfo "+ Util.XSTR+") RETURNS "+ Util.XSTR + " PARAMETER STYLE JAVA LANGUAGE JAVA NO SQL EXTERNAL NAME 'com.ibm.gaiandb.GaianDBConfigProcedures." + connectionMaintainerFunction + "'"; public final static String SUCCESS_TAG = "SUCCESS:"; public final static String DISTANCE2SERVER_TAG = "D2S:"; public final static String SSLMODE_TAG = "SSL:"; private static int distanceToServerNode = GaianNode.isLite() ? -1 : 0; public static int getDistanceToServerNode() { return distanceToServerNode; } private static String bestPathToServer = null; public static String getBestPathToServer() { return bestPathToServer; } // Use PROPRIETARY notice if class contains a main() method, otherwise use COPYRIGHT notice. public static final String COPYRIGHT_NOTICE = "(c) Copyright IBM Corp. 2008"; private static final Logger logger = new Logger( "DatabaseConnectionsChecker", 35 ); private static final String TESTER_NAME = "ConnectionTester"; private static final String MAINTAINER_NAME = "ConnectionMaintainer"; private static final DatabaseConnectionsChecker dcc = new DatabaseConnectionsChecker(); private static Thread connectionsCheckerThread = null; // Set of all sets of executing vtis private static final Set<Set<VTIWrapper>> executingDataSourceSets = new HashSet<Set<VTIWrapper>>(); // Set of all vtis currently being checked private static final Set<VTIWrapper> dataSourcesBeingChecked = new HashSet<VTIWrapper>(); // Mapping of Set executingDataSources -> LinkedBlockingQueue nodeResults // private static ConcurrentMap<Set<VTIWrapper>, Queue<DataValueDescriptor[][]>> nodeResultsForExecutingDataSources = new ConcurrentHashMap<Set<VTIWrapper>, Queue<DataValueDescriptor[][]>>(); private static final ConcurrentMap<Set<VTIWrapper>, GaianResult> gaianResultsForExecutingDataSources = new ConcurrentHashMap<Set<VTIWrapper>, GaianResult>(); static int getNumberOfSuspectedHangingQueriesBeingChecked() { return gaianResultsForExecutingDataSources.size(); } // Set of String connection ids private static final Set<String> maintainedConnections = new HashSet<String>(); // VTIWrapper -> ConnectionTester ; String (gaian outgoing connection id) -> ConnectionMaintainer private static final Hashtable<VTIWrapper, ConnectionTester> testers = new Hashtable<VTIWrapper, ConnectionTester>(); private static final Hashtable<String, ConnectionMaintainer> maintainers = new Hashtable<String, ConnectionMaintainer>(); private static int connectionsCheckerHeartbeat = GaianDBConfig.getConnectionsCheckerHeartbeat(); public static void maintainTwoWayConnection( String connID ) { synchronized ( maintainedConnections ) { maintainedConnections.add( connID ); dcc.kickOffMaintainerThread( connID ); } } // public static void rootOutHangingDataSources( Set<VTIWrapper> executingDataSources, Queue<DataValueDescriptor[][]> resultRowsBuffer ) { public static void rootOutHangingDataSources( Set<VTIWrapper> executingDataSources, GaianResult gResult ) { boolean isDataSourceSetContainsAGaianNode = false; for ( Iterator<VTIWrapper> it = executingDataSources.iterator(); it.hasNext() && false == isDataSourceSetContainsAGaianNode; ) isDataSourceSetContainsAGaianNode = it.next().isGaianNode(); if ( isDataSourceSetContainsAGaianNode ) { synchronized ( dataSourcesBeingChecked ) { dataSourcesBeingChecked.addAll( executingDataSources ); executingDataSourceSets.add( executingDataSources ); } gaianResultsForExecutingDataSources.put( executingDataSources, gResult ); } } // Low value feature to skip maintenance for certain connections // private static ConcurrentSkipListSet<String> connectionsExcludedFromNextMaintenanceCycle = // new ConcurrentSkipListSet<String>(); // public static void excludeConnectionFromNextMaintenanceCycle( String connID ) { // System.out.println("Adding connection to maintenance exclusion set: " + connID); // connectionsExcludedFromNextMaintenanceCycle.add(connID); // } static void checkConnectionsInBackground() { if ( null == connectionsCheckerThread ) connectionsCheckerThread = new Thread(dcc, GaianNode.THREADNAME_CONNECTIONS_CHECKER); connectionsCheckerThread.start(); } static void interruptConnectionsChecker() { if ( null != connectionsCheckerThread ) connectionsCheckerThread.interrupt(); } /** * This method should really be called within a code block that waits or sleeps * after calling it before checking whether the maintenance call succeeded. * However we also call it the first time from maintainTwoWayConnection just to get the * connection set up as quickly as possible - later calls will check for success status. * * @param gaianConnectionID */ private void kickOffMaintainerThread( String gaianConnectionID ) { ConnectionMaintainer cm = (ConnectionMaintainer) maintainers.get(gaianConnectionID); if ( null == cm ) { cm = new ConnectionMaintainer( gaianConnectionID ); maintainers.put( gaianConnectionID, cm ); } else cm.reinitialise(); // if ( !cm.isInitialConnectionAttemptInProgress() ) new Thread( cm, MAINTAINER_NAME + " for " + gaianConnectionID ).start(); } public void run() { try { Set<VTIWrapper> latestDataSourcesBeingChecked = new HashSet<VTIWrapper>(); Set<Set<VTIWrapper>> latestExecutingDataSourceSets = new HashSet<Set<VTIWrapper>>(); Set<VTIWrapper> hangingDataSources = new HashSet<VTIWrapper>(); Set<String> latestMaintainedConnections = new HashSet<String>(); int numSuspectDataSourceSetsPrevious = -1; while ( GaianNode.isRunning() ) { connectionsCheckerHeartbeat = GaianDBConfig.getConnectionsCheckerHeartbeat(); // Clear temporary sets for next iteration (should already be done... just being defensive) latestDataSourcesBeingChecked.clear(); hangingDataSources.clear(); latestExecutingDataSourceSets.clear(); latestMaintainedConnections.clear(); synchronized ( maintainedConnections ) { // Use temporary set, so seeker thread can register new maintenance connection // concurrently without affecting the code that modifies the set after this synchronized block latestMaintainedConnections.addAll(maintainedConnections); // Note we don't clear the maintained connections here // Low value feature to skip maintenance for certain connections // for ( Iterator<String> it = connectionsExcludedFromNextMaintenanceCycle.iterator(); it.hasNext() ; ) { // String alreadyQueriedConnection = it.next(); // latestMaintainedConnections.remove(alreadyQueriedConnection); // it.remove(); // } } // System.out.println("Connection maintenance set: " + Arrays.asList(latestMaintainedConnections)); for (String conn : latestMaintainedConnections) kickOffMaintainerThread( (String) conn ); synchronized( dataSourcesBeingChecked ) { // Use temporary sets, so executing threads can register their executing vtis // concurrently without affecting the code that modifies these sets after this synchronized block latestExecutingDataSourceSets.addAll( executingDataSourceSets ); executingDataSourceSets.clear(); latestDataSourcesBeingChecked.addAll( dataSourcesBeingChecked ); dataSourcesBeingChecked.clear(); } for (VTIWrapper dataSource : latestDataSourcesBeingChecked) { if ( ! ( dataSource instanceof VTIRDBResult ) ) continue; // No need to check non RDBMS data sources if ( !dataSource.isGaianNode() ) { logger.logInfo("No need to check dataSource as it is not a Gaian Node: " + dataSource); continue; } ConnectionTester ct = (ConnectionTester) testers.get(dataSource); if ( null == ct ) { ct = new ConnectionTester( (VTIRDBResult) dataSource ); testers.put( dataSource, ct ); } ct.reinitialise(); new Thread( ct, TESTER_NAME + " for " + dataSource ).start(); } // logger.logDetail( "Maintained connections: " + latestMaintainedConnections.size() + // ", DataSources being checked: " + latestDataSourcesBeingChecked.size()); try { Thread.sleep( connectionsCheckerHeartbeat ); } catch (InterruptedException e) { if ( !GaianNode.isRunning() ) break; // restart the loop if interrupted while running, with no other suspected hanging connections being tested. else if ( latestDataSourcesBeingChecked.isEmpty() && latestExecutingDataSourceSets.isEmpty() ) continue; } for (VTIWrapper dataSource : latestDataSourcesBeingChecked) { ConnectionTester ct = (ConnectionTester) testers.get(dataSource); if ( null == ct ) continue; // it will be null if we don't test this type of dataSource (e.g. if dataSource is not a VTIRDBresult) if ( false == ct.pollSucceeded() ) { logger.logInfo("Poll hanging for active jdbc connection of data source: " + dataSource); // cant close jdbc connection as it will hang aswell (as it waits for the hanging exec to clear itself up) // try { // ct.closeJDBCConnection(); // logger.logInfo("Closed JDBC connection for hanging data source " + vti); // } catch (SQLException e) { // logger.logWarning("Unable to close JDBC connection for hanging data source " + vti + ": " + e); // } hangingDataSources.add(dataSource); ((VTIRDBResult) dataSource).lostConnection(); // testers.remove( dataSource ); // not actually necessary, the hanging thread can't be harmful when it gets unstuck } } latestDataSourcesBeingChecked.clear(); // vtis all checked // logger.logInfo("All hanging data sources: " + Arrays.asList( hangingDataSources ) ); // logger.logInfo("Number of data source sets being checked for executing queries: " + latestExecutingDataSourceSets.size() ); int numSuspectDataSourceSets = gaianResultsForExecutingDataSources.size(); if ( numSuspectDataSourceSetsPrevious != numSuspectDataSourceSets || numSuspectDataSourceSets > 0 ) { logger.logInfo("New number of potentially hanging queries being checked (involving a GaianNode data source): " + numSuspectDataSourceSets ); numSuspectDataSourceSetsPrevious = numSuspectDataSourceSets; } for (Set<VTIWrapper> executingDataSources : latestExecutingDataSourceSets) { logger.logInfo("Checking set of executing data sources: " + executingDataSources ); GaianResult gResult = gaianResultsForExecutingDataSources.remove( executingDataSources ); if ( null == gResult ) continue; // should not happen but possible if latestExecutingDataSourceSets was not cleared properly synchronized( executingDataSources ) { int sizeBeforeRootOut = executingDataSources.size(); if ( true == executingDataSources.removeAll( hangingDataSources ) ) { // executingDataSources.notify(); // wake up fetcher thread to tell it about its cancelled hanging executors // Some of the hanging vtis were in this set of executing vtis.. so notify the GaianResult using // a Poison Pill: An empty result with the number hanging vtis that have been rooted out of this set. logger.logInfo( "Rooted out " + (sizeBeforeRootOut - executingDataSources.size()) + " hanging data sources for query" ); if ( 0 == executingDataSources.size() ) { // try { gResult.endResults(); logger.logInfo("Put poison pill on rowResultsBuffer queue as there are no more executing threads"); // } catch (InterruptedException e) { // logger.logException("Interrupted while putting final termination row on queue: ", e); // } } } } gResult.reEnableCheckingOfHangingQueries(); } // Clear temporary sets for next iteration hangingDataSources.clear(); latestExecutingDataSourceSets.clear(); for (String gc : latestMaintainedConnections) { ConnectionMaintainer cm = maintainers.get(gc); if ( /*false == cm.isInitialConnectionAttemptInProgress() &&*/ false == cm.isTwoWayConnected() ) { logger.logInfo("Maintenance fct call failed for outbound connection " + gc + " to discovered node, dropping it"); // This synchronized block encapsulates both stmts in case an "add node" tries to come in in-between the two. // The "add node" would have to wait for the maintaining state for gc to be removed before it is added again... // Note a node cannot be found to be lost in between it being added and reloaded so that case is safe. synchronized( maintainedConnections ) { if ( GaianDBConfig.isDiscoveredConnection(gc) ) if ( false == cm.isConnectionMaintainedByTheOtherNode() || false == GaianNodeSeeker.reverseMaintenanceDirectionToIncoming( gc ) ) GaianNodeSeeker.lostDiscoveredConnection( gc ); maintainedConnections.remove(gc); // maintainers.remove(gc); // not actually necessary } } } // Clear temp set after use latestMaintainedConnections.clear(); } } catch ( Throwable e ) { GaianNode.stop( "Error/Exception in DatabaseConnectionsChecker", e ); } finally { connectionsCheckerThread = null; } } private class ConnectionTester implements Runnable { private static final String POLL_SQL = "values 1"; // "select 1 from sysibm.sysdummy1"; // private static final String MAINTAIN_CONNECTION_SQL = "values checkConnection()"; // "select 1 from sysibm.sysdummy1"; private Connection c; private final VTIRDBResult dataSource; private boolean pollSucceeded = false; public ConnectionTester( VTIRDBResult dataSource ) { this.dataSource = dataSource; } public void reinitialise() { pollSucceeded = false; // can't use a statement on same connection :( - Derby locks up when trying to exec simultaneously on 2 statements of a same connection // this.c = dataSource.getConnectionOfLongestRunningStatement(); Connection c = null; try { c = dataSource.getConnectionFromApplicablePool(); // dataSource.getConnectionOfLongestRunningStatement(); } catch (SQLException e) { logger.logInfo("Unable to get connection for ConnectionTester for data source: " + dataSource); } this.c = c; } public boolean pollSucceeded() { return pollSucceeded; } public void run() { logger.logInfo("Checking database connection "+c+" for data source: " + dataSource); try { // Note if c is null it means the statement we are checking actually completed execution just as we were about to check... if ( null == c ) logger.logInfo("Poll not executed against data source: " + dataSource + ", cause: could not obtain connection (data source query completed?)"); else { Statement s = c.createStatement(); logger.logInfo("Executing poll query against new statement of connection "+c+" for data source: " + dataSource); if ( GaianNode.isInTestMode() && GaianDBConfigProcedures.internalDiags.containsKey("hang_on_suspect_connection_poll") ) { GaianDBConfigProcedures.internalDiags.remove("hang_on_suspect_connection_poll"); // disable this straight away logger.logInfo("Executing Poll replaced with a simulated hang using jsleep(60000) for data source: " + dataSource ); s.executeQuery("values jsleep(60000)").close(); // sleep remotely for one minute to simulate hang for testing logger.logInfo("Poll simulating a hang with jsleep(60000) completed for data source: " + dataSource ); } else s.executeQuery(POLL_SQL).close(); logger.logInfo("Poll succeeded for active jdbc connection of data source: " + dataSource ); } pollSucceeded = true; dataSource.reEnableNow(); dataSource.returnConnectionToApplicablePool(c); } catch (SQLException e) { logger.logWarning(GDBMessages.ENGINE_DS_CONN_POLL_ERROR, "Unable to poll active jdbc connection of data source: " + dataSource + ", cause: " + e); } } } private class ConnectionMaintainer implements Runnable { private final String gc; private boolean isTwoWayConnected = false; // Previously used to skip verification of this connection allow more time for a first maintenance call - because the establishment of a first connection to the peer node is time-consuming. // private int connectionTimeout = 0; private boolean isFirstMaintenanceCall = true; private String errmsg = null; public ConnectionMaintainer( String gc ) { // , int initialConnectionTimeout ) { this.gc = gc; // this.connectionTimeout = connectionsCheckerHeartbeat; } public void reinitialise() { isTwoWayConnected = false; isFirstMaintenanceCall = false; errmsg = null; } public boolean isTwoWayConnected() { return isTwoWayConnected; } public boolean isConnectionMaintainedByTheOtherNode() { return null != errmsg && -1 != errmsg.indexOf( GaianNodeSeeker.VALID_CONNECTION_BUT_INVALID_MAINTENANCE_DIRECTION ); } // public boolean isInitialConnectionAttemptInProgress() { // return connectionTimeout != connectionsCheckerHeartbeat; // } public void run() { String connectionProperties = null; Stack<Object> pool = null; Connection c = null; String nodeID = null; String sql = null; // Do some validation try { connectionProperties = GaianDBConfig.getRDBConnectionDetailsAsString( gc ); // Check that gc exists since we last checked - hasnt been removed by the Seeker for being on an unwanted host if ( !GaianDBConfig.isDiscoveredOrDefinedConnection(gc) ) { errmsg = "Not a discovered or defined connection: " + gc; } else { nodeID = GaianDBConfig.getDiscoveredNodeID(gc); } } catch (Exception e) { errmsg = "Cannot lookup " + gc + ": " + e; } if ( null == errmsg ) { pool = DataSourcesManager.getSourceHandlesPool( connectionProperties ); // logger.logInfo("Checking database connection for: " + gc); try { // System.out.println("!!!!!!!!!! Getting connection for " + connectionProperties); c = DataSourcesManager.getPooledJDBCConnection( connectionProperties, pool, connectionsCheckerHeartbeat ); // connectionTimeout ); // System.out.println("!!!!!!!!!! GOTTTTTTTTTTTTT connection for " + connectionProperties); final String sslMode = GaianDBConfig.getSSLMode(); sql = "values " + connectionMaintainerFunction + "('" + GaianDBConfig.getGaianNodeID() + "', '" + GaianDBConfig.getGaianNodeUser() + "', '" + Util.escapeSingleQuotes( GaianDBConfig.getGaianNodePasswordScrambled() ) + "', '" + (isFirstMaintenanceCall?"INIT,":"") + (null==sslMode?"":SSLMODE_TAG+sslMode+',') + DISTANCE2SERVER_TAG + distanceToServerNode + "')"; // System.out.println("Maintenance check SQL: " + sql); Statement stmt = c.createStatement(); // stmt.setQueryTimeout( connectionTimeout-1000 ); ResultSet rs = stmt.executeQuery( sql ); if ( rs.next() ) { errmsg = rs.getString(1); isTwoWayConnected = null == errmsg || errmsg.startsWith(SUCCESS_TAG); if ( isTwoWayConnected ) resolveBestPathToNonLiteNodeFromMaintenanceMessage(errmsg, nodeID); } else errmsg = "No result rows returned by maintenance function!"; rs.close(); } catch (SQLException e) { errmsg = "SQLException caught: " + e.toString(); } // finally { connectionTimeout = connectionsCheckerHeartbeat; } } if ( isTwoWayConnected ) logger.logDetail("Maintenance check succeeded for gaian connection: " + gc + " to " + nodeID + ", pool size: " + pool.size()); else logger.logWarning(GDBMessages.ENGINE_CONN_MAINTENANCE_CHECK_ERROR, "Maintenance check failed for gaian connection " + gc + " to " + nodeID + ", sql = " + sql + "; cause: " + errmsg); // (null==errmsg ? "rc=" + rc : errmsg) ); if ( null != c ) pool.push(c); } } public static void invalidatePotentialPathToServerNode( String node ) { if ( 0 < distanceToServerNode && null != node && node.equals(bestPathToServer) ) { distanceToServerNode = -1; bestPathToServer = null; } } public static void resolveBestPathToNonLiteNodeFromMaintenanceMessage( String info, String candidateNode ) { if ( null != info ) { int idx = info.indexOf(DISTANCE2SERVER_TAG); int idx2 = info.indexOf(",", idx); if ( -1 == idx ) { logger.logInfo("No extra info detected from maintainConnection() call"); } else { if ( -1 == idx2 ) idx2 = info.length(); try { int dist = Integer.parseInt( info.substring(idx+DISTANCE2SERVER_TAG.length(), idx2).trim() ); if ( -1 < dist ) if ( -1 == distanceToServerNode || dist+1 < distanceToServerNode ) { distanceToServerNode = dist+1; bestPathToServer = candidateNode; logger.logInfo("Updated distance to nearest derby enabled node: " + distanceToServerNode + ", via " + bestPathToServer); } } catch ( Exception e ) { logger.logInfo("Unable to retrieve a valid distanceToServerNode from maintainConnection() info: " + e); } } } } }