/**
* <p>Title: ReplicaManager </p>
* <p>Description: </p>
* @version $Id$
*/
package diskCacheV111.replicaManager ;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.PrintWriter;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import diskCacheV111.pools.PoolV2Mode;
import diskCacheV111.replicaManager.ReplicaDbV1.DbIterator;
import diskCacheV111.repository.CacheRepositoryEntryInfo;
import diskCacheV111.util.Pgpass;
import diskCacheV111.util.PnfsId;
import diskCacheV111.vehicles.PnfsAddCacheLocationMessage;
import diskCacheV111.vehicles.PnfsDeleteEntryNotificationMessage;
import diskCacheV111.vehicles.PnfsModifyCacheLocationMessage;
import diskCacheV111.vehicles.PoolModifyModeMessage;
import diskCacheV111.vehicles.PoolStatusChangedMessage;
import dmg.cells.nucleus.CellEvent;
import dmg.util.command.Command;
import dmg.util.command.DelayedCommand;
import org.dcache.util.Args;
import static org.dcache.util.ByteUnit.KiB;
public class ReplicaManagerV2 extends DCacheCoreControllerV2
{
private static final Logger _log =
LoggerFactory.getLogger(ReplicaManagerV2.class);
private static final int _maxPnfsIdHashSize = KiB.toBytes(16);
private String _jdbcUrl = "jdbc:postgresql://localhost/replicas";
private String _user = "postgres";
private String _pass = "NoPassword";
private String _pwdfile;
private ReplicaDbV1 _dbrmv2;
private boolean _useDB;
private Args _args;
private Adjuster _adj;
private WatchPools _watchPools;
private Thread _watchDog;
private Thread _dbThread;
private Thread _adjThread;
private volatile boolean _stopThreads;
private volatile boolean _runAdjuster = true;
private boolean _XXcheckPoolHost;
public void setCheckPoolHost ( boolean d ) { _XXcheckPoolHost = d; }
private boolean getCheckPoolHost() { return _XXcheckPoolHost; }
// private ReplicaManagerCLI _cli = new ReplicaManagerCLI();
// private ReplicaManagerCLIDebug _cliDebug = null;;
private int _repId = 1;
private int _redId = 1;
private int _cntOnlinePools;
private final Set<String> _poolsToWait = new HashSet<>(); // Contains old online pools from db
private Map<String, String> _poolMap;
private int _repMin = 2; // Min num. of replicas Adjuster will keep
private int _repMax = 3; // Max num. of replicas Adjuster will keep
// Resilient pool Group
private ResilientPools _resilientPools;
public class ResilientPools {
private List<String> _resPoolsList;
// defaults:
private String _resilientPoolGroupName = "ResilientPools";
private List<String> getResilientPools() {
return _resPoolsList;
}
public ResilientPools( Args args )
{
String group = args.getOpt("resilientGroupName");
if( group != null && (!group.isEmpty()) ) {
_resilientPoolGroupName = group;
_log.info("resilientGroupName={}", group);
}else{
_log.warn("Argument 'resilientGroupName' is not defined, use default settings:"
+ " _resilientPoolGroupName={}", _resilientPoolGroupName);
}
}
public List<String> init()
throws Exception {
_log.debug("Asking for Resilient Pools Group List, resilientPoolGroupName="
+ _resilientPoolGroupName);
try {
_resPoolsList = getPoolGroup(_resilientPoolGroupName);
} catch (Exception ex) {
_log.warn("ERROR: ##### Can not get Resilient Pools Group " + _resilientPoolGroupName + " ####");
throw ex;
}
if (_resPoolsList == null) {
_log.warn("ERROR: ##### Can not get Resilient Pools Group " + _resilientPoolGroupName + " ####");
throw new Exception("Can not get Group " + _resilientPoolGroupName);
}
_log.info("Got " + _resPoolsList.size() + " pools listed in the group "
+ _resilientPoolGroupName);
if (_resPoolsList.isEmpty()) {
_log.warn("ERROR: ##### Group " + _resilientPoolGroupName + " is empty ####");
throw new Exception("Group " + _resilientPoolGroupName + " is empty");
}
_log.info("ResilientPools pools: " + _resPoolsList);
return _resPoolsList;
}
}
private void initResilientPools() {
while (true) { // try forever to connect Pool Manager
try {
List<String> l = _resilientPools.init();
if (l != null) {
break;
}
}
catch (Exception ex) {
_log.warn("InitResilientPools() - got exception '" + ex + "'");
}
}
}
/**
* Returns a list of names (Strings) for active Resilient pools.
*
* @return list of pool names (Strings)
* @throws Exception (see exceptions in getPoolList() ).
*/
@Override
public List<String> getPoolListResilient ()
throws Exception
{
List<String> poolList = getPoolList();
poolList.retainAll( _resilientPools.getResilientPools() );
return poolList;
}
//
private final Object _dbLock = new Object();
private boolean _initDbActive;
private volatile boolean _runPoolWatchDog;
private boolean _hotRestart = true;
private InitDbRunnable _initDbRunnable;
private static final long SECOND = 1000L;
private static final long MINUTE = 60 * SECOND;
private static final long HOUR = 60 * MINUTE;
private long _delayDBStartTO = 20*MINUTE; // - wait for remote pools to get conncted
private long _delayAdjStartTO = 21*MINUTE; // - wait for new pools to start
private long _delayPoolScan = 2*MINUTE; // - wait for remote pools get connected
// before polling pool status
//
private static class DBUpdateMonitor {
private boolean _bool;
// private final ReadWriteLock _lock = new ReentrantReadWriteLock();
// _lock.writeLock().lock();
// _lock.writeLock().unlock();
private Collection<String> _updatedPnfsId;
DBUpdateMonitor() {
_bool = false;
_updatedPnfsId = new LinkedHashSet<>();
}
public synchronized boolean reset() {
// there were any changes in pool status or pnfsId added / removed
boolean ret = _bool || (!_updatedPnfsId.isEmpty());
_bool = false;
_updatedPnfsId.clear();
return ret;
}
public synchronized boolean booleanValue() { return _bool; }
// set flag and wakeup waiting thread
public synchronized void wakeup() {
_bool = true;
try {
this.notifyAll();
}
catch (IllegalMonitorStateException ex) { // Ignore
}
}
// wakeup waiting thread, don't set flag for polls of drastic changes
public synchronized void sendNotify() {
try {
this.notifyAll();
}
catch (IllegalMonitorStateException ex) { // Ignore
}
}
public synchronized void wakeupByPnfsId() {
/** @todo
* rename it all ...
*/
if( _updatedPnfsId.size() > _maxPnfsIdHashSize ) {
wakeup();
} else {
sendNotify();
}
}
public synchronized void addPnfsId(PnfsId p ) {
_updatedPnfsId.add(p.toString());
}
public synchronized boolean hasPnfsId(PnfsId p ) {
return _updatedPnfsId.contains( p.toString() );
}
}
// private Boolean _dbUpdated = Boolean.FALSE;
private final DBUpdateMonitor _dbUpdated = new DBUpdateMonitor();
private void parseDBArgs() {
_log.info("Parse DB arguments "+_args);
String cfURL = _args.getOpt("dbURL");
if (cfURL != null) {
_jdbcUrl = cfURL;
}
String cfUser = _args.getOpt("dbUser");
if (cfUser != null) {
_user = cfUser;
}
String cfPass = _args.getOpt("dbPass");
if (cfPass != null) {
_pass = cfPass;
}
_pwdfile = _args.getOpt("pgPass");
// Now check if all required parameters are present
// if ((cfURL == null ) || (cfDriver == null) || (cfUser == null) || (cfPass == null && _pwdfile == null) ) {
if ((_jdbcUrl == null ) || (_user == null) || (_pass == null && _pwdfile == null)) {
throw new IllegalArgumentException("Not enough arguments to Init SQL database");
}
if (_pwdfile != null && !_pwdfile.isEmpty()) {
Pgpass pgpass = new Pgpass(_pwdfile);
String p = pgpass.getPgpass(cfURL, cfUser);
if (p != null) {
_pass = p;
}
}
}
private void parseArgs() {
// Parse arguments
String min = _args.getOpt("min");
if (min != null) {
_repMin = Integer.parseInt(min);
_adj.setMin(_repMin);
_log.info("Set _repMin=" + _repMin);
}
String max = _args.getOpt("max");
if (max != null) {
_repMax = Integer.parseInt(max);
_adj.setMax(_repMax);
_log.info("Set _repMax=" + _repMax);
}
String delayDBStartTO = _args.getOpt("delayDBStartTO");
if (delayDBStartTO != null) {
_delayDBStartTO = TimeUnit.valueOf(_args.getOpt("delayDBStartTOUnit")).toMillis(Integer.parseInt(delayDBStartTO));
_log.info("Set _delayDBStartTO=" + _delayDBStartTO + " ms");
}
String delayAdjStartTO = _args.getOpt("delayAdjStartTO");
if (delayAdjStartTO != null) {
_delayAdjStartTO = TimeUnit.valueOf(_args.getOpt("delayAdjStartTOUnit")).toMillis(Integer.parseInt(delayAdjStartTO));
_log.info("Set _delayAdjStartTO=" + _delayAdjStartTO + " ms");
}
String waitDBUpdateTO = _args.getOpt("waitDBUpdateTO");
if (waitDBUpdateTO != null) {
long timeout = TimeUnit.valueOf(_args.getOpt("waitDBUpdateTOUnit")).toMillis(Integer.parseInt(waitDBUpdateTO));
_adj.setWaitDBUpdateTO(timeout);
_log.info("Set waitDBUpdateTO=" + timeout + " ms");
}
String waitReplicateTO = _args.getOpt("waitReplicateTO");
if (waitReplicateTO != null) {
long timeout = TimeUnit.valueOf(_args.getOpt("waitReplicateTOUnit")).toMillis(Integer.parseInt(waitReplicateTO));
_adj.setWaitReplicateTO(timeout);
_log.info("Set waitReplicateTO=" + timeout + " ms");
}
String waitReduceTO = _args.getOpt("waitReduceTO");
if (waitReduceTO != null) {
long timeout = TimeUnit.valueOf(_args.getOpt("waitReduceTOUnit")).toMillis(Integer.parseInt(waitReduceTO));
_adj.setWaitReduceTO(timeout);
_log.info("Set waitReduceTO=" + timeout + " ms");
}
String poolWatchDogPeriod = _args.getOpt("poolWatchDogPeriod");
if (poolWatchDogPeriod != null) {
long timeout = TimeUnit.valueOf(_args.getOpt("poolWatchDogPeriodUnit")).toMillis(Integer.parseInt(poolWatchDogPeriod));
_watchPools.setPeriod(timeout);
_log.info("Set poolWatchDogPeriod=" + timeout + " ms");
}
String sExcludedFilesExpirationTO = _args.getOpt("excludedFilesExpirationTO");
if (sExcludedFilesExpirationTO != null) {
long timeout = TimeUnit.valueOf(_args.getOpt("excludedFilesExpirationTOUnit")).toMillis(Integer.parseInt(sExcludedFilesExpirationTO));
_watchPools.setExcludedExpiration(timeout);
_log.info("Set excludedFilesExpirationTO=" + timeout + " ms");
}
String maxWorkers = _args.getOpt("maxWorkers");
if (maxWorkers != null) {
int mx = Integer.parseInt(maxWorkers);
_adj.setMaxWorkers(mx);
_log.info("Set adjuster maxWorkers=" + mx);
}
if( _args.hasOption("coldStart") ) {
_hotRestart = false;
}
if( _args.hasOption("hotRestart") ) {
_hotRestart = true;
}
String argSameHost = _args.getOpt("enableSameHostReplica");
if (argSameHost != null) {
setEnableSameHostReplica(Boolean.valueOf(argSameHost));
}
String argCheckPoolHost = _args.getOpt("XXcheckPoolHost");
if (argCheckPoolHost != null) {
setCheckPoolHost(Boolean.valueOf(argCheckPoolHost));
}
}
public ReplicaManagerV2(String cellName, String args)
{
super(cellName, args);
_args = getArgs();
}
@Override
protected void starting() throws Exception
{
_log.info("Starting cell");
parseDBArgs();
_log.debug("Setup database with: URL={} user={} passwd=********", _jdbcUrl, _user);
_dbrmv2 = new ReplicaDbV1(this, _jdbcUrl, _user, _pass);
_adj = new Adjuster(_repMin, _repMax, _dbrmv2);
_watchPools = new WatchPools(_dbrmv2);
_log.info("Parse arguments");
parseArgs();
_resilientPools = new ResilientPools(_args);
_initDbRunnable = new InitDbRunnable(_delayDBStartTO);
_log.info("Create threads");
_dbThread = getNucleus().newThread(_initDbRunnable, "RepMgr-initDB");
_adjThread = getNucleus().newThread(_adj, "RepMgr-Adjuster");
_watchDog = getNucleus().newThread(_watchPools, "RepMgr-PoolWatchDog");
}
@Override
protected void started()
{
_log.info("Start Init DB thread");
_dbThread.start();
_log.info("Start Adjuster thread");
_adjThread.start();
}
@Override
public void stopped()
{
_log.debug("=== cleanUp called ===");
_stopThreads = true;
_runPoolWatchDog = false;
try {
if (_dbThread != null) {
_dbThread.interrupt();
}
if (_adjThread != null) {
_adjThread.interrupt();
}
if (_watchDog != null) {
_watchDog.interrupt();
}
if (_dbThread != null) {
_dbThread.join(1000);
}
if (_adjThread != null) {
_adjThread.join(1000);
}
if (_watchDog != null) {
_watchDog.join(1000);
}
} catch (InterruptedException e) {
_log.warn("Replica manager failed to shut down", e);
}
_dbrmv2.close();
super.stopped();
}
@Override
public void getInfo(PrintWriter pw) {
super.getInfo( pw );
synchronized (_dbLock) {
pw.println(" initDb Active : " + _initDbActive);
}
pw.println(" enableSameHostReplica : " + getEnableSameHostReplica() );
pw.println(" XXcheckPoolHost : " + getCheckPoolHost() );
}
// private ReplicaDbV1 installReplicaDb(boolean keep) throws SQLException {
// return new ReplicaDbV1(this, keep) ;
// }
//
//
//
private void dbUpdatePool(String poolName) throws Exception {
List<CacheRepositoryEntryInfo> fileList;
String hostName;
_log.info(" dbUpdatePool " + poolName);
// Get pool list
try {
for (int loop = 1; true; loop++) {
try {
fileList = getPoolRepository(poolName);
break;
}
catch (ConcurrentModificationException cmee) {
_log.warn(" dbUpdatePool - Pnfs List was invalidated. retry=" + loop + " pool=" + poolName);
if (loop == 4) {
throw cmee;
}
}
catch(MissingResourceException mre) {
_log.warn(" dbUpdatePool - Can not get PnfsId List. retry=" + loop + " pool=" + poolName);
if (loop == 4) {
throw mre;
}
}
}
}
catch (Exception ee) {
_log.warn(" dbUpdatePool - Problem fetching repository from " + poolName + " : " + ee);
_log.debug(ee.toString(), ee);
throw ee;
}
// Got pool list OK
_log.info(" dbUpdatePool - Got " + fileList.size() + " pnfsIds from " + poolName);
_dbrmv2.removePool( poolName );
// for (Iterator n = fileList.iterator(); n.hasNext(); ) {
// _db.addPool( (PnfsId) n.next(), poolName);
// }
_dbrmv2.addPnfsToPool(fileList, poolName);
// get host name from pool
if (_XXcheckPoolHost) {
try {
for (int loop = 1; true; loop++) {
hostName = getPoolHost(poolName);
if (hostName != null) {
synchronized (_hostMap) {
_hostMap.put(poolName, hostName);
_log.debug("dbUpdatePool: _hostMap updated, pool=" + poolName
+ " host=" + hostName );
}
}
break;
}
} catch (Exception ee) {
_log.warn(" dbUpdatePool - Problem get/set host name for the pool " +
poolName +
" : " + ee);
_log.debug(ee.toString(), ee);
throw ee;
}
}
}
//
// cleanupDb - cleanup db, preparation phase for initDb()
//
private void cleanupDb() {
synchronized (_dbLock) {
_log.info("Starting cleanupDb()");
// Save old pools state from DB into Map for hot restart
// "pool" table will be cleared in DB and state lost
_poolMap = new HashMap<>();
if (_hotRestart) {
_log.info("Clear DB for online pools");
_log.debug("Save old db pools state into map");
_dbrmv2.clearTransactions();
Iterator<String> p = _dbrmv2.getPools();
while (p.hasNext()) {
String pool = p.next();
String poolSts = _dbrmv2.getPoolStatus(pool);
_poolMap.put(pool, poolSts);
_log.debug("Add to poolMap : [" + pool + "] " + poolSts);
if (poolSts.equals(ReplicaDb1.ONLINE)) {
_poolsToWait.add(pool); // List old online pools in DB
_dbrmv2.clearPool(pool); // clear all entries for online pool
// _db.setPoolStatus(pool,ReplicaDb1.DOWN);
}
}
((DbIterator<?>)p).close();
}
else {
_log.info("Cleanup DB");
_dbrmv2.clearAll(); // Clear "replica" and "pools" tables
_dbrmv2.clearTransactions();
}
_cntOnlinePools = 0;
}
}
//
// initDb - update db with files locations in pools
//
private void initDb() throws Exception {
_log.info("Starting initDb()");
synchronized (_dbLock) {
initResilientPools();
_log.debug("Asking for Pool List");
List<String> pools = getPoolListResilient();
_log.info("Got " + pools.size() + " resilient pools connected");
for (Object pool : pools) {
String poolName = (String) pool;
String oldStatus;
_log.debug("Got pool [" + poolName + "]");
oldStatus = _poolMap.get(poolName);
_log.debug("Got from poolMap : " + poolName + " " + oldStatus);
_dbrmv2.setPoolStatus(poolName, ReplicaDb1.OFFLINE); // ... and add it - so record will be
try {
dbUpdatePool(poolName);
} catch (Exception ee) {
_log.info(" initDb - Problem fetching repository from " + poolName + " : " + ee);
_log.info(" initDb - pool " + poolName + " stays '" + ReplicaDb1.OFFLINE + "'");
continue;
}
// Set status online for only 'new' (unknown) pools,
// otherwise leave pool state as it was before
String newStatus = (oldStatus == null || oldStatus
.equals("UNKNOWN"))
? ReplicaDb1.ONLINE
: oldStatus;
_dbrmv2.setPoolStatus(poolName, newStatus);
if (newStatus.equals(ReplicaDb1.ONLINE)) {
_poolsToWait.remove(poolName);
_log.debug("Pool " + poolName + " set online, _poolsToWait.size()=" +
_poolsToWait.size());
_cntOnlinePools++;
}
}
_useDB = true; // set flag for call back routines
} // synchronized
_log.info("Init DB done");
}
/////////////////////////////////////////////////////////////////////////////
// Adjuster thread
/////////////////////////////////////////////////////////////////////////////
private class Adjuster implements Runnable {
private long waitDBUpdateTO = 10*MINUTE ; // 10 min - re-run Adjuster by TO if DB was NOT notified
private long waitReplicateTO = 12*HOUR ; // wait for replicattion to finish
private long waitReduceTO = 12*HOUR ; // wait for reduction to finish
private int _min = 2;
private int _max = 2;
private int _maxWorkers = 4;
private int _replicated;
private int _removed;
private String _status = "not updated yet";
// private boolean _adjIncomplete = false;
// private boolean _adjFinished;
private Semaphore workerCount;
private Semaphore workerCountRM;
private int _cntThrottleMsgs;
private boolean _throttleMsgs;
private Set<String> _poolsWritable = new HashSet<>(); // can be Destination pools
private Set<String> _poolsReadable = new HashSet<>(); // can be Source pools
private ReplicaDbV1 _db;
public Adjuster(int min, int max, ReplicaDbV1 db)
{
_min = min;
_max = max;
_db = db;
}
public void setMin( int min ){
_min = min;
}
public void setMax( int max ){
_max = max;
}
public void setWaitDBUpdateTO( long delay ){
waitDBUpdateTO = delay;
}
public long getWaitDBUpdateTO(){ return waitDBUpdateTO; }
public void setWaitReplicateTO( long delay ){
waitReplicateTO = delay;
}
public void setWaitReduceTO( long delay ){
waitReduceTO = delay;
}
public void setMaxWorkers( int n ){
_maxWorkers = n;
}
private boolean stopping() { return (!_runAdjuster || _stopThreads); }
@Override
public void run() {
workerCount = new Semaphore(_maxWorkers);
workerCountRM = new Semaphore(_maxWorkers);
_log.info("Adjuster Thread started");
// _db.setHeartBeat("Adjuster","startup");
while (true) {
if (_dbThread != null) {
_log.info("Adjuster - wait for Init DB to finish");
try {
_dbThread.join();
break;
}
catch (InterruptedException ex) {
_log.info(
"Adjuster - Waiting for connections to complete was interrupted, "
+ ( (_stopThreads) ? "stop thread" : "continue"));
}
catch (Exception ex) {
_log.info(
"Adjuster - Got exception "+ex+"waiting for Init DB thread to complete, wait");
}
}
else {
_log.warn(
"Adjuster - did not get DB thread (it can be an error), so sleep for "
+ "60 sec and retry");
try {
Thread.sleep(60 * 1000); // n
}
catch (InterruptedException ex) {
_log.info(
"Adjuster - Waiting for connections to complete was interrupted, "
+ ( (_stopThreads) ? "stop thread" : "continue"));
}
}
if (_stopThreads) {
return;
}
}
// Adjuster can start.
// Start pools watch dog thread
if ( _watchDog == null ) {
_log.info("Starting pool watch dog for the first time - internal ERROR,"
+" class _watchDog not instantiated, startup aborted");
_db.setHeartBeat("Adjuster","aborted");
return;
} else {
if( _runPoolWatchDog ) {
_log.info("Trying to start pool watch dog - Watch dog already running");
}else{
_log.info("Adjuster - start pool watch dog");
_runPoolWatchDog = true;
_watchDog.start();
}
}
// Check DB updates when noticed, or time-to-time
//
boolean haveMore;
_log.info("=== Adjuster Ready, Start the loop ===");
haveMore = true; // preset not finished state to trigger first check without wait
boolean dbUpdatedSnapshot;
_cntThrottleMsgs = 0;
_throttleMsgs = false;
while ( ! stopping() ) { // Loop forever to update DB
try {
synchronized (_dbUpdated) {
// Check monitor if
// - DB changed during last adjustment
// - last adjustment was not completed (I do one replica at a time)
dbUpdatedSnapshot = _dbUpdated.reset();
if ( ! dbUpdatedSnapshot
&& ! haveMore) {
_db.setHeartBeat("Adjuster", "waitDbUpdate");
// Wait for update for some time
_log.debug("Adjuster : wait for DB update");
_dbUpdated.wait(waitDBUpdateTO);
}
} // synchronized
if ( stopping() ) { // check one more time after possible wait()
_log.debug("Adjuster Thread - stopping");
break;
}
if (dbUpdatedSnapshot || haveMore) {
_cntThrottleMsgs = 0;
_throttleMsgs = false;
}
// Check whether wait() broke due to update or timeout:
if ( dbUpdatedSnapshot ) {
_log.info("Adjuster : DB updated, scan DB and replicate or reduce");
} else if ( haveMore ) {
_log.info("Adjuster : adjustment incomplete, rescan DB");
} else {
String msg = "Adjuster : no DB updates for "
+ waitDBUpdateTO / 1000L + " sec, rescan DB";
if (++_cntThrottleMsgs < 5) {
_log.info(msg);
} else if (_cntThrottleMsgs == 5) {
_log.info(msg + "; throttle future 'no DB updates' messages ");
_throttleMsgs = true;
}
}
haveMore = runAdjustment();
if( haveMore && ! dbUpdatedSnapshot && ! _throttleMsgs ) {
_log.info("Adjuster : pass finished, adjustment is NOT complete");
} else {
_log.debug("Adjuster : pass finished haveMore=" + haveMore
+ " dbUpdatedSnapshot=" + dbUpdatedSnapshot
+ " _throttleMsgs=" + _throttleMsgs);
}
}
catch (InterruptedException ee) {
_log.info("Adjuster : thread was interrupted");
if( _stopThreads ) {
break;
}
}
} // - update DB loop
_db.setHeartBeat("Adjuster","done");
_log.debug("Adjuster : done");
}
public boolean runAdjustment()
{
_log.debug("Adjuster - started");
/*
* Get list of all Writable and Readable pools for this pass of adjuster
* As soon as some pool changes its state, _dbUpdated will change
* and this pass of adjuster will finish
* Such optimistic scheme is not 100% proof, rather reduces most of the
* conflicts but not all of them - in any locking scheme pool can go down;
* Distributed locking may be desirable but may be expensive as well.
*/
Iterator<String> it;
// get "online" pools from DB
Set<String> poolsWritable = new HashSet<>();
for (it = _db.getPoolsWritable(); it.hasNext(); ) {
poolsWritable.add(it.next());
}
((DbIterator<?>) it).close();
_poolsWritable = poolsWritable;
// _log.debug("runAdjustment - _poolsWritable.size()=" +_poolsWritable.size());
// get from DB pools in online, drainoff, offline-prepare state
Set<String> poolsReadable = new HashSet<>();
for (it = _db.getPoolsReadable(); it.hasNext(); ) {
poolsReadable.add(it.next());
}
((DbIterator<?>) it).close();
_poolsReadable = poolsReadable;
// _log.debug("runAdjustment - _poolsReadable.size()=" +_poolsReadable.size());
boolean haveMore = false;
do { // One pass - just to use "break"
//------- Drainoff -------
Iterator<String> itDrain = scanDrainoff();
try {
haveMore |= processReplication(itDrain, "drainoff");
}
finally {
( (DbIterator<?>) itDrain).close();
}
if (_stopThreads || _dbUpdated.booleanValue()) {
break;
}
//------ Offline -------
Iterator<String> itOffline = scanOffline();
try {
haveMore |= processReplication(itOffline, "offline-prepare");
}
finally {
( (DbIterator<?>) itOffline).close();
}
if (_stopThreads || _dbUpdated.booleanValue()) {
break;
}
//------ Deficient -------
int min = _min;
Iterator<Object[]> itDeficient = scanDeficient(min);
try {
haveMore |= processReplicateDeficient(itDeficient, min);
}
finally {
( (DbIterator<?>) itDeficient).close();
}
if (_stopThreads || _dbUpdated.booleanValue()) {
break;
}
//------ Redundant -------
int max = _max;
Iterator<Object[]> itRedundant = scanRedundant(max);
try {
haveMore |= processReduceRedundant(itRedundant, max);
}
finally {
( (DbIterator<?>) itRedundant).close();
}
if (_stopThreads || _dbUpdated.booleanValue()) {
break;
}
}
while ( false ); // One pass only
// _log.debug("runAdjustment - got to the end of iteration");
// adjustment cycle complete
return ( haveMore );
}
/*
* Scan for replicas files which might be locked in drainoff pools
*/
protected Iterator<String> scanDrainoff() {
_log.debug("Adjuster - scan drainoff");
_setStatus("Adjuster - scan drainoff");
_db.setHeartBeat("Adjuster", "scan drainOff");
Iterator<String> it;
synchronized (_dbLock) {
it = _db.getInDrainoffOnly();
}
return it;
}
/*
* Scan for and replicate files which can get locked in set of OFFLINE_PREPARE pools
* Copy out single replica, it shall be enough to have access to the file
*/
protected Iterator<String> scanOffline() {
_log.debug("Adjuster - scan offline-prepare");
_setStatus("Adjuster - scan offline-prepare");
_db.setHeartBeat("Adjuster", "scan offline-prepare");
Iterator<String> it;
synchronized (_dbLock) {
it = _db.getInOfflineOnly();
}
return it;
}
/*
* Scan for and replicate Deficient files
* -- all other files with fewer replicas
*/
protected Iterator<Object[]> scanDeficient(int min) {
_log.debug("Adjuster - scan deficient");
_setStatus("Adjuster - scan deficient");
_db.setHeartBeat("Adjuster", "scan deficient");
Iterator<Object[]> it;
synchronized (_dbLock) {
it = _db.getDeficient(min);
}
return it;
}
/*
* Scan for and reduce Redundant files - with Extra replicas
* recovers space in pools.
*/
protected Iterator<Object[]> scanRedundant(int max) {
_log.debug("Adjuster - scan redundant");
_setStatus("Adjuster - scan redundant");
_db.setHeartBeat("Adjuster", "scan redundant");
Iterator<Object[]> it;
synchronized (_dbLock) {
it = _db.getRedundant(max);
}
return it;
}
/*
* Copy single replica out of the pool for pool set in drainoff or offline-prepare state
* It shall be enough to have access to the file.
* It will require to scan deficient files again to find out do we need more replications
*/
protected boolean processReplication(Iterator<String> it, String detail) {
boolean updated = false;
boolean haveMore = false;
while( ! _stopThreads
&& ! (updated=_dbUpdated.booleanValue())
&& it.hasNext() )
{
PnfsId pnfsId = new PnfsId(it.next());
if( _dbUpdated.hasPnfsId( pnfsId ) ) {
haveMore = true; // skip and tag for further replication
} else {
replicateAsync(pnfsId, false); // can not use 'extended set of pools ("drainoff", offline-prepare)
}
}
if (_stopThreads) {
_log.debug("processReplication() - stopThreads detected, stopping");
} else if( updated ) {
_log.debug("processReplication() - DB update detected, break processing of " + detail + " pools cycle");
}
return ( it.hasNext() || haveMore );
}
protected boolean processReplicateDeficient(Iterator<Object[]> it, int min ) {
int records = 0;
int corrupted = 0;
int belowMin = 0;
boolean updated = false;
boolean haveMore = false;
while( ! _stopThreads
&& ! (updated=_dbUpdated.booleanValue())
&& it.hasNext() )
{
records++;
Object[] rec = (it.next());
if (rec.length < 2) {
corrupted++;
continue;
}
PnfsId pnfsId = new PnfsId( ( (String) rec[0]));
int count = (Integer) rec[1];
int delta = min - count;
if (delta <= 0) { // Must be positive for Deficient
belowMin++;
continue;
}
if (delta > 1) // we need to create 2 or more extra replicas for this file in one step
{
haveMore = true; // ... set flag to scan DB one more time to replicate more replicas
}
if( _dbUpdated.hasPnfsId( pnfsId ) ) {
haveMore = true; // skip and tag for further replication
} else // ... create one more replica of the file
{
replicateAsync(pnfsId, false); // can not use 'extended set of pools ("drainoff", offline-prepare)
}
}
if ( corrupted > 0 ) {
_log.warn("Error in processReplicateDeficient(): DB.getDeficient() record length <2 in " + corrupted + "/" + records + " records");
}
if ( belowMin > 0 ) {
_log.warn("Error in processReplicateDeficient(): DB.getDeficient() replica count greater or equal specified min="
+ min + " in " + belowMin + "/" + records + " records");
}
if ( _stopThreads ) {
_log.debug("processReplicateDeficient() - stopThreads detected, stopping");
} else if ( updated ) {
_log.debug("processReplicateDeficient() - DB update detected, break replication pass");
}
// We are not done with iterator yet or there we some replicas we skipped
return ( it.hasNext() || haveMore );
}
protected boolean processReduceRedundant(Iterator<Object[]> it, int max) {
int records = 0;
int corrupted = 0;
int aboveMax = 0;
boolean updated = false;
boolean haveMore = false;
while( ! _stopThreads
&& ! (updated=_dbUpdated.booleanValue())
&& it.hasNext() )
{
records++;
Object[] rec = (it.next());
if (rec.length < 2) {
corrupted++;
continue;
}
PnfsId pnfsId = new PnfsId( ( (String) rec[0]));
int count = (Integer) rec[1];
int delta = count - max;
// Must be positive for Redundant
if (delta <= 0) {
aboveMax++;
continue;
}
if ( delta > 1 ) // we need to remove 2 or more replicas of this file
{
haveMore = true; // ... set flag to scan DB again to reduce more replicas
}
if ( _dbUpdated.hasPnfsId( pnfsId ) ) {
haveMore = true; // ... set tag and skip if there was modification
} else {
reduceAsync(pnfsId); // reduce ONE replica only
}
}
if (corrupted > 0) {
_log.warn("Error in processReplicateDeficient(): DB.getRedundant() record length <2 in " +
corrupted + "/" + records + " records");
}
if (aboveMax > 0) {
_log.warn("Error in processReplicateDeficient(): DB.getRedundant() replica count greater or equal specified max="
+ max + " in " + aboveMax + "/" + records + " records");
}
if (_stopThreads) {
_log.debug("processReduceRedundant() - stopThreads detected, stopping");
} else if (updated) {
_log.debug("processReduceRedundant() - DB update detected, break reduction pass");
}
// We are not done with iterator yet or there we some replicas we skipped
return ( it.hasNext() || haveMore );
}
//--------------
private void excludePnfsId(PnfsId pnfsId, String errcode, String errmsg) {
synchronized (_dbLock) {
long timeStamp = System.currentTimeMillis();
// _db.addTransaction(pnfsId, timeStamp, 0);
_db.addExcluded(pnfsId, timeStamp, errcode, errmsg);
}
_log.info("pnfsId=" + pnfsId + " excluded from replication. ");
}
private class Replicator implements Runnable {
private PnfsId _pnfsId;
private int _Id;
int _wCnt;
private boolean _extended; // include drainoff and offline prepare pools
// into the source pools
// HashSet brokenFiles = new HashSet();
Replicator(PnfsId pnfsId, int Id, int cnt, boolean extended) {
_pnfsId = pnfsId;
_Id = Id;
_wCnt = cnt;
_extended = extended;
}
@Override
public void run() {
try {
if (_stopThreads) {
_log.info("Replicator ID=" + _Id + ", pnfsId=" + _pnfsId +
" can not start - shutdown detected");
} else {
_log.info("Replicator ID=" + _Id + ", pnfsId=" + _pnfsId +
" starting, now "
+ (_maxWorkers - _wCnt) + "/" + _maxWorkers +
" workers are active");
replicate(_pnfsId);
}
} catch (InterruptedException ex) {
_log.info("Replicator for pnfsId=" + _pnfsId + " got exception " + ex);
} finally {
// release Adjuster thread waiting till locking record (exclude) is added to DB
synchronized (this) { // synchronization is required only to invoke notify()
this.notifyAll();
}
_wCnt = workerCount.release();
_log.info("Replicator ID=" + _Id + ", pnfsId=" + _pnfsId
+ " finished, now " + (_maxWorkers - _wCnt) + "/" + _maxWorkers +
" workers are active");
}
}
private void replicate(PnfsId pnfsId) throws InterruptedException {
MoverTask observer;
long start, stop, currentTime;
long timerToWait, timeToStop;
_setStatus("Adjuster - Replicating " + pnfsId);
start = System.currentTimeMillis();
timeToStop = start + waitReplicateTO;
try {
// extended == true -- source pools include drainoff and offline-prepare pools
// extended == false -- online pools only
observer = (_extended)
? replicatePnfsId(pnfsId,_poolsReadable, _poolsWritable)
: replicatePnfsId(pnfsId,_poolsWritable, _poolsWritable);
} catch (MissingResourceException mrex) {
String exMsg = mrex.getMessage();
String exClass = mrex.getClassName();
String exKey = mrex.getKey();
_log.info("replicate(" + pnfsId + ") reported : " + mrex);
if (exMsg.startsWith("Pnfs File not found :")
|| exMsg.equals("Pnfs lookup failed")
|| exMsg.startsWith("Not a valid PnfsId")) {
// I must remove file from DB, if it has no pnfs entry;
// mark it 'exclude' for now
excludePnfsId(pnfsId, "", exMsg);
} else {
_log.debug("msg='" + exMsg + "' class='" + exClass + "' key='" +
exKey + "' ");
// not excluded - will retry
}
return;
} catch (IllegalArgumentException iaex) {
boolean sigFound;
_log.info("replicate(" + pnfsId + ") reported : " + iaex);
String exMsg = iaex.getMessage();
sigFound = // exMsg.startsWith("No pools found,") ||
exMsg.startsWith(selectSourcePoolError) ||
exMsg.startsWith(selectDestinationPoolError) ;
// Exclude forever from replication;
/** @todo
* better to check again when pool arrives and situation changes
*/
if (sigFound) {
excludePnfsId(pnfsId, "", exMsg);
}
// Report, But do not exclude
if (exMsg.startsWith("replicatePnfsId, argument")) {
_log.info("There are not enough pools to get replica from or to put it to; try operation later");
sigFound = true;
} else if (exMsg.startsWith("Try again :")) {
_log.info(exMsg);
sigFound = true;
}
_log.debug("msg='" + exMsg + "' "
+ (sigFound
? "signature OK"
: "signature not found"
)
);
return;
} catch (Exception ee) {
_log.warn("replicate(" + pnfsId + ") reported : " + ee + ", excluding", ee);
excludePnfsId(pnfsId, "", ee.getMessage());
return;
}
String poolName = observer.getDstPool();
_log.info(pnfsId.toString() + " Replicating");
synchronized (_dbLock) {
_db.addTransaction(pnfsId, start, +1);
}
// release Adjuster thread waiting till locking record is added to DB
// In this case parent will be notified twice
synchronized (this) { // synchronization is required only to invoke notify()
this.notifyAll();
}
synchronized (observer) {
timerToWait = timeToStop - start;
currentTime = System.currentTimeMillis();
while (timerToWait > 0 && !observer.isDone()) {
observer.wait(timerToWait);
currentTime = System.currentTimeMillis();
timerToWait = timeToStop - currentTime;
}
if (!observer.isDone()) {
observer.setErrorCode(-1, "replicate pnfsID=" + pnfsId
+ ", Timed out after " +
(currentTime - start) + " ms");
}
}
stop = System.currentTimeMillis();
boolean completedOK = false;
boolean exclude = false;
int oErr = observer.getErrorCode();
String oErrMsg = observer.getErrorMessage();
String excludeReason = oErrMsg;
long timeStamp = System.currentTimeMillis();
if (oErr == 0) {
completedOK = true;
_replicated++;
_log.info(pnfsId.toString() + " replication done after " + (stop - start) +
" ms, result " + observer);
_log.debug("replicate(" + pnfsId
+ ") : cleanup action record and add pnfsid to the pool="
+ poolName + "- updating DB");
} else {
_log.info(pnfsId.toString() + " replication ERROR=" + oErr
+ ", timer=" + (stop - start) +" ms, error " + oErrMsg);
/** todo : formalize error codes
*/
// Error codes :
// oErr > 0 -- reported by dcache
// oErr < -100 -- reported internally by DCacheCoreController
// oErr < 0 -- reported internally
// excludeReason is set to oErrMsg;
if (oErr == 102) {
// wariety of pool errors
exclude = true;
} else if (oErr > 0) {
// do nothing - as before (will retry this pnfsid)
} else if (oErr < -100 ) {
exclude = true;
}
/** do not exclude file after timeout anymore:
* by default it would be another 12 hour after 12 hor timeout
else if (oErr == -1) {
excludeReason = "replication timed out";
exclude = true;
}
*/
}
synchronized (_dbLock) {
if ( completedOK ) {
_db.addPool(pnfsId, poolName);
}
_db.removeTransaction(pnfsId);
if ( exclude ) {
_db.addExcluded(pnfsId, timeStamp, String.valueOf(oErr), excludeReason);
}
}
if ( exclude ) {
_log.info("pnfsId=" + pnfsId + " is excluded from replication. (err="
+ oErr + ", " + excludeReason + ")");
}
} // replicate()
} // Replicator
private class Reducer implements Runnable {
private PnfsId _pnfsId;
private int _Id;
int _wCnt;
// HashSet brokenFiles = new HashSet();
Reducer(PnfsId pnfsId, int Id, int cnt) {
_pnfsId = pnfsId;
_Id = Id;
_wCnt = cnt;
}
@Override
public void run() {
try {
if (_stopThreads) {
_log.info("Reducer ID=" + _Id + ", pnfsId=" + _pnfsId + " can not start - "
+ "shutdown detected");
} else {
_log.info("Reducer ID=" + _Id + ", pnfsId=" + _pnfsId + " starting"
+ ", now " + (_maxWorkers - _wCnt) + "/" + _maxWorkers +
" workers are active");
reduce(_pnfsId);
}
} catch (InterruptedException ex) {
_log.info("Reducer for pnfsId=" + _pnfsId + " got exception " + ex);
} finally {
// release Adjuster thread waiting till locking record (exclude) is added to DB
synchronized (this) { // synchronization is required only to invoke notify()
this.notifyAll();
}
_wCnt = workerCountRM.release();
_log.info("Reducer ID=" + _Id + ", pnfsId=" + _pnfsId + " finished"
+ ", now " + (_maxWorkers - _wCnt) + "/" + _maxWorkers +
" workers are active");
}
}
private void reduce(PnfsId pnfsId) throws InterruptedException {
ReductionObserver observer;
long start, stop, currentTime;
long timerToWait, timeToStop;
_setStatus("Adjuster - reducing " + pnfsId);
start = System.currentTimeMillis();
timeToStop = start + waitReduceTO;
try {
observer = (ReductionObserver) removeCopy(pnfsId, _poolsWritable);
} catch (Exception ee) {
_log.info("reduce(" + pnfsId + ") reported : " + ee);
return;
}
String poolName = observer.getPool();
_log.info(pnfsId.toString() + " Reducing");
synchronized (_dbLock) {
_db.addTransaction(pnfsId, start, -1);
}
// release Adjuster thread waiting till locking record is added to DB
// - notify could be sent twice in this case
synchronized (this) { // synchronization is required only to invoke notify()
this.notifyAll();
}
synchronized (observer) {
timerToWait = timeToStop - start;
currentTime = System.currentTimeMillis();
while (timerToWait > 0 && !observer.isDone()) {
observer.wait(timerToWait);
currentTime = System.currentTimeMillis();
timerToWait = timeToStop - currentTime;
}
if (!observer.isDone()) {
observer.setErrorCode(-1, "reduce pnfsID=" + pnfsId
+ ", Timed out after " + (currentTime - start) +
" ms");
}
}
stop = System.currentTimeMillis();
int oErr = observer.getErrorCode();
String eMsg = observer.getErrorMessage();
long timeStamp = System.currentTimeMillis();
if (oErr == 0) {
_removed++;
_log.info(pnfsId.toString() + " reduction done after " + (stop - start) +
" ms, result " + observer);
synchronized (_dbLock) {
_db.removePool(pnfsId, poolName);
_db.removeTransaction(pnfsId);
}
_log.debug("reduce("+pnfsId
+") : cleanup action record and remove pnfsid from the pool="
+poolName +"- DB updated");
} else {
// _log.info(pnfsId.toString() + " reduction ERROR, timer=" + (stop - start) + " ms, "
// + "error " + observer.getErrorMessage() );
_log.info(pnfsId.toString() + " reduction ERROR, result=[" + observer + "]");
// it is set already
// if (oErr == -1) {
// eMsg = "operation timed out";
// }
// ALWAYS exclude pnfsid if replica removal failed
synchronized (_dbLock) {
_db.removeTransaction(pnfsId);
_db.addExcluded(pnfsId, timeStamp, String.valueOf(oErr), eMsg);
}
_log.info("pnfsId=" + pnfsId + " excluded from replication. "
+ "(err=" + oErr + ", " + eMsg + ")");
}
}
}
private void replicateAsync(PnfsId pnfsId, boolean extended) {
boolean noWorker = true;
int cnt = 0;
// Aquire "worker awailable" semaphore
do {
try {
_log.debug("replicateAsync - get worker");
cnt = workerCount.acquire();
_log.debug("replicateAsync - got worker OK");
noWorker = false;
}
catch (InterruptedException ex) {
if( _stopThreads ) {
_log.info("replicateAsync: waiting for awailable worker thread interrupted, stop thread");
return;
} else {
_log.info("replicateAsync: waiting for awailable worker thread interrupted, retry");
}
}
} while( noWorker );
// Create Replicator object, thread, and start it
Replicator r = new Replicator( pnfsId, _repId, cnt, extended );
synchronized (r) {
getNucleus().newThread(r, "RepMgr-Replicator-" + _repId).start();
_repId++;
// Wait until r.replicate() will add locking record to DB
// and will release current thread with notifyAll()
try {
r.wait();
}
catch (InterruptedException ex1) {
_log.info("replicateAsync: Waiting for release from replicator thread to was interrupted, "
+ ( (_stopThreads) ? "stop thread" : "continue"));
}
}
}
private void reduceAsync(PnfsId pnfsId) {
boolean noWorker = true;
int cnt = 0;
// Aquire "worker awailable" semaphore
do{
try {
_log.debug("reduceAsync - get worker");
cnt = workerCountRM.acquire();
_log.debug("reduceAsync - got worker - OK");
noWorker = false;
}
catch (InterruptedException ex) {
if( _stopThreads ) {
_log.info("reduceAsync: waiting for awailable worker thread interrupted, stop thread");
return;
} else {
_log.info("reduceAsync: waiting for awailable worker thread interrupted, retry");
}
}
}while( noWorker );
// Create Reducer object, thread, and start it
Reducer r = new Reducer( pnfsId, _redId, cnt );
synchronized (r) {
getNucleus().newThread(r, "RepMgr-Reducer-" + _redId).start();
_redId++;
// Wait until r.reduce() will add locking record to DB
// and will release current thread with notifyAll()
try {
r.wait();
}
catch (InterruptedException ex1) {
_log.info("reduceAsync: Waiting for release from reducer thread was interrupted, "
+ ( (_stopThreads) ? "stop thread" : "continue"));
}
}
}
/** @todo OBSOLETE
* Verifies if info from DB corresponds to the real life
*
protected List verifyPnfsId(PnfsId pnfsId, Iterator knownPoolList)
throws Exception {
List sourcePoolList = getCacheLocationList(pnfsId, true);
HashSet newPoolSet = new HashSet(sourcePoolList);
while (knownPoolList.hasNext()) {
Object inext = knownPoolList.next();
if (newPoolSet.contains(inext)) {
newPoolSet.remove(inext);
}
else {
newPoolSet.add(inext);
}
}
List poolList = new ArrayList(newPoolSet);
if (poolList.size() == 0) {
return null;
}
else {
return poolList;
}
}
* end OBSOLETE
*/
private void _setStatus( String status ){ _status = status ; }
} //-- end class Adjuster
private int poolDisable( String poolName ) {
int modeBits = PoolV2Mode.DISABLED_STRICT;
return setPoolMode( poolName, modeBits );
}
private int poolEnable( String poolName ) {
int modeBits = PoolV2Mode.ENABLED;
return setPoolMode( poolName, modeBits );
}
private int poolRdOnly( String poolName ) {
int modeBits = PoolV2Mode.DISABLED_RDONLY;
return setPoolMode( poolName, modeBits );
}
private int setPoolMode( String poolName, int modeBits ) {
int rc = 1 ;
String rm = "Replica Manager Command";
PoolV2Mode mode = new PoolV2Mode( modeBits ) ;
PoolModifyModeMessage msg, reply;
msg = new PoolModifyModeMessage(poolName,mode);
msg.setStatusInfo( rc, rm );
try{
reply = (PoolModifyModeMessage) sendObject( poolName, msg ) ;
}catch(Exception ee ){
_log.warn( "setPoolMode pool=" + poolName
+ ", mode=" + new PoolV2Mode(modeBits).toString()
+" - got exception '"+ ee.getMessage() +"'" );
return -1;
}
if( reply.getReturnCode() != 0 ){
_log.warn( "setPoolMode pool=" + poolName
+ ", mode=" + new PoolV2Mode(modeBits).toString()
+" - error '" +reply.getErrorObject().toString() +"'" );
return -1;
}
return 0;
}
/////////////////////////////////////////////////////////////////////////////
// CLI
//
// public class ReplicaManagerCLI {
//--------------------------------------------------------------------------
//=== System ===
//--------------------------------------------------------------------------
// enable / disable same host replication (test environment / production)
public static final String hh_enable_same_host_replication = "true | false";
public String ac_enable_same_host_replication_$_1(Args args) {
String cmd = args.argv(0);
String msg = "same host replication ";
String m;
if (cmd.equalsIgnoreCase("true")) {
setEnableSameHostReplica(true);
m = msg + "enabled";
} else if (cmd.equalsIgnoreCase("false")) {
setEnableSameHostReplica(false);
m = msg + "disabled";
} else {
m = "Wrong argument '" + cmd + "'";
}
_log.info(m);
return m;
}
// enable / disable same host replication (test environment / production)
public static final String hh_XX_check_pool_host = "true | false # experimental, can be changed";
public String ac_XX_check_pool_host_$_1(Args args) {
String cmd = args.argv(0);
String msg = "check pool host ";
String m;
if (cmd.equalsIgnoreCase("true")) {
setCheckPoolHost(true);
m = msg + "true";
} else if (cmd.equalsIgnoreCase("false")) {
setCheckPoolHost(false);
m = msg + "false";
} else {
m = "Wrong argument '" + cmd + "'";
}
_log.info(m);
return m;
}
//--------------------------------------------------------------------------
public static final String hh_db_wakeup = " # wakeup DB initialization on startup when it waits pools to connect";
public String ac_db_wakeup(Args args) {
if (_initDbRunnable != null) {
_initDbRunnable.wakeupWaitInit();
return "woke up db";
}
else {
return "_initDbRunnable is not instantiated";
}
}
//--------------------------------------------------------------------------
//=== Pool ===
//--------------------------------------------------------------------------
public static final String hh_show_pool = "<pool> # show pool status";
public String ac_show_pool_$_1(Args args) {
String poolName = args.argv(0);
String poolStatus;
synchronized (_dbLock) {
poolStatus = _dbrmv2.getPoolStatus(poolName);
}
String s = "Pool '" + poolName + "' status " + poolStatus;
_log.info("INFO: {}", s);
return s;
}
public static final String hh_set_pool = "<pool> <state>";
public String ac_set_pool_$_2(Args args) {
String poolName = args.argv(0);
String poolStatus = args.argv(1);
boolean updatedOK = false;
boolean setOK = false;
String sErrRet = "Resilient Pools List is not defined (yet), ignore";
if( _resilientPools == null ) {
// _resilientPools was not set yet
_log.debug( sErrRet );
return sErrRet;
}
List<String> l = _resilientPools.getResilientPools();
if (l == null) {
// usePoolGroup() == true, but we got 'null' list for resilient pools
_log.debug(sErrRet);
return sErrRet;
} else if (!l.contains(poolName)) { // pool is NOT resilient
String sErrRet2 = "Pool " + poolName + " is not resilient pool, ignore command";
_log.debug(sErrRet2);
return sErrRet2;
}
synchronized (_dbLock) {
String poolStatusOld = _dbrmv2.getPoolStatus(poolName);
_log.info("Pool '" + poolName + "' status was " + poolStatusOld);
// Check this is correct command - new pool state is valid
if (poolStatus.equals("down")
|| poolStatus.equals("online")
|| poolStatus.equals("offline")
|| poolStatus.equals("offline-prepare")
|| poolStatus.equals("drainoff")
) { // new pool state is correct
/* how it shall be:
if ( poolStatus.equals("online") ) {
setOK = ( poolEnable ( poolName ) == 0 );
} else if ( poolStatus.equals("offline-prepare")
|| poolStatus.equals("drainoff") ) {
setOK = ( poolRdOnly ( poolName ) == 0 );
}
*/
// Really, poolRdOnly disables pool, so it is not considered readable anymore.
// Do only up/down transitions.
boolean countablePool = ( poolStatus.equals("online")
|| poolStatus.equals("offline-prepare")
|| poolStatus.equals("drainoff") );
if ( countablePool ) {
// setOK = ( poolEnable ( poolName ) == 0 );
// Do not do pool Enable on Timur's and Patrick's reuquest. 9/28/07
// They have problems with counting of space in pools and keep pool disabled.
// User shall do "enable" pool manually by direct command to the pool
// set pool enabled
setOK = true;
}
// Do not try to update countablePool when it already failed to change status
// Still can try 'offline' pool
if ((countablePool && setOK) || !countablePool) {
if (((poolStatusOld.equals("down") || poolStatusOld.equals("UNKNOWN"))
&& !(poolStatus.equals("down"))
)
||
((poolStatus.equals("online") && !poolStatus.equals(poolStatusOld)))
) {
// Transition from state where we did not count files correctly
// to the state where we must count files:
// -- rerun pool inventory
try {
dbUpdatePool(poolName);
_dbrmv2.setPoolStatus(poolName, poolStatus);
_log.info("setpool, pool " + poolName +
" state change to '" + poolStatus + "' updated in DB");
} catch (Exception ex) {
_log.info(" setpool - Problem fetching repository from " + poolName +
" : " + ex);
_log.info(" setpool - pool " + poolName + " stays '" + poolStatusOld +
"'");
} // try / catch
} else {
// otherwise we simply change pool status in DB.
_dbrmv2.setPoolStatus(poolName, poolStatus);
_log.info("Pool '" + poolName + "' status set to " + poolStatus);
}
}
if (poolStatus.equals("down")
|| poolStatus.equals("offline") ) {
// setOK = ( poolDisable( poolName ) == 0 );
// Do not do pool Enable on Timur's and Patrick's reuquest. 9/28/07
// They have problems with counting of space in pools and want keep pool disabled.
// User shall do "enable" pool manually by direct command to the pool
// set pool disable
setOK = true;
}
updatedOK = true;
}
}
if (updatedOK && setOK) {
_log.info("setpool, pool " + poolName + ", notify All");
_dbUpdated.wakeup();
return "ok";
} else {
_log.info("Can not set pool '" + poolName + "' state to " + poolStatus +
", ignored");
if (updatedOK && setOK) {
_log.info("Tansaction error: pool state or DB modified, but not both");
}
return "error";
}
}
//--------------------------------------------------------------------------
public static final String hh_ls_unique = "<pool> # check if pool drained off (has unique pndfsIds)";
public String ac_ls_unique_$_1(Args args) throws SQLException {
String poolName = args.argv(0);
_log.info("pool '" +poolName +"'");
List<Object> uniqueList = findUniqueFiles(poolName);
int uniqueFiles = uniqueList.size();
_log.info("Found "+ uniqueFiles +" unique files in pool '" + poolName + "'");
for (Object pnfsId : uniqueList) {
_log.info("Unique in " + poolName + ", pnfsId=" + pnfsId);
}
return "Found " + uniqueFiles;
}
// helper function
private List<Object> findUniqueFiles(String poolName) throws SQLException {
Collection<Object> inPoolSet;
List<Object> missingList = new ArrayList<>();
List<Object> inPoolList = new ArrayList<>();
Iterator<String> inPool = _dbrmv2.getPnfsIds(poolName);
Iterator<String> missing = _dbrmv2.getMissing();
while (missing.hasNext()) {
Object rec = missing.next();
missingList.add(rec); // pnfsId as string
}
((DbIterator<?>)missing).close();
while (inPool.hasNext()) {
Object rec = inPool.next();
inPoolList.add(rec); // pnfsId as String
}
((DbIterator<?>)inPool).close();
inPoolSet = new HashSet<>(inPoolList);
List<Object> uniqueList = new ArrayList<>() ;
for (Object inext : missingList) {
if (inPoolSet.contains(inext)) {
uniqueList.add(inext);
}
}
return uniqueList;
}
//--------------------------------------------------------------------------
// === pnfsId ===
//--------------------------------------------------------------------------
public static final String hh_ls_pnfsid = "[<pnfsId>] # DEBUG: list pools for pnfsid[s], from DB";
public String ac_ls_pnfsid_$_0_1(Args args) throws SQLException {
StringBuilder sb = new StringBuilder();
if (args.argc() == 0) {
Iterator<String> it = _dbrmv2.getPnfsIds();
while (it.hasNext()) {
PnfsId pnfsId = new PnfsId(it.next());
sb.append(printCacheLocation(pnfsId)).append("\n");
}
((DbIterator<?>) it).close();
} else {
PnfsId pnfsId = new PnfsId(args.argv(0));
sb.append(printCacheLocation(pnfsId)).append("\n");
}
return sb.toString();
}
//--------------------------------------------------------------------------
/**
* COMMAND HELP for 'show hostmap'
*/
public static final String hh_show_hostmap = " [<pool>] # show pool to host mapping";
/**
* COMMAND : show hostmap [<pool>]
* displays list of pool to host mapping for all pools or specified pool
*/
public String ac_show_hostmap_$_0_1(Args args) {
StringBuilder sb = new StringBuilder();
String poolName;
String hostName;
if (args.argc() == 0) {
synchronized (_hostMap) {
for (Object o : _hostMap.keySet()) {
poolName = o.toString();
hostName = _hostMap.get(poolName);
sb.append(poolName).append(" ").append(hostName)
.append("\n");
}
}
} else {
poolName = args.argv(0);
if (poolName != null) {
synchronized (_hostMap) {
hostName = _hostMap.get(poolName);
sb.append(poolName).append(" ").append(hostName).append("\n");
}
}
}
return sb.toString();
}
/**
* COMMAND HELP for 'set hostmap'
*/
public static final String hh_set_hostmap = " <pool> <host> # set TEMPORARILY pool to host mapping";
/**
* COMMAND : set hostmap pool host
* maps pool "pool" to specified "host"
* Pool to host mapping is updated automatically by "host" tag defined in .poollistfile
* This command may have sense if you want to define "host" which does not have this tag defined
*/
public String ac_set_hostmap_$_2(Args args) {
StringBuilder sb = new StringBuilder();
String poolName = args.argv(0);
String hostName = args.argv(1);
if (poolName != null && hostName != null) {
synchronized (_hostMap) {
_hostMap.put(poolName, hostName);
}
sb.append("set hostmap ").append(poolName).append(" ")
.append(hostName).append("\n");
}
return sb.toString();
}
/**
* COMMAND HELP for 'remove hostmap'
*/
public static final String hh_remove_hostmap =
" <pool> # remove pool to host mapping for the pool 'pool'";
/**
* COMMAND : remove hostmap pool
* remove pool to host mapping for specified "pool"
*/
public String ac_remove_hostmap_$_1(Args args) {
StringBuilder sb = new StringBuilder();
String poolName = args.argv(0);
if (poolName != null) {
synchronized (_hostMap) {
_hostMap.remove(poolName);
}
sb.append("remove hostmap ").append(poolName).append("\n");
}
return sb.toString();
}
//--------------------------------------------------------------------------
public static final String hh_update = "<pnfsid> [-c] # DEBUG: get pools list from pnfs, '-c' confirm with pools";
public String ac_update_$_1(Args args) throws Exception {
StringBuilder sb = new StringBuilder();
PnfsId pnfsId = new PnfsId(args.argv(0));
sb.append("Old : ").append(printCacheLocation(pnfsId)).append("\n");
List<String> list = getCacheLocationList(pnfsId, args.hasOption("c"));
_dbrmv2.clearPools(pnfsId);
for (Object location : list) {
_dbrmv2.addPool(pnfsId, location.toString());
}
sb.append("New : ").append(printCacheLocation(pnfsId)).append("\n");
return sb.toString();
}
//--------------------------------------------------------------------------
public static final String hh_reduce = "<pnfsId>";
public String ac_reduce_$_1(Args args)
{
final PnfsId pnfsId = new PnfsId(args.argv(0));
ReducePnfsIDRunnable r = new ReducePnfsIDRunnable(pnfsId);
getNucleus().newThread(r).start();
return "initiated (See pinboard for more information)";
}
//--------------------------------------------------------------------------
public static final String hh_replicate = "<pnfsId>";
public String ac_replicate_$_1(Args args)
{
final PnfsId pnfsId = new PnfsId(args.argv(0));
ReplicatePnfsIDRunnable r = new ReplicatePnfsIDRunnable(pnfsId);
getNucleus().newThread(r).start();
return "initiated (See pinboard for more information)";
}
//--------------------------------------------------------------------------
public static final String hh_copy = "<pnfsId> <sourcePool>|* <destinationPool> # does not check for free space in dest";
public String ac_copy_$_3(Args args) throws Exception {
PnfsId pnfsId = new PnfsId(args.argv(0));
String source = args.argv(1);
String destination = args.argv(2);
Collection<String> set = new HashSet<>();
Iterator<String> it = _dbrmv2.getPools(pnfsId);
while (it.hasNext()) {
set.add(it.next());
}
((DbIterator<?>) it).close();
if (set.isEmpty()) {
throw new
IllegalArgumentException("No source found for p2p");
}
if (source.equals("*")) {
source = set.iterator().next();
}
if (!set.contains(source)) {
throw new
IllegalArgumentException("Source " + source +
" not found in pools list");
}
if (set.contains(destination)) {
throw new
IllegalArgumentException("Destination " + destination +
" already found in pools list");
}
TaskObserver observer = movePnfsId(pnfsId, source, destination);
return observer.toString();
}
//--------------------------------------------------------------------------
public static final String hh_exclude = "<pnfsId> [iErrCode [sErrorMessage] ] # exclude <pnfsId> from replication";
public String ac_exclude_$_1_3(Args args)
{
long timeStamp = System.currentTimeMillis();
PnfsId pnfsId = new PnfsId(args.argv(0));
// It is supposed to be a number
String iErr = ( args.argc() > 1 ) ? args.argv(1) : "-2";
String eMsg = ( args.argc() > 2 ) ? args.argv(2) : "Operator intervention";
synchronized (_dbLock) {
_dbrmv2.addExcluded(pnfsId, timeStamp, iErr, eMsg);
}
String msg = "pnfsId=" + pnfsId + " excluded from replication";
_log.info( msg );
return msg;
}
//--------------------------------------------------------------------------
public static final String hh_release = "<pnfsId> # removes transaction/'BAD' status for pnfsId";
public String ac_release_$_1(Args args)
{
PnfsId pnfsId = new PnfsId(args.argv(0));
synchronized (_dbLock) {
_dbrmv2.removeTransaction( pnfsId );
}
String msg = "pnfsId=" + pnfsId + " released";
_log.info( msg + ", (active transaction or 'exclude' status cleared)" );
return msg;
}
// } // end class ReplicaManagerCLI definiton
//---------------------------------------------------------------------------
// public class ReplicaManagerCLIDebug {
//--------------------------------------------------------------------------
// === System ===
//--------------------------------------------------------------------------
/*
//----------------------------------------------------
// Start / stop pool watch dog
// Stop sequence is NOT fool proof - watch dog will notice
// command to stop when it will wake up, that is it can be after _period
// Anf it is NOT singleton to prevent second copy from running
public static final String hh_stop = "threads | watchdog | adjuster # DEBUG:";
public String ac_stop_$_1(Args args) {
String cmd = args.argv(0);
if ( cmd.equals("threads") ) {
_stopThreads = true;
_log.info("Threads were notified to stop");
return "Threads were notified to stop";
}else if ( cmd.equals("watchdog") ) {
_runPoolWatchDog = false;
_log.info("Pool watch dog notified to stop, wait for "+ _watchPools.getPeriod()
+ " until it will wake up and notice the command");
return "watch dog notified to stop";
}
else if (cmd.equals("adjuster")) {
_runAdjuster = false;
_log.info("adjuster notified to stop, wait for " + _watchPools.getPeriod() +
" until it will wake up and notice the command");
return "adjuster notified to stop";
}
_log.info("Wrong argument '" + cmd + "'");
return "wrong argument";
}
*/
//--------------------------------------------------------------------------
// === Pool ===
//----------------------------------------------------------------------------
public static final String hh_pool_inventory = "<pool> # DEBUG - danger, DB not locked";
public String ac_pool_inventory_$_1(Args args) {
String poolName = args.argv(0);
synchronized (_dbLock) {
if (_initDbActive) {
throw new
IllegalArgumentException("InitDb still active");
} else {
_initDbActive = true;
}
}
dbUpdatePoolRunnable r = new dbUpdatePoolRunnable( poolName );
getNucleus().newThread(r,"RepMgr-dbUpdatePool").start();
return "Initiated";
}
//--------------------------------------------------------------------------
// === Pool ===
//----------------------------------------------------------------------------
@Command(name = "update poolgroup", hint = "refresh list of resilient pools",
description = "Refresh list of resilient pools and initialize newly "
+ "discovered set of pools." )
public class UpdatePoolGroup extends DelayedCommand<String>
{
@Override
protected String execute() throws Exception {
List<String> currentPools = _resilientPools.getResilientPools();
List<String> newPools = _resilientPools.init();
List<String> addedPools = new ArrayList<>();
for (String pool : newPools) {
if (currentPools.contains(pool)) {
continue;
}
synchronized (_dbLock) {
String oldStatus = _poolMap.get(pool);
_dbrmv2.setPoolStatus(pool, ReplicaDb1.OFFLINE); // ... and add it - so record will be
try {
dbUpdatePool(pool);
} catch (Exception ee) {
_log.info(" initDb - Problem fetching repository from " + pool + " : " + ee);
_log.info(" initDb - pool " + pool + " stays '" + ReplicaDb1.OFFLINE + "'");
continue;
}
// Set status online for only 'new' (unknown) pools,
// otherwise leave pool state as it was before
String newStatus = (oldStatus == null || oldStatus
.equals("UNKNOWN"))
? ReplicaDb1.ONLINE
: oldStatus;
_dbrmv2.setPoolStatus(pool, newStatus);
if (newStatus.equals(ReplicaDb1.ONLINE)) {
_poolsToWait.remove(pool);
}
}
addedPools.add(pool);
}
return "Added pools: " + addedPools;
}
}
//--------------------------------------------------------------------------
// === PnfsId ===
//--------------------------------------------------------------------------
public static final String hh_clear = "<pnfsid> # DEBUG: removes pnfsid from replicas table in DB";
public String ac_clear_$_1(Args args) {
PnfsId pnfsId = new PnfsId(args.argv(0));
synchronized (_dbLock) {
_dbrmv2.clearPools(pnfsId);
}
return "";
}
//---------------------------------------------------------------------------
/** @todo check DB handle
*
* @param pnfsId PnfsId
* @return String
*/
private String printCacheLocation(PnfsId pnfsId) {
StringBuilder sb = new StringBuilder();
sb.append(pnfsId).append(" ");
Iterator<String> it = _dbrmv2.getPools(pnfsId);
while (it.hasNext()) {
sb.append(it.next()).append(" ");
}
((DbIterator<?>) it).close();
return sb.toString();
}
//---------------------------------------------------------------------------
private class ReducePnfsIDRunnable implements Runnable {
PnfsId _pnfsId;
public ReducePnfsIDRunnable(PnfsId pnfsId) {
_pnfsId = pnfsId;
}
@Override
public void run() {
if (_adj == null) {
_log.info("adjuster class not instantiated yet");
return;
}
_log.info(_pnfsId.toString() + " Starting replication");
try {
_adj.reduceAsync(_pnfsId );
_log.info(_pnfsId.toString() + " async reduction started");
} catch (Exception ex) {
_log.info(_pnfsId.toString() + " got exception " + ex);
}
}
}
//---------------------------------------------------------------------------
private class ReplicatePnfsIDRunnable implements Runnable{
PnfsId _pnfsId;
public ReplicatePnfsIDRunnable(PnfsId pnfsId) {
_pnfsId = pnfsId;
}
@Override
public void run() {
if ( _adj == null ) {
_log.info( "adjuster class not instantiated yet" );
return;
}
_log.info(_pnfsId.toString() + " Starting replication");
try {
// use extended set of source pools for replication (include drainoff, offline-prepare)
_adj.replicateAsync(_pnfsId, true);
_log.info(_pnfsId.toString() + " async replication started");
}
catch (Exception ex) {
_log.info(_pnfsId.toString() + " got exception " + ex );
}
}
}
/////////////////////////////////////////////////////////////////////////////
// InitDb thread
/////////////////////////////////////////////////////////////////////////////
private class InitDbRunnable implements Runnable {
private long _delayStart;
Thread myThread;
boolean _waiting;
// private ReplicaDbV1 _db;
public InitDbRunnable( long delay ) {
_delayStart = delay;
// _db = installReplicaDb();
}
public InitDbRunnable() {
this(0L);
}
public void wakeupWaitInit() {
if ( myThread != null && _waiting ) {
myThread.interrupt();
} else {
_log.info("DB thread does not sleep");
}
}
public boolean isWaiting() { return _waiting; }
@Override
public void run() {
_log.info("--- DB init started ---");
myThread = Thread.currentThread();
try {
_log.info( _hotRestart
? "=== Hot Restart ==="
: "=== Cold Start ===" );
cleanupDb();
_log.debug( "Sleep " + _delayPoolScan/1000 + " sec");
Thread.sleep( _delayPoolScan ); // sleep x sec while communications are established
_log.debug( "Sleep - waiting for communications to establish - is over");
initDb();
if (_delayStart != 0L) {
synchronized (_poolsToWait) {
if (!_poolsToWait.isEmpty()) {
_log.info("=== Adjuster wakeup is delayed for " + _delayStart/1000L +
" sec. for pools to connect - sleep ... ===\n");
try {
_waiting = true;
_poolsToWait.wait(_delayStart);
}
catch (InterruptedException ex) {
if (_stopThreads) {
_log.info("DB init delay interrupted, stop thread");
_waiting = false;
return;
}
else {
_log.info("DB init delay interrupted, continue");
}
}
finally {
_waiting = false;
} // try / catch /finally
} // if
} // synchronized
}
}
catch (Exception ex) {
_log.info("Exception in go : " + ex, ex);
}
finally {
synchronized (_dbLock) {
_initDbActive = false;
_log.info("DB initialized, notify All");
}
// TODO IF I got exception, I shall not 'wakeup', but retry or shutdown
_dbUpdated.wakeup();
}
myThread = null;
}
}
/////////////////////////////////////////////////////////////////////////////
// WatchDog thread
/////////////////////////////////////////////////////////////////////////////
private class WatchPools implements Runnable {
Set<String> _knownPoolSet = new HashSet<>();
// private long _timeout = 10L * 1000L ; // 10 sec. - Pool Msg Reply timeout
int cntNoChangeMsgLastTime;
private long _period = 10 * 60L * 1000L ; // 10 min. - cycle period
private long _expire = 12 *3600 * 1000L ; // 12 hours - expire excluded files after 12 hours
private boolean _restarted;
private ReplicaDbV1 _db;
public WatchPools(ReplicaDbV1 db)
{
_db = db;
}
public void setPeriod( long p ) {
_period = p;
}
public long getPeriod() {
return _period;
}
public void setExcludedExpiration( long e ) {
_expire = e;
}
public long getExcludedExpiration() {
return _expire;
}
@Override
public void run() {
_log.info("Starting pool watch dog thread");
_restarted = true;
do {
try {
if( ! _runPoolWatchDog ) {
break;
}
Thread.sleep( _period );
runit();
}
catch (InterruptedException ex ){
_log.info("WatchPool Thread was interrupted");
break;
}
catch (Exception ex) {
_log.info("WatchPool Thread got exception, continue", ex);
}
} while( _runPoolWatchDog
&& ! _stopThreads );
_log.info("PoolWatch watch dog thread stopped");
_db.setHeartBeat("PoolWatchDog", "stopped" );
}
public void runit() throws Exception {
Set<String> oldPoolSet;
Set<String> newPoolSet;
List<String> poolList;
boolean updated = false;
String hbMsg;
int releasedCount;
// When WatchDog is restarted after some time,
// get pool list from DB
// and keep real pool status synchronized with with DB
if( _restarted ) {
_restarted = false;
_knownPoolSet = new HashSet<>();
Iterator<String> p = _db.getPools();
while (p.hasNext()) {
String pool = p.next();
if( _db.getPoolStatus( pool ).equals( ReplicaDb1.ONLINE ) ) {
_knownPoolSet.add(pool);
}
}
((DbIterator<?>) p).close();
}
poolList = getPoolListResilient();
newPoolSet = new HashSet<>(poolList);
oldPoolSet = new HashSet<>(_knownPoolSet);
for (String inext : poolList) {
if (oldPoolSet
.contains(inext)) { // remove common part from both sets
oldPoolSet.remove(inext);
newPoolSet.remove(inext);
}
}
List<String> arrived = new ArrayList<>( newPoolSet ) ;
List<String> departed = new ArrayList<>( oldPoolSet ) ;
if (arrived.isEmpty()
&& departed.isEmpty()) {
hbMsg = "no changes";
if ( ++cntNoChangeMsgLastTime < 5 ) {
_log.info("WatchPool - no pools arrived or departed");
} else if ( cntNoChangeMsgLastTime == 5 ) {
_log.info("WatchPool - no pools arrived or departed, throttle future 'no change' messages ");
}
} else {
hbMsg = "conf changed";
cntNoChangeMsgLastTime = 0;
if (arrived.isEmpty()) {
_log.info("WatchPool - no new pools arrived");
} else {
for (Object inext : arrived) {
String poolName = (String) inext;
_log.info("WatchPool - pool arrived '" + poolName + "'");
// Check if pool was "down" and bring it "online"
// if and only if
// Stay calm if it was "drainoff", "offline", "offline-prepare",
// or known to be "online" already
synchronized (_dbLock) {
String poolStatusOld = _db.getPoolStatus(poolName);
// can be "down" or "UNKNOWN" or who knows what else ("", null) ...
// ... so explicitly check strings I want ignore
if (!poolStatusOld.equals("drainoff")
&& !poolStatusOld.equals("offline")
&& !poolStatusOld.equals("offline-prepare")
&& !poolStatusOld.equals("online")
) {
updatePool((String) inext, ReplicaDb1.ONLINE, true);
updated = true;
}
}
}
}
if (departed.isEmpty()) {
_log.info("WatchPool - no pools departed");
} else {
// For pools which left UPDATE pool status in DB
// because
// if pool crashed, there will be no message
// if pool went down on node shut down,
// there will be no message until pool restarted manually
//
for (Object inext : departed) {
_log.info("WatchPool - pool departed '" + inext + "'");
synchronized (_dbLock) {
updatePool((String) inext, ReplicaDb1.DOWN, false);
updated = true;
}
}
}
}
// Release exculded files older then exclude expiration time
releasedCount = 0;
try {
long now = System.currentTimeMillis();
releasedCount = _db.releaseExcluded(now - getExcludedExpiration() );
}
catch (Exception ex) {
// go on
}
if( updated || releasedCount > 0 ) {
_dbUpdated.wakeup();
}
_knownPoolSet = new HashSet<>(poolList);
_db.setHeartBeat("PoolWatchDog", hbMsg );
}
}
private class dbUpdatePoolRunnable implements Runnable {
String _poolName;
public dbUpdatePoolRunnable(String poolName) {
_poolName = poolName;
}
@Override
public void run() {
synchronized (_dbLock) {
try {
dbUpdatePool(_poolName);
}
catch (Exception ee) {
_log.info(" poolStatusChanged - Problem fetching repository from " +
_poolName + " : " + ee);
}
finally {
_initDbActive = false;
}
}
_dbUpdated.wakeup();
}
}
//////////////////////////////////////////////////////////////////////////////
// Callback functions
//////////////////////////////////////////////////////////////////////////////
//
// Callback: File added to pool or removed from pool
// -------------------------------------------------
@Override
public void cacheLocationModified(
PnfsModifyCacheLocationMessage msg,
boolean wasAdded) {
PnfsId pnfsId = msg.getPnfsId();
String poolName = msg.getPoolName();
String strLocModified = "cacheLocationModified : pnfsID " + pnfsId
+ (wasAdded ? " added to" : " removed from")
+ " pool " + poolName ;
List<String> l = _resilientPools.getResilientPools();
if (l == null) {
_log.debug(strLocModified);
_log.debug("Resilient Pools List is not defined (yet), ignore file added/removed");
return;
} else if (!l.contains(poolName)) { // pool is NOT resilient
_log.debug(strLocModified);
_log.debug("Pool " + poolName + " is not resilient pool, ignore file added/removed");
return;
}
_log.info( strLocModified );
synchronized (_dbLock) {
// if ( _initDbActive )
// return;
if( ! _useDB ) {
_log.info("DB not ready yet, ignore ::" + strLocModified );
return;
}
if (_log.isDebugEnabled()) {
_log.debug("Pool list in DB before {} for pnfsId={}: {}",
wasAdded ? "Insertion" : "Removal", pnfsId, printCacheLocation(pnfsId));
}
if (wasAdded) {
_dbrmv2.addPool(pnfsId, poolName);
} else {
_dbrmv2.removePool(pnfsId, poolName);
}
_log.debug("cacheLocationModified() : DB updated, notify All");
}
_dbUpdated.addPnfsId( pnfsId );
_dbUpdated.wakeupByPnfsId();
_log.info("cacheLocationModified : pnfsID " + pnfsId
+ (wasAdded ? " added to" : " removed from") + " pool " + poolName
+ " - DB updated");
}
//////////////////////////////////////////////////////////////////////////////
//
// Callback: List of replicas added to pools
// -----------------------------------------
private static class Replica {
private PnfsId _id;
private String _pool;
public Replica(PnfsId id, String p) {
_id = id;
_pool = p;
}
public PnfsId getPnfsId() { return _id; }
public String getPool() { return _pool; }
public String toString() { return "{" + _id + "," + _pool + "}" ; }
}
@Override
public void cacheLocationAdded( List<PnfsAddCacheLocationMessage> ml )
{
List<String> lres = _resilientPools.getResilientPools();
if ( lres == null ) { // _resilientPools not set yet
_log.debug("Resilient Pools List is not defined (yet), ignore added replica list");
return;
}
// Cleanup message list here:
// @todo : move list creation to DCacheCoreController
// and list processing into DB handler
List<Replica> rList = new LinkedList<>();
for ( PnfsAddCacheLocationMessage msg: ml ) {
PnfsId pnfsId = msg.getPnfsId();
String poolName = msg.getPoolName();
if (lres.contains(poolName)) {
Replica r = new Replica(pnfsId, poolName);
rList.add(r);
_log.debug("cacheLocationAdded(List) : add replica to update list " + r);
} else {
_log.debug("cacheLocationAdded(List) : skip replica {" + pnfsId + "," + poolName +
"} - pool is not on my resilient pool list");
}
}
int count = rList.size();
if( count == 0 ) {
return;
}
synchronized (_dbLock) {
if( ! _useDB ) {
_log.debug("cacheLocationAdded(): DB not ready yet, skip " + rList.size() +" replica updates" );
return;
}
for ( Replica r: rList ) {
_dbUpdated.addPnfsId( r.getPnfsId() );
_dbrmv2.addPool(r.getPnfsId(), r.getPool());
}
}
_log.debug("cacheLocationAdded(List) : added "+count +" pnfsid(s) to DB, notify All");
_dbUpdated.wakeupByPnfsId();
}
//////////////////////////////////////////////////////////////////////////////
//
// Callback: Pool went down or restarted
// -------------------------------------
@Override
protected void processPoolStatusChangedMessage( PoolStatusChangedMessage msg ) {
String msPool = msg.getPoolName();
String msPoolStatus = msg.getPoolStatus();
String poolStatus;
boolean doPoolInventory;
String strStatusChanged = "Pool " +msPool +" status changed to "
+msPoolStatus;
List<String> l = _resilientPools.getResilientPools();
if (l == null) {
// usePoolGroup() == true, but we got 'null' list for resilient pools
_log.debug(strStatusChanged);
_log.debug("Resilient Pools List is not defined (yet), ignore pool status change");
return;
} else if (!l.contains(msPool)) { // pool is NOT resilient
_log.debug(strStatusChanged);
_log.debug("Pool " + msPool + " is not resilient pool, ignore pool status change");
return;
}
switch (msPoolStatus) {
case "DOWN":
poolStatus = ReplicaDb1.DOWN;
break;
case "UP":
case "RESTART":
poolStatus = ReplicaDb1.ONLINE;
break;
case "UNKNOWN":
poolStatus = ReplicaDb1.DOWN;
_log.info("poolStatusChanged ERROR, pool " + msPool +
" state changed to '" + msPoolStatus + "'"
+ " - set pool status to " + poolStatus);
break;
default:
_log.info("poolStatusChanged ERROR, pool " + msPool +
" state changed to unknown state '" + msPoolStatus + "'"
+ ", message ignored");
return;
}
_log.info( "poolStatusChanged, pool " + msPool +
" state changed to '" + poolStatus + "'" ) ;
String detailString = msg.getDetailMessage();
if (_log.isDebugEnabled()) {
int pState = msg.getPoolState();
PoolV2Mode pMode = msg.getPoolMode();
int detailCode = msg.getDetailCode();
_log.debug("PoolStatusChangedMessage msg=" + msg );
//Again:
_log.debug("pool_state=" + pState );
_log.debug("pool_mode=" + ((pMode == null) ? "null" : pMode.toString()));
_log.debug("detail_code=" + detailCode );
_log.debug("detail_string=" + ((detailString == null) ? "null" : detailString) );
// end DEBUG
}
boolean onReplicaMgrCommand = ( detailString != null )
&& detailString.equals("Replica Manager Command");
if( onReplicaMgrCommand ) {
_log.debug("pool status changed on RM command");
}
String poolName = msPool;
/** @todo - do cleanup
* string can be "offline" or "oflline-prepare" - it is set above
*/
doPoolInventory = ( poolStatus.equals("online") ||
poolStatus.equals("offline") ||
poolStatus.equals("offline-prepare"));
if ( ! onReplicaMgrCommand // DB already set to drainoff | offline-prepare | offline
|| poolStatus.equals("online") ) {
synchronized (_dbLock) {
if( ! _useDB ) {
_log.info("DB not ready yet, skip DB update" );
return;
}
updatePool(poolName, poolStatus, doPoolInventory);
}
_dbUpdated.wakeup();
}
}
/**
* updatePool()
*/
private void updatePool(String poolName, String poolStatus,
boolean doPoolInventory) {
synchronized (_dbLock) {
String poolStatusOld = _dbrmv2.getPoolStatus(poolName);
if (poolStatusOld.equals("drainoff")) {
_log.info("poolStatusChanged, Pool '" + poolName + "' status is " +
poolStatusOld
+ ", ignore pool status change messages");
} else {
_log.info("poolStatusChanged, Pool '" + poolName + "' status was " +
poolStatusOld);
if (poolStatus.equals(ReplicaDb1.ONLINE)) {
_poolsToWait.remove(poolName);
}
if (!doPoolInventory) {
// update DB only
_dbrmv2.setPoolStatus(poolName, poolStatus);
_log.info("poolStatusChanged, pool " + poolName +
" state change to '" + poolStatus + "' updated in DB, notify All");
}
else { // "RESTART" || "UP"
// update pnfsId from this pool, and update DB
_dbrmv2.setPoolStatus(poolName, ReplicaDbV1.OFFLINE);
try {
dbUpdatePool(poolName);
_dbrmv2.setPoolStatus(poolName, poolStatus);
_log.info("poolStatusChanged, pool " + poolName +
" state change to '" + poolStatus + "' updated in DB, notify All");
}
catch (Exception ee) {
_log.info(" poolStatusChanged - Problem fetching repository from " +
poolName + " : " + ee);
_log.info(" poolStatusChanged - pool " + poolName + " stays '"
+ReplicaDb1.OFFLINE+"'");
}
}
synchronized (_poolsToWait) {
if (_initDbRunnable != null
&& _initDbRunnable.isWaiting()
&& _poolsToWait.isEmpty()) {
_poolsToWait.notifyAll();
_log.debug("Got all online pools back online, wakeup InitDB");
}
} // synchronized (_poolsToWait)
} // synchronized (_dbLock)
}
if ( poolStatus.equals(ReplicaDb1.DOWN) ) {
taskTearDownByPoolName(poolName);
}
}
//////////////////////////////////////////////////////////////////////////////
// Pool Remove Files message from Cleaner
// - wipe out all pnfsID entries from replicas table
@Override
protected void processDeleteEntryNotification(PnfsDeleteEntryNotificationMessage msg)
{
// non strict check for number of pnfs cleared in DB:
// - DB maybe waked up between locks, clearPools can get error
// still wakeup db check
int fileCount = 0;
PnfsId pnfsId = msg.getPnfsId();
synchronized (_dbLock) {
if (!_useDB) {
_log.info("DB not ready yet, skip DB update" );
return;
}
_dbrmv2.clearPools(pnfsId);
fileCount++;
}
_log.debug("ReplicaManager: PoolRemoveFiles(): {} cleared in DB", pnfsId);
if (fileCount > 0) {
_dbUpdated.wakeup();
}
}
//////////////////////////////////////////////////////////////////////////////
// Task finished - Test
@Override
public void taskFinished(TaskObserver task) {
_log.info("TaskFinished callback: task " + task);
if (task.getType().equals("Reduction")) {
ReductionObserver rt = (ReductionObserver) task;
_log.debug("taskFinished() reduction " + rt.getPnfsId() + " at " +
rt.getPool());
}
if (task.getType().equals("Replication")) {
MoverTask mt = (MoverTask) task;
_log.debug("taskFinished() replication " + mt.getPnfsId()
+ " from " + mt.getSrcPool()
+ " to " + mt.getDstPool()
);
}
}
}