package org.dcache.chimera.namespace; import com.google.common.base.Splitter; import com.google.common.base.Strings; import com.google.common.base.Throwables; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.UncheckedExecutionException; import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariDataSource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.dao.DataAccessException; import org.springframework.dao.DataIntegrityViolationException; import org.springframework.jdbc.core.BatchPreparedStatementSetter; import org.springframework.jdbc.core.JdbcTemplate; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.function.BiFunction; import diskCacheV111.util.CacheException; import diskCacheV111.util.PnfsId; import diskCacheV111.vehicles.PnfsDeleteEntryNotificationMessage; import diskCacheV111.vehicles.PoolManagerPoolUpMessage; import diskCacheV111.vehicles.PoolRemoveFilesMessage; import dmg.cells.nucleus.CellPath; import dmg.cells.nucleus.NoRouteToCellException; import dmg.util.command.Argument; import dmg.util.command.Command; import org.dcache.cells.AbstractCell; import org.dcache.cells.CellStub; import org.dcache.db.AlarmEnabledDataSource; import org.dcache.services.hsmcleaner.PoolInformationBase; import org.dcache.services.hsmcleaner.RequestTracker; import org.dcache.util.Args; import org.dcache.util.CacheExceptionFactory; import org.dcache.util.Option; import static com.google.common.util.concurrent.Futures.allAsList; import static com.google.common.util.concurrent.Futures.immediateFailedFuture; import static java.util.concurrent.TimeUnit.SECONDS; import static java.util.stream.Collectors.toList; /** * @author Irina Kozlova * @version 22 Oct 2007 * * ChimeraCleaner: takes file names from the table public.t_locationinfo_trash, * removes them from the corresponding pools and then from the table as well. * @since 1.8 */ public class ChimeraCleaner extends AbstractCell implements Runnable { private static final Logger _log = LoggerFactory.getLogger(ChimeraCleaner.class); @Option( name="refresh", description="Refresh interval", required=true ) protected long _refreshInterval; @Option( name="refreshUnit", description="Refresh interval unit", required=true ) protected TimeUnit _refreshIntervalUnit; @Option( name="recover", description="", required=true ) protected long _recoverTimer; @Option( name="recoverUnit", description="", required=true ) protected TimeUnit _recoverTimerUnit; @Option( name="poolTimeout", description="", required=true ) protected long _replyTimeout; @Option( name="poolTimeoutUnit", description="", required=true ) protected TimeUnit _replyTimeoutUnit; @Option( name="processFilesPerRun", description="The number of files to process at once", required=true, unit="files" ) protected int _processAtOnce; @Option( name="reportRemove", description="The cells to report removes to" ) protected String _reportTo; @Option( name="hsmCleaner", description="Whether to enable the HSM cleaner", required=true ) protected boolean _hsmCleanerEnabled; @Option( name="hsmCleanerRequest", description="Maximum number of files to include in a single request", required=true, unit="files" ) protected int _hsmCleanerRequest; @Option( name="hsmCleanerTimeout", description="Timeout in milliseconds for delete requests send to HSM-pools", required=true ) protected long _hsmTimeout; @Option( name="hsmCleanerTimeoutUnit", description="Timeout in milliseconds for delete requests send to HSM-pools", required=true ) protected TimeUnit _hsmTimeoutUnit; @Option( name="threads", description="Size of thread pool", required=true ) protected int _threadPoolSize; private CellPath[] _deleteNotificationTargets; private final ConcurrentHashMap<String, Long> _poolsBlackList = new ConcurrentHashMap<>(); private RequestTracker _requests; private ScheduledExecutorService _executor; private ScheduledFuture<?> _cleanerTask; private PoolInformationBase _pools = new PoolInformationBase(); private AlarmEnabledDataSource _dataSource; private JdbcTemplate _db; private CellStub _notificationStub; private CellStub _poolStub; public ChimeraCleaner(String cellName, String args) { super(cellName, args); } @Override protected void starting() throws Exception { super.starting(); useInterpreter(true); _notificationStub = new CellStub(this); _deleteNotificationTargets = Splitter.on(",") .omitEmptyStrings() .splitToList(Strings.nullToEmpty(_reportTo)) .stream() .map(CellPath::new) .toArray(CellPath[]::new); _executor = Executors.newScheduledThreadPool(_threadPoolSize); dbInit(getArgs().getOpt("chimera.db.url"), getArgs().getOpt("chimera.db.user"), getArgs().getOpt("chimera.db.password")); if (_hsmCleanerEnabled) { _requests = new RequestTracker(); _requests.setMaxFilesPerRequest(_hsmCleanerRequest); _requests.setTimeout(_hsmTimeoutUnit.toMillis(_hsmTimeout)); _requests.setPoolStub(new CellStub(this)); _requests.setPoolInformationBase(_pools); _requests.setSuccessSink(uri -> _executor.execute(() -> onSuccess(uri))); _requests.setFailureSink(uri -> _executor.execute(() -> onFailure(uri))); addMessageListener(_requests); addCommandListener(_requests); } addMessageListener(_pools); addCommandListener(_pools); _poolStub = new CellStub(this, null, _replyTimeout, _replyTimeoutUnit); } @Override protected void started() { _cleanerTask = _executor.scheduleWithFixedDelay(this, _refreshInterval, _refreshInterval, _refreshIntervalUnit); } void dbInit(String jdbcUrl, String user, String pass ) throws ClassNotFoundException { if (jdbcUrl == null || user == null || pass == null) { throw new IllegalArgumentException("Not enough arguments to Init SQL database"); } HikariConfig config = new HikariConfig(); config.setJdbcUrl(jdbcUrl); config.setUsername(user); config.setPassword(pass); config.setMinimumIdle(1); config.setMaximumPoolSize(10); _dataSource = new AlarmEnabledDataSource(jdbcUrl, ChimeraCleaner.class.getSimpleName(), new HikariDataSource(config)); _db = new JdbcTemplate(_dataSource); _log.info("Database connection with jdbcUrl={}; user={}", jdbcUrl, user); } @Override protected void stopped() { if (_requests != null) { _requests.shutdown(); } if (_executor != null) { _executor.shutdownNow(); } if (_dataSource != null) { try { _dataSource.close(); } catch (IOException e) { _log.debug("Failed to shutdown database connection pool: {}", e.getMessage()); } } } @Override public void run() { try { _log.info("*********NEW_RUN*************"); _log.debug("INFO: Refresh Interval : {} {}", _refreshInterval, _refreshIntervalUnit); _log.debug("INFO: Number of files processed at once: {}", _processAtOnce); // get list of pool names from the trash_table List<String> poolList = getPoolList(); _log.debug("List of Pools from the trash-table : {}", poolList); // if there are some pools in the blackPoolList (i.e., //pools that are down/do not exist), extract them from the //poolList if (!_poolsBlackList.isEmpty()) { _log.debug("{} pools are currently blacklisted.", _poolsBlackList.size()); for (Map.Entry<String, Long> blackListEntry : _poolsBlackList.entrySet()) { String poolName = blackListEntry.getKey(); long valueTime = blackListEntry.getValue(); //check, if it is time to remove pool from the black list if ((valueTime != 0) && (_recoverTimer > 0) && ((System.currentTimeMillis() - valueTime) > _recoverTimerUnit.toMillis(_recoverTimer))) { _poolsBlackList.remove(poolName); _log.debug("Removed the following pool from the black list: {}", poolName);} } poolList.removeAll(_poolsBlackList.keySet()); } if (!poolList.isEmpty()) { _log.debug("The following pools are cleaned: {}", poolList); runDelete(poolList); } //HSM part if (_hsmCleanerEnabled){ runDeleteHSM(); runNotification(); } } catch (DataAccessException e) { _log.error("Database failure: " + e.getMessage()); } catch (InterruptedException e) { _log.info("Cleaner was interrupted"); } catch (RuntimeException e) { _log.error("Bug detected" , e); } } /** * Returns a list of dinstinctpool names from the trash-table. * * @return list of pool names */ List<String> getPoolList() { return _db.query("SELECT DISTINCT ilocation FROM t_locationinfo_trash WHERE itype=1", (rs, rowNum) -> rs.getString("ilocation")); } /** * Delete entries from the trash-table. * Pool name and the file names are input parameters. * * @param poolname name of the pool * @param filelist file list for this pool * */ void removeFiles(final String poolname, final List<String> filelist) { _db.batchUpdate("DELETE FROM t_locationinfo_trash WHERE ilocation=? AND ipnfsid=? AND itype=1", new BatchPreparedStatementSetter() { @Override public int getBatchSize() { return filelist.size(); } @Override public void setValues(PreparedStatement ps, int i) throws SQLException { ps.setString(1, poolname); ps.setString(2, filelist.get(i)); } }); } /** * runDelete * Delete files on each pool from the poolList. * * @param poolList list of pools * @throws InterruptedException */ private void runDelete(List<String> poolList) throws InterruptedException { for (String pool: poolList) { if (Thread.interrupted()) { throw new InterruptedException("Cleaner interrupted"); } runDelete(pool); // Notify other components that we are done deleting runNotification(); } } private void runDelete(String pool) throws InterruptedException { _log.info("runDelete(): Now processing pool {}", pool); if (!_poolsBlackList.containsKey(pool)) { try { cleanPoolComplete(pool); } catch (NoRouteToCellException | CacheException e) { _log.warn("Failed to remove files from {}: {}", pool, e.getMessage()); } } } /** * sendRemoveToPoolCleaner * removes set of files from the pool * * @param poolName name of the pool * @param removeList list of files to be removed from this pool * @throws InterruptedException */ private void sendRemoveToPoolCleaner(String poolName, List<String> removeList) throws InterruptedException, CacheException, NoRouteToCellException { _log.trace("sendRemoveToPoolCleaner: poolName={} removeList={}", poolName, removeList); try { PoolRemoveFilesMessage msg = CellStub.get(_poolStub.send(new CellPath(poolName), new PoolRemoveFilesMessage(poolName, removeList))); if (msg.getReturnCode() == 0) { removeFiles(poolName, removeList); } else if (msg.getReturnCode() == 1 && msg.getErrorObject() instanceof String[]) { Set<String> notRemoved = new HashSet<>(Arrays.asList((String[]) msg.getErrorObject())); List<String> removed = new ArrayList<>(removeList); removed.removeAll(notRemoved); removeFiles(poolName, removed); } else { throw CacheExceptionFactory.exceptionOf(msg); } } catch (NoRouteToCellException | CacheException e) { _poolsBlackList.put(poolName, System.currentTimeMillis()); throw e; } } public void messageArrived(NoRouteToCellException e) { _log.warn(e.getMessage()); } public void messageArrived(PoolManagerPoolUpMessage poolUpMessage) { String poolName = poolUpMessage.getPoolName(); if (poolUpMessage.getPoolMode().isEnabled() ) { _poolsBlackList.remove(poolName); } else { _poolsBlackList.put(poolName, System.currentTimeMillis()); } } private void runNotification() throws InterruptedException { final String QUERY = "SELECT ipnfsid FROM t_locationinfo_trash t1 " + "WHERE itype=2 AND NOT EXISTS (SELECT 1 FROM t_locationinfo_trash t2 WHERE t2.ipnfsid=t1.ipnfsid AND t2.itype <> 2)"; for (String id : _db.queryForList(QUERY, String.class)) { try { sendDeleteNotifications(new PnfsId(id)).get(); _db.update("DELETE FROM t_locationinfo_trash WHERE ipnfsid=? AND itype=2", id); } catch (ExecutionException e) { _log.warn(e.getCause().getMessage()); } } } private ListenableFuture<List<PnfsDeleteEntryNotificationMessage>> sendDeleteNotifications(PnfsId pnfsId) { BiFunction<CellPath, Exception, CacheException> failureFor = (path, e) -> new CacheException("Failed to notify " + path + " about deletion of " + pnfsId + ": " + e.getMessage(), e); return allAsList( Arrays.stream(_deleteNotificationTargets) .map(a -> Futures.catchingAsync(_notificationStub.send(a, new PnfsDeleteEntryNotificationMessage(pnfsId)), Exception.class, e -> immediateFailedFuture(failureFor.apply(a, e)))) .collect(toList())); } /** * cleanPoolComplete * delete all files from the pool 'poolName' found in the trash-table for this pool * * @param poolName name of the pool */ void cleanPoolComplete(final String poolName) throws InterruptedException, CacheException, NoRouteToCellException { _log.trace("CleanPoolComplete(): poolname={}", poolName); try { List<String> files = new ArrayList<>(_processAtOnce); _db.query("SELECT ipnfsid FROM t_locationinfo_trash WHERE ilocation=? AND itype=1 ORDER BY iatime", rs -> { try { files.add(rs.getString("ipnfsid")); if (files.size() >= _processAtOnce || rs.isLast()) { sendRemoveToPoolCleaner(poolName, files); files.clear(); } } catch (InterruptedException | CacheException | NoRouteToCellException e) { throw new UncheckedExecutionException(e); } }, poolName); } catch (UncheckedExecutionException e) { Throwables.propagateIfInstanceOf(e.getCause(), InterruptedException.class); Throwables.propagateIfInstanceOf(e.getCause(), CacheException.class); Throwables.propagateIfInstanceOf(e.getCause(), NoRouteToCellException.class); throw Throwables.propagate(e.getCause()); } } /** * Delete files stored on tape (HSM). */ private void runDeleteHSM() { _db.query("SELECT ilocation FROM t_locationinfo_trash WHERE itype=0", rs -> { try { URI uri = new URI(rs.getString("ilocation")); _log.debug("Submitting a request to delete a file: {}", uri); _requests.submit(uri); } catch (URISyntaxException e) { throw new DataIntegrityViolationException("Invalid URI in database: " + e.getMessage(), e); } }); } //////////////////////////////////////////////////////////////////////////// @Command(name = "rundelete", hint = "run cleaner", description = "Delete all files found in the trash-table irrespective of the pool.") public class RundeleteCommand implements Callable<String> { @Override public String call() throws InterruptedException { runDelete(getPoolList()); return ""; } } @Command(name = "show info", hint = "get cleaner service information") public class ShowInfoCommand implements Callable<String> { @Override public String call() { StringBuilder sb = new StringBuilder(); sb.append("Refresh Interval: ").append(_refreshInterval).append(" ").append(_refreshIntervalUnit).append("\n"); sb.append("Reply Timeout: ").append(_replyTimeout).append(" ").append(_replyTimeoutUnit).append("\n"); sb.append("Recover Timer: ").append(_recoverTimer).append(" ").append(_recoverTimerUnit).append("\n"); sb.append("Number of files processed at once: ").append(_processAtOnce); if ( _hsmCleanerEnabled ) { sb.append("\n HSM Cleaner enabled. Info : \n"); sb.append("Timeout for cleaning requests to HSM-pools: ").append(_hsmTimeout).append(" ").append(_hsmTimeoutUnit).append("\n"); sb.append("Maximal number of concurrent requests to a single HSM : ").append(_hsmCleanerRequest); } else { sb.append("\n HSM Cleaner disabled."); } return sb.toString(); } } @Command(name = "ls blacklist", hint = "list blacklisted pools", description = "Show a list of blacklisted pools. Blacklisted pool is a " + "pool that is down or do not exist.") public class LsBlacklistCommand implements Callable<String> { @Override public String call() { StringBuilder sb = new StringBuilder(); for (String pool : _poolsBlackList.keySet()) { sb.append(pool).append("\n"); } return sb.toString(); } } @Command(name = "remove from blacklist", hint = "remove a pool from the blacklist") public class RemoveFromBlacklistCommand implements Callable<String> { @Argument(usage = "The name of the pool to be removed from the blacklist.") String poolName; @Override public String call() { if (_poolsBlackList.remove(poolName) != null) { return "Pool " + poolName + " is removed from the Black List "; } return "Pool " + poolName + " was not found in the Black List "; } } public static final String hh_clean_file = "<pnfsID> # clean this file (file will be deleted from DISK)"; public String ac_clean_file_$_1(Args args) throws InterruptedException, CacheException, NoRouteToCellException { try { String pnfsid = args.argv(0); List<String> removeFile = Collections.singletonList(pnfsid); _db.query("SELECT ilocation FROM t_locationinfo_trash WHERE ipnfsid=? AND itype=1 ORDER BY iatime", rs -> { String pool = rs.getString("ilocation"); try { sendRemoveToPoolCleaner(pool, removeFile); } catch (CacheException | InterruptedException | NoRouteToCellException e) { throw new UncheckedExecutionException(e); } }, pnfsid); } catch (UncheckedExecutionException e) { Throwables.propagateIfInstanceOf(e.getCause(), InterruptedException.class); Throwables.propagateIfInstanceOf(e.getCause(), CacheException.class); Throwables.propagateIfInstanceOf(e.getCause(), NoRouteToCellException.class); throw Throwables.propagate(e.getCause()); } return ""; } public static final String hh_clean_pool = "<poolName> # clean this pool "; public String ac_clean_pool_$_1(Args args) throws CacheException, InterruptedException, NoRouteToCellException { String poolName = args.argv(0); if (_poolsBlackList.containsKey(poolName)) { return "This pool is not available for the moment and therefore will not be cleaned."; } cleanPoolComplete(poolName); return ""; } public static final String hh_set_refresh = "[<refreshTimeInSeconds>]"; public static final String fh_set_refresh = "Alters refresh rate and triggers a new run. Maximum rate is every 5 seconds."; public String ac_set_refresh_$_0_1(Args args) { if (args.argc() > 0) { long newRefresh = Long.parseLong(args.argv(0)); if (newRefresh < 5) { throw new IllegalArgumentException("Time must be greater than 5 seconds"); } _refreshInterval = newRefresh; _refreshIntervalUnit = SECONDS; if (_cleanerTask != null) { _cleanerTask.cancel(true); } _cleanerTask = _executor.scheduleWithFixedDelay(this, 0, _refreshInterval, _refreshIntervalUnit); } return "Refresh set to " + _refreshInterval + " " + _refreshIntervalUnit; } public static final String hh_set_processedAtOnce = "<processedAtOnce> # max number of files sent to pool for processing at once "; public String ac_set_processedAtOnce_$_1(Args args) { if (args.argc() > 0) { int processAtOnce = Integer.parseInt(args.argv(0)); if (processAtOnce <= 0) { throw new IllegalArgumentException("Number of files must be greater than 0 "); } _processAtOnce = processAtOnce; } return "Number of files processed at once set to " + _processAtOnce; } ///// HSM admin commands ///// public static final String hh_rundelete_hsm = " # run HSM Cleaner"; public String ac_rundelete_hsm(Args args) { if (!_hsmCleanerEnabled) { return "HSM Cleaner is disabled."; } runDeleteHSM(); return ""; } //explicitly clean HSM-file public static final String hh_clean_file_hsm = "<pnfsID> # clean this file on HSM (file will be deleted from HSM)"; public String ac_clean_file_hsm_$_1(Args args) { if (!_hsmCleanerEnabled) { return "HSM Cleaner is disabled."; } _db.query("SELECT ilocation FROM t_locationinfo_trash WHERE ipnfsid=? AND itype=0 ORDER BY iatime", rs -> { try { _requests.submit(new URI(rs.getString("ilocation"))); } catch (URISyntaxException e) { throw new DataIntegrityViolationException("Invalid URI in database: " + e.getMessage(), e); } }, args.argv(0)); return ""; } public static final String hh_hsm_set_MaxFilesPerRequest = "<number> # maximal number of concurrent requests to a single HSM"; public String ac_hsm_set_MaxFilesPerRequest_$_1(Args args) throws NumberFormatException { if (!_hsmCleanerEnabled) { return "HSM Cleaner is disabled."; } if (args.argc() > 0) { int maxFilesPerRequest = Integer.parseInt(args.argv(0)); if (maxFilesPerRequest == 0) { throw new IllegalArgumentException("The number must be greater than 0 "); } _hsmCleanerRequest = maxFilesPerRequest; } return "Maximal number of concurrent requests to a single HSM is set to " + _hsmCleanerRequest; } public static final String hh_hsm_set_TimeOut = "<seconds> # cleaning request timeout in seconds (for HSM-pools)"; public String ac_hsm_set_TimeOut_$_1(Args args) throws NumberFormatException { if (!_hsmCleanerEnabled) { return "HSM Cleaner is disabled."; } if (args.argc() > 0) { _hsmTimeout = Long.parseLong(args.argv(0)); _hsmTimeoutUnit = SECONDS; } return "Timeout for cleaning requests to HSM-pools is set to " + _hsmTimeout + " " + _hsmTimeoutUnit; } /** * Called when a file was successfully deleted from the HSM. */ protected void onSuccess(URI uri) { try { _log.debug("HSM-ChimeraCleaner: remove entries from the trash-table. ilocation={}", uri); _db.update("DELETE FROM t_locationinfo_trash WHERE ilocation=? AND itype=0", uri.toString()); } catch (DataAccessException e) { _log.error("Error when deleting from the trash-table: " + e.getMessage()); } } /** * Called when a file could not be deleted from the HSM. */ protected void onFailure(URI uri) { _log.info("Failed to delete a file {} from HSM. Will try again later.", uri); } }