package diskCacheV111.poolManager ;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Multimap;
import com.google.common.util.concurrent.ListenableFuture;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Required;
import java.io.IOException;
import java.io.PrintWriter;
import java.lang.Thread.UncaughtExceptionHandler;
import java.net.InetSocketAddress;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Deque;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executor;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import diskCacheV111.util.CacheException;
import diskCacheV111.util.CheckStagePermission;
import diskCacheV111.util.CostException;
import diskCacheV111.util.DestinationCostException;
import diskCacheV111.util.ExtendedRunnable;
import diskCacheV111.util.FileNotInCacheException;
import diskCacheV111.util.PermissionDeniedCacheException;
import diskCacheV111.util.PnfsHandler;
import diskCacheV111.util.PnfsId;
import diskCacheV111.util.SourceCostException;
import diskCacheV111.vehicles.DCapProtocolInfo;
import diskCacheV111.vehicles.IpProtocolInfo;
import diskCacheV111.vehicles.Message;
import diskCacheV111.vehicles.Pool2PoolTransferMsg;
import diskCacheV111.vehicles.PoolFetchFileMessage;
import diskCacheV111.vehicles.PoolHitInfoMessage;
import diskCacheV111.vehicles.PoolMgrReplicateFileMsg;
import diskCacheV111.vehicles.PoolMgrSelectReadPoolMsg;
import diskCacheV111.vehicles.PoolStatusChangedMessage;
import diskCacheV111.vehicles.ProtocolInfo;
import diskCacheV111.vehicles.RestoreHandlerInfo;
import diskCacheV111.vehicles.StorageInfo;
import diskCacheV111.vehicles.WarningPnfsFileInfoMessage;
import dmg.cells.nucleus.AbstractCellComponent;
import dmg.cells.nucleus.CDC;
import dmg.cells.nucleus.CellAddressCore;
import dmg.cells.nucleus.CellCommandListener;
import dmg.cells.nucleus.CellInfoProvider;
import dmg.cells.nucleus.CellMessage;
import dmg.cells.nucleus.CellMessageReceiver;
import dmg.cells.nucleus.CellPath;
import dmg.cells.nucleus.CellSetupProvider;
import dmg.cells.nucleus.NoRouteToCellException;
import dmg.cells.nucleus.UOID;
import org.dcache.cells.CellStub;
import org.dcache.poolmanager.Partition;
import org.dcache.poolmanager.PartitionManager;
import org.dcache.poolmanager.PoolInfo;
import org.dcache.poolmanager.PoolSelector;
import org.dcache.poolmanager.SelectedPool;
import org.dcache.util.Args;
import org.dcache.util.FireAndForgetTask;
import org.dcache.vehicles.FileAttributes;
public class RequestContainerV5
extends AbstractCellComponent
implements Runnable, CellCommandListener, CellMessageReceiver, CellSetupProvider, CellInfoProvider
{
private static final Logger _log =
LoggerFactory.getLogger(RequestContainerV5.class);
public enum RequestState { ST_INIT, ST_DONE, ST_POOL_2_POOL,
ST_STAGE, ST_WAITING, ST_WAITING_FOR_STAGING,
ST_WAITING_FOR_POOL_2_POOL, ST_SUSPENDED }
private static final String POOL_UNKNOWN_STRING = "<unknown>" ;
private static final String STRING_NEVER = "never" ;
private static final String STRING_BESTEFFORT = "besteffort" ;
private static final String STRING_NOTCHECKED = "notchecked" ;
/** value in milliseconds */
private static final int DEFAULT_TICKER_INTERVAL = 60000;
private static final DateTimeFormatter DATE_TIME_FORMAT = DateTimeFormatter.ofPattern("MM.dd HH:mm:ss");
private final Map<UOID, PoolRequestHandler> _messageHash = new HashMap<>() ;
private final Map<String, PoolRequestHandler> _handlerHash = new HashMap<>() ;
private CellStub _billing;
private CellStub _poolStub;
private long _retryTimer = 15 * 60 * 1000 ;
private static final int MAX_REQUEST_CLUMPING = 20;
private String _onError = "suspend" ;
private int _maxRetries = 3 ;
private int _maxRestore = -1 ;
private CheckStagePermission _stagePolicyDecisionPoint;
private boolean _sendHitInfo;
private int _restoreExceeded;
private boolean _suspendIncoming;
private boolean _suspendStaging;
private PoolSelectionUnit _selectionUnit;
private PoolMonitorV5 _poolMonitor;
private PnfsHandler _pnfsHandler;
private Executor _executor;
private final Map<PnfsId, CacheException> _selections = new HashMap<>() ;
private PartitionManager _partitionManager ;
private volatile long _checkFilePingTimer = 10 * 60 * 1000 ;
/** value in milliseconds */
private final long _ticketInterval;
private Thread _tickerThread;
private PoolPingThread _poolPingThread;
/**
* Tape Protection.
* allStates defines that all states are allowed.
* allStatesExceptStage defines that all states except STAGE are allowed.
*/
public static final EnumSet<RequestState> allStates =
EnumSet.allOf(RequestState.class);
public static final EnumSet<RequestState> allStatesExceptStage =
EnumSet.complementOf(EnumSet.of(RequestState.ST_STAGE));
public RequestContainerV5(long tickerInterval) {
_ticketInterval = tickerInterval;
}
public RequestContainerV5()
{
this(DEFAULT_TICKER_INTERVAL);
}
public void start()
{
_tickerThread = new Thread(this, "Container-ticker");
_tickerThread.start();
_poolPingThread = new PoolPingThread();
_poolPingThread.start();
}
public void shutdown()
{
if (_tickerThread != null) {
_tickerThread.interrupt();
}
if (_poolPingThread != null) {
_poolPingThread.interrupt();
}
}
@Required
public void setPoolSelectionUnit(PoolSelectionUnit selectionUnit)
{
_selectionUnit = selectionUnit;
}
@Required
public void setPoolMonitor(PoolMonitorV5 poolMonitor)
{
_poolMonitor = poolMonitor;
}
@Required
public void setPnfsHandler(PnfsHandler pnfsHandler)
{
_pnfsHandler = pnfsHandler;
}
@Required
public void setPartitionManager(PartitionManager partitionManager)
{
_partitionManager = partitionManager;
}
@Required
public void setExecutor(Executor executor)
{
_executor = executor;
}
public void setHitInfoMessages(boolean sendHitInfo)
{
_sendHitInfo = sendHitInfo;
}
@Required
public void setBilling(CellStub billing)
{
_billing = billing;
}
@Required
public void setPoolStub(CellStub poolStub)
{
_poolStub = poolStub;
}
public void messageArrived(CellMessage envelope, Object message)
{
UOID uoid = envelope.getLastUOID();
PoolRequestHandler handler;
synchronized (_messageHash) {
handler = _messageHash.remove(uoid);
if (handler == null) {
return;
}
}
handler.mailForYou(message);
}
@Override
public void run()
{
while (!Thread.interrupted()) {
try {
Thread.sleep(_ticketInterval) ;
List<PoolRequestHandler> list;
synchronized (_handlerHash) {
list = new ArrayList<>(_handlerHash.values());
}
for (PoolRequestHandler handler: list) {
if (handler != null) {
handler.alive();
}
}
} catch (InterruptedException e) {
break;
} catch (Throwable t) {
Thread thisThread = Thread.currentThread();
UncaughtExceptionHandler ueh =
thisThread.getUncaughtExceptionHandler();
ueh.uncaughtException(thisThread, t);
}
}
_log.debug("Container-ticker done");
}
public void poolStatusChanged(String poolName, int poolStatus) {
_log.info("Restore Manager : got 'poolRestarted' for " + poolName);
try {
List<PoolRequestHandler> list;
synchronized (_handlerHash) {
list = new ArrayList<>(_handlerHash.values());
}
for (PoolRequestHandler rph : list) {
if (rph == null) {
continue;
}
switch( poolStatus ) {
case PoolStatusChangedMessage.UP:
/*
* if pool is up, re-try all request scheduled to this pool
* and all requests, which do not have any pool candidates
*
* in this construction we will fall down to next case
*/
if (rph.getPoolCandidate().equals(POOL_UNKNOWN_STRING) ) {
_log.info("Restore Manager : retrying : " + rph);
rph.retry();
}
case PoolStatusChangedMessage.DOWN:
/*
* if pool is down, re-try all request scheduled to this
* pool
*/
if (rph.getPoolCandidate().equals(poolName) ) {
_log.info("Restore Manager : retrying : " + rph);
rph.retry();
}
}
}
} catch (RuntimeException e) {
_log.error("Problem retrying pool " + poolName, e);
}
}
@Override
public void getInfo(PrintWriter pw)
{
Partition def = _partitionManager.getDefaultPartition();
pw.println( " Retry Timeout : "+(_retryTimer/1000)+" seconds" ) ;
pw.println( " Thread Controller : "+_executor ) ;
pw.println( " Maximum Retries : "+_maxRetries ) ;
pw.println( " Pool Ping Timer : "+(_checkFilePingTimer/1000) + " seconds" ) ;
pw.println( " On Error : "+_onError ) ;
pw.println( " Allow p2p : "+( def._p2pAllowed ? "on" : "off" )+
" oncost="+( def._p2pOnCost ? "on" : "off" )+
" fortransfer="+( def._p2pForTransfer ? "on" : "off" ) );
pw.println( " Allow staging : "+(def._hasHsmBackend ? "on":"off") ) ;
pw.println( "Allow stage on cost : "+(def._stageOnCost ? "on":"off") ) ;
pw.println( " Restore Limit : "+(_maxRestore<0?"unlimited":(String.valueOf(_maxRestore))));
pw.println( " Restore Exceeded : "+_restoreExceeded ) ;
if( _suspendIncoming ) {
pw.println(" Suspend Incoming : on (not persistent)");
}
if( _suspendStaging ) {
pw.println(" Suspend Staging : on (not persistent)");
}
}
@Override
public void printSetup(PrintWriter pw)
{
pw.append("rc onerror ").println(_onError);
pw.append("rc set max retries ").println(_maxRetries);
pw.append("rc set retry ").println(_retryTimer/1000);
pw.append("rc set poolpingtimer ").println(_checkFilePingTimer/1000);
pw.append("rc set max restore ")
.println(_maxRestore<0?"unlimited":(String.valueOf(_maxRestore)));
}
public static final String hh_rc_set_sameHostCopy =
STRING_NEVER+"|"+STRING_BESTEFFORT+"|"+STRING_NOTCHECKED;
public String ac_rc_set_sameHostCopy_$_1(Args args)
{
_partitionManager.setProperties("default", ImmutableMap.of("sameHostCopy", args.argv(0)));
return "";
}
public static final String hh_rc_set_sameHostRetry =
STRING_NEVER+"|"+STRING_BESTEFFORT+"|"+STRING_NOTCHECKED;
public String ac_rc_set_sameHostRetry_$_1(Args args)
{
_partitionManager.setProperties("default", ImmutableMap.of("sameHostRetry", args.argv(0)));
return "" ;
}
public static final String fh_rc_set_max_restore = "Limit total number of concurrent restores. If the total number of\n" +
"restores reaches this limit then any additional restores will fail;\n" +
"when the total number of restores drops below limit then additional\n" +
"restores will be accepted. Setting the limit to \"0\" will result in\n" +
"all restores failing; setting the limit to \"unlimited\" will remove\n" +
"the limit.";
public static final String hh_rc_set_max_restore = "<maxNumberOfRestores>" ;
@AffectsSetup
public String ac_rc_set_max_restore_$_1( Args args ){
if( args.argv(0).equals("unlimited") ){
_maxRestore = -1 ;
return "" ;
}
int n = Integer.parseInt(args.argv(0));
if( n < 0 ) {
throw new
IllegalArgumentException("must be >=0");
}
_maxRestore = n ;
return "" ;
}
public static final String hh_rc_select = "[<pnfsId> [<errorNumber> [<errorMessage>]] [-remove]]" ;
public String ac_rc_select_$_0_3( Args args ){
synchronized( _selections ){
if( args.argc() == 0 ){
StringBuilder sb = new StringBuilder() ;
for( Map.Entry<PnfsId, CacheException > entry: _selections.entrySet() ){
sb.append(entry.getKey().toString()).
append(" ").
append(entry.getValue().toString()).
append("\n");
}
return sb.toString() ;
}
boolean remove = args.hasOption("remove") ;
PnfsId pnfsId = new PnfsId(args.argv(0));
if( remove ){
_selections.remove( pnfsId ) ;
return "" ;
}
int errorNumber = args.argc() > 1 ? Integer.parseInt(args.argv(1)) : 1 ;
String errorMessage = args.argc() > 2 ? args.argv(2) : ("Failed-"+errorNumber);
_selections.put( pnfsId , new CacheException(errorNumber,errorMessage) ) ;
}
return "" ;
}
public static final String hh_rc_set_warning_path = " # obsolete";
public String ac_rc_set_warning_path_$_0_1( Args args ){
return "";
}
public static final String fh_rc_set_poolpingtimer =
" rc set poolpingtimer <timer/seconds> "+
""+
" If set to a nonzero value, the restore handler will frequently"+
" check the pool whether the request is still pending, failed"+
" or has been successful" +
"";
public static final String hh_rc_set_poolpingtimer = "<checkPoolFileTimer/seconds>" ;
@AffectsSetup
public String ac_rc_set_poolpingtimer_$_1(Args args ){
_checkFilePingTimer = 1000L * Long.parseLong(args.argv(0));
PoolPingThread poolPingThread = _poolPingThread;
if (poolPingThread != null) {
synchronized (poolPingThread) {
poolPingThread.notify();
}
}
return "" ;
}
public static final String hh_rc_set_retry = "<retryTimer/seconds>" ;
@AffectsSetup
public String ac_rc_set_retry_$_1(Args args ){
_retryTimer = 1000L * Long.parseLong(args.argv(0));
return "" ;
}
public static final String hh_rc_set_max_retries = "<maxNumberOfRetries>" ;
@AffectsSetup
public String ac_rc_set_max_retries_$_1(Args args ){
_maxRetries = Integer.parseInt(args.argv(0));
return "" ;
}
public static final String hh_rc_suspend = "[on|off] -all" ;
public String ac_rc_suspend_$_0_1( Args args ){
boolean all = args.hasOption("all") ;
if( args.argc() == 0 ){
if(all) {
_suspendIncoming = true;
}
_suspendStaging = true ;
}else{
String mode = args.argv(0) ;
switch (mode) {
case "on":
if (all) {
_suspendIncoming = true;
}
_suspendStaging = true;
break;
case "off":
if (all) {
_suspendIncoming = false;
}
_suspendStaging = false;
break;
default:
throw new
IllegalArgumentException("Usage : rc suspend [on|off]");
}
}
return "" ;
}
public static final String hh_rc_onerror = "suspend|fail" ;
@AffectsSetup
public String ac_rc_onerror_$_1(Args args ){
String onerror = args.argv(0) ;
if( ( ! onerror.equals("suspend") ) &&
( ! onerror.equals("fail") ) ) {
throw new
IllegalArgumentException("Usage : rc onerror fail|suspend");
}
_onError = onerror ;
return "onerror "+_onError ;
}
public static final String fh_rc_retry =
"NAME\n"+
" rc retry\n\n"+
"SYNOPSIS\n"+
" I) rc retry <pnfsId> [OPTIONS]\n"+
" II) rc retry * -force-all [OPTIONS]\n\n"+
"DESCRIPTION\n"+
" Forces a 'restore request' to be retried.\n"+
" While using syntax I, a single request is retried,\n"+
" syntax II retries all requests which reported an error.\n"+
" If the '-force-all' options is given, all requests are\n"+
" retried, regardless of their current status.\n";
public static final String hh_rc_retry = "<pnfsId>|* -force-all";
public String ac_rc_retry_$_1( Args args )
{
boolean forceAll = args.hasOption("force-all") ;
if( args.argv(0).equals("*") ){
List<PoolRequestHandler> all;
//
// Remember : we are not allowed to call 'retry' as long
// as we are holding the _handlerHash lock.
//
synchronized( _handlerHash ){
all = new ArrayList<>( _handlerHash.values() ) ;
}
for (PoolRequestHandler rph : all) {
if( forceAll || ( rph._currentRc != 0 ) ) {
rph.retry();
}
}
}else{
PoolRequestHandler rph;
synchronized( _handlerHash ){
rph = _handlerHash.get(args.argv(0));
if( rph == null ) {
throw new
IllegalArgumentException("Not found : " + args
.argv(0));
}
}
rph.retry() ;
}
return "";
}
public static final String hh_rc_failed = "<pnfsId> [<errorNumber> [<errorMessage>]]" ;
public String ac_rc_failed_$_1_3( Args args )
{
int errorNumber = args.argc() > 1 ? Integer.parseInt(args.argv(1)) : 1;
String errorString = args.argc() > 2 ? args.argv(2) : "Operator Intervention" ;
PoolRequestHandler rph;
synchronized( _handlerHash ){
rph = _handlerHash.get(args.argv(0));
if( rph == null ) {
throw new
IllegalArgumentException("Not found : " + args.argv(0));
}
}
rph.failed(errorNumber,errorString) ;
return "" ;
}
public static final String hh_rc_ls = " [<regularExpression>] [-w] [-l] # lists pending requests" ;
public String ac_rc_ls_$_0_1( Args args ){
StringBuilder sb = new StringBuilder() ;
Pattern pattern = args.argc() > 0 ? Pattern.compile(args.argv(0)) : null ;
boolean isLongListing = args.hasOption("l");
if( !args.hasOption("w") ){
List<PoolRequestHandler> allRequestHandlers;
synchronized( _handlerHash ){
allRequestHandlers = new ArrayList<>( _handlerHash.values() ) ;
}
for( PoolRequestHandler h : allRequestHandlers ){
if( h == null ) {
continue;
}
String line = h.toString() ;
if( ( pattern == null ) || pattern.matcher(line).matches() ) {
sb.append(line).append("\n");
if (isLongListing) {
for(CellMessage m: h.getMessages()) {
PoolMgrSelectReadPoolMsg request =
(PoolMgrSelectReadPoolMsg) m.getMessageObject();
sb.append(" ").append(request.getProtocolInfo()).append('\n');
}
}
}
}
}else{
Map<UOID, PoolRequestHandler> allPendingRequestHandlers = new HashMap<>() ;
synchronized(_messageHash){
allPendingRequestHandlers.putAll( _messageHash ) ;
}
for (Map.Entry<UOID, PoolRequestHandler> requestHandler : allPendingRequestHandlers.entrySet()) {
UOID uoid = requestHandler.getKey();
PoolRequestHandler h = requestHandler.getValue();
if (h == null) {
continue;
}
String line = uoid.toString() + " " + h.toString();
if ((pattern == null) || pattern.matcher(line).matches()) {
sb.append(line).append("\n");
}
}
}
return sb.toString();
}
public static final String hh_xrc_ls = " # lists pending requests (binary)" ;
public Object ac_xrc_ls( Args args ){
List<PoolRequestHandler> all;
synchronized( _handlerHash ){
all = new ArrayList<>( _handlerHash.values() ) ;
}
List<RestoreHandlerInfo> list = new ArrayList<>() ;
for( PoolRequestHandler h: all ){
if( h == null ) {
continue;
}
list.add( h.getRestoreHandlerInfo() ) ;
}
return list.toArray( new RestoreHandlerInfo[list.size()] ) ;
}
public void messageArrived(CellMessage envelope,
PoolMgrSelectReadPoolMsg request)
throws PatternSyntaxException, IOException
{
boolean enforceP2P = false ;
PnfsId pnfsId = request.getPnfsId() ;
ProtocolInfo protocolInfo = request.getProtocolInfo() ;
EnumSet<RequestState> allowedStates = request.getAllowedStates();
String hostName =
protocolInfo instanceof IpProtocolInfo ?
((IpProtocolInfo)protocolInfo).getSocketAddress().getAddress().getHostAddress() :
"NoSuchHost" ;
String netName = _selectionUnit.getNetIdentifier(hostName);
String protocolNameFromInfo = protocolInfo.getProtocol()+"/"+protocolInfo.getMajorVersion() ;
String protocolName = _selectionUnit.getProtocolUnit( protocolNameFromInfo ) ;
if( protocolName == null ) {
throw new
IllegalArgumentException("Protocol not found : "+protocolNameFromInfo);
}
if( request instanceof PoolMgrReplicateFileMsg ){
if( request.isReply() ){
_log.warn("Unexpected PoolMgrReplicateFileMsg arrived (is a reply)");
return ;
}else{
enforceP2P = true ;
}
}
String canonicalName = pnfsId +"@"+netName+"-"+protocolName+(enforceP2P?"-p2p":"") ;
//
//
PoolRequestHandler handler;
_log.info( "Adding request for : "+canonicalName ) ;
synchronized( _handlerHash ){
handler = _handlerHash.computeIfAbsent(canonicalName, n -> new PoolRequestHandler(pnfsId, n, allowedStates));
handler.addRequest(envelope) ;
}
}
// replicate a file
public static final String hh_replicate = " <pnfsid> <client IP>";
public String ac_replicate_$_2(Args args) {
String commandReply = "Replication initiated...";
try {
FileAttributes fileAttributes =
_pnfsHandler.getFileAttributes(new PnfsId(args.argv(0)),
PoolMgrReplicateFileMsg.getRequiredAttributes());
// TODO: call p2p direct
// send message to yourself
PoolMgrReplicateFileMsg req =
new PoolMgrReplicateFileMsg(fileAttributes,
new DCapProtocolInfo("DCap", 3, 0,
new InetSocketAddress(args.argv(1),
2222)));
sendMessage( new CellMessage(new CellAddressCore("PoolManager"), req) );
} catch (CacheException e) {
commandReply = "P2P failed : " + e.getMessage();
}
return commandReply;
}
///////////////////////////////////////////////////////////////
//
// the read io request handler
//
private class PoolRequestHandler {
protected final PnfsId _pnfsId;
protected final List<CellMessage> _messages = new ArrayList<>() ;
protected int _retryCounter;
private final CDC _cdc = new CDC();
private UOID _waitingFor;
private String _status = "[<idle>]";
private volatile RequestState _state = RequestState.ST_INIT;
private final Collection<RequestState> _allowedStates;
private boolean _stagingDenied;
private int _currentRc;
private String _currentRm = "" ;
/**
* The best pool found by askIfAvailable(). In contrast to
* _poolCandidateInfo, _bestPool may be set even when
* askIfAvailable() returns with an error. Eg when the best
* pool is too expensive.
*/
private SelectedPool _bestPool;
/**
* The pool from which to read the file or the pool to which
* to stage the file. Set by askIfAvailable() when it returns
* RT_FOUND, by exercisePool2PoolReply() when it returns
* RT_OK, and by askForStaging(). Also set in the
* stateEngine() at various points.
*/
private volatile SelectedPool _poolCandidate;
/**
* The host name of the pool used for staging.
*
* Serves a critical role when retrying staging to avoid that
* the same stage host is chosen twice in a row.
*/
private String _stageCandidateHost;
/**
* The name of the pool used for staging.
*
* Serves a critical role when retrying staging to avoid that
* the same pool is chosen twice in a row.
*/
private String _stageCandidatePool;
/**
* The destination of a pool to pool transfer. Set by
* askForPoolToPool() when it returns RT_FOUND.
*/
private volatile SelectedPool _p2pDestinationPool;
/**
* The source of a pool to pool transfer. Set by
* askForPoolToPool() when it return RT_FOUND.
*/
private SelectedPool _p2pSourcePool;
private final long _started = System.currentTimeMillis() ;
private final String _name;
private FileAttributes _fileAttributes;
private StorageInfo _storageInfo;
private ProtocolInfo _protocolInfo;
private String _linkGroup;
private String _billingPath;
private String _transferPath;
private boolean _enforceP2P;
private int _destinationFileStatus = Pool2PoolTransferMsg.UNDETERMINED ;
private PoolSelector _poolSelector;
private Partition _parameter = _partitionManager.getDefaultPartition();
/**
* Indicates the next time a TTL of a request message will be
* exceeded.
*/
private long _nextTtlTimeout = Long.MAX_VALUE;
public PoolRequestHandler(PnfsId pnfsId, String canonicalName,
Collection<RequestState> allowedStates)
{
_pnfsId = pnfsId ;
_name = canonicalName ;
_allowedStates = allowedStates ;
}
//...........................................................
//
// the following methods can be called from outside
// at any time.
//...........................................................
//
// add request is assumed to be synchronized by a higher level.
//
public void addRequest( CellMessage message ){
_messages.add(message);
_stagingDenied = false;
long ttl = message.getTtl();
if (ttl < Long.MAX_VALUE) {
long timeout = System.currentTimeMillis() + ttl;
_nextTtlTimeout = Math.min(_nextTtlTimeout, timeout);
}
if (_poolSelector != null) {
return;
}
PoolMgrSelectReadPoolMsg request =
(PoolMgrSelectReadPoolMsg)message.getMessageObject() ;
_linkGroup = request.getLinkGroup();
_protocolInfo = request.getProtocolInfo();
_fileAttributes = request.getFileAttributes();
_storageInfo = _fileAttributes.getStorageInfo();
_billingPath = request.getBillingPath();
_transferPath = request.getTransferPath();
_retryCounter = request.getContext().getRetryCounter();
_stageCandidateHost = request.getContext().getPreviousStageHost();
_stageCandidatePool = request.getContext().getPreviousStagePool();
if( request instanceof PoolMgrReplicateFileMsg ){
_enforceP2P = true ;
_destinationFileStatus = ((PoolMgrReplicateFileMsg)request).getDestinationFileStatus() ;
}
_poolSelector =
_poolMonitor.getPoolSelector(_fileAttributes,
_protocolInfo,
_linkGroup);
//
//
//
add(null) ;
}
public List<CellMessage> getMessages() {
synchronized( _handlerHash ){
return new ArrayList<>(_messages);
}
}
public String getPoolCandidate()
{
if (_poolCandidate != null) {
return _poolCandidate.name();
} else if (_p2pDestinationPool != null) {
return _p2pDestinationPool.name();
} else {
return POOL_UNKNOWN_STRING;
}
}
private String getPoolCandidateState()
{
if (_poolCandidate != null) {
return _poolCandidate.name();
} else if (_p2pDestinationPool != null) {
return (_p2pSourcePool == null ? POOL_UNKNOWN_STRING : _p2pSourcePool)
+ "->" + _p2pDestinationPool.name();
} else {
return POOL_UNKNOWN_STRING;
}
}
public RestoreHandlerInfo getRestoreHandlerInfo(){
return new RestoreHandlerInfo(
_name,
_messages.size(),
_retryCounter ,
_started ,
getPoolCandidateState() ,
_status ,
_currentRc ,
_currentRm ) ;
}
@Override
public String toString(){
return _name+" m="+_messages.size()+" r="+
_retryCounter+" ["+getPoolCandidateState()+"] ["+_status+"] "+
"{"+_currentRc+","+_currentRm+"}" ;
}
//
//
private void mailForYou( Object message ){
//
// !!!!!!!!! remove this
//
//if( message instanceof PoolFetchFileMessage ){
// _log.info("mailForYou !!!!! reply ignored ") ;
// return ;
//}
add( message ) ;
}
private void alive(){
Object [] command = new Object[1] ;
command[0] = "alive" ;
add( command ) ;
}
private void retry()
{
Object [] command = new Object[1];
command[0] = "retry" ;
add(command);
}
private void failed( int errorNumber , String errorMessage )
{
if( errorNumber > 0 ){
Object [] command = new Object[3] ;
command[0] = "failed" ;
command[1] = errorNumber;
command[2] = errorMessage == null ?
( "Error-"+_currentRc ) :
errorMessage ;
add( command ) ;
return ;
}
throw new
IllegalArgumentException("Error number must be > 0");
}
//...................................................................
//
// from now on, methods can only be called from within
// the state mechanism. (which is thread save because
// we only allow to run a single thread at a time.
//
private void clearSteering() {
if (_waitingFor != null) {
synchronized (_messageHash) {
_messageHash.remove(_waitingFor);
}
_waitingFor = null;
}
}
private void setError( int errorCode , String errorMessage ){
_currentRc = errorCode ;
_currentRm = errorMessage ;
}
private boolean sendFetchRequest(SelectedPool pool)
{
// TODO: Include assumption in request
CellMessage cellMessage = new CellMessage(
new CellPath(pool.address()),
new PoolFetchFileMessage(pool.name(), _fileAttributes)
);
synchronized (_messageHash) {
if (_maxRestore >= 0 && _messageHash.size() >= _maxRestore) {
return false;
}
if (_waitingFor != null) {
_messageHash.remove(_waitingFor);
}
_waitingFor = cellMessage.getUOID();
_messageHash.put(_waitingFor, this);
sendMessage(cellMessage);
_status = "Staging " + LocalDateTime.now().format(DATE_TIME_FORMAT);
}
return true;
}
private void sendPool2PoolRequest(SelectedPool sourcePool, SelectedPool destPool)
{
// TOOD: Include assumptions in request
Pool2PoolTransferMsg pool2pool =
new Pool2PoolTransferMsg(sourcePool.name(), destPool.name(), _fileAttributes);
pool2pool.setDestinationFileStatus(_destinationFileStatus);
_log.info("[p2p] Sending transfer request: " + pool2pool);
CellMessage cellMessage =
new CellMessage(new CellPath(destPool.address()), pool2pool);
synchronized (_messageHash) {
if (_waitingFor != null) {
_messageHash.remove(_waitingFor);
}
_waitingFor = cellMessage.getUOID();
_messageHash.put(_waitingFor, this);
sendMessage(cellMessage);
_status = "[P2P " + LocalDateTime.now().format(DATE_TIME_FORMAT) + "]";
}
}
/**
* Removes request messages who's time to live has been
* exceeded. Messages are dropped; no reply is sent to the
* requestor, as we assume it is no longer waiting for the
* reply.
*/
private void expireRequests()
{
/* Access to _messages is controlled by a lock on
* _handlerHash.
*/
synchronized (_handlerHash) {
long now = System.currentTimeMillis();
_nextTtlTimeout = Long.MAX_VALUE;
Iterator<CellMessage> i = _messages.iterator();
while (i.hasNext()) {
CellMessage message = i.next();
long ttl = message.getTtl();
if (message.getLocalAge() >= ttl) {
_log.info("Discarding request from "
+ message.getSourcePath().getCellName()
+ " because its time to live has been exceeded.");
i.remove();
} else if (ttl < Long.MAX_VALUE) {
_nextTtlTimeout = Math.min(_nextTtlTimeout, now + ttl);
}
}
}
}
private boolean answerRequest(int count) {
//
// if there is an error we won't continue ;
//
if (_currentRc != 0) {
count = 100000;
}
//
Iterator<CellMessage> messages = _messages.iterator();
for (int i = 0; (i < count) && messages.hasNext(); i++) {
CellMessage m = messages.next();
PoolMgrSelectReadPoolMsg rpm =
(PoolMgrSelectReadPoolMsg) m.getMessageObject();
rpm.setContext(_retryCounter + 1, _stageCandidateHost, _stageCandidatePool);
if (_currentRc == 0) {
rpm.setPoolName(_poolCandidate.name());
rpm.setPoolAddress(_poolCandidate.info().getAddress());
rpm.setAssumption(_poolCandidate.assumption());
rpm.setSucceeded();
} else {
rpm.setFailed(_currentRc, _currentRm);
}
m.revertDirection();
sendMessage(m);
messages.remove();
}
return messages.hasNext();
}
//
// and the heart ...
//
private static final int RT_OK = 1 ;
private static final int RT_FOUND = 2 ;
private static final int RT_NOT_FOUND = 3 ;
private static final int RT_ERROR = 4 ;
private static final int RT_OUT_OF_RESOURCES = 5 ;
private static final int RT_COST_EXCEEDED = 7 ;
private static final int RT_NOT_PERMITTED = 8 ;
private static final int RT_S_COST_EXCEEDED = 9 ;
private static final int RT_DELAY = 10 ;
private static final int CONTINUE = 0 ;
private static final int WAIT = 1 ;
private final Deque<Object> _fifo = new LinkedList<>() ;
private boolean _stateEngineActive;
private boolean _forceContinue;
private boolean _overwriteCost;
public class RunEngine implements ExtendedRunnable {
@Override
public void run(){
try (CDC ignored = _cdc.restore()) {
stateLoop() ;
}finally{
synchronized( _fifo ){
_stateEngineActive = false ;
}
}
}
@Override
public void runFailed(){
synchronized( _fifo ){
_stateEngineActive = false ;
}
}
@Override
public String toString() {
return PoolRequestHandler.this.toString();
}
}
private void add( Object obj ){
synchronized( _fifo ){
_log.info( "Adding Object : "+obj ) ;
_fifo.addFirst(obj) ;
if( _stateEngineActive ) {
return;
}
_log.info( "Starting Engine" ) ;
_stateEngineActive = true ;
try {
_executor.execute(new FireAndForgetTask(new RunEngine()));
} catch (RuntimeException e) {
_stateEngineActive = false;
throw e;
}
}
}
private void stateLoop(){
Object inputObject ;
_log.info( "ACTIVATING STATE ENGINE "+_pnfsId+" "+(System.currentTimeMillis()-_started)) ;
while( ! Thread.interrupted() ){
if( ! _forceContinue ){
synchronized( _fifo ){
if(_fifo.isEmpty()){
_stateEngineActive = false ;
return ;
}
inputObject = _fifo.removeLast() ;
}
}else{
inputObject = null ;
}
_forceContinue = false ;
try{
_log.info("StageEngine called in mode " +
_state + " with object " +
( inputObject == null ?
"(NULL)":
( inputObject instanceof Object [] ?
((Object[])inputObject)[0].toString() :
inputObject.getClass().getName()
)
)
);
stateEngine( inputObject ) ;
_log.info("StageEngine left with: {} ({})",
_state, (_forceContinue ? "Continue" : "Wait"));
} catch (RuntimeException e) {
_log.error("Unexpected Exception in state loop for " + _pnfsId, e);
}
}
}
private boolean canStage()
{
/* If the result is cached or the door disabled staging,
* then we don't check the permissions.
*/
if (_stagingDenied || !_allowedStates.contains(RequestState.ST_STAGE)) {
return false;
}
/* Staging is allowed if just one of the requests has
* permission to stage.
*/
for (CellMessage envelope: _messages) {
try {
PoolMgrSelectReadPoolMsg msg =
(PoolMgrSelectReadPoolMsg) envelope.getMessageObject();
if (_stagePolicyDecisionPoint.canPerformStaging(msg.getSubject(),
msg.getFileAttributes(),
msg.getProtocolInfo())) {
return true;
}
} catch (IOException | PatternSyntaxException e) {
_log.error("Failed to verify stage permissions: " + e.getMessage());
}
}
/* None of the requests had the necessary credentials to
* stage. This result is cached.
*/
_stagingDenied = true;
return false;
}
private void nextStep(RequestState state, int shouldContinue ){
if (_currentRc == CacheException.NOT_IN_TRASH ||
_currentRc == CacheException.FILE_NOT_FOUND) {
_state = RequestState.ST_DONE;
_forceContinue = true;
_status = "Failed";
sendInfoMessage(
_currentRc , "Failed "+_currentRm);
} else {
if (state == RequestState.ST_STAGE && !canStage()) {
_state = RequestState.ST_DONE;
_forceContinue = true;
_status = "Failed";
_log.debug("Subject is not authorized to stage");
_currentRc = CacheException.PERMISSION_DENIED;
_currentRm = "File not online. Staging not allowed.";
sendInfoMessage(
_currentRc , "Permission denied." + _currentRm);
} else if (!_allowedStates.contains(state)) {
_state = RequestState.ST_DONE;
_forceContinue = true;
_status = "Failed";
_log.debug("No permission to perform {}", state);
_currentRc = CacheException.PERMISSION_DENIED;
_currentRm = "Permission denied.";
sendInfoMessage(_currentRc,
"Permission denied for " + state);
} else {
_state = state;
_forceContinue = shouldContinue == CONTINUE ;
if( _state != RequestState.ST_DONE ){
_currentRc = 0 ;
_currentRm = "" ;
}
}
}
}
//
// askIfAvailable :
//
// default : (bestPool=set,overwriteCost=false) otherwise mentioned
//
// RT_FOUND :
//
// Because : file is on pool which is allowed and has reasonable cost.
//
// -> DONE
//
// RT_NOT_FOUND :
//
// Because : file is not in cache at all
//
// (bestPool=0)
//
// -> _hasHsmBackend : STAGE
// else : Suspended (1010, pool unavailable)
//
// RT_NOT_PERMITTED :
//
// Because : file not in an permitted pool but somewhere else
//
// (bestPool=0,overwriteCost=true)
//
// -> _p2pAllowed ||
// ! _hasHsmBackend : P2P
// else : STAGE
//
// RT_COST_EXCEEDED :
//
// Because : file is in permitted pools but cost is too high.
//
// -> _p2pOnCost : P2P
// _hasHsmBackend &&
// _stageOnCost : STAGE
// else : 127 , "Cost exceeded (st,p2p not allowed)"
//
// RT_ERROR :
//
// Because : - No entry in configuration Permission Matrix
// - Code Exception
//
// (bestPool=0)
//
// -> STAGE
//
//
//
// askForPoolToPool( overwriteCost ) :
//
// RT_FOUND :
//
// Because : source and destination pool found and cost ok.
//
// -> DONE
//
// RT_NOT_PERMITTED :
//
// Because : - already too many copies (_maxPnfsFileCopies)
// - file already everywhere (no destination found)
// - SAME_HOST_NEVER : but no valid combination found
//
// -> DONE 'using bestPool'
//
// RT_S_COST_EXCEEDED (only if ! overwriteCost ) :
//
// Because : best source pool exceeds 'alert' cost.
//
// -> _hasHsmBackend &&
// _stageOnCost : STAGE
// bestPool == 0 : 194,"File not present in any reasonable pool"
// else : DONE 'using bestPool'
//
// RT_COST_EXCEEDED (only if ! overwriteCost ) :
//
// Because : file is in permitted pools but cost of
// best destination pool exceeds cost of best
// source pool (resp. slope * source).
//
// -> _bestPool == 0 : 192,"File not present in any reasonable pool"
// else : DONE 'using bestPool'
//
// RT_ERROR :
//
// Because : - no source pool (code problem)
// - Code Exception
//
// -> 132,"PANIC : Tried to do p2p, but source was empty"
// or exception text.
//
// askForStaging :
//
// RT_FOUND :
//
// Because : destination pool found and cost ok.
//
// -> DONE
//
// RT_NOT_FOUND :
//
// -> 149 , "No pool candidates available or configured for 'staging'"
// -> 150 , "No cheap candidates available for 'staging'"
//
// RT_ERROR :
//
// Because : - Code Exception
//
private void stateEngine( Object inputObject ) {
int rc;
switch( _state ){
case ST_INIT :
_log.debug( "stateEngine: case ST_INIT");
synchronized( _selections ){
CacheException ce = _selections.get(_pnfsId) ;
if( ce != null ){
setError(ce.getRc(),ce.getMessage());
nextStep(RequestState.ST_DONE , CONTINUE ) ;
return ;
}
}
if( inputObject == null ){
if( _suspendIncoming ){
setError(1005, "Suspend enforced");
suspend("Suspended (forced)");
return ;
}
//
//
if( _enforceP2P ){
setError(0,"");
nextStep(RequestState.ST_POOL_2_POOL , CONTINUE) ;
return ;
}
if( ( rc = askIfAvailable() ) == RT_FOUND ){
setError(0,"");
nextStep(RequestState.ST_DONE , CONTINUE ) ;
_log.info("AskIfAvailable found the object");
if (_sendHitInfo) {
sendHitMsg(_bestPool.info(), true);
}
}else if( rc == RT_NOT_FOUND ){
//
//
_log.debug(" stateEngine: RT_NOT_FOUND ");
if( _parameter._hasHsmBackend && _storageInfo.isStored()){
_log.debug(" stateEngine: parameter has HSM backend and the file is stored on tape ");
nextStep(RequestState.ST_STAGE , CONTINUE ) ;
}else{
_log.debug(" stateEngine: case 1: parameter has NO HSM backend or case 2: the HSM backend exists but the file isn't stored on it.");
_poolCandidate = null ;
setError(1010, "Pool unavailable");
suspendIfEnabled("Suspended (pool unavailable)");
}
if (_sendHitInfo && _poolCandidate == null) {
sendHitMsg(_bestPool.info(), false); //VP
}
//
}else if( rc == RT_NOT_PERMITTED ){
//
// if we can't read the file because 'read is prohibited'
// we at least must give dCache the chance to copy it
// to another pool (not regarding the cost).
//
_overwriteCost = true ;
//
// if we don't have an hsm we overwrite the p2pAllowed
//
nextStep( _parameter._p2pAllowed || ! _parameter._hasHsmBackend
? RequestState.ST_POOL_2_POOL : RequestState.ST_STAGE , CONTINUE ) ;
}else if( rc == RT_COST_EXCEEDED ){
if( _parameter._p2pOnCost ){
nextStep(RequestState.ST_POOL_2_POOL , CONTINUE ) ;
}else if( _parameter._hasHsmBackend && _parameter._stageOnCost ){
nextStep(RequestState.ST_STAGE , CONTINUE ) ;
}else{
setError( 127 , "Cost exceeded (st,p2p not allowed)" ) ;
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}
}else if( rc == RT_ERROR ){
_log.debug( " stateEngine: RT_ERROR");
nextStep(RequestState.ST_STAGE , CONTINUE ) ;
_log.info("AskIfAvailable returned an error, will continue with Staging");
}
}else if( inputObject instanceof Object [] ){
handleCommandObject( (Object [] ) inputObject ) ;
}
break ;
case ST_POOL_2_POOL :
{
_log.debug( "stateEngine: case ST_POOL_2_POOL");
if( inputObject == null ){
if( ( rc = askForPoolToPool( _overwriteCost ) ) == RT_FOUND ){
nextStep(RequestState.ST_WAITING_FOR_POOL_2_POOL , WAIT ) ;
_status = "Pool2Pool "+ LocalDateTime.now().format(DATE_TIME_FORMAT);
setError(0, "");
if (_sendHitInfo) {
sendHitMsg(_p2pSourcePool.info(), true); //VP
}
}else if( rc == RT_NOT_PERMITTED ){
if( _bestPool == null) {
if( _enforceP2P ){
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}else if( _parameter._hasHsmBackend && _storageInfo.isStored() ){
_log.info("ST_POOL_2_POOL : Pool to pool not permitted, trying to stage the file");
nextStep(RequestState.ST_STAGE , CONTINUE ) ;
}else{
setError(265, "Pool to pool not permitted");
suspendIfEnabled("Suspended");
}
}else{
_poolCandidate = _bestPool;
_log.info("ST_POOL_2_POOL : Choosing high cost pool "+_poolCandidate.info());
setError(0,"");
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}
}else if( rc == RT_S_COST_EXCEEDED ){
_log.info("ST_POOL_2_POOL : RT_S_COST_EXCEEDED");
if( _parameter._hasHsmBackend && _parameter._stageOnCost && _storageInfo.isStored() ){
if( _enforceP2P ){
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}else{
_log.info("ST_POOL_2_POOL : staging");
nextStep(RequestState.ST_STAGE , CONTINUE ) ;
}
}else{
if( _bestPool != null ){
_poolCandidate = _bestPool;
_log.info("ST_POOL_2_POOL : Choosing high cost pool "+_poolCandidate.info());
setError(0,"");
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}else{
//
// this can't possibly happen
//
setError(194,"PANIC : File not present in any reasonable pool");
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}
}
}else if( rc == RT_COST_EXCEEDED ){
//
//
if( _bestPool == null ){
//
// this can't possibly happen
//
if( _enforceP2P ){
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}else{
setError(192,"PANIC : File not present in any reasonable pool");
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}
}else{
_poolCandidate = _bestPool;
_log.info(" found high cost object");
setError(0,"");
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}
}else{
if( _enforceP2P ){
nextStep(RequestState.ST_DONE , CONTINUE ) ;
}else if( _parameter._hasHsmBackend && _storageInfo.isStored() ){
nextStep(RequestState.ST_STAGE , CONTINUE ) ;
}else{
suspendIfEnabled("Suspended");
}
}
}
}
break ;
case ST_STAGE :
_log.debug("stateEngine: case ST_STAGE");
if( inputObject == null ){
if( _suspendStaging ){
setError(1005, "Suspend enforced");
suspend("Suspended Stage (forced)");
return ;
}
if( ( rc = askForStaging() ) == RT_FOUND ){
nextStep(RequestState.ST_WAITING_FOR_STAGING , WAIT ) ;
_status = "Staging "+ LocalDateTime.now().format(DATE_TIME_FORMAT);
setError(0, "");
}else if( rc == RT_OUT_OF_RESOURCES ){
_restoreExceeded ++ ;
outOfResources("Restore") ;
}else{
//
// we couldn't find a pool for staging
//
errorHandler() ;
}
}
break ;
case ST_WAITING_FOR_POOL_2_POOL :
_log.debug( "stateEngine: case ST_WAITING_FOR_POOL_2_POOL");
if( inputObject instanceof Message ){
if( ( rc = exercisePool2PoolReply((Message)inputObject) ) == RT_OK ){
if (_parameter._p2pForTransfer && ! _enforceP2P) {
setError(CacheException.OUT_OF_DATE,
"Pool locations changed due to p2p transfer");
nextStep(RequestState.ST_DONE, CONTINUE);
} else {
nextStep(RequestState.ST_DONE, CONTINUE);
}
}else{
_log.info("ST_POOL_2_POOL : Pool to pool reported a problem");
if( _parameter._hasHsmBackend && _storageInfo.isStored() ){
_log.info("ST_POOL_2_POOL : trying to stage the file");
nextStep(RequestState.ST_STAGE , CONTINUE ) ;
}else{
errorHandler() ;
}
}
}else if( inputObject instanceof Object [] ){
handleCommandObject( (Object []) inputObject ) ;
} else if (inputObject instanceof PingFailure &&
_p2pDestinationPool.address().equals(((PingFailure) inputObject).getPool())) {
_log.info("Ping reported that request died.");
setError(CacheException.TIMEOUT, "Replication timed out");
errorHandler();
}
break ;
case ST_WAITING_FOR_STAGING :
_log.debug( "stateEngine: case ST_WAITING_FOR_STAGING" );
if( inputObject instanceof Message ){
if( ( rc = exerciseStageReply( (Message)inputObject ) ) == RT_OK ){
if (_parameter._p2pForTransfer) {
setError(CacheException.OUT_OF_DATE,
"Pool locations changed due to stage");
nextStep(RequestState.ST_DONE, CONTINUE);
} else {
nextStep(RequestState.ST_DONE, CONTINUE);
}
}else if( rc == RT_DELAY ){
suspend("Suspended By HSM request");
}else{
errorHandler() ;
}
}else if( inputObject instanceof Object [] ){
handleCommandObject( (Object []) inputObject ) ;
} else if (inputObject instanceof PingFailure &&
_poolCandidate.address().equals(((PingFailure) inputObject).getPool())) {
_log.info("Ping reported that request died.");
setError(CacheException.TIMEOUT, "Staging timed out");
errorHandler();
}
break;
case ST_SUSPENDED:
_log.debug("stateEngine: case ST_SUSPENDED");
if (inputObject instanceof Object[]) {
handleCommandObject( (Object []) inputObject ) ;
}
return ;
case ST_DONE :
_log.debug( "stateEngine: case ST_DONE" );
if( inputObject == null ){
clearSteering();
//
// it is essential that we are not within any other
// lock when trying to get the handlerHash lock.
//
synchronized (_handlerHash) {
_handlerHash.remove(_name);
}
while (answerRequest(MAX_REQUEST_CLUMPING)) {
setError(CacheException.OUT_OF_DATE,
"Request clumping limit reached");
}
}
}
}
private void handleCommandObject( Object [] c ){
String command = c[0].toString() ;
switch (command) {
case "failed":
clearSteering();
setError((Integer) c[1], c[2].toString());
nextStep(RequestState.ST_DONE, CONTINUE);
break;
case "retry":
_status = "Retry enforced";
_retryCounter = -1;
clearSteering();
setError(CacheException.OUT_OF_DATE, "Operator asked for retry");
nextStep(RequestState.ST_DONE, CONTINUE);
break;
case "alive":
long now = System.currentTimeMillis();
if (now > _nextTtlTimeout) {
expireRequests();
}
break;
}
}
private void outOfResources( String detail ){
clearSteering();
setError(5,"Resource temporarily unavailable : "+detail);
nextStep(RequestState.ST_DONE , CONTINUE ) ;
_status = "Failed" ;
sendInfoMessage(
_currentRc , "Failed "+_currentRm );
}
private void fail()
{
if (_currentRc == 0) {
_log.error("Error handler called without an error");
setError(CacheException.DEFAULT_ERROR_CODE,
"Pool selection failed");
}
nextStep(RequestState.ST_DONE, CONTINUE);
}
private void suspend(String status)
{
_log.debug(" stateEngine: SUSPENDED/WAIT ");
_status = status + " " + LocalDateTime.now().format(DATE_TIME_FORMAT);
nextStep(RequestState.ST_SUSPENDED, WAIT);
sendInfoMessage(
_currentRc, "Suspended (" + _currentRm + ")");
}
private void suspendIfEnabled(String status)
{
if (_onError.equals("suspend")) {
suspend(status);
} else {
fail();
}
}
private void errorHandler()
{
if (_retryCounter >= _maxRetries) {
suspendIfEnabled("Suspended");
} else {
fail();
}
}
private int exerciseStageReply( Message messageArrived ){
try{
if( messageArrived instanceof PoolFetchFileMessage ){
PoolFetchFileMessage reply = (PoolFetchFileMessage)messageArrived ;
int rc;
_currentRc = reply.getReturnCode();
switch(_currentRc) {
case 0:
// best candidate is the right one
rc = RT_OK;
break;
case CacheException.HSM_DELAY_ERROR:
_currentRm = "Suspend by HSM request : " + reply.getErrorObject() == null ?
"No info" : reply.getErrorObject().toString() ;
rc = RT_DELAY;
break;
default:
_currentRm = reply.getErrorObject() == null ?
( "Error="+_currentRc ) : reply.getErrorObject().toString() ;
rc = RT_ERROR ;
}
return rc;
}else{
throw new
CacheException(204,"Invalid message arrived : "+
messageArrived.getClass().getName());
}
} catch (CacheException e) {
_currentRc = e.getRc();
_currentRm = e.getMessage();
_log.warn("exerciseStageReply: {} ", e.toString());
return RT_ERROR;
} catch (RuntimeException e) {
_currentRc = 102;
_currentRm = e.getMessage();
_log.error("exerciseStageReply", e) ;
return RT_ERROR;
}
}
private int exercisePool2PoolReply( Message messageArrived ){
try{
if( messageArrived instanceof Pool2PoolTransferMsg ){
Pool2PoolTransferMsg reply = (Pool2PoolTransferMsg)messageArrived ;
_log.info("Pool2PoolTransferMsg replied with : "+reply);
if( ( _currentRc = reply.getReturnCode() ) == 0 ){
_poolCandidate = _p2pDestinationPool;
return RT_OK ;
}else{
_currentRm = reply.getErrorObject() == null ?
( "Error="+_currentRc ) : reply.getErrorObject().toString() ;
return RT_ERROR ;
}
}else{
throw new
CacheException(205,"Invalid message arrived : "+
messageArrived.getClass().getName());
}
} catch (CacheException e) {
_currentRc = e.getRc();
_currentRm = e.getMessage();
_log.warn("exercisePool2PoolReply: {}", e.toString());
return RT_ERROR;
} catch (RuntimeException e) {
_currentRc = 102;
_currentRm = e.getMessage();
_log.error("exercisePool2PoolReply", e);
return RT_ERROR;
}
}
//
// calculate :
// matrix = list of list of active
// pools with file available (sorted)
//
// if empty :
// bestPool = 0 , return NOT_FOUND
//
// else
// determine best pool by
//
// if allowFallback :
// first row for which cost < costCut or
// if not found, pool with lowest cost.
// else
// leftmost pool of first nonzero row
//
// if bestPool > costCut :
// return COST_EXCEEDED
//
// chose best pool from row selected above by :
// if ( minCostCut > 0 ) :
// take all pools of the selected row
// with cost < minCostCut and make hash selection.
// else
// take leftmost pool.
//
// return FOUND
//
// RESULT :
// RT_FOUND :
// file is on pool which is allowed and has reasonable cost.
// RT_NOT_FOUND :
// file is not in cache at all
// RT_NOT_PERMITTED :
// file not in an permitted pool but somewhere else
// RT_COST_EXCEEDED :
// file is in permitted pools but cost is too high.
// RT_ERROR :
// - No entry in configuration Permission Matrix
// - Code Exception
//
private int askIfAvailable()
{
try {
_bestPool = _poolSelector.selectReadPool();
_parameter = _poolSelector.getCurrentPartition();
} catch (FileNotInCacheException e) {
_log.info("[read] {}", e.getMessage());
return RT_NOT_FOUND;
} catch (PermissionDeniedCacheException e) {
_log.info("[read] {}", e.getMessage());
return RT_NOT_PERMITTED;
} catch (CostException e) {
if (e.getPool() == null) {
_log.info("[read] {}", e.getMessage());
setError(125, e.getMessage());
return RT_ERROR;
}
_bestPool = e.getPool();
_parameter = _poolSelector.getCurrentPartition();
if (e.shouldTryAlternatives()) {
_log.info("[read] {} ({})", e.getMessage(), _bestPool.name());
return RT_COST_EXCEEDED;
}
} catch (CacheException e) {
String err = "Read pool selection failed: " + e.getMessage();
_log.warn(err);
setError(130, err);
return RT_ERROR;
} catch (IllegalArgumentException e) {
String err = "Read pool selection failed:" + e.getMessage();
_log.error(err);
setError(130, err);
return RT_ERROR;
} catch (RuntimeException e) {
_log.error("Read pool selection failed", e);
setError(130, "Read pool selection failed: " + e.toString());
return RT_ERROR;
} finally {
_log.info("[read] Took {} ms",
(System.currentTimeMillis() - _started));
}
_poolCandidate = _bestPool;
setError(0,"");
return RT_FOUND;
}
//
// Result :
// FOUND :
// valid source/destination pair found fitting all constraints.
// NOT_PERMITTED :
// - already too many copies (_maxPnfsFileCopies)
// - file already everywhere (no destination found)
// - SAME_HOST_NEVER : but no valid combination found
// COST_EXCEEDED :
// - slope == 0 : all destination pools > costCut (p2p)
// else : (best destination) > ( slope * source )
// S_COST_EXCEEDED :
// - all source pools > alert
// ERROR
// - no source pool (code problem)
//
private int askForPoolToPool(boolean overwriteCost)
{
try {
Partition.P2pPair pools =
_poolSelector.selectPool2Pool(overwriteCost);
_p2pSourcePool = pools.source;
_p2pDestinationPool = pools.destination;
_log.info("[p2p] source={};dest={}",
_p2pSourcePool, _p2pDestinationPool);
sendPool2PoolRequest(_p2pSourcePool, _p2pDestinationPool);
return RT_FOUND;
} catch (PermissionDeniedCacheException e) {
setError(e.getRc(), e.getMessage());
_log.info("[p2p] {}", e.toString());
return RT_NOT_PERMITTED;
} catch (SourceCostException e) {
setError(e.getRc(), e.getMessage());
_log.info("[p2p] {}", e.getMessage());
return RT_S_COST_EXCEEDED;
} catch (DestinationCostException e) {
setError(e.getRc(), e.getMessage());
_log.info("[p2p] {}", e.getMessage());
return RT_COST_EXCEEDED;
} catch (CacheException e) {
setError(e.getRc(), e.getMessage());
_log.warn("[p2p] {}", e.getMessage());
return RT_ERROR;
} catch (IllegalArgumentException e) {
setError(128, e.getMessage());
_log.error("[p2p] {}", e.getMessage());
return RT_ERROR;
} catch (RuntimeException e) {
setError(128, e.getMessage());
_log.error("[p2p] contact support@dcache.org", e);
return RT_ERROR;
} finally {
_log.info("[p2p] Selection took {} ms",
(System.currentTimeMillis() - _started));
}
}
//
// FOUND :
// - pool candidate found
// NOT_FOUND :
// - no pools configured
// - pools configured but not active
// - no pools left after subtracting primary candidate.
// OUT_OF_RESOURCES :
// - too many requests queued
//
private int askForStaging()
{
try {
SelectedPool pool =
_poolSelector.selectStagePool(_stageCandidatePool, _stageCandidateHost);
_poolCandidate = pool;
_stageCandidatePool = pool.name();
_stageCandidateHost = pool.hostName();
_log.info("[staging] poolCandidate -> {}", _poolCandidate.info());
if (!sendFetchRequest(_poolCandidate)) {
return RT_OUT_OF_RESOURCES;
}
setError(0,"");
return RT_FOUND;
} catch (CostException e) {
if (e.getPool() != null) {
_poolCandidate = e.getPool();
_stageCandidatePool = e.getPool().name();
_stageCandidateHost = e.getPool().hostName();
return RT_FOUND;
}
_log.info("[stage] {}", e.getMessage());
setError(125, e.getMessage());
return RT_ERROR;
} catch (CacheException e) {
setError(e.getRc(), e.getMessage());
_log.warn("[stage] {}", e.getMessage());
return RT_NOT_FOUND;
} catch (IllegalArgumentException e) {
setError(128, e.getMessage());
_log.error("[stage] {}", e.getMessage());
return RT_ERROR;
} catch (RuntimeException e) {
setError(128, e.getMessage());
_log.error("[stage] contact support@dcache.org", e);
return RT_ERROR;
} finally {
_log.info("[stage] Selection took {} ms",
(System.currentTimeMillis() - _started));
}
}
private void sendInfoMessage(int rc, String infoMessage)
{
WarningPnfsFileInfoMessage info =
new WarningPnfsFileInfoMessage("PoolManager", getCellAddress(), _pnfsId, rc, infoMessage);
info.setStorageInfo(_fileAttributes.getStorageInfo());
info.setFileSize(_fileAttributes.getSize());
info.setBillingPath(_billingPath);
info.setTransferPath(_transferPath);
_billing.notify(info);
}
private void sendHitMsg(PoolInfo pool, boolean cached)
{
PoolHitInfoMessage msg = new PoolHitInfoMessage(pool == null ? null : pool.getAddress(), _pnfsId);
msg.setBillingPath(_billingPath);
msg.setTransferPath(_transferPath);
msg.setFileCached(cached);
msg.setStorageInfo(_fileAttributes.getStorageInfo());
msg.setFileSize(_fileAttributes.getSize());
msg.setProtocolInfo(_protocolInfo);
_billing.notify(msg);
}
}
public void setStageConfigurationFile(String path)
{
_stagePolicyDecisionPoint = new CheckStagePermission(path);
}
private class PoolPingThread extends Thread
{
private PoolPingThread()
{
super("Container-ping");
}
public void run()
{
try {
while (!Thread.interrupted()) {
try {
synchronized (this) {
wait(_checkFilePingTimer);
}
long now = System.currentTimeMillis();
// Determine which pools to query
List<PoolRequestHandler> list;
synchronized (_handlerHash) {
list = new ArrayList<>(_handlerHash.values());
}
Multimap<CellAddressCore, PoolRequestHandler> p2pRequests = ArrayListMultimap.create();
Multimap<CellAddressCore, PoolRequestHandler> stageRequests = ArrayListMultimap.create();
for (PoolRequestHandler handler : list) {
if (handler._started < now - _checkFilePingTimer) {
SelectedPool pool;
switch (handler._state) {
case ST_WAITING_FOR_POOL_2_POOL:
pool = handler._p2pDestinationPool;
if (pool != null) {
p2pRequests.put(pool.address(), handler);
}
break;
case ST_WAITING_FOR_STAGING:
pool = handler._poolCandidate;
if (pool != null) {
stageRequests.put(pool.address(), handler);
}
break;
}
}
}
// Send query to all pools
Map<CellAddressCore, ListenableFuture<String>> futures = new HashMap<>();
for (CellAddressCore pool : p2pRequests.keySet()) {
futures.put(pool, _poolStub.send(new CellPath(pool), "pp ls", String.class));
}
for (CellAddressCore pool : stageRequests.keySet()) {
futures.put(pool, _poolStub.send(new CellPath(pool), "rh ls", String.class));
}
// Collect replies
for (Map.Entry<CellAddressCore, ListenableFuture<String>> entry : futures.entrySet()) {
String reply;
try {
reply = CellStub.get(entry.getValue());
} catch (NoRouteToCellException | CacheException ignored) {
reply = "";
}
CellAddressCore address = entry.getKey();
for (PoolRequestHandler handler : p2pRequests.get(address)) {
if (!reply.contains(handler._pnfsId.toString())) {
handler.add(new PingFailure(address));
}
}
for (PoolRequestHandler handler : stageRequests.get(address)) {
if (!reply.contains(handler._pnfsId.toString())) {
handler.add(new PingFailure(address));
}
}
}
} catch (RuntimeException e) {
_log.error("Pool ping failed", e);
}
}
} catch (InterruptedException ignored) {
}
}
}
private static class PingFailure
{
private final CellAddressCore pool;
private PingFailure(CellAddressCore pool)
{
this.pool = pool;
}
public CellAddressCore getPool()
{
return pool;
}
}
}