/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License, Version 1.0 only
* (the "License"). You may not use this file except in compliance
* with the License.
*
* You can obtain a copy of the license at
* trunk/opends/resource/legal-notices/OpenDS.LICENSE
* or https://OpenDS.dev.java.net/OpenDS.LICENSE.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at
* trunk/opends/resource/legal-notices/OpenDS.LICENSE. If applicable,
* add the following below this CDDL HEADER, with the fields enclosed
* by brackets "[]" replaced with your own identifying information:
* Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*
*
* Copyright 2006-2010 Sun Microsystems, Inc.
* Portions copyright 2011-2013 ForgeRock AS
*/
package org.opends.server.replication.server;
import static org.opends.messages.ReplicationMessages.*;
import static org.opends.server.loggers.ErrorLogger.logError;
import static org.opends.server.loggers.debug.DebugLogger.debugEnabled;
import static org.opends.server.loggers.debug.DebugLogger.getTracer;
import static org.opends.server.util.StaticUtils.stackTraceToSingleLineString;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;
import org.opends.messages.Category;
import org.opends.messages.Message;
import org.opends.messages.MessageBuilder;
import org.opends.messages.Severity;
import org.opends.server.admin.std.server.MonitorProviderCfg;
import org.opends.server.api.MonitorProvider;
import org.opends.server.core.DirectoryServer;
import org.opends.server.loggers.debug.DebugTracer;
import org.opends.server.replication.common.*;
import org.opends.server.replication.protocol.*;
import org.opends.server.types.*;
import org.opends.server.util.TimeThread;
import com.sleepycat.je.DatabaseException;
/**
* This class define an in-memory cache that will be used to store
* the messages that have been received from an LDAP server or
* from another replication server and that should be forwarded to
* other servers.
*
* The size of the cache is set by configuration.
* If the cache becomes bigger than the configured size, the older messages
* are removed and should they be needed again must be read from the backing
* file
*
*
* it runs a thread that is responsible for saving the messages
* received to the disk and for trimming them
* Decision to trim can be based on disk space or age of the message
*/
public class ReplicationServerDomain extends MonitorProvider<MonitorProviderCfg>
{
private final String baseDn;
// The Status analyzer that periodically verifies if the connected DSs are
// late or not
private StatusAnalyzer statusAnalyzer = null;
// The monitoring publisher that periodically sends monitoring messages to the
// topology
private MonitoringPublisher monitoringPublisher = null;
/*
* The following map contains one balanced tree for each replica ID
* to which we are currently publishing
* the first update in the balanced tree is the next change that we
* must push to this particular server
*
* We add new TreeSet in the HashMap when a new server register
* to this replication server.
*
*/
private final Map<Integer, DataServerHandler> directoryServers =
new ConcurrentHashMap<Integer, DataServerHandler>();
/*
* This map contains one ServerHandler for each replication servers
* with which we are connected (so normally all the replication servers)
* the first update in the balanced tree is the next change that we
* must push to this particular server
*
* We add new TreeSet in the HashMap when a new replication server register
* to this replication server.
*/
private final Map<Integer, ReplicationServerHandler> replicationServers =
new ConcurrentHashMap<Integer, ReplicationServerHandler>();
private final ConcurrentLinkedQueue<MessageHandler> otherHandlers =
new ConcurrentLinkedQueue<MessageHandler>();
/*
* This map contains the List of updates received from each
* LDAP server
*/
private final Map<Integer, DbHandler> sourceDbHandlers =
new ConcurrentHashMap<Integer, DbHandler>();
private ReplicationServer replicationServer;
// GenerationId management
private volatile long generationId = -1;
private boolean generationIdSavedStatus = false;
// The tracer object for the debug logger.
private static final DebugTracer TRACER = getTracer();
// Monitor data management
/**
* The monitor data consolidated over the topology.
*/
private volatile MonitorData monitorData = new MonitorData();
// This lock guards against multiple concurrent monitor data recalculation.
private final Object pendingMonitorLock = new Object();
// Guarded by pendingMonitorLock.
private long monitorDataLastBuildDate = 0;
// The set of replication servers which are already known to be slow to send
// monitor data.
//
// Guarded by pendingMonitorLock.
private final Set<Integer> monitorDataLateServers = new HashSet<Integer>();
// This lock serializes updates to the pending monitor data.
private final Object pendingMonitorDataLock = new Object();
// Monitor data which is currently being calculated.
//
// Guarded by pendingMonitorDataLock.
private MonitorData pendingMonitorData;
// A set containing the IDs of servers from which we are currently expecting
// monitor responses. When a response is received from a server we remove the
// ID from this table, and count down the latch if the ID was in the table.
//
// Guarded by pendingMonitorDataLock.
private final Set<Integer> pendingMonitorDataServerIDs =
new HashSet<Integer>();
// This latch is non-null and is used in order to count incoming responses as
// they arrive. Since incoming response may arrive at any time, even when
// there is no pending monitor request, access to the latch must be guarded.
//
// Guarded by pendingMonitorDataLock.
private CountDownLatch pendingMonitorDataLatch = null;
// TODO: Remote monitor data cache lifetime is 500ms/should be configurable
private final long monitorDataLifeTime = 500;
/**
* The needed info for each received assured update message we are waiting
* acks for.
* Key: a change number matching a received update message which requested
* assured mode usage (either safe read or safe data mode)
* Value: The object holding every info needed about the already received acks
* as well as the acks to be received.
* For more details, see ExpectedAcksInfo and its sub classes javadoc.
*/
private final ConcurrentHashMap<ChangeNumber, ExpectedAcksInfo> waitingAcks =
new ConcurrentHashMap<ChangeNumber, ExpectedAcksInfo>();
// The timer used to run the timeout code (timer tasks) for the assured update
// messages we are waiting acks for.
private Timer assuredTimeoutTimer = null;
// Counter used to purge the timer tasks references in assuredTimeoutTimer,
// every n number of treated assured messages
private int assuredTimeoutTimerPurgeCounter = 0;
private ServerState ctHeartbeatState = null;
/**
* Creates a new ReplicationServerDomain associated to the DN baseDn.
*
* @param baseDn The baseDn associated to the ReplicationServerDomain.
* @param replicationServer the ReplicationServer that created this
* replicationServer cache.
*/
public ReplicationServerDomain(
String baseDn, ReplicationServer replicationServer)
{
this.baseDn = baseDn;
this.replicationServer = replicationServer;
this.assuredTimeoutTimer = new Timer("Replication server RS("
+ replicationServer.getServerId()
+ ") assured timer for domain \"" + baseDn + "\"", true);
DirectoryServer.registerMonitorProvider(this);
}
/**
* Add an update that has been received to the list of
* updates that must be forwarded to all other servers.
*
* @param update The update that has been received.
* @param sourceHandler The ServerHandler for the server from which the
* update was received
* @throws IOException When an IO exception happens during the update
* processing.
*/
public void put(UpdateMsg update, ServerHandler sourceHandler)
throws IOException
{
ChangeNumber cn = update.getChangeNumber();
int id = cn.getServerId();
sourceHandler.updateServerState(update);
sourceHandler.incrementInCount();
if (generationId < 0)
{
generationId = sourceHandler.getGenerationId();
}
/**
* If this is an assured message (a message requesting ack), we must
* construct the ExpectedAcksInfo object with the right number of expected
* acks before posting message to the writers. Otherwise some writers may
* have time to post, receive the ack and increment received ack counter
* (kept in ExpectedAcksInfo object) and we could think the acknowledgment
* is fully processed although it may be not (some other acks from other
* servers are not yet arrived). So for that purpose we do a pre-loop
* to determine to who we will post an assured message.
* Whether the assured mode is safe read or safe data, we anyway do not
* support the assured replication feature across topologies with different
* group ids. The assured feature insures assured replication based on the
* same locality (group id). For instance in double data center deployment
* (2 group id usage) with assured replication enabled, an assured message
* sent from data center 1 (group id = 1) will be sent to servers of both
* data centers, but one will request and wait acks only from servers of the
* data center 1.
*/
boolean assuredMessage = update.isAssured();
PreparedAssuredInfo preparedAssuredInfo = null;
if (assuredMessage)
{
// Assured feature is supported starting from replication protocol V2
if (sourceHandler.getProtocolVersion() >=
ProtocolVersion.REPLICATION_PROTOCOL_V2)
{
// According to assured sub-mode, prepare structures to keep track of
// the acks we are interested in.
AssuredMode assuredMode = update.getAssuredMode();
if (assuredMode == AssuredMode.SAFE_DATA_MODE)
{
sourceHandler.incrementAssuredSdReceivedUpdates();
preparedAssuredInfo = processSafeDataUpdateMsg(update, sourceHandler);
} else if (assuredMode == AssuredMode.SAFE_READ_MODE)
{
sourceHandler.incrementAssuredSrReceivedUpdates();
preparedAssuredInfo = processSafeReadUpdateMsg(update, sourceHandler);
} else
{
// Unknown assured mode: should never happen
Message errorMsg = ERR_RS_UNKNOWN_ASSURED_MODE.get(
Integer.toString(replicationServer.getServerId()),
assuredMode.toString(), baseDn, update.toString());
logError(errorMsg);
assuredMessage = false;
}
} else
{
assuredMessage = false;
}
}
// look for the dbHandler that is responsible for the LDAP server which
// generated the change.
DbHandler dbHandler;
synchronized (sourceDbHandlers)
{
dbHandler = sourceDbHandlers.get(id);
if (dbHandler == null)
{
try
{
dbHandler = replicationServer.newDbHandler(id, baseDn);
generationIdSavedStatus = true;
} catch (DatabaseException e)
{
/*
* Because of database problem we can't save any more changes
* from at least one LDAP server.
* This replicationServer therefore can't do it's job properly anymore
* and needs to close all its connections and shutdown itself.
*/
MessageBuilder mb = new MessageBuilder();
mb.append(ERR_CHANGELOG_SHUTDOWN_DATABASE_ERROR.get());
mb.append(stackTraceToSingleLineString(e));
logError(mb.toMessage());
replicationServer.shutdown();
return;
}
sourceDbHandlers.put(id, dbHandler);
}
}
// Publish the messages to the source handler
dbHandler.add(update);
List<Integer> expectedServers = null;
if (assuredMessage)
{
expectedServers = preparedAssuredInfo.expectedServers;
if (expectedServers != null)
{
// Store the expected acks info into the global map.
// The code for processing reception of acks for this update will update
// info kept in this object and if enough acks received, it will send
// back the final ack to the requester and remove the object from this
// map
// OR
// The following timer will time out and send an timeout ack to the
// requester if the acks are not received in time. The timer will also
// remove the object from this map.
waitingAcks.put(cn, preparedAssuredInfo.expectedAcksInfo);
// Arm timer for this assured update message (wait for acks until it
// times out)
AssuredTimeoutTask assuredTimeoutTask = new AssuredTimeoutTask(cn);
assuredTimeoutTimer.schedule(assuredTimeoutTask,
replicationServer.getAssuredTimeout());
// Purge timer every 100 treated messages
assuredTimeoutTimerPurgeCounter++;
if ((assuredTimeoutTimerPurgeCounter % 100) == 0)
assuredTimeoutTimer.purge();
}
}
/**
* The update message equivalent to the originally received update message,
* but with assured flag disabled. This message is the one that should be
* sent to non eligible servers for assured mode.
* We need a clone like of the original message with assured flag off, to be
* posted to servers we don't want to wait the ack from (not normal status
* servers or servers with different group id). This must be done because
* the posted message is a reference so each writer queue gets the same
* reference, thus, changing the assured flag of an object is done for every
* references posted on every writer queues. That is why we need a message
* version with assured flag on and another one with assured flag off.
*/
NotAssuredUpdateMsg notAssuredUpdate = null;
/*
* Push the message to the replication servers
*/
if (sourceHandler.isDataServer())
{
for (ReplicationServerHandler handler : replicationServers.values())
{
/**
* Ignore updates to RS with bad gen id
* (no system managed status for a RS)
*/
if ( (generationId>0) && (generationId != handler.getGenerationId()) )
{
if (debugEnabled())
TRACER.debugInfo("In " + "Replication Server " +
replicationServer.getReplicationPort() + " " +
baseDn + " " + replicationServer.getServerId() +
" for dn " + baseDn + ", update " +
update.getChangeNumber().toString() +
" will not be sent to replication server " +
Integer.toString(handler.getServerId()) + " with generation id " +
Long.toString(handler.getGenerationId()) +
" different from local " +
"generation id " + Long.toString(generationId));
continue;
}
if (assuredMessage)
{
// Assured mode: post an assured or not assured matching update
// message according to what has been computed for the destination
// server
if ((expectedServers != null) && expectedServers.contains(handler.
getServerId()))
{
handler.add(update, sourceHandler);
} else
{
if (notAssuredUpdate == null)
{
notAssuredUpdate = new NotAssuredUpdateMsg(update);
}
handler.add(notAssuredUpdate, sourceHandler);
}
} else
{
handler.add(update, sourceHandler);
}
}
}
/*
* Push the message to the LDAP servers
*/
for (DataServerHandler handler : directoryServers.values())
{
// Don't forward the change to the server that just sent it
if (handler == sourceHandler)
{
continue;
}
/**
* Ignore updates to DS in bad BAD_GENID_STATUS or FULL_UPDATE_STATUS
*
* The RSD lock should not be taken here as it is acceptable to have a
* delay between the time the server has a wrong status and the fact we
* detect it: the updates that succeed to pass during this time will have
* no impact on remote server. But it is interesting to not saturate
* uselessly the network if the updates are not necessary so this check to
* stop sending updates is interesting anyway. Not taking the RSD lock
* allows to have better performances in normal mode (most of the time).
*/
ServerStatus dsStatus = handler.getStatus();
if ( (dsStatus == ServerStatus.BAD_GEN_ID_STATUS) ||
(dsStatus == ServerStatus.FULL_UPDATE_STATUS) )
{
if (debugEnabled())
{
if (dsStatus == ServerStatus.BAD_GEN_ID_STATUS)
TRACER.debugInfo("In " + this +
" for dn " + baseDn + ", update " +
update.getChangeNumber().toString() +
" will not be sent to directory server " +
Integer.toString(handler.getServerId()) + " with generation id " +
Long.toString(handler.getGenerationId()) +
" different from local " +
"generation id " + Long.toString(generationId));
if (dsStatus == ServerStatus.FULL_UPDATE_STATUS)
TRACER.debugInfo("In RS " +
replicationServer.getServerId() +
" for dn " + baseDn + ", update " +
update.getChangeNumber().toString() +
" will not be sent to directory server " +
Integer.toString(handler.getServerId()) +
" as it is in full update");
}
continue;
}
if (assuredMessage)
{
// Assured mode: post an assured or not assured matching update
// message according to what has been computed for the destination
// server
if ((expectedServers != null) && expectedServers.contains(handler.
getServerId()))
{
handler.add(update, sourceHandler);
} else
{
if (notAssuredUpdate == null)
{
notAssuredUpdate = new NotAssuredUpdateMsg(update);
}
handler.add(notAssuredUpdate, sourceHandler);
}
} else
{
handler.add(update, sourceHandler);
}
}
// Push the message to the other subscribing handlers
for (MessageHandler handler : otherHandlers) {
handler.add(update, sourceHandler);
}
}
/**
* Helper class to be the return type of a method that processes a just
* received assured update message:
* - processSafeReadUpdateMsg
* - processSafeDataUpdateMsg
* This is a facility to pack many interesting returned object.
*/
private class PreparedAssuredInfo
{
/**
* The list of servers identified as servers we are interested in
* receiving acks from. If this list is not null, then expectedAcksInfo
* should be not null.
* Servers that are not in this list are servers not eligible for an ack
* request.
*
*/
public List<Integer> expectedServers = null;
/**
* The constructed ExpectedAcksInfo object to be used when acks will be
* received. Null if expectedServers is null.
*/
public ExpectedAcksInfo expectedAcksInfo = null;
}
/**
* Process a just received assured update message in Safe Read mode. If the
* ack can be sent immediately, it is done here. This will also determine to
* which suitable servers an ack should be requested from, and which ones are
* not eligible for an ack request.
* This method is an helper method for the put method. Have a look at the put
* method for a better understanding.
* @param update The just received assured update to process.
* @param sourceHandler The ServerHandler for the server from which the
* update was received
* @return A suitable PreparedAssuredInfo object that contains every needed
* info to proceed with post to server writers.
* @throws IOException When an IO exception happens during the update
* processing.
*/
private PreparedAssuredInfo processSafeReadUpdateMsg(
UpdateMsg update, ServerHandler sourceHandler) throws IOException
{
ChangeNumber cn = update.getChangeNumber();
byte groupId = replicationServer.getGroupId();
byte sourceGroupId = sourceHandler.getGroupId();
List<Integer> expectedServers = new ArrayList<Integer>();
List<Integer> wrongStatusServers = new ArrayList<Integer>();
if (sourceGroupId == groupId)
// Assured feature does not cross different group ids
{
if (sourceHandler.isDataServer())
{
// Look for RS eligible for assured
for (ReplicationServerHandler handler : replicationServers.values())
{
if (handler.getGroupId() == groupId)
// No ack expected from a RS with different group id
{
if ((generationId > 0) &&
(generationId == handler.getGenerationId()))
// No ack expected from a RS with bad gen id
{
expectedServers.add(handler.getServerId());
}
}
}
}
// Look for DS eligible for assured
for (DataServerHandler handler : directoryServers.values())
{
// Don't forward the change to the server that just sent it
if (handler == sourceHandler)
{
continue;
}
if (handler.getGroupId() == groupId)
// No ack expected from a DS with different group id
{
ServerStatus serverStatus = handler.getStatus();
if (serverStatus == ServerStatus.NORMAL_STATUS)
{
expectedServers.add(handler.getServerId());
} else
// No ack expected from a DS with wrong status
{
if (serverStatus == ServerStatus.DEGRADED_STATUS)
{
wrongStatusServers.add(handler.getServerId());
}
/**
* else
* BAD_GEN_ID_STATUS or FULL_UPDATE_STATUS:
* We do not want this to be reported as an error to the update
* maker -> no pollution or potential misunderstanding when
* reading logs or monitoring and it was just administration (for
* instance new server is being configured in topo: it goes in bad
* gen then then full full update).
*/
}
}
}
}
// Return computed structures
PreparedAssuredInfo preparedAssuredInfo = new PreparedAssuredInfo();
if (expectedServers.size() > 0)
{
// Some other acks to wait for
preparedAssuredInfo.expectedAcksInfo = new SafeReadExpectedAcksInfo(cn,
sourceHandler, expectedServers, wrongStatusServers);
preparedAssuredInfo.expectedServers = expectedServers;
}
if (preparedAssuredInfo.expectedServers == null)
{
// No eligible servers found, send the ack immediately
sourceHandler.send(new AckMsg(cn));
}
return preparedAssuredInfo;
}
/**
* Process a just received assured update message in Safe Data mode. If the
* ack can be sent immediately, it is done here. This will also determine to
* which suitable servers an ack should be requested from, and which ones are
* not eligible for an ack request.
* This method is an helper method for the put method. Have a look at the put
* method for a better understanding.
* @param update The just received assured update to process.
* @param sourceHandler The ServerHandler for the server from which the
* update was received
* @return A suitable PreparedAssuredInfo object that contains every needed
* info to proceed with post to server writers.
* @throws IOException When an IO exception happens during the update
* processing.
*/
private PreparedAssuredInfo processSafeDataUpdateMsg(
UpdateMsg update, ServerHandler sourceHandler) throws IOException
{
ChangeNumber cn = update.getChangeNumber();
boolean interestedInAcks = false;
byte safeDataLevel = update.getSafeDataLevel();
byte groupId = replicationServer.getGroupId();
byte sourceGroupId = sourceHandler.getGroupId();
if (safeDataLevel < (byte) 1)
{
// Should never happen
Message errorMsg = ERR_UNKNOWN_ASSURED_SAFE_DATA_LEVEL.get(
Integer.toString(replicationServer.getServerId()),
Byte.toString(safeDataLevel), baseDn, update.toString());
logError(errorMsg);
} else if (sourceGroupId != groupId)
{
// Assured feature does not cross different group IDS
} else
{
if ((generationId > 0) &&
(generationId == sourceHandler.getGenerationId()))
// Ignore assured updates from wrong generationId servers
{
if (sourceHandler.isDataServer())
{
if (safeDataLevel == (byte) 1)
{
/**
* Immediately return the ack for an assured message in safe data
* mode with safe data level 1, coming from a DS. No need to wait
* for more acks
*/
sourceHandler.send(new AckMsg(cn));
} else
{
/**
* level > 1 : We need further acks
* The message will be posted in assured mode to eligible
* servers. The embedded safe data level is not changed, and his
* value will be used by a remote RS to determine if he must send
* an ack (level > 1) or not (level = 1)
*/
interestedInAcks = true;
}
} else
{ // A RS sent us the safe data message, for sure no further ack to wait
/**
* Level 1 has already been reached so no further acks to wait.
* Just deal with level > 1
*/
if (safeDataLevel > (byte) 1)
{
sourceHandler.send(new AckMsg(cn));
}
}
}
}
List<Integer> expectedServers = new ArrayList<Integer>();
if (interestedInAcks)
{
if (sourceHandler.isDataServer())
{
// Look for RS eligible for assured
for (ReplicationServerHandler handler : replicationServers.values())
{
if (handler.getGroupId() == groupId)
// No ack expected from a RS with different group id
{
if ((generationId > 0) &&
(generationId == handler.getGenerationId()))
// No ack expected from a RS with bad gen id
{
expectedServers.add(handler.getServerId());
}
}
}
}
}
// Return computed structures
PreparedAssuredInfo preparedAssuredInfo = new PreparedAssuredInfo();
int nExpectedServers = expectedServers.size();
if (interestedInAcks) // interestedInAcks so level > 1
{
if (nExpectedServers > 0)
{
// Some other acks to wait for
int sdl = update.getSafeDataLevel();
int neededAdditionalServers = sdl - 1;
// Change the number of expected acks if not enough available eligible
// servers: the level is a best effort thing, we do not want to timeout
// at every assured SD update for instance if a RS has had his gen id
// reseted
byte finalSdl = ((nExpectedServers >= neededAdditionalServers) ?
(byte)sdl : // Keep level as it was
(byte)(nExpectedServers+1)); // Change level to match what's available
preparedAssuredInfo.expectedAcksInfo = new SafeDataExpectedAcksInfo(cn,
sourceHandler, finalSdl, expectedServers);
preparedAssuredInfo.expectedServers = expectedServers;
} else
{
// level > 1 and source is a DS but no eligible servers found, send the
// ack immediately
sourceHandler.send(new AckMsg(cn));
}
}
return preparedAssuredInfo;
}
/**
* Process an ack received from a given server.
*
* @param ack The ack message received.
* @param ackingServer The server handler of the server that sent the ack.
*/
public void processAck(AckMsg ack, ServerHandler ackingServer)
{
// Retrieve the expected acks info for the update matching the original
// sent update.
ChangeNumber cn = ack.getChangeNumber();
ExpectedAcksInfo expectedAcksInfo = waitingAcks.get(cn);
if (expectedAcksInfo != null)
{
// Prevent concurrent access from processAck() or AssuredTimeoutTask.run()
synchronized (expectedAcksInfo)
{
if (expectedAcksInfo.isCompleted())
{
// Timeout code is sending a timeout ack, do nothing and let him
// remove object from the map
return;
}
/**
*
* If this is the last ack we were waiting from, immediately create and
* send the final ack to the original server
*/
if (expectedAcksInfo.processReceivedAck(ackingServer, ack))
{
// Remove the object from the map as no more needed
waitingAcks.remove(cn);
AckMsg finalAck = expectedAcksInfo.createAck(false);
ServerHandler origServer = expectedAcksInfo.getRequesterServer();
try
{
origServer.send(finalAck);
} catch (IOException e)
{
/**
* An error happened trying the send back an ack to the server.
* Log an error and close the connection to this server.
*/
MessageBuilder mb = new MessageBuilder();
mb.append(ERR_RS_ERROR_SENDING_ACK.get(
Integer.toString(replicationServer.getServerId()),
Integer.toString(origServer.getServerId()),
cn.toString(), baseDn));
mb.append(stackTraceToSingleLineString(e));
logError(mb.toMessage());
stopServer(origServer, false);
}
// Mark the ack info object as completed to prevent potential timeout
// code parallel run
expectedAcksInfo.completed();
}
}
}
/* Else the timeout occurred for the update matching this change number
* and the ack with timeout error has probably already been sent.
*/
}
/**
* The code run when the timeout occurs while waiting for acks of the
* eligible servers. This basically sends a timeout ack (with any additional
* error info) to the original server that sent an assured update message.
*/
private class AssuredTimeoutTask extends TimerTask
{
private ChangeNumber cn = null;
/**
* Constructor for the timer task.
* @param cn The changenumber of the assured update we are waiting acks for
*/
public AssuredTimeoutTask(ChangeNumber cn)
{
this.cn = cn;
}
/**
* Run when the assured timeout for an assured update message we are waiting
* acks for occurs.
*/
public void run()
{
ExpectedAcksInfo expectedAcksInfo = waitingAcks.get(cn);
if (expectedAcksInfo != null)
{
synchronized (expectedAcksInfo)
{
if (expectedAcksInfo.isCompleted())
{
// processAck() code is sending the ack, do nothing and let him
// remove object from the map
return;
}
// Remove the object from the map as no more needed
waitingAcks.remove(cn);
// Create the timeout ack and send him to the server the assured
// update message came from
AckMsg finalAck = expectedAcksInfo.createAck(true);
ServerHandler origServer = expectedAcksInfo.getRequesterServer();
if (debugEnabled())
TRACER.debugInfo(
"In RS " + Integer.toString(replicationServer.getServerId()) +
" for " + baseDn +
", sending timeout for assured update with change " + " number " +
cn.toString() + " to server id " +
Integer.toString(origServer.getServerId()));
try
{
origServer.send(finalAck);
} catch (IOException e)
{
/**
* An error happened trying the send back an ack to the server.
* Log an error and close the connection to this server.
*/
MessageBuilder mb = new MessageBuilder();
mb.append(ERR_RS_ERROR_SENDING_ACK.get(
Integer.toString(replicationServer.getServerId()),
Integer.toString(origServer.getServerId()),
cn.toString(), baseDn));
mb.append(stackTraceToSingleLineString(e));
logError(mb.toMessage());
stopServer(origServer, false);
}
// Increment assured counters
boolean safeRead =
(expectedAcksInfo instanceof SafeReadExpectedAcksInfo);
if (safeRead)
{
origServer.incrementAssuredSrReceivedUpdatesTimeout();
} else
{
if (origServer.isDataServer())
{
origServer.incrementAssuredSdReceivedUpdatesTimeout();
}
}
// retrieve expected servers in timeout to increment their counter
List<Integer> serversInTimeout = expectedAcksInfo.getTimeoutServers();
for (Integer serverId : serversInTimeout)
{
ServerHandler expectedServerInTimeout =
directoryServers.get(serverId);
if (expectedServerInTimeout != null)
{
// Was a DS
if (safeRead)
{
expectedServerInTimeout.incrementAssuredSrSentUpdatesTimeout();
} else
{
// No SD update sent to a DS (meaningless)
}
} else
{
expectedServerInTimeout =
replicationServers.get(serverId);
if (expectedServerInTimeout != null)
{
// Was a RS
if (safeRead)
{
expectedServerInTimeout.
incrementAssuredSrSentUpdatesTimeout();
} else
{
expectedServerInTimeout.
incrementAssuredSdSentUpdatesTimeout();
}
}
/* else server disappeared ? Let's forget about it. */
}
}
// Mark the ack info object as completed to prevent potential
// processAck() code parallel run
expectedAcksInfo.completed();
}
}
}
}
/**
* Stop operations with a list of replication servers.
*
* @param replServers the replication servers for which
* we want to stop operations
*/
public void stopReplicationServers(Collection<String> replServers)
{
for (ReplicationServerHandler handler : replicationServers.values())
{
if (replServers.contains(handler.getServerAddressURL()))
stopServer(handler, false);
}
}
/**
* Stop operations with all servers this domain is connected with (RS and DS).
*
* @param shutdown A boolean indicating if the stop is due to a
* shutdown condition.
*/
public void stopAllServers(boolean shutdown)
{
// Close session with other replication servers
for (ReplicationServerHandler serverHandler : replicationServers.values())
{
stopServer(serverHandler, shutdown);
}
// Close session with other LDAP servers
for (DataServerHandler serverHandler : directoryServers.values())
{
stopServer(serverHandler, shutdown);
}
}
/**
* Checks that a DS is not connected with same id.
*
* @param handler the DS we want to check
* @return true if this is not a duplicate server
*/
public boolean checkForDuplicateDS(DataServerHandler handler)
{
if (directoryServers.containsKey(handler.getServerId()))
{
// looks like two connected LDAP servers have the same serverId
Message message = ERR_DUPLICATE_SERVER_ID.get(
replicationServer.getMonitorInstanceName(),
directoryServers.get(handler.getServerId()).toString(),
handler.toString(), handler.getServerId());
logError(message);
return false;
}
return true;
}
/**
* Stop operations with a given server.
*
* @param handler the server for which we want to stop operations.
* @param shutdown A boolean indicating if the stop is due to a
* shutdown condition.
*/
public void stopServer(ServerHandler handler, boolean shutdown)
{
if (debugEnabled())
TRACER.debugInfo(
"In " + this.replicationServer.getMonitorInstanceName() +
" domain=" + this + " stopServer() on the server handler " +
handler.getMonitorInstanceName());
/*
* We must prevent deadlock on replication server domain lock, when for
* instance this code is called from dying ServerReader but also dying
* ServerWriter at the same time, or from a thread that wants to shut down
* the handler. So use a thread safe flag to know if the job must be done
* or not (is already being processed or not).
*/
if (!handler.engageShutdown())
// Only do this once (prevent other thread to enter here again)
{
if (!shutdown)
{
try
{
// Acquire lock on domain (see more details in comment of start()
// method of ServerHandler)
lock();
}
catch (InterruptedException ex)
{
// We can't deal with this here, so re-interrupt thread so that it is
// caught during subsequent IO.
Thread.currentThread().interrupt();
return;
}
}
try
{
// Stop useless monitoring publisher if no more RS or DS in domain
if ( (directoryServers.size() + replicationServers.size() )== 1)
{
if (debugEnabled())
TRACER.debugInfo("In " +
replicationServer.getMonitorInstanceName() +
" remote server " + handler.getMonitorInstanceName() + " is " +
"the last RS/DS to be stopped: stopping monitoring publisher");
stopMonitoringPublisher();
}
if (handler.isReplicationServer())
{
if (replicationServers.containsKey(handler.getServerId()))
{
unregisterServerHandler(handler);
handler.shutdown();
// Check if generation id has to be reset
mayResetGenerationId();
if (!shutdown)
{
// Warn our DSs that a RS or DS has quit (does not use this
// handler as already removed from list)
buildAndSendTopoInfoToDSs(null);
}
}
} else if (directoryServers.containsKey(handler.getServerId()))
{
// If this is the last DS for the domain,
// shutdown the status analyzer
if (directoryServers.size() == 1)
{
if (debugEnabled())
TRACER.debugInfo("In " +
replicationServer.getMonitorInstanceName() +
" remote server " + handler.getMonitorInstanceName() +
" is the last DS to be stopped: stopping status analyzer");
stopStatusAnalyzer();
}
unregisterServerHandler(handler);
handler.shutdown();
// Check if generation id has to be reset
mayResetGenerationId();
if (!shutdown)
{
// Update the remote replication servers with our list
// of connected LDAP servers
buildAndSendTopoInfoToRSs();
// Warn our DSs that a RS or DS has quit (does not use this
// handler as already removed from list)
buildAndSendTopoInfoToDSs(null);
}
} else if (otherHandlers.contains(handler))
{
unRegisterHandler(handler);
handler.shutdown();
}
}
catch(Exception e)
{
logError(Message.raw(Category.SYNC, Severity.NOTICE,
stackTraceToSingleLineString(e)));
}
finally
{
if (!shutdown)
{
release();
}
}
}
}
/**
* Stop the handler.
* @param handler The handler to stop.
*/
public void stopServer(MessageHandler handler)
{
if (debugEnabled())
TRACER.debugInfo(
"In " + this.replicationServer.getMonitorInstanceName()
+ " domain=" + this + " stopServer() on the message handler "
+ handler.getMonitorInstanceName());
/*
* We must prevent deadlock on replication server domain lock, when for
* instance this code is called from dying ServerReader but also dying
* ServerWriter at the same time, or from a thread that wants to shut down
* the handler. So use a thread safe flag to know if the job must be done
* or not (is already being processed or not).
*/
if (!handler.engageShutdown())
// Only do this once (prevent other thread to enter here again)
{
try
{
// Acquire lock on domain (see more details in comment of start() method
// of ServerHandler)
lock();
}
catch (InterruptedException ex)
{
// We can't deal with this here, so re-interrupt thread so that it is
// caught during subsequent IO.
Thread.currentThread().interrupt();
return;
}
try
{
if (otherHandlers.contains(handler))
{
unRegisterHandler(handler);
handler.shutdown();
}
}
catch(Exception e)
{
logError(Message.raw(Category.SYNC, Severity.NOTICE,
stackTraceToSingleLineString(e)));
}
finally
{
release();
}
}
}
/**
* Unregister this handler from the list of handlers registered to this
* domain.
* @param handler the provided handler to unregister.
*/
private void unregisterServerHandler(ServerHandler handler)
{
if (handler.isReplicationServer())
{
replicationServers.remove(handler.getServerId());
}
else
{
directoryServers.remove(handler.getServerId());
}
}
/**
* This method resets the generationId for this domain if there is no LDAP
* server currently connected in the whole topology on this domain and
* if the generationId has never been saved.
*
* - test emtpyness of directoryServers list
* - traverse replicationServers list and test for each if DS are connected
* So it strongly relies on the directoryServers list
*/
private void mayResetGenerationId()
{
if (debugEnabled())
TRACER.debugInfo(
"In RS " + this.replicationServer.getMonitorInstanceName() +
" for " + baseDn + " " +
" mayResetGenerationId generationIdSavedStatus=" +
generationIdSavedStatus);
// If there is no more any LDAP server connected to this domain in the
// topology and the generationId has never been saved, then we can reset
// it and the next LDAP server to connect will become the new reference.
boolean lDAPServersConnectedInTheTopology = false;
if (directoryServers.isEmpty())
{
for (ReplicationServerHandler rsh : replicationServers.values())
{
if (generationId != rsh.getGenerationId())
{
if (debugEnabled())
TRACER.debugInfo(
"In RS " + this.replicationServer.getMonitorInstanceName() +
" for " + baseDn + " " +
" mayResetGenerationId skip RS" + rsh.getMonitorInstanceName() +
" that has different genId");
} else
{
if (rsh.hasRemoteLDAPServers())
{
lDAPServersConnectedInTheTopology = true;
if (debugEnabled())
TRACER.debugInfo(
"In RS " + this.replicationServer.getMonitorInstanceName() +
" for " + baseDn + " " +
" mayResetGenerationId RS" + rsh.getMonitorInstanceName() +
" has servers connected to it - will not reset generationId");
break;
}
}
}
} else
{
lDAPServersConnectedInTheTopology = true;
if (debugEnabled())
TRACER.debugInfo(
"In RS " + this.replicationServer.getMonitorInstanceName() +
" for " + baseDn + " " +
" has servers connected to it - will not reset generationId");
}
if ((!lDAPServersConnectedInTheTopology) &&
(!this.generationIdSavedStatus) &&
(generationId != -1))
{
changeGenerationId(-1, false);
}
}
/**
* Checks that a remote RS is not already connected to this hosting RS.
* @param handler The handler for the remote RS.
* @return flag specifying whether the remote RS is already connected.
* @throws DirectoryException when a problem occurs.
*/
public boolean checkForDuplicateRS(ReplicationServerHandler handler)
throws DirectoryException
{
ReplicationServerHandler oldHandler =
replicationServers.get(handler.getServerId());
if ((oldHandler != null))
{
if (oldHandler.getServerAddressURL().equals(
handler.getServerAddressURL()))
{
// this is the same server, this means that our ServerStart messages
// have been sent at about the same time and 2 connections
// have been established.
// Silently drop this connection.
return false;
}
else
{
// looks like two replication servers have the same serverId
// log an error message and drop this connection.
Message message = ERR_DUPLICATE_REPLICATION_SERVER_ID.get(
replicationServer.getMonitorInstanceName(), oldHandler.
getServerAddressURL(), handler.getServerAddressURL(),
handler.getServerId());
throw new DirectoryException(ResultCode.OTHER, message);
}
}
return true;
}
/**
* Get the next update that need to be sent to a given LDAP server.
* This call is blocking when no update is available or when dependencies
* do not allow to send the next available change
*
* @param handler The server handler for the target directory server.
*
* @return the update that must be forwarded
*/
public UpdateMsg take(ServerHandler handler)
{
UpdateMsg msg;
/*
* Get the balanced tree that we use to sort the changes to be
* sent to the replica from the cookie
*
* The next change to send is always the first one in the tree
* So this methods simply need to check that dependencies are OK
* and update this replicaId RUV
*
*/
msg = handler.take();
return msg;
}
/**
* Return a Set of String containing the lists of Replication servers
* connected to this server.
* @return the set of connected servers
*/
public Set<String> getChangelogs()
{
LinkedHashSet<String> mySet = new LinkedHashSet<String>();
for (ReplicationServerHandler handler : replicationServers.values())
{
mySet.add(handler.getServerAddressURL());
}
return mySet;
}
/**
* Return a set containing the server that produced update and known by
* this replicationServer from all over the topology,
* whatever directly connected of connected to another RS.
* @return a set containing the servers known by this replicationServer.
*/
public Set<Integer> getServers()
{
return sourceDbHandlers.keySet();
}
/**
* Returns as a set of String the list of LDAP servers connected to us.
* Each string is the serverID of a connected LDAP server.
*
* @return The set of connected LDAP servers
*/
public List<String> getConnectedLDAPservers()
{
List<String> mySet = new ArrayList<String>(0);
for (DataServerHandler handler : directoryServers.values())
{
mySet.add(String.valueOf(handler.getServerId()));
}
return mySet;
}
/**
* Creates and returns an iterator.
* When the iterator is not used anymore, the caller MUST call the
* ReplicationIterator.releaseCursor() method to free the resources
* and locks used by the ReplicationIterator.
*
* @param serverId Identifier of the server for which the iterator is created.
* @param changeNumber Starting point for the iterator.
* @return the created ReplicationIterator. Null when no DB is available
* for the provided server Id.
*/
public ReplicationIterator getChangelogIterator(int serverId,
ChangeNumber changeNumber)
{
DbHandler handler = sourceDbHandlers.get(serverId);
if (handler == null)
return null;
ReplicationIterator it;
try
{
it = handler.generateIterator(changeNumber);
}
catch (Exception e)
{
return null;
}
if (!it.next())
{
it.releaseCursor();
return null;
}
return it;
}
/**
* Count the number of changes in the replication changelog for the provided
* serverID, between 2 provided changenumbers.
* @param serverId Identifier of the server for which the iterator is created.
* @param from lower limit changenumber.
* @param to upper limit changenumber.
* @return the number of changes.
*
*/
public int getCount(int serverId,
ChangeNumber from, ChangeNumber to)
{
DbHandler handler = sourceDbHandlers.get(serverId);
if (handler == null)
return 0;
return handler.getCount(from, to);
}
/**
* Returns the change count for that ReplicationServerDomain.
*
* @return the change count.
*/
public long getChangesCount()
{
long entryCount = 0;
for (DbHandler dbHandler : sourceDbHandlers.values())
{
entryCount += dbHandler.getChangesCount();
}
return entryCount;
}
/**
* Get the baseDn.
* @return Returns the baseDn.
*/
public String getBaseDn()
{
return baseDn;
}
/**
* Sets the provided DbHandler associated to the provided serverId.
*
* @param serverId the serverId for the server to which is
* associated the DbHandler.
* @param dbHandler the dbHandler associated to the serverId.
*
* @throws DatabaseException If a database error happened.
*/
public void setDbHandler(int serverId, DbHandler dbHandler)
throws DatabaseException
{
synchronized (sourceDbHandlers)
{
sourceDbHandlers.put(serverId, dbHandler);
}
}
/**
* Retrieves the destination handlers for a routable message.
*
* @param msg The message to route.
* @param senderHandler The handler of the server that published this message.
* @return The list of destination handlers.
*/
private List<ServerHandler> getDestinationServers(RoutableMsg msg,
ServerHandler senderHandler)
{
List<ServerHandler> servers =
new ArrayList<ServerHandler>();
if (msg.getDestination() == RoutableMsg.THE_CLOSEST_SERVER)
{
// TODO Import from the "closest server" to be implemented
} else if (msg.getDestination() == RoutableMsg.ALL_SERVERS)
{
if (!senderHandler.isReplicationServer())
{
// Send to all replication servers with a least one remote
// server connected
for (ReplicationServerHandler rsh : replicationServers.values())
{
if (rsh.hasRemoteLDAPServers())
{
servers.add(rsh);
}
}
}
// Sends to all connected LDAP servers
for (DataServerHandler destinationHandler : directoryServers.values())
{
// Don't loop on the sender
if (destinationHandler == senderHandler)
continue;
servers.add(destinationHandler);
}
} else
{
// Destination is one server
DataServerHandler destinationHandler =
directoryServers.get(msg.getDestination());
if (destinationHandler != null)
{
servers.add(destinationHandler);
} else
{
// the targeted server is NOT connected
// Let's search for the replication server that MAY
// have the targeted server connected.
if (senderHandler.isDataServer())
{
for (ReplicationServerHandler h : replicationServers.values())
{
// Send to all replication servers with a least one remote
// server connected
if (h.isRemoteLDAPServer(msg.getDestination()))
{
servers.add(h);
}
}
}
}
}
return servers;
}
/**
* Processes a message coming from one server in the topology
* and potentially forwards it to one or all other servers.
*
* @param msg The message received and to be processed.
* @param senderHandler The server handler of the server that emitted
* the message.
*/
public void process(RoutableMsg msg, ServerHandler senderHandler)
{
// Test the message for which a ReplicationServer is expected
// to be the destination
if (!(msg instanceof InitializeRequestMsg) &&
!(msg instanceof InitializeTargetMsg) &&
!(msg instanceof InitializeRcvAckMsg) &&
!(msg instanceof EntryMsg) &&
!(msg instanceof DoneMsg) &&
(msg.getDestination() == this.replicationServer.getServerId()))
{
if (msg instanceof ErrorMsg)
{
ErrorMsg errorMsg = (ErrorMsg) msg;
logError(ERR_ERROR_MSG_RECEIVED.get(
errorMsg.getDetails()));
} else if (msg instanceof MonitorRequestMsg)
{
// If the request comes from a Directory Server we need to
// build the full list of all servers in the topology
// and send back a MonitorMsg with the full list of all the servers
// in the topology.
if (senderHandler.isDataServer())
{
// Monitoring information requested by a DS
MonitorMsg monitorMsg = createGlobalTopologyMonitorMsg(
msg.getDestination(), msg.getSenderID(), monitorData);
if (monitorMsg != null)
{
try
{
senderHandler.send(monitorMsg);
}
catch (IOException e)
{
// the connection was closed.
}
}
return;
} else
{
// Monitoring information requested by a RS
MonitorMsg monitorMsg =
createLocalTopologyMonitorMsg(msg.getDestination(),
msg.getSenderID());
if (monitorMsg != null)
{
try
{
senderHandler.send(monitorMsg);
} catch (Exception e)
{
// We log the error. The requestor will detect a timeout or
// any other failure on the connection.
logError(ERR_CHANGELOG_ERROR_SENDING_MSG.get(
Integer.toString((msg.getDestination()))));
}
}
}
} else if (msg instanceof MonitorMsg)
{
MonitorMsg monitorMsg = (MonitorMsg) msg;
receivesMonitorDataResponse(monitorMsg, senderHandler.getServerId());
} else
{
logError(NOTE_ERR_ROUTING_TO_SERVER.get(
msg.getClass().getCanonicalName()));
MessageBuilder mb1 = new MessageBuilder();
mb1.append(
NOTE_ERR_ROUTING_TO_SERVER.get(msg.getClass().getCanonicalName()));
mb1.append("serverID:").append(msg.getDestination());
ErrorMsg errMsg = new ErrorMsg(msg.getSenderID(), mb1.toMessage());
try
{
senderHandler.send(errMsg);
} catch (IOException ioe1)
{
// an error happened on the sender session trying to recover
// from an error on the receiver session.
// Not much more we can do at this point.
}
}
return;
}
List<ServerHandler> servers = getDestinationServers(msg, senderHandler);
if (servers.isEmpty())
{
MessageBuilder mb = new MessageBuilder();
mb.append(ERR_NO_REACHABLE_PEER_IN_THE_DOMAIN.get(
this.baseDn, Integer.toString(msg.getDestination())));
mb.append(" In Replication Server=").append(
this.replicationServer.getMonitorInstanceName());
mb.append(" unroutable message =").append(msg.getClass().getSimpleName());
mb.append(" Details:routing table is empty");
ErrorMsg errMsg = new ErrorMsg(
this.replicationServer.getServerId(),
msg.getSenderID(),
mb.toMessage());
logError(mb.toMessage());
try
{
senderHandler.send(errMsg);
} catch (IOException ioe)
{
// TODO Handle error properly (sender timeout in addition)
/*
* An error happened trying to send an error msg to this server.
* Log an error and close the connection to this server.
*/
MessageBuilder mb2 = new MessageBuilder();
mb2.append(ERR_CHANGELOG_ERROR_SENDING_ERROR.get(this.toString()));
mb2.append(stackTraceToSingleLineString(ioe));
logError(mb2.toMessage());
stopServer(senderHandler, false);
}
} else
{
for (ServerHandler targetHandler : servers)
{
try
{
targetHandler.send(msg);
} catch (IOException ioe)
{
/*
* An error happened trying the send a routable message
* to its destination server.
* Send back an error to the originator of the message.
*/
MessageBuilder mb1 = new MessageBuilder();
mb1.append(ERR_NO_REACHABLE_PEER_IN_THE_DOMAIN.get(
this.baseDn, Integer.toString(msg.getDestination())));
mb1.append(" unroutable message =" + msg.getClass().getSimpleName());
mb1.append(" Details: " + ioe.getLocalizedMessage());
ErrorMsg errMsg = new ErrorMsg(
msg.getSenderID(), mb1.toMessage());
logError(mb1.toMessage());
try
{
senderHandler.send(errMsg);
} catch (IOException ioe1)
{
// an error happened on the sender session trying to recover
// from an error on the receiver session.
// We don't have much solution left beside closing the sessions.
stopServer(senderHandler, false);
stopServer(targetHandler, false);
}
// TODO Handle error properly (sender timeout in addition)
}
}
}
}
/**
* Creates a new monitor message including monitoring information for the
* whole topology.
*
* @param sender
* The sender of this message.
* @param destination
* The destination of this message.
* @param monitorData
* The domain monitor data which should be used for the message.
* @return The newly created and filled MonitorMsg. Null if a problem occurred
* during message creation.
*/
public MonitorMsg createGlobalTopologyMonitorMsg(
int sender, int destination, MonitorData monitorData)
{
MonitorMsg returnMsg =
new MonitorMsg(sender, destination);
returnMsg.setReplServerDbState(getDbServerState());
// Add the informations about the Replicas currently in
// the topology.
Iterator<Integer> it = monitorData.ldapIterator();
while (it.hasNext())
{
int replicaId = it.next();
returnMsg.setServerState(replicaId,
monitorData.getLDAPServerState(replicaId),
monitorData.getApproxFirstMissingDate(replicaId), true);
}
// Add the information about the Replication Servers
// currently in the topology.
it = monitorData.rsIterator();
while (it.hasNext())
{
int replicaId = it.next();
returnMsg.setServerState(replicaId,
monitorData.getRSStates(replicaId),
monitorData.getRSApproxFirstMissingDate(replicaId), false);
}
return returnMsg;
}
/**
* Creates a new monitor message including monitoring information for the
* topology directly connected to this RS. This includes information for: -
* local RS - all direct DSs - all direct RSs
*
* @param sender
* The sender of this message.
* @param destination
* The destination of this message.
* @return The newly created and filled MonitorMsg. Null if the current thread
* was interrupted while attempting to get the domain lock.
*/
public MonitorMsg createLocalTopologyMonitorMsg(int sender, int destination)
{
try
{
// Lock domain as we need to go through connected servers list
lock();
}
catch (InterruptedException e)
{
return null;
}
try
{
MonitorMsg monitorMsg = new MonitorMsg(sender, destination);
// Populate for each connected LDAP Server
// from the states stored in the serverHandler.
// - the server state
// - the older missing change
for (DataServerHandler lsh : this.directoryServers.values())
{
monitorMsg.setServerState(lsh.getServerId(),
lsh.getServerState(), lsh.getApproxFirstMissingDate(),
true);
}
// Same for the connected RS
for (ReplicationServerHandler rsh : this.replicationServers.values())
{
monitorMsg.setServerState(rsh.getServerId(),
rsh.getServerState(), rsh.getApproxFirstMissingDate(),
false);
}
// Populate the RS state in the msg from the DbState
monitorMsg.setReplServerDbState(this.getDbServerState());
return monitorMsg;
}
finally
{
release();
}
}
/**
* Shutdown this ReplicationServerDomain.
*/
public void shutdown()
{
DirectoryServer.deregisterMonitorProvider(this);
// Terminate the assured timer
assuredTimeoutTimer.cancel();
stopAllServers(true);
stopDbHandlers();
}
/**
* Stop the dbHandlers .
*/
private void stopDbHandlers()
{
// Shutdown the dbHandlers
synchronized (sourceDbHandlers)
{
for (DbHandler dbHandler : sourceDbHandlers.values())
{
dbHandler.shutdown();
}
sourceDbHandlers.clear();
}
}
/**
* Returns the ServerState describing the last change from this replica.
*
* @return The ServerState describing the last change from this replica.
*/
public ServerState getDbServerState()
{
ServerState serverState = new ServerState();
for (DbHandler db : sourceDbHandlers.values())
{
serverState.update(db.getLastChange());
}
return serverState;
}
/**
* {@inheritDoc}
*/
@Override
public String toString()
{
return "ReplicationServerDomain " + baseDn;
}
/**
* Send a TopologyMsg to all the connected directory servers in order to
* let them know the topology (every known DSs and RSs).
* @param notThisOne If not null, the topology message will not be sent to
* this passed server.
*/
public void buildAndSendTopoInfoToDSs(ServerHandler notThisOne)
{
for (DataServerHandler handler : directoryServers.values())
{
if ((notThisOne == null) || ((handler != notThisOne)))
// All except passed one
{
for (int i=1; i<=2; i++)
{
if (!handler.shuttingDown())
{
if (handler.getStatus() != ServerStatus.NOT_CONNECTED_STATUS)
{
TopologyMsg topoMsg=createTopologyMsgForDS(handler.getServerId());
try
{
handler.sendTopoInfo(topoMsg);
break;
}
catch (IOException e)
{
if (i==2)
{
Message message = ERR_EXCEPTION_SENDING_TOPO_INFO.get(
baseDn,
"directory",
Integer.toString(handler.getServerId()),
e.getMessage());
logError(message);
}
}
}
}
try { Thread.sleep(100); } catch(Exception e) {}
}
}
}
}
/**
* Send a TopologyMsg to all the connected replication servers
* in order to let them know our connected LDAP servers.
*/
public void buildAndSendTopoInfoToRSs()
{
TopologyMsg topoMsg = createTopologyMsgForRS();
for (ReplicationServerHandler handler : replicationServers.values())
{
for (int i=1; i<=2; i++)
{
if (!handler.shuttingDown())
{
if (handler.getStatus() != ServerStatus.NOT_CONNECTED_STATUS)
{
try
{
handler.sendTopoInfo(topoMsg);
break;
}
catch (IOException e)
{
if (i==2)
{
Message message = ERR_EXCEPTION_SENDING_TOPO_INFO.get(
baseDn,
"replication",
Integer.toString(handler.getServerId()),
e.getMessage());
logError(message);
}
}
}
}
try { Thread.sleep(100); } catch(Exception e) {}
}
}
}
/**
* Creates a TopologyMsg filled with information to be sent to a remote RS.
* We send remote RS the info of every DS that are directly connected to us
* plus our own info as RS.
* @return A suitable TopologyMsg PDU to be sent to a peer RS
*/
public TopologyMsg createTopologyMsgForRS()
{
List<DSInfo> dsInfos = new ArrayList<DSInfo>();
// Go through every DSs
for (DataServerHandler serverHandler : directoryServers.values())
{
dsInfos.add(serverHandler.toDSInfo());
}
// Create info for the local RS
List<RSInfo> rsInfos = new ArrayList<RSInfo>();
RSInfo localRSInfo = new RSInfo(replicationServer.getServerId(),
replicationServer.getServerURL(), generationId,
replicationServer.getGroupId(), replicationServer.getWeight());
rsInfos.add(localRSInfo);
return new TopologyMsg(dsInfos, rsInfos);
}
/**
* Creates a TopologyMsg filled with information to be sent to a DS.
* We send remote DS the info of every known DS and RS in the topology (our
* directly connected DSs plus the DSs connected to other RSs) except himself.
* Also put info related to local RS.
*
* @param destDsId The id of the DS the TopologyMsg PDU is to be sent to and
* that we must not include in the DS list.
* @return A suitable TopologyMsg PDU to be sent to a peer DS
*/
public TopologyMsg createTopologyMsgForDS(int destDsId)
{
List<DSInfo> dsInfos = new ArrayList<DSInfo>();
List<RSInfo> rsInfos = new ArrayList<RSInfo>();
// Go through every DSs (except recipient of msg)
for (DataServerHandler serverHandler : directoryServers.values())
{
if (serverHandler.getServerId() == destDsId)
continue;
dsInfos.add(serverHandler.toDSInfo());
}
// Add our own info (local RS)
RSInfo localRSInfo = new RSInfo(replicationServer.getServerId(),
replicationServer.getServerURL(), generationId,
replicationServer.getGroupId(), replicationServer.getWeight());
rsInfos.add(localRSInfo);
// Go through every peer RSs (and get their connected DSs), also add info
// for RSs
for (ReplicationServerHandler serverHandler : replicationServers.values())
{
// Put RS info
rsInfos.add(serverHandler.toRSInfo());
serverHandler.addDSInfos(dsInfos);
}
return new TopologyMsg(dsInfos, rsInfos);
}
/**
* Get the generationId associated to this domain.
*
* @return The generationId
*/
public long getGenerationId()
{
return generationId;
}
/**
* Get the generationId saved status.
*
* @return The generationId saved status.
*/
public boolean getGenerationIdSavedStatus()
{
return generationIdSavedStatus;
}
/**
* Initialize the value of the generationID for this ReplicationServerDomain.
* This method is intended to be used for initialization at startup and
* simply stores the new value without any additional processing.
* For example it does not clear the change-log DBs
*
* @param generationId The new value of generationId.
*/
public void initGenerationID(long generationId)
{
synchronized (generationIDLock)
{
this.generationId = generationId;
this.generationIdSavedStatus = true;
}
}
/**
* Sets the provided value as the new in memory generationId.
* Also clear the changelog databases.
*
* @param generationId The new value of generationId.
* @param savedStatus The saved status of the generationId.
* @return The old generation id
*/
public long changeGenerationId(long generationId, boolean savedStatus)
{
synchronized (generationIDLock)
{
long oldGenerationId = this.generationId;
if (this.generationId != generationId)
{
// we are changing of genId
clearDbs();
this.generationId = generationId;
this.generationIdSavedStatus = savedStatus;
}
return oldGenerationId;
}
}
/**
* Resets the generationID.
*
* @param senderHandler The handler associated to the server
* that requested to reset the generationId.
* @param genIdMsg The reset generation ID msg received.
*/
public void resetGenerationId(ServerHandler senderHandler,
ResetGenerationIdMsg genIdMsg)
{
if (debugEnabled())
TRACER.debugInfo(
"In " + this +
" Receiving ResetGenerationIdMsg from " + senderHandler.getServerId()+
" for baseDn " + baseDn + ":\n" + genIdMsg);
try
{
// Acquire lock on domain (see more details in comment of start() method
// of ServerHandler)
lock();
}
catch (InterruptedException ex)
{
// We can't deal with this here, so re-interrupt thread so that it is
// caught during subsequent IO.
Thread.currentThread().interrupt();
return;
}
try
{
long newGenId = genIdMsg.getGenerationId();
if (newGenId != this.generationId)
{
changeGenerationId(newGenId, false);
}
else
{
// Order to take a gen id we already have, just ignore
if (debugEnabled())
TRACER.debugInfo(
"In " + this
+ " Reset generation id requested for baseDn " + baseDn
+ " but generation id was already " + this.generationId
+ ":\n" + genIdMsg);
}
// If we are the first replication server warned,
// then forwards the reset message to the remote replication servers
for (ServerHandler rsHandler : replicationServers.values())
{
try
{
// After we'll have sent the message , the remote RS will adopt
// the new genId
rsHandler.setGenerationId(newGenId);
if (senderHandler.isDataServer())
{
rsHandler.send(genIdMsg);
}
} catch (IOException e)
{
logError(ERR_EXCEPTION_FORWARDING_RESET_GEN_ID.get(baseDn,
e.getMessage()));
}
}
// Change status of the connected DSs according to the requested new
// reference generation id
for (DataServerHandler dsHandler : directoryServers.values())
{
try
{
dsHandler.changeStatusForResetGenId(newGenId);
} catch (IOException e)
{
logError(ERR_EXCEPTION_CHANGING_STATUS_AFTER_RESET_GEN_ID.get(baseDn,
Integer.toString(dsHandler.getServerId()),
e.getMessage()));
}
}
// Update every peers (RS/DS) with potential topology changes (status
// change). Rather than doing that each time a DS has a status change
// (consecutive to reset gen id message), we prefer advertising once for
// all after changes (less packet sent), here at the end of the reset msg
// treatment.
buildAndSendTopoInfoToDSs(null);
buildAndSendTopoInfoToRSs();
Message message = NOTE_RESET_GENERATION_ID.get(baseDn, newGenId);
logError(message);
}
catch(Exception e)
{
logError(Message.raw(Category.SYNC, Severity.NOTICE,
stackTraceToSingleLineString(e)));
}
finally
{
release();
}
}
/**
* Process message of a remote server changing his status.
* @param senderHandler The handler associated to the server
* that changed his status.
* @param csMsg The message containing the new status
*/
public void processNewStatus(DataServerHandler senderHandler,
ChangeStatusMsg csMsg)
{
if (debugEnabled())
{
TRACER.debugInfo(
"In RS " + getReplicationServer().getServerId() +
" Receiving ChangeStatusMsg from " + senderHandler.getServerId() +
" for baseDn " + baseDn + ":\n" + csMsg);
}
try
{
// Acquire lock on domain (see more details in comment of start() method
// of ServerHandler)
lock();
}
catch (InterruptedException ex)
{
// We can't deal with this here, so re-interrupt thread so that it is
// caught during subsequent IO.
Thread.currentThread().interrupt();
return;
}
try
{
ServerStatus newStatus = senderHandler.processNewStatus(csMsg);
if (newStatus == ServerStatus.INVALID_STATUS)
{
// Already logged an error in processNewStatus()
// just return not to forward a bad status to topology
return;
}
// Update every peers (RS/DS) with topology changes
buildAndSendTopoInfoToDSs(senderHandler);
buildAndSendTopoInfoToRSs();
Message message = NOTE_DIRECTORY_SERVER_CHANGED_STATUS.get(
senderHandler.getServerId(), baseDn, newStatus.toString());
logError(message);
}
catch(Exception e)
{
logError(Message.raw(Category.SYNC, Severity.NOTICE,
stackTraceToSingleLineString(e)));
}
finally
{
release();
}
}
/**
* Change the status of a directory server according to the event generated
* from the status analyzer.
* @param serverHandler The handler of the directory server to update
* @param event The event to be used for new status computation
* @return True if we have been interrupted (must stop), false otherwise
*/
public boolean changeStatusFromStatusAnalyzer(
DataServerHandler serverHandler, StatusMachineEvent event)
{
try
{
// Acquire lock on domain (see more details in comment of start() method
// of ServerHandler)
lock();
}
catch (InterruptedException ex)
{
// We have been interrupted for dying, from stopStatusAnalyzer
// to prevent deadlock in this situation:
// RS is being shutdown, and stopServer will call stopStatusAnalyzer.
// Domain lock is taken by shutdown thread while status analyzer thread
// is willing to change the status of a server at the same time so is
// waiting for the domain lock at the same time. As shutdown thread is
// waiting for analyzer thread death, a deadlock occurs. So we force
// interruption of the status analyzer thread death after 2 seconds if
// it has not finished (see StatusAnalyzer.waitForShutdown). This allows
// to have the analyzer thread taking the domain lock only when the
// status of a DS has to be changed. See more comments in run method of
// StatusAnalyzer.
if (debugEnabled())
TRACER
.debugInfo("Status analyzer for domain "
+ baseDn
+ " has been interrupted when"
+ " trying to acquire domain lock for changing the status"
+ " of DS "
+ serverHandler.getServerId());
return true;
}
try
{
ServerStatus newStatus = ServerStatus.INVALID_STATUS;
ServerStatus oldStatus = serverHandler.getStatus();
try
{
newStatus = serverHandler
.changeStatusFromStatusAnalyzer(event);
}
catch (IOException e)
{
logError(ERR_EXCEPTION_CHANGING_STATUS_FROM_STATUS_ANALYZER
.get(baseDn,
Integer.toString(serverHandler.getServerId()),
e.getMessage()));
}
if ((newStatus == ServerStatus.INVALID_STATUS)
|| (newStatus == oldStatus))
{
// Change was impossible or already occurred (see StatusAnalyzer
// comments)
return false;
}
// Update every peers (RS/DS) with topology changes
buildAndSendTopoInfoToDSs(serverHandler);
buildAndSendTopoInfoToRSs();
}
catch (Exception e)
{
logError(Message.raw(Category.SYNC, Severity.NOTICE,
stackTraceToSingleLineString(e)));
}
finally
{
release();
}
return false;
}
/**
* Clears the Db associated with that domain.
*/
public void clearDbs()
{
// Reset the localchange and state db for the current domain
synchronized (sourceDbHandlers)
{
for (DbHandler dbHandler : sourceDbHandlers.values())
{
try
{
dbHandler.clear();
} catch (Exception e)
{
// TODO: i18n
MessageBuilder mb = new MessageBuilder();
mb.append(ERR_ERROR_CLEARING_DB.get(dbHandler.toString(),
e.getMessage() + " " +
stackTraceToSingleLineString(e)));
logError(mb.toMessage());
}
}
stopDbHandlers();
}
try
{
replicationServer.clearGenerationId(baseDn);
} catch (Exception e)
{
// TODO: i18n
logError(Message.raw(
"Exception caught while clearing generationId:" +
e.getLocalizedMessage()));
}
}
/**
* Returns whether the provided server is in degraded
* state due to the fact that the peer server has an invalid
* generationId for this domain.
*
* @param serverId The serverId for which we want to know the
* the state.
* @return Whether it is degraded or not.
*/
public boolean isDegradedDueToGenerationId(int serverId)
{
if (debugEnabled())
TRACER.debugInfo(
"In " + this.replicationServer.getMonitorInstanceName() +
" baseDN=" + baseDn +
" isDegraded serverId=" + serverId +
" given local generation Id=" + this.generationId);
ServerHandler handler = replicationServers.get(serverId);
if (handler == null)
{
handler = directoryServers.get(serverId);
if (handler == null)
{
return false;
}
}
if (debugEnabled())
TRACER.debugInfo(
"In " + this.replicationServer.getMonitorInstanceName() +
" baseDN=" + baseDn +
" Compute degradation of serverId=" + serverId +
" LS server generation Id=" + handler.getGenerationId());
return (handler.getGenerationId() != this.generationId);
}
/**
* Return the associated replication server.
* @return The replication server.
*/
public ReplicationServer getReplicationServer()
{
return replicationServer;
}
/**
* Process topology information received from a peer RS.
* @param topoMsg The just received topo message from remote RS
* @param handler The handler that received the message.
* @param allowResetGenId True for allowing to reset the generation id (
* when called after initial handshake)
* @throws IOException If an error occurred.
* @throws DirectoryException If an error occurred.
*/
public void receiveTopoInfoFromRS(TopologyMsg topoMsg,
ReplicationServerHandler handler,
boolean allowResetGenId)
throws IOException, DirectoryException
{
if (debugEnabled())
{
TRACER.debugInfo(
"In RS " + getReplicationServer().getServerId() +
" Receiving TopologyMsg from " + handler.getServerId() +
" for baseDn " + baseDn + ":\n" + topoMsg);
}
try
{
// Acquire lock on domain (see more details in comment of start() method
// of ServerHandler)
lock();
}
catch (InterruptedException ex)
{
// We can't deal with this here, so re-interrupt thread so that it is
// caught during subsequent IO.
Thread.currentThread().interrupt();
return;
}
try
{
/*
* Store DS connected to remote RS & update information about the peer RS
*/
handler.processTopoInfoFromRS(topoMsg);
/*
* Handle generation id
*/
if (allowResetGenId)
{
// Check if generation id has to be reseted
mayResetGenerationId();
if (generationId < 0)
generationId = handler.getGenerationId();
}
if (generationId > 0 && (generationId != handler.getGenerationId()))
{
Message message = WARN_BAD_GENERATION_ID_FROM_RS.get(handler
.getServerId(), handler.session
.getReadableRemoteAddress(), handler.getGenerationId(),
baseDn, getReplicationServer().getServerId(),
generationId);
logError(message);
ErrorMsg errorMsg = new ErrorMsg(
getReplicationServer().getServerId(),
handler.getServerId(),
message);
handler.send(errorMsg);
}
/*
* Sends the currently known topology information to every connected
* DS we have.
*/
buildAndSendTopoInfoToDSs(null);
}
catch(Exception e)
{
logError(Message.raw(Category.SYNC, Severity.NOTICE,
stackTraceToSingleLineString(e)));
}
finally
{
release();
}
}
/* =======================
* Monitor Data generation
* =======================
*/
/**
* Returns the latest monitor data available for this replication server
* domain.
*
* @return The latest monitor data available for this replication server
* domain, which is never {@code null}.
*/
MonitorData getDomainMonitorData()
{
return monitorData;
}
/**
* Recomputes the monitor data for this replication server domain.
*
* @return The recomputed monitor data for this replication server domain.
* @throws InterruptedException
* If this thread is interrupted while waiting for a response.
*/
MonitorData computeDomainMonitorData() throws InterruptedException
{
// Only allow monitor recalculation at a time.
synchronized (pendingMonitorLock)
{
if ((monitorDataLastBuildDate + monitorDataLifeTime) < TimeThread
.getTime())
{
try
{
// Prevent out of band monitor responses from updating our pending
// table until we are ready.
synchronized (pendingMonitorDataLock)
{
// Clear the pending monitor data.
pendingMonitorDataServerIDs.clear();
pendingMonitorData = new MonitorData();
// Initialize the monitor data.
initializePendingMonitorData();
// Send the monitor requests to the connected replication servers.
for (ReplicationServerHandler rs : replicationServers.values())
{
// Add server ID to pending table.
int serverId = rs.getServerId();
MonitorRequestMsg msg = new MonitorRequestMsg(
this.replicationServer.getServerId(), serverId);
try
{
rs.send(msg);
// Only register this server ID if we were able to send the
// message.
pendingMonitorDataServerIDs.add(serverId);
}
catch (IOException e)
{
// Log a message and do a best effort from here.
Message message = ERR_SENDING_REMOTE_MONITOR_DATA_REQUEST
.get(baseDn, serverId, e.getMessage());
logError(message);
}
}
// Create the pending response latch based on the number of expected
// monitor responses.
pendingMonitorDataLatch = new CountDownLatch(
pendingMonitorDataServerIDs.size());
}
// Wait for the responses to come back.
pendingMonitorDataLatch.await(5, TimeUnit.SECONDS);
// Log messages for replication servers that have gone or come back.
synchronized (pendingMonitorDataLock)
{
// Log servers that have come back.
for (int serverId : monitorDataLateServers)
{
// Ensure that we only log once per server: don't fill the
// error log with repeated messages.
if (!pendingMonitorDataServerIDs.contains(serverId))
{
logError(NOTE_MONITOR_DATA_RECEIVED.get(baseDn,
serverId));
}
}
// Log servers that have gone away.
for (int serverId : pendingMonitorDataServerIDs)
{
// Ensure that we only log once per server: don't fill the
// error log with repeated messages.
if (!monitorDataLateServers.contains(serverId))
{
logError(WARN_MISSING_REMOTE_MONITOR_DATA.get(baseDn,
serverId));
}
}
// Remember which servers were late this time.
monitorDataLateServers.clear();
monitorDataLateServers.addAll(pendingMonitorDataServerIDs);
}
// Store the new computed data as the reference
synchronized (pendingMonitorDataLock)
{
// Now we have the expected answers or an error occurred
pendingMonitorData.completeComputing();
monitorData = pendingMonitorData;
monitorDataLastBuildDate = TimeThread.getTime();
}
}
finally
{
synchronized (pendingMonitorDataLock)
{
// Clear pending state.
pendingMonitorData = null;
pendingMonitorDataLatch = null;
pendingMonitorDataServerIDs.clear();
}
}
}
}
return monitorData;
}
/**
* Start collecting global monitoring information for this
* ReplicationServerDomain.
*/
private void initializePendingMonitorData()
{
// Let's process our directly connected DS
// - in the ServerHandler for a given DS1, the stored state contains :
// - the max CN produced by DS1
// - the last CN consumed by DS1 from DS2..n
// - in the RSdomain/dbHandler, the built-in state contains :
// - the max CN produced by each server
// So for a given DS connected we can take the state and the max from
// the DS/state.
for (ServerHandler ds : directoryServers.values())
{
int serverID = ds.getServerId();
// the state comes from the state stored in the SH
ServerState dsState = ds.getServerState()
.duplicate();
// the max CN sent by that LS also comes from the SH
ChangeNumber maxcn = dsState.getMaxChangeNumber(serverID);
if (maxcn == null)
{
// This directly connected LS has never produced any change
maxcn = new ChangeNumber(0, 0, serverID);
}
pendingMonitorData.setMaxCN(serverID, maxcn);
pendingMonitorData.setLDAPServerState(serverID, dsState);
pendingMonitorData.setFirstMissingDate(serverID,
ds.getApproxFirstMissingDate());
}
// Then initialize the max CN for the LS that produced something
// - from our own local db state
// - whatever they are directly or indirectly connected
ServerState dbServerState = getDbServerState();
pendingMonitorData.setRSState(replicationServer.getServerId(),
dbServerState);
for (int sid : dbServerState) {
ChangeNumber storedCN = dbServerState.getMaxChangeNumber(sid);
pendingMonitorData.setMaxCN(sid, storedCN);
}
}
/**
* Processes a Monitor message receives from a remote Replication Server and
* stores the data received.
*
* @param msg
* The message to be processed.
* @param serverId
* server handler that is receiving the message.
*/
private void receivesMonitorDataResponse(MonitorMsg msg,
int serverId)
{
synchronized (pendingMonitorDataLock)
{
if (pendingMonitorData == null)
{
// This is a response for an earlier request whose computing is
// already complete.
logError(INFO_IGNORING_REMOTE_MONITOR_DATA.get(baseDn,
msg.getSenderID()));
return;
}
try
{
// Here is the RS state : list <serverID, lastChangeNumber>
// For each LDAP Server, we keep the max CN across the RSes
ServerState replServerState = msg.getReplServerDbState();
pendingMonitorData.setMaxCNs(replServerState);
// store the remote RS states.
pendingMonitorData.setRSState(msg.getSenderID(),
replServerState);
// Store the remote LDAP servers states
Iterator<Integer> lsidIterator = msg.ldapIterator();
while (lsidIterator.hasNext())
{
int sid = lsidIterator.next();
ServerState dsServerState = msg.getLDAPServerState(sid);
pendingMonitorData.setMaxCNs(dsServerState);
pendingMonitorData.setLDAPServerState(sid, dsServerState);
pendingMonitorData.setFirstMissingDate(sid,
msg.getLDAPApproxFirstMissingDate(sid));
}
// Process the latency reported by the remote RSi on its connections
// to the other RSes
Iterator<Integer> rsidIterator = msg.rsIterator();
while (rsidIterator.hasNext())
{
int rsid = rsidIterator.next();
if (rsid == replicationServer.getServerId())
{
// this is the latency of the remote RSi regarding the current RS
// let's update the fmd of my connected LS
for (ServerHandler connectedlsh : directoryServers
.values())
{
int connectedlsid = connectedlsh.getServerId();
Long newfmd = msg.getRSApproxFirstMissingDate(rsid);
pendingMonitorData.setFirstMissingDate(connectedlsid,
newfmd);
}
}
else
{
// this is the latency of the remote RSi regarding another RSj
// let's update the latency of the LSes connected to RSj
ReplicationServerHandler rsjHdr = replicationServers
.get(rsid);
if (rsjHdr != null)
{
for (int remotelsid : rsjHdr
.getConnectedDirectoryServerIds())
{
Long newfmd = msg.getRSApproxFirstMissingDate(rsid);
pendingMonitorData.setFirstMissingDate(remotelsid,
newfmd);
}
}
}
}
}
catch (RuntimeException e)
{
// FIXME: do we really expect these???
logError(ERR_PROCESSING_REMOTE_MONITOR_DATA.get(e
.getMessage() + stackTraceToSingleLineString(e)));
}
finally
{
// Decreases the number of expected responses and potentially
// wakes up the waiting requester thread.
if (pendingMonitorDataServerIDs.remove(serverId))
{
pendingMonitorDataLatch.countDown();
}
}
}
}
/**
* Set the purge delay on all the db Handlers for this Domain
* of Replication.
*
* @param delay The new purge delay to use.
*/
public void setPurgeDelay(long delay)
{
for (DbHandler handler : sourceDbHandlers.values())
{
handler.setPurgeDelay(delay);
}
}
/**
* Get the map of connected DSs.
* @return The map of connected DSs
*/
public Map<Integer, DataServerHandler> getConnectedDSs()
{
return directoryServers;
}
/**
* Get the map of connected RSs.
* @return The map of connected RSs
*/
public Map<Integer, ReplicationServerHandler> getConnectedRSs()
{
return replicationServers;
}
/**
* A synchronization mechanism is created to insure exclusive access to the
* domain. The goal is to have a consistent view of the topology by locking
* the structures holding the topology view of the domain: directoryServers
* and replicationServers. When a connection is established with a peer DS or
* RS, the lock should be taken before updating these structures, then
* released. The same mechanism should be used when updating any data related
* to the view of the topology: for instance if the status of a DS is changed,
* the lock should be taken before updating the matching server handler and
* sending the topology messages to peers and released after.... This allows
* every member of the topology to have a consistent view of the topology and
* to be sure it will not miss some information.
* So the locking system must be called (not exhaustive list):
* - when connection established with a DS or RS
* - when connection ended with a DS or RS
* - when receiving a TopologyMsg and updating structures
* - when creating and sending a TopologyMsg
* - when a DS status is changing (ChangeStatusMsg received or sent)...
*/
private final ReentrantLock lock = new ReentrantLock();
/**
* This lock is used to protect the generationid variable.
*/
private final Object generationIDLock = new Object();
/**
* Tests if the current thread has the lock on this domain.
* @return True if the current thread has the lock.
*/
public boolean hasLock()
{
return (lock.getHoldCount() > 0);
}
/**
* Takes the lock on this domain (blocking until lock can be acquired) or
* calling thread is interrupted.
* @throws java.lang.InterruptedException If interrupted.
*/
public void lock() throws InterruptedException
{
lock.lockInterruptibly();
}
/**
* Releases the lock on this domain.
*/
public void release()
{
lock.unlock();
}
/**
* Tries to acquire the lock on the domain within a given amount of time.
* @param timeout The amount of milliseconds to wait for acquiring the lock.
* @return True if the lock was acquired, false if timeout occurred.
* @throws java.lang.InterruptedException When call was interrupted.
*/
public boolean tryLock(long timeout) throws InterruptedException
{
return lock.tryLock(timeout, TimeUnit.MILLISECONDS);
}
/**
* Starts the status analyzer for the domain.
*/
public void startStatusAnalyzer()
{
if (statusAnalyzer == null)
{
int degradedStatusThreshold =
replicationServer.getDegradedStatusThreshold();
if (degradedStatusThreshold > 0) // 0 means no status analyzer
{
statusAnalyzer = new StatusAnalyzer(this, degradedStatusThreshold);
statusAnalyzer.start();
}
}
}
/**
* Stops the status analyzer for the domain.
*/
public void stopStatusAnalyzer()
{
if (statusAnalyzer != null)
{
statusAnalyzer.shutdown();
statusAnalyzer.waitForShutdown();
statusAnalyzer = null;
}
}
/**
* Tests if the status analyzer for this domain is running.
* @return True if the status analyzer is running, false otherwise.
*/
public boolean isRunningStatusAnalyzer()
{
return (statusAnalyzer != null);
}
/**
* Update the status analyzer with the new threshold value.
* @param degradedStatusThreshold The new threshold value.
*/
public void updateStatusAnalyzer(int degradedStatusThreshold)
{
if (statusAnalyzer != null)
{
statusAnalyzer.setDegradedStatusThreshold(degradedStatusThreshold);
}
}
/**
* Starts the monitoring publisher for the domain.
*/
public void startMonitoringPublisher()
{
if (monitoringPublisher == null)
{
long period =
replicationServer.getMonitoringPublisherPeriod();
if (period > 0) // 0 means no monitoring publisher
{
monitoringPublisher = new MonitoringPublisher(this, period);
monitoringPublisher.start();
}
}
}
/**
* Stops the monitoring publisher for the domain.
*/
public void stopMonitoringPublisher()
{
if (monitoringPublisher != null)
{
monitoringPublisher.shutdown();
monitoringPublisher.waitForShutdown();
monitoringPublisher = null;
}
}
/**
* Tests if the monitoring publisher for this domain is running.
* @return True if the monitoring publisher is running, false otherwise.
*/
public boolean isRunningMonitoringPublisher()
{
return (monitoringPublisher != null);
}
/**
* Update the monitoring publisher with the new period value.
* @param period The new period value.
*/
public void updateMonitoringPublisher(long period)
{
if (monitoringPublisher != null)
{
monitoringPublisher.setPeriod(period);
}
}
/**
* {@inheritDoc}
*/
@Override
public void initializeMonitorProvider(MonitorProviderCfg configuraiton)
{
// Nothing to do for now
}
/**
* {@inheritDoc}
*/
@Override
public String getMonitorInstanceName()
{
return "Replication server RS(" + replicationServer.getServerId() + ") "
+ replicationServer.getServerURL() + ",cn="
+ baseDn.replace(',', '_').replace('=', '_') + ",cn=Replication";
}
/**
* {@inheritDoc}
*/
@Override
public ArrayList<Attribute> getMonitorData()
{
/*
* publish the server id and the port number.
*/
ArrayList<Attribute> attributes = new ArrayList<Attribute>();
attributes.add(Attributes.create("replication-server-id",
String.valueOf(replicationServer.getServerId())));
attributes.add(Attributes.create("replication-server-port",
String.valueOf(replicationServer.getReplicationPort())));
/*
* Add all the base DNs that are known by this replication server.
*/
AttributeBuilder builder = new AttributeBuilder("domain-name");
builder.add(baseDn);
attributes.add(builder.toAttribute());
// Publish to monitor the generation ID by replicationServerDomain
builder = new AttributeBuilder("generation-id");
builder.add(baseDn + " " + generationId);
attributes.add(builder.toAttribute());
MonitorData md = getDomainMonitorData();
// Missing changes
long missingChanges = md.getMissingChangesRS(replicationServer
.getServerId());
attributes.add(Attributes.create("missing-changes",
String.valueOf(missingChanges)));
return attributes;
}
/**
* Register in the domain an handler that subscribes to changes.
* @param handler the provided subscribing handler.
*/
public void registerHandler(MessageHandler handler)
{
this.otherHandlers.add(handler);
}
/**
* Unregister from the domain an handler.
* @param handler the provided unsubscribing handler.
* @return Whether this handler has been unregistered with success.
*/
public boolean unRegisterHandler(MessageHandler handler)
{
return this.otherHandlers.remove(handler);
}
/**
* Return the state that contain for each server the time of eligibility.
* @return the state.
*/
public ServerState getChangeTimeHeartbeatState()
{
if (ctHeartbeatState == null)
{
ctHeartbeatState = this.getDbServerState().duplicate();
}
return ctHeartbeatState;
}
/**
* Computes the eligible server state for the domain.
*
* s1 s2 s3
* -- -- --
* cn31
* cn15
*
* ----------------------------------------- eligibleCN
* cn14
* cn26
* cn13
*
* The eligibleState is : s1;cn14 / s2;cn26 / s3;cn31
*
* @param eligibleCN The provided eligibleCN.
* @return The computed eligible server state.
*/
public ServerState getEligibleState(ChangeNumber eligibleCN)
{
ServerState dbState = this.getDbServerState();
// The result is initialized from the dbState.
// From it, we don't want to keep the changes newer than eligibleCN.
ServerState result = dbState.duplicate();
if (eligibleCN != null)
{
for (int sid : dbState) {
DbHandler h = sourceDbHandlers.get(sid);
ChangeNumber mostRecentDbCN = dbState.getMaxChangeNumber(sid);
try {
// Is the most recent change in the Db newer than eligible CN ?
// if yes (like cn15 in the example above, then we have to go back
// to the Db and look for the change older than eligible CN (cn14)
if (eligibleCN.olderOrEqual(mostRecentDbCN)) {
// let's try to seek the first change <= eligibleCN
ReplicationIterator ri = null;
try {
ri = h.generateIterator(eligibleCN);
if ((ri != null) && (ri.getChange() != null)) {
ChangeNumber newCN = ri.getChange().getChangeNumber();
result.update(newCN);
}
} catch (Exception e) {
// there's no change older than eligibleCN (case of s3/cn31)
result.update(new ChangeNumber(0, 0, sid));
} finally {
if (ri != null) {
ri.releaseCursor();
}
}
} else {
// for this serverId, all changes in the ChangelogDb are holder
// than eligibleCN , the most recent in the db is our guy.
result.update(mostRecentDbCN);
}
} catch (Exception e) {
Message errMessage = ERR_WRITER_UNEXPECTED_EXCEPTION.get(
" " + stackTraceToSingleLineString(e));
logError(errMessage);
TRACER.debugCaught(DebugLogLevel.ERROR, e);
}
}
}
if (debugEnabled())
TRACER.debugInfo("In " + this
+ " getEligibleState() result is " + result);
return result;
}
/**
* Returns the start state of the domain, made of the first (oldest)
* change stored for each serverId.
* Note: Because the replication changelogdb trimming always keep one change
* whatever its date, the change contained in the returned state can be very
* old.
* @return the start state of the domain.
*/
public ServerState getStartState()
{
ServerState domainStartState = new ServerState();
for (DbHandler dbHandler : sourceDbHandlers.values())
{
domainStartState.update(dbHandler.getFirstChange());
}
return domainStartState;
}
/**
* Returns the eligibleCN for that domain - relies on the ChangeTimeHeartbeat
* state.
* For each DS, take the oldest CN from the changetime heartbeat state
* and from the changelog db last CN. Can be null.
* @return the eligible CN.
*/
public ChangeNumber getEligibleCN()
{
ChangeNumber eligibleCN = null;
for (DbHandler db : sourceDbHandlers.values())
{
// Consider this producer (DS/db).
int sid = db.getServerId();
// Should it be considered for eligibility ?
ChangeNumber heartbeatLastDN =
getChangeTimeHeartbeatState().getMaxChangeNumber(sid);
// If the most recent UpdateMsg or CLHeartbeatMsg received is very old
// then the domain is considered down and not considered for eligibility
/*
if ((heartbeatLastDN != null) &&
(TimeThread.getTime()- heartbeatLastDN.getTime() > 5000))
{
if (debugEnabled())
TRACER.debugInfo("In " + this.getName() +
" Server " + sid
+ " is not considered for eligibility ... potentially down");
continue;
}
*/
boolean sidConnected = false;
if (directoryServers.containsKey(sid))
{
sidConnected = true;
}
else
{
// not directly connected
for (ReplicationServerHandler rsh : replicationServers.values())
{
if (rsh.isRemoteLDAPServer(sid))
{
sidConnected = true;
break;
}
}
}
if (!sidConnected)
{
if (debugEnabled())
TRACER.debugInfo("In " + "Replication Server " +
replicationServer.getReplicationPort() + " " +
baseDn + " " + replicationServer.getServerId() +
" Server " + sid
+ " is not considered for eligibility ... potentially down");
continue;
}
ChangeNumber changelogLastCN = db.getLastChange();
if (changelogLastCN != null)
{
if ((eligibleCN == null) || (changelogLastCN.newer(eligibleCN)))
{
eligibleCN = changelogLastCN;
}
}
if ((heartbeatLastDN != null) &&
((eligibleCN == null) || (heartbeatLastDN.newer(eligibleCN))))
{
eligibleCN = heartbeatLastDN;
}
}
if (debugEnabled())
TRACER.debugInfo(
"In " + "Replication Server " + replicationServer.getReplicationPort() +
" " + baseDn + " " + replicationServer.getServerId() +
" getEligibleCN() returns result =" + eligibleCN);
return eligibleCN;
}
/**
* Processes a ChangeTimeHeartbeatMsg received, by storing the CN (timestamp)
* value received, and forwarding the message to the other RSes.
* @param senderHandler The handler for the server that sent the heartbeat.
* @param msg The message to process.
*/
public void processChangeTimeHeartbeatMsg(ServerHandler senderHandler,
ChangeTimeHeartbeatMsg msg )
{
try
{
// Acquire lock on domain (see more details in comment of start() method
// of ServerHandler)
lock();
}
catch (InterruptedException ex)
{
// We can't deal with this here, so re-interrupt thread so that it is
// caught during subsequent IO.
Thread.currentThread().interrupt();
return;
}
try
{
storeReceivedCTHeartbeat(msg.getChangeNumber());
if (senderHandler.isDataServer())
{
// If we are the first replication server warned,
// then forwards the message to the remote replication servers
for (ReplicationServerHandler rsHandler : replicationServers
.values())
{
try
{
if (rsHandler.getProtocolVersion() >=
ProtocolVersion.REPLICATION_PROTOCOL_V3)
{
rsHandler.send(msg);
}
}
catch (IOException e)
{
TRACER.debugCaught(DebugLogLevel.ERROR, e);
logError(ERR_CHANGELOG_ERROR_SENDING_MSG
.get("Replication Server "
+ replicationServer.getReplicationPort() + " "
+ baseDn + " " + replicationServer.getServerId()));
stopServer(rsHandler, false);
}
}
}
}
finally
{
release();
}
}
/**
* Store a change time value received from a data server.
* @param cn The provided change time.
*/
public void storeReceivedCTHeartbeat(ChangeNumber cn)
{
// TODO:May be we can spare processing by only storing CN (timestamp)
// instead of a server state.
getChangeTimeHeartbeatState().update(cn);
/*
if (debugEnabled())
{
Set<String> ss = ctHeartbeatState.toStringSet();
String dss = "";
for (String s : ss)
{
dss = dss + " \\ " + s;
}
TRACER.debugInfo("In " + this.getName() + " " + dss);
}
*/
}
/**
* This methods count the changes, server by server :
* - from a serverState start point
* - to (inclusive) an end point (the provided endCN).
* @param startState The provided start server state.
* @param endCN The provided end change number.
* @return The number of changes between startState and endCN.
*/
public long getEligibleCount(ServerState startState, ChangeNumber endCN)
{
long res = 0;
// Parses the dbState of the domain , server by server
ServerState dbState = this.getDbServerState();
for (int sid : dbState) {
// process one sid
ChangeNumber startCN = null;
if (startState.getMaxChangeNumber(sid) != null)
startCN = startState.getMaxChangeNumber(sid);
long sidRes = getCount(sid, startCN, endCN);
// The startPoint is excluded when counting the ECL eligible changes
if ((startCN != null) && (sidRes > 0))
sidRes--;
res += sidRes;
}
return res;
}
/**
* This methods count the changes, server by server :
* - from a start CN
* - to (inclusive) an end point (the provided endCN).
* @param startCN The provided start changeNumber.
* @param endCN The provided end change number.
* @return The number of changes between startTime and endCN.
*/
public long getEligibleCount(ChangeNumber startCN, ChangeNumber endCN)
{
long res = 0;
// Parses the dbState of the domain , server by server
ServerState dbState = this.getDbServerState();
for (int sid : dbState) {
// process one sid
ChangeNumber lStartCN =
new ChangeNumber(startCN.getTime(), startCN.getSeqnum(), sid);
res += getCount(sid, lStartCN, endCN);
}
return res;
}
/**
* Get the latest (more recent) trim date of the changelog dbs associated
* to this domain.
* @return The latest trim date.
*/
public long getLatestDomainTrimDate()
{
long latest = 0;
for (DbHandler db : sourceDbHandlers.values())
{
if ((latest==0) || (latest<db.getLatestTrimDate()))
{
latest = db.getLatestTrimDate();
}
}
return latest;
}
}