package com.linkedin.databus.client; /* * * Copyright 2013 LinkedIn Corp. All rights reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ import java.io.ByteArrayInputStream; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import javax.management.MBeanServer; import org.apache.log4j.Logger; import com.linkedin.databus.client.ConnectionState.StateId; import com.linkedin.databus.client.netty.RemoteExceptionHandler; import com.linkedin.databus.client.pub.ServerInfo; import com.linkedin.databus.client.pub.mbean.UnifiedClientStats; import com.linkedin.databus.core.Checkpoint; import com.linkedin.databus.core.CheckpointMult; import com.linkedin.databus.core.DatabusComponentStatus; import com.linkedin.databus.core.DbusClientMode; import com.linkedin.databus.core.DbusEvent; import com.linkedin.databus.core.DbusEventBuffer; import com.linkedin.databus.core.DbusEventFactory; import com.linkedin.databus.core.InvalidEventException; import com.linkedin.databus.core.PendingEventTooLargeException; import com.linkedin.databus.core.PullerRetriesExhaustedException; import com.linkedin.databus.core.SCNRegressMessage; import com.linkedin.databus.core.ScnNotFoundException; import com.linkedin.databus.core.async.LifecycleMessage; import com.linkedin.databus.core.data_model.DatabusSubscription; import com.linkedin.databus.core.data_model.LogicalSource; import com.linkedin.databus.core.data_model.LogicalSourceId; import com.linkedin.databus.core.data_model.PhysicalPartition; import com.linkedin.databus.core.monitoring.mbean.DbusEventsStatisticsCollector; import com.linkedin.databus.core.util.IdNamePair; import com.linkedin.databus2.core.BackoffTimer; import com.linkedin.databus2.core.BackoffTimerStaticConfig; import com.linkedin.databus2.core.DatabusException; import com.linkedin.databus2.core.container.monitoring.mbean.HttpStatisticsCollector; import com.linkedin.databus2.core.container.request.BootstrapDatabaseTooOldException; import com.linkedin.databus2.core.container.request.RegisterResponseEntry; import com.linkedin.databus2.core.filter.DbusKeyCompositeFilter; import com.linkedin.databus2.core.filter.DbusKeyCompositeFilterConfig; import com.linkedin.databus2.core.filter.KeyFilterConfigHolder; public class RelayPullThread extends BasePullThread { private final static int RELAY_CALLS_MERGE_FREQ = 10; private static final ArrayList<RegisterResponseEntry> EMPTY_REGISTER_LIST = new ArrayList<RegisterResponseEntry>(); private static final ArrayList<Integer> EMPTY_STREAM_LIST = new ArrayList<Integer>(); private final boolean _isConsumeCurrent ; private final HttpStatisticsCollector _relayCallsStats; //Keep track of the last open relay connection so that we can close it on shutdown private DatabusRelayConnection _lastOpenConnection; private DbusKeyCompositeFilter _relayFilter; private final List<DbusKeyCompositeFilterConfig> _relayFilterConfigs; private int _unmergedHttpCallsStats = 0; private long _streamCallStartMs = 0; // if that much time passes with no events - reset the connection, and pick another server private long _noEventsConnectionResetTimeSec; private long _timeSinceEventsSec = System.currentTimeMillis(); /** * When RelayPullThread falls-off the relay, it goes to the Fell-Off mode and the below retryConfig (_retriesOnFallOff) * control the retries. The fell-off mode is also set when the client starts with a bootstrap-ckpt. * * <pre> * The fell-off mode will only be reset after * (a) Bootstrap got completed * (b) streamFromLastSCN gets triggered. * (c) Server-Set Change which results in tearing the current connection * * * Behavior: * * (a) When bootstrap enabled, * On first SCNNotFoundException, * 1 Fell-Off mode is set. * 2. In Pick_SERVER state, RelayPuller will re-establish the connection (hopefully to new relay in case of VIPS) and retry depending upon the retry-config. * 3. On subsequent retry, any failure could happen (SCNNotFound or ConnClose or any other). All these will still happen in fell-off mode and the _retriesOnFallOff config will control retries. * 4. If the _retriesOnFallOff gets exhausted, will go to bootstrap mode and reset the _retriesOnFallOff counter. * 5. When bootstrap gets completed, Fell_Off mode is reset * * (b) When bootstrap disabled and streamFromLastSCNonFellOff set * 1 Fell-Off mode is set. * 2. In Pick_SERVER state, RelayPuller will re-establish the connection (hopefully to new relay in case of VIPS) and retry depending upon the retry-config. * 3. On subsequent retry, any failure could happen (SCNNotFound or ConnClose or any other). All these will still happen in fell-off mode and the _retriesOnFallOff config will control retries. * 4. If the _retriesOnFallOff gets exhausted, will set the flag to make current relay stream from the latest SCN and resets fell-Off mode * * (c) When bootstrap disabled and streamFromLastSCNonFellOff not set * 1. Fell-Off mode is set. * 2. In Pick_SERVER state, RelayPuller will re-establish the connection (hopefully to new relay in case of VIPS) and retry depending upon the retry-config. * 3. On subsequent retry, any failure could happen (SCNNotFound or ConnClose or any other). All these will still happen in fell-off mode and the _retriesOnFallOff config will control retries. * 4. If the _retriesOnFallOff gets exhausted, the relay-pull thread gets suspended. * </pre> */ private final BackoffTimer _retriesOnFallOff; final private RemoteExceptionHandler _remoteExceptionHandler; private final boolean _isReadLatestScnOnErrorEnabled; private final double _pullerBufferUtilizationPct ; public RelayPullThread(String name, DatabusSourcesConnection sourcesConn, DbusEventBuffer dbusEventBuffer, ConnectionStateFactory connStateFactory, Set<ServerInfo> relays, List<DbusKeyCompositeFilterConfig> relayFilterConfigs, boolean isConsumeCurrent, boolean isReadLatestScnOnErrorEnabled, double pullerBufferUtilPct, int noEventsConnectionResetTimeSec, MBeanServer mbeanServer, DbusEventFactory eventFactory, Logger log) { super(name, sourcesConn.getConnectionConfig().getPullerRetries(), sourcesConn, dbusEventBuffer, connStateFactory, relays, mbeanServer, eventFactory, log); _relayFilterConfigs = relayFilterConfigs; _isConsumeCurrent = isConsumeCurrent; _remoteExceptionHandler = new RemoteExceptionHandler(sourcesConn, dbusEventBuffer, eventFactory); _relayCallsStats = _sourcesConn.getLocalRelayCallsStatsCollector(); _isReadLatestScnOnErrorEnabled = isReadLatestScnOnErrorEnabled; _pullerBufferUtilizationPct = pullerBufferUtilPct; _retriesOnFallOff = new BackoffTimer("RetriesOnFallOff", new BackoffTimerStaticConfig(0, 0, 1, 0, sourcesConn.getConnectionConfig().getNumRetriesOnFallOff())); _noEventsConnectionResetTimeSec = noEventsConnectionResetTimeSec; } @Override protected void onResume() { _currentState.switchToPickServer(); enqueueMessage(_currentState); } @Override protected void onShutdown() { if (null != _lastOpenConnection) { _log.info("closing open connection during onShutdown()"); _lastOpenConnection.close(); _lastOpenConnection = null; _log.info("Closed open connection during onShutdown()"); } } /** * Invoked when a LifeCycle message of type "SUSPEND_ON_ERROR" is received by relay puller thread * as defined in AbstractActorMessageQueue#executeAndChangeState(Object) * * 1. Invoke same method in super class * 2. Send an "I'm dead" heartbeat value as a failsafe */ @Override protected void doSuspendOnError(LifecycleMessage lcMessage) { super.doSuspendOnError(lcMessage); sendHeartbeat(_sourcesConn.getUnifiedClientStats(), -1); } @Override protected boolean shouldDelayTearConnection(StateId stateId) { boolean delayTear = false; switch(stateId) { case SOURCES_REQUEST_SENT: case SOURCES_RESPONSE_SUCCESS: case SOURCES_RESPONSE_ERROR: case SOURCES_REQUEST_ERROR: case REGISTER_REQUEST_SENT : case REGISTER_RESPONSE_SUCCESS: case REGISTER_RESPONSE_ERROR: case REGISTER_REQUEST_ERROR: case STREAM_REQUEST_SENT: case STREAM_REQUEST_SUCCESS: case STREAM_RESPONSE_ERROR: case STREAM_REQUEST_ERROR: case BOOTSTRAP_REQUESTED: delayTear = true; break; default: delayTear = false; break; } return delayTear; } @Override protected boolean executeAndChangeState(Object message) { boolean success = true; if (message instanceof ConnectionStateMessage) { if (_componentStatus.getStatus() != DatabusComponentStatus.Status.RUNNING) { _log.warn(" not running: " + message.toString()); } else { ConnectionStateMessage stateMsg = (ConnectionStateMessage)message; ConnectionState currentState = stateMsg.getConnState(); switch (stateMsg.getStateId()) { case INITIAL: break; case CLOSED: shutdown(); break; case PICK_SERVER: doPickRelay(currentState); break; case REQUEST_SOURCES: doRequestSources(currentState); break; case SOURCES_RESPONSE_SUCCESS: doSourcesResponseSuccess(currentState); break; case REQUEST_REGISTER: doRequestRegister(currentState); break; case REGISTER_RESPONSE_SUCCESS: doRegisterResponseSuccess(currentState); break; case BOOTSTRAP: doBootstrap(currentState); break; case REQUEST_STREAM: doRequestStream(currentState); break; case STREAM_REQUEST_SUCCESS: doReadDataEvents(currentState); break; case STREAM_RESPONSE_DONE: doStreamResponseDone(currentState); break; case SOURCES_REQUEST_ERROR: processSourcesRequestError(currentState); break; case SOURCES_RESPONSE_ERROR: processSourcesResponseError(currentState); break; case REGISTER_REQUEST_ERROR: processRegisterRequestError(currentState); break; case REGISTER_RESPONSE_ERROR: processRegisterResponseError(currentState); break; case STREAM_REQUEST_ERROR: processStreamRequestError(currentState); break; case STREAM_RESPONSE_ERROR: processStreamResponseError(currentState); break; default: { _log.error("Unknown state in RelayPullThread: " + currentState.getStateId()); success = false; break; } } } } else if (message instanceof BootstrapResultMessage) { BootstrapResultMessage bootstrapResultMessage = (BootstrapResultMessage)message; switch (bootstrapResultMessage.getTypeId()) { case BOOTSTRAP_COMPLETE: doBootstrapComplete(bootstrapResultMessage); break; case BOOTSTRAP_FAILED: doBootstrapFailed(bootstrapResultMessage); break; default: { _log.error("Unknown BootstrapResultChangeMessage in RelayPullThread: " + bootstrapResultMessage.getTypeId()); success = false; break; } } } else { success = super.executeAndChangeState(message); } return success; } private void doBootstrapFailed(BootstrapResultMessage bootstrapResultMessage) { _log.error("bootstrap failed", bootstrapResultMessage.getFailureReason()); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { _currentState.switchToBootstrap(_currentState.getCheckpoint()); enqueueMessage(_currentState); } } private void doBootstrapComplete(BootstrapResultMessage bootstrapResultMessage) { Checkpoint cp = bootstrapResultMessage.getBootstrapCheckpoint(); // we must have persisted some checkpoint at the end of the bootstrap phase if (null == cp) { bootstrapResultMessage.switchToBootstrapFailed( new RuntimeException("No persistent checkpoint found at the end of bootstrap!")); doBootstrapFailed(bootstrapResultMessage); } else { _currentState.setRelayFellOff(false); DbusClientMode consumptionMode = cp.getConsumptionMode(); _log.info("Bootstrap completed: " + "Consumption Mode=" + consumptionMode + " startScn=" + cp.getBootstrapStartScn() + " targetScn=" + cp.getBootstrapTargetScn() + " sinceScn=" + cp.getBootstrapSinceScn() + " windowScn=" + cp.getWindowScn()); UnifiedClientStats unifiedClientStats = _sourcesConn.getUnifiedClientStats(); if (unifiedClientStats != null) { // should always be false, but just in case of weird edge cases: boolean isBootstrapping = (consumptionMode == DbusClientMode.BOOTSTRAP_SNAPSHOT || consumptionMode == DbusClientMode.BOOTSTRAP_CATCHUP); unifiedClientStats.setBootstrappingState(isBootstrapping); } cp.resetBootstrap(); // clear Bootstrap scns for future bootstraps if ( toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { _currentState.switchToRequestStream(cp); enqueueMessage(_currentState); } } } @Override protected void doStart(LifecycleMessage lcMessage) { _log.info("Relay Puller doStart "); if (_currentState.getStateId() != ConnectionState.StateId.INITIAL) { return; } super.doStart(lcMessage); _currentState.switchToPickServer(); enqueueMessage(_currentState); } protected void doPickRelay(ConnectionState curState) { int serversNum = _servers.size(); if (0 == serversNum) { enqueueMessage(LifecycleMessage.createSuspendOnErroMessage( new DatabusException("No relays specified"))); return; } Random rng = new Random(); DatabusRelayConnection relayConn = null; ServerInfo serverInfo = null; int retriesLeft; BackoffTimer originalCounter =_status.getRetriesCounter(); if( curState.isRelayFellOff()) _status.setRetriesCounter(_retriesOnFallOff); while (null == relayConn && (retriesLeft = _status.getRetriesLeft()) >= 0 && !checkForShutdownRequest()) { _log.info("picking a relay; retries left:" + retriesLeft + ", Backoff Timer :" + _status.getRetriesCounter() + ", Are we retrying because of SCNNotFoundException : " + curState.isRelayFellOff()); backoffOnPullError(); _curServerIdx = (_curServerIdx < 0) ? rng.nextInt(serversNum) : (_curServerIdx + 1) % serversNum; Iterator<ServerInfo> setIter = _servers.iterator(); for (int i = 0; i <= _curServerIdx; ++i) serverInfo = setIter.next(); try { relayConn = _sourcesConn.getRelayConnFactory().createRelayConnection( serverInfo, this, _remoteExceptionHandler); _log.info("picked a relay:" + serverInfo.toSimpleString()); } catch (Exception e) { _log.error("Unable to get connection to relay:" + serverInfo.toSimpleString(), e); } } _status.setRetriesCounter(originalCounter); if (!checkForShutdownRequest()) { _curServer = serverInfo; if (null == relayConn) { if (_currentState.isRelayFellOff()) { boolean enqueueMessage = false; try { enqueueMessage = onRelayFellOff(curState, curState.getCheckpoint(), new ScnNotFoundException("Retries on SCNNotFoundException exhausted !!")); } catch (InterruptedException ie) { _log.error("interrupted while processing onRelayFellOff", ie); curState.switchToPickServer(); enqueueMessage(curState); } catch (InvalidEventException e) { _log.error("error trying to notify dispatcher of bootstrapping :" + e.getMessage(), e); curState.switchToPickServer(); enqueueMessage(curState); } if ( enqueueMessage ) enqueueMessage(curState); } else { // There are no retries left. Invoke an onError callback try { _log.info("Puller retries exhausted. Injecting an error event on dispatcher queue to invoke onError callback"); _remoteExceptionHandler.handleException(new PullerRetriesExhaustedException()); } catch (InterruptedException ie){ _log.error("Interrupted while processing retries exhausted", ie); } catch (InvalidEventException e) { _log.error("Error trying to notify dispatcher of puller retries getting exhausted", e); } _log.error("Cannot find a relay"); } } else { DatabusRelayConnection oldRelayConn = curState.getRelayConnection(); if ( null != oldRelayConn) { resetConnectionAndSetFlag(); } sendHeartbeat(_sourcesConn.getUnifiedClientStats()); _log.info("Relay Puller switching to request sources"); curState.switchToRequestSources(serverInfo, serverInfo.getAddress(), relayConn); _lastOpenConnection = relayConn; enqueueMessage(curState); } } } protected void doRequestSources(final ConnectionState curState) { if (null != _relayCallsStats) _relayCallsStats.registerSourcesCall(); _log.debug("Sending /sources request"); curState.switchToSourcesRequestSent(); sendHeartbeat(_sourcesConn.getUnifiedClientStats()); curState.getRelayConnection().requestSources(curState); } /** * Build the string of subs to be sent in the register call. * */ private String buildSubsList(List<DatabusSubscription> subs, Map<String, IdNamePair> sourceNameMap) { StringBuilder sb = new StringBuilder(128); sb.append("["); boolean first = true; for(DatabusSubscription sub :subs) { if(!first) sb.append(','); DatabusSubscription realSub = sub; LogicalSource ls = sub.getLogicalSource(); if (!ls.idKnown() && !ls.isWildcard()) { IdNamePair sourceEntry = sourceNameMap.get(ls.getName()); if (null == sourceEntry) { //this should never happen throw new RuntimeException("FATAL! unable to find logical source " + ls.getName() + " in " + sourceNameMap); } realSub = new DatabusSubscription( sub.getPhysicalSource(), sub.getPhysicalPartition(), new LogicalSourceId(new LogicalSource(sourceEntry.getId().intValue(), ls.getName()), sub.getLogicalPartition().getId())); } sb.append(realSub.toJsonString()); first = false; } sb.append("]"); return sb.toString(); } protected void doSourcesResponseSuccess(ConnectionState curState) { mergeRelayCallsStats(); Map<String, IdNamePair> sourceNameMap = curState.getSourcesNameMap(); StringBuilder sb = new StringBuilder(); boolean firstSource = true; boolean error = false; List<IdNamePair> sourcesList = new ArrayList<IdNamePair>(_sourcesConn.getSourcesNames().size()); if (curState.getRelayConnection().getProtocolVersion() < 3) { // Generate the list of sources for the sources= parameter to the /stream call. // This is needed only for version 2 of the protocol. For version 3 and higher, // we pass a list of subscriptions. for (String sourceName: _sourcesConn.getSourcesNames()) { IdNamePair source = sourceNameMap.get(sourceName); if (null == source) { _log.error("Source not found on server: " + sourceName); error = true; break; } else { if (! firstSource) sb.append(','); sb.append(source.getId().toString()); firstSource = false; sourcesList.add(source); } } } String sourcesIdList = sb.toString(); String subsString = error ? "ERROR" : buildSubsList(curState.getSubscriptions(), sourceNameMap); if (_log.isDebugEnabled()) { _log.debug("Source ids: " + sourcesIdList); _log.debug("Subs : " + subsString); } _sourcesConn.getRelayDispatcher().enqueueMessage( SourcesMessage.createSetSourcesIdsMessage(curState.getSources(), sourcesIdList)); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { if (error) { curState.switchToPickServer(); } else { curState.switchToRequestSourcesSchemas(sourcesIdList, subsString); if (_sourcesConn.isBootstrapEnabled()) { _sourcesConn.getBootstrapPuller().enqueueMessage( SourcesMessage.createSetSourcesIdsMessage(curState.getSources(), curState.getSourcesIdListString())); } String hostHdr = curState.getHostName(), svcHdr = curState.getSvcName(); _log.info("Connected to relay " + hostHdr + " with a service identifier " + svcHdr); } enqueueMessage(curState); } } protected void doRequestRegister(ConnectionState curState) { if (null != _relayCallsStats) _relayCallsStats.registerRegisterCall(EMPTY_REGISTER_LIST); _log.debug("Sending /sources request"); curState.swichToRegisterRequestSent(); sendHeartbeat(_sourcesConn.getUnifiedClientStats()); curState.getRelayConnection().requestRegister(curState.getSourcesIdListString(), curState); } protected void doRegisterResponseSuccess(ConnectionState curState) { boolean enqueueMessage = true; mergeRelayCallsStats(); if(curState.getSourcesSchemas().size() < _sourcesConn.getSourcesNames().size()) { _log.error("Expected " + _sourcesConn.getSourcesNames().size() + " schemas, got: " + curState.getSourcesSchemas().size()); curState.switchToPickServer(); } else { _sourcesConn.getRelayDispatcher().enqueueMessage( SourcesMessage.createSetSourcesSchemasMessage( curState.getSourcesSchemas(), curState.getMetadataSchemas() ) ); // Determine the checkpoint for read events in the following order // 1. Existing checkpoint in the current state // 2. Checkpoint persisted on disk // 3. New checkpoint Checkpoint cp = _currentState.getCheckpoint(); if (null == cp) { _log.info("no existing checkpoint found; attempting to load persistent checkpoint"); cp = _sourcesConn.loadPersistentCheckpoint(); } if (null == cp) { _log.info(getName() + ": no checkpoint found"); cp = new Checkpoint(); // set the mode to streaming first so relay will inspect the scn cp.setConsumptionMode(DbusClientMode.ONLINE_CONSUMPTION); // setting windowScn makes server to return scn not found error cp.setWindowScn(0L); cp.clearBootstrapStartScn(); if (_isConsumeCurrent) { cp.setFlexible(); _log.info("Setting flexible checkpoint: consumeCurrent is true"); } } else { _log.info("persisted checkpoint loaded: " + cp.toString()); } if ( cp.getFlexible()) curState.setFlexibleCheckpointRequest(true); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); enqueueMessage = false; } else { if (_sourcesConn.isBootstrapEnabled()) { _sourcesConn.getBootstrapPuller().enqueueMessage( SourcesMessage.createSetSourcesSchemasMessage( curState.getSourcesSchemas(), curState.getMetadataSchemas())); } if (DbusClientMode.BOOTSTRAP_SNAPSHOT == cp.getConsumptionMode() || DbusClientMode.BOOTSTRAP_CATCHUP == cp.getConsumptionMode()) { curState.setRelayFellOff(true); if (_sourcesConn.isBootstrapEnabled()) { curState.switchToBootstrap(cp); } else { String message = "bootstrap checkpoint found but bootstrapping is disabled:" + cp; _log.error(message); _status.suspendOnError(new DatabusException(message)); enqueueMessage = false; } } else { if (cp.getWindowOffset() > 0) { // switched when in middle of Window _log.info("RelayPuller reconnecting when in middle of event window. Will regress. Current Checkpoint :" + cp); if (cp.getPrevScn() > 0) { cp.setWindowScn(cp.getPrevScn()); cp.setWindowOffset(-1); curState.setSCNRegress(true); } else if (curState.isFlexibleCheckpointRequest()) { _log.info("Switched relays after reading partial window with flexible checkpoint !!"); cp.setFlexible(); curState.setSCNRegress(true); } else { _log.fatal("Checkpoint does not have prevSCN !!. Suspending !! Checkpoint :" + cp); enqueueMessage(LifecycleMessage.createSuspendOnErroMessage( new Exception("Checkpoint does not have prevSCN !!. Suspending !! Checkpoint :" + cp))); enqueueMessage = false; } } if ( enqueueMessage ) curState.switchToRequestStream(cp); } } } if ( enqueueMessage) enqueueMessage(curState); } protected void doRequestStream(ConnectionState curState) { boolean debugEnabled = _log.isDebugEnabled(); if (debugEnabled) _log.debug("Checking for free space in buffer"); int freeBufferThreshold=(int)(_sourcesConn.getConnectionConfig().getFreeBufferThreshold() * 100.0 / _pullerBufferUtilizationPct); try { curState.getDataEventsBuffer().waitForFreeSpace(freeBufferThreshold); } catch (InterruptedException ie) { //loop enqueueMessage(curState); return; } Checkpoint cp = curState.getCheckpoint(); if (debugEnabled) _log.debug("Checkpoint at RequestDataEvents: " + cp.toString()); if ( null == _relayFilter) { if (debugEnabled) _log.debug("Initializing relay filter config"); _relayFilter = new DbusKeyCompositeFilter(); Map<String, IdNamePair> srcNameIdMap = curState.getSourcesNameMap(); for (DbusKeyCompositeFilterConfig conf : _relayFilterConfigs) { Map<String, KeyFilterConfigHolder> cMap = conf.getConfigMap(); Map<Long, KeyFilterConfigHolder> fConfMap = new HashMap<Long, KeyFilterConfigHolder>(); for ( Entry<String, KeyFilterConfigHolder> e : cMap.entrySet()) { IdNamePair idName = srcNameIdMap.get(e.getKey()); if ( null != idName) { fConfMap.put(idName.getId(),e.getValue()); } } if (debugEnabled) _log.debug("FilterConfMap is :" + fConfMap); _relayFilter.merge(new DbusKeyCompositeFilter(fConfMap)); } if (debugEnabled) _log.debug("Merged Filter (before deduping) is :" + _relayFilter); _relayFilter.dedupe(); if (debugEnabled) _log.debug("Merged Filter (after deduping) is :" + _relayFilter); } _streamCallStartMs = System.currentTimeMillis(); if (null != _relayCallsStats) _relayCallsStats.registerStreamRequest(cp, EMPTY_STREAM_LIST); int fetchSize = (int)((curState.getDataEventsBuffer().getBufferFreeReadSpace() / 100.0) * _pullerBufferUtilizationPct); fetchSize = Math.max(freeBufferThreshold, fetchSize); CheckpointMult cpMult = new CheckpointMult(); String args; if (curState.getRelayConnection().getProtocolVersion() >= 3) { // for version 3 and higher we pass subscriptions args = curState.getSubsListString(); for (DatabusSubscription sub : curState.getSubscriptions()) { PhysicalPartition p = sub.getPhysicalPartition(); cpMult.addCheckpoint(p, cp); } } else { args = curState.getSourcesIdListString(); cpMult.addCheckpoint(PhysicalPartition.ANY_PHYSICAL_PARTITION, cp); } curState.switchToStreamRequestSent(); sendHeartbeat(_sourcesConn.getUnifiedClientStats()); curState.getRelayConnection().requestStream( args, _relayFilter, fetchSize, cpMult, _sourcesConn.getConnectionConfig().getKeyRange(), curState); } protected void doReadDataEvents(ConnectionState curState) { boolean debugEnabled = _log.isDebugEnabled(); boolean enqueueMessage = true; try { ChunkedBodyReadableByteChannel readChannel = curState.getReadChannel(); Checkpoint cp = curState.getCheckpoint(); curState.setRelayFellOff(false); String remoteErrorName = RemoteExceptionHandler.getExceptionName(readChannel); Throwable knownRemoteError = _remoteExceptionHandler.getException(readChannel); if (null != knownRemoteError && knownRemoteError instanceof ScnNotFoundException) { if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); enqueueMessage = false; } else { curState.setRelayFellOff(true); if ( _retriesOnFallOff.getRemainingRetriesNum() > 0 ) { _log.error("Got SCNNotFoundException. Retry (" + _retriesOnFallOff.getRetriesNum() + ") out of " + _retriesOnFallOff.getConfig().getMaxRetryNum()); curState.switchToPickServer(); } else { enqueueMessage = onRelayFellOff(curState, cp, knownRemoteError); } } } else if (null != remoteErrorName) { if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); enqueueMessage = false; } else { //remote processing error _log.error("read events error: " + RemoteExceptionHandler.getExceptionMessage(readChannel)); curState.switchToStreamResponseError(); } } else { /*DispatcherState dispatchState = curState.getDispatcherState(); dispatchState.switchToDispatchEvents(); _dispatcherThread.addNewStateBlocking(dispatchState);*/ if (debugEnabled) _log.debug("Sending events to buffer"); DbusEventsStatisticsCollector connCollector = _sourcesConn.getInboundEventsStatsCollector(); if (curState.isSCNRegress()) { _log.info("SCN Regress requested !! Sending a SCN Regress Message to dispatcher. Curr Ckpt :" + curState.getCheckpoint()); DbusEvent regressEvent = getEventFactory().createSCNRegressEvent(new SCNRegressMessage(curState.getCheckpoint())); writeEventToRelayDispatcher(curState, regressEvent, "SCN Regress Event from ckpt :" + curState.getCheckpoint()); curState.setSCNRegress(false); } UnifiedClientStats unifiedClientStats = _sourcesConn.getUnifiedClientStats(); if (unifiedClientStats != null) { unifiedClientStats.setBootstrappingState(false); // failsafe: we're definitely not bootstrapping here sendHeartbeat(unifiedClientStats); } int eventsNum = curState.getDataEventsBuffer().readEvents(readChannel, curState.getListeners(), connCollector); boolean resetConnection = false; if (eventsNum > 0) { _timeSinceEventsSec = System.currentTimeMillis(); cp.checkPoint(); } else { // no need to checkpoint if nothing returned from relay // check how long it has been since we got some events if (_remoteExceptionHandler.getPendingEventSize(readChannel) > curState.getDataEventsBuffer().getMaxReadBufferCapacity()) { // The relay had a pending event that we can never accommodate. This is fatal error. String err = "ReadBuffer max capacity(" + curState.getDataEventsBuffer().getMaxReadBufferCapacity() + ") is less than event size(" + _remoteExceptionHandler.getPendingEventSize(readChannel) + "). Increase databus.client.connectionDefaults.eventBuffer.maxEventSize and restart."; _log.fatal(err); enqueueMessage(LifecycleMessage.createSuspendOnErroMessage(new PendingEventTooLargeException(err))); return; } else { if(_noEventsConnectionResetTimeSec > 0) { // unless the feature is disabled (<=0) resetConnection = (System.currentTimeMillis() - _timeSinceEventsSec)/1000 > _noEventsConnectionResetTimeSec; if(resetConnection) { _timeSinceEventsSec = System.currentTimeMillis(); _log.warn("about to reset connection to relay " + curState.getServerInetAddress() + ", because there were no events for " + _noEventsConnectionResetTimeSec + "secs"); } } } } if (debugEnabled) _log.debug("Events read: " + eventsNum); // if it has been too long since we got non-empty responses - the relay may be stuck, try to reconnect if (toTearConnAfterHandlingResponse() || resetConnection) { tearConnectionAndEnqueuePickServer(); enqueueMessage = false; } else { curState.switchToStreamResponseDone(); resetServerRetries(); } } if ( enqueueMessage) enqueueMessage(curState); } catch (InterruptedException ie) { _log.warn("interrupted", ie); curState.switchToStreamResponseError(); enqueueMessage(curState); } catch (InvalidEventException e) { _log.error("error reading events from server:" + e, e); curState.switchToStreamResponseError(); enqueueMessage(curState); } catch (RuntimeException e) { _log.error("runtime error reading events from server: " + e, e); curState.switchToStreamResponseError(); enqueueMessage(curState); } } protected void doStreamResponseDone(ConnectionState curState) { Checkpoint cp = curState.getCheckpoint(); if (_log.isDebugEnabled()) { _log.debug("Data events done"); _log.debug("Checkpoint at EventsDone: " + cp.toString()); } if (curState.isFlexibleCheckpointRequest() && (curState.getDataEventsBuffer().getSeenEndOfPeriodScn() > 0 )) curState.setFlexibleCheckpointRequest(false); if (null != _sourcesConn.getLocalRelayCallsStatsCollector()) { long streamDuration = System.currentTimeMillis() - _streamCallStartMs; _sourcesConn.getLocalRelayCallsStatsCollector().registerStreamResponse(streamDuration); mergeRelayCallsStats(); } _status.getRetriesCounter().sleep(); curState.switchToRequestStream(cp); enqueueMessage(curState); } protected void doBootstrap(ConnectionState curState) { if (null != _lastOpenConnection) { _lastOpenConnection.close(); _lastOpenConnection = null; } Checkpoint bootstrapCkpt = null; if (_sourcesConn.getBootstrapPuller() == null) { _log.warn("doBootstrap got called, but BootstrapPullThread is null. Is bootstrap disabled?"); return; } try { bootstrapCkpt = curState.getCheckpoint().clone(); } catch (Exception e) { String msg = "Error copying checkpoint:" + curState.getCheckpoint(); _log.error(msg, e); BootstrapResultMessage bootstrapResultMessage = BootstrapResultMessage.createBootstrapFailedMessage(e); doBootstrapFailed(bootstrapResultMessage); return; } if (!bootstrapCkpt.isBootstrapStartScnSet()) { bootstrapCkpt = curState.getBstCheckpointHandler().createInitialBootstrapCheckpoint(bootstrapCkpt, bootstrapCkpt.getWindowScn()); //bootstrapCkpt.setBootstrapSinceScn(Long.valueOf(bootstrapCkpt.getWindowScn())); } _log.info("Bootstrap begin: sinceScn=" + bootstrapCkpt.getWindowScn()); CheckpointMessage bootstrapCpMessage = CheckpointMessage.createSetCheckpointMessage(bootstrapCkpt); _sourcesConn.getBootstrapPuller().enqueueMessage(bootstrapCpMessage); try { Checkpoint cpForDispatcher = new Checkpoint(bootstrapCkpt.toString()); cpForDispatcher.setConsumptionMode(DbusClientMode.BOOTSTRAP_SNAPSHOT); DbusEvent cpEvent = getEventFactory().createCheckpointEvent(cpForDispatcher); writeEventToRelayDispatcher(curState, cpEvent, "Control Event to start bootstrap"); curState.switchToBootstrapRequested(); } catch (InterruptedException ie) { _log.error("Got interrupted while writing control message to bootstrap !!", ie); enqueueMessage(LifecycleMessage.createSuspendOnErroMessage(ie)); } catch (Exception e) { enqueueMessage(LifecycleMessage.createSuspendOnErroMessage(e)); _log.error("Exception occured while switching to bootstrap: ", e); } //wait for bootstrap to finish /*boolean bootstrapDone = false; while (! bootstrapDone) { try { bootstrapPullerThread.join(); bootstrapDone = true; } catch (InterruptedException ie) {} }*/ } private void mergeRelayCallsStats() { if (null != _sourcesConn.getLocalRelayCallsStatsCollector() && null != _sourcesConn.getRelayCallsStatsCollector() && _unmergedHttpCallsStats >= RELAY_CALLS_MERGE_FREQ) { _sourcesConn.getRelayCallsStatsCollector().merge(_sourcesConn.getLocalRelayCallsStatsCollector()); _sourcesConn.getLocalRelayCallsStatsCollector().reset(); _unmergedHttpCallsStats = 0; } else { ++_unmergedHttpCallsStats; } } private void processSourcesRequestError(ConnectionState state) { _log.info("Sources Request Error"); if (null != _relayCallsStats) _relayCallsStats.registerInvalidSourceRequest(); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { state.switchToPickServer(); enqueueMessage(state); } } private void processSourcesResponseError(ConnectionState state) { _log.info("Sources Response Error"); if (null != _relayCallsStats) _relayCallsStats.registerInvalidSourceRequest(); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { state.switchToPickServer(); enqueueMessage(state); } } private void processRegisterRequestError(ConnectionState state) { _log.info("Register Request Error"); if (null != _relayCallsStats) _relayCallsStats.registerInvalidRegisterCall(); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { state.switchToPickServer(); enqueueMessage(state); } } private void processRegisterResponseError(ConnectionState state) { _log.info("Register Response Error"); if (null != _relayCallsStats) _relayCallsStats.registerInvalidRegisterCall(); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { state.switchToPickServer(); enqueueMessage(state); } } private void processStreamRequestError(ConnectionState state) { _log.info("Stream Request Error"); if (null != _relayCallsStats) _relayCallsStats.registerInvalidStreamRequest(); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { state.switchToPickServer(); enqueueMessage(state); } } private void processStreamResponseError(ConnectionState state) { _log.info("Stream Response Error"); if (null != _relayCallsStats) _relayCallsStats.registerInvalidStreamRequest(); if (toTearConnAfterHandlingResponse()) { tearConnectionAndEnqueuePickServer(); } else { state.switchToPickServer(); enqueueMessage(state); } } @Override protected void resetConnection() { DatabusServerConnection conn = _currentState.getRelayConnection(); if ( null != conn) { conn.close(); _currentState.setRelayConnection(null); } } private boolean onRelayFellOff(ConnectionState curState, Checkpoint cp, Throwable knownRemoteError) throws InvalidEventException, InterruptedException { boolean enqueueMessage = true; _log.error("Retries on SCNNotFoundException exhausted !!"); _retriesOnFallOff.reset(); //need to bootstrap, suspend or read From Latest SCN if (!_sourcesConn.isBootstrapEnabled()) { _log.error("No scn found on relay while no bootstrap services provided:"); _log.error(" bootstrapServices=" + _sourcesConn.getBootstrapServices() + "; bootstrapRegistrations=" + _sourcesConn.getBootstrapRegistrations()); if (_isReadLatestScnOnErrorEnabled) { // Read From Latest SCN on Error _log.error("Read Latest SCN Window on SCNNotFoundException is enabled. Will start reading from the lastest SCN Window !!"); curState.getRelayConnection().enableReadFromLatestScn(true); _currentState.setRelayFellOff(false); curState.switchToStreamResponseDone(); } else { _log.fatal("Got SCNNotFoundException but Read Latest SCN Window and bootstrap are disabled !!"); _remoteExceptionHandler.handleException(new BootstrapDatabaseTooOldException(knownRemoteError)); enqueueMessage = false; } } else { _log.info("Requested scn " + cp.getWindowScn() + " not found on relay; switching to bootstrap service"); curState.switchToBootstrap(cp); UnifiedClientStats unifiedClientStats = _sourcesConn.getUnifiedClientStats(); if (unifiedClientStats != null) { unifiedClientStats.setBootstrappingState(true); } } return enqueueMessage; } @Override protected void tearConnection() { _currentState.setRelayFellOff(false); super.tearConnection(); } /** * allows adjusting noEvents threshold dynamically * @param noEventsConnectionResetTimeSec - new threshold */ public void setNoEventsConnectionResetTimeSec(long noEventsConnectionResetTimeSec) { _noEventsConnectionResetTimeSec = noEventsConnectionResetTimeSec; } protected BackoffTimer getRetryonFallOff() { return _retriesOnFallOff; } private void writeEventToRelayDispatcher(ConnectionState curState, DbusEvent event, String message) throws InterruptedException, InvalidEventException { boolean success = false; // Create a infinite backoff timer that waits for maximum of 1 sec // for writing the control message to evb BackoffTimerStaticConfig timerConfig = new BackoffTimerStaticConfig( 1, 1000, 1, 1, -1); BackoffTimer timer = new BackoffTimer("EVB More Space Timer", timerConfig); timer.reset(); byte[] eventBytes = new byte[event.size()]; _log.info("Event size: " + eventBytes.length); _log.info("Event:" + event.toString()); event.getRawBytes().get(eventBytes); UnifiedClientStats unifiedClientStats = _sourcesConn.getUnifiedClientStats(); while ((!success) && (timer.getRemainingRetriesNum() > 0)) { ByteArrayInputStream cpIs = new ByteArrayInputStream( eventBytes); ReadableByteChannel cpRbc = Channels.newChannel(cpIs); sendHeartbeat(unifiedClientStats); int ecnt = curState.getDataEventsBuffer().readEvents(cpRbc); if (ecnt <= 0) { _log.error("Not enough spece in the event buffer to add a control message :" + message); boolean interrupted = !timer.backoffAndSleep(); if (interrupted) throw new InterruptedException( "Got interrupted while waiting to write control Message to EVB : " + message); } else { _log.info("Sent a control message :" + message); success = true; } } } public DatabusRelayConnection getLastOpenConnection() { return _lastOpenConnection; } }