/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.flume.agent; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.CountDownLatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cloudera.flume.agent.durability.WALCompletionNotifier; import com.cloudera.flume.conf.FlumeConfiguration; import com.cloudera.flume.conf.FlumeSpecException; import com.cloudera.flume.conf.FlumeConfigData; import com.cloudera.flume.handlers.endtoend.AckListener.Empty; import com.cloudera.util.Clock; import com.google.common.base.Preconditions; /** * This manages heartbeating the node with the master, notifications of * configuration updates, and spawning/decommissioning of logical nodes. * * TODO (jon) rename to HeartbeatManager */ public class LivenessManager { static final Logger LOG = LoggerFactory.getLogger(LivenessManager.class); final long BACKOFF_MILLIS; MasterRPC master; LogicalNodeManager nodesman; HeartbeatThread t; final WALAckManager ackcheck; final WALCompletionNotifier walman; class RetryAckListener extends Empty { @Override public void end(String group) throws IOException { walman.toAcked(group); } @Override public void expired(String group) throws IOException { walman.retry(group); } }; /** * Create a liveness manager with the specified managers. * * LogicalNodeManager is necessary for tracking physical/logical node * mappings. MasterRPC is the connection to the master, WALCompletionNotifier * is necessary for check on acks */ public LivenessManager(LogicalNodeManager nodesman, MasterRPC master, WALCompletionNotifier walman) { Preconditions.checkNotNull(nodesman); Preconditions.checkNotNull(master); BACKOFF_MILLIS = FlumeConfiguration.get().getHeartbeatBackoff(); this.walman = walman; this.nodesman = nodesman; this.master = master; this.t = new HeartbeatThread(); this.ackcheck = new WALAckManager(master, new RetryAckListener(), FlumeConfiguration.get().getAgentAckedRetransmit()); } /** * Checks against the master to get new physical nodes or to learn about * decommissioned logical nodes * * Invariant: There is always at least logical per physical node. When there * is one, it has the same name as the physical node. */ public void checkLogicalNodes() throws IOException { // TODO (jon) Make this a single batched rpc call instead of // multiple calls String physNode = nodesman.getPhysicalNodeName(); // get logical nodes list for this node. List<String> lns = master.getLogicalNodes(physNode); if (!lns.contains(physNode)) { // physical node node present? make sure it stays around. lns = new ArrayList<String>(lns); // copy the unmodifiable list lns.add(physNode); } for (String ln : lns) { // a logical node is not present? spawn it. if (nodesman.get(ln) == null) { try { nodesman.spawn(ln, "null", "null"); } catch (FlumeSpecException e) { LOG.error("This should never happen", e); } } } // Update the Chokeinformation for the ChokeManager FlumeNode.getInstance().getChokeManager().updateChokeLimitMap( master.getChokeMap(physNode)); nodesman.decommissionAllBut(lns); } /** * Checks registered nodes to see if they need a new configuraiton. */ public void checkLogicalNodeConfigs() throws IOException { // TODO (jon) batch all these rpc requests into one multi-part rpc // request. for (LogicalNode nd : nodesman.getNodes()) { boolean needsCfg = master.heartbeat(nd); if (needsCfg) { final FlumeConfigData data = master.getConfig(nd); if (data == null) { LOG.debug("Logical Node '" + nd.getName() + "' not configured on master"); } final LogicalNode node = nd; // TODO This is quite gross, but prevents heartbeat from blocking new Thread("SpawningLogicalNode " + nd.getName()) { public void run() { node.checkConfig(data); } }.start(); } } } /** * All the core functionality of a heartbeat accessible without having to be * in the heartbeat thread. */ public void heartbeatChecks() throws IOException { // these will call ensure open on the master checkLogicalNodes(); checkLogicalNodeConfigs(); // check for end to end acks. ackcheck.checkAcks(); } /** * This thread periodically contacts the master with a heartbeat. */ class HeartbeatThread extends Thread { volatile boolean done = false; long backoff = BACKOFF_MILLIS; long backoffLimit = FlumeConfiguration.get().getNodeHeartbeatBackoffLimit(); long heartbeatPeriod = FlumeConfiguration.get().getConfigHeartbeatPeriod(); CountDownLatch stopped = new CountDownLatch(1); HeartbeatThread() { super("Heartbeat"); } public void run() { try { while (!done) { try { heartbeatChecks(); backoff = BACKOFF_MILLIS; // was successful, reset backoff Clock.sleep(heartbeatPeriod); } catch (Exception e) { backoff *= 2; // sleep twice as long backoff = backoff > backoffLimit ? backoffLimit : backoff; LOG.warn("Connection to master(s) failed, " + e.getMessage() + ". Backing off for " + backoff + " ms "); LOG.debug("Current master is " + master.toString(), e); try { master.close(); } catch (IOException e1) { LOG.error("Failed when attempting to close master", e1); } Clock.sleep(backoff); } } } catch (InterruptedException e) { LOG.error("Heartbeat interrupted, this is not expected!", e); } stopped.countDown(); } }; /** * Starts the heartbeat thread and then returns. */ public void start() { t.start(); } public void stop() { CountDownLatch stopped = t.stopped; t.done = true; try { stopped.await(); } catch (InterruptedException e) { LOG.error("Problem waiting for livenessManager to stop", e); } } public WALAckManager getAckChecker() { return ackcheck; } }