/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.flume.watchdog; import java.io.IOException; import java.io.PrintStream; import java.nio.ByteBuffer; import java.nio.channels.ReadableByteChannel; import java.nio.channels.SelectableChannel; import java.nio.channels.SelectionKey; import java.nio.channels.Selector; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.Iterator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cloudera.flume.conf.FlumeConfiguration; import com.cloudera.util.InputStreamPipe; /** * This is a is a watchdog program that starts its arguments as a sub process * and restarts it if the program exists with any error code other than 0. * stderr and stdout are redirected from the child program to output to the * watchdog. Sub-processes are not restricted to java programs. * * Example usage (Starts the watchdog on the LocalFlumeAgent). * * java c.c.f.watchdog.Watchdog java c.c.f.agent.FlumeLocalAgent * * Caveats: This program is not helpful for programs that hang -- it assumes * that program fail-exit. It is also assumed that the watchdog, being a small * and simple program, is robust and will not fail. * * Future versions may determine if it is hung by using the SIGAR library and * some heuristics to see if the watched process is dead (possibility: no cpu * usage, no io, no syscalls). These will be more of an art than a science. * * */ public class Watchdog { static final Logger LOG = LoggerFactory.getLogger(Watchdog.class); String[] args; // intilized by laucnh Selector selector; Runtime rt; Process proc; InputStreamPipe outPipe, errPipe, watchdogPipe; PrintStream procIn; SelectableChannel stdout, stderr, stdin; boolean interactive; public Watchdog(String[] args, boolean interactive) { this.args = args.clone(); this.interactive = interactive; } public Watchdog(String[] args) { this(args, false); } /** * This starts the different pumper threads, and opens all the piping * connections. * * @throws IOException */ void startup() throws IOException { selector = Selector.open(); rt = Runtime.getRuntime(); proc = rt.exec(args); outPipe = new InputStreamPipe(proc.getInputStream()); errPipe = new InputStreamPipe(proc.getErrorStream()); if (interactive) { watchdogPipe = new InputStreamPipe(System.in); procIn = new PrintStream(proc.getOutputStream()); } stdout = outPipe.getChannel(); stderr = errPipe.getChannel(); if (interactive) { stdin = watchdogPipe.getChannel(); } stdout.register(selector, SelectionKey.OP_READ); stderr.register(selector, SelectionKey.OP_READ); if (interactive) { stdin.register(selector, SelectionKey.OP_READ); } outPipe.start(); errPipe.start(); if (interactive) { watchdogPipe.start(); } // if watchdog gets shutdown, make sure the subprocess is closed as well rt.addShutdownHook(new Thread() { public void run() { synchronized (Watchdog.this) { LOG.info("Watchdog shutdown hook"); if (proc != null) { proc.destroy(); } } } }); } // This prevents eventual exhaustion by watch dog due to file handle // exhaustion and thread frame exhaustion. // # of open handles can be checked by doing an "lsof | grep <pid> | wc" void shutdown() throws IOException { if (proc == null) return; if (interactive) { watchdogPipe.shutdown(); watchdogPipe = null; } outPipe.shutdown(); // +1 thread outPipe = null; errPipe.shutdown(); // +1 thread errPipe = null; if (interactive) { procIn.close(); procIn = null; } proc.getOutputStream().close(); // +1 file handle proc.getInputStream().close(); // +1 file handle proc.getErrorStream().close(); // +1 file handle proc.destroy(); proc = null; if (interactive) { stdin.close(); stdin = null; } stdout.close(); // +1 file handle stdout = null; stderr.close(); // +0 but never used in test case stderr = null; selector.close(); // +5 handles! selector = null; } /** * This version uses NIO to do unix style select input output redirection of * the child process. If the i/o streams are closed this likely means our * process is exiting, so exit and wait for process death. * * This all happens in a single thread but each pipe has its own thread for * moving data between the inputstream and ouptutstream * */ public int launchAgent() throws IOException, InterruptedException { startup(); ByteBuffer buffer = ByteBuffer.allocate(32); while (true) { selector.select(2000); Iterator<SelectionKey> it = selector.selectedKeys().iterator(); if (!it.hasNext()) { // nothing to do, loop again. continue; } SelectionKey key = it.next(); it.remove(); buffer.clear(); ReadableByteChannel channel = (ReadableByteChannel) key.channel(); int count = channel.read(buffer); if (count < 0) { // EOF channel.close(); break; } buffer.flip(); while (buffer.hasRemaining()) { if (key.channel() == stdout) { System.out.print((char) buffer.get()); } else if (key.channel() == stderr) { System.err.print((char) buffer.get()); } else if (key.channel() == stdin) { procIn.print((char) buffer.get()); procIn.flush(); } } } int retval = proc.waitFor(); shutdown(); LOG.info("Subprocess exited with value " + retval); return retval; } public void run(int maxTriesPerMin) { ArrayList<Date> times = new ArrayList<Date>(); while (true) { Date now = new Date(); if (times.size() > maxTriesPerMin) { // drop entries that are too old to count against us ArrayList<Date> newTimes = new ArrayList<Date>(); for (Date t : times) { Calendar c = Calendar.getInstance(); c.setTime(t); c.add(Calendar.MINUTE, 1); Date t_plus1 = c.getTime(); if (t_plus1.getTime() - now.getTime() > 0) { newTimes.add(t); } } // if it still to many recent tries, get the oldest and wait until it // should expire. times = newTimes; if (newTimes.size() > maxTriesPerMin) { try { Calendar c = Calendar.getInstance(); c.setTime(times.get(0)); c.add(Calendar.MINUTE, 1); Date old_plus1 = c.getTime(); long delta = old_plus1.getTime() - now.getTime(); LOG.warn("too many attempts failed per minute -- waiting for " + (delta / 1000) + "s"); Thread.sleep(delta); } catch (InterruptedException e) { e.printStackTrace(); } } } Date d = now; LOG.info("Restarting process @ " + d); times.add(d); try { int ret = launchAgent(); if (ret == 0) { LOG.info("Subprocess exited cleanly, closing watchdog"); break; } } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } } } @Deprecated // TODO (jon) move to com.cloudera.util.Watchdog. Except for one config var, // there are no dependencies on Flume. public static void main(String[] argv) { if (argv.length == 0) { System.out.println("need to specify watched command as arguments"); System.exit(-1); } String[] args = argv; FlumeConfiguration conf = FlumeConfiguration.hardExitLoadConfig(); int maxTriesPerMin = conf.getMaxRestartsPerMin(); Watchdog watchdog = new Watchdog(args); watchdog.run(maxTriesPerMin); } }