/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.flume.agent; import java.io.File; import java.io.IOException; import java.lang.reflect.Method; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.Properties; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.security.UserGroupInformation; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cloudera.flume.VersionInfo; import com.cloudera.flume.agent.diskfailover.DiskFailoverManager; import com.cloudera.flume.agent.diskfailover.NaiveFileFailoverManager; import com.cloudera.flume.agent.durability.NaiveFileWALManager; import com.cloudera.flume.agent.durability.WALManager; import com.cloudera.flume.conf.Context; import com.cloudera.flume.conf.FlumeBuilder; import com.cloudera.flume.conf.FlumeConfiguration; import com.cloudera.flume.conf.FlumeSpecException; import com.cloudera.flume.conf.LogicalNodeContext; import com.cloudera.flume.handlers.debug.ChokeManager; import com.cloudera.flume.handlers.endtoend.AckListener; import com.cloudera.flume.handlers.endtoend.CollectorAckListener; import com.cloudera.flume.handlers.text.FormatFactory; import com.cloudera.flume.handlers.text.FormatFactory.OutputFormatBuilder; import com.cloudera.flume.reporter.MasterReportPusher; import com.cloudera.flume.reporter.ReportEvent; import com.cloudera.flume.reporter.ReportManager; import com.cloudera.flume.reporter.Reportable; import com.cloudera.flume.util.FlumeVMInfo; import com.cloudera.flume.util.SystemInfo; import com.cloudera.util.CheckJavaVersion; import com.cloudera.util.FileUtil; import com.cloudera.util.NetUtils; import com.cloudera.util.Pair; import com.cloudera.util.StatusHttpServer; import com.google.common.base.Preconditions; /** * This is a configurable flume node. * * It has four main parts: 1) a http server that shows its status, 2) A * LivenessManager. This encapsulates heartbeat communications with master that * indicate the node is alive, and can trigger config updates. 3) a MasterRPC * layer. This encapsulates the RPC comms with the master. It can be replaced * later with a different master communication mechanism (Avro or ZK). 4) A * LogicalNodeManager. There now can be more than one logical node on a single * node. This class manages them. * * Threads in the status server do not prevent exit. Threads in liveness manager * will prevent exit as will any running threads in nodethread manager. * */ public class FlumeNode implements Reportable { static final Logger LOG = LoggerFactory.getLogger(FlumeNode.class); final static String PHYSICAL_NODE_REPORT_PREFIX = "pn-"; static final String R_NUM_LOGICAL_NODES = "Logical nodes"; // hook for jsp/web display private static FlumeNode instance; // run mode for this node. final boolean startHttp; // should this node start its own http status server private StatusHttpServer http = null; private FlumeVMInfo vmInfo; private SystemInfo sysInfo; private final LivenessManager liveMan; private MasterRPC rpcMan; private LogicalNodeManager nodesMan; private final MasterReportPusher reportPusher; /** * (String) logical node name -> WALManager */ private Map<String, WALManager> walMans = new HashMap<String, WALManager>(); /** * (String) logical node name -> DiskFailoverManager */ private Map<String, DiskFailoverManager> failoverMans = new HashMap<String, DiskFailoverManager>(); final private CollectorAckListener collectorAck; final String physicalNodeName; private final ChokeManager chokeMan; /** * A FlumeNode constructor with pluggable xxxManagers. This is used for * debugging and test cases. The http server is assumed not to be started, and * we are not doing a oneshot. */ public FlumeNode(String name, MasterRPC rpc, LogicalNodeManager nodesMan, WALManager walMan, DiskFailoverManager dfMan, CollectorAckListener colAck, LivenessManager liveman) { this.physicalNodeName = name; rpcMan = rpc; instance = this; this.startHttp = false; this.nodesMan = nodesMan; this.walMans.put(getPhysicalNodeName(), walMan); this.failoverMans.put(getPhysicalNodeName(), dfMan); this.collectorAck = colAck; this.liveMan = liveman; // As this is only for the testing puposes, just initialize the physical // node limit to Max Int. this.chokeMan = new ChokeManager(); this.vmInfo = new FlumeVMInfo(PHYSICAL_NODE_REPORT_PREFIX + this.physicalNodeName + "."); this.reportPusher = new MasterReportPusher(FlumeConfiguration.get(), ReportManager.get(), rpcMan); this.sysInfo = new SystemInfo(PHYSICAL_NODE_REPORT_PREFIX + this.physicalNodeName + "."); } public FlumeNode(FlumeConfiguration conf, String nodeName, MasterRPC rpc, boolean startHttp, boolean oneshot) { this.physicalNodeName = nodeName; rpcMan = rpc; instance = this; this.startHttp = startHttp; this.nodesMan = new LogicalNodeManager(nodeName); File defaultDir = new File(conf.getAgentLogsDir(), getPhysicalNodeName()); WALManager walMan = new NaiveFileWALManager(defaultDir); this.walMans.put(getPhysicalNodeName(), walMan); this.failoverMans.put(getPhysicalNodeName(), new NaiveFileFailoverManager( defaultDir)); // no need for liveness tracker if a one shot execution. this.collectorAck = new CollectorAckListener(rpcMan); if (!oneshot) { this.liveMan = new LivenessManager(nodesMan, rpcMan, new FlumeNodeWALNotifier(this.walMans)); this.reportPusher = new MasterReportPusher(conf, ReportManager.get(), rpcMan); } else { this.liveMan = null; this.reportPusher = null; } // initializing ChokeController this.chokeMan = new ChokeManager(); this.vmInfo = new FlumeVMInfo(PHYSICAL_NODE_REPORT_PREFIX + this.getPhysicalNodeName() + "."); this.sysInfo = new SystemInfo(PHYSICAL_NODE_REPORT_PREFIX + this.getPhysicalNodeName() + "."); } public FlumeNode(MasterRPC rpc, boolean startHttp, boolean oneshot) { this(FlumeConfiguration.get(), NetUtils.localhost(), rpc, startHttp, oneshot); } public FlumeNode(String nodename, FlumeConfiguration conf, boolean startHttp, boolean oneshot) { // Use a failover-enabled master RPC, which randomizes the failover order this(conf, nodename, new MultiMasterRPC(conf, true), startHttp, oneshot); } public FlumeNode(FlumeConfiguration conf) { this(NetUtils.localhost(), conf, false /* http server */, false /* oneshot */); } /** * This hook makes it easy for web apps and jsps to get the current FlumeNode * instance. This is used to test the FlumeNode related jsps. */ public static FlumeNode getInstance() { if (instance == null) { instance = new FlumeNode(FlumeConfiguration.get()); } return instance; } public static String getWebPath(FlumeConfiguration conf) { String webPath = conf.getWebAppsPath(); File f = new File(webPath); // absolute paths win, but if is not absolute, prefix with flume home if (!f.isAbsolute()) { String basepath = FlumeConfiguration.getFlumeHome(); if (basepath == null) { LOG.warn("FLUME_HOME not set, potential for odd behavior!"); } File base = new File(basepath, webPath); webPath = base.getAbsolutePath(); } return webPath; } /** * This also implements the Apache Commons Daemon interface's start */ synchronized public void start() { FlumeConfiguration conf = FlumeConfiguration.get(); ReportManager.get().add(vmInfo); ReportManager.get().add(sysInfo); ReportManager.get().add(this); if (startHttp) { try { String webPath = getWebPath(conf); boolean findport = FlumeConfiguration.get().getNodeAutofindHttpPort(); this.http = new StatusHttpServer("flumeagent", webPath, "0.0.0.0", conf.getNodeStatusPort(), findport); http.start(); } catch (IOException e) { LOG.error("Flume node failed: " + e.getMessage(), e); } catch (Throwable t) { LOG.error("Unexcepted exception/error thrown! " + t.getMessage(), t); } } if (reportPusher != null) { reportPusher.start(); } if (liveMan != null) { liveMan.start(); } if (chokeMan != null) { // JVM exits if only daemons threads remain. chokeMan.setDaemon(true); chokeMan.start(); } } /** * This also implements the Apache Commons Daemon interface's stop */ synchronized public void stop() { if (this.http != null) { try { http.stop(); } catch (Exception e) { LOG.error("Exception stopping FlumeNode", e); } } if (reportPusher != null) { reportPusher.stop(); } if (liveMan != null) { liveMan.stop(); } if (chokeMan != null) { chokeMan.halt(); } } /** * Load output format plugins specified by * {@link FlumeConfiguration#OUTPUT_FORMAT_PLUGIN_CLASSES}. Invalid plugins * are discarded from the list with errors logged. */ public static void loadOutputFormatPlugins() { String outputFormatPluginClasses = FlumeConfiguration.get().get( FlumeConfiguration.OUTPUT_FORMAT_PLUGIN_CLASSES, ""); String[] classes = outputFormatPluginClasses.split(",\\s*"); for (String className : classes) { try { Class<?> cls = Class.forName(className); if (OutputFormatBuilder.class.isAssignableFrom(cls)) { OutputFormatBuilder builder = (OutputFormatBuilder) cls.newInstance(); FormatFactory.get().registerFormat(builder.getName(), builder); LOG.info("Registered output format plugin " + className); } else { LOG.warn("Ignoring output format plugin class " + className + " - Does not subclass OutputFormatBuilder"); } } catch (ClassNotFoundException e) { LOG.warn("Unable to load output format plugin class " + className + " - Class not found"); } catch (FlumeSpecException e) { LOG.warn("Unable to load output format plugin class " + className + " - Flume spec exception follows.", e); } catch (InstantiationException e) { LOG.warn("Unable to load output format plugin class " + className + " - Unable to instantiate class.", e); } catch (IllegalAccessException e) { LOG.warn("Unable to load output format plugin class " + className + " - Access violation.", e); } } } /** * This also implements the Apache Commons Daemon interface's init */ public void init(String[] args) { try { setup(args); } catch (IOException ioe) { LOG.error("Failed to init Flume Node", ioe); } } /** * This also implements the Apache Commons Daemon interface's destroy */ public void destroy() { stop(); // I think this is ok. } /** * This method is currently called by the JSP to display node information. */ public String report() { return getReport().toHtml(); } public WALAckManager getAckChecker() { if (liveMan == null) return null; return liveMan.getAckChecker(); } public AckListener getCollectorAckListener() { return collectorAck; } public static void logVersion(Logger log) { log.info("Flume " + VersionInfo.getVersion()); log.info(" rev " + VersionInfo.getRevision()); log.info("Compiled on " + VersionInfo.getDate()); } public static void logEnvironment(Logger log) { Properties props = System.getProperties(); for (Entry<Object, Object> p : props.entrySet()) { log.info("System property " + p.getKey() + "=" + p.getValue()); } } /** * This function checks the agent logs dir to make sure that the process has * the ability to the directory if necesary, that the path if it does exist is * a directory, and that it can infact create files inside of the directory. * If it fails any of these, it throws an exception. * * Finally, it checks to see if the path is in /tmp and warns the user that * this may not be the best idea. */ public static void nodeConfigChecksOk() throws IOException { // TODO (jon) if we add more checks in here, make the different managers // responsible for throwing an Exception on construction instead. FlumeConfiguration conf = FlumeConfiguration.get(); String s = conf.getAgentLogsDir(); File f = new File(s); if (!FileUtil.makeDirs(f)) { throw new IOException("Path to Log dir cannot be created: '" + s + "'. Check permissions?"); } if (!f.isDirectory()) { throw new IOException("Log dir '" + s + "' already exists as a file. Check log dir path."); } File f2 = null; try { f2 = File.createTempFile("initcheck", ".test", f); } catch (IOException e) { throw new IOException("Failure to write in log directory: '" + s + "'. Check permissions?"); } if (!f2.delete()) { throw new IOException("Unable to delete " + f2 + " from log directory " + "(but writing succeeded) - something is strange here"); } File tmp = new File("/tmp"); File cur = f; while (cur != null) { if (cur.equals(tmp)) { LOG.warn("Log directory is writing inside of /tmp. This data may not survive reboot!"); break; } cur = cur.getParentFile(); } } /** * Returns a Flume Node with settings from specified command line parameters. * (See usage for instructions) * * @param argv * @return * @throws IOException */ public static FlumeNode setup(String[] argv) throws IOException { logVersion(LOG); logEnvironment(LOG); // Make sure the Java version is not older than 1.6 if (!CheckJavaVersion.isVersionOk()) { LOG.error("Exiting because of an old Java version or Java version in bad format"); System.exit(-1); } LOG.info("Starting flume agent on: " + NetUtils.localhost()); LOG.info(" Working directory is: " + new File(".").getAbsolutePath()); FlumeConfiguration.hardExitLoadConfig(); // will exit if conf file is bad. CommandLine cmd = null; Options options = new Options(); options.addOption("c", true, "Load initial config from cmdline arg"); options.addOption("n", true, "Set node name"); options.addOption("s", false, "Do not start local flume status server on node"); options.addOption("1", false, "Make flume node one shot (if closes or errors, exits)"); options.addOption("m", false, "Have flume hard exit if in likey gc thrash situation"); options.addOption("h", false, "Print help information"); options.addOption("v", false, "Print version information"); try { CommandLineParser parser = new PosixParser(); cmd = parser.parse(options, argv); } catch (ParseException e) { HelpFormatter fmt = new HelpFormatter(); fmt.printHelp("FlumeNode", options, true); return null; } // dump version info only if (cmd != null && cmd.hasOption("v")) { return null; } // dump help info. if (cmd != null && cmd.hasOption("h")) { HelpFormatter fmt = new HelpFormatter(); fmt.printHelp("FlumeNode", options, true); return null; } // Check FlumeConfiguration file for settings that may cause node to fail. nodeConfigChecksOk(); String nodename = NetUtils.localhost(); // default to local host name. if (cmd != null && cmd.hasOption("n")) { // select a different name, allow for multiple processes configured // differently on same node. nodename = cmd.getOptionValue("n"); } boolean startHttp = false; if (cmd != null && !cmd.hasOption("s")) { // no -s option, start the local status server startHttp = true; } boolean oneshot = false; if (cmd != null && cmd.hasOption("1")) { oneshot = true; } loadOutputFormatPlugins(); // Instantiate the flume node. FlumeConfiguration conf = FlumeConfiguration.get(); FlumeNode flume = new FlumeNode(nodename, conf, startHttp, oneshot); flume.start(); // load an initial configuration from command line if (cmd != null && cmd.hasOption("c")) { String spec = cmd.getOptionValue("c"); LOG.info("Loading spec from command line: '" + spec + "'"); try { Context ctx = new LogicalNodeContext(nodename, nodename); Map<String, Pair<String, String>> cfgs = FlumeBuilder.parseConf(ctx, spec); Pair<String, String> node = cfgs.get(nodename); flume.nodesMan.spawn(nodename, node.getLeft(), node.getRight()); } catch (Exception e) { LOG.warn("Caught exception loading node:" + e.getMessage()); LOG.debug("Exception: ", e); if (oneshot) { System.exit(0); // exit cleanly } } } else { // default to null configurations. try { flume.nodesMan.spawn(nodename, "null", "null"); } catch (FlumeSpecException e) { LOG.error("This should never happen", e); } catch (IOException e) { LOG.error("Caught exception loading node", e); } } if (cmd != null && cmd.hasOption("m")) { // setup memory use monitor LOG.info("Setup hard exit on memory exhaustion"); MemoryMonitor.setupHardExitMemMonitor(FlumeConfiguration.get() .getAgentMemoryThreshold()); } try { tryKerberosLogin(); } catch (IOException ioe) { LOG.error("Failed to kerberos login.", ioe); } // hangout, waiting for other agent thread to exit. return flume; } /** * This should attempts to kerberos login via a keytab if security is enabled * in hadoop. * * This should be able to support multiple hadoop clusters as long as the * particular principal is allowed on multiple clusters. * * To preserve compatibility with non security enhanced hdfs, we use * reflection on various UserGroupInformation and SecurityUtil related method * calls. */ @SuppressWarnings("unchecked") static void tryKerberosLogin() throws IOException { /* * UserGroupInformation is in hadoop 0.18 * UserGroupInformation.isSecurityEnabled() not in pre security API. * * boolean useSec = UserGroupInformation.isSecurityEnabled(); */ boolean useSec = false; try { Class<UserGroupInformation> c = UserGroupInformation.class; // static call, null this obj useSec = (Boolean) c.getMethod("isSecurityEnabled").invoke(null); } catch (Exception e) { LOG.warn("Flume is using Hadoop core " + org.apache.hadoop.util.VersionInfo.getVersion() + " which does not support Security / Authentication: " + e.getMessage()); return; } LOG.info("Hadoop Security enabled: " + useSec); if (!useSec) { return; } // At this point we know we are using a hadoop library that is kerberos // enabled. // attempt to load kerberos information for authenticated hdfs comms. String principal = FlumeConfiguration.get().getKerberosPrincipal(); String keytab = FlumeConfiguration.get().getKerberosKeytab(); LOG.info("Kerberos login as " + principal + " from " + keytab); try { /* * SecurityUtil not present pre hadoop 20.2 * * SecurityUtil.login not in pre-security Hadoop API * * // Keytab login does not need to auto refresh * * SecurityUtil.login(FlumeConfiguration.get(), * FlumeConfiguration.SECURITY_KERBEROS_KEYTAB, * FlumeConfiguration.SECURITY_KERBEROS_PRINCIPAL); */ Class c = Class.forName("org.apache.hadoop.security.SecurityUtil"); // get method login(Configuration, String, String); Method m = c.getMethod("login", Configuration.class, String.class, String.class); m.invoke(null, FlumeConfiguration.get(), FlumeConfiguration.SECURITY_KERBEROS_KEYTAB, FlumeConfiguration.SECURITY_KERBEROS_PRINCIPAL); } catch (Exception e) { LOG.error("Flume failed when attempting to authenticate with keytab " + FlumeConfiguration.get().getKerberosKeytab() + " and principal '" + FlumeConfiguration.get().getKerberosPrincipal() + "'", e); // e.getMessage() comes from hadoop is worthless return; } try { /* * getLoginUser, getAuthenticationMethod, and isLoginKeytabBased are not * in Hadoop 20.2, only kerberized enhanced version. * * getUserName is in all 0.18.3+ * * UserGroupInformation ugi = UserGroupInformation.getLoginUser(); * LOG.info("Auth method: " + ugi.getAuthenticationMethod()); * LOG.info(" User name: " + ugi.getUserName()); * LOG.info(" Using keytab: " + * UserGroupInformation.isLoginKeytabBased()); */ Class<UserGroupInformation> c2 = UserGroupInformation.class; // static call, null this obj UserGroupInformation ugi = (UserGroupInformation) c2.getMethod( "getLoginUser").invoke(null); String authMethod = c2.getMethod("getAuthenticationMethod").invoke(ugi) .toString(); boolean keytabBased = (Boolean) c2.getMethod("isLoginKeytabBased") .invoke(ugi); LOG.info("Auth method: " + authMethod); LOG.info(" User name: " + ugi.getUserName()); LOG.info(" Using keytab: " + keytabBased); } catch (Exception e) { LOG.error("Flume was unable to dump kerberos login user" + " and authentication method", e); return; } } public static void main(String[] argv) { try { setup(argv); } catch (Exception e) { LOG.error( "Aborting: Unexpected problem with environment." + e.getMessage(), e); System.exit(-1); } } public WALManager getWalManager() { // default to physical node's wal manager synchronized (walMans) { return walMans.get(getPhysicalNodeName()); } } public WALManager getWalManager(String walnode) { synchronized (walMans) { if (walnode == null) { return getWalManager(); } return walMans.get(walnode); } } public WALManager addWalManager(String walnode) { Preconditions.checkArgument(walnode != null); FlumeConfiguration conf = FlumeConfiguration.get(); WALManager wm = new NaiveFileWALManager(new File(new File( conf.getAgentLogsDir()), walnode)); synchronized (walMans) { walMans.put(walnode, wm); return wm; } } /** * Atomically gets an existing dfo for the given node or creates a new one. */ public WALManager getAddWALManager(String dfonode) { synchronized (failoverMans) { WALManager walman = getWalManager(dfonode); if (walman == null) { walman = addWalManager(dfonode); } return walman; } } public DiskFailoverManager getDFOManager() { synchronized (failoverMans) { return failoverMans.get(getPhysicalNodeName()); } } public DiskFailoverManager getDFOManager(String dfonode) { synchronized (failoverMans) { if (dfonode == null) { return getDFOManager(); } return failoverMans.get(dfonode); } } public DiskFailoverManager addDFOManager(String dfonode) { Preconditions.checkArgument(dfonode != null); FlumeConfiguration conf = FlumeConfiguration.get(); DiskFailoverManager wm = new NaiveFileFailoverManager(new File(new File( conf.getAgentLogsDir()), dfonode)); synchronized (failoverMans) { failoverMans.put(dfonode, wm); return wm; } } /** * Atomically gets an existing dfo for the given node or creates a new one. */ public DiskFailoverManager getAddDFOManager(String dfonode) { synchronized (failoverMans) { DiskFailoverManager dfoman = getDFOManager(dfonode); if (dfoman == null) { dfoman = addDFOManager(dfonode); } return dfoman; } } public LogicalNodeManager getLogicalNodeManager() { return nodesMan; } // TODO (jon) rename when liveness manager renamed public LivenessManager getLivenessManager() { return liveMan; } public ChokeManager getChokeManager() { return chokeMan; } @Override public String getName() { return PHYSICAL_NODE_REPORT_PREFIX + this.getPhysicalNodeName(); } @Override public ReportEvent getReport() { ReportEvent node = new ReportEvent(getName()); node.setLongMetric(R_NUM_LOGICAL_NODES, this.getLogicalNodeManager() .getNodes().size()); node.hierarchicalMerge(nodesMan.getName(), nodesMan.getReport()); if (getAckChecker() != null) { node.hierarchicalMerge(getAckChecker().getName(), getAckChecker() .getReport()); } // TODO (jon) LivenessMan // TODO (jon) rpcMan return node; } public String getPhysicalNodeName() { return physicalNodeName; } }