/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tajo.worker; import com.codahale.metrics.Gauge; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalDirAllocator; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.shell.PathData; import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.yarn.util.RackResolver; import org.apache.tajo.QueryId; import org.apache.tajo.TajoConstants; import org.apache.tajo.TajoProtos; import org.apache.tajo.catalog.CatalogClient; import org.apache.tajo.catalog.CatalogService; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.ipc.TajoMasterProtocol; import org.apache.tajo.master.querymaster.QueryMaster; import org.apache.tajo.master.querymaster.QueryMasterManagerService; import org.apache.tajo.master.rm.TajoWorkerResourceManager; import org.apache.tajo.pullserver.TajoPullServerService; import org.apache.tajo.rpc.RpcChannelFactory; import org.apache.tajo.rpc.RpcConnectionPool; import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos; import org.apache.tajo.util.CommonTestingUtil; import org.apache.tajo.util.NetUtils; import org.apache.tajo.util.TajoIdUtils; import org.apache.tajo.util.metrics.TajoSystemMetrics; import org.apache.tajo.webapp.StaticHttpServer; import java.io.*; import java.lang.management.ManagementFactory; import java.lang.management.ThreadInfo; import java.lang.management.ThreadMXBean; import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import static org.apache.tajo.conf.TajoConf.ConfVars; public class TajoWorker extends CompositeService { public static final PrimitiveProtos.BoolProto TRUE_PROTO = PrimitiveProtos.BoolProto.newBuilder().setValue(true).build(); public static final PrimitiveProtos.BoolProto FALSE_PROTO = PrimitiveProtos.BoolProto.newBuilder().setValue(false).build(); public static final String WORKER_MODE_YARN_TASKRUNNER = "tr"; public static final String WORKER_MODE_YARN_QUERYMASTER = "qm"; public static final String WORKER_MODE_STANDBY = "standby"; public static final String WORKER_MODE_QUERY_MASTER = "standby-qm"; public static final String WORKER_MODE_TASKRUNNER = "standby-tr"; private static final Log LOG = LogFactory.getLog(TajoWorker.class); private TajoConf systemConf; private StaticHttpServer webServer; private TajoWorkerClientService tajoWorkerClientService; private QueryMasterManagerService queryMasterManagerService; private TajoWorkerManagerService tajoWorkerManagerService; private InetSocketAddress tajoMasterAddress; private InetSocketAddress workerResourceTrackerAddr; private CatalogClient catalogClient; private WorkerContext workerContext; private TaskRunnerManager taskRunnerManager; private TajoPullServerService pullService; private boolean yarnContainerMode; private boolean queryMasterMode; private boolean taskRunnerMode; private WorkerHeartbeatService workerHeartbeatThread; private AtomicBoolean stopped = new AtomicBoolean(false); private AtomicInteger numClusterNodes = new AtomicInteger(); private TajoMasterProtocol.ClusterResourceSummary clusterResource; private int httpPort; private ThreadMXBean threadBean = ManagementFactory.getThreadMXBean(); private RpcConnectionPool connPool; private String[] cmdArgs; private DeletionService deletionService; private TajoSystemMetrics workerSystemMetrics; public TajoWorker() throws Exception { super(TajoWorker.class.getName()); } public void startWorker(TajoConf systemConf, String[] args) { this.systemConf = systemConf; this.cmdArgs = args; setWorkerMode(args); init(systemConf); start(); } private void setWorkerMode(String[] args) { if(args.length < 1) { queryMasterMode = systemConf.getBoolean("tajo.worker.mode.querymaster", true); taskRunnerMode = systemConf.getBoolean("tajo.worker.mode.taskrunner", true); } else { if(WORKER_MODE_STANDBY.equals(args[0])) { queryMasterMode = true; taskRunnerMode = true; } else if(WORKER_MODE_YARN_TASKRUNNER.equals(args[0])) { yarnContainerMode = true; queryMasterMode = true; } else if(WORKER_MODE_YARN_QUERYMASTER.equals(args[0])) { yarnContainerMode = true; taskRunnerMode = true; } else if(WORKER_MODE_QUERY_MASTER.equals(args[0])) { yarnContainerMode = false; queryMasterMode = true; } else { yarnContainerMode = false; taskRunnerMode = true; } } if(!queryMasterMode && !taskRunnerMode) { LOG.fatal("Worker daemon exit cause no worker mode(querymaster/taskrunner) property"); System.exit(0); } } @Override public void init(Configuration conf) { Runtime.getRuntime().addShutdownHook(new Thread(new ShutdownHook())); this.systemConf = (TajoConf)conf; RackResolver.init(systemConf); this.connPool = RpcConnectionPool.getPool(systemConf); this.workerContext = new WorkerContext(); String resourceManagerClassName = systemConf.getVar(ConfVars.RESOURCE_MANAGER_CLASS); boolean randomPort = true; if(resourceManagerClassName.indexOf(TajoWorkerResourceManager.class.getName()) >= 0) { randomPort = false; } int clientPort = systemConf.getSocketAddrVar(ConfVars.WORKER_CLIENT_RPC_ADDRESS).getPort(); int peerRpcPort = systemConf.getSocketAddrVar(ConfVars.WORKER_PEER_RPC_ADDRESS).getPort(); int qmManagerPort = systemConf.getSocketAddrVar(ConfVars.WORKER_QM_RPC_ADDRESS).getPort(); if(randomPort) { clientPort = 0; peerRpcPort = 0; qmManagerPort = 0; systemConf.setIntVar(ConfVars.PULLSERVER_PORT, 0); } // querymaster worker tajoWorkerClientService = new TajoWorkerClientService(workerContext, clientPort); addService(tajoWorkerClientService); queryMasterManagerService = new QueryMasterManagerService(workerContext, qmManagerPort); addService(queryMasterManagerService); // taskrunner worker taskRunnerManager = new TaskRunnerManager(workerContext); addService(taskRunnerManager); tajoWorkerManagerService = new TajoWorkerManagerService(workerContext, peerRpcPort); addService(tajoWorkerManagerService); if(!yarnContainerMode) { if(taskRunnerMode) { pullService = new TajoPullServerService(); addService(pullService); } if (!systemConf.get(CommonTestingUtil.TAJO_TEST, "FALSE").equalsIgnoreCase("TRUE")) { try { httpPort = systemConf.getSocketAddrVar(ConfVars.WORKER_INFO_ADDRESS).getPort(); if(queryMasterMode && !taskRunnerMode) { //If QueryMaster and TaskRunner run on single host, http port conflicts httpPort = systemConf.getSocketAddrVar(ConfVars.WORKER_QM_INFO_ADDRESS).getPort(); } webServer = StaticHttpServer.getInstance(this ,"worker", null, httpPort , true, null, systemConf, null); webServer.start(); httpPort = webServer.getPort(); LOG.info("Worker info server started:" + httpPort); deletionService = new DeletionService(getMountPath().size(), 0); if(systemConf.getBoolVar(ConfVars.WORKER_TEMPORAL_DIR_CLEANUP)){ getWorkerContext().cleanupTemporalDirectories(); } } catch (IOException e) { LOG.error(e.getMessage(), e); } } } LOG.info("Tajo Worker started: queryMaster=" + queryMasterMode + " taskRunner=" + taskRunnerMode + ", qmRpcPort=" + qmManagerPort + ",yarnContainer=" + yarnContainerMode + ", clientPort=" + clientPort + ", peerRpcPort=" + peerRpcPort + ":" + qmManagerPort + ",httpPort" + httpPort); super.init(conf); if(yarnContainerMode && queryMasterMode) { tajoMasterAddress = NetUtils.createSocketAddr(cmdArgs[2]); connectToCatalog(); QueryId queryId = TajoIdUtils.parseQueryId(cmdArgs[1]); queryMasterManagerService.getQueryMaster().reportQueryStatusToQueryMaster( queryId, TajoProtos.QueryState.QUERY_MASTER_LAUNCHED); } else if(yarnContainerMode && taskRunnerMode) { //TaskRunner mode taskRunnerManager.startTask(cmdArgs); } else { tajoMasterAddress = NetUtils.createSocketAddr(systemConf.getVar(ConfVars.TAJO_MASTER_UMBILICAL_RPC_ADDRESS)); workerResourceTrackerAddr = NetUtils.createSocketAddr(systemConf.getVar(ConfVars.RESOURCE_TRACKER_RPC_ADDRESS)); connectToCatalog(); } workerHeartbeatThread = new WorkerHeartbeatService(workerContext); workerHeartbeatThread.init(conf); addIfService(workerHeartbeatThread); } private void initWorkerMetrics() { workerSystemMetrics = new TajoSystemMetrics(systemConf, "worker", workerContext.getWorkerName()); workerSystemMetrics.start(); workerSystemMetrics.register("querymaster", "runningQueries", new Gauge<Integer>() { @Override public Integer getValue() { if(queryMasterManagerService != null) { return queryMasterManagerService.getQueryMaster().getQueryMasterTasks().size(); } else { return 0; } } }); workerSystemMetrics.register("task", "runningTasks", new Gauge<Integer>() { @Override public Integer getValue() { if(taskRunnerManager != null) { return taskRunnerManager.getNumTasks(); } else { return 0; } } }); } public WorkerContext getWorkerContext() { return workerContext; } @Override public void start() { super.start(); initWorkerMetrics(); } @Override public void stop() { if(stopped.getAndSet(true)) { return; } if(webServer != null) { try { webServer.stop(); } catch (Exception e) { LOG.error(e.getMessage(), e); } } if (catalogClient != null) { catalogClient.close(); } if(connPool != null) { connPool.shutdown(); RpcChannelFactory.shutdown(); } if(webServer != null && webServer.isAlive()) { try { webServer.stop(); } catch (Exception e) { } } if(workerSystemMetrics != null) { workerSystemMetrics.stop(); } if(deletionService != null) deletionService.stop(); super.stop(); LOG.info("TajoWorker main thread exiting"); } public class WorkerContext { public QueryMaster getQueryMaster() { if(queryMasterManagerService == null) { return null; } return queryMasterManagerService.getQueryMaster(); } public TajoWorkerManagerService getTajoWorkerManagerService() { return tajoWorkerManagerService; } public QueryMasterManagerService getQueryMasterManagerService() { return queryMasterManagerService; } public TajoWorkerClientService getTajoWorkerClientService() { return tajoWorkerClientService; } public TaskRunnerManager getTaskRunnerManager() { return taskRunnerManager; } public CatalogService getCatalog() { return catalogClient; } public TajoPullServerService getPullService() { return pullService; } public int getHttpPort() { return httpPort; } public String getWorkerName() { if(queryMasterMode) { return getQueryMasterManagerService().getHostAndPort(); } else { return getTajoWorkerManagerService().getHostAndPort(); } } public void stopWorker(boolean force) { stop(); if(force) { System.exit(0); } } protected void cleanup(String strPath) { if(deletionService == null) return; LocalDirAllocator lDirAllocator = new LocalDirAllocator(ConfVars.WORKER_TEMPORAL_DIR.varname); try { Iterable<Path> iter = lDirAllocator.getAllLocalPathsToRead(strPath, systemConf); FileSystem localFS = FileSystem.getLocal(systemConf); for (Path path : iter){ deletionService.delete(localFS.makeQualified(path)); } } catch (IOException e) { LOG.error(e.getMessage(), e); } } protected void cleanupTemporalDirectories() { if(deletionService == null) return; LocalDirAllocator lDirAllocator = new LocalDirAllocator(ConfVars.WORKER_TEMPORAL_DIR.varname); try { Iterable<Path> iter = lDirAllocator.getAllLocalPathsToRead(".", systemConf); FileSystem localFS = FileSystem.getLocal(systemConf); for (Path path : iter){ PathData[] items = PathData.expandAsGlob(localFS.makeQualified(new Path(path, "*")).toString(), systemConf); ArrayList<Path> paths = new ArrayList<Path>(); for (PathData pd : items){ paths.add(pd.path); } if(paths.size() == 0) continue; deletionService.delete(null, paths.toArray(new Path[paths.size()])); } } catch (IOException e) { LOG.error(e.getMessage(), e); } } public boolean isYarnContainerMode() { return yarnContainerMode; } public void setNumClusterNodes(int numClusterNodes) { TajoWorker.this.numClusterNodes.set(numClusterNodes); } public int getNumClusterNodes() { return TajoWorker.this.numClusterNodes.get(); } public void setClusterResource(TajoMasterProtocol.ClusterResourceSummary clusterResource) { synchronized(numClusterNodes) { TajoWorker.this.clusterResource = clusterResource; } } public TajoMasterProtocol.ClusterResourceSummary getClusterResource() { synchronized(numClusterNodes) { return TajoWorker.this.clusterResource; } } public InetSocketAddress getTajoMasterAddress() { return tajoMasterAddress; } public InetSocketAddress getResourceTrackerAddress() { return workerResourceTrackerAddr; } public int getPeerRpcPort() { return getTajoWorkerManagerService() == null ? 0 : getTajoWorkerManagerService().getBindAddr().getPort(); } public boolean isQueryMasterMode() { return queryMasterMode; } public boolean isTaskRunnerMode() { return taskRunnerMode; } public TajoSystemMetrics getWorkerSystemMetrics() { return workerSystemMetrics; } } public void stopWorkerForce() { stop(); } private void connectToCatalog() { try { catalogClient = new CatalogClient(systemConf); } catch (IOException e) { LOG.error(e.getMessage(), e); } } private class ShutdownHook implements Runnable { @Override public void run() { if(!stopped.get()) { LOG.info("============================================"); LOG.info("TajoWorker received SIGINT Signal"); LOG.info("============================================"); stop(); } } } String getThreadTaskName(long id, String name) { if (name == null) { return Long.toString(id); } return id + " (" + name + ")"; } public void dumpThread(Writer writer) { PrintWriter stream = new PrintWriter(writer); int STACK_DEPTH = 20; boolean contention = threadBean.isThreadContentionMonitoringEnabled(); long[] threadIds = threadBean.getAllThreadIds(); stream.println("Process Thread Dump: Tajo Worker"); stream.println(threadIds.length + " active threads"); for (long tid : threadIds) { ThreadInfo info = threadBean.getThreadInfo(tid, STACK_DEPTH); if (info == null) { stream.println(" Inactive"); continue; } stream.println("Thread " + getThreadTaskName(info.getThreadId(), info.getThreadName()) + ":"); Thread.State state = info.getThreadState(); stream.println(" State: " + state + ", Blocked count: " + info.getBlockedCount() + ", Waited count: " + info.getWaitedCount()); if (contention) { stream.println(" Blocked time: " + info.getBlockedTime() + ", Waited time: " + info.getWaitedTime()); } if (state == Thread.State.WAITING) { stream.println(" Waiting on " + info.getLockName()); } else if (state == Thread.State.BLOCKED) { stream.println(" Blocked on " + info.getLockName() + ", Blocked by " + getThreadTaskName(info.getLockOwnerId(), info.getLockOwnerName())); } stream.println(" Stack:"); for (StackTraceElement frame : info.getStackTrace()) { stream.println(" " + frame.toString()); } stream.println(""); } } public static List<File> getMountPath() throws IOException { BufferedReader mountOutput = null; try { Process mountProcess = Runtime.getRuntime ().exec("mount"); mountOutput = new BufferedReader(new InputStreamReader(mountProcess.getInputStream())); List<File> mountPaths = new ArrayList<File>(); while (true) { String line = mountOutput.readLine(); if (line == null) { break; } int indexStart = line.indexOf(" on /"); int indexEnd = line.indexOf(" ", indexStart + 4); mountPaths.add(new File(line.substring (indexStart + 4, indexEnd))); } return mountPaths; } catch (IOException e) { e.printStackTrace(); throw e; } finally { if(mountOutput != null) { mountOutput.close(); } } } public static void main(String[] args) throws Exception { StringUtils.startupShutdownMessage(TajoWorker.class, args, LOG); TajoConf tajoConf = new TajoConf(); tajoConf.addResource(new Path(TajoConstants.SYSTEM_CONF_FILENAME)); try { TajoWorker tajoWorker = new TajoWorker(); tajoWorker.startWorker(tajoConf, args); } catch (Throwable t) { LOG.fatal("Error starting TajoWorker", t); System.exit(-1); } } }