/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.twill.internal.appmaster; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.util.concurrent.AbstractIdleService; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.Service; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.twill.api.RunId; import org.apache.twill.internal.Constants; import org.apache.twill.internal.EnvKeys; import org.apache.twill.internal.RunIds; import org.apache.twill.internal.ServiceMain; import org.apache.twill.internal.kafka.EmbeddedKafkaServer; import org.apache.twill.internal.logging.Loggings; import org.apache.twill.internal.utils.Networks; import org.apache.twill.internal.yarn.VersionDetectYarnAMClientFactory; import org.apache.twill.internal.yarn.YarnAMClient; import org.apache.twill.zookeeper.OperationFuture; import org.apache.twill.zookeeper.ZKClient; import org.apache.twill.zookeeper.ZKClientService; import org.apache.twill.zookeeper.ZKOperations; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.List; import java.util.Properties; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; /** * Main class for launching {@link ApplicationMasterService}. * TODO: This is copied from Twill as a temporary workaround for Kafka issue in MapR. * Will be removed when TWILL-147 is fixed. */ public final class ApplicationMasterMain extends ServiceMain { private static final Logger LOG = LoggerFactory.getLogger(ApplicationMasterMain.class); private final String kafkaZKConnect; private ApplicationMasterMain(String kafkaZKConnect) { this.kafkaZKConnect = kafkaZKConnect; } /** * Starts the application master. */ public static void main(String[] args) throws Exception { String zkConnect = System.getenv(EnvKeys.TWILL_ZK_CONNECT); File twillSpec = new File(Constants.Files.TWILL_SPEC); RunId runId = RunIds.fromString(System.getenv(EnvKeys.TWILL_RUN_ID)); ZKClientService zkClientService = createZKClient(zkConnect, System.getenv(EnvKeys.TWILL_APP_NAME)); Configuration conf = new YarnConfiguration(new HdfsConfiguration(new Configuration())); setRMSchedulerAddress(conf); final YarnAMClient amClient = new VersionDetectYarnAMClientFactory(conf).create(); ApplicationMasterService service = new ApplicationMasterService(runId, zkClientService, twillSpec, amClient, createAppLocation(conf)); TrackerService trackerService = new TrackerService(service); List<Service> prerequisites = Lists.newArrayList( new YarnAMClientService(amClient, trackerService), zkClientService, new AppMasterTwillZKPathService(zkClientService, runId) ); // TODO: Temp fix for Kakfa issue in MapR. Will be removed when fixing TWILL-147 if (Boolean.parseBoolean(System.getProperty("twill.disable.kafka"))) { LOG.info("Log collection through kafka disabled"); } else { prerequisites.add(new ApplicationKafkaService(zkClientService, runId)); } new ApplicationMasterMain(String.format("%s/%s/kafka", zkConnect, runId.getId())) .doMain( service, prerequisites.toArray(new Service[prerequisites.size()]) ); } /** * Optionally sets the RM scheduler address based on the environment variable if it is not set in the cluster config. */ private static void setRMSchedulerAddress(Configuration conf) { String schedulerAddress = System.getenv(EnvKeys.YARN_RM_SCHEDULER_ADDRESS); if (schedulerAddress == null) { return; } // If the RM scheduler address is not in the config or it's from yarn-default.xml, // replace it with the one from the env, which is the same as the one client connected to. String[] sources = conf.getPropertySources(YarnConfiguration.RM_SCHEDULER_ADDRESS); if (sources == null || sources.length == 0 || "yarn-default.xml".equals(sources[sources.length - 1])) { conf.set(YarnConfiguration.RM_SCHEDULER_ADDRESS, schedulerAddress); } } @Override protected String getHostname() { try { return InetAddress.getLocalHost().getCanonicalHostName(); } catch (UnknownHostException e) { return "unknown"; } } @Override protected String getKafkaZKConnect() { return kafkaZKConnect; } @Override protected String getRunnableName() { return System.getenv(EnvKeys.TWILL_RUNNABLE_NAME); } /** * A service wrapper for starting/stopping {@link EmbeddedKafkaServer} and make sure the ZK path for * Kafka exists before starting the Kafka server. */ private static final class ApplicationKafkaService extends AbstractIdleService { private static final Logger LOG = LoggerFactory.getLogger(ApplicationKafkaService.class); private final ZKClient zkClient; private final String kafkaZKPath; private final EmbeddedKafkaServer kafkaServer; private ApplicationKafkaService(ZKClient zkClient, RunId runId) { this.zkClient = zkClient; this.kafkaZKPath = "/" + runId.getId() + "/kafka"; this.kafkaServer = new EmbeddedKafkaServer(generateKafkaConfig(zkClient.getConnectString() + kafkaZKPath)); } @Override protected void startUp() throws Exception { ZKOperations.ignoreError( zkClient.create(kafkaZKPath, null, CreateMode.PERSISTENT), KeeperException.NodeExistsException.class, kafkaZKPath).get(); kafkaServer.startAndWait(); } @Override protected void shutDown() throws Exception { // Flush all logs before shutting down Kafka server Loggings.forceFlush(); // Delay for 2 seconds to give clients chance to poll the last batch of log messages. try { TimeUnit.SECONDS.sleep(2); } catch (InterruptedException e) { // Ignore LOG.info("Kafka shutdown delay interrupted", e); } finally { kafkaServer.stopAndWait(); } } private Properties generateKafkaConfig(String kafkaZKConnect) { int port = Networks.getRandomPort(); Preconditions.checkState(port > 0, "Failed to get random port."); Properties prop = new Properties(); prop.setProperty("log.dir", new File("kafka-logs").getAbsolutePath()); prop.setProperty("port", Integer.toString(port)); prop.setProperty("broker.id", "1"); prop.setProperty("socket.send.buffer.bytes", "1048576"); prop.setProperty("socket.receive.buffer.bytes", "1048576"); prop.setProperty("socket.request.max.bytes", "104857600"); prop.setProperty("num.partitions", "1"); prop.setProperty("log.retention.hours", "24"); prop.setProperty("log.flush.interval.messages", "10000"); prop.setProperty("log.flush.interval.ms", "1000"); prop.setProperty("log.segment.bytes", "536870912"); prop.setProperty("zookeeper.connect", kafkaZKConnect); // Set the connection timeout to relatively short time (3 seconds). // It is only used by the org.I0Itec.zkclient.ZKClient inside KafkaServer // to block and wait for ZK connection goes into SyncConnected state. // However, due to race condition described in TWILL-139 in the ZK client library used by Kafka, // when ZK authentication is enabled, the ZK client may hang until connection timeout. // Setting it to lower value allow the AM to retry multiple times if race happens. prop.setProperty("zookeeper.connection.timeout.ms", "3000"); prop.setProperty("default.replication.factor", "1"); return prop; } } /** * A Service wrapper that starts {@link TrackerService} and {@link YarnAMClient}. It is needed because * the tracker host and url needs to be provided to {@link YarnAMClient} before it starts {@link YarnAMClient}. */ private static final class YarnAMClientService extends AbstractIdleService { private final YarnAMClient yarnAMClient; private final TrackerService trackerService; private YarnAMClientService(YarnAMClient yarnAMClient, TrackerService trackerService) { this.yarnAMClient = yarnAMClient; this.trackerService = trackerService; } @Override protected void startUp() throws Exception { trackerService.setHost(yarnAMClient.getHost()); trackerService.startAndWait(); yarnAMClient.setTracker(trackerService.getBindAddress(), trackerService.getUrl()); try { yarnAMClient.startAndWait(); } catch (Exception e) { trackerService.stopAndWait(); throw e; } } @Override protected void shutDown() throws Exception { try { yarnAMClient.stopAndWait(); } finally { trackerService.stopAndWait(); } } } private static final class AppMasterTwillZKPathService extends TwillZKPathService { private static final Logger LOG = LoggerFactory.getLogger(AppMasterTwillZKPathService.class); private final ZKClient zkClient; public AppMasterTwillZKPathService(ZKClient zkClient, RunId runId) { super(zkClient, runId); this.zkClient = zkClient; } @Override protected void shutDown() throws Exception { super.shutDown(); // Deletes ZK nodes created for the application execution. // We don't have to worry about a race condition if another instance of the same app starts at the same time // as when removal is performed. This is because we always create nodes with "createParent == true", // which takes care of the parent node recreation if it is removed from here. // Try to delete the /instances path. It may throws NotEmptyException if there are other instances of the // same app running, which we can safely ignore and return. if (!delete(Constants.INSTANCES_PATH_PREFIX)) { return; } // Try to delete children under /discovery. It may fail with NotEmptyException if there are other instances // of the same app running that has discovery services running. List<String> children = zkClient.getChildren(Constants.DISCOVERY_PATH_PREFIX) .get(TIMEOUT_SECONDS, TimeUnit.SECONDS).getChildren(); List<OperationFuture<?>> deleteFutures = new ArrayList<>(); for (String child : children) { String path = Constants.DISCOVERY_PATH_PREFIX + "/" + child; LOG.info("Removing ZK path: {}{}", zkClient.getConnectString(), path); deleteFutures.add(zkClient.delete(path)); } Futures.successfulAsList(deleteFutures).get(TIMEOUT_SECONDS, TimeUnit.SECONDS); for (OperationFuture<?> future : deleteFutures) { try { future.get(); } catch (ExecutionException e) { if (e.getCause() instanceof KeeperException.NotEmptyException) { return; } throw e; } } // Delete the /discovery. It may fail with NotEmptyException (due to race between apps), // which can safely ignore and return. if (!delete(Constants.DISCOVERY_PATH_PREFIX)) { return; } // Delete the ZK path for the app namespace. delete("/"); } /** * Deletes the given ZK path. * * @param path path to delete * @return true if the path was deleted, false if failed to delete due to {@link KeeperException.NotEmptyException}. * @throws Exception if failed to delete the path */ private boolean delete(String path) throws Exception { try { LOG.info("Removing ZK path: {}{}", zkClient.getConnectString(), path); zkClient.delete(path).get(TIMEOUT_SECONDS, TimeUnit.SECONDS); return true; } catch (ExecutionException e) { if (e.getCause() instanceof KeeperException.NotEmptyException) { return false; } throw e; } } } }