/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.llap.registry.impl; import java.io.IOException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import javax.security.auth.login.AppConfigurationEntry; import com.google.common.collect.Sets; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.CuratorFrameworkFactory; import org.apache.curator.framework.api.ACLProvider; import org.apache.curator.framework.imps.CuratorFrameworkState; import org.apache.curator.framework.recipes.cache.ChildData; import org.apache.curator.framework.recipes.cache.PathChildrenCache; import org.apache.curator.framework.recipes.cache.PathChildrenCacheEvent; import org.apache.curator.framework.recipes.cache.PathChildrenCacheListener; import org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode; import org.apache.curator.framework.recipes.nodes.PersistentEphemeralNode.Mode; import org.apache.curator.retry.ExponentialBackoffRetry; import org.apache.curator.utils.CloseableUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.llap.LlapUtil; import org.apache.hadoop.hive.llap.io.api.LlapProxy; import org.apache.hadoop.hive.llap.registry.ServiceInstance; import org.apache.hadoop.hive.llap.registry.ServiceInstanceSet; import org.apache.hadoop.hive.llap.registry.ServiceInstanceStateChangeListener; import org.apache.hadoop.hive.llap.registry.ServiceRegistry; import org.apache.hadoop.registry.client.binding.RegistryTypeUtils; import org.apache.hadoop.registry.client.binding.RegistryUtils; import org.apache.hadoop.registry.client.binding.RegistryUtils.ServiceRecordMarshal; import org.apache.hadoop.registry.client.types.AddressTypes; import org.apache.hadoop.registry.client.types.Endpoint; import org.apache.hadoop.registry.client.types.ProtocolTypes; import org.apache.hadoop.registry.client.types.ServiceRecord; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authentication.util.KerberosUtil; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.zookeeper.ZooDefs; import org.apache.zookeeper.KeeperException.InvalidACLException; import org.apache.zookeeper.client.ZooKeeperSaslClient; import org.apache.zookeeper.data.ACL; import org.apache.zookeeper.data.Id; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.util.concurrent.ThreadFactoryBuilder; public class LlapZookeeperRegistryImpl implements ServiceRegistry { private static final Logger LOG = LoggerFactory.getLogger(LlapZookeeperRegistryImpl.class); /** * IPC endpoint names. */ private static final String IPC_SERVICES = "services"; private static final String IPC_MNG = "llapmng"; private static final String IPC_SHUFFLE = "shuffle"; private static final String IPC_LLAP = "llap"; private static final String IPC_OUTPUTFORMAT = "llapoutputformat"; private final static String SASL_NAMESPACE = "llap-sasl"; private final static String UNSECURE_NAMESPACE = "llap-unsecure"; private final static String USER_SCOPE_PATH_PREFIX = "user-"; private static final String DISABLE_MESSAGE = "Set " + ConfVars.LLAP_VALIDATE_ACLS.varname + " to false to disable ACL validation"; private static final String WORKER_PREFIX = "worker-"; private static final String SLOT_PREFIX = "slot-"; private final Configuration conf; private final CuratorFramework zooKeeperClient; // userPathPrefix is the path specific to the user for which ACLs should be restrictive. // workersPath is the directory path where all the worker znodes are located. private final String userPathPrefix, workersPath; private String userNameFromPrincipal; // Only set when setting up the secure config for ZK. private PersistentEphemeralNode znode; private SlotZnode slotZnode; private String znodePath; // unique identity for this instance private final ServiceRecordMarshal encoder; // to marshal/unmarshal znode data // to be used by clients of ServiceRegistry private DynamicServiceInstanceSet instances; private PathChildrenCache instancesCache; private static final UUID uniq = UUID.randomUUID(); private static final String UNIQUE_IDENTIFIER = "llap.unique.id"; private Set<ServiceInstanceStateChangeListener> stateChangeListeners; private final Map<String, Set<ServiceInstance>> pathToInstanceCache; private final Map<String, Set<ServiceInstance>> nodeToInstanceCache; private final Lock instanceCacheLock = new ReentrantLock(); // get local hostname private static final String hostname; static { String localhost = "localhost"; try { localhost = InetAddress.getLocalHost().getCanonicalHostName(); } catch (UnknownHostException uhe) { // ignore } hostname = localhost; } public LlapZookeeperRegistryImpl(String instanceName, Configuration conf) { this.conf = new Configuration(conf); this.conf.addResource(YarnConfiguration.YARN_SITE_CONFIGURATION_FILE); String zkEnsemble = getQuorumServers(this.conf); this.encoder = new RegistryUtils.ServiceRecordMarshal(); int sessionTimeout = (int) HiveConf.getTimeVar(conf, ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, TimeUnit.MILLISECONDS); int baseSleepTime = (int) HiveConf .getTimeVar(conf, ConfVars.HIVE_ZOOKEEPER_CONNECTION_BASESLEEPTIME, TimeUnit.MILLISECONDS); int maxRetries = HiveConf.getIntVar(conf, ConfVars.HIVE_ZOOKEEPER_CONNECTION_MAX_RETRIES); // sample path: /llap-sasl/hiveuser/hostname/workers/worker-0000000 // worker-0000000 is the sequence number which will be retained until session timeout. If a // worker does not respond due to communication interruptions it will retain the same sequence // number when it returns back. If session timeout expires, the node will be deleted and new // addition of the same node (restart) will get next sequence number this.userPathPrefix = USER_SCOPE_PATH_PREFIX + getZkPathUser(this.conf); this.workersPath = "/" + userPathPrefix + "/" + instanceName + "/workers"; this.instancesCache = null; this.instances = null; this.stateChangeListeners = new HashSet<>(); this.pathToInstanceCache = new ConcurrentHashMap<>(); this.nodeToInstanceCache = new ConcurrentHashMap<>(); final boolean isSecure = UserGroupInformation.isSecurityEnabled(); ACLProvider zooKeeperAclProvider = new ACLProvider() { @Override public List<ACL> getDefaultAcl() { // We always return something from getAclForPath so this should not happen. LOG.warn("getDefaultAcl was called"); return Lists.newArrayList(ZooDefs.Ids.OPEN_ACL_UNSAFE); } @Override public List<ACL> getAclForPath(String path) { if (!isSecure || path == null || !path.contains(userPathPrefix)) { // No security or the path is below the user path - full access. return Lists.newArrayList(ZooDefs.Ids.OPEN_ACL_UNSAFE); } return createSecureAcls(); } }; String rootNs = HiveConf.getVar(conf, ConfVars.LLAP_ZK_REGISTRY_NAMESPACE); if (rootNs == null) { rootNs = isSecure ? SASL_NAMESPACE : UNSECURE_NAMESPACE; // The normal path. } // Create a CuratorFramework instance to be used as the ZooKeeper client // Use the zooKeeperAclProvider to create appropriate ACLs this.zooKeeperClient = CuratorFrameworkFactory.builder() .connectString(zkEnsemble) .sessionTimeoutMs(sessionTimeout) .aclProvider(zooKeeperAclProvider) .namespace(rootNs) .retryPolicy(new ExponentialBackoffRetry(baseSleepTime, maxRetries)) .build(); LOG.info("Llap Zookeeper Registry is enabled with registryid: " + instanceName); } private static List<ACL> createSecureAcls() { // Read all to the world List<ACL> nodeAcls = new ArrayList<ACL>(ZooDefs.Ids.READ_ACL_UNSAFE); // Create/Delete/Write/Admin to creator nodeAcls.addAll(ZooDefs.Ids.CREATOR_ALL_ACL); return nodeAcls; } /** * Get the ensemble server addresses from the configuration. The format is: host1:port, * host2:port.. * * @param conf **/ private String getQuorumServers(Configuration conf) { String[] hosts = conf.getTrimmedStrings(ConfVars.HIVE_ZOOKEEPER_QUORUM.varname); String port = conf.get(ConfVars.HIVE_ZOOKEEPER_CLIENT_PORT.varname, ConfVars.HIVE_ZOOKEEPER_CLIENT_PORT.getDefaultValue()); StringBuilder quorum = new StringBuilder(); for (int i = 0; i < hosts.length; i++) { quorum.append(hosts[i].trim()); if (!hosts[i].contains(":")) { // if the hostname doesn't contain a port, add the configured port to hostname quorum.append(":"); quorum.append(port); } if (i != hosts.length - 1) { quorum.append(","); } } return quorum.toString(); } private String getZkPathUser(Configuration conf) { // External LLAP clients would need to set LLAP_ZK_REGISTRY_USER to the LLAP daemon user (hive), // rather than relying on RegistryUtils.currentUser(). String user = HiveConf.getVar(conf, ConfVars.LLAP_ZK_REGISTRY_USER, RegistryUtils.currentUser()); return user; } public Endpoint getRpcEndpoint() { final int rpcPort = HiveConf.getIntVar(conf, ConfVars.LLAP_DAEMON_RPC_PORT); return RegistryTypeUtils.ipcEndpoint(IPC_LLAP, new InetSocketAddress(hostname, rpcPort)); } public Endpoint getShuffleEndpoint() { final int shufflePort = HiveConf.getIntVar(conf, ConfVars.LLAP_DAEMON_YARN_SHUFFLE_PORT); // HTTP today, but might not be return RegistryTypeUtils.inetAddrEndpoint(IPC_SHUFFLE, ProtocolTypes.PROTOCOL_TCP, hostname, shufflePort); } public Endpoint getServicesEndpoint() { final int servicePort = HiveConf.getIntVar(conf, ConfVars.LLAP_DAEMON_WEB_PORT); final boolean isSSL = HiveConf.getBoolVar(conf, ConfVars.LLAP_DAEMON_WEB_SSL); final String scheme = isSSL ? "https" : "http"; final URL serviceURL; try { serviceURL = new URL(scheme, hostname, servicePort, ""); return RegistryTypeUtils.webEndpoint(IPC_SERVICES, serviceURL.toURI()); } catch (MalformedURLException e) { throw new RuntimeException(e); } catch (URISyntaxException e) { throw new RuntimeException("llap service URI for " + hostname + " is invalid", e); } } public Endpoint getMngEndpoint() { return RegistryTypeUtils.ipcEndpoint(IPC_MNG, new InetSocketAddress(hostname, HiveConf.getIntVar(conf, ConfVars.LLAP_MANAGEMENT_RPC_PORT))); } public Endpoint getOutputFormatEndpoint() { return RegistryTypeUtils.ipcEndpoint(IPC_OUTPUTFORMAT, new InetSocketAddress(hostname, HiveConf.getIntVar(conf, ConfVars.LLAP_DAEMON_OUTPUT_SERVICE_PORT))); } @Override public String register() throws IOException { ServiceRecord srv = new ServiceRecord(); Endpoint rpcEndpoint = getRpcEndpoint(); srv.addInternalEndpoint(rpcEndpoint); srv.addInternalEndpoint(getMngEndpoint()); srv.addInternalEndpoint(getShuffleEndpoint()); srv.addExternalEndpoint(getServicesEndpoint()); srv.addInternalEndpoint(getOutputFormatEndpoint()); for (Map.Entry<String, String> kv : this.conf) { if (kv.getKey().startsWith(HiveConf.PREFIX_LLAP) || kv.getKey().startsWith(HiveConf.PREFIX_HIVE_LLAP)) { // TODO: read this somewhere useful, like the task scheduler srv.set(kv.getKey(), kv.getValue()); } } // restart sensitive instance id srv.set(UNIQUE_IDENTIFIER, uniq.toString()); // Create a znode under the rootNamespace parent for this instance of the server try { // PersistentEphemeralNode will make sure the ephemeral node created on server will be present // even under connection or session interruption (will automatically handle retries) znode = new PersistentEphemeralNode(zooKeeperClient, Mode.EPHEMERAL_SEQUENTIAL, workersPath + "/" + WORKER_PREFIX, encoder.toBytes(srv)); // start the creation of znodes znode.start(); // We'll wait for 120s for node creation long znodeCreationTimeout = 120; if (!znode.waitForInitialCreate(znodeCreationTimeout, TimeUnit.SECONDS)) { throw new Exception( "Max znode creation wait time: " + znodeCreationTimeout + "s exhausted"); } znodePath = znode.getActualPath(); slotZnode = new SlotZnode( zooKeeperClient, workersPath, SLOT_PREFIX, WORKER_PREFIX, uniq.toString()); if (!slotZnode.start(znodeCreationTimeout, TimeUnit.SECONDS)) { throw new Exception( "Max znode creation wait time: " + znodeCreationTimeout + "s exhausted"); } if (HiveConf.getBoolVar(conf, ConfVars.LLAP_VALIDATE_ACLS)) { try { checkAndSetAcls(); } catch (Exception ex) { throw new IOException("Error validating or setting ACLs. " + DISABLE_MESSAGE, ex); } } if (zooKeeperClient.checkExists().forPath(znodePath) == null) { // No node exists, throw exception throw new Exception("Unable to create znode for this LLAP instance on ZooKeeper."); } LOG.info( "Registered node. Created a znode on ZooKeeper for LLAP instance: rpc: {}, shuffle: {}," + " webui: {}, mgmt: {}, znodePath: {} ", rpcEndpoint, getShuffleEndpoint(), getServicesEndpoint(), getMngEndpoint(), znodePath); } catch (Exception e) { LOG.error("Unable to create a znode for this server instance", e); CloseableUtils.closeQuietly(znode); CloseableUtils.closeQuietly(slotZnode); throw (e instanceof IOException) ? (IOException)e : new IOException(e); } if (LOG.isDebugEnabled()) { LOG.debug("Created zknode with path: {} service record: {}", znodePath, srv); } return uniq.toString(); } private void checkAndSetAcls() throws Exception { if (!UserGroupInformation.isSecurityEnabled()) return; // We are trying to check ACLs on the "workers" directory, which noone except us should be // able to write to. Higher-level directories shouldn't matter - we don't read them. String pathToCheck = workersPath; List<ACL> acls = zooKeeperClient.getACL().forPath(pathToCheck); if (acls == null || acls.isEmpty()) { // Can there be no ACLs? There's some access (to get ACLs), so assume it means free for all. LOG.warn("No ACLs on " + pathToCheck + "; setting up ACLs. " + DISABLE_MESSAGE); setUpAcls(pathToCheck); return; } // This could be brittle. assert userNameFromPrincipal != null; Id currentUser = new Id("sasl", userNameFromPrincipal); for (ACL acl : acls) { if ((acl.getPerms() & ~ZooDefs.Perms.READ) == 0 || currentUser.equals(acl.getId())) { continue; // Read permission/no permissions, or the expected user. } LOG.warn("The ACL " + acl + " is unnacceptable for " + pathToCheck + "; setting up ACLs. " + DISABLE_MESSAGE); setUpAcls(pathToCheck); return; } } private void setUpAcls(String path) throws Exception { List<ACL> acls = createSecureAcls(); LinkedList<String> paths = new LinkedList<>(); paths.add(path); while (!paths.isEmpty()) { String currentPath = paths.poll(); List<String> children = zooKeeperClient.getChildren().forPath(currentPath); if (children != null) { for (String child : children) { paths.add(currentPath + "/" + child); } } zooKeeperClient.setACL().withACL(acls).forPath(currentPath); } } @Override public void unregister() throws IOException { // Nothing for the zkCreate models } private class DynamicServiceInstance implements ServiceInstance { private final ServiceRecord srv; private final String host; private final int rpcPort; private final int mngPort; private final int shufflePort; private final int outputFormatPort; private final String serviceAddress; private final Resource resource; public DynamicServiceInstance(ServiceRecord srv) throws IOException { this.srv = srv; if (LOG.isTraceEnabled()) { LOG.trace("Working with ServiceRecord: {}", srv); } final Endpoint shuffle = srv.getInternalEndpoint(IPC_SHUFFLE); final Endpoint rpc = srv.getInternalEndpoint(IPC_LLAP); final Endpoint mng = srv.getInternalEndpoint(IPC_MNG); final Endpoint outputFormat = srv.getInternalEndpoint(IPC_OUTPUTFORMAT); final Endpoint services = srv.getExternalEndpoint(IPC_SERVICES); this.host = RegistryTypeUtils.getAddressField(rpc.addresses.get(0), AddressTypes.ADDRESS_HOSTNAME_FIELD); this.rpcPort = Integer.parseInt(RegistryTypeUtils.getAddressField(rpc.addresses.get(0), AddressTypes.ADDRESS_PORT_FIELD)); this.mngPort = Integer.parseInt(RegistryTypeUtils.getAddressField(mng.addresses.get(0), AddressTypes.ADDRESS_PORT_FIELD)); this.shufflePort = Integer.parseInt(RegistryTypeUtils.getAddressField(shuffle.addresses.get(0), AddressTypes.ADDRESS_PORT_FIELD)); this.outputFormatPort = Integer.valueOf(RegistryTypeUtils.getAddressField(outputFormat.addresses.get(0), AddressTypes.ADDRESS_PORT_FIELD)); this.serviceAddress = RegistryTypeUtils.getAddressField(services.addresses.get(0), AddressTypes.ADDRESS_URI); String memStr = srv.get(ConfVars.LLAP_DAEMON_MEMORY_PER_INSTANCE_MB.varname, ""); String coreStr = srv.get(ConfVars.LLAP_DAEMON_NUM_EXECUTORS.varname, ""); try { this.resource = Resource.newInstance(Integer.parseInt(memStr), Integer.parseInt(coreStr)); } catch (NumberFormatException ex) { throw new IOException("Invalid resource configuration for a LLAP node: memory " + memStr + ", vcores " + coreStr); } } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } DynamicServiceInstance other = (DynamicServiceInstance) o; return this.getWorkerIdentity().equals(other.getWorkerIdentity()); } @Override public int hashCode() { return getWorkerIdentity().hashCode(); } @Override public String getWorkerIdentity() { return srv.get(UNIQUE_IDENTIFIER); } @Override public String getHost() { return host; } @Override public int getRpcPort() { return rpcPort; } @Override public int getShufflePort() { return shufflePort; } @Override public String getServicesAddress() { return serviceAddress; } @Override public Map<String, String> getProperties() { return srv.attributes(); } @Override public Resource getResource() { return resource; } @Override public String toString() { return "DynamicServiceInstance [id=" + getWorkerIdentity() + ", host=" + host + ":" + rpcPort + " with resources=" + getResource() + ", shufflePort=" + getShufflePort() + ", servicesAddress=" + getServicesAddress() + ", mgmtPort=" + getManagementPort() + "]"; } @Override public int getManagementPort() { return mngPort; } @Override public int getOutputFormatPort() { return outputFormatPort; } // TODO: This needs a hashCode/equality implementation if used as a key in various structures. // A new ServiceInstance is created each time. } private void addToCache(String path, String host, ServiceInstance instance) { instanceCacheLock.lock(); try { putInCache(path, pathToInstanceCache, instance); putInCache(host, nodeToInstanceCache, instance); } finally { instanceCacheLock.unlock(); } LOG.debug("Added path={}, host={} instance={} to cache." + " pathToInstanceCache:size={}, nodeToInstanceCache:size={}", path, host, instance, pathToInstanceCache.size(), nodeToInstanceCache.size()); } private void removeFromCache(String path, String host) { instanceCacheLock.lock(); try { pathToInstanceCache.remove(path); nodeToInstanceCache.remove(host); } finally { instanceCacheLock.unlock(); } LOG.debug("Removed path={}, host={} from cache." + " pathToInstanceCache:size={}, nodeToInstanceCache:size={}", path, host, pathToInstanceCache.size(), nodeToInstanceCache.size()); } private void putInCache(String key, Map<String, Set<ServiceInstance>> cache, ServiceInstance instance) { Set<ServiceInstance> instanceSet = cache.get(key); if (instanceSet == null) { instanceSet = Sets.newHashSet(); cache.put(key, instanceSet); } instanceSet.add(instance); } private class DynamicServiceInstanceSet implements ServiceInstanceSet { private final PathChildrenCache instancesCache; public DynamicServiceInstanceSet(final PathChildrenCache cache) { this.instancesCache = cache; populateCache(); } private void populateCache() { for (ChildData childData : instancesCache.getCurrentData()) { byte[] data = getWorkerData(childData); if (data == null) continue; try { ServiceRecord srv = encoder.fromBytes(childData.getPath(), data); ServiceInstance instance = new DynamicServiceInstance(srv); addToCache(childData.getPath(), instance.getHost(), instance); } catch (IOException e) { LOG.error("Unable to decode data for zkpath: {}." + " Ignoring from current instances list..", childData.getPath()); } } } @Override public Collection<ServiceInstance> getAll() { Set<ServiceInstance> instances = new HashSet<>(); for(Set<ServiceInstance> instanceSet : pathToInstanceCache.values()) { instances.addAll(instanceSet); } return instances; } public ApplicationId getApplicationId() { for (ChildData childData : instancesCache.getCurrentData()) { byte[] data = getWorkerData(childData); if (data == null) continue; ServiceRecord sr = null; try { sr = encoder.fromBytes(childData.getPath(), data); } catch (IOException e) { LOG.error("Unable to decode data for zkpath: {}." + " Ignoring from current instances list..", childData.getPath()); continue; } String containerStr = sr.get(HiveConf.ConfVars.LLAP_DAEMON_CONTAINER_ID.varname); if (containerStr == null || containerStr.isEmpty()) continue; return ContainerId.fromString(containerStr).getApplicationAttemptId().getApplicationId(); } return null; } private byte[] getWorkerData(ChildData childData) { if (childData == null) return null; byte[] data = childData.getData(); if (data == null) return null; if (!extractNodeName(childData).startsWith(WORKER_PREFIX)) return null; return data; } @Override public Collection<ServiceInstance> getAllInstancesOrdered(boolean consistentIndexes) { Map<String, Long> slotByWorker = new HashMap<String, Long>(); Set<ServiceInstance> unsorted = Sets.newHashSet(); for (ChildData childData : instancesCache.getCurrentData()) { if (childData == null) continue; byte[] data = childData.getData(); if (data == null) continue; String nodeName = extractNodeName(childData); if (nodeName.startsWith(WORKER_PREFIX)) { Set<ServiceInstance> instances = pathToInstanceCache.get(childData.getPath()); if (instances != null) { unsorted.addAll(instances); } } else if (nodeName.startsWith(SLOT_PREFIX)) { slotByWorker.put(extractWorkerIdFromSlot(childData), Long.parseLong(nodeName.substring(SLOT_PREFIX.length()))); } else { LOG.info("Ignoring unknown node {}", childData.getPath()); } } TreeMap<Long, ServiceInstance> sorted = new TreeMap<>(); long maxSlot = Long.MIN_VALUE; for (ServiceInstance worker : unsorted) { Long slot = slotByWorker.get(worker.getWorkerIdentity()); if (slot == null) { LOG.info("Unknown slot for {}", worker.getWorkerIdentity()); continue; } maxSlot = Math.max(maxSlot, slot); sorted.put(slot, worker); } if (consistentIndexes) { // Add dummy instances to all slots where LLAPs are MIA... I can haz insert_iterator? TreeMap<Long, ServiceInstance> dummies = new TreeMap<>(); Iterator<Long> keyIter = sorted.keySet().iterator(); long expected = 0; Long ts = null; while (keyIter.hasNext()) { Long slot = keyIter.next(); assert slot >= expected; while (slot > expected) { if (ts == null) { ts = System.nanoTime(); // Inactive nodes restart every call! } dummies.put(expected, new InactiveServiceInstance("inactive-" + expected + "-" + ts)); ++expected; } ++expected; } sorted.putAll(dummies); } return sorted.values(); } @Override public ServiceInstance getInstance(String name) { Collection<ServiceInstance> instances = getAll(); for(ServiceInstance instance : instances) { if (instance.getWorkerIdentity().equals(name)) { return instance; } } return null; } @Override public Set<ServiceInstance> getByHost(String host) { Set<ServiceInstance> byHost = nodeToInstanceCache.get(host); byHost = (byHost == null) ? Sets.<ServiceInstance>newHashSet() : byHost; if (LOG.isDebugEnabled()) { LOG.debug("Returning " + byHost.size() + " hosts for locality allocation on " + host); } return byHost; } @Override public int size() { // not using the path child cache here as there could be more than 1 path per host (worker and slot znodes) return nodeToInstanceCache.size(); } } // TODO: make class static? fields leak private class InstanceStateChangeListener implements PathChildrenCacheListener { private final Logger LOG = LoggerFactory.getLogger(InstanceStateChangeListener.class); @Override public void childEvent(final CuratorFramework client, final PathChildrenCacheEvent event) throws Exception { Preconditions.checkArgument(client != null && client.getState() == CuratorFrameworkState.STARTED, "client is not started"); synchronized (this) { ChildData childData = event.getData(); if (childData == null) return; String nodeName = extractNodeName(childData); if (!nodeName.startsWith(WORKER_PREFIX)) return; // No need to propagate slot updates. LOG.info("{} for zknode {} in llap namespace", event.getType(), childData.getPath()); ServiceInstance instance = extractServiceInstance(event, childData); switch (event.getType()) { case CHILD_ADDED: addToCache(childData.getPath(), instance.getHost(), instance); for (ServiceInstanceStateChangeListener listener : stateChangeListeners) { listener.onCreate(instance); } break; case CHILD_UPDATED: addToCache(childData.getPath(), instance.getHost(), instance); for (ServiceInstanceStateChangeListener listener : stateChangeListeners) { listener.onUpdate(instance); } break; case CHILD_REMOVED: removeFromCache(childData.getPath(), instance.getHost()); for (ServiceInstanceStateChangeListener listener : stateChangeListeners) { listener.onRemove(instance); } break; default: // Ignore all the other events; logged above. } } } } private static String extractWorkerIdFromSlot(ChildData childData) { return new String(childData.getData(), SlotZnode.CHARSET); } private static String extractNodeName(ChildData childData) { String nodeName = childData.getPath(); int ix = nodeName.lastIndexOf("/"); if (ix >= 0) { nodeName = nodeName.substring(ix + 1); } return nodeName; } private ServiceInstance extractServiceInstance( PathChildrenCacheEvent event, ChildData childData) { byte[] data = childData.getData(); if (data == null) return null; try { ServiceRecord srv = encoder.fromBytes(event.getData().getPath(), data); return new DynamicServiceInstance(srv); } catch (IOException e) { LOG.error("Unable to decode data for zknode: {}." + " Dropping notification of type: {}", childData.getPath(), event.getType()); return null; } } @Override public ServiceInstanceSet getInstances( String component, long clusterReadyTimeoutMs) throws IOException { checkPathChildrenCache(clusterReadyTimeoutMs); // lazily create instances if (instances == null) { this.instances = new DynamicServiceInstanceSet(instancesCache); } return instances; } @Override public ApplicationId getApplicationId() throws IOException { getInstances("LLAP", 0); return instances.getApplicationId(); } @Override public synchronized void registerStateChangeListener( final ServiceInstanceStateChangeListener listener) throws IOException { checkPathChildrenCache(0); this.stateChangeListeners.add(listener); } private synchronized void checkPathChildrenCache(long clusterReadyTimeoutMs) throws IOException { Preconditions.checkArgument(zooKeeperClient != null && zooKeeperClient.getState() == CuratorFrameworkState.STARTED, "client is not started"); // lazily create PathChildrenCache if (instancesCache != null) return; ExecutorService tp = Executors.newFixedThreadPool(1, new ThreadFactoryBuilder() .setDaemon(true).setNameFormat("StateChangeNotificationHandler").build()); long startTimeNs = System.nanoTime(), deltaNs = clusterReadyTimeoutMs * 1000000L; long sleepTimeMs = Math.min(16, clusterReadyTimeoutMs); while (true) { PathChildrenCache instancesCache = new PathChildrenCache(zooKeeperClient, workersPath, true); instancesCache.getListenable().addListener(new InstanceStateChangeListener(), tp); try { instancesCache.start(PathChildrenCache.StartMode.BUILD_INITIAL_CACHE); this.instancesCache = instancesCache; break; } catch (InvalidACLException e) { // PathChildrenCache tried to mkdir when the znode wasn't there, and failed. CloseableUtils.closeQuietly(instancesCache); long elapsedNs = System.nanoTime() - startTimeNs; if (deltaNs == 0 || deltaNs <= elapsedNs) { LOG.error("Unable to start curator PathChildrenCache", e); throw new IOException(e); } LOG.warn("The cluster is not started yet (InvalidACL); will retry"); try { Thread.sleep(Math.min(sleepTimeMs, (deltaNs - elapsedNs)/1000000L)); } catch (InterruptedException e1) { LOG.error("Interrupted while retrying the PathChildrenCache startup"); throw new IOException(e1); } sleepTimeMs = sleepTimeMs << 1; } catch (Exception e) { CloseableUtils.closeQuietly(instancesCache); LOG.error("Unable to start curator PathChildrenCache", e); throw new IOException(e); } } } @Override public void start() throws IOException { if (zooKeeperClient != null) { setupZookeeperAuth(this.conf); zooKeeperClient.start(); } // Init closeable utils in case register is not called (see HIVE-13322) CloseableUtils.class.getName(); } @Override public void stop() throws IOException { CloseableUtils.closeQuietly(znode); CloseableUtils.closeQuietly(slotZnode); CloseableUtils.closeQuietly(instancesCache); CloseableUtils.closeQuietly(zooKeeperClient); } private void setupZookeeperAuth(final Configuration conf) throws IOException { if (UserGroupInformation.isSecurityEnabled() && LlapProxy.isDaemon()) { LOG.info("UGI security is enabled. Setting up ZK auth."); String llapPrincipal = HiveConf.getVar(conf, ConfVars.LLAP_KERBEROS_PRINCIPAL); if (llapPrincipal == null || llapPrincipal.isEmpty()) { throw new IOException("Llap Kerberos principal is empty"); } String llapKeytab = HiveConf.getVar(conf, ConfVars.LLAP_KERBEROS_KEYTAB_FILE); if (llapKeytab == null || llapKeytab.isEmpty()) { throw new IOException("Llap Kerberos keytab is empty"); } // Install the JAAS Configuration for the runtime setZookeeperClientKerberosJaasConfig(llapPrincipal, llapKeytab); } else { LOG.info("UGI security is not enabled, or non-daemon environment. Skipping setting up ZK auth."); } } /** * Dynamically sets up the JAAS configuration that uses kerberos * * @param principal * @param keyTabFile * @throws IOException */ private void setZookeeperClientKerberosJaasConfig(String principal, String keyTabFile) throws IOException { // ZooKeeper property name to pick the correct JAAS conf section final String SASL_LOGIN_CONTEXT_NAME = "LlapZooKeeperClient"; System.setProperty(ZooKeeperSaslClient.LOGIN_CONTEXT_NAME_KEY, SASL_LOGIN_CONTEXT_NAME); principal = SecurityUtil.getServerPrincipal(principal, "0.0.0.0"); userNameFromPrincipal = LlapUtil.getUserNameFromPrincipal(principal); JaasConfiguration jaasConf = new JaasConfiguration(SASL_LOGIN_CONTEXT_NAME, principal, keyTabFile); // Install the Configuration in the runtime. javax.security.auth.login.Configuration.setConfiguration(jaasConf); } /** * A JAAS configuration for ZooKeeper clients intended to use for SASL * Kerberos. */ private static class JaasConfiguration extends javax.security.auth.login.Configuration { // Current installed Configuration private final javax.security.auth.login.Configuration baseConfig = javax.security.auth.login.Configuration .getConfiguration(); private final String loginContextName; private final String principal; private final String keyTabFile; public JaasConfiguration(String llapLoginContextName, String principal, String keyTabFile) { this.loginContextName = llapLoginContextName; this.principal = principal; this.keyTabFile = keyTabFile; } @Override public AppConfigurationEntry[] getAppConfigurationEntry(String appName) { if (loginContextName.equals(appName)) { Map<String, String> krbOptions = new HashMap<String, String>(); krbOptions.put("doNotPrompt", "true"); krbOptions.put("storeKey", "true"); krbOptions.put("useKeyTab", "true"); krbOptions.put("principal", principal); krbOptions.put("keyTab", keyTabFile); krbOptions.put("refreshKrb5Config", "true"); AppConfigurationEntry llapZooKeeperClientEntry = new AppConfigurationEntry( KerberosUtil.getKrb5LoginModuleName(), AppConfigurationEntry.LoginModuleControlFlag.REQUIRED, krbOptions); return new AppConfigurationEntry[]{llapZooKeeperClientEntry}; } // Try the base config if (baseConfig != null) { return baseConfig.getAppConfigurationEntry(appName); } return null; } } }