/* * Copyright (C) 2015 hops.io. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.hops.util; import io.hops.leaderElection.LeaderElection; import io.hops.leaderElection.YarnLeDescriptorFactory; import io.hops.leader_election.node.ActiveNode; import io.hops.leader_election.node.SortedActiveNodeList; import io.hops.metadata.yarn.entity.Load; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.ha.HAServiceProtocol; import org.apache.hadoop.ha.HAServiceStatus; import org.apache.hadoop.ha.HealthCheckFailedException; import org.apache.hadoop.ha.ServiceFailedException; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.ipc.StandbyException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.AccessControlList; import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.yarn.conf.HAUtil; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.ipc.RPCUtil; import org.apache.hadoop.yarn.ipc.YarnRPC; import org.apache.hadoop.yarn.server.api.protocolrecords.RefreshAdminAclsResponse; import org.apache.hadoop.yarn.server.utils.YarnServerBuilderUtils; import io.hops.util.impl.ActiveRMPBImpl; import java.io.IOException; import java.io.InputStream; import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.yarn.security.YarnAuthorizationProvider; import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.RMServerUtils; import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager; public class GroupMembershipService extends CompositeService implements GroupMembership, HAServiceProtocol { private static final Log LOG = LogFactory.getLog(GroupMembershipService.class); private final RMContext rmContext; private final ResourceManager rm; private Server server; private AccessControlList adminAcl; private final RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null); private LeaderElection groupMembership; private boolean autoFailoverEnabled; private InetSocketAddress groupMembershipServiceAddress; boolean running = true; private String rmId = ""; private Thread lEnGmMonitor; private YarnAuthorizationProvider authorizer; private UserGroupInformation daemonUser; private Configuration conf; public GroupMembershipService(ResourceManager rm, RMContext rmContext) { super(GroupMembershipService.class.getName()); this.rm = rm; this.rmContext = rmContext; } @Override public synchronized void serviceInit(Configuration conf) throws Exception { this.conf = conf; groupMembershipServiceAddress = conf.getSocketAddr( YarnConfiguration.RM_BIND_HOST, YarnConfiguration.RM_GROUP_MEMBERSHIP_ADDRESS, YarnConfiguration.DEFAULT_RM_GROUP_MEMBERSHIP_ADDRESS, YarnConfiguration.DEFAULT_RM_GROUP_MEMBERSHIP_PORT); adminAcl = new AccessControlList(conf.get(YarnConfiguration.YARN_ADMIN_ACL, YarnConfiguration.DEFAULT_YARN_ADMIN_ACL)); if (HAUtil.isHAEnabled(conf)) { this.rmId = HAUtil.getRMHAId(conf); } daemonUser = UserGroupInformation.getCurrentUser(); authorizer = YarnAuthorizationProvider.getInstance(conf); authorizer.setAdmins(getAdminAclList(conf), UserGroupInformation .getCurrentUser()); LOG.info("init groupMembershipService " + this.rmId); } private AccessControlList getAdminAclList(Configuration conf) { AccessControlList aclList = new AccessControlList(conf.get(YarnConfiguration.YARN_ADMIN_ACL, YarnConfiguration.DEFAULT_YARN_ADMIN_ACL)); aclList.addUser(daemonUser.getShortUserName()); return aclList; } @Override protected synchronized void serviceStart() throws Exception { startServer(); groupMembershipServiceAddress = getConfig().updateConnectAddr(YarnConfiguration.RM_BIND_HOST, YarnConfiguration.RM_GROUP_MEMBERSHIP_ADDRESS, YarnConfiguration.DEFAULT_RM_GROUP_MEMBERSHIP_ADDRESS, server.getListenerAddress()); startGroupMembership(); LOG.info("Started GMS: " + rmId + " on " + groupMembershipServiceAddress.getAddress().getHostAddress() + ":" + groupMembershipServiceAddress.getPort()); super.serviceStart(); } protected synchronized void startGroupMembership() throws IOException { if (rmContext.isHAEnabled() || rmContext.isDistributed()) { initLEandGM(conf); } if (groupMembership != null) { groupMembership.start(); try { groupMembership.waitActive(); } catch (InterruptedException e) { LOG.warn("Group membership service was interrupted"); } lEnGmMonitor = new Thread(new LEnGmMonitor()); lEnGmMonitor.setName("group membership monitor"); lEnGmMonitor.start(); } } @Override protected synchronized void serviceStop() throws Exception { stopServer(); LOG.info("stopping group membership service service"); stopGroupMembership(); LOG.info("stopped group membership service"); super.serviceStop(); LOG.info("stopped GMS on " + rmId); } protected synchronized void stopGroupMembership() throws Exception { if (groupMembership != null && groupMembership.isRunning()) { groupMembership.stopElectionThread(); } } protected void startServer() throws Exception { Configuration conf = getConfig(); YarnRPC rpc = YarnRPC.create(conf); this.server = rpc.getServer(GroupMembership.class, this, groupMembershipServiceAddress, conf, null, conf.getInt(YarnConfiguration.RM_GROUP_MEMBERSHIP_CLIENT_THREAD_COUNT, YarnConfiguration.DEFAULT_RM_GROUP_MEMBERSHIP_CLIENT_THREAD_COUNT)); this.server.start(); } protected void stopServer() throws Exception { if (this.server != null) { LOG.info("stopping group membership service server on " + server.getListenerAddress().getHostName() + ":" + server.getPort()); this.server.stop(); } } private synchronized Configuration getConfiguration(Configuration conf, String confFileName) throws YarnException, IOException { InputStream confFileInputStream = this.rmContext.getConfigurationProvider() .getConfigurationInputStream(conf, confFileName); if (confFileInputStream != null) { conf.addResource(confFileInputStream); } return conf; } public String getRMId() { return rmId; } public boolean isLeader() { if (groupMembership != null && groupMembership.isRunning()) { return groupMembership.isLeader(); } else { return false; } } public boolean isLeadingRT(){ if(groupMembership!=null && groupMembership.isRunning()){ return groupMembership.isSecond(); }else{ return false; } } public boolean isAlone(){ if(groupMembership.getActiveNamenodes().size()==1){ return true; }else{ return false; } } @Override public synchronized void monitorHealth() throws IOException { checkAccess("monitorHealth"); if (isRMActive() && !rm.areSchedulerServicesRunning()) { throw new HealthCheckFailedException( "Active ResourceManager services are not running!"); } } private UserGroupInformation checkAccess(String method) throws IOException { return RMServerUtils.verifyAdminAccess(authorizer, method, LOG); } private UserGroupInformation checkAcls(String method) throws YarnException { try { return checkAccess(method); } catch (IOException ioe) { throw RPCUtil.getRemoteException(ioe); } } private synchronized boolean isRMActive() { return HAServiceState.ACTIVE == rmContext.getHAServiceState(); } private void throwStandbyException() throws StandbyException { throw new StandbyException( "ResourceManager " + rmId + " is not Active!"); } @Override public synchronized void transitionToActive( HAServiceProtocol.StateChangeRequestInfo reqInfo) throws IOException { // call refreshAdminAcls before HA state transition // for the case that adminAcls have been updated in previous active RM try { refreshAdminAcls(false); } catch (YarnException ex) { throw new ServiceFailedException("Can not execute refreshAdminAcls", ex); } throw new UnsupportedOperationException("not implemented yet"); } @Override public synchronized void transitionToStandby( HAServiceProtocol.StateChangeRequestInfo reqInfo) throws IOException { // call refreshAdminAcls before HA state transition // for the case that adminAcls have been updated in previous active RM try { refreshAdminAcls(false); } catch (YarnException ex) { throw new ServiceFailedException("Can not execute refreshAdminAcls", ex); } throw new UnsupportedOperationException("not implemented yet"); } private RefreshAdminAclsResponse refreshAdminAcls(boolean checkRMHAState) throws YarnException, IOException { String argName = "refreshAdminAcls"; UserGroupInformation user = checkAcls(argName); if (checkRMHAState) { checkRMStatus(user.getShortUserName(), argName, "refresh Admin ACLs."); } Configuration conf = getConfiguration(new Configuration(false), YarnConfiguration.YARN_SITE_CONFIGURATION_FILE); authorizer.setAdmins(getAdminAclList(conf), UserGroupInformation .getCurrentUser()); RMAuditLogger.logSuccess(user.getShortUserName(), argName, "AdminService"); return recordFactory.newRecordInstance(RefreshAdminAclsResponse.class); } private void checkRMStatus(String user, String argName, String msg) throws StandbyException { if (!isRMActive()) { RMAuditLogger.logFailure(user, argName, "", "AdminService", "ResourceManager is not active. Can not " + msg); throwStandbyException(); } } @Override public synchronized HAServiceStatus getServiceStatus() throws IOException { checkAccess("getServiceState"); HAServiceState haState = rmContext.getHAServiceState(); HAServiceStatus ret = new HAServiceStatus(haState); if (isRMActive() || haState == HAServiceProtocol.HAServiceState.STANDBY) { ret.setReadyToBecomeActive(); } else { ret.setNotReadyToBecomeActive("State is " + haState); } return ret; } @Override public LiveRMsResponse getLiveRMList() { List<ActiveNode> rmList = new ArrayList<ActiveNode>(); Map<String, Load> loads; try { loads = DBUtility.getAllLoads(); } catch (IOException ex) { LOG.error(ex); loads = new HashMap<String, Load>(); } SortedActiveNodeList nnList = groupMembership.getActiveNamenodes(); for (ActiveNode node : nnList.getSortedActiveNodes()) { if (loads.get(node.getHostname()) == null) { rmList.add(new ActiveRMPBImpl(node.getId(), node.getHostname(), node. getIpAddress(), node.getPort(), node.getHttpAddress(), 0)); } else { rmList.add(new ActiveRMPBImpl(node.getId(), node.getHostname(), node. getIpAddress(), node.getPort(), node.getHttpAddress(), loads. get(node.getHostname()).getLoad())); } } SortedActiveRMList sortedRmList = new SortedActiveRMList(rmList); return YarnServerBuilderUtils.newLiveRMsResponse(sortedRmList); } private void initLEandGM(Configuration conf) throws IOException { // Initialize the leader election algorithm (only once rpc server is // created and httpserver is started) long leadercheckInterval = conf.getInt(CommonConfigurationKeys.DFS_LEADER_CHECK_INTERVAL_IN_MS_KEY, CommonConfigurationKeys.DFS_LEADER_CHECK_INTERVAL_IN_MS_DEFAULT); int missedHeartBeatThreshold = conf.getInt(CommonConfigurationKeys.DFS_LEADER_MISSED_HB_THRESHOLD_KEY, CommonConfigurationKeys.DFS_LEADER_MISSED_HB_THRESHOLD_DEFAULT); int leIncrement = conf.getInt(CommonConfigurationKeys.DFS_LEADER_TP_INCREMENT_KEY, CommonConfigurationKeys.DFS_LEADER_TP_INCREMENT_DEFAULT); groupMembership = new LeaderElection(new YarnLeDescriptorFactory(), leadercheckInterval, missedHeartBeatThreshold, leIncrement, rmId, groupMembershipServiceAddress.getAddress().getHostAddress() + ":" + groupMembershipServiceAddress.getPort()); } private class LEnGmMonitor implements Runnable { Boolean previousLeaderRole = null; Boolean previousLeadingRTRole = null; @Override public void run() { try { while (groupMembership.isRunning()) { boolean currentLeaderRole = isLeader(); if (previousLeaderRole == null || currentLeaderRole != previousLeaderRole) { previousLeaderRole = currentLeaderRole; switchLeaderRole(previousLeaderRole); } boolean currentLeadingRTRole = isLeadingRT(); Thread.sleep(100L); } } catch (Exception ex) { LOG.error(ex, ex); } } private void switchLeaderRole(boolean role) throws Exception { conf.set(YarnConfiguration.RM_HA_ID, rmId); if (role) { LOG.info(groupMembership.getCurrentId() + " switching to active "); rm.transitionToActive(); } else { LOG.info(groupMembership.getCurrentId() + " switching to standby "); rm.transitionToStandby(true); } } } public void relinquishId() throws InterruptedException { if(groupMembership!=null){ groupMembership.relinquishCurrentIdInNextRound(); } } }