/* * Copyright 2015 Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.resourcemanager.quota; import io.hops.exception.StorageException; import io.hops.metadata.common.entity.LongVariable; import io.hops.metadata.common.entity.Variable; import io.hops.metadata.hdfs.dal.VariableDataAccess; import io.hops.metadata.yarn.dal.ContainerStatusDataAccess; import io.hops.metadata.yarn.dal.quota.ContainersLogsDataAccess; import io.hops.metadata.yarn.dal.quota.PriceMultiplicatorDataAccess; import io.hops.metadata.yarn.dal.util.YARNOperationType; import io.hops.metadata.yarn.entity.ContainerStatus; import io.hops.metadata.yarn.entity.quota.ContainerLog; import io.hops.metadata.yarn.entity.quota.PriceMultiplicator; import io.hops.transaction.handler.LightWeightRequestHandler; import io.hops.util.RMStorageFactory; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.LinkedBlockingQueue; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.service.CompositeService; import org.apache.hadoop.yarn.api.records.ContainerExitStatus; import org.apache.hadoop.yarn.api.records.ContainerState; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.api.records.impl.pb.ResourcePBImpl; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.util.ConverterUtils; public class ContainersLogsService extends CompositeService { private static final Log LOG = LogFactory.getLog(ContainersLogsService.class); Configuration conf; private Thread tickThread; private volatile boolean stopped; //Flag for Thread force stop private long monitorInterval; //Time in ms till next ContainerStatus read private boolean checkpointEnabled; private int checkpointInterval; //Time in ticks between checkpoints private double alertThreshold; private double threshold; private final RMContext rMContext; private float currentMultiplicator; // This variable will be set/updated by the streaming service. private long multiplicatorPeirod; ContainerStatusDataAccess containerStatusDA; ContainersLogsDataAccess containersLogsDA; VariableDataAccess variableDA; Map<String, ContainerLog> activeContainers = new HashMap<>(); Map<String, ContainerLog> updateContainers = new HashMap<>(); LinkedBlockingQueue<ContainerStatus> eventContainers = new LinkedBlockingQueue<>(); LongVariable tickCounter = new LongVariable(Variable.Finder.QuotaTicksCounter, 0); // True when service is up to speed with existing statuses and // with events triggered while initializing boolean recovered = true; public ContainersLogsService(RMContext rMContext) { super(ContainersLogsService.class.getName()); this.rMContext = rMContext; } @Override public void serviceInit(Configuration conf) throws Exception { LOG.info("Initializing containers logs service"); this.conf = conf; // Initialize config parameters this.monitorInterval = this.conf.getLong( YarnConfiguration.QUOTA_CONTAINERS_LOGS_MONITOR_INTERVAL, YarnConfiguration.DEFAULT_QUOTA_CONTAINERS_LOGS_MONITOR_INTERVAL); this.checkpointEnabled = this.conf.getBoolean( YarnConfiguration.QUOTA_CONTAINERS_LOGS_CHECKPOINTS_ENABLED, YarnConfiguration.DEFAULT_QUOTA_CONTAINERS_LOGS_CHECKPOINTS_ENABLED); this.checkpointInterval = this.conf.getInt( YarnConfiguration.QUOTA_CONTAINERS_LOGS_CHECKPOINTS_MINTICKS, YarnConfiguration.DEFAULT_QUOTA_CONTAINERS_LOGS_CHECKPOINTS_MINTICKS) * this.conf.getInt(YarnConfiguration.QUOTA_MIN_TICKS_CHARGE, YarnConfiguration.DEFAULT_QUOTA_MIN_TICKS_CHARGE); this.alertThreshold = this.conf.getDouble( YarnConfiguration.QUOTA_CONTAINERS_LOGS_ALERT_THRESHOLD, YarnConfiguration.DEFAULT_QUOTA_CONTAINERS_LOGS_ALERT_THRESHOLD); // Calculate execution time warning threshold this.threshold = this.monitorInterval * alertThreshold; this.multiplicatorPeirod = this.conf.getLong( YarnConfiguration.QUOTA_FIXED_MULTIPLICATOR_PERIOD, YarnConfiguration.DEFAULT_QUOTA_FIXED_MULTIPLICATOR_PERIOD) * checkpointInterval; currentMultiplicator = 1; // Initialize DataAccesses containerStatusDA = (ContainerStatusDataAccess) RMStorageFactory. getDataAccess(ContainerStatusDataAccess.class); containersLogsDA = (ContainersLogsDataAccess) RMStorageFactory. getDataAccess(ContainersLogsDataAccess.class); variableDA = (VariableDataAccess) RMStorageFactory.getDataAccess( VariableDataAccess.class); // Creates separate thread for retrieving container statuses tickThread = new Thread(new TickThread()); tickThread.setName("ContainersLogs Tick Thread"); tickThread.setDaemon(true); super.serviceInit(conf); } @Override protected void serviceStart() throws Exception { LOG.info("Starting containers logs service"); recover(); tickThread.start(); super.serviceStart(); } @Override protected void serviceStop() throws Exception { LOG.info("Stopping containers logs service"); stopped = true; if (tickThread != null) { tickThread.interrupt(); } super.serviceStop(); } /** * Appends container statuses obtained from events into event queue * * @param changedContainerStatuses */ public void insertEvent(List<ContainerStatus> changedContainerStatuses) { LOG.debug("CL :: New event, size: " + changedContainerStatuses.size()); for (ContainerStatus cs : changedContainerStatuses) { try { eventContainers.put(cs); } catch (InterruptedException ex) { LOG.warn("Unable to insert container status: " + cs.toString() + " inside event queue", ex); } } } public synchronized void setCurrentPrice(float currentPrice) { LOG.debug("set new price: " + currentPrice); this.currentMultiplicator = currentPrice; } /** * Returns list of latest entries in eventContainers list and removes them * * @return */ private List<ContainerStatus> getLatestEvents() { List<ContainerStatus> oldEvents = new ArrayList<>(); while (!eventContainers.isEmpty()) { oldEvents.add(eventContainers.poll()); } return oldEvents; } /** * Retrieve tick counter, unfinished containers logs entries and container * statuses entries. Merge them and then retrieve events that have arrived * since launching service. Merge events and mark recovery as completed. */ public void recover() { LOG.info("Starting containers logs recovery"); try { tickCounter = getTickCounter(); activeContainers = getContainersLogs(); //recover current multiplicator Map<PriceMultiplicator.MultiplicatorType, PriceMultiplicator> currentPrices = getCurrentMultiplicator(); if (currentPrices.get(PriceMultiplicator.MultiplicatorType.VARIABLE) != null) { this.currentMultiplicator = currentPrices.get( PriceMultiplicator.MultiplicatorType.VARIABLE).getValue(); } //Finish to log all the containers for which we currently have logs //they will restart once they send a new heartbeat finishLogging(); updateContainersLogs(false); LOG.info("Finished containers logs recovery"); } catch (Exception ex) { LOG.warn("Unable to finish containers logs recovery", ex); } } private void finishLogging(){ for(ContainerLog log: activeContainers.values()){ log.setStop(tickCounter.getValue()); log.setExitstatus(ContainerExitStatus.ABORTED); updateContainers.put(log.getContainerId(), log); } activeContainers.clear(); } private synchronized void checkEventContainerStatuses( List<ContainerStatus> latestEvents ) { for (ContainerStatus cs : latestEvents) { ContainerLog cl; boolean updatable = false; if (cs.getState().equals(ContainerState.NEW)) { continue; } cl = activeContainers.get(cs.getContainerid()); if (cl == null) { RMContainer container = rMContext.getScheduler().getRMContainer( ConverterUtils.toContainerId(cs.getContainerid())); Resource containerResources = null; if(container==null){ containerResources = Resource.newInstance(0, 0); }else{ containerResources = container.getContainer().getResource(); } cl = new ContainerLog(cs.getContainerid(), tickCounter.getValue(), ContainerExitStatus.CONTAINER_RUNNING_STATE, currentMultiplicator, containerResources.getVirtualCores(), containerResources.getMemory()); // Unable to capture start use case if (cs.getState().equals(ContainerState.COMPLETE.toString())) { //TODO: this is overwriten by the next cl.setExitstatus, need to be verified cl.setExitstatus(ContainerExitStatus.UNKNOWN_CONTAINER_EXIT); } activeContainers.put(cl.getContainerid(), cl); updatable = true; } if (cs.getState().equals(ContainerState.COMPLETE.toString())) { cl.setStop(tickCounter.getValue()); cl.setExitstatus(cs.getExitstatus()); activeContainers.remove(cl.getContainerid()); updatable = true; } if (updatable) { updateContainers.put(cl.getContainerid(), cl); } } } /** * Updates containers logs table with container status information in update * list Also update tick counter in YARN variables table */ private void updateContainersLogs(final boolean updatetTick) { try { LightWeightRequestHandler containersLogsHandler = new LightWeightRequestHandler(YARNOperationType.TEST) { @Override public Object performTask() throws StorageException { connector.beginTransaction(); connector.writeLock(); // Update containers logs table if necessary if (updateContainers.size() > 0) { LOG.debug("CL :: Update containers logs size: " + updateContainers. size()); try { containersLogsDA.addAll(updateContainers.values()); } catch (StorageException ex) { LOG.warn("Unable to update containers logs table", ex); } } // Update tick counter if (updatetTick) { variableDA.setVariable(tickCounter); } connector.commit(); return null; } }; containersLogsHandler.handle(); QuotaService quotaService = rMContext.getQuotaService(); if (quotaService != null) { quotaService.insertEvents(updateContainers.values()); } updateContainers.clear(); } catch (IOException ex) { LOG.warn("Unable to update containers logs and tick counter", ex); } } /** * Retrieves unfinished containers logs entries Used when initializing * active list * * @return */ private Map<String, ContainerLog> getContainersLogs() { Map<String, ContainerLog> allContainersLogs = new HashMap<>(); try { // Retrieve unfinished containers logs entries LightWeightRequestHandler allContainersHandler = new LightWeightRequestHandler(YARNOperationType.TEST) { @Override public Object performTask() throws StorageException { connector.beginTransaction(); connector.readCommitted(); Map<String, ContainerLog> allContainersLogs = containersLogsDA.getAll(); connector.commit(); return allContainersLogs; } }; allContainersLogs = (Map<String, ContainerLog>) allContainersHandler. handle(); } catch (IOException ex) { LOG.warn("Unable to retrieve containers logs table data", ex); } return allContainersLogs; } /** * Retrieves containers logs tick counter from YARN variables * * @return */ private LongVariable getTickCounter() { LongVariable tc = new LongVariable(Variable.Finder.QuotaTicksCounter, 0); try { LongVariable found; LightWeightRequestHandler tickCounterHandler = new LightWeightRequestHandler(YARNOperationType.TEST) { @Override public Object performTask() throws StorageException { connector.beginTransaction(); connector.readCommitted(); Variable tickCounterVariable = (Variable) variableDA.getVariable( Variable.Finder.QuotaTicksCounter); connector.commit(); return tickCounterVariable; } }; found = (LongVariable) tickCounterHandler.handle(); if (found != null && found.getValue() != null) { tc = found; } } catch (IOException ex) { LOG.warn("Unable to retrieve tick counter from YARN variables", ex); } return tc; } private Map<PriceMultiplicator.MultiplicatorType, PriceMultiplicator> getCurrentMultiplicator() throws IOException { LightWeightRequestHandler currentPriceHandler = new LightWeightRequestHandler(YARNOperationType.TEST) { @Override public Object performTask() throws StorageException { connector.beginTransaction(); connector.readCommitted(); PriceMultiplicatorDataAccess da = (PriceMultiplicatorDataAccess) RMStorageFactory.getDataAccess( PriceMultiplicatorDataAccess.class); Map<PriceMultiplicator.MultiplicatorType, PriceMultiplicator> currentPrices = da.getAll(); connector.commit(); return currentPrices; } }; return (Map<PriceMultiplicator.MultiplicatorType, PriceMultiplicator>) currentPriceHandler. handle(); } //TODO optimisation /** * Loop active list and add all found & not completed container statuses to * update list. This ensures that whole running time is not lost. */ private synchronized void createCheckpoint() { long tick = tickCounter.getValue(); for (ContainerLog log : activeContainers.values()) { if ((tick - log.getStart()) % checkpointInterval == 0) { log.setStop(tickCounter.getValue()); if ((tick - log.getStart()) % multiplicatorPeirod == 0) { log.setPrice(currentMultiplicator); } updateContainers.put(log.getContainerid(), log); } } } /** * Retrieve latest events from the queue; * Update active and update lists with latest events * Perform checkpoint if necessary * Update containers logs table */ public void processTick() { List<ContainerStatus> latestEvents = getLatestEvents(); LOG.debug("CL :: Event count: " + latestEvents.size()); // Go through all events and update active and update lists checkEventContainerStatuses(latestEvents); // Checkpoint if (checkpointEnabled) { createCheckpoint(); } LOG.debug("CL :: Update list size: " + updateContainers.size()); LOG.debug("CL :: Active list size: " + activeContainers.size()); // Update Containers logs table and tick counter updateContainersLogs(true); } /** * Thread that retrieves container statuses, updates active and update * lists, and updates containers logs table and tick counter */ private class TickThread implements Runnable { @Override public void run() { try { while (!stopped && !Thread.currentThread().isInterrupted()) { long executionTime = 0; long startTime = System.currentTimeMillis(); if (recovered) { LOG.debug("CL :: Current tick: " + tickCounter.getValue()); // Process everything for single tick processTick(); // Increment tick counter tickCounter = new LongVariable(Variable.Finder.QuotaTicksCounter, tickCounter.getValue() + 1); } else { LOG.debug("CL :: Not yet recovered"); } //Check alert threshold executionTime = System.currentTimeMillis() - startTime; if (threshold < executionTime) { LOG.warn("Monitor interval threshold exceeded!" + " Execution time: " + Long.toString(executionTime) + "ms." + " Threshold: " + Double.toString(threshold) + "ms." + " Consider increasing monitor interval!"); //To avoid negative values executionTime = (executionTime > monitorInterval) ? monitorInterval : executionTime; } Thread.sleep(Math.max(0,monitorInterval - executionTime)); } } catch (InterruptedException ex) { LOG.error(ex, ex); } } } public long getCurrentTick() { return tickCounter.getValue(); } }