/* * Copyright © 2015-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.internal.app.services; import co.cask.cdap.api.ProgramSpecification; import co.cask.cdap.api.app.ApplicationSpecification; import co.cask.cdap.api.flow.FlowSpecification; import co.cask.cdap.app.program.Program; import co.cask.cdap.app.program.Programs; import co.cask.cdap.app.runtime.ProgramController; import co.cask.cdap.app.runtime.ProgramRuntimeService; import co.cask.cdap.app.runtime.ProgramRuntimeService.RuntimeInfo; import co.cask.cdap.app.store.Store; import co.cask.cdap.common.ApplicationNotFoundException; import co.cask.cdap.common.BadRequestException; import co.cask.cdap.common.ConflictException; import co.cask.cdap.common.NotFoundException; import co.cask.cdap.common.ProgramNotFoundException; import co.cask.cdap.common.app.RunIds; import co.cask.cdap.common.conf.CConfiguration; import co.cask.cdap.common.conf.Constants; import co.cask.cdap.common.io.CaseInsensitiveEnumTypeAdapterFactory; import co.cask.cdap.common.namespace.NamespacedLocationFactory; import co.cask.cdap.config.PreferencesStore; import co.cask.cdap.internal.app.ApplicationSpecificationAdapter; import co.cask.cdap.internal.app.runtime.AbstractListener; import co.cask.cdap.internal.app.runtime.BasicArguments; import co.cask.cdap.internal.app.runtime.ProgramOptionConstants; import co.cask.cdap.internal.app.runtime.SimpleProgramOptions; import co.cask.cdap.internal.app.store.RunRecordMeta; import co.cask.cdap.proto.NamespaceMeta; import co.cask.cdap.proto.ProgramRunStatus; import co.cask.cdap.proto.ProgramStatus; import co.cask.cdap.proto.ProgramType; import co.cask.cdap.proto.RunRecord; import co.cask.cdap.proto.id.Ids; import co.cask.cdap.proto.id.NamespaceId; import co.cask.cdap.proto.id.ProgramId; import co.cask.cdap.proto.id.ProgramRunId; import co.cask.cdap.proto.security.Action; import co.cask.cdap.security.authorization.AuthorizerInstantiatorService; import co.cask.cdap.security.spi.authentication.SecurityRequestContext; import co.cask.cdap.security.spi.authorization.UnauthorizedException; import co.cask.cdap.store.NamespaceStore; import com.google.common.base.Predicate; import com.google.common.base.Throwables; import com.google.common.collect.Collections2; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Sets; import com.google.common.util.concurrent.AbstractIdleService; import com.google.common.util.concurrent.ListenableFuture; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.inject.Inject; import org.apache.twill.api.RunId; import org.apache.twill.common.Threads; import org.apache.twill.filesystem.Location; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Collection; import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; /** * Service that manages lifecycle of Programs. */ public class ProgramLifecycleService extends AbstractIdleService { private static final Logger LOG = LoggerFactory.getLogger(ProgramLifecycleService.class); private static final Gson GSON = ApplicationSpecificationAdapter .addTypeAdapters(new GsonBuilder()) .registerTypeAdapterFactory(new CaseInsensitiveEnumTypeAdapterFactory()) .create(); private final ScheduledExecutorService scheduledExecutorService; private final Store store; private final ProgramRuntimeService runtimeService; private final CConfiguration cConf; private final NamespaceStore nsStore; private final PropertiesResolver propertiesResolver; private final NamespacedLocationFactory namespacedLocationFactory; private final String appFabricDir; private final PreferencesStore preferencesStore; private final AuthorizerInstantiatorService authorizerInstantiatorService; @Inject ProgramLifecycleService(Store store, NamespaceStore nsStore, ProgramRuntimeService runtimeService, CConfiguration cConf, PropertiesResolver propertiesResolver, NamespacedLocationFactory namespacedLocationFactory, PreferencesStore preferencesStore, AuthorizerInstantiatorService authorizerInstantiatorService) { this.store = store; this.nsStore = nsStore; this.runtimeService = runtimeService; this.propertiesResolver = propertiesResolver; this.namespacedLocationFactory = namespacedLocationFactory; this.appFabricDir = cConf.get(Constants.AppFabric.OUTPUT_DIR); this.scheduledExecutorService = Executors.newScheduledThreadPool(1); this.cConf = cConf; this.preferencesStore = preferencesStore; this.authorizerInstantiatorService = authorizerInstantiatorService; } @Override protected void startUp() throws Exception { LOG.info("Starting ProgramLifecycleService"); long interval = cConf.getLong(Constants.AppFabric.PROGRAM_RUNID_CORRECTOR_INTERVAL_SECONDS); if (interval <= 0) { LOG.debug("Invalid run id corrector interval {}. Setting it to 180 seconds.", interval); interval = 180L; } scheduledExecutorService.scheduleWithFixedDelay(new RunRecordsCorrectorRunnable(this), 2L, interval, TimeUnit.SECONDS); } @Override protected void shutDown() throws Exception { LOG.info("Shutting down ProgramLifecycleService"); scheduledExecutorService.shutdown(); try { if (!scheduledExecutorService.awaitTermination(5, TimeUnit.SECONDS)) { scheduledExecutorService.shutdownNow(); } } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } } /** * Returns the program status. * @param programId the id of the program for which the status call is made * @return the status of the program * @throws NotFoundException if the application to which this program belongs was not found */ public ProgramStatus getProgramStatus(ProgramId programId) throws NotFoundException { // check that app exists ApplicationSpecification appSpec = store.getApplication(programId.toId().getApplication()); if (appSpec == null) { throw new NotFoundException(Ids.namespace(programId.getNamespace()).app(programId.getApplication()).toId()); } ProgramRuntimeService.RuntimeInfo runtimeInfo = findRuntimeInfo(programId); if (runtimeInfo == null) { if (programId.getType() != ProgramType.WEBAPP) { //Runtime info not found. Check to see if the program exists. ProgramSpecification spec = getProgramSpecification(programId); if (spec == null) { // program doesn't exist throw new NotFoundException(programId); } if ((programId.getType() == ProgramType.MAPREDUCE || programId.getType() == ProgramType.SPARK) && !store.getRuns(programId.toId(), ProgramRunStatus.RUNNING, 0, Long.MAX_VALUE, 1).isEmpty()) { // MapReduce program exists and running as a part of Workflow return ProgramStatus.RUNNING; } return ProgramStatus.STOPPED; } // TODO: Fetching webapp status is a hack. This will be fixed when webapp spec is added. try { Location webappLoc = Programs.programLocation(namespacedLocationFactory, appFabricDir, programId.toId()); if (webappLoc != null && webappLoc.exists()) { // webapp exists and not running. so return stopped. return ProgramStatus.STOPPED; } // the webappLoc does not exists throw new NotFoundException(programId); } catch (IOException ioe) { throw new NotFoundException(programId.toId(), ioe); } } return runtimeInfo.getController().getState().getProgramStatus(); } /** * Returns the {@link ProgramSpecification} for the specified {@link ProgramId program}. * * @param programId the {@link ProgramId program} for which the {@link ProgramSpecification} is requested * @return the {@link ProgramSpecification} for the specified {@link ProgramId program} */ @Nullable public ProgramSpecification getProgramSpecification(ProgramId programId) { ApplicationSpecification appSpec; appSpec = store.getApplication(Ids.namespace(programId.getNamespace()).app(programId.getApplication()).toId()); if (appSpec == null) { return null; } String programName = programId.getProgram(); ProgramType type = programId.getType(); ProgramSpecification programSpec; if (type == ProgramType.FLOW && appSpec.getFlows().containsKey(programName)) { programSpec = appSpec.getFlows().get(programName); } else if (type == ProgramType.MAPREDUCE && appSpec.getMapReduce().containsKey(programName)) { programSpec = appSpec.getMapReduce().get(programName); } else if (type == ProgramType.SPARK && appSpec.getSpark().containsKey(programName)) { programSpec = appSpec.getSpark().get(programName); } else if (type == ProgramType.WORKFLOW && appSpec.getWorkflows().containsKey(programName)) { programSpec = appSpec.getWorkflows().get(programName); } else if (type == ProgramType.SERVICE && appSpec.getServices().containsKey(programName)) { programSpec = appSpec.getServices().get(programName); } else if (type == ProgramType.WORKER && appSpec.getWorkers().containsKey(programName)) { programSpec = appSpec.getWorkers().get(programName); } else { programSpec = null; } return programSpec; } /** * Starts a Program with the specified argument overrides. * * @param programId the {@link ProgramId} to start/stop * @param overrides the arguments to override in the program's configured user arguments before starting * @param debug {@code true} if the program is to be started in debug mode, {@code false} otherwise * @throws ConflictException if the specified program is already running, and if concurrent runs are not allowed * @throws NotFoundException if the specified program or the app it belongs to is not found in the specified namespace * @throws IOException if there is an error starting the program * @throws UnauthorizedException if the logged in user is not authorized to start the program. To start a program, * a user requires {@link Action#EXECUTE} on the program * @throws Exception if there were other exceptions checking if the current user is authorized to start the program */ public synchronized void start(ProgramId programId, Map<String, String> overrides, boolean debug) throws Exception { Map<String, String> sysArgs = propertiesResolver.getSystemProperties(programId.toId()); Map<String, String> userArgs = propertiesResolver.getUserProperties(programId.toId()); if (overrides != null) { userArgs.putAll(overrides); } if (isRunning(programId) && !isConcurrentRunsAllowed(programId.getType())) { throw new ConflictException(String.format("Program %s is already running", programId)); } ProgramRuntimeService.RuntimeInfo runtimeInfo = start(programId, sysArgs, userArgs, debug); if (runtimeInfo == null) { throw new IOException(String.format("Failed to start program %s", programId)); } } /** * Start a Program. * * @param programId the {@link ProgramId program} to start * @param systemArgs system arguments * @param userArgs user arguments * @param debug enable debug mode * @return {@link ProgramRuntimeService.RuntimeInfo} * @throws IOException if there is an error starting the program * @throws ProgramNotFoundException if program is not found * @throws UnauthorizedException if the logged in user is not authorized to start the program. To start a program, * a user requires {@link Action#EXECUTE} on the program * @throws Exception if there were other exceptions checking if the current user is authorized to start the program */ public ProgramRuntimeService.RuntimeInfo start(final ProgramId programId, final Map<String, String> systemArgs, final Map<String, String> userArgs, boolean debug) throws Exception { authorizerInstantiatorService.get().enforce(programId, SecurityRequestContext.toPrincipal(), Action.EXECUTE); Program program = store.loadProgram(programId.toId()); BasicArguments systemArguments = new BasicArguments(systemArgs); BasicArguments userArguments = new BasicArguments(userArgs); ProgramRuntimeService.RuntimeInfo runtimeInfo = runtimeService.run(program, new SimpleProgramOptions( programId.getProgram(), systemArguments, userArguments, debug)); final ProgramController controller = runtimeInfo.getController(); final String runId = controller.getRunId().getId(); final String twillRunId = runtimeInfo.getTwillRunId() == null ? null : runtimeInfo.getTwillRunId().getId(); if (programId.getType() != ProgramType.MAPREDUCE && programId.getType() != ProgramType.SPARK) { // MapReduce state recording is done by the MapReduceProgramRunner // TODO [JIRA: CDAP-2013] Same needs to be done for other programs as well controller.addListener(new AbstractListener() { @Override public void init(ProgramController.State state, @Nullable Throwable cause) { // Get start time from RunId long startTimeInSeconds = RunIds.getTime(controller.getRunId(), TimeUnit.SECONDS); if (startTimeInSeconds == -1) { // If RunId is not time-based, use current time as start time startTimeInSeconds = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()); } store.setStart(programId.toId(), runId, startTimeInSeconds, twillRunId, userArgs, systemArgs); if (state == ProgramController.State.COMPLETED) { completed(); } if (state == ProgramController.State.ERROR) { error(controller.getFailureCause()); } } @Override public void completed() { LOG.debug("Program {} completed successfully.", programId); store.setStop(programId.toId(), runId, TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()), ProgramController.State.COMPLETED.getRunStatus()); } @Override public void killed() { LOG.debug("Program {} killed.", programId.getNamespaceId()); store.setStop(programId.toId(), runId, TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()), ProgramController.State.KILLED.getRunStatus()); } @Override public void suspended() { LOG.debug("Suspending Program {} {}.", programId, runId); store.setSuspend(programId.toId(), runId); } @Override public void resuming() { LOG.debug("Resuming Program {} {}.", programId, runId); store.setResume(programId.toId(), runId); } @Override public void error(Throwable cause) { LOG.info("Program stopped with error {}, {}", programId, runId, cause); store.setStop(programId.toId(), runId, TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()), ProgramController.State.ERROR.getRunStatus(), cause); } }, Threads.SAME_THREAD_EXECUTOR); } return runtimeInfo; } /** * Stops the specified program. The first run of the program as found by {@link ProgramRuntimeService} is stopped. * * @param programId the {@link ProgramId program} to stop * @throws NotFoundException if the app, program or run was not found * @throws BadRequestException if an attempt is made to stop a program that is either not running or * was started by a workflow * @throws InterruptedException if there was a problem while waiting for the stop call to complete * @throws ExecutionException if there was a problem while waiting for the stop call to complete */ public synchronized void stop(ProgramId programId) throws Exception { stop(programId, null); } /** * Stops the specified run of the specified program. * * @param programId the {@link ProgramId program} to stop * @param runId the runId of the program run to stop. If null, the first run of the program as returned by * {@link ProgramRuntimeService} is stopped. * @throws NotFoundException if the app, program or run was not found * @throws BadRequestException if an attempt is made to stop a program that is either not running or * was started by a workflow * @throws InterruptedException if there was a problem while waiting for the stop call to complete * @throws ExecutionException if there was a problem while waiting for the stop call to complete */ public void stop(ProgramId programId, @Nullable String runId) throws Exception { issueStop(programId, runId).get(); } /** * Issues a command to stop the specified {@link RunId} of the specified {@link ProgramId} and returns a * {@link ListenableFuture} with the {@link ProgramController} for it. * Clients can wait for completion of the {@link ListenableFuture}. * * @param programId the {@link ProgramId program} to issue a stop for * @param runId the runId of the program run to stop. If null, the first run of the program as returned by * {@link ProgramRuntimeService} is stopped. * @return a {@link ListenableFuture} with a {@link ProgramController} that clients can wait on for stop to complete. * @throws NotFoundException if the app, program or run was not found * @throws BadRequestException if an attempt is made to stop a program that is either not running or * was started by a workflow * @throws UnauthorizedException if the user issuing the command is not authorized to stop the program. To stop a * program, a user requires {@link Action#EXECUTE} permission on the program. */ public ListenableFuture<ProgramController> issueStop(ProgramId programId, @Nullable String runId) throws Exception { authorizerInstantiatorService.get().enforce(programId, SecurityRequestContext.toPrincipal(), Action.EXECUTE); ProgramRuntimeService.RuntimeInfo runtimeInfo = findRuntimeInfo(programId, runId); if (runtimeInfo == null) { if (!store.applicationExists(programId.toId().getApplication())) { throw new ApplicationNotFoundException(programId.toId().getApplication()); } else if (!store.programExists(programId.toId())) { throw new ProgramNotFoundException(programId.toId()); } else if (runId != null) { ProgramRunId programRunId = programId.run(runId); // Check if the program is running and is started by the Workflow RunRecordMeta runRecord = store.getRun(programId.toId(), runId); if (runRecord != null && runRecord.getProperties().containsKey("workflowrunid") && runRecord.getStatus().equals(ProgramRunStatus.RUNNING)) { String workflowRunId = runRecord.getProperties().get("workflowrunid"); throw new BadRequestException(String.format("Cannot stop the program '%s' started by the Workflow " + "run '%s'. Please stop the Workflow.", programRunId, workflowRunId)); } throw new NotFoundException(programRunId); } throw new BadRequestException(String.format("Program '%s' is not running.", programId)); } return runtimeInfo.getController().stop(); } /** * Save runtime arguments for all future runs of this program. The runtime arguments are saved in the * {@link PreferencesStore}. * * @param programId the {@link ProgramId program} for which runtime arguments are to be saved * @param runtimeArgs the runtime arguments to save * @throws NotFoundException if the specified program was not found * @throws UnauthorizedException if the current user does not have sufficient privileges to save runtime arguments for * the specified program. To save runtime arguments for a program, a user requires * {@link Action#ADMIN} privileges on the program. */ public void saveRuntimeArgs(ProgramId programId, Map<String, String> runtimeArgs) throws Exception { authorizerInstantiatorService.get().enforce(programId, SecurityRequestContext.toPrincipal(), Action.ADMIN); if (!store.programExists(programId.toId())) { throw new NotFoundException(programId.toId()); } preferencesStore.setProperties(programId.getNamespace(), programId.getApplication(), programId.getType().getCategoryName(), programId.getProgram(), runtimeArgs); } private boolean isRunning(ProgramId programId) throws BadRequestException, NotFoundException { return ProgramStatus.STOPPED != getProgramStatus(programId); } private boolean isConcurrentRunsAllowed(ProgramType type) { // Concurrent runs are only allowed for the Workflow and MapReduce return EnumSet.of(ProgramType.WORKFLOW, ProgramType.MAPREDUCE).contains(type); } @Nullable private ProgramRuntimeService.RuntimeInfo findRuntimeInfo(ProgramId programId, @Nullable String runId) throws BadRequestException { Map<RunId, ProgramRuntimeService.RuntimeInfo> runtimeInfos = runtimeService.list(programId.getType()); if (runId != null) { RunId run; try { run = RunIds.fromString(runId); } catch (IllegalArgumentException e) { throw new BadRequestException("Error parsing run-id.", e); } return runtimeInfos.get(run); } return findRuntimeInfo(programId); } @Nullable private ProgramRuntimeService.RuntimeInfo findRuntimeInfo(ProgramId programId) { Map<RunId, ProgramRuntimeService.RuntimeInfo> runtimeInfos = runtimeService.list(programId.getType()); for (ProgramRuntimeService.RuntimeInfo info : runtimeInfos.values()) { if (programId.equals(info.getProgramId().toEntityId())) { return info; } } return null; } /** * @see #setInstances(ProgramId, int, String) */ public void setInstances(ProgramId programId, int instances) throws Exception { setInstances(programId, instances, null); } /** * Set instances for the given program. Only supported program types for this action are {@link ProgramType#FLOW}, * {@link ProgramType#SERVICE} and {@link ProgramType#WORKER}. * * @param programId the {@link ProgramId} of the program for which instances are to be updated * @param instances the number of instances to be updated. * @param component the flowlet name. Only used when the program is a {@link ProgramType#FLOW flow}. * @throws InterruptedException if there is an error while asynchronously updating instances * @throws ExecutionException if there is an error while asynchronously updating instances * @throws BadRequestException if the number of instances specified is less than 0 * @throws UnauthorizedException if the user does not have privileges to set instances for the specified program. * To set instances for a program, a user needs {@link Action#ADMIN} on the program. */ public void setInstances(ProgramId programId, int instances, @Nullable String component) throws Exception { authorizerInstantiatorService.get().enforce(programId, SecurityRequestContext.toPrincipal(), Action.ADMIN); if (instances < 1) { throw new BadRequestException(String.format("Instance count should be greater than 0. Got %s.", instances)); } switch (programId.getType()) { case SERVICE: setServiceInstances(programId, instances); break; case WORKER: setWorkerInstances(programId, instances); break; case FLOW: setFlowletInstances(programId, component, instances); break; default: throw new BadRequestException(String.format("Setting instances for program type %s is not supported", programId.getType().getPrettyName())); } } private void setWorkerInstances(ProgramId programId, int instances) throws ExecutionException, InterruptedException { int oldInstances = store.getWorkerInstances(programId.toId()); if (oldInstances != instances) { store.setWorkerInstances(programId.toId(), instances); ProgramRuntimeService.RuntimeInfo runtimeInfo = findRuntimeInfo(programId); if (runtimeInfo != null) { runtimeInfo.getController().command(ProgramOptionConstants.INSTANCES, ImmutableMap.of("runnable", programId.getProgram(), "newInstances", String.valueOf(instances), "oldInstances", String.valueOf(oldInstances))).get(); } } } private void setFlowletInstances(ProgramId programId, String flowletId, int instances) throws ExecutionException, InterruptedException { int oldInstances = store.getFlowletInstances(programId.toId(), flowletId); if (oldInstances != instances) { FlowSpecification flowSpec = store.setFlowletInstances(programId.toId(), flowletId, instances); ProgramRuntimeService.RuntimeInfo runtimeInfo = findRuntimeInfo(programId); if (runtimeInfo != null) { runtimeInfo.getController() .command(ProgramOptionConstants.INSTANCES, ImmutableMap.of("flowlet", flowletId, "newInstances", String.valueOf(instances), "oldFlowSpec", GSON.toJson(flowSpec, FlowSpecification.class))).get(); } } } private void setServiceInstances(ProgramId programId, int instances) throws ExecutionException, InterruptedException { int oldInstances = store.getServiceInstances(programId.toId()); if (oldInstances != instances) { store.setServiceInstances(programId.toId(), instances); ProgramRuntimeService.RuntimeInfo runtimeInfo = findRuntimeInfo(programId); if (runtimeInfo != null) { runtimeInfo.getController().command(ProgramOptionConstants.INSTANCES, ImmutableMap.of("runnable", programId.getProgram(), "newInstances", String.valueOf(instances), "oldInstances", String.valueOf(oldInstances))).get(); } } } /** * Fix all the possible inconsistent states for RunRecords that shows it is in RUNNING state but actually not * via check to {@link ProgramRuntimeService}. */ private void validateAndCorrectRunningRunRecords() { Set<String> processedInvalidRunRecordIds = Sets.newHashSet(); // Lets update the running programs run records for (ProgramType programType : ProgramType.values()) { validateAndCorrectRunningRunRecords(programType, processedInvalidRunRecordIds); } if (!processedInvalidRunRecordIds.isEmpty()) { LOG.info("Corrected {} of run records with RUNNING status but no actual program running.", processedInvalidRunRecordIds.size()); } } /** * Fix all the possible inconsistent states for RunRecords that shows it is in RUNNING state but actually not * via check to {@link ProgramRuntimeService} for a type of CDAP program. * * @param programType The type of program the run records need to validate and update. * @param processedInvalidRunRecordIds the {@link Set} of processed invalid run record ids. */ void validateAndCorrectRunningRunRecords(final ProgramType programType, Set<String> processedInvalidRunRecordIds) { final Map<RunId, RuntimeInfo> runIdToRuntimeInfo = runtimeService.list(programType); LOG.trace("Start getting run records not actually running ..."); List<RunRecordMeta> notActuallyRunning = store.getRuns(ProgramRunStatus.RUNNING, new Predicate<RunRecordMeta>() { @Override public boolean apply(RunRecordMeta input) { String runId = input.getPid(); // Check if it is not actually running. return !runIdToRuntimeInfo.containsKey(RunIds.fromString(runId)); } }); LOG.trace("End getting {} run records not actually running.", notActuallyRunning.size()); final Map<String, ProgramId> runIdToProgramId = new HashMap<>(); LOG.trace("Start getting invalid run records ..."); Collection<RunRecordMeta> invalidRunRecords = Collections2.filter(notActuallyRunning, new Predicate<RunRecordMeta>() { @Override public boolean apply(RunRecordMeta input) { String runId = input.getPid(); // check for program Id for the run record, if null then it is invalid program type. ProgramId targetProgramId = retrieveProgramIdForRunRecord(programType, runId); // Check if run id is for the right program type if (targetProgramId != null) { runIdToProgramId.put(runId, targetProgramId); return true; } else { return false; } } }); LOG.trace("End getting invalid run records."); if (!invalidRunRecords.isEmpty()) { LOG.warn("Found {} RunRecords with RUNNING status but the program is not actually running for program type {}", invalidRunRecords.size(), programType.getPrettyName()); } else { LOG.trace("No RunRecords found with RUNNING status but the program is not actually running for program type {}", programType.getPrettyName()); } // Now lets correct the invalid RunRecords for (RunRecordMeta invalidRunRecordMeta : invalidRunRecords) { boolean shouldCorrect = shouldCorrectForWorkflowChildren(invalidRunRecordMeta, processedInvalidRunRecordIds); if (!shouldCorrect) { LOG.trace("Will not correct invalid run record {} since it's parent workflow still running.", invalidRunRecordMeta); continue; } String runId = invalidRunRecordMeta.getPid(); ProgramId targetProgramId = runIdToProgramId.get(runId); LOG.warn("Fixing RunRecord {} in program {} of type {} with RUNNING status but the program is not running", runId, targetProgramId, programType.getPrettyName()); store.compareAndSetStatus(targetProgramId.toId(), runId, ProgramController.State.ALIVE.getRunStatus(), ProgramController.State.ERROR.getRunStatus()); processedInvalidRunRecordIds.add(runId); } } /** * Helper method to check if the run record is a child program of a Workflow * * @param runRecordMeta The target {@link RunRecordMeta} to check * @param processedInvalidRunRecordIds the {@link Set} of processed invalid run record ids. * @return {@code true} of we should check and {@code false} otherwise */ private boolean shouldCorrectForWorkflowChildren(RunRecordMeta runRecordMeta, Set<String> processedInvalidRunRecordIds) { // check if it is part of workflow because it may not have actual runtime info if (runRecordMeta.getProperties() != null && runRecordMeta.getProperties().get("workflowrunid") != null) { // Get the parent Workflow info String workflowRunId = runRecordMeta.getProperties().get("workflowrunid"); if (!processedInvalidRunRecordIds.contains(workflowRunId)) { // If the parent workflow has not been processed, then check if it still valid ProgramId workflowProgramId = retrieveProgramIdForRunRecord(ProgramType.WORKFLOW, workflowRunId); if (workflowProgramId != null) { // lets see if the parent workflow run records state is still running RunRecordMeta wfRunRecord = store.getRun(workflowProgramId.toId(), workflowRunId); RuntimeInfo wfRuntimeInfo = runtimeService.lookup(workflowProgramId.toId(), RunIds.fromString(workflowRunId)); // Check of the parent workflow run record exists and it is running and runtime info said it is still there // then do not update it if (wfRunRecord != null && wfRunRecord.getStatus() == ProgramRunStatus.RUNNING && wfRuntimeInfo != null) { return false; } } } } return true; } /** * Helper method to get {@link ProgramId} for a RunRecord for type of program * * @param programType Type of program to search * @param runId The target id of the {@link RunRecord} to find * @return the program id of the run record or {@code null} if does not exist. */ @Nullable private ProgramId retrieveProgramIdForRunRecord(ProgramType programType, String runId) { // Get list of namespaces (borrow logic from AbstractAppFabricHttpHandler#listPrograms) List<NamespaceMeta> namespaceMetas = nsStore.list(); // For each, get all programs under it ProgramId targetProgramId = null; for (NamespaceMeta nm : namespaceMetas) { NamespaceId namespace = Ids.namespace(nm.getName()); Collection<ApplicationSpecification> appSpecs = store.getAllApplications(namespace.toId()); // For each application get the programs checked against run records for (ApplicationSpecification appSpec : appSpecs) { switch (programType) { case FLOW: for (String programName : appSpec.getFlows().keySet()) { ProgramId programId = validateProgramForRunRecord(nm.getName(), appSpec.getName(), programType, programName, runId); if (programId != null) { targetProgramId = programId; break; } } break; case MAPREDUCE: for (String programName : appSpec.getMapReduce().keySet()) { ProgramId programId = validateProgramForRunRecord(nm.getName(), appSpec.getName(), programType, programName, runId); if (programId != null) { targetProgramId = programId; break; } } break; case SPARK: for (String programName : appSpec.getSpark().keySet()) { ProgramId programId = validateProgramForRunRecord(nm.getName(), appSpec.getName(), programType, programName, runId); if (programId != null) { targetProgramId = programId; break; } } break; case SERVICE: for (String programName : appSpec.getServices().keySet()) { ProgramId programId = validateProgramForRunRecord(nm.getName(), appSpec.getName(), programType, programName, runId); if (programId != null) { targetProgramId = programId; break; } } break; case WORKER: for (String programName : appSpec.getWorkers().keySet()) { ProgramId programId = validateProgramForRunRecord(nm.getName(), appSpec.getName(), programType, programName, runId); if (programId != null) { targetProgramId = programId; break; } } break; case WORKFLOW: for (String programName : appSpec.getWorkflows().keySet()) { ProgramId programId = validateProgramForRunRecord(nm.getName(), appSpec.getName(), programType, programName, runId); if (programId != null) { targetProgramId = programId; break; } } break; default: LOG.debug("Unknown program type: " + programType.name()); break; } if (targetProgramId != null) { break; } } if (targetProgramId != null) { break; } } return targetProgramId; } /** * Helper method to get program id for a run record if it exists in the store. * * @return instance of {@link ProgramId} if exist for the runId or null if does not */ @Nullable private ProgramId validateProgramForRunRecord(String namespaceName, String appName, ProgramType programType, String programName, String runId) { ProgramId programId = Ids.namespace(namespaceName).app(appName).program(programType, programName); RunRecordMeta runRecord = store.getRun(programId.toId(), runId); if (runRecord == null) { return null; } return programId; } /** * Helper class to run in separate thread to validate the invalid running run records */ private static class RunRecordsCorrectorRunnable implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(RunRecordsCorrectorRunnable.class); private final ProgramLifecycleService programLifecycleService; public RunRecordsCorrectorRunnable(ProgramLifecycleService programLifecycleService) { this.programLifecycleService = programLifecycleService; } @Override public void run() { try { RunRecordsCorrectorRunnable.LOG.debug("Start correcting invalid run records ..."); // Lets update the running programs run records programLifecycleService.validateAndCorrectRunningRunRecords(); RunRecordsCorrectorRunnable.LOG.debug("End correcting invalid run records."); } catch (Throwable t) { // Ignore any exception thrown since this behaves like daemon thread. //noinspection ThrowableResultOfMethodCallIgnored LOG.warn("Unable to complete correcting run records: {}", Throwables.getRootCause(t).getMessage()); LOG.debug("Exception thrown when running run id cleaner.", t); } } } }