/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.falcon.execution; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import java.util.Collection; import java.util.concurrent.ExecutionException; import org.apache.falcon.FalconException; import org.apache.falcon.entity.EntityUtil; import org.apache.falcon.entity.ProcessHelper; import org.apache.falcon.entity.v0.Entity; import org.apache.falcon.entity.v0.SchemaHelper; import org.apache.falcon.entity.v0.process.Cluster; import org.apache.falcon.entity.v0.process.Process; import org.apache.falcon.exception.InvalidStateTransitionException; import org.apache.falcon.exception.StateStoreException; import org.apache.falcon.notification.service.NotificationServicesRegistry; import org.apache.falcon.notification.service.event.DataEvent; import org.apache.falcon.notification.service.event.Event; import org.apache.falcon.notification.service.event.EventType; import org.apache.falcon.notification.service.event.JobCompletedEvent; import org.apache.falcon.notification.service.event.RerunEvent; import org.apache.falcon.notification.service.event.JobScheduledEvent; import org.apache.falcon.notification.service.event.TimeElapsedEvent; import org.apache.falcon.notification.service.impl.AlarmService; import org.apache.falcon.notification.service.impl.JobCompletionService; import org.apache.falcon.notification.service.impl.SchedulerService; import org.apache.falcon.predicate.Predicate; import org.apache.falcon.state.EntityClusterID; import org.apache.falcon.state.EntityState; import org.apache.falcon.state.InstanceID; import org.apache.falcon.state.InstanceState; import org.apache.falcon.state.StateService; import org.apache.falcon.util.StartupProperties; import org.apache.falcon.workflow.engine.DAGEngineFactory; import org.apache.falcon.workflow.engine.FalconWorkflowEngine; import org.joda.time.DateTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Date; import java.util.Properties; /** * This class is responsible for managing execution instances of a process. * It caches the active process instances in memory and handles notification events. * It intercepts all the notification events intended for its instances and passes them along to the instance after * acting on it, where applicable. */ public class ProcessExecutor extends EntityExecutor { private static final Logger LOG = LoggerFactory.getLogger(ProcessExecutor.class); protected LoadingCache<InstanceID, ProcessExecutionInstance> instances; private Predicate triggerPredicate; private Process process; private final StateService stateService = StateService.get(); private final FalconExecutionService executionService = FalconExecutionService.get(); /** * Constructor per entity, per cluster. * * @param proc * @param clusterName * @throws FalconException */ public ProcessExecutor(Process proc, String clusterName) throws FalconException { process = proc; cluster = clusterName; id = new EntityClusterID(proc, clusterName); } @Override public void schedule() throws FalconException { // Lazy instantiation if (instances == null) { initInstances(); } // Check to handle restart and restoration from state store. EntityState entityState = STATE_STORE.getEntity(id.getEntityID()); if (entityState.getCurrentState() != EntityState.STATE.SCHEDULED) { dryRun(entityState.getProperties()); } else { LOG.info("Process, {} was already scheduled on cluster, {}.", process.getName(), cluster); LOG.info("Loading instances for process {} from state store.", process.getName()); reloadInstances(); } registerForNotifications(getLastInstanceTime()); } private void dryRun(Properties properties) throws FalconException { DAGEngineFactory.getDAGEngine(cluster).submit(process, properties); } // Initializes the cache of execution instances. Cache is backed by the state store. private void initInstances() throws FalconException { int cacheSize = Integer.parseInt(StartupProperties.get().getProperty("scheduler.instance.cache.size", DEFAULT_CACHE_SIZE)); instances = CacheBuilder.newBuilder() .maximumSize(cacheSize) .build(new CacheLoader<InstanceID, ProcessExecutionInstance>() { @Override public ProcessExecutionInstance load(InstanceID id) throws Exception { return (ProcessExecutionInstance) STATE_STORE.getExecutionInstance(id).getInstance(); } }); } // Re-load any active instances from state private void reloadInstances() throws FalconException { for (InstanceState instanceState : STATE_STORE.getExecutionInstances(process, cluster, InstanceState.getActiveStates())) { ExecutionInstance instance = instanceState.getInstance(); LOG.debug("Loading instance {} from state.", instance.getId()); switch (instanceState.getCurrentState()) { case RUNNING: onSchedule(instance); break; case READY: onConditionsMet(instance); break; case WAITING: instance.resume(); break; default: // skip } instances.put(instance.getId(), (ProcessExecutionInstance) instance); } } @Override public void suspendAll() throws FalconException { NotificationServicesRegistry.unregister(executionService, getId()); StringBuffer errMsg = new StringBuffer(); // Only active instances are in memory. Suspend them first. for (ExecutionInstance instance : instances.asMap().values()) { try { suspend(instance); } catch (FalconException e) { // Proceed with next errMsg.append(handleError(instance, e, EntityState.EVENT.SUSPEND)); } } for (InstanceState instanceState : STATE_STORE.getExecutionInstances(process, cluster, InstanceState.getActiveStates())) { ExecutionInstance instance = instanceState.getInstance(); try { suspend(instance); } catch (FalconException e) { errMsg.append(handleError(instance, e, EntityState.EVENT.SUSPEND)); } } // Some errors if (errMsg.length() != 0) { throw new FalconException("Some instances failed to suspend : " + errMsg.toString()); } } // Error handling for an operation. private String handleError(ExecutionInstance instance, FalconException e, EntityState.EVENT action) throws StateStoreException { // If the instance terminated while a kill/suspend operation was in progress, ignore the exception. InstanceState.STATE currentState = STATE_STORE.getExecutionInstance(instance.getId()).getCurrentState(); if (InstanceState.getTerminalStates().contains(currentState)) { return ""; } String errMsg = "Instance " + action.name() + " failed for: " + instance.getId() + " due to " + e.getMessage(); LOG.error(errMsg, e); return errMsg; } // Returns last materialized instance's time. private Date getLastInstanceTime() throws StateStoreException { InstanceState instanceState = STATE_STORE.getLastExecutionInstance(process, cluster); if (instanceState == null) { return null; } return EntityUtil.getNextInstanceTime(instanceState.getInstance().getInstanceTime().toDate(), EntityUtil.getFrequency(process), EntityUtil.getTimeZone(process), 1); } @Override public void resumeAll() throws FalconException { if (instances == null) { initInstances(); } StringBuffer errMsg = new StringBuffer(); ArrayList<InstanceState.STATE> states = new ArrayList<InstanceState.STATE>(); // TODO : Distinguish between individually suspended instance versus suspended entity? states.add(InstanceState.STATE.SUSPENDED); // Load cache with suspended instances for (InstanceState instanceState : STATE_STORE.getExecutionInstances(process, cluster, states)) { ExecutionInstance instance = instanceState.getInstance(); try { resume(instance); } catch (FalconException e) { errMsg.append("Instance resume failed for : " + instance.getId() + " due to " + e.getMessage()); LOG.error("Instance resume failed for : " + instance.getId(), e); } } registerForNotifications(getLastInstanceTime()); // Some errors if (errMsg.length() != 0) { throw new FalconException("Some instances failed to resume : " + errMsg.toString()); } } @Override public void killAll() throws FalconException { StringBuffer errMsg = new StringBuffer(); // Kill workflows in oozie. for (InstanceState instanceState : STATE_STORE.getExecutionInstances(process, cluster, InstanceState.getActiveStates())) { ExecutionInstance instance = instanceState.getInstance(); try { kill(instance); } catch (FalconException e) { errMsg.append(handleError(instance, e, EntityState.EVENT.KILL)); } } // Kill active instances in memory. Collection<ProcessExecutionInstance> execInstances = instances.asMap().values(); for (ExecutionInstance instance : execInstances) { try { kill(instance); } catch (FalconException e) { // Proceed with next errMsg.append(handleError(instance, e, EntityState.EVENT.KILL)); } } // Some errors if (errMsg.length() != 0) { throw new FalconException("Some instances failed to kill : " + errMsg.toString()); } NotificationServicesRegistry.unregister(executionService, getId()); } @Override public void suspend(ExecutionInstance instance) throws FalconException { try { instance.suspend(); stateService.handleStateChange(instance, InstanceState.EVENT.SUSPEND, this); } catch (Exception e) { LOG.error("Suspend failed for instance id : " + instance.getId(), e); throw new FalconException("Suspend failed for instance : " + instance.getId(), e); } } @Override public void resume(ExecutionInstance instance) throws FalconException { try { instance.resume(); if (((ProcessExecutionInstance) instance).isReady()) { stateService.handleStateChange(instance, InstanceState.EVENT.RESUME_READY, this); onConditionsMet(instance); } else { stateService.handleStateChange(instance, InstanceState.EVENT.RESUME_WAITING, this); } } catch (Exception e) { LOG.error("Resume failed for instance id : " + instance.getId(), e); throw new FalconException("Resume failed for instance : " + instance.getId(), e); } } @Override public void kill(ExecutionInstance instance) throws FalconException { try { // Kill will de-register from notification services instance.kill(); stateService.handleStateChange(instance, InstanceState.EVENT.KILL, this); } catch (Exception e) { LOG.error("Kill failed for instance id : " + instance.getId(), e); throw new FalconException("Kill failed for instance : " + instance.getId(), e); } } @Override public void rerun(ExecutionInstance instance, Properties props, boolean isForced) throws FalconException { if (props == null) { props = new Properties(); } if (isForced) { props.put(FalconWorkflowEngine.FALCON_FORCE_RERUN, "true"); } props.put(FalconWorkflowEngine.FALCON_RERUN, "true"); instance.setProperties(props); instances.put(new InstanceID(instance), (ProcessExecutionInstance) instance); RerunEvent rerunEvent = new RerunEvent(instance.getId(), instance.getInstanceTime()); onEvent(rerunEvent); } @Override public void update(Entity newEntity) throws FalconException { Date newEndTime = EntityUtil.getEndTime(newEntity, cluster); if (newEndTime.before(new Date())) { throw new FalconException("Entity's end time " + SchemaHelper.formatDateUTC(newEndTime) + " is before current time. Entity can't be updated. Use remove and add"); } LOG.debug("Updating for cluster: {}, entity: {}", cluster, newEntity.toShortString()); // Unregister from the service that causes an instance to trigger, // so the new instances are triggered with the new definition. switch(triggerPredicate.getType()) { case TIME: NotificationServicesRegistry.getService(NotificationServicesRegistry.SERVICE.TIME) .unregister(executionService, getId()); break; default: throw new FalconException("Internal Error : Wrong instance trigger type."); } // Update process process = (Process) newEntity; // Re-register with new start, end, frequency etc. registerForNotifications(getLastInstanceTime()); } @Override public Entity getEntity() { return process; } private ProcessExecutionInstance buildInstance(Event event) throws FalconException { // If a time triggered instance, use instance time from event if (event.getType() == EventType.TIME_ELAPSED) { TimeElapsedEvent timeEvent = (TimeElapsedEvent) event; LOG.debug("Creating a new process instance for instance time {}.", timeEvent.getInstanceTime()); return new ProcessExecutionInstance(process, timeEvent.getInstanceTime(), cluster); } else { return new ProcessExecutionInstance(process, DateTime.now(), cluster); } } @Override public void onEvent(Event event) throws FalconException { try { // Handle event if applicable if (shouldHandleEvent(event)) { handleEvent(event); } else { // Else, pass it along to the execution instance if (event.getTarget() instanceof InstanceID) { InstanceID instanceID = (InstanceID) event.getTarget(); ProcessExecutionInstance instance = instances.get(instanceID); if (instance != null) { instance.onEvent(event); if (instance.isReady()) { stateService.handleStateChange(instance, InstanceState.EVENT.CONDITIONS_MET, this); } else if (instance.hasTimedout()) { stateService.handleStateChange(instance, InstanceState.EVENT.TIME_OUT, this); } } } } } catch (Exception e) { throw new FalconException("Unable to handle event of type : " + event.getType() + " with target:" + event.getTarget(), e); } } private void handleEvent(Event event) throws FalconException { ProcessExecutionInstance instance; try { switch (event.getType()) { case JOB_SCHEDULED: instance = instances.get((InstanceID)event.getTarget()); instance.onEvent(event); switch(((JobScheduledEvent)event).getStatus()) { case SUCCESSFUL: stateService.handleStateChange(instance, InstanceState.EVENT.SCHEDULE, this); break; case FAILED: stateService.handleStateChange(instance, InstanceState.EVENT.FAIL, this); break; default: throw new InvalidStateTransitionException("Invalid job scheduler status."); } break; case JOB_COMPLETED: instance = instances.get((InstanceID)event.getTarget()); instance.onEvent(event); switch (((JobCompletedEvent) event).getStatus()) { case SUCCEEDED: stateService.handleStateChange(instance, InstanceState.EVENT.SUCCEED, this); break; case FAILED: stateService.handleStateChange(instance, InstanceState.EVENT.FAIL, this); break; case KILLED: stateService.handleStateChange(instance, InstanceState.EVENT.KILL, this); break; case SUSPENDED: stateService.handleStateChange(instance, InstanceState.EVENT.SUSPEND, this); break; default: throw new InvalidStateTransitionException( "Job seems to be have been managed outside Falcon."); } break; case RE_RUN: instance = instances.get((InstanceID)event.getTarget()); stateService.handleStateChange(instance, InstanceState.EVENT.EXTERNAL_TRIGGER, this); if (instance.isReady()) { stateService.handleStateChange(instance, InstanceState.EVENT.CONDITIONS_MET, this); } break; case DATA_AVAILABLE: instance = instances.get((InstanceID)event.getTarget()); instance.onEvent(event); switch (((DataEvent) event).getStatus()) { case AVAILABLE: if (instance.areDataAwaitingPredicatesEmpty() && !instance.hasTimedOut) { LOG.info("Data conditions met for instance {} and scheduled for running ", instance.getId()); stateService.handleStateChange(instance, InstanceState.EVENT.CONDITIONS_MET, this); } else if (instance.areDataAwaitingPredicatesEmpty()) { LOG.info("Instance {} timedout since input data not available", instance.getId()); stateService.handleStateChange(instance, InstanceState.EVENT.TIME_OUT, this); } else { STATE_STORE.updateExecutionInstance(new InstanceState(instance)); } break; case UNAVAILABLE: if (instance.areDataAwaitingPredicatesEmpty()) { stateService.handleStateChange(instance, InstanceState.EVENT.TIME_OUT, this); } break; default: throw new InvalidStateTransitionException("Invalid Data event status."); } break; default: if (isTriggerEvent(event)) { instance = buildInstance(event); stateService.handleStateChange(instance, InstanceState.EVENT.TRIGGER, this); // This happens where are no conditions the instance is waiting on (for example, no data inputs). if (!instance.isScheduled() && instance.isReady()) { stateService.handleStateChange(instance, InstanceState.EVENT.CONDITIONS_MET, this); } } } } catch (ExecutionException ee) { throw new FalconException("Unable to handle event for execution instance", ee); } } // Evaluates the trigger predicate against the current event, to determine if a new instance needs to be triggered. private boolean isTriggerEvent(Event event) { try { return triggerPredicate.evaluate(Predicate.getPredicate(event)); } catch (FalconException e) { return false; } } // Registers for all notifications that should trigger an instance. // Currently, only time based triggers are handled. protected void registerForNotifications(Date instanceTime) throws FalconException { AlarmService.AlarmRequestBuilder requestBuilder = (AlarmService.AlarmRequestBuilder) NotificationServicesRegistry.getService(NotificationServicesRegistry.SERVICE.TIME) .createRequestBuilder(executionService, getId()); Cluster processCluster = ProcessHelper.getCluster(process, cluster); // If there are no instances, use process's start, else, use last materialized instance's time Date startTime = (instanceTime == null) ? processCluster.getValidity().getStart() : instanceTime; Date endTime = processCluster.getValidity().getEnd(); // TODO : Handle cron based and calendar based time triggers // TODO : Set execution order details. requestBuilder.setFrequency(process.getFrequency()) .setStartTime(new DateTime(startTime)) .setEndTime(new DateTime(endTime)) .setTimeZone(EntityUtil.getTimeZone(process)); NotificationServicesRegistry.register(requestBuilder.build()); LOG.info("Registered for a time based notification for process {} with frequency: {}, " + "start time: {}, end time: {}", process.getName(), process.getFrequency(), startTime, endTime); triggerPredicate = Predicate.createTimePredicate(startTime.getTime(), endTime.getTime(), -1); } // This executor must handle any events intended for itself. // Or, if it is job run or job complete notifications, so it can handle the instance's state transition. private boolean shouldHandleEvent(Event event) { return event.getTarget().equals(id) || event.getType() == EventType.JOB_COMPLETED || event.getType() == EventType.JOB_SCHEDULED || event.getType() == EventType.RE_RUN || event.getType() == EventType.DATA_AVAILABLE; } @Override public void onTrigger(ExecutionInstance instance) throws FalconException { instances.put(new InstanceID(instance), (ProcessExecutionInstance) instance); } @Override public void onExternalTrigger(ExecutionInstance instance) throws FalconException { instances.put(new InstanceID(instance), (ProcessExecutionInstance) instance); ((ProcessExecutionInstance) instance).rerun(); } @Override public void onConditionsMet(ExecutionInstance instance) throws FalconException { // Put process in run queue and register for notification SchedulerService.JobScheduleRequestBuilder requestBuilder = (SchedulerService.JobScheduleRequestBuilder) NotificationServicesRegistry.getService(NotificationServicesRegistry.SERVICE.JOB_SCHEDULE) .createRequestBuilder(executionService, getId()); requestBuilder.setInstance(instance); NotificationServicesRegistry.register(requestBuilder.build()); } @Override public void onSchedule(ExecutionInstance instance) throws FalconException { JobCompletionService.JobCompletionRequestBuilder completionRequestBuilder = (JobCompletionService.JobCompletionRequestBuilder) NotificationServicesRegistry.getService(NotificationServicesRegistry.SERVICE.JOB_COMPLETION) .createRequestBuilder(executionService, getId()); completionRequestBuilder.setExternalId(instance.getExternalID()); completionRequestBuilder.setCluster(instance.getCluster()); NotificationServicesRegistry.register(completionRequestBuilder.build()); } @Override public void onSuspend(ExecutionInstance instance) throws FalconException { NotificationServicesRegistry.getService(NotificationServicesRegistry.SERVICE.JOB_SCHEDULE) .unregister(executionService, instance.getId()); instances.invalidate(instance.getId()); } @Override public void onResume(ExecutionInstance instance) throws FalconException { instances.put(instance.getId(), (ProcessExecutionInstance) instance); } @Override public void onKill(ExecutionInstance instance) throws FalconException { NotificationServicesRegistry.getService(NotificationServicesRegistry.SERVICE.JOB_SCHEDULE) .unregister(executionService, instance.getId()); instances.invalidate(instance.getId()); } @Override public void onSuccess(ExecutionInstance instance) throws FalconException { instance.destroy(); instances.invalidate(instance.getId()); } @Override public void onFailure(ExecutionInstance instance) throws FalconException { instance.destroy(); instances.invalidate(instance.getId()); } @Override public void onTimeOut(ExecutionInstance instance) throws FalconException { instance.destroy(); instances.invalidate(instance.getId()); } }