/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.falcon.workflow.engine; import org.apache.commons.lang3.StringUtils; import org.apache.falcon.FalconException; import org.apache.falcon.Tag; import org.apache.falcon.entity.ClusterHelper; import org.apache.falcon.entity.EntityUtil; import org.apache.falcon.entity.store.ConfigurationStore; import org.apache.falcon.entity.v0.Entity; import org.apache.falcon.entity.v0.EntityType; import org.apache.falcon.entity.v0.cluster.Cluster; import org.apache.falcon.entity.v0.process.Process; import org.apache.falcon.exception.DAGEngineException; import org.apache.falcon.execution.ExecutionInstance; import org.apache.falcon.hadoop.HadoopClientFactory; import org.apache.falcon.oozie.OozieOrchestrationWorkflowBuilder; import org.apache.falcon.resource.InstancesResult; import org.apache.falcon.security.CurrentUser; import org.apache.falcon.util.OozieUtils; import org.apache.falcon.util.RuntimeProperties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.oozie.client.Job; import org.apache.oozie.client.OozieClient; import org.apache.oozie.client.OozieClientException; import org.apache.oozie.client.WorkflowAction; import org.apache.oozie.client.WorkflowJob; import org.joda.time.DateTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.Map; import java.util.Properties; /** * A DAG Engine that uses Oozie to execute the DAG. */ public class OozieDAGEngine implements DAGEngine { private static final Logger LOG = LoggerFactory.getLogger(OozieDAGEngine.class); private final OozieClient client; private static final int WORKFLOW_STATUS_RETRY_DELAY_MS = 100; private static final String WORKFLOW_STATUS_RETRY_COUNT = "workflow.status.retry.count"; private static final List<String> PARENT_WF_ACTION_NAMES = Arrays.asList( "pre-processing", "recordsize", "succeeded-post-processing", "failed-post-processing" ); public static final String INSTANCE_FORMAT = "yyyy-MM-dd-HH-mm"; private final Cluster cluster; public OozieDAGEngine(Cluster cluster) throws DAGEngineException { try { client = OozieClientFactory.get(cluster); this.cluster = cluster; } catch (Exception e) { throw new DAGEngineException(e); } } public OozieDAGEngine(String clusterName) throws DAGEngineException { try { this.cluster = ConfigurationStore.get().get(EntityType.CLUSTER, clusterName); client = OozieClientFactory.get(cluster); } catch (Exception e) { throw new DAGEngineException(e); } } @Override public String run(ExecutionInstance instance, Properties props) throws DAGEngineException { try { OozieOrchestrationWorkflowBuilder builder = OozieOrchestrationWorkflowBuilder.get(instance.getEntity(), cluster, Tag.DEFAULT, OozieOrchestrationWorkflowBuilder.Scheduler.NATIVE); prepareEntityBuildPath(instance.getEntity()); Path buildPath = EntityUtil.getLatestStagingPath(cluster, instance.getEntity()); builder.setNominalTime(instance.getInstanceTime()); Properties properties = builder.build(cluster, buildPath, props); switchUserTo(instance.getEntity().getACL().getOwner()); properties.setProperty(OozieClient.USER_NAME, instance.getEntity().getACL().getOwner()); properties.setProperty(OozieClient.APP_PATH, buildPath.toString()); return client.run(properties); } catch (OozieClientException e) { LOG.error("Oozie client exception:", e); throw new DAGEngineException(e); } catch (FalconException e1) { LOG.error("Falcon Exception : ", e1); throw new DAGEngineException(e1); } } private void switchUserTo(String user) { CurrentUser.authenticate(user); } private void prepareEntityBuildPath(Entity entity) throws FalconException { Path stagingPath = EntityUtil.getBaseStagingPath(cluster, entity); Path logPath = EntityUtil.getLogPath(cluster, entity); try { FileSystem fs = HadoopClientFactory.get().createProxiedFileSystem( ClusterHelper.getConfiguration(cluster)); HadoopClientFactory.mkdirsWithDefaultPerms(fs, stagingPath); HadoopClientFactory.mkdirsWithDefaultPerms(fs, logPath); } catch (IOException e) { throw new FalconException("Error preparing base staging dirs: " + stagingPath, e); } } private void dryRunInternal(Properties properties, Path buildPath, Entity entity) throws OozieClientException, DAGEngineException { if (properties == null) { LOG.info("Entity {} is not scheduled on cluster {} with user {}", entity.getName(), cluster, entity.getACL().getOwner()); throw new DAGEngineException("Properties for entity " + entity.getName() + " is empty"); } switchUserTo(entity.getACL().getOwner()); properties.setProperty(OozieClient.USER_NAME, entity.getACL().getOwner()); properties.setProperty(OozieClient.APP_PATH, buildPath.toString()); //Do dryrun before run as run is asynchronous LOG.info("Dry run with properties {}", properties); client.dryrun(properties); } private void switchUser() { switchUserTo(System.getProperty("user.name")); } @Override public boolean isScheduled(ExecutionInstance instance) throws DAGEngineException { try { return statusEquals(client.getJobInfo(instance.getExternalID()).getStatus().name(), Job.Status.PREP, Job.Status.RUNNING); } catch (OozieClientException e) { throw new DAGEngineException(e); } } @Override public void suspend(ExecutionInstance instance) throws DAGEngineException { try { client.suspend(instance.getExternalID()); assertStatus(instance.getExternalID(), Job.Status.PREPSUSPENDED, Job.Status.SUSPENDED, Job.Status.SUCCEEDED, Job.Status.FAILED, Job.Status.KILLED); LOG.info("Suspended job {} of entity {} of time {} on cluster {}", instance.getExternalID(), instance.getEntity().getName(), instance.getInstanceTime(), instance.getCluster()); } catch (OozieClientException e) { throw new DAGEngineException(e); } } @Override public void resume(ExecutionInstance instance) throws DAGEngineException { switchUserTo(instance.getEntity().getACL().getOwner()); try { client.resume(instance.getExternalID()); assertStatus(instance.getExternalID(), Job.Status.PREP, Job.Status.RUNNING, Job.Status.SUCCEEDED, Job.Status.FAILED, Job.Status.KILLED); LOG.info("Resumed job {} of entity {} of time {} on cluster {}", instance.getExternalID(), instance.getEntity().getName(), instance.getInstanceTime(), instance.getCluster()); } catch (OozieClientException e) { throw new DAGEngineException(e); } } @Override public void kill(ExecutionInstance instance) throws DAGEngineException { try { client.kill(instance.getExternalID()); assertStatus(instance.getExternalID(), Job.Status.KILLED, Job.Status.SUCCEEDED, Job.Status.FAILED); LOG.info("Killed job {} of entity {} of time {} on cluster {}", instance.getExternalID(), instance.getEntity().getName(), instance.getInstanceTime(), instance.getCluster()); } catch (OozieClientException e) { throw new DAGEngineException(e); } } @Override public void reRun(ExecutionInstance instance, Properties props, boolean isForced) throws DAGEngineException { switchUserTo(instance.getEntity().getACL().getOwner()); String jobId = instance.getExternalID(); try { WorkflowJob jobInfo = client.getJobInfo(jobId); if (props == null) { props = new Properties(); } //if user has set any of these oozie rerun properties then force rerun flag is ignored if (!props.containsKey(OozieClient.RERUN_FAIL_NODES) && !props.containsKey(OozieClient.RERUN_SKIP_NODES)) { props.put(OozieClient.RERUN_FAIL_NODES, String.valueOf(!isForced)); } Properties jobprops = OozieUtils.toProperties(jobInfo.getConf()); jobprops.putAll(props); jobprops.remove(OozieClient.COORDINATOR_APP_PATH); jobprops.remove(OozieClient.BUNDLE_APP_PATH); // In case if both props exists one should be removed otherwise it will fail. // This case will occur when user runs workflow with skip-nodes property and // try to do force rerun or rerun with fail-nodes property. if (jobprops.containsKey(OozieClient.RERUN_FAIL_NODES) && jobprops.containsKey(OozieClient.RERUN_SKIP_NODES)) { LOG.warn("Both " + OozieClient.RERUN_SKIP_NODES + " and " + OozieClient.RERUN_FAIL_NODES + " are present in workflow params removing" + OozieClient.RERUN_SKIP_NODES); jobprops.remove(OozieClient.RERUN_SKIP_NODES); } client.reRun(jobId, jobprops); assertStatus(instance.getExternalID(), Job.Status.PREP, Job.Status.RUNNING, Job.Status.SUCCEEDED); LOG.info("Rerun job {} of entity {} of time {} on cluster {}", jobId, instance.getEntity().getName(), instance.getInstanceTime(), instance.getCluster()); } catch (Exception e) { LOG.error("Unable to rerun workflows", e); throw new DAGEngineException(e); } } @Override public void submit(Entity entity, Properties props) throws DAGEngineException { try { // TODO : remove hardcoded Tag value when feed support is added. OozieOrchestrationWorkflowBuilder builder = OozieOrchestrationWorkflowBuilder.get(entity, cluster, Tag.DEFAULT, OozieOrchestrationWorkflowBuilder.Scheduler.NATIVE); prepareEntityBuildPath(entity); Path buildPath = EntityUtil.getNewStagingPath(cluster, entity); org.apache.falcon.entity.v0.process.Process process = (Process) entity; builder.setNominalTime(new DateTime(process.getClusters().getClusters().get(0).getValidity().getStart())); Properties properties = builder.build(cluster, buildPath, props); boolean skipDryRun = false; if (props != null && !props.isEmpty() && props.containsKey(FalconWorkflowEngine.FALCON_SKIP_DRYRUN)) { Boolean skipDryRunprop = Boolean .parseBoolean(props.getProperty(FalconWorkflowEngine.FALCON_SKIP_DRYRUN)); if (skipDryRunprop != null) { skipDryRun = skipDryRunprop; } } if (!skipDryRun) { dryRunInternal(properties, buildPath, entity); } } catch (OozieClientException e) { LOG.error("Oozie client exception:", e); throw new DAGEngineException(e); } catch (FalconException e1) { LOG.error("Falcon Exception : ", e1); throw new DAGEngineException(e1); } } @Override public InstancesResult.Instance info(String externalID) throws DAGEngineException { InstancesResult.Instance instance = new InstancesResult.Instance(); try { LOG.debug("Retrieving details for job {} ", externalID); WorkflowJob jobInfo = client.getJobInfo(externalID); instance.startTime = jobInfo.getStartTime(); if (jobInfo.getStatus().name().equals(Job.Status.RUNNING.name())) { instance.endTime = new Date(); } else { instance.endTime = jobInfo.getEndTime(); } instance.cluster = cluster.getName(); instance.runId = jobInfo.getRun(); instance.status = InstancesResult.WorkflowStatus.valueOf(jobInfo.getStatus().name()); instance.logFile = jobInfo.getConsoleUrl(); instance.wfParams = getWFParams(jobInfo); return instance; } catch (Exception e) { LOG.error("Error when attempting to get info for " + externalID, e); throw new DAGEngineException(e); } } private InstancesResult.KeyValuePair[] getWFParams(WorkflowJob jobInfo) { Configuration conf = new Configuration(false); conf.addResource(new ByteArrayInputStream(jobInfo.getConf().getBytes())); InstancesResult.KeyValuePair[] wfParams = new InstancesResult.KeyValuePair[conf.size()]; int i = 0; for (Map.Entry<String, String> entry : conf) { wfParams[i++] = new InstancesResult.KeyValuePair(entry.getKey(), entry.getValue()); } return wfParams; } @Override public List<InstancesResult.InstanceAction> getJobDetails(String externalID) throws DAGEngineException { List<InstancesResult.InstanceAction> instanceActions = new ArrayList<>(); try { WorkflowJob wfJob = client.getJobInfo(externalID); List<WorkflowAction> wfActions = wfJob.getActions(); // We wanna capture job urls for all user-actions & non succeeded actions of the main workflow for (WorkflowAction action : wfActions) { if (action.getType().equalsIgnoreCase("sub-workflow") && StringUtils.isNotEmpty(action.getExternalId())) { // if the action is sub-workflow, get job urls of all actions within the sub-workflow List<WorkflowAction> subWorkFlowActions = client .getJobInfo(action.getExternalId()).getActions(); for (WorkflowAction subWfAction : subWorkFlowActions) { if (!subWfAction.getType().startsWith(":")) { InstancesResult.InstanceAction instanceAction = new InstancesResult.InstanceAction(subWfAction.getName(), subWfAction.getExternalStatus(), subWfAction.getConsoleUrl()); instanceActions.add(instanceAction); } } } else if (!action.getType().startsWith(":")) { // if the action is a transition node it starts with :, we don't need their statuses if (PARENT_WF_ACTION_NAMES.contains(action.getName()) && !Job.Status.SUCCEEDED.toString().equals(action.getExternalStatus())) { // falcon actions in the main workflow are defined in the list // get job urls for all non succeeded actions of the main workflow InstancesResult.InstanceAction instanceAction = new InstancesResult.InstanceAction(action.getName(), action.getExternalStatus(), action.getConsoleUrl()); instanceActions.add(instanceAction); } else if (!PARENT_WF_ACTION_NAMES.contains(action.getName()) && !StringUtils.equals(action.getExternalId(), "-")) { // if user-action is pig/hive there is no sub-workflow, we wanna capture their urls as well InstancesResult.InstanceAction instanceAction = new InstancesResult.InstanceAction(action.getName(), action.getExternalStatus(), action.getConsoleUrl()); instanceActions.add(instanceAction); } } } return instanceActions; } catch (OozieClientException oce) { throw new DAGEngineException(oce); } } @Override public boolean isAlive() throws DAGEngineException { try { return client.getSystemMode() == OozieClient.SYSTEM_MODE.NORMAL; } catch (OozieClientException e) { throw new DAGEngineException("Unable to reach Oozie server.", e); } } @Override public Properties getConfiguration(String externalID) throws DAGEngineException { Properties props = new Properties(); try { switchUser(); WorkflowJob jobInfo = client.getJobInfo(externalID); Configuration conf = new Configuration(false); conf.addResource(new ByteArrayInputStream(jobInfo.getConf().getBytes())); for (Map.Entry<String, String> entry : conf) { props.put(entry.getKey(), entry.getValue()); } } catch (OozieClientException e) { throw new DAGEngineException(e); } return props; } @Override public void touch(Entity entity, Boolean skipDryRun) throws DAGEngineException { // TODO : remove hardcoded Tag value when feed support is added. try { OozieOrchestrationWorkflowBuilder builder = OozieOrchestrationWorkflowBuilder.get(entity, cluster, Tag.DEFAULT, OozieOrchestrationWorkflowBuilder.Scheduler.NATIVE); if (!skipDryRun) { Path buildPath = new Path("/tmp", "falcon" + entity.getName() + System.currentTimeMillis()); Properties props = builder.build(cluster, buildPath); dryRunInternal(props, buildPath, entity); } Path buildPath = EntityUtil.getNewStagingPath(cluster, entity); // build it and forget it. The next run will always pick up from the latest staging path. builder.build(cluster, buildPath); } catch (FalconException fe) { LOG.error("Falcon Exception : ", fe); throw new DAGEngineException(fe); } catch (OozieClientException e) { LOG.error("Oozie client exception:", e); throw new DAGEngineException(e); } } // Get status of a workflow (with retry) and ensure it is one of statuses requested. private void assertStatus(String jobID, Job.Status... statuses) throws DAGEngineException { String actualStatus = null; int retryCount; String retry = RuntimeProperties.get().getProperty(WORKFLOW_STATUS_RETRY_COUNT, "30"); try { retryCount = Integer.parseInt(retry); } catch (NumberFormatException nfe) { throw new DAGEngineException("Invalid value provided for runtime property \"" + WORKFLOW_STATUS_RETRY_COUNT + "\". Please provide an integer value."); } for (int counter = 0; counter < retryCount; counter++) { try { actualStatus = client.getJobInfo(jobID).getStatus().name(); } catch (OozieClientException e) { LOG.error("Unable to get status of workflow: " + jobID, e); throw new DAGEngineException(e); } if (!statusEquals(actualStatus, statuses)) { try { Thread.sleep(WORKFLOW_STATUS_RETRY_DELAY_MS); } catch (InterruptedException ignore) { //ignore } } else { return; } } throw new DAGEngineException("For Job" + jobID + ", actual statuses: " + actualStatus + ", expected statuses: " + Arrays.toString(statuses)); } private boolean statusEquals(String left, Job.Status... right) { for (Job.Status rightElement : right) { if (left.equals(rightElement.name())) { return true; } } return false; } }