/*
* The MIT License
*
* Copyright 2016 CloudBees, Inc
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package it.dockins.dockerslaves.pipeline;
import hudson.AbortException;
import hudson.Extension;
import hudson.model.queue.QueueListener;
import it.dockins.dockerslaves.DockerSlave;
import it.dockins.dockerslaves.DockerSlaves;
import it.dockins.dockerslaves.spec.ContainerSetDefinition;
import it.dockins.dockerslaves.spec.ImageIdContainerDefinition;
import it.dockins.dockerslaves.spec.SideContainerDefinition;
import com.google.inject.Inject;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import hudson.EnvVars;
import hudson.FilePath;
import hudson.Launcher;
import hudson.model.Computer;
import hudson.model.Executor;
import hudson.model.Item;
import hudson.model.Job;
import hudson.model.Label;
import hudson.model.Node;
import hudson.model.Queue;
import hudson.model.ResourceList;
import hudson.model.Run;
import hudson.model.TaskListener;
import hudson.model.TopLevelItem;
import hudson.model.queue.CauseOfBlockage;
import hudson.model.queue.SubTask;
import hudson.remoting.ChannelClosedException;
import hudson.remoting.RequestAbortedException;
import hudson.security.ACL;
import hudson.security.AccessControlled;
import hudson.security.Permission;
import hudson.slaves.WorkspaceList;
import jenkins.model.Jenkins;
import jenkins.model.queue.AsynchronousExecution;
import jenkins.util.Timer;
import org.acegisecurity.AccessDeniedException;
import org.acegisecurity.Authentication;
import org.jenkinsci.plugins.durabletask.executors.ContinuableExecutable;
import org.jenkinsci.plugins.durabletask.executors.ContinuedTask;
import org.jenkinsci.plugins.workflow.flow.FlowExecution;
import org.jenkinsci.plugins.workflow.flow.FlowExecutionOwner;
import org.jenkinsci.plugins.workflow.graph.FlowNode;
import org.jenkinsci.plugins.workflow.steps.AbstractStepExecutionImpl;
import org.jenkinsci.plugins.workflow.steps.BodyExecutionCallback;
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.StepContextParameter;
import org.jenkinsci.plugins.workflow.support.actions.WorkspaceActionImpl;
import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.DoNotUse;
import org.kohsuke.accmod.restrictions.NoExternalUse;
import org.kohsuke.stapler.export.ExportedBean;
import javax.annotation.CheckForNull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.io.PrintStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import static java.util.logging.Level.FINE;
import static java.util.logging.Level.WARNING;
/**
* This class is heavily based (so to speak...) on org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution
* The original code cannot be easily extended, PlaceholderTask constructor is private for example.
*
* As much as we hate copy / pasting, as an experimental implementation, we feel that it is still interesting
* to test the approach.
*
* See https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/15 for a discussion about that.
*/
public class DockerNodeStepExecution extends AbstractStepExecutionImpl {
@Inject(optional = true)
private transient DockerNodeStep step;
@StepContextParameter
private transient TaskListener listener;
@StepContextParameter
private transient Run<?, ?> run;
// Here just for requiredContext; could perhaps be passed to the PlaceholderTask constructor:
@StepContextParameter
private transient FlowExecution flowExecution;
@StepContextParameter
private transient FlowNode flowNode;
/*
* General strategy of this step.
* <p/>
* 1. schedule {@link PlaceholderTask} into the {@link Queue} (what this method does)
* 2. when {@link PlaceholderTask} starts running, invoke the closure
* 3. when the closure is done, let {@link PlaceholderTask} complete
*/
@Override
public boolean start() throws Exception {
final String label = "docker_" + Long.toHexString(System.nanoTime());
final PlaceholderTask task = new PlaceholderTask(getContext(), label, run);
final DockerSlaves cloud = DockerSlaves.get();
String slaveName = "Container for " + run.toString() + "." + flowNode.getId();
String description = "Container building " + run.getParent().getFullName();
Queue.Item item = Queue.getInstance().schedule2(task, 0).getCreateItem();
if (item == null) {
// There can be no duplicates. But could be refused if a QueueDecisionHandler rejects it for some odd reason.
throw new IllegalStateException("failed to schedule task");
}
List<SideContainerDefinition> sideContainers = new ArrayList<>();
if (step.getSideContainers() != null) {
for (String entry : step.getSideContainers()) {
sideContainers.add(new SideContainerDefinition(entry,
new ImageIdContainerDefinition(entry, false)));
}
}
ContainerSetDefinition spec = new ContainerSetDefinition(
new ImageIdContainerDefinition(step.getImage(), false), sideContainers);
final Node node = new DockerSlave(slaveName, description, label,
cloud.createProvisionerForPipeline(run.getParent(), spec), item);
Jenkins.getActiveInstance().addNode(node);
Timer.get().schedule(new Runnable() {
@Override public void run() {
Queue.Item item = Queue.getInstance().getItem(task);
if (item != null) {
PrintStream logger;
try {
logger = listener.getLogger();
} catch (Exception x) { // IOException, InterruptedException
LOGGER.log(WARNING, null, x);
return;
}
logger.println("Still waiting to schedule task");
String why = item.getWhy();
if (why != null) {
logger.println(why);
}
}
}
}, 15, TimeUnit.SECONDS);
return false;
}
@Override
public void stop(Throwable cause) {
for (Queue.Item item : Queue.getInstance().getItems()) {
// if we are still in the queue waiting to be scheduled, just retract that
if (item.task instanceof PlaceholderTask && ((PlaceholderTask) item.task).context.equals(getContext())) {
Queue.getInstance().cancel(item);
break;
}
}
Jenkins j = Jenkins.getInstance();
if (j != null) {
// if we are already running, kill the ongoing activities, which releases PlaceholderExecutable from its sleep loop
// Similar to Executor.of, but distinct since we do not have the Executable yet:
COMPUTERS: for (Computer c : j.getComputers()) {
for (Executor e : c.getExecutors()) {
Queue.Executable exec = e.getCurrentExecutable();
if (exec instanceof PlaceholderTask.PlaceholderExecutable && ((PlaceholderTask.PlaceholderExecutable) exec).getParent().context.equals(getContext())) {
PlaceholderTask.finish(((PlaceholderTask.PlaceholderExecutable) exec).getParent().cookie);
break COMPUTERS;
}
}
}
}
// Whether or not either of the above worked (and they would not if for example our item were canceled), make sure we die.
getContext().onFailure(cause);
}
@Override public void onResume() {
super.onResume();
// See if we are still running, or scheduled to run. Cf. stop logic above.
for (Queue.Item item : Queue.getInstance().getItems()) {
if (item.task instanceof PlaceholderTask && ((PlaceholderTask) item.task).context.equals(getContext())) {
LOGGER.log(FINE, "Queue item for node block in {0} is still waiting after reload", run);
return;
}
}
Jenkins j = Jenkins.getInstance();
if (j != null) {
COMPUTERS: for (Computer c : j.getComputers()) {
for (Executor e : c.getExecutors()) {
Queue.Executable exec = e.getCurrentExecutable();
if (exec instanceof PlaceholderTask.PlaceholderExecutable && ((PlaceholderTask.PlaceholderExecutable) exec).getParent().context.equals(getContext())) {
LOGGER.log(FINE, "Node block in {0} is running on {1} after reload", new Object[] {run, c.getName()});
return;
}
}
}
}
if (step == null) { // compatibility: used to be transient
listener.getLogger().println("Queue item for node block in " + run.getFullDisplayName() + " is missing (perhaps JENKINS-34281), but cannot reschedule");
return;
}
listener.getLogger().println("Queue item for node block in " + run.getFullDisplayName() + " is missing (perhaps JENKINS-34281); rescheduling");
try {
start();
} catch (Exception x) {
getContext().onFailure(x);
}
}
@Override public String getStatus() {
// Yet another copy of the same logic; perhaps this should be factored into some method returning a union of Queue.Item and PlaceholderExecutable?
for (Queue.Item item : Queue.getInstance().getItems()) {
if (item.task instanceof PlaceholderTask && ((PlaceholderTask) item.task).context.equals(getContext())) {
return "waiting for " + item.task.getFullDisplayName() + " to be scheduled; blocked: " + item.getWhy();
}
}
Jenkins j = Jenkins.getInstance();
if (j != null) {
COMPUTERS: for (Computer c : j.getComputers()) {
for (Executor e : c.getExecutors()) {
Queue.Executable exec = e.getCurrentExecutable();
if (exec instanceof PlaceholderTask.PlaceholderExecutable && ((PlaceholderTask.PlaceholderExecutable) exec).getParent().context.equals(getContext())) {
return "running on " + c.getName();
}
}
}
}
return "node block appears to be neither running nor scheduled";
}
@Extension public static class CancelledItemListener extends QueueListener {
@Override public void onLeft(Queue.LeftItem li) {
if (li.isCancelled()) {
if (li.task instanceof PlaceholderTask) {
(((PlaceholderTask) li.task).context).onFailure(new AbortException(Messages.DockerNodeStepExecution_queue_task_cancelled()));
}
}
}
}
/** Transient handle of a running executor task. */
private static final class RunningTask {
/** null until placeholder executable runs */
@Nullable AsynchronousExecution execution;
/** null until placeholder executable runs */
@Nullable Launcher launcher;
}
private static final String COOKIE_VAR = "JENKINS_SERVER_COOKIE";
@ExportedBean
public static final class PlaceholderTask implements ContinuedTask, Serializable, AccessControlled {
/** keys are {@link #cookie}s */
private static final Map<String,RunningTask> runningTasks = new HashMap<String,RunningTask>();
private final StepContext context;
private String label;
/** Shortcut for {@link #run}. */
private String runId;
/**
* Unique cookie set once the task starts.
* Serves multiple purposes:
* identifies whether we have already invoked the body (since this can be rerun after restart);
* serves as a key for {@link #runningTasks} and {@link Callback} (cannot just have a doneness flag in {@link PlaceholderTask} because multiple copies might be deserialized);
* and allows {@link Launcher#kill} to work.
*/
private String cookie;
PlaceholderTask(StepContext context, String label, Run<?,?> run) {
this.context = context;
this.label = label;
runId = run.getExternalizableId();
}
private Object readResolve() {
LOGGER.log(FINE, "deserialized {0}", cookie);
if (cookie != null) {
synchronized (runningTasks) {
runningTasks.put(cookie, new RunningTask());
}
}
return this;
}
@Override public Queue.Executable createExecutable() throws IOException {
return new PlaceholderExecutable();
}
@Override public Label getAssignedLabel() {
if (label == null) {
return null;
} else if (label.isEmpty()) {
Jenkins j = Jenkins.getInstance();
if (j == null) {
return null;
}
return j.getSelfLabel();
} else {
return Label.get(label);
}
}
@Override public Node getLastBuiltOn() {
if (label == null) {
return null;
}
Jenkins j = Jenkins.getInstance();
if (j == null) {
return null;
}
return j.getNode(label);
}
@Override public boolean isBuildBlocked() {
return false;
}
@Deprecated
@Override public String getWhyBlocked() {
return null;
}
@Override public CauseOfBlockage getCauseOfBlockage() {
return null;
}
@Override public boolean isConcurrentBuild() {
return false;
}
@Override public Collection<? extends SubTask> getSubTasks() {
return Collections.singleton(this);
}
@Override public Queue.Task getOwnerTask() {
Run<?,?> r = run();
if (r != null && r.getParent() instanceof Queue.Task) {
return (Queue.Task) r.getParent();
} else {
return this;
}
}
@Override public Object getSameNodeConstraint() {
return null;
}
/**
* Something we can use to check abort and read permissions.
* Normally this will be a {@link Run}.
* However if things are badly broken, for example if the build has been deleted,
* then as a fallback we use the Jenkins root.
* This allows an administrator to clean up dead queue items and executor cells.
* TODO make {@link FlowExecutionOwner} implement {@link AccessControlled}
* so that an implementation could fall back to checking {@link Job} permission.
*/
@Override public ACL getACL() {
try {
if (!context.isReady()) {
return Jenkins.getActiveInstance().getACL();
}
FlowExecution exec = context.get(FlowExecution.class);
if (exec == null) {
return Jenkins.getActiveInstance().getACL();
}
Queue.Executable executable = exec.getOwner().getExecutable();
if (executable instanceof AccessControlled) {
return ((AccessControlled) executable).getACL();
} else {
return Jenkins.getActiveInstance().getACL();
}
} catch (Exception x) {
LOGGER.log(FINE, null, x);
return Jenkins.getActiveInstance().getACL();
}
}
@Override public void checkPermission(Permission p) throws AccessDeniedException {
getACL().checkPermission(p);
}
@Override public boolean hasPermission(Permission p) {
return getACL().hasPermission(p);
}
@Override public void checkAbortPermission() {
checkPermission(Item.CANCEL);
}
@Override public boolean hasAbortPermission() {
return hasPermission(Item.CANCEL);
}
public @CheckForNull Run<?,?> run() {
try {
if (!context.isReady()) {
return null;
}
return context.get(Run.class);
} catch (Exception x) {
LOGGER.log(FINE, "broken " + cookie, x);
finish(cookie); // probably broken, so just shut it down
return null;
}
}
public @CheckForNull Run<?,?> runForDisplay() {
Run<?,?> r = run();
if (r == null && /* not stored prior to 1.13 */runId != null) {
return Run.fromExternalizableId(runId);
}
return r;
}
@Override public String getUrl() {
// TODO ideally this would be found via FlowExecution.owner.executable, but how do we check for something with a URL? There is no marker interface for it: JENKINS-26091
Run<?,?> r = runForDisplay();
return r != null ? r.getUrl() : "";
}
@Override public String getDisplayName() {
// TODO more generic to check whether FlowExecution.owner.executable is a ModelObject
Run<?,?> r = runForDisplay();
return r != null ? Messages.DockerNodeStepExecution_PlaceholderTask_displayName(r.getFullDisplayName()) : Messages.DockerNodeStepExecution_PlaceholderTask_displayName_unknown();
}
@Override public String getName() {
return getDisplayName();
}
@Override public String getFullDisplayName() {
return getDisplayName();
}
@Override public long getEstimatedDuration() {
Run<?,?> r = run();
// Not accurate if there are multiple slaves in one build, but better than nothing:
return r != null ? r.getEstimatedDuration() : -1;
}
@Override public ResourceList getResourceList() {
return new ResourceList();
}
@Override public Authentication getDefaultAuthentication() {
return ACL.SYSTEM; // TODO should pick up credentials from configuring user or something
}
@Override public Authentication getDefaultAuthentication(Queue.Item item) {
return getDefaultAuthentication();
}
@Override public boolean isContinued() {
return cookie != null; // in which case this is after a restart and we still claim the executor
}
private static void finish(@CheckForNull final String cookie) {
if (cookie == null) {
return;
}
synchronized (runningTasks) {
final RunningTask runningTask = runningTasks.remove(cookie);
if (runningTask == null) {
LOGGER.log(FINE, "no running task corresponds to {0}", cookie);
return;
}
final AsynchronousExecution execution = runningTask.execution;
if (execution == null) {
// JENKINS-30759: finished before asynch execution was even scheduled
return;
}
assert runningTask.launcher != null;
Timer.get().submit(new Runnable() { // JENKINS-31614
@Override public void run() {
execution.completed(null);
try {
runningTask.launcher.kill(Collections.singletonMap(COOKIE_VAR, cookie));
} catch (ChannelClosedException x) {
// fine, Jenkins was shutting down
} catch (RequestAbortedException x) {
// slave was exiting; too late to kill subprocesses
} catch (Exception x) {
LOGGER.log(Level.WARNING, "failed to shut down " + cookie, x);
}
}
});
}
}
/**
* Called when the body closure is complete.
*/
@SuppressFBWarnings(value="SE_BAD_FIELD", justification="lease is pickled")
private static final class Callback extends BodyExecutionCallback.TailCall {
private final String cookie;
private WorkspaceList.Lease lease;
Callback(String cookie, WorkspaceList.Lease lease) {
this.cookie = cookie;
this.lease = lease;
}
@Override protected void finished(StepContext context) throws Exception {
LOGGER.log(FINE, "finished {0}", cookie);
lease.release();
lease = null;
finish(cookie);
}
}
/**
* Occupies {@link Executor} while workflow uses this slave.
*/
@ExportedBean
private final class PlaceholderExecutable implements ContinuableExecutable {
@Override public void run() {
final TaskListener listener;
Launcher launcher;
final Run<?, ?> r;
try {
Executor exec = Executor.currentExecutor();
if (exec == null) {
throw new IllegalStateException("running task without associated executor thread");
}
Computer computer = exec.getOwner();
// Set up context for other steps inside this one.
Node node = computer.getNode();
if (node == null) {
throw new IllegalStateException("running computer lacks a node");
}
listener = context.get(TaskListener.class);
launcher = node.createLauncher(listener);
r = context.get(Run.class);
if (cookie == null) {
// First time around.
cookie = UUID.randomUUID().toString();
// Switches the label to a self-label, so if the executable is killed and restarted via ExecutorPickle, it will run on the same node:
label = computer.getName();
EnvVars env = computer.getEnvironment();
env.overrideAll(computer.buildEnvironment(listener));
env.put(COOKIE_VAR, cookie);
if (exec.getOwner() instanceof Jenkins.MasterComputer) {
env.put("NODE_NAME", "master");
} else {
env.put("NODE_NAME", label);
}
env.put("EXECUTOR_NUMBER", String.valueOf(exec.getNumber()));
synchronized (runningTasks) {
runningTasks.put(cookie, new RunningTask());
}
// For convenience, automatically allocate a workspace, like WorkspaceStep would:
Job<?,?> j = r.getParent();
if (!(j instanceof TopLevelItem)) {
throw new Exception(j + " must be a top-level job");
}
FilePath p = node.getWorkspaceFor((TopLevelItem) j);
if (p == null) {
throw new IllegalStateException(node + " is offline");
}
WorkspaceList.Lease lease = computer.getWorkspaceList().allocate(p);
FilePath workspace = lease.path;
FlowNode flowNode = context.get(FlowNode.class);
flowNode.addAction(new WorkspaceActionImpl(workspace, flowNode));
listener.getLogger().println("Running on " + computer.getDisplayName() + " in " + workspace); // TODO hyperlink
context.newBodyInvoker()
.withContexts(exec, computer, env, workspace)
.withCallback(new Callback(cookie, lease))
.start();
LOGGER.log(FINE, "started {0}", cookie);
} else {
// just rescheduled after a restart; wait for task to complete
LOGGER.log(FINE, "resuming {0}", cookie);
}
} catch (Exception x) {
context.onFailure(x);
return;
}
// wait until the invokeBodyLater call above completes and notifies our Callback object
synchronized (runningTasks) {
LOGGER.log(FINE, "waiting on {0}", cookie);
RunningTask runningTask = runningTasks.get(cookie);
if (runningTask == null) {
LOGGER.log(FINE, "running task apparently finished quickly for {0}", cookie);
return;
}
assert runningTask.execution == null;
assert runningTask.launcher == null;
runningTask.launcher = launcher;
runningTask.execution = new AsynchronousExecution() {
@Override public void interrupt(boolean forShutdown) {
if (forShutdown) {
return;
}
LOGGER.log(FINE, "interrupted {0}", cookie);
// TODO save the BodyExecution somehow and call .cancel() here; currently we just interrupt the build as a whole:
Executor masterExecutor = r.getExecutor();
if (masterExecutor != null) {
masterExecutor.interrupt();
} else { // ?
super.getExecutor().recordCauseOfInterruption(r, listener);
}
}
@Override public boolean blocksRestart() {
return false;
}
@Override public boolean displayCell() {
return true;
}
};
throw runningTask.execution;
}
}
@Override public PlaceholderTask getParent() {
return PlaceholderTask.this;
}
@Override public long getEstimatedDuration() {
return getParent().getEstimatedDuration();
}
@Override public boolean willContinue() {
synchronized (runningTasks) {
return runningTasks.containsKey(cookie);
}
}
@Restricted(DoNotUse.class) // for Jelly
public @CheckForNull Executor getExecutor() {
return Executor.of(this);
}
@Restricted(NoExternalUse.class) // for Jelly and toString
public String getUrl() {
return PlaceholderTask.this.getUrl(); // we hope this has a console.jelly
}
@Override public String toString() {
return "PlaceholderExecutable:" + getUrl() + ":" + cookie;
}
private static final long serialVersionUID = 1L;
}
}
private static final long serialVersionUID = 1L;
private static final Logger LOGGER = Logger.getLogger(DockerNodeStepExecution.class.getName());
}