diff --git a/pom.xml b/pom.xml
index e881bd08..c1ee87a0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -61,6 +61,33 @@
ssh-slaves
test
+
+ org.jenkins-ci.plugins.workflow
+ workflow-job
+ test
+
+
+ org.jenkins-ci.plugins.workflow
+ workflow-durable-task-step
+ test
+
+
+ org.jenkins-ci.plugins.workflow
+ workflow-cps
+ test
+
+
+ org.hamcrest
+ hamcrest
+ 2.2
+ test
+
+
+ org.awaitility
+ awaitility
+ 4.2.0
+ test
+
diff --git a/src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java b/src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java
index 29630c43..72c0f391 100644
--- a/src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java
+++ b/src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java
@@ -73,7 +73,19 @@ public long check(final AbstractCloudComputer c) {
done(c);
}
}
-
+ // if the agent is not done and it is offline we should trigger a relaunch
+ // do not do this whilst we hold the lock, as connect is documented only as "This method may return immediately"
+ // so lets not presume any asynchronous nature and instead that this could be a long blocking op.
+ boolean shouldRestart = false;
+ synchronized (this) {
+ if (!terminating && c.isOffline() && c.isLaunchSupported() && !disabled) {
+ shouldRestart = true;
+ }
+ }
+ if (shouldRestart) {
+ LOGGER.log(Level.FINE, "Attempting relaunch of {0}", c.getName());
+ c.connect(false);
+ }
// Return one because we want to check every minute if idle.
return 1;
}
diff --git a/src/test/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategyTest.java b/src/test/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategyTest.java
new file mode 100644
index 00000000..318bdad0
--- /dev/null
+++ b/src/test/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategyTest.java
@@ -0,0 +1,227 @@
+package org.jenkinsci.plugins.durabletask.executors;
+
+import java.io.File;
+import java.io.IOException;
+import java.time.Duration;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.jenkinsci.plugins.durabletask.BourneShellScript;
+import org.jenkinsci.plugins.durabletask.WindowsBatchScript;
+import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition;
+import org.jenkinsci.plugins.workflow.job.WorkflowJob;
+import org.jenkinsci.plugins.workflow.job.WorkflowRun;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.jvnet.hudson.test.BuildWatcher;
+import org.jvnet.hudson.test.Issue;
+import org.jvnet.hudson.test.JenkinsRule;
+import org.jvnet.hudson.test.LoggerRule;
+import org.jvnet.hudson.test.SimpleCommandLauncher;
+import hudson.Functions;
+import hudson.model.Computer;
+import hudson.model.LoadStatistics;
+import hudson.model.TaskListener;
+import hudson.model.Descriptor.FormException;
+import hudson.model.labels.LabelAtom;
+import hudson.remoting.RequestAbortedException;
+import hudson.remoting.Which;
+import hudson.slaves.AbstractCloudComputer;
+import hudson.slaves.AbstractCloudSlave;
+import hudson.slaves.Cloud;
+import hudson.slaves.NodeProvisioner.PlannedNode;
+import hudson.util.RemotingDiagnostics;
+import jenkins.model.Jenkins;
+
+import static org.awaitility.Awaitility.await;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.arrayWithSize;
+import static org.hamcrest.Matchers.empty;
+import static org.hamcrest.Matchers.is;
+
+public class OnceRetentionStrategyTest {
+
+ private final static Logger LOGGER = Logger.getLogger(OnceRetentionStrategyTest.class.getName());
+
+ @Rule
+ public LoggerRule lr = new LoggerRule().record(LOGGER, Level.ALL);
+
+ @Rule
+ public JenkinsRule jr = new JenkinsRule();
+
+ @Rule
+ public TemporaryFolder temp = new TemporaryFolder();
+
+ @Rule
+ public BuildWatcher bw = new BuildWatcher();
+
+ @BeforeClass
+ public static void setupStatics() {
+ LoadStatistics.CLOCK = 100; // 100ms for the LoadStatistics so we have quick provisioning
+ // deadConnectionsShouldReLaunched seems to fail always when run as part of the suite - but passes always when run individually without the binary wrapper
+ // (because the ping command and its parent cmd.exe get killed :-o )
+ WindowsBatchScript.USE_BINARY_WRAPPER=true;
+ BourneShellScript.USE_BINARY_WRAPPER=true;
+ }
+
+ @Test
+ public void withoutRestartNodesAreCleaned() throws Exception {
+ MySimpleCloud sc = new MySimpleCloud("simples", temp.newFolder());
+
+ jr.jenkins.clouds.add(sc);
+ jr.jenkins.setNumExecutors(0);
+ //jr.jenkins.getLabel("simples").nodeProvisioner. = new ImmediateNodeProvisioner();
+
+ WorkflowJob foo = jr.jenkins.createProject(WorkflowJob.class, "foo");
+ foo.setDefinition(new CpsFlowDefinition(String.join(System.lineSeparator(),
+ "node('simples') {",
+ Functions.isWindows() ? " bat 'echo hello'" : " sh 'echo hello'",
+ "}"),
+ true));
+ WorkflowRun run = foo.scheduleBuild2(0).waitForStart();
+
+ jr.waitForCompletion(run);
+ jr.assertBuildStatusSuccess(run);
+
+ // agent should have been removed and we should have no agents but this happens on a different Thread..
+ await().atMost(Duration.ofSeconds(5)).until(() -> jr.jenkins.getNodes().isEmpty());
+ assertThat(jr.jenkins.getNodes(), is(empty()));
+ assertThat(jr.jenkins.getComputers(), arrayWithSize(1)); // the Jenkins computer itself
+ }
+
+
+ @Test
+ @Issue("JENKINS-69277")
+ public void deadConnectionsShouldReLaunched() throws Exception {
+ MySimpleCloud sc = new MySimpleCloud("simples", temp.newFolder());
+
+ jr.jenkins.clouds.add(sc);
+ jr.jenkins.setNumExecutors(0);
+ //jr.jenkins.getLabel("simples").nodeProvisioner. = new ImmediateNodeProvisioner();
+
+ WorkflowJob foo = jr.jenkins.createProject(WorkflowJob.class, "foo");
+ foo.setDefinition(new CpsFlowDefinition(String.join(System.lineSeparator(),
+ "node('simples') {",
+ Functions.isWindows() ? " bat 'echo hello & ping -w 1000 -n 30 localhost'"
+ : " sh 'echo hello & sleep 30'",
+ "}"),
+ true));
+ WorkflowRun run = foo.scheduleBuild2(0).waitForStart();
+ LOGGER.log(Level.FINE, "waiting for hello");
+
+ jr.waitForMessage("hello", run);
+ LOGGER.log(Level.FINE, "killing da agent");
+
+ killAgent();
+ // retention strategy will kick in but may take up to 60 seconds.
+ jr.waitForCompletion(run);
+ jr.assertBuildStatusSuccess(run);
+
+ // agent should have been removed and we should have no agents but this happens on a different Thread..
+ await().atMost(Duration.ofSeconds(5)).until(() -> jr.jenkins.getNodes().isEmpty());
+ assertThat(jr.jenkins.getNodes(), is(empty()));
+ assertThat(jr.jenkins.getComputers(), arrayWithSize(1)); // the Jenkins computer itself
+ }
+
+ private void killAgent() throws IOException, InterruptedException {
+ Computer[] computers = Jenkins.get().getComputers();
+ for (Computer c : computers) {
+ if (c instanceof MySimpleCloudComputer) {
+ LOGGER.log(Level.FINE, "Asking {0} to commit suicide", c);
+ try {
+ RemotingDiagnostics.executeGroovy("Runtime.getRuntime().halt(1)", c.getChannel());
+ } catch (RequestAbortedException ignored) {
+ // we have just asked the Computer to commit suicide so this is expected.
+ }
+ }
+ }
+ }
+
+ public static class MySimpleCloud extends Cloud {
+
+ private final LabelAtom label;
+ private final File agentRootDir;
+ private int count = 0;
+
+ public MySimpleCloud(String name, File agentRootDir) {
+ super(name);
+ this.agentRootDir = agentRootDir;
+ label = Jenkins.get().getLabelAtom(name);
+ }
+
+ @Override
+ public boolean canProvision(CloudState state) {
+ boolean retVal = state.getLabel().matches(Collections.singleton(label));
+ LOGGER.log(Level.FINE, "SimpleCloud.canProvision({0},{1}) -> {2}", new Object[] {state.getLabel(), state.getAdditionalPlannedCapacity(), retVal});
+ return retVal;
+ }
+
+ @Override
+ public Collection provision(CloudState state, int excessWorkload) {
+ LOGGER.log(Level.FINE, "SimpleCloud.provision(({0}, {1}), {2}", new Object[] {state.getLabel(), state.getAdditionalPlannedCapacity(), excessWorkload});
+ Collection retVal = new HashSet();
+ for (int i =0; i < excessWorkload - state.getAdditionalPlannedCapacity(); ++i) {
+ String agentName;
+ synchronized (this) {
+ if (count != 0) {
+ // sometimes we end up with 2 agents due to the lovely cloud API.
+ LOGGER.log(Level.FINE, "not provisioning another agent as we have already provisioned.");
+ return Collections.emptyList();
+ }
+ agentName = "cloud-"+name+"-"+(++count);
+ }
+
+ PlannedNode n = new PlannedNode(agentName,
+ Computer.threadPoolForRemoting.submit( () -> new MySimpleCloudSlave(agentName, new File(agentRootDir, agentName), label)),
+ 1);
+ LOGGER.log(Level.FINE, "SimpleCloud.provision() -> Added planned node for {0}", name);
+ retVal.add(n);
+ }
+ return retVal;
+ }
+ }
+
+ public static class MySimpleCloudSlave extends AbstractCloudSlave {
+
+ private final LabelAtom label;
+
+ public MySimpleCloudSlave(String name, File remoteFS, LabelAtom label) throws FormException, IOException {
+ super(name,
+ remoteFS.getAbsolutePath(),
+ new SimpleCommandLauncher("java -Xmx512m -Djava.awt.headless=true -jar " + Which.jarFile(hudson.remoting.Launcher.class) + " -jar-cache " + remoteFS + " -workDir " + remoteFS));
+ this.setRetentionStrategy(new OnceRetentionStrategy(1));
+ LOGGER.log(Level.FINE, "SimpleCloudSlave()");
+ this.label = label;
+ }
+
+ @Override
+ public AbstractCloudComputer createComputer() {
+ LOGGER.log(Level.FINE, "SimpleCloudSlave.createComputer()");
+ return new MySimpleCloudComputer(this);
+ }
+
+ @Override
+ protected void _terminate(TaskListener listener) throws IOException, InterruptedException {
+ // nothing to do here
+ }
+
+ @Override
+ public Set getAssignedLabels() {
+ return new HashSet(Arrays.asList(label, getSelfLabel()));
+ }
+ }
+
+ public static class MySimpleCloudComputer extends AbstractCloudComputer {
+
+ public MySimpleCloudComputer(MySimpleCloudSlave slave) {
+ super(slave);
+ }
+ }
+
+}