From 6802fe0f0b82f188d8e2865118548afb6788b668 Mon Sep 17 00:00:00 2001 From: James Nord Date: Tue, 16 Apr 2024 16:35:28 +0100 Subject: [PATCH] [JENKINS-69277] relaunch an agent if the connection is broken (#152) * [JENKINS-69277] relaunch an agent if the connection is broken Even though this is a OnceRetentionStrategy we expect a durable task to be able to run to completion even if the connection is broken. So we relaunch the agent if the agent is offline and it is not done. We are not forcing a reconnect, as this allows any existing connect operation (which could be the first one, to continue running). * [JENKINS-69277] avoid spotbugs warning Whilst spotbugs complained about the lack of synchronization on terminating we do not want to put the connect call inside the synchronized block as it may be long running, or cause some form of re-entry on a different thread. whilst the locking is not perfect, we may try to relaunch and agent just as it was done, we have set the computer to not accept any tasks so the agent should become idle, and be terminated (either in the current termination block) or the next time the check runs. * check if launch is supported early on * [JENKINS-69277] Add a unit test to cover OnceRetentionStrategy iAdds a unit test to cover both the happy case of a cloud agent using OnceRetentionStrategy and the unhappy case where the agent dies and needs restarting. --------- Co-authored-by: Carroll Chiou --- pom.xml | 27 +++ .../executors/OnceRetentionStrategy.java | 14 +- .../executors/OnceRetentionStrategyTest.java | 227 ++++++++++++++++++ 3 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 src/test/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategyTest.java diff --git a/pom.xml b/pom.xml index e881bd08..c1ee87a0 100644 --- a/pom.xml +++ b/pom.xml @@ -61,6 +61,33 @@ ssh-slaves test + + org.jenkins-ci.plugins.workflow + workflow-job + test + + + org.jenkins-ci.plugins.workflow + workflow-durable-task-step + test + + + org.jenkins-ci.plugins.workflow + workflow-cps + test + + + org.hamcrest + hamcrest + 2.2 + test + + + org.awaitility + awaitility + 4.2.0 + test + diff --git a/src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java b/src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java index 29630c43..72c0f391 100644 --- a/src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java +++ b/src/main/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategy.java @@ -73,7 +73,19 @@ public long check(final AbstractCloudComputer c) { done(c); } } - + // if the agent is not done and it is offline we should trigger a relaunch + // do not do this whilst we hold the lock, as connect is documented only as "This method may return immediately" + // so lets not presume any asynchronous nature and instead that this could be a long blocking op. + boolean shouldRestart = false; + synchronized (this) { + if (!terminating && c.isOffline() && c.isLaunchSupported() && !disabled) { + shouldRestart = true; + } + } + if (shouldRestart) { + LOGGER.log(Level.FINE, "Attempting relaunch of {0}", c.getName()); + c.connect(false); + } // Return one because we want to check every minute if idle. return 1; } diff --git a/src/test/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategyTest.java b/src/test/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategyTest.java new file mode 100644 index 00000000..318bdad0 --- /dev/null +++ b/src/test/java/org/jenkinsci/plugins/durabletask/executors/OnceRetentionStrategyTest.java @@ -0,0 +1,227 @@ +package org.jenkinsci.plugins.durabletask.executors; + +import java.io.File; +import java.io.IOException; +import java.time.Duration; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.jenkinsci.plugins.durabletask.BourneShellScript; +import org.jenkinsci.plugins.durabletask.WindowsBatchScript; +import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition; +import org.jenkinsci.plugins.workflow.job.WorkflowJob; +import org.jenkinsci.plugins.workflow.job.WorkflowRun; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.jvnet.hudson.test.BuildWatcher; +import org.jvnet.hudson.test.Issue; +import org.jvnet.hudson.test.JenkinsRule; +import org.jvnet.hudson.test.LoggerRule; +import org.jvnet.hudson.test.SimpleCommandLauncher; +import hudson.Functions; +import hudson.model.Computer; +import hudson.model.LoadStatistics; +import hudson.model.TaskListener; +import hudson.model.Descriptor.FormException; +import hudson.model.labels.LabelAtom; +import hudson.remoting.RequestAbortedException; +import hudson.remoting.Which; +import hudson.slaves.AbstractCloudComputer; +import hudson.slaves.AbstractCloudSlave; +import hudson.slaves.Cloud; +import hudson.slaves.NodeProvisioner.PlannedNode; +import hudson.util.RemotingDiagnostics; +import jenkins.model.Jenkins; + +import static org.awaitility.Awaitility.await; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.arrayWithSize; +import static org.hamcrest.Matchers.empty; +import static org.hamcrest.Matchers.is; + +public class OnceRetentionStrategyTest { + + private final static Logger LOGGER = Logger.getLogger(OnceRetentionStrategyTest.class.getName()); + + @Rule + public LoggerRule lr = new LoggerRule().record(LOGGER, Level.ALL); + + @Rule + public JenkinsRule jr = new JenkinsRule(); + + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + @Rule + public BuildWatcher bw = new BuildWatcher(); + + @BeforeClass + public static void setupStatics() { + LoadStatistics.CLOCK = 100; // 100ms for the LoadStatistics so we have quick provisioning + // deadConnectionsShouldReLaunched seems to fail always when run as part of the suite - but passes always when run individually without the binary wrapper + // (because the ping command and its parent cmd.exe get killed :-o ) + WindowsBatchScript.USE_BINARY_WRAPPER=true; + BourneShellScript.USE_BINARY_WRAPPER=true; + } + + @Test + public void withoutRestartNodesAreCleaned() throws Exception { + MySimpleCloud sc = new MySimpleCloud("simples", temp.newFolder()); + + jr.jenkins.clouds.add(sc); + jr.jenkins.setNumExecutors(0); + //jr.jenkins.getLabel("simples").nodeProvisioner. = new ImmediateNodeProvisioner(); + + WorkflowJob foo = jr.jenkins.createProject(WorkflowJob.class, "foo"); + foo.setDefinition(new CpsFlowDefinition(String.join(System.lineSeparator(), + "node('simples') {", + Functions.isWindows() ? " bat 'echo hello'" : " sh 'echo hello'", + "}"), + true)); + WorkflowRun run = foo.scheduleBuild2(0).waitForStart(); + + jr.waitForCompletion(run); + jr.assertBuildStatusSuccess(run); + + // agent should have been removed and we should have no agents but this happens on a different Thread.. + await().atMost(Duration.ofSeconds(5)).until(() -> jr.jenkins.getNodes().isEmpty()); + assertThat(jr.jenkins.getNodes(), is(empty())); + assertThat(jr.jenkins.getComputers(), arrayWithSize(1)); // the Jenkins computer itself + } + + + @Test + @Issue("JENKINS-69277") + public void deadConnectionsShouldReLaunched() throws Exception { + MySimpleCloud sc = new MySimpleCloud("simples", temp.newFolder()); + + jr.jenkins.clouds.add(sc); + jr.jenkins.setNumExecutors(0); + //jr.jenkins.getLabel("simples").nodeProvisioner. = new ImmediateNodeProvisioner(); + + WorkflowJob foo = jr.jenkins.createProject(WorkflowJob.class, "foo"); + foo.setDefinition(new CpsFlowDefinition(String.join(System.lineSeparator(), + "node('simples') {", + Functions.isWindows() ? " bat 'echo hello & ping -w 1000 -n 30 localhost'" + : " sh 'echo hello & sleep 30'", + "}"), + true)); + WorkflowRun run = foo.scheduleBuild2(0).waitForStart(); + LOGGER.log(Level.FINE, "waiting for hello"); + + jr.waitForMessage("hello", run); + LOGGER.log(Level.FINE, "killing da agent"); + + killAgent(); + // retention strategy will kick in but may take up to 60 seconds. + jr.waitForCompletion(run); + jr.assertBuildStatusSuccess(run); + + // agent should have been removed and we should have no agents but this happens on a different Thread.. + await().atMost(Duration.ofSeconds(5)).until(() -> jr.jenkins.getNodes().isEmpty()); + assertThat(jr.jenkins.getNodes(), is(empty())); + assertThat(jr.jenkins.getComputers(), arrayWithSize(1)); // the Jenkins computer itself + } + + private void killAgent() throws IOException, InterruptedException { + Computer[] computers = Jenkins.get().getComputers(); + for (Computer c : computers) { + if (c instanceof MySimpleCloudComputer) { + LOGGER.log(Level.FINE, "Asking {0} to commit suicide", c); + try { + RemotingDiagnostics.executeGroovy("Runtime.getRuntime().halt(1)", c.getChannel()); + } catch (RequestAbortedException ignored) { + // we have just asked the Computer to commit suicide so this is expected. + } + } + } + } + + public static class MySimpleCloud extends Cloud { + + private final LabelAtom label; + private final File agentRootDir; + private int count = 0; + + public MySimpleCloud(String name, File agentRootDir) { + super(name); + this.agentRootDir = agentRootDir; + label = Jenkins.get().getLabelAtom(name); + } + + @Override + public boolean canProvision(CloudState state) { + boolean retVal = state.getLabel().matches(Collections.singleton(label)); + LOGGER.log(Level.FINE, "SimpleCloud.canProvision({0},{1}) -> {2}", new Object[] {state.getLabel(), state.getAdditionalPlannedCapacity(), retVal}); + return retVal; + } + + @Override + public Collection provision(CloudState state, int excessWorkload) { + LOGGER.log(Level.FINE, "SimpleCloud.provision(({0}, {1}), {2}", new Object[] {state.getLabel(), state.getAdditionalPlannedCapacity(), excessWorkload}); + Collection retVal = new HashSet(); + for (int i =0; i < excessWorkload - state.getAdditionalPlannedCapacity(); ++i) { + String agentName; + synchronized (this) { + if (count != 0) { + // sometimes we end up with 2 agents due to the lovely cloud API. + LOGGER.log(Level.FINE, "not provisioning another agent as we have already provisioned."); + return Collections.emptyList(); + } + agentName = "cloud-"+name+"-"+(++count); + } + + PlannedNode n = new PlannedNode(agentName, + Computer.threadPoolForRemoting.submit( () -> new MySimpleCloudSlave(agentName, new File(agentRootDir, agentName), label)), + 1); + LOGGER.log(Level.FINE, "SimpleCloud.provision() -> Added planned node for {0}", name); + retVal.add(n); + } + return retVal; + } + } + + public static class MySimpleCloudSlave extends AbstractCloudSlave { + + private final LabelAtom label; + + public MySimpleCloudSlave(String name, File remoteFS, LabelAtom label) throws FormException, IOException { + super(name, + remoteFS.getAbsolutePath(), + new SimpleCommandLauncher("java -Xmx512m -Djava.awt.headless=true -jar " + Which.jarFile(hudson.remoting.Launcher.class) + " -jar-cache " + remoteFS + " -workDir " + remoteFS)); + this.setRetentionStrategy(new OnceRetentionStrategy(1)); + LOGGER.log(Level.FINE, "SimpleCloudSlave()"); + this.label = label; + } + + @Override + public AbstractCloudComputer createComputer() { + LOGGER.log(Level.FINE, "SimpleCloudSlave.createComputer()"); + return new MySimpleCloudComputer(this); + } + + @Override + protected void _terminate(TaskListener listener) throws IOException, InterruptedException { + // nothing to do here + } + + @Override + public Set getAssignedLabels() { + return new HashSet(Arrays.asList(label, getSelfLabel())); + } + } + + public static class MySimpleCloudComputer extends AbstractCloudComputer { + + public MySimpleCloudComputer(MySimpleCloudSlave slave) { + super(slave); + } + } + +}