From f18b5bf95fe69e0bc24441065ebb214473bde8f0 Mon Sep 17 00:00:00 2001 From: Sandra Parsick Date: Mon, 14 Feb 2022 17:40:08 +0100 Subject: [PATCH] Many jobs based on the same agent template produce many failed deployment (#102) Co-authored-by: Tim Jacomb <21194782+timja@users.noreply.github.com> --- .../jenkins/containeragents/aci/AciCloud.java | 52 +++++++++++++------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/src/main/java/com/microsoft/jenkins/containeragents/aci/AciCloud.java b/src/main/java/com/microsoft/jenkins/containeragents/aci/AciCloud.java index f46cde7..4d6f24b 100644 --- a/src/main/java/com/microsoft/jenkins/containeragents/aci/AciCloud.java +++ b/src/main/java/com/microsoft/jenkins/containeragents/aci/AciCloud.java @@ -131,7 +131,7 @@ public Collection provision(CloudState cloudState, return agent; } catch (Exception e) { LOGGER.log(Level.WARNING, "AciCloud: Provision agent {0} failed: {1}", - new Object[] {agent.getNodeName(), e.getMessage()}); + new Object[] {agent.getNodeName(), e}); agent.terminate(); @@ -177,8 +177,22 @@ public AciContainerTemplate getFirstTemplate(Label label) { public void addIpEnv(AciAgent agent) throws Exception { AzureResourceManager azureResourceManager = getAzureClient(); - String ip = azureResourceManager.containerGroups() - .getByResourceGroup(resourceGroup, agent.getNodeName()).ipAddress(); + // Workaround for https://github.com/Azure/azure-sdk-for-java/issues/27083 + String ip = null; + boolean nullIsThrown; + do { + try { + ip = azureResourceManager.containerGroups() + .getByResourceGroup(resourceGroup, agent.getNodeName()).ipAddress(); + nullIsThrown = false; + } catch (NullPointerException e) { + LOGGER.log(Level.WARNING, "During asking for IP address of Agent {0} NullPointerException is thrown," + + "but it is ignored.", agent.getNodeName()); + nullIsThrown = true; + final int retryInterval = 5 * 1000; + Thread.sleep(retryInterval); + } + } while (nullIsThrown); EnvironmentVariablesNodeProperty ipEnv = new EnvironmentVariablesNodeProperty( new EnvironmentVariablesNodeProperty.Entry("IP", ip) @@ -215,19 +229,25 @@ private void waitToOnline(AciAgent agent, int startupTimeout, StopWatch stopWatc if (computer == null) { throw new IllegalStateException("Agent node has been deleted"); } - ContainerGroup containerGroup = - azureResourceManager.containerGroups().getByResourceGroup(resourceGroup, agent.getNodeName()); - - if (containerGroup.containers().containsKey(agent.getNodeName()) - && containerGroup.containers().get(agent.getNodeName()).instanceView().currentState().state() - .equals("Terminated")) { - - // there doesn't seem to be anyway to get debug information with the current API version in the SDK - // logs and events just return nothing - // while debugging with the CLI the best way I could find was 'attaching' to the container - // see https://github.com/Azure/azure-libraries-for-java/issues/1379 - throw new IllegalStateException("ACI container terminated, see the Azure portal / " - + "CLI for more information"); + try { + ContainerGroup containerGroup = + azureResourceManager.containerGroups().getByResourceGroup(resourceGroup, agent.getNodeName()); + + if (containerGroup.containers().containsKey(agent.getNodeName()) + && containerGroup.containers().get(agent.getNodeName()).instanceView().currentState().state() + .equals("Terminated")) { + + // there doesn't seem to be anyway to get debug information with the current API version in the SDK + // logs and events just return nothing + // while debugging with the CLI the best way I could find was 'attaching' to the container + // see https://github.com/Azure/azure-libraries-for-java/issues/1379 + throw new IllegalStateException("ACI container terminated, see the Azure portal / " + + "CLI for more information"); + } + } catch (NullPointerException e) { + // workaround for https://github.com/Azure/azure-sdk-for-java/issues/27083 + LOGGER.log(Level.WARNING, "Waiting for Agent {0} produces a NullPointerException, " + + "but it is ignored.", agent.getNodeName()); } if (computer.isOnline()) {