Skip to content

Commit 11f09de

Browse files
authored
Merge branch 'master' into upstream-webhook-mitigations
2 parents e590488 + 9428996 commit 11f09de

File tree

14 files changed

+1280
-1046
lines changed

14 files changed

+1280
-1046
lines changed

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
fiatVersion=1.50.0
2-
korkVersion=7.239.0
2+
korkVersion=7.240.0
33
kotlinVersion=1.6.21
44
org.gradle.parallel=true
55
org.gradle.jvmargs=-Xmx4g

orca-clouddriver/src/main/groovy/com/netflix/spinnaker/orca/clouddriver/tasks/job/WaitOnJobCompletion.groovy

Lines changed: 208 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,11 @@ public class WaitOnJobCompletion implements CloudProviderAware, OverridableTimeo
147147
InputStream jobStream
148148
retrySupport.retry({
149149
jobStream = katoRestService.collectJob(appName, account, location, name).body.in()
150-
}, 6, 5000, false) // retry for 30 seconds
150+
},
151+
configProperties.getJobStatusRetry().maxAttempts,
152+
Duration.ofMillis(configProperties.getJobStatusRetry().getBackOffInMs()),
153+
configProperties.getJobStatusRetry().exponentialBackoffEnabled
154+
)
151155
Map job = objectMapper.readValue(jobStream, new TypeReference<Map>() {})
152156
outputs.jobStatus = job
153157

@@ -165,24 +169,16 @@ public class WaitOnJobCompletion implements CloudProviderAware, OverridableTimeo
165169

166170
if ((status == ExecutionStatus.SUCCEEDED) || (status == ExecutionStatus.TERMINAL)) {
167171
if (stage.context.propertyFile) {
168-
Map<String, Object> properties = [:]
169-
try {
170-
retrySupport.retry({
171-
properties = katoRestService.getFileContents(appName, account, location, name, stage.context.propertyFile)
172-
}, 6, 5000, false) // retry for 30 seconds
173-
} catch (Exception e) {
174-
if (status == ExecutionStatus.SUCCEEDED) {
175-
throw new ConfigurationException("Property File: ${stage.context.propertyFile} contents could not be retrieved. Error: " + e)
176-
}
177-
log.warn("failed to get file contents for ${appName}, account: ${account}, namespace: ${location}, " +
178-
"manifest: ${name} from propertyFile: ${stage.context.propertyFile}. Error: ", e)
179-
}
180-
181-
if (properties.size() == 0) {
182-
if (status == ExecutionStatus.SUCCEEDED) {
183-
throw new ConfigurationException("Expected properties file ${stage.context.propertyFile} but it was either missing, empty or contained invalid syntax")
184-
}
185-
} else if (properties.size() > 0) {
172+
Map<String, Object> properties = getPropertyFileContents(
173+
job,
174+
appName,
175+
status,
176+
account,
177+
location,
178+
name,
179+
stage.context.propertyFile as String)
180+
181+
if (properties.size() > 0) {
186182
outputs << properties
187183
outputs.propertyFileContents = properties
188184
}
@@ -251,4 +247,197 @@ public class WaitOnJobCompletion implements CloudProviderAware, OverridableTimeo
251247
}
252248
throw new JobFailedException(errorMessage)
253249
}
250+
251+
/**
252+
* <p>this method attempts to get property file from clouddriver and then parses its contents. Depending
253+
* on the job itself, it could be handled by any job provider in clouddriver. This method should only be
254+
* called for jobs with ExecutionStatus as either SUCCEEDED or TERMINAL.
255+
*
256+
* <p>If property file contents could not be retrieved from clouddriver, then the error handling depends
257+
* on the job's ExecutionStatus. If it is SUCCEEDED, then an exception is thrown. Otherwise, no exception
258+
* is thrown since we don't want to mask the real reason behind the job failure.
259+
*
260+
* <p>If ExecutionStatus == SUCCEEDED, and especially for kubernetes run jobs, it can so happen that a user
261+
* has configured the job spec to run 1 pod, have completions and parallelism == 1, and
262+
* restartPolicy == Never. Despite that, kubernetes may end up running another pod as stated here:
263+
* https://kubernetes.io/docs/concepts/workloads/controllers/job/#handling-pod-and-container-failures
264+
* In such a scenario, it may so happen that two pods are created for that job. The first pod may still be
265+
* around, such as in a PodInitializing state and the second pod could complete before the first one is
266+
* terminated. This leads to the getFileContents() call failing, since under the covers, kubernetes job
267+
* provider runs kubectl logs job/<jobName> command, which picks one out of the two pods to obtain the
268+
* logs as seen here:
269+
*
270+
* <p>kubectl -n test logs job/test-run-job-5j2vl -c parser
271+
* Found 2 pods, using pod/test-run-job-5j2vl-fj8hd
272+
* Error from server (BadRequest): container "parser" in pod "test-run-job-5j2vl-fj8hd" is PodInitializing
273+
*
274+
* <p>That means, even if kubernetes and clouddriver marked the job as successful, since number of
275+
* succeeded pods >= number of completions, the kubectl command shown above could still end using
276+
* the failed pod for obtaining the logs.
277+
*
278+
* <p>To handle this case, if we get an error while making the getFileContents() call or if we don't receive
279+
* any properties, then for kubernetes jobs, we figure out if the job status has any pod with phase
280+
* SUCCEEDED. If we find such a pod, then we directly get the logs from this succeeded pod. Otherwise,
281+
* we throw an exception as before.
282+
*
283+
* <p> we aren't handling the above case for ExecutionStatus == TERMINAL, because at that point, we wouldn't
284+
* know which pod to query for properties file contents. It could so happen that all the pods in such a job
285+
* have failed, then we would have to loop over each pod and see what it generated. Then if say, two pods
286+
* generated different property values for the same key, which one do we choose? Bearing this complexity
287+
* in mind, and knowing that for succeeded jobs, this solution actually helps prevent a pipeline failure,
288+
* we are limiting this logic to succeeded jobs only for now.
289+
*
290+
* @param job - job status returned by clouddriver
291+
* @param appName - application name where the job is run
292+
* @param status - Execution status of the job. Should either be SUCCEEDED or TERMINAL
293+
* @param account - account under which this job is run
294+
* @param location - where this job is run
295+
* @param name - name of the job
296+
* @param propertyFile - file name to query from the job
297+
* @return map of property file contents
298+
*/
299+
private Map<String, Object> getPropertyFileContents(
300+
Map job,
301+
String appName,
302+
ExecutionStatus status,
303+
String account,
304+
String location,
305+
String name,
306+
String propertyFile
307+
) {
308+
Map<String, Object> properties = [:]
309+
try {
310+
retrySupport.retry({
311+
properties = katoRestService.getFileContents(appName, account, location, name, propertyFile)
312+
},
313+
configProperties.getFileContentRetry().maxAttempts,
314+
Duration.ofMillis(configProperties.getFileContentRetry().getBackOffInMs()),
315+
configProperties.getFileContentRetry().exponentialBackoffEnabled
316+
)
317+
} catch (Exception e) {
318+
log.warn("Error occurred while retrieving property file contents from job: ${name}" +
319+
" in application: ${appName}, in account: ${account}, location: ${location}," +
320+
" using propertyFile: ${propertyFile}. Error: ", e
321+
)
322+
323+
// For succeeded kubernetes jobs, let's try one more time to get property file contents.
324+
if (status == ExecutionStatus.SUCCEEDED) {
325+
properties = getPropertyFileContentsForSucceededKubernetesJob(
326+
job,
327+
appName,
328+
account,
329+
location,
330+
propertyFile
331+
)
332+
if (properties.size() == 0) {
333+
// since we didn't get any properties, we fail with this exception
334+
throw new ConfigurationException("Expected properties file: ${propertyFile} in " +
335+
"job: ${name}, application: ${appName}, location: ${location}, account: ${account} " +
336+
"but it was either missing, empty or contained invalid syntax. Error: ${e}")
337+
}
338+
}
339+
}
340+
341+
if (properties.size() == 0) {
342+
log.warn("Could not parse propertyFile: ${propertyFile} in job: ${name}" +
343+
" in application: ${appName}, in account: ${account}, location: ${location}." +
344+
" It is either missing, empty or contains invalid syntax"
345+
)
346+
347+
// For succeeded kubernetes jobs, let's try one more time to get property file contents.
348+
if (status == ExecutionStatus.SUCCEEDED) {
349+
// let's try one more time to get properties from a kubernetes pod
350+
properties = getPropertyFileContentsForSucceededKubernetesJob(
351+
job,
352+
appName,
353+
account,
354+
location,
355+
propertyFile
356+
)
357+
if (properties.size() == 0) {
358+
// since we didn't get any properties, we fail with this exception
359+
throw new ConfigurationException("Expected properties file: ${propertyFile} in " +
360+
"job: ${name}, application: ${appName}, location: ${location}, account: ${account} " +
361+
"but it was either missing, empty or contained invalid syntax")
362+
}
363+
}
364+
}
365+
return properties
366+
}
367+
368+
/**
369+
* This method is supposed to be called from getPropertyFileContents(). This is only applicable for
370+
* Kubernetes jobs. It finds a successful pod in the job and directly queries it for property file
371+
* contents.
372+
*
373+
* <p>It is meant to handle the following case:
374+
*
375+
* <p> if ExecutionStatus == SUCCEEDED, and especially for kubernetes run jobs, it can so happen that a
376+
* user has configured the job spec to run 1 pod, have completions and parallelism == 1, and
377+
* restartPolicy == Never. Despite that, kubernetes may end up running another pod as stated here:
378+
* https://kubernetes.io/docs/concepts/workloads/controllers/job/#handling-pod-and-container-failures
379+
* In such a scenario, it may so happen that two pods are created for that job. The first pod may still be
380+
* around, such as in a PodInitializing state and the second pod could complete before the first one is
381+
* terminated. This leads to the getFileContents() call failing, since under the covers, kubernetes job
382+
* provider runs kubectl logs job/<jobName> command, which picks one out of the two pods to obtain the
383+
* logs as seen here:
384+
*
385+
* <p>kubectl -n test logs job/test-run-job-5j2vl -c parser
386+
* Found 2 pods, using pod/test-run-job-5j2vl-fj8hd
387+
* Error from server (BadRequest): container "parser" in pod "test-run-job-5j2vl-fj8hd" is PodInitializing
388+
*
389+
* <p>That means, even if kubernetes and clouddriver marked the job as successful, since number of
390+
* succeeded pods >= number of completions, the kubectl command shown above could still end using
391+
* the failed pod for obtaining the logs.
392+
*
393+
* <p>To handle this case, if we get an error while making the getFileContents() call or if we don't receive
394+
* any properties, then for kubernetes jobs, we figure out if the job status has any pod with phase
395+
* SUCCEEDED. If we find such a pod, then we directly get the logs from this succeeded pod. Otherwise,
396+
* we throw an exception as before.
397+
*
398+
* <p>To keep it simple, and not worry about how to deal with property file
399+
* contents obtained from various successful pods in a job, if that may happen, we simply query the first
400+
* successful pod in that job.
401+
*
402+
* @param job - job status returned by clouddriver
403+
* @param appName - application in which this job is run
404+
* @param account - account under which this job is run
405+
* @param namespace - where this job is run
406+
* @param propertyFile - file name to query from the job
407+
* @return map of property file contents
408+
*/
409+
private Map<String, Object> getPropertyFileContentsForSucceededKubernetesJob(
410+
Map job,
411+
String appName,
412+
String account,
413+
String namespace,
414+
String propertyFile
415+
) {
416+
Map<String, Object> properties = [:]
417+
if (job.get("provider", "unknown") == "kubernetes") {
418+
Optional<Map> succeededPod = job.get("pods", [])
419+
.stream()
420+
.filter({ Map pod -> pod.get("status", [:]).get("phase", "Running") == "Succeeded"
421+
})
422+
.findFirst()
423+
424+
if (succeededPod.isPresent()) {
425+
String podName = (succeededPod.get() as Map).get("name")
426+
retrySupport.retry({
427+
properties = katoRestService.getFileContentsFromKubernetesPod(
428+
appName,
429+
account,
430+
namespace,
431+
podName,
432+
propertyFile
433+
)
434+
},
435+
configProperties.getFileContentRetry().maxAttempts,
436+
Duration.ofMillis(configProperties.getFileContentRetry().getBackOffInMs()),
437+
configProperties.getFileContentRetry().exponentialBackoffEnabled
438+
)
439+
}
440+
}
441+
return properties
442+
}
254443
}

orca-clouddriver/src/main/java/com/netflix/spinnaker/orca/clouddriver/DelegatingKatoRestService.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,13 @@ public Map<String, Object> getFileContents(
7171
return getService().getFileContents(app, account, region, id, fileName);
7272
}
7373

74+
@Override
75+
public Map<String, Object> getFileContentsFromKubernetesPod(
76+
String app, String account, String namespace, String podName, String fileName) {
77+
return getService()
78+
.getFileContentsFromKubernetesPod(app, account, namespace, podName, fileName);
79+
}
80+
7481
@Override
7582
public Task lookupTask(String id) {
7683
return getService().lookupTask(id);

orca-clouddriver/src/main/java/com/netflix/spinnaker/orca/clouddriver/KatoRestService.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,14 @@ Map<String, Object> getFileContents(
7373
@Path("id") String id,
7474
@Path("fileName") String fileName);
7575

76+
@GET("/applications/{app}/kubernetes/pods/{account}/{namespace}/{podName}/{fileName}")
77+
Map<String, Object> getFileContentsFromKubernetesPod(
78+
@Path("app") String app,
79+
@Path("account") String account,
80+
@Path("namespace") String namespace,
81+
@Path("podName") String podName,
82+
@Path("fileName") String fileName);
83+
7684
/**
7785
* This should _only_ be called if there is a problem retrieving the Task from
7886
* CloudDriverTaskStatusService (ie. a clouddriver replica).

orca-clouddriver/src/main/java/com/netflix/spinnaker/orca/clouddriver/config/TaskConfigurationProperties.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,22 @@ public static class WaitOnJobCompletionTaskConfig {
4747
* Default or empty set means that no keys will be excluded.
4848
*/
4949
private Set<String> excludeKeysFromOutputs = Set.of();
50+
51+
private Retries jobStatusRetry = new Retries();
52+
53+
private Retries fileContentRetry = new Retries();
54+
55+
@Data
56+
public static class Retries {
57+
// total number of attempts
58+
int maxAttempts = 6;
59+
60+
// time in ms to wait before subsequent retry attempts
61+
long backOffInMs = 5000;
62+
63+
// flag to enable exponential backoff
64+
boolean exponentialBackoffEnabled = false;
65+
}
5066
}
5167

5268
@Data

0 commit comments

Comments
 (0)