@@ -147,7 +147,11 @@ public class WaitOnJobCompletion implements CloudProviderAware, OverridableTimeo
147
147
InputStream jobStream
148
148
retrySupport. retry({
149
149
jobStream = katoRestService. collectJob(appName, account, location, name). body. in()
150
- }, 6 , 5000 , false ) // retry for 30 seconds
150
+ },
151
+ configProperties. getJobStatusRetry(). maxAttempts,
152
+ Duration . ofMillis(configProperties. getJobStatusRetry(). getBackOffInMs()),
153
+ configProperties. getJobStatusRetry(). exponentialBackoffEnabled
154
+ )
151
155
Map job = objectMapper. readValue(jobStream, new TypeReference<Map > () {})
152
156
outputs. jobStatus = job
153
157
@@ -165,24 +169,16 @@ public class WaitOnJobCompletion implements CloudProviderAware, OverridableTimeo
165
169
166
170
if ((status == ExecutionStatus . SUCCEEDED ) || (status == ExecutionStatus . TERMINAL )) {
167
171
if (stage. context. propertyFile) {
168
- Map<String , Object > properties = [:]
169
- try {
170
- retrySupport. retry({
171
- properties = katoRestService. getFileContents(appName, account, location, name, stage. context. propertyFile)
172
- }, 6 , 5000 , false ) // retry for 30 seconds
173
- } catch (Exception e) {
174
- if (status == ExecutionStatus . SUCCEEDED ) {
175
- throw new ConfigurationException (" Property File: ${ stage.context.propertyFile} contents could not be retrieved. Error: " + e)
176
- }
177
- log. warn(" failed to get file contents for ${ appName} , account: ${ account} , namespace: ${ location} , " +
178
- " manifest: ${ name} from propertyFile: ${ stage.context.propertyFile} . Error: " , e)
179
- }
180
-
181
- if (properties. size() == 0 ) {
182
- if (status == ExecutionStatus . SUCCEEDED ) {
183
- throw new ConfigurationException (" Expected properties file ${ stage.context.propertyFile} but it was either missing, empty or contained invalid syntax" )
184
- }
185
- } else if (properties. size() > 0 ) {
172
+ Map<String , Object > properties = getPropertyFileContents(
173
+ job,
174
+ appName,
175
+ status,
176
+ account,
177
+ location,
178
+ name,
179
+ stage. context. propertyFile as String )
180
+
181
+ if (properties. size() > 0 ) {
186
182
outputs << properties
187
183
outputs. propertyFileContents = properties
188
184
}
@@ -251,4 +247,197 @@ public class WaitOnJobCompletion implements CloudProviderAware, OverridableTimeo
251
247
}
252
248
throw new JobFailedException (errorMessage)
253
249
}
250
+
251
+ /**
252
+ * <p >this method attempts to get property file from clouddriver and then parses its contents. Depending
253
+ * on the job itself, it could be handled by any job provider in clouddriver. This method should only be
254
+ * called for jobs with ExecutionStatus as either SUCCEEDED or TERMINAL.
255
+ *
256
+ * <p >If property file contents could not be retrieved from clouddriver, then the error handling depends
257
+ * on the job's ExecutionStatus. If it is SUCCEEDED, then an exception is thrown. Otherwise, no exception
258
+ * is thrown since we don't want to mask the real reason behind the job failure.
259
+ *
260
+ * <p >If ExecutionStatus == SUCCEEDED, and especially for kubernetes run jobs, it can so happen that a user
261
+ * has configured the job spec to run 1 pod, have completions and parallelism == 1, and
262
+ * restartPolicy == Never. Despite that, kubernetes may end up running another pod as stated here:
263
+ * https://kubernetes.io/docs/concepts/workloads/controllers/job/#handling-pod-and-container-failures
264
+ * In such a scenario, it may so happen that two pods are created for that job. The first pod may still be
265
+ * around, such as in a PodInitializing state and the second pod could complete before the first one is
266
+ * terminated. This leads to the getFileContents() call failing, since under the covers, kubernetes job
267
+ * provider runs kubectl logs job/<jobName> command, which picks one out of the two pods to obtain the
268
+ * logs as seen here:
269
+ *
270
+ * <p >kubectl -n test logs job/test-run-job-5j2vl -c parser
271
+ * Found 2 pods, using pod/test-run-job-5j2vl-fj8hd
272
+ * Error from server (BadRequest): container "parser" in pod "test-run-job-5j2vl-fj8hd" is PodInitializing
273
+ *
274
+ * <p >That means, even if kubernetes and clouddriver marked the job as successful, since number of
275
+ * succeeded pods >= number of completions, the kubectl command shown above could still end using
276
+ * the failed pod for obtaining the logs.
277
+ *
278
+ * <p >To handle this case, if we get an error while making the getFileContents() call or if we don't receive
279
+ * any properties, then for kubernetes jobs, we figure out if the job status has any pod with phase
280
+ * SUCCEEDED. If we find such a pod, then we directly get the logs from this succeeded pod. Otherwise,
281
+ * we throw an exception as before.
282
+ *
283
+ * <p > we aren't handling the above case for ExecutionStatus == TERMINAL, because at that point, we wouldn't
284
+ * know which pod to query for properties file contents. It could so happen that all the pods in such a job
285
+ * have failed, then we would have to loop over each pod and see what it generated. Then if say, two pods
286
+ * generated different property values for the same key, which one do we choose? Bearing this complexity
287
+ * in mind, and knowing that for succeeded jobs, this solution actually helps prevent a pipeline failure,
288
+ * we are limiting this logic to succeeded jobs only for now.
289
+ *
290
+ * @param job - job status returned by clouddriver
291
+ * @param appName - application name where the job is run
292
+ * @param status - Execution status of the job. Should either be SUCCEEDED or TERMINAL
293
+ * @param account - account under which this job is run
294
+ * @param location - where this job is run
295
+ * @param name - name of the job
296
+ * @param propertyFile - file name to query from the job
297
+ * @return map of property file contents
298
+ */
299
+ private Map<String , Object > getPropertyFileContents (
300
+ Map job ,
301
+ String appName ,
302
+ ExecutionStatus status ,
303
+ String account ,
304
+ String location ,
305
+ String name ,
306
+ String propertyFile
307
+ ) {
308
+ Map<String , Object > properties = [:]
309
+ try {
310
+ retrySupport. retry({
311
+ properties = katoRestService. getFileContents(appName, account, location, name, propertyFile)
312
+ },
313
+ configProperties. getFileContentRetry(). maxAttempts,
314
+ Duration . ofMillis(configProperties. getFileContentRetry(). getBackOffInMs()),
315
+ configProperties. getFileContentRetry(). exponentialBackoffEnabled
316
+ )
317
+ } catch (Exception e) {
318
+ log. warn(" Error occurred while retrieving property file contents from job: ${ name} " +
319
+ " in application: ${ appName} , in account: ${ account} , location: ${ location} ," +
320
+ " using propertyFile: ${ propertyFile} . Error: " , e
321
+ )
322
+
323
+ // For succeeded kubernetes jobs, let's try one more time to get property file contents.
324
+ if (status == ExecutionStatus . SUCCEEDED ) {
325
+ properties = getPropertyFileContentsForSucceededKubernetesJob(
326
+ job,
327
+ appName,
328
+ account,
329
+ location,
330
+ propertyFile
331
+ )
332
+ if (properties. size() == 0 ) {
333
+ // since we didn't get any properties, we fail with this exception
334
+ throw new ConfigurationException (" Expected properties file: ${ propertyFile} in " +
335
+ " job: ${ name} , application: ${ appName} , location: ${ location} , account: ${ account} " +
336
+ " but it was either missing, empty or contained invalid syntax. Error: ${ e} " )
337
+ }
338
+ }
339
+ }
340
+
341
+ if (properties. size() == 0 ) {
342
+ log. warn(" Could not parse propertyFile: ${ propertyFile} in job: ${ name} " +
343
+ " in application: ${ appName} , in account: ${ account} , location: ${ location} ." +
344
+ " It is either missing, empty or contains invalid syntax"
345
+ )
346
+
347
+ // For succeeded kubernetes jobs, let's try one more time to get property file contents.
348
+ if (status == ExecutionStatus . SUCCEEDED ) {
349
+ // let's try one more time to get properties from a kubernetes pod
350
+ properties = getPropertyFileContentsForSucceededKubernetesJob(
351
+ job,
352
+ appName,
353
+ account,
354
+ location,
355
+ propertyFile
356
+ )
357
+ if (properties. size() == 0 ) {
358
+ // since we didn't get any properties, we fail with this exception
359
+ throw new ConfigurationException (" Expected properties file: ${ propertyFile} in " +
360
+ " job: ${ name} , application: ${ appName} , location: ${ location} , account: ${ account} " +
361
+ " but it was either missing, empty or contained invalid syntax" )
362
+ }
363
+ }
364
+ }
365
+ return properties
366
+ }
367
+
368
+ /**
369
+ * This method is supposed to be called from getPropertyFileContents(). This is only applicable for
370
+ * Kubernetes jobs. It finds a successful pod in the job and directly queries it for property file
371
+ * contents.
372
+ *
373
+ * <p >It is meant to handle the following case:
374
+ *
375
+ * <p > if ExecutionStatus == SUCCEEDED, and especially for kubernetes run jobs, it can so happen that a
376
+ * user has configured the job spec to run 1 pod, have completions and parallelism == 1, and
377
+ * restartPolicy == Never. Despite that, kubernetes may end up running another pod as stated here:
378
+ * https://kubernetes.io/docs/concepts/workloads/controllers/job/#handling-pod-and-container-failures
379
+ * In such a scenario, it may so happen that two pods are created for that job. The first pod may still be
380
+ * around, such as in a PodInitializing state and the second pod could complete before the first one is
381
+ * terminated. This leads to the getFileContents() call failing, since under the covers, kubernetes job
382
+ * provider runs kubectl logs job/<jobName> command, which picks one out of the two pods to obtain the
383
+ * logs as seen here:
384
+ *
385
+ * <p >kubectl -n test logs job/test-run-job-5j2vl -c parser
386
+ * Found 2 pods, using pod/test-run-job-5j2vl-fj8hd
387
+ * Error from server (BadRequest): container "parser" in pod "test-run-job-5j2vl-fj8hd" is PodInitializing
388
+ *
389
+ * <p >That means, even if kubernetes and clouddriver marked the job as successful, since number of
390
+ * succeeded pods >= number of completions, the kubectl command shown above could still end using
391
+ * the failed pod for obtaining the logs.
392
+ *
393
+ * <p >To handle this case, if we get an error while making the getFileContents() call or if we don't receive
394
+ * any properties, then for kubernetes jobs, we figure out if the job status has any pod with phase
395
+ * SUCCEEDED. If we find such a pod, then we directly get the logs from this succeeded pod. Otherwise,
396
+ * we throw an exception as before.
397
+ *
398
+ * <p >To keep it simple, and not worry about how to deal with property file
399
+ * contents obtained from various successful pods in a job, if that may happen, we simply query the first
400
+ * successful pod in that job.
401
+ *
402
+ * @param job - job status returned by clouddriver
403
+ * @param appName - application in which this job is run
404
+ * @param account - account under which this job is run
405
+ * @param namespace - where this job is run
406
+ * @param propertyFile - file name to query from the job
407
+ * @return map of property file contents
408
+ */
409
+ private Map<String , Object > getPropertyFileContentsForSucceededKubernetesJob (
410
+ Map job ,
411
+ String appName ,
412
+ String account ,
413
+ String namespace ,
414
+ String propertyFile
415
+ ) {
416
+ Map<String , Object > properties = [:]
417
+ if (job. get(" provider" , " unknown" ) == " kubernetes" ) {
418
+ Optional<Map > succeededPod = job. get(" pods" , [])
419
+ .stream()
420
+ .filter({ Map pod -> pod. get(" status" , [:]). get(" phase" , " Running" ) == " Succeeded"
421
+ })
422
+ .findFirst()
423
+
424
+ if (succeededPod. isPresent()) {
425
+ String podName = (succeededPod. get() as Map ). get(" name" )
426
+ retrySupport. retry({
427
+ properties = katoRestService. getFileContentsFromKubernetesPod(
428
+ appName,
429
+ account,
430
+ namespace,
431
+ podName,
432
+ propertyFile
433
+ )
434
+ },
435
+ configProperties. getFileContentRetry(). maxAttempts,
436
+ Duration . ofMillis(configProperties. getFileContentRetry(). getBackOffInMs()),
437
+ configProperties. getFileContentRetry(). exponentialBackoffEnabled
438
+ )
439
+ }
440
+ }
441
+ return properties
442
+ }
254
443
}
0 commit comments