Skip to content
This repository was archived by the owner on Oct 10, 2019. It is now read-only.

Commit 5fa75e4

Browse files
authored
Merge pull request #64 from brianhlin/sw2929_fix_mem_parsing
Fix Slurm/PBS memory parsing (SOFTWARE-2929)
2 parents af091f9 + e4dbcd4 commit 5fa75e4

File tree

2 files changed

+66
-22
lines changed

2 files changed

+66
-22
lines changed

src/scripts/pbs_status.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -330,25 +330,48 @@ def get_finished_job_stats(jobid):
330330
except Exception, e:
331331
log("Unable to read in CSV output from sacct: %s" % str(e))
332332
return return_dict
333-
334-
sacct_parser = {'RemoteUserCpu': lambda orig, results: orig + \
335-
convert_cpu_to_seconds(results["AveCPU"]) * int(results["AllocCPUS"]),
336-
'ImageSize': lambda orig, results: orig + int(results["MaxRSS"].replace('K', '')),
337-
'ExitCode': lambda orig, results: int(results["ExitCode"].split(":")[0])}
333+
338334
# Slurm can return more than 1 row, for some odd reason.
339335
# so sum up relevant values
340336
for row in reader:
341-
for attr, func in sacct_parser.items():
337+
if row["AveCPU"] is not "":
342338
try:
343-
return_dict[attr] = func(return_dict[attr], row)
344-
except (ValueError, KeyError), exc:
345-
log("Could not parse %s for Jobid %s: %s" % (attr, jobid, exc))
346-
347-
# PBS completion
339+
return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"])
340+
except:
341+
log("Failed to parse CPU usage for job id %s: %s, %s" % (jobid, row["AveCPU"], row["AllocCPUS"]))
342+
raise
343+
if row["MaxRSS"] is not "":
344+
# Remove the trailing [KMGTP] and scale the value appropriately
345+
# Note: We assume that all values will have a suffix, and we
346+
# want the value in kilos.
347+
try:
348+
value = row["MaxRSS"]
349+
factor = 1
350+
if value[-1] == 'M':
351+
factor = 1024
352+
elif value[-1] == 'G':
353+
factor = 1024 * 1024
354+
elif value[-1] == 'T':
355+
factor = 1024 * 1024 * 1024
356+
elif value[-1] == 'P':
357+
factor = 1024 * 1024 * 1024 * 1024
358+
return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor
359+
except:
360+
log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"]))
361+
raise
362+
if row["ExitCode"] is not "":
363+
try:
364+
return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0])
365+
except:
366+
log("Failed to parse ExitCode for job id %s: %s" % (jobid, row["ExitCode"]))
367+
raise
368+
369+
# PBS completion
348370
elif _cluster_type_cache == "pbs":
349371
pass
350372

351373
return return_dict
374+
352375

353376
_qstat_location_cache = None
354377
def get_qstat_location():

src/scripts/slurm_status.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -319,22 +319,43 @@ def get_finished_job_stats(jobid):
319319
except Exception, e:
320320
log("Unable to read in CSV output from sacct: %s" % str(e))
321321
return return_dict
322-
323-
sacct_parser = {'RemoteUserCpu': lambda orig, results: orig + \
324-
convert_cpu_to_seconds(results["AveCPU"]) * int(results["AllocCPUS"]),
325-
'ImageSize': lambda orig, results: orig + int(results["MaxRSS"].replace('K', '')),
326-
'ExitCode': lambda orig, results: int(results["ExitCode"].split(":")[0])}
322+
327323
# Slurm can return more than 1 row, for some odd reason.
328324
# so sum up relevant values
329325
for row in reader:
330-
for attr, func in sacct_parser.items():
326+
if row["AveCPU"] is not "":
331327
try:
332-
return_dict[attr] = func(return_dict[attr], row)
333-
except (ValueError, KeyError), exc:
334-
log("Could not parse %s for Jobid %s: %s" % (attr, jobid, exc))
335-
328+
return_dict['RemoteUserCpu'] += convert_cpu_to_seconds(row["AveCPU"]) * int(row["AllocCPUS"])
329+
except:
330+
log("Failed to parse CPU usage for job id %s: %s, %s" % (jobid, row["AveCPU"], row["AllocCPUS"]))
331+
raise
332+
if row["MaxRSS"] is not "":
333+
# Remove the trailing [KMGTP] and scale the value appropriately
334+
# Note: We assume that all values will have a suffix, and we
335+
# want the value in kilos.
336+
try:
337+
value = row["MaxRSS"]
338+
factor = 1
339+
if value[-1] == 'M':
340+
factor = 1024
341+
elif value[-1] == 'G':
342+
factor = 1024 * 1024
343+
elif value[-1] == 'T':
344+
factor = 1024 * 1024 * 1024
345+
elif value[-1] == 'P':
346+
factor = 1024 * 1024 * 1024 * 1024
347+
return_dict["ImageSize"] += int(value.strip('KMGTP')) * factor
348+
except:
349+
log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"]))
350+
raise
351+
if row["ExitCode"] is not "":
352+
try:
353+
return_dict["ExitCode"] = int(row["ExitCode"].split(":")[0])
354+
except:
355+
log("Failed to parse memory usage for job id %s: %s" % (jobid, row["MaxRSS"]))
356+
raise
336357
return return_dict
337-
358+
338359

339360
_slurm_location_cache = None
340361
def get_slurm_location(program):

0 commit comments

Comments
 (0)