adjust --statePollingWait and --retryCount

jfennick · Aug 15, 2023 · cdafd85 · cdafd85
1 parent e89544f
commit cdafd85
Showing 1 changed file with 2 additions and 9 deletions.
diff --git a/src/wic/run_local.py b/src/wic/run_local.py
@@ -163,15 +163,8 @@ def run_local(args: argparse.Namespace, rose_tree: RoseTree, cachedir: Optional[
                 # '--defaultMemory', '536870912',  # 500MB
                 # The default amount of memory to request for a job (in bytes), by default is 2^31 = 2 gigabytes
                 # '--logDebug',
-                '--coalesceStatusCalls',
-                # Coalese status calls to prevent the batch system from
-                # being overloaded. Currently only supported for LSF.
-                # NOTE: If you do NOT use this option, the loop which polls the batch
-                # system for status updates becomes O(n^2) time complexity!
-                # NOTE: It appears --coalesceStatusCalls is now supported for lsf and slurm.
-                '--statePollingWait', '5',  # For slurm, defaults to SchedulerTimeSlice*1.2 (default 30 seconds*1.2)
-                # Time, in seconds, to wait before doing a scheduler query for job state. Return cached results if within the waiting period. Only works for grid engine batch systems such as gridengine, htcondor, torque, slurm, and lsf.
-                '--retryCount', '1',  # There appear to be random errors due to the panassas network file system.
+                '--statePollingWait', '0',  # See https://github.com/DataBiosphere/toil/pull/4471
+                '--retryCount', '30',  # There appear to be random errors due to the panassas network file system.
                 # Number of times to retry a failing job before giving
                 # up and labeling job failed. default=0
                 '--maxCores', '128',