From e89544f8be9f7c989428d42f35bbf416478100a9 Mon Sep 17 00:00:00 2001 From: Jake Fennick Date: Tue, 18 Apr 2023 10:32:43 -0600 Subject: [PATCH] added toil slurm cli flags --- src/wic/run_local.py | 48 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/src/wic/run_local.py b/src/wic/run_local.py index 1d064a45..188bd3c9 100644 --- a/src/wic/run_local.py +++ b/src/wic/run_local.py @@ -106,8 +106,9 @@ def run_local(args: argparse.Namespace, rose_tree: RoseTree, cachedir: Optional[ # NOTE: Using --leave-outputs to disable --outdir # See https://github.com/dnanexus/dx-cwl/issues/20 # --outdir has one or more bugs which will cause workflows to fail!!! - cmd = ['cwltool'] + parallel + quiet + cachedir_ + net + provenance + docker_cmd_ + write_summary + skip_schemas + cmd = ['cwltool'] + parallel + quiet + cachedir_ + net + write_summary + skip_schemas # + provenance + docker_cmd_ cmd += ['--leave-outputs', + '--singularity', # '--js-console', # "Running with support for javascript console in expressions (DO NOT USE IN PRODUCTION)" f'autogenerated/{yaml_stem}.cwl', f'autogenerated/{yaml_stem}_inputs.yml'] # TODO: Consider using the undocumented flag --fast-parser for known-good workflows, @@ -152,11 +153,52 @@ def run_local(args: argparse.Namespace, rose_tree: RoseTree, cachedir: Optional[ net = ['--custom-net', args.custom_net] if args.custom_net else [] provenance = ['--provenance', 'provenance'] docker_cmd_ = [] if docker_cmd == 'docker' else ['--user-space-docker-cmd', docker_cmd] - cmd = ['toil-cwl-runner'] + net + provenance + docker_cmd_ + cmd = ['toil-cwl-runner'] + net # + provenance + docker_cmd_ cmd += ['--outdir', 'outdir_toil', + # NOTE: "... the job store must be accessible by all worker nodes." '--jobStore', f'file:./jobStore_{yaml_stem}', # NOTE: This is the equivalent of --cachedir + '--batchSystem', 'slurm', + # See https://github.com/DataBiosphere/toil/blob/d439924b4110cc572b5b996f0efa623ebb48155f/src/toil/batchSystems/slurm.py#L376 + '--dont_allocate_mem', + # '--defaultMemory', '536870912', # 500MB + # The default amount of memory to request for a job (in bytes), by default is 2^31 = 2 gigabytes + # '--logDebug', + '--coalesceStatusCalls', + # Coalese status calls to prevent the batch system from + # being overloaded. Currently only supported for LSF. + # NOTE: If you do NOT use this option, the loop which polls the batch + # system for status updates becomes O(n^2) time complexity! + # NOTE: It appears --coalesceStatusCalls is now supported for lsf and slurm. + '--statePollingWait', '5', # For slurm, defaults to SchedulerTimeSlice*1.2 (default 30 seconds*1.2) + # Time, in seconds, to wait before doing a scheduler query for job state. Return cached results if within the waiting period. Only works for grid engine batch systems such as gridengine, htcondor, torque, slurm, and lsf. + '--retryCount', '1', # There appear to be random errors due to the panassas network file system. + # Number of times to retry a failing job before giving + # up and labeling job failed. default=0 + '--maxCores', '128', + '--maxLocalJobs', '128', # i.e. infinity + # For batch systems that support a local queue for housekeeping jobs (Mesos, GridEngine, htcondor, lsf, slurm, torque). Specifies the maximum number of these housekeeping jobs to run on the local system. The default (equal to the + # number of cores) is a maximum of 24 concurrent local housekeeping jobs. + # '--runCwlInternalJobsOnWorkers', 'true', + # Whether to run CWL internal jobs (e.g. CWLScatter) on the worker nodes instead of the primary node. If false (default), then all such jobs are run on the primary node. Setting this to true can speed up the pipeline for very + # large workflows with many sub-workflows and/or scatters, provided that the worker pool is large enough. + '--disableAutoDeployment', # Using conda/docker/panassas NFS, so shouldn't need to deploy scripts. + # Should auto-deployment of the user script be deactivated? If True, the user script/package should be present at the same location on all workers. Default = False. + '--stats', + # Records statistics about the toil workflow to be used by 'toil stats'. + '--clusterStats', 'clusterStats.json', + # If enabled, writes out JSON resource usage statistics to a file. The default location for this file is the current working directory, but an absolute path can also be passed to specify where this file should be written. This + # options only applies when using scalable batch systems. + '--singularity', + '--workDir', 'workdir', # "This directory needs to exist on all machines running jobs." + # i.e. /run/user/$UID/coorddir This is a local /tmpfs (in-memory) NOT NFS + '--coordinationDir', 'coorddir', # "Absolute path to directory where Toil will keep state and lock files." + '--log-dir', 'logdir', + '--logFile', 'logfile', + # '--restart', + '--disableCaching', + '--disableProgress', # disable the progress bar in the terminal # TODO: Check --clean, --cleanWorkDir, --restart - '--clean', 'always', # This effectively disables caching, but is reproducible + # '--clean', 'always', # This effectively disables caching, but is reproducible f'autogenerated/{yaml_stem}.cwl', f'autogenerated/{yaml_stem}_inputs.yml'] print('Running ' + ' '.join(cmd))