-
Notifications
You must be signed in to change notification settings - Fork 0
/
settings.yml
140 lines (122 loc) · 5.87 KB
/
settings.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Log file behavior for launcher and site services
logging:
level: INFO # Increasing verbosity: ERROR, WARNING, INFO, DEBUG
format: '%(asctime)s.%(msecs)03d | %(process)d | %(levelname)s | %(name)s:%(lineno)s]
%(message)s'
datefmt: '%Y-%m-%d %H:%M:%S'
buffer_num_records: 1024 # Flush logs after this many records emitted
flush_period: 30 # Flush logs after this many seconds since last flush
# Use filter_tags to limit what jobs the Site will process.
# Launchers submitted from the command line will ignore this setting
# and use the --filter-tags specified as command line arguments.
# For example:
# filter_tags:
# system: H2O
# workflow: test
filter_tags: {}
# Launcher settings
launcher:
idle_ttl_sec: 60 # Shutdown after this many seconds of nothing running
delay_sec: 1 # Process polling interval
error_tail_num_lines: 10 # Number of lines from tail-end of job output to store
max_concurrent_mpiruns: 990 # Maximum number of runs in a single MPI-mode launcher
# Launcher platform interfaces:
compute_node: balsam.platform.compute_node.PolarisNode
mpi_app_launcher: balsam.platform.app_run.PolarisRun
local_app_launcher: balsam.platform.app_run.LocalAppRun
mpirun_allows_node_packing: True # mpi_app_launcher supports multiple concurrent runs per node
serial_mode_prefetch_per_rank: 64 # How many jobs to prefetch from API in serial mode
# Pass-through parameters to mpirun when starting the serial mode launcher:
serial_mode_startup_params:
cpu_affinity: none
# ###################################################################
# BALSAM Site modules
# Any of the modules below can be disabled by setting the key: null
#
# For instance, `transfers: null` will disable the transfers module
# if you are not performing any remote data transfers
# ###################################################################
# Scheduler service syncs the Balsam API with local queues
scheduler:
scheduler_class: balsam.platform.scheduler.PBSScheduler
sync_period: 10 # Scheduler polling / API update interval in seconds
# Configure the queues to which BatchJobs may be submitted at this Site
allowed_queues:
debug:
max_nodes: 2
max_queued_jobs: 100
max_walltime: 60
prod:
max_nodes: 500
max_queued_jobs: 1000
max_walltime: 1440
preemptable:
max_nodes: 10
max_queued_jobs: 20
max_walltime: 4320
# Projects to which BatchJobs may be submitted:
allowed_projects:
- datascience
# Supported "pass-through" parameters to the BatchJob template
# Provided as key:value pairs, where key is the parameter name
# and value is the *default* value of the parameter
optional_batch_job_params:
mig_count: 0
# The path to the job template currently in use at this Site
job_template_path: job-template.sh
# Pre/post-processing service runs job state transitions in parallel
processing:
num_workers: 1 # Number of concurrent worker processes
prefetch_depth: 1000 # Number of Jobs to have prefetched from API
# Transfer service dispatches stage-in/stage-out tasks
transfers:
# Trusted transfer locations
# The site will only transfer data to/from remote locations given here
# Given as a set of {site_alias}: {protocol}//{address} pairs
transfer_locations:
olcf_dtn: globus://ef1a9560-7ca1-11e5-992c-22000b96db58
cori_dtn: globus://9d6d99eb-6d04-11e5-ba46-22000b92c6ec
perlmutter_dtn: globus://6bdc7956-fc0f-4ad2-989c-7aa5ee643a79
theta_dtn: globus://08925f04-569f-11e7-bef8-22000b9a448b
aps_dtn: globus://1c85525e-8948-11e9-8e6a-029d279f7e24
als_dtn: globus://c7c11b22-e127-11e8-8c91-0a1d4c5c824a
globus_endpoint_id: 08925f04-569f-11e7-bef8-22000b9a448b # The local Globus endpoint ID
max_concurrent_transfers: 5 # How many TransferTasks can run at a time
transfer_batch_size: 100 # Maximum number of items per TransferTask
num_items_query_limit: 2000 # Maximum number of transfer items to fetch from API at a time
service_period: 5 # API polling interval
# Elastic Queue flexibly sizes and requests BatchJobs based on the net
# job requirements
#elastic_queue: null
elastic_queue:
service_period: 60 # Calculate and submit 1 BatchJob per `service_period` seconds
submit_project: "datascience"
submit_queue: "prod"
job_mode: "mpi"
use_backfill: False # Constrains BatchJobs to fit into idle backfill slots
min_wall_time_min: 35 # If `use_backfill`, ignore windows shorter than this duration
max_wall_time_min: 120 # If not `use_backfill`, submit BatchJobs for this walltime limit
wall_time_pad_min: 5 # Subtract this walltime duration from all submitted BatchJobs
min_num_nodes: 1 # Do not request fewer than this many nodes
max_num_nodes: 10 # Do not request more than this many nodes
max_queue_wait_time_min: 600 # Delete BatchJobs waiting in queue for this long
max_queued_jobs: 1000 # Maximum number of simultaneously queued jobs
# Queue maintainer keeps a fixed number of BatchJobs queued at all times
# Useful for extended campaigns. Inherits filter_tags from above.
queue_maintainer: null
# queue_maintainer:
# submit_period: 60
# submit_project: datascience
# submit_queue: default
# job_mode: mpi
# num_queued_jobs: 5
# num_nodes: 128
# wall_time_min: 60
# File Cleaner periodically scans the workdirs of newly finished
# jobs, and deletes any files matching the `cleanup_files` glob-pattern list
# defined on the ApplicationDefinition. This is typically done to avoid
# hitting storage quotas when results are staged out to a remote location.
file_cleaner: null
# file_cleaner:
# cleanup_batch_size: 180 # Clean up to this many Job workdirs at a time
# service_period: 30 # Cleanup files every `service_period` seconds