-
Notifications
You must be signed in to change notification settings - Fork 9
Glossary
-
num_accs_per_task = int( 4 / int( os.environ.get('SLURM_TASKS_PER_NODE', '1')[0] ))
= number of GPUs / accelerators per task -
device = init_torch( num_accs_per_task)
= CUDA devices -
with_ddp = True
= activate/deactivate the use oftorch ddp
-
par_rank
= ranking parameter ; taken fromsetup_ddp
-
par_size
= size parameter ; taken fromsetup_ddp
parameter -
back_passes_per_step = 4
= number of back passes per step
-
comment = ''
= additional label -
file_format = 'grib'
= format of the input files. supported:grib
,netcdf
,bin
-
data_dir = str(config.path_data)
= input data directory. Defined inatmorep/config/config.py
-
level_type = 'ml'
= pressure levels or model levels (support forpl
levels works only if the data are stored with same structure as forml
levels)
format: list of fields where for each field the list is
[ name ,
[ dynamic or static field { 1, 0 }, embedding dimension , device id ],
[ vertical levels ],
[ num_tokens],
[ token size],
[ total masking rate, rate masking, rate noising, rate for multi-res distortion]
]
-
name
= name of the field (as in the data) -
dynamic or static field
= 1: dynamic field, 0: static field (e.g. orography) -
embedding dimension
= dimension for the sin/cos embedding e.g. 1024, 512 etc.. -
device id
= cuda device ID -
vertical levels
= list of vertical levels, e.g. 123, 105 etc.. -
num_tokens
= number of tokens in [time, lat, lon] loaded by the model to form the hypercube. It constitutes the total size ofsource
(aka the data loaded before masking) -
token_size
= size of each token in [time, lat, lon] given in time steps (time) x grid points (lat) x grid points (lon). e.g. [3, 9, 9] = each token is constituted by 3h in time, 9 grid points in latitude and 3 grid points in longitude -
total masking rate
= total fraction ofsource
that will be masked -
rate masking
= fraction of tokens completely masked substituting the entire token with 0. -
rate noising
= fraction of tokens masked by substituting it with random noise -
rate for multi-res distortion
= fraction of tokens masked with multi-resolution distortion
cf.fields = [ [ 'velocity_u', [ 1, 1024, [ ], 0],
[ 114, 123, 137 ],
[12, 6, 12], [3, 9, 9], [0.5, 0.9, 0.1, 0.05] ],
[ 'velocity_v', [ 1, 1024, [ ], 0 ],
[ 114, 123, 137 ],
[ 12, 6, 12], [3, 9, 9], [0.25, 0.9, 0.1, 0.05] ], #]
[ 'total_precip', [ 1, 1536, [ ], 3 ],
[ 0 ],
[12, 6, 12], [3, 9, 9], [0.25, 0.9, 0.2, 0.05] ] ]
cf.fields_prediction = [ [cf.fields[0][0], 0.5], [cf.fields[1][0], 0.5] ]
-
fields
= loaded fields -
fields_prediction
= predicted fields -
fields_targets = []
= fields with alternative datasets used for pre-training (not checked yet in this version. it might no work out of the box) -
years_train = list( range( 1980, 2018))
= years used to train the model -
years_test = [2021]
= years used to test the model -
month = None
= in case you want to use a specific month e.g. June due to storage constraints on your device -
geo_range_sampling = [[ -90., 90.], [ 0., 360.]]
= coverage in latitude x longitude coordinates, assuming a lat-on linear projection of the data -
time_sampling = 1
= sampling rate for time steps
-
data_smoothing = 0
= smoothing parameters. in case you want to smooth your data for testing purposes -
file_shape = (-1, 721, 1440)
= TODO: get rid of this parameter. it contains the lat-lon resolution of the data. -
num_t_samples = 31*24
= Number of samples loaded in time -
num_files_train = 5
= number of files loaded for training -
num_files_test = 2
= number of files loaded for testing -
num_patches_per_t_train = 8
= number of patches per time step in training. It controls the length of an epoch. -
num_patches_per_t_test = 4
= number of patches per time step in testing
-
torch_seed = torch.initial_seed()
= seed for random masking -
batch_size_test = 64
= batch size used for testing -
batch_size_start = 16
= batch size used at the beginning of the training -
batch_size_max = 32
= max batch size -
batch_size_delta = 8
= step to increase the batch size in training -
num_epochs = 128
= number of epochs -
num_loader_workers = 8
= number of workers (memory consuming because to creates a full copy of your data. reduce in case you run out of memory)
-
size_token_info = 8
= internal parameter, do not change it. -
size_token_info_net = 16
= dimension of the tail networks -
grad_checkpointing = True
= use checkpoints of the gradients -
with_cls = False
= Bool - run with or without cls
-
with_layernorm = True
= Do layer normalization -
coupling_num_heads_per_field = 1
= number of coupled heads per field -
dropout_rate = 0.05
= Dropout rate -
learnable_mask = False
= use learnable masking -
with_qk_lnorm = True
do queues and keys layer normalization
-
encoder_num_layers = 10
= number of layers in the encoder -
encoder_num_heads = 16
= number of heads in the encoder -
encoder_num_mlp_layers = 2
= number of map layers in the encoder -
encoder_att_type = 'dense'
= attention type for the encoder
-
decoder_num_layers = 10
= number of layers in the decoder -
decoder_num_heads = 16
= number of heads in the decoder -
decoder_num_mlp_layers = 2
= number of map layers in the decoder -
decoder_self_att = False
= do self attention in the decoder -
decoder_cross_att_ratio = 0.5
= cross attention ratio in the decoder -
decoder_cross_att_rate = 1.0
= cross attention rate in the decoder -
decoder_att_type = 'dense'
= attention type for the decoder
-
net_tail_num_nets = 16
= number of tail networks (aka number of ensemble members) -
net_tail_num_layers = 0
= number of layers in the tail netwokrs -
losses = ['mse_ensemble', 'stats']
= list with combination of losses (total: sum of each term). see Trainer for supported losses -
optimizer_zero = False
= AdamW (False) or ZeroRedundancyOptimizer (True) -
lr_start = 5. * 10e-7
= initial learning rate -
lr_max = 0.00005
= maximum learning rate -
lr_min = 0.00004
= minimum learning rate -
weight_decay = 0.05
= weight decay parameter -
lr_decay_rate = 1.025
= decay rate of the learning rate -
lr_start_epochs = 3
= first epoch to change the learning rate -
lat_sampling_weighted = True
= apply weighted latitude sampling to sample less at the poles than at the equator (reduce pole distortions)
-
BERT_strategy = 'BERT'
= Training strategy (see Trainer for details). strategies:BERT
,forecast
,temporal_interpolation
,identity
-
BERT_fields_synced = False
= apply synchronized / identical masking to all fields (fields need to have same BERT params for this to have effect) -
BERT_mr_max = 2
= maximum reduction rate for resolution -
log_test_num_ranks = 0
= threshold for logging. logging only if cf.par_rank < cf.log_test_num_ranks -
save_grads = False
= save the gradients -
profile = False
=. do profiling (TODO: check if still works) -
test_initial = True
= compute test loss before training (sanity check) -
attention = False
= store the attention maps in a fileXXXX_attention.zarr
-
rng_seed = None
= random seed initialisation -
with_wandb = True
= use weights and biases as logger (highly recommended). Usually use%>wandb offline
to switch to disable syncing with server -
with_mixed_precision = True
= use mixed precision -
n_size = [36, 0.25*9*6, 0.25*9*12]
= (new!) control source size: [time steps, latitude steps, longitude steps]. redundant wrt num_tokez and tokens_size above. -
num_samples_per_epoch = 1024
= (new!) number of samples per epoch -
num_samples_validate = 128
= (new!) number of samples used in the validate step
The AtmoRep Collaboration - last update: April 2024