From 6c5bb72dace8fc364b658fdc851ef768974b3dd6 Mon Sep 17 00:00:00 2001 From: Sam Adam-Day Date: Wed, 25 Jun 2025 18:45:40 +0100 Subject: [PATCH] Enabled optimiser and param offloading --- CHANGELOG.md | 1 + nip/language_model_server/config.py | 12 +++++++++ .../templates/accelerate_config.yaml.jinja2 | 10 ++++---- nip/language_model_server/trainer_handler.py | 25 ++++++++++++++++++- 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d89c834..8708f35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,7 @@ to communicate reliably. - Ability to set vLLM max LoRA rank automatically. - Agent-level hyper-parameter to enable quantisation for self-hosted models. - Using Liger kernel in DPO training for increased speed and lower memory usage. +- Enabled optimiser and parameter offloading for DPO training. ### Changed diff --git a/nip/language_model_server/config.py b/nip/language_model_server/config.py index ced0fd1..9da219d 100644 --- a/nip/language_model_server/config.py +++ b/nip/language_model_server/config.py @@ -75,6 +75,18 @@ class Settings( against the template directory: ``nip/language_model_server/templates/``. """ + offload_optimizer: bool = False + """Whether to offload the optimizer state to CPU memory when training. + + This can reduce GPU memory usage, but may slow down training. + """ + + offload_parameters: bool = False + """Whether to offload the model parameters to CPU memory when training. + + This can reduce GPU memory usage, but may slow down training. + """ + parent_script_cwd: CliSuppress[str | None] = None """Path to the working directory of the script which called this process. diff --git a/nip/language_model_server/templates/accelerate_config.yaml.jinja2 b/nip/language_model_server/templates/accelerate_config.yaml.jinja2 index 0c38362..c6c36d5 100644 --- a/nip/language_model_server/templates/accelerate_config.yaml.jinja2 +++ b/nip/language_model_server/templates/accelerate_config.yaml.jinja2 @@ -2,17 +2,17 @@ compute_environment: LOCAL_MACHINE debug: false deepspeed_config: gradient_accumulation_steps: 1 - offload_optimizer_device: none - offload_param_device: none + offload_optimizer_device: '{{ offload_optimizer_device }}' + offload_param_device: '{{ offload_param_device }}' zero3_init_flag: false zero_stage: 2 - mixed_precision: {{ mixed_precision }} -distributed_type: DEEPSPEED + mixed_precision: '{{ mixed_precision }}' +distributed_type: '{{ distributed_type }}' downcast_bf16: 'no' enable_cpu_affinity: false machine_rank: 0 main_training_function: main -mixed_precision: {{ mixed_precision }} +mixed_precision: '{{ mixed_precision }}' num_machines: 1 num_processes: {{ num_gpus }} rdzv_backend: static diff --git a/nip/language_model_server/trainer_handler.py b/nip/language_model_server/trainer_handler.py index ff32bfd..2c4b591 100644 --- a/nip/language_model_server/trainer_handler.py +++ b/nip/language_model_server/trainer_handler.py @@ -448,9 +448,32 @@ def _get_accelerate_config_path(self) -> Optional[Path]: else: mixed_precision = "fp16" + # For multi-GPU training or when offloading optimizer/parameters, use DeepSpeed + # as the distributed type. Otherwise we don't do distributed training, becuase + # it slows down training + if ( + num_gpus > 1 + or self.settings.offload_optimizer + or self.settings.offload_parameters + ): + distributed_type = "DEEPSPEED" + else: + distributed_type = "NO" + + offload_optimizer_device = "cpu" if self.settings.offload_optimizer else "none" + offload_param_device = "cpu" if self.settings.offload_parameters else "none" + rendered_path = self.temporary_directory_path.joinpath("accelerate_config.yaml") with open(rendered_path, "w") as f: - f.write(template.render(num_gpus=num_gpus, mixed_precision=mixed_precision)) + f.write( + template.render( + num_gpus=num_gpus, + mixed_precision=mixed_precision, + distributed_type=distributed_type, + offload_optimizer_device=offload_optimizer_device, + offload_param_device=offload_param_device, + ) + ) return rendered_path