Add pinning support to group offloading hooks

Aki-07 · Aki-07 · commit 9c3c14f52aa7 · 2025-11-29T19:28:47.000+05:30
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -17,7 +17,7 @@
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
 
 import safetensors.torch
 import torch
@@ -60,6 +60,7 @@ class GroupOffloadingConfig:
     offload_to_disk_path: Optional[str] = None
     stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
     block_modules: Optional[List[str]] = None
+    pin_groups: Optional[Union[str, Callable]] = None
 
 
 class ModuleGroup:
@@ -92,6 +93,7 @@ def __init__(
         self.record_stream = record_stream
         self.onload_self = onload_self
         self.low_cpu_mem_usage = low_cpu_mem_usage
+        self.pinned = False
 
         self.offload_to_disk_path = offload_to_disk_path
         self._is_offloaded_to_disk = False
@@ -297,6 +299,24 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
         if self.group.onload_leader is None:
             self.group.onload_leader = module
 
+        if self.group.pinned:
+            if self.group.onload_leader == module and not self._is_group_on_device():
+                self.group.onload_()
+
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
+                self.next_group.onload_()
+
+            should_synchronize = (
+                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+            )
+            if should_synchronize:
+                self.group.stream.synchronize()
+
+            args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+            return args, kwargs
+
         # If the current module is the onload_leader of the group, we onload the group if it is supposed
         # to onload itself. In the case of using prefetching with streams, we onload the next group if
         # it is not supposed to onload itself.
@@ -325,10 +345,26 @@ def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
         return args, kwargs
 
     def post_forward(self, module: torch.nn.Module, output):
+        if self.group.pinned:
+            return output
+
         if self.group.offload_leader == module:
             self.group.offload_()
         return output
 
+    def _is_group_on_device(self) -> bool:
+        tensors = []
+        for group_module in self.group.modules:
+            tensors.extend(list(group_module.parameters()))
+            tensors.extend(list(group_module.buffers()))
+        tensors.extend(self.group.parameters)
+        tensors.extend(self.group.buffers)
+
+        if len(tensors) == 0:
+            return True
+
+        return all(t.device == self.group.onload_device for t in tensors)
+
 
 class LazyPrefetchGroupOffloadingHook(ModelHook):
     r"""
@@ -424,6 +460,51 @@ def post_forward(self, module, output):
             group_offloading_hooks[i].next_group = group_offloading_hooks[i + 1].group
             group_offloading_hooks[i].next_group.onload_self = False
 
+        pin_groups = getattr(base_module_registry, "_group_offload_pin_groups", None)
+        if pin_groups is not None and num_executed > 0:
+            param_exec_info = []
+            for idx, ((name, submodule), hook) in enumerate(zip(self.execution_order, group_offloading_hooks)):
+                if hook is None:
+                    continue
+                if next(submodule.parameters(), None) is None and next(submodule.buffers(), None) is None:
+                    continue
+                param_exec_info.append((name, submodule, hook))
+
+            num_param_modules = len(param_exec_info)
+            if num_param_modules > 0:
+                pinned_indices = set()
+                if isinstance(pin_groups, str):
+                    if pin_groups == "all":
+                        pinned_indices = set(range(num_param_modules))
+                    elif pin_groups == "first_last":
+                        pinned_indices.add(0)
+                        pinned_indices.add(num_param_modules - 1)
+                elif callable(pin_groups):
+                    for idx, (name, submodule, _) in enumerate(param_exec_info):
+                        should_pin = False
+                        try:
+                            should_pin = bool(pin_groups(submodule))
+                        except TypeError:
+                            try:
+                                should_pin = bool(pin_groups(name, submodule))
+                            except TypeError:
+                                should_pin = bool(pin_groups(name, submodule, idx))
+                        if should_pin:
+                            pinned_indices.add(idx)
+
+                pinned_groups = set()
+                for idx in pinned_indices:
+                    if idx >= num_param_modules:
+                        continue
+                    group = param_exec_info[idx][2].group
+                    if group not in pinned_groups:
+                        group.pinned = True
+                        pinned_groups.add(group)
+
+                for group in pinned_groups:
+                    if group.offload_device != group.onload_device:
+                        group.onload_()
+
         return output
 
 
@@ -455,6 +536,8 @@ def apply_group_offloading(
     low_cpu_mem_usage: bool = False,
     offload_to_disk_path: Optional[str] = None,
     block_modules: Optional[List[str]] = None,
+    pin_groups: Optional[Union[str, Callable]] = None,
+    pin_first_last: bool = False,
 ) -> None:
     r"""
     Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
@@ -515,6 +598,12 @@ def apply_group_offloading(
         block_modules (`List[str]`, *optional*):
             List of module names that should be treated as blocks for offloading. If provided, only these modules
             will be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        pin_groups (`"first_last"` or `"all"` or `Callable`, *optional*, defaults to `None`):
+            Optionally keeps selected groups on the onload device permanently. Use `"first_last"` to pin the first
+            and last parameter-bearing groups, `"all"` to pin every parameter-bearing group, or pass a callable that
+            receives a module (and optionally the module name and index) and returns `True` to pin that group.
+        pin_first_last (`bool`, *optional*, defaults to `False`):
+            Deprecated alias for `pin_groups="first_last"`.
 
     Example:
         ```python
@@ -554,7 +643,24 @@ def apply_group_offloading(
     if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
         raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
 
+    if pin_first_last:
+        if pin_groups is not None and pin_groups != "first_last":
+            raise ValueError("`pin_first_last` cannot be combined with a different `pin_groups` setting.")
+        pin_groups = "first_last"
+
+    normalized_pin_groups = pin_groups
+    if isinstance(pin_groups, str):
+        normalized_pin_groups = pin_groups.lower()
+        if normalized_pin_groups not in {"first_last", "all"}:
+            raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+    elif pin_groups is not None and not callable(pin_groups):
+        raise ValueError("`pin_groups` must be one of `None`, 'first_last', 'all', or a callable.")
+
+    pin_groups = normalized_pin_groups
+
     _raise_error_if_accelerate_model_or_sequential_hook_present(module)
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry._group_offload_pin_groups = pin_groups
 
     config = GroupOffloadingConfig(
         onload_device=onload_device,
@@ -567,11 +673,15 @@ def apply_group_offloading(
         low_cpu_mem_usage=low_cpu_mem_usage,
         offload_to_disk_path=offload_to_disk_path,
         block_modules=block_modules,
+        pin_groups=pin_groups,
     )
     _apply_group_offloading(module, config)
 
 
 def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry._group_offload_pin_groups = config.pin_groups
+
     if config.offload_type == GroupOffloadingType.BLOCK_LEVEL:
         _apply_group_offloading_block_level(module, config)
     elif config.offload_type == GroupOffloadingType.LEAF_LEVEL: