From eeea82ad04c3e9d8118e792019f6461021c1513c Mon Sep 17 00:00:00 2001 From: CrossProduct Date: Tue, 7 Jan 2025 20:27:19 +0000 Subject: [PATCH 01/15] Full Finetuning for LTX possibily extended to other models. --- finetrainers/args.py | 2 +- finetrainers/ltx_video/ltx_video_lora.py | 15 +++++ finetrainers/models.py | 3 +- finetrainers/trainer.py | 71 +++++++++++++++--------- 4 files changed, 62 insertions(+), 29 deletions(-) diff --git a/finetrainers/args.py b/finetrainers/args.py index d1c0715..5ca21b5 100644 --- a/finetrainers/args.py +++ b/finetrainers/args.py @@ -455,7 +455,7 @@ def _add_training_arguments(parser: argparse.ArgumentParser) -> None: "--training_type", type=str, default=None, - help="Type of training to perform. Choose between ['lora']", + help="Type of training to perform. Choose between ['lora','finetune']", ) parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( diff --git a/finetrainers/ltx_video/ltx_video_lora.py b/finetrainers/ltx_video/ltx_video_lora.py index 0e1af9b..4b14533 100644 --- a/finetrainers/ltx_video/ltx_video_lora.py +++ b/finetrainers/ltx_video/ltx_video_lora.py @@ -321,3 +321,18 @@ def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int "forward_pass": forward_pass, "validation": validation, } + +LTX_VIDEO_T2V_FT_CONFIG = { + "pipeline_cls": LTXPipeline, + "load_condition_models": load_condition_models, + "load_latent_models": load_latent_models, + "load_diffusion_models": load_diffusion_models, + "initialize_pipeline": initialize_pipeline, + "prepare_conditions": prepare_conditions, + "prepare_latents": prepare_latents, + "post_latent_preparation": post_latent_preparation, + "collate_fn": collate_fn_t2v, + "forward_pass": forward_pass, + "validation": validation, +} + diff --git a/finetrainers/models.py b/finetrainers/models.py index c7d95ae..dab4de3 100644 --- a/finetrainers/models.py +++ b/finetrainers/models.py @@ -2,7 +2,7 @@ from .cogvideox import COGVIDEOX_T2V_LORA_CONFIG from .hunyuan_video import HUNYUAN_VIDEO_T2V_LORA_CONFIG -from .ltx_video import LTX_VIDEO_T2V_LORA_CONFIG +from .ltx_video import LTX_VIDEO_T2V_LORA_CONFIG, LTX_VIDEO_T2V_FT_CONFIG SUPPORTED_MODEL_CONFIGS = { @@ -11,6 +11,7 @@ }, "ltx_video": { "lora": LTX_VIDEO_T2V_LORA_CONFIG, + "finetune": LTX_VIDEO_T2V_FT_CONFIG, }, "cogvideox": { "lora": COGVIDEOX_T2V_LORA_CONFIG, diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index 8a9cc01..6716f78 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -99,6 +99,8 @@ def __init__(self, args: Args) -> None: self.state.model_name = self.args.model_name self.model_config = get_config_from_model_name(self.args.model_name, self.args.training_type) + # Components list + self.components = [] def prepare_dataset(self) -> None: # TODO(aryan): Make a background process for fetching logger.info("Initializing dataset and dataloader") @@ -153,6 +155,17 @@ def _set_components(self, components: Dict[str, Any]) -> None: self.transformer_config = self.transformer.config if self.transformer is not None else self.transformer_config self.vae_config = self.vae.config if self.vae is not None else self.vae_config + self.components = [self.tokenizer, + self.tokenizer_2, + self.tokenizer_3, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + self.transformer, + self.unet, + self.vae] + + def _delete_components(self) -> None: self.tokenizer = None self.tokenizer_2 = None @@ -167,6 +180,8 @@ def _delete_components(self) -> None: free_memory() torch.cuda.synchronize(self.state.accelerator.device) + self.components = None + def prepare_models(self) -> None: logger.info("Initializing models") @@ -189,6 +204,16 @@ def prepare_models(self) -> None: if self.args.enable_tiling: self.vae.enable_tiling() + def _disable_grad_for_components(self, components:list): + for component in components: + if component is not None: + component.requires_grad_(False) + + def _enable_grad_for_components(self, components:list): + for component in components: + if component is not None: + component.requires_grad_(True) + def prepare_precomputations(self) -> None: if not self.args.precompute_conditions: return @@ -237,16 +262,11 @@ def collate_fn(batch): self._set_components(condition_components) self._move_components_to_device() - # TODO(aryan): refactor later. for now only lora is supported - components_to_disable_grads = [ + self._disable_grad_for_components(components=[ self.text_encoder, self.text_encoder_2, self.text_encoder_3, - ] - for component in components_to_disable_grads: - if component is not None: - component.requires_grad_(False) - + ]) if self.args.caption_dropout_p > 0 and self.args.caption_dropout_technique == "empty": logger.warning( "Caption dropout is not supported with precomputation yet. This will be supported in the future." @@ -300,12 +320,7 @@ def collate_fn(batch): self._set_components(latent_components) self._move_components_to_device() - # TODO(aryan): refactor later - components_to_disable_grads = [self.vae] - for component in components_to_disable_grads: - if component is not None: - component.requires_grad_(False) - + self._disable_grad_for_components(components=[self.vae]) if self.vae is not None: if self.args.enable_slicing: self.vae.enable_slicing() @@ -363,17 +378,18 @@ def prepare_trainable_parameters(self) -> None: diffusion_components = self.model_config["load_diffusion_models"](**self._get_load_components_kwargs()) self._set_components(diffusion_components) - # TODO(aryan): refactor later. for now only lora is supported - components_to_disable_grads = [ + self._disable_grad_for_components(components=[ self.text_encoder, self.text_encoder_2, self.text_encoder_3, - self.transformer, self.vae, - ] - for component in components_to_disable_grads: - if component is not None: - component.requires_grad_(False) + ]) + + if self.args.training_type == "full_finetune": + logger.info("Full Fine Tuning Enabled") + self._enable_grad_for_components(components=[self.transformer]) + else: + logger.info("Lora Fine Tuning Enabled") # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. @@ -398,13 +414,14 @@ def prepare_trainable_parameters(self) -> None: if self.args.gradient_checkpointing: self.transformer.enable_gradient_checkpointing() - transformer_lora_config = LoraConfig( - r=self.args.rank, - lora_alpha=self.args.lora_alpha, - init_lora_weights=True, - target_modules=self.args.target_modules, - ) - self.transformer.add_adapter(transformer_lora_config) + if self.args.training_type == "lora": + transformer_lora_config = LoraConfig( + r=self.args.rank, + lora_alpha=self.args.lora_alpha, + init_lora_weights=True, + target_modules=self.args.target_modules, + ) + self.transformer.add_adapter(transformer_lora_config) # Enable TF32 for faster training on Ampere GPUs: https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices if self.args.allow_tf32 and torch.cuda.is_available(): From f0db0ccb009741c82d24547ee4bcdc1391f93b3f Mon Sep 17 00:00:00 2001 From: CrossProduct Date: Tue, 7 Jan 2025 20:36:03 +0000 Subject: [PATCH 02/15] Change name of the flag --- finetrainers/args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetrainers/args.py b/finetrainers/args.py index 5ca21b5..224b9f1 100644 --- a/finetrainers/args.py +++ b/finetrainers/args.py @@ -455,7 +455,7 @@ def _add_training_arguments(parser: argparse.ArgumentParser) -> None: "--training_type", type=str, default=None, - help="Type of training to perform. Choose between ['lora','finetune']", + help="Type of training to perform. Choose between ['lora','full_finetune']", ) parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( From 4cd5a8e53b76523544dafc221ae58bf79f7f0fa0 Mon Sep 17 00:00:00 2001 From: CrossProduct Date: Tue, 7 Jan 2025 20:41:12 +0000 Subject: [PATCH 03/15] Used disable grad for component on lora fine tuning enabled --- finetrainers/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index 6716f78..e948411 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -390,7 +390,8 @@ def prepare_trainable_parameters(self) -> None: self._enable_grad_for_components(components=[self.transformer]) else: logger.info("Lora Fine Tuning Enabled") - + self._disable_grad_for_components(components=[self.transformer]) + # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = self._get_training_dtype(accelerator=self.state.accelerator) From cb9381b40efacb8ce3678a9a90bc48fe0b261cf8 Mon Sep 17 00:00:00 2001 From: CrossProduct Date: Tue, 7 Jan 2025 22:59:09 +0000 Subject: [PATCH 04/15] Suggestions Addressed Renamed to SFT Added 2 other models. Testing required. --- finetrainers/args.py | 2 +- finetrainers/cogvideox/cogvideox_lora.py | 15 +++++ .../hunyuan_video/hunyuan_video_lora.py | 14 +++++ finetrainers/ltx_video/ltx_video_lora.py | 3 +- finetrainers/models.py | 10 ++-- finetrainers/trainer.py | 58 ++++++++++--------- train.py | 2 - 7 files changed, 69 insertions(+), 35 deletions(-) diff --git a/finetrainers/args.py b/finetrainers/args.py index 224b9f1..bcd0076 100644 --- a/finetrainers/args.py +++ b/finetrainers/args.py @@ -455,7 +455,7 @@ def _add_training_arguments(parser: argparse.ArgumentParser) -> None: "--training_type", type=str, default=None, - help="Type of training to perform. Choose between ['lora','full_finetune']", + help="Type of training to perform. Choose between ['lora','sft']", ) parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( diff --git a/finetrainers/cogvideox/cogvideox_lora.py b/finetrainers/cogvideox/cogvideox_lora.py index c3b754a..a536cab 100644 --- a/finetrainers/cogvideox/cogvideox_lora.py +++ b/finetrainers/cogvideox/cogvideox_lora.py @@ -325,3 +325,18 @@ def _pad_frames(latents: torch.Tensor, patch_size_t: int): "forward_pass": forward_pass, "validation": validation, } + +COGVIDEOX_T2V_SFT_CONFIG = { + "pipeline_cls": CogVideoXPipeline, + "load_condition_models": load_condition_models, + "load_latent_models": load_latent_models, + "load_diffusion_models": load_diffusion_models, + "initialize_pipeline": initialize_pipeline, + "prepare_conditions": prepare_conditions, + "prepare_latents": prepare_latents, + "post_latent_preparation": post_latent_preparation, + "collate_fn": collate_fn_t2v, + "calculate_noisy_latents": calculate_noisy_latents, + "forward_pass": forward_pass, + "validation": validation, +} diff --git a/finetrainers/hunyuan_video/hunyuan_video_lora.py b/finetrainers/hunyuan_video/hunyuan_video_lora.py index 9bfea53..34ce576 100644 --- a/finetrainers/hunyuan_video/hunyuan_video_lora.py +++ b/finetrainers/hunyuan_video/hunyuan_video_lora.py @@ -358,3 +358,17 @@ def _get_clip_prompt_embeds( "forward_pass": forward_pass, "validation": validation, } + +HUNYUAN_VIDEO_T2V_SFT_CONFIG = { + "pipeline_cls": HunyuanVideoPipeline, + "load_condition_models": load_condition_models, + "load_latent_models": load_latent_models, + "load_diffusion_models": load_diffusion_models, + "initialize_pipeline": initialize_pipeline, + "prepare_conditions": prepare_conditions, + "prepare_latents": prepare_latents, + "post_latent_preparation": post_latent_preparation, + "collate_fn": collate_fn_t2v, + "forward_pass": forward_pass, + "validation": validation, +} diff --git a/finetrainers/ltx_video/ltx_video_lora.py b/finetrainers/ltx_video/ltx_video_lora.py index 4b14533..77ec50e 100644 --- a/finetrainers/ltx_video/ltx_video_lora.py +++ b/finetrainers/ltx_video/ltx_video_lora.py @@ -322,7 +322,7 @@ def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int "validation": validation, } -LTX_VIDEO_T2V_FT_CONFIG = { +LTX_VIDEO_T2V_SFT_CONFIG = { "pipeline_cls": LTXPipeline, "load_condition_models": load_condition_models, "load_latent_models": load_latent_models, @@ -335,4 +335,3 @@ def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int "forward_pass": forward_pass, "validation": validation, } - diff --git a/finetrainers/models.py b/finetrainers/models.py index dab4de3..e753c4a 100644 --- a/finetrainers/models.py +++ b/finetrainers/models.py @@ -1,20 +1,22 @@ from typing import Any, Dict -from .cogvideox import COGVIDEOX_T2V_LORA_CONFIG -from .hunyuan_video import HUNYUAN_VIDEO_T2V_LORA_CONFIG -from .ltx_video import LTX_VIDEO_T2V_LORA_CONFIG, LTX_VIDEO_T2V_FT_CONFIG +from .cogvideox import COGVIDEOX_T2V_LORA_CONFIG, COGVIDEOX_T2V_SFT_CONFIG +from .hunyuan_video import HUNYUAN_VIDEO_T2V_LORA_CONFIG, HUNYUAN_VIDEO_T2V_SFT_CONFIG +from .ltx_video import LTX_VIDEO_T2V_LORA_CONFIG, LTX_VIDEO_T2V_SFT_CONFIG SUPPORTED_MODEL_CONFIGS = { "hunyuan_video": { "lora": HUNYUAN_VIDEO_T2V_LORA_CONFIG, + "sft": HUNYUAN_VIDEO_T2V_SFT_CONFIG, }, "ltx_video": { "lora": LTX_VIDEO_T2V_LORA_CONFIG, - "finetune": LTX_VIDEO_T2V_FT_CONFIG, + "sft": LTX_VIDEO_T2V_SFT_CONFIG, }, "cogvideox": { "lora": COGVIDEOX_T2V_LORA_CONFIG, + "sft": COGVIDEOX_T2V_SFT_CONFIG, }, } diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index e948411..bfcbaeb 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -101,6 +101,7 @@ def __init__(self, args: Args) -> None: # Components list self.components = [] + def prepare_dataset(self) -> None: # TODO(aryan): Make a background process for fetching logger.info("Initializing dataset and dataloader") @@ -155,16 +156,17 @@ def _set_components(self, components: Dict[str, Any]) -> None: self.transformer_config = self.transformer.config if self.transformer is not None else self.transformer_config self.vae_config = self.vae.config if self.vae is not None else self.vae_config - self.components = [self.tokenizer, - self.tokenizer_2, - self.tokenizer_3, - self.text_encoder, - self.text_encoder_2, - self.text_encoder_3, - self.transformer, - self.unet, - self.vae] - + self.components = [ + self.tokenizer, + self.tokenizer_2, + self.tokenizer_3, + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + self.transformer, + self.unet, + self.vae, + ] def _delete_components(self) -> None: self.tokenizer = None @@ -204,12 +206,12 @@ def prepare_models(self) -> None: if self.args.enable_tiling: self.vae.enable_tiling() - def _disable_grad_for_components(self, components:list): + def _disable_grad_for_components(self, components: list): for component in components: if component is not None: component.requires_grad_(False) - def _enable_grad_for_components(self, components:list): + def _enable_grad_for_components(self, components: list): for component in components: if component is not None: component.requires_grad_(True) @@ -262,11 +264,13 @@ def collate_fn(batch): self._set_components(condition_components) self._move_components_to_device() - self._disable_grad_for_components(components=[ - self.text_encoder, - self.text_encoder_2, - self.text_encoder_3, - ]) + self._disable_grad_for_components( + components=[ + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + ] + ) if self.args.caption_dropout_p > 0 and self.args.caption_dropout_technique == "empty": logger.warning( "Caption dropout is not supported with precomputation yet. This will be supported in the future." @@ -378,20 +382,22 @@ def prepare_trainable_parameters(self) -> None: diffusion_components = self.model_config["load_diffusion_models"](**self._get_load_components_kwargs()) self._set_components(diffusion_components) - self._disable_grad_for_components(components=[ - self.text_encoder, - self.text_encoder_2, - self.text_encoder_3, - self.vae, - ]) - - if self.args.training_type == "full_finetune": + self._disable_grad_for_components( + components=[ + self.text_encoder, + self.text_encoder_2, + self.text_encoder_3, + self.vae, + ] + ) + + if self.args.training_type == "sft": logger.info("Full Fine Tuning Enabled") self._enable_grad_for_components(components=[self.transformer]) else: logger.info("Lora Fine Tuning Enabled") self._disable_grad_for_components(components=[self.transformer]) - + # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = self._get_training_dtype(accelerator=self.state.accelerator) diff --git a/train.py b/train.py index 088c061..32fe903 100644 --- a/train.py +++ b/train.py @@ -4,11 +4,9 @@ from finetrainers import Trainer, parse_arguments from finetrainers.constants import FINETRAINERS_LOG_LEVEL - logger = logging.getLogger("finetrainers") logger.setLevel(FINETRAINERS_LOG_LEVEL) - def main(): try: import multiprocessing From d0ee9c310161e12d8a89601b24d124dced0e4d81 Mon Sep 17 00:00:00 2001 From: CrossProduct Date: Thu, 9 Jan 2025 00:27:15 +0000 Subject: [PATCH 05/15] Switching to Full FineTuning --- finetrainers/cogvideox/__init__.py | 1 + finetrainers/cogvideox/cogvideox_lora.py | 2 +- finetrainers/hunyuan_video/__init__.py | 1 + finetrainers/hunyuan_video/hunyuan_video_lora.py | 2 +- finetrainers/ltx_video/__init__.py | 1 + finetrainers/ltx_video/ltx_video_lora.py | 2 +- finetrainers/models.py | 13 ++++++------- finetrainers/trainer.py | 2 +- 8 files changed, 13 insertions(+), 11 deletions(-) diff --git a/finetrainers/cogvideox/__init__.py b/finetrainers/cogvideox/__init__.py index 6a3f826..9a707b8 100644 --- a/finetrainers/cogvideox/__init__.py +++ b/finetrainers/cogvideox/__init__.py @@ -1 +1,2 @@ from .cogvideox_lora import COGVIDEOX_T2V_LORA_CONFIG +from .cogvideox_lora import COGVIDEOX_T2V_FULL_FINETUNE_CONFIG \ No newline at end of file diff --git a/finetrainers/cogvideox/cogvideox_lora.py b/finetrainers/cogvideox/cogvideox_lora.py index a536cab..63050f2 100644 --- a/finetrainers/cogvideox/cogvideox_lora.py +++ b/finetrainers/cogvideox/cogvideox_lora.py @@ -326,7 +326,7 @@ def _pad_frames(latents: torch.Tensor, patch_size_t: int): "validation": validation, } -COGVIDEOX_T2V_SFT_CONFIG = { +COGVIDEOX_T2V_FULL_FINETUNE_CONFIG = { "pipeline_cls": CogVideoXPipeline, "load_condition_models": load_condition_models, "load_latent_models": load_latent_models, diff --git a/finetrainers/hunyuan_video/__init__.py b/finetrainers/hunyuan_video/__init__.py index f4e780d..1ee2837 100644 --- a/finetrainers/hunyuan_video/__init__.py +++ b/finetrainers/hunyuan_video/__init__.py @@ -1 +1,2 @@ from .hunyuan_video_lora import HUNYUAN_VIDEO_T2V_LORA_CONFIG +from .hunyuan_video_lora import HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG \ No newline at end of file diff --git a/finetrainers/hunyuan_video/hunyuan_video_lora.py b/finetrainers/hunyuan_video/hunyuan_video_lora.py index 34ce576..d0d8bce 100644 --- a/finetrainers/hunyuan_video/hunyuan_video_lora.py +++ b/finetrainers/hunyuan_video/hunyuan_video_lora.py @@ -359,7 +359,7 @@ def _get_clip_prompt_embeds( "validation": validation, } -HUNYUAN_VIDEO_T2V_SFT_CONFIG = { +HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG = { "pipeline_cls": HunyuanVideoPipeline, "load_condition_models": load_condition_models, "load_latent_models": load_latent_models, diff --git a/finetrainers/ltx_video/__init__.py b/finetrainers/ltx_video/__init__.py index b583686..0476fc6 100644 --- a/finetrainers/ltx_video/__init__.py +++ b/finetrainers/ltx_video/__init__.py @@ -1 +1,2 @@ from .ltx_video_lora import LTX_VIDEO_T2V_LORA_CONFIG +from .ltx_video_lora import LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG \ No newline at end of file diff --git a/finetrainers/ltx_video/ltx_video_lora.py b/finetrainers/ltx_video/ltx_video_lora.py index 77ec50e..86b60cf 100644 --- a/finetrainers/ltx_video/ltx_video_lora.py +++ b/finetrainers/ltx_video/ltx_video_lora.py @@ -322,7 +322,7 @@ def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int "validation": validation, } -LTX_VIDEO_T2V_SFT_CONFIG = { +LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG = { "pipeline_cls": LTXPipeline, "load_condition_models": load_condition_models, "load_latent_models": load_latent_models, diff --git a/finetrainers/models.py b/finetrainers/models.py index e753c4a..d3ece75 100644 --- a/finetrainers/models.py +++ b/finetrainers/models.py @@ -1,22 +1,21 @@ from typing import Any, Dict -from .cogvideox import COGVIDEOX_T2V_LORA_CONFIG, COGVIDEOX_T2V_SFT_CONFIG -from .hunyuan_video import HUNYUAN_VIDEO_T2V_LORA_CONFIG, HUNYUAN_VIDEO_T2V_SFT_CONFIG -from .ltx_video import LTX_VIDEO_T2V_LORA_CONFIG, LTX_VIDEO_T2V_SFT_CONFIG - +from .cogvideox import COGVIDEOX_T2V_LORA_CONFIG, COGVIDEOX_T2V_FULL_FINETUNE_CONFIG +from .hunyuan_video import HUNYUAN_VIDEO_T2V_LORA_CONFIG, HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG +from .ltx_video import LTX_VIDEO_T2V_LORA_CONFIG, LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG SUPPORTED_MODEL_CONFIGS = { "hunyuan_video": { "lora": HUNYUAN_VIDEO_T2V_LORA_CONFIG, - "sft": HUNYUAN_VIDEO_T2V_SFT_CONFIG, + "full-finetune": HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG, }, "ltx_video": { "lora": LTX_VIDEO_T2V_LORA_CONFIG, - "sft": LTX_VIDEO_T2V_SFT_CONFIG, + "full-finetune": LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG, }, "cogvideox": { "lora": COGVIDEOX_T2V_LORA_CONFIG, - "sft": COGVIDEOX_T2V_SFT_CONFIG, + "full-finetune": COGVIDEOX_T2V_FULL_FINETUNE_CONFIG, }, } diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index d843387..b068c5a 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -395,7 +395,7 @@ def prepare_trainable_parameters(self) -> None: ] ) - if self.args.training_type == "sft": + if self.args.training_type == "full-finetune": logger.info("Full Fine Tuning Enabled") self._enable_grad_for_components(components=[self.transformer]) else: From 19bba0aed38d78b65571f775988834b4afc1a49c Mon Sep 17 00:00:00 2001 From: CrossProduct Date: Thu, 9 Jan 2025 00:39:20 +0000 Subject: [PATCH 06/15] Run linter. --- finetrainers/cogvideox/__init__.py | 3 +-- finetrainers/hunyuan_video/__init__.py | 3 +-- finetrainers/ltx_video/__init__.py | 3 +-- finetrainers/models.py | 7 ++++--- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/finetrainers/cogvideox/__init__.py b/finetrainers/cogvideox/__init__.py index 9a707b8..8ca65b7 100644 --- a/finetrainers/cogvideox/__init__.py +++ b/finetrainers/cogvideox/__init__.py @@ -1,2 +1 @@ -from .cogvideox_lora import COGVIDEOX_T2V_LORA_CONFIG -from .cogvideox_lora import COGVIDEOX_T2V_FULL_FINETUNE_CONFIG \ No newline at end of file +from .cogvideox_lora import COGVIDEOX_T2V_FULL_FINETUNE_CONFIG, COGVIDEOX_T2V_LORA_CONFIG diff --git a/finetrainers/hunyuan_video/__init__.py b/finetrainers/hunyuan_video/__init__.py index 1ee2837..1800b2e 100644 --- a/finetrainers/hunyuan_video/__init__.py +++ b/finetrainers/hunyuan_video/__init__.py @@ -1,2 +1 @@ -from .hunyuan_video_lora import HUNYUAN_VIDEO_T2V_LORA_CONFIG -from .hunyuan_video_lora import HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG \ No newline at end of file +from .hunyuan_video_lora import HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG, HUNYUAN_VIDEO_T2V_LORA_CONFIG diff --git a/finetrainers/ltx_video/__init__.py b/finetrainers/ltx_video/__init__.py index 0476fc6..0b33512 100644 --- a/finetrainers/ltx_video/__init__.py +++ b/finetrainers/ltx_video/__init__.py @@ -1,2 +1 @@ -from .ltx_video_lora import LTX_VIDEO_T2V_LORA_CONFIG -from .ltx_video_lora import LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG \ No newline at end of file +from .ltx_video_lora import LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG, LTX_VIDEO_T2V_LORA_CONFIG diff --git a/finetrainers/models.py b/finetrainers/models.py index d3ece75..c24ab95 100644 --- a/finetrainers/models.py +++ b/finetrainers/models.py @@ -1,8 +1,9 @@ from typing import Any, Dict -from .cogvideox import COGVIDEOX_T2V_LORA_CONFIG, COGVIDEOX_T2V_FULL_FINETUNE_CONFIG -from .hunyuan_video import HUNYUAN_VIDEO_T2V_LORA_CONFIG, HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG -from .ltx_video import LTX_VIDEO_T2V_LORA_CONFIG, LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG +from .cogvideox import COGVIDEOX_T2V_FULL_FINETUNE_CONFIG, COGVIDEOX_T2V_LORA_CONFIG +from .hunyuan_video import HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG, HUNYUAN_VIDEO_T2V_LORA_CONFIG +from .ltx_video import LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG, LTX_VIDEO_T2V_LORA_CONFIG + SUPPORTED_MODEL_CONFIGS = { "hunyuan_video": { From acffc2d9fcefd23244cd6e897b9c7f8db04c093d Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 9 Jan 2025 12:54:57 +0530 Subject: [PATCH 07/15] parse subfolder when needed. --- finetrainers/cogvideox/cogvideox_lora.py | 3 ++- finetrainers/hunyuan_video/hunyuan_video_lora.py | 3 ++- finetrainers/ltx_video/ltx_video_lora.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/finetrainers/cogvideox/cogvideox_lora.py b/finetrainers/cogvideox/cogvideox_lora.py index c3b754a..9d3a77c 100644 --- a/finetrainers/cogvideox/cogvideox_lora.py +++ b/finetrainers/cogvideox/cogvideox_lora.py @@ -42,8 +42,9 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ): + subfolder = kwargs.get("subfolder", None) transformer = CogVideoXTransformer3DModel.from_pretrained( - model_id, subfolder="transformer", torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir + model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) scheduler = CogVideoXDDIMScheduler.from_pretrained(model_id, subfolder="scheduler") return {"transformer": transformer, "scheduler": scheduler} diff --git a/finetrainers/hunyuan_video/hunyuan_video_lora.py b/finetrainers/hunyuan_video/hunyuan_video_lora.py index 9bfea53..450101d 100644 --- a/finetrainers/hunyuan_video/hunyuan_video_lora.py +++ b/finetrainers/hunyuan_video/hunyuan_video_lora.py @@ -63,8 +63,9 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ) -> Dict[str, Union[nn.Module, FlowMatchEulerDiscreteScheduler]]: + subfolder = kwargs.get("subfolder", None) transformer = HunyuanVideoTransformer3DModel.from_pretrained( - model_id, subfolder="transformer", torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir + model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) scheduler = FlowMatchEulerDiscreteScheduler(shift=shift) return {"transformer": transformer, "scheduler": scheduler} diff --git a/finetrainers/ltx_video/ltx_video_lora.py b/finetrainers/ltx_video/ltx_video_lora.py index 0e1af9b..7a80543 100644 --- a/finetrainers/ltx_video/ltx_video_lora.py +++ b/finetrainers/ltx_video/ltx_video_lora.py @@ -45,8 +45,9 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ) -> Dict[str, nn.Module]: + subfolder = kwargs.get("subfolder", None) transformer = LTXVideoTransformer3DModel.from_pretrained( - model_id, subfolder="transformer", torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir + model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) scheduler = FlowMatchEulerDiscreteScheduler() return {"transformer": transformer, "scheduler": scheduler} From 8188f8a11e631ddf59335a76d714df76bf0b982a Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 9 Jan 2025 13:03:14 +0530 Subject: [PATCH 08/15] tackle saving and loading hooks. --- finetrainers/trainer.py | 94 +++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 40 deletions(-) diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index 81c8264..125e70d 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -406,13 +406,16 @@ def prepare_trainable_parameters(self) -> None: if self.args.gradient_checkpointing: self.transformer.enable_gradient_checkpointing() - transformer_lora_config = LoraConfig( - r=self.args.rank, - lora_alpha=self.args.lora_alpha, - init_lora_weights=True, - target_modules=self.args.target_modules, - ) - self.transformer.add_adapter(transformer_lora_config) + if self.args.training_type == "lora": + transformer_lora_config = LoraConfig( + r=self.args.rank, + lora_alpha=self.args.lora_alpha, + init_lora_weights=True, + target_modules=self.args.target_modules, + ) + self.transformer.add_adapter(transformer_lora_config) + else: + transformer_lora_config = None # Enable TF32 for faster training on Ampere GPUs: https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices if self.args.allow_tf32 and torch.cuda.is_available(): @@ -432,7 +435,8 @@ def save_model_hook(models, weights, output_dir): type(unwrap_model(self.state.accelerator, self.transformer)), ): model = unwrap_model(self.state.accelerator, model) - transformer_lora_layers_to_save = get_peft_model_state_dict(model) + if self.args.training_type == "lora": + transformer_lora_layers_to_save = get_peft_model_state_dict(model) else: raise ValueError(f"Unexpected save model: {model.__class__}") @@ -440,10 +444,14 @@ def save_model_hook(models, weights, output_dir): if weights: weights.pop() - self.model_config["pipeline_cls"].save_lora_weights( - output_dir, - transformer_lora_layers=transformer_lora_layers_to_save, - ) + # TODO: refactor later if needed. But for now, this is just a few LoC. + if self.args.training_type == "lora": + self.model_config["pipeline_cls"].save_lora_weights( + output_dir, + transformer_lora_layers=transformer_lora_layers_to_save, + ) + else: + model.save_pretrained(os.path.join(output_dir, "transformer")) def load_model_hook(models, input_dir): if not self.state.accelerator.distributed_type == DistributedType.DEEPSPEED: @@ -459,33 +467,39 @@ def load_model_hook(models, input_dir): f"Unexpected save model: {unwrap_model(self.state.accelerator, model).__class__}" ) else: - transformer_ = unwrap_model(self.state.accelerator, self.transformer).__class__.from_pretrained( - self.args.pretrained_model_name_or_path, subfolder="transformer" - ) - transformer_.add_adapter(transformer_lora_config) + transformer_cls_ = unwrap_model(self.state.accelerator, self.transformer).__class__ - lora_state_dict = self.model_config["pipeline_cls"].lora_state_dict(input_dir) - transformer_state_dict = { - f'{k.replace("transformer.", "")}': v - for k, v in lora_state_dict.items() - if k.startswith("transformer.") - } - incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default") - if incompatible_keys is not None: - # check only for unexpected keys - unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None) - if unexpected_keys: - logger.warning( - f"Loading adapter weights from state_dict led to unexpected keys not found in the model: " - f" {unexpected_keys}. " + if self.args.training_type == "lora": + transformer_ = transformer_cls_.__class__.from_pretrained( + self.args.pretrained_model_name_or_path, subfolder="transformer" + ) + transformer_.add_adapter(transformer_lora_config) + lora_state_dict = self.model_config["pipeline_cls"].lora_state_dict(input_dir) + transformer_state_dict = { + f'{k.replace("transformer.", "")}': v + for k, v in lora_state_dict.items() + if k.startswith("transformer.") + } + incompatible_keys = set_peft_model_state_dict( + transformer_, transformer_state_dict, adapter_name="default" ) + if incompatible_keys is not None: + # check only for unexpected keys + unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None) + if unexpected_keys: + logger.warning( + f"Loading adapter weights from state_dict led to unexpected keys not found in the model: " + f" {unexpected_keys}. " + ) - # Make sure the trainable params are in float32. This is again needed since the base models - # are in `weight_dtype`. More details: - # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804 - if self.args.mixed_precision == "fp16": - # only upcast trainable parameters (LoRA) into fp32 - cast_training_params([transformer_]) + # Make sure the trainable params are in float32. This is again needed since the base models + # are in `weight_dtype`. More details: + # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804 + if self.args.mixed_precision == "fp16": + # only upcast trainable parameters (LoRA) into fp32 + cast_training_params([transformer_]) + else: + transformer_ = transformer_cls_.from_pretrained(os.path.join(input_dir, "transformer")) self.state.accelerator.register_save_state_pre_hook(save_model_hook) self.state.accelerator.register_load_state_pre_hook(load_model_hook) @@ -497,7 +511,7 @@ def prepare_optimizer(self) -> None: self.state.train_steps = self.args.train_steps # Make sure the trainable params are in float32 - if self.args.mixed_precision == "fp16": + if self.args.mixed_precision == "fp16" and self.args.training_type == "lora": # only upcast trainable parameters (LoRA) into fp32 cast_training_params([self.transformer], dtype=torch.float32) @@ -510,13 +524,13 @@ def prepare_optimizer(self) -> None: * self.state.accelerator.num_processes ) - transformer_lora_parameters = list(filter(lambda p: p.requires_grad, self.transformer.parameters())) + transformer_trainable_parameters = list(filter(lambda p: p.requires_grad, self.transformer.parameters())) transformer_parameters_with_lr = { - "params": transformer_lora_parameters, + "params": transformer_trainable_parameters, "lr": self.state.learning_rate, } params_to_optimize = [transformer_parameters_with_lr] - self.state.num_trainable_parameters = sum(p.numel() for p in transformer_lora_parameters) + self.state.num_trainable_parameters = sum(p.numel() for p in transformer_trainable_parameters) use_deepspeed_opt = ( self.state.accelerator.state.deepspeed_plugin is not None From 5183405e61dfa2cdf1d56abbd46cd1d588b672f7 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 9 Jan 2025 13:17:15 +0530 Subject: [PATCH 09/15] tackle validation. --- finetrainers/trainer.py | 73 +++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index 125e70d..502d2b4 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -21,6 +21,7 @@ gather_object, set_seed, ) +from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution from diffusers.optimization import get_scheduler @@ -916,35 +917,7 @@ def validate(self, step: int, final_validation: bool = False) -> None: memory_statistics = get_memory_statistics() logger.info(f"Memory before validation start: {json.dumps(memory_statistics, indent=4)}") - if not final_validation: - pipeline = self.model_config["initialize_pipeline"]( - model_id=self.args.pretrained_model_name_or_path, - tokenizer=self.tokenizer, - text_encoder=self.text_encoder, - tokenizer_2=self.tokenizer_2, - text_encoder_2=self.text_encoder_2, - transformer=unwrap_model(accelerator, self.transformer), - vae=self.vae, - device=accelerator.device, - revision=self.args.revision, - cache_dir=self.args.cache_dir, - enable_slicing=self.args.enable_slicing, - enable_tiling=self.args.enable_tiling, - enable_model_cpu_offload=self.args.enable_model_cpu_offload, - ) - else: - # `torch_dtype` is manually set within `initialize_pipeline()`. - self._delete_components() - pipeline = self.model_config["initialize_pipeline"]( - model_id=self.args.pretrained_model_name_or_path, - device=accelerator.device, - revision=self.args.revision, - cache_dir=self.args.cache_dir, - enable_slicing=self.args.enable_slicing, - enable_tiling=self.args.enable_tiling, - enable_model_cpu_offload=self.args.enable_model_cpu_offload, - ) - pipeline.load_lora_weights(self.args.output_dir) + pipeline = self._get_and_prepare_pipeline_for_validation(final_validation=final_validation) all_processes_artifacts = [] prompts_to_filenames = {} @@ -1150,3 +1123,45 @@ def _get_training_dtype(self, accelerator) -> torch.dtype: elif self.state.accelerator.mixed_precision == "bf16": weight_dtype = torch.bfloat16 return weight_dtype + + def _get_and_prepare_pipeline_for_validation(self, final_validation: bool = False) -> DiffusionPipeline: + accelerator = self.state.accelerator + if not final_validation: + pipeline = self.model_config["initialize_pipeline"]( + model_id=self.args.pretrained_model_name_or_path, + tokenizer=self.tokenizer, + text_encoder=self.text_encoder, + tokenizer_2=self.tokenizer_2, + text_encoder_2=self.text_encoder_2, + transformer=unwrap_model(accelerator, self.transformer), + vae=self.vae, + device=accelerator.device, + revision=self.args.revision, + cache_dir=self.args.cache_dir, + enable_slicing=self.args.enable_slicing, + enable_tiling=self.args.enable_tiling, + enable_model_cpu_offload=self.args.enable_model_cpu_offload, + ) + else: + # `torch_dtype` is manually set within `initialize_pipeline()`. + self._delete_components() + if self.args.training_type == "lora": + transformer = None + else: + transformer = self.model_config["load_diffusion_models"]( + model_id=self.args.output_dir, subfolder=None + )["transformer"] + pipeline = self.model_config["initialize_pipeline"]( + model_id=self.args.pretrained_model_name_or_path, + transformer=transformer, + device=accelerator.device, + revision=self.args.revision, + cache_dir=self.args.cache_dir, + enable_slicing=self.args.enable_slicing, + enable_tiling=self.args.enable_tiling, + enable_model_cpu_offload=self.args.enable_model_cpu_offload, + ) + if self.args.training_type == "lora": + pipeline.load_lora_weights(self.args.output_dir) + + return pipeline From 162e6cd50dc745656d4685b3159c7446f4fa7dc5 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 9 Jan 2025 13:25:01 +0530 Subject: [PATCH 10/15] fix subfolder bug. --- finetrainers/cogvideox/cogvideox_lora.py | 2 +- finetrainers/hunyuan_video/hunyuan_video_lora.py | 2 +- finetrainers/ltx_video/ltx_video_lora.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/finetrainers/cogvideox/cogvideox_lora.py b/finetrainers/cogvideox/cogvideox_lora.py index 9d3a77c..c1413ae 100644 --- a/finetrainers/cogvideox/cogvideox_lora.py +++ b/finetrainers/cogvideox/cogvideox_lora.py @@ -42,7 +42,7 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ): - subfolder = kwargs.get("subfolder", None) + subfolder = kwargs.get("subfolder", "transformer") transformer = CogVideoXTransformer3DModel.from_pretrained( model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) diff --git a/finetrainers/hunyuan_video/hunyuan_video_lora.py b/finetrainers/hunyuan_video/hunyuan_video_lora.py index 450101d..c23feb8 100644 --- a/finetrainers/hunyuan_video/hunyuan_video_lora.py +++ b/finetrainers/hunyuan_video/hunyuan_video_lora.py @@ -63,7 +63,7 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ) -> Dict[str, Union[nn.Module, FlowMatchEulerDiscreteScheduler]]: - subfolder = kwargs.get("subfolder", None) + subfolder = kwargs.get("subfolder", "transformer") transformer = HunyuanVideoTransformer3DModel.from_pretrained( model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) diff --git a/finetrainers/ltx_video/ltx_video_lora.py b/finetrainers/ltx_video/ltx_video_lora.py index 7a80543..a6e8222 100644 --- a/finetrainers/ltx_video/ltx_video_lora.py +++ b/finetrainers/ltx_video/ltx_video_lora.py @@ -45,7 +45,7 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ) -> Dict[str, nn.Module]: - subfolder = kwargs.get("subfolder", None) + subfolder = kwargs.get("subfolder", "transformer") transformer = LTXVideoTransformer3DModel.from_pretrained( model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) From c0f38892f88158d77fdeb2fc32d5eb3742a71473 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Fri, 10 Jan 2025 07:44:36 +0530 Subject: [PATCH 11/15] remove __class__. --- finetrainers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index 922186d..e413fe8 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -471,7 +471,7 @@ def load_model_hook(models, input_dir): transformer_cls_ = unwrap_model(self.state.accelerator, self.transformer).__class__ if self.args.training_type == "lora": - transformer_ = transformer_cls_.__class__.from_pretrained( + transformer_ = transformer_cls_.from_pretrained( self.args.pretrained_model_name_or_path, subfolder="transformer" ) transformer_.add_adapter(transformer_lora_config) From d6821c3da8c39bb36690098e9d742c5bc8410e77 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sat, 11 Jan 2025 00:54:03 +0100 Subject: [PATCH 12/15] refactor --- finetrainers/args.py | 13 +++++++- finetrainers/cogvideox/__init__.py | 3 +- finetrainers/cogvideox/cogvideox_lora.py | 16 +--------- finetrainers/cogvideox/full_finetune.py | 32 +++++++++++++++++++ finetrainers/hunyuan_video/__init__.py | 3 +- finetrainers/hunyuan_video/full_finetune.py | 30 +++++++++++++++++ .../hunyuan_video/hunyuan_video_lora.py | 15 +-------- finetrainers/ltx_video/__init__.py | 3 +- finetrainers/ltx_video/full_finetune.py | 30 +++++++++++++++++ finetrainers/ltx_video/ltx_video_lora.py | 14 -------- 10 files changed, 112 insertions(+), 47 deletions(-) create mode 100644 finetrainers/cogvideox/full_finetune.py create mode 100644 finetrainers/hunyuan_video/full_finetune.py create mode 100644 finetrainers/ltx_video/full_finetune.py diff --git a/finetrainers/args.py b/finetrainers/args.py index bcd0076..137eab6 100644 --- a/finetrainers/args.py +++ b/finetrainers/args.py @@ -236,6 +236,7 @@ def parse_arguments() -> Args: def validate_args(args: Args): + _validate_training_args(args) _validate_validation_args(args) @@ -455,7 +456,8 @@ def _add_training_arguments(parser: argparse.ArgumentParser) -> None: "--training_type", type=str, default=None, - help="Type of training to perform. Choose between ['lora','sft']", + choices=["lora", "full-finetune"], + help="Type of training to perform. Choose between ['lora', 'full-finetune']", ) parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( @@ -874,6 +876,15 @@ def _map_to_args_type(args: Dict[str, Any]) -> Args: return result_args +def _validate_training_args(args: Args): + if args.training_type == "lora": + assert args.rank is not None, "Rank is required for LoRA training" + assert args.lora_alpha is not None, "LoRA alpha is required for LoRA training" + assert ( + args.target_modules is not None and len(args.target_modules) > 0 + ), "Target modules are required for LoRA training" + + def _validate_validation_args(args: Args): assert args.validation_prompts is not None, "Validation prompts are required for validation" if args.validation_images is not None: diff --git a/finetrainers/cogvideox/__init__.py b/finetrainers/cogvideox/__init__.py index 8ca65b7..390479b 100644 --- a/finetrainers/cogvideox/__init__.py +++ b/finetrainers/cogvideox/__init__.py @@ -1 +1,2 @@ -from .cogvideox_lora import COGVIDEOX_T2V_FULL_FINETUNE_CONFIG, COGVIDEOX_T2V_LORA_CONFIG +from .cogvideox_lora import COGVIDEOX_T2V_LORA_CONFIG +from .full_finetune import COGVIDEOX_T2V_FULL_FINETUNE_CONFIG diff --git a/finetrainers/cogvideox/cogvideox_lora.py b/finetrainers/cogvideox/cogvideox_lora.py index cc213f8..36776d0 100644 --- a/finetrainers/cogvideox/cogvideox_lora.py +++ b/finetrainers/cogvideox/cogvideox_lora.py @@ -312,6 +312,7 @@ def _pad_frames(latents: torch.Tensor, patch_size_t: int): return latents +# TODO(aryan): refactor into model specs for better re-use COGVIDEOX_T2V_LORA_CONFIG = { "pipeline_cls": CogVideoXPipeline, "load_condition_models": load_condition_models, @@ -326,18 +327,3 @@ def _pad_frames(latents: torch.Tensor, patch_size_t: int): "forward_pass": forward_pass, "validation": validation, } - -COGVIDEOX_T2V_FULL_FINETUNE_CONFIG = { - "pipeline_cls": CogVideoXPipeline, - "load_condition_models": load_condition_models, - "load_latent_models": load_latent_models, - "load_diffusion_models": load_diffusion_models, - "initialize_pipeline": initialize_pipeline, - "prepare_conditions": prepare_conditions, - "prepare_latents": prepare_latents, - "post_latent_preparation": post_latent_preparation, - "collate_fn": collate_fn_t2v, - "calculate_noisy_latents": calculate_noisy_latents, - "forward_pass": forward_pass, - "validation": validation, -} diff --git a/finetrainers/cogvideox/full_finetune.py b/finetrainers/cogvideox/full_finetune.py new file mode 100644 index 0000000..f755981 --- /dev/null +++ b/finetrainers/cogvideox/full_finetune.py @@ -0,0 +1,32 @@ +from diffusers import CogVideoXPipeline + +from .cogvideox_lora import ( + calculate_noisy_latents, + collate_fn_t2v, + forward_pass, + initialize_pipeline, + load_condition_models, + load_diffusion_models, + load_latent_models, + post_latent_preparation, + prepare_conditions, + prepare_latents, + validation, +) + + +# TODO(aryan): refactor into model specs for better re-use +COGVIDEOX_T2V_FULL_FINETUNE_CONFIG = { + "pipeline_cls": CogVideoXPipeline, + "load_condition_models": load_condition_models, + "load_latent_models": load_latent_models, + "load_diffusion_models": load_diffusion_models, + "initialize_pipeline": initialize_pipeline, + "prepare_conditions": prepare_conditions, + "prepare_latents": prepare_latents, + "post_latent_preparation": post_latent_preparation, + "collate_fn": collate_fn_t2v, + "calculate_noisy_latents": calculate_noisy_latents, + "forward_pass": forward_pass, + "validation": validation, +} diff --git a/finetrainers/hunyuan_video/__init__.py b/finetrainers/hunyuan_video/__init__.py index 1800b2e..e1fdafa 100644 --- a/finetrainers/hunyuan_video/__init__.py +++ b/finetrainers/hunyuan_video/__init__.py @@ -1 +1,2 @@ -from .hunyuan_video_lora import HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG, HUNYUAN_VIDEO_T2V_LORA_CONFIG +from .full_finetune import HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG +from .hunyuan_video_lora import HUNYUAN_VIDEO_T2V_LORA_CONFIG diff --git a/finetrainers/hunyuan_video/full_finetune.py b/finetrainers/hunyuan_video/full_finetune.py new file mode 100644 index 0000000..36dd5cb --- /dev/null +++ b/finetrainers/hunyuan_video/full_finetune.py @@ -0,0 +1,30 @@ +from diffusers import HunyuanVideoPipeline + +from .hunyuan_video_lora import ( + collate_fn_t2v, + forward_pass, + initialize_pipeline, + load_condition_models, + load_diffusion_models, + load_latent_models, + post_latent_preparation, + prepare_conditions, + prepare_latents, + validation, +) + + +# TODO(aryan): refactor into model specs for better re-use +HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG = { + "pipeline_cls": HunyuanVideoPipeline, + "load_condition_models": load_condition_models, + "load_latent_models": load_latent_models, + "load_diffusion_models": load_diffusion_models, + "initialize_pipeline": initialize_pipeline, + "prepare_conditions": prepare_conditions, + "prepare_latents": prepare_latents, + "post_latent_preparation": post_latent_preparation, + "collate_fn": collate_fn_t2v, + "forward_pass": forward_pass, + "validation": validation, +} diff --git a/finetrainers/hunyuan_video/hunyuan_video_lora.py b/finetrainers/hunyuan_video/hunyuan_video_lora.py index 7d4fb5e..071451c 100644 --- a/finetrainers/hunyuan_video/hunyuan_video_lora.py +++ b/finetrainers/hunyuan_video/hunyuan_video_lora.py @@ -346,6 +346,7 @@ def _get_clip_prompt_embeds( return {"pooled_prompt_embeds": prompt_embeds} +# TODO(aryan): refactor into model specs for better re-use HUNYUAN_VIDEO_T2V_LORA_CONFIG = { "pipeline_cls": HunyuanVideoPipeline, "load_condition_models": load_condition_models, @@ -359,17 +360,3 @@ def _get_clip_prompt_embeds( "forward_pass": forward_pass, "validation": validation, } - -HUNYUAN_VIDEO_T2V_FULL_FINETUNE_CONFIG = { - "pipeline_cls": HunyuanVideoPipeline, - "load_condition_models": load_condition_models, - "load_latent_models": load_latent_models, - "load_diffusion_models": load_diffusion_models, - "initialize_pipeline": initialize_pipeline, - "prepare_conditions": prepare_conditions, - "prepare_latents": prepare_latents, - "post_latent_preparation": post_latent_preparation, - "collate_fn": collate_fn_t2v, - "forward_pass": forward_pass, - "validation": validation, -} diff --git a/finetrainers/ltx_video/__init__.py b/finetrainers/ltx_video/__init__.py index 0b33512..6d5d0f9 100644 --- a/finetrainers/ltx_video/__init__.py +++ b/finetrainers/ltx_video/__init__.py @@ -1 +1,2 @@ -from .ltx_video_lora import LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG, LTX_VIDEO_T2V_LORA_CONFIG +from .full_finetune import LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG +from .ltx_video_lora import LTX_VIDEO_T2V_LORA_CONFIG diff --git a/finetrainers/ltx_video/full_finetune.py b/finetrainers/ltx_video/full_finetune.py new file mode 100644 index 0000000..9aa30ef --- /dev/null +++ b/finetrainers/ltx_video/full_finetune.py @@ -0,0 +1,30 @@ +from diffusers import LTXPipeline + +from .ltx_video_lora import ( + collate_fn_t2v, + forward_pass, + initialize_pipeline, + load_condition_models, + load_diffusion_models, + load_latent_models, + post_latent_preparation, + prepare_conditions, + prepare_latents, + validation, +) + + +# TODO(aryan): refactor into model specs for better re-use +LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG = { + "pipeline_cls": LTXPipeline, + "load_condition_models": load_condition_models, + "load_latent_models": load_latent_models, + "load_diffusion_models": load_diffusion_models, + "initialize_pipeline": initialize_pipeline, + "prepare_conditions": prepare_conditions, + "prepare_latents": prepare_latents, + "post_latent_preparation": post_latent_preparation, + "collate_fn": collate_fn_t2v, + "forward_pass": forward_pass, + "validation": validation, +} diff --git a/finetrainers/ltx_video/ltx_video_lora.py b/finetrainers/ltx_video/ltx_video_lora.py index 64bb625..a6e8222 100644 --- a/finetrainers/ltx_video/ltx_video_lora.py +++ b/finetrainers/ltx_video/ltx_video_lora.py @@ -322,17 +322,3 @@ def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int "forward_pass": forward_pass, "validation": validation, } - -LTX_VIDEO_T2V_FULL_FINETUNE_CONFIG = { - "pipeline_cls": LTXPipeline, - "load_condition_models": load_condition_models, - "load_latent_models": load_latent_models, - "load_diffusion_models": load_diffusion_models, - "initialize_pipeline": initialize_pipeline, - "prepare_conditions": prepare_conditions, - "prepare_latents": prepare_latents, - "post_latent_preparation": post_latent_preparation, - "collate_fn": collate_fn_t2v, - "forward_pass": forward_pass, - "validation": validation, -} From 06dd96c0a3d0ed6972fde40e2321699635763e40 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sat, 11 Jan 2025 01:11:27 +0100 Subject: [PATCH 13/15] remove unnecessary changes --- finetrainers/cogvideox/cogvideox_lora.py | 3 +- .../hunyuan_video/hunyuan_video_lora.py | 3 +- finetrainers/ltx_video/ltx_video_lora.py | 3 +- finetrainers/trainer.py | 53 +++++++++---------- 4 files changed, 27 insertions(+), 35 deletions(-) diff --git a/finetrainers/cogvideox/cogvideox_lora.py b/finetrainers/cogvideox/cogvideox_lora.py index 36776d0..7dca3d0 100644 --- a/finetrainers/cogvideox/cogvideox_lora.py +++ b/finetrainers/cogvideox/cogvideox_lora.py @@ -42,9 +42,8 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ): - subfolder = kwargs.get("subfolder", "transformer") transformer = CogVideoXTransformer3DModel.from_pretrained( - model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir + model_id, subfolder="transformer", torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) scheduler = CogVideoXDDIMScheduler.from_pretrained(model_id, subfolder="scheduler") return {"transformer": transformer, "scheduler": scheduler} diff --git a/finetrainers/hunyuan_video/hunyuan_video_lora.py b/finetrainers/hunyuan_video/hunyuan_video_lora.py index 071451c..ed9013c 100644 --- a/finetrainers/hunyuan_video/hunyuan_video_lora.py +++ b/finetrainers/hunyuan_video/hunyuan_video_lora.py @@ -63,9 +63,8 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ) -> Dict[str, Union[nn.Module, FlowMatchEulerDiscreteScheduler]]: - subfolder = kwargs.get("subfolder", "transformer") transformer = HunyuanVideoTransformer3DModel.from_pretrained( - model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir + model_id, subfolder="transformer", torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) scheduler = FlowMatchEulerDiscreteScheduler(shift=shift) return {"transformer": transformer, "scheduler": scheduler} diff --git a/finetrainers/ltx_video/ltx_video_lora.py b/finetrainers/ltx_video/ltx_video_lora.py index a6e8222..0e1af9b 100644 --- a/finetrainers/ltx_video/ltx_video_lora.py +++ b/finetrainers/ltx_video/ltx_video_lora.py @@ -45,9 +45,8 @@ def load_diffusion_models( cache_dir: Optional[str] = None, **kwargs, ) -> Dict[str, nn.Module]: - subfolder = kwargs.get("subfolder", "transformer") transformer = LTXVideoTransformer3DModel.from_pretrained( - model_id, subfolder=subfolder, torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir + model_id, subfolder="transformer", torch_dtype=transformer_dtype, revision=revision, cache_dir=cache_dir ) scheduler = FlowMatchEulerDiscreteScheduler() return {"transformer": transformer, "scheduler": scheduler} diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index c5e6508..6af4c2c 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -5,7 +5,7 @@ import random from datetime import datetime, timedelta from pathlib import Path -from typing import Any, Dict +from typing import Any, Dict, List import diffusers import torch @@ -207,16 +207,6 @@ def prepare_models(self) -> None: if self.args.enable_tiling: self.vae.enable_tiling() - def _disable_grad_for_components(self, components: list): - for component in components: - if component is not None: - component.requires_grad_(False) - - def _enable_grad_for_components(self, components: list): - for component in components: - if component is not None: - component.requires_grad_(True) - def prepare_precomputations(self) -> None: if not self.args.precompute_conditions: return @@ -391,20 +381,14 @@ def prepare_trainable_parameters(self) -> None: diffusion_components = self.model_config["load_diffusion_models"](**self._get_load_components_kwargs()) self._set_components(diffusion_components) - self._disable_grad_for_components( - components=[ - self.text_encoder, - self.text_encoder_2, - self.text_encoder_3, - self.vae, - ] - ) + components = [self.text_encoder, self.text_encoder_2, self.text_encoder_3, self.vae] + self._disable_grad_for_components(components) if self.args.training_type == "full-finetune": - logger.info("Full Fine Tuning Enabled") + logger.info("Finetuning transformer with no additional parameters.") self._enable_grad_for_components(components=[self.transformer]) else: - logger.info("Lora Fine Tuning Enabled") + logger.info("Finetuning transformer with low-rank peft parameters.") self._disable_grad_for_components(components=[self.transformer]) # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision @@ -412,7 +396,7 @@ def prepare_trainable_parameters(self) -> None: weight_dtype = self._get_training_dtype(accelerator=self.state.accelerator) if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16: - # due to pytorch#99272, MPS does not yet support bfloat16. + # Due to pytorch#99272, MPS does not yet support bfloat16. raise ValueError( "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead." ) @@ -1174,14 +1158,13 @@ def _get_and_prepare_pipeline_for_validation(self, final_validation: bool = Fals enable_model_cpu_offload=self.args.enable_model_cpu_offload, ) else: - # `torch_dtype` is manually set within `initialize_pipeline()`. self._delete_components() - if self.args.training_type == "lora": - transformer = None - else: - transformer = self.model_config["load_diffusion_models"]( - model_id=self.args.output_dir, subfolder=None - )["transformer"] + + # Load the transformer weights from the final checkpoint if performing full-finetune + transformer = None + if self.args.training_type == "full-finetune": + transformer = self.model_config["load_diffusion_models"](model_id=self.args.output_dir)["transformer"] + pipeline = self.model_config["initialize_pipeline"]( model_id=self.args.pretrained_model_name_or_path, transformer=transformer, @@ -1192,7 +1175,19 @@ def _get_and_prepare_pipeline_for_validation(self, final_validation: bool = Fals enable_tiling=self.args.enable_tiling, enable_model_cpu_offload=self.args.enable_model_cpu_offload, ) + + # Load the LoRA weights if performing LoRA finetuning if self.args.training_type == "lora": pipeline.load_lora_weights(self.args.output_dir) return pipeline + + def _disable_grad_for_components(self, components: List[torch.nn.Module]): + for component in components: + if component is not None: + component.requires_grad_(False) + + def _enable_grad_for_components(self, components: List[torch.nn.Module]): + for component in components: + if component is not None: + component.requires_grad_(True) From 1f304b36d4c4ccca785b77414ffe5aee0e615600 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sat, 11 Jan 2025 01:28:47 +0100 Subject: [PATCH 14/15] handle saving of final model weights correctly --- finetrainers/trainer.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index 6af4c2c..43d0dae 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -385,10 +385,10 @@ def prepare_trainable_parameters(self) -> None: self._disable_grad_for_components(components) if self.args.training_type == "full-finetune": - logger.info("Finetuning transformer with no additional parameters.") + logger.info("Finetuning transformer with no additional parameters") self._enable_grad_for_components(components=[self.transformer]) else: - logger.info("Finetuning transformer with low-rank peft parameters.") + logger.info("Finetuning transformer with PEFT parameters") self._disable_grad_for_components(components=[self.transformer]) # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision @@ -452,7 +452,6 @@ def save_model_hook(models, weights, output_dir): if weights: weights.pop() - # TODO: refactor later if needed. But for now, this is just a few LoC. if self.args.training_type == "lora": self.model_config["pipeline_cls"].save_lora_weights( output_dir, @@ -894,14 +893,17 @@ def train(self) -> None: accelerator.wait_for_everyone() if accelerator.is_main_process: - # TODO: consider factoring this out when supporting other types of training algos. - self.transformer = unwrap_model(accelerator, self.transformer) - transformer_lora_layers = get_peft_model_state_dict(self.transformer) + transformer = unwrap_model(accelerator, self.transformer) - self.model_config["pipeline_cls"].save_lora_weights( - save_directory=self.args.output_dir, - transformer_lora_layers=transformer_lora_layers, - ) + if self.args.training_type == "lora": + transformer_lora_layers = get_peft_model_state_dict(transformer) + + self.model_config["pipeline_cls"].save_lora_weights( + save_directory=self.args.output_dir, + transformer_lora_layers=transformer_lora_layers, + ) + else: + transformer.save_pretrained(os.path.join(self.args.output_dir, "transformer")) self.validate(step=global_step, final_validation=True) From ca957e543c16613d4ccd34c4f2792477ca9f313c Mon Sep 17 00:00:00 2001 From: Aryan Date: Sat, 11 Jan 2025 01:36:52 +0100 Subject: [PATCH 15/15] remove unnecessary changes --- finetrainers/trainer.py | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/finetrainers/trainer.py b/finetrainers/trainer.py index 43d0dae..d37eb81 100644 --- a/finetrainers/trainer.py +++ b/finetrainers/trainer.py @@ -100,9 +100,6 @@ def __init__(self, args: Args) -> None: self.state.model_name = self.args.model_name self.model_config = get_config_from_model_name(self.args.model_name, self.args.training_type) - # Components list - self.components = [] - def prepare_dataset(self) -> None: # TODO(aryan): Make a background process for fetching logger.info("Initializing dataset and dataloader") @@ -157,18 +154,6 @@ def _set_components(self, components: Dict[str, Any]) -> None: self.transformer_config = self.transformer.config if self.transformer is not None else self.transformer_config self.vae_config = self.vae.config if self.vae is not None else self.vae_config - self.components = [ - self.tokenizer, - self.tokenizer_2, - self.tokenizer_3, - self.text_encoder, - self.text_encoder_2, - self.text_encoder_3, - self.transformer, - self.unet, - self.vae, - ] - def _delete_components(self) -> None: self.tokenizer = None self.tokenizer_2 = None @@ -183,8 +168,6 @@ def _delete_components(self) -> None: free_memory() torch.cuda.synchronize(self.state.accelerator.device) - self.components = None - def prepare_models(self) -> None: logger.info("Initializing models") @@ -260,14 +243,8 @@ def collate_fn(batch): condition_components = self.model_config["load_condition_models"](**self._get_load_components_kwargs()) self._set_components(condition_components) self._move_components_to_device() + self._disable_grad_for_components([self.text_encoder, self.text_encoder_2, self.text_encoder_3]) - self._disable_grad_for_components( - components=[ - self.text_encoder, - self.text_encoder_2, - self.text_encoder_3, - ] - ) if self.args.caption_dropout_p > 0 and self.args.caption_dropout_technique == "empty": logger.warning( "Caption dropout is not supported with precomputation yet. This will be supported in the future." @@ -320,8 +297,8 @@ def collate_fn(batch): latent_components = self.model_config["load_latent_models"](**self._get_load_components_kwargs()) self._set_components(latent_components) self._move_components_to_device() + self._disable_grad_for_components([self.vae]) - self._disable_grad_for_components(components=[self.vae]) if self.vae is not None: if self.args.enable_slicing: self.vae.enable_slicing()