From c52e1f62c843c46e0c36e05359aa27777db3d9e2 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Tue, 17 Sep 2024 19:01:53 +0000 Subject: [PATCH 01/12] Update versions --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index f909592a..014373dc 100644 --- a/setup.py +++ b/setup.py @@ -6,11 +6,11 @@ from setuptools import find_packages, setup install_requires = [ - 'mosaicml==0.20.1', 'mosaicml-streaming==0.7.4', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', - 'diffusers[torch]==0.26.3', 'transformers[torch]==4.38.2', 'huggingface_hub==0.21.2', 'wandb==0.16.3', - 'xformers==0.0.23.post1', 'triton==2.1.0', 'torchmetrics[image]==1.3.1', 'lpips==0.1.4', 'clean-fid==0.1.35', - 'clip@git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33', 'gradio==4.19.2', - 'datasets==2.19.2', 'peft==0.12.0' + 'mosaicml==0.24.1', 'mosaicml-streaming==0.8.1', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', + 'diffusers[torch]==0.30.3', 'transformers[torch]==4.44.2', 'huggingface_hub==0.25.0', 'wandb==0.18.1', + 'xformers==0.0.27post2', 'triton==2.1.0', 'torchmetrics[image]==1.4.2', 'lpips==0.1.4', 'clean-fid==0.1.35', + 'clip@git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33', 'gradio==4.44.0', + 'datasets==2.19.2', 'peft==0.12.0', 'sentencepeice', ] extras_require = {} From 15e4a4d2cff5ebaa8c7c7f0fcf634f100195c16f Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Wed, 18 Sep 2024 17:17:36 +0000 Subject: [PATCH 02/12] Include mlflow --- setup.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 014373dc..ffed638a 100644 --- a/setup.py +++ b/setup.py @@ -6,11 +6,26 @@ from setuptools import find_packages, setup install_requires = [ - 'mosaicml==0.24.1', 'mosaicml-streaming==0.8.1', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', - 'diffusers[torch]==0.30.3', 'transformers[torch]==4.44.2', 'huggingface_hub==0.25.0', 'wandb==0.18.1', - 'xformers==0.0.27post2', 'triton==2.1.0', 'torchmetrics[image]==1.4.2', 'lpips==0.1.4', 'clean-fid==0.1.35', - 'clip@git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33', 'gradio==4.44.0', - 'datasets==2.19.2', 'peft==0.12.0', 'sentencepeice', + 'mosaicml==0.24.1', + 'mosaicml-streaming==0.8.1', + 'hydra-core>=1.2', + 'hydra-colorlog>=1.1.0', + 'diffusers[torch]==0.30.3', + 'transformers[torch]==4.44.2', + 'huggingface-hub[hf_transfer]>=0.23.2', + 'wandb>=0.18.1', + 'xformers==0.0.27post2', + 'triton==2.1.0', + 'torchmetrics[image]==1.4.2', + 'lpips==0.1.4', + 'clean-fid==0.1.35', + 'clip@git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33', + 'gradio==4.44.0', + 'datasets==2.19.2', + 'peft==0.12.0', + 'sentencepeice', + 'mlflow', + 'pynvml', ] extras_require = {} From a6ced94ff2656a3f0d864ff32e2a2e5fa32da8b0 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Wed, 18 Sep 2024 17:23:13 +0000 Subject: [PATCH 03/12] Spelling is important --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ffed638a..8ba95ce9 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ 'gradio==4.44.0', 'datasets==2.19.2', 'peft==0.12.0', - 'sentencepeice', + 'sentencepiece', 'mlflow', 'pynvml', ] From fd7cffda94dd3f7e4d49b2380c0f7c6c3f38115b Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Wed, 18 Sep 2024 17:30:31 +0000 Subject: [PATCH 04/12] Torchmetrics :( --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8ba95ce9..d0b12d51 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ 'wandb>=0.18.1', 'xformers==0.0.27post2', 'triton==2.1.0', - 'torchmetrics[image]==1.4.2', + 'torchmetrics[image]>=1.4.0.post0', 'lpips==0.1.4', 'clean-fid==0.1.35', 'clip@git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33', From 75bfd308ae491f5695a5e61dcfe0ea516ee51270 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Wed, 18 Sep 2024 17:36:02 +0000 Subject: [PATCH 05/12] Triton --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d0b12d51..4c85f8ad 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ 'huggingface-hub[hf_transfer]>=0.23.2', 'wandb>=0.18.1', 'xformers==0.0.27post2', - 'triton==2.1.0', + 'triton>=2.1.0', 'torchmetrics[image]>=1.4.0.post0', 'lpips==0.1.4', 'clean-fid==0.1.35', From cc90fff6e22e6c0846ca2540efd64075bc8629db Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Thu, 3 Oct 2024 22:17:20 +0000 Subject: [PATCH 06/12] New xformers --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4c85f8ad..18be958d 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ 'transformers[torch]==4.44.2', 'huggingface-hub[hf_transfer]>=0.23.2', 'wandb>=0.18.1', - 'xformers==0.0.27post2', + 'xformers==0.0.28.post1', 'triton>=2.1.0', 'torchmetrics[image]>=1.4.0.post0', 'lpips==0.1.4', From 6cb12eed5eee9f3dd6055e70e237460c78aaccb6 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Thu, 3 Oct 2024 22:26:25 +0000 Subject: [PATCH 07/12] Automatically add per-device batch size as a streaming kwarg --- diffusion/train.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/diffusion/train.py b/diffusion/train.py index becff0f1..c8674cbe 100644 --- a/diffusion/train.py +++ b/diffusion/train.py @@ -103,19 +103,22 @@ def train(config: DictConfig) -> None: else: optimizer = hydra.utils.instantiate(config.optimizer, params=model.parameters()) - # Load train dataset. Currently this expects to load according to the datasetHparam method. - # This means adding external datasets is currently not super easy. Will refactor or check for - # upstream composer changes that could make this easier. + # Load train dataset. Need to ensure that the per-device batch size is added as a streaming kwarg + per_device_train_batch_size = config.dataset.train_batch_size // dist.get_world_size() + if hasattr(config.dataset.train_dataset, 'streaming_kwargs'): + config.dataset.train_dataset.streaming_kwargs['batch_size'] = per_device_train_batch_size + else: + config.dataset.train_dataset.streaming_kwargs = {'batch_size': per_device_train_batch_size} if tokenizer: train_dataloader: Union[Iterable, DataSpec, Dict[str, Any]] = hydra.utils.instantiate( config.dataset.train_dataset, tokenizer=tokenizer, - batch_size=config.dataset.train_batch_size // dist.get_world_size(), + batch_size=per_device_train_batch_size, ) else: train_dataloader: Union[Iterable, DataSpec, Dict[str, Any]] = hydra.utils.instantiate( config.dataset.train_dataset, - batch_size=config.dataset.train_batch_size // dist.get_world_size(), + batch_size=per_device_train_batch_size, ) # Need to sleep for a bit to avoid dataloader crash time.sleep(10) @@ -148,13 +151,18 @@ def train(config: DictConfig) -> None: eval_set = evaluators else: + # Need to ensure that the per-device batch size is added as a streaming kwarg + per_device_eval_batch_size = config.dataset.eval_batch_size // dist.get_world_size() + if hasattr(config.dataset.eval_dataset, 'streaming_kwargs'): + config.dataset.eval_dataset.streaming_kwargs['batch_size'] = per_device_eval_batch_size + else: + config.dataset.eval_dataset.streaming_kwargs = {'batch_size': per_device_eval_batch_size} if tokenizer: eval_set = hydra.utils.instantiate(config.dataset.eval_dataset, tokenizer=model.tokenizer, - batch_size=config.dataset.eval_batch_size // dist.get_world_size()) + batch_size=per_device_eval_batch_size) else: - eval_set = hydra.utils.instantiate(config.dataset.eval_dataset, - batch_size=config.dataset.eval_batch_size // dist.get_world_size()) + eval_set = hydra.utils.instantiate(config.dataset.eval_dataset, batch_size=per_device_eval_batch_size) # Need to sleep for a bit to avoid dataloader crash time.sleep(10) From 755e5cccdb59a77e6ce05c0bb6f7d6a05d0c8006 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Fri, 4 Oct 2024 00:23:12 +0000 Subject: [PATCH 08/12] Bump composer --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 18be958d..13a17da9 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup install_requires = [ - 'mosaicml==0.24.1', + 'mosaicml==0.25.0', 'mosaicml-streaming==0.8.1', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', From 192dc176f76e45a95acb2ef3c38330d264437879 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Fri, 4 Oct 2024 04:38:42 +0000 Subject: [PATCH 09/12] Update streaming --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 13a17da9..3f9f42a5 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ install_requires = [ 'mosaicml==0.25.0', - 'mosaicml-streaming==0.8.1', + 'mosaicml-streaming==0.9.0', 'hydra-core>=1.2', 'hydra-colorlog>=1.1.0', 'diffusers[torch]==0.30.3', From a004a9d133492c27fb4671b823a49987d8479e91 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Fri, 4 Oct 2024 05:51:11 +0000 Subject: [PATCH 10/12] Fix huggingface warning --- diffusion/models/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffusion/models/models.py b/diffusion/models/models.py index 33b02f8d..82c1480f 100644 --- a/diffusion/models/models.py +++ b/diffusion/models/models.py @@ -125,7 +125,7 @@ def stable_diffusion_2( precision = torch.float16 if encode_latents_in_fp16 else None # Make the text encoder text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder='text_encoder', torch_dtype=precision) - tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder='tokenizer') + tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder='tokenizer', clean_up_tokenization_spaces=True) # Make the autoencoder if autoencoder_path is None: From 1e2c20ca3e6601ee2a64d8498e8ed957dfff493e Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Fri, 4 Oct 2024 06:05:09 +0000 Subject: [PATCH 11/12] Update to new torch autocast --- diffusion/models/precomputed_text_latent_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diffusion/models/precomputed_text_latent_diffusion.py b/diffusion/models/precomputed_text_latent_diffusion.py index d1ee9136..31acdb2f 100644 --- a/diffusion/models/precomputed_text_latent_diffusion.py +++ b/diffusion/models/precomputed_text_latent_diffusion.py @@ -189,7 +189,7 @@ def set_rng_generator(self, rng_generator: torch.Generator): self.rng_generator = rng_generator def encode_images(self, inputs, dtype=torch.bfloat16): - with torch.amp.autocast('cuda', enabled=False): + with torch.autocast(device_type='cuda', enabled=False): latents = self.vae.encode(inputs.to(dtype))['latent_dist'].sample().data latents = (latents - self.latent_mean) / self.latent_std # scale latents return latents From 37bbd4c61b151cfa204ae6c21ac794bfc6cbd814 Mon Sep 17 00:00:00 2001 From: Cory Stephenson Date: Fri, 4 Oct 2024 16:38:30 +0000 Subject: [PATCH 12/12] Update workflows --- .github/workflows/code-quality.yaml | 2 +- .github/workflows/docker.yaml | 6 ++++++ .github/workflows/pr-cpu.yaml | 8 ++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml index 261edd5e..16e2a125 100644 --- a/.github/workflows/code-quality.yaml +++ b/.github/workflows/code-quality.yaml @@ -24,8 +24,8 @@ jobs: strategy: matrix: python_version: - - "3.9" - "3.10" + - "3.11" pip_deps: - "[dev]" steps: diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 164b5ccb..20dd7c6e 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -29,6 +29,12 @@ jobs: - name: "2.4.0_cu124_aws" base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws dep_groups: "[all]" + - name: "2.4.1_cu124" + base_image: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04 + dep_groups: "[all]" + - name: "2.4.1_cu124_aws" + base_image: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws + dep_groups: "[all]" steps: - name: Checkout diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index b43ce0ae..b5f8cc2e 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -19,12 +19,12 @@ jobs: strategy: matrix: include: - - name: 'cpu-3.9-1.12' - container: mosaicml/pytorch:1.12.1_cpu-python3.9-ubuntu20.04 + - name: 'cpu-3.10-2.1' + container: mosaicml/pytorch:2.1.2_cpu-python3.10-ubuntu20.04 markers: 'not gpu' pytest_command: 'coverage run -m pytest' - - name: 'cpu-3.10-1.13' - container: mosaicml/pytorch:1.13.1_cpu-python3.10-ubuntu20.04 + - name: 'cpu-3.11-2.4' + container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04 markers: 'not gpu' pytest_command: 'coverage run -m pytest' name: ${{ matrix.name }}