From ce76f7e059956b6ceafcba511d407b4cecbb2589 Mon Sep 17 00:00:00 2001 From: Yifei Teng Date: Thu, 19 Dec 2024 01:01:56 -0800 Subject: [PATCH] Remove HuggingFace tests from PyTorch/XLA CI These tests frequently break and a better place for them is in HuggingFace upstream. --- .../nightly/accelerate-smoke.libsonnet | 31 ------- .../tests/pytorch/nightly/common.libsonnet | 39 --------- .../tests/pytorch/nightly/hf-bert.libsonnet | 62 -------------- .../pytorch/nightly/hf-diffusers.libsonnet | 82 ------------------- .../tests/pytorch/nightly/targets.jsonnet | 8 +- .../pytorch/r2.6/accelerate-smoke.libsonnet | 31 ------- .../tests/pytorch/r2.6/common.libsonnet | 38 --------- .../tests/pytorch/r2.6/hf-bert.libsonnet | 62 -------------- .../tests/pytorch/r2.6/hf-diffusers.libsonnet | 81 ------------------ .../tests/pytorch/r2.6/targets.jsonnet | 8 +- dags/pytorch_xla/nightly.py | 32 -------- dags/pytorch_xla/r2_6.py | 33 -------- 12 files changed, 2 insertions(+), 505 deletions(-) delete mode 100644 dags/legacy_test/tests/pytorch/nightly/accelerate-smoke.libsonnet delete mode 100644 dags/legacy_test/tests/pytorch/nightly/hf-bert.libsonnet delete mode 100644 dags/legacy_test/tests/pytorch/nightly/hf-diffusers.libsonnet delete mode 100644 dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet delete mode 100644 dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet delete mode 100644 dags/legacy_test/tests/pytorch/r2.6/hf-diffusers.libsonnet diff --git a/dags/legacy_test/tests/pytorch/nightly/accelerate-smoke.libsonnet b/dags/legacy_test/tests/pytorch/nightly/accelerate-smoke.libsonnet deleted file mode 100644 index 4e8040fd..00000000 --- a/dags/legacy_test/tests/pytorch/nightly/accelerate-smoke.libsonnet +++ /dev/null @@ -1,31 +0,0 @@ -local experimental = import '../experimental.libsonnet'; -local common = import 'common.libsonnet'; -local tpus = import 'templates/tpus.libsonnet'; - -{ - local accelerate = self.accelerate, - accelerate:: common.PyTorchTest + common.Functional { - modelName: 'accelerate', - mode: 'smoke', - command: [ - 'accelerate', - 'test', - ], - }, - local pjrt = self.pjrt, - pjrt:: common.PyTorchTpuVmMixin + common.Accelerate, - - local v2_8 = self.v2_8, - v2_8:: { - accelerator: tpus.v2_8, - }, - local v4_8 = self.v4_8, - v4_8:: { - accelerator: tpus.v4_8, - }, - - configs: [ - accelerate + v2_8 + pjrt, - accelerate + v4_8 + pjrt, - ], -} diff --git a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet index dd770ec6..fe972b2f 100644 --- a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet +++ b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet @@ -181,45 +181,6 @@ local volumes = import 'templates/volumes.libsonnet'; }, }, - - Accelerate:: { - local config = self, - tpuSettings+: { - tpuVmExports+: ||| - export PATH=~/.local/bin:$PATH - |||, - tpuVmExtraSetup: ||| - if [ -d "$HOME/.local/bin" ] ; then - export PATH="$HOME/.local/bin:$PATH" - fi - # Dependency of accelerate, unfortunately there is no requirements.txt in accelerate. - pip install pytest - git clone https://github.com/huggingface/accelerate.git - pip install ./accelerate - - mkdir -p ~/.cache/huggingface/accelerate/ - cat > ~/.cache/huggingface/accelerate/default_config.yaml << 'HF_CONFIG_EOF' - compute_environment: LOCAL_MACHINE - distributed_type: XLA - downcast_bf16: 'no' - machine_rank: 0 - main_training_function: main - mixed_precision: 'no' - num_machines: 1 - num_processes: %d - rdzv_backend: static - same_network: true - tpu_env: [] - tpu_use_cluster: false - tpu_use_sudo: false - use_cpu: false - HF_CONFIG_EOF - - accelerate env - ||| % [config.accelerator.numCores], - }, - }, - // DEPRECATED: Use PyTorchTpuVmMixin instead tpu_vm_nightly_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup, } diff --git a/dags/legacy_test/tests/pytorch/nightly/hf-bert.libsonnet b/dags/legacy_test/tests/pytorch/nightly/hf-bert.libsonnet deleted file mode 100644 index 2f6aff0b..00000000 --- a/dags/legacy_test/tests/pytorch/nightly/hf-bert.libsonnet +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -local experimental = import '../experimental.libsonnet'; -local common = import 'common.libsonnet'; -local timeouts = import 'templates/timeouts.libsonnet'; -local tpus = import 'templates/tpus.libsonnet'; - -{ - local bert = self.bert, - bert:: common.PyTorchTest { - modelName: 'hf-bert', - volumeMap+: { - datasets: common.datasetsVolume, - }, - command: [ - 'python3', - 'pytorch/xla/test/pjrt/test_train_hf_transformer.py', - '--logdir=$(MODEL_DIR)', - ], - }, - - local functional = self.functional, - functional:: common.Functional { - command+: [ - '--short_data', - ], - }, - local convergence = self.convergence, - convergence:: common.Convergence, - - local v4_8 = self.v4_8, - v4_8:: { - accelerator: tpus.v4_8, - }, - - local pjrt = self.pjrt, - pjrt:: common.PyTorchTpuVmMixin { - modelName+: '-pjrt', - tpuSettings+: { - tpuVmExtraSetup: ||| - pip install tensorboardX google-cloud-storage transformers evaluate scikit-learn - |||, - }, - }, - - configs: [ - bert + functional + v4_8 + pjrt + timeouts.Hours(2), - bert + convergence + v4_8 + pjrt + timeouts.Hours(12), - ], -} diff --git a/dags/legacy_test/tests/pytorch/nightly/hf-diffusers.libsonnet b/dags/legacy_test/tests/pytorch/nightly/hf-diffusers.libsonnet deleted file mode 100644 index 83347982..00000000 --- a/dags/legacy_test/tests/pytorch/nightly/hf-diffusers.libsonnet +++ /dev/null @@ -1,82 +0,0 @@ -local experimental = import '../experimental.libsonnet'; -local common = import 'common.libsonnet'; -local tpus = import 'templates/tpus.libsonnet'; - -{ - local diffusers = self.diffusers, - diffusers:: common.PyTorchTest + common.Functional { - modelName: 'hf-diffusers', - command: [ - 'accelerate', - 'launch', - 'train_text_to_image.py', - '--pretrained_model_name_or_path=CompVis/stable-diffusion-v1-4', - '--dataset_name=huggan/smithsonian_butterflies_subset', - "--caption_column=image_url", - '--use_ema', - '--resolution=512', - '--center_crop', - '--random_flip', - '--train_batch_size=1', - '--learning_rate=1e-05', - '--max_grad_norm=1', - '--lr_scheduler=constant', - '--lr_warmup_steps=0', - '--output_dir=/tmp/sd-pokemon-model', - '--checkpoints_total_limit=1', - '--checkpointing_steps=6000', - ], - }, - - local functional = self.functional, - functional:: common.Functional { - command+: [ - '--max_train_steps=100', - ], - }, - local convergence = self.convergence, - convergence:: common.Convergence { - command+: [ - '--max_train_steps=5000', - ], - }, - - local pjrt = self.pjrt, - pjrt:: common.PyTorchTpuVmMixin + common.Accelerate { - tpuSettings+: { - tpuVmExports+: ||| - export XLA_USE_BF16=1 - cd diffusers/examples/text_to_image/ - |||, - tpuVmExtraSetup+: ||| - git clone https://github.com/huggingface/diffusers - cd diffusers - pip install . - - cd examples/text_to_image - sed '/accelerate>=0.28.0/d' requirements.txt > clean_requirements.txt - sed '/torchvision/d' requirements.txt > clean_requirements.txt - sed -i 's/transformers>=.*/transformers>=4.36.2/g' clean_requirements.txt - echo "Pillow>=9.4.0" >> clean_requirements.txt - pip install -r clean_requirements.txt - - # Skip saving the pretrained model, which contains invalid tensor storage - sed -i 's/pipeline.save_pretrained(args.output_dir)//g' train_text_to_image.py - |||, - }, - }, - - local v2_8 = self.v2_8, - v2_8:: { - accelerator: tpus.v2_8, - }, - local v4_8 = self.v4_8, - v4_8:: { - accelerator: tpus.v4_8, - }, - - configs: [ - diffusers + functional + v4_8 + pjrt, - diffusers + convergence + v4_8 + pjrt, - ], -} diff --git a/dags/legacy_test/tests/pytorch/nightly/targets.jsonnet b/dags/legacy_test/tests/pytorch/nightly/targets.jsonnet index d28f1049..9f083707 100644 --- a/dags/legacy_test/tests/pytorch/nightly/targets.jsonnet +++ b/dags/legacy_test/tests/pytorch/nightly/targets.jsonnet @@ -12,21 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -local accelerate = import 'accelerate-smoke.libsonnet'; local ci = import 'ci.libsonnet'; -local hfBert = import 'hf-bert.libsonnet'; -local huggingfaceDiffusers = import 'hf-diffusers.libsonnet'; local llama2 = import 'llama2-model.libsonnet'; local mnist = import 'mnist.libsonnet'; local resnet50_mp = import 'resnet50-mp.libsonnet'; // Add new models here std.flattenArrays([ - accelerate.configs, ci.configs, - hfBert.configs, - huggingfaceDiffusers.configs, + llama2.configs, mnist.configs, resnet50_mp.configs, - llama2.configs, ]) diff --git a/dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet deleted file mode 100644 index 4e8040fd..00000000 --- a/dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet +++ /dev/null @@ -1,31 +0,0 @@ -local experimental = import '../experimental.libsonnet'; -local common = import 'common.libsonnet'; -local tpus = import 'templates/tpus.libsonnet'; - -{ - local accelerate = self.accelerate, - accelerate:: common.PyTorchTest + common.Functional { - modelName: 'accelerate', - mode: 'smoke', - command: [ - 'accelerate', - 'test', - ], - }, - local pjrt = self.pjrt, - pjrt:: common.PyTorchTpuVmMixin + common.Accelerate, - - local v2_8 = self.v2_8, - v2_8:: { - accelerator: tpus.v2_8, - }, - local v4_8 = self.v4_8, - v4_8:: { - accelerator: tpus.v4_8, - }, - - configs: [ - accelerate + v2_8 + pjrt, - accelerate + v4_8 + pjrt, - ], -} diff --git a/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet index 84791bff..1d3f9ae9 100644 --- a/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet +++ b/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet @@ -185,44 +185,6 @@ local volumes = import 'templates/volumes.libsonnet'; }, - Accelerate:: { - local config = self, - tpuSettings+: { - tpuVmExports+: ||| - export PATH=~/.local/bin:$PATH - |||, - tpuVmExtraSetup: ||| - if [ -d "$HOME/.local/bin" ] ; then - export PATH="$HOME/.local/bin:$PATH" - fi - # Dependency of accelerate, unfortunately there is no requirements.txt in accelerate. - pip install pytest - git clone https://github.com/huggingface/accelerate.git - pip install ./accelerate - - mkdir -p ~/.cache/huggingface/accelerate/ - cat > ~/.cache/huggingface/accelerate/default_config.yaml << 'HF_CONFIG_EOF' - compute_environment: LOCAL_MACHINE - distributed_type: XLA - downcast_bf16: 'no' - machine_rank: 0 - main_training_function: main - mixed_precision: 'no' - num_machines: 1 - num_processes: %d - rdzv_backend: static - same_network: true - tpu_env: [] - tpu_use_cluster: false - tpu_use_sudo: false - use_cpu: false - HF_CONFIG_EOF - - accelerate env - ||| % [config.accelerator.numCores], - }, - }, - // DEPRECATED: Use PyTorchTpuVmMixin instead tpu_vm_r2_6_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup, } diff --git a/dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet deleted file mode 100644 index 2f6aff0b..00000000 --- a/dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -local experimental = import '../experimental.libsonnet'; -local common = import 'common.libsonnet'; -local timeouts = import 'templates/timeouts.libsonnet'; -local tpus = import 'templates/tpus.libsonnet'; - -{ - local bert = self.bert, - bert:: common.PyTorchTest { - modelName: 'hf-bert', - volumeMap+: { - datasets: common.datasetsVolume, - }, - command: [ - 'python3', - 'pytorch/xla/test/pjrt/test_train_hf_transformer.py', - '--logdir=$(MODEL_DIR)', - ], - }, - - local functional = self.functional, - functional:: common.Functional { - command+: [ - '--short_data', - ], - }, - local convergence = self.convergence, - convergence:: common.Convergence, - - local v4_8 = self.v4_8, - v4_8:: { - accelerator: tpus.v4_8, - }, - - local pjrt = self.pjrt, - pjrt:: common.PyTorchTpuVmMixin { - modelName+: '-pjrt', - tpuSettings+: { - tpuVmExtraSetup: ||| - pip install tensorboardX google-cloud-storage transformers evaluate scikit-learn - |||, - }, - }, - - configs: [ - bert + functional + v4_8 + pjrt + timeouts.Hours(2), - bert + convergence + v4_8 + pjrt + timeouts.Hours(12), - ], -} diff --git a/dags/legacy_test/tests/pytorch/r2.6/hf-diffusers.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/hf-diffusers.libsonnet deleted file mode 100644 index 53a7a811..00000000 --- a/dags/legacy_test/tests/pytorch/r2.6/hf-diffusers.libsonnet +++ /dev/null @@ -1,81 +0,0 @@ -local experimental = import '../experimental.libsonnet'; -local common = import 'common.libsonnet'; -local tpus = import 'templates/tpus.libsonnet'; - -{ - local diffusers = self.diffusers, - diffusers:: common.PyTorchTest + common.Functional { - modelName: 'hf-diffusers', - command: [ - 'accelerate', - 'launch', - 'train_text_to_image.py', - '--pretrained_model_name_or_path=CompVis/stable-diffusion-v1-4', - '--dataset_name=huggan/smithsonian_butterflies_subset', - '--caption_column=image_url', - '--use_ema', - '--resolution=512', - '--center_crop', - '--random_flip', - '--train_batch_size=1', - '--learning_rate=1e-05', - '--max_grad_norm=1', - '--lr_scheduler=constant', - '--lr_warmup_steps=0', - '--output_dir=/tmp/sd-pokemon-model', - '--checkpoints_total_limit=1', - '--checkpointing_steps=6000', - ], - }, - - local functional = self.functional, - functional:: common.Functional { - command+: [ - '--max_train_steps=100', - ], - }, - local convergence = self.convergence, - convergence:: common.Convergence { - command+: [ - '--max_train_steps=5000', - ], - }, - - local pjrt = self.pjrt, - pjrt:: common.PyTorchTpuVmMixin + common.Accelerate { - tpuSettings+: { - tpuVmExports+: ||| - export XLA_USE_BF16=1 - cd diffusers/examples/text_to_image/ - |||, - tpuVmExtraSetup+: ||| - git clone https://github.com/huggingface/diffusers - cd diffusers - pip install . - - cd examples/text_to_image - sed '/accelerate>=0.28.0/d' requirements.txt > clean_requirements.txt - sed '/torchvision/d' requirements.txt > clean_requirements.txt - sed -i 's/transformers>=.*/transformers>=4.36.2/g' clean_requirements.txt - pip install -r clean_requirements.txt - - # Skip saving the pretrained model, which contains invalid tensor storage - sed -i 's/pipeline.save_pretrained(args.output_dir)//g' train_text_to_image.py - |||, - }, - }, - - local v2_8 = self.v2_8, - v2_8:: { - accelerator: tpus.v2_8, - }, - local v4_8 = self.v4_8, - v4_8:: { - accelerator: tpus.v4_8, - }, - - configs: [ - diffusers + functional + v4_8 + pjrt, - diffusers + convergence + v4_8 + pjrt, - ], -} diff --git a/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet b/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet index d28f1049..9f083707 100644 --- a/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet +++ b/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet @@ -12,21 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -local accelerate = import 'accelerate-smoke.libsonnet'; local ci = import 'ci.libsonnet'; -local hfBert = import 'hf-bert.libsonnet'; -local huggingfaceDiffusers = import 'hf-diffusers.libsonnet'; local llama2 = import 'llama2-model.libsonnet'; local mnist = import 'mnist.libsonnet'; local resnet50_mp = import 'resnet50-mp.libsonnet'; // Add new models here std.flattenArrays([ - accelerate.configs, ci.configs, - hfBert.configs, - huggingfaceDiffusers.configs, + llama2.configs, mnist.configs, resnet50_mp.configs, - llama2.configs, ]) diff --git a/dags/pytorch_xla/nightly.py b/dags/pytorch_xla/nightly.py index 7df8b36c..4b3b329d 100644 --- a/dags/pytorch_xla/nightly.py +++ b/dags/pytorch_xla/nightly.py @@ -142,37 +142,6 @@ def torchvision(): resnet_v100_2x2 >> resnet_v100_2x2_spmd -@task_group(prefix_group_id=False) -def huggingface(): - task.run_queued_resource_test( - test_config.JSonnetTpuVmTest.from_pytorch( - "pt-nightly-accelerate-smoke-v2-8-1vm", reserved=True - ), - US_CENTRAL1_C, - ) - accelerate_v4_8 = task.run_queued_resource_test( - test_config.JSonnetTpuVmTest.from_pytorch( - "pt-nightly-accelerate-smoke-v4-8-1vm" - ), - US_CENTRAL2_B, - ) - diffusers_v4_8 = task.run_queued_resource_test( - test_config.JSonnetTpuVmTest.from_pytorch( - "pt-nightly-hf-diffusers-func-v4-8-1vm" - ), - US_CENTRAL2_B, - ) - - accelerate_v4_8 >> diffusers_v4_8 - - task.run_queued_resource_test( - test_config.JSonnetTpuVmTest.from_pytorch( - "pt-nightly-hf-bert-pjrt-func-v4-8-1vm" - ), - US_CENTRAL2_B, - ) - - @task_group(prefix_group_id=False) def llama(): llama_inference_v4_8 = task.run_queued_resource_test( @@ -197,7 +166,6 @@ def llama(): catchup=False, ): torchvision() - huggingface() llama() ci_v5lp_4 = task.run_queued_resource_test( diff --git a/dags/pytorch_xla/r2_6.py b/dags/pytorch_xla/r2_6.py index 14f68192..a9f8b442 100644 --- a/dags/pytorch_xla/r2_6.py +++ b/dags/pytorch_xla/r2_6.py @@ -138,38 +138,6 @@ def torchvision(): resnet_v100_2x2 >> resnet_v100_2x2_spmd -@task_group(prefix_group_id=False) -def huggingface(): - accelerate_v2_8 = task.run_queued_resource_test( - test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-6-accelerate-smoke-v2-8-1vm", reserved=True - ), - US_CENTRAL1_C, - ) - accelerate_v4_8 = task.run_queued_resource_test( - test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-6-accelerate-smoke-v4-8-1vm" - ), - US_CENTRAL2_B, - ) - diffusers_v4_8 = task.run_queued_resource_test( - test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-6-hf-diffusers-func-v4-8-1vm" - ), - US_CENTRAL2_B, - ) - - accelerate_v4_8 >> accelerate_v2_8 - accelerate_v4_8 >> diffusers_v4_8 - - task.run_queued_resource_test( - test_config.JSonnetTpuVmTest.from_pytorch( - "pt-2-6-hf-bert-pjrt-func-v4-8-1vm" - ), - US_CENTRAL2_B, - ) - - @task_group(prefix_group_id=False) def llama(): llama_inference_v4_8 = task.run_queued_resource_test( @@ -229,7 +197,6 @@ def llama(): catchup=False, ): torchvision() - huggingface() llama() resnet_v5lp_4 = task.run_queued_resource_test(