From d6289361f6a8243a91b30d25e34469c3c9e583e9 Mon Sep 17 00:00:00 2001 From: Yifei Teng Date: Thu, 9 Jan 2025 14:04:18 -0800 Subject: [PATCH] Add back most Huggingface tests (#531) * Revert "Remove HuggingFace tests from PyTorch/XLA CI (#513)" This reverts commit 313e33af1c6011b071c773749d08ddb3524353af. * Add back most Huggingface tests As proposed in https://github.com/pytorch/xla/issues/8542, this change adds back accelerate smoke test and bert training example to 2.6 and nightly CI. Additionally, llama2 training, accelerate smoke test, and bert training are modified to install huggingface dependencies following a constraint file. --- .../tests/pytorch/common.libsonnet | 11 +++ .../nightly/accelerate-smoke.libsonnet | 31 +++++++++ .../tests/pytorch/nightly/common.libsonnet | 42 ++++++++++++ .../tests/pytorch/nightly/hf-bert.libsonnet | 67 +++++++++++++++++++ .../pytorch/nightly/llama2-model.libsonnet | 13 ++-- .../tests/pytorch/nightly/targets.jsonnet | 6 +- .../pytorch/r2.6/accelerate-smoke.libsonnet | 31 +++++++++ .../tests/pytorch/r2.6/common.libsonnet | 41 ++++++++++++ .../tests/pytorch/r2.6/hf-bert.libsonnet | 67 +++++++++++++++++++ .../tests/pytorch/r2.6/llama2-model.libsonnet | 31 +++++---- .../tests/pytorch/r2.6/targets.jsonnet | 6 +- dags/pytorch_xla/nightly.py | 24 +++++++ dags/pytorch_xla/r2_6.py | 26 +++++++ 13 files changed, 372 insertions(+), 24 deletions(-) create mode 100644 dags/legacy_test/tests/pytorch/nightly/accelerate-smoke.libsonnet create mode 100644 dags/legacy_test/tests/pytorch/nightly/hf-bert.libsonnet create mode 100644 dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet create mode 100644 dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet diff --git a/dags/legacy_test/tests/pytorch/common.libsonnet b/dags/legacy_test/tests/pytorch/common.libsonnet index ed42b81b..0a0745b2 100644 --- a/dags/legacy_test/tests/pytorch/common.libsonnet +++ b/dags/legacy_test/tests/pytorch/common.libsonnet @@ -94,4 +94,15 @@ local volumes = import 'templates/volumes.libsonnet'; }, }, }, + + // A list of Python library versions that are known to work with + // the Huggingface models and workflows exercised in PyTorch/XLA CI. + HuggingfacePipVersionConstraints:: ||| + accelerate==1.2.1 + datasets==3.2.0 + evaluate==0.4.3 + huggingface-hub==0.27.1 + safetensors==0.5.0 + tokenizers==0.19.1 + |||, } diff --git a/dags/legacy_test/tests/pytorch/nightly/accelerate-smoke.libsonnet b/dags/legacy_test/tests/pytorch/nightly/accelerate-smoke.libsonnet new file mode 100644 index 00000000..4e8040fd --- /dev/null +++ b/dags/legacy_test/tests/pytorch/nightly/accelerate-smoke.libsonnet @@ -0,0 +1,31 @@ +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; + +{ + local accelerate = self.accelerate, + accelerate:: common.PyTorchTest + common.Functional { + modelName: 'accelerate', + mode: 'smoke', + command: [ + 'accelerate', + 'test', + ], + }, + local pjrt = self.pjrt, + pjrt:: common.PyTorchTpuVmMixin + common.Accelerate, + + local v2_8 = self.v2_8, + v2_8:: { + accelerator: tpus.v2_8, + }, + local v4_8 = self.v4_8, + v4_8:: { + accelerator: tpus.v4_8, + }, + + configs: [ + accelerate + v2_8 + pjrt, + accelerate + v4_8 + pjrt, + ], +} diff --git a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet index fe972b2f..f96f92ea 100644 --- a/dags/legacy_test/tests/pytorch/nightly/common.libsonnet +++ b/dags/legacy_test/tests/pytorch/nightly/common.libsonnet @@ -181,6 +181,48 @@ local volumes = import 'templates/volumes.libsonnet'; }, }, + + Accelerate:: { + local config = self, + tpuSettings+: { + tpuVmExports+: ||| + export PATH=~/.local/bin:$PATH + |||, + tpuVmExtraSetup: ||| + if [ -d "$HOME/.local/bin" ] ; then + export PATH="$HOME/.local/bin:$PATH" + fi + + cat > ~/hf-constraints.txt << 'HF_CONSTRAINTS_EOF' + %s + HF_CONSTRAINTS_EOF + pip install pytest accelerate -c ~/hf-constraints.txt + + mkdir -p ~/.cache/huggingface/accelerate/ + cat > ~/.cache/huggingface/accelerate/default_config.yaml << 'HF_CONFIG_EOF' + compute_environment: LOCAL_MACHINE + distributed_type: XLA + downcast_bf16: 'no' + machine_rank: 0 + main_training_function: main + mixed_precision: 'no' + num_machines: 1 + num_processes: null + rdzv_backend: static + same_network: true + tpu_env: [] + tpu_use_cluster: false + tpu_use_sudo: false + use_cpu: false + HF_CONFIG_EOF + + accelerate env + ||| % common.HuggingfacePipVersionConstraints, + }, + }, + + HuggingfacePipVersionConstraints:: common.HuggingfacePipVersionConstraints, + // DEPRECATED: Use PyTorchTpuVmMixin instead tpu_vm_nightly_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup, } diff --git a/dags/legacy_test/tests/pytorch/nightly/hf-bert.libsonnet b/dags/legacy_test/tests/pytorch/nightly/hf-bert.libsonnet new file mode 100644 index 00000000..bc82305c --- /dev/null +++ b/dags/legacy_test/tests/pytorch/nightly/hf-bert.libsonnet @@ -0,0 +1,67 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; + +{ + local bert = self.bert, + bert:: common.PyTorchTest { + modelName: 'hf-bert', + volumeMap+: { + datasets: common.datasetsVolume, + }, + command: [ + 'python3', + 'pytorch/xla/test/pjrt/test_train_hf_transformer.py', + '--logdir=$(MODEL_DIR)', + ], + }, + + local functional = self.functional, + functional:: common.Functional { + command+: [ + '--short_data', + ], + }, + local convergence = self.convergence, + convergence:: common.Convergence, + + local v4_8 = self.v4_8, + v4_8:: { + accelerator: tpus.v4_8, + }, + + local pjrt = self.pjrt, + pjrt:: common.PyTorchTpuVmMixin { + modelName+: '-pjrt', + tpuSettings+: { + tpuVmExtraSetup: ||| + cat > ~/hf-constraints.txt << 'HF_CONSTRAINTS_EOF' + %s + HF_CONSTRAINTS_EOF + pip install pytest accelerate -c ~/hf-constraints.txt + + pip install tensorboardX google-cloud-storage transformers evaluate scikit-learn -c ~/hf-constraints.txt + ||| % common.HuggingfacePipVersionConstraints, + }, + }, + + configs: [ + bert + functional + v4_8 + pjrt + timeouts.Hours(2), + bert + convergence + v4_8 + pjrt + timeouts.Hours(12), + ], +} diff --git a/dags/legacy_test/tests/pytorch/nightly/llama2-model.libsonnet b/dags/legacy_test/tests/pytorch/nightly/llama2-model.libsonnet index 7eb14bde..c0db5669 100644 --- a/dags/legacy_test/tests/pytorch/nightly/llama2-model.libsonnet +++ b/dags/legacy_test/tests/pytorch/nightly/llama2-model.libsonnet @@ -108,6 +108,10 @@ local utils = import 'templates/utils.libsonnet'; export TPU_MEGACORE=megacore_dense |||, tpuVmExtraSetup: ||| + cat > ~/hf-constraints.txt << 'HF_CONSTRAINTS_EOF' + %s + HF_CONSTRAINTS_EOF + # install tokenizer model gsutil cp gs://tpu-pytorch/lsiyuan-experiment/llama/spiece.model . @@ -115,18 +119,15 @@ local utils = import 'templates/utils.libsonnet'; git clone -b llama2-google-next-training https://github.com/pytorch-tpu/transformers.git cd transformers sudo pip3 uninstall transformers - sudo pip3 install -e . - pip3 install datasets - pip3 install evaluate - pip3 install scikit-learn - pip3 install accelerate + sudo pip3 install -e . -c ~/hf-constraints.txt + pip3 install datasets evaluate scikit-learn accelerate -c ~/hf-constraints.txt cd # 7B config mkdir 7B cd 7B/ gsutil cp gs://manfei_public_experimental/2B.json . - |||, + ||| % common.HuggingfacePipVersionConstraints, }, }, diff --git a/dags/legacy_test/tests/pytorch/nightly/targets.jsonnet b/dags/legacy_test/tests/pytorch/nightly/targets.jsonnet index 9f083707..4afd5d05 100644 --- a/dags/legacy_test/tests/pytorch/nightly/targets.jsonnet +++ b/dags/legacy_test/tests/pytorch/nightly/targets.jsonnet @@ -12,15 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +local accelerate = import 'accelerate-smoke.libsonnet'; local ci = import 'ci.libsonnet'; +local hfBert = import 'hf-bert.libsonnet'; local llama2 = import 'llama2-model.libsonnet'; local mnist = import 'mnist.libsonnet'; local resnet50_mp = import 'resnet50-mp.libsonnet'; // Add new models here std.flattenArrays([ + accelerate.configs, ci.configs, - llama2.configs, + hfBert.configs, mnist.configs, resnet50_mp.configs, + llama2.configs, ]) diff --git a/dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet new file mode 100644 index 00000000..4e8040fd --- /dev/null +++ b/dags/legacy_test/tests/pytorch/r2.6/accelerate-smoke.libsonnet @@ -0,0 +1,31 @@ +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; + +{ + local accelerate = self.accelerate, + accelerate:: common.PyTorchTest + common.Functional { + modelName: 'accelerate', + mode: 'smoke', + command: [ + 'accelerate', + 'test', + ], + }, + local pjrt = self.pjrt, + pjrt:: common.PyTorchTpuVmMixin + common.Accelerate, + + local v2_8 = self.v2_8, + v2_8:: { + accelerator: tpus.v2_8, + }, + local v4_8 = self.v4_8, + v4_8:: { + accelerator: tpus.v4_8, + }, + + configs: [ + accelerate + v2_8 + pjrt, + accelerate + v4_8 + pjrt, + ], +} diff --git a/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet index e63a99e1..dcbe29d7 100644 --- a/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet +++ b/dags/legacy_test/tests/pytorch/r2.6/common.libsonnet @@ -187,6 +187,47 @@ local rcVersion = 'rc3'; }, + Accelerate:: { + local config = self, + tpuSettings+: { + tpuVmExports+: ||| + export PATH=~/.local/bin:$PATH + |||, + tpuVmExtraSetup: ||| + if [ -d "$HOME/.local/bin" ] ; then + export PATH="$HOME/.local/bin:$PATH" + fi + + cat > ~/hf-constraints.txt << 'HF_CONSTRAINTS_EOF' + %s + HF_CONSTRAINTS_EOF + pip install pytest accelerate -c ~/hf-constraints.txt + + mkdir -p ~/.cache/huggingface/accelerate/ + cat > ~/.cache/huggingface/accelerate/default_config.yaml << 'HF_CONFIG_EOF' + compute_environment: LOCAL_MACHINE + distributed_type: XLA + downcast_bf16: 'no' + machine_rank: 0 + main_training_function: main + mixed_precision: 'no' + num_machines: 1 + num_processes: null + rdzv_backend: static + same_network: true + tpu_env: [] + tpu_use_cluster: false + tpu_use_sudo: false + use_cpu: false + HF_CONFIG_EOF + + accelerate env + ||| % common.HuggingfacePipVersionConstraints, + }, + }, + + HuggingfacePipVersionConstraints:: common.HuggingfacePipVersionConstraints, + // DEPRECATED: Use PyTorchTpuVmMixin instead tpu_vm_r2_6_install: self.PyTorchTpuVmMixin.tpuSettings.tpuVmPytorchSetup, } diff --git a/dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet new file mode 100644 index 00000000..bc82305c --- /dev/null +++ b/dags/legacy_test/tests/pytorch/r2.6/hf-bert.libsonnet @@ -0,0 +1,67 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local experimental = import '../experimental.libsonnet'; +local common = import 'common.libsonnet'; +local timeouts = import 'templates/timeouts.libsonnet'; +local tpus = import 'templates/tpus.libsonnet'; + +{ + local bert = self.bert, + bert:: common.PyTorchTest { + modelName: 'hf-bert', + volumeMap+: { + datasets: common.datasetsVolume, + }, + command: [ + 'python3', + 'pytorch/xla/test/pjrt/test_train_hf_transformer.py', + '--logdir=$(MODEL_DIR)', + ], + }, + + local functional = self.functional, + functional:: common.Functional { + command+: [ + '--short_data', + ], + }, + local convergence = self.convergence, + convergence:: common.Convergence, + + local v4_8 = self.v4_8, + v4_8:: { + accelerator: tpus.v4_8, + }, + + local pjrt = self.pjrt, + pjrt:: common.PyTorchTpuVmMixin { + modelName+: '-pjrt', + tpuSettings+: { + tpuVmExtraSetup: ||| + cat > ~/hf-constraints.txt << 'HF_CONSTRAINTS_EOF' + %s + HF_CONSTRAINTS_EOF + pip install pytest accelerate -c ~/hf-constraints.txt + + pip install tensorboardX google-cloud-storage transformers evaluate scikit-learn -c ~/hf-constraints.txt + ||| % common.HuggingfacePipVersionConstraints, + }, + }, + + configs: [ + bert + functional + v4_8 + pjrt + timeouts.Hours(2), + bert + convergence + v4_8 + pjrt + timeouts.Hours(12), + ], +} diff --git a/dags/legacy_test/tests/pytorch/r2.6/llama2-model.libsonnet b/dags/legacy_test/tests/pytorch/r2.6/llama2-model.libsonnet index 6e9f8ea3..3051b7ba 100644 --- a/dags/legacy_test/tests/pytorch/r2.6/llama2-model.libsonnet +++ b/dags/legacy_test/tests/pytorch/r2.6/llama2-model.libsonnet @@ -108,6 +108,10 @@ local utils = import 'templates/utils.libsonnet'; export TPU_MEGACORE=megacore_dense |||, tpuVmExtraSetup: ||| + cat > ~/hf-constraints.txt << 'HF_CONSTRAINTS_EOF' + %s + HF_CONSTRAINTS_EOF + # install tokenizer model gsutil cp gs://tpu-pytorch/lsiyuan-experiment/llama/spiece.model . @@ -115,18 +119,15 @@ local utils = import 'templates/utils.libsonnet'; git clone -b llama2-google-next-training https://github.com/pytorch-tpu/transformers.git cd transformers sudo pip3 uninstall transformers - sudo pip3 install -e . - pip3 install datasets - pip3 install evaluate - pip3 install scikit-learn - pip3 install accelerate + sudo pip3 install -e . -c ~/hf-constraints.txt + pip3 install datasets evaluate scikit-learn accelerate -c ~/hf-constraints.txt cd # 7B config mkdir 7B cd 7B/ gsutil cp gs://manfei_public_experimental/2B.json . - |||, + ||| % common.HuggingfacePipVersionConstraints, }, }, local llama3_train = self.llama3_train, @@ -161,6 +162,10 @@ local utils = import 'templates/utils.libsonnet'; export XLA_USE_SPMD=1 |||, tpuVmExtraSetup: ||| + cat > ~/hf-constraints.txt << 'HF_CONSTRAINTS_EOF' + %s + HF_CONSTRAINTS_EOF + git clone -b flash_attention https://github.com/pytorch-tpu/transformers.git # install tokenizer model @@ -170,16 +175,10 @@ local utils = import 'templates/utils.libsonnet'; google-cloud-sdk/bin/gsutil cp -r gs://pytorch-airflow/llama_3/ . cd transformers - sudo pip3 install -e . - pip3 install datasets - pip3 install evaluate - pip3 install scikit-learn - pip3 install accelerate - pip3 install transformers - - pip install jax==0.4.33 -f https://storage.googleapis.com/jax-releases/jax_releases.html - pip install jaxlib==0.4.33 -f https://storage.googleapis.com/jax-releases/jaxlib_releases.html - |||, + sudo pip3 install -e . -c ~/hf-constraints.txt + pip3 install 'torch_xla[pallas]' -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + pip3 install datasets evaluate scikit-learn accelerate -c ~/hf-constraints.txt + ||| % common.HuggingfacePipVersionConstraints, }, }, diff --git a/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet b/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet index 9f083707..4afd5d05 100644 --- a/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet +++ b/dags/legacy_test/tests/pytorch/r2.6/targets.jsonnet @@ -12,15 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +local accelerate = import 'accelerate-smoke.libsonnet'; local ci = import 'ci.libsonnet'; +local hfBert = import 'hf-bert.libsonnet'; local llama2 = import 'llama2-model.libsonnet'; local mnist = import 'mnist.libsonnet'; local resnet50_mp = import 'resnet50-mp.libsonnet'; // Add new models here std.flattenArrays([ + accelerate.configs, ci.configs, - llama2.configs, + hfBert.configs, mnist.configs, resnet50_mp.configs, + llama2.configs, ]) diff --git a/dags/pytorch_xla/nightly.py b/dags/pytorch_xla/nightly.py index 4b3b329d..7db00332 100644 --- a/dags/pytorch_xla/nightly.py +++ b/dags/pytorch_xla/nightly.py @@ -142,6 +142,29 @@ def torchvision(): resnet_v100_2x2 >> resnet_v100_2x2_spmd +@task_group(prefix_group_id=False) +def huggingface(): + task.run_queued_resource_test( + test_config.JSonnetTpuVmTest.from_pytorch( + "pt-nightly-accelerate-smoke-v2-8-1vm", reserved=True + ), + US_CENTRAL1_C, + ) + task.run_queued_resource_test( + test_config.JSonnetTpuVmTest.from_pytorch( + "pt-nightly-accelerate-smoke-v4-8-1vm" + ), + US_CENTRAL2_B, + ) + + task.run_queued_resource_test( + test_config.JSonnetTpuVmTest.from_pytorch( + "pt-nightly-hf-bert-pjrt-func-v4-8-1vm" + ), + US_CENTRAL2_B, + ) + + @task_group(prefix_group_id=False) def llama(): llama_inference_v4_8 = task.run_queued_resource_test( @@ -166,6 +189,7 @@ def llama(): catchup=False, ): torchvision() + huggingface() llama() ci_v5lp_4 = task.run_queued_resource_test( diff --git a/dags/pytorch_xla/r2_6.py b/dags/pytorch_xla/r2_6.py index a9f8b442..ee3dac9a 100644 --- a/dags/pytorch_xla/r2_6.py +++ b/dags/pytorch_xla/r2_6.py @@ -138,6 +138,31 @@ def torchvision(): resnet_v100_2x2 >> resnet_v100_2x2_spmd +@task_group(prefix_group_id=False) +def huggingface(): + accelerate_v2_8 = task.run_queued_resource_test( + test_config.JSonnetTpuVmTest.from_pytorch( + "pt-2-6-accelerate-smoke-v2-8-1vm", reserved=True + ), + US_CENTRAL1_C, + ) + accelerate_v4_8 = task.run_queued_resource_test( + test_config.JSonnetTpuVmTest.from_pytorch( + "pt-2-6-accelerate-smoke-v4-8-1vm" + ), + US_CENTRAL2_B, + ) + + accelerate_v4_8 >> accelerate_v2_8 + + task.run_queued_resource_test( + test_config.JSonnetTpuVmTest.from_pytorch( + "pt-2-6-hf-bert-pjrt-func-v4-8-1vm" + ), + US_CENTRAL2_B, + ) + + @task_group(prefix_group_id=False) def llama(): llama_inference_v4_8 = task.run_queued_resource_test( @@ -197,6 +222,7 @@ def llama(): catchup=False, ): torchvision() + huggingface() llama() resnet_v5lp_4 = task.run_queued_resource_test(