diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh index 078fa09f59..49eb29cc1d 100755 --- a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v1_12/dino/build.sh @@ -21,7 +21,8 @@ ZONE_NAME="us-west1-b" ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v1_12/dino" TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh" PYTORCH_VERSION="v1_12" +BUCKET_TYPE="non-hns" cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" -source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh index b4d44bf9fc..183a588094 100755 --- a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2/dino/build.sh @@ -21,7 +21,8 @@ ZONE_NAME="asia-northeast1-a" ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino" TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh" PYTORCH_VERSION="v2" +BUCKET_TYPE="non-hns" cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" -source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/build.sh new file mode 100755 index 0000000000..5c2ca5ff1a --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/build.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +VM_NAME="pytorch2-dino-7d-a100-gpu-hns-bucket" +ZONE_NAME="us-central1-f" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2_hns/dino" +TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh" +PYTORCH_VERSION="v2" +BUCKET_TYPE="hns" + +cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" + +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE diff --git a/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/continuous.cfg new file mode 100644 index 0000000000..6fcfa505cd --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/continuous.cfg @@ -0,0 +1,18 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/build.sh" + +# 1 hour timeout. +timeout_mins: 60 diff --git a/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh b/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh index 52b2f47e81..ae7af3b5c6 100755 --- a/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh +++ b/perfmetrics/scripts/continuous_test/ml_tests/run_and_manage_test.sh @@ -29,6 +29,7 @@ ARTIFACTS_BUCKET_PATH=$3 TEST_SCRIPT_PATH=$4 # pytorch version PYTORCH_VERSION=$5 +BUCKET_TYPE=$6 RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus" function initialize_ssh_key () { @@ -65,6 +66,11 @@ function delete_existing_vm_and_create_new () { RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-pytorch2-2gpu" fi + if [ $BUCKET_TYPE == "hns" ]; + then + RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-hns-bucket" + fi + echo "Creating VM $VM_NAME in zone $ZONE_NAME" # The below command creates VM using the reservation 'ai-ml-tests' sudo gcloud compute instances create $VM_NAME \ @@ -155,7 +161,7 @@ then echo "Clone the gcsfuse repo on test VM" sudo gcloud compute ssh $VM_NAME --zone $ZONE_NAME --internal-ip --command "mkdir github; cd github; git clone https://github.com/GoogleCloudPlatform/gcsfuse.git; cd gcsfuse; git checkout master;" echo "Trigger the build script on test VM" - sudo gcloud compute ssh $VM_NAME --zone $ZONE_NAME --internal-ip --command "bash \$HOME/$TEST_SCRIPT_PATH 1> \$HOME/build.out 2> \$HOME/build.err &" + sudo gcloud compute ssh $VM_NAME --zone $ZONE_NAME --internal-ip --command "bash \$HOME/$TEST_SCRIPT_PATH $BUCKET_TYPE 1> \$HOME/build.out 2> \$HOME/build.err &" echo "Wait for 15 minutes for test VM to setup for test and to change the status from START to RUNNING." sleep 900s diff --git a/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet/build.sh index ef44d8eaba..b98a908d7e 100755 --- a/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet/build.sh +++ b/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet/build.sh @@ -24,4 +24,3 @@ TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/tf/resnet/setup_ho cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH - diff --git a/perfmetrics/scripts/ml_tests/pytorch/run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/run_container.sh index 99e5383519..cbe793ed15 100644 --- a/perfmetrics/scripts/ml_tests/pytorch/run_container.sh +++ b/perfmetrics/scripts/ml_tests/pytorch/run_container.sh @@ -14,10 +14,12 @@ # limitations under the License. set -e # pytorch version (e.g. v1_12, v2) -PYTORCH_VESRION=$1 +PYTORCH_VERSION=$1 +BUCKET_TYPE=$2 + cd "$HOME/github/gcsfuse" echo "Building docker image containing all pytorch libraries..." -sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VESRION}/dino/Dockerfile --tag pytorch-gcsfuse +sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VERSION}/dino/Dockerfile --tag pytorch-gcsfuse --build-arg PYTORCH_VERSION="${PYTORCH_VERSION}" --build-arg BUCKET_TYPE="${BUCKET_TYPE}" mkdir -p container_artifacts diff --git a/perfmetrics/scripts/ml_tests/pytorch/run_model.sh b/perfmetrics/scripts/ml_tests/pytorch/run_model.sh index a0ede84a25..556e7d4d41 100755 --- a/perfmetrics/scripts/ml_tests/pytorch/run_model.sh +++ b/perfmetrics/scripts/ml_tests/pytorch/run_model.sh @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -PYTORCH_VESRION=$1 +PYTORCH_VERSION=$1 +BUCKET_TYPE=$2 + NUM_EPOCHS=80 TEST_BUCKET="gcsfuse-ml-data" @@ -29,10 +31,10 @@ CGO_ENABLED=0 go build . cd - # Create a directory for gcsfuse logs -mkdir run_artifacts/gcsfuse_logs +mkdir run_artifacts/gcsfuse_logs # We have created a bucket in the asia-northeast1 region to align with the location of our PyTorch 2.0 VM, which is also in asia-northeast1. -if [ ${PYTORCH_VESRION} == "v2" ]; +if [ ${PYTORCH_VERSION} == "v2" ]; then TEST_BUCKET="gcsfuse-ml-data-asia-northeast1" fi @@ -51,6 +53,16 @@ metadata-cache: ttl-secs: 1728000 stat-cache-max-size-mb: 3200 EOF + +DIR=${PYTORCH_VERSION} +# Enable the enable-hns flag to run tests on the folder APIs with an HNS bucket. +if [ ${BUCKET_TYPE} == "hns" ]; +then + TEST_BUCKET="gcsfuse-ml-data-hns-central1" + echo "enable-hns: true" >> $config_filename + DIR=${DIR}_${BUCKET_TYPE} +fi + echo "Created config-file at "$config_filename echo "Mounting GCSFuse..." @@ -87,7 +99,7 @@ python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")' # (TulsiShah) TODO: Pytorch 2.0 compile mode has issues (https://github.com/pytorch/pytorch/issues/94599), # which is fixed in pytorch version 2.1.0 (https://github.com/pytorch/pytorch/pull/100071). # We'll remove this workaround once we update our Docker image to use Pytorch 2.1.0 or greater version. -if [ ${PYTORCH_VESRION} == "v2" ]; +if [ ${PYTORCH_VERSION} == "v2" ]; then allowed_functions_file="/opt/conda/lib/python3.10/site-packages/torch/_dynamo/allowed_functions.py" # Update the pytorch library code to bypass the kernel-cache @@ -184,7 +196,7 @@ dynamo_unsupported_distributed_c10d_ops = [ ]" >> $distributed_c10d_file fi -ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${PYTORCH_VESRION}/dino" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${DIR}/dino" echo "Update status file" echo "RUNNING" > status.txt gsutil cp status.txt $ARTIFACTS_BUCKET_PATH/ diff --git a/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile index 9e911cb100..9653d2b3bc 100644 --- a/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile +++ b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile @@ -29,6 +29,9 @@ COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./ RUN mkdir -p "run_artifacts" RUN mkdir -p "gcsfuse_data" -ENV PYTORCH_VERSION="v1_12" +ARG PYTORCH_VERSION +ARG BUCKET_TYPE +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ENV BUCKET_TYPE=${BUCKET_TYPE} -ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"] +ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION} ${BUCKET_TYPE}"] diff --git a/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh index 6ad24239c4..98c874b25c 100755 --- a/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh +++ b/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh @@ -24,4 +24,5 @@ DRIVER_VERSION="450.172.01" source ml_tests/setup_host.sh $DRIVER_VERSION PYTORCH_VERSION="v1_12" -source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION +BUCKET_TYPE=$1 +source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION $BUCKET_TYPE diff --git a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile index ed167041f0..b92732b215 100644 --- a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile +++ b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile @@ -39,6 +39,10 @@ COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./ RUN mkdir -p "run_artifacts" RUN mkdir -p "gcsfuse_data" -ENV PYTORCH_VERSION="v2" +ARG PYTORCH_VERSION +ARG BUCKET_TYPE +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ENV BUCKET_TYPE=${BUCKET_TYPE} -ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"] +RUN echo ${BUCKET_TYPE} +ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION} ${BUCKET_TYPE}"] diff --git a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh index 18d926addf..6cfb5a7297 100755 --- a/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh +++ b/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh @@ -22,5 +22,8 @@ echo "Setting up the machine with Docker and Nvidia Driver" DRIVER_VERSION="520.61.05" source ml_tests/setup_host.sh $DRIVER_VERSION + PYTORCH_VERSION="v2" -source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION +BUCKET_TYPE=$1 + +source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION $BUCKET_TYPE