Skip to content

Commit

Permalink
Pytorch 2.0 AI-ML tests on HNS bucket (#2375)
Browse files Browse the repository at this point in the history
* automate hns bucket tests

* small fix in pytorch setup

* small fix in pytorch setup

* testing

* test on VM

* test on VM

* test VM

* test VM

* test VM

* remove sleep

* test on VM

* test on VM

* small fix

* small fix

* remove checkout

* update comment

* remove duplicate Docker file

* remove local dependency

* small fix

* remove print

* small fix
  • Loading branch information
Tulsishah authored Aug 27, 2024
1 parent 8a89e1f commit 7aba348
Show file tree
Hide file tree
Showing 12 changed files with 95 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ ZONE_NAME="us-west1-b"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v1_12/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v1_12"
BUCKET_TYPE="non-hns"

cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION
source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ ZONE_NAME="asia-northeast1-a"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v2"
BUCKET_TYPE="non-hns"

cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION
source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This will stop execution when any command will have non-zero status.
set -e

VM_NAME="pytorch2-dino-7d-a100-gpu-hns-bucket"
ZONE_NAME="us-central1-f"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2_hns/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v2"
BUCKET_TYPE="hns"

cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/build.sh"

# 1 hour timeout.
timeout_mins: 60
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ARTIFACTS_BUCKET_PATH=$3
TEST_SCRIPT_PATH=$4
# pytorch version
PYTORCH_VERSION=$5
BUCKET_TYPE=$6
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus"

function initialize_ssh_key () {
Expand Down Expand Up @@ -65,6 +66,11 @@ function delete_existing_vm_and_create_new () {
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-pytorch2-2gpu"
fi

if [ $BUCKET_TYPE == "hns" ];
then
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-hns-bucket"
fi

echo "Creating VM $VM_NAME in zone $ZONE_NAME"
# The below command creates VM using the reservation 'ai-ml-tests'
sudo gcloud compute instances create $VM_NAME \
Expand Down Expand Up @@ -155,7 +161,7 @@ then
echo "Clone the gcsfuse repo on test VM"
sudo gcloud compute ssh $VM_NAME --zone $ZONE_NAME --internal-ip --command "mkdir github; cd github; git clone https://github.com/GoogleCloudPlatform/gcsfuse.git; cd gcsfuse; git checkout master;"
echo "Trigger the build script on test VM"
sudo gcloud compute ssh $VM_NAME --zone $ZONE_NAME --internal-ip --command "bash \$HOME/$TEST_SCRIPT_PATH 1> \$HOME/build.out 2> \$HOME/build.err &"
sudo gcloud compute ssh $VM_NAME --zone $ZONE_NAME --internal-ip --command "bash \$HOME/$TEST_SCRIPT_PATH $BUCKET_TYPE 1> \$HOME/build.out 2> \$HOME/build.err &"
echo "Wait for 15 minutes for test VM to setup for test and to change the status from START to RUNNING."
sleep 900s

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,3 @@ TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/tf/resnet/setup_ho
cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH

6 changes: 4 additions & 2 deletions perfmetrics/scripts/ml_tests/pytorch/run_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
# limitations under the License.
set -e
# pytorch version (e.g. v1_12, v2)
PYTORCH_VESRION=$1
PYTORCH_VERSION=$1
BUCKET_TYPE=$2

cd "$HOME/github/gcsfuse"
echo "Building docker image containing all pytorch libraries..."
sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VESRION}/dino/Dockerfile --tag pytorch-gcsfuse
sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VERSION}/dino/Dockerfile --tag pytorch-gcsfuse --build-arg PYTORCH_VERSION="${PYTORCH_VERSION}" --build-arg BUCKET_TYPE="${BUCKET_TYPE}"

mkdir -p container_artifacts

Expand Down
22 changes: 17 additions & 5 deletions perfmetrics/scripts/ml_tests/pytorch/run_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

PYTORCH_VESRION=$1
PYTORCH_VERSION=$1
BUCKET_TYPE=$2

NUM_EPOCHS=80
TEST_BUCKET="gcsfuse-ml-data"

Expand All @@ -29,10 +31,10 @@ CGO_ENABLED=0 go build .
cd -

# Create a directory for gcsfuse logs
mkdir run_artifacts/gcsfuse_logs
mkdir run_artifacts/gcsfuse_logs

# We have created a bucket in the asia-northeast1 region to align with the location of our PyTorch 2.0 VM, which is also in asia-northeast1.
if [ ${PYTORCH_VESRION} == "v2" ];
if [ ${PYTORCH_VERSION} == "v2" ];
then
TEST_BUCKET="gcsfuse-ml-data-asia-northeast1"
fi
Expand All @@ -51,6 +53,16 @@ metadata-cache:
ttl-secs: 1728000
stat-cache-max-size-mb: 3200
EOF

DIR=${PYTORCH_VERSION}
# Enable the enable-hns flag to run tests on the folder APIs with an HNS bucket.
if [ ${BUCKET_TYPE} == "hns" ];
then
TEST_BUCKET="gcsfuse-ml-data-hns-central1"
echo "enable-hns: true" >> $config_filename
DIR=${DIR}_${BUCKET_TYPE}
fi

echo "Created config-file at "$config_filename

echo "Mounting GCSFuse..."
Expand Down Expand Up @@ -87,7 +99,7 @@ python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")'
# (TulsiShah) TODO: Pytorch 2.0 compile mode has issues (https://github.com/pytorch/pytorch/issues/94599),
# which is fixed in pytorch version 2.1.0 (https://github.com/pytorch/pytorch/pull/100071).
# We'll remove this workaround once we update our Docker image to use Pytorch 2.1.0 or greater version.
if [ ${PYTORCH_VESRION} == "v2" ];
if [ ${PYTORCH_VERSION} == "v2" ];
then
allowed_functions_file="/opt/conda/lib/python3.10/site-packages/torch/_dynamo/allowed_functions.py"
# Update the pytorch library code to bypass the kernel-cache
Expand Down Expand Up @@ -184,7 +196,7 @@ dynamo_unsupported_distributed_c10d_ops = [
]" >> $distributed_c10d_file
fi

ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${PYTORCH_VESRION}/dino"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${DIR}/dino"
echo "Update status file"
echo "RUNNING" > status.txt
gsutil cp status.txt $ARTIFACTS_BUCKET_PATH/
Expand Down
7 changes: 5 additions & 2 deletions perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./
RUN mkdir -p "run_artifacts"
RUN mkdir -p "gcsfuse_data"

ENV PYTORCH_VERSION="v1_12"
ARG PYTORCH_VERSION
ARG BUCKET_TYPE
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
ENV BUCKET_TYPE=${BUCKET_TYPE}

ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"]
ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION} ${BUCKET_TYPE}"]
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ DRIVER_VERSION="450.172.01"
source ml_tests/setup_host.sh $DRIVER_VERSION

PYTORCH_VERSION="v1_12"
source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION
BUCKET_TYPE=$1
source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION $BUCKET_TYPE
8 changes: 6 additions & 2 deletions perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./
RUN mkdir -p "run_artifacts"
RUN mkdir -p "gcsfuse_data"

ENV PYTORCH_VERSION="v2"
ARG PYTORCH_VERSION
ARG BUCKET_TYPE
ENV PYTORCH_VERSION=${PYTORCH_VERSION}
ENV BUCKET_TYPE=${BUCKET_TYPE}

ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"]
RUN echo ${BUCKET_TYPE}
ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION} ${BUCKET_TYPE}"]
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,8 @@ echo "Setting up the machine with Docker and Nvidia Driver"
DRIVER_VERSION="520.61.05"
source ml_tests/setup_host.sh $DRIVER_VERSION


PYTORCH_VERSION="v2"
source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION
BUCKET_TYPE=$1

source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION $BUCKET_TYPE

0 comments on commit 7aba348

Please sign in to comment.