Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pytorch 2.0 AI-ML tests on HNS bucket #2375

Merged
merged 21 commits into from
Aug 27, 2024
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ ZONE_NAME="us-west1-b"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v1_12/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v1_12/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v1_12"
BUCKET_TYPE="non-hns"

cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION
source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ ZONE_NAME="asia-northeast1-a"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v2"
BUCKET_TYPE="non-hns"

cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION
source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This will stop execution when any command will have non-zero status.
set -e

VM_NAME="pytorch2-dino-7d-a100-gpu-hns-bucket"
ZONE_NAME="us-central1-f"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/v2_hns/dino"
TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/pytorch/v2_hns/dino/setup_host_and_run_container.sh"
PYTORCH_VERSION="v2"
BUCKET_TYPE="hns"

cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH $PYTORCH_VERSION $BUCKET_TYPE
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/pytorch/v2_hns/dino/build.sh"

# 1 hour timeout.
timeout_mins: 60
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ARTIFACTS_BUCKET_PATH=$3
TEST_SCRIPT_PATH=$4
# pytorch version
PYTORCH_VERSION=$5
BUCKET_TYPE=$6
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-2gpus"

function initialize_ssh_key () {
Expand Down Expand Up @@ -65,6 +66,11 @@ function delete_existing_vm_and_create_new () {
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-pytorch2-2gpu"
fi

if [ $BUCKET_TYPE == "hns" ];
then
RESERVATION="projects/$GCP_PROJECT/reservations/ai-ml-tests-hns-bucket"
fi

echo "Creating VM $VM_NAME in zone $ZONE_NAME"
# The below command creates VM using the reservation 'ai-ml-tests'
sudo gcloud compute instances create $VM_NAME \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,3 @@ TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/tf/resnet/setup_ho
cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/"

source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH

5 changes: 3 additions & 2 deletions perfmetrics/scripts/ml_tests/pytorch/run_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
# limitations under the License.
set -e
# pytorch version (e.g. v1_12, v2)
PYTORCH_VESRION=$1
DIR=$1

cd "$HOME/github/gcsfuse"
echo "Building docker image containing all pytorch libraries..."
sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${PYTORCH_VESRION}/dino/Dockerfile --tag pytorch-gcsfuse
sudo docker build . -f perfmetrics/scripts/ml_tests/pytorch/${DIR}/dino/Dockerfile --tag pytorch-gcsfuse

mkdir -p container_artifacts

Expand Down
21 changes: 16 additions & 5 deletions perfmetrics/scripts/ml_tests/pytorch/run_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

PYTORCH_VESRION=$1
PYTORCH_VERSION=$1
BUCKET_TYPE=$2
NUM_EPOCHS=80
TEST_BUCKET="gcsfuse-ml-data"

Expand All @@ -29,10 +30,10 @@ CGO_ENABLED=0 go build .
cd -

# Create a directory for gcsfuse logs
mkdir run_artifacts/gcsfuse_logs
mkdir run_artifacts/gcsfuse_logs

# We have created a bucket in the asia-northeast1 region to align with the location of our PyTorch 2.0 VM, which is also in asia-northeast1.
if [ ${PYTORCH_VESRION} == "v2" ];
if [ ${PYTORCH_VERSION} == "v2" ];
then
TEST_BUCKET="gcsfuse-ml-data-asia-northeast1"
fi
Expand All @@ -51,6 +52,16 @@ metadata-cache:
ttl-secs: 1728000
stat-cache-max-size-mb: 3200
EOF

DIR=${PYTORCH_VERSION}
# Enable the enable-hns flag to run tests on the folder APIs with an HNS bucket.
if [ ${BUCKET_TYPE} == "hns" ];
then
TEST_BUCKET="gcsfuse-ml-data-hns-central1"
echo "enable-hns: true" >> $config_filename
DIR=${DIR}_${BUCKET_TYPE}
fi

echo "Created config-file at "$config_filename

echo "Mounting GCSFuse..."
Expand Down Expand Up @@ -87,7 +98,7 @@ python -c 'import torch;torch.hub.list("facebookresearch/xcit:main")'
# (TulsiShah) TODO: Pytorch 2.0 compile mode has issues (https://github.com/pytorch/pytorch/issues/94599),
# which is fixed in pytorch version 2.1.0 (https://github.com/pytorch/pytorch/pull/100071).
# We'll remove this workaround once we update our Docker image to use Pytorch 2.1.0 or greater version.
if [ ${PYTORCH_VESRION} == "v2" ];
if [ ${PYTORCH_VERSION} == "v2" ];
then
allowed_functions_file="/opt/conda/lib/python3.10/site-packages/torch/_dynamo/allowed_functions.py"
# Update the pytorch library code to bypass the kernel-cache
Expand Down Expand Up @@ -184,7 +195,7 @@ dynamo_unsupported_distributed_c10d_ops = [
]" >> $distributed_c10d_file
fi

ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${PYTORCH_VESRION}/dino"
ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/pytorch/${DIR}/dino"
echo "Update status file"
echo "RUNNING" > status.txt
gsutil cp status.txt $ARTIFACTS_BUCKET_PATH/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@ RUN mkdir -p "run_artifacts"
RUN mkdir -p "gcsfuse_data"

ENV PYTORCH_VERSION="v1_12"
ENV BUCKET_TYPE="non-hns"

ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"]
ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION} ${BUCKET_TYPE}"]
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ echo "Setting up the machine with Docker and Nvidia Driver"
DRIVER_VERSION="450.172.01"
source ml_tests/setup_host.sh $DRIVER_VERSION

PYTORCH_VERSION="v1_12"
source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION
DIR="v1_12"
source ml_tests/pytorch/run_container.sh $DIR
3 changes: 2 additions & 1 deletion perfmetrics/scripts/ml_tests/pytorch/v2/dino/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,6 @@ RUN mkdir -p "run_artifacts"
RUN mkdir -p "gcsfuse_data"

ENV PYTORCH_VERSION="v2"
ENV BUCKET_TYPE="non-hns"

ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION}"]
ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION} ${BUCKET_TYPE}"]
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ echo "Setting up the machine with Docker and Nvidia Driver"
DRIVER_VERSION="520.61.05"
source ml_tests/setup_host.sh $DRIVER_VERSION

PYTORCH_VERSION="v2"
source ml_tests/pytorch/run_container.sh $PYTORCH_VERSION
DIR="v2"
source ml_tests/pytorch/run_container.sh $DIR
45 changes: 45 additions & 0 deletions perfmetrics/scripts/ml_tests/pytorch/v2_hns/dino/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Image with gcsfuse installed and its package (.deb)
FROM gcr.io/deeplearning-platform-release/pytorch-gpu.2-0.py310

# Allow non-root users to specify the allow_other or allow_root mount options
RUN echo "user_allow_other" > /etc/fuse.conf

RUN pip3 install timm setuptools==69.5.1

WORKDIR "/pytorch_dino/"

RUN git clone "https://github.com/facebookresearch/dino"

WORKDIR "/pytorch_dino/dino"
RUN echo '[remote "origin"]' >> .git/config
RUN echo ' fetch = +refs/pull/262/head:refs/remotes/origin/pr/262' >> .git/config

RUN git fetch origin
RUN git diff origin/main origin/pr/262 > diff.patch
RUN git apply diff.patch

WORKDIR "/pytorch_dino/"

COPY perfmetrics/scripts/ml_tests/pytorch/run_model.sh ./

RUN mkdir -p "run_artifacts"
RUN mkdir -p "gcsfuse_data"

ENV PYTORCH_VERSION="v2"
ENV BUCKET_TYPE="hns"

ENTRYPOINT ["/bin/bash", "-c", "./run_model.sh ${PYTORCH_VERSION} ${BUCKET_TYPE}"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This will stop execution when any command will have non-zero status.
set -e

cd "$HOME/github/gcsfuse/perfmetrics/scripts"

echo "Setting up the machine with Docker and Nvidia Driver"
DRIVER_VERSION="520.61.05"
source ml_tests/setup_host.sh $DRIVER_VERSION

DIR="v2_hns"
source ml_tests/pytorch/run_container.sh $DIR
Loading