diff --git a/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet/build.sh index b98a908d7e..ec68b28a85 100755 --- a/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet/build.sh +++ b/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet/build.sh @@ -20,7 +20,8 @@ VM_NAME="tf-resnet-7d" ZONE_NAME="us-west1-b" ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/tf/resnet" TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/tf/resnet/setup_host_and_run_model.sh" +BUCKET_TYPE="non-hns" cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" -source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH "" $BUCKET_TYPE diff --git a/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet_hns/build.sh b/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet_hns/build.sh new file mode 100755 index 0000000000..9871e70529 --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet_hns/build.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This will stop execution when any command will have non-zero status. +set -e + +VM_NAME="tf-resnet-7d-a100-gpu-hns-bucket" +ZONE_NAME="us-central1-f" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/tf/resnet_hns" +TEST_SCRIPT_PATH="github/gcsfuse/perfmetrics/scripts/ml_tests/tf/resnet/setup_host_and_run_model.sh" +BUCKET_TYPE="hns" + +cd "${KOKORO_ARTIFACTS_DIR}/github/gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/" + +source run_and_manage_test.sh $VM_NAME $ZONE_NAME $ARTIFACTS_BUCKET_PATH $TEST_SCRIPT_PATH "" $BUCKET_TYPE diff --git a/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet_hns/continuous.cfg b/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet_hns/continuous.cfg new file mode 100644 index 0000000000..39ba104f27 --- /dev/null +++ b/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet_hns/continuous.cfg @@ -0,0 +1,19 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Config file for kokoro test +build_file: "gcsfuse/perfmetrics/scripts/continuous_test/ml_tests/tf/resnet_hns/build.sh" + +# 1 hours timeout. +timeout_mins: 60 diff --git a/perfmetrics/scripts/ml_tests/tf/resnet/Dockerfile b/perfmetrics/scripts/ml_tests/tf/resnet/Dockerfile index d7f4b9edcd..f479bc56cb 100644 --- a/perfmetrics/scripts/ml_tests/tf/resnet/Dockerfile +++ b/perfmetrics/scripts/ml_tests/tf/resnet/Dockerfile @@ -31,5 +31,8 @@ WORKDIR "/tf_test/" COPY ./perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh . COPY ./perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/resnet_runner.py . +ARG BUCKET_TYPE +ENV BUCKET_TYPE=${BUCKET_TYPE} + RUN mkdir -p "myBucket" -ENTRYPOINT ["/bin/bash", "-c", "./setup_container.sh"] +ENTRYPOINT ["/bin/bash", "-c", "./setup_container.sh ${BUCKET_TYPE}"] diff --git a/perfmetrics/scripts/ml_tests/tf/resnet/setup_host_and_run_model.sh b/perfmetrics/scripts/ml_tests/tf/resnet/setup_host_and_run_model.sh index 3bfaf8634a..35431b5495 100755 --- a/perfmetrics/scripts/ml_tests/tf/resnet/setup_host_and_run_model.sh +++ b/perfmetrics/scripts/ml_tests/tf/resnet/setup_host_and_run_model.sh @@ -16,6 +16,7 @@ # This will stop execution when any command will have non-zero status. set -e +BUCKET_TYPE=$1 cd "$HOME/github/gcsfuse/perfmetrics/scripts" echo "Setting up the machine with Docker and Nvidia Driver..." @@ -26,7 +27,7 @@ cd "$HOME/github/gcsfuse/" mkdir container_artifacts && mkdir container_artifacts/logs && mkdir container_artifacts/output echo "Building tf DLC docker image containing all tensorflow libraries..." -sudo docker build . -f perfmetrics/scripts/ml_tests/tf/resnet/Dockerfile -t tf-dlc-gcsfuse --build-arg DLC_IMAGE_NAME=tf-gpu.2-13 +sudo docker build . -f perfmetrics/scripts/ml_tests/tf/resnet/Dockerfile -t tf-dlc-gcsfuse --build-arg DLC_IMAGE_NAME=tf-gpu.2-13 --build-arg BUCKET_TYPE="${BUCKET_TYPE}" echo "Running the docker image build in the previous step..." sudo docker run --gpus all --name tf_model_container --privileged -d \ diff --git a/perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh b/perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh index 5c3d52d327..089ebe6316 100755 --- a/perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh +++ b/perfmetrics/scripts/ml_tests/tf/resnet/setup_scripts/setup_container.sh @@ -18,6 +18,7 @@ # and epochs functionality, and runs the model # Install go lang +BUCKET_TYPE=$1 wget -O go_tar.tar.gz https://go.dev/dl/go1.23.0.linux-amd64.tar.gz -q sudo rm -rf /usr/local/go && tar -xzf go_tar.tar.gz && sudo mv go /usr/local export PATH=$PATH:/usr/local/go/bin @@ -39,11 +40,22 @@ echo "logging: backup-file-count: 3 compress: true " > /tmp/gcsfuse_config.yaml + +TEST_BUCKET="gcsfuse-ml-tf-data" +DIR="resnet" +# Enable the enable-hns flag to run tests on the folder APIs with an HNS bucket. +if [ ${BUCKET_TYPE} == "hns" ]; +then + TEST_BUCKET="gcsfuse-ml-data-hns-central1" + echo "enable-hns: true" >> /tmp/gcsfuse_config.yaml + DIR=${DIR}_${BUCKET_TYPE} +fi + nohup gcsfuse/gcsfuse --foreground \ --implicit-dirs \ --stackdriver-export-interval 60s \ --config-file /tmp/gcsfuse_config.yaml \ - gcsfuse-ml-tf-data myBucket > /home/output/gcsfuse.out 2> /home/output/gcsfuse.err & + $TEST_BUCKET myBucket > /home/output/gcsfuse.out 2> /home/output/gcsfuse.err & # Install tensorflow model garden library pip3 install --user tf-models-official==2.13.2 @@ -190,7 +202,7 @@ sed -i "$lines"'d' $train_lib_file x=$((x-1)) sed -i "$x"'r bypassed_code.py' $train_lib_file -ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/tf/resnet" +ARTIFACTS_BUCKET_PATH="gs://gcsfuse-ml-tests-logs/ci_artifacts/tf/${DIR}" echo "Update status file" echo "RUNNING" > status.txt gsutil cp status.txt $ARTIFACTS_BUCKET_PATH/