diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml index 8b510627f5..7a117da56b 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml @@ -32,11 +32,11 @@ spec: image: {{ .Values.image }} resources: limits: - cpu: "100" - memory: 400Gi + cpu: {{ .Values.resourceLimits.cpu }} + memory: {{ .Values.resourceLimits.memory }} requests: - cpu: "50" - memory: 300Gi + cpu: {{ .Values.resourceRequests.cpu }} + memory: {{ .Values.resourceRequests.memory }} env: - name: RDMAV_FORK_SAFE value: "1" @@ -86,6 +86,11 @@ spec: ++workload.reader.read_threads={{ .Values.dlio.readThreads }} \ ++workload.output.folder=/logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }} + # dump the gcsfuse-mount-configuration to a file in output-directory. + {{ if eq .Values.scenario "gcsfuse-generic"}} + echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options + {{ end }} + gsutil -m cp -R /logs gs://{{ .Values.bucketName }}/logs/$(date +"%Y-%m-%d-%H-%M") volumeMounts: - name: dshm @@ -103,6 +108,12 @@ spec: - name: data-vol {{- if eq .Values.scenario "local-ssd" }} emptyDir: {} + {{- else if eq .Values.scenario "gcsfuse-generic" }} + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.bucketName }} + mountOptions: "{{ .Values.gcsfuse.mountOptions }}" {{- else if eq .Values.scenario "gcsfuse-file-cache" }} csi: driver: gcsfuse.csi.storage.gke.io diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml index e11d433755..cbaa9d0137 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml @@ -19,9 +19,17 @@ image: jiaxun/dlio:v1.2.0 bucketName: gke-dlio-test-data +# scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests. scenario: local-ssd nodeType: n2-standard-96 +resourceLimits: + cpu: 0 + memory: 0 +resourceRequests: + cpu: 0 + memory: 0 + dlio: numFilesTrain: 500000 recordLength: 102400 @@ -34,3 +42,4 @@ gcsfuse: metadataTypeCacheCapacity: "-1" fileCacheCapacity: "-1" fileCacheForRangeRead: "true" + mountOptions: "implicit-dirs" diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml index b244cfc9ef..368fba13af 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml @@ -36,8 +36,8 @@ spec: cpu: {{ .Values.resourceLimits.cpu }} memory: {{ .Values.resourceLimits.memory }} requests: - cpu: "30" - memory: 300Gi + cpu: {{ .Values.resourceRequests.cpu }} + memory: {{ .Values.resourceRequests.memory }} command: - "/bin/sh" - "-c" @@ -110,7 +110,7 @@ spec: pause_in_seconds=20 block_size={{ .Values.fio.blockSize }} file_size={{ .Values.fio.fileSize }} - num_of_threads=50 + num_of_threads={{ .Values.fio.numThreads }} workload_dir=/data # Cleaning the pagecache, dentries and inode cache before the starting the workload. @@ -126,6 +126,12 @@ spec: echo "Run fio tests..." mkdir -p /data/fio-output/{{ .Values.scenario }}/$read_type + + # dump the gcsfuse-mount-configuration to a file in output-directory. + {{ if eq .Values.scenario "gcsfuse-generic" }} + echo "{{ .Values.gcsfuse.mountOptions }}" > /data/fio-output/{{ .Values.scenario }}/$read_type/gcsfuse_mount_options + {{ end }} + for i in $(seq $epoch); do echo "[Epoch ${i}] start time:" `date +%s` @@ -164,6 +170,12 @@ spec: - name: data-vol {{- if eq .Values.scenario "local-ssd" }} emptyDir: {} + {{- else if eq .Values.scenario "gcsfuse-generic" }} + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.bucketName }} + mountOptions: "{{ .Values.gcsfuse.mountOptions }}" {{- else if eq .Values.scenario "gcsfuse-file-cache" }} csi: driver: gcsfuse.csi.storage.gke.io diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml index bf4678ce79..15111a740d 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml @@ -19,18 +19,23 @@ image: ubuntu:24.04 bucketName: gke-dlio-test-data +# scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests. scenario: local-ssd nodeType: n2-standard-96 resourceLimits: - cpu: 100 - memory: 400Gi + cpu: 0 + memory: 0 +resourceRequests: + cpu: 0 + memory: 0 fio: readType: read fileSize: 64K blockSize: 64K filesPerThread: "20000" + numThreads: "50" gcsfuse: metadataCacheTTLSeconds: "6048000" @@ -38,4 +43,5 @@ gcsfuse: metadataTypeCacheCapacity: "-1" fileCacheCapacity: "-1" fileCacheForRangeRead: "true" + mountOptions: "implicit-dirs"