From 78bb93116c3c306838d937187067b794e4fb7aba Mon Sep 17 00:00:00 2001 From: Nitin Garg Date: Fri, 2 Aug 2024 12:29:38 +0000 Subject: [PATCH 1/4] Support gcsfuse-generic and any machine-type in pods --- .../templates/dlio-tester.yaml | 19 +++++++++++++++---- .../dlio/unet3d-loading-test/values.yaml | 8 ++++++++ .../loading-test/templates/fio-tester.yaml | 18 +++++++++++++++--- .../examples/fio/loading-test/values.yaml | 9 +++++++-- 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml index 8b510627f5..5d5db9f132 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml @@ -32,11 +32,11 @@ spec: image: {{ .Values.image }} resources: limits: - cpu: "100" - memory: 400Gi + cpu: {{ .Values.resourceLimits.cpu }} + memory: {{ .Values.resourceLimits.memory }} requests: - cpu: "50" - memory: 300Gi + cpu: {{ .Values.resourceRequests.cpu }} + memory: {{ .Values.resourceRequests.memory }} env: - name: RDMAV_FORK_SAFE value: "1" @@ -86,6 +86,10 @@ spec: ++workload.reader.read_threads={{ .Values.dlio.readThreads }} \ ++workload.output.folder=/logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }} + {{ if eq .Values.scenario "gcsfuse-generic"}} + echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options + {{ end }} + gsutil -m cp -R /logs gs://{{ .Values.bucketName }}/logs/$(date +"%Y-%m-%d-%H-%M") volumeMounts: - name: dshm @@ -103,6 +107,13 @@ spec: - name: data-vol {{- if eq .Values.scenario "local-ssd" }} emptyDir: {} + {{- else if eq .Values.scenario "gcsfuse-generic" }} + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.bucketName }} + gcsfuseLoggingSeverity: "info" + mountOptions: "{{ .Values.gcsfuse.mountOptions }}" {{- else if eq .Values.scenario "gcsfuse-file-cache" }} csi: driver: gcsfuse.csi.storage.gke.io diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml index e11d433755..c96e4ba9f0 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml @@ -22,6 +22,13 @@ bucketName: gke-dlio-test-data scenario: local-ssd nodeType: n2-standard-96 +resourceLimits: + cpu: 0 + memory: 0 +resourceRequests: + cpu: 0 + memory: 0 + dlio: numFilesTrain: 500000 recordLength: 102400 @@ -34,3 +41,4 @@ gcsfuse: metadataTypeCacheCapacity: "-1" fileCacheCapacity: "-1" fileCacheForRangeRead: "true" + mountOptions: "implicit-dirs" diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml index b244cfc9ef..951f3da885 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml @@ -36,8 +36,8 @@ spec: cpu: {{ .Values.resourceLimits.cpu }} memory: {{ .Values.resourceLimits.memory }} requests: - cpu: "30" - memory: 300Gi + cpu: {{ .Values.resourceRequests.cpu }} + memory: {{ .Values.resourceRequests.memory }} command: - "/bin/sh" - "-c" @@ -110,7 +110,7 @@ spec: pause_in_seconds=20 block_size={{ .Values.fio.blockSize }} file_size={{ .Values.fio.fileSize }} - num_of_threads=50 + num_of_threads={{ .Values.fio.numThreads }} workload_dir=/data # Cleaning the pagecache, dentries and inode cache before the starting the workload. @@ -126,6 +126,11 @@ spec: echo "Run fio tests..." mkdir -p /data/fio-output/{{ .Values.scenario }}/$read_type + + {{ if eq .Values.scenario "gcsfuse-generic" }} + echo "{{ .Values.gcsfuse.mountOptions }}" > /data/fio-output/{{ .Values.scenario }}/$read_type/gcsfuse_mount_options + {{ end }} + for i in $(seq $epoch); do echo "[Epoch ${i}] start time:" `date +%s` @@ -164,6 +169,13 @@ spec: - name: data-vol {{- if eq .Values.scenario "local-ssd" }} emptyDir: {} + {{- else if eq .Values.scenario "gcsfuse-generic" }} + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.bucketName }} + # gcsfuseLoggingSeverity: "info" + mountOptions: "{{ .Values.gcsfuse.mountOptions }}" {{- else if eq .Values.scenario "gcsfuse-file-cache" }} csi: driver: gcsfuse.csi.storage.gke.io diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml index bf4678ce79..ad8a969cfa 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml @@ -23,14 +23,18 @@ scenario: local-ssd nodeType: n2-standard-96 resourceLimits: - cpu: 100 - memory: 400Gi + cpu: 0 + memory: 0 +resourceRequests: + cpu: 0 + memory: 0 fio: readType: read fileSize: 64K blockSize: 64K filesPerThread: "20000" + numThreads: "50" gcsfuse: metadataCacheTTLSeconds: "6048000" @@ -38,4 +42,5 @@ gcsfuse: metadataTypeCacheCapacity: "-1" fileCacheCapacity: "-1" fileCacheForRangeRead: "true" + mountOptions: "implicit-dirs" From 5c374911cf1132b0ce578e46bb9503d516ba6605 Mon Sep 17 00:00:00 2001 From: Nitin Garg Date: Wed, 7 Aug 2024 15:51:25 +0000 Subject: [PATCH 2/4] address self-review comments --- .../dlio/unet3d-loading-test/templates/dlio-tester.yaml | 1 + .../examples/fio/loading-test/templates/fio-tester.yaml | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml index 5d5db9f132..834fa99f01 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml @@ -86,6 +86,7 @@ spec: ++workload.reader.read_threads={{ .Values.dlio.readThreads }} \ ++workload.output.folder=/logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }} + # dump the gcsfuse-mount-configuration to a file in output-directory. {{ if eq .Values.scenario "gcsfuse-generic"}} echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options {{ end }} diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml index 951f3da885..90a1b6c486 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml @@ -127,6 +127,7 @@ spec: echo "Run fio tests..." mkdir -p /data/fio-output/{{ .Values.scenario }}/$read_type + # dump the gcsfuse-mount-configuration to a file in output-directory. {{ if eq .Values.scenario "gcsfuse-generic" }} echo "{{ .Values.gcsfuse.mountOptions }}" > /data/fio-output/{{ .Values.scenario }}/$read_type/gcsfuse_mount_options {{ end }} @@ -174,7 +175,7 @@ spec: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.bucketName }} - # gcsfuseLoggingSeverity: "info" + gcsfuseLoggingSeverity: "info" mountOptions: "{{ .Values.gcsfuse.mountOptions }}" {{- else if eq .Values.scenario "gcsfuse-file-cache" }} csi: From a70bbb7bf2c5380274f2dec0d2d803beda7935c1 Mon Sep 17 00:00:00 2001 From: Nitin Garg Date: Fri, 9 Aug 2024 15:19:04 +0000 Subject: [PATCH 3/4] remove explicit gcsfuse-log-severity from pod yaml --- .../examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml | 1 - .../examples/fio/loading-test/templates/fio-tester.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml index 834fa99f01..7a117da56b 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml @@ -113,7 +113,6 @@ spec: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.bucketName }} - gcsfuseLoggingSeverity: "info" mountOptions: "{{ .Values.gcsfuse.mountOptions }}" {{- else if eq .Values.scenario "gcsfuse-file-cache" }} csi: diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml index 90a1b6c486..368fba13af 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml @@ -175,7 +175,6 @@ spec: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.bucketName }} - gcsfuseLoggingSeverity: "info" mountOptions: "{{ .Values.gcsfuse.mountOptions }}" {{- else if eq .Values.scenario "gcsfuse-file-cache" }} csi: From c29a1a00b6d0e3428fff5cebafaa18a97bdc7d4b Mon Sep 17 00:00:00 2001 From: Nitin Garg Date: Wed, 14 Aug 2024 11:38:23 +0000 Subject: [PATCH 4/4] add comments for scenario in helm values for pod configs --- .../testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml | 1 + .../scripts/testing_on_gke/examples/fio/loading-test/values.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml index c96e4ba9f0..cbaa9d0137 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml @@ -19,6 +19,7 @@ image: jiaxun/dlio:v1.2.0 bucketName: gke-dlio-test-data +# scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests. scenario: local-ssd nodeType: n2-standard-96 diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml index ad8a969cfa..15111a740d 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml @@ -19,6 +19,7 @@ image: ubuntu:24.04 bucketName: gke-dlio-test-data +# scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests. scenario: local-ssd nodeType: n2-standard-96