Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[testing-on-gke part 1.3] Allow any generic machine-type, and passing GCSFuse configuration, workload configuration in the pod configuration #2367

Merged
merged 4 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ spec:
image: {{ .Values.image }}
resources:
limits:
cpu: "100"
memory: 400Gi
cpu: {{ .Values.resourceLimits.cpu }}
memory: {{ .Values.resourceLimits.memory }}
requests:
cpu: "50"
memory: 300Gi
cpu: {{ .Values.resourceRequests.cpu }}
memory: {{ .Values.resourceRequests.memory }}
env:
- name: RDMAV_FORK_SAFE
value: "1"
Expand Down Expand Up @@ -86,6 +86,11 @@ spec:
++workload.reader.read_threads={{ .Values.dlio.readThreads }} \
++workload.output.folder=/logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}

# dump the gcsfuse-mount-configuration to a file in output-directory.
{{ if eq .Values.scenario "gcsfuse-generic"}}
echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options
{{ end }}

gsutil -m cp -R /logs gs://{{ .Values.bucketName }}/logs/$(date +"%Y-%m-%d-%H-%M")
volumeMounts:
- name: dshm
Expand All @@ -103,6 +108,12 @@ spec:
- name: data-vol
{{- if eq .Values.scenario "local-ssd" }}
emptyDir: {}
{{- else if eq .Values.scenario "gcsfuse-generic" }}
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: {{ .Values.bucketName }}
mountOptions: "{{ .Values.gcsfuse.mountOptions }}"
{{- else if eq .Values.scenario "gcsfuse-file-cache" }}
csi:
driver: gcsfuse.csi.storage.gke.io
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,17 @@

image: jiaxun/dlio:v1.2.0
bucketName: gke-dlio-test-data
# scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests.
scenario: local-ssd
nodeType: n2-standard-96

resourceLimits:
cpu: 0
memory: 0
resourceRequests:
cpu: 0
memory: 0

dlio:
numFilesTrain: 500000
recordLength: 102400
Expand All @@ -34,3 +42,4 @@ gcsfuse:
metadataTypeCacheCapacity: "-1"
fileCacheCapacity: "-1"
fileCacheForRangeRead: "true"
mountOptions: "implicit-dirs"
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ spec:
cpu: {{ .Values.resourceLimits.cpu }}
memory: {{ .Values.resourceLimits.memory }}
requests:
cpu: "30"
memory: 300Gi
cpu: {{ .Values.resourceRequests.cpu }}
memory: {{ .Values.resourceRequests.memory }}
command:
- "/bin/sh"
- "-c"
Expand Down Expand Up @@ -110,7 +110,7 @@ spec:
pause_in_seconds=20
block_size={{ .Values.fio.blockSize }}
file_size={{ .Values.fio.fileSize }}
num_of_threads=50
num_of_threads={{ .Values.fio.numThreads }}
workload_dir=/data

# Cleaning the pagecache, dentries and inode cache before the starting the workload.
Expand All @@ -126,6 +126,12 @@ spec:

echo "Run fio tests..."
mkdir -p /data/fio-output/{{ .Values.scenario }}/$read_type

# dump the gcsfuse-mount-configuration to a file in output-directory.
{{ if eq .Values.scenario "gcsfuse-generic" }}
echo "{{ .Values.gcsfuse.mountOptions }}" > /data/fio-output/{{ .Values.scenario }}/$read_type/gcsfuse_mount_options
{{ end }}

for i in $(seq $epoch); do

echo "[Epoch ${i}] start time:" `date +%s`
Expand Down Expand Up @@ -164,6 +170,12 @@ spec:
- name: data-vol
{{- if eq .Values.scenario "local-ssd" }}
emptyDir: {}
{{- else if eq .Values.scenario "gcsfuse-generic" }}
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: {{ .Values.bucketName }}
mountOptions: "{{ .Values.gcsfuse.mountOptions }}"
{{- else if eq .Values.scenario "gcsfuse-file-cache" }}
csi:
driver: gcsfuse.csi.storage.gke.io
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,29 @@

image: ubuntu:24.04
bucketName: gke-dlio-test-data
# scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests.
scenario: local-ssd
nodeType: n2-standard-96

resourceLimits:
cpu: 100
memory: 400Gi
cpu: 0
memory: 0
resourceRequests:
cpu: 0
memory: 0

fio:
readType: read
fileSize: 64K
blockSize: 64K
filesPerThread: "20000"
numThreads: "50"

gcsfuse:
metadataCacheTTLSeconds: "6048000"
metadataStatCacheCapacity: "-1"
metadataTypeCacheCapacity: "-1"
fileCacheCapacity: "-1"
fileCacheForRangeRead: "true"
mountOptions: "implicit-dirs"

Loading