GoogleCloudPlatform · ganochenkodg · Jul 8, 2024 · Jul 9, 2024 · Jul 11, 2024 · Aug 6, 2024
@@ -0,0 +1,8 @@
+# Data backend options for model training jobs on GKE
+
+These examples shows performance of different storages for model training purposes.
+[Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine).
+
+Visit [Google Cloud documentation](will be known after publishing)
+to follow the tutorials.
+
@@ -0,0 +1,47 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_model_train_01_bucket]
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: gcs-fuse-pv
+spec:
+  accessModes:
+  - ReadWriteMany
+  capacity:
+    storage: 16Gi
+  storageClassName: example-storage-class
+  mountOptions:
+    - implicit-dirs
+  csi:
+    driver: gcsfuse.csi.storage.gke.io
+    volumeHandle: <PROJECT_ID>-<CLUSTER_PREFIX>-model-train
+    volumeAttributes:
+      fileCacheCapacity: 5Gi
+      fileCacheForRangeRead: "true"
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: gcs-fuse-claim
+spec:
+  accessModes:
+  - ReadWriteMany
+  resources:
+    requests:
+      storage: 16Gi
+  volumeName: gcs-fuse-pv
+  storageClassName: example-storage-class
+# [END gke_ai_ml_model_train_01_bucket]
@@ -0,0 +1,33 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_model_train_01_cloudbuild]
+steps:
+- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
+  entrypoint: /bin/bash
+  args: 
+    - '-c'
+    - |
+      gcloud compute ssh --tunnel-through-iap --quiet cloudbuild@${_INSTANCE_NAME} --zone=${_ZONE} --command="\
+      sudo mkdir -p /mnt/disks/ram-disk && \
+      sudo mount -t tmpfs -o size=16g tmpfs /mnt/disks/ram-disk && \
+      sudo mkfs.ext4 -F /dev/disk/by-id/google-local-ssd-block0 && \
+      sudo mkdir -p /mnt/disks/ssd0 && \
+      sudo mount /dev/disk/by-id/google-local-ssd-block0 /mnt/disks/ssd0 && \
+      sudo mkdir -p /mnt/disks/ssd0/outputs && \
+      sudo chmod -R 777 /mnt/disks/ssd0/outputs" 
+substitutions:
+  _ZONE: us-central1-a
+  _INSTANCE_NAME: model-train-vm
+# [END gke_ai_ml_model_train_01_cloudbuild]
@@ -0,0 +1,95 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_model_train_01_volumes]
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: local-ssd-pv
+spec:
+  capacity:
+    storage: 16Gi
+  accessModes: ["ReadWriteOnce"]
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/disks/ssd0
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: "node_pool"
+          operator: "In"
+          values:
+          - "model-train-pool"
+---
+  kind: PersistentVolumeClaim
+  apiVersion: v1
+  metadata:
+    name: local-ssd-claim
+  spec:
+    accessModes:
+    - ReadWriteOnce
+    storageClassName: local-storage
+    volumeName: local-ssd-pv
+    resources:
+      requests:
+        storage: 16Gi
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: ram-disk-pv
+spec:
+  capacity:
+    storage: 16Gi
+  accessModes: ["ReadWriteOnce"]
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: local-storage
+  local:
+    path: /mnt/disks/ram-disk
+  nodeAffinity:
+    required:
+      nodeSelectorTerms:
+      - matchExpressions:
+        - key: "node_pool"
+          operator: "In"
+          values:
+          - "model-train-pool"
+---
+  kind: PersistentVolumeClaim
+  apiVersion: v1
+  metadata:
+    name: ram-disk-claim
+  spec:
+    accessModes:
+    - ReadWriteOnce
+    storageClassName: local-storage
+    volumeName: ram-disk-pv
+    resources:
+      requests:
+        storage: 16Gi
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: pd-ssd-claim
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 16Gi
+  storageClassName: premium-rwo
+# [END gke_ai_ml_model_train_01_volumes]
@@ -0,0 +1,109 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# [START gke_ai_ml_model_train_02_data_load_job]
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: bucket-access
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: download-script
+data:
+  download.sh: |-
+    #!/usr/bin/bash -x
+    apt-get update -y && \
+      apt-get install -y --no-install-recommends \
+      git git-lfs rsync
+    git lfs install
+    cd /tmp
+    echo "Saving dataset into tmp..."
+    time git clone --depth=1 "$DATASET_REPO"; echo "cloned"
+    if [ "$UPLOAD_SSD" == "1" ]; then
+    echo "Saving dataset into Local SSD..."
+    time rsync --info=progress2 -a /tmp/dataset/dataset/ /local-ssd/dataset/
+    fi
+    if [ "$UPLOAD_RAM" == "1" ]; then
+    echo "Saving dataset into Ram disk..."
+    time rsync --info=progress2 -a /tmp/dataset/dataset/ /ram-disk/dataset/
+    fi
+    if [ "$UPLOAD_PD" == "1" ]; then
+    echo "Saving dataset into Persistent disk..."
+    time rsync --info=progress2 -a /tmp/dataset/dataset/ /pd-ssd/dataset/
+    fi
+    if [ "$UPLOAD_BUCKET" == "1" ]; then
+    echo "Saving dataset into Bucket..."
+    time gsutil -q -m cp -r /tmp/dataset/dataset/ gs://$BUCKET_NAME/
+    echo "Dataset was successfully saved in all storages!"
+    fi
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bucket-dataset-downloader
+  labels:
+    app: bucket-dataset-downloader
+spec:
+  ttlSecondsAfterFinished: 120
+  template:
+    metadata:
+      labels:
+        app: bucket-dataset-downloader
+    spec:
+      restartPolicy: OnFailure
+      serviceAccountName: bucket-access
+      containers:
+      - name: gcloud
+        image: gcr.io/google.com/cloudsdktool/google-cloud-cli:slim
+        resources:
+          requests:
+            cpu: "1"
+            memory: "3Gi"
+          limits:
+            cpu: "2"
+            memory: "3Gi"
+        command:
+        - /scripts/download.sh
+        env:
+        - name: UPLOAD_BUCKET
+          value: "1"
+        - name: BUCKET_NAME
+          value: <PROJECT_ID>-<CLUSTER_PREFIX>-model-train
+        - name: DATASET_REPO
+          value: "https://huggingface.co/datasets/dganochenko/dataset"
+        - name: TIMEFORMAT
+          value: "%0lR"
+        volumeMounts:
+        - name: scripts-volume
+          mountPath: "/scripts/"
+          readOnly: true
+      volumes:
+      - name: scripts-volume
+        configMap:
+          defaultMode: 0700
+          name: download-script
+      tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Equal"
+        value: "present"
+        effect: NoSchedule
+      - key: "app.stateful/component"
+        operator: "Equal"
+        value: "model-train"
+        effect: NoSchedule
+      nodeSelector:
+        cloud.google.com/gke-accelerator: nvidia-l4
+# [END gke_ai_ml_model_train_02_data_load_job]