diff --git a/ai-ml/model-train/README.md b/ai-ml/model-train/README.md new file mode 100644 index 0000000000..822759d457 --- /dev/null +++ b/ai-ml/model-train/README.md @@ -0,0 +1,8 @@ +# Data backend options for model training jobs on GKE + +These examples shows performance of different storages for model training purposes. +[Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine). + +Visit [Google Cloud documentation](will be known after publishing) +to follow the tutorials. + diff --git a/ai-ml/model-train/manifests/01-volumes/bucket.yaml b/ai-ml/model-train/manifests/01-volumes/bucket.yaml new file mode 100644 index 0000000000..9eca18d4eb --- /dev/null +++ b/ai-ml/model-train/manifests/01-volumes/bucket.yaml @@ -0,0 +1,47 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_model_train_01_bucket] +apiVersion: v1 +kind: PersistentVolume +metadata: + name: gcs-fuse-pv +spec: + accessModes: + - ReadWriteMany + capacity: + storage: 16Gi + storageClassName: example-storage-class + mountOptions: + - implicit-dirs + csi: + driver: gcsfuse.csi.storage.gke.io + volumeHandle: --model-train + volumeAttributes: + fileCacheCapacity: 5Gi + fileCacheForRangeRead: "true" +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: gcs-fuse-claim +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 16Gi + volumeName: gcs-fuse-pv + storageClassName: example-storage-class +# [END gke_ai_ml_model_train_01_bucket] \ No newline at end of file diff --git a/ai-ml/model-train/manifests/01-volumes/cloudbuild.yaml b/ai-ml/model-train/manifests/01-volumes/cloudbuild.yaml new file mode 100644 index 0000000000..f166f6ca68 --- /dev/null +++ b/ai-ml/model-train/manifests/01-volumes/cloudbuild.yaml @@ -0,0 +1,33 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_model_train_01_cloudbuild] +steps: +- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' + entrypoint: /bin/bash + args: + - '-c' + - | + gcloud compute ssh --tunnel-through-iap --quiet cloudbuild@${_INSTANCE_NAME} --zone=${_ZONE} --command="\ + sudo mkdir -p /mnt/disks/ram-disk && \ + sudo mount -t tmpfs -o size=16g tmpfs /mnt/disks/ram-disk && \ + sudo mkfs.ext4 -F /dev/disk/by-id/google-local-ssd-block0 && \ + sudo mkdir -p /mnt/disks/ssd0 && \ + sudo mount /dev/disk/by-id/google-local-ssd-block0 /mnt/disks/ssd0 && \ + sudo mkdir -p /mnt/disks/ssd0/outputs && \ + sudo chmod -R 777 /mnt/disks/ssd0/outputs" +substitutions: + _ZONE: us-central1-a + _INSTANCE_NAME: model-train-vm +# [END gke_ai_ml_model_train_01_cloudbuild] \ No newline at end of file diff --git a/ai-ml/model-train/manifests/01-volumes/volumes.yaml b/ai-ml/model-train/manifests/01-volumes/volumes.yaml new file mode 100644 index 0000000000..626e8f1f66 --- /dev/null +++ b/ai-ml/model-train/manifests/01-volumes/volumes.yaml @@ -0,0 +1,95 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_model_train_01_volumes] +apiVersion: v1 +kind: PersistentVolume +metadata: + name: local-ssd-pv +spec: + capacity: + storage: 16Gi + accessModes: ["ReadWriteOnce"] + persistentVolumeReclaimPolicy: Retain + storageClassName: local-storage + local: + path: /mnt/disks/ssd0 + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: "node_pool" + operator: "In" + values: + - "model-train-pool" +--- + kind: PersistentVolumeClaim + apiVersion: v1 + metadata: + name: local-ssd-claim + spec: + accessModes: + - ReadWriteOnce + storageClassName: local-storage + volumeName: local-ssd-pv + resources: + requests: + storage: 16Gi +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: ram-disk-pv +spec: + capacity: + storage: 16Gi + accessModes: ["ReadWriteOnce"] + persistentVolumeReclaimPolicy: Retain + storageClassName: local-storage + local: + path: /mnt/disks/ram-disk + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: "node_pool" + operator: "In" + values: + - "model-train-pool" +--- + kind: PersistentVolumeClaim + apiVersion: v1 + metadata: + name: ram-disk-claim + spec: + accessModes: + - ReadWriteOnce + storageClassName: local-storage + volumeName: ram-disk-pv + resources: + requests: + storage: 16Gi +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: pd-ssd-claim +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 16Gi + storageClassName: premium-rwo +# [END gke_ai_ml_model_train_01_volumes] diff --git a/ai-ml/model-train/manifests/02-dataset/bucket-job.yaml b/ai-ml/model-train/manifests/02-dataset/bucket-job.yaml new file mode 100644 index 0000000000..ebb71ec88f --- /dev/null +++ b/ai-ml/model-train/manifests/02-dataset/bucket-job.yaml @@ -0,0 +1,109 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_model_train_02_data_load_job] +apiVersion: v1 +kind: ServiceAccount +metadata: + name: bucket-access +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: download-script +data: + download.sh: |- + #!/usr/bin/bash -x + apt-get update -y && \ + apt-get install -y --no-install-recommends \ + git git-lfs rsync + git lfs install + cd /tmp + echo "Saving dataset into tmp..." + time git clone --depth=1 "$DATASET_REPO"; echo "cloned" + if [ "$UPLOAD_SSD" == "1" ]; then + echo "Saving dataset into Local SSD..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /local-ssd/dataset/ + fi + if [ "$UPLOAD_RAM" == "1" ]; then + echo "Saving dataset into Ram disk..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /ram-disk/dataset/ + fi + if [ "$UPLOAD_PD" == "1" ]; then + echo "Saving dataset into Persistent disk..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /pd-ssd/dataset/ + fi + if [ "$UPLOAD_BUCKET" == "1" ]; then + echo "Saving dataset into Bucket..." + time gsutil -q -m cp -r /tmp/dataset/dataset/ gs://$BUCKET_NAME/ + echo "Dataset was successfully saved in all storages!" + fi +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: bucket-dataset-downloader + labels: + app: bucket-dataset-downloader +spec: + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + app: bucket-dataset-downloader + spec: + restartPolicy: OnFailure + serviceAccountName: bucket-access + containers: + - name: gcloud + image: gcr.io/google.com/cloudsdktool/google-cloud-cli:slim + resources: + requests: + cpu: "1" + memory: "3Gi" + limits: + cpu: "2" + memory: "3Gi" + command: + - /scripts/download.sh + env: + - name: UPLOAD_BUCKET + value: "1" + - name: BUCKET_NAME + value: --model-train + - name: DATASET_REPO + value: "https://huggingface.co/datasets/dganochenko/dataset" + - name: TIMEFORMAT + value: "%0lR" + volumeMounts: + - name: scripts-volume + mountPath: "/scripts/" + readOnly: true + volumes: + - name: scripts-volume + configMap: + defaultMode: 0700 + name: download-script + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: NoSchedule + - key: "app.stateful/component" + operator: "Equal" + value: "model-train" + effect: NoSchedule + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 +# [END gke_ai_ml_model_train_02_data_load_job] diff --git a/ai-ml/model-train/manifests/02-dataset/pd-job.yaml b/ai-ml/model-train/manifests/02-dataset/pd-job.yaml new file mode 100644 index 0000000000..d2f340748f --- /dev/null +++ b/ai-ml/model-train/manifests/02-dataset/pd-job.yaml @@ -0,0 +1,112 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_model_train_02_data_load_job] +apiVersion: v1 +kind: ServiceAccount +metadata: + name: bucket-access +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: download-script +data: + download.sh: |- + #!/usr/bin/bash -x + apt-get update -y && \ + apt-get install -y --no-install-recommends \ + git git-lfs rsync + git lfs install + cd /tmp + echo "Saving dataset into tmp..." + time git clone --depth=1 "$DATASET_REPO"; echo "cloned" + if [ "$UPLOAD_SSD" == "1" ]; then + echo "Saving dataset into Local SSD..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /local-ssd/dataset/ + fi + if [ "$UPLOAD_RAM" == "1" ]; then + echo "Saving dataset into Ram disk..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /ram-disk/dataset/ + fi + if [ "$UPLOAD_PD" == "1" ]; then + echo "Saving dataset into Persistent disk..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /pd-ssd/dataset/ + fi + if [ "$UPLOAD_BUCKET" == "1" ]; then + echo "Saving dataset into Bucket..." + time gsutil -q -m cp -r /tmp/dataset/dataset/ gs://$BUCKET_NAME/ + echo "Dataset was successfully saved in all storages!" + fi +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: pd-dataset-downloader + labels: + app: pd-dataset-downloader +spec: + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + app: pd-dataset-downloader + spec: + restartPolicy: OnFailure + serviceAccountName: bucket-access + containers: + - name: gcloud + image: gcr.io/google.com/cloudsdktool/google-cloud-cli:slim + resources: + requests: + cpu: "1" + memory: "3Gi" + limits: + cpu: "2" + memory: "3Gi" + command: + - /scripts/download.sh + env: + - name: UPLOAD_PD + value: "1" + - name: DATASET_REPO + value: "https://huggingface.co/datasets/dganochenko/dataset" + - name: TIMEFORMAT + value: "%0lR" + volumeMounts: + - name: pd-ssd-storage + mountPath: /pd-ssd + - name: scripts-volume + mountPath: "/scripts/" + readOnly: true + volumes: + - name: scripts-volume + configMap: + defaultMode: 0700 + name: download-script + - name: pd-ssd-storage + persistentVolumeClaim: + claimName: pd-ssd-claim + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: NoSchedule + - key: "app.stateful/component" + operator: "Equal" + value: "model-train" + effect: NoSchedule + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 +# [END gke_ai_ml_model_train_02_data_load_job] diff --git a/ai-ml/model-train/manifests/02-dataset/ram-job.yaml b/ai-ml/model-train/manifests/02-dataset/ram-job.yaml new file mode 100644 index 0000000000..b53d5609cd --- /dev/null +++ b/ai-ml/model-train/manifests/02-dataset/ram-job.yaml @@ -0,0 +1,112 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_model_train_02_data_load_job] +apiVersion: v1 +kind: ServiceAccount +metadata: + name: bucket-access +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: download-script +data: + download.sh: |- + #!/usr/bin/bash -x + apt-get update -y && \ + apt-get install -y --no-install-recommends \ + git git-lfs rsync + git lfs install + cd /tmp + echo "Saving dataset into tmp..." + time git clone --depth=1 "$DATASET_REPO"; echo "cloned" + if [ "$UPLOAD_SSD" == "1" ]; then + echo "Saving dataset into Local SSD..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /local-ssd/dataset/ + fi + if [ "$UPLOAD_RAM" == "1" ]; then + echo "Saving dataset into Ram disk..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /ram-disk/dataset/ + fi + if [ "$UPLOAD_PD" == "1" ]; then + echo "Saving dataset into Persistent disk..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /pd-ssd/dataset/ + fi + if [ "$UPLOAD_BUCKET" == "1" ]; then + echo "Saving dataset into Bucket..." + time gsutil -q -m cp -r /tmp/dataset/dataset/ gs://$BUCKET_NAME/ + echo "Dataset was successfully saved in all storages!" + fi +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ram-dataset-downloader + labels: + app: ram-dataset-downloader +spec: + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + app: ram-dataset-downloader + spec: + restartPolicy: OnFailure + serviceAccountName: bucket-access + containers: + - name: gcloud + image: gcr.io/google.com/cloudsdktool/google-cloud-cli:slim + resources: + requests: + cpu: "1" + memory: "12Gi" + limits: + cpu: "2" + memory: "12Gi" + command: + - /scripts/download.sh + env: + - name: UPLOAD_RAM + value: "1" + - name: DATASET_REPO + value: "https://huggingface.co/datasets/dganochenko/dataset" + - name: TIMEFORMAT + value: "%0lR" + volumeMounts: + - name: ram-disk-storage + mountPath: /ram-disk + - name: scripts-volume + mountPath: "/scripts/" + readOnly: true + volumes: + - name: scripts-volume + configMap: + defaultMode: 0700 + name: download-script + - name: ram-disk-storage + persistentVolumeClaim: + claimName: ram-disk-claim + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: NoSchedule + - key: "app.stateful/component" + operator: "Equal" + value: "model-train" + effect: NoSchedule + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 +# [END gke_ai_ml_model_train_02_data_load_job] diff --git a/ai-ml/model-train/manifests/02-dataset/ssd-job.yaml b/ai-ml/model-train/manifests/02-dataset/ssd-job.yaml new file mode 100644 index 0000000000..2db87e9fed --- /dev/null +++ b/ai-ml/model-train/manifests/02-dataset/ssd-job.yaml @@ -0,0 +1,112 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_model_train_02_data_load_job] +apiVersion: v1 +kind: ServiceAccount +metadata: + name: bucket-access +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: download-script +data: + download.sh: |- + #!/usr/bin/bash -x + apt-get update -y && \ + apt-get install -y --no-install-recommends \ + git git-lfs rsync + git lfs install + cd /tmp + echo "Saving dataset into tmp..." + time git clone --depth=1 "$DATASET_REPO"; echo "cloned" + if [ "$UPLOAD_SSD" == "1" ]; then + echo "Saving dataset into Local SSD..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /local-ssd/dataset/ + fi + if [ "$UPLOAD_RAM" == "1" ]; then + echo "Saving dataset into Ram disk..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /ram-disk/dataset/ + fi + if [ "$UPLOAD_PD" == "1" ]; then + echo "Saving dataset into Persistent disk..." + time rsync --info=progress2 -a /tmp/dataset/dataset/ /pd-ssd/dataset/ + fi + if [ "$UPLOAD_BUCKET" == "1" ]; then + echo "Saving dataset into Bucket..." + time gsutil -q -m cp -r /tmp/dataset/dataset/ gs://$BUCKET_NAME/ + echo "Dataset was successfully saved in all storages!" + fi +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: ssd-dataset-downloader + labels: + app: ssd-dataset-downloader +spec: + ttlSecondsAfterFinished: 120 + template: + metadata: + labels: + app: ssd-dataset-downloader + spec: + restartPolicy: OnFailure + serviceAccountName: bucket-access + containers: + - name: gcloud + image: gcr.io/google.com/cloudsdktool/google-cloud-cli:slim + resources: + requests: + cpu: "1" + memory: "3Gi" + limits: + cpu: "2" + memory: "3Gi" + command: + - /scripts/download.sh + env: + - name: UPLOAD_SSD + value: "1" + - name: DATASET_REPO + value: "https://huggingface.co/datasets/dganochenko/dataset" + - name: TIMEFORMAT + value: "%0lR" + volumeMounts: + - name: local-ssd-storage + mountPath: /local-ssd + - name: scripts-volume + mountPath: "/scripts/" + readOnly: true + volumes: + - name: scripts-volume + configMap: + defaultMode: 0700 + name: download-script + - name: local-ssd-storage + persistentVolumeClaim: + claimName: local-ssd-claim + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: NoSchedule + - key: "app.stateful/component" + operator: "Equal" + value: "model-train" + effect: NoSchedule + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 +# [END gke_ai_ml_model_train_02_data_load_job] diff --git a/ai-ml/model-train/manifests/03-notebook/model-train.ipynb b/ai-ml/model-train/manifests/03-notebook/model-train.ipynb new file mode 100644 index 0000000000..136b097f5c --- /dev/null +++ b/ai-ml/model-train/manifests/03-notebook/model-train.ipynb @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fb8bf6b1", + "metadata": {}, + "source": [ + " Copyright 2024 Google LLC\n", + "\n", + " Licensed under the Apache License, Version 2.0 (the \"License\");\n", + " you may not use this file except in compliance with the License.\n", + " You may obtain a copy of the License at\n", + "\n", + " https://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + " Unless required by applicable law or agreed to in writing, software\n", + " distributed under the License is distributed on an \"AS IS\" BASIS,\n", + " WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + " See the License for the specific language governing permissions and\n", + " limitations under the License." + ] + }, + { + "cell_type": "markdown", + "id": "73ad9294", + "metadata": {}, + "source": [ + "This sections shows how to run model training using PyTorch and data from specific storage " + ] + }, + { + "cell_type": "markdown", + "id": "12fce6c7", + "metadata": {}, + "source": [ + "Import required python libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1a3c49c7-f0e3-4d8b-b631-bf3f0a92828e", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import time\n", + "import torchvision.transforms as transforms\n", + "import torchvision.datasets as datasets\n", + "from torch.utils.data import DataLoader" + ] + }, + { + "cell_type": "markdown", + "id": "71898099", + "metadata": {}, + "source": [ + "Declare batch size for dataset reading. Define dataset [transformation parameters](https://pytorch.org/vision/stable/transforms.html) - resize image, apply random image augmentations, convert to tensor and normalize it to fit all vector dimensions into [-1, 1] range." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "67fd906d-1d87-44d0-a64a-484dfab4d320", + "metadata": {}, + "outputs": [], + "source": [ + "BATCH_SIZE = 64\n", + "\n", + "train_transform = transforms.Compose([\n", + " transforms.Resize((224, 224)),\n", + " transforms.RandomHorizontalFlip(p=0.5),\n", + " transforms.RandomVerticalFlip(p=0.5),\n", + " transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)),\n", + " transforms.RandomRotation(degrees=(30, 70)),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(\n", + " mean=[0.5, 0.5, 0.5],\n", + " std=[0.5, 0.5, 0.5]\n", + " )\n", + "])\n" + ] + }, + { + "cell_type": "markdown", + "id": "aeac6430", + "metadata": {}, + "source": [ + "Declare custom neural network. Worth to mention nn.Linear parameter set limits to input features (vector dimensions of input image) and output (corresponding image class)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d0d15daf-eeab-4840-a3a5-f68f365754fa", + "metadata": {}, + "outputs": [], + "source": [ + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "class CNNModel(nn.Module):\n", + " def __init__(self):\n", + " super(CNNModel, self).__init__()\n", + " self.conv1 = nn.Conv2d(3, 32, 5)\n", + " self.conv2 = nn.Conv2d(32, 64, 5)\n", + " self.conv3 = nn.Conv2d(64, 128, 3)\n", + " self.conv4 = nn.Conv2d(128, 256, 5)\n", + " \n", + " self.fc1 = nn.Linear(256, 1000)\n", + " \n", + " self.pool = nn.MaxPool2d(2, 2)\n", + " \n", + " def forward(self, x):\n", + " x = self.pool(F.relu(self.conv1(x)))\n", + " x = self.pool(F.relu(self.conv2(x)))\n", + " x = self.pool(F.relu(self.conv3(x)))\n", + " x = self.pool(F.relu(self.conv4(x)))\n", + " bs, _, _, _ = x.shape\n", + " x = F.adaptive_avg_pool2d(x, 1).reshape(bs, -1)\n", + " x = self.fc1(x)\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "id": "ebd39481", + "metadata": {}, + "source": [ + "In this cell you check if CUDA is available and declare two [optimization functions](https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html) - optimizer (the Adam\n", + "optimizer with a 0.001 learning rate) and criterion (the Cross-Entropy loss function). " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6a0a5b27-b699-4c14-9fe2-a36d992079b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computation device: cuda\n", + "\n" + ] + } + ], + "source": [ + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from tqdm.auto import tqdm\n", + "\n", + "device = ('cuda' if torch.cuda.is_available() else 'cpu')\n", + "print(f\"Computation device: {device}\\n\")\n", + "model = CNNModel().to(device)\n", + "\n", + "optimizer = optim.Adam(model.parameters(), lr=0.001)\n", + "criterion = nn.CrossEntropyLoss()" + ] + }, + { + "cell_type": "markdown", + "id": "744c187a", + "metadata": {}, + "source": [ + "Define helper save_model function to save model states." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "178a81c6-dbc4-427a-8f82-7cf503d423b9", + "metadata": {}, + "outputs": [], + "source": [ + "def save_model(epochs, model, optimizer, criterion):\n", + " model_path = \"/local-ssd/outputs/model-\" + time.strftime(\"%H-%M-%S\", time.localtime()) + \".pth\"\n", + " torch.save({\n", + " 'epoch': epochs,\n", + " 'model_state_dict': model.state_dict(),\n", + " 'optimizer_state_dict': optimizer.state_dict(),\n", + " 'loss': criterion,\n", + " }, model_path)\n", + " print(f\"Model was saved in {model_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b401f2ad", + "metadata": {}, + "source": [ + "Define model training function that runs very [common training loop](https://pytorch.org/tutorials/beginner/introyt/trainingyt.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e0b54e29-fe75-4a98-b636-c85ee7945e83", + "metadata": {}, + "outputs": [], + "source": [ + "def train(model, trainloader, optimizer, criterion):\n", + " model.train()\n", + " print('Training')\n", + " train_running_loss = 0.0\n", + " train_running_correct = 0\n", + " counter = 0\n", + " for i, data in tqdm(enumerate(trainloader), total=len(trainloader)):\n", + " counter += 1\n", + " image, labels = data\n", + " image = image.to(device)\n", + " labels = labels.to(device)\n", + " optimizer.zero_grad()\n", + " outputs = model(image)\n", + " loss = criterion(outputs, labels)\n", + " train_running_loss += loss.item()\n", + " _, preds = torch.max(outputs.data, 1)\n", + " train_running_correct += (preds == labels).sum().item()\n", + " loss.backward()\n", + " optimizer.step()\n", + " epoch_loss = train_running_loss / counter\n", + " epoch_acc = 100. * (train_running_correct / len(trainloader.dataset))\n", + " return epoch_loss, epoch_acc" + ] + }, + { + "cell_type": "markdown", + "id": "3e42c785", + "metadata": {}, + "source": [ + "Declare benchmark function, that calculates training time using different storages and epochs amount. The function erases cache in GPU memory and uploads an untrained model to GPU memory before each training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9dd69f8-4018-4fad-ae91-faa30b13f0fc", + "metadata": {}, + "outputs": [], + "source": [ + "def train_benchmark(dataset_path, epochs):\n", + " start = time.time()\n", + " train_dataset = datasets.ImageFolder(\n", + " root=dataset_path,\n", + " transform=train_transform\n", + " )\n", + " train_loader = DataLoader(\n", + " train_dataset, batch_size=BATCH_SIZE, shuffle=True,\n", + " num_workers=2\n", + " )\n", + " #recreate the model before training\n", + " torch.cuda.empty_cache()\n", + " model = CNNModel().to(device)\n", + " train_loss = []\n", + " train_acc = []\n", + " for epoch in range(epochs):\n", + " print(f\"[INFO]: Epoch {epoch+1} of {epochs}\")\n", + " train_epoch_loss, train_epoch_acc = train(model, train_loader, optimizer, criterion)\n", + " train_loss.append(train_epoch_loss)\n", + " train_acc.append(train_epoch_acc)\n", + " print(f\"Training loss: {train_epoch_loss:.3f}, training acc: {train_epoch_acc:.3f}\")\n", + " print('-'*50)\n", + " save_model(epochs, model, optimizer, criterion)\n", + " print('Training complete')\n", + "\n", + " end = time.time()\n", + " print(\"Total training time: \", time.strftime(\"%H:%M:%S\", time.gmtime(end-start)))" + ] + }, + { + "cell_type": "markdown", + "id": "0a23f203-908f-4e41-82e4-803683ce20f8", + "metadata": {}, + "source": [ + "Run the benchmark using Ram disk and 2/5/10 training cycles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4762d0ec-8c0c-40c2-8930-8feb8e81e899", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Ram disk - 2 epochs\")\n", + "train_benchmark(\"/ram-disk/dataset\", 2)\n", + "print(\"Ram disk - 5 epochs\")\n", + "train_benchmark(\"/ram-disk/dataset\", 5)\n", + "print(\"Ram disk - 10 epochs\")\n", + "train_benchmark(\"/ram-disk/dataset\", 10)" + ] + }, + { + "cell_type": "markdown", + "id": "9a1b7e9a-14d4-4d3c-b20a-9987c46e78b0", + "metadata": {}, + "source": [ + "Run the benchmark using Local SSD and 2/5/10 training cycles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3908c503-4b86-451e-bd04-ff00e65a2bb7", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Local ssd - 2 epochs\")\n", + "train_benchmark(\"/local-ssd/dataset\", 2)\n", + "print(\"Local ssd - 5 epochs\")\n", + "train_benchmark(\"/local-ssd/dataset\", 5)\n", + "print(\"Local ssd - 10 epochs\")\n", + "train_benchmark(\"/local-ssd/dataset\", 10)" + ] + }, + { + "cell_type": "markdown", + "id": "38e1d7bc-942e-477e-8c81-43eb70ccc981", + "metadata": {}, + "source": [ + "Run the benchmark using Persistent disk and 2/5/10 training cycles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "322de896-0b78-48e1-829e-363b1577dea9", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Persistent disk - 2 epochs\")\n", + "train_benchmark(\"/pd-ssd/dataset\", 2)\n", + "print(\"Persistent disk - 5 epochs\")\n", + "train_benchmark(\"/pd-ssd/dataset\", 5)\n", + "print(\"Persistent disk - 2 epochs\")\n", + "train_benchmark(\"/pd-ssd/dataset\", 10)" + ] + }, + { + "cell_type": "markdown", + "id": "402ef4a7-3c5e-4dbf-bed7-ffb3a9d04f65", + "metadata": {}, + "source": [ + "Run the benchmark using GCS bucket and 2/5/10 training cycles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38d258f2-41b4-4e81-9120-e0e1e8ba1d68", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Local ssd - 2 epochs\")\n", + "train_benchmark(\"/bucket/datase\", 2)\n", + "print(\"Ram disk - 5 epochs\")\n", + "train_benchmark(\"/bucket/datase\", 5)\n", + "print(\"Bucket - 10 epochs\")\n", + "train_benchmark(\"/bucket/dataset\", 10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai-ml/model-train/manifests/03-notebook/notebook.yaml b/ai-ml/model-train/manifests/03-notebook/notebook.yaml new file mode 100644 index 0000000000..bc4d89cb89 --- /dev/null +++ b/ai-ml/model-train/manifests/03-notebook/notebook.yaml @@ -0,0 +1,103 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_ai_ml_model_train_03_jupiter_notebook] +apiVersion: v1 +kind: Service +metadata: + labels: &labels + app: jupyter-notebook + name: notebook +spec: + ports: + - port: 8888 + selector: *labels + type: LoadBalancer + #type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: notebook + labels: &labels + app: jupyter-notebook +spec: + selector: + matchLabels: *labels + template: + metadata: + annotations: + gke-gcsfuse/volumes: "true" + labels: *labels + spec: + serviceAccountName: bucket-access + containers: + - name: jupyter + image: quay.io/jupyter/pytorch-notebook:cuda12-pytorch-2.4.0 + resources: + requests: + cpu: "2" + memory: "16Gi" + nvidia.com/gpu: 1 + limits: + cpu: "2" + memory: "16Gi" + nvidia.com/gpu: 1 + ports: + - containerPort: 8888 + volumeMounts: + - name: dshm + mountPath: /dev/shm + - name: notebook + mountPath: "/home/jovyan/model-train.ipynb" + subPath: "model-train.ipynb" + - name: local-ssd-storage + mountPath: /local-ssd + - name: ram-disk-storage + mountPath: /ram-disk + - name: gcs-fuse-storage + mountPath: /bucket + - name: pd-ssd-storage + mountPath: /pd-ssd + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: notebook + configMap: + name: notebook + - name: local-ssd-storage + persistentVolumeClaim: + claimName: local-ssd-claim + - name: ram-disk-storage + persistentVolumeClaim: + claimName: ram-disk-claim + - name: gcs-fuse-storage + persistentVolumeClaim: + claimName: gcs-fuse-claim + - name: pd-ssd-storage + persistentVolumeClaim: + claimName: pd-ssd-claim + tolerations: + - key: "nvidia.com/gpu" + operator: "Equal" + value: "present" + effect: NoSchedule + - key: "app.stateful/component" + operator: "Equal" + value: "model-train" + effect: NoSchedule + nodeSelector: + cloud.google.com/gke-accelerator: nvidia-l4 +# [END gke_ai_ml_model_train_03_jupiter_notebook] diff --git a/ai-ml/model-train/terraform/gke-standard/README.md b/ai-ml/model-train/terraform/gke-standard/README.md new file mode 100644 index 0000000000..099a397275 --- /dev/null +++ b/ai-ml/model-train/terraform/gke-standard/README.md @@ -0,0 +1,28 @@ +# Terraform to provision GKE Standard + +## Prerequisites and Assumptions +* Done initialization of the project and gcloud CLI following the instructions in `{ROOT}/README.md` +* VPC network, refer to `gke` folder for the details + +## Usage +``` +export GOOGLE_OAUTH_ACCESS_TOKEN=$(gcloud auth print-access-token) +export PROJECT_ID="your project" +export REGION="us-central1" +export CLUSTER_PREFIX="model-train" +export GPU_ZONE=$(gcloud compute accelerator-types list --filter="zone ~ $REGION AND name=nvidia-l4" --limit=1 --format="value(zone)") + +terraform init +terraform plan -var project_id=$PROJECT_ID -var region=${REGION} -var cluster_prefix=${CLUSTER_PREFIX} -var node_location=${GPU_ZONE} +terraform apply -var project_id=$PROJECT_ID -var region=${REGION} -var cluster_prefix=${CLUSTER_PREFIX} -var node_location=${GPU_ZONE} +``` +## Clean up +**NOTE:** Be very careful when destroying any resource, not recommended for production! +``` +# Destroy everything +terraform destroy \ +-var project_id=$PROJECT_ID \ +-var region=${REGION} \ +-var cluster_prefix=${CLUSTER_PREFIX} \ +-var node_location=${GPU_ZONE} + diff --git a/ai-ml/model-train/terraform/gke-standard/main.tf b/ai-ml/model-train/terraform/gke-standard/main.tf new file mode 100644 index 0000000000..aff752929e --- /dev/null +++ b/ai-ml/model-train/terraform/gke-standard/main.tf @@ -0,0 +1,74 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# create private subnet +module "network" { + source = "../modules/network" + project_id = var.project_id + region = var.region + cluster_prefix = var.cluster_prefix +} + +# [START gke_model_train_standard_private_regional_cluster] +module "training_cluster" { + source = "../modules/cluster" + project_id = var.project_id + region = var.region + cluster_prefix = var.cluster_prefix + network = module.network.network_name + subnetwork = module.network.subnet_name + + node_pools = [ + { + name = "model-train-pool" + disk_size_gb = var.node_disk_size + disk_type = "pd-balanced" + node_locations = var.node_location + autoscaling = true + min_count = 1 + max_count = var.autoscaling_max_count + max_surge = 1 + max_unavailable = 0 + machine_type = "g2-standard-8" + local_nvme_ssd_count = 1 + auto_repair = true + accelerator_count = 1 + accelerator_type = "nvidia-l4" + gpu_driver_version = "LATEST" + } + ] + + node_pools_labels = { + all = {} + model-train-pool = { + "app.stateful/component" = "model-train" + } + } + node_pools_taints = { + all = [] + model-train-pool = [ + { + key = "app.stateful/component" + value = "model-train" + effect = "NO_SCHEDULE" + } + ] + } +} + +output "kubectl_connection_command" { + value = "gcloud container clusters get-credentials ${var.cluster_prefix}-cluster --region ${var.region}" + description = "Connection command" +} +# [END gke_model_train_standard_private_regional_cluster] diff --git a/ai-ml/model-train/terraform/gke-standard/variables.tf b/ai-ml/model-train/terraform/gke-standard/variables.tf new file mode 100644 index 0000000000..d60038d573 --- /dev/null +++ b/ai-ml/model-train/terraform/gke-standard/variables.tf @@ -0,0 +1,57 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + description = "The project ID to host the cluster in" + default = "" +} + +variable "region" { + description = "The region to host the cluster in" +} + +variable "cluster_prefix" { + description = "The prefix for all cluster resources" + default = "model-train" +} + +variable "node_location" { + description = "Node location for GPU node pool - please check GPUs node availability in official documentation: https://cloud.google.com/compute/docs/regions-zones" + type = string + default = "" + +} + +variable "node_machine_type" { + description = "The machine type for node instances" + default = "e2-standard-2" + type = string +} + +variable "node_disk_type" { + description = "The persistent disk type for node instances" + default = "pd-standard" + type = string +} +variable "node_disk_size" { + description = "The persistent disk size for node instances" + default = 100 + type = number +} + +variable "autoscaling_max_count" { + description = "Maximum node counts per zone" + default = 2 + type = number +} diff --git a/ai-ml/model-train/terraform/gke-standard/versions.tf b/ai-ml/model-train/terraform/gke-standard/versions.tf new file mode 100644 index 0000000000..f8bded9171 --- /dev/null +++ b/ai-ml/model-train/terraform/gke-standard/versions.tf @@ -0,0 +1,23 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + } + required_version = ">= 1.3" +} diff --git a/ai-ml/model-train/terraform/modules/cluster/bucket.tf b/ai-ml/model-train/terraform/modules/cluster/bucket.tf new file mode 100644 index 0000000000..05810ee485 --- /dev/null +++ b/ai-ml/model-train/terraform/modules/cluster/bucket.tf @@ -0,0 +1,42 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_model_train_cloud_storage_bucket] +module "cloud-storage" { + source = "terraform-google-modules/cloud-storage/google//modules/simple_bucket" + version = "~> 5.0" + + name = "${var.project_id}-${var.cluster_prefix}-model-train" + project_id = var.project_id + location = var.region + force_destroy = true +} + +locals { + workload_principal = "principal://iam.googleapis.com/projects/${data.google_project.project.number}/locations/global/workloadIdentityPools/${var.project_id}.svc.id.goog/subject/ns/default/sa/bucket-access" +} + + +module "cloud-storage-iam-bindings" { + source = "terraform-google-modules/iam/google//modules/storage_buckets_iam" + version = "~> 7.0" + + storage_buckets = [module.cloud-storage.name] + mode = "authoritative" + bindings = { + "roles/storage.objectUser" = ["${local.workload_principal}"] + } + depends_on = [module.cloud-storage] +} +# [END gke_model_train_cloud_storage_bucket] diff --git a/ai-ml/model-train/terraform/modules/cluster/cloudbuild.yaml b/ai-ml/model-train/terraform/modules/cluster/cloudbuild.yaml new file mode 100644 index 0000000000..92121b330f --- /dev/null +++ b/ai-ml/model-train/terraform/modules/cluster/cloudbuild.yaml @@ -0,0 +1,34 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_model_train_cloudbuild] +locals { + cloudbuild_email = "${data.google_project.project.number}@cloudbuild.gserviceaccount.com" +} + +module "cloudbuild-bindings" { + source = "terraform-google-modules/iam/google//modules/projects_iam" + version = "~> 7.0" + + projects = ["${var.project_id}"] + mode = "additive" + + bindings = { + "roles/compute.instanceAdmin.v1" = ["serviceAccount:${local.cloudbuild_email}"] + "roles/iam.serviceAccountUser" = ["serviceAccount:${local.cloudbuild_email}"] + "roles/iap.tunnelResourceAccessor" = ["serviceAccount:${local.cloudbuild_email}"] + } +} +# [END gke_model_train_cloudbuild] + diff --git a/ai-ml/model-train/terraform/modules/cluster/main.tf b/ai-ml/model-train/terraform/modules/cluster/main.tf new file mode 100644 index 0000000000..b4393be7cc --- /dev/null +++ b/ai-ml/model-train/terraform/modules/cluster/main.tf @@ -0,0 +1,60 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_model_train_standard_private_regional_cluster] +module "training_cluster" { + source = "terraform-google-modules/kubernetes-engine/google//modules/beta-private-cluster" + version = "~> 31.0" + project_id = var.project_id + name = "${var.cluster_prefix}-cluster" + regional = true + region = var.region + network = var.network + subnetwork = var.subnetwork + ip_range_pods = "k8s-pod-range" + ip_range_services = "k8s-service-range" + create_service_account = true + enable_private_endpoint = false + enable_private_nodes = true + master_ipv4_cidr_block = "172.16.0.0/28" + network_policy = true + logging_enabled_components = ["SYSTEM_COMPONENTS","WORKLOADS"] + monitoring_enabled_components = ["SYSTEM_COMPONENTS"] + enable_cost_allocation = true + deletion_protection = false + initial_node_count = 1 + stateful_ha = true + grant_registry_access = true + kubernetes_version = "latest" + release_channel = "RAPID" + + cluster_resource_labels = { + name = "${var.cluster_prefix}-cluster" + component = "${var.cluster_prefix}-training" + } + + monitoring_enable_managed_prometheus = true + gke_backup_agent_config = true + + node_pools = var.node_pools + node_pools_labels = var.node_pools_labels + node_pools_taints = var.node_pools_taints + gce_pd_csi_driver = true + gcs_fuse_csi_driver = true +} + +data "google_project" "project" { + project_id = var.project_id +} +# [END gke_model_train_standard_private_regional_cluster] diff --git a/ai-ml/model-train/terraform/modules/cluster/variables.tf b/ai-ml/model-train/terraform/modules/cluster/variables.tf new file mode 100644 index 0000000000..f619fdb7eb --- /dev/null +++ b/ai-ml/model-train/terraform/modules/cluster/variables.tf @@ -0,0 +1,49 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + description = "The project ID to host the cluster in" +} + +variable "region" { + description = "The region to host the cluster in" +} + +variable "network" { + description = "The VPC network to host the cluster in" +} + +variable "subnetwork" { + description = "The subnetwork to host the cluster in" +} + +variable "cluster_prefix" { + description = "The prefix for all cluster resources" +} + +variable "node_pools" { + type = list(map(any)) + description = "List of maps containing node pools" +} + +variable "node_pools_labels" { + type = map(map(string)) + description = "Map of maps containing node labels by node-pool name" +} + +variable "node_pools_taints" { + type = map(list(object({ key = string, value = string, effect = string }))) + description = "Map of lists containing node taints by node-pool name" +} + diff --git a/ai-ml/model-train/terraform/modules/cluster/versions.tf b/ai-ml/model-train/terraform/modules/cluster/versions.tf new file mode 100644 index 0000000000..f8bded9171 --- /dev/null +++ b/ai-ml/model-train/terraform/modules/cluster/versions.tf @@ -0,0 +1,23 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + } + required_version = ">= 1.3" +} diff --git a/ai-ml/model-train/terraform/modules/network/main.tf b/ai-ml/model-train/terraform/modules/network/main.tf new file mode 100644 index 0000000000..d78c39b875 --- /dev/null +++ b/ai-ml/model-train/terraform/modules/network/main.tf @@ -0,0 +1,86 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START gke_model_train_vpc_multi_region_network] +module "gcp-network" { + source = "terraform-google-modules/network/google" + version = "~> 8.1.0" + + project_id = var.project_id + network_name = "${var.cluster_prefix}-vpc" + + subnets = [ + { + subnet_name = "${var.cluster_prefix}-private-subnet" + subnet_ip = "10.10.0.0/24" + subnet_region = var.region + subnet_private_access = true + subnet_flow_logs = "true" + } + ] + + secondary_ranges = { + ("${var.cluster_prefix}-private-subnet") = [ + { + range_name = "k8s-pod-range" + ip_cidr_range = "10.48.0.0/20" + }, + { + range_name = "k8s-service-range" + ip_cidr_range = "10.52.0.0/20" + }, + ] + } +} + +module "firewall_rules" { + source = "terraform-google-modules/network/google//modules/firewall-rules" + version = "~> 8.0" + project_id = var.project_id + network_name = module.gcp-network.network_name + + ingress_rules = [{ + name = "allow-ssh-for-cloudbuild" + description = "open webhook port" + source_ranges = ["35.235.240.0/20"] + allow = [{ + protocol = "tcp" + ports = ["22"] + }] + deny = [] + }] +} + +output "network_name" { + value = module.gcp-network.network_name +} + +output "subnet_name" { + value = module.gcp-network.subnets_names[0] +} +# [END gke_model_train_vpc_multi_region_network] + +# [START gke_model_train_cloudnat_simple_create] +module "cloud_router" { + source = "terraform-google-modules/cloud-router/google" + version = "~> 6.0" + project = var.project_id + name = "${var.cluster_prefix}-nat-router" + network = module.gcp-network.network_name + region = var.region + nats = [{ + name = "${var.cluster_prefix}-nat" + }] +} +# [END gke_model_train_cloudnat_simple_create] diff --git a/ai-ml/model-train/terraform/modules/network/variables.tf b/ai-ml/model-train/terraform/modules/network/variables.tf new file mode 100644 index 0000000000..9126a4c71f --- /dev/null +++ b/ai-ml/model-train/terraform/modules/network/variables.tf @@ -0,0 +1,26 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + description = "The project ID to host the cluster in" +} + +variable "region" { + description = "The region to host the cluster in" +} + +variable "cluster_prefix" { + description = "The prefix for all cluster resources" +} + diff --git a/ai-ml/model-train/terraform/modules/network/versions.tf b/ai-ml/model-train/terraform/modules/network/versions.tf new file mode 100644 index 0000000000..f8bded9171 --- /dev/null +++ b/ai-ml/model-train/terraform/modules/network/versions.tf @@ -0,0 +1,23 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + } + required_version = ">= 1.3" +}