From 94ed5ae4fcc9014e42b7abcb1e77bb8ca8a4a914 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Mon, 14 Oct 2024 23:40:36 +0000 Subject: [PATCH 001/129] adding warning for pre-existing vpc subnets with private google access disabled --- modules/network/firewall-rules/main.tf | 8 ++++++++ modules/network/pre-existing-subnetwork/main.tf | 8 ++++++++ modules/network/pre-existing-vpc/main.tf | 8 ++++++++ 3 files changed, 24 insertions(+) diff --git a/modules/network/firewall-rules/main.tf b/modules/network/firewall-rules/main.tf index 322d4c2e7f..f3f24e78d3 100644 --- a/modules/network/firewall-rules/main.tf +++ b/modules/network/firewall-rules/main.tf @@ -20,6 +20,14 @@ data "google_compute_subnetwork" "subnetwork" { self_link = var.subnetwork_self_link } +# Module-level check for Private Google Access on the subnetwork +check "private_google_access_enabled_subnetwork" { + assert { + condition = data.google_compute_subnetwork.subnetwork.private_ip_google_access + error_message = "Private Google Access is disabled for subnetwork '${data.google_compute_subnetwork.subnetwork.name}'. This may cause connectivity issues for instances without external IPs trying to access Google APIs and services." + } +} + module "firewall_rule" { source = "terraform-google-modules/network/google//modules/firewall-rules" version = "~> 9.0" diff --git a/modules/network/pre-existing-subnetwork/main.tf b/modules/network/pre-existing-subnetwork/main.tf index 8042f6472a..9fb206f969 100644 --- a/modules/network/pre-existing-subnetwork/main.tf +++ b/modules/network/pre-existing-subnetwork/main.tf @@ -28,3 +28,11 @@ data "google_compute_subnetwork" "primary_subnetwork" { } } } + +# Module-level check for Private Google Access on the subnetwork +check "private_google_access_enabled_subnetwork" { + assert { + condition = data.google_compute_subnetwork.primary_subnetwork.private_ip_google_access + error_message = "Private Google Access is disabled for subnetwork '${data.google_compute_subnetwork.primary_subnetwork.name}'. This may cause connectivity issues for instances without external IPs trying to access Google APIs and services." + } +} diff --git a/modules/network/pre-existing-vpc/main.tf b/modules/network/pre-existing-vpc/main.tf index 88ccd5f93d..ed332bab72 100644 --- a/modules/network/pre-existing-vpc/main.tf +++ b/modules/network/pre-existing-vpc/main.tf @@ -43,3 +43,11 @@ data "google_compute_subnetwork" "primary_subnetwork" { } } } + +# Module-level check for Private Google Access on the subnetwork +check "private_google_access_enabled_subnetwork" { + assert { + condition = data.google_compute_subnetwork.primary_subnetwork.private_ip_google_access + error_message = "Private Google Access is disabled for subnetwork '${data.google_compute_subnetwork.primary_subnetwork.name}'. This may cause connectivity issues for instances without external IPs trying to access Google APIs and services." + } +} From 8818a46053a0ac849a1919bb62694969ac980112 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Wed, 16 Oct 2024 11:00:08 +0000 Subject: [PATCH 002/129] test commit signing --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 03bfefa2d0..6bf1e0db89 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,6 +22,7 @@ use GitHub pull requests for this purpose. Consult [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on pull requests. + ### Standard PR Response Times Community submissions can take up to 2 weeks to be reviewed. From f8ae65a08b8587055ac45a7a6a75cc97d5077ab4 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Wed, 16 Oct 2024 11:06:50 +0000 Subject: [PATCH 003/129] revert test change --- CONTRIBUTING.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6bf1e0db89..03bfefa2d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,7 +22,6 @@ use GitHub pull requests for this purpose. Consult [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on pull requests. - ### Standard PR Response Times Community submissions can take up to 2 weeks to be reviewed. From eccebe5a4c114aecd6386c7115b4d60dacedaecb Mon Sep 17 00:00:00 2001 From: jrossthomson Date: Wed, 16 Oct 2024 10:13:45 -0400 Subject: [PATCH 004/129] Updated Notebook to remove deprecated version. --- community/modules/compute/notebook/main.tf | 23 +++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/community/modules/compute/notebook/main.tf b/community/modules/compute/notebook/main.tf index 3652667ffb..c1ecb2a023 100644 --- a/community/modules/compute/notebook/main.tf +++ b/community/modules/compute/notebook/main.tf @@ -21,7 +21,8 @@ locals { locals { suffix = random_id.resource_name_suffix.hex - name = "${var.deployment_name}-notebook-${local.suffix}" + #name = "thenotebook" + name = "notebook-${local.suffix}" bucket = replace(var.gcs_bucket_path, "gs://", "") post_script_filename = "mount-${local.suffix}.sh" @@ -54,15 +55,23 @@ resource "google_storage_bucket_object" "mount_script" { bucket = local.bucket } -resource "google_notebooks_instance" "instance" { +output resource_name_suffix { + value = local.name +} + +resource "google_workbench_instance" "instance" { name = local.name location = var.zone - machine_type = var.machine_type project = var.project_id - post_startup_script = "${var.gcs_bucket_path}/${google_storage_bucket_object.mount_script.name}" labels = local.labels - vm_image { - project = var.instance_image.project - image_family = var.instance_image.family + gce_setup{ + machine_type = var.machine_type + metadata = { + post-startup-script = "${var.gcs_bucket_path}/${google_storage_bucket_object.mount_script.name}" + } + vm_image { + project = var.instance_image.project + family = var.instance_image.family + } } } From aaa2d8e553e5579ccada7847a494574f239d4320 Mon Sep 17 00:00:00 2001 From: jrossthomson Date: Wed, 16 Oct 2024 16:05:49 -0400 Subject: [PATCH 005/129] Update from notebook_instance to workbench_instance --- community/modules/compute/notebook/README.md | 4 ++-- community/modules/compute/notebook/main.tf | 24 ++++++++----------- .../modules/compute/notebook/variables.tf | 5 ++++ 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/community/modules/compute/notebook/README.md b/community/modules/compute/notebook/README.md index 26b726418f..ed60b9d2a8 100644 --- a/community/modules/compute/notebook/README.md +++ b/community/modules/compute/notebook/README.md @@ -66,8 +66,8 @@ No modules. | Name | Type | |------|------| -| [google_notebooks_instance.instance](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/notebooks_instance) | resource | | [google_storage_bucket_object.mount_script](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_workbench_instance.instance](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/workbench_instance) | resource | | [random_id.resource_name_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource | ## Inputs @@ -76,7 +76,7 @@ No modules. |------|-------------|------|---------|:--------:| | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment; used as part of name of the notebook. | `string` | n/a | yes | | [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | Bucket name, can be provided from the google-cloud-storage module | `string` | `null` | no | -| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "tf-latest-cpu",
"name": null,
"project": "deeplearning-platform-release"
}
| no | +| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "tf-latest-cpu",
"name": null,
"project": "deeplearning-platform-release"
}
| no | | [labels](#input\_labels) | Labels to add to the resource Key-value pairs. | `map(string)` | n/a | yes | | [machine\_type](#input\_machine\_type) | The machine type to employ | `string` | n/a | yes | | [mount\_runner](#input\_mount\_runner) | mount content from the google-cloud-storage module | `map(string)` | n/a | yes | diff --git a/community/modules/compute/notebook/main.tf b/community/modules/compute/notebook/main.tf index c1ecb2a023..43b29c8474 100644 --- a/community/modules/compute/notebook/main.tf +++ b/community/modules/compute/notebook/main.tf @@ -20,9 +20,9 @@ locals { } locals { - suffix = random_id.resource_name_suffix.hex + suffix = random_id.resource_name_suffix.hex #name = "thenotebook" - name = "notebook-${local.suffix}" + name = "notebook-${var.deployment_name}-${local.suffix}" bucket = replace(var.gcs_bucket_path, "gs://", "") post_script_filename = "mount-${local.suffix}.sh" @@ -55,23 +55,19 @@ resource "google_storage_bucket_object" "mount_script" { bucket = local.bucket } -output resource_name_suffix { - value = local.name -} - resource "google_workbench_instance" "instance" { - name = local.name - location = var.zone - project = var.project_id - labels = local.labels - gce_setup{ - machine_type = var.machine_type + name = local.name + location = var.zone + project = var.project_id + labels = local.labels + gce_setup { + machine_type = var.machine_type metadata = { post-startup-script = "${var.gcs_bucket_path}/${google_storage_bucket_object.mount_script.name}" } vm_image { - project = var.instance_image.project - family = var.instance_image.family + project = var.instance_image.project + family = var.instance_image.family } } } diff --git a/community/modules/compute/notebook/variables.tf b/community/modules/compute/notebook/variables.tf index 5a2c803f30..21eb9518bd 100644 --- a/community/modules/compute/notebook/variables.tf +++ b/community/modules/compute/notebook/variables.tf @@ -22,6 +22,11 @@ variable "project_id" { variable "deployment_name" { description = "Name of the HPC deployment; used as part of name of the notebook." type = string + # notebook name can have: lowercase letters, numbers, or hyphens (-) and cannot end with a hyphen + validation { + error_message = "The notebook name uses 'deployment_name' -- can only have: lowercase letters, numbers, or hyphens" + condition = can(regex("^[a-z0-9]+(?:-[a-z0-9]+)*$", var.deployment_name)) + } } variable "zone" { From 4045c8c02888eca86da26614c8f883523e7e034d Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Tue, 22 Oct 2024 13:40:15 +0000 Subject: [PATCH 006/129] bump go version to 1.23 --- go.mod | 2 +- tools/cloud-build/provision/pr-go-build-test.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 56808e3f4e..c6dda3d8f6 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module hpc-toolkit -go 1.21 +go 1.22 require ( cloud.google.com/go/storage v1.41.0 // indirect diff --git a/tools/cloud-build/provision/pr-go-build-test.tf b/tools/cloud-build/provision/pr-go-build-test.tf index 32da3ff0ed..b2103ecf0f 100644 --- a/tools/cloud-build/provision/pr-go-build-test.tf +++ b/tools/cloud-build/provision/pr-go-build-test.tf @@ -14,7 +14,7 @@ resource "google_cloudbuild_trigger" "pr_go_build_test" { - for_each = toset(["1.21", "1.22"]) + for_each = toset(["1.22", "1.23"]) name = "PR-Go-${replace(each.key, ".", "-")}-build-test" description = "Test that the PR builds with Go ${each.key}" From 7674060d5db7290b3826941cf0f7319f3c9ff510 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Mon, 21 Oct 2024 16:16:45 +0000 Subject: [PATCH 007/129] Initial commit for new logging output --- modules/packer/custom-image/image.pkr.hcl | 25 +++++++++++++++++++ .../igc_pkr/one/image/image.pkr.hcl | 25 +++++++++++++++++++ .../text_escape/zero/lime/image.pkr.hcl | 25 +++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/modules/packer/custom-image/image.pkr.hcl b/modules/packer/custom-image/image.pkr.hcl index e4f30dfb58..ed40663bdb 100644 --- a/modules/packer/custom-image/image.pkr.hcl +++ b/modules/packer/custom-image/image.pkr.hcl @@ -21,6 +21,9 @@ locals { image_name_default = "${local.image_family}-${formatdate("YYYYMMDD't'hhmmss'z'", timestamp())}" image_name = var.image_name != null ? var.image_name : local.image_name_default + # construct vm image name for use when getting logs + instance_name = "packer-${substr(uuidv4(), 0, 6)}" + # default to explicit var.communicator, otherwise in-order: ssh/winrm/none shell_script_communicator = length(var.shell_scripts) > 0 ? "ssh" : "" ansible_playbook_communicator = length(var.ansible_playbooks) > 0 ? "ssh" : "" @@ -96,6 +99,7 @@ source "googlecompute" "toolkit_image" { image_name = local.image_name image_family = local.image_family image_labels = local.labels + instance_name = local.instance_name machine_type = var.machine_type accelerator_type = local.accelerator_type accelerator_count = var.accelerator_count @@ -197,4 +201,25 @@ build { "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", ] } + + # If there is an error during image creation, print out command for getting packer VM logs + error-cleanup-provisioner "shell-local" { + environment_vars = [ + "PRJ_ID=${var.project_id}", + "INST_NAME=${local.instance_name}", + "ZONE=${var.zone}", + ] + inline_shebang = "/bin/bash -e" + inline = [ + "type -P gcloud > /dev/null || exit 0", + "INST_ID=$(gcloud compute instances describe $INST_NAME --project $PRJ_ID --format=\"value(id)\" --zone=$ZONE)", + "echo 'Error building image try checking logs:'", + join(" ", ["echo \"gcloud logging --project $PRJ_ID read", + "'logName=(\\\"projects/$PRJ_ID/logs/GCEMetadataScripts\\\" OR \\\"projects/$PRJ_ID/logs/google_metadata_script_runner\\\") AND resource.labels.instance_id=$INST_ID'", + "--format=\\\"table(timestamp, resource.labels.instance_id, jsonPayload.message)\\\"", + "--order=asc\"" + ] + ) + ] + } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl index e4f30dfb58..ed40663bdb 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl @@ -21,6 +21,9 @@ locals { image_name_default = "${local.image_family}-${formatdate("YYYYMMDD't'hhmmss'z'", timestamp())}" image_name = var.image_name != null ? var.image_name : local.image_name_default + # construct vm image name for use when getting logs + instance_name = "packer-${substr(uuidv4(), 0, 6)}" + # default to explicit var.communicator, otherwise in-order: ssh/winrm/none shell_script_communicator = length(var.shell_scripts) > 0 ? "ssh" : "" ansible_playbook_communicator = length(var.ansible_playbooks) > 0 ? "ssh" : "" @@ -96,6 +99,7 @@ source "googlecompute" "toolkit_image" { image_name = local.image_name image_family = local.image_family image_labels = local.labels + instance_name = local.instance_name machine_type = var.machine_type accelerator_type = local.accelerator_type accelerator_count = var.accelerator_count @@ -197,4 +201,25 @@ build { "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", ] } + + # If there is an error during image creation, print out command for getting packer VM logs + error-cleanup-provisioner "shell-local" { + environment_vars = [ + "PRJ_ID=${var.project_id}", + "INST_NAME=${local.instance_name}", + "ZONE=${var.zone}", + ] + inline_shebang = "/bin/bash -e" + inline = [ + "type -P gcloud > /dev/null || exit 0", + "INST_ID=$(gcloud compute instances describe $INST_NAME --project $PRJ_ID --format=\"value(id)\" --zone=$ZONE)", + "echo 'Error building image try checking logs:'", + join(" ", ["echo \"gcloud logging --project $PRJ_ID read", + "'logName=(\\\"projects/$PRJ_ID/logs/GCEMetadataScripts\\\" OR \\\"projects/$PRJ_ID/logs/google_metadata_script_runner\\\") AND resource.labels.instance_id=$INST_ID'", + "--format=\\\"table(timestamp, resource.labels.instance_id, jsonPayload.message)\\\"", + "--order=asc\"" + ] + ) + ] + } } diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl index e4f30dfb58..ed40663bdb 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl @@ -21,6 +21,9 @@ locals { image_name_default = "${local.image_family}-${formatdate("YYYYMMDD't'hhmmss'z'", timestamp())}" image_name = var.image_name != null ? var.image_name : local.image_name_default + # construct vm image name for use when getting logs + instance_name = "packer-${substr(uuidv4(), 0, 6)}" + # default to explicit var.communicator, otherwise in-order: ssh/winrm/none shell_script_communicator = length(var.shell_scripts) > 0 ? "ssh" : "" ansible_playbook_communicator = length(var.ansible_playbooks) > 0 ? "ssh" : "" @@ -96,6 +99,7 @@ source "googlecompute" "toolkit_image" { image_name = local.image_name image_family = local.image_family image_labels = local.labels + instance_name = local.instance_name machine_type = var.machine_type accelerator_type = local.accelerator_type accelerator_count = var.accelerator_count @@ -197,4 +201,25 @@ build { "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", ] } + + # If there is an error during image creation, print out command for getting packer VM logs + error-cleanup-provisioner "shell-local" { + environment_vars = [ + "PRJ_ID=${var.project_id}", + "INST_NAME=${local.instance_name}", + "ZONE=${var.zone}", + ] + inline_shebang = "/bin/bash -e" + inline = [ + "type -P gcloud > /dev/null || exit 0", + "INST_ID=$(gcloud compute instances describe $INST_NAME --project $PRJ_ID --format=\"value(id)\" --zone=$ZONE)", + "echo 'Error building image try checking logs:'", + join(" ", ["echo \"gcloud logging --project $PRJ_ID read", + "'logName=(\\\"projects/$PRJ_ID/logs/GCEMetadataScripts\\\" OR \\\"projects/$PRJ_ID/logs/google_metadata_script_runner\\\") AND resource.labels.instance_id=$INST_ID'", + "--format=\\\"table(timestamp, resource.labels.instance_id, jsonPayload.message)\\\"", + "--order=asc\"" + ] + ) + ] + } } From d73a0458b4611254326a3f815f7cd309c19871f2 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Tue, 22 Oct 2024 14:37:09 +0000 Subject: [PATCH 008/129] reverse changes in go.mod --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index c6dda3d8f6..56808e3f4e 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module hpc-toolkit -go 1.22 +go 1.21 require ( cloud.google.com/go/storage v1.41.0 // indirect From ce62680660268476f5abb9fcd0bacf3181bbab07 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:48 +0000 Subject: [PATCH 009/129] Modify htcondor-access-point to use local embedded module path --- community/modules/scheduler/htcondor-access-point/README.md | 2 +- community/modules/scheduler/htcondor-access-point/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index a50aba0f39..d93bc24b1d 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -122,7 +122,7 @@ limitations under the License. |------|--------|---------| | [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 73dc845 | | [htcondor\_ap](#module\_htcondor\_ap) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 89de24a67c..ad6f02eebf 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -183,7 +183,7 @@ resource "google_storage_bucket_object" "ap_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" project_id = var.project_id region = var.region From e7192237ecf101e9a1cbaa209d9d895c8be70dd7 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:48 +0000 Subject: [PATCH 010/129] Modify htcondor-central-manager to use local embedded module path --- community/modules/scheduler/htcondor-central-manager/README.md | 2 +- community/modules/scheduler/htcondor-central-manager/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 3e6a33dfeb..fa98e93d05 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -106,7 +106,7 @@ limitations under the License. |------|--------|---------| | [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | | [htcondor\_cm](#module\_htcondor\_cm) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index eec76139c2..52c0d71c4f 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -122,7 +122,7 @@ resource "google_storage_bucket_object" "cm_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" project_id = var.project_id region = var.region From 549ce44e8fa9e4f3d7373c266dc03aca581dcb0d Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:48 +0000 Subject: [PATCH 011/129] Modify htcondor-execute-point to use local embedded module path --- community/modules/compute/htcondor-execute-point/README.md | 2 +- community/modules/compute/htcondor-execute-point/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 5767904901..cb801236f1 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -212,7 +212,7 @@ limitations under the License. |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | | [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index c4cb0589c7..f6292430d9 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -120,7 +120,7 @@ resource "google_storage_bucket_object" "execute_config" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" project_id = var.project_id region = var.region From e1e47029b8deb364f449c32f4281c4ea2daa5176 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:48 +0000 Subject: [PATCH 012/129] Modify htcondor-service-accounts to use local embedded module path --- .../modules/scheduler/htcondor-service-accounts/README.md | 6 +++--- .../modules/scheduler/htcondor-service-accounts/main.tf | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/community/modules/scheduler/htcondor-service-accounts/README.md b/community/modules/scheduler/htcondor-service-accounts/README.md index 1246d39994..5a403c0a38 100644 --- a/community/modules/scheduler/htcondor-service-accounts/README.md +++ b/community/modules/scheduler/htcondor-service-accounts/README.md @@ -100,9 +100,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [access\_point\_service\_account](#module\_access\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.39.0&depth=1 | -| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.39.0&depth=1 | -| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account | v1.39.0&depth=1 | +| [access\_point\_service\_account](#module\_access\_point\_service\_account) | ../../../../community/modules/project/service-account | n/a | +| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | ../../../../community/modules/project/service-account | n/a | +| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | ../../../../community/modules/project/service-account | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-service-accounts/main.tf b/community/modules/scheduler/htcondor-service-accounts/main.tf index 9d72da114d..9d97b18642 100644 --- a/community/modules/scheduler/htcondor-service-accounts/main.tf +++ b/community/modules/scheduler/htcondor-service-accounts/main.tf @@ -21,7 +21,7 @@ # require them module "access_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.39.0&depth=1" + source = "../../../../community/modules/project/service-account" project_id = var.project_id display_name = "HTCondor Access Point" @@ -31,7 +31,7 @@ module "access_point_service_account" { } module "execute_point_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.39.0&depth=1" + source = "../../../../community/modules/project/service-account" project_id = var.project_id display_name = "HTCondor Execute Point" @@ -41,7 +41,7 @@ module "execute_point_service_account" { } module "central_manager_service_account" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/project/service-account?ref=v1.39.0&depth=1" + source = "../../../../community/modules/project/service-account" project_id = var.project_id display_name = "HTCondor Central Manager" From 1c630f6daf5ec5edd610a34a7e752c8b289eaf71 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:48 +0000 Subject: [PATCH 013/129] Modify htcondor-setup to use local embedded module path --- community/modules/scheduler/htcondor-setup/README.md | 4 ++-- community/modules/scheduler/htcondor-setup/main.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/modules/scheduler/htcondor-setup/README.md b/community/modules/scheduler/htcondor-setup/README.md index 254bb08241..9c46f0a3c8 100644 --- a/community/modules/scheduler/htcondor-setup/README.md +++ b/community/modules/scheduler/htcondor-setup/README.md @@ -90,8 +90,8 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [health\_check\_firewall\_rule](#module\_health\_check\_firewall\_rule) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/firewall-rules | 9e695aab | -| [htcondor\_bucket](#module\_htcondor\_bucket) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/file-system/cloud-storage-bucket/ | 9e695aab | +| [health\_check\_firewall\_rule](#module\_health\_check\_firewall\_rule) | ../../../../modules/network/firewall-rules | n/a | +| [htcondor\_bucket](#module\_htcondor\_bucket) | ../../../../community/modules/file-system/cloud-storage-bucket | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-setup/main.tf b/community/modules/scheduler/htcondor-setup/main.tf index ae4dca1b73..ad471fddad 100644 --- a/community/modules/scheduler/htcondor-setup/main.tf +++ b/community/modules/scheduler/htcondor-setup/main.tf @@ -33,7 +33,7 @@ locals { } module "health_check_firewall_rule" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/firewall-rules?ref=9e695aab" + source = "../../../../modules/network/firewall-rules" subnetwork_self_link = var.subnetwork_self_link @@ -54,7 +54,7 @@ module "health_check_firewall_rule" { } module "htcondor_bucket" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/file-system/cloud-storage-bucket/?ref=9e695aab" + source = "../../../../community/modules/file-system/cloud-storage-bucket" project_id = var.project_id deployment_name = var.deployment_name From d5a30fb158da980dbe072656203c00e291f05b87 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:48 +0000 Subject: [PATCH 014/129] Modify chrome-remote-desktop to use local embedded module path --- .../modules/remote-desktop/chrome-remote-desktop/README.md | 4 ++-- .../modules/remote-desktop/chrome-remote-desktop/main.tf | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index 685204cb3e..85a942a045 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -63,8 +63,8 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | -| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | +| [client\_startup\_script](#module\_client\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | +| [instances](#module\_instances) | ../../../../modules/compute/vm-instance | n/a | ## Resources diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 5d7165a2c2..1091c2cff5 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -55,7 +55,7 @@ locals { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" deployment_name = var.deployment_name project_id = var.project_id @@ -71,7 +71,7 @@ module "client_startup_script" { } module "instances" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" + source = "../../../../modules/compute/vm-instance" instance_count = var.instance_count name_prefix = var.name_prefix From b8196ec6f0feda814f6c0d2913311ce68aba9216 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:48 +0000 Subject: [PATCH 015/129] Modify pbspro-client to use local embedded module path --- community/modules/scheduler/pbspro-client/README.md | 6 +++--- community/modules/scheduler/pbspro-client/main.tf | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index cc18153ccd..edcfa9b591 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | -| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.39.0&depth=1 | +| [client\_startup\_script](#module\_client\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | +| [pbs\_client](#module\_pbs\_client) | ../../../../modules/compute/vm-instance | n/a | +| [pbs\_install](#module\_pbs\_install) | ../../../../community/modules/scripts/pbspro-install | n/a | ## Resources diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index fb51718b86..4da020dac1 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.39.0&depth=1" + source = "../../../../community/modules/scripts/pbspro-install" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -43,7 +43,7 @@ module "pbs_install" { } module "client_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" deployment_name = var.deployment_name project_id = var.project_id @@ -57,7 +57,7 @@ module "client_startup_script" { } module "pbs_client" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" + source = "../../../../modules/compute/vm-instance" instance_count = var.instance_count spot = var.spot From d7929454fe9910cc78deb7a8bea948848f4b972e Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:48 +0000 Subject: [PATCH 016/129] Modify pbspro-execution to use local embedded module path --- community/modules/compute/pbspro-execution/README.md | 6 +++--- community/modules/compute/pbspro-execution/main.tf | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index 757ccd1f48..4fa927cbee 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -74,9 +74,9 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | -| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.39.0&depth=1 | +| [execution\_startup\_script](#module\_execution\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | +| [pbs\_execution](#module\_pbs\_execution) | ../../../../modules/compute/vm-instance | n/a | +| [pbs\_install](#module\_pbs\_install) | ../../scripts/pbspro-install | n/a | ## Resources diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index f66148e829..a87bf0863e 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -42,7 +42,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.39.0&depth=1" + source = "../../scripts/pbspro-install" pbs_exec = var.pbs_exec pbs_home = var.pbs_home @@ -53,7 +53,7 @@ module "pbs_install" { } module "execution_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" deployment_name = var.deployment_name project_id = var.project_id @@ -68,7 +68,7 @@ module "execution_startup_script" { } module "pbs_execution" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" + source = "../../../../modules/compute/vm-instance" instance_count = var.instance_count spot = var.spot From 1957a862ce8fcf10f3a23d8b4aa2357c5ddb2e4f Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 017/129] Modify pbspro-server to use local embedded module path --- community/modules/scheduler/pbspro-server/README.md | 8 ++++---- community/modules/scheduler/pbspro-server/main.tf | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index 95bd3c74a7..53b0c51f95 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -69,10 +69,10 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.39.0&depth=1 | -| [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.39.0&depth=1 | -| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | -| [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [pbs\_install](#module\_pbs\_install) | ../../../../community/modules/scripts/pbspro-install | n/a | +| [pbs\_qmgr](#module\_pbs\_qmgr) | ../../../../community/modules/scripts/pbspro-qmgr | n/a | +| [pbs\_server](#module\_pbs\_server) | ../../../../modules/compute/vm-instance | n/a | +| [server\_startup\_script](#module\_server\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index 5671829ade..657cb6e86b 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -32,7 +32,7 @@ locals { } module "pbs_install" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install?ref=v1.39.0&depth=1" + source = "../../../../community/modules/scripts/pbspro-install" pbs_data_service_user = var.pbs_data_service_user pbs_exec = var.pbs_exec @@ -45,7 +45,7 @@ module "pbs_install" { } module "pbs_qmgr" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr?ref=v1.39.0&depth=1" + source = "../../../../community/modules/scripts/pbspro-qmgr" client_host_count = var.client_host_count client_hostname_prefix = var.client_hostname_prefix @@ -55,7 +55,7 @@ module "pbs_qmgr" { } module "server_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" deployment_name = var.deployment_name project_id = var.project_id @@ -70,7 +70,7 @@ module "server_startup_script" { } module "pbs_server" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" + source = "../../../../modules/compute/vm-instance" instance_count = var.instance_count spot = var.spot From d16fa3145e3129c207eab5c6bc34b89ba6288ca4 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 018/129] Modify schedmd-slurm-gcp-v6-controller to use local embedded module path --- .../modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 9f4933a1fa..a4a9f4406b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -235,7 +235,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | -| [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | | [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index a5e30ea64d..e743508181 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -122,7 +122,7 @@ locals { module "daos_network_storage_scripts" { count = length(local.daos_ns) > 0 ? 1 : 0 - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name From 4d11b7e05b2374728ed138e97023f845b75d7e2a Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 019/129] Modify ramble-execute to use local embedded module path --- community/modules/scripts/ramble-execute/README.md | 2 +- community/modules/scripts/ramble-execute/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index 8b9fa844c6..55c2fc7e4e 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -77,7 +77,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index d2470e3821..7ef0b029e3 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -55,7 +55,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id From 96af3689c1ff62e0fb287bdfa7ad20ab8c06e95e Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 020/129] Modify ramble-setup to use local embedded module path --- community/modules/scripts/ramble-setup/README.md | 2 +- community/modules/scripts/ramble-setup/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/ramble-setup/README.md b/community/modules/scripts/ramble-setup/README.md index 09f8c5511d..9891088105 100644 --- a/community/modules/scripts/ramble-setup/README.md +++ b/community/modules/scripts/ramble-setup/README.md @@ -86,7 +86,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 205c980c03..4389af7d33 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -97,7 +97,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id From 8d2e15391c694c41ed5464663122b6fa022e10ce Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 021/129] Modify spack-execute to use local embedded module path --- community/modules/scripts/spack-execute/README.md | 2 +- community/modules/scripts/spack-execute/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index 1e1a8a78ee..8cbb75fb42 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -104,7 +104,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf index 6f8055cd59..04ebcf7d49 100644 --- a/community/modules/scripts/spack-execute/main.tf +++ b/community/modules/scripts/spack-execute/main.tf @@ -54,7 +54,7 @@ locals { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id From 8ad18aaf22f9760393fb248ba5db1a1ac6e1c228 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 022/129] Modify spack-setup to use local embedded module path --- community/modules/scripts/spack-setup/README.md | 2 +- community/modules/scripts/spack-setup/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 0b1ca7810b..68fc68ee80 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -340,7 +340,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index b705ccc06c..d45f5d1be3 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -104,7 +104,7 @@ resource "google_storage_bucket" "bucket" { } module "startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../../../modules/scripts/startup-script" labels = local.labels project_id = var.project_id From 1821e6e95eb62b142a845df73b9e78cf621d4eef Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 023/129] Modify multivpc to use local embedded module path --- modules/network/multivpc/README.md | 2 +- modules/network/multivpc/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/network/multivpc/README.md b/modules/network/multivpc/README.md index 0c0a1811d1..605a6a3603 100644 --- a/modules/network/multivpc/README.md +++ b/modules/network/multivpc/README.md @@ -88,7 +88,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [vpcs](#module\_vpcs) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc | v1.39.0&depth=1 | +| [vpcs](#module\_vpcs) | ../vpc | n/a | ## Resources diff --git a/modules/network/multivpc/main.tf b/modules/network/multivpc/main.tf index 603ab057d4..3b04195f8a 100644 --- a/modules/network/multivpc/main.tf +++ b/modules/network/multivpc/main.tf @@ -48,7 +48,7 @@ resource "terraform_data" "global_ip_cidr_suffix" { } module "vpcs" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=v1.39.0&depth=1" + source = "../vpc" count = var.network_count From a35e7ef8e3d1cbf6d736694111690a9141427210 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 024/129] Modify batch-login-node to use local embedded module path --- modules/scheduler/batch-login-node/README.md | 2 +- modules/scheduler/batch-login-node/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index ef76446250..c20ca7dbeb 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -89,7 +89,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [login\_startup\_script](#module\_login\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0&depth=1 | +| [login\_startup\_script](#module\_login\_startup\_script) | ../../scripts/startup-script | n/a | ## Resources diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index b5eb8bc7dd..6f539af122 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -94,7 +94,7 @@ locals { } module "login_startup_script" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.39.0&depth=1" + source = "../../scripts/startup-script" labels = local.labels project_id = var.project_id deployment_name = var.deployment_name From 894520281b60c0b93e856d0df5d2e1d9f20bdd16 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 22 Oct 2024 20:58:49 +0000 Subject: [PATCH 025/129] Disallow local references to embedded modules The use of source = "../.." notation within the Toolkit modules makes the Toolkit a complete "package" of Terraform modules. This means the whole set of Toolkit modules must be copied starting at the root directory. However, any use of a leading ".." or "./" in the source line in our YAML blueprint, will only stage the final directory into the deployment folder. This changes causes failure whenever a locally staged copy of a module known to be embedded in the Toolkit is referenced. We anticipate retiring embedding modules in favor of fully remote modules. --- pkg/modulereader/resreader.go | 3 +++ pkg/sourcereader/embedded.go | 26 ++++++++++++++++++++++++++ pkg/sourcereader/embedded_test.go | 22 ++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index 220525c4f8..ba6aed0e16 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -135,6 +135,9 @@ func GetModuleInfo(source string, kind string) (ModuleInfo, error) { switch { case sourcereader.IsEmbeddedPath(source) || sourcereader.IsLocalPath(source): modPath = source + if sourcereader.IsLocalPath(source) && sourcereader.LocalModuleIsEmbedded(source) { + return ModuleInfo{}, fmt.Errorf("using embedded modules with local paths is no longer supported; use embedded path and rebuild gcluster binary") + } default: pkgAddr, subDir := getter.SourceDirSubdir(source) if cachedModPath, ok := modDownloadCache[pkgAddr]; ok { diff --git a/pkg/sourcereader/embedded.go b/pkg/sourcereader/embedded.go index 5410f4ca76..3e9afd8a61 100644 --- a/pkg/sourcereader/embedded.go +++ b/pkg/sourcereader/embedded.go @@ -20,6 +20,7 @@ import ( "os" "path" "path/filepath" + "strings" ) // ModuleFS contains embedded modules (./modules) for use in building @@ -53,6 +54,31 @@ func copyFileOut(bfs BaseFS, src string, dst string) error { return nil } +func LocalModuleIsEmbedded(source string) bool { + if ModuleFS == nil { + return false + } + + if !IsLocalPath(source) { + return false + } + + pathBits := strings.SplitN(filepath.Clean(source), string(os.PathSeparator), 5) + lengthPath := len(pathBits) + if lengthPath < 3 { + return false + } + + for i := 3; i <= lengthPath; i++ { + lastBits := filepath.Join(pathBits[lengthPath-i:]...) + _, err := ModuleFS.ReadDir(lastBits) + if err == nil { + return true + } + } + return false +} + // copyDir copies an FS directory to a local path func copyDir(bfs BaseFS, source string, dest string) error { dirEntries, err := bfs.ReadDir(source) diff --git a/pkg/sourcereader/embedded_test.go b/pkg/sourcereader/embedded_test.go index 8ee22ce125..a930d65127 100644 --- a/pkg/sourcereader/embedded_test.go +++ b/pkg/sourcereader/embedded_test.go @@ -98,6 +98,28 @@ func (s *embeddedSuite) TestGetModule_Embedded(c *C) { c.Assert(err, ErrorMatches, "source is not valid: .*") } +func (s *embeddedSuite) TestLocalModuleIsEmbedded(c *C) { + { // Invalid: Cannot use embedded modules locally + found := LocalModuleIsEmbedded("./modules/network/vpc") + c.Check(found, Equals, true) + } + + { // Invalid: Cannot use embedded modules locally + found := LocalModuleIsEmbedded("../hpc-toolkit/modules/compute/../network/vpc") + c.Check(found, Equals, true) + } + + { // Valid: use non-embedded modules locally + found := LocalModuleIsEmbedded("../hpc-toolkit/modules/compute/../foo/bar") + c.Check(found, Equals, false) + } + + { // Invalid: must be a local path + found := LocalModuleIsEmbedded("modules/network/vpc") + c.Check(found, Equals, false) + } +} + func (s *embeddedSuite) TestGetModule_NilFs(c *C) { ModuleFS = nil c.Assert(s.r.GetModule("here", "there"), NotNil) From 5520c4954018c9ff9e1af4e9a992493df7ab794b Mon Sep 17 00:00:00 2001 From: J Ross Thomson <39315853+jrossthomson@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:23:27 -0400 Subject: [PATCH 026/129] Update community/modules/compute/notebook/README.md Co-authored-by: Tom Downes --- community/modules/compute/notebook/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/compute/notebook/README.md b/community/modules/compute/notebook/README.md index ed60b9d2a8..c173b155bb 100644 --- a/community/modules/compute/notebook/README.md +++ b/community/modules/compute/notebook/README.md @@ -76,7 +76,7 @@ No modules. |------|-------------|------|---------|:--------:| | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment; used as part of name of the notebook. | `string` | n/a | yes | | [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | Bucket name, can be provided from the google-cloud-storage module | `string` | `null` | no | -| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "tf-latest-cpu",
"name": null,
"project": "deeplearning-platform-release"
}
| no | +| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "tf-latest-cpu",
"name": null,
"project": "deeplearning-platform-release"
}
| no | | [labels](#input\_labels) | Labels to add to the resource Key-value pairs. | `map(string)` | n/a | yes | | [machine\_type](#input\_machine\_type) | The machine type to employ | `string` | n/a | yes | | [mount\_runner](#input\_mount\_runner) | mount content from the google-cloud-storage module | `map(string)` | n/a | yes | From 5796cbcbee291c29488054093a808f506e5f5361 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 22 Oct 2024 22:49:28 +0000 Subject: [PATCH 027/129] Update to Slurm images 6-8 --- community/examples/AMD/hpc-amd-slurm.yaml | 2 +- community/examples/hpc-slurm-ubuntu2004.yaml | 2 +- community/examples/hpc-slurm6-apptainer.yaml | 2 +- .../schedmd-slurm-gcp-v6-nodeset-dynamic/README.md | 2 +- .../source_image_logic.tf | 8 ++++---- .../schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf | 8 ++++---- .../compute/schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../schedmd-slurm-gcp-v6-controller/source_image_logic.tf | 8 ++++---- .../variables_controller_instance.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-login/README.md | 2 +- .../schedmd-slurm-gcp-v6-login/source_image_logic.tf | 8 ++++---- .../scheduler/schedmd-slurm-gcp-v6-login/variables.tf | 2 +- examples/cae/cae-slurm.yaml | 2 +- examples/hpc-enterprise-slurm.yaml | 2 +- examples/hpc-slurm-static.yaml | 2 +- examples/image-builder.yaml | 2 +- examples/ml-slurm.yaml | 2 +- .../cloud-build/daily-tests/blueprints/lustre-slurm.yaml | 4 ++-- tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml | 2 +- .../golden_copies/configs/versioned_blueprint.yaml | 2 +- .../.ghpc/artifacts/expanded_blueprint.yaml | 2 +- .../versioned_blueprint/primary/terraform.tfvars | 2 +- tools/validate_configs/test_configs/node-groups.yaml | 6 +++--- 28 files changed, 43 insertions(+), 43 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 282d5b7816..211f41e9cc 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -168,7 +168,7 @@ deployment_groups: # these images must match the images used by Slurm modules below because # we are building OpenMPI with PMI support in libraries contained in # Slurm installation - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public - id: low_cost_nodeset diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index 34037a1052..475f65a317 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -24,7 +24,7 @@ vars: slurm_image: # Please refer to the following link for the latest images: # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems - family: slurm-gcp-6-7-ubuntu-2004-lts + family: slurm-gcp-6-8-ubuntu-2004-lts project: schedmd-slurm-public instance_image_custom: true diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml index 47e9c267aa..fba9e18f87 100644 --- a/community/examples/hpc-slurm6-apptainer.yaml +++ b/community/examples/hpc-slurm6-apptainer.yaml @@ -60,7 +60,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8 + source_image_family: slurm-gcp-6-8-hpc-rocky-linux-8 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index d251dff2af..d0a15a8234 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -104,7 +104,7 @@ modules. For support with the underlying modules, see the instructions in the | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [feature](#input\_feature) | The node feature, used to bind nodes to the nodeset. If not set, the nodeset name will be used. | `string` | `null` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf index a86c28ffc2..a4a2579989 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-7-debian-11", - "slurm-gcp-6-7-hpc-rocky-linux-8", - "slurm-gcp-6-7-ubuntu-2004-lts", - "slurm-gcp-6-7-ubuntu-2204-lts-arm64" + "slurm-gcp-6-8-debian-11", + "slurm-gcp-6-8-hpc-rocky-linux-8", + "slurm-gcp-6-8-ubuntu-2004-lts", + "slurm-gcp-6-8-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf index 5d5f71c9c0..b3e323bc68 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -68,7 +68,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md index f0fb08ee1d..024931b2d8 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/README.md @@ -59,7 +59,7 @@ No resources. | [accelerator\_config](#input\_accelerator\_config) | Nodeset accelerator config, see https://cloud.google.com/tpu/docs/supported-tpu-configurations for details. |
object({
topology = string
version = string
})
|
{
"topology": "",
"version": ""
}
| no | | [data\_disks](#input\_data\_disks) | The data disks to include in the TPU node | `list(string)` | `[]` | no | | [disable\_public\_ips](#input\_disable\_public\_ips) | DEPRECATED: Use `enable_public_ips` instead. | `bool` | `null` | no | -| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf- | `string` | `null` | no | +| [docker\_image](#input\_docker\_image) | The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-8-tf- | `string` | `null` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf index 3302e0ea4c..0831588b83 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/variables.tf @@ -112,7 +112,7 @@ variable "data_disks" { } variable "docker_image" { - description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-7-tf-" + description = "The gcp container registry id docker image to use in the TPU vms, it defaults to gcr.io/schedmd-slurm-public/tpu:slurm-gcp-6-8-tf-" type = string default = null } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 115ac451e7..892c659dcb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -179,7 +179,7 @@ No modules. | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_properties](#input\_instance\_properties) | Override the instance properties. Used to test features not supported by Slurm GCP,
recommended for advanced usage only.
See https://cloud.google.com/compute/docs/reference/rest/v1/regionInstances/bulkInsert
If any sub-field (e.g. scheduling) is set, it will override the values computed by
SlurmGCP and ignoring values of provided vars. | `any` | `null` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for compute nodes. | `string` | `null` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf index a86c28ffc2..a4a2579989 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-7-debian-11", - "slurm-gcp-6-7-hpc-rocky-linux-8", - "slurm-gcp-6-7-ubuntu-2004-lts", - "slurm-gcp-6-7-ubuntu-2204-lts-arm64" + "slurm-gcp-6-8-debian-11", + "slurm-gcp-6-8-hpc-rocky-linux-8", + "slurm-gcp-6-8-ubuntu-2004-lts", + "slurm-gcp-6-8-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 536659f136..282dceca50 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -88,7 +88,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index a4a9f4406b..c09bc30350 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -301,7 +301,7 @@ limitations under the License. | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no | | [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | `""` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf index a86c28ffc2..a4a2579989 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-7-debian-11", - "slurm-gcp-6-7-hpc-rocky-linux-8", - "slurm-gcp-6-7-ubuntu-2004-lts", - "slurm-gcp-6-7-ubuntu-2204-lts-arm64" + "slurm-gcp-6-8-debian-11", + "slurm-gcp-6-8-hpc-rocky-linux-8", + "slurm-gcp-6-8-ubuntu-2004-lts", + "slurm-gcp-6-8-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index 0df835e322..2d684b0e62 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -267,7 +267,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 4ad20a6352..72a2180a12 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -100,7 +100,7 @@ No modules. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-7-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-8-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for login nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf index a86c28ffc2..a4a2579989 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/source_image_logic.tf @@ -18,10 +18,10 @@ locals { # Currently supported images and projects known_project_families = { schedmd-slurm-public = [ - "slurm-gcp-6-7-debian-11", - "slurm-gcp-6-7-hpc-rocky-linux-8", - "slurm-gcp-6-7-ubuntu-2004-lts", - "slurm-gcp-6-7-ubuntu-2204-lts-arm64" + "slurm-gcp-6-8-debian-11", + "slurm-gcp-6-8-hpc-rocky-linux-8", + "slurm-gcp-6-8-ubuntu-2004-lts", + "slurm-gcp-6-8-ubuntu-2204-lts-arm64" ] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index 2b53c8f9e5..104b9f4a33 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -325,7 +325,7 @@ variable "instance_image" { EOD type = map(string) default = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index 34096a7080..b42a4e401c 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -40,7 +40,7 @@ vars: # for a list of valid family options with Slurm; note: the image types for the compute nodes # and the Chrome Remote Desktop (CRD) need to have the same Slurm base. instance_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public # Documentation for each of the modules used below can be found at diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index 69aeab57dc..ac08d1d06e 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -25,7 +25,7 @@ vars: slurm_image: # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public # If image above is changed to use custom image, then setting below must be set to true instance_image_custom: false diff --git a/examples/hpc-slurm-static.yaml b/examples/hpc-slurm-static.yaml index 07ed2a4690..847e02e713 100644 --- a/examples/hpc-slurm-static.yaml +++ b/examples/hpc-slurm-static.yaml @@ -29,7 +29,7 @@ vars: static_node_count: 2 ## Must be <= number of reserved machines ## slurm_instance_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: false # true if using custom image in lines above bandwidth_tier: gvnic_enabled diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 715948b0dd..8c05670388 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -59,7 +59,7 @@ deployment_groups: settings: source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8 + source_image_family: slurm-gcp-6-8-hpc-rocky-linux-8 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size) diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index 4baaaf07ce..7860eb2daf 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -139,7 +139,7 @@ deployment_groups: omit_external_ip: false source_image_project_id: [schedmd-slurm-public] # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family - source_image_family: slurm-gcp-6-7-debian-11 + source_image_family: slurm-gcp-6-8-debian-11 # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml index 44900430a7..d6ddeeadcc 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -27,7 +27,7 @@ vars: # on_host_maintenance: MIGRATE num_nodes: 1 rocky_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public deployment_groups: @@ -79,7 +79,7 @@ deployment_groups: # settings: # node_count_dynamic_max: $(vars.num_nodes) # instance_image: - # family: slurm-gcp-6-7-ubuntu-2004-lts + # family: slurm-gcp-6-8-ubuntu-2004-lts # project: schedmd-slurm-public # - id: ubuntu_partition diff --git a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml index 8d5e724b0b..90338aabd9 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml @@ -22,7 +22,7 @@ slurm_cluster_name: "debiv6{{ build[0:4] }}" cli_deployment_vars: network_name: "{{ network }}" - slurm_image: "{family: slurm-gcp-6-7-debian-11, project: schedmd-slurm-public}" + slurm_image: "{family: slurm-gcp-6-8-debian-11, project: schedmd-slurm-public}" region: us-west4 zone: us-west4-c diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml index 5240404a3c..711c9f72e0 100644 --- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml +++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml @@ -27,7 +27,7 @@ vars: slurm_image: # Visit https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public # If image above is changed to use custom image, then setting below must be set to true instance_image_custom: false diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index 2c5e9ca64a..79bd57ecc4 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,7 +39,7 @@ vars: project_id: invalid-project region: us-central1 slurm_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public zone: us-central1-a deployment_groups: diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars index 39fad882b4..1a3c91cac2 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/terraform.tfvars @@ -30,7 +30,7 @@ project_id = "invalid-project" region = "us-central1" slurm_image = { - family = "slurm-gcp-6-7-hpc-rocky-linux-8" + family = "slurm-gcp-6-8-hpc-rocky-linux-8" project = "schedmd-slurm-public" } diff --git a/tools/validate_configs/test_configs/node-groups.yaml b/tools/validate_configs/test_configs/node-groups.yaml index cfb166cbb5..962d1e3130 100644 --- a/tools/validate_configs/test_configs/node-groups.yaml +++ b/tools/validate_configs/test_configs/node-groups.yaml @@ -64,7 +64,7 @@ deployment_groups: name: c30 machine_type: c2-standard-30 instance_image: - family: slurm-gcp-6-7-debian-11 + family: slurm-gcp-6-8-debian-11 project: schedmd-slurm-public instance_image_custom: true @@ -75,7 +75,7 @@ deployment_groups: name: c60 machine_type: c2-standard-60 instance_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public - id: nodeset_3 @@ -85,7 +85,7 @@ deployment_groups: name: cd112 machine_type: c2d-standard-112 instance_image: - family: slurm-gcp-6-7-hpc-rocky-linux-8 + family: slurm-gcp-6-8-hpc-rocky-linux-8 project: schedmd-slurm-public instance_image_custom: true enable_smt: true From 8f6d9b1a6b144a9e7a320641f236008506378886 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Wed, 23 Oct 2024 15:15:07 +0000 Subject: [PATCH 028/129] Bump go version 1.21 -> 1.22 --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 56808e3f4e..c6dda3d8f6 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module hpc-toolkit -go 1.21 +go 1.22 require ( cloud.google.com/go/storage v1.41.0 // indirect From 7a1e18736bdca9490477bb08b54e0f0b171091c4 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Mon, 21 Oct 2024 17:30:09 +0000 Subject: [PATCH 029/129] Remove (now) redundant print out of new image name --- modules/packer/custom-image/image.pkr.hcl | 9 --------- .../expectations/igc_pkr/one/image/image.pkr.hcl | 9 --------- .../expectations/text_escape/zero/lime/image.pkr.hcl | 9 --------- 3 files changed, 27 deletions(-) diff --git a/modules/packer/custom-image/image.pkr.hcl b/modules/packer/custom-image/image.pkr.hcl index ed40663bdb..9282cf7433 100644 --- a/modules/packer/custom-image/image.pkr.hcl +++ b/modules/packer/custom-image/image.pkr.hcl @@ -193,15 +193,6 @@ build { } } - # if the jq command is present, this will print the image name to stdout - # if jq is not present, this exits silently with code 0 - post-processor "shell-local" { - inline = [ - "command -v jq > /dev/null || exit 0", - "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", - ] - } - # If there is an error during image creation, print out command for getting packer VM logs error-cleanup-provisioner "shell-local" { environment_vars = [ diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl index ed40663bdb..9282cf7433 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl @@ -193,15 +193,6 @@ build { } } - # if the jq command is present, this will print the image name to stdout - # if jq is not present, this exits silently with code 0 - post-processor "shell-local" { - inline = [ - "command -v jq > /dev/null || exit 0", - "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", - ] - } - # If there is an error during image creation, print out command for getting packer VM logs error-cleanup-provisioner "shell-local" { environment_vars = [ diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl index ed40663bdb..9282cf7433 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl @@ -193,15 +193,6 @@ build { } } - # if the jq command is present, this will print the image name to stdout - # if jq is not present, this exits silently with code 0 - post-processor "shell-local" { - inline = [ - "command -v jq > /dev/null || exit 0", - "echo \"Image built: $(jq -r '.builds[-1].artifact_id' ${var.manifest_file} | cut -d ':' -f2)\"", - ] - } - # If there is an error during image creation, print out command for getting packer VM logs error-cleanup-provisioner "shell-local" { environment_vars = [ From 4b8af75efc0f1221539b5fab496a887881b24215 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 23 Oct 2024 18:15:21 +0000 Subject: [PATCH 030/129] SlurmGCP. "All or nothing" bulk insert on requests with placements --- .../modules/slurm_files/scripts/resume.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 1bc1150c58..04e8d05574 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -120,11 +120,7 @@ def per_instance_properties(node): def create_instances_request(nodes, partition_name, placement_group, job_id=None): """Call regionInstances.bulkInsert to create instances""" - assert len(nodes) > 0 - if placement_group: - assert len(nodes) <= min(PLACEMENT_MAX_CNT, BULK_INSERT_LIMIT) - else: - assert len(nodes) <= BULK_INSERT_LIMIT + assert 0 < len(nodes) <= BULK_INSERT_LIMIT # model here indicates any node that can be used to describe the rest model = next(iter(nodes)) @@ -134,8 +130,14 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None log.debug(f"create_instances_request: {model} placement: {placement_group}") body = NSDict() + body.count = len(nodes) - body.minCount = 1 + + if placement_group: + assert len(nodes) <= PLACEMENT_MAX_CNT + pass # do not set minCount to force "all or nothing" behavior + else: + body.minCount = 1 # source of instance properties body.sourceInstanceTemplate = template From 4bfc56648d897c22c81a08f0bc01277ee06afbb4 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 23 Oct 2024 22:05:25 +0000 Subject: [PATCH 031/129] Update bucket module within Slurm controller module - Support terraform-provider-google v6 --- .../modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index c09bc30350..0b17191293 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -234,7 +234,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | +| [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 6.1 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index e743508181..8fcbe78ddc 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -24,7 +24,7 @@ locals { module "bucket" { source = "terraform-google-modules/cloud-storage/google" - version = "~> 5.0" + version = "~> 6.1" count = var.create_bucket ? 1 : 0 From cb02627d1c9676552ae23ff1cad21b9003770d4c Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 24 Oct 2024 20:18:32 +0000 Subject: [PATCH 032/129] Add support for SchedulerParameters to cloud_parameters in Slurm controller module --- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/README.md | 2 +- .../modules/slurm_files/scripts/conf.py | 12 +++++------ .../slurm_files/scripts/tests/test_conf.py | 15 +++++++++++--- .../modules/slurm_files/variables.tf | 20 ++++++++++--------- .../variables.tf | 20 ++++++++++--------- 6 files changed, 42 insertions(+), 29 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index c09bc30350..d7c1a388e3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -269,7 +269,7 @@ limitations under the License. | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool, false)
scheduler_parameters = optional(list(string))
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index 3033d59f43..a20f60b730 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -66,7 +66,7 @@ No modules. | [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool, false)
scheduler_parameters = optional(list(string))
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | | [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 120ae7f1e8..35618574b9 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -95,11 +95,6 @@ def get(key, default): "enable_configless", "idle_on_node_suspend", ], - "SchedulerParameters": [ - "bf_continue", - "salloc_wait_nodes", - "ignore_prefer_validation", - ], "GresTypes": [ "gpu" if any_gpus else None, ], @@ -114,11 +109,16 @@ def get(key, default): **(comma_params if not no_comma_params else {}), "Prolog": f"{prolog_path}/*" if lkp.cfg.prolog_scripts else None, "Epilog": f"{epilog_path}/*" if lkp.cfg.epilog_scripts else None, - "SuspendProgram": f"{scripts_dir}/suspend.py", + "SchedulerParameters": get("scheduler_parameters", [ + "bf_continue", + "salloc_wait_nodes", + "ignore_prefer_validation", + ]), "ResumeProgram": f"{scripts_dir}/resume.py", "ResumeFailProgram": f"{scripts_dir}/suspend.py", "ResumeRate": get("resume_rate", 0), "ResumeTimeout": get("resume_timeout", 300), + "SuspendProgram": f"{scripts_dir}/suspend.py", "SuspendRate": get("suspend_rate", 0), "SuspendTimeout": get("suspend_timeout", 300), "TreeWidth": get("tree_width", default_tree_width), diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py index 0b25b0df58..4d9c869f2f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py @@ -92,11 +92,11 @@ def test_dict_to_conf(value: dict, want: str): LaunchParameters=enable_nss_slurm,use_interactive_step SlurmctldParameters=cloud_dns,enable_configless,idle_on_node_suspend SchedulerParameters=bf_continue,salloc_wait_nodes,ignore_prefer_validation -SuspendProgram=ukulele/suspend.py ResumeProgram=ukulele/resume.py ResumeFailProgram=ukulele/suspend.py ResumeRate=0 ResumeTimeout=300 +SuspendProgram=ukulele/suspend.py SuspendRate=0 SuspendTimeout=300 TreeWidth=128 @@ -106,6 +106,7 @@ def test_dict_to_conf(value: dict, want: str): install_dir="ukulele", cloud_parameters={ "no_comma_params": True, + "scheduler_parameters": None, "resume_rate": None, "resume_timeout": None, "suspend_rate": None, @@ -115,11 +116,12 @@ def test_dict_to_conf(value: dict, want: str): "tree_width": None, }, ), - """SuspendProgram=ukulele/suspend.py + """SchedulerParameters=bf_continue,salloc_wait_nodes,ignore_prefer_validation ResumeProgram=ukulele/resume.py ResumeFailProgram=ukulele/suspend.py ResumeRate=0 ResumeTimeout=300 +SuspendProgram=ukulele/suspend.py SuspendRate=0 SuspendTimeout=300 TreeWidth=128 @@ -129,6 +131,12 @@ def test_dict_to_conf(value: dict, want: str): install_dir="ukulele", cloud_parameters={ "no_comma_params": True, + "scheduler_parameters": [ + "bf_busy_nodes", + "bf_continue", + "ignore_prefer_validation", + "nohold_on_prolog_fail", + ], "resume_rate": 1, "resume_timeout": 2, "suspend_rate": 3, @@ -138,11 +146,12 @@ def test_dict_to_conf(value: dict, want: str): "tree_width": 5, }, ), - """SuspendProgram=ukulele/suspend.py + """SchedulerParameters=bf_busy_nodes,bf_continue,ignore_prefer_validation,nohold_on_prolog_fail ResumeProgram=ukulele/resume.py ResumeFailProgram=ukulele/suspend.py ResumeRate=1 ResumeTimeout=2 +SuspendProgram=ukulele/suspend.py SuspendRate=3 SuspendTimeout=4 TreeWidth=5 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 91026fc267..2c3ed4c63d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -316,16 +316,18 @@ variable "partitions" { variable "cloud_parameters" { description = "cloud.conf options. Default behavior defined in scripts/conf.py" type = object({ - no_comma_params = optional(bool) - resume_rate = optional(number) - resume_timeout = optional(number) - suspend_rate = optional(number) - suspend_timeout = optional(number) - topology_plugin = optional(string) - topology_param = optional(string) - tree_width = optional(number) + no_comma_params = optional(bool, false) + scheduler_parameters = optional(list(string)) + resume_rate = optional(number) + resume_timeout = optional(number) + suspend_rate = optional(number) + suspend_timeout = optional(number) + topology_plugin = optional(string) + topology_param = optional(string) + tree_width = optional(number) }) - default = {} + default = {} + nullable = false } ########## diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 95e5c20d0a..b2e98d7b2f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -406,16 +406,18 @@ EOD variable "cloud_parameters" { description = "cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters)" type = object({ - no_comma_params = optional(bool) - resume_rate = optional(number) - resume_timeout = optional(number) - suspend_rate = optional(number) - suspend_timeout = optional(number) - topology_plugin = optional(string) - topology_param = optional(string) - tree_width = optional(number) + no_comma_params = optional(bool, false) + scheduler_parameters = optional(list(string)) + resume_rate = optional(number) + resume_timeout = optional(number) + suspend_rate = optional(number) + suspend_timeout = optional(number) + topology_plugin = optional(string) + topology_param = optional(string) + tree_width = optional(number) }) - default = {} + default = {} + nullable = false } variable "enable_default_mounts" { From 6ea03e442942d275c93f16eb110fdfea4e3f56b9 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 24 Oct 2024 20:18:32 +0000 Subject: [PATCH 033/129] Add support for PrivateData to cloud_parameters in Slurm controller module Drop default value of "cloud" as it was made the default behavior: https://github.com/SchedMD/slurm/commit/d3ccb90 --- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/README.md | 2 +- .../modules/slurm_files/scripts/conf.py | 4 +--- .../modules/slurm_files/scripts/tests/test_conf.py | 11 ++++++++--- .../modules/slurm_files/variables.tf | 1 + .../schedmd-slurm-gcp-v6-controller/variables.tf | 1 + 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index d7c1a388e3..48d94836dd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -269,7 +269,7 @@ limitations under the License. | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool, false)
scheduler_parameters = optional(list(string))
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool, false)
private_data = optional(list(string))
scheduler_parameters = optional(list(string))
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md index a20f60b730..1b1db61cc4 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/README.md @@ -66,7 +66,7 @@ No modules. | [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool, false)
scheduler_parameters = optional(list(string))
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool, false)
private_data = optional(list(string))
scheduler_parameters = optional(list(string))
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | | [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index 35618574b9..7ee06332f1 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -83,9 +83,6 @@ def get(key, default): any_dynamic = any(bool(p.partition_feature) for p in lkp.cfg.partitions.values()) comma_params = { - "PrivateData": [ - "cloud", - ], "LaunchParameters": [ "enable_nss_slurm", "use_interactive_step", @@ -109,6 +106,7 @@ def get(key, default): **(comma_params if not no_comma_params else {}), "Prolog": f"{prolog_path}/*" if lkp.cfg.prolog_scripts else None, "Epilog": f"{epilog_path}/*" if lkp.cfg.epilog_scripts else None, + "PrivateData": get("private_data", []), "SchedulerParameters": get("scheduler_parameters", [ "bf_continue", "salloc_wait_nodes", diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py index 4d9c869f2f..6585b2fcd1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py @@ -88,8 +88,7 @@ def test_dict_to_conf(value: dict, want: str): (TstCfg( install_dir="ukulele", ), - """PrivateData=cloud -LaunchParameters=enable_nss_slurm,use_interactive_step + """LaunchParameters=enable_nss_slurm,use_interactive_step SlurmctldParameters=cloud_dns,enable_configless,idle_on_node_suspend SchedulerParameters=bf_continue,salloc_wait_nodes,ignore_prefer_validation ResumeProgram=ukulele/resume.py @@ -106,6 +105,7 @@ def test_dict_to_conf(value: dict, want: str): install_dir="ukulele", cloud_parameters={ "no_comma_params": True, + "private_data": None, "scheduler_parameters": None, "resume_rate": None, "resume_timeout": None, @@ -131,6 +131,10 @@ def test_dict_to_conf(value: dict, want: str): install_dir="ukulele", cloud_parameters={ "no_comma_params": True, + "private_data": [ + "events", + "jobs", + ], "scheduler_parameters": [ "bf_busy_nodes", "bf_continue", @@ -146,7 +150,8 @@ def test_dict_to_conf(value: dict, want: str): "tree_width": 5, }, ), - """SchedulerParameters=bf_busy_nodes,bf_continue,ignore_prefer_validation,nohold_on_prolog_fail + """PrivateData=events,jobs +SchedulerParameters=bf_busy_nodes,bf_continue,ignore_prefer_validation,nohold_on_prolog_fail ResumeProgram=ukulele/resume.py ResumeFailProgram=ukulele/suspend.py ResumeRate=1 diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index 2c3ed4c63d..308a42e639 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -317,6 +317,7 @@ variable "cloud_parameters" { description = "cloud.conf options. Default behavior defined in scripts/conf.py" type = object({ no_comma_params = optional(bool, false) + private_data = optional(list(string)) scheduler_parameters = optional(list(string)) resume_rate = optional(number) resume_timeout = optional(number) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index b2e98d7b2f..7ecb8696cf 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -407,6 +407,7 @@ variable "cloud_parameters" { description = "cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters)" type = object({ no_comma_params = optional(bool, false) + private_data = optional(list(string)) scheduler_parameters = optional(list(string)) resume_rate = optional(number) resume_timeout = optional(number) From 237521ce024b01873a26f064191e2bb6e19c7ea1 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Thu, 24 Oct 2024 22:42:44 +0000 Subject: [PATCH 034/129] removing upper version constraint on vm-instance module, and defining guest accelerator as a block --- modules/compute/vm-instance/README.md | 8 ++++---- modules/compute/vm-instance/main.tf | 9 ++++++++- modules/compute/vm-instance/versions.tf | 4 ++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 3332be5e6a..149f472d68 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -169,16 +169,16 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.3.0 | -| [google](#requirement\_google) | >= 4.73.0, <6.0 | -| [google-beta](#requirement\_google-beta) | >= 4.73.0, <6.0 | +| [google](#requirement\_google) | >= 4.73.0 | +| [google-beta](#requirement\_google-beta) | >= 4.73.0 | | [null](#requirement\_null) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 4.73.0, <6.0 | -| [google-beta](#provider\_google-beta) | >= 4.73.0, <6.0 | +| [google](#provider\_google) | >= 4.73.0 | +| [google-beta](#provider\_google-beta) | >= 4.73.0 | | [null](#provider\_null) | >= 3.0 | ## Modules diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 01207d701f..013449acb0 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -239,7 +239,14 @@ resource "google_compute_instance" "compute_vm" { scopes = var.service_account_scopes } - guest_accelerator = local.guest_accelerator + dynamic "guest_accelerator" { + for_each = local.guest_accelerator + content { + count = guest_accelerator.value.count + type = guest_accelerator.value.type + } + } + scheduling { on_host_maintenance = local.on_host_maintenance automatic_restart = local.automatic_restart diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 228e58fe84..25592509fc 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -18,12 +18,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.73.0, <6.0" + version = ">= 4.73.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.73.0, <6.0" + version = ">= 4.73.0" } null = { source = "hashicorp/null" From 22fddbb3840b901d83cbad79437eee53fc6b3f03 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Fri, 25 Oct 2024 13:12:43 +0000 Subject: [PATCH 035/129] Adding tip about new logging output --- modules/packer/custom-image/README.md | 45 ++++++++++++++++++--------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 873d3b993b..40326a5dd5 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -210,24 +210,41 @@ to the console. For example: ==> example.googlecompute.toolkit_image: Startup script, if any, has finished running. ``` -Using the default value for \[var.scopes\]\[#input_scopes\], the output of -startup script execution will be stored in Cloud Logging. It can be examined -using the [Cloud Logging Console][logging-console] or with a -[gcloud logging read][logging-read-docs] command (substituting `<>` -with your project ID): +To monitor the progress of the startup script, first gather the name and zone +of the instance using Cloud Console or the following command: ```shell -$ gcloud logging --project <> read \ - 'logName="projects/<>/logs/GCEMetadataScripts" AND jsonPayload.message=~"^startup-script: "' \ - --format="table[box](timestamp, resource.labels.instance_id, jsonPayload.message)" --freshness 2h +gcloud compute instances list --project --filter="name~^packer" ``` -Note that this command will print **all** startup script entries within the -project within the "freshness" window **in reverse order**. You may need to -identify the instance ID of the Packer VM and filter further by that value using -`gcloud` or `grep`. To print the entries in the order they would have appeared -on your console, we recommend piping the output of this command to the standard -Linux utility `tac`. +This will produce a list of VMs starting with `packer` along with other +information including which zone the VMs are in. If there is more than one +Packer VM you will need to determine which is the one that you wish to monitor +using criteria such as zone or machine-type. + +Once the VM name is determined, you can either check the serial port output in +Cloud Console or by running the command: + +```shell +gcloud compute instances get-serial-port-output --port 1 --zone --project +``` + +The serial port output of the Packer VM will contain the startup script logs. +This output will only be available while Packer is running. + +### Gathering Startup Script Logs After Failure + +If the Packer image build fails, the module will produce a `gcloud` command +that prints the failed startup script output from the Packer VM. The produced +command will have the variables specified and can be used in a terminal without +modification. + +The output will look similar to: + +```shell +Error building image try checking logs: +gcloud logging --project read 'logName=("projects//logs/GCEMetadataScripts" OR "projects//logs/google_metadata_script_runner") AND resource.labels.instance_id=' --format="table(timestamp, resource.labels.instance_id, jsonPayload.message) --order=asc +``` ## License From 7ea094c366493d011c8fda0d209570aae54098b3 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Fri, 25 Oct 2024 16:08:29 +0000 Subject: [PATCH 036/129] Revert "guest_accelerator modifications" This reverts commit ca66b8c6e3e55eeb26f90e928cc9c6a43ca46acd. --- .../modules/compute/htcondor-execute-point/gpu_definition.tf | 4 ++-- .../compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf | 4 ++-- .../schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf | 4 ++-- .../compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf | 4 ++-- .../schedmd-slurm-gcp-v5-controller/gpu_definition.tf | 4 ++-- .../scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/gpu_definition.tf | 4 ++-- .../scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf | 4 ++-- modules/compute/gke-node-pool/gpu_definition.tf | 4 ++-- modules/compute/gke-node-pool/main.tf | 4 ++-- modules/compute/gke-node-pool/reservation_definitions.tf | 2 +- modules/compute/vm-instance/gpu_definition.tf | 4 ++-- modules/compute/vm-instance/main.tf | 2 +- 13 files changed, 24 insertions(+), 24 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/community/modules/compute/htcondor-execute-point/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf +++ b/community/modules/compute/htcondor-execute-point/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/modules/compute/gke-node-pool/gpu_definition.tf b/modules/compute/gke-node-pool/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/modules/compute/gke-node-pool/gpu_definition.tf +++ b/modules/compute/gke-node-pool/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 02518f705a..49bee3d0fa 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -21,7 +21,7 @@ locals { locals { preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family) - has_gpu = (local.guest_accelerator != null && (length([for ga in local.guest_accelerator : ga if ga.count > 0]) > 0)) || local.preattached_gpu_machine_family + has_gpu = (local.guest_accelerator != null && length(local.guest_accelerator) > 0) || local.preattached_gpu_machine_family gpu_taint = local.has_gpu ? [{ key = "nvidia.com/gpu" value = "present" @@ -85,7 +85,7 @@ resource "google_container_node_pool" "node_pool" { image_type = var.image_type dynamic "guest_accelerator" { - for_each = { for idx, ga in local.guest_accelerator : idx => ga if ga.count > 0 } + for_each = local.guest_accelerator content { type = coalesce(guest_accelerator.value.type, try(local.generated_guest_accelerator[0].type, "")) count = coalesce(try(guest_accelerator.value.count, 0) > 0 ? guest_accelerator.value.count : try(local.generated_guest_accelerator[0].count, "0")) diff --git a/modules/compute/gke-node-pool/reservation_definitions.tf b/modules/compute/gke-node-pool/reservation_definitions.tf index a75246b185..d40cc5b01f 100644 --- a/modules/compute/gke-node-pool/reservation_definitions.tf +++ b/modules/compute/gke-node-pool/reservation_definitions.tf @@ -55,7 +55,7 @@ locals { }] nodepool_vm_properties = { "machine_type" : var.machine_type - "guest_accelerators" : { for acc in try(local.guest_accelerator, []) : (acc.count > 0 ? coalesce(acc.type, try(local.generated_guest_accelerator[0].type, "")) : "") => acc.count if acc.count > 0 }, + "guest_accelerators" : { for acc in try(local.guest_accelerator, []) : coalesce(acc.type, try(local.generated_guest_accelerator[0].type, "")) => coalesce(acc.count, try(local.generated_guest_accelerator[0].count, 0)) }, "local_ssds" : { "NVME" : coalesce(local.local_ssd_config.local_ssd_count_nvme_block, 0), "SCSI" : coalesce(local.local_ssd_config.local_ssd_count_ephemeral_storage, 0) diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf index c6c3944332..6c5d96d286 100644 --- a/modules/compute/vm-instance/gpu_definition.tf +++ b/modules/compute/vm-instance/gpu_definition.tf @@ -47,11 +47,11 @@ locals { "g2-standard-48" = { type = "nvidia-l4", count = 4 }, "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], [{ count = 0, type = "" }]) + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) # Select in priority order: # (1) var.guest_accelerator if not empty # (2) local.generated_guest_accelerator if not empty # (3) default to empty list if both are empty - guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), [{ count = 0, type = "" }]) + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 013449acb0..c639f075d6 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -39,7 +39,7 @@ locals { # compact_placement : true when placement policy is provided and collocation set; false if unset compact_placement = try(var.placement_policy.collocation, null) != null - gpu_attached = contains(["a2", "g2"], local.machine_family) || (length([for ga in local.guest_accelerator : ga if ga.count > 0]) > 0) + gpu_attached = contains(["a2", "g2"], local.machine_family) || length(local.guest_accelerator) > 0 # both of these must be false if either compact placement or preemptible/spot instances are used # automatic restart is tolerant of GPUs while on host maintenance is not From 0c294aa281b67fc1a73c2c1f137b04ac81011196 Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Wed, 16 Oct 2024 00:38:57 +0000 Subject: [PATCH 037/129] Add job duration as option for dws_flex --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset/outputs.tf | 1 + .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 3 +++ .../schedmd-slurm-gcp-v6-partition/main.tf | 1 + .../schedmd-slurm-gcp-v6-partition/outputs.tf | 5 ++++ .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/scripts/resume.py | 23 +++++++++++----- .../modules/slurm_files/scripts/util.py | 27 +++++++++++++++++++ .../variables.tf | 1 + docs/slurm-dws-flex.md | 2 +- 10 files changed, 58 insertions(+), 9 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 892c659dcb..8dcffd8fda 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -169,7 +169,7 @@ No modules. | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | -| [dws\_flex](#input\_dws\_flex) | If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes.
See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler
Options:
- enable: Enable DWS Flex Start
- max\_run\_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks).

Limitations:
- CAN NOT be used with reservations;
- CAN NOT be used with placement groups; |
object({
enabled = optional(bool, true)
max_run_duration = optional(number, 1209600) # 2 weeks
})
|
{
"enabled": false
}
| no | +| [dws\_flex](#input\_dws\_flex) | If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes.
See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler
Options:
- enable: Enable DWS Flex Start
- max\_run\_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks).
- use\_job\_duration: Use the job duration to determine the max\_run\_duration, if job duration is not set, max\_run\_duration will be used.

Limitations:
- CAN NOT be used with reservations;
- CAN NOT be used with placement groups;
- If `use_job_duration` is enabled nodeset can be used in "exclusive" partitions only |
object({
enabled = optional(bool, true)
max_run_duration = optional(number, 1209600) # 2 weeks
use_job_duration = optional(bool, false)
})
|
{
"enabled": false
}
| no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_maintenance\_reservation](#input\_enable\_maintenance\_reservation) | Enables slurm reservation for scheduled maintenance. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf index 671d542584..b957db13c1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf @@ -44,6 +44,7 @@ output "nodeset" { condition = !var.enable_placement || var.node_count_static == 0 || var.node_count_dynamic_max == 0 error_message = "Cannot use placement with static and auto-scaling nodes in the same node set." } + precondition { condition = var.reservation_name == "" || !var.dws_flex.enabled error_message = "Cannot use reservations with DWS Flex." diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 282dceca50..ad0409ad11 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -520,16 +520,19 @@ variable "dws_flex" { Options: - enable: Enable DWS Flex Start - max_run_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks). + - use_job_duration: Use the job duration to determine the max_run_duration, if job duration is not set, max_run_duration will be used. Limitations: - CAN NOT be used with reservations; - CAN NOT be used with placement groups; + - If `use_job_duration` is enabled nodeset can be used in "exclusive" partitions only EOD type = object({ enabled = optional(bool, true) max_run_duration = optional(number, 1209600) # 2 weeks + use_job_duration = optional(bool, false) }) default = { enabled = false diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf index e877d0865b..4d2e0eead4 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf @@ -15,6 +15,7 @@ locals { non_static_ns_with_placement = [for ns in var.nodeset : ns.nodeset_name if ns.enable_placement && ns.node_count_static == 0] use_static = [for ns in concat(var.nodeset, var.nodeset_tpu) : ns.nodeset_name if ns.node_count_static > 0] + uses_job_duration = length([for ns in var.nodeset : ns.dws_flex.use_job_duration if ns.dws_flex.use_job_duration]) > 0 ? true : false has_node = length(var.nodeset) > 0 has_dyn = length(var.nodeset_dyn) > 0 diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf index e75c6293f1..4a06593b32 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf @@ -37,6 +37,11 @@ output "partitions" { condition = sum([for b in [local.has_node, local.has_dyn, local.has_tpu] : b ? 1 : 0]) == 1 error_message = "Partition must contain exactly one type of nodeset." } + + precondition { + condition = !local.uses_job_duration || var.exclusive + error_message = "`use_job_duration` can only be used in exclusive partitions" + } } output "nodeset" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 23c395b34d..89352dd48e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -313,7 +313,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 04e8d05574..31658cd96a 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -15,9 +15,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List +from typing import List, Optional import argparse import collections +from datetime import timedelta import json import logging import os @@ -57,7 +58,7 @@ BULK_INSERT_LIMIT = 5000 -def instance_properties(nodeset, model, placement_group, labels=None): +def instance_properties(nodeset:object, model:str, placement_group:Optional[str], labels:Optional[dict], job_id:Optional[int]): props = NSDict() if labels: # merge in extra labels on instance and disks @@ -99,18 +100,28 @@ def instance_properties(nodeset, model, placement_group, labels=None): props.scheduling.maintenanceInterval = nodeset.maintenance_interval if nodeset.dws_flex.enabled: - update_props_dws(props,nodeset.dws_flex) + update_props_dws(props, nodeset.dws_flex, job_id) # Override with properties explicit specified in the nodeset props.update(nodeset.get("instance_properties") or {}) return props -def update_props_dws(props:dict,dws_flex:dict) -> None: +def update_props_dws(props:object, dws_flex:object, job_id: Optional[int]) -> None: props.scheduling.onHostMaintenance = "TERMINATE" props.scheduling.instanceTerminationAction = "DELETE" - props.scheduling.maxRunDuration['seconds'] = dws_flex.max_run_duration props.reservationAffinity['consumeReservationType'] = "NO_RESERVATION" + props.scheduling.maxRunDuration['seconds'] = dws_flex_duration(dws_flex, job_id) + +def dws_flex_duration(dws_flex:object, job_id: Optional[int]) -> int: + max_duration = dws_flex.max_run_duration + if dws_flex.use_job_duration and job_id is not None and (job := lookup().job(job_id)) and job.duration: + if timedelta(seconds=30) <= job.duration <= timedelta(weeks=2): + max_duration = int(job.duration.total_seconds()) + else: + log.info("Job TimeLimit cannot be less than 30 seconds or exceed 2 weeks") + return max_duration + def per_instance_properties(node): props = NSDict() @@ -149,7 +160,7 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None ) # overwrites properties across all instances body.instanceProperties = instance_properties( - nodeset, model, placement_group, labels + nodeset, model, placement_group, labels, job_id ) # key is instance name, value overwrites properties diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 8467e300e2..68716e51bf 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -19,6 +19,7 @@ import base64 import collections from dataclasses import dataclass +from datetime import timedelta import hashlib import inspect import json @@ -1451,6 +1452,11 @@ class ReservationDetails: policies: List[str] # names (not URLs) of resource policies bulk_insert_name: str # name in format suitable for bulk insert (currently identical to user supplied name in long format) +@dataclass +class Job: + id: int + duration: Optional[timedelta] = None + class Lookup: """Wrapper class for cached data access""" @@ -1917,6 +1923,27 @@ def nodeset_map(self, hostnames: list): nodeset_map[self.node_nodeset_name(node)].append(node) return nodeset_map + @lru_cache + def job(self, job_id: int) -> Optional[Job]: + jobInfo = run(f"{self.scontrol} show jobid {job_id}", check=False).stdout.rstrip() + if not jobInfo: + return None + + timePattern = r"TimeLimit=(?:(\d+)-)?(\d{2}):(\d{2}):(\d{2})" + match = re.search(timePattern, jobInfo) + + if not match: + return Job(id=job_id) + + days, hours, minutes, seconds = match.groups() + job_duration = timedelta( + days=int(days) if days else 0, + hours=int(hours), + minutes=int(minutes), + seconds=int(seconds) + ) + return Job(id=job_id, duration=job_duration) + @property def etc_dir(self) -> Path: return Path(self.cfg.output_dir or slurmdirs.etc) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 7ecb8696cf..569d0d65ff 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -215,6 +215,7 @@ variable "nodeset" { dws_flex = object({ enabled = bool max_run_duration = number + use_job_duration = bool }) labels = optional(map(string), {}) machine_type = optional(string) diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md index 8b1c38bb01..6f2d38cd2b 100644 --- a/docs/slurm-dws-flex.md +++ b/docs/slurm-dws-flex.md @@ -13,7 +13,7 @@ With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity re > The project needs to be allowlisted for private preview access. > Fill out the [form](https://docs.google.com/forms/d/1etaaXMW9jJUTTxfUC7TIIMttLWT5H-3Q8_3-sG6vwKk/edit). -In order to make use of DWS Flex Start mode with SlurmGCP, you must use the `dws_flex` variable in the `schedmd-slurm-gcp-v6-nodeset` module. From there you can specify the desired maximum duration (in seconds) with `max_run_duration`. See the example below: +In order to make use of DWS Flex Start mode with SlurmGCP, you must use the `dws_flex` variable in the `schedmd-slurm-gcp-v6-nodeset` module. From there you can specify the desired maximum duration (in seconds) with `max_run_duration`. You can also use `use_job_duration` which will utilize the job's `TimeLimit` within Slurm as the duration. If `use_job_duration` is enabled but `TimeLimit` is not set, it will default to `max_run_duration`. See the example below: ```yaml - id: flex_nodeset From 78f4c7ac21f4bc27bb62ad1dfb74d876a111ffce Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 25 Oct 2024 21:22:57 +0000 Subject: [PATCH 038/129] SlurmGCP. Relax `reservation_name` match, allow for suffix --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 24 ++++++++----------- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 8 +++---- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 8dcffd8fda..bd339c262e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -197,7 +197,7 @@ No modules. | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | -| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME
- RESERVATION\_NAME

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | +| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME[/SUFF/IX]
- RESERVATION\_NAME[/SUFF/IX]

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | | [service\_account](#input\_service\_account) | DEPRECATED: Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the compute instances. | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 217328277b..6e680f002b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -130,26 +130,22 @@ data "google_compute_zones" "available" { } locals { - res_name_split = split("/", var.reservation_name) - reservation = var.reservation_name == "" ? null : ( - length(local.res_name_split) == 4 ? { - project : local.res_name_split[1], - name : local.res_name_split[3] - } : { - project : var.project_id, - name : var.reservation_name - } - ) + res_match = regex("^(?P(?Pprojects/(?P[a-z0-9-]+)/reservations/)?(?[a-z0-9-]+)(?P/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name) + + res_short_name = local.res_match.name + res_project = coalesce(local.res_match.project, var.project_id) + res_prefix = coalesce(local.res_match.prefix, "projects/${local.res_project}/reservations/") + res_suffix = local.res_match.suffix == null ? "" : local.res_match.suffix - reservation_name = local.reservation == null ? "" : "projects/${local.reservation.project}/reservations/${local.reservation.name}" + reservation_name = local.res_match.whole == null ? "" : "${local.res_prefix}${local.res_short_name}${local.res_suffix}" } # tflint-ignore: terraform_unused_declarations data "google_compute_reservation" "reservation" { - count = local.reservation != null ? 1 : 0 + count = length(local.reservation_name) > 0 ? 1 : 0 - name = local.reservation.name - project = local.reservation.project + name = local.res_short_name + project = local.res_project zone = var.zone lifecycle { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index ad0409ad11..3bd8fc74fb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -447,8 +447,8 @@ variable "access_config" { variable "reservation_name" { description = <<-EOD Name of the reservation to use for VM resources, should be in one of the following formats: - - projects/PROJECT_ID/reservations/RESERVATION_NAME - - RESERVATION_NAME + - projects/PROJECT_ID/reservations/RESERVATION_NAME[/SUFF/IX] + - RESERVATION_NAME[/SUFF/IX] Must be a "SPECIFIC" reservation Set to empty string if using no reservation or automatically-consumed reservations @@ -458,8 +458,8 @@ variable "reservation_name" { nullable = false validation { - condition = var.reservation_name == "" || length(regexall("^projects/[a-z0-9-]+/reservations/[a-z0-9-]+$", var.reservation_name)) > 0 || length(regexall("^[a-z0-9-]+$", var.reservation_name)) > 0 - error_message = "Reservation name must be in the format 'projects/PROJECT_ID/reservations/RESERVATION_NAME' or 'RESERVATION_NAME'." + condition = length(regexall("^((projects/([a-z0-9-]+)/reservations/)?([a-z0-9-]+)(/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name)) > 0 + error_message = "Reservation name must be either empty or in the format '[projects/PROJECT_ID/reservations/]RESERVATION_NAME[/SUFF/IX]', [...] are optional parts." } } From b0d5d371f94a502828c75e1148d8b56a46ad9cde Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Fri, 25 Oct 2024 23:01:19 +0000 Subject: [PATCH 039/129] Delete the new-project module --- .../modules/project/new-project/README.md | 9 +- community/modules/project/new-project/main.tf | 74 ----- .../modules/project/new-project/metadata.yaml | 22 -- .../modules/project/new-project/outputs.tf | 100 ------ .../modules/project/new-project/variables.tf | 288 ------------------ .../modules/project/new-project/versions.tf | 19 -- modules/README.md | 3 - pkg/inspect/modules_test.go | 2 +- pkg/modulereader/metadata_legacy.go | 6 - .../test_configs/new_project.yaml | 31 -- .../test_configs/test_outputs.yaml | 25 -- 11 files changed, 4 insertions(+), 575 deletions(-) delete mode 100644 community/modules/project/new-project/main.tf delete mode 100644 community/modules/project/new-project/metadata.yaml delete mode 100644 community/modules/project/new-project/outputs.tf delete mode 100644 community/modules/project/new-project/variables.tf delete mode 100644 community/modules/project/new-project/versions.tf delete mode 100644 tools/validate_configs/test_configs/new_project.yaml diff --git a/community/modules/project/new-project/README.md b/community/modules/project/new-project/README.md index 850b06b602..5e5cabe9d5 100644 --- a/community/modules/project/new-project/README.md +++ b/community/modules/project/new-project/README.md @@ -6,16 +6,13 @@ access, Service Accounts, and API enablement to follow best practices. This module is meant for use with Terraform 0.13. +**Note:** This module has been removed from the Cluster Toolkit. The upstream module (`terraform-google-project-factory`) is now the recommended way to create and manage GCP projects. + ### Example ```yaml - id: project - source: community/modules/project/new-project - settings: - project_id: test_project - folder_id: 334688113020 # random number - billing_account: "111110-M2N704-854685" # random billing number - org_id: 123456789 # random org id + source: github.com/terraform-google-modules/terraform-google-project-factory?rev=v17.0.0&depth=1 ``` This creates a new project with pre-defined project ID, a designated folder and diff --git a/community/modules/project/new-project/main.tf b/community/modules/project/new-project/main.tf deleted file mode 100644 index 5a9a611a27..0000000000 --- a/community/modules/project/new-project/main.tf +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -locals { - # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "new-project", ghpc_role = "project" }) -} - -locals { - name = var.name != null ? var.name : var.project_id -} - -module "project_factory" { - source = "terraform-google-modules/project-factory/google" - version = "~> 11.3" - - random_project_id = var.random_project_id - org_id = var.org_id - domain = var.domain - name = local.name - project_id = var.project_id - svpc_host_project_id = var.svpc_host_project_id - enable_shared_vpc_host_project = var.enable_shared_vpc_host_project - billing_account = var.billing_account - folder_id = var.folder_id - group_name = var.group_name - group_role = var.group_role - create_project_sa = var.create_project_sa - project_sa_name = var.project_sa_name - sa_role = var.sa_role - activate_apis = var.activate_apis - activate_api_identities = var.activate_api_identities - usage_bucket_name = var.usage_bucket_name - usage_bucket_prefix = var.usage_bucket_prefix - shared_vpc_subnets = var.shared_vpc_subnets - labels = local.labels - bucket_project = var.bucket_project - bucket_name = var.bucket_name - bucket_location = var.bucket_location - bucket_versioning = var.bucket_versioning - bucket_labels = var.bucket_labels - bucket_force_destroy = var.bucket_force_destroy - bucket_ula = var.bucket_ula - auto_create_network = var.auto_create_network - lien = var.lien - disable_services_on_destroy = var.disable_services_on_destroy - default_service_account = var.default_service_account - disable_dependent_services = var.disable_dependent_services - budget_amount = var.budget_amount - budget_display_name = var.budget_display_name - budget_alert_pubsub_topic = var.budget_alert_pubsub_topic - budget_monitoring_notification_channels = var.budget_monitoring_notification_channels - budget_alert_spent_percents = var.budget_alert_spent_percents - vpc_service_control_attach_enabled = var.vpc_service_control_attach_enabled - vpc_service_control_perimeter_name = var.vpc_service_control_perimeter_name - grant_services_security_admin_role = var.grant_services_security_admin_role - grant_services_network_role = var.grant_services_network_role - consumer_quotas = var.consumer_quotas - default_network_tier = var.default_network_tier - -} diff --git a/community/modules/project/new-project/metadata.yaml b/community/modules/project/new-project/metadata.yaml deleted file mode 100644 index 806241c118..0000000000 --- a/community/modules/project/new-project/metadata.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2023 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- - -spec: - requirements: - services: - - admin.googleapis.com - - cloudresourcemanager.googleapis.com - - cloudbilling.googleapis.com - - iam.googleapis.com diff --git a/community/modules/project/new-project/outputs.tf b/community/modules/project/new-project/outputs.tf deleted file mode 100644 index 07d2e038eb..0000000000 --- a/community/modules/project/new-project/outputs.tf +++ /dev/null @@ -1,100 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -output "project_name" { - value = module.project_factory.project_name - description = "Name of the project that was created" -} - -output "project_id" { - value = module.project_factory.project_id - description = "ID of the project that was created" -} - -output "project_number" { - value = module.project_factory.project_number - description = "Number of the project that was created" -} - -output "domain" { - value = module.project_factory.domain - description = "The organization's domain" -} - -output "group_email" { - value = module.project_factory.group_email - description = "The email of the G Suite group with group_name" -} - -output "service_account_id" { - value = module.project_factory.service_account_id - description = "The id of the default service account" -} - -output "service_account_display_name" { - value = module.project_factory.service_account_display_name - description = "The display name of the default service account" -} - -output "service_account_email" { - value = module.project_factory.service_account_email - description = "The email of the default service account" -} - -output "service_account_name" { - value = module.project_factory.service_account_name - description = "The fully-qualified name of the default service account" -} - -output "service_account_unique_id" { - value = module.project_factory.service_account_unique_id - description = "The unique id of the default service account" -} - -output "project_bucket_self_link" { - value = module.project_factory.project_bucket_self_link - description = "Project's bucket selfLink" -} - -output "project_bucket_url" { - value = module.project_factory.project_bucket_url - description = "Project's bucket url" -} - -output "api_s_account" { - value = module.project_factory.api_s_account - description = "API service account email" -} - -output "api_s_account_fmt" { - value = module.project_factory.api_s_account_fmt - description = "API service account email formatted for terraform use" -} - -output "enabled_apis" { - value = module.project_factory.enabled_apis - description = "Enabled APIs in the project" -} - -output "enabled_api_identities" { - value = module.project_factory.enabled_api_identities - description = "Enabled API identities in the project" -} - -output "budget_name" { - value = module.project_factory.budget_name - description = "The name of the budget if created" -} diff --git a/community/modules/project/new-project/variables.tf b/community/modules/project/new-project/variables.tf deleted file mode 100644 index 8c776feebf..0000000000 --- a/community/modules/project/new-project/variables.tf +++ /dev/null @@ -1,288 +0,0 @@ -/** - * Copyright 2022 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -variable "random_project_id" { - description = "Adds a suffix of 4 random characters to the `project_id`" - type = bool - default = false -} - -variable "org_id" { - description = "The organization ID." - type = string -} - -variable "domain" { - description = "The domain name (optional)." - type = string - default = "" -} - -variable "name" { - description = "The name for the project" - type = string - default = null -} - -variable "project_id" { - description = "The ID to give the project. If not provided, the `name` will be used." - type = string - default = "" -} - -variable "svpc_host_project_id" { - description = "The ID of the host project which hosts the shared VPC" - type = string - default = "" -} - -variable "enable_shared_vpc_host_project" { - description = "If this project is a shared VPC host project. If true, you must *not* set svpc_host_project_id variable. Default is false." - type = bool - default = false -} - -variable "billing_account" { - description = "The ID of the billing account to associate this project with" - type = string -} - -variable "folder_id" { - description = "The ID of a folder to host this project" - type = string - default = "" -} - -variable "group_name" { - description = "A group to control the project by being assigned group_role (defaults to project editor)" - type = string - default = "" -} - -variable "group_role" { - description = "The role to give the controlling group (group_name) over the project (defaults to project editor)" - type = string - default = "roles/editor" -} - -variable "create_project_sa" { - description = "Whether the default service account for the project shall be created" - type = bool - default = true -} - -variable "project_sa_name" { - description = "Default service account name for the project." - type = string - default = "project-service-account" -} - -variable "sa_role" { - description = "A role to give the default Service Account for the project (defaults to none)" - type = string - default = "" -} - -variable "activate_apis" { - description = "The list of apis to activate within the project" - type = list(string) - default = [ - "compute.googleapis.com", - "serviceusage.googleapis.com", - "storage.googleapis.com", - ] -} - -variable "activate_api_identities" { - description = < Date: Mon, 28 Oct 2024 18:14:01 +0000 Subject: [PATCH 040/129] Addressing PR comments --- modules/packer/custom-image/README.md | 40 ++++++++------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 40326a5dd5..d31c0eef1c 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -210,42 +210,26 @@ to the console. For example: ==> example.googlecompute.toolkit_image: Startup script, if any, has finished running. ``` -To monitor the progress of the startup script, first gather the name and zone -of the instance using Cloud Console or the following command: +### Viewing Startup Script Logs -```shell -gcloud compute instances list --project --filter="name~^packer" -``` - -This will produce a list of VMs starting with `packer` along with other -information including which zone the VMs are in. If there is more than one -Packer VM you will need to determine which is the one that you wish to monitor -using criteria such as zone or machine-type. - -Once the VM name is determined, you can either check the serial port output in -Cloud Console or by running the command: +The recommended method for debugging the image build process is to use Cloud +Logging. This can be done by either searching for the VM instance in the Cloud +Console or using the following template command with the variables `PROJECT_ID` +(e.g. `test_project_1`) and `INSTANCE_ID` (note: unique numerical id, not +instance name) specified: ```shell -gcloud compute instances get-serial-port-output --port 1 --zone --project +gcloud logging --project read 'logName=("projects//logs/GCEMetadataScripts" OR "projects//logs/google_metadata_script_runner") AND resource.labels.instance_id=' --format="table(timestamp, resource.labels.instance_id, jsonPayload.message) --order=asc ``` -The serial port output of the Packer VM will contain the startup script logs. -This output will only be available while Packer is running. - -### Gathering Startup Script Logs After Failure +> [!NOTE] +> There can be a delay in the propagation of the logs from the instance to +> Cloud Logging, so it may require waiting a few minutes to see the full logs. -If the Packer image build fails, the module will produce a `gcloud` command -that prints the failed startup script output from the Packer VM. The produced -command will have the variables specified and can be used in a terminal without +If the Packer image build fails, the module will output the command above +with the variables specified and can be used in a terminal without modification. -The output will look similar to: - -```shell -Error building image try checking logs: -gcloud logging --project read 'logName=("projects//logs/GCEMetadataScripts" OR "projects//logs/google_metadata_script_runner") AND resource.labels.instance_id=' --format="table(timestamp, resource.labels.instance_id, jsonPayload.message) --order=asc -``` - ## License Copyright 2022 Google LLC From 49de14e80ad21596205a67f67448bda51797e31a Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Mon, 28 Oct 2024 19:56:04 +0000 Subject: [PATCH 041/129] Update MTU for a3 mega in GKE --- examples/gke-a3-megagpu.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml index 30edb3974c..1198b520c0 100644 --- a/examples/gke-a3-megagpu.yaml +++ b/examples/gke-a3-megagpu.yaml @@ -33,6 +33,7 @@ deployment_groups: source: modules/network/vpc settings: subnetwork_name: gke-subnet-a3-mega + mtu: 8244 secondary_ranges: gke-subnet-a3-mega: - range_name: pods @@ -59,6 +60,7 @@ deployment_groups: global_ip_address_range: 192.169.0.0/16 network_count: 8 subnetwork_cidr_suffix: 24 + mtu: 8244 - id: gke_cluster source: modules/scheduler/gke-cluster From f44a1d2f8166f23c3b1884a94386c270610a0ab5 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Mon, 28 Oct 2024 20:26:13 +0000 Subject: [PATCH 042/129] Delete Daos Example Blueprints and updating readme --- community/examples/intel/README.md | 454 ------------------ community/examples/intel/hpc-slurm-daos.yaml | 188 -------- community/examples/intel/pfs-daos.yaml | 109 ----- .../modules/file-system/Intel-DAOS/README.md | 66 +-- 4 files changed, 1 insertion(+), 816 deletions(-) delete mode 100644 community/examples/intel/README.md delete mode 100644 community/examples/intel/hpc-slurm-daos.yaml delete mode 100644 community/examples/intel/pfs-daos.yaml diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md deleted file mode 100644 index e83bd27391..0000000000 --- a/community/examples/intel/README.md +++ /dev/null @@ -1,454 +0,0 @@ -# Intel Solutions for the Cluster Toolkit (formerly HPC Toolkit) - -> **_NOTE:_** The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) will not be compatible -> for newer version of slurm-gcp v6. - - - - -- [Intel Solutions for the Cluster Toolkit](#intel-solutions-for-the-cluster-toolkit) - - [DAOS Cluster](#daos-cluster) - - [Initial Setup for DAOS Cluster](#initial-setup-for-daos-cluster) - - [Deploy the DAOS Cluster](#deploy-the-daos-cluster) - - [Connect to a client node](#connect-to-a-client-node) - - [Verify the DAOS storage system](#verify-the-daos-storage-system) - - [Create a DAOS Pool and Container](#create-a-daos-pool-and-container) - - [About the DAOS Command Line Tools](#about-the-daos-command-line-tools) - - [View Free Space](#view-free-space) - - [Create a Pool](#create-a-pool) - - [Create a Container](#create-a-container) - - [Mount the DAOS Container](#mount-the-daos-container) - - [Use DAOS Storage](#use-daos-storage) - - [Unmount the DAOS Container](#unmount-the-daos-container) - - [Delete the DAOS infrastructure when not in use](#delete-the-daos-infrastructure-when-not-in-use) - - [DAOS Server with Slurm cluster](#daos-server-with-slurm-cluster) - - [Initial Setup for the DAOS/Slurm cluster](#initial-setup-for-the-daosslurm-cluster) - - [Deploy the DAOS/Slurm Cluster](#deploy-the-daosslurm-cluster) - - [Connect to the DAOS/Slurm Cluster login node](#connect-to-the-daosslurm-cluster-login-node) - - [Create and Mount a DAOS Container](#create-and-mount-a-daos-container) - - [Run a Job that uses the DAOS Container](#run-a-job-that-uses-the-daos-container) - - [Unmount the Container](#unmount-the-container) - - [Delete the DAOS/Slurm Cluster infrastructure when not in use](#delete-the-daosslurm-cluster-infrastructure-when-not-in-use) - -## DAOS Cluster - -The [pfs-daos.yaml](pfs-daos.yaml) blueprint describes an environment with -- Two DAOS server instances -- Two DAOS client instances - -The [pfs-daos.yaml](pfs-daos.yaml) blueprint uses a Packer template and -Terraform modules from the [Google Cloud DAOS][google-cloud-daos] repository. -Please review the [introduction to image building](../../../docs/image-building.md) -for general information on building custom images using the Toolkit. - -Identify a project to work in and substitute its unique id wherever you see -`<>` in the instructions below. - -[google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos -[pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md -[DAOS Yum Repository]: https://packages.daos.io - -### Initial Setup for DAOS Cluster - -Before provisioning the DAOS cluster you must follow the steps listed in the [Google Cloud DAOS Pre-deployment Guide][pre-deployment_guide]. - -Skip the "Build DAOS Images" step at the end of the [Pre-deployment Guide][pre-deployment_guide]. The [pfs-daos.yaml](pfs-daos.yaml) blueprint will build the images as part of the deployment. - -The Pre-deployment Guide provides instructions for: -- installing the Google Cloud CLI -- enabling service accounts -- enabling APIs -- establishing minimum resource quotas -- creating a Cloud NAT to allow instances without public IPs to access the [DAOS Yum Repository] repository. - -### Deploy the DAOS Cluster - -After completing the steps in the [Pre-deployment Guide][pre-deployment_guide] use `gcluster` to provision the blueprint - -```text -gcluster create community/examples/intel/pfs-daos.yaml \ - --vars project_id=<> \ - [--backend-config bucket=] -``` - -This will create the deployment directory containing Terraform modules and -Packer templates. The `--backend-config` option is not required but recommended. -It will save the terraform state in a pre-existing [Google Cloud Storage -bucket][bucket]. For more information see [Setting up a remote terraform -state][backend]. Use `gcluster deploy` to provision your DAOS storage cluster: - -```text -gcluster deploy pfs-daos --auto-approve -``` - -[backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state -[bucket]: https://cloud.google.com/storage/docs/creating-buckets - -### Connect to a client node - -1. Open the following URL in a new tab. - - https://console.cloud.google.com/compute - - This will take you to **Compute Engine > VM instances** in the Google Cloud Console. - - Select the project in which the DAOS cluster will be provisioned. - -2. Click on the **SSH** button associated with the **daos-client-0001** - instance to open a window with a terminal into the first DAOS client instance. - -### Verify the DAOS storage system - -The `community/examples/intel/pfs-daos.yaml` blueprint does not contain configuration for DAOS pools and containers. Therefore, pools and containers will need to be created manually. - -Before pools and containers can be created the storage system must be formatted. Formatting the storage is done automatically by the startup script that runs on the *daos-server-0001* instance. The startup script will run the [dmg storage format](https://docs.daos.io/v2.4/admin/deployment/?h=dmg+storage#storage-formatting) command. It may take a few minutes for all daos server instances to join. - -Verify that the storage system has been formatted and that the daos-server instances have joined. - -```bash -sudo dmg system query -v -``` - -The command will not return output until the system is ready. - -The output will look similar to - -```text -Rank UUID Control Address Fault Domain State Reason ----- ---- --------------- ------------ ----- ------ -0 225a0a51-d4ed-4ac3-b1a5-04b31c08b559 10.128.0.51:10001 /daos-server-0001 Joined -1 553ab1dc-99af-460e-a57c-3350611d1d09 10.128.0.43:10001 /daos-server-0002 Joined -``` - -Both daos-server instances should show a state of *Joined*. - -### Create a DAOS Pool and Container - -#### About the DAOS Command Line Tools - -The DAOS Management tool `dmg` is used by System Administrators to manage the DAOS storage [system](https://docs.daos.io/v2.4/overview/architecture/#daos-system) and DAOS [pools](https://docs.daos.io/v2.4/overview/storage/#daos-pool). Therefore, `sudo` must be used when running `dmg`. - -The DAOS CLI `daos` is used by both users and System Administrators to create and manage [containers](https://docs.daos.io/v2.4/overview/storage/#daos-container). It is not necessary to use `sudo` with the `daos` command. - -#### View Free Space - -View how much free space is available. - -```bash -sudo dmg storage query usage -``` - -#### Create a Pool - -Create a single pool owned by root which uses 100% of the available free space. - -```bash -sudo dmg pool create --size=100% --user=root pool1 -``` - -Set ACLs to allow any user to create a container in *pool1*. - -```bash -sudo dmg pool update-acl -e A::EVERYONE@:rcta pool1 -``` - -See the [Pool Operations](https://docs.daos.io/v2.4/admin/pool_operations) section of the DAOS Administration Guide for more information about creating pools. - -#### Create a Container - -At this point it is necessary to determine who will need to access the container -and how it will be used. The ACLs will need to be set properly to allow users and/or groups to access the container. - -For the purpose of this demo create the container without specifying ACLs. The container will be owned by your user account and you will have full access to the container. - -```bash -daos container create --type=POSIX --properties=rf:0 pool1 cont1 -``` - -See the [Container Management](https://docs.daos.io/v2.4/user/container) section of the DAOS User Guide for more information about creating containers. - -#### Mount the DAOS Container - -Mount the container with dfuse (DAOS Fuse) - -```bash -mkdir -p "${HOME}/daos/cont1" -dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" -``` - -Verify that the container is mounted - -```bash -df -h -t fuse.daos -``` - -### Use DAOS Storage - -The `cont1` container is now mounted on `${HOME}/daos/cont1` - -Create a 20GiB file which will be stored in the DAOS filesystem. - -```bash -time LD_PRELOAD=/usr/lib64/libioil.so \ -dd if=/dev/zero of="${HOME}/daos/cont1/test20GiB.img" iflag=fullblock bs=1G count=20 -``` - -**Known Issue:** - -When you run `ls -lh "${HOME}/daos/cont1"` you may see that the `test20GiB.img` file shows a size of 0 bytes. - -If you unmount the container and mount it again, the file size will show as 20G. - -```bash -fusermount3 -u "${HOME}/daos/cont1" -dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" -ls -lh "${HOME}/daos/cont1" -``` - -A work-around for this issue to disable caching when mounting the container. - -```bash -dfuse --singlethread --disable-caching --pool=pool1 --container=cont1 --mountpoint="${HOME}/daos/cont1" -``` - -See the [File System](https://docs.daos.io/v2.4/user/filesystem/) section of the DAOS User Guide for more information about DFuse. - -### Unmount the DAOS Container - -The container will need to be unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. - -Verify that the container is unmounted - -```bash -df -h -t fuse.daos -``` - -Logout of the DAOS client instance. - -```bash -logout -``` - -See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. - -### Delete the DAOS infrastructure when not in use - -> **_NOTE:_** Data stored in the DAOS container will be permanently lost after cluster deletion. - -Delete the remaining infrastructure - -```bash -gcluster destroy pfs-daos --auto-approve -``` - -## DAOS Server with Slurm cluster - -The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint can be used to deploy a Slurm cluster and four DAOS server instances. The Slurm compute instances are configured as DAOS clients. - -The blueprint uses modules from -- [google-cloud-daos][google-cloud-daos] -- [community/modules/compute/schedmd-slurm-gcp-v6-nodeset][schedmd-slurm-gcp-v6-nodeset] -- [community/modules/compute/schedmd-slurm-gcp-v6-partition][schedmd-slurm-gcp-v6-partition] -- [community/modules/scheduler/schedmd-slurm-gcp-v6-login][schedmd-slurm-gcp-v6-login] -- [community/modules/scheduler/schedmd-slurm-gcp-v6-controller][schedmd-slurm-gcp-v6-controller] - -The blueprint also uses a Packer template from the [Google Cloud -DAOS][google-cloud-daos] repository. Please review the [introduction to image -building](../../../docs/image-building.md) for general information on building -custom images using the Toolkit. - -Substitute your project ID wherever you see `<>` in the instructions below. - -### Initial Setup for the DAOS/Slurm cluster - -Before provisioning the DAOS cluster you must follow the steps listed in the [Google Cloud DAOS Pre-deployment Guide][pre-deployment_guide]. - -Skip the "Build DAOS Images" step at the end of the [Pre-deployment Guide][pre-deployment_guide]. The [hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint will build the DAOS server image as part of the deployment. - -The [Pre-deployment Guide][pre-deployment_guide] provides instructions for enabling service accounts, APIs, establishing minimum resource quotas and other necessary steps to prepare your project for DAOS server deployment. - -[google-cloud-daos]: https://github.com/daos-stack/google-cloud-daos -[pre-deployment_guide]: https://github.com/daos-stack/google-cloud-daos/blob/main/docs/pre-deployment_guide.md -[packer-template]: https://github.com/daos-stack/google-cloud-daos/blob/main/images/daos.pkr.hcl -[apis]: ../../../README.md#enable-gcp-apis -[schedmd-slurm-gcp-v6-nodeset]: ../../modules/compute/schedmd-slurm-gcp-v6-nodeset -[schedmd-slurm-gcp-v6-partition]: ../../modules/compute/schedmd-slurm-gcp-v6-partition -[schedmd-slurm-gcp-v6-controller]: ../../modules/scheduler/schedmd-slurm-gcp-v6-controller -[schedmd-slurm-gcp-v6-login]: ../../modules/scheduler/schedmd-slurm-gcp-v6-login - -Follow the Toolkit guidance to enable [APIs][apis] and establish minimum resource [quotas][quotas] for Slurm. - -[apis]: ../../../README.md#enable-gcp-apis -[quotas]: ../../../README.md#gcp-quotas - -The following available quota is required in the region used by Slurm: - -- Filestore: 2560GB -- C2 CPUs: 6000 (fully-scaled "compute" partition) - - This quota is not necessary at initial deployment, but will be required to - successfully scale the partition to its maximum size -- C2 CPUs: 4 (login node) - -### Deploy the DAOS/Slurm Cluster - -Use `gcluster` to provision the blueprint, supplying your project ID - -```text -gcluster create community/examples/intel/hpc-slurm-daos.yaml \ - --vars project_id=<> \ - [--backend-config bucket=] -``` - -This will create a set of directories containing Terraform modules and Packer -templates. - -The `--backend-config` option is not required but recommended. It will save the terraform state in a pre-existing [Google Cloud Storage bucket][bucket]. For more information see [Setting up a remote terraform state][backend]. - -Follow `gcluster` instructions to deploy the environment - -```text -gcluster deploy hpc-slurm-daos --auto-approve -``` - -[backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state -[bucket]: https://cloud.google.com/storage/docs/creating-buckets - -### Connect to the DAOS/Slurm Cluster login node - -Once the startup script has completed and Slurm reports readiness, connect to the login node. - -1. Open the following URL in a new tab. - - https://console.cloud.google.com/compute - - This will take you to **Compute Engine > VM instances** in the Google Cloud Console - - Select the project in which the cluster will be provisionsd. - -2. Click on the `SSH` button associated with the `hpcslurmda-login-login-001` - instance. - - This will open a separate pop up window with a terminal into our newly created - Slurm login VM. - -### Create and Mount a DAOS Container - -The [community/examples/intel/hpc-slurm-daos.yaml](hpc-slurm-daos.yaml) blueprint defines a single DAOS pool named `pool1`. The pool will be created when the *daos-server* instances are provisioned. - -You will need to create your own DAOS container in the pool that can be used by your Slurm jobs. - -While logged into the login node create a container named `cont1` in the `pool1` pool: - -```bash -daos cont create --type=POSIX --properties=rf:0 pool1 cont1 -``` - -NOTE: If you encounter an error `daos: command not found`, it's likely that the startup scripts have not finished running yet. Wait a few minutes and try again. - -Since the `cont1` container is owned by your account, your Slurm jobs will need to run as your user account to access the container. - -Create a mount point for the container and mount it with dfuse (DAOS Fuse) - -```bash -mkdir -p ${HOME}/daos/cont1 - -dfuse --singlethread \ ---pool=pool1 \ ---container=cont1 \ ---mountpoint=${HOME}/daos/cont1 -``` - -Verify that the container is mounted - -```bash -df -h -t fuse.daos -``` - -### Run a Job that uses the DAOS Container - -On the login node create a `daos_job.sh` file with the following content - -```bash -#!/bin/bash -JOB_HOSTNAME="$(hostname)" -TIMESTAMP="$(date '+%Y%m%d%H%M%S')" - -echo "Timestamp = ${TIMESTAMP}" -echo "Date = $(date)" -echo "Hostname = $(hostname)" -echo "User = $(whoami)" -echo "Working Directory = $(pwd)" -echo "" -echo "Number of Nodes Allocated = $SLURM_JOB_NUM_NODES" -echo "Number of Tasks Allocated = $SLURM_NTASKS" - -MOUNT_DIR="${HOME}/daos/cont1" -LOG_FILE="${MOUNT_DIR}/${JOB_HOSTNAME}.log" - -echo "${JOB_HOSTNAME} : Creating directory: ${MOUNT_DIR}" -mkdir -p "${MOUNT_DIR}" - -echo "${JOB_HOSTNAME} : Mounting with dfuse" -dfuse --singlethread --pool=pool1 --container=cont1 --mountpoint="${MOUNT_DIR}" -sleep 5 - -echo "${JOB_HOSTNAME} : Creating log file" -echo "Job ${SLURM_JOB_ID} running on ${JOB_HOSTNAME}" | tee "${MOUNT_DIR}/${TIMESTAMP}_${JOB_HOSTNAME}.log" - -echo "${JOB_HOSTNAME} : Unmounting dfuse" -fusermount3 -u "${MOUNT_DIR}" - -``` - -Run the `daos_job.sh` script in an interactive Slurm job on 4 nodes - -```bash -srun --nodes=4 \ - --ntasks-per-node=1 \ - --time=00:10:00 \ - --job-name=daos \ - --output=srunjob_%j.log \ - --partition=compute \ - daos_job.sh & -``` - -Run `squeue` to see the status of the job. The `daos_job.sh` script will run once on each of the 4 nodes. Each time it runs it creates a log file which is stored in the `cont1` DAOS container. - -Wait for the job to complete and then view the files that were created in the `cont1` DAOS container mounted on `${HOME}/daos/cont1`. - -```bash -ls -l ${HOME}/daos/cont1/*.log -cat ${HOME}/daos/cont1/*.log -``` - -### Unmount the Container - -The container will need to by unmounted before you log out. If this is not done it can leave open file handles and prevent the container from being mounted when you log in again. - -```bash -fusermount3 -u ${HOME}/daos/cont1 -``` - -Verify that the container is unmounted - -```bash -df -h -t fuse.daos -``` - -See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#dfuse-daos-fuse) section of the DAOS User Guide for more information about mounting POSIX containers. - -### Delete the DAOS/Slurm Cluster infrastructure when not in use - -> **_NOTE:_** -> -> - Data on the DAOS file system will be permanently lost after cluster deletion. -> - If the Slurm controller is shut down before the auto-scale instances are destroyed, those compute instances will be left running. - -Open your browser to the VM instances page and ensure that instances named "compute" -have been shutdown and deleted by the Slurm autoscaler. - -Delete the remaining infrastructure: - -```bash -gcluster destroy hpc-slurm-daos --auto-approve -``` diff --git a/community/examples/intel/hpc-slurm-daos.yaml b/community/examples/intel/hpc-slurm-daos.yaml deleted file mode 100644 index b3c217474c..0000000000 --- a/community/examples/intel/hpc-slurm-daos.yaml +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: hpc-slurm-daos - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: hpc-slurm-daos - region: us-central1 - zone: us-central1-c - daos_server_image_family: daos-server-hpc-rocky-8 - daos_version: "2.4" - tags: [] - -# Note: this blueprint assumes the existence of a default global network and -# subnetwork in the region chosen above - -validators: -- validator: test_module_not_used - inputs: {} - skip: true - -deployment_groups: -- group: primary - modules: - - id: network1 - source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=v1.33.0&depth=1 - - - id: homefs - source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore?ref=v1.33.0&depth=1 - use: [network1] - settings: - local_mount: /home - -- group: daos-server-image - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/images - - id: daos-server-image - source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" - kind: packer - settings: - daos_version: $(vars.daos_version) - daos_repo_base_url: https://packages.daos.io/ - daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo - use_iap: true - enable_oslogin: false - machine_type: n2-standard-32 - source_image_family: hpc-rocky-linux-8 - source_image_project_id: cloud-hpc-image-public - image_guest_os_features: ["GVNIC"] - disk_size: "20" - state_timeout: "10m" - scopes: ["https://www.googleapis.com/auth/cloud-platform"] - use_internal_ip: true - omit_external_ip: false - daos_install_type: server - image_family: $(vars.daos_server_image_family) - -- group: cluster - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server - - id: daos - source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.5.0&depth=1" - use: [network1] - settings: - labels: {ghpc_role: file-system} - machine_type: "n2-standard-16" - os_family: $(vars.daos_server_image_family) - daos_disk_count: 4 - tags: $(vars.tags) - pools: - - name: "pool1" - size: "100%" - # Do not set value for scm_size when size=100% - daos_scm_size: - user: "root@" - group: "root@" - acls: - - "A::OWNER@:rwdtTaAo" - - "A:G:GROUP@:rwtT" - - "A::EVERYONE@:rcta" - properties: - reclaim: "lazy" - containers: [] - - - id: daos-client-script - source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.33.0&depth=1 - settings: - runners: - - type: data - content: $(daos.daos_agent_yml) - destination: /etc/daos/daos_agent.yml - - type: data - content: $(daos.daos_control_yml) - destination: /etc/daos/daos_control.yml - - type: shell - content: $(daos.daos_client_install_script) - destination: /tmp/daos_client_install.sh - - type: shell - content: $(daos.daos_client_config_script) - destination: /tmp/daos_client_config.sh - - - id: debug_nodeset - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-nodeset?ref=v1.33.0&depth=1 - use: [network1] - settings: - name: ns1 - node_count_dynamic_max: 4 - machine_type: n2-standard-2 - enable_placement: false # the default is: true - service_account_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" - - - id: debug_partition - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-partition?ref=v1.33.0&depth=1 - use: [debug_nodeset] - settings: - partition_name: debug - exclusive: false # allows nodes to stay up after jobs are done - is_default: true - - - id: compute_nodeset - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-nodeset?ref=v1.33.0&depth=1 - use: [network1] - settings: - name: ns2 - node_count_dynamic_max: 20 - bandwidth_tier: gvnic_enabled - service_account_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" - - - id: compute_partition - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-partition?ref=v1.33.0&depth=1 - use: [compute_nodeset] - settings: - partition_name: compute - - - id: slurm_login - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scheduler/schedmd-slurm-gcp-v6-login?ref=v1.33.0&depth=1 - use: [network1] - settings: - name_prefix: login - machine_type: n2-standard-4 - enable_login_public_ips: true - tags: $(vars.tags) - service_account_scopes: - - "https://www.googleapis.com/auth/monitoring.write" - - "https://www.googleapis.com/auth/logging.write" - - "https://www.googleapis.com/auth/devstorage.read_only" - - "https://www.googleapis.com/auth/cloud-platform" - - - id: slurm_controller - source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scheduler/schedmd-slurm-gcp-v6-controller?ref=v1.33.0&depth=1 - use: - - network1 - - debug_partition - - compute_partition - - slurm_login - - homefs - - daos-client-script - settings: - enable_controller_public_ips: true - compute_startup_script: $(daos-client-script.startup_script) - controller_startup_script: $(daos-client-script.startup_script) - login_startup_script: $(daos-client-script.startup_script) - compute_startup_scripts_timeout: 1000 - controller_startup_scripts_timeout: 1000 - login_startup_scripts_timeout: 1000 - tags: $(vars.tags) diff --git a/community/examples/intel/pfs-daos.yaml b/community/examples/intel/pfs-daos.yaml deleted file mode 100644 index 3abf5c9778..0000000000 --- a/community/examples/intel/pfs-daos.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- - -blueprint_name: pfs-daos - -vars: - project_id: ## Set GCP Project ID Here ## - deployment_name: pfs-daos - region: us-central1 - zone: us-central1-c - daos_server_image_family: daos-server-hpc-rocky-8 - daos_client_image_family: daos-client-hpc-rocky-8 - daos_version: "2.4" - tags: [] - -# Note: this blueprint assumes the existence of a default global network and -# subnetwork in the region chosen above - -deployment_groups: -- group: primary - modules: - - id: network1 - source: modules/network/pre-existing-vpc - -- group: daos-server-image - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/images - - id: daos-server-image - source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" - kind: packer - settings: - daos_version: $(vars.daos_version) - daos_repo_base_url: https://packages.daos.io - daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo - use_iap: true - enable_oslogin: false - machine_type: n2-standard-32 - source_image_family: hpc-rocky-linux-8 - source_image_project_id: cloud-hpc-image-public - image_guest_os_features: ["GVNIC"] - disk_size: "20" - state_timeout: "10m" - scopes: ["https://www.googleapis.com/auth/cloud-platform"] - use_internal_ip: true - omit_external_ip: false - daos_install_type: server - image_family: $(vars.daos_server_image_family) - -- group: daos-client-image - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/v0.5.0/images - - id: daos-client-image - source: "github.com/daos-stack/google-cloud-daos//images?ref=v0.5.0&depth=1" - kind: packer - settings: - daos_version: $(vars.daos_version) - daos_repo_base_url: https://packages.daos.io - daos_packages_repo_file: EL8/packages/x86_64/daos_packages.repo - use_iap: true - enable_oslogin: false - machine_type: n2-standard-32 - source_image_family: hpc-rocky-linux-8 - source_image_project_id: cloud-hpc-image-public - image_guest_os_features: ["GVNIC"] - disk_size: "20" - state_timeout: "10m" - scopes: ["https://www.googleapis.com/auth/cloud-platform"] - use_internal_ip: true - omit_external_ip: false - daos_install_type: client - image_family: $(vars.daos_client_image_family) - -- group: daos-cluster - modules: - # more info: https://github.com/daos-stack/google-cloud-daos/tree/develop/terraform/modules/daos_server - - id: daos-server - # source: $(vars.daos_server_module_source_url) - source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_server?ref=v0.5.0&depth=1" - use: [network1] - settings: - number_of_instances: 2 - labels: {ghpc_role: file-system} - os_family: $(vars.daos_server_image_family) - daos_scm_size: "172" - tags: $(vars.tags) - - # more info: https://github.com/daos-stack/google-cloud-daos/tree/develop/terraform/modules/daos_client - - id: daos-client - # source: $(vars.daos_client_module_source_url) - source: "github.com/daos-stack/google-cloud-daos//terraform/modules/daos_client?ref=v0.5.0&depth=1" - use: [network1, daos-server] - settings: - number_of_instances: 2 - labels: {ghpc_role: compute} - os_family: $(vars.daos_client_image_family) - tags: $(vars.tags) diff --git a/community/modules/file-system/Intel-DAOS/README.md b/community/modules/file-system/Intel-DAOS/README.md index 410189eceb..04db0acb8c 100644 --- a/community/modules/file-system/Intel-DAOS/README.md +++ b/community/modules/file-system/Intel-DAOS/README.md @@ -1,65 +1 @@ -## Description - -This module allows creating an instance of Distributed Asynchronous Object Storage ([DAOS](https://docs.daos.io/)) on Google Cloud Platform ([GCP](https://cloud.google.com/)). - -> **_NOTE:_** -> DAOS on GCP does not require an Cluster Toolkit wrapper. -> Terraform modules are sourced directly from GitHub. -> It will not work as a [local or embedded module](../../../../modules/README.md#embedded-modules). - -Terraform modules for DAOS servers and clients are located in the [Google Cloud DAOS repo on GitHub](https://github.com/daos-stack/google-cloud-daos). - -DAOS Terraform module parameters can be found in the README.md files in each module directory. - -- [DAOS Server module](https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server#readme) -- [DAOS Client module](https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_client#readme) - -For more information on this and other network storage options in the Cluster Toolkit, see the extended [Network Storage documentation](../../../../docs/network_storage.md). -## Examples - -The [community examples folder](../../../examples/intel/) contains two example blueprints for deploying DAOS. - -- [community/examples/intel/pfs-daos.yml](../../../examples/intel/pfs-daos.yml) - Blueprint for deploying a DAOS cluster consisting of servers and clients. - After deploying this example the DAOS storage system will be formatted but no pools or containers will exist. - The instructions in the [community/examples/intel/README.md](../../../examples/intel/README.md#create-a-daos-pool-and-container) describe how to - - - Deploy a DAOS cluster - - Manage storage (create a [pool](https://docs.daos.io/v2.2/overview/storage/?h=container#daos-pool) and a [container](https://docs.daos.io/v2.2/overview/storage/?h=container#daos-container)) - - Mount a container on a client - - Store a large file in a DAOS container - -- [community/examples/intel/hpc-slurm-daos.yaml](../../../examples/intel/hpc-slurm-daos.yaml) - Blueprint for deploying a Slurm cluster and DAOS storage with 4 servers. - The Slurm compute nodes are configured as DAOS clients and have the ability to use the DAOS filesystem. - The instructions in the [community/examples/intel/README.md](../../../examples/intel/README.md#deploy-the-daosslurm-cluster) describe how to deploy the Slurm cluster and run a job which uses the DAOS file system. - -## Support - -Content in the [google-cloud-daos](https://github.com/daos-stack/google-cloud-daos) repository is licensed under the [Apache License Version 2.0](https://github.com/daos-stack/google-cloud-daos/blob/main/LICENSE) open-source license. - -[DAOS](https://github.com/daos-stack/daos) is distributed under the BSD-2-Clause-Patent open-source license. - -Intel Corporation provides two options for technical support: - -1. Community Support - - Community support is available to anyone through Jira and via the DAOS channel for Google Cloud users on Slack. - - JIRA: https://daosio.atlassian.net/jira/software/c/projects/DAOS/issues/ - - - An Atlassian account is not needed for read only access to Jira. - - An Atlassian account is required to create and update tickets. - To create an account follow the steps at https://support.atlassian.com/atlassian-account/docs/create-an-atlassian-account. - - Slack: https://daos-stack.slack.com/archives/C03GLTLHA59 - - Community support is provided on a best-effort basis. - -2. Commercial L3 Support - - Commercial L3 support is available on an on-demand basis. - - Contact Intel Corporation to obtain more information about Commercial L3 support. - - You may inquire about L3 support via the [Slack channel](https://daos-stack.slack.com/archives/C03GLTLHA59). +> **_NOTE:_** Cluster Toolkit is dropping support for the external [Google Cloud DAOS](https://github.com/daos-stack/google-cloud-daos/tree/main) repository. The DAOS example blueprints (`hpc-slurm-daos.yaml` and `pfs-daos.yaml`) have been removed from the Cluster Toolkit. We recommend migrating to the first-party [Parallelstore](../../../../modules/file-system/parallelstore/) module for similar functionality. To help with this transition, see the Parallelstore example blueprints ([pfs-parallelstore.yaml](../../../../examples/pfs-parallelstore.yaml) and [ps-slurm.yaml](../../../../examples/ps-slurm.yaml)). If the external [Google Cloud DAOS](https://github.com/daos-stack/google-cloud-daos/tree/main) repository is necessary, we recommend using the last Cluster Toolkit [v1.41.0](https://github.com/GoogleCloudPlatform/cluster-toolkit/releases/tag/v1.41.0). From 3377b6770c1a3e1ce7c376fab684153708885e81 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Mon, 28 Oct 2024 20:46:35 +0000 Subject: [PATCH 043/129] removing daos references --- examples/cae/cae-slurm-v5-legacy.yaml | 2 +- examples/cae/cae-slurm.yaml | 2 +- modules/README.md | 3 --- pkg/modulereader/metadata_legacy.go | 5 ----- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/examples/cae/cae-slurm-v5-legacy.yaml b/examples/cae/cae-slurm-v5-legacy.yaml index e1a5411252..dc5d8b80cf 100644 --- a/examples/cae/cae-slurm-v5-legacy.yaml +++ b/examples/cae/cae-slurm-v5-legacy.yaml @@ -140,7 +140,7 @@ deployment_groups: # Please visit here for more information # - DDN Exascaler Lustre: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/DDN-EXAScaler/README.md # - Sycomp IBM Spectrum Scale: https://console.developers.google.com/marketplace/product/sycomp/sycomp-storage-fueled-by-ibm-spectrum-scale - # - Intel DAOS: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/Intel-DAOS/README.md + # - Parallelstore: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/modules/file-system/parallelstore/README.md ######## Remote Desktop(s) ####### # This block enables a partition for nodes that support Chrome Remote Desktop diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index b42a4e401c..fda4daa9ea 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -137,7 +137,7 @@ deployment_groups: # Please visit here for more information # - DDN Exascaler Lustre: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/DDN-EXAScaler/README.md # - Sycomp IBM Spectrum Scale: https://console.developers.google.com/marketplace/product/sycomp/sycomp-storage-fueled-by-ibm-spectrum-scale - # - Intel DAOS: https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/community/modules/file-system/Intel-DAOS/README.md + # - Parallelstore: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/modules/file-system/parallelstore/README.md ######## Remote Desktop(s) ####### # This block creates chrome remote desktop. diff --git a/modules/README.md b/modules/README.md index 722449e6e6..1b109a82ec 100644 --- a/modules/README.md +++ b/modules/README.md @@ -103,8 +103,6 @@ Modules that are still in development and less stable are labeled with the a [DDN EXAscaler lustre](https://www.ddn.com/partners/google-cloud-platform/) file system. This module has [license costs](https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud). -* **[Intel-DAOS]** ![community-badge] : Creates - a [DAOS](https://docs.daos.io/) file system. * **[cloud-storage-bucket]** ![community-badge] ![experimental-badge] : Creates a Google Cloud Storage (GCS) bucket. * **[gke-persistent-volume]** ![core-badge] ![experimental-badge] : Creates persistent volumes and persistent volume claims for shared storage. * **[nfs-server]** ![community-badge] ![experimental-badge] : Creates a VM and @@ -114,7 +112,6 @@ Modules that are still in development and less stable are labeled with the [parallelstore]: file-system/parallelstore/README.md [pre-existing-network-storage]: file-system/pre-existing-network-storage/README.md [ddn-exascaler]: ../community/modules/file-system/DDN-EXAScaler/README.md -[intel-daos]: ../community/modules/file-system/Intel-DAOS/README.md [nfs-server]: ../community/modules/file-system/nfs-server/README.md [cloud-storage-bucket]: ../community/modules/file-system/cloud-storage-bucket/README.md [gke-persistent-volume]: ../modules/file-system/gke-persistent-volume/README.md diff --git a/pkg/modulereader/metadata_legacy.go b/pkg/modulereader/metadata_legacy.go index 1c55ddf3fc..a4ebb943b1 100644 --- a/pkg/modulereader/metadata_legacy.go +++ b/pkg/modulereader/metadata_legacy.go @@ -62,11 +62,6 @@ func defaultAPIList(source string) []string { "iam.googleapis.com", "runtimeconfig.googleapis.com", }, - "community/modules/file-system/Intel-DAOS": { - "compute.googleapis.com", - "iam.googleapis.com", - "secretmanager.googleapis.com", - }, "community/modules/file-system/nfs-server": { "compute.googleapis.com", }, From 37787610a360fabe1af1850ccb05408e2367cd28 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Mon, 28 Oct 2024 20:53:59 +0000 Subject: [PATCH 044/129] removing mentions of daos --- docs/image-building.md | 1 - examples/README.md | 18 ------------------ 2 files changed, 19 deletions(-) diff --git a/docs/image-building.md b/docs/image-building.md index 488c1b19ef..707fc407c4 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -27,7 +27,6 @@ below demonstrate each approach: - [Customizing a Slurm cluster (Hello, World)](../examples/README.md#image-builderyaml-) - [Customizing a Slurm cluster (AI/ML applications)](../examples/README.md#ml-slurmyaml-) - [Provisioning an HTCondor pool (installing scheduler)](../examples/README.md#htc-htcondoryaml--) -- [Provisioning a DAOS storage cluster](../community/examples/intel/README.md#daos-cluster) ## Why build an image? diff --git a/examples/README.md b/examples/README.md index 9257a98d0a..5fe023512d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -39,8 +39,6 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-build-slurm-image.yaml](#hpc-build-slurm-imageyaml--) ![community-badge] ![experimental-badge] * [hpc-slurm-ubuntu2004-v5-legacy.yaml](#hpc-slurm-ubuntu2004-v5-legacyyaml--) ![community-badge] ![deprecated-badge] * [hpc-slurm-ubuntu2004.yaml](#hpc-slurm-ubuntu2004yaml--) ![community-badge] - * [pfs-daos.yaml](#pfs-daosyaml-) ![community-badge] - * [hpc-slurm-daos.yaml](#hpc-slurm-daosyaml-) ![community-badge] * [hpc-amd-slurm-v5-legacy.yaml](#hpc-amd-slurm-v5-legacyyaml--) ![community-badge] ![deprecated-badge] * [hpc-amd-slurm.yaml](#hpc-amd-slurmyaml-) ![community-badge] * [hpc-slurm-sharedvpc.yaml](#hpc-slurm-sharedvpcyaml--) ![community-badge] ![experimental-badge] @@ -1214,22 +1212,6 @@ For this example the following is needed in the selected region: * Compute Engine API: Resource policies: **one for each job in parallel** - _only needed for `compute` partition_ -### [pfs-daos.yaml] ![community-badge] - -This example provisions a DAOS cluster with [managed instance groups][migs] for the servers and for clients. It is more extensively discussed in a dedicated [README for Intel -examples][intel-examples-readme]. - -[pfs-daos.yaml]: ../community/examples/intel/pfs-daos.yaml -[migs]: https://cloud.google.com/compute/docs/instance-groups - -### [hpc-slurm-daos.yaml] ![community-badge] - -This example provisions DAOS servers and a Slurm cluster. It is -more extensively discussed in a dedicated [README for Intel -examples][intel-examples-readme]. - -[hpc-slurm-daos.yaml]: ../community/examples/intel/hpc-slurm-daos.yaml - ### [hpc-amd-slurm-v5-legacy.yaml] ![community-badge] ![deprecated-badge] This example provisions a Slurm cluster using AMD VM machine types. It From 78ab53a7decabf0c7dffbd8d728619d6a8865fe7 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Mon, 28 Oct 2024 20:59:47 +0000 Subject: [PATCH 045/129] removing more mentions of daos --- examples/cae/cae-slurm-v5-legacy.yaml | 2 +- examples/cae/cae-slurm.yaml | 2 +- modules/README.md | 11 ----------- 3 files changed, 2 insertions(+), 13 deletions(-) diff --git a/examples/cae/cae-slurm-v5-legacy.yaml b/examples/cae/cae-slurm-v5-legacy.yaml index dc5d8b80cf..01dddbecdb 100644 --- a/examples/cae/cae-slurm-v5-legacy.yaml +++ b/examples/cae/cae-slurm-v5-legacy.yaml @@ -134,7 +134,7 @@ deployment_groups: local_mount: /scratch # If you require maximum IO performance, you can consider to bring up a dedicated parallel - # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Intel DAOS. + # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Parallelstore. # Note: Those solutions may have associated license cost. # # Please visit here for more information diff --git a/examples/cae/cae-slurm.yaml b/examples/cae/cae-slurm.yaml index fda4daa9ea..ab6b129219 100644 --- a/examples/cae/cae-slurm.yaml +++ b/examples/cae/cae-slurm.yaml @@ -131,7 +131,7 @@ deployment_groups: local_mount: /scratch # If you require maximum IO performance, you can consider to bring up a dedicated parallel - # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Intel DAOS. + # file system, e.g. DDN Exascaler Lustre, Sycomp GPFS, or Parallelstore. # Note: Those solutions may have associated license cost. # # Please visit here for more information diff --git a/modules/README.md b/modules/README.md index 1b109a82ec..7546ae3e2b 100644 --- a/modules/README.md +++ b/modules/README.md @@ -357,10 +357,6 @@ following module definition refers the local pre-existing-vpc modules. #### GitHub-hosted Modules and Packages -The [Intel DAOS blueprint][pfs-daos.yaml] makes extensive use of GitHub-hosted -Terraform and Packer modules. You may wish to use it as an example reference for -this documentation. - To use a Terraform module available on GitHub, set the source to a path starting with `github.com` (HTTPS) or `git@github.com` (SSH). For instance, the following module definition sources the Toolkit vpc module: @@ -398,7 +394,6 @@ into a hidden folder when you run `terraform init`. [tfrev]: https://www.terraform.io/language/modules/sources#selecting-a-revision [gitref]: https://git-scm.com/book/en/v2/Git-Tools-Revision-Selection#_single_revisions [tfsubdir]: https://www.terraform.io/language/modules/sources#modules-in-package-sub-directories -[pfs-daos.yaml]: ../community/examples/intel/pfs-daos.yaml ##### GitHub-hosted Packer modules @@ -410,12 +405,6 @@ repository to the module path: `deployment_name/group_name/module_id`. However, when `gcluster deploy` is invoked, it will run Packer from the subdirectory `deployment_name/group_name/module_id/subdirectory/after/double_slash`. -Referring back to the [Intel DAOS blueprint][pfs-daos.yaml], we see that it will -create 2 deployment groups at `pfs-daos/daos-client-image` and -`pfs-daos/daos-server-image`. However, Packer will actually be invoked from -a subdirectories ending in `daos-client-image/images` and -`daos-server-image/images`. - If the module does not use `//` package notation, `gcluster create` will copy only the final directory in the path to `deployment_name/group_name/module_id`. From c5d49cca16f9c729b15b86d93bc1d6c33c2d695c Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 28 Oct 2024 22:31:52 +0000 Subject: [PATCH 046/129] Refactor docker support in startup-script module Anticipate future additions to Docker configuration by creating a single var.docker input variable. This will facilitate validation by including all related settings in a single object. --- .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 5 ++- .../ml-slurm-a3-1-image-v5-legacy.yaml | 5 ++- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 5 ++- modules/scripts/startup-script/README.md | 5 ++- modules/scripts/startup-script/main.tf | 10 ++--- modules/scripts/startup-script/variables.tf | 38 ++++++++++++++++--- 6 files changed, 47 insertions(+), 21 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index 705e1299eb..3d4db65c21 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -46,8 +46,9 @@ deployment_groups: source: modules/scripts/startup-script settings: install_ansible: true - install_docker: true - enable_docker_world_writable: true + docker: + enabled: true + world_writable: true configure_ssh_host_patterns: - 10.0.0.* - 10.1.0.* diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml index 42a823bf8e..979702db30 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml @@ -46,8 +46,9 @@ deployment_groups: source: modules/scripts/startup-script settings: install_ansible: true - install_docker: true - enable_docker_world_writable: true + docker: + enabled: true + world_writable: true configure_ssh_host_patterns: - 10.0.0.* - 10.1.0.* diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index dfc4d4ab4c..07e06ac8ac 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -49,8 +49,9 @@ deployment_groups: - 10.6.0.* - 10.7.0.* - $(vars.slurm_cluster_name)* - enable_docker_world_writable: true - install_docker: true + docker: + enabled: true + world_writable: true runners: # it is important that kernel upgrades do not occur before running the # solution for building Slurm (which doesn't handle them well on the fly) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index 189e627732..abed00d3b3 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -318,13 +318,14 @@ No modules. | [configure\_ssh\_host\_patterns](#input\_configure\_ssh\_host\_patterns) | If specified, it will automate ssh configuration by:
- Defining a Host block for every element of this variable and setting StrictHostKeyChecking to 'No'.
Ex: "hpc*", "hpc01*", "ml*"
- The first time users log-in, it will create ssh keys that are added to the authorized keys list
This requires a shared /home filesystem and relies on specifying the right prefix. | `list(string)` | `[]` | no | | [debug\_file](#input\_debug\_file) | Path to an optional local to be written with 'startup\_script'. | `string` | `null` | no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used to name GCS bucket for startup scripts. | `string` | n/a | yes | -| [enable\_docker\_world\_writable](#input\_enable\_docker\_world\_writable) | Configure Docker daemon to be writable by all users (if var.install\_docker is set to true). | `bool` | `false` | no | +| [docker](#input\_docker) | Install and configure Docker |
object({
enabled = optional(bool, false)
world_writable = optional(bool, false)
})
|
{
"enabled": false
}
| no | +| [enable\_docker\_world\_writable](#input\_enable\_docker\_world\_writable) | DEPRECATED: use var.docker | `bool` | `null` | no | | [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object, starting with `gs://`. | `string` | `null` | no | | [http\_no\_proxy](#input\_http\_no\_proxy) | Domains for which to disable http\_proxy behavior. Honored only if var.http\_proxy is set | `string` | `".google.com,.googleapis.com,metadata.google.internal,localhost,127.0.0.1"` | no | | [http\_proxy](#input\_http\_proxy) | Web (http and https) proxy configuration for pip, apt, and yum/dnf and interactive shells | `string` | `""` | no | | [install\_ansible](#input\_install\_ansible) | Run Ansible installation script if either set to true or unset and runner of type 'ansible-local' are used. | `bool` | `null` | no | | [install\_cloud\_ops\_agent](#input\_install\_cloud\_ops\_agent) | Warning: Consider using `install_stackdriver_agent` for better performance. Run Google Ops Agent installation script if set to true. | `bool` | `false` | no | -| [install\_docker](#input\_install\_docker) | Install Docker command line tool and daemon. | `bool` | `false` | no | +| [install\_docker](#input\_install\_docker) | DEPRECATED: use var.docker. | `bool` | `null` | no | | [install\_stackdriver\_agent](#input\_install\_stackdriver\_agent) | Run Google Stackdriver Agent installation script if set to true. Preferred over ops agent for performance. | `bool` | `false` | no | | [labels](#input\_labels) | Labels for the created GCS bucket. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_filesystem](#input\_local\_ssd\_filesystem) | Create and mount a filesystem from local SSD disks (data will be lost if VMs are powered down without enabling migration); enable by setting mountpoint field to a valid directory path. |
object({
fs_type = optional(string, "ext4")
mountpoint = optional(string, "")
permissions = optional(string, "0755")
})
|
{
"fs_type": "ext4",
"mountpoint": "",
"permissions": "0755"
}
| no | diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 8a6c1dd6cb..4a8ccc1643 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -89,12 +89,12 @@ locals { } ] - docker_runner = !var.install_docker ? [] : [ + docker_runner = !var.docker.enabled ? [] : [ { type = "ansible-local" destination = "install_docker.yml" content = file("${path.module}/files/install_docker.yml") - args = "-e enable_docker_world_writable=${var.enable_docker_world_writable}" + args = "-e enable_docker_world_writable=${var.docker.world_writable}" }, ] @@ -113,7 +113,7 @@ locals { ] supplied_ansible_runners = anytrue([for r in var.runners : r.type == "ansible-local"]) - has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.install_docker, local.local_ssd_filesystem_enabled]) + has_ansible_runners = anytrue([local.supplied_ansible_runners, local.configure_ssh, var.docker.enabled, local.local_ssd_filesystem_enabled]) install_ansible = coalesce(var.install_ansible, local.has_ansible_runners) ansible_installer = local.install_ansible ? [{ type = "shell" @@ -225,10 +225,6 @@ resource "google_storage_bucket_object" "scripts" { condition = !(var.install_cloud_ops_agent && var.install_stackdriver_agent) error_message = "Only one of var.install_stackdriver_agent or var.install_cloud_ops_agent can be set. Stackdriver is recommended for best performance." } - precondition { - condition = !var.enable_docker_world_writable || var.install_docker - error_message = "If var.enable_docker_world_writable is set to true, var.install_docker must also be set to true." - } } } diff --git a/modules/scripts/startup-script/variables.tf b/modules/scripts/startup-script/variables.tf index 3975a69614..3b99f3f0e6 100644 --- a/modules/scripts/startup-script/variables.tf +++ b/modules/scripts/startup-script/variables.tf @@ -112,18 +112,44 @@ EOT default = [] } +variable "docker" { + description = "Install and configure Docker" + type = object({ + enabled = optional(bool, false) + world_writable = optional(bool, false) + }) + default = { + enabled = false + } + + validation { + condition = !coalesce(var.docker.world_writable) || var.docker.enabled + error_message = "var.docker.world_writable should only be set if var.docker.enabled is set to true" + } +} + +# tflint-ignore: terraform_unused_declarations variable "enable_docker_world_writable" { - description = "Configure Docker daemon to be writable by all users (if var.install_docker is set to true)." + description = "DEPRECATED: use var.docker" type = bool - default = false - nullable = false + default = null + + validation { + condition = var.enable_docker_world_writable == null + error_message = "The variable enable_docker_world_writable has been removed. Use var.docker instead" + } } +# tflint-ignore: terraform_unused_declarations variable "install_docker" { - description = "Install Docker command line tool and daemon." + description = "DEPRECATED: use var.docker." type = bool - default = false - nullable = false + default = null + + validation { + condition = var.install_docker == null + error_message = "The variable install_docker has been removed. Use var.docker instead" + } } variable "local_ssd_filesystem" { From 6b0c26acb3e6f0d5936fbb32104367896408cc09 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 28 Oct 2024 15:43:03 -0700 Subject: [PATCH 047/129] Revert "SlurmGCP. Relax `reservation_name` match, allow for suffix" --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 24 +++++++++++-------- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 8 +++---- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index bd339c262e..8dcffd8fda 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -197,7 +197,7 @@ No modules. | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | -| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME[/SUFF/IX]
- RESERVATION\_NAME[/SUFF/IX]

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | +| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME
- RESERVATION\_NAME

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | | [service\_account](#input\_service\_account) | DEPRECATED: Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the compute instances. | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 6e680f002b..217328277b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -130,22 +130,26 @@ data "google_compute_zones" "available" { } locals { - res_match = regex("^(?P(?Pprojects/(?P[a-z0-9-]+)/reservations/)?(?[a-z0-9-]+)(?P/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name) - - res_short_name = local.res_match.name - res_project = coalesce(local.res_match.project, var.project_id) - res_prefix = coalesce(local.res_match.prefix, "projects/${local.res_project}/reservations/") - res_suffix = local.res_match.suffix == null ? "" : local.res_match.suffix + res_name_split = split("/", var.reservation_name) + reservation = var.reservation_name == "" ? null : ( + length(local.res_name_split) == 4 ? { + project : local.res_name_split[1], + name : local.res_name_split[3] + } : { + project : var.project_id, + name : var.reservation_name + } + ) - reservation_name = local.res_match.whole == null ? "" : "${local.res_prefix}${local.res_short_name}${local.res_suffix}" + reservation_name = local.reservation == null ? "" : "projects/${local.reservation.project}/reservations/${local.reservation.name}" } # tflint-ignore: terraform_unused_declarations data "google_compute_reservation" "reservation" { - count = length(local.reservation_name) > 0 ? 1 : 0 + count = local.reservation != null ? 1 : 0 - name = local.res_short_name - project = local.res_project + name = local.reservation.name + project = local.reservation.project zone = var.zone lifecycle { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 3bd8fc74fb..ad0409ad11 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -447,8 +447,8 @@ variable "access_config" { variable "reservation_name" { description = <<-EOD Name of the reservation to use for VM resources, should be in one of the following formats: - - projects/PROJECT_ID/reservations/RESERVATION_NAME[/SUFF/IX] - - RESERVATION_NAME[/SUFF/IX] + - projects/PROJECT_ID/reservations/RESERVATION_NAME + - RESERVATION_NAME Must be a "SPECIFIC" reservation Set to empty string if using no reservation or automatically-consumed reservations @@ -458,8 +458,8 @@ variable "reservation_name" { nullable = false validation { - condition = length(regexall("^((projects/([a-z0-9-]+)/reservations/)?([a-z0-9-]+)(/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name)) > 0 - error_message = "Reservation name must be either empty or in the format '[projects/PROJECT_ID/reservations/]RESERVATION_NAME[/SUFF/IX]', [...] are optional parts." + condition = var.reservation_name == "" || length(regexall("^projects/[a-z0-9-]+/reservations/[a-z0-9-]+$", var.reservation_name)) > 0 || length(regexall("^[a-z0-9-]+$", var.reservation_name)) > 0 + error_message = "Reservation name must be in the format 'projects/PROJECT_ID/reservations/RESERVATION_NAME' or 'RESERVATION_NAME'." } } From 9c316dfea122464cd7a158bcc4a0b7d1d9085b63 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 28 Oct 2024 16:15:19 -0700 Subject: [PATCH 048/129] Revert "Revert "SlurmGCP. Relax `reservation_name` match, allow for suffix"" --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../schedmd-slurm-gcp-v6-nodeset/main.tf | 24 ++++++++----------- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 8 +++---- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 8dcffd8fda..bd339c262e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -197,7 +197,7 @@ No modules. | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | -| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME
- RESERVATION\_NAME

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | +| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME[/SUFF/IX]
- RESERVATION\_NAME[/SUFF/IX]

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | | [service\_account](#input\_service\_account) | DEPRECATED: Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the compute instances. | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 217328277b..6e680f002b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -130,26 +130,22 @@ data "google_compute_zones" "available" { } locals { - res_name_split = split("/", var.reservation_name) - reservation = var.reservation_name == "" ? null : ( - length(local.res_name_split) == 4 ? { - project : local.res_name_split[1], - name : local.res_name_split[3] - } : { - project : var.project_id, - name : var.reservation_name - } - ) + res_match = regex("^(?P(?Pprojects/(?P[a-z0-9-]+)/reservations/)?(?[a-z0-9-]+)(?P/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name) + + res_short_name = local.res_match.name + res_project = coalesce(local.res_match.project, var.project_id) + res_prefix = coalesce(local.res_match.prefix, "projects/${local.res_project}/reservations/") + res_suffix = local.res_match.suffix == null ? "" : local.res_match.suffix - reservation_name = local.reservation == null ? "" : "projects/${local.reservation.project}/reservations/${local.reservation.name}" + reservation_name = local.res_match.whole == null ? "" : "${local.res_prefix}${local.res_short_name}${local.res_suffix}" } # tflint-ignore: terraform_unused_declarations data "google_compute_reservation" "reservation" { - count = local.reservation != null ? 1 : 0 + count = length(local.reservation_name) > 0 ? 1 : 0 - name = local.reservation.name - project = local.reservation.project + name = local.res_short_name + project = local.res_project zone = var.zone lifecycle { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index ad0409ad11..3bd8fc74fb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -447,8 +447,8 @@ variable "access_config" { variable "reservation_name" { description = <<-EOD Name of the reservation to use for VM resources, should be in one of the following formats: - - projects/PROJECT_ID/reservations/RESERVATION_NAME - - RESERVATION_NAME + - projects/PROJECT_ID/reservations/RESERVATION_NAME[/SUFF/IX] + - RESERVATION_NAME[/SUFF/IX] Must be a "SPECIFIC" reservation Set to empty string if using no reservation or automatically-consumed reservations @@ -458,8 +458,8 @@ variable "reservation_name" { nullable = false validation { - condition = var.reservation_name == "" || length(regexall("^projects/[a-z0-9-]+/reservations/[a-z0-9-]+$", var.reservation_name)) > 0 || length(regexall("^[a-z0-9-]+$", var.reservation_name)) > 0 - error_message = "Reservation name must be in the format 'projects/PROJECT_ID/reservations/RESERVATION_NAME' or 'RESERVATION_NAME'." + condition = length(regexall("^((projects/([a-z0-9-]+)/reservations/)?([a-z0-9-]+)(/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name)) > 0 + error_message = "Reservation name must be either empty or in the format '[projects/PROJECT_ID/reservations/]RESERVATION_NAME[/SUFF/IX]', [...] are optional parts." } } From 5676c617a573bcdabd85a728c5b59db6c3d556d7 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 28 Oct 2024 23:21:05 +0000 Subject: [PATCH 049/129] SlurmGCP. Fix old Terraform regex compatibility issue. --- community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 6e680f002b..ffaa9d4302 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -130,7 +130,7 @@ data "google_compute_zones" "available" { } locals { - res_match = regex("^(?P(?Pprojects/(?P[a-z0-9-]+)/reservations/)?(?[a-z0-9-]+)(?P/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name) + res_match = regex("^(?P(?Pprojects/(?P[a-z0-9-]+)/reservations/)?(?P[a-z0-9-]+)(?P/[a-z0-9-]+/[a-z0-9-]+)?)?$", var.reservation_name) res_short_name = local.res_match.name res_project = coalesce(local.res_match.project, var.project_id) From 7885a22d758847205416902096dfa1c578f3552b Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Mon, 28 Oct 2024 23:54:42 +0000 Subject: [PATCH 050/129] add training example for gke parallelstore blueprint --- examples/gke-storage-parallelstore.yaml | 54 ++++++++++++------- modules/compute/gke-job-template/README.md | 1 + modules/compute/gke-job-template/main.tf | 1 + .../templates/gke-job-base.yaml.tftpl | 6 +++ modules/compute/gke-job-template/variables.tf | 9 ++++ .../test-gke-storage-parallelstore.yml | 6 +-- 6 files changed, 54 insertions(+), 23 deletions(-) diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 413e523da7..45b51d6c74 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -blueprint_name: gke-storage-parallelstore +blueprint_name: gke-storage-parallelstore-dev vars: project_id: ## Set GCP Project ID Here ## - deployment_name: gke-storage-parallelstore + deployment_name: gke-storage-parallelstore-dev region: us-central1 zone: us-central1-c @@ -67,7 +67,7 @@ deployment_groups: sc_volume_binding_mode: Immediate sc_reclaim_policy: Delete # Use Retain if you want to volume and parallelstore resource will remain after sc_topology_zones: [$(vars.zone)] - pvc_count: 2 + pvc_count: 1 capacity_gb: 12000 # from 12,000 GiB to 100,000 GiB, in multiples of 4,000 GiB - id: sample-pool @@ -76,9 +76,10 @@ deployment_groups: settings: name: sample-pool zones: [$(vars.zone)] - machine_type: n2-standard-4 + machine_type: n2-standard-16 - ### Parallelstore enabled Job ### + # Train a TensorFlow model with Keras and Parallelstore on GKE + # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample - id: parallelstore-job source: modules/compute/gke-job-template @@ -86,22 +87,35 @@ deployment_groups: - gke_cluster - parallelstore-setup settings: - image: busybox + name: tensorflow + image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d + security_context: + - key: runAsUser + value: 1000 + - key: runAsGroup + value: 100 + - key: fsGroup + value: 100 command: - - bin/sh + - bash - -c - | - echo "Set up job folders" - shopt -s extglob; JOB=${HOSTNAME%%-+([[:digit:]])} - mkdir /data/parallelstore-pvc-0/${JOB}/ -p; - mkdir /data/parallelstore-pvc-1/${JOB}/ -p; - - echo "Writing seed data to Parallelstore volumes" - dd if=/dev/urandom of=/data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 - dd if=/dev/urandom of=/data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat bs=1K count=1000 - - # echo "Hash file and write between the 2 hyerpdisk balanced volumes" - # md5sum /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.md5 - # md5sum /data/parallelstore-pvc-1/${JOB}/${JOB_COMPLETION_INDEX}.dat > /data/parallelstore-pvc-0/${JOB}/${JOB_COMPLETION_INDEX}.md5 - node_count: 5 + pip install transformers datasets + python - < [random\_name\_sufix](#input\_random\_name\_sufix) | Appends a random suffix to the job name to avoid clashes. | `bool` | `true` | no | | [requested\_cpu\_per\_pod](#input\_requested\_cpu\_per\_pod) | The requested cpu per pod. If null, allocatable\_cpu\_per\_node will be used to claim whole nodes. If provided will override allocatable\_cpu\_per\_node. | `number` | `-1` | no | | [restart\_policy](#input\_restart\_policy) | Job restart policy. Only a RestartPolicy equal to `Never` or `OnFailure` is allowed. | `string` | `"Never"` | no | +| [security\_context](#input\_security\_context) | The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ |
list(object({
key = string
value = string
}))
| `[]` | no | | [tolerations](#input\_tolerations) | Tolerations allow the scheduler to schedule pods with matching taints. Generally populated from gke-node-pool via `use` field. |
list(object({
key = string
operator = string
value = string
effect = string
}))
|
[
{
"effect": "NoSchedule",
"key": "user-workload",
"operator": "Equal",
"value": "true"
}
]
| no | ## Outputs diff --git a/modules/compute/gke-job-template/main.tf b/modules/compute/gke-job-template/main.tf index cded3fbb1d..2e21c7c394 100644 --- a/modules/compute/gke-job-template/main.tf +++ b/modules/compute/gke-job-template/main.tf @@ -129,6 +129,7 @@ locals { restart_policy = var.restart_policy backoff_limit = var.backoff_limit tolerations = distinct(var.tolerations) + security_context = var.security_context labels = local.labels empty_dir_volumes = local.empty_dir_volumes diff --git a/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl b/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl index 61c34f8b25..431a519b9c 100644 --- a/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl +++ b/modules/compute/gke-job-template/templates/gke-job-base.yaml.tftpl @@ -18,6 +18,12 @@ spec: gke-gcsfuse/volumes: "true" %{~ endif ~} spec: + %{~ if length(security_context) > 0 ~} + securityContext: + %{~ for context in security_context ~} + ${context.key}: ${context.value} + %{~ endfor ~} + %{~ endif ~} %{~ if k8s_service_account_name != null ~} serviceAccountName: ${k8s_service_account_name} %{~ endif ~} diff --git a/modules/compute/gke-job-template/variables.tf b/modules/compute/gke-job-template/variables.tf index 279293cf26..6a37c344c1 100644 --- a/modules/compute/gke-job-template/variables.tf +++ b/modules/compute/gke-job-template/variables.tf @@ -92,6 +92,15 @@ variable "tolerations" { ] } +variable "security_context" { + description = "The security options the container should be run with. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/" + type = list(object({ + key = string + value = string + })) + default = [] +} + variable "machine_family" { description = "The machine family to use in the node selector (example: `n2`). If null then machine family will not be used as selector criteria." type = string diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml index 424908f436..adceaa1087 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml @@ -19,7 +19,7 @@ - name: Execute the job delegate_to: localhost ansible.builtin.shell: | - jobs=({{ workspace }}/{{ deployment_name }}/primary/my-job*) + jobs=({{ workspace }}/{{ deployment_name }}/primary/tensorflow*) for job in "${jobs[@]}"; do kubectl create -f "$job" done @@ -30,10 +30,10 @@ - name: Wait for job to complete delegate_to: localhost ansible.builtin.command: | - kubectl get job --field-selector status.successful=5 + kubectl get job --field-selector status.successful=1 register: job_completion until: job_completion.stdout_lines | length > 1 - retries: 40 + retries: 80 delay: 15 - name: Print job_completion debug output From 0b18f4b02164320d8de3555a6c0b9c9e239acaf9 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Mon, 28 Oct 2024 23:57:03 +0000 Subject: [PATCH 051/129] fix blueprint name --- examples/gke-storage-parallelstore.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 45b51d6c74..6b88fd7913 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -blueprint_name: gke-storage-parallelstore-dev +blueprint_name: gke-storage-parallelstore vars: project_id: ## Set GCP Project ID Here ## - deployment_name: gke-storage-parallelstore-dev + deployment_name: gke-storage-parallelstore region: us-central1 zone: us-central1-c From c531a1563a88849557aa869c5e88edeef42c4b50 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 23 Oct 2024 22:15:33 +0000 Subject: [PATCH 052/129] Update IP address module within VPC module - Support terraform-provider-google v6 - Adds support for labels on regional IP addresses --- modules/network/vpc/README.md | 3 ++- modules/network/vpc/main.tf | 8 +++++++- modules/network/vpc/variables.tf | 7 +++++++ .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml | 1 + .../golden_copies/expectations/igc_pkr/zero/main.tf | 1 + .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml | 1 + .../golden_copies/expectations/igc_tf/zero/main.tf | 1 + .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 1 + .../golden_copies/expectations/merge_flatten/zero/main.tf | 1 + 9 files changed, 22 insertions(+), 2 deletions(-) diff --git a/modules/network/vpc/README.md b/modules/network/vpc/README.md index 215c13a4f2..97e28d548a 100644 --- a/modules/network/vpc/README.md +++ b/modules/network/vpc/README.md @@ -172,7 +172,7 @@ No providers. | Name | Source | Version | |------|--------|---------| | [cloud\_router](#module\_cloud\_router) | terraform-google-modules/cloud-router/google | ~> 6.0 | -| [nat\_ip\_addresses](#module\_nat\_ip\_addresses) | terraform-google-modules/address/google | ~> 3.1 | +| [nat\_ip\_addresses](#module\_nat\_ip\_addresses) | terraform-google-modules/address/google | ~> 4.1 | | [vpc](#module\_vpc) | terraform-google-modules/network/google | ~> 9.0 | ## Resources @@ -196,6 +196,7 @@ No resources. | [firewall\_log\_config](#input\_firewall\_log\_config) | Firewall log configuration for Toolkit firewall rules (var.enable\_iap\_ssh\_ingress and others) | `string` | `"DISABLE_LOGGING"` | no | | [firewall\_rules](#input\_firewall\_rules) | List of firewall rules | `any` | `[]` | no | | [ips\_per\_nat](#input\_ips\_per\_nat) | The number of IP addresses to allocate for each regional Cloud NAT (set to 0 to disable NAT) | `number` | `2` | no | +| [labels](#input\_labels) | Labels to add to network resources that support labels. Key-value pairs of strings. | `map(string)` | `{}` | no | | [mtu](#input\_mtu) | The network MTU (default: 8896). Recommended values: 0 (use Compute Engine default), 1460 (default outside HPC environments), 1500 (Internet default), or 8896 (for Jumbo packets). Allowed are all values in the range 1300 to 8896, inclusively. | `number` | `8896` | no | | [network\_address\_range](#input\_network\_address\_range) | IP address range (CIDR) for global network | `string` | `"10.0.0.0/9"` | no | | [network\_description](#input\_network\_description) | An optional description of this resource (changes will trigger resource destroy/create) | `string` | `""` | no | diff --git a/modules/network/vpc/main.tf b/modules/network/vpc/main.tf index 76de7a3f57..3c1ceff0d2 100644 --- a/modules/network/vpc/main.tf +++ b/modules/network/vpc/main.tf @@ -14,6 +14,11 @@ * limitations under the License. */ +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "vpc", ghpc_role = "network" }) +} + locals { autoname = replace(var.deployment_name, "_", "-") network_name = var.network_name == null ? "${local.autoname}-net" : var.network_name @@ -175,7 +180,7 @@ module "vpc" { # https://github.com/terraform-google-modules/terraform-google-address/blob/v3.1.1/outputs.tf module "nat_ip_addresses" { source = "terraform-google-modules/address/google" - version = "~> 3.1" + version = "~> 4.1" for_each = toset(local.regions) @@ -184,6 +189,7 @@ module "nat_ip_addresses" { # an external, regional (not global) IP address is suited for a regional NAT address_type = "EXTERNAL" global = false + labels = local.labels names = [for idx in range(var.ips_per_nat) : "${local.network_name}-nat-ips-${each.value}-${idx}"] } diff --git a/modules/network/vpc/variables.tf b/modules/network/vpc/variables.tf index 996c5b7273..12495b6770 100644 --- a/modules/network/vpc/variables.tf +++ b/modules/network/vpc/variables.tf @@ -19,6 +19,13 @@ variable "project_id" { type = string } +variable "labels" { + description = "Labels to add to network resources that support labels. Key-value pairs of strings." + type = map(string) + default = {} + nullable = false +} + variable "network_name" { description = "The name of the network to be created (if unsupplied, will default to \"{deployment_name}-net\")" type = string diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index c3f9926b11..247b05451b 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -62,6 +62,7 @@ deployment_groups: deployment_name: ((var.deployment_name)) enable_iap_rdp_ingress: true enable_iap_winrm_ingress: true + labels: ((var.labels)) project_id: ((var.project_id)) region: ((var.region)) - source: modules/file-system/filestore diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/main.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/main.tf index 516a553e29..83763abfd4 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/main.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/main.tf @@ -19,6 +19,7 @@ module "network0" { deployment_name = var.deployment_name enable_iap_rdp_ingress = true enable_iap_winrm_ingress = true + labels = var.labels project_id = var.project_id region = var.region } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index d9c215a457..5c4ee035b9 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -68,6 +68,7 @@ deployment_groups: sensitive: true settings: deployment_name: ((var.deployment_name)) + labels: ((var.labels)) project_id: ((var.project_id)) region: ((var.region)) - group: one diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/main.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/main.tf index 5904dcf49c..b76daf0303 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/main.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/main.tf @@ -24,6 +24,7 @@ terraform { module "network0" { source = "./modules/embedded/modules/network/vpc" deployment_name = var.deployment_name + labels = var.labels project_id = var.project_id region = var.region } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 46614b02e6..019e71a4e5 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -57,6 +57,7 @@ deployment_groups: id: network settings: deployment_name: ((var.deployment_name)) + labels: ((var.labels)) project_id: ((var.project_id)) region: ((var.region)) - source: modules/file-system/filestore diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/main.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/main.tf index 214ee3b73c..7e0b0290af 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/main.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/main.tf @@ -17,6 +17,7 @@ module "network" { source = "./modules/embedded/modules/network/vpc" deployment_name = var.deployment_name + labels = var.labels project_id = var.project_id region = var.region } From bc23059217802b8ce5412e8eee36d9251ce6db91 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Tue, 29 Oct 2024 12:12:03 +0000 Subject: [PATCH 053/129] Improve Reservation Validation Error Message --- modules/compute/gke-node-pool/README.md | 34 +++++++++++++++++++ modules/compute/gke-node-pool/main.tf | 9 +++-- .../gke-node-pool/reservation_definitions.tf | 12 +++++++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 880e1834e4..258462f731 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -223,6 +223,40 @@ Finally, the following is adding multivpc to a node pool: ... ``` +## Using GCE Reservations +You can reserve Google Compute Engine instances in a specific zone to ensure resources are available for their workloads when needed. For more details on how to manage reservations, see [Reserving Compute Engine zonal resources](https://cloud.google.com/compute/docs/instances/reserving-zonal-resources). + +After creating a reservation, you can consume the reserved GCE VM instances in GKE. GKE clusters deployed using Cluster Toolkit support the same consumption modes as Compute Engine: NO_RESERVATION(default), ANY_RESERVATION, SPECIFIC_RESERVATION. + +This can be accomplished using [`reservation_affinity`](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/modules/compute/gke-node-pool/README.md#input_reservation_affinity). + +```yaml +# Target any reservation +reservation_affinity: + consume_reservation_type: ANY_RESERVATION + +# Target a specific reservation +reservation_affinity: + consume_reservation_type: SPECIFIC_RESERVATION + specific_reservations: + - name: specific-reservation-1 +``` + +The following requirements need to be satisfied for the node pool nodes to be able to use a specific reservation: +1. A reservation with the name must exist in the specified project(`var.project_id`) and one of the specified zones(`var.zones`). +2. Its consumption type must be `specific`. +3. Its GCE VM Properties must match with those of the Node Pool; Machine type, Accelerators (GPU Type and count), Local SSD disk type and count. + +If you want to utilise a shared reservation, the owner project of the shared reservation needs to be explicitly specified like the following. Note that a shared reservation can be used by the project that hosts the reservation (owner project) and by the projects the reservation is shared with (consumer projects). See how to [create and use a shared reservation](https://cloud.google.com/compute/docs/instances/reservations-shared). + +```yaml +reservation_affinity: + consume_reservation_type: SPECIFIC_RESERVATION + specific_reservations: + - name: specific-reservation-shared + project: shared_reservation_owner_project_id +``` + ## License diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 49bee3d0fa..b556773559 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -225,9 +225,12 @@ resource "google_container_node_pool" "node_pool" { ) error_message = <<-EOT Check if your reservation is configured correctly: - 1. A reservation with the name must exist in the specified project and one of the specified zones - 2. Its consumption type must be "specific" - 3. Its VM Properties must match with those of the Node Pool; Machine type, Accelerators (GPU Type and count), Local SSD disk type and count + - A reservation with the name must exist in the specified project and one of the specified zones + + - Its consumption type must be "specific" + %{for property in local.specific_reservation_requirement_violations} + - ${local.specific_reservation_requirement_violation_messages[property]} + %{endfor} EOT } } diff --git a/modules/compute/gke-node-pool/reservation_definitions.tf b/modules/compute/gke-node-pool/reservation_definitions.tf index d40cc5b01f..26ab22808f 100644 --- a/modules/compute/gke-node-pool/reservation_definitions.tf +++ b/modules/compute/gke-node-pool/reservation_definitions.tf @@ -66,4 +66,16 @@ locals { # Know that in map comparison the order of keys does not matter. That is {NVME: x, SCSI: y} and {SCSI: y, NVME: x} are equal # As of this writing, there is only one reservation supported by the Node Pool API. So, directly accessing it from the list specific_reservation_requirement_violations = length(local.reservation_vm_properties) == 0 ? [] : [for k, v in local.nodepool_vm_properties : k if v != local.reservation_vm_properties[0][k]] + + specific_reservation_requirement_violation_messages = { + "machine_type" : <<-EOT + The reservation has "${try(local.reservation_vm_properties[0].machine_type, "")}" machine type and the node pool has "${local.nodepool_vm_properties.machine_type}". Check the relevant node pool setting: "machine_type" + EOT + "guest_accelerators" : <<-EOT + The reservation has ${jsonencode(try(local.reservation_vm_properties[0].guest_accelerators, {}))} accelerators and the node pool has ${jsonencode(try(local.nodepool_vm_properties.guest_accelerators, {}))}. Check the relevant node pool setting: "guest_accelerator". When unspecified, for the machine_type=${var.machine_type}, the default is guest_accelerator=${jsonencode(try(local.generated_guest_accelerator, [{}]))}. + EOT + "local_ssds" : <<-EOT + The reservation has ${jsonencode(try(local.reservation_vm_properties[0].local_ssds, {}))} local SSDs and the node pool has ${jsonencode(try(local.nodepool_vm_properties.local_ssds, {}))}. Check the relevant node pool settings: {local_ssd_count_ephemeral_storage, local_ssd_count_nvme_block}. When unspecified, for the machine_type=${var.machine_type} the defaults are: {local_ssd_count_ephemeral_storage=${coalesce(local.generated_local_ssd_config.local_ssd_count_ephemeral_storage, 0)}, local_ssd_count_nvme_block=${coalesce(local.generated_local_ssd_config.local_ssd_count_nvme_block, 0)}}. + EOT + } } From 0d93ff9a68720bb8a13fe549b217de65a1af8728 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Tue, 29 Oct 2024 18:33:35 +0000 Subject: [PATCH 054/129] updating tpg version constraints --- pkg/config/expand.go | 4 ++-- pkg/config/expand_test.go | 4 ++-- .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../golden_copies/expectations/igc_pkr/zero/versions.tf | 4 ++-- .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml | 8 ++++---- .../golden_copies/expectations/igc_tf/one/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/zero/versions.tf | 4 ++-- .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/merge_flatten/zero/versions.tf | 4 ++-- .../.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/versioned_blueprint/primary/versions.tf | 4 ++-- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index b79babfbe5..1ec2323f12 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 4.84.0, < 6.8.0", + Version: ">= 5.44.2, < 6.2.0", Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 6.8.0", + Version: ">= 5.44.2, < 6.2.0", Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 5abdd6620d..12ad651081 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 4.84.0, < 6.8.0"}, + Version: ">= 5.44.2, < 6.2.0"}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 4.84.0, < 6.8.0"}}) + Version: ">= 5.44.2, < 6.2.0"}}) } { // no def PR, group PR diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index 247b05451b..d4209686db 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index 3dd3e12681..6cad938861 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index 5c4ee035b9..7dba9d7a74 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) @@ -80,14 +80,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index 3dd3e12681..6cad938861 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index 3dd3e12681..6cad938861 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 019e71a4e5..1bbb09a4f0 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index 3dd3e12681..6cad938861 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index 79bd57ecc4..9fe6a8753f 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -47,14 +47,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 4.84.0, < 6.8.0' + version: '>= 5.44.2, < 6.2.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf index 3dd3e12681..6cad938861 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 6.8.0" + version = ">= 5.44.2, < 6.2.0" } } } From bb77086d5e73f1ce15dc58fc76e4dddd5d2e744f Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 29 Oct 2024 18:54:40 +0000 Subject: [PATCH 055/129] Update instance_template usage in test blueprint to support TPG 5.x and 6.x --- .../test_configs/cloud-batch-cft-instance-template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml index 2df9ca1276..4938bf1f76 100644 --- a/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml +++ b/tools/validate_configs/test_configs/cloud-batch-cft-instance-template.yaml @@ -44,7 +44,7 @@ deployment_groups: echo "Hello World" > /sw/hello.txt - id: batch-compute-template - source: github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=v7.8.0 + source: github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=v12.1.0 use: [batch-startup-script] settings: # Boiler plate to work with Cloud Foundation Toolkit From 957931448e675e864b00915f15c40560637592d2 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 29 Oct 2024 20:03:38 +0000 Subject: [PATCH 056/129] Update Slurm-GCP v5 to 5.12.1 - Enable compatibility with terraform-provider-google 6.x - Update NVIDIA driver to 550.90.12 --- .../README.md | 2 +- .../main.tf | 2 +- .../schedmd-slurm-gcp-v5-partition/README.md | 2 +- .../schedmd-slurm-gcp-v5-partition/main.tf | 2 +- .../schedmd-slurm-gcp-v5-controller/README.md | 18 +++++++++--------- .../schedmd-slurm-gcp-v5-controller/main.tf | 4 ++-- .../schedmd-slurm-gcp-v5-hybrid/README.md | 14 +++++++------- .../schedmd-slurm-gcp-v5-hybrid/main.tf | 2 +- .../schedmd-slurm-gcp-v5-login/README.md | 14 +++++++------- .../schedmd-slurm-gcp-v5-login/main.tf | 4 ++-- .../demo-with-cloud-controller-instructions.md | 2 +- .../deploy-instructions.md | 4 ++-- .../on-prem-instructions.md | 16 ++++++++-------- docs/image-building.md | 2 +- examples/README.md | 4 ++-- .../a3-highgpu-8g/v5-legacy/README.md | 2 +- .../ml-slurm-a3-1-image-v5-legacy.yaml | 2 +- modules/README.md | 2 +- 18 files changed, 49 insertions(+), 49 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index cecea973e1..b064a8721d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.0 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.1 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index ed9365c4e6..96e3fa9de5 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.1" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index f9fcc59bed..6049020468 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -151,7 +151,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.0 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.1 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index 52e9d0a7d2..d710206935 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -40,7 +40,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.1" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index b9ae2ce50c..bb9ad2a65e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -22,14 +22,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -99,12 +99,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster#optional +[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster#optional ## Custom Images @@ -220,8 +220,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.0 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.0 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.1 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.1 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 92d7a9d840..12ab3aaa22 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -61,7 +61,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.1" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -99,7 +99,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.1" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index 56cbc33b07..70ce7c80f7 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -44,7 +44,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -64,15 +64,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md -[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/docs/hybrid.md +[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -152,10 +152,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer +[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/packer ## License @@ -187,7 +187,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.0 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.1 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 7b8eb1171d..02c5956676 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.1" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 44b337ec78..72357718b0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -10,9 +10,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -51,8 +51,8 @@ The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terrafo modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1#slurm-on-google-cloud-platform ## License @@ -87,8 +87,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.0 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.0 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.1 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.1 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index af9254ae74..c7c8f8b753 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -57,7 +57,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.1" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -95,7 +95,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.0" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.1" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index fbc851e1db..892af3a71c 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index b03f5403a1..f848d2ef55 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -264,8 +264,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 -[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 +[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 30e721dad0..4f75504a27 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 [slurm\_controller\_hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/docs/hybrid.md +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/docs/hybrid.md ### NFS Mounts @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer -[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.0/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/packer +[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 707fc407c4..24c1eadc54 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -167,7 +167,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.12.0&depth=1 + source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.12.1&depth=1 kind: packer settings: use_iap: true diff --git a/examples/README.md b/examples/README.md index 5fe023512d..dcc640ffd2 100644 --- a/examples/README.md +++ b/examples/README.md @@ -216,7 +216,7 @@ the experimental badge (![experimental-badge]). > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -1149,7 +1149,7 @@ The blueprint contains 3 groups: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt > ``` Similar to the [hpc-slurm-v5-legacy.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md index 9800ad7128..690a968d15 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md @@ -40,7 +40,7 @@ installing them in a Python virtual environment: python3 -m venv toolkit-a3 source toolkit-a3/bin/activate pip3 install -r \ - https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.0/scripts/requirements.txt + https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt ``` **Always** activate the environment before running any gcluster commands such as diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml index 42a823bf8e..aa054e8c3f 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml @@ -92,7 +92,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.12.0 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.12.1 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/modules/README.md b/modules/README.md index 338d6b5213..cc1bb513ef 100644 --- a/modules/README.md +++ b/modules/README.md @@ -223,7 +223,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.0 +[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 [slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md From 957b719a098073db1732f7e62d27065f91132b59 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Mon, 28 Oct 2024 20:32:54 +0000 Subject: [PATCH 057/129] Update documentation on how to use shared vpc network using toolkit --- examples/README.md | 24 +++++++++++++++++++++- modules/network/pre-existing-vpc/README.md | 11 ++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index 5fe023512d..703cc8ecc2 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1676,6 +1676,28 @@ the controller and login nodes. Also since this blueprint doesn't use external IPs for compute nodes, one must needs to [set up cloud nat][cloudnat] and [set up iap][iap]. +Now, one needs to update the blueprint to include shared vpc details. In the +network configuration, update the details for shared vpc as mentioned below, + +```yaml +vars: + project_id: # update /w the service project id in which shared network will be used. + host_project_id: # update /w the host project id in which shared network is created. + deployment_name: hpc-small-shared-vpc + region: us-central1 + zone: us-central1-c + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/pre-existing-vpc + settings: + project_id: $(vars.host_project_id) + network_name: # update /w shared network name + subnetwork_name: # update /w shared sub-net name +``` + [hpc-slurm-sharedvpc.yaml]: ../community/examples/hpc-slurm-sharedvpc.yaml [fs-shared-vpc]: https://cloud.google.com/filestore/docs/shared-vpc @@ -1733,7 +1755,7 @@ deployment_groups: # GitHub module over HTTPS, prefixed with github.com - source: github.com/org/repo//path/to/module - # Local absolute source, prefixed with / + # Local absolute source, prefixed with / - source: /path/to/module # Local relative (to current working directory) source, prefixed with ./ or ../ diff --git a/modules/network/pre-existing-vpc/README.md b/modules/network/pre-existing-vpc/README.md index b1257a4097..ecd4584ef5 100644 --- a/modules/network/pre-existing-vpc/README.md +++ b/modules/network/pre-existing-vpc/README.md @@ -35,6 +35,17 @@ VM will be created. > **_NOTE:_** The `project_id` and `region` settings would be inferred from the > deployment variables of the same name, but they are included here for clarity. +### Use shared-vpc + +If a network is created in different project, this module can be used to +reference the network. To use a network from a different project first make sure +you have a [cloud nat][cloudnat] and [IAP][iap] forwarding. For more details, +refer [shared-vpc][shared-vpc-doc] + +[cloudnat]: https://cloud.google.com/nat/docs/overview +[iap]: https://cloud.google.com/iap/docs/using-tcp-forwarding +[shared-vpc-doc]: ../../../examples/README.md#hpc-slurm-sharedvpcyaml-community-badge-experimental-badge + ## License From 28830c9914896bf0ed68fe4daea95479d619f265 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Tue, 29 Oct 2024 20:43:57 +0000 Subject: [PATCH 058/129] updating htcondor version constraints for mig and instance-template modules --- community/modules/compute/htcondor-execute-point/README.md | 4 ++-- community/modules/compute/htcondor-execute-point/main.tf | 4 ++-- community/modules/scheduler/htcondor-access-point/README.md | 4 ++-- community/modules/scheduler/htcondor-access-point/main.tf | 5 +++-- .../modules/scheduler/htcondor-central-manager/README.md | 4 ++-- community/modules/scheduler/htcondor-central-manager/main.tf | 4 ++-- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index cb801236f1..c7068a4522 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -210,8 +210,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | -| [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | +| [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | +| [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | ~> 12.1 | | [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index f6292430d9..0d8171092a 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -132,7 +132,7 @@ module "startup_script" { module "execute_point_instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" - version = "10.1.1" + version = "~> 12.1" name_prefix = local.name_prefix project_id = var.project_id @@ -160,7 +160,7 @@ module "execute_point_instance_template" { module "mig" { source = "terraform-google-modules/vm/google//modules/mig" - version = "10.1.1" + version = "~> 12.1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index d93bc24b1d..05c24cb6e8 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -120,8 +120,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 73dc845 | -| [htcondor\_ap](#module\_htcondor\_ap) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | +| [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | +| [htcondor\_ap](#module\_htcondor\_ap) | terraform-google-modules/vm/google//modules/mig | ~> 12.1 | | [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index ad6f02eebf..30a71679f7 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -226,7 +226,8 @@ resource "google_compute_disk" "spool" { } module "access_point_instance_template" { - source = "github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=73dc845" + source = "terraform-google-modules/vm/google//modules/instance_template" + version = "~> 12.1" name_prefix = local.name_prefix project_id = var.project_id @@ -261,7 +262,7 @@ module "access_point_instance_template" { module "htcondor_ap" { source = "terraform-google-modules/vm/google//modules/mig" - version = "10.1.1" + version = "~> 12.1" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index fa98e93d05..5a94c7bc5f 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -104,8 +104,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | 10.1.1 | -| [htcondor\_cm](#module\_htcondor\_cm) | terraform-google-modules/vm/google//modules/mig | 10.1.1 | +| [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | +| [htcondor\_cm](#module\_htcondor\_cm) | terraform-google-modules/vm/google//modules/mig | ~> 12.1 | | [startup\_script](#module\_startup\_script) | ../../../../modules/scripts/startup-script | n/a | ## Resources diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 52c0d71c4f..35da433f5f 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -134,7 +134,7 @@ module "startup_script" { module "central_manager_instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" - version = "10.1.1" + version = "~> 12.1" name_prefix = local.name_prefix project_id = var.project_id @@ -160,7 +160,7 @@ module "central_manager_instance_template" { module "htcondor_cm" { source = "terraform-google-modules/vm/google//modules/mig" - version = "10.1.1" + version = "~> 12.1" project_id = var.project_id region = var.region From 109d215d8af1acee77f9ccbe4ec08d7d104ad600 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Tue, 29 Oct 2024 21:41:27 +0000 Subject: [PATCH 059/129] updating batch version constraint for instance-template module --- modules/scheduler/batch-job-template/README.md | 2 +- modules/scheduler/batch-job-template/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index a3a0b176b6..d4068a93c3 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -140,7 +140,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 10.1.1 | +| [instance\_template](#module\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 12.1 | | [netstorage\_startup\_script](#module\_netstorage\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.39.0 | ## Resources diff --git a/modules/scheduler/batch-job-template/main.tf b/modules/scheduler/batch-job-template/main.tf index bb378ea7b1..0d681536c9 100644 --- a/modules/scheduler/batch-job-template/main.tf +++ b/modules/scheduler/batch-job-template/main.tf @@ -90,7 +90,7 @@ locals { module "instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" - version = "~> 10.1.1" + version = "~> 12.1" name_prefix = var.instance_template == null ? "${var.job_id}-instance-template" : "unused-template" project_id = var.project_id From b4b3a59aa12a93aa3c9cd1db45809273cbe34360 Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Wed, 30 Oct 2024 16:37:50 +0000 Subject: [PATCH 060/129] add example of using topology with pytorch --- .../topological-pytorch/README.md | 162 ++++++++++++++++++ .../topological-pytorch/install.sh | 26 +++ .../topological_pytorch.py | 69 ++++++++ .../topological_pytorch.sh | 54 ++++++ 4 files changed, 311 insertions(+) create mode 100644 examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md create mode 100644 examples/machine-learning/a3-megagpu-8g/topological-pytorch/install.sh create mode 100644 examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.py create mode 100644 examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md new file mode 100644 index 0000000000..bc6317c031 --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md @@ -0,0 +1,162 @@ + +# Topologically-aware Pytorch Distributed + +This example demonstrates how to incorporate topology information into a +pytorch distributed workload. + +Note: This requires that your nodes were created using a compact placement +policy. + +The main concept is that pytorch needs to incorporate the information from topologically-aware Slurm into its +`dist.init_process_group` function. [Slurm topology plugin is automatically configured for ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/docs/slurm-topology.md). + +Note: If you use torchrun, you may need to alter how this information is +incorporated. Using `torchrun ... --node-rank=${SLURM_NODEID}` does not seem to +properly initialize ranks based on the correct node sorting. For that reason, +we suggest using the +[env://](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization) +initialization process, which is slightly more manual but enables the fine grain +control that we want. + +## Quickstart + +Run the following commands to demonstrate topologically aware pytorch: + + # Creates a local python3 env and installs pytorch + jobid=$(sbatch --parsable install.sh) + + # Run an example of setting SLURM_HOSTFILE based on topology + sbatch --dependency=afterok:$jobid topological_pytorch.sh + +Once submitted, you should be able to view the state of the jobs with `sinfo`: + + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) + 124 a3mega topologi username PD 0:00 8 (Dependency) + 123 a3mega install. username R 2:14 1 a3mega-a3meganodeset-0 + +Wait until job 124 is complete, then review the output in `slurm-124.out`. It +will look something like this (illustative values used, your physical host will +have random characters): + + Standard + rank hostname physical_host + 0 a3mega-a3meganodeset-0.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/00000000000000000000000000000000 + 8 a3mega-a3meganodeset-1.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/dddddddddddddddddddddddddddddddd/11111111111111111111111111111111 + 16 a3mega-a3meganodeset-2.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/22222222222222222222222222222222 + 24 a3mega-a3meganodeset-3.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/cccccccccccccccccccccccccccccccc/33333333333333333333333333333333 + 32 a3mega-a3meganodeset-4.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee/44444444444444444444444444444444 + 40 a3mega-a3meganodeset-5.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/ffffffffffffffffffffffffffffffff/55555555555555555555555555555555 + 48 a3mega-a3meganodeset-6.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/66666666666666666666666666666666 + 54 a3mega-a3meganodeset-7.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/ffffffffffffffffffffffffffffffff/77777777777777777777777777777777 + Sorted by topology + rank hostname physical_host + 0 a3mega-a3meganodeset-2.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa/22222222222222222222222222222222 + 8 a3mega-a3meganodeset-0.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/00000000000000000000000000000000 + 16 a3mega-a3meganodeset-6.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb/66666666666666666666666666666666 + 24 a3mega-a3meganodeset-3.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/cccccccccccccccccccccccccccccccc/33333333333333333333333333333333 + 32 a3mega-a3meganodeset-1.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/dddddddddddddddddddddddddddddddd/11111111111111111111111111111111 + 40 a3mega-a3meganodeset-4.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee/44444444444444444444444444444444 + 48 a3mega-a3meganodeset-5.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/ffffffffffffffffffffffffffffffff/55555555555555555555555555555555 + 56 a3mega-a3meganodeset-7.c..internal /CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC/ffffffffffffffffffffffffffffffff/77777777777777777777777777777777 + +Which shows that the ranks are ordered by the "rack" component of the `physical_host`. +See [here](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies#verify-vm-location) +for more information on compact placement policies. + +## Detailed Explanation + +### Setup + +First we need to install pytorch. While these same concepts transfer to using +enroot/pyxis to launch containerized workloads, in this example we will just +use a local python environment: + + # Creates a local python3 env and installs pytorch + sbatch install.sh + +### Job Submission Script +Now let's review the `topological_pytorch.sh` batch job submission script. + +First we set the requisite GPUDirect-TCPXO environment variables: + + NCCL_LIB_DIR="/var/lib/tcpxo/lib64" source /var/lib/tcpxo/lib64/nccl-env-profile.sh + export NCCL_FASTRAK_CTRL_DEV=enp0s12 + export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + export NCCL_SOCKET_IFNAME=enp0s12 + export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices + +and activate our python environment: + + source env/bin/activate + +Next we demonstrate the standard behavior that torchrun would use, which does +not incorporate topology into how it orders ranks among the nodes. + + # Demonstrate standard behavior + echo "Standard" + # Set the MASTER_ADDR to the first node in the Slurm Job Nodelist + export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) + # For torchrun, we only launch 1 task per node, and instruct torchrun to create + # 8 (SLURM_GPUS_PER_NODE) processes per node. + srun --ntasks-per-node=1 --nodes $SLURM_NNODES \ + python -m torch.distributed.run \ + --nproc_per_node ${SLURM_GPUS_PER_NODE} \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --nnodes $SLURM_NNODES topological_pytorch.py + +torchrun will launch 8 tasks per node, and assign ranks lexiconographically +across nodes according to the hostnames. + +For topologically-aware behavior, we launch all the tasks using Slurm's `srun`, +and will use the Slurm environment variables to initialize the torch distributed +process group, as we'll describe in the next section. + +Note: [Topology aware Slurm is enabled by default in ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/docs/slurm-topology.md) + +Slurm sets the `SLURM_PROCID` according to topology, which we will later use to +order NCCL ranks in Pytorch. The last thing we need to do is launch the job, +adding `--topology` to the script arguments to trigger the topology logic. + + srun python topological_pytorch.py --topology + +### Test Script +Next review the `topological_pytorch.py` script. There is a top level flag of +`--topology`, which controls whether pytorch is initialized using torchrun (when +`False`) or using Slurm (when `True`). The Slurm environment variables ensure +that the node ordering that Slurm uses gets translated to the Pytorch ranks. + + if args.topology: + # These are populated by Slurm + local_rank = int(os.environ["SLURM_LOCALID"]) + global_rank = int(os.environ["SLURM_PROCID"]) + world_size = int(os.environ["SLURM_NPROCS"]) + procs_per_node = int(os.environ["SLURM_NTASKS_PER_NODE"]) + + # Must set rank and world_size based on SLURM_PROCID and SLURM_NPROCS + dist.init_process_group("nccl", rank=global_rank, world_size=world_size) + else: + # These are populated by torchrun + local_rank = int(os.environ["LOCAL_RANK"]) + global_rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + procs_per_node = int(os.environ["LOCAL_WORLD_SIZE"]) + + # Torchrun handles rank allocation + dist.init_process_group("nccl") + +The remainder of the script is meant to demonstrate functionality. We use +`dist.all_gather_object` to collect the rank, hostname, and `physical_host` from +each pytorch worker, and then print the order out from global rank 0. What you +should see is that depending on the topology that Slurm uses to launch the jobs, +the ordering of this output will vary. + +### Running the Test + +Run the following commands to demonstrate topologically aware pytorch: + + # Run an example of setting SLURM_HOSTFILE based on topology + sbatch topological_pytorch.sh + +The output shows the standard vs topologically-aware behavior. See +the Quickstart section above for an example. diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/install.sh b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/install.sh new file mode 100644 index 0000000000..ba44f14bdc --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/install.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#filename: install.sh +#submit with `sbatch install.sh` + +#SBATCH --partition=a3mega +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=1 +#SBATCH --nodes 1 + +python3 -m venv env +source env/bin/activate +pip3 install --pre torch torchvision torchaudio diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.py b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.py new file mode 100644 index 0000000000..ed71e4ea49 --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.py @@ -0,0 +1,69 @@ + +#!/usr/bin/env python +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#filename: topological_pytorch.py +import os +import torch +import torch.distributed as dist +import socket +import subprocess +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--topology", action=argparse.BooleanOptionalAction) +args = parser.parse_args() + +hostname = socket.getfqdn() +if args.topology: + # These are populated by Slurm + local_rank = int(os.environ["SLURM_LOCALID"]) + global_rank = int(os.environ["SLURM_PROCID"]) + world_size = int(os.environ["SLURM_NPROCS"]) + procs_per_node = int(os.environ["SLURM_NTASKS_PER_NODE"]) + + # Must set rank and world_size based on SLURM_PROCID and SLURM_NPROCS + dist.init_process_group("nccl", rank=global_rank, world_size=world_size) +else: + # These are populated by torchrun + local_rank = int(os.environ["LOCAL_RANK"]) + global_rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + procs_per_node = int(os.environ["LOCAL_WORLD_SIZE"]) + + # Torchrun handles rank allocation + dist.init_process_group("nccl") + +# Must attach device based on the local rank. +torch.cuda.set_device(local_rank) + +# Get the physical host for the current task to print later +physical_host = subprocess.check_output([ + "curl", "-s", + "http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host", + "-H", "Metadata-Flavor: Google" +]).decode('utf-8') + +# Create an output to collect from the all-gather +output = [None for _ in range(world_size)] +dist.all_gather_object(output, [global_rank, hostname, physical_host]) +if global_rank == 0: + # Print out ordered set of hostnames from all-gather + print("rank\thostname\tphysical_host") + # Skip to print every procs_per_node to keep output compact + for result in output[::procs_per_node]: + print("\t".join(map(str,result))) + +dist.destroy_process_group() diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh new file mode 100644 index 0000000000..6a62fc9151 --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# shellcheck disable=SC2016 +# shellcheck disable=SC2155 + +#filename: topological_pytorch.sh +#submit with `sbatch topological_pytorch.sh` +#SBATCH --partition=a3mega +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --nodes 8 + +NCCL_LIB_DIR="/var/lib/tcpxo/lib64" source /var/lib/tcpxo/lib64/nccl-env-profile.sh +export NCCL_FASTRAK_CTRL_DEV=enp0s12 +export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 +export NCCL_SOCKET_IFNAME=enp0s12 +export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices + +source env/bin/activate + +export MASTER_PORT=12345 +export OMP_NUM_THREADS=12 + +# Demonstrate standard behavior +echo "Standard" +# Set the MASTER_ADDR to the first node in the Slurm Job Nodelist +export MASTER_ADDR=$(scontrol show hostnames "${SLURM_JOB_NODELIST}" | head -n 1) +# For torchrun, we only launch 1 task per node, and instruct torchrun to create +# 8 (SLURM_GPUS_PER_NODE) processes per node. +srun --ntasks-per-node=1 --nodes "${SLURM_NNODES}" \ + python -m torch.distributed.run \ + --nproc_per_node "${SLURM_GPUS_PER_NODE}" \ + --rdzv_endpoint "${MASTER_ADDR}":"${MASTER_PORT}" \ + --rdzv_backend c10d \ + --nnodes "${SLURM_NNODES}" topological_pytorch.py + +# Demonstrate how to incorporate topology +echo "Topologically aware" +# Run 8 tasks per node (inherited from the job script)l, since we aren't using +# torchrun in this case. Supply the --topology flag to the script to s +srun python topological_pytorch.py --topology From bbfadc5bc2728973e47430ae3a51af0ca5ac6587 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Wed, 30 Oct 2024 20:06:18 +0000 Subject: [PATCH 061/129] Update MTU for high gpu --- examples/gke-a3-highgpu.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml index f7f4018b0d..1c19dcd2e6 100644 --- a/examples/gke-a3-highgpu.yaml +++ b/examples/gke-a3-highgpu.yaml @@ -33,6 +33,7 @@ deployment_groups: source: modules/network/vpc settings: subnetwork_name: gke-subnet-a3-highgpu + mtu: 8244 secondary_ranges: gke-subnet-a3-highgpu: - range_name: pods @@ -59,6 +60,7 @@ deployment_groups: global_ip_address_range: 192.169.0.0/16 network_count: 4 subnetwork_cidr_suffix: 24 + mtu: 8244 - id: gke_cluster source: modules/scheduler/gke-cluster From 867fdd2ccdc924418012c06a23c53296fad96157 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Wed, 30 Oct 2024 20:30:41 +0000 Subject: [PATCH 062/129] Changing default for subnetwork_project to null --- .../compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md | 2 +- .../schedmd-slurm-gcp-v5-partition-dynamic/variables.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v5-partition/README.md | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/variables.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf | 4 ++-- .../modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index b064a8721d..d95f3167d2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -93,7 +93,7 @@ No resources. | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | | [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `""` | no | +| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf index d0c1ba162d..653862e030 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf @@ -75,7 +75,7 @@ variable "subnetwork_self_link" { variable "subnetwork_project" { description = "The project the subnetwork belongs to." type = string - default = "" + default = null } variable "exclusive" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index 6049020468..211cb24e30 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -178,7 +178,7 @@ limitations under the License. | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | | [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | | [startup\_script](#input\_startup\_script) | Startup script that will be used by the partition VMs. | `string` | `""` | no | -| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `""` | no | +| [subnetwork\_project](#input\_subnetwork\_project) | The project the subnetwork belongs to. | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | `null` | no | | [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | | [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.
ANY
GCE picks zones for creating VM instances to fulfill the requested number of VMs
within present resource constraints and to maximize utilization of unused zonal
reservations.
ANY\_SINGLE\_ZONE (default)
GCE always selects a single zone for all the VMs, optimizing for resource quotas,
available reservations and general capacity.
BALANCED
GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
are available while distributing VMs as evenly as possible across allowed zones
to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 45e87da037..1ca9b96eaa 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -137,7 +137,7 @@ variable "subnetwork_self_link" { variable "subnetwork_project" { description = "The project the subnetwork belongs to." type = string - default = "" + default = null } variable "exclusive" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 12ab3aaa22..4eb8223392 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -70,7 +70,7 @@ module "slurm_controller_instance" { region = var.region network = var.network_self_link == null ? "" : var.network_self_link subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link - subnetwork_project = var.subnetwork_project == null ? "" : var.subnetwork_project + subnetwork_project = var.subnetwork_project zone = var.zone static_ips = var.static_ips cgroup_conf_tpl = var.cgroup_conf_tpl @@ -127,7 +127,7 @@ module "slurm_controller_template" { source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf network = var.network_self_link == null ? "" : var.network_self_link - subnetwork_project = var.subnetwork_project == null ? "" : var.subnetwork_project + subnetwork_project = var.subnetwork_project subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) service_account = var.service_account != null ? var.service_account : { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index c7c8f8b753..a9c937a483 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -85,7 +85,7 @@ module "slurm_login_template" { source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf network = var.network_self_link == null ? "" : var.network_self_link - subnetwork_project = var.subnetwork_project == null ? "" : var.subnetwork_project + subnetwork_project = var.subnetwork_project subnetwork = var.subnetwork_self_link == null ? "" : var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) service_account = var.service_account != null ? var.service_account : { From 8a1cb69a458dced02b82c75fe95c513f35194a31 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Thu, 31 Oct 2024 13:29:22 +0000 Subject: [PATCH 063/129] Small change to README --- modules/packer/custom-image/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 873d3b993b..94169ef3c8 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -36,7 +36,7 @@ This can be achieved by one of the following 2 approaches: 1. Using a public IP address on the VM -- Set [var.omit_external_ip](#input_omit_external_ip) to `true` +- Set [var.omit_external_ip](#input_omit_external_ip) to `false` 1. Configuring a VPC with a Cloud NAT in the region of the VM From f87b572c6106f290602e481287586c977e6f57c4 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 31 Oct 2024 16:21:31 +0000 Subject: [PATCH 064/129] Remove the default taints from system pool and user nodepool --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/variables.tf | 6 +----- modules/scheduler/gke-cluster/README.md | 2 +- modules/scheduler/gke-cluster/variables.tf | 6 +----- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 880e1834e4..c22d4e9396 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -307,7 +307,7 @@ limitations under the License. | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [static\_node\_count](#input\_static\_node\_count) | The static number of nodes in the node pool. If set, autoscaling will be disabled. | `number` | `null` | no | -| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "user-workload",
"value": true
}
]
| no | +| [taints](#input\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
| `[]` | no | | [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index f5f31abde0..11442ceb4a 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -230,11 +230,7 @@ variable "taints" { value = any effect = string })) - default = [{ - key = "user-workload" - value = true - effect = "NO_SCHEDULE" - }] + default = [] } variable "labels" { diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 56b6236066..16bd3c8c49 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -182,7 +182,7 @@ limitations under the License. | [system\_node\_pool\_machine\_type](#input\_system\_node\_pool\_machine\_type) | Machine type for the system node pool. | `string` | `"e2-standard-4"` | no | | [system\_node\_pool\_name](#input\_system\_node\_pool\_name) | Name of the system node pool. | `string` | `"system"` | no | | [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. |
object({
total_min_nodes = number
total_max_nodes = number
})
|
{
"total_max_nodes": 10,
"total_min_nodes": 2
}
| no | -| [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "components.gke.io/gke-managed-components",
"value": true
}
]
| no | +| [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
| `[]` | no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index a291d58a1a..22734cf512 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -170,11 +170,7 @@ variable "system_node_pool_taints" { value = any effect = string })) - default = [{ - key = "components.gke.io/gke-managed-components" - value = true - effect = "NO_SCHEDULE" - }] + default = [] } variable "system_node_pool_kubernetes_labels" { From 7cc8c41bf48def4c6f1ef2c315de63878fa554c9 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Thu, 31 Oct 2024 18:07:02 +0000 Subject: [PATCH 065/129] updating workload-identity module from v29 to v34 --- modules/scheduler/gke-cluster/README.md | 2 +- modules/scheduler/gke-cluster/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 56b6236066..637a73ab1e 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -125,7 +125,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [kubectl\_apply](#module\_kubectl\_apply) | ../../management/kubectl-apply | n/a | -| [workload\_identity](#module\_workload\_identity) | terraform-google-modules/kubernetes-engine/google//modules/workload-identity | 29.0.0 | +| [workload\_identity](#module\_workload\_identity) | terraform-google-modules/kubernetes-engine/google//modules/workload-identity | ~> 34.0 | ## Resources diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index b57fec89cd..c7aaf52eef 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -290,7 +290,7 @@ provider "kubernetes" { module "workload_identity" { count = var.configure_workload_identity_sa ? 1 : 0 source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity" - version = "29.0.0" + version = "~> 34.0" use_existing_gcp_sa = true name = "workload-identity-k8-sa" From 998bbc127b1b1a6757af94b328283281a899f23f Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Thu, 31 Oct 2024 15:57:28 +0000 Subject: [PATCH 066/129] Initial commit to fix SSH issues with integration tests --- .../base-integration-test.yml | 9 ++-- .../htcondor-integration-test.yml | 9 ++-- .../slurm-integration-test.yml | 15 +++---- .../ansible_playbooks/tasks/wait-for-host.yml | 41 +++++++++++++++++++ 4 files changed, 60 insertions(+), 14 deletions(-) create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/tasks/wait-for-host.yml diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml index 8fd04acac8..66a2908869 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml @@ -93,7 +93,7 @@ ansible.builtin.debug: var: remote_ip - ## Setup firewall for cloud build + # Setup firewall for cloud build - name: Create firewall rule register: fw_result changed_when: fw_result.rc == 0 @@ -132,8 +132,11 @@ groups: [remote_host] when: remote_ip | ansible.utils.ipaddr - - name: Wait for cluster - ansible.builtin.wait_for_connection: + - name: Wait for host tasks + ansible.builtin.include_tasks: tasks/wait-for-host.yml + vars: + host_ip: "{{ remote_ip }}" + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" ## Cleanup and fail gracefully rescue: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml index 74c7c2ea46..480042a810 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml @@ -102,6 +102,11 @@ - --ttl - 2h - "--key-file=/builder/home/.ssh/id_rsa.pub" + - name: Wait for host tasks + ansible.builtin.include_tasks: tasks/wait-for-host.yml + vars: + host_ip: "{{ access_ip.stdout }}" + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" rescue: - name: Delete Firewall Rule register: fw_deleted @@ -146,10 +151,6 @@ vars: ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" block: - - name: Wait until host is reachable - ansible.builtin.wait_for_connection: - delay: 60 - timeout: 300 - name: Gather facts ansible.builtin.setup: - name: Wait until HTCondor daemon is up diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml index 4afc25457a..8c11fc2848 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml @@ -133,7 +133,7 @@ - --rules=tcp:22 - --source-ranges={{ build_ip.stdout }} - - name: 'Add SSH Keys to OS-Login' + - name: Add SSH Keys to OS-Login register: key_created changed_when: key_created.rc == 0 ansible.builtin.command: @@ -153,6 +153,12 @@ groups: [remote_host] when: login_ip | ansible.utils.ipaddr + - name: Wait for host tasks + ansible.builtin.include_tasks: tasks/wait-for-host.yml + vars: + host_ip: "{{ login_ip }}" + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" + ## Cleanup and fail gracefully rescue: - name: Capture gcluster stderr @@ -186,14 +192,9 @@ tasks: - name: Slurm Test Block vars: - ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" ansible_remote_tmp: "/tmp/gcluster/" + ansible_ssh_private_key_file: "/builder/home/.ssh/id_rsa" block: - - name: Wait until host is reachable - ansible.builtin.wait_for_connection: - delay: 60 - timeout: 300 - - name: Gather facts ansible.builtin.setup: diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/wait-for-host.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/wait-for-host.yml new file mode 100644 index 0000000000..27dbf53109 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/wait-for-host.yml @@ -0,0 +1,41 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +- name: Assert variables are defined + ansible.builtin.assert: + that: + - host_ip is defined + +- name: Wait for firewall to allow port 22 connection + ansible.builtin.wait_for: + host: "{{ host_ip }}" + port: 22 + delay: 60 + timeout: 300 + delegate_to: localhost + ignore_errors: true + register: port_out + +- name: Check connection to remote host + ansible.builtin.wait_for_connection: + delay: 10 + delegate_to: "{{ host_ip }}" + ignore_unreachable: true + register: connect_out + +- name: Fail on bad connections + ansible.builtin.fail: + msg: "Failed to connect to remote host {{ host_ip }}" + when: port_out is failed or connect_out is failed From cea4673360807518c8b51c226922e5d300a9c234 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Thu, 31 Oct 2024 19:24:41 +0000 Subject: [PATCH 067/129] Addressing PR review --- modules/packer/custom-image/README.md | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index d31c0eef1c..c743b161e3 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -210,25 +210,14 @@ to the console. For example: ==> example.googlecompute.toolkit_image: Startup script, if any, has finished running. ``` -### Viewing Startup Script Logs - -The recommended method for debugging the image build process is to use Cloud -Logging. This can be done by either searching for the VM instance in the Cloud -Console or using the following template command with the variables `PROJECT_ID` -(e.g. `test_project_1`) and `INSTANCE_ID` (note: unique numerical id, not -instance name) specified: - -```shell -gcloud logging --project read 'logName=("projects//logs/GCEMetadataScripts" OR "projects//logs/google_metadata_script_runner") AND resource.labels.instance_id=' --format="table(timestamp, resource.labels.instance_id, jsonPayload.message) --order=asc -``` +### Debugging startup-script failures > [!NOTE] > There can be a delay in the propagation of the logs from the instance to > Cloud Logging, so it may require waiting a few minutes to see the full logs. -If the Packer image build fails, the module will output the command above -with the variables specified and can be used in a terminal without -modification. +If the Packer image build fails, the module will output a `gcloud` command +that can be used directly to review startup-script execution. ## License From 29fc6f7bf1b992c8b4e7ac0c6ebb5229a48c3f98 Mon Sep 17 00:00:00 2001 From: Akiki Liang Date: Thu, 31 Oct 2024 22:17:25 +0000 Subject: [PATCH 068/129] add details and minor corrections --- .../topological-pytorch/README.md | 26 +++++++------------ .../topological_pytorch.sh | 5 ++-- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md index bc6317c031..2a46614f0e 100644 --- a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md @@ -7,16 +7,7 @@ pytorch distributed workload. Note: This requires that your nodes were created using a compact placement policy. -The main concept is that pytorch needs to incorporate the information from topologically-aware Slurm into its -`dist.init_process_group` function. [Slurm topology plugin is automatically configured for ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/docs/slurm-topology.md). - -Note: If you use torchrun, you may need to alter how this information is -incorporated. Using `torchrun ... --node-rank=${SLURM_NODEID}` does not seem to -properly initialize ranks based on the correct node sorting. For that reason, -we suggest using the -[env://](https://pytorch.org/docs/stable/distributed.html#environment-variable-initialization) -initialization process, which is slightly more manual but enables the fine grain -control that we want. +The main concept is that pytorch should incorporate the information from topologically-aware Slurm into its `dist.init_process_group` function. [Slurm topology plugin is automatically configured for ClusterToolkit](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/docs/slurm-topology.md). ## Quickstart @@ -28,7 +19,7 @@ Run the following commands to demonstrate topologically aware pytorch: # Run an example of setting SLURM_HOSTFILE based on topology sbatch --dependency=afterok:$jobid topological_pytorch.sh -Once submitted, you should be able to view the state of the jobs with `sinfo`: +Once submitted, you should be able to view the state of the jobs with `squeue`: JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 124 a3mega topologi username PD 0:00 8 (Dependency) @@ -98,12 +89,12 @@ not incorporate topology into how it orders ranks among the nodes. export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) # For torchrun, we only launch 1 task per node, and instruct torchrun to create # 8 (SLURM_GPUS_PER_NODE) processes per node. - srun --ntasks-per-node=1 --nodes $SLURM_NNODES \ + srun --ntasks-per-node=1 --nodes "${SLURM_NNODES}" \ python -m torch.distributed.run \ - --nproc_per_node ${SLURM_GPUS_PER_NODE} \ - --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --nproc_per_node "${SLURM_GPUS_PER_NODE}" \ + --rdzv_endpoint "${MASTER_ADDR}":"${MASTER_PORT}" \ --rdzv_backend c10d \ - --nnodes $SLURM_NNODES topological_pytorch.py + --nnodes "${SLURM_NNODES}" topological_pytorch.py torchrun will launch 8 tasks per node, and assign ranks lexiconographically across nodes according to the hostnames. @@ -120,6 +111,10 @@ adding `--topology` to the script arguments to trigger the topology logic. srun python topological_pytorch.py --topology +Note: Alternatively you can set the required environment variables to be populated by Slurm in the srun command. + + srun sh -c "WORLD_SIZE=\${SLURM_NPROCS} RANK=\${SLURM_PROCID} LOCAL_RANK=\${SLURM_LOCALID} LOCAL_WORLD_SIZE=\${SLURM_NTASKS_PER_NODE} python topological_pytorch.py" + ### Test Script Next review the `topological_pytorch.py` script. There is a top level flag of `--topology`, which controls whether pytorch is initialized using torchrun (when @@ -155,7 +150,6 @@ the ordering of this output will vary. Run the following commands to demonstrate topologically aware pytorch: - # Run an example of setting SLURM_HOSTFILE based on topology sbatch topological_pytorch.sh The output shows the standard vs topologically-aware behavior. See diff --git a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh index 6a62fc9151..477c92b0b5 100644 --- a/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh +++ b/examples/machine-learning/a3-megagpu-8g/topological-pytorch/topological_pytorch.sh @@ -49,6 +49,7 @@ srun --ntasks-per-node=1 --nodes "${SLURM_NNODES}" \ # Demonstrate how to incorporate topology echo "Topologically aware" -# Run 8 tasks per node (inherited from the job script)l, since we aren't using -# torchrun in this case. Supply the --topology flag to the script to s +# Run 8 tasks per node (inherited from the job script), since we aren't using +# torchrun in this case. Supply the --topology flag to the script to set +# global rank and world size of variables based on Slurm srun python topological_pytorch.py --topology From ba7a967a8ec08cfdc8b470e8def723ef2f4c2b3a Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 31 Oct 2024 22:30:52 +0000 Subject: [PATCH 069/129] Undo change to remove default from system pool, will handle that separately when required --- modules/scheduler/gke-cluster/README.md | 2 +- modules/scheduler/gke-cluster/variables.tf | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 16bd3c8c49..56b6236066 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -182,7 +182,7 @@ limitations under the License. | [system\_node\_pool\_machine\_type](#input\_system\_node\_pool\_machine\_type) | Machine type for the system node pool. | `string` | `"e2-standard-4"` | no | | [system\_node\_pool\_name](#input\_system\_node\_pool\_name) | Name of the system node pool. | `string` | `"system"` | no | | [system\_node\_pool\_node\_count](#input\_system\_node\_pool\_node\_count) | The total min and max nodes to be maintained in the system node pool. |
object({
total_min_nodes = number
total_max_nodes = number
})
|
{
"total_max_nodes": 10,
"total_min_nodes": 2
}
| no | -| [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
| `[]` | no | +| [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "components.gke.io/gke-managed-components",
"value": true
}
]
| no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index 22734cf512..a291d58a1a 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -170,7 +170,11 @@ variable "system_node_pool_taints" { value = any effect = string })) - default = [] + default = [{ + key = "components.gke.io/gke-managed-components" + value = true + effect = "NO_SCHEDULE" + }] } variable "system_node_pool_kubernetes_labels" { From 2e07fe9a78e7060529e662e65fbbaced471b37a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Fri, 18 Oct 2024 12:24:30 +0000 Subject: [PATCH 070/129] Imporvements for CloudSQL * add possibility to use Private Service Connect * add 8.4 version of MySQL * add posisbility to choose CloudSQL edition * add posibility to enable data caching --- .../slurm-cloudsql-federation/README.md | 10 ++++- .../slurm-cloudsql-federation/main.tf | 40 ++++++++++++++++++- .../slurm-cloudsql-federation/outputs.tf | 2 +- .../slurm-cloudsql-federation/variables.tf | 31 +++++++++++++- 4 files changed, 76 insertions(+), 7 deletions(-) diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index d15ddea672..9586a7f503 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -63,6 +63,8 @@ No modules. | Name | Type | |------|------| | [google_bigquery_connection.connection](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/bigquery_connection) | resource | +| [google_compute_address.psc](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_address) | resource | +| [google_compute_forwarding_rule.psc_consumer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_forwarding_rule) | resource | | [google_sql_database.database](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sql_database) | resource | | [google_sql_database_instance.instance](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sql_database_instance) | resource | | [google_sql_user.users](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/sql_user) | resource | @@ -74,19 +76,23 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [authorized\_networks](#input\_authorized\_networks) | IP address ranges as authorized networks of the Cloud SQL for MySQL instances | `list(string)` | `[]` | no | +| [data\_cache\_enabled](#input\_data\_cache\_enabled) | Whether data cache is enabled for the instance. Can be used with ENTERPRISE\_PLUS edition. | `bool` | `false` | no | | [database\_version](#input\_database\_version) | The version of the database to be created. | `string` | `"MYSQL_5_7"` | no | | [deletion\_protection](#input\_deletion\_protection) | Whether or not to allow Terraform to destroy the instance. | `string` | `false` | no | | [deployment\_name](#input\_deployment\_name) | The name of the current deployment | `string` | n/a | yes | +| [edition](#input\_edition) | value | `string` | `"ENTERPRISE"` | no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | -| [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is going to be created in.:
`projects//global/networks/`" | `string` | n/a | yes | +| [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is going to be created in.:
`projects//global/networks/`" | `string` | n/a | yes | | [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection, used only as dependency for Cloud SQL creation. | `string` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | | [region](#input\_region) | The region where SQL instance will be configured | `string` | n/a | yes | | [sql\_instance\_name](#input\_sql\_instance\_name) | name given to the sql instance for ease of identificaion | `string` | n/a | yes | | [sql\_password](#input\_sql\_password) | Password for the SQL database. | `any` | `null` | no | | [sql\_username](#input\_sql\_username) | Username for the SQL database | `string` | `"slurm"` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Self link of the network where Cloud SQL instance PSC endpoint will be created | `string` | n/a | yes | | [tier](#input\_tier) | The machine type to use for the SQL instance | `string` | n/a | yes | -| [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secrets |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | +| [use\_psc\_connection](#input\_use\_psc\_connection) | Create Private Service Connection instead of using Private Service Access peering | `bool` | `false` | no | +| [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secrets |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | ## Outputs diff --git a/community/modules/database/slurm-cloudsql-federation/main.tf b/community/modules/database/slurm-cloudsql-federation/main.tf index 6e2bfaceeb..3ff367eac8 100644 --- a/community/modules/database/slurm-cloudsql-federation/main.tf +++ b/community/modules/database/slurm-cloudsql-federation/main.tf @@ -48,24 +48,60 @@ resource "google_sql_database_instance" "instance" { settings { user_labels = local.labels + edition = var.edition tier = var.tier + dynamic "data_cache_config" { + for_each = var.edition == "ENTERPRISE_PLUS" ? [""] : [] + content { + data_cache_enabled = var.data_cache_enabled + } + } ip_configuration { ipv4_enabled = false - private_network = var.network_id + private_network = var.use_psc_connection ? null : var.network_id enable_private_path_for_google_cloud_services = true dynamic "authorized_networks" { - for_each = var.authorized_networks + for_each = var.use_psc_connection ? [] : var.authorized_networks iterator = ip_range content { value = ip_range.value } } + dynamic "psc_config" { + for_each = var.use_psc_connection ? [""] : [] + content { + psc_enabled = true + allowed_consumer_projects = [var.project_id] + } + } } } } +resource "google_compute_address" "psc" { + count = var.use_psc_connection ? 1 : 0 + project = var.project_id + name = local.sql_instance_name + address_type = "INTERNAL" + region = var.region + subnetwork = var.subnetwork_self_link + labels = local.labels +} + +resource "google_compute_forwarding_rule" "psc_consumer" { + count = var.use_psc_connection ? 1 : 0 + name = local.sql_instance_name + project = var.project_id + region = var.region + subnetwork = var.subnetwork_self_link + ip_address = google_compute_address.psc[0].self_link + load_balancing_scheme = "" + recreate_closed_psc = true + target = google_sql_database_instance.instance.psc_service_attachment_link +} + resource "google_sql_database" "database" { project = var.project_id name = "slurm_accounting" diff --git a/community/modules/database/slurm-cloudsql-federation/outputs.tf b/community/modules/database/slurm-cloudsql-federation/outputs.tf index 21d8bbfcc9..0d05221cd8 100644 --- a/community/modules/database/slurm-cloudsql-federation/outputs.tf +++ b/community/modules/database/slurm-cloudsql-federation/outputs.tf @@ -18,7 +18,7 @@ output "cloudsql" { description = "Describes the cloudsql instance." sensitive = true value = { - server_ip = google_sql_database_instance.instance.ip_address[0].ip_address + server_ip = var.use_psc_connection ? google_compute_address.psc[0].address : google_sql_database_instance.instance.ip_address[0].ip_address user = google_sql_user.users.name password = google_sql_user.users.password db_name = google_sql_database.database.name diff --git a/community/modules/database/slurm-cloudsql-federation/variables.tf b/community/modules/database/slurm-cloudsql-federation/variables.tf index ec41c70e9d..636fd939f8 100644 --- a/community/modules/database/slurm-cloudsql-federation/variables.tf +++ b/community/modules/database/slurm-cloudsql-federation/variables.tf @@ -26,16 +26,32 @@ variable "database_version" { type = string default = "MYSQL_5_7" validation { - condition = var.database_version == "MYSQL_5_7" || var.database_version == "MYSQL_8_0" - error_message = "The database version must be either MYSQL_5_7 or MYSQL_8_0." + condition = contains(["MYSQL_5_7", "MYSQL_8_0", "MYSQL_8_4"], var.database_version) + error_message = "The database version must be either MYSQL_5_7, MYSQL_8_0 or MYSQL_8_4." } } +variable "data_cache_enabled" { + description = "Whether data cache is enabled for the instance. Can be used with ENTERPRISE_PLUS edition." + type = bool + default = false +} + variable "deployment_name" { description = "The name of the current deployment" type = string } +variable "edition" { + description = "value" + type = string + validation { + condition = contains(["ENTERPRISE", "ENTERPRISE_PLUS"], var.edition) + error_message = "The database edition must be either ENTERPRISE or ENTERPRISE_PLUS" + } + default = "ENTERPRISE" +} + variable "project_id" { description = "Project in which the HPC deployment will be created" type = string @@ -97,6 +113,11 @@ variable "private_vpc_connection_peering" { default = null } +variable "subnetwork_self_link" { + description = "Self link of the network where Cloud SQL instance PSC endpoint will be created" + type = string +} + variable "user_managed_replication" { type = list(object({ location = string @@ -105,3 +126,9 @@ variable "user_managed_replication" { description = "Replication parameters that will be used for defined secrets" default = [] } + +variable "use_psc_connection" { + description = "Create Private Service Connection instead of using Private Service Access peering" + type = bool + default = false +} From adcd7073f475de82e100ecc7f8e9f8ff95235c0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Fri, 18 Oct 2024 15:28:48 +0000 Subject: [PATCH 071/129] Allow easy switching between ENTERPRISE_PLUS and ENTERPRISE --- .../modules/database/slurm-cloudsql-federation/README.md | 1 + .../modules/database/slurm-cloudsql-federation/main.tf | 6 ++++++ .../modules/database/slurm-cloudsql-federation/variables.tf | 6 ++++++ 3 files changed, 13 insertions(+) diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index 9586a7f503..c167150ae1 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -81,6 +81,7 @@ No modules. | [deletion\_protection](#input\_deletion\_protection) | Whether or not to allow Terraform to destroy the instance. | `string` | `false` | no | | [deployment\_name](#input\_deployment\_name) | The name of the current deployment | `string` | n/a | yes | | [edition](#input\_edition) | value | `string` | `"ENTERPRISE"` | no | +| [enable\_backups](#input\_enable\_backups) | Set true to enable backups | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | | [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is going to be created in.:
`projects//global/networks/`" | `string` | n/a | yes | | [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection, used only as dependency for Cloud SQL creation. | `string` | `null` | no | diff --git a/community/modules/database/slurm-cloudsql-federation/main.tf b/community/modules/database/slurm-cloudsql-federation/main.tf index 3ff367eac8..dccc68a921 100644 --- a/community/modules/database/slurm-cloudsql-federation/main.tf +++ b/community/modules/database/slurm-cloudsql-federation/main.tf @@ -77,6 +77,12 @@ resource "google_sql_database_instance" "instance" { } } } + + backup_configuration { + enabled = var.enable_backups + # to allow easy switching between ENTERPRISE and ENTERPRISE_PLUS + transaction_log_retention_days = 7 + } } } diff --git a/community/modules/database/slurm-cloudsql-federation/variables.tf b/community/modules/database/slurm-cloudsql-federation/variables.tf index 636fd939f8..f09635acea 100644 --- a/community/modules/database/slurm-cloudsql-federation/variables.tf +++ b/community/modules/database/slurm-cloudsql-federation/variables.tf @@ -52,6 +52,12 @@ variable "edition" { default = "ENTERPRISE" } +variable "enable_backups" { + description = "Set true to enable backups" + type = bool + default = false +} + variable "project_id" { description = "Project in which the HPC deployment will be created" type = string From 33d8bb18b41311f80d6c77782846cb0f05bf0798 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Mon, 21 Oct 2024 16:03:29 +0000 Subject: [PATCH 072/129] Allow providing information about disk size to scale IOPS --- .../slurm-cloudsql-federation/README.md | 8 +++++--- .../database/slurm-cloudsql-federation/main.tf | 17 ++++++++++++++--- .../slurm-cloudsql-federation/variables.tf | 13 +++++++++++++ 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index c167150ae1..178d05d476 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -80,20 +80,22 @@ No modules. | [database\_version](#input\_database\_version) | The version of the database to be created. | `string` | `"MYSQL_5_7"` | no | | [deletion\_protection](#input\_deletion\_protection) | Whether or not to allow Terraform to destroy the instance. | `string` | `false` | no | | [deployment\_name](#input\_deployment\_name) | The name of the current deployment | `string` | n/a | yes | +| [disk\_autoresize](#input\_disk\_autoresize) | Set to false to disable automatic disk grow. | `bool` | `true` | no | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Size of the database disk in GiB. | `number` | `null` | no | | [edition](#input\_edition) | value | `string` | `"ENTERPRISE"` | no | | [enable\_backups](#input\_enable\_backups) | Set true to enable backups | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | -| [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is going to be created in.:
`projects//global/networks/`" | `string` | n/a | yes | +| [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is going to be created in.:
`projects//global/networks/`" | `string` | n/a | yes | | [private\_vpc\_connection\_peering](#input\_private\_vpc\_connection\_peering) | The name of the VPC Network peering connection, used only as dependency for Cloud SQL creation. | `string` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | | [region](#input\_region) | The region where SQL instance will be configured | `string` | n/a | yes | | [sql\_instance\_name](#input\_sql\_instance\_name) | name given to the sql instance for ease of identificaion | `string` | n/a | yes | | [sql\_password](#input\_sql\_password) | Password for the SQL database. | `any` | `null` | no | | [sql\_username](#input\_sql\_username) | Username for the SQL database | `string` | `"slurm"` | no | -| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Self link of the network where Cloud SQL instance PSC endpoint will be created | `string` | n/a | yes | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Self link of the network where Cloud SQL instance PSC endpoint will be created | `string` | `null` | no | | [tier](#input\_tier) | The machine type to use for the SQL instance | `string` | n/a | yes | | [use\_psc\_connection](#input\_use\_psc\_connection) | Create Private Service Connection instead of using Private Service Access peering | `bool` | `false` | no | -| [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secrets |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | +| [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secrets |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | ## Outputs diff --git a/community/modules/database/slurm-cloudsql-federation/main.tf b/community/modules/database/slurm-cloudsql-federation/main.tf index dccc68a921..638e592f74 100644 --- a/community/modules/database/slurm-cloudsql-federation/main.tf +++ b/community/modules/database/slurm-cloudsql-federation/main.tf @@ -47,9 +47,12 @@ resource "google_sql_database_instance" "instance" { database_version = var.database_version settings { - user_labels = local.labels - edition = var.edition - tier = var.tier + disk_size = var.disk_size_gb + disk_autoresize = var.disk_autoresize + edition = var.edition + tier = var.tier + user_labels = local.labels + dynamic "data_cache_config" { for_each = var.edition == "ENTERPRISE_PLUS" ? [""] : [] content { @@ -84,8 +87,16 @@ resource "google_sql_database_instance" "instance" { transaction_log_retention_days = 7 } } + lifecycle { + precondition { + condition = var.disk_autoresize && var.disk_size_gb == null || !var.disk_autoresize + error_message = "If setting disk_size_gb set disk_autorize to false to prevent re-provisioning of the instance after disk auto-expansion." + } + } } + + resource "google_compute_address" "psc" { count = var.use_psc_connection ? 1 : 0 project = var.project_id diff --git a/community/modules/database/slurm-cloudsql-federation/variables.tf b/community/modules/database/slurm-cloudsql-federation/variables.tf index f09635acea..a921d60d65 100644 --- a/community/modules/database/slurm-cloudsql-federation/variables.tf +++ b/community/modules/database/slurm-cloudsql-federation/variables.tf @@ -42,6 +42,18 @@ variable "deployment_name" { type = string } +variable "disk_autoresize" { + description = "Set to false to disable automatic disk grow." + type = bool + default = true +} + +variable "disk_size_gb" { + description = "Size of the database disk in GiB." + type = number + default = null +} + variable "edition" { description = "value" type = string @@ -122,6 +134,7 @@ variable "private_vpc_connection_peering" { variable "subnetwork_self_link" { description = "Self link of the network where Cloud SQL instance PSC endpoint will be created" type = string + default = null } variable "user_managed_replication" { From 138b8205da6fb98caa4880663ef59e47f19951b3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Nov 2024 10:41:19 +0000 Subject: [PATCH 073/129] Bump golang.org/x/sys from 0.25.0 to 0.26.0 Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.25.0 to 0.26.0. - [Commits](https://github.com/golang/sys/compare/v0.25.0...v0.26.0) --- updated-dependencies: - dependency-name: golang.org/x/sys dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index c6dda3d8f6..30af212272 100644 --- a/go.mod +++ b/go.mod @@ -98,7 +98,7 @@ require ( golang.org/x/crypto v0.24.0 // indirect golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.25.0 + golang.org/x/sys v0.26.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.1 // indirect google.golang.org/protobuf v1.34.2 // indirect diff --git a/go.sum b/go.sum index ca8c11adb3..0ee72197bb 100644 --- a/go.sum +++ b/go.sum @@ -732,8 +732,8 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= -golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= From 24a6202255697865ab9e4e4ad1a4e574dbb52a57 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Nov 2024 10:41:23 +0000 Subject: [PATCH 074/129] Bump github.com/fatih/color from 1.17.0 to 1.18.0 Bumps [github.com/fatih/color](https://github.com/fatih/color) from 1.17.0 to 1.18.0. - [Release notes](https://github.com/fatih/color/releases) - [Commits](https://github.com/fatih/color/compare/v1.17.0...v1.18.0) --- updated-dependencies: - dependency-name: github.com/fatih/color dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index c6dda3d8f6..73660bb566 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,7 @@ require ( ) require ( - github.com/fatih/color v1.17.0 + github.com/fatih/color v1.18.0 github.com/go-git/go-billy/v5 v5.5.0 github.com/google/go-cmp v0.6.0 github.com/hashicorp/terraform-exec v0.21.0 diff --git a/go.sum b/go.sum index ca8c11adb3..9aef3d9495 100644 --- a/go.sum +++ b/go.sum @@ -252,8 +252,8 @@ github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go. github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1/go.mod h1:KJwIaB5Mv44NWtYuAOFCVOjcI94vtpEz2JU/D2v6IjE= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= -github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= -github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI= +github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= +github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= From 3864a42d04b04849ef2cd11e99f03fe6d9ef60f2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 Nov 2024 13:13:18 +0000 Subject: [PATCH 075/129] Bump github.com/go-git/go-billy/v5 from 5.5.0 to 5.6.0 Bumps [github.com/go-git/go-billy/v5](https://github.com/go-git/go-billy) from 5.5.0 to 5.6.0. - [Release notes](https://github.com/go-git/go-billy/releases) - [Commits](https://github.com/go-git/go-billy/compare/v5.5.0...v5.6.0) --- updated-dependencies: - dependency-name: github.com/go-git/go-billy/v5 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 14 +++++++------- go.sum | 36 ++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/go.mod b/go.mod index 49f4b2b23f..012b792751 100644 --- a/go.mod +++ b/go.mod @@ -14,7 +14,7 @@ require ( github.com/spf13/afero v1.11.0 github.com/spf13/cobra v1.8.1 github.com/zclconf/go-cty v1.15.0 - golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa + golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 @@ -22,7 +22,7 @@ require ( require ( github.com/fatih/color v1.18.0 - github.com/go-git/go-billy/v5 v5.5.0 + github.com/go-git/go-billy/v5 v5.6.0 github.com/google/go-cmp v0.6.0 github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 @@ -35,7 +35,7 @@ require ( cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect - github.com/cyphar/filepath-securejoin v0.2.4 // indirect + github.com/cyphar/filepath-securejoin v0.2.5 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.1 // indirect @@ -50,10 +50,10 @@ require ( go.opentelemetry.io/otel v1.24.0 // indirect go.opentelemetry.io/otel/metric v1.24.0 // indirect go.opentelemetry.io/otel/trace v1.24.0 // indirect - golang.org/x/mod v0.17.0 // indirect + golang.org/x/mod v0.19.0 // indirect golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect - golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect + golang.org/x/tools v0.23.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240610135401-a8a62080eff3 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240617180043-68d350f18fd4 // indirect ) @@ -95,8 +95,8 @@ require ( github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.24.0 // indirect - golang.org/x/net v0.26.0 // indirect + golang.org/x/crypto v0.25.0 // indirect + golang.org/x/net v0.27.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect golang.org/x/sys v0.26.0 golang.org/x/text v0.16.0 // indirect diff --git a/go.sum b/go.sum index 20825f36fe..3d4849db05 100644 --- a/go.sum +++ b/go.sum @@ -231,8 +231,8 @@ github.com/cncf/xds/go v0.0.0-20211001041855-01bcc9b48dfe/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= -github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= -github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= +github.com/cyphar/filepath-securejoin v0.2.5 h1:6iR5tXJ/e6tJZzzdMc1km3Sa7RRIVBKAK32O2s7AYfo= +github.com/cyphar/filepath-securejoin v0.2.5/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -261,8 +261,8 @@ github.com/gliderlabs/ssh v0.3.7 h1:iV3Bqi942d9huXnzEF2Mt+CY9gLu8DNM4Obd+8bODRE= github.com/gliderlabs/ssh v0.3.7/go.mod h1:zpHEXBstFnQYtGnB8k8kQLol82umzn/2/snG7alWVD8= github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 h1:+zs/tPmkDkHx3U66DAb0lQFJrpS6731Oaa12ikc+DiI= github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmSxCcxctByoQdvwPiA7DTK7jaaFDBTtu0ic= -github.com/go-git/go-billy/v5 v5.5.0 h1:yEY4yhzCDuMGSv83oGxiBotRzhwhNr8VZyphhiu+mTU= -github.com/go-git/go-billy/v5 v5.5.0/go.mod h1:hmexnoNsr2SJU1Ju67OaNz5ASJY3+sHgFRpCtpDCKow= +github.com/go-git/go-billy/v5 v5.6.0 h1:w2hPNtoehvJIxR00Vb4xX94qHQi/ApZfX+nBE2Cjio8= +github.com/go-git/go-billy/v5 v5.6.0/go.mod h1:sFDq7xD3fn3E0GOwUSZqHo9lrkmx8xJhA0ZrfvjBRGM= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII= github.com/go-git/go-git/v5 v5.12.0 h1:7Md+ndsjrzZxbddRDZjF14qK+NN56sy6wkqaVrjZtys= @@ -440,8 +440,8 @@ github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770 github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770/go.mod h1:SO/iHr6q2EzbqRApt+8/E9wqebTwQn5y+UlB04bxzo0= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= -github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI= -github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= +github.com/onsi/gomega v1.34.1 h1:EUMJIKUjM8sKjYbtxQI9A4z2o+rruxnzNvpknOXie6k= +github.com/onsi/gomega v1.34.1/go.mod h1:kU1QgUvBDLXBJq618Xvm2LUX6rSAfRaFRTcdOeDLwwY= github.com/otiai10/copy v1.14.0 h1:dCI/t1iTdYGtkvCuBG2BgR6KZa83PTclw4U5n2wAllU= github.com/otiai10/copy v1.14.0/go.mod h1:ECfuL02W+/FkTWZWgQqXPWZgW9oeKCSQ5qVfSc4qc4w= github.com/otiai10/mint v1.5.1 h1:XaPLeE+9vGbuyEHem1JNk3bYc7KKqyI/na0/mLd/Kks= @@ -529,8 +529,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.24.0 h1:mnl8DM0o513X8fdIkmyFE/5hTYxbwYOjDS/+rK6qpRI= -golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/crypto v0.25.0 h1:ypSNr+bnYL2YhwoMt2zPxHFmbAN1KZs/njMG3hxUp30= +golang.org/x/crypto v0.25.0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -541,8 +541,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ= -golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= +golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -569,8 +569,8 @@ golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8= +golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -619,8 +619,8 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= -golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= -golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= +golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -737,8 +737,8 @@ golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -808,8 +808,8 @@ golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg= +golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= From 934064af9e01db7a4247261099d1d600941fff59 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 1 Nov 2024 14:42:38 +0000 Subject: [PATCH 076/129] Add support for custom Docker daemon configuration Docker daemons can be configured with a JSON file. Support for custom data-roots is particularly valuable due to the performance impact of storing Docker layers on performant storage. This commit adds support custom JSON configuration along with the proper SystemD ordering for data-root when it is specified. The assumption that data-root may be provided by the local_ssd_filesystem option is made, but this dependency is soft (After) and does not impose a strict requirement. --- modules/scripts/startup-script/README.md | 2 +- .../startup-script/files/install_docker.yml | 34 ++++++++++++++++++- modules/scripts/startup-script/main.tf | 32 +++++++++++++++-- modules/scripts/startup-script/variables.tf | 12 +++++++ 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index abed00d3b3..3cbaedb363 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -318,7 +318,7 @@ No modules. | [configure\_ssh\_host\_patterns](#input\_configure\_ssh\_host\_patterns) | If specified, it will automate ssh configuration by:
- Defining a Host block for every element of this variable and setting StrictHostKeyChecking to 'No'.
Ex: "hpc*", "hpc01*", "ml*"
- The first time users log-in, it will create ssh keys that are added to the authorized keys list
This requires a shared /home filesystem and relies on specifying the right prefix. | `list(string)` | `[]` | no | | [debug\_file](#input\_debug\_file) | Path to an optional local to be written with 'startup\_script'. | `string` | `null` | no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used to name GCS bucket for startup scripts. | `string` | n/a | yes | -| [docker](#input\_docker) | Install and configure Docker |
object({
enabled = optional(bool, false)
world_writable = optional(bool, false)
})
|
{
"enabled": false
}
| no | +| [docker](#input\_docker) | Install and configure Docker |
object({
enabled = optional(bool, false)
world_writable = optional(bool, false)
daemon_config = optional(string, "")
})
|
{
"enabled": false
}
| no | | [enable\_docker\_world\_writable](#input\_enable\_docker\_world\_writable) | DEPRECATED: use var.docker | `bool` | `null` | no | | [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object, starting with `gs://`. | `string` | `null` | no | | [http\_no\_proxy](#input\_http\_no\_proxy) | Domains for which to disable http\_proxy behavior. Honored only if var.http\_proxy is set | `string` | `".google.com,.googleapis.com,metadata.google.internal,localhost,127.0.0.1"` | no | diff --git a/modules/scripts/startup-script/files/install_docker.yml b/modules/scripts/startup-script/files/install_docker.yml index 61a74ea0df..ce500d57ec 100644 --- a/modules/scripts/startup-script/files/install_docker.yml +++ b/modules/scripts/startup-script/files/install_docker.yml @@ -17,6 +17,8 @@ hosts: all become: true vars: + docker_data_root: '' + docker_daemon_config: '' enable_docker_world_writable: false tasks: - name: Check if docker is installed @@ -36,6 +38,32 @@ register: docker_installed changed_when: docker_installed.rc != 0 when: not docker_binary.stat.exists + - name: Create Docker daemon configuration + ansible.builtin.copy: + dest: /etc/docker/daemon.json + mode: 0644 + content: '{{ docker_daemon_config }}' + validate: /usr/bin/dockerd --validate --config-file %s + when: docker_daemon_config + notify: + - Restart Docker + - name: Create Docker service override directory + ansible.builtin.file: + path: /etc/systemd/system/docker.service.d + state: directory + owner: root + group: root + mode: 0755 + - name: Create Docker service override configuration + ansible.builtin.copy: + dest: /etc/systemd/system/docker.service.d/data-root.conf + mode: 0644 + content: | + [Unit] + {% if docker_data_root %} + RequiresMountsFor={{ docker_data_root }} + {% endif %} + After=create-localssd-raid.service - name: Create Docker socket override directory ansible.builtin.file: path: /etc/systemd/system/docker.socket.d @@ -72,10 +100,14 @@ ansible.builtin.service: name: docker.socket state: restarted + - name: Restart Docker + ansible.builtin.service: + name: docker.service + state: restarted post_tasks: - name: Start Docker ansible.builtin.service: - name: docker + name: docker.service state: started enabled: true diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index 4a8ccc1643..b694275ed0 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -44,7 +44,11 @@ locals { host_name_prefix = var.configure_ssh_host_patterns } - prefix_file = "/tmp/prefix_file.json" + prefix_file = "/tmp/prefix_file.json" + ansible_docker_settings_file = "/tmp/ansible_docker_settings.json" + + docker_config = try(jsondecode(var.docker.daemon_config), {}) + docker_data_root = try(local.docker_config.data-root, null) configure_ssh_runners = local.configure_ssh ? [ { @@ -90,11 +94,20 @@ locals { ] docker_runner = !var.docker.enabled ? [] : [ + { + type = "data" + destination = local.ansible_docker_settings_file + content = jsonencode({ + enable_docker_world_writable = var.docker.world_writable + docker_daemon_config = var.docker.daemon_config + docker_data_root = local.docker_data_root + }) + }, { type = "ansible-local" destination = "install_docker.yml" content = file("${path.module}/files/install_docker.yml") - args = "-e enable_docker_world_writable=${var.docker.world_writable}" + args = "-e \"@${local.ansible_docker_settings_file}\"" }, ] @@ -134,9 +147,9 @@ locals { local.proxy_runner, local.monitoring_agent_installer, local.ansible_installer, + local.raid_setup, # order RAID early to ensure filesystem is ready for subsequent runners local.configure_ssh_runners, local.docker_runner, - local.raid_setup, var.runners ) @@ -188,6 +201,19 @@ locals { } } +check "health_check" { + assert { + condition = local.docker_config == {} + error_message = <<-EOT + This message is only a warning. The Toolkit performs no validation of the Docker + daemon configuration. VM startup scripts will fail if the configuration file is + not a valid Docker JSON configuration. Please review the Docker documentation: + + https://docs.docker.com/engine/daemon/ + EOT + } +} + resource "random_id" "resource_name_suffix" { byte_length = 4 } diff --git a/modules/scripts/startup-script/variables.tf b/modules/scripts/startup-script/variables.tf index 3b99f3f0e6..026f0d624d 100644 --- a/modules/scripts/startup-script/variables.tf +++ b/modules/scripts/startup-script/variables.tf @@ -117,6 +117,7 @@ variable "docker" { type = object({ enabled = optional(bool, false) world_writable = optional(bool, false) + daemon_config = optional(string, "") }) default = { enabled = false @@ -126,6 +127,17 @@ variable "docker" { condition = !coalesce(var.docker.world_writable) || var.docker.enabled error_message = "var.docker.world_writable should only be set if var.docker.enabled is set to true" } + + validation { + condition = !can(coalesce(var.docker.daemon_config)) || var.docker.enabled + error_message = "var.docker.daemon_config should only be set if var.docker.enabled is set to true" + } + + validation { + condition = !can(coalesce(var.docker.daemon_config)) || can(jsondecode(var.docker.daemon_config)) + error_message = "var.docker.daemon_config should be set to a valid Docker daemon JSON configuration" + } + } # tflint-ignore: terraform_unused_declarations From 9692e20362a5e57d6ed3b4f0e8d9383fbcace2d2 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 1 Nov 2024 14:42:38 +0000 Subject: [PATCH 077/129] Quote all file modes to ensure they are interpreted as strings --- .../scripts/startup-script/files/install_docker.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/scripts/startup-script/files/install_docker.yml b/modules/scripts/startup-script/files/install_docker.yml index ce500d57ec..9e3d3bc523 100644 --- a/modules/scripts/startup-script/files/install_docker.yml +++ b/modules/scripts/startup-script/files/install_docker.yml @@ -31,7 +31,7 @@ dest: /tmp/get-docker.sh owner: root group: root - mode: 0644 + mode: '0644' when: not docker_binary.stat.exists - name: Install Docker ansible.builtin.command: sh /tmp/get-docker.sh @@ -41,7 +41,7 @@ - name: Create Docker daemon configuration ansible.builtin.copy: dest: /etc/docker/daemon.json - mode: 0644 + mode: '0644' content: '{{ docker_daemon_config }}' validate: /usr/bin/dockerd --validate --config-file %s when: docker_daemon_config @@ -53,11 +53,11 @@ state: directory owner: root group: root - mode: 0755 + mode: '0755' - name: Create Docker service override configuration ansible.builtin.copy: dest: /etc/systemd/system/docker.service.d/data-root.conf - mode: 0644 + mode: '0644' content: | [Unit] {% if docker_data_root %} @@ -70,12 +70,12 @@ state: directory owner: root group: root - mode: 0755 + mode: '0755' when: enable_docker_world_writable - name: Create Docker socket override configuration ansible.builtin.copy: dest: /etc/systemd/system/docker.socket.d/world-writable.conf - mode: 0644 + mode: '0644' content: | [Socket] SocketMode=0666 From 03b264f1ad4d1def512af970969700a88338a87f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Fri, 1 Nov 2024 17:10:39 +0000 Subject: [PATCH 078/129] Fix docs Source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py --- .../modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/variables.tf | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 89352dd48e..2045355fe0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -289,7 +289,7 @@ limitations under the License. | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_controller\_public\_ips](#input\_enable\_controller\_public\_ips) | If set to true. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | -| [enable\_default\_mounts](#input\_enable\_default\_mounts) | Enable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `true` | no | +| [enable\_default\_mounts](#input\_enable\_default\_mounts) | Enable default global network storage from the controller
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `true` | no | | [enable\_devel](#input\_enable\_devel) | DEPRECATED: `enable_devel` is always on. | `bool` | `null` | no | | [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/tools/prologs-epilogs/README.md | `bool` | `null` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 569d0d65ff..1fc4fb1e0f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -425,8 +425,6 @@ variable "cloud_parameters" { variable "enable_default_mounts" { description = <<-EOD Enable default global network storage from the controller - - /usr/local/etc/slurm - - /etc/munge - /home - /apps Warning: If these are disabled, the slurm etc and munge dirs must be added From 431f05ab9e67fa8e0e7127345d98c36634fce494 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Fri, 1 Nov 2024 17:33:57 +0000 Subject: [PATCH 079/129] fix comment --- examples/gke-storage-parallelstore.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-parallelstore.yaml index 6b88fd7913..9ffe737e83 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-parallelstore.yaml @@ -89,7 +89,7 @@ deployment_groups: settings: name: tensorflow image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d - security_context: + security_context: # to make sure the job have enough access to execute the jobs and r/w from parallelstore - key: runAsUser value: 1000 - key: runAsGroup From 6a88a0da46aca2964f7627bd7fae2a9b82d297d2 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 1 Nov 2024 20:51:54 +0000 Subject: [PATCH 080/129] Drop validate flag to Docker daemon configuration The current a3-highgpu-8g image does not support the --validate flag to the dockerd daemon. In practice, this flag is really only useful in a live system when a new configuration is overwriting an active configuration (to prevent a bad update from propagating). In the Cloud, most changes are being made at first boot so it always ends in failure for the user when a bad configuration is provided. --- modules/scripts/startup-script/files/install_docker.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/scripts/startup-script/files/install_docker.yml b/modules/scripts/startup-script/files/install_docker.yml index 9e3d3bc523..c169b62cd9 100644 --- a/modules/scripts/startup-script/files/install_docker.yml +++ b/modules/scripts/startup-script/files/install_docker.yml @@ -43,7 +43,10 @@ dest: /etc/docker/daemon.json mode: '0644' content: '{{ docker_daemon_config }}' - validate: /usr/bin/dockerd --validate --config-file %s + # validate flag requires Docker server version 23.0 and above + # can add this back after private A3 DLVM image is deprecated + # this image comes with Docker version 20.10.17 + # validate: /usr/bin/dockerd --validate --config-file %s when: docker_daemon_config notify: - Restart Docker From d35a85cac2a0fb4590ef592e429742ab989ae3e9 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 1 Nov 2024 20:49:39 +0000 Subject: [PATCH 081/129] Adopt local SSD storage for A3 docker images --- .../a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml | 12 +++++++++++- .../a3-megagpu-8g/slurm-a3mega-cluster.yaml | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml index c51114d9bf..57dafd1f48 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml @@ -52,6 +52,7 @@ vars: enable_ops_agent: true # enable the NVIDIA DCGM daemon and integration into Cloud Ops Agent enable_nvidia_dcgm: true + localssd_mountpoint: /mnt/localssd deployment_groups: - group: cluster @@ -114,8 +115,17 @@ deployment_groups: # Failure to do will result in VMs that lose data and do not automatically # mount local SSD filesystems local_ssd_filesystem: - mountpoint: /mnt/localssd + mountpoint: $(vars.localssd_mountpoint) permissions: "1777" # must quote numeric filesystem permissions! + # Docker was successfully installed in the image, this configures it + # to use the A3-specific local SSD volumes to store container images + docker: + enabled: true + world_writable: true + daemon_config: | + { + "data-root": "$(vars.localssd_mountpoint)/docker" + } runners: - type: ansible-local destination: enable_nvidia_dcgm.yml diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 8d46b10c40..8b8a3eda19 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -30,6 +30,7 @@ vars: project: $(vars.project_id) enable_login_public_ips: true enable_controller_public_ips: true + localssd_mountpoint: /mnt/localssd deployment_groups: - group: cluster @@ -89,8 +90,17 @@ deployment_groups: # Failure to do will result in VMs that lose data and do not automatically # mount local SSD filesystems local_ssd_filesystem: - mountpoint: /mnt/localssd + mountpoint: $(vars.localssd_mountpoint) permissions: "1777" # must quote numeric filesystem permissions! + # Docker was successfully installed in the image, this configures it + # to use the A3-specific local SSD volumes to store container images + docker: + enabled: true + world_writable: true + daemon_config: | + { + "data-root": "$(vars.localssd_mountpoint)/docker" + } runners: - type: ansible-local destination: slurm_aperture.yml From 291db2cd0b238dbae382663b3b1647ca18d444f9 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Wed, 30 Oct 2024 21:47:34 +0000 Subject: [PATCH 082/129] Update slurm references to test changing default for subnetwork_project --- community/examples/hpc-build-slurm-image.yaml | 2 +- .../README.md | 2 +- .../main.tf | 2 +- .../schedmd-slurm-gcp-v5-partition/README.md | 2 +- .../schedmd-slurm-gcp-v5-partition/main.tf | 2 +- .../README.md | 2 +- .../main.tf | 2 +- .../schedmd-slurm-gcp-v5-controller/README.md | 18 +++++++++--------- .../schedmd-slurm-gcp-v5-controller/main.tf | 4 ++-- .../schedmd-slurm-gcp-v5-hybrid/README.md | 14 +++++++------- .../schedmd-slurm-gcp-v5-hybrid/main.tf | 2 +- .../schedmd-slurm-gcp-v5-login/README.md | 14 +++++++------- .../schedmd-slurm-gcp-v5-login/main.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/README.md | 18 +++++++++--------- .../controller.tf | 4 ++-- .../schedmd-slurm-gcp-v6-controller/login.tf | 4 ++-- .../partition.tf | 4 ++-- .../schedmd-slurm-gcp-v6-login/README.md | 8 ++++---- .../demo-with-cloud-controller-instructions.md | 2 +- .../deploy-instructions.md | 4 ++-- .../on-prem-instructions.md | 16 ++++++++-------- docs/image-building.md | 2 +- examples/README.md | 4 ++-- .../a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 2 +- .../a3-highgpu-8g/v5-legacy/README.md | 2 +- .../ml-slurm-a3-1-image-v5-legacy.yaml | 2 +- .../a3-megagpu-8g/slurm-a3mega-image.yaml | 2 +- modules/README.md | 4 ++-- 28 files changed, 74 insertions(+), 74 deletions(-) diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index a1fa81767e..67994a4be5 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -23,7 +23,7 @@ vars: image_build_machine_type: n2d-standard-16 build_from_image_family: hpc-rocky-linux-8 build_from_image_project: cloud-hpc-image-public - build_from_git_ref: 6.8.2 + build_from_git_ref: 6.8.5 built_image_family: my-custom-slurm built_instance_image: family: $(vars.built_image_family) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index b064a8721d..edd9b62673 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -74,7 +74,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index 96e3fa9de5..38fd95b761 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.2" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index 6049020468..5aba53c18d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -151,7 +151,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.1 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.12.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index d710206935..2bf5bb7b30 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -40,7 +40,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.12.2" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index d0a15a8234..54ed5cf1d3 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index 7ca868a049..bab7de7eaa 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -56,7 +56,7 @@ locals { } module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.5" project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index bb9ad2a65e..fb020afb9f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -22,14 +22,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -99,12 +99,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster#optional +[optdeps]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster#optional ## Custom Images @@ -220,8 +220,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.12.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 12ab3aaa22..2ad6c622e1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -61,7 +61,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.12.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -99,7 +99,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index 70ce7c80f7..73b6c5fcb0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -44,7 +44,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -64,15 +64,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/scripts/requirements.txt +[requirements.txt]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/docs/hybrid.md -[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md +[manual steps]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -152,10 +152,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/packer +[packer templates]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/packer ## License @@ -187,7 +187,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.1 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.12.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 02c5956676..c721a13bb3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.12.2" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 72357718b0..787ece124c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -10,9 +10,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -51,8 +51,8 @@ The Cluster Toolkit team maintains the wrapper around the [slurm-on-gcp] terrafo modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2#slurm-on-google-cloud-platform ## License @@ -87,8 +87,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.12.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.12.2 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index c7c8f8b753..40cb4728bc 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -57,7 +57,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.12.2" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -95,7 +95,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.12.2" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 89352dd48e..c3bc3aa12b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -11,9 +11,9 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and enhancing the Slurm on GCP cluster as well as recommendations on configuring the controller for optimal performance at different scales. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions @@ -238,13 +238,13 @@ limitations under the License. | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.5 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.2 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.2 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.2 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.5 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.8.5 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 1ce6ed158f..8803b76fce 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -43,7 +43,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.5" project_id = var.project_id region = var.region @@ -99,7 +99,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.5" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index 998a8e0867..e320dbb893 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.5" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -56,7 +56,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.5" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 0d05c71f91..6287399993 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local template module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.8.5" for_each = local.nodeset_map project_id = var.project_id @@ -102,7 +102,7 @@ locals { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.2" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.8.5" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index 72a2180a12..eff81b42c1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -53,7 +53,7 @@ modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. [slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/7 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2#slurm-on-google-cloud-platform +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5#slurm-on-google-cloud-platform ## Requirements diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index 892af3a71c..928caff5a4 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index f848d2ef55..95fb2a067f 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -264,8 +264,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 -[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/docs/hybrid.md +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm-gcp-hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 4f75504a27..fbfb8750c6 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 [slurm\_controller\_hybrid]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/docs/hybrid.md +[hybrid.md]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/docs/hybrid.md ### NFS Mounts @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/packer -[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.1/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/packer +[example.pkrvars.hcl]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/GoogleCloudPlatform/slurm-gcp/blob/5.12.2/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 24c1eadc54..98ae56e203 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -167,7 +167,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.12.1&depth=1 + source: github.com/GoogleCloudPlatform/slurm-gcp//packer?ref=5.12.2&depth=1 kind: packer settings: use_iap: true diff --git a/examples/README.md b/examples/README.md index 20533db95f..b1cafa8c6b 100644 --- a/examples/README.md +++ b/examples/README.md @@ -216,7 +216,7 @@ the experimental badge (![experimental-badge]). > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -1149,7 +1149,7 @@ The blueprint contains 3 groups: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt > ``` Similar to the [hpc-slurm-v5-legacy.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index 3d4db65c21..62abc006fe 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -95,7 +95,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.5 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md index 690a968d15..96087ef64c 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/README.md @@ -40,7 +40,7 @@ installing them in a Python virtual environment: python3 -m venv toolkit-a3 source toolkit-a3/bin/activate pip3 install -r \ - https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.1/scripts/requirements.txt + https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/5.12.2/scripts/requirements.txt ``` **Always** activate the environment before running any gcluster commands such as diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml index 1275a788d8..88d5cefda8 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml @@ -93,7 +93,7 @@ deployment_groups: set -e -o pipefail ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.12.1 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 5.12.2 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 07e06ac8ac..462e0752c3 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -109,7 +109,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.2 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.8.5 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/modules/README.md b/modules/README.md index cc1bb513ef..c5f1df282a 100644 --- a/modules/README.md +++ b/modules/README.md @@ -223,8 +223,8 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.1 -[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.2 +[slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.12.2 +[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.8.5 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md From aa48f496f54e9517e97805508954fa502a524f10 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Fri, 1 Nov 2024 22:40:49 +0000 Subject: [PATCH 083/129] Updates maintenance.py to support new format --- tools/maintenance/maintenance.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/maintenance/maintenance.py b/tools/maintenance/maintenance.py index 7a22623a8f..c65edd6368 100755 --- a/tools/maintenance/maintenance.py +++ b/tools/maintenance/maintenance.py @@ -35,6 +35,13 @@ "upcomingMaintenance.startTimeWindow.earliest," \ "upcomingMaintenance.startTimeWindow.latest," \ "upcomingMaintenance.canReschedule,upcomingMaintenance.type)'" + +UPDATED_UPC_MAINT_CMD = "gcloud alpha compute instances list --project={}" \ + " --filter='upcomingMaintenance:*' --format='value(name," \ + "upcomingMaintenance.latestWindowStartTime," \ + "upcomingMaintenance.windowEndTime," \ + "upcomingMaintenance.canReschedule,upcomingMaintenance.type)'" + PER_MAINT_CMD = "gcloud alpha compute instances list --project={}" \ " --filter=scheduling.maintenanceInterval:PERIODIC " \ " --format='value(name)'" @@ -72,6 +79,9 @@ def get_upcoming_maintenance(project: str) -> List[str]: err_msg = "Error getting upcoming maintenance list" res = run_command(UPC_MAINT_CMD.format(project), err_msg) + # Check if all output was received. If length is 3, two of the filters failed since the maintenance output is using new format. + if len(res.stdout.split()) == 3: + res = run_command(UPDATED_UPC_MAINT_CMD.format(project), err_msg) upc_maint = [x.split() for x in res.stdout.split("\n")[:-1]] return upc_maint From 05a79fc8a769f0e0bdbcf4e7a5b6191296da418a Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 1 Nov 2024 23:32:19 +0000 Subject: [PATCH 084/129] SlurmGCP. Escape GCP error reasons that may cause malformed CLI args --- .../modules/slurm_files/scripts/resume.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 31658cd96a..e113cf23e4 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -19,6 +19,7 @@ import argparse import collections from datetime import timedelta +import shlex import json import logging import os @@ -480,7 +481,9 @@ def down_nodes(nodelist, reason): if isinstance(nodelist, list): nodelist = util.to_hostlist(nodelist) update_job_comment(nodelist, reason) - run(f"{lookup().scontrol} update nodename={nodelist} state=down reason='{reason}'") + reason_quoted = shlex.quote(reason) + log.error(f"Marking nodes {nodelist} as DOWN, reason: {reason}") + run(f"{lookup().scontrol} update nodename={nodelist} state=down reason={reason_quoted}") def hold_job(job_id, reason): From 95646b2232676f30b81869f2502a102a15b584b7 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Sun, 3 Nov 2024 00:14:56 -0700 Subject: [PATCH 085/129] Add c4a to the nosmt list This fixes the issue where Slurm wrongly considers c4a to have smt enabled (threads_per_core=2). In truth, smt is not applicable for Arm machines. Test with the change: ```sh # As indicated from lscpu, c4a always has 1 thread per core linsword_google_com@c4atest-c4ahighmemnode-0:~$ lscpu | grep "per " Thread(s) per core: 1 Core(s) per socket: 72 # Verify the number looks correct with the change linsword_google_com@c4atest-login-001:~$ scontrol show nodes | grep 72 | head -n2 NodeName=c4atest-c4ahighmemnode-0 Arch=aarch64 CoresPerSocket=72 CPUAlloc=0 CPUEfctv=72 CPUTot=72 CPULoad=0.06 ``` --- .../modules/slurm_files/scripts/util.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 68716e51bf..d046c27357 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -454,7 +454,7 @@ def _list_config_blobs() -> Tuple[Any, str]: if res["core"] is None: raise DeffetiveStoredConfigError("config.yaml not found in bucket") return res, hash.hexdigest() - + def _fetch_config(old_hash: Optional[str]) -> Optional[Tuple[NSDict, str]]: """Fetch config from bucket, returns None if no changes are detected.""" @@ -1156,7 +1156,12 @@ def machine_type_sockets(template) -> int: def isSmt(template) -> bool: # https://cloud.google.com/compute/docs/cpu-platforms - noSmtFamily = ("t2a", "t2d", "h3",) + noSmtFamily = ( + "t2a", + "t2d", + "h3", + "c4a", + ) if machine_type_family(template.machineType) in noSmtFamily: return False if template.machine_info.guestCpus == 1: From 991fea3a0bb884a0f3301227a8839be5814f2f79 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 4 Nov 2024 15:48:34 +0000 Subject: [PATCH 086/129] Allow latest Terraform google plugin PR #3195 brings in a fix to address a Terraform behavioral change introduced in plugin v6.2.0 and above by https://github.com/hashicorp/terraform-provider-google/commit/bd42980115a2f598c129b9a81fe0a603b0f384d6 --- pkg/config/expand.go | 4 ++-- pkg/config/expand_test.go | 4 ++-- .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../golden_copies/expectations/igc_pkr/zero/versions.tf | 4 ++-- .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml | 8 ++++---- .../golden_copies/expectations/igc_tf/one/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/zero/versions.tf | 4 ++-- .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/merge_flatten/zero/versions.tf | 4 ++-- .../.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/versioned_blueprint/primary/versions.tf | 4 ++-- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 1ec2323f12..5e73706f15 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 5.44.2, < 6.2.0", + Version: ">= 5.44.2, < 6.10.0", Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 5.44.2, < 6.2.0", + Version: ">= 5.44.2, < 6.10.0", Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 12ad651081..17b8304900 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 5.44.2, < 6.2.0"}, + Version: ">= 5.44.2, < 6.10.0"}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 5.44.2, < 6.2.0"}}) + Version: ">= 5.44.2, < 6.10.0"}}) } { // no def PR, group PR diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index d4209686db..f8df351469 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index 6cad938861..c14eee115f 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index 7dba9d7a74..d6d186dd07 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) @@ -80,14 +80,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index 6cad938861..c14eee115f 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index 6cad938861..c14eee115f 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 1bbb09a4f0..fbd8d55a64 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index 6cad938861..c14eee115f 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index 9fe6a8753f..537dbef727 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -47,14 +47,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.2.0' + version: '>= 5.44.2, < 6.10.0' configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf index 6cad938861..c14eee115f 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.2.0" + version = ">= 5.44.2, < 6.10.0" } } } From 5531e8bf8dd547b46b582313eb7b30e666f10009 Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Mon, 4 Nov 2024 15:02:38 +0000 Subject: [PATCH 087/129] Increasing ml-slurm test timeout to 5hr --- tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml | 2 +- tools/cloud-build/daily-tests/builds/ml-slurm.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml index 7169240059..3382f342b6 100644 --- a/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-slurm-v5-legacy.yaml @@ -25,7 +25,7 @@ tags: - m.startup-script - slurm5 -timeout: 14400s # 4hr +timeout: 18000s # 5hr steps: # test image creation by provisioning a new VPC and using Packer to build an # image in it diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml index 79c8502590..c06c110a54 100644 --- a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml @@ -24,7 +24,7 @@ tags: - m.startup-script - slurm6 -timeout: 14400s # 4hr +timeout: 18000s # 5hr steps: # While using static network names we are gaurding against more than 1 instance running at a time (for multi-group tests) - id: check_for_running_build From d3bd763bfdd68e36c232c4e44d9c7d86f712d6f4 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 4 Nov 2024 18:35:13 +0000 Subject: [PATCH 088/129] SlurmGCP. Add `set -e` to prolog mux --- .../modules/slurm_files/files/setup_external.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh index c21f7cbdbd..21454bd52c 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/files/setup_external.sh @@ -41,6 +41,8 @@ if [ ! -f "${SLURM_EXTERNAL_ROOT}/${SLURM_MUX_FILE}" ]; then # See the License for the specific language governing permissions and # limitations under the License. +set -e + CMD="${0##*/}" # Locate script BASE=$(readlink -f $0) From 487a7d45a038777cf2e679494cc4e17b64441eb3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 29 Oct 2024 00:15:55 +0000 Subject: [PATCH 089/129] SlurmGCP. topology readme --- docs/slurm-topology.md | 73 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 docs/slurm-topology.md diff --git a/docs/slurm-topology.md b/docs/slurm-topology.md new file mode 100644 index 0000000000..e83c11047e --- /dev/null +++ b/docs/slurm-topology.md @@ -0,0 +1,73 @@ +# Network topology aware scheduling + +Slurm can be [configured](https://slurm.schedmd.com/topology.html) to support topology-aware +resource allocation to optimize job performance. + +If you are using Slurm via ClusterToolkit, the Slurm Topology Plugin is automatically configured with: + +```ini +TopologyPlugin=topology/tree +TopologyParam=SwitchAsNodeRank +``` + +This does two things: + +* **Minimizes inter-rack communication:** For jobs smaller than the full cluster size, Slurm will assign the job to as few racks as possible. +* **Optimizes rank placement:** Within a job, the Slurm node rank (used to assign global Slurm / MPI ranks) is ordered by the Switch that the node is on, such that ranks are ordered by rack. + +SlurmGCP automatically updates topology information for all nodes in the cluster, according to their [physical location](https://cloud.google.com/compute/docs/instances/use-compact-placement-policies#verify-vm-location). + +> [!NOTE] +> The physical location information is available for VMs configured with a placement policy. +> VMs without a defined placement policy will be assigned a less efficient 'fake' topology. + +Applications that incorporate either the `SLURM_PROCID`/`NODE_RANK`/etc or the MPI Rank into their task assignment may see performance benefits. +In other cases, such as with PyTorch's `distributed`, you may need to modify the rank assignment to incorporate this information, see [example](../examples/machine-learning/a3-megagpu-8g/topological-pytorch/README.md). + +## Inspect topology + +You can inspect topology used by Slurm by running: + +```sh +scontrol show topology + +# Or by listing the configuration file: +cat /etc/slurm/topology.conf +``` + +To inspect the "real" topology and verify the physical host placement, you can list the `physical_host` property of nodes: + +```sh +#!/bin/bash + +# /home/where.sh - echo machines hostname and its physicalHost +echo "$(hostname) $(curl 'http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host' -H 'Metadata-Flavor: Google' -s)" +``` + +```sh +srun --nodelist={nodes_to_inspect} -l /home/where.sh | sort -V +``` + +## Disabling SlurmGCP topology integration + +Updates to `topology.conf` require reconfiguration of Slurm controller. This can be a costly operation that affects the responsiveness of the controller. + +You have the option to disable the Slurm Topology Plugin (along with automatic updates) by providing the following settings to controller module in your blueprint: + +```yaml +settings: + cloud_parameters: + topology_plugin: "" +``` + +Even with the Topology Plugin disabled, you can still optimize rank placement by using the `sort_nodes` +util in your [sbatch](https://slurm.schedmd.com/sbatch.html) scripts. For example: + +```sh +#SBATCH --ntasks-per-node=8 +#SBATCH --nodes=64 + +export SLURM_HOSTFILE=$(sort_nodes.py) + +srun -l hostname | sort +``` From e1455af9e4a2bb20a2736e7d24c6161697ffc972 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 4 Nov 2024 22:34:41 +0000 Subject: [PATCH 090/129] Refactor mount/mode setting for local SSD RAID The local SSD RAID solution is written in Ansible which will successfully handle re-creating the RAID array and mounting it in scenarios where the VM has been re-created and the contents of local SSD have been discared. The Slurm solutions do not re-run startup scripts after the first boot using a given persistent disk. During maintenance events, the persistent disk is retained while the local SSD disks are discarded. PR #3129 addressed re-creating, formatting and mounting the RAID array but left a gap in setting the mode of the mounted directory after power off/on cycles. This PR refactors mounting and mode-setting to resolve this gap. --- .../startup-script/files/setup-raid.yml | 44 ++++++++++++------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/modules/scripts/startup-script/files/setup-raid.yml b/modules/scripts/startup-script/files/setup-raid.yml index d7590069a8..5ebf35e522 100644 --- a/modules/scripts/startup-script/files/setup-raid.yml +++ b/modules/scripts/startup-script/files/setup-raid.yml @@ -53,10 +53,11 @@ [Unit] After=local-fs.target Before=slurmd.service - ConditionPathIsMountPoint=!{{ mountpoint }} + ConditionPathExists=!{{ array_dev }} [Service] Type=oneshot + RemainAfterExit=yes ExecStart=/usr/bin/bash -c "/usr/sbin/mdadm --create {{ array_dev }} --name={{ raid_name }} --homehost=any --level=0 --raid-devices={{ local_ssd_devices.files | length }} /dev/disk/by-id/google-local-nvme-ssd-*{{ " --force" if local_ssd_devices.files | length == 1 else "" }}" ExecStartPost=/usr/sbin/mkfs -t {{ fstype }}{{ " -m 0" if fstype == "ext4" else "" }} {{ array_dev }} @@ -70,19 +71,30 @@ enabled: true daemon_reload: true - - name: Mount RAID array - ansible.posix.mount: - src: "{{ array_dev }}" - path: "{{ mountpoint }}" - fstype: "{{ fstype }}" - # the nofail option is critical as it enables the boot process to complete on machines - # that were powered off and had local SSD contents discarded; without this option - # VMs may fail to join the network - opts: discard,defaults,nofail - state: mounted + - name: Install service to mount local SSD array + ansible.builtin.copy: + dest: /etc/systemd/system/mount-localssd-raid.service + mode: 0644 + content: | + [Unit] + After=local-fs.target create-localssd-raid.service + Before=slurmd.service + Wants=create-localssd-raid.service + ConditionPathIsMountPoint=!{{ mountpoint }} - - name: Set mount permissions - ansible.builtin.file: - path: "{{ mountpoint }}" - state: directory - mode: "{{ mode }}" + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStart=/usr/bin/systemd-mount -t {{ fstype }} -o discard,defaults,nofail {{ array_dev }} {{ mountpoint }} + ExecStartPost=/usr/bin/chmod {{ mode }} {{ mountpoint }} + ExecStop=/usr/bin/systemd-umount {{ mountpoint }} + + [Install] + WantedBy=slurmd.service + + - name: Mount RAID array and set permissions + ansible.builtin.systemd: + name: mount-localssd-raid.service + state: started + enabled: true + daemon_reload: true From 645428f1c65308aa6a543c5ea6d0678c947b91c3 Mon Sep 17 00:00:00 2001 From: Lin Guo Date: Mon, 4 Nov 2024 16:34:21 -0800 Subject: [PATCH 091/129] Add in socket count info for c-series VMs The socket info is used by Slurm for scheduling purpose. Ideally it'd be better if there's a SOT that can be queried for these topology information. --- .../modules/slurm_files/scripts/util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index d046c27357..26a09ca195 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -1148,6 +1148,10 @@ def machine_type_sockets(template) -> int: "h3": 2, "c2d": 2 if guestCpus > 56 else 1, "a3": 2, + "c2": 2 if guestCpus > 30 else 1, + "c3": 2 if guestCpus > 88 else 1, + "c3d": 2 if guestCpus > 180 else 1, + "c4": 2 if guestCpus > 96 else 1, }.get( machine_type_family(template.machineType), 1, # assume 1 socket for all other families From 679ea45f838a72be29b9166078b6c65d016ea6d2 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 4 Nov 2024 19:58:56 +0000 Subject: [PATCH 092/129] SlurmGCP. Do not create placementPoliciy for a single VM --- .../modules/slurm_files/scripts/resume.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 31658cd96a..48210137a8 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -508,21 +508,25 @@ def create_placement_request(pg_name, region): return request -def create_placement_groups(node_list: list, job_id=0): +def create_placement_groups(node_list: list[str], job_id:int=0) -> dict[str, list[str]]: pgs = {} node_map = lookup().nodeset_map(node_list) for _, nodes in node_map.items(): - pgs.update(create_nodeset_placement_groups(nodes, job_id=job_id)) + pgs.update(create_nodeset_placement_groups(nodes, job_id)) return pgs -def create_nodeset_placement_groups(node_list: list, job_id=0): +def create_nodeset_placement_groups(node_list: list[str], job_id:int) -> dict[str, list[str]]: + no_pg = {None: node_list} # canned result for no placement policies created + + if len(node_list) < 2: + return no_pg # don't create placement_policy for just one node + model = next(iter(node_list)) nodeset = lookup().node_nodeset(model) - if not nodeset.enable_placement: - return {None: node_list} - if not valid_placement_nodes(node_list): - return {None: node_list} + if not (nodeset.enable_placement and valid_placement_nodes(node_list)): + return no_pg + region = lookup().node_region(model) groups = { @@ -538,8 +542,7 @@ def create_nodeset_placement_groups(node_list: list, job_id=0): f"creating {len(groups)} placement groups: \n{yaml.safe_dump(debug_groups).rstrip()}" ) requests = { - group: create_placement_request(group, region) - for group, incl_nodes in groups.items() + group: create_placement_request(group, region) for group in groups.keys() } ops = dict( zip(requests.keys(), map_with_futures(ensure_execute, requests.values())) From 47aa3d7b2068bc9ae376ed76cfc491672db9485b Mon Sep 17 00:00:00 2001 From: Ward K Harold Date: Tue, 5 Nov 2024 17:34:28 +0000 Subject: [PATCH 093/129] Update commit hash; fixes b/377488571 --- community/examples/flux-framework/flux-cluster.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/examples/flux-framework/flux-cluster.yaml b/community/examples/flux-framework/flux-cluster.yaml index e38393fe01..7004b3af88 100644 --- a/community/examples/flux-framework/flux-cluster.yaml +++ b/community/examples/flux-framework/flux-cluster.yaml @@ -34,7 +34,7 @@ deployment_groups: settings: local_mount: /home - id: fluxfw-gcp - source: github.com/GoogleCloudPlatform/scientific-computing-examples//fluxfw-gcp/tf?ref=867e558 + source: github.com/GoogleCloudPlatform/scientific-computing-examples//fluxfw-gcp/tf?ref=cb36377 settings: compute_node_specs: - name_prefix: gfluxfw-compute From 5b91e185a66171822a85f97c0b3a7162df150ee4 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 5 Nov 2024 18:23:50 +0000 Subject: [PATCH 094/129] Update minimum Terraform release for gke-node-pool The use of check blocks requires Terraform 1.5 and above. --- modules/compute/gke-node-pool/README.md | 2 +- modules/compute/gke-node-pool/versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index ed05936163..9fde80fab6 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -278,7 +278,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.2 | +| [terraform](#requirement\_terraform) | >= 1.5 | | [google](#requirement\_google) | ~> 5.0 | | [google-beta](#requirement\_google-beta) | ~> 5.0 | | [null](#requirement\_null) | ~> 3.0 | diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index 0f4cb13c2f..a20854b245 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -13,7 +13,7 @@ # limitations under the License. terraform { - required_version = ">= 1.2" + required_version = ">= 1.5" required_providers { google = { From 8603b6b2d00079472988139a9bf4b0cec9c4fd2f Mon Sep 17 00:00:00 2001 From: Carson Dunbar Date: Tue, 5 Nov 2024 14:57:20 +0000 Subject: [PATCH 095/129] Adding retries to a3mega image blueprint --- .../machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index dfc4d4ab4c..79a2f3e940 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -83,6 +83,11 @@ deployment_groups: ansible.builtin.get_url: url: "{{ package_url }}" dest: "{{ package_filename }}" + retries: 3 + delay: 60 + register: result + until: result is success + failed_when: result is failure - name: Install kernel headers ansible.builtin.apt: deb: "{{ package_filename }}" From 007fd8330dc4a10a603cbaba413ceee6d30d6ea2 Mon Sep 17 00:00:00 2001 From: Rachael Tamakloe Date: Tue, 5 Nov 2024 21:24:31 +0000 Subject: [PATCH 096/129] updating network_storage.md with Parallelstore info --- docs/network_storage.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/network_storage.md b/docs/network_storage.md index 28a39594d6..40065edba3 100644 --- a/docs/network_storage.md +++ b/docs/network_storage.md @@ -7,7 +7,7 @@ The Toolkit contains modules that will **provision**: - [Filestore (GCP managed NFS)][filestore] - [DDN EXAScaler lustre][ddn-exascaler] -- [Intel DAOS][intel-daos] +- [Parallelstore][parallelstore] - [NFS server (non-GCP managed)][nfs-server] The Toolkit also provides a **[pre-existing-network-storage]** module to work @@ -104,12 +104,12 @@ filestore | via USE | via USE | via USE | via USE | via STARTUP | via USE | via nfs-server | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE cloud-storage-bucket (GCS)| via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE DDN EXAScaler lustre | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE -Intel DAOS** | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing | Needs Testing +Parallelstore | via USE | Needs Testing | Needs Testing | via USE | Needs Testing | Needs Testing | Needs Testing |  |   |   |   |   |   |   filestore (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE nfs-server (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | via USE DDN EXAScaler lustre (pre-existing) | via USE | via USE | via USE | via USE | Needs Testing | via USE | via USE -Intel DAOS (pre-existing) | Not Supported | Not Supported | Not Supported | Not Supported | Not Supported | Not Supported | Not Supported +Parallelstore (pre-existing) | via USE | Needs Testing | Needs Testing | via USE | Needs Testing | Needs Testing | Needs Testing GCS FUSE (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | via USE | Needs Testing - **via USE:** Client installation and mounting occur automatically when @@ -122,10 +122,9 @@ GCS FUSE (pre-existing) | via USE | via USE | via USE | via USE | via STARTUP | - **Not Supported:** This feature is not supported right now. \* only supported on CentOS 7\ -** DAOS has additional pre-req steps and does not yet support automatic mounting [filestore]: ../modules/file-system/filestore/README.md [pre-existing-network-storage]: ../modules/file-system/pre-existing-network-storage/README.md [ddn-exascaler]: ../community/modules/file-system/DDN-EXAScaler/README.md -[intel-daos]: ../community/modules/file-system/Intel-DAOS/README.md +[parallelstore]: ../modules/file-system/parallelstore/README.md [nfs-server]: ../community/modules/file-system/nfs-server/README.md From 68750d59c92e8d5c61787d1a150aa56ac2d6b261 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 6 Nov 2024 10:49:00 +0000 Subject: [PATCH 097/129] Cleanup Always when SSH Fails --- .../daily-tests/ansible_playbooks/base-integration-test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml index 66a2908869..229ce35261 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml @@ -169,6 +169,7 @@ vars: startup_timeout_seconds: 600 # 10 minutes gather_facts: false + ignore_unreachable: true # ensure always block will run even if SSH fails tasks: - name: Remote Test Block vars: @@ -182,6 +183,7 @@ - name: Run Integration tests for Cluster Toolkit ansible.builtin.include_tasks: "{{ test }}" + ignore_unreachable: false # end the play when the host is unreachable vars: remote_node: "{{ remote_node }}" deployment_name: "{{ deployment_name }}" From 4796b43be85023f8a58b6c03e35e1149e21e84c3 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 6 Nov 2024 18:43:11 +0000 Subject: [PATCH 098/129] Modernize ml-slurm v5 legacy example Adopt recent versions of pytorch and tensorflow from pip which have improved predictability of CUDA adoption. --- examples/ml-slurm-v5-legacy.yaml | 41 ++++++-------------------------- 1 file changed, 7 insertions(+), 34 deletions(-) diff --git a/examples/ml-slurm-v5-legacy.yaml b/examples/ml-slurm-v5-legacy.yaml index 6c0fb8aa30..bf0e107bb0 100644 --- a/examples/ml-slurm-v5-legacy.yaml +++ b/examples/ml-slurm-v5-legacy.yaml @@ -94,9 +94,8 @@ deployment_groups: content: | #!/bin/bash # this script is designed to execute on Slurm images published by SchedMD that: - # - are based on Debian 11 distribution of Linux - # - have NVIDIA Drivers v530 pre-installed - # - have CUDA Toolkit 12.1 pre-installed. + # - are based on Debian distribution of Linux + # - have NVIDIA drivers pre-installed set -e -o pipefail @@ -112,8 +111,8 @@ deployment_groups: DL_DIR=\$(mktemp -d) cd $DL_DIR - curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh - HOME=$DL_DIR bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -b -p $CONDA_BASE + curl -L -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh + HOME=$DL_DIR bash Miniforge3-24.7.1-2-Linux-x86_64.sh -b -p $CONDA_BASE cd - rm -rf $DL_DIR unset DL_DIR @@ -123,39 +122,12 @@ deployment_groups: conda config --system --set auto_activate_base False # following channel ordering is important! use strict_priority! conda config --system --set channel_priority strict - conda config --system --remove channels defaults - conda config --system --add channels conda-forge - conda config --system --add channels nvidia - conda config --system --add channels nvidia/label/cuda-11.8.0 - conda update -n base conda --yes ### create a virtual environment for tensorflow - conda create -n tf python=3.10 --yes + conda create -n tf python=3.11 --yes conda activate tf - conda install -n tf cuda-toolkit --yes - pip install nvidia-cudnn-cu11 nvidia-nccl-cu11 - - cd $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/nccl/lib/ - ln -s libnccl.so.2 libnccl.so - cd - - - mkdir -p $CONDA_PREFIX/etc/conda/activate.d - echo 'export OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH' > $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - echo 'NVIDIA_PYTHON_PATH=$CONDA_PREFIX/lib/python3.10/site-packages/nvidia' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$NVIDIA_PYTHON_PATH/cudnn/lib/:$NVIDIA_PYTHON_PATH/nccl/lib/' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - mkdir -p $CONDA_PREFIX/etc/conda/deactivate.d - echo 'export LD_LIBRARY_PATH=${OLD_LD_LIBRARY_PATH}' > $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh - echo 'unset OLD_LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh - - pip install tensorflow==2.12.* - pip install tensorrt==8.6.* - - ### create a virtual environment for pytorch - conda create -n pytorch python=3.10 --yes - conda activate pytorch - conda config --env --add channels pytorch - conda install -n pytorch pytorch torchvision torchaudio pytorch-cuda=11.8 --yes + pip install tensorflow[and-cuda]==2.18.* - group: packer modules: @@ -175,6 +147,7 @@ deployment_groups: # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) + disk_type: pd-ssd image_family: $(vars.new_image.family) # building this image does not require a GPU-enabled VM machine_type: c2-standard-4 From 4da2d6235c3955ad5cf3682972652e1125c909c1 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 6 Nov 2024 18:44:10 +0000 Subject: [PATCH 099/129] Modernize ml-slurm example Adopt recent versions of pytorch and tensorflow from pip which have improved predictability of CUDA adoption. --- examples/ml-slurm.yaml | 41 ++++++++++------------------------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index 7860eb2daf..59967cd3e3 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -62,9 +62,8 @@ deployment_groups: content: | #!/bin/bash # this script is designed to execute on Slurm images published by SchedMD that: - # - are based on Debian 11 distribution of Linux - # - have NVIDIA Drivers v530 pre-installed - # - have CUDA Toolkit 12.1 pre-installed. + # - are based on Debian distribution of Linux + # - have NVIDIA drivers pre-installed set -e -o pipefail @@ -80,8 +79,8 @@ deployment_groups: DL_DIR=\$(mktemp -d) cd $DL_DIR - curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh - HOME=$DL_DIR bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -b -p $CONDA_BASE + curl -L -O https://github.com/conda-forge/miniforge/releases/download/24.7.1-2/Miniforge3-24.7.1-2-Linux-x86_64.sh + HOME=$DL_DIR bash Miniforge3-24.7.1-2-Linux-x86_64.sh -b -p $CONDA_BASE cd - rm -rf $DL_DIR unset DL_DIR @@ -91,39 +90,18 @@ deployment_groups: conda config --system --set auto_activate_base False # following channel ordering is important! use strict_priority! conda config --system --set channel_priority strict - conda config --system --remove channels defaults - conda config --system --add channels conda-forge - conda config --system --add channels nvidia - conda config --system --add channels nvidia/label/cuda-11.8.0 - conda update -n base conda --yes ### create a virtual environment for tensorflow - conda create -n tf python=3.10 --yes + conda create -n tf python=3.11 --yes conda activate tf - conda install -n tf cuda-toolkit --yes - pip install nvidia-cudnn-cu11 nvidia-nccl-cu11 - - cd $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/nccl/lib/ - ln -s libnccl.so.2 libnccl.so - cd - - - mkdir -p $CONDA_PREFIX/etc/conda/activate.d - echo 'export OLD_LD_LIBRARY_PATH=$LD_LIBRARY_PATH' > $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - echo 'NVIDIA_PYTHON_PATH=$CONDA_PREFIX/lib/python3.10/site-packages/nvidia' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$NVIDIA_PYTHON_PATH/cudnn/lib/:$NVIDIA_PYTHON_PATH/nccl/lib/' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh - mkdir -p $CONDA_PREFIX/etc/conda/deactivate.d - echo 'export LD_LIBRARY_PATH=${OLD_LD_LIBRARY_PATH}' > $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh - echo 'unset OLD_LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/deactivate.d/env_vars.sh - - pip install tensorflow==2.12.* - pip install tensorrt==8.6.* + pip install tensorflow[and-cuda]==2.18.* + pip install tensorrt==10.6.* ### create a virtual environment for pytorch - conda create -n pytorch python=3.10 --yes + conda create -n pytorch python=3.11 --yes conda activate pytorch - conda config --env --add channels pytorch - conda install -n pytorch pytorch torchvision torchaudio pytorch-cuda=11.8 --yes + pip install torch torchvision torchaudio - group: packer modules: @@ -143,6 +121,7 @@ deployment_groups: # You can find size of source image by using following command # gcloud compute images describe-from-family --project schedmd-slurm-public disk_size: $(vars.disk_size_gb) + disk_type: pd-ssd image_family: $(vars.new_image.family) # building this image does not require a GPU-enabled VM machine_type: c2-standard-4 From 7e9ced2e6eb8dd9143d9dfb1c31bcce903fbc82a Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 6 Nov 2024 18:44:44 +0000 Subject: [PATCH 100/129] Update instructions for ml-slurm examples to explicitly request GPUs --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index b1cafa8c6b..7933f73640 100644 --- a/examples/README.md +++ b/examples/README.md @@ -583,7 +583,7 @@ An example benchmarking job for PyTorch can be run under Slurm: ```shell cp /var/tmp/torch_test.* . -sbatch -N 1 torch_test.sh +sbatch -N 1 --gpus-per-node=1 torch_test.sh ``` When you are done, clean up the resources in reverse order of creation: @@ -632,7 +632,7 @@ An example benchmarking job for PyTorch can be run under Slurm: ```shell cp /var/tmp/torch_test.* . -sbatch -N 1 torch_test.sh +sbatch -N 1 --gpus-per-node=1 torch_test.sh ``` When you are done, clean up the resources in reverse order of creation: From 7a290415dc91f965083e4dbeda9d2a9999fd4bed Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 7 Nov 2024 07:29:53 +0000 Subject: [PATCH 101/129] Fix "too modern" type annotations --- .../modules/slurm_files/scripts/resume.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 48210137a8..bb5139aea8 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional +from typing import List, Optional, Dict import argparse import collections from datetime import timedelta @@ -508,7 +508,7 @@ def create_placement_request(pg_name, region): return request -def create_placement_groups(node_list: list[str], job_id:int=0) -> dict[str, list[str]]: +def create_placement_groups(node_list: List[str], job_id:int=0) -> Dict[str, List[str]]: pgs = {} node_map = lookup().nodeset_map(node_list) for _, nodes in node_map.items(): @@ -516,7 +516,7 @@ def create_placement_groups(node_list: list[str], job_id:int=0) -> dict[str, lis return pgs -def create_nodeset_placement_groups(node_list: list[str], job_id:int) -> dict[str, list[str]]: +def create_nodeset_placement_groups(node_list: List[str], job_id:int) -> Dict[str, List[str]]: no_pg = {None: node_list} # canned result for no placement policies created if len(node_list) < 2: From 1d08120680a9ea7a3bf96d009975c4f03cd75260 Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Thu, 7 Nov 2024 07:55:38 +0000 Subject: [PATCH 102/129] TaskInclude does not have ignore_unreachable --- .../daily-tests/ansible_playbooks/base-integration-test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml index 229ce35261..04baa266e5 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml @@ -183,7 +183,6 @@ - name: Run Integration tests for Cluster Toolkit ansible.builtin.include_tasks: "{{ test }}" - ignore_unreachable: false # end the play when the host is unreachable vars: remote_node: "{{ remote_node }}" deployment_name: "{{ deployment_name }}" From 7d7c35e1d1d1598fbc96928371c0e9e058b1507d Mon Sep 17 00:00:00 2001 From: abbas1902 Date: Thu, 7 Nov 2024 17:21:23 +0000 Subject: [PATCH 103/129] remove local_ssd warning --- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 54ed5cf1d3..1ee007d0a1 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -87,7 +87,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf index b3e323bc68..9be7e48dbb 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/variables.tf @@ -142,7 +142,7 @@ variable "disk_labels" { } variable "additional_disks" { - description = "Configurations of additional disks to be included on the partition nodes. (do not use \"disk_type: local-ssd\"; known issue being addressed)" + description = "Configurations of additional disks to be included on the partition nodes." type = list(object({ disk_name = string device_name = string diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index bd339c262e..d3a002f372 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -159,7 +159,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 3bd8fc74fb..003a76d779 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -161,7 +161,7 @@ variable "disk_labels" { } variable "additional_disks" { - description = "Configurations of additional disks to be included on the partition nodes. (do not use \"disk_type: local-ssd\"; known issue being addressed)" + description = "Configurations of additional disks to be included on the partition nodes." type = list(object({ disk_name = string device_name = string From d88cd50a44951bc350b956c63b3db3b73fdab7ec Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Wed, 30 Oct 2024 21:23:53 +0000 Subject: [PATCH 104/129] Improve caching job details and fetch more job details --- .../modules/slurm_files/scripts/slurmsync.py | 7 +- .../slurm_files/scripts/tests/test_util.py | 77 +++++++++++++++-- .../modules/slurm_files/scripts/util.py | 82 +++++++++++++------ 3 files changed, 130 insertions(+), 36 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 112e2d5748..cea4ae2f35 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -334,13 +334,14 @@ def sync_placement_groups(): "STOPPED", "SUSPENDED", "COMPLETING", + "PENDING", ] ) keep_jobs = { - str(job["job_id"]) - for job in json.loads(run(f"{lookup().scontrol} show jobs --json").stdout)["jobs"] - if "job_state" in job and set(job["job_state"]) & keep_states + str(job.id) + for job in lookup().get_jobs() + if job.job_state in keep_states } keep_jobs.add("0") # Job 0 is a placeholder for static node placement diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index 14b7a7bf62..7a3c40946d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -16,6 +16,7 @@ from mock import Mock from common import TstNodeset, TstCfg # needed to import util import util +from datetime import timedelta from google.api_core.client_options import ClientOptions # noqa: E402 # Note: need to install pytest-mock @@ -158,14 +159,14 @@ def test_nodeset_reservation_err(nodeset, err): with pytest.raises(err): lkp.nodeset_reservation(nodeset) lkp._get_reservation.assert_not_called() - + @pytest.mark.parametrize( "nodeset,policies,expected", [ (TstNodeset(), [], None), # no reservation (TstNodeset( reservation_name="projects/bobin/reservations/robin", - zone_policy_allow=["eine"]), + zone_policy_allow=["eine"]), [], util.ReservationDetails( project="bobin", @@ -175,7 +176,7 @@ def test_nodeset_reservation_err(nodeset, err): bulk_insert_name="projects/bobin/reservations/robin")), (TstNodeset( reservation_name="projects/bobin/reservations/robin", - zone_policy_allow=["eine"]), + zone_policy_allow=["eine"]), ["seven/wanders", "five/red/apples", "yum"], util.ReservationDetails( project="bobin", @@ -185,7 +186,7 @@ def test_nodeset_reservation_err(nodeset, err): bulk_insert_name="projects/bobin/reservations/robin")), (TstNodeset( reservation_name="projects/bobin/reservations/robin/snek/cheese-brie-6", - zone_policy_allow=["eine"]), + zone_policy_allow=["eine"]), [], util.ReservationDetails( project="bobin", @@ -199,16 +200,76 @@ def test_nodeset_reservation_err(nodeset, err): def test_nodeset_reservation_ok(nodeset, policies, expected): lkp = util.Lookup(TstCfg()) lkp._get_reservation = Mock() - + if not expected: assert lkp.nodeset_reservation(nodeset) is None lkp._get_reservation.assert_not_called() return - + lkp._get_reservation.return_value = { "resourcePolicies": {i: p for i, p in enumerate(policies)}, } assert lkp.nodeset_reservation(nodeset) == expected lkp._get_reservation.assert_called_once_with(expected.project, expected.zone, expected.name) - - + + +@pytest.mark.parametrize( + "job_info,expected_job", + [ + ( + """JobId=123 + TimeLimit=02:00:00 + JobName=myjob + JobState=PENDING + ReqNodeList=node-[1-10]""", + util.Job( + id=123, + duration=timedelta(days=0, hours=2, minutes=0, seconds=0), + name="myjob", + job_state="PENDING", + required_nodes="node-[1-10]" + ), + ), + ( + """JobId=456 + JobName=anotherjob + JobState=PENDING + ReqNodeList=node-group1""", + util.Job( + id=456, + duration=None, + name="anotherjob", + job_state="PENDING", + required_nodes="node-group1" + ), + ), + ( + """JobId=789 + TimeLimit=00:30:00 + JobState=COMPLETED""", + util.Job( + id=789, + duration=timedelta(minutes=30), + name=None, + job_state="COMPLETED", + required_nodes=None + ), + ), + ( + """JobId=101112 + TimeLimit=1-00:30:00 + JobState=COMPLETED, + ReqNodeList=node-[1-10],grob-pop-[2,1,44-77]""", + util.Job( + id=101112, + duration=timedelta(days=1, hours=0, minutes=30, seconds=0), + name=None, + job_state="COMPLETED", + required_nodes="node-[1-10],grob-pop-[2,1,44-77]" + ), + ), + ], +) +def test_parse_job_info(job_info, expected_job): + lkp = util.Lookup(TstCfg()) + assert lkp._parse_job_info(job_info) == expected_job diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index d046c27357..5690fea0a2 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -411,7 +411,7 @@ def _fill_cfg_defaults(cfg: NSDict) -> NSDict: "mount_options": "defaults,hard,intr,_netdev", } ) - + network_storage_iter = filter( None, ( @@ -474,8 +474,8 @@ def _download(bs) -> List[Any]: ), hash def _assemble_config( - core: Any, - partitions: List[Any], + core: Any, + partitions: List[Any], nodesets: List[Any], nodesets_dyn: List[Any], nodesets_tpu: List[Any], @@ -510,17 +510,17 @@ def _add_nodesets(yamls: List[Any], target: dict): for ns_name in chain(p.partition_nodeset, p.partition_nodeset_dyn, p.partition_nodeset_tpu): if ns_name not in ns_names: raise DeffetiveStoredConfigError(f"nodeset {ns_name} not defined in config") - + return _fill_cfg_defaults(cfg) def fetch_config() -> Tuple[bool, NSDict]: """ - Fetches config from bucket and saves it locally + Fetches config from bucket and saves it locally Returns True if new (updated) config was fetched """ hash_file = Path("/slurm/scripts/.config.hash") old_hash = hash_file.read_text() if hash_file.exists() else None - + cfg_and_hash = _fetch_config(old_hash=old_hash) if not cfg_and_hash: return False, _load_config() @@ -1460,8 +1460,12 @@ class ReservationDetails: @dataclass class Job: id: int + name: Optional[str] = None + required_nodes: Optional[str] = None + job_state: Optional[str] = None duration: Optional[timedelta] = None + class Lookup: """Wrapper class for cached data access""" @@ -1757,11 +1761,11 @@ def _get_reservation(self, project: str, zone: str, name: str) -> object: """See https://cloud.google.com/compute/docs/reference/rest/v1/reservations""" return self.compute.reservations().get( project=project, zone=zone, reservation=name).execute() - + def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: if not nodeset.reservation_name: return None - + zones = list(nodeset.zone_policy_allow or []) assert len(zones) == 1, "Only single zone is supported if using a reservation" zone = zones[0] @@ -1771,7 +1775,7 @@ def nodeset_reservation(self, nodeset: object) -> Optional[ReservationDetails]: raise ValueError( f"Invalid reservation name: '{nodeset.reservation_name}', expected format is 'projects/PROJECT/reservations/NAME'" ) - + project, name = match.group("project", "reservation") reservation = self._get_reservation(project, zone, name) @@ -1928,26 +1932,54 @@ def nodeset_map(self, hostnames: list): nodeset_map[self.node_nodeset_name(node)].append(node) return nodeset_map + def _parse_job_info(self, job_info: str) -> Job: + """Extract job details""" + if match:= re.search(r"JobId=(\d+)", job_info): + job_id = int(match.group(1)) + else: + raise ValueError(f"Job ID not found in the job info: {job_info}") + + if match:= re.search(r"TimeLimit=(?:(\d+)-)?(\d{2}):(\d{2}):(\d{2})", job_info): + days, hours, minutes, seconds = match.groups() + duration = timedelta( + days=int(days) if days else 0, + hours=int(hours), + minutes=int(minutes), + seconds=int(seconds) + ) + else: + duration = None + + if match := re.search(r"JobName=(\w+)", job_info): + name = match.group(1) + else: + name = None + + if match := re.search(r"JobState=(\w+)", job_info): + job_state = match.group(1) + else: + job_state = None + + if match := re.search(r"ReqNodeList=([^ ]+)", job_info): + required_nodes = match.group(1) + else: + required_nodes = None + + return Job(id=job_id, duration=duration, name=name, job_state=job_state, required_nodes=required_nodes) + @lru_cache - def job(self, job_id: int) -> Optional[Job]: - jobInfo = run(f"{self.scontrol} show jobid {job_id}", check=False).stdout.rstrip() - if not jobInfo: - return None + def get_jobs(self) -> List[Job]: + res = run(f"{self.scontrol} show jobs", timeout=30) - timePattern = r"TimeLimit=(?:(\d+)-)?(\d{2}):(\d{2}):(\d{2})" - match = re.search(timePattern, jobInfo) + return [self._parse_job_info(job) for job in res.stdout.split("\n\n")[:-1]] - if not match: - return Job(id=job_id) + @lru_cache + def job(self, job_id: int) -> Optional[Job]: + job_info = run(f"{self.scontrol} show jobid {job_id}", check=False).stdout.rstrip() + if not job_info: + return None - days, hours, minutes, seconds = match.groups() - job_duration = timedelta( - days=int(days) if days else 0, - hours=int(hours), - minutes=int(minutes), - seconds=int(seconds) - ) - return Job(id=job_id, duration=job_duration) + return self._parse_job_info(job_info=job_info) @property def etc_dir(self) -> Path: From 6876b7de4ccb506e06f99fd399e2e23a22d4e2e4 Mon Sep 17 00:00:00 2001 From: chengcongdu Date: Thu, 7 Nov 2024 19:13:09 +0000 Subject: [PATCH 105/129] update a3 machines local ssd to use nvme instead of scsi for better performance --- modules/compute/gke-node-pool/disk_definitions.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/compute/gke-node-pool/disk_definitions.tf b/modules/compute/gke-node-pool/disk_definitions.tf index f7dbebea0a..b5933bf316 100644 --- a/modules/compute/gke-node-pool/disk_definitions.tf +++ b/modules/compute/gke-node-pool/disk_definitions.tf @@ -22,8 +22,8 @@ locals { local_ssd_machines = { - "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, - "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, + "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 }, + "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = 16 }, } generated_local_ssd_config = lookup(local.local_ssd_machines, var.machine_type, { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = null }) From 1176e25cadf293548517891975a864ec35dfc8d0 Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Thu, 7 Nov 2024 22:10:50 +0000 Subject: [PATCH 106/129] Fix regex for Job name in fetching job details --- .../modules/slurm_files/scripts/tests/test_util.py | 14 ++++++++++++++ .../modules/slurm_files/scripts/util.py | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index 7a3c40946d..4104e948c5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -268,6 +268,20 @@ def test_nodeset_reservation_ok(nodeset, policies, expected): required_nodes="node-[1-10],grob-pop-[2,1,44-77]" ), ), + ( + """JobId=131415 + TimeLimit=1-00:30:00 + JobName=mynode-1_maintenance + JobState=COMPLETED, + ReqNodeList=node-[1-10],grob-pop-[2,1,44-77]""", + util.Job( + id=131415, + duration=timedelta(days=1, hours=0, minutes=30, seconds=0), + name="mynode-1_maintenance", + job_state="COMPLETED", + required_nodes="node-[1-10],grob-pop-[2,1,44-77]" + ), + ), ], ) def test_parse_job_info(job_info, expected_job): diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index 5690fea0a2..e841b58ca1 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -1950,7 +1950,7 @@ def _parse_job_info(self, job_info: str) -> Job: else: duration = None - if match := re.search(r"JobName=(\w+)", job_info): + if match := re.search(r"JobName=([^\n]+)", job_info): name = match.group(1) else: name = None From 7f37c7634b88eb224f2025719ee146591703f67b Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 7 Nov 2024 23:40:23 +0000 Subject: [PATCH 107/129] Fix version constraint for resource-policy module Remove upper bound for best practices and impose correct lower bound https://github.com/hashicorp/terraform-provider-google-beta/releases/tag/v4.56.0 --- modules/compute/resource-policy/README.md | 4 ++-- modules/compute/resource-policy/versions.tf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/compute/resource-policy/README.md b/modules/compute/resource-policy/README.md index f3f00e3437..85d8baee1c 100644 --- a/modules/compute/resource-policy/README.md +++ b/modules/compute/resource-policy/README.md @@ -43,13 +43,13 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.3 | -| [google-beta](#requirement\_google-beta) | ~> 5.0 | +| [google-beta](#requirement\_google-beta) | > 4.56.0 | ## Providers | Name | Version | |------|---------| -| [google-beta](#provider\_google-beta) | ~> 5.0 | +| [google-beta](#provider\_google-beta) | > 4.56.0 | ## Modules diff --git a/modules/compute/resource-policy/versions.tf b/modules/compute/resource-policy/versions.tf index 4b7b6158c9..89aea79811 100644 --- a/modules/compute/resource-policy/versions.tf +++ b/modules/compute/resource-policy/versions.tf @@ -18,7 +18,7 @@ terraform { required_providers { google-beta = { source = "hashicorp/google-beta" - version = "~> 5.0" + version = "> 4.56.0" } } From aa046cc489cd391a44d792b03f90bc704e5e9772 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 8 Nov 2024 14:41:58 +0000 Subject: [PATCH 108/129] Revert "Free slurm-gcp v5 hybrid blueprints with the latest cluster toolkit" This reverts commit 5cb64acebcfb136ddbeba2b6919e2677f1aab806. --- community/examples/tutorial-starccm-slurm.yaml | 2 -- docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml | 2 -- 2 files changed, 4 deletions(-) diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml index 9e64014ea7..ebf52861ff 100644 --- a/community/examples/tutorial-starccm-slurm.yaml +++ b/community/examples/tutorial-starccm-slurm.yaml @@ -15,8 +15,6 @@ --- blueprint_name: starccm-on-slurm -toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit -toolkit_modules_version: v1.41.0 vars: project_id: ## Set GCP Project ID Here ## diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml index 813a90f0b6..45312348ed 100644 --- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml @@ -15,8 +15,6 @@ --- blueprint_name: hpc-cluster-hybrid-v5 -toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit -toolkit_modules_version: v1.41.0 vars: project_id: ## <> From e509e6da6f72d9f56a540eaeb54c9bcaa20791a9 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 8 Nov 2024 16:28:53 +0000 Subject: [PATCH 109/129] Make gke-node-pool compatible with TPG 6.x The gke-node-pool module uses older "attribute" syntax for the GPU-related arguments that has been removed in the google Terraform plugin 6.x. This commit replaces attribute syntax with block syntax. The key to understanding this change is that a dynamic block iterating over a list is equivalent to null when the list is empty (no dynamic blocks are inserted). The gpu_sharing_config and gpu_driver_installation_config settings are not (and never were) list(object) in the Terraform plugin. They could only ever taken on length 0 or 1. These are therefore being converted to object format as they are in the API. https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#nested_guest_accelerator https://developer.hashicorp.com/terraform/language/attr-as-blocks --- modules/compute/gke-node-pool/README.md | 10 ++++---- modules/compute/gke-node-pool/main.tf | 30 +++++++++++++++++----- modules/compute/gke-node-pool/variables.tf | 30 +++++++++++++++++----- modules/compute/gke-node-pool/versions.tf | 4 +-- 4 files changed, 53 insertions(+), 21 deletions(-) diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 9fde80fab6..6aab9facb6 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -279,16 +279,16 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.5 | -| [google](#requirement\_google) | ~> 5.0 | -| [google-beta](#requirement\_google-beta) | ~> 5.0 | +| [google](#requirement\_google) | > 5 | +| [google-beta](#requirement\_google-beta) | > 5 | | [null](#requirement\_null) | ~> 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | ~> 5.0 | -| [google-beta](#provider\_google-beta) | ~> 5.0 | +| [google](#provider\_google) | > 5 | +| [google-beta](#provider\_google-beta) | > 5 | | [null](#provider\_null) | ~> 3.0 | ## Modules @@ -322,7 +322,7 @@ limitations under the License. | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(object({
gpu_driver_version = string
}), { gpu_driver_version = "DEFAULT" })
gpu_partition_size = optional(string)
gpu_sharing_config = optional(object({
gpu_sharing_strategy = string
max_shared_clients_per_gpu = number
}))
}))
| `[]` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [initial\_node\_count](#input\_initial\_node\_count) | The initial number of nodes for the pool. In regional clusters, this is the number of nodes per zone. Changing this setting after node pool creation will not make any effect. It cannot be set with static\_node\_count and must be set to a value between autoscaling\_total\_min\_nodes and autoscaling\_total\_max\_nodes. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index b556773559..829383b5ba 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -20,8 +20,7 @@ locals { } locals { - preattached_gpu_machine_family = contains(["a2", "a3", "g2"], local.machine_family) - has_gpu = (local.guest_accelerator != null && length(local.guest_accelerator) > 0) || local.preattached_gpu_machine_family + has_gpu = length(local.guest_accelerator) > 0 gpu_taint = local.has_gpu ? [{ key = "nvidia.com/gpu" value = "present" @@ -86,12 +85,29 @@ resource "google_container_node_pool" "node_pool" { dynamic "guest_accelerator" { for_each = local.guest_accelerator + iterator = ga content { - type = coalesce(guest_accelerator.value.type, try(local.generated_guest_accelerator[0].type, "")) - count = coalesce(try(guest_accelerator.value.count, 0) > 0 ? guest_accelerator.value.count : try(local.generated_guest_accelerator[0].count, "0")) - gpu_driver_installation_config = coalescelist(try(guest_accelerator.value.gpu_driver_installation_config, []), [{ gpu_driver_version = "DEFAULT" }]) - gpu_partition_size = try(guest_accelerator.value.gpu_partition_size, "") - gpu_sharing_config = try(guest_accelerator.value.gpu_sharing_config, null) + type = coalesce(ga.value.type, try(local.generated_guest_accelerator[0].type, "")) + count = coalesce(try(ga.value.count, 0) > 0 ? ga.value.count : try(local.generated_guest_accelerator[0].count, "0")) + + gpu_partition_size = try(ga.value.gpu_partition_size, null) + + dynamic "gpu_driver_installation_config" { + for_each = try([ga.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }]) + iterator = gdic + content { + gpu_driver_version = gdic.value.gpu_driver_version + } + } + + dynamic "gpu_sharing_config" { + for_each = try(ga.value.gpu_sharing_config == null, true) ? [] : [ga.value.gpu_sharing_config] + iterator = gsc + content { + gpu_sharing_strategy = gsc.value.gpu_sharing_strategy + max_shared_clients_per_gpu = gsc.value.max_shared_clients_per_gpu + } + } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 11442ceb4a..eecc4634c1 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -79,16 +79,32 @@ variable "guest_accelerator" { type = list(object({ type = optional(string) count = optional(number, 0) - gpu_driver_installation_config = optional(list(object({ + gpu_driver_installation_config = optional(object({ gpu_driver_version = string - }))) + }), { gpu_driver_version = "DEFAULT" }) gpu_partition_size = optional(string) - gpu_sharing_config = optional(list(object({ - gpu_sharing_strategy = optional(string) - max_shared_clients_per_gpu = optional(number) - }))) + gpu_sharing_config = optional(object({ + gpu_sharing_strategy = string + max_shared_clients_per_gpu = number + })) })) - default = null + default = [] + nullable = false + + validation { + condition = alltrue([for ga in var.guest_accelerator : ga.count != null]) + error_message = "var.guest_accelerator[*].count cannot be null" + } + + validation { + condition = alltrue([for ga in var.guest_accelerator : ga.count >= 0]) + error_message = "var.guest_accelerator[*].count must never be negative" + } + + validation { + condition = alltrue([for ga in var.guest_accelerator : ga.gpu_driver_installation_config != null]) + error_message = "var.guest_accelerator[*].gpu_driver_installation_config must not be null; leave unset to enable GKE to select default GPU driver installation" + } } variable "image_type" { diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index a20854b245..f0ef7ccb2c 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -18,11 +18,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 5.0" + version = "> 5" } google-beta = { source = "hashicorp/google-beta" - version = "~> 5.0" + version = "> 5" } null = { source = "hashicorp/null" From 378bf2a344a18ca3c42d9119f566253ddfceb242 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 8 Nov 2024 16:28:53 +0000 Subject: [PATCH 110/129] Align GKE documentation and examples with TPG 6.x This commit fixes the documentation and examples to align with changes introduced in a9c2a69f to make gke-node-pool module compatible with TPG 6.x. --- examples/README.md | 4 ++-- modules/compute/gke-node-pool/README.md | 6 +++--- tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/README.md b/examples/README.md index 7933f73640..53a84d1a08 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1463,10 +1463,10 @@ guest_accelerator: - type: nvidia-l4 count: 1 gpu_sharing_config: - - max_shared_clients_per_gpu: 2 + max_shared_clients_per_gpu: 2 gpu_sharing_strategy: "TIME_SHARING" gpu_driver_installation_config: - - gpu_driver_version: "LATEST" + gpu_driver_version: "LATEST" ``` * Configuration of the cluster using default drivers provided by GKE. diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 6aab9facb6..7359e934ca 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -151,7 +151,7 @@ The following is an example of guest_accelerator: - gpu_partition_size: 1g.5gb gpu_sharing_config: - - gpu_sharing_strategy: TIME_SHARING + gpu_sharing_strategy: TIME_SHARING max_shared_clients_per_gpu: 3 ``` @@ -181,9 +181,9 @@ The following is an example of using a GPU (with sharing config) attached to an - type: nvidia-tesla-t4 count: 2 gpu_driver_installation_config: - - gpu_driver_version: "LATEST" + gpu_driver_version: "LATEST" gpu_sharing_config: - - max_shared_clients_per_gpu: 2 + max_shared_clients_per_gpu: 2 gpu_sharing_strategy: "TIME_SHARING" ``` diff --git a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml index d7be384115..83ceb58c65 100644 --- a/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/blueprints/ml-gke-e2e.yaml @@ -70,9 +70,9 @@ deployment_groups: machine_type: g2-standard-4 guest_accelerator: - gpu_driver_installation_config: - - gpu_driver_version: "LATEST" + gpu_driver_version: "LATEST" gpu_sharing_config: - - max_shared_clients_per_gpu: 2 + max_shared_clients_per_gpu: 2 gpu_sharing_strategy: "MPS" - id: job_template_g2_latest_driver @@ -131,9 +131,9 @@ deployment_groups: - type: nvidia-tesla-t4 count: 2 gpu_driver_installation_config: - - gpu_driver_version: "LATEST" + gpu_driver_version: "LATEST" gpu_sharing_config: - - max_shared_clients_per_gpu: 2 + max_shared_clients_per_gpu: 2 gpu_sharing_strategy: "TIME_SHARING" - id: job_template_n1_pool_full_spec From bfe2d47feece1fab2543b7de34d14145931becba Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 8 Nov 2024 22:06:10 +0000 Subject: [PATCH 111/129] Drop support for TPG v5 - TPG 5.x is no longer supported or maintained for anything except critical backports - The filestore module will soon support only TPG 6.4 and above, effectively making most of our blueprints require TPG 6.4+ --- pkg/config/expand.go | 4 ++-- pkg/config/expand_test.go | 4 ++-- .../igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../golden_copies/expectations/igc_pkr/zero/versions.tf | 4 ++-- .../igc_tf/.ghpc/artifacts/expanded_blueprint.yaml | 8 ++++---- .../golden_copies/expectations/igc_tf/one/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/zero/versions.tf | 4 ++-- .../merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/merge_flatten/zero/versions.tf | 4 ++-- .../.ghpc/artifacts/expanded_blueprint.yaml | 4 ++-- .../expectations/versioned_blueprint/primary/versions.tf | 4 ++-- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 5e73706f15..9bad4dd2d1 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -199,11 +199,11 @@ func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { return map[string]TerraformProvider{ "google": { Source: "hashicorp/google", - Version: ">= 5.44.2, < 6.10.0", + Version: "~> 6.10.0", Configuration: gglConf}, "google-beta": { Source: "hashicorp/google-beta", - Version: ">= 5.44.2, < 6.10.0", + Version: "~> 6.10.0", Configuration: gglConf}} } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 17b8304900..ad00218133 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -93,10 +93,10 @@ func (s *zeroSuite) TestExpandProviders(c *C) { c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ "google": TerraformProvider{ Source: "hashicorp/google", - Version: ">= 5.44.2, < 6.10.0"}, + Version: "~> 6.10.0"}, "google-beta": TerraformProvider{ Source: "hashicorp/google-beta", - Version: ">= 5.44.2, < 6.10.0"}}) + Version: "~> 6.10.0"}}) } { // no def PR, group PR diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index f8df351469..1db9c66495 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -38,14 +38,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index c14eee115f..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index d6d186dd07..fd6bd3e490 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -44,14 +44,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) @@ -80,14 +80,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index c14eee115f..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index c14eee115f..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index fbd8d55a64..208cdde2ac 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -39,14 +39,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index c14eee115f..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index 537dbef727..d8414f6db3 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -47,14 +47,14 @@ deployment_groups: terraform_providers: google: source: hashicorp/google - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) zone: ((var.zone)) google-beta: source: hashicorp/google-beta - version: '>= 5.44.2, < 6.10.0' + version: ~> 6.10.0 configuration: project: ((var.project_id)) region: ((var.region)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf index c14eee115f..ed7b1bb3ba 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 5.44.2, < 6.10.0" + version = "~> 6.10.0" } } } From 05b0f888c25ca0230664e15a70021888c85c106b Mon Sep 17 00:00:00 2001 From: Harsh Thakkar Date: Tue, 22 Oct 2024 00:06:14 +0000 Subject: [PATCH 112/129] Add support to perform GCP maintenance as slurm job --- .../schedmd-slurm-gcp-v6-nodeset/README.md | 1 + .../schedmd-slurm-gcp-v6-nodeset/main.tf | 3 +- .../schedmd-slurm-gcp-v6-nodeset/variables.tf | 10 ++- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../modules/slurm_files/scripts/setup.py | 33 +++++++-- .../modules/slurm_files/scripts/slurmsync.py | 71 ++++++++++++++++++- .../partition.tf | 33 ++++----- .../variables.tf | 25 +++---- 8 files changed, 140 insertions(+), 38 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index d3a002f372..8ad8c304f0 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -172,6 +172,7 @@ No modules. | [dws\_flex](#input\_dws\_flex) | If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes.
See: https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler
Options:
- enable: Enable DWS Flex Start
- max\_run\_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks).
- use\_job\_duration: Use the job duration to determine the max\_run\_duration, if job duration is not set, max\_run\_duration will be used.

Limitations:
- CAN NOT be used with reservations;
- CAN NOT be used with placement groups;
- If `use_job_duration` is enabled nodeset can be used in "exclusive" partitions only |
object({
enabled = optional(bool, true)
max_run_duration = optional(number, 1209600) # 2 weeks
use_job_duration = optional(bool, false)
})
|
{
"enabled": false
}
| no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_maintenance\_reservation](#input\_enable\_maintenance\_reservation) | Enables slurm reservation for scheduled maintenance. | `bool` | `false` | no | +| [enable\_opportunistic\_maintenance](#input\_enable\_opportunistic\_maintenance) | On receiving maintenance notification, maintenance will be performed as soon as nodes becomes idle. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_placement](#input\_enable\_placement) | Enable placement groups. | `bool` | `true` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index ffaa9d4302..3f283ffade 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -105,7 +105,8 @@ locals { startup_script = local.ghpc_startup_script network_storage = var.network_storage - enable_maintenance_reservation = var.enable_maintenance_reservation + enable_maintenance_reservation = var.enable_maintenance_reservation + enable_opportunistic_maintenance = var.enable_opportunistic_maintenance } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 003a76d779..c35faad4e9 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -513,6 +513,14 @@ variable "enable_maintenance_reservation" { default = false } + +variable "enable_opportunistic_maintenance" { + type = bool + description = "On receiving maintenance notification, maintenance will be performed as soon as nodes becomes idle." + default = false +} + + variable "dws_flex" { description = <<-EOD If set and `enabled = true`, will utilize the DWS Flex Start to provision nodes. @@ -521,7 +529,7 @@ variable "dws_flex" { - enable: Enable DWS Flex Start - max_run_duration: Maximum duration in seconds for the job to run, should not exceed 1,209,600 (2 weeks). - use_job_duration: Use the job duration to determine the max_run_duration, if job duration is not set, max_run_duration will be used. - + Limitations: - CAN NOT be used with reservations; - CAN NOT be used with placement groups; diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index eb2e5b2f86..6f79a4a39e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -313,7 +313,7 @@ limitations under the License. | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, false)
enable_opportunistic_maintenance = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
dws_flex = object({
enabled = bool
max_run_duration = number
use_job_duration = bool
})
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index 37532f6285..46b86e77a6 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -83,7 +83,7 @@ SSSSSSSSSSSS SSS SSSSSSSSSSSSS SSSS SSSS SSSS SSSS """ - +_MAINTENANCE_SBATCH_SCRIPT_PATH = dirs.custom_scripts / "perform_maintenance.sh" def start_motd(): """advise in motd that slurm is currently configuring""" @@ -224,6 +224,26 @@ def setup_sudoers(): sudoers_file.chmod(0o0440) +def setup_maintenance_script(): + perform_maintenance = """#!/bin/bash + +#SBATCH --priority=low +#SBATCH --time=180 + +VM_NAME=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/name" -H "Metadata-Flavor: Google") +ZONE=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/zone" -H "Metadata-Flavor: Google" | cut -d '/' -f 4) + +gcloud compute instances perform-maintenance $VM_NAME \ + --zone=$ZONE +""" + + + with open(_MAINTENANCE_SBATCH_SCRIPT_PATH, "w") as f: + f.write(perform_maintenance) + + util.chown_slurm(_MAINTENANCE_SBATCH_SCRIPT_PATH, mode=0o755) + + def update_system_config(file, content): """Add system defaults options for service files""" sysconfig = Path("/etc/sysconfig") @@ -279,10 +299,10 @@ def configure_mysql(): def configure_dirs(): for p in dirs.values(): util.mkdirp(p) - + for p in (dirs.slurm, dirs.scripts, dirs.custom_scripts): util.chown_slurm(p) - + for p in slurmdirs.values(): util.mkdirp(p) util.chown_slurm(p) @@ -357,6 +377,9 @@ def setup_controller(): run("systemctl start slurm_load_bq.timer", timeout=30) run("systemctl status slurm_load_bq.timer", timeout=30) + # Add script to perform maintenance + setup_maintenance_script() + log.info("Done setting up controller") pass @@ -400,7 +423,7 @@ def setup_compute(): slurmd_options = [ f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"', ] - + try: slurmd_feature = util.instance_metadata("attributes/slurmd_feature") except Exception: @@ -439,7 +462,7 @@ def setup_compute(): def main(): start_motd() - + log.info("Starting setup, fetching config") sleep_seconds = 5 while True: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index cea4ae2f35..5c8d05dbee 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -28,6 +28,7 @@ import datetime as dt from datetime import datetime from typing import Dict, Tuple +from functools import lru_cache import util from util import ( @@ -41,6 +42,7 @@ NSDict, TPU, chunked, + dirs, ) from util import lookup from suspend import delete_instances @@ -50,7 +52,7 @@ log = logging.getLogger() TOT_REQ_CNT = 1000 - +_MAINTENANCE_SBATCH_SCRIPT_PATH = dirs.custom_scripts / "perform_maintenance.sh" NodeStatus = Enum( "NodeStatus", @@ -483,7 +485,7 @@ def get_slurm_reservation_maintenance(lkp: util.Lookup) -> Dict[str, datetime]: return reservation_map - +@lru_cache def get_upcoming_maintenance(lkp: util.Lookup) -> Dict[str, Tuple[str, datetime]]: upc_maint_map = {} @@ -535,6 +537,65 @@ def sync_maintenance_reservation(lkp: util.Lookup) -> None: create_reservation(lkp, res_name, node, start_time) +def delete_maintenance_job(job_name: str) -> None: + util.run(f"scancel --name={job_name}") + + +def create_maintenance_job(job_name: str, node: str) -> None: + util.run(f"sbatch --job-name={job_name} --nodelist={node} {_MAINTENANCE_SBATCH_SCRIPT_PATH}") + + +def get_slurm_maintenance_job(lkp: util.Lookup) -> Dict[str, str]: + jobs = {} + + for job in lkp.get_jobs(): + if job.name is None or job.required_nodes is None or job.job_state is None: + continue + + if job.name != f"{job.required_nodes}_maintenance": + continue + + if job.job_state != "PENDING": + continue + + jobs[job.name] = job.required_nodes + + return jobs + + +def sync_opportunistic_maintenance(lkp: util.Lookup) -> None: + upc_maint_map = get_upcoming_maintenance(lkp) # map job_name -> (node_name, time) + log.debug(f"upcoming-maintenance-vms: {upc_maint_map}") + + curr_jobs = get_slurm_maintenance_job(lkp) # map job_name -> node. + log.debug(f"curr-maintenance-job-map: {curr_jobs}") + + del_jobs = set(curr_jobs.keys() - upc_maint_map.keys()) + create_jobs = {} + + for job_name, (node, _) in upc_maint_map.items(): + try: + enabled = lkp.node_nodeset(node).enable_opportunistic_maintenance + except Exception: + enabled = False + + if not enabled: + if job_name in curr_jobs: + del_jobs.add(job_name) + continue + + if job_name not in curr_jobs: + create_jobs[job_name] = node + + log.debug(f"del-maintenance-job: {del_jobs}") + for job_name in del_jobs: + delete_maintenance_job(job_name) + + log.debug(f"create-maintenance-job: {create_jobs}") + for job_name, node in create_jobs.items(): + create_maintenance_job(job_name, node) + + def main(): try: reconfigure_slurm() @@ -562,6 +623,12 @@ def main(): except Exception: log.exception("failed to sync slurm reservation for scheduled maintenance") + try: + sync_opportunistic_maintenance(lookup()) + except Exception: + log.exception("failed to sync opportunistic reservation for scheduled maintenance") + + try: # TODO: it performs 1 to 4 GCS list requests, # use cached version, combine with `_list_config_blobs` diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 6287399993..71a44a7236 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -81,22 +81,23 @@ module "nodeset_cleanup" { locals { nodesets = [for name, ns in local.nodeset_map : { - nodeset_name = ns.nodeset_name - node_conf = ns.node_conf - dws_flex = ns.dws_flex - instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link - node_count_dynamic_max = ns.node_count_dynamic_max - node_count_static = ns.node_count_static - subnetwork = ns.subnetwork_self_link - reservation_name = ns.reservation_name - maintenance_interval = ns.maintenance_interval - instance_properties_json = ns.instance_properties_json - enable_placement = ns.enable_placement - network_storage = ns.network_storage - zone_target_shape = ns.zone_target_shape - zone_policy_allow = ns.zone_policy_allow - zone_policy_deny = ns.zone_policy_deny - enable_maintenance_reservation = ns.enable_maintenance_reservation + nodeset_name = ns.nodeset_name + node_conf = ns.node_conf + dws_flex = ns.dws_flex + instance_template = module.slurm_nodeset_template[ns.nodeset_name].self_link + node_count_dynamic_max = ns.node_count_dynamic_max + node_count_static = ns.node_count_static + subnetwork = ns.subnetwork_self_link + reservation_name = ns.reservation_name + maintenance_interval = ns.maintenance_interval + instance_properties_json = ns.instance_properties_json + enable_placement = ns.enable_placement + network_storage = ns.network_storage + zone_target_shape = ns.zone_target_shape + zone_policy_allow = ns.zone_policy_allow + zone_policy_deny = ns.zone_policy_deny + enable_maintenance_reservation = ns.enable_maintenance_reservation + enable_opportunistic_maintenance = ns.enable_opportunistic_maintenance }] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index 1fc4fb1e0f..b06d62b39f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -196,18 +196,19 @@ variable "nodeset" { auto_delete = optional(bool, true) boot = optional(bool, false) })), []) - bandwidth_tier = optional(string, "platform_default") - can_ip_forward = optional(bool, false) - disable_smt = optional(bool, false) - disk_auto_delete = optional(bool, true) - disk_labels = optional(map(string), {}) - disk_size_gb = optional(number) - disk_type = optional(string) - enable_confidential_vm = optional(bool, false) - enable_placement = optional(bool, false) - enable_oslogin = optional(bool, true) - enable_shielded_vm = optional(bool, false) - enable_maintenance_reservation = optional(bool, true) + bandwidth_tier = optional(string, "platform_default") + can_ip_forward = optional(bool, false) + disable_smt = optional(bool, false) + disk_auto_delete = optional(bool, true) + disk_labels = optional(map(string), {}) + disk_size_gb = optional(number) + disk_type = optional(string) + enable_confidential_vm = optional(bool, false) + enable_placement = optional(bool, false) + enable_oslogin = optional(bool, true) + enable_shielded_vm = optional(bool, false) + enable_maintenance_reservation = optional(bool, false) + enable_opportunistic_maintenance = optional(bool, false) gpu = optional(object({ count = number type = string From f581c2333b384b979ac7bab195a43d60aa078644 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 12 Nov 2024 00:02:06 +0000 Subject: [PATCH 113/129] SlurmGCP. Add workaround for #3242 to examples --- examples/hpc-enterprise-slurm-v5-legacy.yaml | 2 ++ examples/hpc-enterprise-slurm.yaml | 2 ++ examples/ml-slurm-v5-legacy.yaml | 2 ++ examples/ml-slurm.yaml | 2 ++ 4 files changed, 8 insertions(+) diff --git a/examples/hpc-enterprise-slurm-v5-legacy.yaml b/examples/hpc-enterprise-slurm-v5-legacy.yaml index e482a10d15..99e831ca60 100644 --- a/examples/hpc-enterprise-slurm-v5-legacy.yaml +++ b/examples/hpc-enterprise-slurm-v5-legacy.yaml @@ -36,6 +36,8 @@ vars: # When set, active compute nodes will be cleaned up on destroy. # Note that setting this option requires additional dependencies to be installed locally. enable_cleanup_compute: true + metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243 + VmDnsSetting: GlobalOnly # Recommended to use GCS backend for Terraform state # See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index ac08d1d06e..e03daa3d27 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -29,6 +29,8 @@ vars: project: schedmd-slurm-public # If image above is changed to use custom image, then setting below must be set to true instance_image_custom: false + metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243 + VmDnsSetting: GlobalOnly # Recommended to use GCS backend for Terraform state # See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state diff --git a/examples/ml-slurm-v5-legacy.yaml b/examples/ml-slurm-v5-legacy.yaml index bf0e107bb0..113c052405 100644 --- a/examples/ml-slurm-v5-legacy.yaml +++ b/examples/ml-slurm-v5-legacy.yaml @@ -28,6 +28,8 @@ vars: family: ml-slurm project: $(vars.project_id) disk_size_gb: 200 + metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243 + VmDnsSetting: GlobalOnly # Recommended to use GCS backend for Terraform state # See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index 59967cd3e3..1eea66d407 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -29,6 +29,8 @@ vars: family: ml-slurm project: $(vars.project_id) disk_size_gb: 200 + metadata: # Workaround for https://github.com/GoogleCloudPlatform/cluster-toolkit/discussions/3243 + VmDnsSetting: GlobalOnly # Recommended to use GCS backend for Terraform state # See https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/examples#optional-setting-up-a-remote-terraform-state From 1aaef554521299285a1eda61eefe264a2f09e43e Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 12 Nov 2024 18:59:07 +0000 Subject: [PATCH 114/129] Fix formatting of Docker config warning --- modules/scripts/startup-script/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/scripts/startup-script/main.tf b/modules/scripts/startup-script/main.tf index b694275ed0..3d41bf262f 100644 --- a/modules/scripts/startup-script/main.tf +++ b/modules/scripts/startup-script/main.tf @@ -205,9 +205,9 @@ check "health_check" { assert { condition = local.docker_config == {} error_message = <<-EOT - This message is only a warning. The Toolkit performs no validation of the Docker - daemon configuration. VM startup scripts will fail if the configuration file is - not a valid Docker JSON configuration. Please review the Docker documentation: + This message is only a warning. The Toolkit performs no validation of the + Docker daemon configuration. VM startup scripts will fail if the file is not + a valid Docker JSON configuration. Please review the Docker documentation: https://docs.docker.com/engine/daemon/ EOT From ec3ac969e69ec67368ba8a97fc2d38c0e23a7239 Mon Sep 17 00:00:00 2001 From: NinaCai Date: Tue, 12 Nov 2024 20:45:50 +0000 Subject: [PATCH 115/129] show my changes only --- .../slurm_files/scripts/tools/gpu-test | 76 +++++++++++++++++++ .../a3-megagpu-8g/slurm-a3mega-cluster.yaml | 2 + 2 files changed, 78 insertions(+) create mode 100644 community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test new file mode 100644 index 0000000000..7bf5e22d2b --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test @@ -0,0 +1,76 @@ +# Fail gracefully if nvidia-smi or dcgmi doesn't exist +if ! type -P nvidia-smi 1>/dev/null; then + echo "nvidia-smi not found - this script requires nvidia-smi to function" >&2 + exit 0 +fi + +if ! type -P dcgmi 1>/dev/null; then + echo "dcgmi not found - this script requires dcgmi to function" >&2 + exit 0 +fi + +if ! type -P nv-hostengine 1>/dev/null; then + echo "nv-hostengine not found - this script requires nv-hostengine to function" >&2 + exit 0 +fi + +# Exit if GPU isn't H100 +GPU_MODEL=$(nvidia-smi --query-gpu=name --format=csv,noheader) +if [[ "$GPU_MODEL" != *"H100"* ]]; then + echo "Non-H100 GPU detected" >&2 + exit 0 +fi + +NUMGPUS=$(nvidia-smi -L | wc -l) + +# Check that all GPUs are healthy via DCGM and check for ECC errors +if [ $NUMGPUS -gt 0 ]; then + echo "Execute DCGM health check and ECC error check for GPUs" + GPULIST=$(nvidia-smi --query-gpu=index --format=csv,noheader | tr '\n' ',' | sed 's/,$//') + rm -f /tmp/dcgm.out + rm -f /tmp/ecc_errors.out + + # Run DCGM checks + START_HOSTENGINE=false + if ! pidof nv-hostengine > /dev/null; then + echo "Starting nv-hostengine..." + nv-hostengine + sleep 1 # Give it a moment to start up + START_HOSTENGINE=true + fi + GROUPID=$(dcgmi group -c gpuinfo | awk '{print $NF}' | tr -d ' ') + dcgmi group -g $GROUPID -a $GPULIST + dcgmi diag -g $GROUPID -r 1 1> /tmp/dcgm.out + dcgmi group -d $GROUPID + + # Terminate the host engine if it was manually started + if [ "$START_HOSTENGINE" = true ]; then + echo "Terminating nv-hostengine..." + nv-hostengine -t + fi + + # Check for DCGM failures + DCGM_FAILED=0 + grep -i fail /tmp/dcgm.out > /dev/null || DCGM_FAILED=$? + + # Check for ECC errors + nvidia-smi --query-gpu=ecc.errors.uncorrected.volatile.total --format=csv,noheader > /tmp/ecc_errors.out + ECC_ERRORS=$(awk -F', ' '{sum += $2} END {print sum}' /tmp/ecc_errors.out) + + # Check for NVLink errors + NVLINK_ERRORS=$(nvidia-smi nvlink -sc 0bz -i 0 2>/dev/null | grep -i "Error Count" | awk '{sum += $3} END {print sum}') + # Set to 0 if empty/null + NVLINK_ERRORS=${NVLINK_ERRORS:-0} + + if [ $DCGM_FAILED -eq 0 ] || \ + [ $ECC_ERRORS -gt 0 ] || \ + [ $NVLINK_ERRORS -gt 0 ]; then + REASON="H100 GPU issues detected: " + [ $DCGM_FAILED -eq 0 ] && REASON+="DCGM test failed, " + [ $ECC_ERRORS -gt 0 ] && REASON+="ECC errors found ($ECC_ERRORS double-bit errors), " + [ $NVLINK_ERRORS -gt 0 ] && REASON+="NVLink errors detected ($NVLINK_ERRORS errors), " + REASON+="see /tmp/dcgm.out and /tmp/ecc_errors.out" + echo "$REASON" + exit 1 + fi +fi diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 8b8a3eda19..2dbed95c27 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -226,6 +226,8 @@ deployment_groups: chmod 0755 "${SLURM_ROOT}/scripts/rxdm" ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/rxdm.prolog_slurmd" ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/rxdm.epilog_slurmd" + ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/gpu-test.prolog_slurmd" + ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/gpu-test.epilog_slurmd" - type: shell destination: reset_enroot.sh content: | From a6111c639ef1db7d76bb438ab6889a8724dbddbe Mon Sep 17 00:00:00 2001 From: NinaCai Date: Tue, 12 Nov 2024 21:52:03 +0000 Subject: [PATCH 116/129] only keep epilog to check gpu health --- .../machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 2dbed95c27..55cb453157 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -226,7 +226,6 @@ deployment_groups: chmod 0755 "${SLURM_ROOT}/scripts/rxdm" ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/rxdm.prolog_slurmd" ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/rxdm.epilog_slurmd" - ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/gpu-test.prolog_slurmd" ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/gpu-test.epilog_slurmd" - type: shell destination: reset_enroot.sh From 7e65a3e196940dc1586e387e085286295742df50 Mon Sep 17 00:00:00 2001 From: Alyssa Date: Thu, 7 Nov 2024 08:36:48 +0000 Subject: [PATCH 117/129] Script for Python Tests --- tools/python-integration-tests/test.py | 344 +++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 tools/python-integration-tests/test.py diff --git a/tools/python-integration-tests/test.py b/tools/python-integration-tests/test.py new file mode 100644 index 0000000000..00412f7d7b --- /dev/null +++ b/tools/python-integration-tests/test.py @@ -0,0 +1,344 @@ +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import shutil +import os +import re +import signal +import socket +import subprocess +import sys +import time +import paramiko +from collections import defaultdict +import argparse +import yaml + +def run_command(cmd: str, err_msg: str = None) -> subprocess.CompletedProcess: + res = subprocess.run(cmd, shell=True, universal_newlines=True, check=True, + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if res.returncode != 0: + raise subprocess.SubprocessError(f"{err_msg}:\n{res.stderr}") + + return res + +def parse_blueprint(file_path: str): + with open(file_path, 'r') as file: + content = yaml.safe_load(file) + return content["vars"]["deployment_name"], content["vars"]["zone"] + +def get_account_info(): + # Extract the username from posixAccounts + result = run_command(f"gcloud compute os-login describe-profile --format=json").stdout + posixAccounts = json.loads(result) + + for account in posixAccounts.get('posixAccounts', []): + if 'accountId' in account: + project_id = account['accountId'] + username = account['username'] + return project_id, username + +def create_deployment(blueprint: str): + project_id, username = get_account_info() + deployment_name, zone = parse_blueprint(blueprint) + return Deployment(blueprint, project_id, username, deployment_name, zone) + +def test_simple_job_completion(blueprint: str): + deployment = create_deployment(blueprint) + deployment.deploy() + try: + # Waiting to let the login node finish set up or ssh will fail. + print("Wait 60 seconds") + time.sleep(60) + + ssh = deployment.ssh() + test = Test(ssh, deployment) + test.check_simple_job_completion() + finally: + deployment.close_tunnel() + deployment.destroy() + +def test_topology(blueprint: str): + deployment = create_deployment(blueprint) + deployment.deploy() + try: + # Waiting to let the login node finish set up or ssh will fail. + print("Wait 60 seconds") + time.sleep(60) + ssh = deployment.ssh() + test = Test(ssh, deployment) + test.check_topology() + finally: + deployment.close_tunnel() + deployment.destroy() + +class Deployment: + def __init__(self, blueprint: str, project_id: str, username: str, deployment_name: str, zone: str): + self.blueprint_yaml = blueprint + self.project_id = project_id + self.state_bucket = "daily-tests-tf-state" + self.workspace = "" + self.username = username + self.deployment_name = deployment_name + self.zone = zone + self.test_name = deployment_name + self.tunnel = None + + def get_workspace(self): + return os.path.abspath(os.getcwd().strip()) + + def create_blueprint(self): + self.workspace = self.get_workspace() + + cmd = [ + "./gcluster", + "create", + "-l", + "ERROR", + self.blueprint_yaml, + "--backend-config", + f"bucket={self.state_bucket}", + "--vars", + f"project_id={self.project_id}", + "--vars", + f"deployment_name={self.deployment_name}" + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + + def compress_blueprint(self): + cmd = [ + "tar", + "-czf", + "%s.tgz" % (self.deployment_name), + "%s" % (self.deployment_name), + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + + def upload_deployment(self): + cmd = [ + "gsutil", + "cp", + "%s.tgz" % (self.deployment_name), + "gs://%s/%s/" % (self.state_bucket, self.test_name) + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + + def print_download_command(self): + print("gcloud storage cp gs://%s/%s/%s.tgz ." % (self.state_bucket, self.test_name, self.deployment_name)) + + def create_deployment_directory(self): + self.create_blueprint() + self.compress_blueprint() + self.upload_deployment() + self.print_download_command() + + def deploy(self): + # Create deployment directory + self.create_deployment_directory() + cmd = [ + "./gcluster", + "deploy", + self.deployment_name, + "--auto-approve" + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + + def ssh(self) -> paramiko.SSHClient: + instance_name = self.deployment_name.replace("-", "")[:10] + "-slurm-login-001" + + # Use existing SSH key pair (assuming it's already in ~/.ssh/google_compute_engine) + key_path = os.path.expanduser("~/.ssh/google_compute_engine") + + # Add the public key to OS Login + public_key_path = key_path + ".pub" + subprocess.run( + [ + "gcloud", "compute", "os-login", "ssh-keys", "describe", + "--key-file", public_key_path + ], + check=True, capture_output=True + ) + + # Construct the gcloud command to create the IAP tunnel + iap_tunnel_cmd = [ + "gcloud", "compute", "start-iap-tunnel", instance_name, + "22", "--project", self.project_id, "--zone", self.zone, + "--local-host-port=localhost:10022" + ] + + # Create the IAP tunnel process + self.tunnel = subprocess.Popen(iap_tunnel_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Sleep to give the tunnel a few seconds to set up + time.sleep(3) + + # Create an SSH client + ssh = paramiko.SSHClient() + ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + + # Load the private key + key = paramiko.RSAKey.from_private_key_file(key_path) + + # Connect to the VM + ssh.connect("localhost", port=10022, username=self.username, pkey=key) + + return ssh + + def close_tunnel(self): + if self.tunnel: + self.tunnel.terminate() + self.tunnel.wait() + self.tunnel = None + + def destroy(self): + cmd = [ + "./gcluster", + "destroy", + self.deployment_name, + "--auto-approve" + ] + + subprocess.run(cmd, check=True, cwd=self.workspace) + os.remove(f"{self.deployment_name}.tgz") + shutil.rmtree(self.deployment_name) + + +class Test: + def __init__(self, ssh, deployment): + self.ssh = ssh + self.deployment = deployment + self.job_list = {} + + def get_slurm_topology(self): + stdin, stdout, stderr = self.ssh.exec_command("scontrol show topo") + return stdout.read().decode() + + def monitor_squeue(self): + # Monitors squeue and updates self.job_list until all running jobs are complete. + lines = [] + + while True: + stdin, stdout, stderr = self.ssh.exec_command('squeue') + + lines = stdout.read().decode().splitlines()[1:] # Skip header + + if not lines: + break + for line in lines: + parts = line.split() + job_id, partition, _, _, state, times, nodes, nodelist = line.split() + + if job_id not in self.job_list: + print(f"Job id {job_id} is not recognized.") + else: + self.job_list[job_id].update({ + "partition": partition, + "state": state, + "time": times, + "nodes": nodes, + "nodelist": nodelist, + }) + time.sleep(5) + + def is_job_complete(self, job_id: str): + # Checks if a job successfully completed. + stdin, stdout, stderr = self.ssh.exec_command(f'scontrol show job {job_id} --json') + content = json.load(stdout) + return content["jobs"][0]["job_state"][0] == "COMPLETED" + + def submit_job(self, cmd: str): + stdin, stdout, stderr = self.ssh.exec_command(cmd) + jobID = stdout.read().decode().split()[-1] + self.job_list[jobID] = {} + + def get_node_depth(self, switch_name: str): + return switch_name.count("_") + + def get_real_rack(self, node: str): + result = run_command(f"gcloud compute instances describe {node} --zone={self.deployment.zone} --project={self.deployment.project_id} --format='value(resourceStatus.physicalHost)'") + return result.stdout.split("/")[1] + + def get_slurm_rack(self, node: str): + stdin, stdout, stderr = self.ssh.exec_command(f"scontrol show topology {node} | tail -1 | cut -d' ' -f1") + switch_name = stdout.read().decode() + self.assert_equal(self.get_node_depth(switch_name), 2, f"{node} does not have the expected topology depth of 2."), + return switch_name + + def get_nodes(self): + nodes = [] + stdin, stdout, stderr = self.ssh.exec_command("scontrol show node| grep NodeName") + for line in stdout.read().decode().splitlines(): + nodes.append(line.split()[0].split("=")[1]) + return nodes + + def assert_equal(self, value1, value2, message=None): + if value1 != value2: + if message is None: + message = f"Assertion failed: {value1} != {value2}" + raise AssertionError(message) + + def check_simple_job_completion(self): + # Submits 5 jobs and checks if they are successful. + for i in range(5): + self.submit_job('sbatch -N 1 --wrap "sleep 20"') + self.monitor_squeue() + + for job_id in self.job_list.keys(): + result = self.is_job_complete(job_id) + self.assert_equal(True, result, f"Something went wrong with JobID:{job_id}.") + print(f"JobID {job_id} finished successfully.") + + def check_topology(self): + # Checks isomorphism of last layer of nodes to determine topology. + r_rack, s_rack = defaultdict(set), defaultdict(set) + nodes = self.get_nodes() + + for node in nodes: + r_rack[self.get_real_rack(node)].add(node) + s_rack[self.get_slurm_rack(node)].add(node) + + r_rack_set = [set(v) for v in r_rack.values()] + s_rack_set = [set(v) for v in s_rack.values()] + + self.assert_equal(r_rack_set, s_rack_set, "The two sets did not match.") + +def main(simple_test_blueprints, topo_test_blueprints) -> None: + if simple_test_blueprints: + for blueprint in simple_test_blueprints: + test_simple_job_completion(blueprint) + print(f'{blueprint} passed simple slurm test.') + + if topo_test_blueprints: + for blueprint in topo_test_blueprints: + test_topology(blueprint) + print(f'{blueprint} passed topology test.') + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='test.py', + description="", + formatter_class=argparse.RawTextHelpFormatter + ) + parser.add_argument("--simple", nargs="+", help="File path(s) to blueprint(s) to do the simple slurm test on.") + parser.add_argument("--topo", nargs="+", help="File path(s) to blueprint(s) to do the topology test on.") + + args = parser.parse_args() + + main(args.simple, args.topo) From bc1d86b78f0106e011baf8d3b58cc12724ebe8cc Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 12 Nov 2024 22:06:24 +0000 Subject: [PATCH 118/129] Add support for Filestore deletion protection Filestore deletion protection ensures that instances are not unintentionally deleted. A typical lifecycle for a user will look like: 1. Deploy a blueprint with deletion protection enabled 2. Disable deletion protection in blueprint 3. Redeploy blueprint 4. Destroy deployment In particular, enabling Filestore deletion protection does not prevent Terraform from destroying other resources. So a `gcluster destroy` command will destroy all resources except the Filestore and its dependencies. --- modules/file-system/filestore/README.md | 31 ++++++++++++++++++++-- modules/file-system/filestore/main.tf | 3 +++ modules/file-system/filestore/variables.tf | 17 ++++++++++++ modules/file-system/filestore/versions.tf | 2 +- 4 files changed, 50 insertions(+), 3 deletions(-) diff --git a/modules/file-system/filestore/README.md b/modules/file-system/filestore/README.md index e6de03f9bd..7a78672c40 100644 --- a/modules/file-system/filestore/README.md +++ b/modules/file-system/filestore/README.md @@ -7,6 +7,32 @@ mounted to one or more compute VMs. For more information on this and other network storage options in the Cluster Toolkit, see the extended [Network Storage documentation](../../../docs/network_storage.md). +### Deletion protection + +We recommend considering enabling [Filestore deletion protection][fdp]. Deletion +protection will prevent unintentional deletion of an entire Filestore instance. +It does not prevent deletion of files within the Filestore instance when mounted +by a VM. It ss not available on some [tiers](#filestore-tiers), including the +default BASIC\_HDD tier or BASIC\_SSD tier. Follow the documentation link for +up to date details. + +Usage can be enabled in a blueprint with, for example: + +```yaml + - id: homefs + source: modules/file-system/filestore + use: [network] + settings: + deletion_protection: + enabled: true + reason: Avoid data loss + filestore_tier: ZONAL + local_mount: /home + size_gb: 1024 +``` + +[fdp]: https://cloud.google.com/filestore/docs/deletion-protection + ### Filestore tiers At the time of writing, Filestore supports 5 [tiers of service][tiers] that are @@ -149,14 +175,14 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 0.14.0 | -| [google](#requirement\_google) | >= 4.19 | +| [google](#requirement\_google) | >= 6.4 | | [random](#requirement\_random) | ~> 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 4.19 | +| [google](#provider\_google) | >= 6.4 | | [random](#provider\_random) | ~> 3.0 | ## Modules @@ -175,6 +201,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [connect\_mode](#input\_connect\_mode) | Used to select mode - supported values DIRECT\_PEERING and PRIVATE\_SERVICE\_ACCESS. | `string` | `"DIRECT_PEERING"` | no | +| [deletion\_protection](#input\_deletion\_protection) | Configure Filestore instance deletion protection |
object({
enabled = optional(bool, false)
reason = optional(string)
})
|
{
"enabled": false
}
| no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used as name of the filestore instance if no name is specified. | `string` | n/a | yes | | [filestore\_share\_name](#input\_filestore\_share\_name) | Name of the file system share on the instance. | `string` | `"nfsshare"` | no | | [filestore\_tier](#input\_filestore\_tier) | The service tier of the instance. | `string` | `"BASIC_HDD"` | no | diff --git a/modules/file-system/filestore/main.tf b/modules/file-system/filestore/main.tf index 53d24db8a0..8075a7848b 100644 --- a/modules/file-system/filestore/main.tf +++ b/modules/file-system/filestore/main.tf @@ -56,6 +56,9 @@ resource "google_filestore_instance" "filestore_instance" { location = var.filestore_tier == "ENTERPRISE" ? var.region : var.zone tier = var.filestore_tier + deletion_protection_enabled = var.deletion_protection.enabled + deletion_protection_reason = var.deletion_protection.reason + file_shares { capacity_gb = var.size_gb name = var.filestore_share_name diff --git a/modules/file-system/filestore/variables.tf b/modules/file-system/filestore/variables.tf index d48619c741..e3dbf1c9ff 100644 --- a/modules/file-system/filestore/variables.tf +++ b/modules/file-system/filestore/variables.tf @@ -147,3 +147,20 @@ variable "mount_options" { type = string default = "defaults,_netdev" } + +variable "deletion_protection" { + description = "Configure Filestore instance deletion protection" + type = object({ + enabled = optional(bool, false) + reason = optional(string) + }) + default = { + enabled = false + } + nullable = false + + validation { + condition = !can(coalesce(var.deletion_protection.reason)) || var.deletion_protection.enabled + error_message = "Cannot set Filestore var.deletion_protection.reason unless var.deletion_protection.enabled is true" + } +} diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 3454ca00c6..1648dcb660 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -18,7 +18,7 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.19" + version = ">= 6.4" } random = { source = "hashicorp/random" From 12f0ccbec5f703f5c0ab756fb6eee36aca66fcbc Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Tue, 12 Nov 2024 15:40:30 -0800 Subject: [PATCH 119/129] Don't install Lustre in A3 tests This is a temporary workaround to prevent failures while we debug the issue. --- .../machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml | 2 +- .../a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index 62abc006fe..64bb96f0a1 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -82,7 +82,7 @@ deployment_groups: "reboot": false, "install_cuda": false, "install_gcsfuse": true, - "install_lustre": true, + "install_lustre": false, "install_ompi": true, "monitoring_agent": "cloud-ops", "nvidia_version": "latest", diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml index 88d5cefda8..08060286b6 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml @@ -82,7 +82,7 @@ deployment_groups: "install_cuda": false, "nvidia_version": "latest", "install_ompi": true, - "install_lustre": true, + "install_lustre": false, "install_gcsfuse": true, "monitoring_agent": "cloud-ops" } From ff4192b2278cf8827da45c68fa18771d0ba8b588 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 12 Nov 2024 23:25:13 +0000 Subject: [PATCH 120/129] SlurmGCP. Don't use remote module to create controller instance --- .../schedmd-slurm-gcp-v6-controller/README.md | 2 +- .../controller.tf | 47 +++++++++---------- .../schedmd-slurm-gcp-v6-controller/login.tf | 2 +- .../outputs.tf | 4 +- 4 files changed, 26 insertions(+), 29 deletions(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index 6f79a4a39e..65f4b437f5 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -238,7 +238,6 @@ limitations under the License. | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | ../../../../modules/scripts/startup-script | n/a | | [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | | [nodeset\_cleanup\_tpu](#module\_nodeset\_cleanup\_tpu) | ./modules/cleanup_tpu | n/a | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.5 | | [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.8.5 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | | [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.8.5 | @@ -250,6 +249,7 @@ limitations under the License. | Name | Type | |------|------| +| [google_compute_instance_from_template.controller](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_from_template) | resource | | [google_secret_manager_secret.cloudsql](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | | [google_secret_manager_secret_iam_member.cloudsql_secret_accessor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | | [google_secret_manager_secret_version.cloudsql_version](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 8803b76fce..4b455ed5bd 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -73,7 +73,6 @@ module "slurm_controller_template" { metadata = local.metadata min_cpu_platform = var.min_cpu_platform - # network_ip = TODO: add support for network_ip on_host_maintenance = var.on_host_maintenance preemptible = var.preemptible service_account = local.service_account @@ -82,7 +81,6 @@ module "slurm_controller_template" { source_image_project = local.source_image_project_normalized # requires source_image_logic.tf source_image = local.source_image # requires source_image_logic.tf - # spot = TODO: add support for spot (?) subnetwork = var.subnetwork_self_link tags = concat([local.slurm_cluster_name], var.tags) @@ -90,32 +88,31 @@ module "slurm_controller_template" { } # INSTANCE -locals { - # TODO: add support for proper access_config - access_config = { - nat_ip = null - network_tier = null +resource "google_compute_instance_from_template" "controller" { + name = "${local.slurm_cluster_name}-controller" + project = var.project_id + zone = var.zone + source_instance_template = module.slurm_controller_template.self_link + + allow_stopping_for_update = true + + # Can't rely on template to specify nics due to usage of static_ip + network_interface { + dynamic "access_config" { + for_each = var.enable_controller_public_ips ? ["unit"] : [] + content { + nat_ip = null + network_tier = null + } + } + network_ip = length(var.static_ips) == 0 ? "" : var.static_ips[0] + subnetwork = var.subnetwork_self_link } } -module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.8.5" - - access_config = var.enable_controller_public_ips ? [local.access_config] : [] - add_hostname_suffix = false - hostname = "${local.slurm_cluster_name}-controller" - instance_template = module.slurm_controller_template.self_link - - project_id = var.project_id - region = var.region - slurm_cluster_name = local.slurm_cluster_name - slurm_instance_role = "controller" - static_ips = var.static_ips - subnetwork = var.subnetwork_self_link - zone = var.zone - metadata = var.metadata - - labels = local.labels +moved { + from = module.slurm_controller_instance.google_compute_instance_from_template.slurm_instance[0] + to = google_compute_instance_from_template.controller } # SECRETS: CLOUDSQL diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index e320dbb893..dd8e4699ec 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -78,5 +78,5 @@ module "slurm_login_instance" { zone = each.value.zone # trigger replacement of login nodes when the controller instance is replaced - replace_trigger = module.slurm_controller_instance.instances_self_links[0] + replace_trigger = google_compute_instance_from_template.controller.self_link } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf index 06ffb93594..400a58b437 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/outputs.tf @@ -19,7 +19,7 @@ output "slurm_cluster_name" { output "slurm_controller_instance" { description = "Compute instance of controller node" - value = module.slurm_controller_instance.slurm_instances[0] + value = google_compute_instance_from_template.controller } output "slurm_login_instances" { @@ -36,6 +36,6 @@ output "instructions" { description = "Post deployment instructions." value = <<-EOT To SSH to the controller (may need to add '--tunnel-through-iap'): - gcloud compute ssh ${module.slurm_controller_instance.instances_self_links[0]} + gcloud compute ssh ${google_compute_instance_from_template.controller.self_link} EOT } From 6befe7c5d7e0f6899490a62f3873a28671d69a02 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 12 Nov 2024 22:29:06 -0600 Subject: [PATCH 121/129] Update modules/file-system/filestore/README.md Co-authored-by: Rachael Tamakloe <63983467+RachaelSTamakloe@users.noreply.github.com> --- modules/file-system/filestore/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/file-system/filestore/README.md b/modules/file-system/filestore/README.md index 7a78672c40..193c5f4dc9 100644 --- a/modules/file-system/filestore/README.md +++ b/modules/file-system/filestore/README.md @@ -12,7 +12,7 @@ Toolkit, see the extended [Network Storage documentation](../../../docs/network_ We recommend considering enabling [Filestore deletion protection][fdp]. Deletion protection will prevent unintentional deletion of an entire Filestore instance. It does not prevent deletion of files within the Filestore instance when mounted -by a VM. It ss not available on some [tiers](#filestore-tiers), including the +by a VM. It is not available on some [tiers](#filestore-tiers), including the default BASIC\_HDD tier or BASIC\_SSD tier. Follow the documentation link for up to date details. From 3a75ce6d43c9e18458ce7c1db45f274fe48a3d63 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Tue, 12 Nov 2024 16:13:39 -0800 Subject: [PATCH 122/129] Remove CentOS 7 from list of supported images --- docs/vm-images.md | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/docs/vm-images.md b/docs/vm-images.md index 4b968d68b9..76cc1249be 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -105,20 +105,9 @@ project and the new image name in the `instance_image` field discussed in ## Cluster Toolkit Supported Images -### HPC CentOS 7 - -The Cluster Toolkit has officially supported the [HPC CentOS 7 VM Image][hpcimage] -as the primary VM image for HPC workloads on Google Cloud since it's release. -Since the [HPC CentOS 7 VM Image][hpcimage] comes pre-tuned for optimal -performance on typical HPC workloads, it is the default VM image in our modules, -unless there is specific requirement for a different OS distribution. - -[hpcimage]: https://cloud.google.com/blog/topics/hpc/introducing-hpc-vm-images - ### HPC Rocky Linux 8 -HPC Rocky Linux 8 is planned to become the primary supported VM image for HPC -workloads on Google Cloud from 2024. +HPC Rocky Linux 8 is the primary supported VM image for HPC workloads on Google Cloud. ### Debian 11 @@ -142,20 +131,19 @@ description of our support for Windows images. Deployment Type/Scheduler Feature - CentOS 7Debian 11Rocky Linux 8Ubuntu 20.04 + Debian 11Rocky Linux 8Ubuntu 20.04 - + Cloud Batch Lustre - - + Shared filestore @@ -163,7 +151,6 @@ description of our support for Windows images. - Startup script @@ -171,14 +158,12 @@ description of our support for Windows images. - Slurm Chrome Remote Desktop - @@ -186,25 +171,22 @@ description of our support for Windows images. Lustre - - ✓ + Shared filestore ✓ - ✓ Startup script - - ✓ ✓ + ✓ @@ -212,7 +194,6 @@ description of our support for Windows images. VM Instance Chrome Remote Desktop - * @@ -220,7 +201,6 @@ description of our support for Windows images. Lustre - @@ -231,7 +211,6 @@ description of our support for Windows images. - Startup script @@ -239,13 +218,11 @@ description of our support for Windows images. - HTCondor - ✓ ✓ @@ -255,7 +232,6 @@ description of our support for Windows images. - ✓ From 7795d02ff66ae0472316a102dad3267974652f85 Mon Sep 17 00:00:00 2001 From: NinaCai Date: Wed, 13 Nov 2024 18:20:35 +0000 Subject: [PATCH 123/129] add shebang and license --- .../modules/slurm_files/scripts/tools/gpu-test | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test index 7bf5e22d2b..0aaaeb2fc0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tools/gpu-test @@ -1,3 +1,18 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Fail gracefully if nvidia-smi or dcgmi doesn't exist if ! type -P nvidia-smi 1>/dev/null; then echo "nvidia-smi not found - this script requires nvidia-smi to function" >&2 From 0036d66aa5a092a6a2f4e043bca0d6377fa95225 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 13 Nov 2024 21:14:44 +0000 Subject: [PATCH 124/129] Comment out pluging of `gpu-test` --- .../machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 55cb453157..4117b2c7f8 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -226,7 +226,8 @@ deployment_groups: chmod 0755 "${SLURM_ROOT}/scripts/rxdm" ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-prolog_slurmd.d/rxdm.prolog_slurmd" ln -s "${SLURM_ROOT}/scripts/rxdm" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/rxdm.epilog_slurmd" - ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/gpu-test.epilog_slurmd" + # Uncomment the line below to enable epilog that will check health of GPUs and drain node if problem is detected. + # ln -s "/slurm/scripts/tools/gpu-test" "${SLURM_ROOT}/partition-$(vars.a3mega_partition_name)-epilog_slurmd.d/gpu-test.epilog_slurmd" - type: shell destination: reset_enroot.sh content: | From efe4236a647fd584ca0215946bda8348e31ae215 Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 14 Nov 2024 10:43:30 +0000 Subject: [PATCH 125/129] Fix a bug where try was hiding extraction of gpu driver version from customer --- modules/compute/gke-node-pool/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 829383b5ba..e4bc94cbcc 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -93,7 +93,7 @@ resource "google_container_node_pool" "node_pool" { gpu_partition_size = try(ga.value.gpu_partition_size, null) dynamic "gpu_driver_installation_config" { - for_each = try([ga.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }]) + for_each = coalescelist([ga.value.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }]) iterator = gdic content { gpu_driver_version = gdic.value.gpu_driver_version From b31c74051cc1262f52616aed7816ac85c4a6a7ba Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 14 Nov 2024 17:42:45 +0000 Subject: [PATCH 126/129] Fix the gpu_installation_config default for case where no customer input --- modules/compute/gke-node-pool/main.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index e4bc94cbcc..a76775c7fe 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -93,7 +93,8 @@ resource "google_container_node_pool" "node_pool" { gpu_partition_size = try(ga.value.gpu_partition_size, null) dynamic "gpu_driver_installation_config" { - for_each = coalescelist([ga.value.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }]) + # in case user did not specify guest_accelerator settings, we need a try to default to [] + for_each = coalescelist(try([ga.value.gpu_driver_installation_config], []), [{ gpu_driver_version = "DEFAULT" }]) iterator = gdic content { gpu_driver_version = gdic.value.gpu_driver_version From b2cdd88d4188e7536ba2166bf33792ad1b69eb2a Mon Sep 17 00:00:00 2001 From: Ankit Kinra <1037624+ankitkinra@users.noreply.github.com> Date: Thu, 14 Nov 2024 18:49:30 +0000 Subject: [PATCH 127/129] replace coalesce with more succinct code snipped --- modules/compute/gke-node-pool/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index a76775c7fe..3e38564988 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -94,7 +94,7 @@ resource "google_container_node_pool" "node_pool" { dynamic "gpu_driver_installation_config" { # in case user did not specify guest_accelerator settings, we need a try to default to [] - for_each = coalescelist(try([ga.value.gpu_driver_installation_config], []), [{ gpu_driver_version = "DEFAULT" }]) + for_each = try([ga.value.gpu_driver_installation_config], [{ gpu_driver_version = "DEFAULT" }]) iterator = gdic content { gpu_driver_version = gdic.value.gpu_driver_version From ce9f466ec38f9d5e77a0911fa7177d6c3b1996e9 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Fri, 15 Nov 2024 18:34:00 +0000 Subject: [PATCH 128/129] SlurmGCP. Fix bug that prevents resourcePolicies clean up. --- .../modules/slurm_files/scripts/slurmsync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index 5c8d05dbee..1bd876a56f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -353,7 +353,7 @@ def sync_placement_groups(): op = act.aggregatedList(project=lookup().project, fields=fields, filter=flt) placement_groups = {} pg_regex = re.compile( - rf"{lookup().cfg.slurm_cluster_name}-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" + rf"{lookup().cfg.slurm_cluster_name}-slurmgcp-managed-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" ) while op is not None: result = ensure_execute(op) From e6d5c135b88221f3166b3e8db889fb513e3fe6cb Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Thu, 14 Nov 2024 14:20:11 -0800 Subject: [PATCH 129/129] Increase version to 1.42.0 --- cmd/root.go | 2 +- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- community/modules/compute/mig/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf | 2 +- .../modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v6-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- .../modules/file-system/cloud-storage-bucket/versions.tf | 2 +- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/files/fsi-montecarlo-on-batch/versions.tf | 4 ++-- community/modules/network/private-service-access/versions.tf | 4 ++-- community/modules/project/service-enablement/versions.tf | 2 +- community/modules/pubsub/bigquery-sub/versions.tf | 4 ++-- community/modules/pubsub/topic/versions.tf | 2 +- community/modules/scheduler/htcondor-access-point/versions.tf | 2 +- .../modules/scheduler/htcondor-central-manager/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v6-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- community/modules/scripts/windows-startup-script/versions.tf | 2 +- modules/compute/gke-node-pool/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/file-system/gke-persistent-volume/versions.tf | 2 +- modules/file-system/gke-storage/versions.tf | 2 +- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/firewall-rules/versions.tf | 2 +- modules/network/pre-existing-subnetwork/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scheduler/batch-login-node/versions.tf | 2 +- modules/scheduler/gke-cluster/versions.tf | 2 +- modules/scheduler/pre-existing-gke-cluster/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 39 files changed, 45 insertions(+), 45 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index e58b8a743d..f46cc18e47 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.41.0", + Version: "v1.42.0", Annotations: annotation, } ) diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 3f320827a1..db3d320aad 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.42.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index 8e5b3caa45..4c0dd383c6 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.42.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 51f49882a1..f1c7fedf63 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.42.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index 4f00828f19..e26e07735f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.42.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index 9e7273093a..ed469721ae 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.42.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index f519a18161..52d7873c81 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.42.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 242244c5f7..05edfba039 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.42.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index 17489d3f93..d5e2163add 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.42.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 1e92271e3a..f40eb68805 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.42.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index 0a6664171a..f36bbb2e2b 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.42.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 5251b527b0..85db83d596 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.42.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index 469e310bc0..1d1848a59b 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.42.0" } } diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index efb0f8f2d1..26477fbb8f 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.42.0" } required_version = ">= 1.2" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 974520409d..b762a7b066 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.42.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index 5597272dca..bd0edb216f 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.42.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index 2a3e2fb59b..5870c64e94 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.42.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 3d452c24bb..260bf47cc0 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.42.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 432b506666..8b1837acae 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.42.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 103fe43a30..766a26ced7 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.42.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index d9e1f9b600..b6b84844fe 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.42.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index c52321d462..af509f4827 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.42.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index c1fc007bf0..1a4982e158 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.42.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index dbcebd21c1..74c4e35664 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.42.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index e60ec22c3c..e7e0af9bd1 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.42.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 1a2aa18a3b..5e6507714d 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index f0ef7ccb2c..72fe6c75ba 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.42.0" } } diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index c6577b1edb..95be3897cd 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.42.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 1648dcb660..3722efa557 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.42.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index b87efd8a16..e717d4c42c 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.42.0" } } diff --git a/modules/file-system/gke-storage/versions.tf b/modules/file-system/gke-storage/versions.tf index 27f82792ab..ba5b8164f8 100644 --- a/modules/file-system/gke-storage/versions.tf +++ b/modules/file-system/gke-storage/versions.tf @@ -16,6 +16,6 @@ terraform { required_version = ">= 1.0" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-storage/v1.42.0" } } diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index dbf59fa86f..3615d37090 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index 5312b04355..dfa3e8f332 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.42.0" } required_version = ">= 1.3" diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index 7a38f30404..e8a3464fa4 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index c9f1ec5992..0585447957 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index 599294a84e..9600f1ad02 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.42.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/gke-cluster/versions.tf b/modules/scheduler/gke-cluster/versions.tf index 67c30a9e84..a8d1ecfd89 100644 --- a/modules/scheduler/gke-cluster/versions.tf +++ b/modules/scheduler/gke-cluster/versions.tf @@ -34,6 +34,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.42.0" } } diff --git a/modules/scheduler/pre-existing-gke-cluster/versions.tf b/modules/scheduler/pre-existing-gke-cluster/versions.tf index 328bdda8e1..3b15bab237 100644 --- a/modules/scheduler/pre-existing-gke-cluster/versions.tf +++ b/modules/scheduler/pre-existing-gke-cluster/versions.tf @@ -23,7 +23,7 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-gke-cluster/v1.42.0" } required_version = ">= 1.3" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index c954c7e6fa..826cf6f810 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.41.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.42.0" } required_version = ">= 1.3"