diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml index 12a32c5900..37234d2a0e 100644 --- a/.github/workflows/pr-precommit.yml +++ b/.github/workflows/pr-precommit.yml @@ -24,7 +24,9 @@ on: - labeled - synchronize branches: + - main - develop + - release-candidate jobs: pre-commit: diff --git a/cmd/deploy.go b/cmd/deploy.go index 0a13915935..74e6210f2a 100644 --- a/cmd/deploy.go +++ b/cmd/deploy.go @@ -28,9 +28,10 @@ import ( ) func addDeployFlags(c *cobra.Command) *cobra.Command { - return addAutoApproveFlag( - addArtifactsDirFlag( - addCreateFlags(c))) + return addGroupSelectionFlags( + addAutoApproveFlag( + addArtifactsDirFlag( + addCreateFlags(c)))) } func init() { @@ -71,10 +72,16 @@ func doDeploy(deplRoot string) { checkErr(shell.CheckWritableDir(artDir), nil) bp, ctx := artifactBlueprintOrDie(artDir) groups := bp.Groups + checkErr(validateGroupSelectionFlags(bp), ctx) checkErr(validateRuntimeDependencies(deplRoot, groups), ctx) checkErr(shell.ValidateDeploymentDirectory(groups, deplRoot), ctx) for ig, group := range groups { + if !isGroupSelected(group.Name) { + logging.Info("skipping group %q", group.Name) + continue + } + groupDir := filepath.Join(deplRoot, string(group.Name)) checkErr(shell.ImportInputs(groupDir, artDir, bp), ctx) diff --git a/cmd/destroy.go b/cmd/destroy.go index 05f289669d..bad0a14a16 100644 --- a/cmd/destroy.go +++ b/cmd/destroy.go @@ -31,8 +31,9 @@ import ( func init() { rootCmd.AddCommand( - addAutoApproveFlag( - addArtifactsDirFlag(destroyCmd))) + addGroupSelectionFlags( + addAutoApproveFlag( + addArtifactsDirFlag(destroyCmd)))) } var ( @@ -56,13 +57,17 @@ func runDestroyCmd(cmd *cobra.Command, args []string) { } bp, ctx := artifactBlueprintOrDie(artifactsDir) - + checkErr(validateGroupSelectionFlags(bp), ctx) checkErr(shell.ValidateDeploymentDirectory(bp.Groups, deplRoot), ctx) // destroy in reverse order of creation! packerManifests := []string{} for i := len(bp.Groups) - 1; i >= 0; i-- { group := bp.Groups[i] + if !isGroupSelected(group.Name) { + logging.Info("skipping group %q", group.Name) + continue + } groupDir := filepath.Join(deplRoot, string(group.Name)) if err := shell.ImportInputs(groupDir, artifactsDir, bp); err != nil { diff --git a/cmd/root.go b/cmd/root.go index 10513b9f9a..ee6f2e23d7 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -52,7 +52,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.34.1", + Version: "v1.35.0", Annotations: annotation, } ) diff --git a/cmd/utils.go b/cmd/utils.go index b23bcd55b6..65bb2959e9 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -15,10 +15,13 @@ package cmd import ( + "errors" "fmt" + "hpc-toolkit/pkg/config" "hpc-toolkit/pkg/modulewriter" "hpc-toolkit/pkg/shell" "os" + "slices" "github.com/spf13/cobra" ) @@ -78,3 +81,41 @@ func filterYaml(cmd *cobra.Command, args []string, toComplete string) ([]string, } return []string{"yaml", "yml"}, cobra.ShellCompDirectiveFilterFileExt } + +var flagSkipGroups []string +var flagOnlyGroups []string + +func addGroupSelectionFlags(c *cobra.Command) *cobra.Command { + c.Flags().StringSliceVar(&flagSkipGroups, "skip", nil, "Skip groups with the given names") + c.Flags().StringSliceVar(&flagOnlyGroups, "only", nil, "Only apply to groups with the given names") + return c +} + +func validateGroupSelectionFlags(bp config.Blueprint) error { + if flagOnlyGroups != nil && flagSkipGroups != nil { + return errors.New("cannot specify both --only and --skip") + } + + dict := []string{} + for _, group := range bp.Groups { + dict = append(dict, string(group.Name)) + } + + for _, g := range append(flagOnlyGroups, flagSkipGroups...) { + if !slices.Contains(dict, g) { + return config.HintSpelling(g, dict, fmt.Errorf("group %q not found", g)) + } + } + + return nil +} + +func isGroupSelected(g config.GroupName) bool { + if flagOnlyGroups != nil { + return slices.Contains(flagOnlyGroups, string(g)) + } + if flagSkipGroups != nil { + return !slices.Contains(flagSkipGroups, string(g)) + } + return true +} diff --git a/cmd/utils_test.go b/cmd/utils_test.go new file mode 100644 index 0000000000..49ec9d3548 --- /dev/null +++ b/cmd/utils_test.go @@ -0,0 +1,83 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + "hpc-toolkit/pkg/config" + "testing" +) + +func TestIsGroupSelected(t *testing.T) { + type test struct { + only []string + skip []string + group config.GroupName + want bool + } + tests := []test{ + {nil, nil, "green", true}, + {[]string{"green"}, nil, "green", true}, + {[]string{"green"}, nil, "blue", false}, + {nil, []string{"green"}, "green", false}, + {nil, []string{"green"}, "blue", true}, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("%v;%v;%q", tc.only, tc.skip, tc.group), func(t *testing.T) { + flagOnlyGroups, flagSkipGroups = tc.only, tc.skip + got := isGroupSelected(tc.group) + if got != tc.want { + t.Errorf("isGroupSelected(%v) = %v; want %v", tc.group, got, tc.want) + } + }) + } +} + +func TestValidateGroupSelectionFlags(t *testing.T) { + type test struct { + only []string + skip []string + groups []string + err bool + } + tests := []test{ + {nil, nil, []string{"green"}, false}, + {[]string{"green"}, []string{"blue"}, []string{"green", "blue"}, true}, + {[]string{"green"}, nil, []string{"green"}, false}, + {[]string{"green"}, nil, []string{"blue"}, true}, + {nil, []string{"green"}, []string{"green"}, false}, + {nil, []string{"green"}, []string{"blue"}, true}, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("%v;%v;%v", tc.only, tc.skip, tc.groups), func(t *testing.T) { + flagOnlyGroups, flagSkipGroups = tc.only, tc.skip + bp := config.Blueprint{} + for _, g := range tc.groups { + bp.Groups = append(bp.Groups, config.Group{Name: config.GroupName(g)}) + } + + err := validateGroupSelectionFlags(bp) + if tc.err && err == nil { + t.Errorf("validateGroupSelectionFlags(%v) = nil; want error", tc.groups) + } + if !tc.err && err != nil { + t.Errorf("validateGroupSelectionFlags(%v) = %v; want nil", tc.groups, err) + } + }) + } + +} diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml index 374474dc9e..0edade5ef0 100644 --- a/community/examples/hpc-build-slurm-image.yaml +++ b/community/examples/hpc-build-slurm-image.yaml @@ -23,7 +23,7 @@ vars: image_build_machine_type: n2d-standard-16 build_from_image_family: hpc-rocky-linux-8 build_from_image_project: cloud-hpc-image-public - build_from_git_ref: 6.5.6 + build_from_git_ref: 6.5.8 built_image_family: my-custom-slurm built_instance_image: family: $(vars.built_image_family) diff --git a/community/examples/ml-gke.yaml b/community/examples/ml-gke.yaml index 60bccfb031..d6ae26b173 100644 --- a/community/examples/ml-gke.yaml +++ b/community/examples/ml-gke.yaml @@ -18,7 +18,9 @@ blueprint_name: ml-gke vars: project_id: ## Set GCP Project ID Here ## deployment_name: ml-01 - region: us-central1 + region: asia-southeast1 + zones: + - asia-southeast1-b # g2 machine has better availability in this zone # Cidr block containing the IP of the machine calling terraform. # The following line must be updated for this example to work. @@ -48,22 +50,23 @@ deployment_groups: cidr_block: $(vars.authorized_cidr) outputs: [instructions] - # Docs at https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/kubernetes-operations - - id: install-nvidia-drivers - source: github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//aiinfra-cluster/modules/kubernetes-operations?ref=v0.6.0 - use: [gke_cluster] - settings: - install_nvidia_driver: true - - - id: a2-pool + - id: g2-pool source: community/modules/compute/gke-node-pool use: [gke_cluster] settings: - machine_type: a2-highgpu-8g + disk_type: pd-balanced + machine_type: g2-standard-4 + guest_accelerator: + - type: nvidia-l4 + count: 1 + gpu_partition_size: null + gpu_sharing_config: null + gpu_driver_installation_config: + - gpu_driver_version: "DEFAULT" - id: job-template source: community/modules/compute/gke-job-template - use: [a2-pool] + use: [g2-pool] settings: image: nvidia/cuda:11.0.3-runtime-ubuntu20.04 command: diff --git a/community/front-end/ofe/deploy.sh b/community/front-end/ofe/deploy.sh index 0360591d9c..c1414753d0 100755 --- a/community/front-end/ofe/deploy.sh +++ b/community/front-end/ofe/deploy.sh @@ -57,6 +57,8 @@ PRJ_API['bigqueryconnection.googleapis.com']='BigQuery Connection API' PRJ_API['sqladmin.googleapis.com']='Cloud SQL Admin API' PRJ_API['servicenetworking.googleapis.com']='Service Networking API' PRJ_API['secretmanager.googleapis.com']='Secret Manager API' +PRJ_API['serviceusage.googleapis.com']='Service Usage API' +PRJ_API['storage.googleapis.com']='Cloud Storage API' # Location for output credential file = pwd/credential.json # diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index 8396e99dd8..ecce1e3a32 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -249,6 +249,7 @@ autostart=true autorestart=true user=gcluster redirect_stderr=true +environment=HOME=/opt/gcluster stdout_logfile=/opt/gcluster/run/supvisor.log" >/etc/supervisord.d/gcluster.ini printf "Creating systemd service..." diff --git a/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/README.md b/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/README.md index 1062c08dde..118343799a 100644 --- a/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/README.md +++ b/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/README.md @@ -19,6 +19,7 @@ limitations under the License. |------|---------| | [terraform](#requirement\_terraform) | >= 0.12.31 | | [google](#requirement\_google) | >= 3.54 | +| [google-beta](#requirement\_google-beta) | >= 3.83 | | [random](#requirement\_random) | >= 3.0 | ## Providers @@ -26,6 +27,7 @@ limitations under the License. | Name | Version | |------|---------| | [google](#provider\_google) | >= 3.54 | +| [google-beta](#provider\_google-beta) | >= 3.83 | | [random](#provider\_random) | >= 3.0 | ## Modules @@ -36,11 +38,14 @@ No modules. | Name | Type | |------|------| +| [google-beta_google_compute_global_address.private_ip_alloc](https://registry.terraform.io/providers/hashicorp/google-beta/latest/docs/resources/google_compute_global_address) | resource | | [google_compute_firewall.firewall_allow_ssh](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall) | resource | | [google_compute_firewall.firewall_internal](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_firewall) | resource | | [google_compute_network.network](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_network) | resource | | [google_compute_router.network_router](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_router) | resource | | [google_compute_router_nat.network_nat](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_router_nat) | resource | +| [google_service_networking_connection.private_vpc_connection](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/service_networking_connection) | resource | +| [random_id.resource_name_suffix](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) | resource | | [random_pet.vpc_name](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/pet) | resource | ## Inputs diff --git a/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/main.tf b/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/main.tf index b843d944b7..55b1603d55 100644 --- a/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/main.tf +++ b/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/main.tf @@ -85,6 +85,34 @@ resource "google_compute_firewall" "firewall_internal" { allow { protocol = "icmp" } } +locals { + # This label allows for billing report tracking based on module. + labels = { + created_by = "ofe" + } +} + +resource "random_id" "resource_name_suffix" { + byte_length = 4 +} + +resource "google_compute_global_address" "private_ip_alloc" { + provider = google-beta + project = var.project + name = "global-psconnect-ip-${random_id.resource_name_suffix.hex}" + purpose = "VPC_PEERING" + address_type = "INTERNAL" + network = google_compute_network.network.self_link + prefix_length = 16 + labels = local.labels +} + +resource "google_service_networking_connection" "private_vpc_connection" { + network = google_compute_network.network.self_link + service = "servicenetworking.googleapis.com" + reserved_peering_ranges = [google_compute_global_address.private_ip_alloc.name] +} + output "vpc_id" { value = google_compute_network.network.name description = "Name of the created VPC" diff --git a/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/versions.tf b/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/versions.tf index c5d78daa00..302315613e 100644 --- a/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/versions.tf +++ b/community/front-end/ofe/infrastructure_files/vpc_tf/GCP/versions.tf @@ -20,6 +20,10 @@ terraform { source = "hashicorp/google" version = ">= 3.54" } + google-beta = { + source = "hashicorp/google-beta" + version = ">= 3.83" + } random = { source = "hashicorp/random" version = ">= 3.0" diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py index bf563c54d7..b5708ef1f4 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py @@ -39,6 +39,7 @@ "n1": defaultdict(lambda: "x86_64"), "c3": defaultdict(lambda: "sapphirerapids"), "c3d": defaultdict(lambda: "zen2"), + "c4": defaultdict(lambda: "emeraldrapids"), # Compute Optimized "c2": defaultdict(lambda: "cascadelake"), "c2d": defaultdict( @@ -359,6 +360,7 @@ def get_cpu_price(num_cores, instance_type, skus): "n2d": "N2D AMD Instance Core", "h3": "Compute optimized Core", "c3": "Compute optimized Core", + "c4": "Compute optimized Core", "c2": "Compute optimized Core", "c2d": "C2D AMD Instance Core", "c3d": "C3D AMD Instance Core", @@ -411,6 +413,7 @@ def get_mem_price(num_gb, instance_type, skus): "h3": "Compute optimized Ram", "c2d": "C2D AMD Instance Ram", "c3d": "C3D AMD Instance Ram", + "c4": "C4 Instance RAM", "t2d": "T2D AMD Instance Ram", "a2": "A2 Instance Ram", "m1": "Memory-optimized Instance Ram", diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py index b77c454b12..f735107123 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py @@ -89,7 +89,7 @@ def create_filesystem(fs: Filesystem) -> None: raise NotImplementedError("No support yet for this filesystem") -def _run_ghpc(target_dir: Path) -> None: +def _run_ghpc(target_dir: Path, cred_env: dict) -> None: ghpc_path = "/opt/gcluster/hpc-toolkit/ghpc" try: @@ -104,6 +104,7 @@ def _run_ghpc(target_dir: Path) -> None: stdout=log_out, stderr=log_err, check=True, + env=cred_env ) except subprocess.CalledProcessError as cpe: logger.error("ghpc exec failed", exc_info=cpe) @@ -116,10 +117,11 @@ def start_filesystem(fs: Filesystem) -> None: fs.cloud_state = "cm" fs.save() try: - _run_ghpc(_base_dir_for_fs(fs)) extra_env = { "GOOGLE_APPLICATION_CREDENTIALS": _get_credentials_file(fs) } + _run_ghpc(_base_dir_for_fs(fs), extra_env) + target_dir = _tf_dir_for_fs(fs) utils.run_terraform(target_dir, "init") utils.run_terraform(target_dir, "plan", extra_env=extra_env) diff --git a/community/front-end/ofe/website/ghpcfe/forms.py b/community/front-end/ofe/website/ghpcfe/forms.py index 5eedf75e2e..d6db9c4618 100644 --- a/community/front-end/ofe/website/ghpcfe/forms.py +++ b/community/front-end/ofe/website/ghpcfe/forms.py @@ -247,8 +247,10 @@ class Meta: "image", "dynamic_node_count", "static_node_count", + "reservation_name", "enable_placement", "enable_hyperthreads", + "enable_tier1_networking", "enable_node_reuse", "GPU_type", "GPU_per_node", @@ -303,6 +305,9 @@ def prep_dynamic_select(field, value): self.instance.GPU_type ) + # Mark 'reservation_name' as optional + self.fields["reservation_name"].widget.attrs.update({"placeholder": "Optional"}) + def clean(self): cleaned_data = super().clean() if cleaned_data["enable_placement"] and cleaned_data[ diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index 9e88b2e800..12ea18ff62 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -739,9 +739,9 @@ class Cluster(CloudResource): default="pd-standard", ) controller_disk_size = models.PositiveIntegerField( - validators=[MinValueValidator(10)], + validators=[MinValueValidator(120)], help_text="Boot disk size (in GB)", - default=50, + default=120, blank=True, ) num_login_nodes = models.PositiveIntegerField( @@ -762,9 +762,9 @@ class Cluster(CloudResource): login_node_disk_size = models.PositiveIntegerField( # login node disk must be large enough to hold the SlurmGCP # image: >=50GB - validators=[MinValueValidator(50)], + validators=[MinValueValidator(120)], help_text="Boot disk size (in GB)", - default=50, + default=120, blank=True, ) grafana_dashboard_url = models.CharField( @@ -919,6 +919,13 @@ class ClusterPartition(models.Model): enable_hyperthreads = models.BooleanField( default=False, help_text="Enable Hyperthreads (SMT)" ) + enable_tier1_networking = models.BooleanField( + default=False, + help_text=( + "Select Tier 1 networking (currently only valid for N2, N2D, C2," + "C2D, C3, C3d, M3 and Z3 VMs that have at least 30 vCPUs.)" + ), + ) enable_node_reuse = models.BooleanField( default=True, help_text=( @@ -937,7 +944,7 @@ class ClusterPartition(models.Model): default="pd-standard", ) boot_disk_size = models.PositiveIntegerField( - validators=[MinValueValidator(49)], + validators=[MinValueValidator(50)], help_text="Boot disk size (in GB)", default=50, blank=True, @@ -972,7 +979,11 @@ class ClusterPartition(models.Model): "Automatically delete additional disk when node is deleted?" ), ) - + reservation_name = models.CharField( + blank=True, + max_length=30, + help_text="Name of the reservation to use for VM resources" + ) def __str__(self): return self.name @@ -1572,9 +1583,9 @@ class Workbench(CloudResource): help_text="Type of storage to be required for notebook boot disk", ) boot_disk_capacity = models.PositiveIntegerField( - validators=[MinValueValidator(100)], + validators=[MinValueValidator(120)], help_text="Capacity (in GB) of the filesystem (min of 1024)", - default=100, + default=120, ) proxy_uri = models.CharField(max_length=150, blank=True, null=True) trusted_user = models.ForeignKey( diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cloudsql_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cloudsql_config.yaml.j2 index ddb1bfc147..78a3a249b6 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cloudsql_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cloudsql_config.yaml.j2 @@ -1,7 +1,7 @@ - source: community/modules/database/slurm-cloudsql-federation kind: terraform id: slurm-sql - use: [hpc_network, ps-connect] + use: [hpc_network] settings: sql_instance_name: sql-{{ cluster_id }} tier: "db-g1-small" diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 index c72cc31022..a22569d024 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 @@ -23,10 +23,6 @@ deployment_groups: subnetwork_name: {{ cluster.subnet.cloud_id }} id: hpc_network - - source: community/modules/network/private-service-access - id: ps-connect - use: [ hpc_network ] - {{ filesystems_yaml | safe }} - source: community/modules/project/service-account diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 index 41845aaddc..86ade8151c 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 @@ -14,8 +14,12 @@ id: {{ part_id }}-group use: settings: + bandwidth_tier: {% if part.enable_tier1_networking %}tier_1_enabled{% else %}platform_default{% endif %} enable_smt: {{ part.enable_hyperthreads }} machine_type: {{ part.machine_type }} + {% if part.reservation_name %} + reservation_name: {{ part.reservation_name }} + {% endif %} node_count_dynamic_max: {{ part.dynamic_node_count }} node_count_static: {{ part.static_node_count }} disk_size_gb: {{ part.boot_disk_size }} diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html index 559656f348..8423fbc3ee 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html @@ -295,23 +295,36 @@

{{ title }}

{% endblock %} {% block tailscript %} + + } - - @@ -615,9 +640,8 @@

{{ title }}

elements[i].style.display = 'none'; } - updateDiskAvailability() - updateMachineAvailability() - + updateDiskAvailability(); + updateMachineAvailability(); }); {% endif %} diff --git a/community/front-end/ofe/website/ghpcfe/views/clusters.py b/community/front-end/ofe/website/ghpcfe/views/clusters.py index ff9355d2fb..733fade339 100644 --- a/community/front-end/ofe/website/ghpcfe/views/clusters.py +++ b/community/front-end/ofe/website/ghpcfe/views/clusters.py @@ -470,6 +470,18 @@ def form_valid(self, form): try: for part in parts: part.vCPU_per_node = machine_info[part.machine_type]["vCPU"] // (1 if part.enable_hyperthreads else 2) + cpu_count = machine_info[part.machine_type]["vCPU"] + logger.info(f"{part.machine_type} CPU Count: {cpu_count}") + + # Tier1 networking validation + if part.enable_tier1_networking == True: + logger.info("User selected Tier1 networking, checking if nodes in partition are compatible.") + tier_1_supported_prefixes = ["n2-", "n2d-", "c2-", "c2d-", "c3-", "c3d-", "m3-", "z3-"] + is_tier_1_compatible = any(part.machine_type.startswith(prefix) for prefix in tier_1_supported_prefixes) + + if not(cpu_count >= 30 and is_tier_1_compatible): + raise ValidationError(f"VM type {part.machine_type} is not compatible with Tier 1 networking.") + # Validate GPU choice if part.GPU_type: try: diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index 5545a2aaf5..74b9c53a6f 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.35.0" } } diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index a8a7eedab9..0ef940cea3 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.35.0" } } diff --git a/community/modules/compute/mig/README.md b/community/modules/compute/mig/README.md new file mode 100644 index 0000000000..f44279a49b --- /dev/null +++ b/community/modules/compute/mig/README.md @@ -0,0 +1,45 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [google](#requirement\_google) | > 5.0 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | > 5.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_compute_instance_group_manager.mig](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance_group_manager) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [base\_instance\_name](#input\_base\_instance\_name) | Base name for the instances in the MIG | `string` | `null` | no | +| [deployment\_name](#input\_deployment\_name) | Name of the deployment, will be used to name MIG if `var.name` is not provided | `string` | n/a | yes | +| [ghpc\_module\_id](#input\_ghpc\_module\_id) | Internal GHPC field, do not set this value | `string` | `null` | no | +| [labels](#input\_labels) | Labels to add to the MIG | `map(string)` | n/a | yes | +| [name](#input\_name) | Name of the MIG. If not provided, will be generated from `var.deployment_name` | `string` | `null` | no | +| [project\_id](#input\_project\_id) | Project in which the MIG will be created | `string` | n/a | yes | +| [target\_size](#input\_target\_size) | Target number of instances in the MIG | `number` | `0` | no | +| [versions](#input\_versions) | Application versions managed by this instance group. Each version deals with a specific instance template |
list(object({
name = string
instance_template = string
target_size = optional(object({
fixed = optional(number)
percent = optional(number)
}))
}))
| n/a | yes | +| [wait\_for\_instances](#input\_wait\_for\_instances) | Whether to wait for all instances to be created/updated before returning | `bool` | `false` | no | +| [zone](#input\_zone) | Compute Platform zone. Required, currently only zonal MIGs are supported | `string` | n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [self\_link](#output\_self\_link) | The URL of the created MIG | + diff --git a/community/modules/compute/mig/main.tf b/community/modules/compute/mig/main.tf new file mode 100644 index 0000000000..0e7cf186c2 --- /dev/null +++ b/community/modules/compute/mig/main.tf @@ -0,0 +1,85 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "mig", ghpc_role = "compute" }) +} + +locals { + sanitized_deploy_name = try(replace(lower(var.deployment_name), "/[^a-z0-9]/", ""), null) + sanitized_module_id = try(replace(lower(var.ghpc_module_id), "/[^a-z0-9]/", ""), null) + synth_mig_name = try("${local.sanitized_deploy_name}-${local.sanitized_module_id}", null) + + mig_name = var.name == null ? local.synth_mig_name : var.name + base_instance_name = var.base_instance_name == null ? local.mig_name : var.base_instance_name +} + +resource "google_compute_instance_group_manager" "mig" { + # REQUIRED + name = local.mig_name + base_instance_name = local.base_instance_name + zone = var.zone + + dynamic "version" { + for_each = var.versions + content { + name = version.value.name + instance_template = version.value.instance_template + dynamic "target_size" { + for_each = version.value.target_size != null ? [version.value.target_size] : [] + content { + fixed = target_size.value.fixed + percent = target_size.value.percent + } + } + } + } + + # OPTIONAL + project = var.project_id + target_size = var.target_size + wait_for_instances = var.wait_for_instances + + all_instances_config { + # TODO: validate that template metadata not getting wiped out + # TODO: validate that template labels not getting wiped out + labels = local.labels + } + + # OMITTED: + # * description + # * named_port + # * list_managed_instances_results + # * target_pools - specific for Load Balancers usage + # * wait_for_instances_status + # * auto_healing_policies + # * stateful_disk + # * stateful_internal_ip + # * update_policy + # * params + + + lifecycle { + precondition { + condition = local.mig_name != null + error_message = "Could not come up with a name for the MIG, specify `var.name`" + } + + precondition { + condition = local.base_instance_name != null + error_message = "Could not come up with a base_instance_name, specify `var.base_instance_name`" + } + } +} diff --git a/community/modules/compute/mig/metadata.yaml b/community/modules/compute/mig/metadata.yaml new file mode 100644 index 0000000000..97a4fa9a89 --- /dev/null +++ b/community/modules/compute/mig/metadata.yaml @@ -0,0 +1,21 @@ +# Copyright 2023 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +spec: + requirements: + services: + - compute.googleapis.com +ghpc: + inject_module_id: ghpc_module_id diff --git a/community/modules/compute/mig/outputs.tf b/community/modules/compute/mig/outputs.tf new file mode 100644 index 0000000000..23c66a3535 --- /dev/null +++ b/community/modules/compute/mig/outputs.tf @@ -0,0 +1,18 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +output "self_link" { + description = "The URL of the created MIG" + value = google_compute_instance_group_manager.mig.self_link +} diff --git a/community/modules/compute/mig/variables.tf b/community/modules/compute/mig/variables.tf new file mode 100644 index 0000000000..b6c3c0e78a --- /dev/null +++ b/community/modules/compute/mig/variables.tf @@ -0,0 +1,86 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +variable "project_id" { + description = "Project in which the MIG will be created" + type = string +} + +variable "deployment_name" { + description = "Name of the deployment, will be used to name MIG if `var.name` is not provided" + type = string +} + +variable "labels" { + description = "Labels to add to the MIG" + type = map(string) +} + +variable "zone" { + description = "Compute Platform zone. Required, currently only zonal MIGs are supported" + type = string +} + + +variable "versions" { + description = <<-EOD + Application versions managed by this instance group. Each version deals with a specific instance template + EOD + type = list(object({ + name = string + instance_template = string + target_size = optional(object({ + fixed = optional(number) + percent = optional(number) + })) + })) + + validation { + condition = length(var.versions) > 0 + error_message = "At least one version must be provided" + } + +} + + +variable "ghpc_module_id" { + description = "Internal GHPC field, do not set this value" + type = string + default = null +} + +variable "name" { + description = "Name of the MIG. If not provided, will be generated from `var.deployment_name`" + type = string + default = null +} + +variable "base_instance_name" { + description = "Base name for the instances in the MIG" + type = string + default = null +} + + +variable "target_size" { + description = "Target number of instances in the MIG" + type = number + default = 0 +} + +variable "wait_for_instances" { + description = "Whether to wait for all instances to be created/updated before returning" + type = bool + default = false +} diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf new file mode 100644 index 0000000000..43edb9d0f6 --- /dev/null +++ b/community/modules/compute/mig/versions.tf @@ -0,0 +1,27 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +terraform { + required_version = ">= 1.3" + + required_providers { + google = { + source = "hashicorp/google" + version = "> 5.0" + } + } + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.35.0" + } +} diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index 3c69f0f328..865b464993 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -128,7 +128,7 @@ No modules. | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-standard"` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index 9725b2eda0..08eb237433 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -166,14 +166,9 @@ variable "tags" { } variable "disk_type" { - description = "Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme." + description = "Boot disk type." type = string default = "pd-standard" - - validation { - condition = contains(["pd-ssd", "pd-standard", "pd-balanced", "pd-extreme"], var.disk_type) - error_message = "Variable disk_type must be one of pd-ssd, pd-standard, pd-balanced, or pd-extreme." - } } variable "disk_size_gb" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index de050c3c2a..6af4d4d824 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.35.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index 23870b8b20..26ff8a5065 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.35.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index 08d6edd5db..019509df88 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.35.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index ef612cf1b2..cdfc2f4e88 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -175,7 +175,7 @@ No modules. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | -| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | +| [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for compute nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | | [maintenance\_interval](#input\_maintenance\_interval) | Sets the maintenance interval for instances in this nodeset.
See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#maintenance_interval. | `string` | `null` | no | @@ -190,7 +190,7 @@ No modules. | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | -| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources
- Must be a "SPECIFIC" reservation
- Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | +| [reservation\_name](#input\_reservation\_name) | Name of the reservation to use for VM resources, should be in one of the following formats:
- projects/PROJECT\_ID/reservations/RESERVATION\_NAME
- RESERVATION\_NAME

Must be a "SPECIFIC" reservation
Set to empty string if using no reservation or automatically-consumed reservations | `string` | `""` | no | | [service\_account](#input\_service\_account) | DEPRECATED: Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the compute instances. | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | @@ -201,7 +201,7 @@ No modules. | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [zone](#input\_zone) | Zone in which to create compute VMs. Additional zones in the same region can be specified in var.zones. | `string` | n/a | yes | | [zone\_target\_shape](#input\_zone\_target\_shape) | Strategy for distributing VMs across zones in a region.
ANY
GCE picks zones for creating VM instances to fulfill the requested number of VMs
within present resource constraints and to maximize utilization of unused zonal
reservations.
ANY\_SINGLE\_ZONE (default)
GCE always selects a single zone for all the VMs, optimizing for resource quotas,
available reservations and general capacity.
BALANCED
GCE prioritizes acquisition of resources, scheduling VMs in zones where resources
are available while distributing VMs as evenly as possible across allowed zones
to minimize the impact of zonal failure. | `string` | `"ANY_SINGLE_ZONE"` | no | -| [zones](#input\_zones) | Additional nodes in which to allow creation of partition nodes. Google Cloud
will find zone based on availability, quota and reservations. | `set(string)` | `[]` | no | +| [zones](#input\_zones) | Additional zones in which to allow creation of partition nodes. Google Cloud
will find zone based on availability, quota and reservations.
Should not be set if SPECIFIC reservation is used. | `set(string)` | `[]` | no | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf index 57724e0ed9..be62e6da1f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/main.tf @@ -69,11 +69,10 @@ locals { enable_shielded_vm = var.enable_shielded_vm gpu = one(local.guest_accelerator) - instance_template = var.instance_template - labels = local.labels - machine_type = var.machine_type - metadata = var.metadata - min_cpu_platform = var.min_cpu_platform + labels = local.labels + machine_type = var.machine_type + metadata = var.metadata + min_cpu_platform = var.min_cpu_platform on_host_maintenance = var.on_host_maintenance preemptible = var.preemptible @@ -89,7 +88,7 @@ locals { tags = var.tags spot = var.enable_spot_vm termination_action = try(var.spot_instance_config.termination_action, null) - reservation_name = var.reservation_name + reservation_name = local.reservation_name maintenance_interval = var.maintenance_interval zones = toset(concat([var.zone], tolist(var.zones))) @@ -103,17 +102,33 @@ data "google_compute_default_service_account" "default" { project = var.project_id } +locals { + res_name_split = split("/", var.reservation_name) + reservation = var.reservation_name == "" ? null : ( + length(local.res_name_split) == 4 ? { + project : local.res_name_split[1], + name : local.res_name_split[3] + } : { + project : var.project_id, + name : var.reservation_name + } + ) + + reservation_name = local.reservation == null ? "" : "projects/${local.reservation.project}/reservations/${local.reservation.name}" +} + # tflint-ignore: terraform_unused_declarations data "google_compute_reservation" "reservation" { - count = var.reservation_name != "" ? 1 : 0 - name = var.reservation_name - project = var.project_id + count = local.reservation != null ? 1 : 0 + + name = local.reservation.name + project = local.reservation.project zone = var.zone lifecycle { postcondition { condition = self.self_link != null - error_message = "couldn't find the reservation ${var.reservation_name}}" + error_message = "Couldn't find the reservation ${var.reservation_name}" } postcondition { @@ -125,5 +140,8 @@ data "google_compute_reservation" "reservation" { it in the blueprint. EOT } + + # TODO: wait for https://github.com/hashicorp/terraform-provider-google/issues/18248 + # Add a validation that if reservation.project != var.project_id it should be a shared reservation } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf index 64ac5c6ff3..dc2f3b0c40 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf @@ -33,6 +33,13 @@ output "nodeset" { EOD } + precondition { + condition = var.reservation_name == "" || length(var.zones) == 0 + error_message = <<-EOD + If a reservation is specified, `var.zones` should be empty. + EOD + } + precondition { condition = !var.enable_placement || var.node_count_static == 0 || var.node_count_dynamic_max == 0 error_message = "Cannot use placement with static and auto-scaling nodes in the same node set." diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 11649a8057..15b601727c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -44,18 +44,14 @@ variable "node_count_dynamic_max" { } ## VM Definition -variable "instance_template" { - description = <<-EOD - Self link to a custom instance template. If set, other VM definition - variables such as machine_type and instance_image will be ignored in favor - of the provided instance template. - - For more information on creating custom images for the instance template - that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section - in docs/vm-images.md. - EOD +variable "instance_template" { # tflint-ignore: terraform_unused_declarations + description = "DEPRECATED: Instance template can not be specified for compute nodes." type = string default = null + validation { + condition = var.instance_template == null + error_message = "DEPRECATED: Instance template can not be specified for compute nodes." + } } variable "machine_type" { @@ -359,8 +355,9 @@ variable "zone" { variable "zones" { description = <<-EOD - Additional nodes in which to allow creation of partition nodes. Google Cloud + Additional zones in which to allow creation of partition nodes. Google Cloud will find zone based on availability, quota and reservations. + Should not be set if SPECIFIC reservation is used. EOD type = set(string) default = [] @@ -437,13 +434,21 @@ variable "access_config" { variable "reservation_name" { description = <<-EOD - Name of the reservation to use for VM resources - - Must be a "SPECIFIC" reservation - - Set to empty string if using no reservation or automatically-consumed reservations + Name of the reservation to use for VM resources, should be in one of the following formats: + - projects/PROJECT_ID/reservations/RESERVATION_NAME + - RESERVATION_NAME + + Must be a "SPECIFIC" reservation + Set to empty string if using no reservation or automatically-consumed reservations EOD type = string default = "" nullable = false + + validation { + condition = var.reservation_name == "" || length(regexall("^projects/[a-z0-9-]+/reservations/[a-z0-9-]+$", var.reservation_name)) > 0 || length(regexall("^[a-z0-9-]+$", var.reservation_name)) > 0 + error_message = "Reservation name must be in the format 'projects/PROJECT_ID/reservations/RESERVATION_NAME' or 'RESERVATION_NAME'." + } } variable "maintenance_interval" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 00b12c4812..b12cea4361 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.35.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index 52206d006e..1e766d9226 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.35.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index 5ab40cf0ee..e89f6764b6 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -73,6 +73,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [authorized\_networks](#input\_authorized\_networks) | IP address ranges as authorized networks of the Cloud SQL for MySQL instances | `list(string)` | `[]` | no | | [database\_version](#input\_database\_version) | The version of the database to be created. | `string` | `"MYSQL_5_7"` | no | | [deletion\_protection](#input\_deletion\_protection) | Whether or not to allow Terraform to destroy the instance. | `string` | `false` | no | | [deployment\_name](#input\_deployment\_name) | The name of the current deployment | `string` | n/a | yes | diff --git a/community/modules/database/slurm-cloudsql-federation/main.tf b/community/modules/database/slurm-cloudsql-federation/main.tf index 6b02dff910..09de939b72 100644 --- a/community/modules/database/slurm-cloudsql-federation/main.tf +++ b/community/modules/database/slurm-cloudsql-federation/main.tf @@ -49,6 +49,15 @@ resource "google_sql_database_instance" "instance" { ipv4_enabled = false private_network = var.network_id enable_private_path_for_google_cloud_services = true + + dynamic "authorized_networks" { + for_each = var.authorized_networks + iterator = ip_range + + content { + value = ip_range.value + } + } } } } diff --git a/community/modules/database/slurm-cloudsql-federation/variables.tf b/community/modules/database/slurm-cloudsql-federation/variables.tf index 5f1cbc7588..701f15d1ea 100644 --- a/community/modules/database/slurm-cloudsql-federation/variables.tf +++ b/community/modules/database/slurm-cloudsql-federation/variables.tf @@ -14,6 +14,13 @@ * limitations under the License. */ +variable "authorized_networks" { + description = "IP address ranges as authorized networks of the Cloud SQL for MySQL instances" + type = list(string) + default = [] + nullable = false +} + variable "database_version" { description = "The version of the database to be created." type = string diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index cba2503103..967e242be8 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.35.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.35.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index 88bc353e05..5e1f3b153f 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.35.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index 3b906c9b1c..a3cb767106 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.35.0" } } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 45b5bd1b7e..4cbc4f660e 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.35.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index f6aeca848a..e923928d65 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.35.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.35.0" } } diff --git a/community/modules/network/private-service-access/README.md b/community/modules/network/private-service-access/README.md index ada4fc3609..50fbd42235 100644 --- a/community/modules/network/private-service-access/README.md +++ b/community/modules/network/private-service-access/README.md @@ -78,6 +78,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [address](#input\_address) | The IP address or beginning of the address range allocated for the private service access. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to supporting resources. Key-value pairs. | `map(string)` | n/a | yes | | [network\_id](#input\_network\_id) | The ID of the GCE VPC network to configure private service Access.:
`projects//global/networks/`" | `string` | n/a | yes | | [prefix\_length](#input\_prefix\_length) | The prefix length of the IP range allocated for the private service access. | `number` | `16` | no | diff --git a/community/modules/network/private-service-access/main.tf b/community/modules/network/private-service-access/main.tf index b114a28a8d..706fe1cdf7 100644 --- a/community/modules/network/private-service-access/main.tf +++ b/community/modules/network/private-service-access/main.tf @@ -31,6 +31,7 @@ resource "google_compute_global_address" "private_ip_alloc" { network = var.network_id prefix_length = var.prefix_length labels = local.labels + address = var.address } resource "google_service_networking_connection" "private_vpc_connection" { diff --git a/community/modules/network/private-service-access/variables.tf b/community/modules/network/private-service-access/variables.tf index adc00cd3f9..e600463e3e 100644 --- a/community/modules/network/private-service-access/variables.tf +++ b/community/modules/network/private-service-access/variables.tf @@ -12,6 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +variable "address" { + description = "The IP address or beginning of the address range allocated for the private service access." + type = string + default = null +} + variable "network_id" { description = <<-EOT The ID of the GCE VPC network to configure private service Access.: diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index ce2adbb1f0..823bbed319 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.35.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.35.0" } required_version = ">= 1.2" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index af60b0f69e..2e8b42f18a 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.35.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index bcb56c691d..9a56ed6fff 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.35.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.35.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index e1e2a80c17..f76c883966 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.35.0" } } diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index ea9576d063..b1ddfa60a1 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -30,6 +30,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.35.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 33c970785b..b23481a1d9 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.35.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index c17968c778..3b75f74bf9 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.35.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 13f6601d5d..e2f2645a36 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.35.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 57bf05e715..d23146ac2e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -246,7 +246,7 @@ limitations under the License. | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-ssd"` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enable loading of cluster job usage into big query. | `bool` | `false` | no | | [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | | [enable\_cleanup\_subscriptions](#input\_enable\_cleanup\_subscriptions) | Enables automatic cleanup of pub/sub subscriptions managed by this module, when
cluster is destroyed.

NOTE: Requires Python and pip packages listed at the following link:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/3979e81fc5e4f021b5533a23baa474490f4f3614/scripts/requirements.txt

*WARNING*: Toggling this may temporarily impact var.enable\_reconfigure behavior. | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index e2b9bad03a..86e0e69aab 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -175,13 +175,8 @@ variable "disable_smt" { variable "disk_type" { type = string - description = "Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme." + description = "Boot disk type." default = "pd-ssd" - - validation { - condition = contains(["pd-ssd", "pd-standard", "pd-balanced", "pd-extreme"], var.disk_type) - error_message = "Variable disk_type must be one of pd-ssd, pd-standard, pd-balanced, or pd-extreme." - } } variable "disk_size_gb" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 0b1fdd915b..3e1345f635 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.35.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index d59b3c33f4..38d2fea7c9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -106,7 +106,7 @@ limitations under the License. | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | -| [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | +| [disk\_type](#input\_disk\_type) | Boot disk type. | `string` | `"pd-standard"` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfigure on when Slurm configuration changes (e.g.
slurm.conf.tpl, partition details).

NOTE: Requires Google Pub/Sub API. | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index ab5c5b15e1..90be6b467d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -361,13 +361,8 @@ variable "source_image" { variable "disk_type" { type = string - description = "Boot disk type, can be either pd-ssd, pd-standard, pd-balanced, or pd-extreme." + description = "Boot disk type." default = "pd-standard" - - validation { - condition = contains(["pd-ssd", "pd-standard", "pd-balanced", "pd-extreme"], var.disk_type) - error_message = "Variable disk_type must be one of pd-ssd, pd-standard, pd-balanced, or pd-extreme." - } } variable "disk_size_gb" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 8316191f78..f22f4c6e85 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.35.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index a41f4c732e..e1e2073931 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -11,9 +11,9 @@ The [user guide][slurm-ug] provides detailed instructions on customizing and enhancing the Slurm on GCP cluster as well as recommendations on configuring the controller for optimal performance at different scales. -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.6 -[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.6/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.6/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8 +[slurm\_controller\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions @@ -117,6 +117,32 @@ gcloud beta compute resource-policies list \ > If a zone lacks capacity, using a lower `max-distance` value (such as 1) is > more likely to cause VMs creation to fail. +## TreeWidth and Node Communication + +Slurm uses a fan out mechanism to communicate large groups of nodes. The shape +of this fan out tree is determined by the +[TreeWidth](https://slurm.schedmd.com/slurm.conf.html#OPT_TreeWidth) +configuration variable. + +In the cloud, this fan out mechanism can become unstable when nodes restart with +new IP addresses. You can enforce that all nodes communicate directly with the +controller by setting TreeWidth to a value >= largest partition. + +If the largest partition was 200 nodes, configure the blueprint as follows: + +```yaml + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + ... + settings: + cloud_parameters: + tree_width: 200 +``` + +The default has been set to 128. Values above this have not been fully tested +and may cause congestion on the controller. A more scalable solution is under +way. + ## Hybrid Slurm Clusters For more information on how to configure an on premise slurm cluster with hybrid cloud partitions, see the [schedmd-slurm-gcp-v5-hybrid] module and our @@ -169,14 +195,14 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.6 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | -| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.5.6 | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.6 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | -| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.6 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.6 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.6 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.8 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.8 | +| [slurm\_files](#module\_slurm\_files) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files | 6.5.8 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.5.8 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.8 | +| [slurm\_nodeset](#module\_slurm\_nodeset) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset | 6.5.8 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.5.8 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.5.8 | ## Resources @@ -202,7 +228,7 @@ limitations under the License. | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. |
object({
no_comma_params = optional(bool, false)
resume_rate = optional(number, 0)
resume_timeout = optional(number, 300)
suspend_rate = optional(number, 0)
suspend_timeout = optional(number, 300)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number, 128)
})
| `{}` | no | | [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access. |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | @@ -234,17 +260,17 @@ limitations under the License. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | -| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | +| [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | | [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | -| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | +| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
additional_networks = optional(list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | | [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
instance_template = optional(string)
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
zones = optional(list(string), [])
zone_target_shape = optional(string, "ANY_SINGLE_ZONE")
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])
}))
| `[]` | no | | [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | | [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index 28b31944d6..4d8896078a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -25,8 +25,6 @@ locals { } ] - have_template = var.instance_template != null && var.instance_template != "" - service_account_email = coalesce(var.service_account_email, data.google_compute_default_service_account.default.email) # can't rely on `email=null` as it's used to instantiate `cloudsql_secret_accessor` @@ -38,8 +36,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.6" - count = local.have_template ? 0 : 1 + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.8" project_id = var.project_id region = var.region @@ -95,12 +92,12 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.6" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.8" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false hostname = "${local.slurm_cluster_name}-controller" - instance_template = local.have_template ? var.instance_template : module.slurm_controller_template[0].self_link + instance_template = module.slurm_controller_template.self_link project_id = var.project_id region = var.region diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index ab18afeb2c..2f52d3617c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,12 +14,9 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.6" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.8" - for_each = { - for x in var.login_nodes : x.name_prefix => x - if(x.instance_template == null || x.instance_template == "") - } + for_each = { for x in var.login_nodes : x.name_prefix => x } project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name @@ -28,6 +25,7 @@ module "slurm_login_template" { name_prefix = each.value.name_prefix additional_disks = each.value.additional_disks + additional_networks = each.value.additional_networks bandwidth_tier = each.value.bandwidth_tier can_ip_forward = each.value.can_ip_forward disable_smt = each.value.disable_smt @@ -59,7 +57,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.6" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.5.8" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config @@ -70,13 +68,9 @@ module "slurm_login_instance" { project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name - instance_template = ( - each.value.instance_template != null && each.value.instance_template != "" - ? each.value.instance_template - : module.slurm_login_template[each.key].self_link - ) - labels = merge(each.value.labels, local.files_cs_labels) - num_instances = each.value.num_instances + instance_template = module.slurm_login_template[each.key].self_link + labels = merge(each.value.labels, local.files_cs_labels) + num_instances = each.value.num_instances region = each.value.region static_ips = each.value.static_ips diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf index 02c7d30a67..3e5164ec2c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/partition.tf @@ -26,7 +26,7 @@ locals { # NODESET # TODO: remove dependency on slurm-gcp repo, move to local nodeset module module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.6" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.5.8" for_each = local.nodeset_map project_id = var.project_id @@ -65,7 +65,7 @@ module "slurm_nodeset_template" { } module "slurm_nodeset" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.5.6" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset?ref=6.5.8" for_each = local.nodeset_map instance_template_self_link = module.slurm_nodeset_template[each.key].self_link @@ -85,7 +85,7 @@ module "slurm_nodeset" { # NODESET TPU module "slurm_nodeset_tpu" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.5.6" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu?ref=6.5.8" for_each = local.nodeset_tpu_map project_id = var.project_id diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf index 6e0ad65410..25399942ae 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/slurm_files.tf @@ -44,13 +44,12 @@ module "bucket" { # BUCKET IAMs locals { - controller_sa = toset(flatten([for x in module.slurm_controller_template : x.service_account])) compute_sa = toset(flatten([for x in module.slurm_nodeset_template : x.service_account])) compute_tpu_sa = toset(flatten([for x in module.slurm_nodeset_tpu : x.service_account])) login_sa = toset(flatten([for x in module.slurm_login_template : x.service_account])) viewers = toset(flatten([ - formatlist("serviceAccount:%s", [for x in local.controller_sa : x.email]), + "serviceAccount:${module.slurm_controller_template.service_account.email}", formatlist("serviceAccount:%s", [for x in local.compute_sa : x.email]), formatlist("serviceAccount:%s", [for x in local.compute_tpu_sa : x.email]), formatlist("serviceAccount:%s", [for x in local.login_sa : x.email]), @@ -88,7 +87,7 @@ locals { } module "slurm_files" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.5.6" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_files?ref=6.5.8" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index a91a80f387..0e7ce71e7e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -108,6 +108,26 @@ variable "login_nodes" { auto_delete = optional(bool, true) boot = optional(bool, false) })), []) + additional_networks = optional(list(object({ + access_config = optional(list(object({ + nat_ip = string + network_tier = string + })), []) + alias_ip_range = optional(list(object({ + ip_cidr_range = string + subnetwork_range_name = string + })), []) + ipv6_access_config = optional(list(object({ + network_tier = string + })), []) + network = optional(string) + network_ip = optional(string, "") + nic_type = optional(string) + queue_count = optional(number) + stack_type = optional(string) + subnetwork = optional(string) + subnetwork_project = optional(string) + })), []) bandwidth_tier = optional(string, "platform_default") can_ip_forward = optional(bool, false) disable_smt = optional(bool, false) @@ -122,7 +142,6 @@ variable "login_nodes" { count = number type = string })) - instance_template = optional(string) labels = optional(map(string), {}) machine_type = optional(string) metadata = optional(map(string), {}) @@ -191,7 +210,6 @@ variable "nodeset" { count = number type = string })) - instance_template = optional(string) labels = optional(map(string), {}) machine_type = optional(string) maintenance_interval = optional(string) @@ -379,13 +397,15 @@ EOD } variable "cloud_parameters" { - description = "cloud.conf options." + description = "cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters)" type = object({ - no_comma_params = optional(bool, false) - resume_rate = optional(number, 0) - resume_timeout = optional(number, 300) - suspend_rate = optional(number, 0) - suspend_timeout = optional(number, 300) + no_comma_params = optional(bool) + resume_rate = optional(number) + resume_timeout = optional(number) + suspend_rate = optional(number) + suspend_timeout = optional(number) + topology_plugin = optional(string) + tree_width = optional(number, 128) }) default = {} } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf index 9826378e13..a0a477113d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables_controller_instance.tf @@ -243,18 +243,14 @@ variable "service_account" { # tflint-ignore: terraform_unused_declarations } } -variable "instance_template" { - description = <<-EOD - Self link to a custom instance template. If set, other VM definition - variables such as machine_type and instance_image will be ignored in favor - of the provided instance template. - - For more information on creating custom images for the instance template - that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section - in docs/vm-images.md. - EOD +variable "instance_template" { # tflint-ignore: terraform_unused_declarations + description = "DEPRECATED: Instance template can not be specified for controller." type = string default = null + validation { + condition = var.instance_template == null + error_message = "DEPRECATED: Instance template can not be specified for controller." + } } variable "instance_image" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index a7c24f24f1..f719049f64 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -30,6 +30,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.35.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md index db3a0c6e56..1ffeadcb70 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.6 -[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.6/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.6/terraform/slurm_cluster/modules/slurm_instance_template +[slurm-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8 +[slurm\_login\_instance]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -53,7 +53,7 @@ modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. [slurm-on-gcp]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/7 -[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.6#slurm-on-google-cloud-platform +[slurm-gcp-readme]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8#slurm-on-google-cloud-platform ## Requirements @@ -85,6 +85,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
}))
| `[]` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | DEPRECATED: Use `enable_login_public_ips` instead. | `bool` | `null` | no | @@ -101,7 +102,7 @@ No modules. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-5-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | | [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | -| [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | +| [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for login nodes. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf index 5b80ae7924..ab3e45fe9c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/main.tf @@ -48,12 +48,13 @@ locals { login_node = { - name_prefix = local.name_prefix - disk_auto_delete = var.disk_auto_delete - disk_labels = merge(var.disk_labels, local.labels) - disk_size_gb = var.disk_size_gb - disk_type = var.disk_type - additional_disks = local.additional_disks + name_prefix = local.name_prefix + disk_auto_delete = var.disk_auto_delete + disk_labels = merge(var.disk_labels, local.labels) + disk_size_gb = var.disk_size_gb + disk_type = var.disk_type + additional_disks = local.additional_disks + additional_networks = var.additional_networks can_ip_forward = var.can_ip_forward disable_smt = !var.enable_smt @@ -65,7 +66,6 @@ locals { shielded_instance_config = var.shielded_instance_config gpu = one(local.guest_accelerator) - instance_template = var.instance_template labels = local.labels machine_type = var.machine_type metadata = var.metadata diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf index 28c34ebf48..2bb051d5a8 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/variables.tf @@ -85,6 +85,32 @@ variable "additional_disks" { default = [] } +variable "additional_networks" { + description = "Additional network interface details for GCE, if any." + default = [] + type = list(object({ + access_config = optional(list(object({ + nat_ip = string + network_tier = string + })), []) + alias_ip_range = optional(list(object({ + ip_cidr_range = string + subnetwork_range_name = string + })), []) + ipv6_access_config = optional(list(object({ + network_tier = string + })), []) + network = optional(string) + network_ip = optional(string, "") + nic_type = optional(string) + queue_count = optional(number) + stack_type = optional(string) + subnetwork = optional(string) + subnetwork_project = optional(string) + })) + nullable = false +} + variable "enable_smt" { type = bool description = "Enables Simultaneous Multi-Threading (SMT) on instance." @@ -275,18 +301,14 @@ variable "service_account" { # tflint-ignore: terraform_unused_declarations } } -variable "instance_template" { - description = <<-EOD - Self link to a custom instance template. If set, other VM definition - variables such as machine_type and instance_image will be ignored in favor - of the provided instance template. - - For more information on creating custom images for the instance template - that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section - in docs/vm-images.md. - EOD +variable "instance_template" { # tflint-ignore: terraform_unused_declarations + description = "DEPRECATED: Instance template can not be specified for login nodes." type = string default = null + validation { + condition = var.instance_template == null + error_message = "DEPRECATED: Instance template can not be specified for login nodes." + } } variable "instance_image" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index 02df81651b..80d34d7b87 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.35.0" } } diff --git a/community/modules/scripts/kubernetes-operations/README.md b/community/modules/scripts/kubernetes-operations/README.md deleted file mode 100644 index 4b8644625e..0000000000 --- a/community/modules/scripts/kubernetes-operations/README.md +++ /dev/null @@ -1,43 +0,0 @@ -## Description - -This module performs pre-defined operations on Kubernetes resources that would -otherwise be executed using `kubectl`. - -The `kubernetes-operations` module is owned and maintained by the -[ai-infra-cluster-provisioning] Github project. Full documentation of the module -interface can be found in that project on the [`kubernetes-operations`] page. - -### Examples - -The following example will use the [`kubernetes-operations`] module to create a -DaemonSet that will install Nvidia drivers on GPU nodes. - -```yaml - - id: gke_cluster - source: community/modules/scheduler/gke-cluster - use: [network1] - settings: - enable_private_endpoint: false # Allows for access from authorized public IPs - master_authorized_networks: - - display_name: deployment-machine - cidr_block: /32 - outputs: [instructions] - - - id: install-nvidia-drivers - source: github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning//aiinfra-cluster/modules/kubernetes-operations?ref=v0.6.0 - use: [gke_cluster] - settings: - install_nvidia_driver: true -``` - -> **Note**: The IP address of the machine calling Terraform must be listed as a -> `master_authorized_network` otherwise the [`kubernetes-operations`] module -> will not be able to communicate with the cluster. - -### Version Compatibility - -Only version [v0.6.0] of this module has been tested for compatibility with the HPC Toolkit. Older versions will not work and newer versions are untested. - -[v0.6.0]: https://github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning/releases/tag/v0.6.0 -[`kubernetes-operations`]: https://github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning/tree/v0.6.0/aiinfra-cluster/modules/kubernetes-operations -[ai-infra-cluster-provisioning]: https://github.com/GoogleCloudPlatform/ai-infra-cluster-provisioning/tree/v0.6.0 diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 2f65e194ab..6571b5e8b6 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.35.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index e5d127385c..3c38758c42 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.35.0" } required_version = ">= 0.14.0" diff --git a/examples/README.md b/examples/README.md index e138c5f13b..e1aba7e807 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1382,10 +1382,29 @@ Toolkit. It includes: > work. See note below. * Creation of a regional GKE cluster. -* Creation of an autoscaling GKE node pool with `a2` machines each with 8 - attached A100 GPUs. -* Configuration of the cluster using the [`kubernetes-operations`] module to - install nvidia drivers. +* Creation of an autoscaling GKE node pool with `g2` machines each with 1 + attached L4 GPUs. Note: This blueprint has also been tested with `a2` machines, + but as capacity is hard to find the example uses `g2` machines which have better obtainability. + If using with `a2` machines it is recommended to first obtain an automatic reservation. + + Example settings for a2 look like: + + ```yaml + source: community/modules/compute/gke-node-pool + use: [gke_cluster] + settings: + disk_type: pd-balanced + machine_type: a2-highgpu-2g + guest_accelerator: + - type: nvidia-tesla-a100 + count: 2 + gpu_partition_size: null + gpu_sharing_config: null + gpu_driver_installation_config: + - gpu_driver_version: "DEFAULT" + ``` + +* Configuration of the cluster using default drivers provided by GKE. * Creation of a job template yaml file that can be used to submit jobs to the GPU node pool. diff --git a/examples/machine-learning/a3-highgpu-8g/README.md b/examples/machine-learning/a3-highgpu-8g/README.md index 0b37bd39d6..1dfecb189d 100644 --- a/examples/machine-learning/a3-highgpu-8g/README.md +++ b/examples/machine-learning/a3-highgpu-8g/README.md @@ -308,7 +308,7 @@ using an alternative image. ```shell git clone https://github.com/GoogleCloudPlatform/hpc-toolkit -cd hpc-toolkit/examples/machine-learning/nccl-tests +cd hpc-toolkit/examples/machine-learning/a3-highgpu-8g/nccl-tests ``` ### Import the PyTorch image from the NVIDIA Container Registry diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml index 7008b929a5..61e7861b5e 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml @@ -184,6 +184,7 @@ deployment_groups: suspend_rate: 0 suspend_timeout: 600 no_comma_params: false + tree_width: $(vars.a3_static_cluster_size) instance_image_custom: true instance_image: family: $(vars.final_image_family) diff --git a/examples/machine-learning/a3-megagpu-8g/nccl-tests/run-topological-nccl-tests.sh b/examples/machine-learning/a3-megagpu-8g/nccl-tests/run-topological-nccl-tests.sh new file mode 100644 index 0000000000..2cff2699ef --- /dev/null +++ b/examples/machine-learning/a3-megagpu-8g/nccl-tests/run-topological-nccl-tests.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# shellcheck disable=SC2016 + +#SBATCH --exclusive +#SBATCH --partition=a3mega +#SBATCH --mem=0 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --nodes 2 + +# Usage: sbatch run-nccl-tests.sh + +set -x +# This should be set to the squashfs file that you created for your application +CONTAINER_IMAGE=./nvidia+pytorch+24.04-py3.sqsh + +# Set up NCCL Environment variables +# The following two can be useful for debugging +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=INIT,NET + +# These parameters should not be modified +# shellcheck source=/dev/null +NCCL_LIB_DIR="/var/lib/tcpxo/lib64" source /var/lib/tcpxo/lib64/nccl-env-profile.sh +export NCCL_FASTRAK_CTRL_DEV=enp0s12 +export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 +export NCCL_SOCKET_IFNAME=enp0s12 +export NCCL_FASTRAK_USE_SNAP=1 +export NCCL_FASTRAK_USE_LLCM=1 +export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices + +# Here we grab all the environment variables that need to be +# passed down into the container. Slurm would otherwise only pass these env vars +# to the job environment on the host. +# shellcheck disable=SC2001 +HOST_VARS=$(sed 's/ \{1,\}/,/g' <<<"${!NCCL*}") + +# Mount /var/tmp to allow the rest of the enroot container to be read-only, and +# mount current $PWD to /nccl to for accessing nccl-tests binary +CONTAINER_MOUNTS="/var/tmp:/var/tmp" + +# Mount PWD to /nccl in the enroot container +CONTAINER_MOUNTS=${CONTAINER_MOUNTS},"$PWD:/nccl" + +# Mount required directories for GPUDirect-TCPXO functionality +CONTAINER_MOUNTS=${CONTAINER_MOUNTS},"/var/lib/tcpxo/lib64/" + +# Construct topology ordered hostfile +# The -n, -N, --ntasks-per-node, etc, must match the way the workload is +# launched in order to ensure proper placement. +srun --mpi=pmi2 \ + -n $((SLURM_NNODES * 8)) \ + --ntasks-per-node=8 \ + bash -c 'curl -s "http://metadata.google.internal/computeMetadata/v1/instance/attributes/physical_host" -H "Metadata-Flavor: Google"; echo /$SLURMD_NODENAME' | + sort -t / -s -k 1,4 | + awk -F "/" '{print $NF}' >/var/tmp/topo_sorted_hostfile +export SLURM_HOSTFILE=/var/tmp/topo_sorted_hostfile + +# Run the workload +srun -l \ + --mpi=pmi2 \ + --ntasks-per-node=8 \ + --container-image="${CONTAINER_IMAGE}" \ + --container-env="${HOST_VARS}" \ + --container-mounts="${CONTAINER_MOUNTS}" \ + sh -c " + export LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:/usr/lib/x86_64-linux-gnu:\$LD_LIBRARY_PATH; + /nccl/nccl-tests/build/all_gather_perf -b 8M -e 8G -f 2 -g 1 -w 5 --iters 200 -c 0; + " diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 0d345efffc..af68b4faf5 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -104,7 +104,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.5.6 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.5.8 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml diff --git a/go.mod b/go.mod index 727469dff9..b284491f4a 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/spf13/cobra v1.8.0 github.com/zclconf/go-cty v1.14.4 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa - google.golang.org/genproto v0.0.0-20240401170217-c3f982113cda // indirect + google.golang.org/genproto v0.0.0-20240528184218-531527333157 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -27,11 +27,11 @@ require ( github.com/hashicorp/terraform-exec v0.21.0 github.com/mattn/go-isatty v0.0.20 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.181.0 + google.golang.org/api v0.183.0 ) require ( - cloud.google.com/go/auth v0.4.1 // indirect + cloud.google.com/go/auth v0.5.1 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect dario.cat/mergo v1.0.0 // indirect github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect @@ -54,14 +54,14 @@ require ( golang.org/x/sync v0.7.0 // indirect golang.org/x/time v0.5.0 // indirect golang.org/x/tools v0.15.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20240506185236-b8a5c65736ae // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240521202816-d264139d666e // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 // indirect ) require ( - cloud.google.com/go v0.113.0 // indirect + cloud.google.com/go v0.114.0 // indirect cloud.google.com/go/compute/metadata v0.3.0 // indirect - cloud.google.com/go/iam v1.1.7 // indirect + cloud.google.com/go/iam v1.1.8 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect github.com/ProtonMail/go-crypto v1.1.0-alpha.2 // indirect github.com/agext/levenshtein v1.2.3 @@ -97,10 +97,10 @@ require ( go.opencensus.io v0.24.0 // indirect golang.org/x/crypto v0.23.0 // indirect golang.org/x/net v0.25.0 // indirect - golang.org/x/oauth2 v0.20.0 // indirect - golang.org/x/sys v0.20.0 + golang.org/x/oauth2 v0.21.0 // indirect + golang.org/x/sys v0.21.0 golang.org/x/text v0.15.0 // indirect - google.golang.org/grpc v1.63.2 // indirect + google.golang.org/grpc v1.64.0 // indirect google.golang.org/protobuf v1.34.1 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index bb78dccbcb..039c3a4062 100644 --- a/go.sum +++ b/go.sum @@ -30,8 +30,8 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRYtA= -cloud.google.com/go v0.113.0 h1:g3C70mn3lWfckKBiCVsAshabrDg01pQ0pnX1MNtnMkA= -cloud.google.com/go v0.113.0/go.mod h1:glEqlogERKYeePz6ZdkcLJ28Q2I6aERgDDErBg9GzO8= +cloud.google.com/go v0.114.0 h1:OIPFAdfrFDFO2ve2U7r/H5SwSbBzEdrBdE7xkgwc+kY= +cloud.google.com/go v0.114.0/go.mod h1:ZV9La5YYxctro1HTPug5lXH/GefROyW8PPD4T8n9J8E= cloud.google.com/go/aiplatform v1.22.0/go.mod h1:ig5Nct50bZlzV6NvKaTwmplLLddFx0YReh9WfTO5jKw= cloud.google.com/go/aiplatform v1.24.0/go.mod h1:67UUvRBKG6GTayHKV8DBv2RtR1t93YRu5B1P3x99mYY= cloud.google.com/go/analytics v0.11.0/go.mod h1:DjEWCu41bVbYcKyvlws9Er60YE4a//bK6mnhWvQeFNI= @@ -46,8 +46,8 @@ cloud.google.com/go/asset v1.8.0/go.mod h1:mUNGKhiqIdbr8X7KNayoYvyc4HbbFO9URsjby cloud.google.com/go/assuredworkloads v1.5.0/go.mod h1:n8HOZ6pff6re5KYfBXcFvSViQjDwxFkAkmUFffJRbbY= cloud.google.com/go/assuredworkloads v1.6.0/go.mod h1:yo2YOk37Yc89Rsd5QMVECvjaMKymF9OP+QXWlKXUkXw= cloud.google.com/go/assuredworkloads v1.7.0/go.mod h1:z/736/oNmtGAyU47reJgGN+KVoYoxeLBoj4XkKYscNI= -cloud.google.com/go/auth v0.4.1 h1:Z7YNIhlWRtrnKlZke7z3GMqzvuYzdc2z98F9D1NV5Hg= -cloud.google.com/go/auth v0.4.1/go.mod h1:QVBuVEKpCn4Zp58hzRGvL0tjRGU0YqdRTdCHM1IHnro= +cloud.google.com/go/auth v0.5.1 h1:0QNO7VThG54LUzKiQxv8C6x1YX7lUrzlAa1nVLF8CIw= +cloud.google.com/go/auth v0.5.1/go.mod h1:vbZT8GjzDf3AVqCcQmqeeM32U9HBFc32vVVAbwDsa6s= cloud.google.com/go/auth/oauth2adapt v0.2.2 h1:+TTV8aXpjeChS9M+aTtN/TjdQnzJvmzKFt//oWu7HX4= cloud.google.com/go/auth/oauth2adapt v0.2.2/go.mod h1:wcYjgpZI9+Yu7LyYBg4pqSiaRkfEK3GQcpb7C/uyF1Q= cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= @@ -111,8 +111,8 @@ cloud.google.com/go/gkehub v0.10.0/go.mod h1:UIPwxI0DsrpsVoWpLB0stwKCP+WFVG9+y97 cloud.google.com/go/grafeas v0.2.0/go.mod h1:KhxgtF2hb0P191HlY5besjYm6MqTSTj3LSI+M+ByZHc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= cloud.google.com/go/iam v0.5.0/go.mod h1:wPU9Vt0P4UmCux7mqtRu6jcpPAb74cP1fh50J3QpkUc= -cloud.google.com/go/iam v1.1.7 h1:z4VHOhwKLF/+UYXAJDFwGtNF0b6gjsW1Pk9Ml0U/IoM= -cloud.google.com/go/iam v1.1.7/go.mod h1:J4PMPg8TtyurAUvSmPj8FF3EDgY1SPRZxcUGrn7WXGA= +cloud.google.com/go/iam v1.1.8 h1:r7umDwhj+BQyz0ScZMp4QrGXjSTI3ZINnpgU2nlB/K0= +cloud.google.com/go/iam v1.1.8/go.mod h1:GvE6lyMmfxXauzNq8NbgJbeVQNspG+tcdL/W8QO1+zE= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/lifesciences v0.5.0/go.mod h1:3oIKy8ycWGPUyZDR/8RNnTOYevhaMLqh5vLUXs9zvT8= @@ -652,8 +652,8 @@ golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.1.0/go.mod h1:G9FE4dLTsbXUu90h/Pf85g4w1D+SSAgR+q46nJZ8M4A= -golang.org/x/oauth2 v0.20.0 h1:4mQdhULixXKP1rwYBW0vAijoXnkTG0BLCDRzfe1idMo= -golang.org/x/oauth2 v0.20.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= +golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -738,8 +738,8 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= -golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -874,8 +874,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.181.0 h1:rPdjwnWgiPPOJx3IcSAQ2III5aX5tCer6wMpa/xmZi4= -google.golang.org/api v0.181.0/go.mod h1:MnQ+M0CFsfUwA5beZ+g/vCBCPXvtmZwRz2qzZk8ih1k= +google.golang.org/api v0.183.0 h1:PNMeRDwo1pJdgNcFQ9GstuLe/noWKIc89pRWRLMvLwE= +google.golang.org/api v0.183.0/go.mod h1:q43adC5/pHoSZTx5h2mSmdF7NcyfW9JuDyIOJAgS9ZQ= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -984,12 +984,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20240401170217-c3f982113cda h1:wu/KJm9KJwpfHWhkkZGohVC6KRrc1oJNr4jwtQMOQXw= -google.golang.org/genproto v0.0.0-20240401170217-c3f982113cda/go.mod h1:g2LLCvCeCSir/JJSWosk19BR4NVxGqHUC6rxIRsd7Aw= -google.golang.org/genproto/googleapis/api v0.0.0-20240506185236-b8a5c65736ae h1:AH34z6WAGVNkllnKs5raNq3yRq93VnjBG6rpfub/jYk= -google.golang.org/genproto/googleapis/api v0.0.0-20240506185236-b8a5c65736ae/go.mod h1:FfiGhwUm6CJviekPrc0oJ+7h29e+DmWU6UtjX0ZvI7Y= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 h1:mxSlqyb8ZAHsYDCfiXN1EDdNTdvjUJSLY+OnAUtYNYA= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM= +google.golang.org/genproto v0.0.0-20240528184218-531527333157 h1:u7WMYrIrVvs0TF5yaKwKNbcJyySYf+HAIFXxWltJOXE= +google.golang.org/genproto v0.0.0-20240528184218-531527333157/go.mod h1:ubQlAQnzejB8uZzszhrTCU2Fyp6Vi7ZE5nn0c3W8+qQ= +google.golang.org/genproto/googleapis/api v0.0.0-20240521202816-d264139d666e h1:SkdGTrROJl2jRGT/Fxv5QUf9jtdKCQh4KQJXbXVLAi0= +google.golang.org/genproto/googleapis/api v0.0.0-20240521202816-d264139d666e/go.mod h1:LweJcLbyVij6rCex8YunD8DYR5VDonap/jYl3ZRxcIU= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157 h1:Zy9XzmMEflZ/MAaA7vNcoebnRAld7FsPW1EeBB7V0m8= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240528184218-531527333157/go.mod h1:EfXuqaE1J41VCDicxHzUDm+8rk+7ZdXzHV0IhO/I6s0= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1025,8 +1025,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= -google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/modules/README.md b/modules/README.md index b2f9bc0398..d369c65c97 100644 --- a/modules/README.md +++ b/modules/README.md @@ -219,7 +219,7 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md [slurm-gcp-version-5]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/5.11.1 -[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.6 +[slurm-gcp-version-6]: https://github.com/GoogleCloudPlatform/slurm-gcp/tree/6.5.8 [pbspro-client]: ../community/modules/scheduler/pbspro-client/README.md [pbspro-server]: ../community/modules/scheduler/pbspro-server/README.md @@ -232,8 +232,6 @@ Pub/Sub subscription. Primarily used for [FSI - MonteCarlo Tutorial][fsi-monteca and VM images. * **[htcondor-install]** ![community-badge] ![experimental-badge] : Creates a startup script to install HTCondor and exports a list of required APIs -* **[kubernetes-operations]** ![community-badge] ![experimental-badge] : - Performs pre-defined operations on Kubernetes resources. * **[omnia-install]** ![community-badge] ![experimental-badge] ![deprecated-badge] : Installs Slurm via [Dell Omnia](https://github.com/dellhpc/omnia) onto a cluster of VM instances. _This module has been deprecated and will be removed diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 10a5059118..4ae20e4ba4 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.35.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.35.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/README.md b/modules/file-system/filestore/README.md index 30b2cca285..dd8268f48b 100644 --- a/modules/file-system/filestore/README.md +++ b/modules/file-system/filestore/README.md @@ -183,6 +183,7 @@ No modules. | [mount\_options](#input\_mount\_options) | NFS mount options to mount file system. | `string` | `"defaults,_netdev"` | no | | [name](#input\_name) | The resource name of the instance. | `string` | `null` | no | | [network\_id](#input\_network\_id) | The ID of the GCE VPC network to which the instance is connected given in the format:
`projects//global/networks/`" | `string` | n/a | yes | +| [nfs\_export\_options](#input\_nfs\_export\_options) | Define NFS export options. |
list(object({
access_mode = optional(string)
ip_ranges = optional(list(string))
squash_mode = optional(string)
}))
| `[]` | no | | [project\_id](#input\_project\_id) | ID of project in which Filestore instance will be created. | `string` | n/a | yes | | [region](#input\_region) | Location for Filestore instances at Enterprise tier. | `string` | n/a | yes | | [reserved\_ip\_range](#input\_reserved\_ip\_range) | Reserved IP range for Filestore instance. Users are encouraged to set to null
for automatic selection. If supplied, it must be:

CIDR format when var.connect\_mode == "DIRECT\_PEERING"
Named IP Range when var.connect\_mode == "PRIVATE\_SERVICE\_ACCESS"

See Cloud documentation for more details:

https://cloud.google.com/filestore/docs/creating-instances#configure_a_reserved_ip_address_range | `string` | `null` | no | diff --git a/modules/file-system/filestore/main.tf b/modules/file-system/filestore/main.tf index f231bc495f..53d24db8a0 100644 --- a/modules/file-system/filestore/main.tf +++ b/modules/file-system/filestore/main.tf @@ -59,6 +59,14 @@ resource "google_filestore_instance" "filestore_instance" { file_shares { capacity_gb = var.size_gb name = var.filestore_share_name + dynamic "nfs_export_options" { + for_each = var.nfs_export_options + content { + access_mode = nfs_export_options.value.access_mode + ip_ranges = nfs_export_options.value.ip_ranges + squash_mode = nfs_export_options.value.squash_mode + } + } } labels = local.labels diff --git a/modules/file-system/filestore/variables.tf b/modules/file-system/filestore/variables.tf index 1c8c64c28f..d48619c741 100644 --- a/modules/file-system/filestore/variables.tf +++ b/modules/file-system/filestore/variables.tf @@ -114,6 +114,17 @@ variable "connect_mode" { } } +variable "nfs_export_options" { + description = "Define NFS export options." + type = list(object({ + access_mode = optional(string) + ip_ranges = optional(list(string)) + squash_mode = optional(string) + })) + default = [] + nullable = false +} + variable "reserved_ip_range" { description = <<-EOT Reserved IP range for Filestore instance. Users are encouraged to set to null diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index b67b3b6506..fa1e5d42d5 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.35.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.35.0" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index bb5e05f848..28c996c64f 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.35.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index c33654833d..da86101857 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.35.0" } required_version = ">= 1.3" diff --git a/modules/network/pre-existing-subnetwork/README.md b/modules/network/pre-existing-subnetwork/README.md new file mode 100644 index 0000000000..15cc92db9e --- /dev/null +++ b/modules/network/pre-existing-subnetwork/README.md @@ -0,0 +1,94 @@ +## Description + +This module discovers a subnetwork that already exists in Google Cloud and +outputs subnetwork attributes that uniquely identify it for use by other modules. + +For example, the blueprint below discovers the referred to subnetwork. +With the `use` keyword, the [vm-instance] module accepts the `subnetwork_self_link` +input variables that uniquely identify the subnetwork in which the VM will be created. + +[vpc]: ../vpc/README.md +[vm-instance]: ../../compute/vm-instance/README.md + +> **_NOTE:_** Additional IAM work is needed for this to work correctly. + +### Example + +```yaml +- id: network + source: modules/network/pre-existing-subnetwork + settings: + subnetwork_self_link: https://www.googleapis.com/compute/v1/projects/name-of-host-project/regions/REGION/subnetworks/SUBNETNAME + +- id: example_vm + source: modules/compute/vm-instance + use: + - network + settings: + name_prefix: example + machine_type: c2-standard-4 +``` + +As described in documentation: +[https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork] + +If subnetwork_self_link is provided then name,region,project is ignored. + +## License + + +Copyright 2024 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [google](#requirement\_google) | >= 3.83 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 3.83 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_compute_subnetwork.primary_subnetwork](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [project](#input\_project) | Name of the project that owns the subnetwork | `string` | `null` | no | +| [region](#input\_region) | Region in which to search for primary subnetwork | `string` | `null` | no | +| [subnetwork\_name](#input\_subnetwork\_name) | Name of the pre-existing VPC subnetwork | `string` | `null` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Self-link of the subnet in the VPC | `string` | `null` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [subnetwork](#output\_subnetwork) | Full subnetwork object in the primary region | +| [subnetwork\_address](#output\_subnetwork\_address) | Subnetwork IP range in the primary region | +| [subnetwork\_name](#output\_subnetwork\_name) | Name of the subnetwork in the primary region | +| [subnetwork\_self\_link](#output\_subnetwork\_self\_link) | Subnetwork self-link in the primary region | + diff --git a/modules/network/pre-existing-subnetwork/main.tf b/modules/network/pre-existing-subnetwork/main.tf new file mode 100644 index 0000000000..8042f6472a --- /dev/null +++ b/modules/network/pre-existing-subnetwork/main.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + + +data "google_compute_subnetwork" "primary_subnetwork" { + name = var.subnetwork_name + region = var.region + project = var.project + self_link = var.subnetwork_self_link + + lifecycle { + postcondition { + condition = self.self_link != null + error_message = "The subnetwork: ${coalesce(var.subnetwork_name, var.subnetwork_self_link)} could not be found." + } + } +} diff --git a/modules/network/pre-existing-subnetwork/metadata.yaml b/modules/network/pre-existing-subnetwork/metadata.yaml new file mode 100644 index 0000000000..6a6f1e5757 --- /dev/null +++ b/modules/network/pre-existing-subnetwork/metadata.yaml @@ -0,0 +1,21 @@ +# Copyright 2023 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +spec: + requirements: + services: + - compute.googleapis.com +ghpc: + has_to_be_used: true diff --git a/modules/network/pre-existing-subnetwork/outputs.tf b/modules/network/pre-existing-subnetwork/outputs.tf new file mode 100644 index 0000000000..868708dc6b --- /dev/null +++ b/modules/network/pre-existing-subnetwork/outputs.tf @@ -0,0 +1,35 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +output "subnetwork" { + description = "Full subnetwork object in the primary region" + value = data.google_compute_subnetwork.primary_subnetwork +} + +output "subnetwork_name" { + description = "Name of the subnetwork in the primary region" + value = data.google_compute_subnetwork.primary_subnetwork.name +} + +output "subnetwork_self_link" { + description = "Subnetwork self-link in the primary region" + value = data.google_compute_subnetwork.primary_subnetwork.self_link +} + +output "subnetwork_address" { + description = "Subnetwork IP range in the primary region" + value = data.google_compute_subnetwork.primary_subnetwork.ip_cidr_range +} diff --git a/modules/network/pre-existing-subnetwork/variables.tf b/modules/network/pre-existing-subnetwork/variables.tf new file mode 100644 index 0000000000..d5191843e8 --- /dev/null +++ b/modules/network/pre-existing-subnetwork/variables.tf @@ -0,0 +1,39 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +variable "subnetwork_self_link" { + description = "Self-link of the subnet in the VPC" + type = string + default = null +} + +variable "project" { + description = "Name of the project that owns the subnetwork" + type = string + default = null +} + +variable "subnetwork_name" { + description = "Name of the pre-existing VPC subnetwork" + type = string + default = null +} + +variable "region" { + description = "Region in which to search for primary subnetwork" + type = string + default = null +} diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf new file mode 100644 index 0000000000..6bac79625a --- /dev/null +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -0,0 +1,29 @@ +/** + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = ">= 3.83" + } + } + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.33.0" + } + + required_version = ">= 0.14.0" +} diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index bc8294ef73..e9fb5f72b0 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.35.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index a27294c71b..3a764d3376 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.35.0" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 44c6682b3e..305b52aa66 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.34.1" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.35.0" } required_version = ">= 0.14.0" diff --git a/pkg/config/config.go b/pkg/config/config.go index f64da3929d..df30bd9616 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -68,9 +68,10 @@ func (n GroupName) Validate() error { // Group defines a group of Modules that are all executed together type Group struct { - Name GroupName `yaml:"group"` - TerraformBackend TerraformBackend `yaml:"terraform_backend,omitempty"` - Modules []Module `yaml:"modules"` + Name GroupName `yaml:"group"` + TerraformBackend TerraformBackend `yaml:"terraform_backend,omitempty"` + TerraformProviders map[string]TerraformProvider `yaml:"terraform_providers,omitempty"` + Modules []Module `yaml:"modules"` // DEPRECATED fields deprecatedKind interface{} `yaml:"kind,omitempty"` //lint:ignore U1000 keep in the struct for backwards compatibility } @@ -114,7 +115,7 @@ func (bp *Blueprint) Module(id ModuleID) (*Module, error) { return mod, nil } -func hintSpelling(s string, dict []string, err error) error { +func HintSpelling(s string, dict []string, err error) error { best, minDist := "", maxHintDist+1 for _, w := range dict { d := levenshtein.Distance(s, w, nil) @@ -176,6 +177,13 @@ type TerraformBackend struct { Configuration Dict } +// TerraformProvider defines the configuration for the terraform providers +type TerraformProvider struct { + Source string + Version string + Configuration Dict +} + // ModuleKind abstracts Toolkit module kinds (presently: packer/terraform) type ModuleKind struct { kind string @@ -261,8 +269,9 @@ type Blueprint struct { Validators []Validator `yaml:"validators,omitempty"` ValidationLevel int `yaml:"validation_level,omitempty"` Vars Dict - Groups []Group `yaml:"deployment_groups"` - TerraformBackendDefaults TerraformBackend `yaml:"terraform_backend_defaults,omitempty"` + Groups []Group `yaml:"deployment_groups"` + TerraformBackendDefaults TerraformBackend `yaml:"terraform_backend_defaults,omitempty"` + TerraformProviders map[string]TerraformProvider `yaml:"terraform_providers,omitempty"` // internal & non-serializable fields @@ -285,6 +294,46 @@ func (bp *Blueprint) Clone() Blueprint { return c } +func (bp *Blueprint) mutateDicts(cb func(dictPath, *Dict) Dict) { + bp.Vars = cb(Root.Vars, &bp.Vars) + + bp.TerraformBackendDefaults.Configuration = cb(Root.Backend.Configuration, &bp.TerraformBackendDefaults.Configuration) + + for k, p := range bp.TerraformProviders { + p.Configuration = cb(Root.Provider.Dot(k).Configuration, &p.Configuration) + bp.TerraformProviders[k] = p + } + + for ig := range bp.Groups { + g := &bp.Groups[ig] + gp := Root.Groups.At(ig) + + g.TerraformBackend.Configuration = cb(gp.Backend.Configuration, &g.TerraformBackend.Configuration) + + for k, p := range g.TerraformProviders { + p.Configuration = cb(gp.Provider.Dot(k).Configuration, &p.Configuration) + g.TerraformProviders[k] = p + } + + for im := range g.Modules { + m := &g.Modules[im] + m.Settings = cb(gp.Modules.At(im).Settings, &m.Settings) + } + } + + for i := range bp.Validators { + v := &bp.Validators[i] + v.Inputs = cb(Root.Validators.At(i).Inputs, &v.Inputs) + } +} + +func (bp *Blueprint) visitDicts(cb func(dictPath, *Dict)) { + bp.mutateDicts(func(p dictPath, d *Dict) Dict { + cb(p, d) + return *d + }) +} + // DeploymentSettings are deployment-specific override settings type DeploymentSettings struct { TerraformBackendDefaults TerraformBackend `yaml:"terraform_backend_defaults,omitempty"` @@ -295,13 +344,18 @@ type DeploymentSettings struct { func (bp *Blueprint) Expand() error { // expand the blueprint in dependency order: // BlueprintName -> DefaultBackend -> Vars -> Groups - if err := bp.checkBlueprintName(); err != nil { - return err + errs := (&Errors{}). + Add(checkStringLiterals(bp)). + Add(bp.checkBlueprintName()). + Add(checkProviders(Root.Provider, bp.TerraformProviders)) + if errs.Any() { + return *errs } - if err := checkBackend(Root.Backend, bp.TerraformBackendDefaults); err != nil { + + if err := bp.expandVars(); err != nil { return err } - if err := bp.expandVars(); err != nil { + if err := bp.checkReferences(); err != nil { return err } return bp.expandGroups() @@ -351,6 +405,14 @@ func (bp Blueprint) ListUnusedVariables() []string { for _, v := range bp.Validators { ns["validator_"+v.Validator] = v.Inputs.AsObject() } + for k, v := range bp.TerraformProviders { + ns["bp_provider_"+k] = v.Configuration.AsObject() + } + for _, grp := range bp.Groups { + for k, v := range grp.TerraformProviders { + ns["grp_"+string(grp.Name)+"_provider_"+k] = v.Configuration.AsObject() + } + } var used = map[string]bool{ "labels": true, // automatically added @@ -463,7 +525,7 @@ func checkModulesAndGroups(bp Blueprint) error { errs.Add(validateModule(pm, mod, bp)) } - errs.Add(checkBackend(pg.Backend, grp.TerraformBackend)) + errs.Add(checkProviders(pg.Provider, grp.TerraformProviders)) } return errs.OrNil() } @@ -478,10 +540,30 @@ func validateModuleUseReferences(p ModulePath, mod Module, bp Blueprint) error { return errs.OrNil() } -func checkBackend(bep backendPath, be TerraformBackend) error { - val, perr := parseYamlString(be.Type) +func checkStringLiterals(bp *Blueprint) error { + errs := Errors{} + bp.visitStringLiterals(func(p Path, s string) { + errs.Add(checkStringLiteral(p, s)) + }) + return errs.OrNil() +} + +func checkStringLiteral(p Path, s string) error { + val, perr := parseYamlString(s) if _, is := IsExpressionValue(val); is || perr != nil { - return BpError{bep.Type, errors.New("can not use expression as a terraform_backend type")} + return BpError{p, errors.New("can not use expression here")} + } + return nil +} + +func checkProviders(pp mapPath[providerPath], tp map[string]TerraformProvider) error { + for k, v := range tp { + if v.Source == "" { + return BpError{pp.Dot(k).Source, errors.New(fmt.Sprintf("provider %q is missing source", k))} + } + if v.Version == "" { + return BpError{pp.Dot(k).Version, errors.New(fmt.Sprintf("provider %q is missing version", k))} + } } return nil } @@ -595,6 +677,30 @@ func (bp *Blueprint) checkBlueprintName() error { return nil } +// Check that all references in expressions are valid +func (bp *Blueprint) checkReferences() error { + errs := Errors{} + bp.visitDicts(func(dp dictPath, d *Dict) { + isModSettings := IsModuleSettingsPath(dp) + for k, v := range d.Items() { + for ref, rp := range valueReferences(v) { + path := dp.Dot(k).Cty(rp) + if !ref.GlobalVar { + if !isModSettings { + errs.At(path, fmt.Errorf("module output %q can only be referenced in other module settings", ref)) + } + // module to module references are checked by validateModuleSettingReferences later + return + } + if !bp.Vars.Has(ref.Name) { + errs.At(path, fmt.Errorf("variable %q not found", ref.Name)) + } + } + } + }) + return errs.OrNil() +} + // productOfModuleUseMark is a "mark" applied to values that are result of `use`. // Should not be used directly, use AsProductOfModuleUse and IsProductOfModuleUse instead. type productOfModuleUseMark struct { @@ -648,6 +754,55 @@ func (bp *Blueprint) WalkModulesSafe(walker func(ModulePath, *Module)) { }) } +func (bp *Blueprint) visitStringLiterals(cb func(Path, string)) { + cb(Root.BlueprintName, bp.BlueprintName) + cb(Root.GhpcVersion, bp.GhpcVersion) + for iv, v := range bp.Validators { + cb(Root.Validators.At(iv).Validator, v.Validator) + } + + vBackend := func(pbe backendPath, be *TerraformBackend) { + cb(pbe.Type, be.Type) + } + vProviders := func(pps mapPath[providerPath], ps map[string]TerraformProvider) { + for k, p := range ps { + pp := pps.Dot(k) + cb(pp, k) + cb(pp.Source, p.Source) + cb(pp.Version, p.Version) + } + } + + vBackend(Root.Backend, &bp.TerraformBackendDefaults) + vProviders(Root.Provider, bp.TerraformProviders) + + for ig, g := range bp.Groups { + pg := Root.Groups.At(ig) + cb(pg.Name, string(g.Name)) + vBackend(pg.Backend, &g.TerraformBackend) + vProviders(pg.Provider, g.TerraformProviders) + for im, m := range g.Modules { + pm := pg.Modules.At(im) + cb(pm.Source, m.Source) + cb(pm.Kind, m.Kind.String()) + cb(pm.ID, string(m.ID)) + for iu, u := range m.Use { + cb(pm.Use.At(iu), string(u)) + } + for io, o := range m.Outputs { + po := pm.Outputs.At(io) + cb(po.Name, o.Name) + cb(po.Description, o.Description) + } + } + } + bp.visitDicts(func(dp dictPath, d *Dict) { + for _, k := range d.Keys() { + cb(dp.Dot(k), k) + } + }) +} + // validate every module setting in the blueprint containing a reference func validateModuleSettingReferences(p ModulePath, m Module, bp Blueprint) error { errs := Errors{} @@ -675,7 +830,7 @@ func varsTopologicalOrder(vars Dict) ([]string, error) { p := Root.Vars.Dot(n).Cty(rp) if !ref.GlobalVar { - return BpError{p, fmt.Errorf("non-global variable %q referenced in expression", ref)} + continue } if used[ref.Name] == 1 { diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 9ad7ce039f..8d25f8752f 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -517,35 +517,71 @@ func (s *zeroSuite) TestCheckMovedModules(c *C) { c.Assert(checkMovedModule("./community/modules/scheduler/cloud-batch-job"), NotNil) } -func (s *zeroSuite) TestCheckBackend(c *C) { - p := Root.Groups.At(173).Backend +func (s *zeroSuite) TestCheckStringLiteral(c *C) { + p := Root.BlueprintName // some path { // OK. Absent - c.Check(checkBackend(p, TerraformBackend{}), IsNil) + c.Check(checkStringLiteral(p, ""), IsNil) } - - { // OK. No variables used - b := TerraformBackend{ - Type: "gcs", - Configuration: Dict{}. - With("bucket", cty.StringVal("trenta")). - With("impersonate_service_account", cty.StringVal("who"))} - c.Check(checkBackend(p, b), IsNil) + { // OK. No expressions + c.Check(checkStringLiteral(p, "who"), IsNil) } { // FAIL. Expression in type - b := TerraformBackend{Type: "$(vartype)"} - c.Check(checkBackend(p, b), NotNil) + c.Check(checkStringLiteral(p, "$(vartype)"), NotNil) } { // FAIL. HCL literal - b := TerraformBackend{Type: "((var.zen))"} - c.Check(checkBackend(p, b), NotNil) + c.Check(checkStringLiteral(p, "((var.zen))"), NotNil) + } + + { // OK. Not an expression + c.Check(checkStringLiteral(p, "\\$(vartype)"), IsNil) } +} + +func (s *zeroSuite) TestCheckProviders(c *C) { + p := Root.Groups.At(173).Provider - { // OK. Not a variable - b := TerraformBackend{Type: "\\$(vartype)"} - c.Check(checkBackend(p, b), IsNil) + { // OK. Absent + c.Check(checkProviders(p, map[string]TerraformProvider{}), IsNil) + } + + { // OK. All required values used + tp := map[string]TerraformProvider{ + "test-provider": { + Source: "test-src", + Version: "test-ver", + Configuration: Dict{}. + With("project", cty.StringVal("test-prj")). + With("region", cty.StringVal("reg1")). + With("zone", cty.StringVal("zone1")). + With("universe_domain", cty.StringVal("test-universe.com"))}} + c.Check(checkProviders(p, tp), IsNil) + } + + { // FAIL. Missing Source + tp := map[string]TerraformProvider{ + "test-provider": { + Version: "test-ver", + Configuration: Dict{}. + With("project", cty.StringVal("test-prj")). + With("region", cty.StringVal("reg1")). + With("zone", cty.StringVal("zone1")). + With("universe_domain", cty.StringVal("test-universe.com"))}} + c.Check(checkProviders(p, tp), NotNil) + } + + { // FAIL. Missing Version + tp := map[string]TerraformProvider{ + "test-provider": { + Source: "test-src", + Configuration: Dict{}. + With("project", cty.StringVal("test-prj")). + With("region", cty.StringVal("reg1")). + With("zone", cty.StringVal("zone1")). + With("universe_domain", cty.StringVal("test-universe.com"))}} + c.Check(checkProviders(p, tp), NotNil) } } diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 6ee7d6becd..7a22523945 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -131,6 +131,9 @@ func (bp *Blueprint) expandGroups() error { func (bp Blueprint) expandGroup(gp groupPath, g *Group) error { var errs Errors bp.expandBackend(g) + if g.Kind() == TerraformKind { + bp.expandProviders(g) + } for im := range g.Modules { errs.Add(bp.expandModule(gp.Modules.At(im), &g.Modules[im])) } @@ -166,6 +169,42 @@ func (bp Blueprint) expandBackend(grp *Group) { } } +func getDefaultGoogleProviders(bp Blueprint) map[string]TerraformProvider { + gglConf := Dict{} + for s, v := range map[string]string{ + "project": "project_id", + "region": "region", + "zone": "zone"} { + if bp.Vars.Has(v) { + gglConf = gglConf.With(s, GlobalRef(v).AsValue()) + } + } + return map[string]TerraformProvider{ + "google": { + Source: "hashicorp/google", + Version: ">= 4.84.0, < 5.32.0", + Configuration: gglConf}, + "google-beta": { + Source: "hashicorp/google-beta", + Version: ">= 4.84.0, < 5.32.0", + Configuration: gglConf}} +} + +func (bp Blueprint) expandProviders(grp *Group) { + // 1. DEFAULT: use TerraformProviders provider dictionary (if supplied) + // 2. If top-level TerraformProviders is defined, insert that + // provider dictionary into resource groups which have no explicit + // TerraformProviders + defaults := bp.TerraformProviders + pv := &grp.TerraformProviders + if defaults == nil { + defaults = getDefaultGoogleProviders(bp) + } + if (*pv) == nil { + (*pv) = maps.Clone(defaults) + } +} + func getModuleInputMap(inputs []modulereader.VarInfo) map[string]cty.Type { modInputs := make(map[string]cty.Type) for _, input := range inputs { @@ -320,7 +359,7 @@ func validateModuleReference(bp Blueprint, from Module, toID ModuleID) error { bp.WalkModulesSafe(func(_ ModulePath, m *Module) { mods = append(mods, string(m.ID)) }) - return hintSpelling(string(toID), mods, err) + return HintSpelling(string(toID), mods, err) } if to.Kind == PackerKind { @@ -346,7 +385,7 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error if r.GlobalVar { if !bp.Vars.Has(r.Name) { err := fmt.Errorf("module %q references unknown global variable %q", mod.ID, r.Name) - return hintSpelling(r.Name, bp.Vars.Keys(), err) + return HintSpelling(r.Name, bp.Vars.Keys(), err) } return nil } @@ -358,7 +397,7 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error bp.WalkModulesSafe(func(_ ModulePath, m *Module) { hints = append(hints, string(m.ID)) }) - return hintSpelling(string(unkModErr.ID), hints, unkModErr) + return HintSpelling(string(unkModErr.ID), hints, unkModErr) } return err } @@ -375,7 +414,7 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error if !slices.Contains(outputs, r.Name) { err := fmt.Errorf("module %q does not have output %q", tm.ID, r.Name) - return hintSpelling(r.Name, outputs, err) + return HintSpelling(r.Name, outputs, err) } return nil } diff --git a/pkg/config/expand_test.go b/pkg/config/expand_test.go index 26059c4c53..7b0dae6c95 100644 --- a/pkg/config/expand_test.go +++ b/pkg/config/expand_test.go @@ -73,6 +73,81 @@ func (s *zeroSuite) TestExpandBackend(c *C) { } } +func (s *zeroSuite) TestExpandProviders(c *C) { + type PR = TerraformProvider // alias for brevity + noDefPr := Blueprint{BlueprintName: "tree"} + + testProvider := map[string]PR{ + "test-provider": TerraformProvider{ + Source: "test-src", + Version: "test-vers", + Configuration: Dict{}. + With("project", cty.StringVal("test-prj")). + With("region", cty.StringVal("reg1")). + With("zone", cty.StringVal("zone1")). + With("universe_domain", cty.StringVal("test-universe.com"))}} + + { // no def PR, no group PR - match default values + g := Group{Name: "clown"} + noDefPr.expandProviders(&g) + c.Check(g.TerraformProviders, DeepEquals, map[string]PR{ + "google": TerraformProvider{ + Source: "hashicorp/google", + Version: ">= 4.84.0, < 5.32.0"}, + "google-beta": TerraformProvider{ + Source: "hashicorp/google-beta", + Version: ">= 4.84.0, < 5.32.0"}}) + } + + { // no def PR, group PR + g := Group{ + Name: "clown", + TerraformProviders: testProvider} + noDefPr.expandProviders(&g) + c.Check(g.TerraformProviders, DeepEquals, testProvider) + } + + defBe := noDefPr + defBe.TerraformProviders = testProvider + + { // def PR, no group PR + g := Group{Name: "clown"} + defBe.expandProviders(&g) + + c.Check(g.TerraformProviders, DeepEquals, testProvider) + } + + group_provider := map[string]PR{ + "test-provider": TerraformProvider{ + Source: "test-source", + Version: "test-versions", + Configuration: Dict{}. + With("project", cty.StringVal("test-prj")). + With("region", cty.StringVal("reg2")). + With("zone", cty.StringVal("zone2s")). + With("universe_domain", cty.StringVal("fake-universe.com"))}} + + { // def PR, group PR set + g := Group{ + Name: "clown", + TerraformProviders: group_provider} + defBe.expandProviders(&g) + + c.Check(g.TerraformProviders, DeepEquals, group_provider) + } + + empty_provider := map[string]PR{} + + { // No def PR, group (nil PR != PR w/ len == 0) (nil PR results in default PR values, empty PR remains empty) + g := Group{Name: "clown"} + g2 := Group{Name: "bear", + TerraformProviders: empty_provider} + noDefPr.expandProviders(&g) + noDefPr.expandProviders(&g2) + c.Check(g.TerraformProviders, Not(DeepEquals), g2.TerraformProviders) + } +} + func (s *zeroSuite) TestAddListValue(c *C) { mod := Module{ID: "TestModule"} diff --git a/pkg/config/materialize.go b/pkg/config/materialize.go index 71e864c530..39c71f3b86 100644 --- a/pkg/config/materialize.go +++ b/pkg/config/materialize.go @@ -27,7 +27,7 @@ func (bp *Blueprint) Materialize() error { return err } - if err := bp.evalGhpcStageInModuleSettings(); err != nil { + if err := bp.evalGhpcStage(); err != nil { return err } diff --git a/pkg/config/path.go b/pkg/config/path.go index c22de0aeb7..1c5926adc0 100644 --- a/pkg/config/path.go +++ b/pkg/config/path.go @@ -135,6 +135,7 @@ type rootPath struct { Vars dictPath `path:"vars"` Groups arrayPath[groupPath] `path:"deployment_groups"` Backend backendPath `path:"terraform_backend_defaults"` + Provider mapPath[providerPath] `path:"terraform_providers"` } type validatorCfgPath struct { @@ -152,11 +153,19 @@ type backendPath struct { Configuration dictPath `path:".configuration"` } +type providerPath struct { + basePath + Source basePath `path:".source"` + Version basePath `path:".version"` + Configuration dictPath `path:".configuration"` +} + type groupPath struct { basePath - Name basePath `path:".group"` - Backend backendPath `path:".terraform_backend"` - Modules arrayPath[ModulePath] `path:".modules"` + Name basePath `path:".group"` + Backend backendPath `path:".terraform_backend"` + Provider mapPath[providerPath] `path:".terraform_provider"` + Modules arrayPath[ModulePath] `path:".modules"` } type ModulePath struct { @@ -182,3 +191,15 @@ var Root rootPath func init() { initPath(&Root, nil, "") } + +func IsModuleSettingsPath(p Path) bool { + parent := p.Parent() + if parent == nil { + return false + } + mp, ok := parent.(*ModulePath) + if !ok { + return false + } + return p == mp.Settings +} diff --git a/pkg/config/path_test.go b/pkg/config/path_test.go index b17a0dc219..4e1964854b 100644 --- a/pkg/config/path_test.go +++ b/pkg/config/path_test.go @@ -36,6 +36,7 @@ func TestPath(t *testing.T) { {r.Vars, "vars"}, {r.Groups, "deployment_groups"}, {r.Backend, "terraform_backend_defaults"}, + {r.Provider, "terraform_providers"}, {r.Validators.At(2), "validators[2]"}, {r.Validators.At(2).Validator, "validators[2].validator"}, @@ -72,6 +73,12 @@ func TestPath(t *testing.T) { {r.Backend.Type, "terraform_backend_defaults.type"}, {r.Backend.Configuration, "terraform_backend_defaults.configuration"}, {r.Backend.Configuration.Dot("goo"), "terraform_backend_defaults.configuration.goo"}, + + {r.Provider.Dot("goo"), "terraform_providers.goo"}, + {r.Provider.Dot("goo").Source, "terraform_providers.goo.source"}, + {r.Provider.Dot("goo").Version, "terraform_providers.goo.version"}, + {r.Provider.Dot("goo").Configuration, "terraform_providers.goo.configuration"}, + {r.Provider.Dot("goo").Configuration.Dot("googoo"), "terraform_providers.goo.configuration.googoo"}, } for _, tc := range tests { t.Run(tc.want, func(t *testing.T) { diff --git a/pkg/config/staging.go b/pkg/config/staging.go index aabb7cad23..df95946511 100644 --- a/pkg/config/staging.go +++ b/pkg/config/staging.go @@ -90,26 +90,23 @@ func (bp *Blueprint) makeGhpcStageFunc() function.Function { }) } -// Update module settings in place, evaluating `ghpc_stage` expressions -func (bp *Blueprint) evalGhpcStageInModuleSettings() error { +// Partially evaluate all `ghpc_stage` expressions in the blueprint +func (bp *Blueprint) evalGhpcStage() error { errs := Errors{} ctx, err := bp.evalCtx() if err != nil { return err } - bp.WalkModulesSafe(func(mp ModulePath, m *Module) { + + bp.mutateDicts(func(dp dictPath, d *Dict) Dict { us := map[string]cty.Value{} - for k, v := range m.Settings.Items() { - uv, err := evalGhpcStageInValue(mp.Settings.Dot(k), v, ctx) - if err != nil { - errs.Add(err) - break - } + for k, v := range d.Items() { + uv, err := evalGhpcStageInValue(dp.Dot(k), v, ctx) + errs.Add(err) us[k] = uv } - m.Settings = NewDict(us) + return NewDict(us) }) - return errs.OrNil() } diff --git a/pkg/config/staging_test.go b/pkg/config/staging_test.go index f34c61e36b..071ae7f6d2 100644 --- a/pkg/config/staging_test.go +++ b/pkg/config/staging_test.go @@ -130,7 +130,7 @@ func TestPartialEval(t *testing.T) { } } -func TestEvalModuleSettings(t *testing.T) { +func TestEvalGhpcStage(t *testing.T) { mod := Module{ Settings: Dict{}. With("war", MustParseExpression(`never("changes")`).AsValue()). @@ -142,7 +142,7 @@ func TestEvalModuleSettings(t *testing.T) { Groups: []Group{{Modules: []Module{mod}}}, } - if err := bp.evalGhpcStageInModuleSettings(); err != nil { + if err := bp.evalGhpcStage(); err != nil { t.Errorf("got unexpected error: %v", err) } diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 5fa290186f..23322719a4 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -181,7 +181,7 @@ func validateSettings( } // Setting not found if _, ok := cVars.Inputs[k]; !ok { - err := hintSpelling(k, maps.Keys(cVars.Inputs), UnknownModuleSetting) + err := HintSpelling(k, maps.Keys(cVars.Inputs), UnknownModuleSetting) errs.At(sp, err) continue // do not perform other validations } diff --git a/pkg/modulewriter/modulewriter_test.go b/pkg/modulewriter/modulewriter_test.go index 4f2b94a340..ee25280ca8 100644 --- a/pkg/modulewriter/modulewriter_test.go +++ b/pkg/modulewriter/modulewriter_test.go @@ -355,60 +355,78 @@ func (s *zeroSuite) TestWriteVariables(c *C) { c.Assert(err, IsNil) } -func (s *zeroSuite) TestGetProviders(c *C) { - // no vars - c.Check( - getProviders(config.Blueprint{}), DeepEquals, []provider{ - {alias: "google", source: "hashicorp/google", version: ">= 4.84.0, < 5.30.0", config: config.Dict{}}, - {alias: "google-beta", source: "hashicorp/google-beta", version: ">= 4.84.0, < 5.30.0", config: config.Dict{}}}) - - { // all vars - allSet := config.NewDict(map[string]cty.Value{ - "project": config.GlobalRef("project_id").AsValue(), - "region": config.GlobalRef("region").AsValue(), - "zone": config.GlobalRef("zone").AsValue(), - }) - c.Check( - getProviders(config.Blueprint{ - Vars: config.NewDict(map[string]cty.Value{ - "project_id": cty.StringVal("some"), - "region": cty.StringVal("some"), - "zone": cty.StringVal("some"), - }), - }), DeepEquals, []provider{ - {alias: "google", source: "hashicorp/google", version: ">= 4.84.0, < 5.30.0", config: allSet}, - {alias: "google-beta", source: "hashicorp/google-beta", version: ">= 4.84.0, < 5.30.0", config: allSet}}) - } -} - func (s *zeroSuite) TestWriteProviders(c *C) { // Setup dir := c.MkDir() - zebra := provider{alias: "zebra", source: "hashicorp/zebra", version: "~> 2", config: config.Dict{}} - elephant := provider{ - alias: "elephant", - source: "savannah/elephant", - version: "~> 8", - config: config.NewDict(map[string]cty.Value{ - "smeller": config.GlobalRef("long").AsValue(), - "listeners": config.GlobalRef("spacious").AsValue()})} - + providers := map[string]config.TerraformProvider{ + "elephant": config.TerraformProvider{ + Source: "savannah/elephant", + Version: "~> 8", + Configuration: config.NewDict(map[string]cty.Value{ + "smeller": config.GlobalRef("long").AsValue(), + "listeners": config.GlobalRef("spacious").AsValue()})}, + "zebra": config.TerraformProvider{ + Source: "hashicorp/zebra", + Version: "~> 2", + Configuration: config.Dict{}}} { // FAIL, non existing path - c.Check(writeProviders([]provider{zebra}, "not/a/real/path"), NotNil) + c.Check(writeProviders(providers, "not/a/real/path"), NotNil) } { // OK - c.Check(writeProviders([]provider{zebra, elephant}, dir), IsNil) + c.Check(writeProviders(providers, dir), IsNil) b, err := os.ReadFile(filepath.Join(dir, "providers.tf")) c.Assert(err, IsNil) c.Check(string(b), Equals, license+` -provider "zebra" { -} - provider "elephant" { listeners = var.spacious smeller = var.long } + +provider "zebra" { +} +`) + } +} + +func (s *zeroSuite) TestWriteVersions(c *C) { + // Setup + dir := c.MkDir() + providers := map[string]config.TerraformProvider{ + "elephant": config.TerraformProvider{ + Source: "savannah/elephant", + Version: "~> 8", + Configuration: config.NewDict(map[string]cty.Value{ + "smeller": config.GlobalRef("long").AsValue(), + "listeners": config.GlobalRef("spacious").AsValue()})}, + "zebra": config.TerraformProvider{ + Source: "hashicorp/zebra", + Version: "~> 2", + Configuration: config.Dict{}}} + + { // FAIL, non existing path + c.Check(writeVersions(providers, "not/a/real/path"), NotNil) + } + + { // OK + c.Check(writeVersions(providers, dir), IsNil) + b, err := os.ReadFile(filepath.Join(dir, "versions.tf")) + c.Assert(err, IsNil) + c.Check(string(b), Equals, license+` +terraform { + required_version = ">= 1.2" + + required_providers { + elephant = { + source = "savannah/elephant" + version = "~> 8" + } + zebra = { + source = "hashicorp/zebra" + version = "~> 2" + } + } +} `) } } diff --git a/pkg/modulewriter/tfwriter.go b/pkg/modulewriter/tfwriter.go index c6fa91030d..361dbe037f 100644 --- a/pkg/modulewriter/tfwriter.go +++ b/pkg/modulewriter/tfwriter.go @@ -186,46 +186,23 @@ func writeMain( return writeHclFile(filepath.Join(dst, "main.tf"), hclFile) } -type provider struct { - alias string - source string - version string - config config.Dict -} - -func getProviders(bp config.Blueprint) []provider { - gglConf := config.Dict{} - for s, v := range map[string]string{ - "project": "project_id", - "region": "region", - "zone": "zone"} { - if bp.Vars.Has(v) { - gglConf = gglConf.With(s, config.GlobalRef(v).AsValue()) - } - } - - return []provider{ - {"google", "hashicorp/google", ">= 4.84.0, < 5.30.0", gglConf}, - {"google-beta", "hashicorp/google-beta", ">= 4.84.0, < 5.30.0", gglConf}, - } -} - -func writeProviders(providers []provider, dst string) error { +func writeProviders(providers map[string]config.TerraformProvider, dst string) error { hclFile := hclwrite.NewEmptyFile() hclBody := hclFile.Body() - for _, prov := range providers { + for _, k := range orderKeys(providers) { hclBody.AppendNewline() - pb := hclBody.AppendNewBlock("provider", []string{prov.alias}).Body() + v := providers[k] + pb := hclBody.AppendNewBlock("provider", []string{k}).Body() - for _, s := range orderKeys(prov.config.Items()) { - pb.SetAttributeRaw(s, config.TokensForValue(prov.config.Get(s))) + for _, s := range orderKeys(v.Configuration.Items()) { + pb.SetAttributeRaw(s, config.TokensForValue(v.Configuration.Get(s))) } } return writeHclFile(filepath.Join(dst, "providers.tf"), hclFile) } -func writeVersions(providers []provider, dst string) error { +func writeVersions(providers map[string]config.TerraformProvider, dst string) error { f := hclwrite.NewEmptyFile() body := f.Body() body.AppendNewline() @@ -235,10 +212,11 @@ func writeVersions(providers []provider, dst string) error { pb := tfb.AppendNewBlock("required_providers", []string{}).Body() - for _, p := range providers { - pb.SetAttributeValue(p.alias, cty.ObjectVal(map[string]cty.Value{ - "source": cty.StringVal(p.source), - "version": cty.StringVal(p.version), + for _, k := range orderKeys(providers) { + v := providers[k] + pb.SetAttributeValue(k, cty.ObjectVal(map[string]cty.Value{ + "source": cty.StringVal(v.Source), + "version": cty.StringVal(v.Version), })) } return writeHclFile(filepath.Join(dst, "versions.tf"), f) @@ -276,6 +254,8 @@ func (w TFWriter) writeGroup( intergroupInputs[igVar.Name] = true } + tp := g.TerraformProviders + // Write main.tf file doctoredModules, err := substituteIgcReferences(g.Modules, intergroupVars) if err != nil { @@ -300,14 +280,13 @@ func (w TFWriter) writeGroup( return fmt.Errorf("error writing terraform.tfvars file for deployment group %s: %w", g.Name, err) } - providers := getProviders(bp) // Write providers.tf file - if err := writeProviders(providers, groupPath); err != nil { + if err := writeProviders(tp, groupPath); err != nil { return fmt.Errorf("error writing providers.tf file for deployment group %s: %w", g.Name, err) } // Write versions.tf file - if err := writeVersions(providers, groupPath); err != nil { + if err := writeVersions(tp, groupPath); err != nil { return fmt.Errorf("error writing versions.tf file for deployment group %s: %v", g.Name, err) } @@ -365,8 +344,8 @@ func getUsedDeploymentVars(group config.Group, bp config.Blueprint) map[string]c for _, m := range group.Modules { used = append(used, config.GetUsedDeploymentVars(m.Settings.AsObject())...) } - for _, p := range getProviders(bp) { - used = append(used, config.GetUsedDeploymentVars(p.config.AsObject())...) + for _, v := range group.TerraformProviders { + used = append(used, config.GetUsedDeploymentVars(v.Configuration.AsObject())...) } for _, v := range used { res[v] = bp.Vars.Get(v) diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml index 6492697daf..c1e1801c1a 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm.yaml @@ -17,7 +17,7 @@ tags: - m.custom-image - m.pre-existing-vpc - m.startup-script -- slurm5 +- slurm6 timeout: 14400s # 4hr steps: @@ -41,3 +41,24 @@ steps: IMAGE_NAME=$(gcloud compute images list --project "${PROJECT_ID}" \ --no-standard-images --filter="labels.ghpc_deployment~$${BUILD_ID_SHORT}" \ --format='get(name)' --limit=1) + + echo $${IMAGE_NAME} > /persistent_volume/image_name + volumes: + - name: 'persistent_volume' + path: '/persistent_volume' +- id: ml-a3-megagpu-slurm-cluster + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + + cat /persistent_volume/image_name | xargs -L1 gcloud compute images delete --project "${PROJECT_ID}" --quiet + volumes: + - name: 'persistent_volume' + path: '/persistent_volume' diff --git a/tools/cloud-build/daily-tests/tests/hcls.yml b/tools/cloud-build/daily-tests/tests/hcls.yml index b193d8fc4c..6b48c4bba5 100644 --- a/tools/cloud-build/daily-tests/tests/hcls.yml +++ b/tools/cloud-build/daily-tests/tests/hcls.yml @@ -19,7 +19,7 @@ deployment_name: "hcls-v6-{{ build }}" # No non-alphanumerical characters in the slurm cluster name - they will be # removed by HPC Toolkit slurm wrappers, which will break the playbook slurm_cluster_name: "hclsv6{{ build[0:4] }}" -zone: us-central1-c +zone: europe-west1-d workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/hcls-blueprint.yaml" network: "{{ test_name }}-net" @@ -27,7 +27,7 @@ login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" cli_deployment_vars: network_name: "{{ network }}" - region: us-central1 + region: europe-west1 zone: "{{ zone }}" enable_login_public_ips: "true" enable_controller_public_ips: "true" diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster.yml b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster.yml index 62283c4e05..66415efa9c 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-cluster.yml @@ -20,7 +20,7 @@ test_name: a3h-cluster deployment_name: a3hc-{{ build }} slurm_cluster_name: "a3hc{{ build[0:4] }}" workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/machine-learning/ml-slurm-a3-2-cluster.yaml" +blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml" login_node: "{{ slurm_cluster_name }}-login-*" controller_node: "{{ slurm_cluster_name }}-controller" network: default diff --git a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml index 7568aa6b38..d9dc8b9587 100644 --- a/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml +++ b/tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm-image.yml @@ -17,7 +17,7 @@ test_name: a3h-image deployment_name: a3himg{{ build }} workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/machine-learning/ml-slurm-a3-1-image.yaml" +blueprint_yaml: "{{ workspace }}/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml" packer_group_name: slurm-build packer_module_id: slurm-image delete_image: false diff --git a/tools/cloud-workstations/Dockerfile b/tools/cloud-workstations/Dockerfile index 11e41f3a75..f6284678e6 100644 --- a/tools/cloud-workstations/Dockerfile +++ b/tools/cloud-workstations/Dockerfile @@ -25,7 +25,7 @@ RUN curl -fsSL https://apt.releases.hashicorp.com/gpg | apt-key add - && \ keychain \ dnsutils && \ apt-add-repository "deb [arch=$(dpkg --print-architecture)] https://apt.releases.hashicorp.com bullseye main" && \ - apt-get -y update && apt-get install -y unzip python3-pip python3-venv terraform packer jq && \ + apt-get -y update && apt-get install -y unzip python3-pip python3-venv terraform packer jq tmux && \ echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ diff --git a/tools/create-release-candidate.sh b/tools/create-release-candidate.sh index 15c88ce3b4..2b7a5bfb22 100644 --- a/tools/create-release-candidate.sh +++ b/tools/create-release-candidate.sh @@ -103,6 +103,7 @@ git commit -m "Increase version to ${NEW_VERSION}" git push -u "${REMOTE_NAME}" "${V_BRANCH}" echo "Opening pull request to update release-candidate to version ${NEW_VERSION}" gh pr create --base "${RC_BRANCH}" --head "${V_BRANCH}" \ + --label "release-chore" \ --title "Update Toolkit release to ${NEW_TAG}" \ --body "Set release-candidate to version ${NEW_VERSION}" echo diff --git a/tools/enforce_coverage.pl b/tools/enforce_coverage.pl index d1ea19783a..b258d07869 100755 --- a/tools/enforce_coverage.pl +++ b/tools/enforce_coverage.pl @@ -26,6 +26,7 @@ pkg/logging 0 pkg/validators 13 pkg/inspect 60 + pkg/modulewriter 79 pkg 80 ); diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml index 59b94e525c..6e19532122 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/.ghpc/artifacts/expanded_blueprint.yaml @@ -35,6 +35,21 @@ vars: zone: us-east4-c deployment_groups: - group: zero + terraform_providers: + google: + source: hashicorp/google + version: '>= 4.84.0, < 5.32.0' + configuration: + project: ((var.project_id)) + region: ((var.region)) + zone: ((var.zone)) + google-beta: + source: hashicorp/google-beta + version: '>= 4.84.0, < 5.32.0' + configuration: + project: ((var.project_id)) + region: ((var.region)) + zone: ((var.zone)) modules: - source: modules/network/vpc kind: terraform diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index 15d18e1957..c54de68b69 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.30.0" + version = ">= 4.84.0, < 5.32.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.30.0" + version = ">= 4.84.0, < 5.32.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml index 77242b6c75..d01535a834 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/.ghpc/artifacts/expanded_blueprint.yaml @@ -41,6 +41,21 @@ deployment_groups: configuration: bucket: ((var.zone)) prefix: (("igc/${var.deployment_name}/zero")) + terraform_providers: + google: + source: hashicorp/google + version: '>= 4.84.0, < 5.32.0' + configuration: + project: ((var.project_id)) + region: ((var.region)) + zone: ((var.zone)) + google-beta: + source: hashicorp/google-beta + version: '>= 4.84.0, < 5.32.0' + configuration: + project: ((var.project_id)) + region: ((var.region)) + zone: ((var.zone)) modules: - source: modules/network/vpc kind: terraform @@ -61,6 +76,21 @@ deployment_groups: configuration: bucket: ((var.zone)) prefix: (("igc/${var.deployment_name}/one")) + terraform_providers: + google: + source: hashicorp/google + version: '>= 4.84.0, < 5.32.0' + configuration: + project: ((var.project_id)) + region: ((var.region)) + zone: ((var.zone)) + google-beta: + source: hashicorp/google-beta + version: '>= 4.84.0, < 5.32.0' + configuration: + project: ((var.project_id)) + region: ((var.region)) + zone: ((var.zone)) modules: - source: modules/file-system/filestore kind: terraform diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index 15d18e1957..c54de68b69 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.30.0" + version = ">= 4.84.0, < 5.32.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.30.0" + version = ">= 4.84.0, < 5.32.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index 15d18e1957..c54de68b69 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.30.0" + version = ">= 4.84.0, < 5.32.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.30.0" + version = ">= 4.84.0, < 5.32.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml index 641833597f..6d562428fc 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/.ghpc/artifacts/expanded_blueprint.yaml @@ -36,6 +36,21 @@ vars: zone: (("${var.region}-c")) deployment_groups: - group: zero + terraform_providers: + google: + source: hashicorp/google + version: '>= 4.84.0, < 5.32.0' + configuration: + project: ((var.project_id)) + region: ((var.region)) + zone: ((var.zone)) + google-beta: + source: hashicorp/google-beta + version: '>= 4.84.0, < 5.32.0' + configuration: + project: ((var.project_id)) + region: ((var.region)) + zone: ((var.zone)) modules: - source: modules/network/vpc kind: terraform diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index 15d18e1957..c54de68b69 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.84.0, < 5.30.0" + version = ">= 4.84.0, < 5.32.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.84.0, < 5.30.0" + version = ">= 4.84.0, < 5.32.0" } } }