Update for sharding

ComputeCanada · Apr 12, 2024 · c747de3 · c747de3
1 parent 25a9cad
commit c747de3
Show file tree

Hide file tree

Showing 6 changed files with 7 additions and 2 deletions.
diff --git a/aws/infrastructure.tf b/aws/infrastructure.tf
@@ -208,6 +208,7 @@ locals {
         ram  = data.aws_ec2_instance_type.instance_type[values.prefix].memory_size
         gpus = try(one(data.aws_ec2_instance_type.instance_type[values.prefix].gpus).count, 0)
         mig  = lookup(values, "mig", null)
+        shard  = lookup(values, "shard", null)
       }
     }
   }

diff --git a/azure/infrastructure.tf b/azure/infrastructure.tf
@@ -171,6 +171,7 @@ locals {
         ram  = local.vmsizes[values.type].ram
         gpus = local.vmsizes[values.type].gpus
         mig  = lookup(values, "mig", null)
+        shard  = lookup(values, "shard", null)
       }
     }
   }

diff --git a/common/configuration/main.tf b/common/configuration/main.tf
@@ -167,4 +167,4 @@ output "bastions" {
     for host, values in var.inventory: host => values
     if contains(values.tags, var.bastion_tag) && contains(values.tags, "public") &&  (!contains(values.tags, "pool"))
   }
-}
+}
diff --git a/docs/README.md b/docs/README.md
@@ -560,6 +560,7 @@ the operating system and service software
     ```
     This is only functional with [MIG supported GPUs](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#supported-gpus),
     and with x86-64 processors (see [NVIDIA/mig-parted issue #30](https://github.com/NVIDIA/mig-parted/issues/30)).
+6. `shard`: total number of [Sharding](https://slurm.schedmd.com/gres.html#Sharding) on the node. Sharding allows sharing the same GPU on multiple jobs. The total number of shards is evenly distributed across all GPUs on the node.
 
 For some cloud providers, it possible to define additional attributes.
 The following sections present the available attributes per provider.

diff --git a/gcp/infrastructure.tf b/gcp/infrastructure.tf
@@ -181,6 +181,7 @@ locals {
         ram  = data.external.machine_type[values["prefix"]].result["ram"]
         gpus = try(data.external.machine_type[values["prefix"]].result["gpus"], lookup(values, "gpu_count", 0))
         mig  = lookup(values, "mig", null)
+        shard  = lookup(values, "shard", null)
       }
     }
   }

diff --git a/openstack/infrastructure.tf b/openstack/infrastructure.tf
@@ -138,6 +138,7 @@ locals {
           parseint(split(":", lookup(data.openstack_compute_flavor_v2.flavors[values.prefix].extra_specs, "pci_passthrough:alias", "gpu:0"))[1], 10)
         ])
         mig  = lookup(values, "mig", null)
+        shard  = lookup(values, "shard", null)
       }
     }
   }
@@ -146,4 +147,4 @@ locals {
     host => merge(module.configuration.inventory[host], {id=openstack_compute_instance_v2.instances[host].id})
     if contains(module.configuration.inventory[host].tags, "public")
   }
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -167,4 +167,4 @@ output "bastions" { @@
         for host, values in var.inventory: host => values
         if contains(values.tags, var.bastion_tag) && contains(values.tags, "public") &&  (!contains(values.tags, "pool"))
       }
-    }
+    }