-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added singe node GPU instance (2xlarge) (#95)
* Added singe node GPU instance (2xlarge) * Updated ami of gpu instance
- Loading branch information
1 parent
6d0b094
commit b867252
Showing
6 changed files
with
309 additions
and
0 deletions.
There are no files selected for viewing
11 changes: 11 additions & 0 deletions
11
terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/common.tf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
|
||
################################################################################ | ||
# This is the bucket holding this specific setup tfstate | ||
################################################################################ | ||
terraform { | ||
backend "s3" { | ||
bucket = "performance-cto-group" | ||
region = "us-east-1" | ||
} | ||
} | ||
|
61 changes: 61 additions & 0 deletions
61
terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/db-resources_2a.tf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
|
||
resource "aws_instance" "server_2a" { | ||
count = var.server_instance_count | ||
ami = var.instance_ami | ||
instance_type = var.server_instance_type | ||
subnet_id = data.terraform_remote_state.shared_resources.outputs.subnet_public_id | ||
vpc_security_group_ids = ["${data.terraform_remote_state.shared_resources.outputs.performance_cto_sg_id}"] | ||
key_name = var.key_name | ||
availability_zone = "us-east-2a" | ||
|
||
root_block_device { | ||
volume_size = var.instance_volume_size | ||
volume_type = var.instance_volume_type | ||
encrypted = var.instance_volume_encrypted | ||
delete_on_termination = true | ||
} | ||
|
||
volume_tags = { | ||
Environment = "${var.environment}" | ||
Project = "${var.environment}" | ||
Name = "ebs_block_device-${var.setup_name}-DB-us-east-2a-${count.index + 1}" | ||
setup = "${var.setup_name}" | ||
triggering_env = "${var.triggering_env}" | ||
github_actor = "${var.github_actor}" | ||
github_org = "${var.github_org}" | ||
github_repo = "${var.github_repo}" | ||
github_sha = "${var.github_sha}" | ||
} | ||
|
||
tags = { | ||
Environment = "${var.environment}" | ||
Project = "${var.environment}" | ||
Name = "${var.setup_name}-DB-us-east-2a-${count.index + 1}" | ||
setup = "${var.setup_name}" | ||
triggering_env = "${var.triggering_env}" | ||
github_actor = "${var.github_actor}" | ||
github_org = "${var.github_org}" | ||
github_repo = "${var.github_repo}" | ||
github_sha = "${var.github_sha}" | ||
} | ||
|
||
################################################################################ | ||
# This will ensure we wait here until the instance is ready to receive the ssh connection | ||
################################################################################ | ||
provisioner "remote-exec" { | ||
script = "./../scripts/wait_for_instance.sh" | ||
connection { | ||
host = self.public_ip # The `self` variable is like `this` in many programming languages | ||
type = "ssh" # in this case, `self` is the resource (the server). | ||
user = var.ssh_user | ||
private_key = file(var.private_key) | ||
#need to increase timeout to larger then 5m for metal instances | ||
timeout = "5m" | ||
agent = "false" | ||
} | ||
} | ||
|
||
################################################################################ | ||
# Deployment related | ||
################################################################################ | ||
} |
23 changes: 23 additions & 0 deletions
23
terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/output.tf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
output "server_public_ip" { | ||
value = aws_instance.server_2a[*].public_ip | ||
} | ||
|
||
output "server_private_ip" { | ||
value = aws_instance.server_2a[*].private_ip | ||
} | ||
|
||
output "server_instance_type" { | ||
value = var.server_instance_type | ||
} | ||
|
||
output "search_threads" { | ||
value = var.search_thread | ||
} | ||
|
||
output "setup_name" { | ||
value = var.setup_name | ||
} | ||
|
||
output "server_instance_count" { | ||
value = var.server_instance_count | ||
} |
26 changes: 26 additions & 0 deletions
26
terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/prepare_ips.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import subprocess | ||
import json | ||
|
||
# Run the "terraform output json" command | ||
output = subprocess.run(["terraform", "output", "-json"], stdout=subprocess.PIPE) | ||
|
||
output_json = json.loads(output.stdout.decode()) | ||
total_nodes = len(output_json['server_private_ip']['value']) | ||
search_threads = output_json['search_threads']['value'] | ||
setup_name = output_json["setup_name"]["value"].replace(".", "_").replace("-", "_") | ||
prefix = "perf-cto-RE-" | ||
setup_name = setup_name[len(prefix) :] | ||
|
||
print("#!/bin/bash\n") | ||
print("TOTAL_NODES={}\n".format(total_nodes)) | ||
print("SEARCH_THREADS={}\n".format(search_threads)) | ||
print('CLUSTER_NAME="{}"\n'.format(setup_name)) | ||
|
||
print("\n#internal IP addresses") | ||
cleaned_json = {} | ||
for keyn, v in enumerate(output_json['server_private_ip']['value'],start=1): | ||
print("B_M{}_I={}".format(keyn,v)) | ||
|
||
print("\n#external IP addresses") | ||
for keyn, v in enumerate(output_json['server_public_ip']['value'],start=1): | ||
print("B_M{}_E={}".format(keyn,v)) |
17 changes: 17 additions & 0 deletions
17
terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/shared_resources.tf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# provider | ||
provider "aws" { | ||
region = var.region | ||
} | ||
|
||
################################################################################ | ||
# This is the shared resources bucket key -- you will need it across environments like security rules,etc... | ||
# !! do not change this !! | ||
################################################################################ | ||
data "terraform_remote_state" "shared_resources" { | ||
backend = "s3" | ||
config = { | ||
bucket = "performance-cto-group" | ||
key = "benchmarks/infrastructure/shared_resources.tfstate" | ||
region = "us-east-1" | ||
} | ||
} |
171 changes: 171 additions & 0 deletions
171
terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/variables.tf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
################################################################################ | ||
# Variables used for deployment tag | ||
################################################################################ | ||
|
||
variable "search_thread" { | ||
description = "setup name" | ||
default = "6" | ||
} | ||
|
||
variable "setup_name" { | ||
description = "setup name" | ||
default = "perf-cto-RE-g6.2xlarge-1shards-1threads-gpu" | ||
} | ||
|
||
variable "github_actor" { | ||
description = "The name of the person or app that initiated the deployment." | ||
default = "N/A" | ||
} | ||
|
||
|
||
variable "github_repo" { | ||
description = " The owner and repository name. For example, testing-infrastructure." | ||
default = "N/A" | ||
} | ||
|
||
variable "triggering_env" { | ||
description = " The triggering environment. For example circleci." | ||
default = "N/A" | ||
} | ||
|
||
variable "environment" { | ||
description = " The cost tag." | ||
default = "VecSim-Nvidia" | ||
} | ||
|
||
variable "github_org" { | ||
description = " The owner name. For example, RedisModules." | ||
default = "N/A" | ||
} | ||
|
||
variable "github_sha" { | ||
description = "The commit SHA that triggered the deployment." | ||
default = "N/A" | ||
} | ||
|
||
variable "timeout_secs" { | ||
description = "The maximum time to wait prior destroying the VM via the watchdog." | ||
default = "3600" | ||
} | ||
|
||
|
||
|
||
################################################################################ | ||
# Access keys | ||
################################################################################ | ||
variable "private_key" { | ||
description = "private key" | ||
default = "/tmp/benchmarks.redislabs.pem" | ||
} | ||
|
||
variable "key_name" { | ||
description = "key name" | ||
default = "perf-cto-us-east-2" | ||
} | ||
|
||
variable "region" { | ||
default = "us-east-2" | ||
} | ||
# (Ubuntu 20.04) | ||
# us-east-2 Focal Fossa 20.04 Deep Learning AMI GPU CUDA 11.4.3 (Ubuntu 20.04) 20221115 | ||
variable "instance_ami" { | ||
description = "AMI for aws EC2 instance - us-east-2 Ubuntu 20.04 - amd64" | ||
default = "ami-01cc20b11fcde36cb" | ||
} | ||
|
||
|
||
|
||
variable "client_instance_ami" { | ||
description = "AMI for aws EC2 instance - us-east-2 Ubuntu 22.04 - amd64" | ||
default = "ami-024e6efaf93d85776" | ||
} | ||
|
||
variable "instance_device_name" { | ||
description = "EC2 instance device name" | ||
default = "/dev/sda1" | ||
} | ||
|
||
variable "redis_module" { | ||
description = "redis_module" | ||
default = "N/A" | ||
} | ||
|
||
variable "instance_volume_size" { | ||
description = "EC2 instance volume_size" | ||
default = "256" | ||
} | ||
|
||
variable "instance_volume_type" { | ||
description = "EC2 instance volume_type" | ||
default = "gp3" | ||
} | ||
|
||
|
||
variable "instance_volume_iops" { | ||
description = "EC2 instance volume_iops" | ||
default = "3000" | ||
} | ||
|
||
variable "client_instance_volume_size" { | ||
description = "EC2 instance volume_size" | ||
default = "256" | ||
} | ||
|
||
variable "client_instance_volume_type" { | ||
description = "EC2 instance volume_type" | ||
default = "gp3" | ||
} | ||
|
||
|
||
variable "instance_volume_encrypted" { | ||
description = "EC2 instance instance_volume_encrypted" | ||
default = "false" | ||
} | ||
|
||
variable "instance_root_block_device_encrypted" { | ||
description = "EC2 instance instance_root_block_device_encrypted" | ||
default = "false" | ||
} | ||
|
||
variable "instance_cpu_threads_per_core" { | ||
description = "CPU threads per core for aws EC2 instance" | ||
default = 1 | ||
} | ||
|
||
variable "instance_cpu_threads_per_core_hyperthreading" { | ||
description = "CPU threads per core when hyperthreading is enabled for aws EC2 instance" | ||
default = 2 | ||
} | ||
|
||
variable "instance_network_interface_plus_count" { | ||
description = "number of additional network interfaces to add to aws EC2 instance" | ||
default = 0 | ||
} | ||
|
||
variable "os" { | ||
description = "os" | ||
default = "ubuntu20.04" | ||
} | ||
|
||
variable "ssh_user" { | ||
description = "ssh_user" | ||
default = "ubuntu" | ||
} | ||
|
||
################################################################################ | ||
# Specific DB machine variables | ||
################################################################################ | ||
variable "server_instance_type" { | ||
description = "type for aws EC2 instance" | ||
default = "g6.2xlarge" | ||
} | ||
|
||
variable "server_instance_count" { | ||
description = "count of aws EC2 instances" | ||
default = 1 | ||
} | ||
|
||
variable "server_instance_cpu_core_count" { | ||
description = "CPU core count for aws EC2 instance" | ||
default = 4 | ||
} |