Skip to content

Commit

Permalink
Added singe node GPU instance (2xlarge) (#95)
Browse files Browse the repository at this point in the history
* Added singe node GPU instance (2xlarge)

* Updated ami of gpu instance
  • Loading branch information
filipecosta90 authored Apr 23, 2024
1 parent 6d0b094 commit b867252
Show file tree
Hide file tree
Showing 6 changed files with 309 additions and 0 deletions.
11 changes: 11 additions & 0 deletions terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/common.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

################################################################################
# This is the bucket holding this specific setup tfstate
################################################################################
terraform {
backend "s3" {
bucket = "performance-cto-group"
region = "us-east-1"
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@

resource "aws_instance" "server_2a" {
count = var.server_instance_count
ami = var.instance_ami
instance_type = var.server_instance_type
subnet_id = data.terraform_remote_state.shared_resources.outputs.subnet_public_id
vpc_security_group_ids = ["${data.terraform_remote_state.shared_resources.outputs.performance_cto_sg_id}"]
key_name = var.key_name
availability_zone = "us-east-2a"

root_block_device {
volume_size = var.instance_volume_size
volume_type = var.instance_volume_type
encrypted = var.instance_volume_encrypted
delete_on_termination = true
}

volume_tags = {
Environment = "${var.environment}"
Project = "${var.environment}"
Name = "ebs_block_device-${var.setup_name}-DB-us-east-2a-${count.index + 1}"
setup = "${var.setup_name}"
triggering_env = "${var.triggering_env}"
github_actor = "${var.github_actor}"
github_org = "${var.github_org}"
github_repo = "${var.github_repo}"
github_sha = "${var.github_sha}"
}

tags = {
Environment = "${var.environment}"
Project = "${var.environment}"
Name = "${var.setup_name}-DB-us-east-2a-${count.index + 1}"
setup = "${var.setup_name}"
triggering_env = "${var.triggering_env}"
github_actor = "${var.github_actor}"
github_org = "${var.github_org}"
github_repo = "${var.github_repo}"
github_sha = "${var.github_sha}"
}

################################################################################
# This will ensure we wait here until the instance is ready to receive the ssh connection
################################################################################
provisioner "remote-exec" {
script = "./../scripts/wait_for_instance.sh"
connection {
host = self.public_ip # The `self` variable is like `this` in many programming languages
type = "ssh" # in this case, `self` is the resource (the server).
user = var.ssh_user
private_key = file(var.private_key)
#need to increase timeout to larger then 5m for metal instances
timeout = "5m"
agent = "false"
}
}

################################################################################
# Deployment related
################################################################################
}
23 changes: 23 additions & 0 deletions terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/output.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
output "server_public_ip" {
value = aws_instance.server_2a[*].public_ip
}

output "server_private_ip" {
value = aws_instance.server_2a[*].private_ip
}

output "server_instance_type" {
value = var.server_instance_type
}

output "search_threads" {
value = var.search_thread
}

output "setup_name" {
value = var.setup_name
}

output "server_instance_count" {
value = var.server_instance_count
}
26 changes: 26 additions & 0 deletions terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/prepare_ips.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import subprocess
import json

# Run the "terraform output json" command
output = subprocess.run(["terraform", "output", "-json"], stdout=subprocess.PIPE)

output_json = json.loads(output.stdout.decode())
total_nodes = len(output_json['server_private_ip']['value'])
search_threads = output_json['search_threads']['value']
setup_name = output_json["setup_name"]["value"].replace(".", "_").replace("-", "_")
prefix = "perf-cto-RE-"
setup_name = setup_name[len(prefix) :]

print("#!/bin/bash\n")
print("TOTAL_NODES={}\n".format(total_nodes))
print("SEARCH_THREADS={}\n".format(search_threads))
print('CLUSTER_NAME="{}"\n'.format(setup_name))

print("\n#internal IP addresses")
cleaned_json = {}
for keyn, v in enumerate(output_json['server_private_ip']['value'],start=1):
print("B_M{}_I={}".format(keyn,v))

print("\n#external IP addresses")
for keyn, v in enumerate(output_json['server_public_ip']['value'],start=1):
print("B_M{}_E={}".format(keyn,v))
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# provider
provider "aws" {
region = var.region
}

################################################################################
# This is the shared resources bucket key -- you will need it across environments like security rules,etc...
# !! do not change this !!
################################################################################
data "terraform_remote_state" "shared_resources" {
backend = "s3"
config = {
bucket = "performance-cto-group"
key = "benchmarks/infrastructure/shared_resources.tfstate"
region = "us-east-1"
}
}
171 changes: 171 additions & 0 deletions terraform/re-1node-1shards-1threads-gpu-g6-2xlarge/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
################################################################################
# Variables used for deployment tag
################################################################################

variable "search_thread" {
description = "setup name"
default = "6"
}

variable "setup_name" {
description = "setup name"
default = "perf-cto-RE-g6.2xlarge-1shards-1threads-gpu"
}

variable "github_actor" {
description = "The name of the person or app that initiated the deployment."
default = "N/A"
}


variable "github_repo" {
description = " The owner and repository name. For example, testing-infrastructure."
default = "N/A"
}

variable "triggering_env" {
description = " The triggering environment. For example circleci."
default = "N/A"
}

variable "environment" {
description = " The cost tag."
default = "VecSim-Nvidia"
}

variable "github_org" {
description = " The owner name. For example, RedisModules."
default = "N/A"
}

variable "github_sha" {
description = "The commit SHA that triggered the deployment."
default = "N/A"
}

variable "timeout_secs" {
description = "The maximum time to wait prior destroying the VM via the watchdog."
default = "3600"
}



################################################################################
# Access keys
################################################################################
variable "private_key" {
description = "private key"
default = "/tmp/benchmarks.redislabs.pem"
}

variable "key_name" {
description = "key name"
default = "perf-cto-us-east-2"
}

variable "region" {
default = "us-east-2"
}
# (Ubuntu 20.04)
# us-east-2 Focal Fossa 20.04 Deep Learning AMI GPU CUDA 11.4.3 (Ubuntu 20.04) 20221115
variable "instance_ami" {
description = "AMI for aws EC2 instance - us-east-2 Ubuntu 20.04 - amd64"
default = "ami-01cc20b11fcde36cb"
}



variable "client_instance_ami" {
description = "AMI for aws EC2 instance - us-east-2 Ubuntu 22.04 - amd64"
default = "ami-024e6efaf93d85776"
}

variable "instance_device_name" {
description = "EC2 instance device name"
default = "/dev/sda1"
}

variable "redis_module" {
description = "redis_module"
default = "N/A"
}

variable "instance_volume_size" {
description = "EC2 instance volume_size"
default = "256"
}

variable "instance_volume_type" {
description = "EC2 instance volume_type"
default = "gp3"
}


variable "instance_volume_iops" {
description = "EC2 instance volume_iops"
default = "3000"
}

variable "client_instance_volume_size" {
description = "EC2 instance volume_size"
default = "256"
}

variable "client_instance_volume_type" {
description = "EC2 instance volume_type"
default = "gp3"
}


variable "instance_volume_encrypted" {
description = "EC2 instance instance_volume_encrypted"
default = "false"
}

variable "instance_root_block_device_encrypted" {
description = "EC2 instance instance_root_block_device_encrypted"
default = "false"
}

variable "instance_cpu_threads_per_core" {
description = "CPU threads per core for aws EC2 instance"
default = 1
}

variable "instance_cpu_threads_per_core_hyperthreading" {
description = "CPU threads per core when hyperthreading is enabled for aws EC2 instance"
default = 2
}

variable "instance_network_interface_plus_count" {
description = "number of additional network interfaces to add to aws EC2 instance"
default = 0
}

variable "os" {
description = "os"
default = "ubuntu20.04"
}

variable "ssh_user" {
description = "ssh_user"
default = "ubuntu"
}

################################################################################
# Specific DB machine variables
################################################################################
variable "server_instance_type" {
description = "type for aws EC2 instance"
default = "g6.2xlarge"
}

variable "server_instance_count" {
description = "count of aws EC2 instances"
default = 1
}

variable "server_instance_cpu_core_count" {
description = "CPU core count for aws EC2 instance"
default = 4
}

0 comments on commit b867252

Please sign in to comment.