Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions deployment/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ crash.*.log
*.tfvars
*.tfvars.json
!azure-tf/setups/*
!aws-tf/setups/*

# Ignore override files as they are usually used to override resources locally and so
# are not checked in
Expand Down
15 changes: 15 additions & 0 deletions deployment/aws-tf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
This is specific to PSL.

VM categories:
- `sevpool`: AMD SEV-SNP nodes m6a.4xlarge. These will not have too much storage in them, per PSL spec.
- `storagepool`: Nodes with big disks. No TEEs in them.
- `clientpool`: Same config (probably) as the storagepool, but no big disks. Used for clients.


Currently not supporting cross-region deployment.
Everything likely to be in US East (Ohio).

1 VPC with the following private IP configs:
- `sevpool` gets 10.0.1.0/24
- `storagepool` gets 10.0.2.0/24
- `clientpool` gets 10.0.3.0/24
73 changes: 73 additions & 0 deletions deployment/aws-tf/init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash

# Copyright (c) Shubham Mishra. All rights reserved.
# Licensed under the MIT License.

# Runs as root.
# Change the default username from ubuntu to psladmin
usermod -l psladmin ubuntu
usermod -m -d /home/psladmin psladmin

# Add the psladmin user to the sudoers file
echo "psladmin ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers


# Docker keys and repos
apt-get update
apt-get install ca-certificates curl
install -m 0755 -d /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
chmod a+r /etc/apt/keyrings/docker.asc

# Add the repository to Apt sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
tee /etc/apt/sources.list.d/docker.list > /dev/null
apt-get update

# Install Docker
apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin

# Non-root run
usermod -aG docker psladmin

# Restart on reboot
systemctl enable docker.service
systemctl enable containerd.service

# PSL dependencies
apt-get install -y screen
apt-get install -y build-essential cmake clang llvm pkg-config
apt-get install -y jq
apt-get install -y protobuf-compiler
apt-get install -y linux-tools-common linux-tools-generic linux-tools-`uname -r`
apt-get install -y net-tools
apt-get install -y ca-certificates curl libssl-dev
apt-get install -y librocksdb-dev libprotobuf-dev
apt-get install -y python3-pip python3-virtualenv


# Increase open file limits

echo "* soft nofile 50000" >> /etc/security/limits.conf
echo "* hard nofile 50000" >> /etc/security/limits.conf


# Mount the EBS SSD.
# AWS + Ubuntu 24.04 => The name for disk is /dev/nvme1n1
# It may not be present (for sevpool and clientpool)

if [ -b /dev/nvme1n1 ]; then
mkfs.ext4 /dev/nvme1n1
mkdir /data
mount /dev/nvme1n1 /data
chmod -R 777 /data
fi


# Turns out AWS lets you login to the instance before this script ends executing.
# We will have a flag file to check for finishing.

echo "VM Ready" > /home/psladmin/ready.txt

271 changes: 271 additions & 0 deletions deployment/aws-tf/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
# First create a VPC with Internet Gateway
# Allow SSH access and allow all traffic within the VPC

resource "aws_vpc" "main" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = local.vpc_name
Project = local.project_name
}
}

resource "aws_internet_gateway" "gateway" {
vpc_id = aws_vpc.main.id
tags = {
Name = "${local.vpc_name}-internet-gateway"
Project = local.project_name
}
}

resource "aws_route" "route_to_internet" {
route_table_id = aws_vpc.main.main_route_table_id
destination_cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.gateway.id
}

# Ports

resource "aws_security_group" "psl_sg" {
name = "${local.project_name}-psl-sg"
description = "Allow SSH and PSL ports"
vpc_id = aws_vpc.main.id

tags = {
Name = "${local.project_name}-psl-sg"
Project = local.project_name
}
}

resource "aws_vpc_security_group_ingress_rule" "allow_ssh" {
security_group_id = aws_security_group.psl_sg.id
cidr_ipv4 = "0.0.0.0/0"
from_port = 22
to_port = 22
ip_protocol = "tcp"
}

resource "aws_vpc_security_group_egress_rule" "allow_default_egress" {
security_group_id = aws_security_group.psl_sg.id
cidr_ipv4 = "0.0.0.0/0"
ip_protocol = "all"
}

resource "aws_vpc_security_group_ingress_rule" "allow_all_within_vpc" {
security_group_id = aws_security_group.psl_sg.id
cidr_ipv4 = aws_vpc.main.cidr_block
ip_protocol = "all"
}


# Then create a subnet for each pool

resource "aws_subnet" "sevpool" {
vpc_id = aws_vpc.main.id
cidr_block = "10.0.1.0/24"
tags = {
Name = local.sevpool_subnet_name
Project = local.project_name
}
}

resource "aws_subnet" "storagepool" {
vpc_id = aws_vpc.main.id
cidr_block = "10.0.2.0/24"
tags = {
Name = local.storagepool_subnet_name
Project = local.project_name
}
}

resource "aws_subnet" "clientpool" {
vpc_id = aws_vpc.main.id
cidr_block = "10.0.3.0/24"
tags = {
Name = local.clientpool_subnet_name
Project = local.project_name
}
}

# Generate a key pair for SSH

resource "tls_private_key" "ssh_key" {
algorithm = "RSA"
rsa_bits = 4096
}

resource "aws_key_pair" "ssh_key" {
key_name = local.key_name
public_key = tls_private_key.ssh_key.public_key_openssh
}


# Now the EC2 instances

data "aws_ami" "ubuntu_24_04" {
most_recent = true
owners = ["099720109477"]

# Ubuntu AMI ID search
filter {
name = "name"
values = ["ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*"]
}

filter {
name = "virtualization-type"
values = ["hvm"]
}
}

resource "aws_instance" "sevpool" {
ami = data.aws_ami.ubuntu_24_04.id

# 4xlarge and lower do not have fixed network performance.
instance_type = "m6a.4xlarge"
subnet_id = aws_subnet.sevpool.id
count = var.sevpool_count



associate_public_ip_address = true

key_name = aws_key_pair.ssh_key.key_name

security_groups = [aws_security_group.psl_sg.id]

tags = {
Name = "${local.sevpool_instance_name}-${count.index}"
InstanceGroup = local.sevpool_instance_name
Project = local.project_name
}

cpu_options {
amd_sev_snp = "enabled"
}

instance_market_options {
market_type = "spot"
spot_options {
max_price = 1.6 # Set it to the on-demand price. For SEV SNP, they charge 10% higher.
# This way the chance of getting evicted is lower.
}
}

root_block_device {
volume_size = 64 # GiB. Disk is not important for worker nodes. Still needed to save logs.
# This is enough for sensible experiment schedules.
}

# Only deploy ebs block device if the flag is set.
dynamic "ebs_block_device" {
for_each = var.deploy_ebs_on_sevpool ? [1] : []
content {
device_name = "/dev/sdb" # AWS + Ubuntu 24.04 => The name for disk is /dev/nvme1n1; this name doesn't matter.
volume_type = "gp3"
volume_size = 1024 # GiB
delete_on_termination = true
throughput = 1000 # MiB/s
iops = 16000 # This is the max for gp3.
}
}

user_data_base64 = filebase64("./init.sh")

# Leave the rest of the config as default.
}

resource "aws_instance" "storagepool" {
ami = data.aws_ami.ubuntu_24_04.id

# 4xlarge and lower do not have fixed network performance.
instance_type = "m6a.4xlarge"
subnet_id = aws_subnet.storagepool.id
count = var.storagepool_count



associate_public_ip_address = true

key_name = aws_key_pair.ssh_key.key_name

security_groups = [aws_security_group.psl_sg.id]

tags = {
Name = "${local.storagepool_instance_name}-${count.index}"
InstanceGroup = local.storagepool_instance_name
Project = local.project_name
}


instance_market_options {
market_type = "spot"
spot_options {
max_price = 1.6 # Set it to the on-demand price.
# This way the chance of getting evicted is lower.
}
}

root_block_device {
volume_size = 64 # GiB. This will only be used for logs.
}


ebs_block_device {
device_name = "/dev/sdb" # AWS + Ubuntu 24.04 => The name for disk is /dev/nvme1n1; this name doesn't matter.
volume_type = "gp3"
volume_size = 1024 # GiB
delete_on_termination = true
throughput = 1000 # MiB/s
iops = 16000 # This is the max for gp3.
}
user_data_base64 = filebase64("./init.sh")

# Leave the rest of the config as default.
}

resource "aws_instance" "clientpool" {
ami = data.aws_ami.ubuntu_24_04.id

# 4xlarge and lower do not have fixed network performance.
# Client devices don't need much memory, but they need a compute. (For generating requests and compiling)
# Because of this, we use the c6a family. Saves cost.
instance_type = "c6a.4xlarge"
subnet_id = aws_subnet.clientpool.id
count = var.clientpool_count



associate_public_ip_address = true

key_name = aws_key_pair.ssh_key.key_name

security_groups = [aws_security_group.psl_sg.id]

tags = {
Name = "${local.clientpool_instance_name}-${count.index}"
InstanceGroup = local.clientpool_instance_name
Project = local.project_name
}


instance_market_options {
market_type = "spot"
spot_options {
max_price = 1.6 # Set it to the on-demand price.
# This way the chance of getting evicted is lower.
}
}

root_block_device {
volume_size = 64 # GiB. Disk is not important for client nodes. Still needed to save logs.
# is is enough for sensible experiment schedules.
}



user_data_base64 = filebase64("./init.sh")

# Leave the rest of the config as default.
}
19 changes: 19 additions & 0 deletions deployment/aws-tf/providers.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "6.4.0"
}
random = {
source = "hashicorp/random"
version = "~>3.0"
}
tls = {
source = "hashicorp/tls"
version = "~>4.0"
}
}
}

provider "aws" {
}
4 changes: 4 additions & 0 deletions deployment/aws-tf/setups/max.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sevpool_count = 16
storagepool_count = 5
clientpool_count = 3
deploy_ebs_on_sevpool = false
Loading