diff --git a/deployment/.gitignore b/deployment/.gitignore index ff58ad30..c78cc881 100644 --- a/deployment/.gitignore +++ b/deployment/.gitignore @@ -29,6 +29,7 @@ crash.*.log *.tfvars *.tfvars.json !azure-tf/setups/* +!aws-tf/setups/* # Ignore override files as they are usually used to override resources locally and so # are not checked in diff --git a/deployment/aws-tf/README.md b/deployment/aws-tf/README.md new file mode 100644 index 00000000..d8a3a38d --- /dev/null +++ b/deployment/aws-tf/README.md @@ -0,0 +1,15 @@ +This is specific to PSL. + +VM categories: +- `sevpool`: AMD SEV-SNP nodes m6a.4xlarge. These will not have too much storage in them, per PSL spec. +- `storagepool`: Nodes with big disks. No TEEs in them. +- `clientpool`: Same config (probably) as the storagepool, but no big disks. Used for clients. + + +Currently not supporting cross-region deployment. +Everything likely to be in US East (Ohio). + +1 VPC with the following private IP configs: +- `sevpool` gets 10.0.1.0/24 +- `storagepool` gets 10.0.2.0/24 +- `clientpool` gets 10.0.3.0/24 diff --git a/deployment/aws-tf/init.sh b/deployment/aws-tf/init.sh new file mode 100644 index 00000000..cc5bcb11 --- /dev/null +++ b/deployment/aws-tf/init.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +# Copyright (c) Shubham Mishra. All rights reserved. +# Licensed under the MIT License. + +# Runs as root. +# Change the default username from ubuntu to psladmin +usermod -l psladmin ubuntu +usermod -m -d /home/psladmin psladmin + +# Add the psladmin user to the sudoers file +echo "psladmin ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + + +# Docker keys and repos +apt-get update +apt-get install ca-certificates curl +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +chmod a+r /etc/apt/keyrings/docker.asc + +# Add the repository to Apt sources: +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null +apt-get update + +# Install Docker +apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin + +# Non-root run +usermod -aG docker psladmin + +# Restart on reboot +systemctl enable docker.service +systemctl enable containerd.service + +# PSL dependencies +apt-get install -y screen +apt-get install -y build-essential cmake clang llvm pkg-config +apt-get install -y jq +apt-get install -y protobuf-compiler +apt-get install -y linux-tools-common linux-tools-generic linux-tools-`uname -r` +apt-get install -y net-tools +apt-get install -y ca-certificates curl libssl-dev +apt-get install -y librocksdb-dev libprotobuf-dev +apt-get install -y python3-pip python3-virtualenv + + +# Increase open file limits + +echo "* soft nofile 50000" >> /etc/security/limits.conf +echo "* hard nofile 50000" >> /etc/security/limits.conf + + +# Mount the EBS SSD. +# AWS + Ubuntu 24.04 => The name for disk is /dev/nvme1n1 +# It may not be present (for sevpool and clientpool) + +if [ -b /dev/nvme1n1 ]; then + mkfs.ext4 /dev/nvme1n1 + mkdir /data + mount /dev/nvme1n1 /data + chmod -R 777 /data +fi + + +# Turns out AWS lets you login to the instance before this script ends executing. +# We will have a flag file to check for finishing. + +echo "VM Ready" > /home/psladmin/ready.txt + diff --git a/deployment/aws-tf/main.tf b/deployment/aws-tf/main.tf new file mode 100644 index 00000000..90e655bf --- /dev/null +++ b/deployment/aws-tf/main.tf @@ -0,0 +1,271 @@ +# First create a VPC with Internet Gateway +# Allow SSH access and allow all traffic within the VPC + +resource "aws_vpc" "main" { + cidr_block = "10.0.0.0/16" + enable_dns_hostnames = true + enable_dns_support = true + tags = { + Name = local.vpc_name + Project = local.project_name + } +} + +resource "aws_internet_gateway" "gateway" { + vpc_id = aws_vpc.main.id + tags = { + Name = "${local.vpc_name}-internet-gateway" + Project = local.project_name + } +} + +resource "aws_route" "route_to_internet" { + route_table_id = aws_vpc.main.main_route_table_id + destination_cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.gateway.id +} + +# Ports + +resource "aws_security_group" "psl_sg" { + name = "${local.project_name}-psl-sg" + description = "Allow SSH and PSL ports" + vpc_id = aws_vpc.main.id + + tags = { + Name = "${local.project_name}-psl-sg" + Project = local.project_name + } +} + +resource "aws_vpc_security_group_ingress_rule" "allow_ssh" { + security_group_id = aws_security_group.psl_sg.id + cidr_ipv4 = "0.0.0.0/0" + from_port = 22 + to_port = 22 + ip_protocol = "tcp" +} + +resource "aws_vpc_security_group_egress_rule" "allow_default_egress" { + security_group_id = aws_security_group.psl_sg.id + cidr_ipv4 = "0.0.0.0/0" + ip_protocol = "all" +} + +resource "aws_vpc_security_group_ingress_rule" "allow_all_within_vpc" { + security_group_id = aws_security_group.psl_sg.id + cidr_ipv4 = aws_vpc.main.cidr_block + ip_protocol = "all" +} + + +# Then create a subnet for each pool + +resource "aws_subnet" "sevpool" { + vpc_id = aws_vpc.main.id + cidr_block = "10.0.1.0/24" + tags = { + Name = local.sevpool_subnet_name + Project = local.project_name + } +} + +resource "aws_subnet" "storagepool" { + vpc_id = aws_vpc.main.id + cidr_block = "10.0.2.0/24" + tags = { + Name = local.storagepool_subnet_name + Project = local.project_name + } +} + +resource "aws_subnet" "clientpool" { + vpc_id = aws_vpc.main.id + cidr_block = "10.0.3.0/24" + tags = { + Name = local.clientpool_subnet_name + Project = local.project_name + } +} + +# Generate a key pair for SSH + +resource "tls_private_key" "ssh_key" { + algorithm = "RSA" + rsa_bits = 4096 +} + +resource "aws_key_pair" "ssh_key" { + key_name = local.key_name + public_key = tls_private_key.ssh_key.public_key_openssh +} + + +# Now the EC2 instances + +data "aws_ami" "ubuntu_24_04" { + most_recent = true + owners = ["099720109477"] + + # Ubuntu AMI ID search + filter { + name = "name" + values = ["ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*"] + } + + filter { + name = "virtualization-type" + values = ["hvm"] + } +} + +resource "aws_instance" "sevpool" { + ami = data.aws_ami.ubuntu_24_04.id + + # 4xlarge and lower do not have fixed network performance. + instance_type = "m6a.4xlarge" + subnet_id = aws_subnet.sevpool.id + count = var.sevpool_count + + + + associate_public_ip_address = true + + key_name = aws_key_pair.ssh_key.key_name + + security_groups = [aws_security_group.psl_sg.id] + + tags = { + Name = "${local.sevpool_instance_name}-${count.index}" + InstanceGroup = local.sevpool_instance_name + Project = local.project_name + } + + cpu_options { + amd_sev_snp = "enabled" + } + + instance_market_options { + market_type = "spot" + spot_options { + max_price = 1.6 # Set it to the on-demand price. For SEV SNP, they charge 10% higher. + # This way the chance of getting evicted is lower. + } + } + + root_block_device { + volume_size = 64 # GiB. Disk is not important for worker nodes. Still needed to save logs. + # This is enough for sensible experiment schedules. + } + + # Only deploy ebs block device if the flag is set. + dynamic "ebs_block_device" { + for_each = var.deploy_ebs_on_sevpool ? [1] : [] + content { + device_name = "/dev/sdb" # AWS + Ubuntu 24.04 => The name for disk is /dev/nvme1n1; this name doesn't matter. + volume_type = "gp3" + volume_size = 1024 # GiB + delete_on_termination = true + throughput = 1000 # MiB/s + iops = 16000 # This is the max for gp3. + } +} + + user_data_base64 = filebase64("./init.sh") + + # Leave the rest of the config as default. +} + +resource "aws_instance" "storagepool" { + ami = data.aws_ami.ubuntu_24_04.id + + # 4xlarge and lower do not have fixed network performance. + instance_type = "m6a.4xlarge" + subnet_id = aws_subnet.storagepool.id + count = var.storagepool_count + + + + associate_public_ip_address = true + + key_name = aws_key_pair.ssh_key.key_name + + security_groups = [aws_security_group.psl_sg.id] + + tags = { + Name = "${local.storagepool_instance_name}-${count.index}" + InstanceGroup = local.storagepool_instance_name + Project = local.project_name + } + + + instance_market_options { + market_type = "spot" + spot_options { + max_price = 1.6 # Set it to the on-demand price. + # This way the chance of getting evicted is lower. + } + } + + root_block_device { + volume_size = 64 # GiB. This will only be used for logs. + } + + + ebs_block_device { + device_name = "/dev/sdb" # AWS + Ubuntu 24.04 => The name for disk is /dev/nvme1n1; this name doesn't matter. + volume_type = "gp3" + volume_size = 1024 # GiB + delete_on_termination = true + throughput = 1000 # MiB/s + iops = 16000 # This is the max for gp3. + } + user_data_base64 = filebase64("./init.sh") + + # Leave the rest of the config as default. +} + +resource "aws_instance" "clientpool" { + ami = data.aws_ami.ubuntu_24_04.id + + # 4xlarge and lower do not have fixed network performance. + # Client devices don't need much memory, but they need a compute. (For generating requests and compiling) + # Because of this, we use the c6a family. Saves cost. + instance_type = "c6a.4xlarge" + subnet_id = aws_subnet.clientpool.id + count = var.clientpool_count + + + + associate_public_ip_address = true + + key_name = aws_key_pair.ssh_key.key_name + + security_groups = [aws_security_group.psl_sg.id] + + tags = { + Name = "${local.clientpool_instance_name}-${count.index}" + InstanceGroup = local.clientpool_instance_name + Project = local.project_name + } + + + instance_market_options { + market_type = "spot" + spot_options { + max_price = 1.6 # Set it to the on-demand price. + # This way the chance of getting evicted is lower. + } + } + + root_block_device { + volume_size = 64 # GiB. Disk is not important for client nodes. Still needed to save logs. + # is is enough for sensible experiment schedules. + } + + + + user_data_base64 = filebase64("./init.sh") + + # Leave the rest of the config as default. +} \ No newline at end of file diff --git a/deployment/aws-tf/providers.tf b/deployment/aws-tf/providers.tf new file mode 100644 index 00000000..da6933b2 --- /dev/null +++ b/deployment/aws-tf/providers.tf @@ -0,0 +1,19 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "6.4.0" + } + random = { + source = "hashicorp/random" + version = "~>3.0" + } + tls = { + source = "hashicorp/tls" + version = "~>4.0" + } + } +} + +provider "aws" { +} diff --git a/deployment/aws-tf/setups/max.tfvars b/deployment/aws-tf/setups/max.tfvars new file mode 100644 index 00000000..d02af59a --- /dev/null +++ b/deployment/aws-tf/setups/max.tfvars @@ -0,0 +1,4 @@ +sevpool_count = 16 +storagepool_count = 5 +clientpool_count = 3 +deploy_ebs_on_sevpool = false \ No newline at end of file diff --git a/deployment/aws-tf/setups/real-tiny.tfvars b/deployment/aws-tf/setups/real-tiny.tfvars new file mode 100644 index 00000000..4bcc504c --- /dev/null +++ b/deployment/aws-tf/setups/real-tiny.tfvars @@ -0,0 +1,4 @@ +sevpool_count = 0 +storagepool_count = 0 +clientpool_count = 1 +deploy_ebs_on_sevpool = false \ No newline at end of file diff --git a/deployment/aws-tf/setups/tiny-consensus-cmp.tfvars b/deployment/aws-tf/setups/tiny-consensus-cmp.tfvars new file mode 100644 index 00000000..9aa32b12 --- /dev/null +++ b/deployment/aws-tf/setups/tiny-consensus-cmp.tfvars @@ -0,0 +1,4 @@ +sevpool_count = 1 +storagepool_count = 1 +clientpool_count = 1 +deploy_ebs_on_sevpool = true \ No newline at end of file diff --git a/deployment/aws-tf/setups/tiny.tfvars b/deployment/aws-tf/setups/tiny.tfvars new file mode 100644 index 00000000..1defd008 --- /dev/null +++ b/deployment/aws-tf/setups/tiny.tfvars @@ -0,0 +1,4 @@ +sevpool_count = 1 +storagepool_count = 1 +clientpool_count = 1 +deploy_ebs_on_sevpool = false \ No newline at end of file diff --git a/deployment/aws-tf/ssh.tf b/deployment/aws-tf/ssh.tf new file mode 100644 index 00000000..e7a34793 --- /dev/null +++ b/deployment/aws-tf/ssh.tf @@ -0,0 +1,13 @@ +output "cluster_private_key" { + value = tls_private_key.ssh_key.private_key_pem + sensitive = true +} + +output "cluster_public_key" { + value = tls_private_key.ssh_key.public_key_pem +} + +output "machine_list" { + value = [for i in aws_instance.clientpool[*] : [i.tags["Name"], i.private_ip, i.public_ip]] +} + diff --git a/deployment/aws-tf/variables.tf b/deployment/aws-tf/variables.tf new file mode 100644 index 00000000..42984e3c --- /dev/null +++ b/deployment/aws-tf/variables.tf @@ -0,0 +1,49 @@ +# Resource group in AWS seems to mean something different from Azure +# Hence using the name `project` here. +variable "project_prefix" { + type = string + default = "psl" +} + +resource "random_pet" "rg_name" { + prefix = var.project_prefix +} + +variable "deploy_ebs_on_sevpool" { + type = bool + default = false +} + +variable "sevpool_count" { + type = number + default = 1 +} + +variable "storagepool_count" { + type = number + default = 1 +} + +variable "clientpool_count" { + type = number + default = 1 +} + +variable "username" { + type = string + default = "pftadmin" # This is kept for backward compatibility. AWS doesn't let you change the username from terraform. + # Username will be changed in the init.sh script to psladmin. +} + +locals { + project_name = random_pet.rg_name.id + vpc_name = "${local.project_name}-vpc" + key_name = "${local.project_name}-key" + sevpool_subnet_name = "${local.project_name}-sevpool-subnet" + storagepool_subnet_name = "${local.project_name}-storagepool-subnet" + clientpool_subnet_name = "${local.project_name}-clientpool-subnet" + sevpool_instance_name = "${local.project_name}-sevpool" + storagepool_instance_name = "${local.project_name}-storagepool" + clientpool_instance_name = "${local.project_name}-clientpool" + +} \ No newline at end of file diff --git a/scripts/__main__.py b/scripts/__main__.py index bbe988fc..2c84b7b8 100644 --- a/scripts/__main__.py +++ b/scripts/__main__.py @@ -22,7 +22,7 @@ from crypto import * from app_experiments import AppExperiment from ssh_utils import * -from deployment import Deployment +from deployment import Deployment, AWSDeployment from experiments import Experiment from autobahn_experiments import AutobahnExperiment from results import * @@ -116,7 +116,16 @@ def parse_config(path, workdir=None, existing_experiments=None): curr_time = datetime.datetime.now(datetime.timezone.utc).isoformat() workdir = os.path.join(toml_dict["workdir"], curr_time) - deployment = Deployment(toml_dict["deployment_config"], workdir) + deployment_klass = Deployment + if "provider" in toml_dict["deployment_config"]: + if toml_dict["deployment_config"]["provider"] == "aws": + deployment_klass = AWSDeployment + elif toml_dict["deployment_config"]["provider"] == "azure": + deployment_klass = Deployment + else: + raise ValueError(f"Unknown provider: {toml_dict['deployment_config']['provider']}. Valid providers are: aws, azure") + + deployment = deployment_klass(toml_dict["deployment_config"], workdir) base_node_config = toml_dict["node_config"] base_client_config = toml_dict["client_config"] diff --git a/scripts/aws_test.toml b/scripts/aws_test.toml new file mode 100644 index 00000000..d55859b2 --- /dev/null +++ b/scripts/aws_test.toml @@ -0,0 +1,130 @@ +workdir = "deployment_artifacts" +project_home = "https://github.com/data-capsule/psl-cvm" + +[deployment_config] +provider = "aws" +mode = "real-tiny" +ssh_key = "cluster_key.pem" +ssh_user = "psladmin" # DO NOT CHANGE THIS for AWS. +node_port_base = 3000 + + +[node_config] + +[node_config.net_config] +client_max_retry = 10 + +[node_config.rpc_config] +recv_buffer_size = 32768 +channel_depth = 1000 + +[node_config.consensus_config] +commit_index_gap_soft = 250 +commit_index_gap_hard = 500 +liveness_u = 2 +max_backlog_batch_size = 1000 +signature_max_delay_blocks = 50 +signature_max_delay_ms = 102 # roughly batch_max_delay_ms * signature_max_delay_blocks +num_crypto_workers = 5 +view_timeout_ms = 4000 +batch_max_delay_ms = 2 + +[node_config.consensus_config.log_storage_config.RocksDB] +write_buffer_size = 2147483648 +max_write_buffer_number = 1 +max_write_buffers_to_merge = 1 + +[node_config.app_config] +logger_stats_report_ms = 100 +checkpoint_interval_ms = 60000 + +[node_config.evil_config] +simulate_byzantine_behavior = false +byzantine_start_block = 0 + + +[client_config] +full_duplex = true +client_sub_id = 0 # This is filled up later by the caller code. + +[client_config.net_config] +client_max_retry = 10 + +[client_config.workload_config] +request_config = "Blanks" +max_concurrent_requests = 16 + +[[experiments]] +name = "pirateship" +repeats = 1 +num_nodes = 7 +node_distribution = "uniform" +build_command = "make" +duration = 60 + +[experiments.sweeping_parameters] +# num_clients = [10, 20, 50, 100, 200, 250, 300, 350, 400] +# num_clients = [100, 200, 400, 500, 700, 800] +num_clients = [400] + +[[experiments]] +name = "pirateship_sig_1" +repeats = 1 +num_nodes = 7 +node_distribution = "uniform" +build_command = "make" +duration = 60 + +[experiments.node_config.consensus_config] +signature_max_delay_blocks = 1 + +[experiments.sweeping_parameters] +# num_clients = [10, 20, 50, 100, 200, 250, 300, 350, 400] +# num_clients = [100, 200, 400, 500, 700, 800] +num_clients = [400] + + +[[experiments]] +name = "pbft" +repeats = 1 +num_nodes = 7 +node_distribution = "uniform" +build_command = "make chained_pbft_logger" +duration = 60 + +[experiments.sweeping_parameters] +num_clients = [400] +# num_clients = [100, 200, 300, 500, 700, 1000, 2000] + +[[experiments]] +name = "signed_raft" +repeats = 1 +num_nodes = 5 +node_distribution = "uniform" +build_command = "make signed_raft_logger" +duration = 60 + +[experiments.sweeping_parameters] +num_clients = [400] + + + + +[[results]] +name = "tput_latency_client_sweep" +plotter = "tput_latency_sweep" +ramp_up = 15 +ramp_down = 15 +output = "lan_experiment.pdf" +force_parse = true +# skip_indices = [0, 1, 2, 3] + +[results.legends] +# Experiment group to legend mapping +pirateship = "pirateship+byz" +pirateship_sig_1 = "pirateship(sig=1)+byz" +pbft = "pbft+onlybyz" +signed_raft = "signed_raft" + +[results.font] +size = 55 diff --git a/scripts/deployment.py b/scripts/deployment.py index ad8bfa72..45a6a131 100644 --- a/scripts/deployment.py +++ b/scripts/deployment.py @@ -147,7 +147,7 @@ def get_ssh_key(self): ]) - def deploy(self): + def deploy(self, tf_dir=None): run_local([ f"mkdir -p {self.workdir}", f"mkdir -p {self.workdir}/deployment", @@ -163,12 +163,15 @@ def deploy(self): # Terraform deploy # Find the azure-tf directory relative to where the script is being called from - found_path = self.find_azure_tf_dir() + if tf_dir is None: + found_path = self.find_azure_tf_dir() + else: + found_path = tf_dir if found_path is None: - raise FileNotFoundError("Azure Terraform directory not found") + raise FileNotFoundError("Terraform directory not found") else: - print(f"Found Azure Terraform directory at {found_path}") + print(f"Found Terraform directory at {found_path}") # There must exist a var-file in azure-tf/setups for the deployment mode var_file = os.path.join(found_path, "setups", f"{self.mode}.tfvars") @@ -194,12 +197,14 @@ def deploy(self): f"terraform -chdir={found_path} apply -no-color -auto-approve -state={tfstate_path} {plan_path} > {tf_output_dir}/apply.log 2>&1", ]) - # Populate nodelist - self.populate_nodelist() - # Store the SSH key self.get_ssh_key() + # It is important to get the SSH key first. + + # Populate nodelist + self.populate_nodelist() + # Install dev dependencies on dev VM self.prepare_dev_vm() @@ -314,7 +319,8 @@ def populate_nodelist(self): for name, info in self.raw_config["node_list"].items(): public_ip = info["public_ip"] private_ip = info["private_ip"] - is_client = name.startswith("client") + # is_client = name.startswith("client") + is_client = "client" in name dev_vm = False if is_client and not(first_client): is_coordinator = True @@ -486,4 +492,169 @@ def run_job_in_dev_vm(self, cmds, wait_till_end=True): if wait_till_end: self.wait_till_end(len(cmds)) + + +class AWSDeployment(Deployment): + def __init__(self, config, workdir): + super().__init__(config, workdir) + + def find_aws_tf_dir(self): + search_paths = [ + os.path.join("deployment", "aws-tf"), + os.path.join("scripts_v2", "deployment", "aws-tf"), + ] + + found_path = None + for path in search_paths: + if os.path.exists(path): + found_path = os.path.abspath(path) + break + + if found_path is None: + raise FileNotFoundError("AWS Terraform directory not found") + else: + print(f"Found AWS Terraform directory at {found_path}") + + return found_path + + def populate_raw_node_list_from_terraform(self): + found_path = self.find_aws_tf_dir() + if found_path is None: + raise FileNotFoundError("AWS Terraform directory not found") + else: + print(f"Found AWS Terraform directory at {found_path}") + + tf_output_dir = os.path.abspath(os.path.join(self.workdir, "deployment")) + + tfstate_path = os.path.join(tf_output_dir, "terraform.tfstate") + + machine_list = run_local([ + f"terraform output -state={tfstate_path} --json machine_list" + ])[0] + + machine_list = json.loads(machine_list) + + # The format is: [[name, private_ip, public_ip]] + + node_list = {} + for name, private_ip, public_ip in machine_list: + + if "tdx" in name: + tee_type = "tdx" + elif "sev" in name: + tee_type = "sev" + else: + tee_type = "nontee" + + if "loc" in name: + # Find the _locX_ part + region_id = int(re.findall(r"loc(\d+)", name)[0]) + else: + region_id = 0 + + node_list[name] = { + "private_ip": private_ip, + "public_ip": public_ip, + "tee_type": tee_type, + "region_id": region_id + } + + self.raw_config["node_list"] = node_list + + pprint(node_list) + + # Terraform returns early while the init script is still running in the VM. + # Finishing is demarcated by the presence of the file /home/psladmin/ready.txt in the VMs. + # Wait for the file to be present in the VMs. + self.wait_till_all_ready() + # In the deploy order, this needs to be called before prepare_dev_vm(), so this is the only place to inject this call. + + + + def wait_till_all_ready(self, init_timeout=60): + print("Waiting for all VMs to be ready...") + print("Initial sleep for", init_timeout, "seconds") + sleep(init_timeout) + + total_count = 0 + + while total_count < len(self.raw_config["node_list"]): + total_count = 0 + for node, info in self.raw_config["node_list"].items(): + public_ip = info["public_ip"] + try: + res = run_remote_public_ip([ + f"cat /home/{self.ssh_user}/ready.txt 2>/dev/null" + ], self.ssh_user, self.ssh_key, public_ip, hide=False, timeout=5) + print("Output from", node, ":", res) + if res[0].strip() == "VM Ready": + total_count += 1 + except Exception as e: + print(f"Error while waiting for {node} to be ready: {e}") + continue + print("Total VMs ready:", total_count) + + print("All VMs ready") + + + + def deploy(self, _tf_dir=None): + found_path = self.find_aws_tf_dir() + if found_path is None: + raise FileNotFoundError("AWS Terraform directory not found") + else: + print(f"Found AWS Terraform directory at {found_path}") + + super().deploy(tf_dir=found_path) + + + + def teardown(self): + if self.mode == "manual": + return + + found_path = self.find_aws_tf_dir() + if found_path is None: + raise FileNotFoundError("AWS Terraform directory not found") + else: + print(f"Found AWS Terraform directory at {found_path}") + + tf_output_dir = os.path.abspath(os.path.join(self.workdir, "deployment")) + tfstate_path = os.path.join(tf_output_dir, "terraform.tfstate") + + + while True: + try: + run_local([ + f"terraform -chdir={found_path} apply -destroy -no-color -auto-approve -state={tfstate_path}", + ], hide=False) + break + except Exception as e: + print("Error while destroying VMs. Retrying...") + print(e) + + def get_all_node_vms(self): + # Node VMs are named sevpool, tdxpool etc. + # Logic: Find nodes that are not client or storage + return list(set(self.nodelist) - set(self.get_all_client_vms()) - set(self.get_all_storage_vms())) + + def get_all_client_vms(self): + return [ + vm for vm in self.nodelist if "client" in vm.name + ] + + def get_all_storage_vms(self): + return [ + vm for vm in self.nodelist if "storage" in vm.name + ] + + def get_all_client_vms_in_region(self, loc: int): + return [ + vm for vm in self.get_all_client_vms() if vm.region_id == loc + ] + + def get_nodes_with_tee(self, tee): + return [ + vm for vm in self.get_all_node_vms() if tee in vm.tee_type + ] diff --git a/scripts/ssh_utils.py b/scripts/ssh_utils.py index d0456b9a..fc888d8f 100644 --- a/scripts/ssh_utils.py +++ b/scripts/ssh_utils.py @@ -29,18 +29,23 @@ def run_local(cmds: list, hide=True, asynchronous=False): return [res.join().stdout.strip() for res in results] -def run_remote_public_ip(cmds: list, ssh_user, ssh_key, host: Node, hide=True): +def run_remote_public_ip(cmds: list, ssh_user, ssh_key, host: Node|str, hide=True, timeout=None): results = [] + + if isinstance(host, Node): + host = host.public_ip + conn = Connection( - host=host.public_ip, + host=host, user=ssh_user, + connect_timeout=timeout, connect_kwargs={ "key_filename": ssh_key } ) for cmd in cmds: try: - res = conn.run(cmd, hide=hide, pty=True) + res = conn.run(cmd, hide=hide, pty=True, timeout=timeout) results.append(res.stdout.strip()) except Exception as e: results.append(str(e))