From 046c45cb4a611c8a4ba21992f1affb6b664dc53c Mon Sep 17 00:00:00 2001 From: Murali Krishnasamy Date: Fri, 3 Nov 2023 13:16:45 -0400 Subject: [PATCH] rebased and squashed --- libs/platforms/platform.py | 5 +- libs/platforms/rosa/hypershift/hypershift.py | 4 +- libs/platforms/rosa/rosa.py | 41 +- libs/platforms/rosa/terraform/files/main.tf | 20 +- libs/platforms/rosa/terraform/files/output.tf | 15 - .../rosa/terraform/files/variables.tf | 15 + libs/platforms/rosa/terraform/terraform.py | 401 +++++++++++++----- libs/utils.py | 11 +- rosa-burner.py | 2 +- 9 files changed, 354 insertions(+), 160 deletions(-) delete mode 100644 libs/platforms/rosa/terraform/files/output.tf diff --git a/libs/platforms/platform.py b/libs/platforms/platform.py index c33786c..0a71ba0 100644 --- a/libs/platforms/platform.py +++ b/libs/platforms/platform.py @@ -21,6 +21,9 @@ def __init__(self, arguments, logging, utils, es): self.environment["platform"] = arguments["platform"] + if arguments["subplatform"]: + self.environment["subplatform"] = arguments["subplatform"] + self.environment["ocm_url"] = arguments["ocm_url"] self.environment["ocm_token"] = arguments["ocm_token"] @@ -184,7 +187,7 @@ def create_cluster(self, platform, cluster_name): def delete_cluster(self, platform, cluster_name): pass - def platform_cleanup(self): + def platform_cleanup(self, platform=""): pass def watcher(self): diff --git a/libs/platforms/rosa/hypershift/hypershift.py b/libs/platforms/rosa/hypershift/hypershift.py index b21b26f..0b910ee 100644 --- a/libs/platforms/rosa/hypershift/hypershift.py +++ b/libs/platforms/rosa/hypershift/hypershift.py @@ -114,8 +114,8 @@ def _get_mc(self, cluster_id): ) return json.loads(resp_out).get("management_cluster", None) if resp_code == 0 else None - def platform_cleanup(self): - super().platform_cleanup() + def platform_cleanup(self, platform=""): + super().platform_cleanup(platform) self.logging.info("Cleaning resources") # Delete Operator Roles self._delete_operator_roles() if self.environment[ diff --git a/libs/platforms/rosa/rosa.py b/libs/platforms/rosa/rosa.py index d534a9f..0526cd1 100644 --- a/libs/platforms/rosa/rosa.py +++ b/libs/platforms/rosa/rosa.py @@ -199,8 +199,8 @@ def _delete_operator_roles(self): ) return True - def platform_cleanup(self): - super().platform_cleanup() + def platform_cleanup(self, platform=""): + super().platform_cleanup(platform) def create_cluster(self, platform, cluster_name): super().create_cluster(platform, cluster_name) @@ -240,25 +240,28 @@ def _preflight_wait(self, cluster_id, cluster_name): self.logging.error(f"Exiting preflight times capturing on {cluster_name} cluster after capturing Ctrl-C") return 0 self.logging.info(f"Getting status for cluster {cluster_name}") - status_code, status_out, status_err = self.utils.subprocess_exec("rosa describe cluster -c " + cluster_id + " -o json", extra_params={"universal_newlines": True}) + status_code, status_out, status_err = self.utils.subprocess_exec("rosa describe cluster -c " + cluster_id + " -o json", extra_params={"universal_newlines": True}, log_output=False) current_time = int(datetime.datetime.utcnow().timestamp()) - try: - current_status = json.loads(status_out)["state"] - except Exception as err: - self.logging.error(f"Cannot load metadata for cluster {cluster_name}") - self.logging.error(err) - continue - if current_status != previous_status and previous_status != "": - return_data[previous_status] = current_time - start_time - start_time = current_time - self.logging.info(f"Cluster {cluster_name} moved from {previous_status} status to {current_status} status after {return_data[previous_status]} seconds") - if current_status == "installing": - self.logging.info(f"Cluster {cluster_name} is on installing status. Exiting preflights waiting...") - return return_data + if status_code != 0: + self.logging.debug("Cluster data not available yet, retrying..") else: - self.logging.debug(f"Cluster {cluster_name} on {current_status} status. Waiting 2 seconds until {datetime.datetime.fromtimestamp(start_time + 60 * 60)} for next check") - time.sleep(1) - previous_status = current_status + try: + current_status = json.loads(status_out)["state"] + except Exception as err: + self.logging.error(f"Cannot load metadata for cluster {cluster_name}") + self.logging.error(err) + continue + if current_status != previous_status and previous_status != "": + return_data[previous_status] = current_time - start_time + start_time = current_time + self.logging.info(f"Cluster {cluster_name} moved from {previous_status} status to {current_status} status after {return_data[previous_status]} seconds") + if current_status == "installing": + self.logging.info(f"Cluster {cluster_name} is on installing status. Exiting preflights waiting...") + return return_data + else: + self.logging.debug(f"Cluster {cluster_name} on {current_status} status. Waiting 2 seconds until {datetime.datetime.fromtimestamp(start_time + 60 * 60)} for next check") + time.sleep(1) + previous_status = current_status self.logging.error(f"Cluster {cluster_name} on {current_status} status (not installing) after 60 minutes. Exiting preflight waiting...") return return_data diff --git a/libs/platforms/rosa/terraform/files/main.tf b/libs/platforms/rosa/terraform/files/main.tf index 6f865ce..9e99fa7 100644 --- a/libs/platforms/rosa/terraform/files/main.tf +++ b/libs/platforms/rosa/terraform/files/main.tf @@ -31,18 +31,6 @@ provider "rhcs" { url = var.url } -# Create managed OIDC config -module "oidc_config" { - token = var.token - url = var.url - source = "./oidc_provider" - managed = true - operator_role_prefix = var.operator_role_prefix - account_role_prefix = var.account_role_prefix - tags = var.tags - path = var.path -} - locals { path = coalesce(var.path, "/") sts_roles = { @@ -53,7 +41,7 @@ locals { worker_role_arn = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role${local.path}${var.account_role_prefix}-Worker-Role" }, operator_role_prefix = var.operator_role_prefix, - oidc_config_id = module.oidc_config.id + oidc_config_id = var.oidc_config_id } } @@ -65,7 +53,8 @@ locals { } resource "rhcs_cluster_rosa_classic" "rosa_sts_cluster" { - name = var.cluster_name + count = var.clusters_per_apply + name = "${var.cluster_name}-${format("%04d", var.loop_factor + count.index + 1)}" cloud_region = var.cloud_region aws_account_id = data.aws_caller_identity.current.account_id availability_zones = var.availability_zones @@ -79,5 +68,6 @@ resource "rhcs_cluster_rosa_classic" "rosa_sts_cluster" { rosa_creator_arn = data.aws_caller_identity.current.arn } sts = local.sts_roles - wait_for_create_complete = true + wait_for_create_complete = false + disable_waiting_in_destroy = true } diff --git a/libs/platforms/rosa/terraform/files/output.tf b/libs/platforms/rosa/terraform/files/output.tf deleted file mode 100644 index 49ede51..0000000 --- a/libs/platforms/rosa/terraform/files/output.tf +++ /dev/null @@ -1,15 +0,0 @@ -output "oidc_config_id" { - value = module.oidc_config.id -} - -output "oidc_endpoint_url" { - value = module.oidc_config.oidc_endpoint_url -} - -output "thumbprint" { - value = module.oidc_config.thumbprint -} - -output "cluster_id" { - value = rhcs_cluster_rosa_classic.rosa_sts_cluster.id -} diff --git a/libs/platforms/rosa/terraform/files/variables.tf b/libs/platforms/rosa/terraform/files/variables.tf index 8116c1c..ee448b3 100644 --- a/libs/platforms/rosa/terraform/files/variables.tf +++ b/libs/platforms/rosa/terraform/files/variables.tf @@ -23,6 +23,21 @@ variable "cluster_name" { default = "rbur-000-0001" } +variable "clusters_per_apply" { + type = number + default = null +} + +variable "loop_factor" { + type = number + default = null +} + +variable "oidc_config_id" { + type = string + default = null +} + variable "cloud_region" { type = string default = "us-east-2" diff --git a/libs/platforms/rosa/terraform/terraform.py b/libs/platforms/rosa/terraform/terraform.py index 6332316..e008a19 100644 --- a/libs/platforms/rosa/terraform/terraform.py +++ b/libs/platforms/rosa/terraform/terraform.py @@ -8,6 +8,8 @@ # import math import shutil import configparser +import concurrent.futures + from libs.platforms.rosa.rosa import Rosa from libs.platforms.rosa.rosa import RosaArguments @@ -22,65 +24,255 @@ def __init__(self, arguments, logging, utils, es): self.logging.info("Parameter --workers will be ignored on terraform subplatform. OCM Terraform module is fixed to 2 workers") self.environment["workers"] = "2" - # if self.environment['cluster_count'] % arguments['clusters_per_apply'] == 0: - # self.logging.debug(str(self.environment['cluster_count'] % arguments['clusters_per_apply'])) - # self.logging.info(str(arguments['clusters_per_apply']) + " clusters will be installed on each Terraform Apply") - # self.environment['clusters_per_apply'] = arguments['clusters_per_apply'] - # self.environment['cluster_count'] = self.environment['cluster_count'] / self.environment['clusters_per_apply'] - # else: - # self.logging.debug(str(self.environment['cluster_count'] % arguments['clusters_per_apply'])) - # self.logging.error("--cluster-count (" + str(self.environment['cluster_count']) + ") parameter must be divisible by --clusters-per-apply (" + str(arguments['clusters_per_apply']) + ")") - # sys.exit("Exiting...") + if self.environment['cluster_count'] % arguments['clusters_per_apply'] == 0: + self.logging.debug(str(self.environment['cluster_count'] % arguments['clusters_per_apply'])) + self.logging.info(str(arguments['clusters_per_apply']) + " clusters will be installed on each Terraform Apply") + self.environment['clusters_per_apply'] = arguments['clusters_per_apply'] + self.environment['clusters_per_apply_count'] = self.environment['cluster_count'] / self.environment['clusters_per_apply'] + else: + self.logging.debug(str(self.environment['cluster_count'] % arguments['clusters_per_apply'])) + self.logging.error("--cluster-count (" + str(self.environment['cluster_count']) + ") parameter must be divisible by --clusters-per-apply (" + str(arguments['clusters_per_apply']) + ")") + sys.exit("Exiting...") def initialize(self): super().initialize() - shutil.copytree(sys.path[0] + "/libs/platforms/rosa/terraform/files", self.environment['path'] + "/terraform") + if not os.path.exists(self.environment['path'] + "/terraform"): + shutil.copytree(sys.path[0] + "/libs/platforms/rosa/terraform/files", self.environment['path'] + "/terraform") self.logging.info("Initializing Terraform with: terraform init") terraform_code, terraform_out, terraform_err = self.utils.subprocess_exec("terraform init", self.environment["path"] + "/terraform/terraform-init.log", {"cwd": self.environment["path"] + "/terraform"}) if terraform_code != 0: - self.logging.error(f"Failed to initialize terraform. Check {self.environment['path']}/terraform/init.log for more information") + self.logging.error(f"Failed to initialize terraform. Check {self.environment['path']}/terraform/terraform-init.log for more information") sys.exit("Exiting...") - def platform_cleanup(self): - super().platform_cleanup() + terraform_code, terraform_out, terraform_err = self.utils.subprocess_exec("terraform init", self.environment["path"] + "/terraform/oidc_provider/terraform-init.log", {"cwd": self.environment["path"] + "/terraform/oidc_provider"}) + if terraform_code != 0: + self.logging.error(f"Failed to initialize terraform. Check {self.environment['path']}/terraform/oidc_provider/terraform-init.log for more information") + sys.exit("Exiting...") - def delete_cluster(self, platform, cluster_name): - super().delete_cluster(platform, cluster_name) + def platform_cleanup(self, platform=""): + super().platform_cleanup(platform) + self.destroy_tf_template(platform, tf_module="oidc") - myenv = os.environ.copy() - myenv["TF_VAR_token"] = self.environment["ocm_token"] - myenv["TF_VAR_cloud_region"] = self.environment['aws']['region'] - myenv["TF_VAR_url"] = self.environment["ocm_url"] - myenv["TF_VAR_account_role_prefix"] = 'ManagedOpenShift' - myenv["TF_VAR_cluster_name"] = cluster_name - myenv["TF_VAR_operator_role_prefix"] = cluster_name -# myenv["TF_VAR_clusters_per_apply"] = str(self.environment['clusters_per_apply']) + def _oidc_tf_template(self, action, tf_path, myenv): + code, out, err = self.utils.subprocess_exec("terraform " + action + " --auto-approve -state=" + tf_path + "/terraform_oidc.tfstate ", tf_path + "/terraform_oidc_" + action.replace(" ", "") + ".log", {"cwd": self.environment['path'] + "/terraform/oidc_provider", 'preexec_fn': self.utils.disable_signals, "env": myenv}) + return code, out, err + + # creates templates based on the clusters_per_apply + # and apply them at given interval without wait for it to complete + def apply_tf_template(self, platform): + loop_counter = 0 + while loop_counter < platform.environment["clusters_per_apply_count"]: + tf_counter = 0 + self.logging.debug(platform.environment["clusters"]) + if self.utils.force_terminate: + loop_counter += 1 + else: + cluster_workers = int(platform.environment["workers"]) + + tf_name = platform.environment["cluster_name_seed"] + + try: + tf_path = platform.environment["path"] + "/" + "TF_" + tf_name + "-" + str(loop_counter * self.environment['clusters_per_apply']).zfill(4) + os.mkdir(tf_path) + + myenv = os.environ.copy() + myenv["TF_VAR_token"] = self.environment["ocm_token"] + myenv["TF_VAR_cloud_region"] = self.environment['aws']['region'] + myenv["TF_VAR_url"] = self.environment["ocm_url"] + myenv["TF_VAR_account_role_prefix"] = 'ManagedOpenShift' + myenv["TF_VAR_cluster_name"] = tf_name + myenv["TF_VAR_replicas"] = str(cluster_workers) + myenv["TF_VAR_operator_role_prefix"] = tf_name + "-" + str(loop_counter) + myenv["TF_VAR_clusters_per_apply"] = str(self.environment['clusters_per_apply']) + myenv["TF_VAR_loop_factor"] = str((loop_counter * self.environment['clusters_per_apply'])) + + # additional env for oidc_provider template + myenv["TF_VAR_managed"] = "true" + + self.logging.info(f"Applying OIDC template to create oidc_provider for cluster seed {tf_name} looping {loop_counter + 1}") + terraform_oidc_apply_code, terraform_oidc_apply_out, terraform_oidc_apply_err = self._oidc_tf_template("apply", tf_path, myenv) + if terraform_oidc_apply_code != 0: + self.logging.error(f"OIDC with seed {tf_name} looping {loop_counter + 1} terraform apply failed") + self.logging.debug(terraform_oidc_apply_out) + return 1 + else: + self.logging.info(f"Applied OIDC template successfully for cluster seed {tf_name} looping {loop_counter + 1}") + with open(tf_path + "/terraform_oidc.tfstate", "r") as terraform_state: + json_output = json.load(terraform_state) + oidc_id = json_output["outputs"]["id"]["value"] + + # Passing new OIDC ID to the cluster template + myenv["TF_VAR_oidc_config_id"] = oidc_id + + self.logging.info(f"Applying template to create {platform.environment['clusters_per_apply']} with cluster seed {tf_name} looping {loop_counter + 1}") + terraform_plan_code, terraform_plan_out, terraform_plan_err = self.utils.subprocess_exec("terraform plan -out " + tf_path + "/" + tf_name + ".tfplan", tf_path + "/terraform_plan.log", {"cwd": self.environment['path'] + "/terraform", "env": myenv}) + if terraform_plan_code != 0: + self.logging.error(f"Clusters with seed {tf_name} looping {loop_counter + 1} terraform plan failed") + self.logging.debug(terraform_plan_out) + return 1 + else: + self.logging.info(f"Trying to install clusters with TF template {tf_name} looping {loop_counter + 1} with {cluster_workers} workers up to 5 times using terraform provider") + trying = 0 + while trying <= 5: + if self.utils.force_terminate: + self.logging.error(f"Exiting clusters creation for {tf_name} looping {loop_counter + 1} after capturing Ctrl-C") + return 0 + trying += 1 + cluster_apply_time = int(datetime.datetime.utcnow().timestamp()) + terraform_apply_code, terraform_apply_out, terraform_apply_err = self.utils.subprocess_exec("terraform apply -state=" + tf_path + "/terraform.tfstate " + tf_path + "/" + tf_name + ".tfplan", tf_path + "/terraform_apply-" + str(trying) + ".log", {"cwd": self.environment['path'] + "/terraform", 'preexec_fn': self.utils.disable_signals, "env": myenv}) + if terraform_apply_code != 0: + self.logging.debug(terraform_apply_out) + self.logging.debug(terraform_apply_err) + if trying <= 5: + self.logging.warning(f"Try: {trying}/5. Clusters with seed {tf_name} looping {loop_counter + 1} installation failed, retrying in 15 seconds") + time.sleep(15) + else: + self.logging.error(f"Clusters with seed {tf_name} looping {loop_counter + 1} installation failed after 5 retries") + self.logging.debug(terraform_apply_out) + self.logging.debug(terraform_apply_err) + return 1 + else: + break + + except Exception as err: + self.logging.error(f"Failed to apply with cluster seed {tf_name} looping {loop_counter + 1}") + self.logging.error(err) + return 1 + while tf_counter < platform.environment["clusters_per_apply"]: + cluster_name = platform.environment["cluster_name_seed"] + "-" + str((loop_counter * self.environment['clusters_per_apply']) + (tf_counter + 1)).zfill(4) + platform.environment["clusters"][cluster_name]["cluster_apply_time"] = cluster_apply_time + tf_counter += 1 + if platform.environment["delay_between_batch"] is None: + time.sleep(1) + else: + time.sleep(platform.environment["delay_between_batch"]) + loop_counter += 1 + return 0 + + # uses created templates based on the clusters_per_apply + # and destroy them at given interval without wait for it to complete + def destroy_tf_template(self, platform, tf_module="cluster"): + loop_counter = 0 + while loop_counter < platform.environment["clusters_per_apply_count"]: + self.logging.debug(platform.environment["clusters"]) + if self.utils.force_terminate: + loop_counter += 1 + else: + cluster_workers = int(platform.environment["workers"]) + + tf_name = platform.environment["cluster_name_seed"] + + try: + + tf_path = platform.environment["path"] + "/" + "TF_" + tf_name + "-" + str(loop_counter * self.environment['clusters_per_apply']).zfill(4) + if not os.path.exists(tf_path): + os.mkdir(tf_path) + myenv = os.environ.copy() + myenv["TF_VAR_token"] = self.environment["ocm_token"] + myenv["TF_VAR_cloud_region"] = self.environment['aws']['region'] + myenv["TF_VAR_url"] = self.environment["ocm_url"] + myenv["TF_VAR_account_role_prefix"] = 'ManagedOpenShift' + myenv["TF_VAR_cluster_name"] = tf_name + myenv["TF_VAR_replicas"] = str(cluster_workers) + myenv["TF_VAR_operator_role_prefix"] = tf_name + "-" + str(loop_counter) + myenv["TF_VAR_clusters_per_apply"] = str(self.environment['clusters_per_apply']) + myenv["TF_VAR_loop_factor"] = str((loop_counter * self.environment['clusters_per_apply'])) + + if tf_module == "oidc": + # additional env for oidc_provider template + myenv["TF_VAR_managed"] = "true" + + self.logging.info(f"Destroying OIDC template to delete oidc_provider for cluster seed {tf_name} looping {loop_counter + 1}") + terraform_oidc_destroy_code, terraform_oidc_destroy_out, terraform_oidc_destroy_err = self._oidc_tf_template("apply -destroy", tf_path, myenv) + if terraform_oidc_destroy_code != 0: + self.logging.error(f"OIDC with seed {tf_name} looping {loop_counter + 1} terraform destroy failed") + self.logging.debug(terraform_oidc_destroy_out) + return 1 + + else: + self.logging.info(f"Deleting Clusters with seed {tf_name} looping {loop_counter + 1} on Rosa Platform using terraform") + trying = 0 + while trying <= 5: + if self.utils.force_terminate: + self.logging.error(f"Exiting clusters deletion for {tf_name} looping {loop_counter + 1} after capturing Ctrl-C") + return 0 + trying += 1 + cleanup_code, cleanup_out, cleanup_err = self.utils.subprocess_exec("terraform apply -destroy -state=" + tf_path + "/terraform.tfstate --auto-approve", tf_path + "/cleanup-" + str(trying) + ".log", {"cwd": self.environment['path'] + "/terraform", 'preexec_fn': self.utils.disable_signals, "env": myenv}) + if cleanup_code != 0: + self.logging.debug(f"Clusters Cleanup with seed {tf_name} looping {loop_counter + 1} is failed") + self.logging.debug(cleanup_out) + self.logging.debug(cleanup_err) + if trying <= 5: + self.logging.warning(f"Try: {trying}/5. Clusters with seed {tf_name} looping {loop_counter + 1} deletion failed, retrying in 15 seconds") + time.sleep(15) + else: + self.logging.error(f"Clusters with seed {tf_name} looping {loop_counter + 1} deletion failed after 5 retries") + self.logging.debug(cleanup_out) + self.logging.debug(cleanup_err) + return 1 + else: + break + + except Exception as err: + self.logging.error(f"Failed to apply with cluster seed {tf_name} looping {loop_counter + 1}") + self.logging.error(err) + return 1 + if platform.environment["delay_between_cleanup"] is None: + time.sleep(1) + else: + time.sleep(platform.environment["delay_between_cleanup"]) + loop_counter += 1 + return 0 + + # Cluster deletion will be initiated by destroy_tf_template + # this function waits and verifies the deletions + # No actual delete logic in this function unlike other subplatform + def delete_cluster(self, platform, cluster_name): + super().delete_cluster(platform, cluster_name) + retry_loop = 0 cluster_info = platform.environment["clusters"][cluster_name] - cluster_start_time = int(datetime.datetime.utcnow().timestamp()) cluster_info["uuid"] = self.environment["uuid"] cluster_info["install_method"] = "terraform" - self.logging.info(f"Deleting cluster {cluster_name} on Rosa Platform using terraform") - cleanup_code, cleanup_out, cleanup_err = self.utils.subprocess_exec("terraform apply -destroy -state=" + cluster_info['path'] + "/terraform.tfstate --auto-approve", cluster_info["path"] + "/cleanup.log", {"cwd": self.environment['path'] + "/terraform", 'preexec_fn': self.utils.disable_signals, "env": myenv}) - cluster_delete_end_time = int(datetime.datetime.utcnow().timestamp()) - if cleanup_code == 0: - self.logging.debug( - f"Confirm cluster {cluster_name} deleted by attempting to describe the cluster. This should fail if the cluster is removed." - ) - check_code, check_out, check_err = self.utils.subprocess_exec( - "rosa describe cluster -c " + cluster_name, log_output=False - ) - if check_code != 0: - cluster_info["status"] = "deleted" + cluster_info["per_template_count"] = platform.environment['clusters_per_apply'] + cluster_info["tf_count"] = platform.environment['clusters_per_apply_count'] + cluster_info["total_count"] = platform.environment['cluster_count'] + self.logging.info(f"Checking uninstall log for cluster {cluster_name}") + + while retry_loop <= 600: # 1hr timeout + retry_loop += 1 + cluster_delete_start_time = int(datetime.datetime.utcnow().timestamp()) + watch_code, watch_out, watch_err = self.utils.subprocess_exec("rosa logs uninstall -c " + cluster_name + " --watch", cluster_info["path"] + "/cleanup.log", {'preexec_fn': self.utils.disable_signals}) + if watch_code != 0: + if retry_loop <= 600: + self.logging.debug(f"ROSA cluster uninstall log for {cluster_name} is not available yet, retrying..") + self.logging.debug(watch_out) + time.sleep(6) + else: + cluster_info['status'] = "not deleted" + self.logging.debug(watch_out) + self.logging.error(watch_err) + return 1 else: - cluster_info["status"] = "not deleted" + break + + cluster_delete_end_time = int(datetime.datetime.utcnow().timestamp()) + self.logging.debug( + f"Confirm cluster {cluster_name} deleted by attempting to describe the cluster. This should fail if the cluster is removed." + ) + check_code, check_out, check_err = self.utils.subprocess_exec( + "rosa describe cluster -c " + cluster_name, log_output=False + ) + if check_code != 0: + cluster_info["status"] = "deleted" else: cluster_info["status"] = "not deleted" + cluster_end_time = int(datetime.datetime.utcnow().timestamp()) - cluster_info["destroy_duration"] = cluster_delete_end_time - cluster_start_time - cluster_info["destroy_all_duration"] = cluster_end_time - cluster_start_time + cluster_info["destroy_duration"] = cluster_delete_end_time - cluster_delete_start_time + cluster_info["destroy_all_duration"] = cluster_end_time - cluster_delete_start_time try: with open(cluster_info['path'] + "/metadata_destroy.json", "w") as metadata_file: json.dump(cluster_info, metadata_file) @@ -116,85 +308,82 @@ def get_workers_ready(self, kubeconfig, cluster_name): ready_nodes = status_list["True"] if "True" in status_list else 0 return ready_nodes + # Cluster creation will be initiated by apply_tf_template + # this function waits and verifies the creation + # No actual create logic in this function unlike other subplatform def create_cluster(self, platform, cluster_name): super().create_cluster(platform, cluster_name) + retry_loop = 0 cluster_info = platform.environment["clusters"][cluster_name] cluster_info["uuid"] = self.environment["uuid"] cluster_info["install_method"] = "terraform" + cluster_info["per_template_count"] = platform.environment['clusters_per_apply'] + cluster_info["tf_count"] = platform.environment['clusters_per_apply_count'] + cluster_info["total_count"] = platform.environment['cluster_count'] self.logging.info(f"Creating cluster {cluster_info['index']} on ROSA with name {cluster_name} and {cluster_info['workers']} workers") cluster_info["path"] = platform.environment["path"] + "/" + cluster_name os.mkdir(cluster_info["path"]) - self.logging.debug("Attempting cluster installation") self.logging.debug("Output directory set to %s" % cluster_info["path"]) - myenv = os.environ.copy() - myenv["TF_VAR_token"] = self.environment["ocm_token"] - myenv["TF_VAR_cloud_region"] = self.environment['aws']['region'] - myenv["TF_VAR_url"] = self.environment["ocm_url"] - myenv["TF_VAR_account_role_prefix"] = 'ManagedOpenShift' - myenv["TF_VAR_cluster_name"] = cluster_name - myenv["TF_VAR_operator_role_prefix"] = cluster_name -# myenv["TF_VAR_clusters_per_apply"] = str(self.environment['clusters_per_apply']) - - terraform_plan_code, terraform_plan_out, terraform_plan_err = self.utils.subprocess_exec("terraform plan -out " + cluster_info['path'] + "/" + cluster_name + ".tfplan", cluster_info["path"] + "/terraform_plan.log", {"cwd": self.environment['path'] + "/terraform", "env": myenv}) - if terraform_plan_code != 0: - cluster_end_time = int(datetime.datetime.utcnow().timestamp()) - cluster_info["status"] = "Not Installed" - self.logging.error(f"Cluster {cluster_name} terraform plan failed") - self.logging.debug(terraform_plan_out) - return 1 - else: - self.logging.info(f"Trying to install cluster {cluster_name} with {cluster_info['workers']} workers up to 5 times using terraform provider") - trying = 0 - while trying <= 5: - cluster_start_time = int(datetime.datetime.utcnow().timestamp()) - if self.utils.force_terminate: - self.logging.error(f"Exiting cluster creation for {cluster_name} after capturing Ctrl-C") - return 0 - trying += 1 - terraform_apply_code, terraform_apply_out, terraform_apply_err = self.utils.subprocess_exec("terraform apply -state=" + cluster_info['path'] + "/terraform.tfstate " + cluster_info['path'] + "/" + cluster_name + ".tfplan", cluster_info["path"] + "/terraform_apply.log", {"cwd": self.environment['path'] + "/terraform", 'preexec_fn': self.utils.disable_signals, "env": myenv}) - if terraform_apply_code != 0: - cluster_info["install_try"] = trying - self.logging.debug(terraform_apply_out) - self.logging.debug(terraform_apply_err) - if trying <= 5: - self.logging.warning(f"Try: {trying}/5. Cluster {cluster_name} installation failed, retrying in 15 seconds") - time.sleep(15) - else: - cluster_end_time = int(datetime.datetime.utcnow().timestamp()) - cluster_info["status"] = "Not Installed" - self.logging.error(f"Cluster {cluster_name} installation failed after 5 retries") - self.logging.debug(terraform_apply_out) - self.logging.debug(terraform_apply_err) - return 1 + while retry_loop <= 60: # 10 min timeout + retry_loop += 1 + cluster_start_time = int(datetime.datetime.utcnow().timestamp()) + status_code, status_out, status_err = self.utils.subprocess_exec("rosa describe cluster -c " + cluster_name + " -o json", extra_params={"universal_newlines": True}, log_output=False) + if status_code != 0: + if retry_loop <= 60: + self.logging.debug(f"ROSA cluster {cluster_name} is not available yet, retrying..") + self.logging.debug(status_out) + time.sleep(5) else: - cluster_end_time = int(datetime.datetime.utcnow().timestamp()) - break - - cluster_info['status'] = "installed" - self.logging.info(f"Cluster {cluster_name} installation finished on the {trying} try") - cluster_info["metadata"] = self.get_metadata(cluster_name) - cluster_info["install_try"] = trying - cluster_info["install_duration"] = cluster_end_time - cluster_start_time - access_timers = self.get_cluster_admin_access(cluster_name, cluster_info["path"]) - cluster_info["kubeconfig"] = access_timers.get("kubeconfig", None) - cluster_info["cluster_admin_create"] = access_timers.get("cluster_admin_create", None) - cluster_info["cluster_admin_login"] = access_timers.get("cluster_admin_login", None) - cluster_info["cluster_oc_adm"] = access_timers.get("cluster_oc_adm", None) - if not cluster_info["kubeconfig"]: - self.logging.error(f"Failed to download kubeconfig file for cluster {cluster_name}. Disabling wait for workers and workload execution") - cluster_info["workers_wait_time"] = None - cluster_info["status"] = "Ready. Not Access" - return 1 - if cluster_info["workers_wait_time"]: - workers_ready = self._wait_for_workers(cluster_info["kubeconfig"], cluster_info["workers"], cluster_info["workers_wait_time"], cluster_name, "workers") - if workers_ready[1] == cluster_info["workers"]: - cluster_info["workers_ready"] = workers_ready[2] - cluster_start_time + cluster_info['status'] = "not ready" + self.logging.debug(status_out) + self.logging.error(status_err) + return 1 else: - cluster_info['workers_ready'] = None - cluster_info['status'] = "Ready, missing workers" + cluster_listed_time = int(datetime.datetime.utcnow().timestamp()) + preflight_ch = self._preflight_wait(cluster_name, cluster_name) + cluster_info["preflight_checks"] = preflight_ch + break + + watch_code, watch_out, watch_err = self.utils.subprocess_exec("rosa logs install -c " + cluster_name + " --watch", cluster_info["path"] + "/installation.log", {'preexec_fn': self.utils.disable_signals}) + status_code, status_out, status_err = self.utils.subprocess_exec("rosa describe cluster -c " + cluster_name + " -o json", extra_params={"universal_newlines": True}) + current_status = json.loads(status_out)["state"] + if watch_code != 0 or current_status != "ready": + cluster_info['status'] = "not installed" + return 1 + else: + cluster_info['status'] = "installed" + cluster_end_time = int(datetime.datetime.utcnow().timestamp()) + # Getting againg metadata to update the cluster status + cluster_info["metadata"] = self.get_metadata(cluster_name) + cluster_info["install_duration"] = cluster_end_time - cluster_start_time + access_timers = self.get_cluster_admin_access(cluster_name, cluster_info["path"]) + cluster_info["kubeconfig"] = access_timers.get("kubeconfig", None) + cluster_info["cluster_admin_create"] = access_timers.get("cluster_admin_create", None) + cluster_info["cluster_admin_login"] = access_timers.get("cluster_admin_login", None) + cluster_info["cluster_oc_adm"] = access_timers.get("cluster_oc_adm", None) + if not cluster_info["kubeconfig"]: + self.logging.error(f"Failed to download kubeconfig file for cluster {cluster_name}. Disabling wait for workers and workload execution") + cluster_info["workers_wait_time"] = None + cluster_info["status"] = "Ready. Not Access" return 1 - cluster_info['status'] = "ready" + if cluster_info["workers_wait_time"]: + with concurrent.futures.ThreadPoolExecutor() as wait_executor: + futures = [wait_executor.submit(self._wait_for_workers, cluster_info["kubeconfig"], cluster_info["workers"], cluster_info["workers_wait_time"], cluster_name, "workers")] + futures.append(wait_executor.submit(self._wait_for_workers, cluster_info["kubeconfig"], platform.environment["extra_machinepool"]["replicas"], cluster_info["workers_wait_time"], cluster_name, platform.environment["extra_machinepool"]["name"])) if "extra_machinepool" in platform.environment else None + for future in concurrent.futures.as_completed(futures): + result = future.result() + if result[0] == "workers": + default_pool_workers = int(result[1]) + if default_pool_workers == cluster_info["workers"]: + cluster_info["workers_ready"] = result[2] - cluster_start_time + else: + cluster_info['workers_ready'] = None + cluster_info['status'] = "Ready, missing workers" + return 1 + cluster_info['status'] = "ready" + cluster_apply_time = int(platform.environment["clusters"][cluster_name]["cluster_apply_time"]) + cluster_info["apply_duration"] = cluster_listed_time - cluster_apply_time try: with open(cluster_info['path'] + "/metadata_install.json", "w") as metadata_file: json.dump(cluster_info, metadata_file) @@ -263,7 +452,7 @@ def __init__(self, parser, config_file, environment): # EnvDefault = self.EnvDefault parser.add_argument("--terraform-retry", type=int, default=5, help="Number of retries when executing terraform commands") -# parser.add_argument("--clusters-per-apply", type=int, default=1, help="Number of clusters to install on each terraform apply") + parser.add_argument("--clusters-per-apply", type=int, default=1, help="Number of clusters to install on each terraform apply") # parser.add_argument("--service-cluster", action=EnvDefault, env=environment, envvar="ROSA_BURNER_HYPERSHIFT_SERVICE_CLUSTER", help="Service Cluster Used to create the Hosted Clusters") if config_file: diff --git a/libs/utils.py b/libs/utils.py index 542b200..8049b2b 100644 --- a/libs/utils.py +++ b/libs/utils.py @@ -113,6 +113,9 @@ def cleanup_scheduler(self, platform): f"Waiting {platform.environment['delay_between_cleanup']} minutes before deleting the next cluster" ) time.sleep(platform.environment["delay_between_cleanup"]) + if platform.environment["subplatform"] and platform.environment["subplatform"] == "terraform": + if platform.destroy_tf_template(platform) != 0: + return 1 return delete_cluster_thread_list # To form the cluster_info dict for cleanup funtions @@ -182,7 +185,9 @@ def install_scheduler(self, platform): else: cluster_workers = int(platform.environment["workers"].split(",")[(loop_counter - 1) % len(platform.environment["workers"].split(","))]) cluster_name = platform.environment["cluster_name_seed"] + "-" + str(loop_counter).zfill(4) - platform.environment["clusters"][cluster_name] = {} + + if cluster_name not in platform.environment["clusters"]: + platform.environment["clusters"][cluster_name] = {} try: platform.environment["clusters"][cluster_name]["workers"] = cluster_workers platform.environment["clusters"][cluster_name]["workers_wait_time"] = platform.environment["workers_wait_time"] @@ -196,6 +201,10 @@ def install_scheduler(self, platform): cluster_thread_list.append(thread) thread.start() self.logging.debug("Number of alive threads %d" % threading.active_count()) + time.sleep(1) + if platform.environment["subplatform"] and platform.environment["subplatform"] == "terraform": + if platform.apply_tf_template(platform) != 0: + return 1 except Exception as err: self.logging.error(err) self.logging.error("Thread creation failed") diff --git a/rosa-burner.py b/rosa-burner.py index b13f094..4402581 100755 --- a/rosa-burner.py +++ b/rosa-burner.py @@ -97,6 +97,6 @@ else: raise - platform.platform_cleanup() + platform.platform_cleanup(platform) # utils.test_recap(platform)