diff --git a/hcp-burner.py b/hcp-burner.py index b13f094..21763d7 100755 --- a/hcp-burner.py +++ b/hcp-burner.py @@ -69,6 +69,7 @@ if 'enabled' in platform.environment['load'] and str(platform.environment['load']['enabled']).lower() == "true": # Prometheus takes a lot of time to start after all nodes are ready. we maybe needs to increase this sleep in the future + platform = utils.get_cluster_info(platform) logging.info("Waiting 5 minutes to allow all clusters to create all pods") time.sleep(300) load_threads = utils.load_scheduler(platform) diff --git a/libs/platforms/rosa/hypershift/hypershift.py b/libs/platforms/rosa/hypershift/hypershift.py index 0032b8e..a778558 100644 --- a/libs/platforms/rosa/hypershift/hypershift.py +++ b/libs/platforms/rosa/hypershift/hypershift.py @@ -96,7 +96,7 @@ def _verify_provision_shard(self): self.logging.error(f"No Provision Shard found for Service Cluster {self.environment['service_cluster']} on {self.environment['aws']['region']}") return None - def _get_mc(self, cluster_id): + def get_mc(self, cluster_id): self.logging.debug(f"Get the mgmt cluster of cluster {cluster_id}") resp_code, resp_out, resp_err = self.utils.subprocess_exec( "ocm get /api/clusters_mgmt/v1/clusters/" + cluster_id + "/hypershift", @@ -268,7 +268,7 @@ def delete_cluster(self, platform, cluster_name): cluster_start_time = int(datetime.datetime.utcnow().timestamp()) cluster_info["uuid"] = self.environment["uuid"] cluster_info["install_method"] = "rosa" - cluster_info["mgmt_cluster_name"] = self._get_mc(cluster_info["metadata"]["cluster_id"]) + cluster_info["mgmt_cluster_name"] = self.get_mc(cluster_info["metadata"]["cluster_id"]) self.logging.info(f"Deleting cluster {cluster_name} on Hypershift Platform") cleanup_code, cleanup_out, cleanup_err = self.utils.subprocess_exec("rosa delete cluster -c " + cluster_name + " -y --watch", cluster_info["path"] + "/cleanup.log", {'preexec_fn': self.utils.disable_signals}) cluster_delete_end_time = int(datetime.datetime.utcnow().timestamp()) @@ -281,10 +281,11 @@ def delete_cluster(self, platform, cluster_name): ) if check_code != 0: cluster_info["status"] = "deleted" + operator_role_prefix = cluster_info["metadata"]["operator_role_prefix"] self.logging.debug( f"Destroying STS associated resources of cluster name: {cluster_name}" ) - (operators_code, operators_out, operators_err) = self.utils.subprocess_exec("rosa delete operator-roles --prefix " + cluster_name + " -m auto -y", cluster_info["path"] + "/cleanup.log", {'preexec_fn': self.utils.disable_signals}) + (operators_code, operators_out, operators_err) = self.utils.subprocess_exec("rosa delete operator-roles --prefix " + operator_role_prefix + " -m auto -y", cluster_info["path"] + "/operator-role-cleanup.log", {'preexec_fn': self.utils.disable_signals}) if operators_code != 0: self.logging.error( f"Failed to delete operator roles on cluster {cluster_name}" @@ -431,7 +432,7 @@ def create_cluster(self, platform, cluster_name): return 0 self.logging.info("Cluster Create Command:") self.logging.info(cluster_cmd) - (create_cluster_code, create_cluster_out, create_cluster_err) = self.utils.subprocess_exec(" ".join(str(x) for x in cluster_cmd), cluster_info["path"] + "/installation.log", {'preexec_fn': self.utils.disable_signals}) + (create_cluster_code, create_cluster_out, create_cluster_err) = self.utils.subprocess_exec(" ".join(str(x) for x in cluster_cmd), cluster_info["path"] + "/rosa-create.log", {'preexec_fn': self.utils.disable_signals}) trying += 1 if create_cluster_code != 0: cluster_info["install_try"] = trying @@ -470,7 +471,7 @@ def create_cluster(self, platform, cluster_name): cluster_info["preflight_checks"] = preflight_ch.result() cluster_info["sc_namespace_timing"] = sc_namespace.result() - cluster_start_time if platform.environment["sc_kubeconfig"] != "" else None - mgmt_cluster_name = self._get_mc(cluster_info["metadata"]["cluster_id"]) + mgmt_cluster_name = self.get_mc(cluster_info["metadata"]["cluster_id"]) self.environment["mc_kubeconfig"] = self.download_kubeconfig(mgmt_cluster_name, self.environment["path"]) mc_namespace = executor.submit(self._namespace_wait, platform.environment["mc_kubeconfig"], cluster_info["metadata"]["cluster_id"], cluster_name, "Management") if platform.environment["mc_kubeconfig"] != "" else 0 cluster_info["mc_namespace_timing"] = mc_namespace.result() - cluster_start_time if platform.environment["mc_kubeconfig"] != "" else None diff --git a/libs/platforms/rosa/rosa.py b/libs/platforms/rosa/rosa.py index d48cd58..b214e72 100644 --- a/libs/platforms/rosa/rosa.py +++ b/libs/platforms/rosa/rosa.py @@ -215,6 +215,7 @@ def get_metadata(self, cluster_name): metadata["status"] = result.get("state", None) metadata["version"] = result.get("version", {}).get("raw_id", None) metadata["zones"] = result.get("nodes", {}).get("availability_zones", None) + metadata["operator_role_prefix"] = result.get("aws", {}).get("sts", {}).get("operator_role_prefix", None) return metadata def _preflight_wait(self, cluster_id, cluster_name): diff --git a/libs/utils.py b/libs/utils.py index 0e57910..3df94ad 100644 --- a/libs/utils.py +++ b/libs/utils.py @@ -116,7 +116,7 @@ def cleanup_scheduler(self, platform): self.logging.info( f"Waiting {platform.environment['delay_between_cleanup']} minutes before deleting the next cluster" ) - time.sleep(platform.environment["delay_between_cleanup"]) + time.sleep(platform.environment["delay_between_cleanup"] * 60) return delete_cluster_thread_list # To form the cluster_info dict for cleanup funtions @@ -130,6 +130,10 @@ def get_cluster_info(self, platform): platform.environment["clusters"][cluster_name]["metadata"] = platform.get_metadata(cluster_name) platform.environment["clusters"][cluster_name]["status"] = platform.environment["clusters"][cluster_name]["metadata"]["status"] platform.environment["clusters"][cluster_name]["path"] = platform.environment["path"] + "/" + cluster_name + platform.environment["clusters"][cluster_name]["kubeconfig"] = platform.environment["clusters"][cluster_name]["path"] + "/kubeconfig" + platform.environment['clusters'][cluster_name]['workers'] = int(platform.environment["workers"].split(",")[(loop_counter - 1) % len(platform.environment["workers"].split(","))]) + cluster_mc = platform.get_mc(platform.get_cluster_id(cluster_name)) + platform.environment["mc_kubeconfig"] = platform.environment["path"] + "/kubeconfig_" + cluster_mc return platform def load_scheduler(self, platform): diff --git a/utils/loop_script.sh b/utils/loop_script.sh new file mode 100644 index 0000000..41aae1e --- /dev/null +++ b/utils/loop_script.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +trap '_delete_clusters ${EXECUTION_SEED}' SIGINT SIGTERM + +### EDIT VARIABLES +LOG_FILE="${LOG_FILE:-/tmp/install_loop.log}" +NUMBER_OF_CLUSTERS="${NUMBER_OF_CLUSTERS:-1}" +CLUSTER_NAME_SEED="${CLUSTER_NAME_SEED:-loop}" +AWS_SUBNETS="${AWS_SUBNETS:-subnet-xxxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx}" +PROVISION_SHARD="${PROVISION_SHARD:-provision_shard_id:xxxxxx}" +OIDC_CONFIG_ID="${OIDC_CONFIG_ID:-xxxxxxxxxxxx}" +OPERATOR_ROLES_PREFIX="${OPERATOR_ROLES_PREFIX:loop-role}" +export AWS_REGION="${AWS_REGION:-us-east-2}" +### + +exec 3>&1 1>>"${LOG_FILE}" 2>&1 + +log() { + message=$1 + timestamp=$(date +%Y-%m-%d\ %H:%M:%S) + echo -e "$timestamp\t$message" +} + +_delete_clusters() { + log "INFO: Captured Control-C key, deleting all clusters with ${1} seed" | tee /dev/fd/3 + for CLUSTER in $(rosa list clusters | grep "$1" | awk '{print $1}'); do + log "INFO: Deleting cluster ${CLUSTER}" | tee /dev/fd/3 + _delete_cluster "${CLUSTER}" + log "INFO: Cluster ${CLUSTER} deleted" | tee /dev/fd/3 + done + exit 0 +} + +_create_cluster() { + log "INFO: Creating cluster $1" | tee /dev/fd/3 + # Timeout of 15 minutes for creating the cluster (3 times of normal execution) + timeout --foreground -k 900 900 rosa create cluster -c "$1" --replicas 3 --hosted-cp --sts --mode auto -y --watch --multi-az --subnet-ids "$2" --properties "$3" --oidc-config-id "${OIDC_CONFIG_ID}" --operator-roles-prefix "${OPERATOR_ROLES_PREFIX}" --compute-machine-type m5.xlarge --version 4.14.2 || (log "ERROR: Failed to create cluster $1 after 15 minutes" && return 1) + log "INFO: Cluster $1 created" 3>&1 1>>"${LOG_FILE}" | tee /dev/fd/3 +} + +_delete_cluster(){ + # Timeout of 60 minutes for cleaning the cluster (3 times of normal execution) + timeout --foreground -k 3600 3600 rosa delete cluster -c "$1" -y --watch|| (log "ERROR: Failed to delete cluster $1 after 30 minutes" && return 1) +# # Timeout of 5 minutes for Roles and OIDC +# timeout --foreground -k 300 300 rosa delete operator-roles -c "$1" -m auto -y || (log "ERROR: Failed to delete operator roles of cluster $1 after 5 minutes" && return 1) +# timeout --foreground -k 300 300 rosa delete oidc-provider -c "$1" -m auto -y || (log "ERROR: Failed to delete OIDC providers of cluster $1 after 5 minutes" && return 1) +} + +CLUSTER_INDEX=0 +EXECUTION_SEED="${CLUSTER_NAME_SEED}"-$(tr -dc '[:lower:]' < /dev/urandom | fold -w 3 | head -n 1) +while true; do + CLUSTERS_CREATED_LIST=() + while IFS='' read -r line; do CLUSTERS_CREATED_LIST+=("$line"); done < <(rosa list clusters | grep "${EXECUTION_SEED}" | awk '{print $2}') + CLUSTERS_CREATED_TOTAL="$(rosa list clusters | grep -c "${EXECUTION_SEED}")" + if [ "${CLUSTERS_CREATED_TOTAL}" -lt "${NUMBER_OF_CLUSTERS}" ] ; then + log "INFO: Clusters created (${CLUSTERS_CREATED_TOTAL}) under threshold (${NUMBER_OF_CLUSTERS}), creating a new one" | tee /dev/fd/3 + ((CLUSTER_INDEX+=1)) + _create_cluster "${EXECUTION_SEED}-$(printf "%0*d\n" 4 ${CLUSTER_INDEX})" "${AWS_SUBNETS}" "${PROVISION_SHARD}" + log "INFO: Waiting 60 seconds for the next check" | tee /dev/fd/3 + sleep 60 + else + log "INFO: Clusters created (${CLUSTERS_CREATED_TOTAL}) matching the threshold (${NUMBER_OF_CLUSTERS}), waiting 60 seconds to delete one of them" | tee /dev/fd/3 + sleep 60 + RANDOM_CLUSTER_TO_DELETE="${CLUSTERS_CREATED_LIST[$((RANDOM % ${#CLUSTERS_CREATED_LIST[@]}))]}" + log "INFO: Selected cluster ${RANDOM_CLUSTER_TO_DELETE} for deletion" | tee /dev/fd/3 + _delete_cluster "${RANDOM_CLUSTER_TO_DELETE}" + log "INFO: Waiting 60 seconds for the next check" | tee /dev/fd/3 + sleep 60 + fi +done \ No newline at end of file