Included loop script and optimization (#20)

* let user run workload alone independently set platform dict during workload * set operator roles prefix to delete * include loop script * fixed shell checks * renaming log file to avoid overwriting it * missing minute conversion for delay_between_cleanup
cloud-bulldozer · Mar 4, 2024 · ac90c07 · ac90c07
1 parent 40d4eb8
commit ac90c07
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 6 deletions.
diff --git a/hcp-burner.py b/hcp-burner.py
@@ -69,6 +69,7 @@
 
     if 'enabled' in platform.environment['load'] and str(platform.environment['load']['enabled']).lower() == "true":
         # Prometheus takes a lot of time to start after all nodes are ready. we maybe needs to increase this sleep in the future
+        platform = utils.get_cluster_info(platform)
         logging.info("Waiting 5 minutes to allow all clusters to create all pods")
         time.sleep(300)
         load_threads = utils.load_scheduler(platform)

diff --git a/libs/platforms/rosa/hypershift/hypershift.py b/libs/platforms/rosa/hypershift/hypershift.py
@@ -96,7 +96,7 @@ def _verify_provision_shard(self):
         self.logging.error(f"No Provision Shard found for Service Cluster {self.environment['service_cluster']} on {self.environment['aws']['region']}")
         return None
 
-    def _get_mc(self, cluster_id):
+    def get_mc(self, cluster_id):
         self.logging.debug(f"Get the mgmt cluster of cluster {cluster_id}")
         resp_code, resp_out, resp_err = self.utils.subprocess_exec(
             "ocm get /api/clusters_mgmt/v1/clusters/" + cluster_id + "/hypershift",
@@ -268,7 +268,7 @@ def delete_cluster(self, platform, cluster_name):
         cluster_start_time = int(datetime.datetime.utcnow().timestamp())
         cluster_info["uuid"] = self.environment["uuid"]
         cluster_info["install_method"] = "rosa"
-        cluster_info["mgmt_cluster_name"] = self._get_mc(cluster_info["metadata"]["cluster_id"])
+        cluster_info["mgmt_cluster_name"] = self.get_mc(cluster_info["metadata"]["cluster_id"])
         self.logging.info(f"Deleting cluster {cluster_name} on Hypershift Platform")
         cleanup_code, cleanup_out, cleanup_err = self.utils.subprocess_exec("rosa delete cluster -c " + cluster_name + " -y --watch", cluster_info["path"] + "/cleanup.log", {'preexec_fn': self.utils.disable_signals})
         cluster_delete_end_time = int(datetime.datetime.utcnow().timestamp())
@@ -281,10 +281,11 @@ def delete_cluster(self, platform, cluster_name):
             )
             if check_code != 0:
                 cluster_info["status"] = "deleted"
+                operator_role_prefix = cluster_info["metadata"]["operator_role_prefix"]
                 self.logging.debug(
                     f"Destroying STS associated resources of cluster name: {cluster_name}"
                 )
-                (operators_code, operators_out, operators_err) = self.utils.subprocess_exec("rosa delete operator-roles --prefix " + cluster_name + " -m auto -y", cluster_info["path"] + "/cleanup.log", {'preexec_fn': self.utils.disable_signals})
+                (operators_code, operators_out, operators_err) = self.utils.subprocess_exec("rosa delete operator-roles --prefix " + operator_role_prefix + " -m auto -y", cluster_info["path"] + "/operator-role-cleanup.log", {'preexec_fn': self.utils.disable_signals})
                 if operators_code != 0:
                     self.logging.error(
                         f"Failed to delete operator roles on cluster {cluster_name}"
@@ -431,7 +432,7 @@ def create_cluster(self, platform, cluster_name):
                 return 0
             self.logging.info("Cluster Create Command:")
             self.logging.info(cluster_cmd)
-            (create_cluster_code, create_cluster_out, create_cluster_err) = self.utils.subprocess_exec(" ".join(str(x) for x in cluster_cmd), cluster_info["path"] + "/installation.log", {'preexec_fn': self.utils.disable_signals})
+            (create_cluster_code, create_cluster_out, create_cluster_err) = self.utils.subprocess_exec(" ".join(str(x) for x in cluster_cmd), cluster_info["path"] + "/rosa-create.log", {'preexec_fn': self.utils.disable_signals})
             trying += 1
             if create_cluster_code != 0:
                 cluster_info["install_try"] = trying
@@ -470,7 +471,7 @@ def create_cluster(self, platform, cluster_name):
             cluster_info["preflight_checks"] = preflight_ch.result()
             cluster_info["sc_namespace_timing"] = sc_namespace.result() - cluster_start_time if platform.environment["sc_kubeconfig"] != "" else None
 
-            mgmt_cluster_name = self._get_mc(cluster_info["metadata"]["cluster_id"])
+            mgmt_cluster_name = self.get_mc(cluster_info["metadata"]["cluster_id"])
             self.environment["mc_kubeconfig"] = self.download_kubeconfig(mgmt_cluster_name, self.environment["path"])
             mc_namespace = executor.submit(self._namespace_wait, platform.environment["mc_kubeconfig"], cluster_info["metadata"]["cluster_id"], cluster_name, "Management") if platform.environment["mc_kubeconfig"] != "" else 0
             cluster_info["mc_namespace_timing"] = mc_namespace.result() - cluster_start_time if platform.environment["mc_kubeconfig"] != "" else None

diff --git a/libs/platforms/rosa/rosa.py b/libs/platforms/rosa/rosa.py
@@ -215,6 +215,7 @@ def get_metadata(self, cluster_name):
         metadata["status"] = result.get("state", None)
         metadata["version"] = result.get("version", {}).get("raw_id", None)
         metadata["zones"] = result.get("nodes", {}).get("availability_zones", None)
+        metadata["operator_role_prefix"] = result.get("aws", {}).get("sts", {}).get("operator_role_prefix", None)
         return metadata
 
     def _preflight_wait(self, cluster_id, cluster_name):

diff --git a/libs/utils.py b/libs/utils.py
@@ -116,7 +116,7 @@ def cleanup_scheduler(self, platform):
                 self.logging.info(
                     f"Waiting {platform.environment['delay_between_cleanup']} minutes before deleting the next cluster"
                 )
-                time.sleep(platform.environment["delay_between_cleanup"])
+                time.sleep(platform.environment["delay_between_cleanup"] * 60)
         return delete_cluster_thread_list
 
     # To form the cluster_info dict for cleanup funtions
@@ -130,6 +130,10 @@ def get_cluster_info(self, platform):
             platform.environment["clusters"][cluster_name]["metadata"] = platform.get_metadata(cluster_name)
             platform.environment["clusters"][cluster_name]["status"] = platform.environment["clusters"][cluster_name]["metadata"]["status"]
             platform.environment["clusters"][cluster_name]["path"] = platform.environment["path"] + "/" + cluster_name
+            platform.environment["clusters"][cluster_name]["kubeconfig"] = platform.environment["clusters"][cluster_name]["path"] + "/kubeconfig"
+            platform.environment['clusters'][cluster_name]['workers'] = int(platform.environment["workers"].split(",")[(loop_counter - 1) % len(platform.environment["workers"].split(","))])
+        cluster_mc = platform.get_mc(platform.get_cluster_id(cluster_name))
+        platform.environment["mc_kubeconfig"] = platform.environment["path"] + "/kubeconfig_" + cluster_mc
         return platform
 
     def load_scheduler(self, platform):

diff --git a/utils/loop_script.sh b/utils/loop_script.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+trap '_delete_clusters ${EXECUTION_SEED}' SIGINT SIGTERM
+
+### EDIT VARIABLES
+LOG_FILE="${LOG_FILE:-/tmp/install_loop.log}"
+NUMBER_OF_CLUSTERS="${NUMBER_OF_CLUSTERS:-1}"
+CLUSTER_NAME_SEED="${CLUSTER_NAME_SEED:-loop}"
+AWS_SUBNETS="${AWS_SUBNETS:-subnet-xxxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx}"
+PROVISION_SHARD="${PROVISION_SHARD:-provision_shard_id:xxxxxx}"
+OIDC_CONFIG_ID="${OIDC_CONFIG_ID:-xxxxxxxxxxxx}"
+OPERATOR_ROLES_PREFIX="${OPERATOR_ROLES_PREFIX:loop-role}"
+export AWS_REGION="${AWS_REGION:-us-east-2}"
+###
+
+exec 3>&1 1>>"${LOG_FILE}" 2>&1
+
+log() {
+  message=$1
+  timestamp=$(date +%Y-%m-%d\ %H:%M:%S)
+  echo -e "$timestamp\t$message"
+}
+
+_delete_clusters() {
+  log "INFO: Captured Control-C key, deleting all clusters with ${1} seed" | tee /dev/fd/3
+  for CLUSTER in $(rosa list clusters | grep "$1" | awk '{print $1}'); do
+    log "INFO: Deleting cluster ${CLUSTER}" | tee /dev/fd/3
+    _delete_cluster "${CLUSTER}"
+    log "INFO: Cluster ${CLUSTER} deleted" | tee /dev/fd/3
+  done
+  exit 0
+}
+
+_create_cluster() {
+  log "INFO: Creating cluster $1" | tee /dev/fd/3
+  # Timeout of 15 minutes for creating the cluster (3 times of normal execution)
+  timeout --foreground -k 900 900 rosa create cluster -c "$1" --replicas 3 --hosted-cp --sts --mode auto -y --watch --multi-az --subnet-ids "$2" --properties "$3" --oidc-config-id "${OIDC_CONFIG_ID}" --operator-roles-prefix "${OPERATOR_ROLES_PREFIX}" --compute-machine-type m5.xlarge --version 4.14.2 || (log "ERROR: Failed to create cluster $1 after 15 minutes" && return 1)
+  log "INFO: Cluster $1 created" 3>&1 1>>"${LOG_FILE}" | tee /dev/fd/3
+}
+
+_delete_cluster(){
+  # Timeout of 60 minutes for cleaning the cluster (3 times of normal execution)
+  timeout --foreground -k 3600 3600 rosa delete cluster -c "$1" -y --watch|| (log "ERROR: Failed to delete cluster $1 after 30 minutes" && return 1)
+#  # Timeout of 5 minutes for Roles and OIDC
+#  timeout --foreground -k 300 300 rosa delete operator-roles -c "$1" -m auto -y || (log "ERROR: Failed to delete operator roles of cluster $1 after 5 minutes" && return 1)
+#  timeout --foreground -k 300 300 rosa delete oidc-provider -c "$1" -m auto -y || (log "ERROR: Failed to delete OIDC providers of cluster $1 after 5 minutes" && return 1)
+}
+
+CLUSTER_INDEX=0
+EXECUTION_SEED="${CLUSTER_NAME_SEED}"-$(tr -dc '[:lower:]' < /dev/urandom | fold -w 3 | head -n 1)
+while true; do
+  CLUSTERS_CREATED_LIST=()
+  while IFS='' read -r line; do CLUSTERS_CREATED_LIST+=("$line"); done < <(rosa list clusters | grep "${EXECUTION_SEED}" | awk '{print $2}')
+  CLUSTERS_CREATED_TOTAL="$(rosa list clusters | grep -c "${EXECUTION_SEED}")"
+  if [ "${CLUSTERS_CREATED_TOTAL}" -lt  "${NUMBER_OF_CLUSTERS}" ] ; then
+    log "INFO: Clusters created (${CLUSTERS_CREATED_TOTAL}) under threshold (${NUMBER_OF_CLUSTERS}), creating a new one" | tee /dev/fd/3
+    ((CLUSTER_INDEX+=1))
+    _create_cluster "${EXECUTION_SEED}-$(printf "%0*d\n" 4 ${CLUSTER_INDEX})" "${AWS_SUBNETS}" "${PROVISION_SHARD}"
+    log "INFO: Waiting 60 seconds for the next check" | tee /dev/fd/3
+    sleep 60
+  else
+    log "INFO: Clusters created (${CLUSTERS_CREATED_TOTAL}) matching the threshold (${NUMBER_OF_CLUSTERS}), waiting 60 seconds to delete one of them" | tee /dev/fd/3
+    sleep 60
+    RANDOM_CLUSTER_TO_DELETE="${CLUSTERS_CREATED_LIST[$((RANDOM % ${#CLUSTERS_CREATED_LIST[@]}))]}"
+    log "INFO: Selected cluster ${RANDOM_CLUSTER_TO_DELETE} for deletion" | tee /dev/fd/3
+    _delete_cluster "${RANDOM_CLUSTER_TO_DELETE}"
+    log "INFO: Waiting 60 seconds for the next check" | tee /dev/fd/3
+    sleep 60
+  fi
+done