Skip to content

Commit

Permalink
Included loop script and optimization (#20)
Browse files Browse the repository at this point in the history
* let user run workload alone independently
set platform dict during workload

* set operator roles prefix to delete

* include loop script

* fixed shell checks

* renaming log file to avoid overwriting it

* missing minute conversion for delay_between_cleanup
  • Loading branch information
mukrishn authored Mar 4, 2024
1 parent 40d4eb8 commit ac90c07
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 6 deletions.
1 change: 1 addition & 0 deletions hcp-burner.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@

if 'enabled' in platform.environment['load'] and str(platform.environment['load']['enabled']).lower() == "true":
# Prometheus takes a lot of time to start after all nodes are ready. we maybe needs to increase this sleep in the future
platform = utils.get_cluster_info(platform)
logging.info("Waiting 5 minutes to allow all clusters to create all pods")
time.sleep(300)
load_threads = utils.load_scheduler(platform)
Expand Down
11 changes: 6 additions & 5 deletions libs/platforms/rosa/hypershift/hypershift.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def _verify_provision_shard(self):
self.logging.error(f"No Provision Shard found for Service Cluster {self.environment['service_cluster']} on {self.environment['aws']['region']}")
return None

def _get_mc(self, cluster_id):
def get_mc(self, cluster_id):
self.logging.debug(f"Get the mgmt cluster of cluster {cluster_id}")
resp_code, resp_out, resp_err = self.utils.subprocess_exec(
"ocm get /api/clusters_mgmt/v1/clusters/" + cluster_id + "/hypershift",
Expand Down Expand Up @@ -268,7 +268,7 @@ def delete_cluster(self, platform, cluster_name):
cluster_start_time = int(datetime.datetime.utcnow().timestamp())
cluster_info["uuid"] = self.environment["uuid"]
cluster_info["install_method"] = "rosa"
cluster_info["mgmt_cluster_name"] = self._get_mc(cluster_info["metadata"]["cluster_id"])
cluster_info["mgmt_cluster_name"] = self.get_mc(cluster_info["metadata"]["cluster_id"])
self.logging.info(f"Deleting cluster {cluster_name} on Hypershift Platform")
cleanup_code, cleanup_out, cleanup_err = self.utils.subprocess_exec("rosa delete cluster -c " + cluster_name + " -y --watch", cluster_info["path"] + "/cleanup.log", {'preexec_fn': self.utils.disable_signals})
cluster_delete_end_time = int(datetime.datetime.utcnow().timestamp())
Expand All @@ -281,10 +281,11 @@ def delete_cluster(self, platform, cluster_name):
)
if check_code != 0:
cluster_info["status"] = "deleted"
operator_role_prefix = cluster_info["metadata"]["operator_role_prefix"]
self.logging.debug(
f"Destroying STS associated resources of cluster name: {cluster_name}"
)
(operators_code, operators_out, operators_err) = self.utils.subprocess_exec("rosa delete operator-roles --prefix " + cluster_name + " -m auto -y", cluster_info["path"] + "/cleanup.log", {'preexec_fn': self.utils.disable_signals})
(operators_code, operators_out, operators_err) = self.utils.subprocess_exec("rosa delete operator-roles --prefix " + operator_role_prefix + " -m auto -y", cluster_info["path"] + "/operator-role-cleanup.log", {'preexec_fn': self.utils.disable_signals})
if operators_code != 0:
self.logging.error(
f"Failed to delete operator roles on cluster {cluster_name}"
Expand Down Expand Up @@ -431,7 +432,7 @@ def create_cluster(self, platform, cluster_name):
return 0
self.logging.info("Cluster Create Command:")
self.logging.info(cluster_cmd)
(create_cluster_code, create_cluster_out, create_cluster_err) = self.utils.subprocess_exec(" ".join(str(x) for x in cluster_cmd), cluster_info["path"] + "/installation.log", {'preexec_fn': self.utils.disable_signals})
(create_cluster_code, create_cluster_out, create_cluster_err) = self.utils.subprocess_exec(" ".join(str(x) for x in cluster_cmd), cluster_info["path"] + "/rosa-create.log", {'preexec_fn': self.utils.disable_signals})
trying += 1
if create_cluster_code != 0:
cluster_info["install_try"] = trying
Expand Down Expand Up @@ -470,7 +471,7 @@ def create_cluster(self, platform, cluster_name):
cluster_info["preflight_checks"] = preflight_ch.result()
cluster_info["sc_namespace_timing"] = sc_namespace.result() - cluster_start_time if platform.environment["sc_kubeconfig"] != "" else None

mgmt_cluster_name = self._get_mc(cluster_info["metadata"]["cluster_id"])
mgmt_cluster_name = self.get_mc(cluster_info["metadata"]["cluster_id"])
self.environment["mc_kubeconfig"] = self.download_kubeconfig(mgmt_cluster_name, self.environment["path"])
mc_namespace = executor.submit(self._namespace_wait, platform.environment["mc_kubeconfig"], cluster_info["metadata"]["cluster_id"], cluster_name, "Management") if platform.environment["mc_kubeconfig"] != "" else 0
cluster_info["mc_namespace_timing"] = mc_namespace.result() - cluster_start_time if platform.environment["mc_kubeconfig"] != "" else None
Expand Down
1 change: 1 addition & 0 deletions libs/platforms/rosa/rosa.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ def get_metadata(self, cluster_name):
metadata["status"] = result.get("state", None)
metadata["version"] = result.get("version", {}).get("raw_id", None)
metadata["zones"] = result.get("nodes", {}).get("availability_zones", None)
metadata["operator_role_prefix"] = result.get("aws", {}).get("sts", {}).get("operator_role_prefix", None)
return metadata

def _preflight_wait(self, cluster_id, cluster_name):
Expand Down
6 changes: 5 additions & 1 deletion libs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def cleanup_scheduler(self, platform):
self.logging.info(
f"Waiting {platform.environment['delay_between_cleanup']} minutes before deleting the next cluster"
)
time.sleep(platform.environment["delay_between_cleanup"])
time.sleep(platform.environment["delay_between_cleanup"] * 60)
return delete_cluster_thread_list

# To form the cluster_info dict for cleanup funtions
Expand All @@ -130,6 +130,10 @@ def get_cluster_info(self, platform):
platform.environment["clusters"][cluster_name]["metadata"] = platform.get_metadata(cluster_name)
platform.environment["clusters"][cluster_name]["status"] = platform.environment["clusters"][cluster_name]["metadata"]["status"]
platform.environment["clusters"][cluster_name]["path"] = platform.environment["path"] + "/" + cluster_name
platform.environment["clusters"][cluster_name]["kubeconfig"] = platform.environment["clusters"][cluster_name]["path"] + "/kubeconfig"
platform.environment['clusters'][cluster_name]['workers'] = int(platform.environment["workers"].split(",")[(loop_counter - 1) % len(platform.environment["workers"].split(","))])
cluster_mc = platform.get_mc(platform.get_cluster_id(cluster_name))
platform.environment["mc_kubeconfig"] = platform.environment["path"] + "/kubeconfig_" + cluster_mc
return platform

def load_scheduler(self, platform):
Expand Down
70 changes: 70 additions & 0 deletions utils/loop_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/bin/bash

trap '_delete_clusters ${EXECUTION_SEED}' SIGINT SIGTERM

### EDIT VARIABLES
LOG_FILE="${LOG_FILE:-/tmp/install_loop.log}"
NUMBER_OF_CLUSTERS="${NUMBER_OF_CLUSTERS:-1}"
CLUSTER_NAME_SEED="${CLUSTER_NAME_SEED:-loop}"
AWS_SUBNETS="${AWS_SUBNETS:-subnet-xxxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx,subnet-xxxxxx}"
PROVISION_SHARD="${PROVISION_SHARD:-provision_shard_id:xxxxxx}"
OIDC_CONFIG_ID="${OIDC_CONFIG_ID:-xxxxxxxxxxxx}"
OPERATOR_ROLES_PREFIX="${OPERATOR_ROLES_PREFIX:loop-role}"
export AWS_REGION="${AWS_REGION:-us-east-2}"
###

exec 3>&1 1>>"${LOG_FILE}" 2>&1

log() {
message=$1
timestamp=$(date +%Y-%m-%d\ %H:%M:%S)
echo -e "$timestamp\t$message"
}

_delete_clusters() {
log "INFO: Captured Control-C key, deleting all clusters with ${1} seed" | tee /dev/fd/3
for CLUSTER in $(rosa list clusters | grep "$1" | awk '{print $1}'); do
log "INFO: Deleting cluster ${CLUSTER}" | tee /dev/fd/3
_delete_cluster "${CLUSTER}"
log "INFO: Cluster ${CLUSTER} deleted" | tee /dev/fd/3
done
exit 0
}

_create_cluster() {
log "INFO: Creating cluster $1" | tee /dev/fd/3
# Timeout of 15 minutes for creating the cluster (3 times of normal execution)
timeout --foreground -k 900 900 rosa create cluster -c "$1" --replicas 3 --hosted-cp --sts --mode auto -y --watch --multi-az --subnet-ids "$2" --properties "$3" --oidc-config-id "${OIDC_CONFIG_ID}" --operator-roles-prefix "${OPERATOR_ROLES_PREFIX}" --compute-machine-type m5.xlarge --version 4.14.2 || (log "ERROR: Failed to create cluster $1 after 15 minutes" && return 1)
log "INFO: Cluster $1 created" 3>&1 1>>"${LOG_FILE}" | tee /dev/fd/3
}

_delete_cluster(){
# Timeout of 60 minutes for cleaning the cluster (3 times of normal execution)
timeout --foreground -k 3600 3600 rosa delete cluster -c "$1" -y --watch|| (log "ERROR: Failed to delete cluster $1 after 30 minutes" && return 1)
# # Timeout of 5 minutes for Roles and OIDC
# timeout --foreground -k 300 300 rosa delete operator-roles -c "$1" -m auto -y || (log "ERROR: Failed to delete operator roles of cluster $1 after 5 minutes" && return 1)
# timeout --foreground -k 300 300 rosa delete oidc-provider -c "$1" -m auto -y || (log "ERROR: Failed to delete OIDC providers of cluster $1 after 5 minutes" && return 1)
}

CLUSTER_INDEX=0
EXECUTION_SEED="${CLUSTER_NAME_SEED}"-$(tr -dc '[:lower:]' < /dev/urandom | fold -w 3 | head -n 1)
while true; do
CLUSTERS_CREATED_LIST=()
while IFS='' read -r line; do CLUSTERS_CREATED_LIST+=("$line"); done < <(rosa list clusters | grep "${EXECUTION_SEED}" | awk '{print $2}')
CLUSTERS_CREATED_TOTAL="$(rosa list clusters | grep -c "${EXECUTION_SEED}")"
if [ "${CLUSTERS_CREATED_TOTAL}" -lt "${NUMBER_OF_CLUSTERS}" ] ; then
log "INFO: Clusters created (${CLUSTERS_CREATED_TOTAL}) under threshold (${NUMBER_OF_CLUSTERS}), creating a new one" | tee /dev/fd/3
((CLUSTER_INDEX+=1))
_create_cluster "${EXECUTION_SEED}-$(printf "%0*d\n" 4 ${CLUSTER_INDEX})" "${AWS_SUBNETS}" "${PROVISION_SHARD}"
log "INFO: Waiting 60 seconds for the next check" | tee /dev/fd/3
sleep 60
else
log "INFO: Clusters created (${CLUSTERS_CREATED_TOTAL}) matching the threshold (${NUMBER_OF_CLUSTERS}), waiting 60 seconds to delete one of them" | tee /dev/fd/3
sleep 60
RANDOM_CLUSTER_TO_DELETE="${CLUSTERS_CREATED_LIST[$((RANDOM % ${#CLUSTERS_CREATED_LIST[@]}))]}"
log "INFO: Selected cluster ${RANDOM_CLUSTER_TO_DELETE} for deletion" | tee /dev/fd/3
_delete_cluster "${RANDOM_CLUSTER_TO_DELETE}"
log "INFO: Waiting 60 seconds for the next check" | tee /dev/fd/3
sleep 60
fi
done

0 comments on commit ac90c07

Please sign in to comment.