Merge branch 'bugfix/fix-nightly-issues' into tmp/octopus/w/128.0/bug…

…fix/fix-nightly-issues
scality · Apr 10, 2024 · e372cd1 · e372cd1
2 parents a66c155 + b58f98b
commit e372cd1
Show file tree

Hide file tree

Showing 14 changed files with 421 additions and 26 deletions.
diff --git a/.github/actions/generate-snapshots/action.yaml b/.github/actions/generate-snapshots/action.yaml
@@ -22,6 +22,11 @@ inputs:
 runs:
   using: "composite"
   steps:
+    - name: Disable kubelet on every nodes
+      uses: ./.github/actions/run-command-ssh-all-nodes
+      with:
+        NODES_COUNT: "${{ inputs.nodes-count }}"
+        COMMAND: "sudo systemctl disable kubelet"
     - name: Install dependencies for Openstack
       if: contains('ovh', inputs.cloud)
       run: |

diff --git a/.github/actions/run-bootstrap/action.yaml b/.github/actions/run-bootstrap/action.yaml
@@ -14,3 +14,4 @@ runs:
       uses: ./.github/actions/run-command-ssh
       with:
         COMMAND: sudo ${{ inputs.MOUNTPOINT }}/bootstrap.sh --verbose
+        SSH_OPTIONS: "-o ServerAliveInterval=15"
diff --git a/.github/actions/run-command-ssh-all-nodes/action.yaml b/.github/actions/run-command-ssh-all-nodes/action.yaml
@@ -0,0 +1,34 @@
+name: "run command ssh on all nodes"
+description: "Run a command over ssh on all nodes"
+
+inputs:
+  COMMAND:
+    description: "the command to run"
+    required: true
+  NODES_COUNT:
+    description: number of nodes
+    required: false
+    default: "0"
+  SSH_OPTIONS:
+    description: "Additional args passed to ssh"
+    required: false
+    default: ""
+
+runs:
+  using: composite
+  steps:
+    ## Here we cannot use the ssh-command action as we need to run this in a loop
+    ## that is generated at the runtime.
+    - shell: bash
+      run: test -f ssh_config
+    - name: build hosts list
+      shell: bash
+      run: |
+        export HOSTS_LIST=""
+        echo "HOSTS_LIST=bootstrap$(seq -s '' --format ' node-%g' 1 ${{ inputs.NODES_COUNT }})" >> $GITHUB_ENV
+    - name: run the command on all nodes
+      shell: bash
+      run: |
+        for host in ${HOSTS_LIST}; do
+          ssh -F ssh_config ${{ inputs.SSH_OPTIONS }} ${host} "${{ inputs.COMMAND }}"
+        done
diff --git a/.github/actions/run-downgrade/action.yaml b/.github/actions/run-downgrade/action.yaml
@@ -18,3 +18,4 @@ runs:
         COMMAND: >
           sudo ${{ steps.metalk8s_mountpoint.outputs.mountpoint }}/downgrade.sh
           --destination-version ${{ inputs.version }} --verbose
+        SSH_OPTIONS: "-o ServerAliveInterval=15"
diff --git a/.github/actions/run-script-ssh/action.yaml b/.github/actions/run-script-ssh/action.yaml
@@ -0,0 +1,34 @@
+name: "run script over ssh"
+description: "run a script on the remove node over ssh"
+
+inputs:
+  SCRIPT:
+    description: "the script to run"
+    required: true
+  RUN_WITH:
+    description: "the interpreter to run the script"
+    required: false
+    default: "bash"
+  ENV:
+    description: "the envirnoment variables for that run"
+    required: false
+    default: ""
+  NODE:
+    description: "the remote node name"
+    required: false
+    default: "bootstrap"
+
+runs:
+  using: composite
+  steps:
+    - name: check ssh_config file
+      shell: bash
+      run: test -f ssh_config
+    - name: check script to run
+      shell: bash
+      run: test -f ${{ inputs.SCRIPT }}
+    - name: run the script on the remote node
+      shell: bash
+      run: >
+        ssh -F ssh_config "${{ inputs.NODE }}"
+        "${{ inputs.ENV }} ${{ inputs.RUN_WITH }}" < ${{ inputs.SCRIPT }}
diff --git a/.github/actions/run-upgrade/action.yaml b/.github/actions/run-upgrade/action.yaml
@@ -13,3 +13,4 @@ runs:
       uses: ./.github/actions/run-command-ssh
       with:
         COMMAND: sudo /srv/scality/metalk8s-${{ inputs.version }}/upgrade.sh --verbose
+        SSH_OPTIONS: "-o ServerAliveInterval=15"
diff --git a/.github/scripts/stabilize_snapshot.py b/.github/scripts/stabilize_snapshot.py
@@ -0,0 +1,276 @@
+"""Stabilize a MetalK8s cluster built from snapshots.
+
+This script is meant to run after a fresh spawn of MetalK8s snapshot images, to ensure
+it is ready for running tests (such as an upgrade).
+
+Since it is designed for use in CI, it retrieve its configuration options from
+environment variables.
+
+Assumption is made that the script is executed from the bootstrap node, with sufficient
+permissions to execute `kubectl` (with `/etc/kubernetes/admin.conf`) and `crictl`
+commands.
+"""
+
+import json
+import os
+import subprocess
+import sys
+import time
+
+
+# Utils {{{
+
+TRUTHY_VALUES = {"y", "yes", "t", "true", "on", "1"}
+FALSY_VALUES = {"n", "no", "f", "false", "off", "0"}
+
+
+def strtobool(value):
+    """Re-implementation of the now deprecated `distutils.utils.strtobool` function."""
+    if not isinstance(value, str):
+        raise ValueError("Not a string")
+    if value.lower() in TRUTHY_VALUES:
+        return True
+    if value.lower() in FALSY_VALUES:
+        return False
+    raise ValueError(f"Unrecognized value: '{value}'")
+
+
+def get_env(key, cast=None, default=None):
+    value = os.environ.get(key, default=default)
+    return cast(value) if cast is not None else value
+
+
+def env_switch(key, default=False):
+    return get_env(key, cast=strtobool, default="y" if default else "n")
+
+
+def die(message):
+    print(message, file=sys.stderr)
+    sys.exit(1)
+
+
+def run(*args, capture_output=False, **kwargs):
+    return subprocess.run(
+        args,
+        stdout=subprocess.PIPE if capture_output else None,
+        stderr=subprocess.PIPE if capture_output else None,
+        **kwargs,
+    )
+
+
+def get_kubeconfig_arg():
+    return f"--kubeconfig={os.environ.get('KUBECONFIG', '/etc/kubernetes/admin.conf')}"
+
+
+def kubectl(*args, parse_json=False, **kwargs):
+    if parse_json:
+        result = run(
+            "kubectl",
+            get_kubeconfig_arg(),
+            *args,
+            "-o=json",
+            capture_output=True,
+            **kwargs,
+        )
+        return json.loads(result.stdout)
+    return run("kubectl", get_kubeconfig_arg(), *args, **kwargs)
+
+
+def get_salt_master():
+    result = run(
+        "crictl",
+        "ps",
+        "--quiet",
+        "--label=io.kubernetes.container.name=salt-master",
+        "--state=Running",
+        capture_output=True,
+    )
+    container_id = result.stdout.decode().strip()
+    if not container_id:
+        print("Failed to find salt-master container", file=sys.stderr)
+        return None
+    return container_id
+
+
+def is_crashlooping(pod):
+    for status in pod["status"]["containerStatuses"]:
+        if (
+            not status["ready"]
+            and status["state"].get("waiting", {}).get("reason") == "CrashLoopBackOff"
+        ):
+            return True
+    return False
+
+
+def are_pods_stabilized(duration):
+    try:
+        kubectl(
+            "get",
+            "pods",
+            "--all-namespaces",
+            "--selector=!job-name",
+            "--no-headers",
+            "--watch-only",
+            capture_output=True,
+            timeout=duration,
+        )
+    except subprocess.TimeoutExpired as exc:
+        if exc.stdout:
+            print(f"Pods are still unstable:\n{exc.stdout.decode()}", file=sys.stderr)
+            return False
+
+    try:
+        pod_list = kubectl(
+            "get", "pods", "--all-namespaces", "--selector=!job-name", parse_json=True
+        )
+    except json.decoder.JSONDecodeError as exc:
+        print(f"Error parsing JSON output when getting pods: {exc}", file=sys.stderr)
+        return False
+
+    return not any(map(is_crashlooping, pod_list["items"]))
+
+
+# }}}
+# Main logic {{{
+
+
+def wait_for_salt(get_master_attempts=60, ping_minions_attempts=10, sleep_duration=5):
+    """Wait for Salt master and minions to become ready using crictl."""
+    print("Waiting for Salt master container...")
+    salt_master_container_id = None
+    for _ in range(get_master_attempts):
+        salt_master_container_id = get_salt_master()
+        if salt_master_container_id is not None:
+            print(f"Found Salt master! ({salt_master_container_id})")
+            break
+        time.sleep(sleep_duration)
+    else:
+        die(
+            "Failed to find a running Salt master container "
+            f"after {get_master_attempts} attempts."
+        )
+
+    print("Waiting for Salt minions to respond...")
+    for _ in range(ping_minions_attempts):
+        try:
+            run(
+                "crictl",
+                "exec",
+                salt_master_container_id,
+                "salt",
+                "*",
+                "test.ping",
+                check=True,
+            )
+        except subprocess.CalledProcessError:
+            continue
+        else:
+            print("Minions responded!")
+            break
+    else:
+        die(f"Failed to reach all Salt minions after {ping_minions_attempts} attempts.")
+
+
+def wait_pods_stable(attempts=30, sleep_duration=5, stabilization_duration=30):
+    """Wait for pods to stabilize in a given state."""
+    print("Waiting for pods to stabilize...")
+    start = time.time()
+    for attempt in range(attempts):
+        print(f"Attempt {attempt + 1}/{attempts}")
+        if are_pods_stabilized(stabilization_duration):
+            break
+        time.sleep(sleep_duration)
+    else:
+        res = kubectl(
+            "get",
+            "pods",
+            "--all-namespaces",
+            "--selector=!job-name",
+            capture_output=True,
+        )
+        die(
+            f"Pods did not stabilize after {(time.time() - start):.1f} seconds."
+            f"\n\n{res.stdout}"
+        )
+    print(f"Pods are stable [{(time.time() - start):.1f}]")
+
+
+def check_pods_running():
+    """Check that all pods are in Running state."""
+    print("Checking that all pods are running...")
+    try:
+        kubectl(
+            "wait",
+            "pods",
+            "--all",
+            "--all-namespaces",
+            "--for=condition=Ready",
+            "--timeout=10s",
+            "--selector=!job-name",  # We filter out Jobs (they can't be Ready)
+            capture_output=True,
+            check=True,
+        )
+    except subprocess.CalledProcessError as exc:
+        die(
+            f"Not all pods are running:\nstdout:\n{exc.stdout.decode()}\n"
+            f"stderr:\n{exc.stderr.decode()}"
+        )
+    print("All pods are running!")
+
+
+def check_no_disk_pressure(check_count=12, sleep_duration=10, wait_timeout=600):
+    """Check that nodes do not suffer from disk pressure."""
+    print("Checking that nodes are not suffering from disk pressure...")
+    for _ in range(check_count):
+        try:
+            kubectl(
+                "wait",
+                "nodes",
+                "--all",
+                "--for=condition=DiskPressure=False",
+                f"--timeout={wait_timeout}s",
+                capture_output=True,
+                check=True,
+            )
+        except subprocess.CalledProcessError:
+            run("crictl", "exec", get_salt_master(), "salt", "*", "disk.percent")
+            die(f"Some nodes still have disk pressure after {wait_timeout} seconds.")
+        else:
+            time.sleep(sleep_duration)
+    print("Nodes are OK!")
+
+
+def main():
+    """Main routine for the stabilize_snapshot script."""
+
+    common_sleep_duration = get_env("SLEEP_TIME", cast=int, default=5)
+    wait_for_salt(
+        get_master_attempts=get_env("WAIT_SALT_MASTER_ATTEMPTS", cast=int, default=60),
+        ping_minions_attempts=get_env(
+            "PING_SALT_MINIONS_ATTEMPTS", cast=int, default=10
+        ),
+        sleep_duration=common_sleep_duration,
+    )
+
+    wait_pods_stable(
+        attempts=get_env("STABILIZATION_ATTEMPTS", cast=int, default=30),
+        sleep_duration=common_sleep_duration,
+        stabilization_duration=get_env("STABILIZATION_TIME", cast=int, default=120),
+    )
+
+    check_pods_running()
+
+    if env_switch("CHECK_DISK_PRESSURE", True):
+        check_no_disk_pressure(
+            check_count=get_env("CHECK_DISK_PRESSURE_ATTEMPTS", cast=int, default=6),
+            sleep_duration=common_sleep_duration,
+            wait_timeout=get_env("CHECK_DISK_PRESSURE_TIMEOUT", cast=int, default=600),
+        )
+
+    print("Cluster is ready!")
+
+
+# }}}
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/spawn/tfvars/common.tfvars b/.github/spawn/tfvars/common.tfvars
@@ -1,3 +1,2 @@
 component                   = "metalk8s"
 offline                     = false
-use_proxy                   = false