Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(helm): make autoscaler deploys more robust #156

Merged
merged 3 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions magnum_cluster_api/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,9 @@ def resize_cluster(
nodegroup.node_count = node_count
nodegroup.save()

resources.apply_cluster_from_magnum_cluster(context, self.k8s_api, cluster)
resources.apply_cluster_from_magnum_cluster(
context, self.k8s_api, cluster, skip_auto_scaling_release=True
)

def upgrade_cluster(
self,
Expand Down Expand Up @@ -270,7 +272,9 @@ def delete_cluster(self, context, cluster):
resources.ClusterAutoscalerHelmRelease(self.k8s_api, cluster).delete()

def create_nodegroup(self, context, cluster, nodegroup):
resources.apply_cluster_from_magnum_cluster(context, self.k8s_api, cluster)
resources.apply_cluster_from_magnum_cluster(
context, self.k8s_api, cluster, skip_auto_scaling_release=True
)

def update_nodegroup_status(self, context, cluster, nodegroup):
action = nodegroup.status.split("_")[0]
Expand Down Expand Up @@ -323,7 +327,9 @@ def update_nodegroup(self, context, cluster, nodegroup):
# NOTE(okozachenko1203): First we save the nodegroup status because update_cluster_status()
# could be finished before update_nodegroup().
nodegroup.save()
resources.apply_cluster_from_magnum_cluster(context, self.k8s_api, cluster)
resources.apply_cluster_from_magnum_cluster(
context, self.k8s_api, cluster, skip_auto_scaling_release=True
)
# NOTE(okozachenko1203): We set the cluster status as UPDATE_IN_PROGRESS again at the end because
# update_cluster_status() could be finished and cluster status has been set as
# UPDATE_COMPLETE before nodegroup_conductor.Handler.nodegroup_update finished.
Expand All @@ -338,6 +344,7 @@ def delete_nodegroup(self, context, cluster, nodegroup):
context,
self.k8s_api,
cluster,
skip_auto_scaling_release=True,
)

def get_monitor(self, context, cluster):
Expand Down
48 changes: 17 additions & 31 deletions magnum_cluster_api/helm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@

import yaml
from oslo_concurrency import processutils
from oslo_log import log as logging

from magnum_cluster_api import exceptions

LOG = logging.getLogger(__name__)


class Command:
def __call__(self, *args, **kwargs):
Expand Down Expand Up @@ -58,19 +61,6 @@ def __call__(self):
raise


class GetStatusReleaseCommand(ReleaseCommand):
COMMAND = ["status"]

def __call__(self):
try:
return super().__call__()
except processutils.ProcessExecutionError as e:
if "release: not found" in e.stderr:
raise exceptions.HelmReleaseNotFound(self.release_name)
else:
raise


class UpgradeReleaseCommand(ReleaseCommand):
COMMAND = ["upgrade"]

Expand All @@ -81,24 +71,20 @@ def __init__(self, namespace, release_name, chart_ref, values={}):

def __call__(self):
try:
stdout, _ = GetStatusReleaseCommand(
namespace=self.namespace,
release_name=self.release_name,
)()

if "STATUS: pending" in stdout:
return "Other task is in progress"
except exceptions.HelmReleaseNotFound:
pass

return super().__call__(
self.chart_ref,
"--install",
"--wait",
"--values",
"-",
process_input=yaml.dump(self.values),
)
return super().__call__(
self.chart_ref,
"--install",
"--values",
"-",
process_input=yaml.dump(self.values),
)
except processutils.ProcessExecutionError as e:
if "UPGRADE FAILED: another operation" in e.stderr:
LOG.info("Helm release %s is already in progress", self.release_name)
elif "UPGRADE FAILED: release: already exists" in e.stderr:
LOG.info("Helm release %s already exists", self.release_name)
else:
raise


class DeleteReleaseCommand(ReleaseCommand):
Expand Down
4 changes: 3 additions & 1 deletion magnum_cluster_api/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -1788,6 +1788,7 @@ def apply_cluster_from_magnum_cluster(
api: pykube.HTTPClient,
cluster: magnum_objects.Cluster,
cluster_template: magnum_objects.ClusterTemplate = None,
skip_auto_scaling_release: bool = False,
) -> objects.Cluster:
"""
Create a ClusterAPI cluster given a Magnum Cluster object.
Expand All @@ -1814,7 +1815,8 @@ def apply_cluster_from_magnum_cluster(
ClusterResourcesConfigMap(context, api, cluster).apply()
ClusterResourceSet(api, cluster).apply()
Cluster(context, api, cluster).apply()
if utils.get_auto_scaling_enabled(cluster):

if not skip_auto_scaling_release and utils.get_auto_scaling_enabled(cluster):
ClusterAutoscalerHelmRelease(api, cluster).apply()


Expand Down
106 changes: 104 additions & 2 deletions magnum_cluster_api/tests/unit/test_helm.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,114 @@ def test_helm_upgrade(mocker):
[
mocker.call(
"helm",
"status",
"upgrade",
"--namespace",
namespace,
release_name,
chart_ref,
"--install",
"--values",
"-",
process_input=yaml.dump(values),
),
]
)


def test_helm_upgrade_with_in_progress_operation(mocker):
namespace = "test-namespace"
release_name = "test-release"
chart_ref = "test-chart"
values = {"test": "value"}

mock_execute = mocker.patch("oslo_concurrency.processutils.execute")
mock_execute.side_effect = processutils.ProcessExecutionError(
stderr="Error: UPGRADE FAILED: another operation (install/upgrade/rollback) is in progress"
)
upgrade = helm.UpgradeReleaseCommand(
namespace,
release_name,
chart_ref,
values,
)
upgrade()

mock_execute.assert_has_calls(
[
mocker.call(
"helm",
"upgrade",
"--namespace",
namespace,
release_name,
chart_ref,
"--install",
"--values",
"-",
process_input=yaml.dump(values),
),
]
)


def test_helm_upgrade_with_existing_release(mocker):
namespace = "test-namespace"
release_name = "test-release"
chart_ref = "test-chart"
values = {"test": "value"}

mock_execute = mocker.patch("oslo_concurrency.processutils.execute")
mock_execute.side_effect = processutils.ProcessExecutionError(
stderr="Error: UPGRADE FAILED: release: already exists"
)
upgrade = helm.UpgradeReleaseCommand(
namespace,
release_name,
chart_ref,
values,
)
upgrade()

mock_execute.assert_has_calls(
[
mocker.call(
"helm",
"upgrade",
"--namespace",
namespace,
release_name,
chart_ref,
"--install",
"--values",
"-",
process_input=yaml.dump(values),
),
]
)


def test_helm_upgrade_with_unknown_error(mocker):
namespace = "test-namespace"
release_name = "test-release"
chart_ref = "test-chart"
values = {"test": "value"}

mock_execute = mocker.patch("oslo_concurrency.processutils.execute")
mock_execute.side_effect = processutils.ProcessExecutionError(
stderr="Error: UPGRADE FAILED: test-release has no deployed releases"
)
upgrade = helm.UpgradeReleaseCommand(
namespace,
release_name,
chart_ref,
values,
)

with pytest.raises(processutils.ProcessExecutionError):
upgrade()

mock_execute.assert_has_calls(
[
mocker.call(
"helm",
"upgrade",
Expand All @@ -52,7 +155,6 @@ def test_helm_upgrade(mocker):
release_name,
chart_ref,
"--install",
"--wait",
"--values",
"-",
process_input=yaml.dump(values),
Expand Down