Skip to content

Commit

Permalink
Kf 1105 gh75 fix crd upgrade (#87)
Browse files Browse the repository at this point in the history
Summary of changes:
- Added remove and upgrade integration test that reproduced the issue.
- Added upgrade event handler.
- Modified K8S resources installation to use forced conflict resolution
  in upgrade path.
- Added error handling and logging to indicate conflict scenario.
- Modified code to apply and then apply with conflict resoltion as
  suggested in K8S docs.
- Added remove_without_resources test case. This should complete
  remove/upgrade test suite.
- Manually merged GenericCharmRuntimeError.
- Updated unit tests.
- Updated logging.
- Updated doc string.
- Changed log message to indicate what happened. Still having error
  message displayed for clarity.
- Moved MaintenanceStatus to apply() to make clearer what is going on.

NOTES:
- `install` event handler needs to be looked at. Applying K8S
resources in install event handler might create some issues as described
in other repos/PRs. This was introduced some time ago to speed up
deployment. Now it is proposed to review/remove it.
- Use of Chisme update layer should be added as well.

---------
Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com>
Co-authored-by: Maksim Beliaev <beliaev.m.s@gmail.com>
Co-authored-by: Daniela Plascencia <daniela.plascencia@canonical.com>
  • Loading branch information
4 people committed Mar 28, 2023
1 parent a80b922 commit 2149ca3
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 20 deletions.
77 changes: 60 additions & 17 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def __init__(self, *args):

self.dashboard_provider = GrafanaDashboardProvider(self)

self.framework.observe(self.on.upgrade_charm, self.main)
self.framework.observe(self.on.config_changed, self.main)
self.framework.observe(self.on.leader_elected, self.main)
self.framework.observe(self.on.upgrade_charm, self._on_upgrade)
self.framework.observe(self.on.config_changed, self._on_event)
self.framework.observe(self.on.leader_elected, self._on_event)
self.framework.observe(self.on.training_operator_pebble_ready, self._on_pebble_ready)
self.framework.observe(self.on.install, self._on_install)
self.framework.observe(self.on.remove, self._on_remove)
Expand Down Expand Up @@ -140,14 +140,45 @@ def _check_leader(self):
self.logger.info("Not a leader, skipping setup")
raise ErrorWithStatus("Waiting for leadership", WaitingStatus)

def _deploy_k8s_resources(self) -> None:
"""Deploys K8S resources."""
def _check_and_report_k8s_conflict(self, error):
"""Returns True if error status code is 409 (conflict), False otherwise."""
if error.status.code == 409:
self.logger.warning(f"Encountered a conflict: {str(error)}")
return True
return False

def _apply_k8s_resources(self, force_conflicts: bool = False) -> None:
"""Applies K8S resources.
Args:
force_conflicts (bool): *(optional)* Will "force" apply requests causing conflicting
fields to change ownership to the field manager used in this
charm.
NOTE: This will only be used if initial regular apply() fails.
"""
self.unit.status = MaintenanceStatus("Creating K8S resources")
try:
self.unit.status = MaintenanceStatus("Creating K8S resources")
self.k8s_resource_handler.apply()
except ApiError as error:
if self._check_and_report_k8s_conflict(error) and force_conflicts:
# conflict detected when applying K8S resources
# re-apply K8S resources with forced conflict resolution
self.unit.status = MaintenanceStatus("Force applying K8S resources")
self.logger.warning("Applying K8S resources with conflict resolution")
self.k8s_resource_handler.apply(force=force_conflicts)
else:
raise GenericCharmRuntimeError("K8S resources creation failed") from error
try:
self.crd_resource_handler.apply()
except ApiError as e:
raise GenericCharmRuntimeError("Failed to create K8S resources") from e
except ApiError as error:
if self._check_and_report_k8s_conflict(error) and force_conflicts:
# conflict detected when applying CRD resources
# re-apply CRD resources with forced conflict resolution
self.unit.status = MaintenanceStatus("Force applying CRD resources")
self.logger.warning("Applying CRD resources with conflict resolution")
self.crd_resource_handler.apply(force=force_conflicts)
else:
raise GenericCharmRuntimeError("CRD resources creation failed") from error
self.model.unit.status = MaintenanceStatus("K8S resources created")

def _update_layer(self) -> None:
Expand All @@ -163,12 +194,17 @@ def _update_layer(self) -> None:
except ChangeError as e:
raise GenericCharmRuntimeError("Failed to replan") from e

def main(self, _) -> None:
"""Perform all required actions the Charm."""
def _on_event(self, _, force_conflicts: bool = False) -> None:
"""Perform all required actions the Charm.
Args:
force_conflicts (bool): Should only be used when need to resolved conflicts on K8S
resources.
"""
try:
self._check_container_connection()
self._check_leader()
self._deploy_k8s_resources()
self._apply_k8s_resources(force_conflicts=force_conflicts)
self._update_layer()
except ErrorWithStatus as error:
self.model.unit.status = error.status
Expand All @@ -178,12 +214,17 @@ def main(self, _) -> None:

def _on_pebble_ready(self, _):
"""Configure started container."""
self.main(_)
self._on_event(_)

def _on_install(self, _):
"""Perform installation only actions."""
self._check_container_connection()
self._on_pebble_ready(_)
# apply K8S resources to speed up deployment
self._apply_k8s_resources()

def _on_upgrade(self, _):
"""Perform upgrade steps."""
# force conflict resolution in K8S resources update
self._on_event(_, force_conflicts=True)

def _on_remove(self, _):
"""Remove all resources."""
Expand All @@ -193,9 +234,11 @@ def _on_remove(self, _):
try:
delete_many(self.crd_resource_handler.lightkube_client, crd_resources_manifests)
delete_many(self.k8s_resource_handler.lightkube_client, k8s_resources_manifests)
except ApiError as e:
self.logger.warning(f"Failed to delete K8S resources, with error: {e}")
raise e
except ApiError as error:
# do not log/report when resources were not found
if error.status.code != 404:
self.logger.error(f"Failed to delete K8S resources, with error: {error}")
raise error
self.unit.status = MaintenanceStatus("K8S resources removed")


Expand Down
125 changes: 125 additions & 0 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@
import requests
import tenacity
import yaml
from lightkube.resources.apiextensions_v1 import CustomResourceDefinition
from lightkube.resources.rbac_authorization_v1 import ClusterRole
from pytest_operator.plugin import OpsTest

logger = logging.getLogger(__name__)

METADATA = yaml.safe_load(Path("./metadata.yaml").read_text())
APP_NAME = "training-operator"
CHARM_LOCATION = None


@pytest.mark.abort_on_fail
Expand All @@ -39,6 +42,10 @@ async def test_build_and_deploy(ops_test: OpsTest):
)
assert ops_test.model.applications[APP_NAME].units[0].workload_status == "active"

# store charm location in global to be used in other tests
global CHARM_LOCATION
CHARM_LOCATION = charm_under_test


def lightkube_create_global_resources() -> dict:
"""Returns a dict with GenericNamespacedResource as value for each CRD key."""
Expand Down Expand Up @@ -175,3 +182,121 @@ async def test_prometheus_grafana_integration(ops_test: OpsTest):
wait=tenacity.wait_exponential(multiplier=1, min=1, max=10),
reraise=True,
)


@pytest.mark.abort_on_fail
async def test_remove_with_resources_present(ops_test: OpsTest):
"""Test remove with all resources deployed.
Verify that all deployed resources that need to be removed are removed.
This test should be next before before test_upgrade(), because it removes deployed charm.
"""
# remove deployed charm and verify that it is removed
await ops_test.model.remove_application(app_name=APP_NAME, block_until_done=True)
assert APP_NAME not in ops_test.model.applications

# verify that all resources that were deployed are removed
lightkube_client = lightkube.Client()
crd_list = lightkube_client.list(
CustomResourceDefinition,
labels=[("app.juju.is/created-by", "training-operator")],
namespace=ops_test.model.name,
)
# testing for empty list (iterator)
_last = object()
assert next(crd_list, _last) is _last


@pytest.mark.abort_on_fail
async def test_upgrade(ops_test: OpsTest):
"""Test upgrade.
Verify that all upgrade process succeeds.
There should be no charm with APP_NAME deployed (after test_remove_with_resources_present()),
because it deploys stable version of this charm and peforms upgrade.
"""

# deploy stable version of the charm
await ops_test.model.deploy(entity_url=APP_NAME, channel="1.5/stable", trust=True)
await ops_test.model.wait_for_idle(
apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=60 * 10
)

# refresh (upgrade) using charm built in test_build_and_deploy()
# NOTE: using ops_test.juju() because there is no functionality to refresh in ops_test
image_path = METADATA["resources"]["training-operator-image"]["upstream-source"]
await ops_test.juju(
"refresh",
APP_NAME,
f"--path={CHARM_LOCATION}",
f'--resource="training-operator-image={image_path}"',
)
await ops_test.model.wait_for_idle(
apps=[APP_NAME], status="active", raise_on_blocked=True, timeout=60 * 10
)

# verify that all CRDs are installed
lightkube_client = lightkube.Client()
crd_list = lightkube_client.list(
CustomResourceDefinition,
labels=[("app.juju.is/created-by", "training-operator")],
namespace=ops_test.model.name,
)
# testing for non empty list (iterator)
_last = object()
assert not next(crd_list, _last) is _last

# check that all CRDs are installed and versions are correct
test_crd_list = []
for crd in yaml.safe_load_all(Path("./src/templates/crds_manifests.yaml.j2").read_text()):
test_crd_list.append(
(
crd["metadata"]["name"],
crd["metadata"]["annotations"]["controller-gen.kubebuilder.io/version"],
)
)
for crd in crd_list:
assert (
(crd.metadata.name, crd.metadata.annotations["controller-gen.kubebuilder.io/version"])
) in test_crd_list

# verify that if ClusterRole is installed and parameters are correct
cluster_role = lightkube_client.get(
ClusterRole,
name=f"{ops_test.model.name}-{APP_NAME}-charm",
namespace=ops_test.model.name,
)
for rule in cluster_role.rules:
if rule.apiGroups == "kubeflow.org":
assert "paddlejobs" in rule.resources


@pytest.mark.abort_on_fail
async def test_remove_without_resources(ops_test: OpsTest):
"""Test remove when no resources are present.
Verify that application is removed and not stuck in error state.
This test should be last in the test suite after test_upgrade(), because it removes deployed
charm.
"""

# remove all CRDs
lightkube_client = lightkube.Client()
crd_list = lightkube_client.list(
CustomResourceDefinition,
labels=[("app.juju.is/created-by", "training-operator")],
namespace=ops_test.model.name,
)
for crd in crd_list:
lightkube_client.delete(
CustomResourceDefinition,
name=crd.metadata.name,
namespace=ops_test.model.name,
)

# remove deployed charm and verify that it is removed successfully
await ops_test.model.remove_application(app_name=APP_NAME, block_until_done=True)
assert APP_NAME not in ops_test.model.applications
13 changes: 10 additions & 3 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,14 @@ def harness() -> Harness:
class TestCharm:
"""Test class for TrainingOperatorCharm."""

def test_not_leader(self, harness: Harness):
@patch("charm.TrainingOperatorCharm.k8s_resource_handler")
@patch("charm.TrainingOperatorCharm.crd_resource_handler")
def test_not_leader(
self,
_: MagicMock, # k8s_resource_handler
___: MagicMock, # crd_resource_handler
harness: Harness,
):
"""Test not a leader scenario."""
harness.begin_with_initial_hooks()
harness.container_pebble_ready("training-operator")
Expand Down Expand Up @@ -104,15 +111,15 @@ def test_pebble_layer(

@patch("charm.TrainingOperatorCharm.k8s_resource_handler")
@patch("charm.TrainingOperatorCharm.crd_resource_handler")
def test_deploy_k8s_resources_success(
def test_apply_k8s_resources_success(
self,
k8s_resource_handler: MagicMock,
crd_resource_handler: MagicMock,
harness: Harness,
):
"""Test if K8S resource handler is executed as expected."""
harness.begin()
harness.charm._deploy_k8s_resources()
harness.charm._apply_k8s_resources()
crd_resource_handler.apply.assert_called()
k8s_resource_handler.apply.assert_called()
assert isinstance(harness.charm.model.unit.status, MaintenanceStatus)
Expand Down

0 comments on commit 2149ca3

Please sign in to comment.