From ee176197b421866b12b14efa14c0490a085d1d19 Mon Sep 17 00:00:00 2001 From: Joe Shimkus Date: Thu, 21 Mar 2024 14:33:22 -0400 Subject: [PATCH] wip: add per-entity restart parameters [AAP-20053] --- src/aap_eda/api/serializers/activation.py | 22 ++++ src/aap_eda/api/serializers/event_stream.py | 3 + ...on_restart_completion_interval_and_more.py | 121 ++++++++++++++++++ src/aap_eda/core/models/activation.py | 94 ++++++++++++++ src/aap_eda/core/models/event_stream.py | 94 ++++++++++++++ src/aap_eda/services/activation/manager.py | 35 +++-- 6 files changed, 355 insertions(+), 14 deletions(-) create mode 100644 src/aap_eda/core/migrations/0028_activation_restart_completion_interval_and_more.py diff --git a/src/aap_eda/api/serializers/activation.py b/src/aap_eda/api/serializers/activation.py index cf9669425..59b14ef13 100644 --- a/src/aap_eda/api/serializers/activation.py +++ b/src/aap_eda/api/serializers/activation.py @@ -194,6 +194,9 @@ class Meta: "project_id", "rulebook_id", "extra_var_id", + "restart_completion_interval", + "restart_failure_interval", + "restart_failure_limit", "restart_policy", "restart_count", "rulebook_name", @@ -250,6 +253,9 @@ class Meta: "project_id", "rulebook_id", "extra_var_id", + "restart_completion_interval", + "restart_failure_interval", + "restart_failure_limit", "restart_policy", "restart_count", "rulebook_name", @@ -294,6 +300,11 @@ def to_representation(self, activation): "project_id": activation.project_id, "rulebook_id": activation.rulebook_id, "extra_var_id": activation.extra_var_id, + "restart_completion_interval": ( + activation.restart_completion_interval + ), + "restart_failure_interval": activation.restart_failure_interval, + "restart_failure_limit": activation.restart_failure_limit, "restart_policy": activation.restart_policy, "restart_count": activation.restart_count, "rulebook_name": activation.rulebook_name, @@ -324,6 +335,9 @@ class Meta: "rulebook_id", "extra_var_id", "user", + "restart_completion_interval", + "restart_failure_interval", + "restart_failure_limit", "restart_policy", "awx_token_id", "credentials", @@ -489,6 +503,9 @@ class Meta: "rulebook", "extra_var", "instances", + "restart_completion_interval", + "restart_failure_interval", + "restart_failure_limit", "restart_policy", "restart_count", "rulebook_name", @@ -573,6 +590,11 @@ def to_representation(self, activation): "instances": ActivationInstanceSerializer( activation_instances, many=True ).data, + "restart_completion_interval": ( + activation.restart_completion_interval + ), + "restart_failure_interval": activation.restart_failure_interval, + "restart_failure_limit": activation.restart_failure_limit, "restart_policy": activation.restart_policy, "restart_count": activation.restart_count, "rulebook_name": activation.rulebook_name, diff --git a/src/aap_eda/api/serializers/event_stream.py b/src/aap_eda/api/serializers/event_stream.py index 908c3012c..7aa550b4f 100644 --- a/src/aap_eda/api/serializers/event_stream.py +++ b/src/aap_eda/api/serializers/event_stream.py @@ -191,6 +191,9 @@ class Meta: "rulebook_id", "extra_var_id", "user", + "restart_completion_interval", + "restart_failure_interval", + "restart_failure_limit", "restart_policy", "credentials", "log_level", diff --git a/src/aap_eda/core/migrations/0028_activation_restart_completion_interval_and_more.py b/src/aap_eda/core/migrations/0028_activation_restart_completion_interval_and_more.py new file mode 100644 index 000000000..b77fd09a6 --- /dev/null +++ b/src/aap_eda/core/migrations/0028_activation_restart_completion_interval_and_more.py @@ -0,0 +1,121 @@ +# Generated by Django 4.2.7 on 2024-03-21 18:43 + +import django.core.validators +from django.db import migrations, models + +import aap_eda.core.models.activation +import aap_eda.core.models.event_stream + + +class Migration(migrations.Migration): + dependencies = [ + ( + "core", + "0027_credentialtype_alter_permission_resource_type_and_more", + ), + ] + + operations = [ + migrations.AddField( + model_name="activation", + name="restart_completion_interval", + field=models.IntegerField( + default=aap_eda.core.models.activation.RestartCompletionInterval[ + "MINIMUM" + ], + validators=[ + django.core.validators.MinValueValidator( + limit_value=aap_eda.core.models.activation.RestartCompletionInterval[ + "MINIMUM" + ], + message="The restart interval for completions specifies the delay, in seconds, between restarts; it must be an integer greater than or equal to 0 indicating the delay, in seconds, between restarts; system settings = 0, default = 0", + ) + ], + ), + ), + migrations.AddField( + model_name="activation", + name="restart_failure_interval", + field=models.IntegerField( + default=aap_eda.core.models.activation.RestartFailureInterval[ + "MINIMUM" + ], + validators=[ + django.core.validators.MinValueValidator( + limit_value=aap_eda.core.models.activation.RestartFailureInterval[ + "MINIMUM" + ], + message="The restart interval for failures specifies the delay, in seconds, between restarts; it must be an integer greater than or equal to 0 indicating the delay, in seconds, between restarts; system settings = 0, default = 0", + ) + ], + ), + ), + migrations.AddField( + model_name="activation", + name="restart_failure_limit", + field=models.IntegerField( + default=aap_eda.core.models.activation.RestartFailureLimit[ + "SETTINGS" + ], + validators=[ + django.core.validators.MinValueValidator( + limit_value=aap_eda.core.models.activation.RestartFailureLimit[ + "MINIMUM" + ], + message="The restart limit for failiures specifies the limit on repeated attempts to start an activation in the face of failures to do so; it must be an integer greater than or equal to -1; system settings = 0, unlimited restarts = -1, default = 0", + ) + ], + ), + ), + migrations.AddField( + model_name="eventstream", + name="restart_completion_interval", + field=models.IntegerField( + default=aap_eda.core.models.event_stream.RestartCompletionInterval[ + "MINIMUM" + ], + validators=[ + django.core.validators.MinValueValidator( + limit_value=aap_eda.core.models.event_stream.RestartCompletionInterval[ + "MINIMUM" + ], + message="The restart interval for completions specifies the delay, in seconds, between restarts; it must be an integer greater than or equal to 0 indicating the delay, in seconds, between restarts; system settings = 0, default = 0", + ) + ], + ), + ), + migrations.AddField( + model_name="eventstream", + name="restart_failure_interval", + field=models.IntegerField( + default=aap_eda.core.models.event_stream.RestartFailureInterval[ + "MINIMUM" + ], + validators=[ + django.core.validators.MinValueValidator( + limit_value=aap_eda.core.models.event_stream.RestartFailureInterval[ + "MINIMUM" + ], + message="The restart interval for failures specifies the delay, in seconds, between restarts; it must be an integer greater than or equal to 0 indicating the delay, in seconds, between restarts; system settings = 0, default = 0", + ) + ], + ), + ), + migrations.AddField( + model_name="eventstream", + name="restart_failure_limit", + field=models.IntegerField( + default=aap_eda.core.models.event_stream.RestartFailureLimit[ + "SETTINGS" + ], + validators=[ + django.core.validators.MinValueValidator( + limit_value=aap_eda.core.models.event_stream.RestartFailureLimit[ + "MINIMUM" + ], + message="The restart limit for failiures specifies the limit on repeated attempts to start an activation in the face of failures to do so; it must be an integer greater than or equal to -1; system settings = 0, unlimited restarts = -1, default = 0", + ) + ], + ), + ), + ] diff --git a/src/aap_eda/core/models/activation.py b/src/aap_eda/core/models/activation.py index 03ce13c2f..c25c91ba5 100644 --- a/src/aap_eda/core/models/activation.py +++ b/src/aap_eda/core/models/activation.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import enum + +from django.conf import settings +from django.core import validators from django.db import models from aap_eda.core.enums import ( @@ -28,6 +32,25 @@ __all__ = ("Activation",) +class RestartCompletionInterval(enum.IntEnum): + MINIMUM = 0 + SETTINGS = MINIMUM + DEFAULT = SETTINGS + + +class RestartFailureInterval(enum.IntEnum): + MINIMUM = 0 + SETTINGS = MINIMUM + DEFAULT = SETTINGS + + +class RestartFailureLimit(enum.IntEnum): + MINIMUM = -1 + SETTINGS = 0 + DEFAULT = SETTINGS + UNLIMITED = MINIMUM + + class Activation(StatusHandlerModelMixin, ContainerableMixin, models.Model): class Meta: db_table = "core_activation" @@ -52,6 +75,52 @@ class Meta: extra_var = models.ForeignKey( "ExtraVar", on_delete=models.CASCADE, null=True ) + restart_completion_interval = models.IntegerField( + validators=[ + validators.MinValueValidator( + limit_value=RestartCompletionInterval.MINIMUM, + message="The restart interval for completions specifies" + " the delay, in seconds, between restarts" + "; it must be an integer greater than or equal to" + f" {RestartCompletionInterval.MINIMUM}" + " indicating the delay, in seconds, between restarts" + f"; system settings = {RestartCompletionInterval.SETTINGS}" + f", default = {RestartCompletionInterval.DEFAULT}", + ), + ], + default=RestartCompletionInterval.DEFAULT, + ) + restart_failure_interval = models.IntegerField( + validators=[ + validators.MinValueValidator( + limit_value=RestartFailureInterval.MINIMUM, + message="The restart interval for failures specifies" + " the delay, in seconds, between restarts" + "; it must be an integer greater than or equal to " + f" {RestartFailureInterval.MINIMUM}" + " indicating the delay, in seconds, between restarts" + f"; system settings = {RestartFailureInterval.SETTINGS}" + f", default = {RestartFailureInterval.DEFAULT}", + ), + ], + default=RestartFailureInterval.DEFAULT, + ) + restart_failure_limit = models.IntegerField( + validators=[ + validators.MinValueValidator( + limit_value=RestartFailureLimit.MINIMUM, + message="The restart limit for failiures specifies" + " the limit on repeated attempts to start an activation" + " in the face of failures to do so" + "; it must be an integer greater than or equal to" + f" {RestartFailureLimit.MINIMUM}" + f"; system settings = {RestartFailureLimit.SETTINGS}" + f", unlimited restarts = {RestartFailureLimit.UNLIMITED}" + f", default = {RestartFailureLimit.DEFAULT}", + ), + ], + default=RestartFailureLimit.DEFAULT, + ) restart_policy = models.TextField( choices=RestartPolicy.choices(), default=RestartPolicy.ON_FAILURE, @@ -122,3 +191,28 @@ class Meta: on_delete=models.CASCADE, related_name="+", ) + + @property + def effective_restart_completion_interval(self): + effective = self.restart_completion_interval + if effective == RestartCompletionInterval.SETTINGS: + effective = settings.ACTIVATION_RESTART_SECONDS_ON_COMPLETE + return effective + + @property + def effective_restart_failure_interval(self): + effective = self.restart_failure_interval + if effective == RestartFailureInterval.SETTINGS: + effective = settings.ACTIVATION_RESTART_SECONDS_ON_FAILURE + return effective + + @property + def effective_restart_failure_limit(self): + effective = self.restart_failure_limit + if effective == RestartFailureLimit.SETTINGS: + effective = settings.ACTIVATION_MAX_RESTARTS_ON_FAILURE + return effective + + @property + def unlimited_restart_failures(self): + return self.restart_failure_limit == RestartFailureLimit.UNLIMITED diff --git a/src/aap_eda/core/models/event_stream.py b/src/aap_eda/core/models/event_stream.py index 3db3f1147..da70891ce 100644 --- a/src/aap_eda/core/models/event_stream.py +++ b/src/aap_eda/core/models/event_stream.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import enum + +from django.conf import settings +from django.core import validators from django.db import models from aap_eda.core.enums import ( @@ -25,6 +29,25 @@ from .mixins import StatusHandlerModelMixin +class RestartCompletionInterval(enum.IntEnum): + MINIMUM = 0 + SETTINGS = MINIMUM + DEFAULT = SETTINGS + + +class RestartFailureInterval(enum.IntEnum): + MINIMUM = 0 + SETTINGS = MINIMUM + DEFAULT = SETTINGS + + +class RestartFailureLimit(enum.IntEnum): + MINIMUM = -1 + SETTINGS = 0 + DEFAULT = SETTINGS + UNLIMITED = MINIMUM + + class EventStream(StatusHandlerModelMixin, ContainerableMixin, models.Model): """Model representing an event stream.""" @@ -46,6 +69,52 @@ class EventStream(StatusHandlerModelMixin, ContainerableMixin, models.Model): on_delete=models.CASCADE, null=True, ) + restart_completion_interval = models.IntegerField( + validators=[ + validators.MinValueValidator( + limit_value=RestartCompletionInterval.MINIMUM, + message="The restart interval for completions specifies" + " the delay, in seconds, between restarts" + "; it must be an integer greater than or equal to" + f" {RestartCompletionInterval.MINIMUM}" + " indicating the delay, in seconds, between restarts" + f"; system settings = {RestartCompletionInterval.SETTINGS}" + f", default = {RestartCompletionInterval.DEFAULT}", + ), + ], + default=RestartCompletionInterval.DEFAULT, + ) + restart_failure_interval = models.IntegerField( + validators=[ + validators.MinValueValidator( + limit_value=RestartFailureInterval.MINIMUM, + message="The restart interval for failures specifies" + " the delay, in seconds, between restarts" + "; it must be an integer greater than or equal to " + f" {RestartFailureInterval.MINIMUM}" + " indicating the delay, in seconds, between restarts" + f"; system settings = {RestartFailureInterval.SETTINGS}" + f", default = {RestartFailureInterval.DEFAULT}", + ), + ], + default=RestartFailureInterval.DEFAULT, + ) + restart_failure_limit = models.IntegerField( + validators=[ + validators.MinValueValidator( + limit_value=RestartFailureLimit.MINIMUM, + message="The restart limit for failiures specifies" + " the limit on repeated attempts to start an activation" + " in the face of failures to do so" + "; it must be an integer greater than or equal to" + f" {RestartFailureLimit.MINIMUM}" + f"; system settings = {RestartFailureLimit.SETTINGS}" + f", unlimited restarts = {RestartFailureLimit.UNLIMITED}" + f", default = {RestartFailureLimit.DEFAULT}", + ), + ], + default=RestartFailureLimit.DEFAULT, + ) restart_policy = models.TextField( choices=RestartPolicy.choices(), default=RestartPolicy.ON_FAILURE, @@ -113,3 +182,28 @@ def __str__(self) -> str: def _get_skip_audit_events(self) -> bool: """Event stream skips audit events.""" return True + + @property + def effective_restart_completion_interval(self): + effective = self.restart_completion_interval + if effective == RestartCompletionInterval.SETTINGS: + effective = settings.ACTIVATION_RESTART_SECONDS_ON_COMPLETE + return effective + + @property + def effective_restart_failure_interval(self): + effective = self.restart_failure_interval + if effective == RestartFailureInterval.SETTINGS: + effective = settings.ACTIVATION_RESTART_SECONDS_ON_FAILURE + return effective + + @property + def effective_restart_failure_limit(self): + effective = self.restart_failure_limit + if effective == RestartFailureLimit.SETTINGS: + effective = settings.ACTIVATION_MAX_RESTARTS_ON_FAILURE + return effective + + @property + def unlimited_restart_failures(self): + return self.restart_failure_limit == RestartFailureLimit.UNLIMITED diff --git a/src/aap_eda/services/activation/manager.py b/src/aap_eda/services/activation/manager.py index bba4acc7b..b72ae94b4 100644 --- a/src/aap_eda/services/activation/manager.py +++ b/src/aap_eda/services/activation/manager.py @@ -522,9 +522,10 @@ def _completed_policy(self, container_msg: str): ) user_msg = ( f"Activation completed. It will attempt to restart in " - f"{settings.ACTIVATION_RESTART_SECONDS_ON_COMPLETE} seconds " - f"according to the restart policy {RestartPolicy.ALWAYS}." - "It may take longer if there is no capacity available." + f"{self.db_instance.effective_restart_completion_interval}" + f" seconds according to the restart policy" + f" {RestartPolicy.ALWAYS}." + " It may take longer if there is no capacity available." ) if container_msg: user_msg = f"{container_msg} {user_msg}" @@ -540,7 +541,9 @@ def _completed_policy(self, container_msg: str): system_restart_activation( self.db_instance_type, self.db_instance.id, - delay_seconds=settings.ACTIVATION_RESTART_SECONDS_ON_COMPLETE, + delay_seconds=( + self.db_instance.effective_restart_completion_interval + ), ) else: LOGGER.info( @@ -599,9 +602,9 @@ def _failed_policy(self, container_msg: str): raise exceptions.ActivationMonitorError(msg) from exc # No restart if it has reached the maximum number of restarts - elif ( + elif (not self.db_instance.unlimited_restart_failures) and ( self.db_instance.failure_count - >= settings.ACTIVATION_MAX_RESTARTS_ON_FAILURE + >= self.db_instance.effective_restart_failure_limit ): LOGGER.info( f"Activation id: {self.db_instance.id} " @@ -634,16 +637,17 @@ def _failed_policy(self, container_msg: str): raise exceptions.ActivationMonitorError(msg) from exc # Restart else: - count_msg = ( - f"({self.db_instance.failure_count + 1}/" - f"{settings.ACTIVATION_MAX_RESTARTS_ON_FAILURE})" + count_msg = f"({self.db_instance.failure_count + 1}" "/{0}".format( + self.db_instance.effective_restart_failure_limit + if not self.db_instance.unlimited_restart_failures + else "unlimited" ) user_msg = ( f"Activation failed. It will attempt to restart {count_msg} in" - f" {settings.ACTIVATION_RESTART_SECONDS_ON_FAILURE} seconds " - "according to the restart policy " + f" {self.db_instance.effective_restart_failure_interval}" + f" seconds according to the restart policy " f"{self.db_instance.restart_policy}." - "It may take longer if there is no capacity available." + " It may take longer if there is no capacity available." ) if container_msg: user_msg = f"{container_msg} {user_msg}" @@ -671,12 +675,15 @@ def _failed_policy(self, container_msg: str): f"Activation {self.db_instance.id} failed. " f"Restart policy is set to {self.db_instance.restart_policy}. " f"Scheduling restart in " - f"{settings.ACTIVATION_RESTART_SECONDS_ON_FAILURE} seconds.", + f"{self.db_instance.effective_restart_failure_interval} " + "seconds.", ) system_restart_activation( self.db_instance_type, self.db_instance.id, - delay_seconds=settings.ACTIVATION_RESTART_SECONDS_ON_FAILURE, + delay_seconds=( + self.db_instance.effective_restart_failure_interval + ), ) def _fail_instance(self, msg: tp.Optional[str] = None):