Report status for single-config ext

Azure · Jan 13, 2025 · c3aac0f · c3aac0f
1 parent fc2de23
commit c3aac0f
Show file tree

Hide file tree

Showing 9 changed files with 210 additions and 123 deletions.
diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py
@@ -297,6 +297,8 @@ class ExtHandlersHandler(object):
     def __init__(self, protocol):
         self.protocol = protocol
         self.ext_handlers = None
+        # Maintain a list of extensions that are disallowed, and always report extension status for disallowed extensions.
+        self.disallowed_ext_handlers = []
         # The GoalState Aggregate status needs to report the last status of the GoalState. Since we only process
         # extensions on goal state change, we need to maintain its state.
         # Setting the status to None here. This would be overridden as soon as the first GoalState is processed
@@ -519,6 +521,7 @@ def handle_ext_handlers(self, goal_state_id):
             # back for the skipped extensions. In order to propagate the status back to CRP, we will report status back
             # here with an error message.
             if not extensions_enabled:
+                self.disallowed_ext_handlers.append(ext_handler)
                 agent_conf_file_path = get_osutil().agent_conf_file_path
                 msg = "Extension will not be processed since extension processing is disabled. To enable extension " \
                       "processing, set Extensions.Enabled=y in '{0}'".format(agent_conf_file_path)
@@ -745,28 +748,25 @@ def __handle_and_report_ext_handler_errors(ext_handler_i, error, report_op, mess
             add_event(name=name, version=handler_version, op=report_op, is_success=False, log_event=True,
                       message=message)
 
-    @staticmethod
-    def __report_policy_error(ext_handler_i, error_code, report_op, message, extension):
-        # TODO: Consider merging this function with __handle_and_report_ext_handler_errors() above, after investigating
-        # the impact of this change.
-        #
-        # If extension status is present, CRP will ignore handler status and report extension status. In the case of policy errors,
-        # extensions are not processed, so collect_ext_status() reports transitioning status on behalf of the extension.
-        # However, extensions blocked by policy should fail fast, so agent should write a .status file for policy failures.
-        # Note that __handle_and_report_ext_handler_errors() does not create the file for single-config extensions, but changing
-        # it will require additional testing/investigation. As a temporary workaround, this separate function was created
-        # to write a status file for single-config extensions.
-
-        # Set handler status for all extensions (with and without settings). We report the same error at both the
-        # handler and extension status level.
+    def __report_policy_error(self, ext_handler_i, error_code, report_op, message, extension):
+        # TODO: __handle_and_report_ext_handler_errors() does not create a status file for single-config extensions, this
+        # function was created as a temporary workaround. Consider merging the two functions function after assessing its impact.
+
+        # If extension status exists, CRP ignores handler status and reports extension status. In the case of policy errors,
+        # we write a .status file to force CRP to fail fast - the agent will otherwise report a transitioning status.
+        # - For extensions without settings or uninstall errors: report at the handler level.
+        # - For extensions with settings (install/enable errors): report at both handler and extension levels.
+
+        # Keep a list of disallowed extensions so that report_ext_handler_status() can report status for them.
+        self.disallowed_ext_handlers.append(ext_handler_i.ext_handler)
+
+        # Set handler status for all extensions (with and without settings).
         ext_handler_i.set_handler_status(message=message, code=error_code)
 
-        # Create status file for extensions with settings (single and multi config).
-        # If status file already exists, overwrite it. If an extension was previously reporting status and is now
-        # blocked by a policy error, we should report the policy error.
-        if extension is not None:
-            # TODO: if extension is reporting a heartbeat, it overwrites status. Consider overwriting heartbeat, if
-            # it exists.
+        # For extensions with settings (install/enable errors), also update extension-level status.
+        # Overwrite any existing status file to reflect policy failures accurately.
+        if extension is not None and ext_handler_i.ext_handler.state == ExtensionRequestedState.Enabled:
+            # TODO: if extension is reporting heartbeat, it overwrites status. Consider overwriting heartbeat here
             ext_handler_i.create_status_file(extension, status=ExtensionStatusValue.error, code=error_code,
                                              operation=report_op, message=message, overwrite=True)
 
@@ -1068,12 +1068,13 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed):
 
         handler_state = ext_handler_i.get_handler_state()
         ext_handler_statuses = []
+        ext_disallowed = ext_handler in self.disallowed_ext_handlers
         # For MultiConfig, we need to report status per extension even for Handler level failures.
         # If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per
         # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc)
-        # We also need to report extension status for an uninstalled handler if extensions are disabled, because CRP
-        # waits for extension runtime status before failing the extension operation.
-        if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled():
+        # We also need to report extension status for an uninstalled handler if the extension is disallowed (due to
+        # policy failure, extensions disabled, etc.) because CRP waits for extension runtime status before failing the operation.
+        if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or ext_disallowed:
 
             # Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found.
             # Only try to read heartbeat if HandlerState != NotInstalled.

diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py
@@ -3572,7 +3572,7 @@ def test_should_fail_enable_if_extension_disallowed(self):
             }
         expected_msg = "failed to run extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified as an allowed extension."
         self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed,
-                          expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg)
+                          expected_handler_status='NotReady', expected_ext_count=1, expected_status_msg=expected_msg)
 
     def test_should_fail_enable_for_invalid_policy(self):
         policy = \
@@ -3584,7 +3584,7 @@ def test_should_fail_enable_for_invalid_policy(self):
             }
         expected_msg = "attribute 'extensionPolicies.allowListedExtensionsOnly'; must be 'boolean'"
         self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled, expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed,
-                          expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg)
+                          expected_handler_status='NotReady', expected_ext_count=1, expected_status_msg=expected_msg)
 
     def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self):
         policy = \
@@ -3596,7 +3596,7 @@ def test_should_fail_extension_if_error_thrown_during_policy_engine_init(self):
             expected_msg = "Extension will not be processed: mock exception"
             self._test_policy_case(policy=policy, op=ExtensionRequestedState.Enabled,
                                       expected_status_code=ExtensionErrorCodes.PluginEnableProcessingFailed,
-                                      expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg)
+                                      expected_handler_status='NotReady', expected_ext_count=1, expected_status_msg=expected_msg)
 
     def test_should_fail_uninstall_if_extension_disallowed(self):
         policy = \
@@ -3610,7 +3610,7 @@ def test_should_fail_uninstall_if_extension_disallowed(self):
             }
         expected_msg = "failed to uninstall extension 'OSTCExtensions.ExampleHandlerLinux' because it is not specified as an allowed extension."
         self._test_policy_case(policy=policy, op=ExtensionRequestedState.Uninstall, expected_status_code=ExtensionErrorCodes.PluginDisableProcessingFailed,
-                                  expected_handler_status='NotReady', expected_ext_count=0, expected_status_msg=expected_msg)
+                                  expected_handler_status='NotReady', expected_ext_count=1, expected_status_msg=expected_msg)
 
     def test_should_fail_enable_if_dependent_extension_disallowed(self):
         self._create_policy_file({
@@ -3634,7 +3634,7 @@ def test_should_fail_enable_if_dependent_extension_disallowed(self):
 
             # OtherExampleHandlerLinux should be disallowed by policy, ExampleHandlerLinux should be skipped because
             # dependent extension failed
-            self._assert_handler_status(protocol.report_vm_status, expected_status="NotReady", expected_ext_count=0,
+            self._assert_handler_status(protocol.report_vm_status, expected_status="NotReady", expected_ext_count=1,
                                         version="1.0.0", expected_handler_name="OSTCExtensions.OtherExampleHandlerLinux",
                                         expected_msg=("failed to run extension 'OSTCExtensions.OtherExampleHandlerLinux' "
                                                       "because it is not specified as an allowed extension."))

diff --git a/tests_e2e/orchestrator/scripts/collect-logs b/tests_e2e/orchestrator/scripts/collect-logs
@@ -19,7 +19,8 @@ tar --exclude='journal/*' --exclude='omsbundle' --exclude='omsagent' --exclude='
     -czf "$logs_file_name" \
     /var/log \
     /var/lib/waagent/ \
-    $waagent_conf
+    $waagent_conf \
+    /etc/waagent_policy.json
 
 set -euxo pipefail
 

diff --git a/tests_e2e/test_suites/ext_policy_with_dependencies.yml b/tests_e2e/test_suites/ext_policy_with_dependencies.yml
@@ -6,4 +6,6 @@ tests:
   - "ext_policy/ext_policy_with_dependencies.py"
 images: "endorsed"
 executes_on_scale_set: true
-owns_vm: false
+owns_vm: false
+skip_on_images:
+  - "alma_9"  # TODO: Currently AlmaLinux is not available for scale sets; enable this image when it is available.
diff --git a/tests_e2e/tests/ext_policy/ext_policy.py b/tests_e2e/tests/ext_policy/ext_policy.py
@@ -18,6 +18,7 @@
 #
 import json
 import uuid
+import os
 from typing import List, Dict, Any
 from assertpy import assert_that, fail
 
@@ -44,16 +45,20 @@ def _create_policy_file(self, policy):
         """
         Create policy json file and copy to /etc/waagent_policy.json on test machine.
         """
-        with open("waagent_policy.json", mode='w') as policy_file:
+        unique_id = uuid.uuid4()
+        file_path = "/tmp/waagent_policy_{0}.json".format(unique_id)
+        with open(file_path, mode='w') as policy_file:
             json.dump(policy, policy_file, indent=4)
             policy_file.flush()
+            log.info("Policy file contents: {0}".format(json.dumps(policy, indent=4)))
 
             remote_path = "/tmp/waagent_policy.json"
             local_path = policy_file.name
             self._ssh_client.copy_to_node(local_path=local_path, remote_path=remote_path)
             policy_file_final_dest = "/etc/waagent_policy.json"
             log.info("Copying policy file to test VM [%s]", self._context.vm.name)
             self._ssh_client.run_command(f"mv {remote_path} {policy_file_final_dest}", use_sudo=True)
+        os.remove(file_path)
 
     def _operation_should_succeed(self, operation, extension_case):
         log.info("")
@@ -234,6 +239,7 @@ def run(self):
                 }
             }
         self._create_policy_file(policy)
+        self._operation_should_succeed("enable", custom_script)
         # Update settings to force an update to the seq no
         run_command.settings = {'source': f"echo '{str(uuid.uuid4())}'"}
         run_command_2.settings = {'source': f"echo '{str(uuid.uuid4())}'"}
@@ -316,4 +322,4 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
 
 
 if __name__ == "__main__":
-    ExtPolicy.run_from_command_line()
+    ExtPolicy.run_from_command_line()