-
Notifications
You must be signed in to change notification settings - Fork 223
fix verify_cpu_offline_channel_add #4092
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,7 +4,7 @@ | |
|
|
||
| import random | ||
| import time | ||
| from typing import cast | ||
| from typing import List, cast | ||
|
|
||
| from assertpy import assert_that | ||
| from microsoft.testsuites.cpu.common import ( | ||
|
|
@@ -28,17 +28,8 @@ | |
| ) | ||
| from lisa.environment import Environment | ||
| from lisa.node import RemoteNode | ||
| from lisa.tools import ( | ||
| Ethtool, | ||
| Fio, | ||
| Iperf3, | ||
| KernelConfig, | ||
| Kill, | ||
| Lscpu, | ||
| Lsvmbus, | ||
| Modprobe, | ||
| Reboot, | ||
| ) | ||
| from lisa.tools import Ethtool, Fio, Iperf3, Kill, Lscpu, Lsvmbus, Reboot | ||
| from lisa.tools.lsvmbus import HV_NETVSC_CLASS_ID | ||
| from lisa.util import SkippedException | ||
|
|
||
|
|
||
|
|
@@ -172,150 +163,234 @@ def verify_cpu_offline_network_workload( | |
| server.tools[Kill].by_name("iperf3", ignore_not_exist=True) | ||
| client.tools[Kill].by_name("iperf3", ignore_not_exist=True) | ||
|
|
||
| # ---- CPUSuite helpers ---- | ||
| def _clamp_channels(self, val: int) -> int: | ||
| return max(1, min(int(val), 64)) | ||
|
|
||
| def _read_max_supported(self, node: Node) -> int: | ||
| """ | ||
| Return conservative device max 'combined' channels for eth0. | ||
| Fallback strategy: Try to collect all possible candidates from ethtool fields | ||
| (max_combined, max_channels, max_current, current_channels); if none are found, | ||
| fall back to lsvmbus queue count; if that fails, fall back to thread count. | ||
| Always clamp to [1, 64]. | ||
| """ | ||
| try: | ||
| info = node.tools[Ethtool].get_device_channels_info("eth0", True) | ||
| candidates = [] | ||
| for name in ("max_combined", "max_channels", "max_current"): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put them into ethtool tool
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please refer get_device_channels_info in ethtool, wrap these into ethtool |
||
| v = getattr(info, name, None) | ||
| if v is not None: | ||
| try: | ||
| candidates.append(int(v)) | ||
| except Exception: | ||
| # Ignore values that cannot be converted to int | ||
| # (may be missing or malformed) | ||
| pass | ||
| cur = getattr(info, "current_channels", None) | ||
| if cur is not None: | ||
| try: | ||
| candidates.append(int(cur)) | ||
| except Exception: | ||
LiliDeng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Ignore values that cannot be converted to int | ||
| # (may be missing or malformed) | ||
| pass | ||
| if candidates: | ||
| return max(1, min(max(candidates), 64)) | ||
| except Exception: | ||
LiliDeng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Ignore ethtool exceptions to allow fallback to lsvmbus method | ||
| pass | ||
|
|
||
| try: | ||
| chans = node.tools[Lsvmbus].get_device_channels(force_run=True) | ||
| for ch in chans: | ||
| if ch.class_id == HV_NETVSC_CLASS_ID: | ||
| return max(1, min(len(ch.channel_vp_map), 64)) | ||
| except Exception: | ||
LiliDeng marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # Ignore lsvmbus exceptions to allow fallback to threads method | ||
| # (lsvmbus may not be available) | ||
| pass | ||
|
|
||
| threads = node.tools[Lscpu].get_thread_count() | ||
| return max(1, min(int(threads), 64)) | ||
|
|
||
| def _read_current(self, node: Node) -> int: | ||
| """ | ||
| Read current combined channels. | ||
| """ | ||
| info = node.tools[Ethtool].get_device_channels_info("eth0", True) | ||
| cur = getattr(info, "current_channels", 1) | ||
| return max(1, int(cur)) | ||
|
|
||
| def _set_channels_with_retry( | ||
| self, log: Logger, node: Node, tgt: int, cur: int, soft_upper: int | ||
| ) -> int: | ||
| """ | ||
| Set channels to tgt with a single safe retry if it exceeds device max. | ||
| We clamp to min(device_max, soft_upper) first; if still failing with | ||
| 'exceeds maximum', we shrink to device_max and retry once. | ||
| """ | ||
| dev_max = self._read_max_supported(node) | ||
| final_tgt = max(1, min(int(tgt), int(soft_upper), int(dev_max))) | ||
| if final_tgt == int(cur): | ||
| return cur | ||
|
|
||
| try: | ||
| node.tools[Ethtool].change_device_channels_info("eth0", final_tgt) | ||
| return final_tgt | ||
| except Exception as e: | ||
| msg = str(e) | ||
| if "exceeds maximum" in msg or "Invalid argument" in msg: | ||
| if final_tgt != dev_max: | ||
| log.debug( | ||
| f"Retrying with device max due to '{msg}': " | ||
| f"tgt={final_tgt} -> {dev_max}" | ||
| ) | ||
| node.tools[Ethtool].change_device_channels_info("eth0", dev_max) | ||
| return dev_max | ||
| raise | ||
|
|
||
| def _pick_target_not_eq_current( | ||
| self, current: int, upper: int, lower: int = 1 | ||
| ) -> int: | ||
| """ | ||
| Pick a safe random target in [lower, upper] different from current. | ||
| Always clamp within the allowed range. | ||
| """ | ||
|
|
||
| lower = max(1, int(lower)) | ||
| upper = max(lower, int(upper)) | ||
|
|
||
| # If current already above limit, bring it back first | ||
| current = min(max(current, lower), upper) | ||
|
|
||
| # Candidates within range but != current | ||
| candidates = [x for x in range(lower, upper + 1) if x != current] | ||
| if not candidates: | ||
| return current | ||
|
|
||
| tgt = random.choice(candidates) | ||
| return min(max(tgt, lower), upper) | ||
|
|
||
| def _verify_no_irq_on_offline( | ||
| self, node: Node, offline: List[str], expect_len: int | ||
| ) -> None: | ||
| """ | ||
| Assert NIC channel count and that no IRQ is routed to offline CPUs. | ||
| """ | ||
| chans = node.tools[Lsvmbus].get_device_channels(force_run=True) | ||
| for ch in chans: | ||
| if ch.class_id == HV_NETVSC_CLASS_ID: | ||
| assert_that(ch.channel_vp_map).is_length(expect_len) | ||
| for vp in ch.channel_vp_map: | ||
| assert_that(vp.target_cpu).is_not_in(offline) | ||
|
|
||
| @TestCaseMetadata( | ||
| description=""" | ||
| This test will check that the added channels to synthetic network | ||
| adapter do not handle interrupts on offline cpu. | ||
| Steps: | ||
| 1. Get list of offline CPUs. | ||
| 2. Add channels to synthetic network adapter. | ||
| 3. Verify that the channels were added to synthetic network adapter. | ||
| 4. Verify that the added channels do not handle interrupts on offline cpu. | ||
| """, | ||
| Validate that changing netvsc combined channels works while some CPUs | ||
| are offline, and that no IRQ is routed to offline CPUs. Capture the | ||
| baseline NIC capability before any CPU is taken offline to avoid | ||
| misjudging capability from a transient state. | ||
| """, | ||
| priority=4, | ||
| requirement=simple_requirement( | ||
| min_core_count=16, | ||
| ), | ||
| requirement=simple_requirement(min_core_count=16), | ||
| ) | ||
| def verify_cpu_offline_channel_add(self, log: Logger, node: Node) -> None: | ||
| # skip test if kernel doesn't support cpu hotplug | ||
| check_runnable(node) | ||
| """ | ||
| Validate that changing netvsc combined channels works when some CPUs are | ||
| offline, and that no IRQ is routed to offline CPUs. The target channel | ||
| count is always clamped to the device capability and current CPU limits. | ||
| """ | ||
|
|
||
| # set vmbus channels target cpu into 0 if kernel supports this feature. | ||
| # ---------- Pre-checks ---------- | ||
| check_runnable(node) | ||
| set_interrupts_assigned_cpu(log, node) | ||
|
|
||
| # when kernel doesn't support above feature, we have to rely on current vm's | ||
| # cpu usage. then collect the cpu not in used exclude cpu0. | ||
| idle_cpus = get_idle_cpus(node) | ||
| log.debug(f"idle cpus: {idle_cpus}") | ||
|
|
||
| # save origin current channel | ||
| origin_device_channel = ( | ||
| node.tools[Ethtool].get_device_channels_info("eth0", True) | ||
| ).current_channels | ||
| log.debug(f"origin current channels count: {origin_device_channel}") | ||
| # Baseline capability with CPUs online | ||
| origin_channels = self._read_current(node) | ||
| dev_max0 = self._read_max_supported(node) | ||
| log.debug( | ||
| f"Baseline channels: current={origin_channels}, device_max={dev_max0}" | ||
| ) | ||
| if dev_max0 <= 1: | ||
| raise SkippedException( | ||
| "Device Combined max <= 1 at baseline; cannot add channels." | ||
| ) | ||
|
|
||
| # set channel count into 1 to get idle cpus | ||
| if len(idle_cpus) == 0: | ||
| # Find idle CPUs; if none, shrink once to 1 and retry | ||
| idle = get_idle_cpus(node) | ||
| log.debug(f"Idle CPUs (initial): {idle}") | ||
| if len(idle) == 0: | ||
| node.tools[Ethtool].change_device_channels_info("eth0", 1) | ||
| idle_cpus = get_idle_cpus(node) | ||
| log.debug(f"idle cpus: {idle_cpus}") | ||
| if len(idle_cpus) == 0: | ||
| idle = get_idle_cpus(node) | ||
| log.debug(f"Idle CPUs (after shrink to 1): {idle}") | ||
| if len(idle) == 0: | ||
| raise SkippedException( | ||
| "all of the cpu are associated vmbus channels, " | ||
| "no idle cpu can be used to test hotplug." | ||
| "All CPUs are associated with vmbus channels; no idle CPU available." | ||
| ) | ||
|
|
||
| # set idle cpu state offline and change channels | ||
| # current max channel will be cpu_count - len(idle_cpus) | ||
| # check channels of synthetic network adapter align with current setting channel | ||
| try: | ||
| # take idle cpu to offline | ||
| set_cpu_state_serial(log, node, idle_cpus, CPUState.OFFLINE) | ||
| # ---------- Phase 1: CPUs taken offline ---------- | ||
| set_cpu_state_serial(log, node, idle, CPUState.OFFLINE) | ||
|
|
||
| threads1 = node.tools[Lscpu].get_thread_count() | ||
| dev_max1 = self._read_max_supported(node) | ||
| upper1 = max(1, min(threads1 - len(idle), dev_max1, 64)) | ||
|
|
||
| # get vmbus channels of synthetic network adapter. the synthetic network | ||
| # drivers have class id "f8615163-df3e-46c5-913f-f2d2f965ed0e" | ||
| node.tools[Lsvmbus].get_device_channels(force_run=True) | ||
| thread_count = node.tools[Lscpu].get_thread_count() | ||
| cur1 = self._read_current(node) | ||
|
|
||
| # current max channel count need minus count of idle cpus | ||
| max_channel_count = thread_count - len(idle_cpus) | ||
| # If current exceeds the new upper bound, reduce first | ||
| if cur1 > upper1: | ||
| node.tools[Ethtool].change_device_channels_info("eth0", upper1) | ||
| cur1 = self._read_current(node) | ||
| log.debug(f"Reduced current channels at phase1: {cur1}") | ||
|
|
||
| first_current_device_channel = ( | ||
| node.tools[Ethtool].get_device_channels_info("eth0", True) | ||
| ).current_channels | ||
| tgt1 = self._pick_target_not_eq_current(cur1, upper1) | ||
| new1 = self._set_channels_with_retry(log, node, tgt1, cur1, upper1) | ||
| log.debug( | ||
| f"current channels count: {first_current_device_channel} " | ||
| "after taking idle cpu to offline" | ||
| f"Phase1 set: cur={cur1} -> {new1} " | ||
| f"(upper={upper1}, dev_max1={dev_max1})" | ||
| ) | ||
|
|
||
| # if all cpus besides cpu 0 are changed into offline | ||
| # skip change the channel, since current channel is 1 | ||
| first_channel_count = random.randint(1, min(max_channel_count, 64)) | ||
| if first_current_device_channel > 1: | ||
| while True: | ||
| if first_channel_count != first_current_device_channel: | ||
| break | ||
| first_channel_count = random.randint(1, min(thread_count, 64)) | ||
| node.tools[Ethtool].change_device_channels_info( | ||
| "eth0", first_channel_count | ||
| ) | ||
| first_current_device_channel = ( | ||
| node.tools[Ethtool].get_device_channels_info("eth0", True) | ||
| ).current_channels | ||
| log.debug( | ||
| f"current channels count: {first_current_device_channel} " | ||
| f"after changing channel into {first_channel_count}" | ||
| ) | ||
|
|
||
| # verify that the added channels do not handle interrupts on offline cpu | ||
| lsvmbus_channels = node.tools[Lsvmbus].get_device_channels(force_run=True) | ||
| for channel in lsvmbus_channels: | ||
| # verify synthetic network adapter channels align with expected value | ||
| if channel.class_id == "f8615163-df3e-46c5-913f-f2d2f965ed0e": | ||
| log.debug(f"Network synthetic channel: {channel}") | ||
| assert_that(channel.channel_vp_map).is_length( | ||
| first_current_device_channel | ||
| ) | ||
| self._verify_no_irq_on_offline(node, idle, new1) | ||
|
|
||
| # verify that devices do not handle interrupts on offline cpu | ||
| for channel_vp in channel.channel_vp_map: | ||
| assert_that(channel_vp.target_cpu).is_not_in(idle_cpus) | ||
|
|
||
| # reset idle cpu to online | ||
| set_cpu_state_serial(log, node, idle_cpus, CPUState.ONLINE) | ||
|
|
||
| # reset max and current channel count into original ones | ||
| # by reloading hv_netvsc driver if hv_netvsc can be reload | ||
| # otherwise reboot vm | ||
| if node.tools[KernelConfig].is_built_as_module("CONFIG_HYPERV_NET"): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why remove this logic, is it unnecessary?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The conditional logic was removed to improve test reliability. Module reload could cause network stack issues after CPU hotplug, so we switched to a simpler "always reboot" approach that's slower but guaranteed to work in all environments. |
||
| node.tools[Modprobe].reload("hv_netvsc") | ||
| else: | ||
| node.tools[Reboot].reboot() | ||
|
|
||
| # change the combined channels count after all cpus online | ||
| second_channel_count = random.randint(1, min(thread_count, 64)) | ||
| while True: | ||
| if first_current_device_channel != second_channel_count: | ||
| break | ||
| second_channel_count = random.randint(1, min(thread_count, 64)) | ||
| node.tools[Ethtool].change_device_channels_info( | ||
| "eth0", second_channel_count | ||
| ) | ||
| second_current_device_channel = ( | ||
| node.tools[Ethtool].get_device_channels_info("eth0", True) | ||
| ).current_channels | ||
| # ---------- Phase 2: CPUs back online ---------- | ||
| set_cpu_state_serial(log, node, idle, CPUState.ONLINE) | ||
| # Always reboot to ensure network stack is properly reinitialized | ||
| # after CPU hotplug operations to avoid SSH connection issues | ||
| node.tools[Reboot].reboot() | ||
|
|
||
| threads2 = node.tools[Lscpu].get_thread_count() | ||
| dev_max2 = self._read_max_supported(node) | ||
| upper2 = max(1, min(threads2, dev_max2, 64)) | ||
|
|
||
| cur2 = self._read_current(node) | ||
| if cur2 > upper2: | ||
| node.tools[Ethtool].change_device_channels_info("eth0", upper2) | ||
| cur2 = self._read_current(node) | ||
| log.debug(f"Reduced current channels at phase2: {cur2}") | ||
|
|
||
| tgt2 = self._pick_target_not_eq_current(cur2, upper2) | ||
| new2 = self._set_channels_with_retry(log, node, tgt2, cur2, upper2) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid using numeric suffixes; please use meaningful names instead.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment regarding usage of "tgt", does it mean target? |
||
| log.debug( | ||
| f"current channels count: {second_current_device_channel} " | ||
| f"after changing channel into {second_channel_count}" | ||
| f"Phase2 set: cur={cur2} -> {new2} " | ||
| f"(upper={upper2}, dev_max2={dev_max2})" | ||
| ) | ||
|
|
||
| # verify that the network adapter channels count changed | ||
| # into new channel count | ||
| lsvmbus_channels = node.tools[Lsvmbus].get_device_channels(force_run=True) | ||
| for channel in lsvmbus_channels: | ||
| # verify that channels were added to synthetic network adapter | ||
| if channel.class_id == "f8615163-df3e-46c5-913f-f2d2f965ed0e": | ||
| log.debug(f"Network synthetic channel: {channel}") | ||
| assert_that(channel.channel_vp_map).is_length(second_channel_count) | ||
| finally: | ||
| # reset idle cpu to online | ||
| set_cpu_state_serial(log, node, idle_cpus, CPUState.ONLINE) | ||
| # restore channel count into origin value | ||
| current_device_channel = ( | ||
| node.tools[Ethtool].get_device_channels_info("eth0", True) | ||
| ).current_channels | ||
| if current_device_channel != origin_device_channel: | ||
| node.tools[Ethtool].change_device_channels_info( | ||
| "eth0", origin_device_channel | ||
| ) | ||
| # ---------- Cleanup: always restore ---------- | ||
| try: | ||
| set_cpu_state_serial(log, node, idle, CPUState.ONLINE) | ||
| except Exception as e: | ||
| log.error(f"Failed to bring CPUs online during cleanup: {e}") | ||
|
|
||
| try: | ||
| # Re-read device cap for a safe restore | ||
| dev_max_final = self._read_max_supported(node) | ||
| safe_origin = max(1, min(int(origin_channels), int(dev_max_final))) | ||
| cur_now = self._read_current(node) | ||
| if cur_now != safe_origin: | ||
| node.tools[Ethtool].change_device_channels_info("eth0", safe_origin) | ||
| log.debug(f"Restored channels to origin value: {safe_origin}") | ||
| except Exception as e: | ||
| log.error(f"Restore channels failed (target={origin_channels}): {e}") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
where is it used?