From 3fedee95049b24895e5323a74ac676536420fb7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Kr=C3=BCger?= <philipp.krueger1@gmail.com>
Date: Thu, 9 Jan 2025 09:57:38 +0100
Subject: [PATCH] test(iroh): Make `test_relay_datagram_queue` less timing
 dependent (#3106)

## Description

<!-- A summary of what this pull request achieves and a rough list of
changes. -->
I've seen `test_relay_datagram_queue` fail on windows recently:
https://github.com/n0-computer/iroh/actions/runs/12669553316/job/35307313406

Given that
- when the test failed, it took 720ms,
- when the test succeeded, it took 150ms and 320ms,
- we've got a 100ms timeout on one of the futures, and
- the tokio scheduler (I think) doesn't optimize for average job time,

I'm not convinced there's a race condition.

So, instead I'm rewriting the test to be less timing-dependent. And less
confusing as well *and* have better logging.

## Change checklist

- [x] Self-review.
- [x] Documentation updates following the [style
guide](https://rust-lang.github.io/rfcs/1574-more-api-documentation-conventions.html#appendix-a-full-conventions-text),
if relevant.
- [x] Tests if relevant.
- [x] All breaking changes documented.
---
 iroh/src/magicsock.rs | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)
diff --git a/iroh/src/magicsock.rs b/iroh/src/magicsock.rs
index a46ca25f4d..e32d16ad65 100644
--- a/iroh/src/magicsock.rs
+++ b/iroh/src/magicsock.rs
@@ -4096,26 +4096,20 @@ mod tests {
         tasks.spawn({
             let queue = queue.clone();
             async move {
-                let mut expected_msgs = vec![false; capacity];
-
-                while let Ok(datagram) = tokio::time::timeout(
-                    Duration::from_millis(100),
-                    futures_lite::future::poll_fn(|cx| {
+                let mut expected_msgs: BTreeSet<usize> = (0..capacity).collect();
+                while !expected_msgs.is_empty() {
+                    let datagram = futures_lite::future::poll_fn(|cx| {
                         queue.poll_recv(cx).map(|result| result.unwrap())
-                    }),
-                )
-                .await
-                {
+                    })
+                    .await;
+
                     let msg_num = usize::from_le_bytes(datagram.buf.as_ref().try_into().unwrap());
+                    debug!("Received {msg_num}");
 
-                    if expected_msgs[msg_num] {
-                        panic!("Received message number {msg_num} more than once (duplicated)");
+                    if !expected_msgs.remove(&msg_num) {
+                        panic!("Received message number {msg_num} twice or more, but expected it only exactly once.");
                     }
-
-                    expected_msgs[msg_num] = true;
                 }
-
-                assert!(expected_msgs.into_iter().all(|is_set| is_set));
             }
         });
 
@@ -4124,6 +4118,7 @@ mod tests {
                 let queue = queue.clone();
                 let url = url.clone();
                 async move {
+                    debug!("Sending {i}");
                     queue
                         .try_send(RelayRecvDatagram {
                             url,
@@ -4135,7 +4130,13 @@ mod tests {
             });
         }
 
-        tasks.join_all().await;
+        // We expect all of this work to be done in 10 seconds max.
+        if tokio::time::timeout(Duration::from_secs(10), tasks.join_all())
+            .await
+            .is_err()
+        {
+            panic!("Timeout - not all messages between 0 and {capacity} received.");
+        }
     }
 
     #[tokio::test]