From fa4884cdadcbd15189db02ffe350ef39e6aa4b3a Mon Sep 17 00:00:00 2001 From: mrmr1993 Date: Wed, 26 Jul 2023 19:26:44 +0100 Subject: [PATCH 1/3] Make medium bootstrap sections hard fail --- src/app/test_executive/medium_bootstrap.ml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/app/test_executive/medium_bootstrap.ml b/src/app/test_executive/medium_bootstrap.ml index 0fd3434f873..92fce3515b3 100644 --- a/src/app/test_executive/medium_bootstrap.ml +++ b/src/app/test_executive/medium_bootstrap.ml @@ -49,8 +49,9 @@ module Make (Inputs : Intf.Test.Inputs_intf) = struct let logger = Logger.create () in let all_nodes = Network.all_nodes network in let%bind () = - wait_for t - (Wait_condition.nodes_to_initialize (Core.String.Map.data all_nodes)) + section_hard "Wait for nodes to initialize" + (wait_for t + (Wait_condition.nodes_to_initialize (Core.String.Map.data all_nodes)) ) in let node_a = Core.String.Map.find_exn (Network.block_producers network) "node-a" @@ -62,11 +63,11 @@ module Make (Inputs : Intf.Test.Inputs_intf) = struct Core.String.Map.find_exn (Network.block_producers network) "node-c" in let%bind () = - section "blocks are produced" + section_hard "blocks are produced" (wait_for t (Wait_condition.blocks_to_be_produced 1)) in let%bind () = - section "restart node after 2k+1, ie 5, blocks" + section_hard "restart node after 2k+1, ie 5, blocks" (let%bind () = Node.stop node_c in [%log info] "%s stopped, will now wait for blocks to be produced" (Node.id node_c) ; @@ -80,7 +81,7 @@ module Make (Inputs : Intf.Test.Inputs_intf) = struct (Wait_condition.nodes_to_synchronize [ node_a; node_b; node_c ]) ) in let%bind () = - section "network is fully connected after one node was restarted" + section_hard "network is fully connected after one node was restarted" (let%bind () = Malleable_error.lift (after (Time.Span.of_sec 240.0)) in let%bind final_connectivity_data = fetch_connectivity_data ~logger (Core.String.Map.data all_nodes) @@ -88,11 +89,11 @@ module Make (Inputs : Intf.Test.Inputs_intf) = struct assert_peers_completely_connected final_connectivity_data ) in let%bind () = - section "blocks are produced" + section_hard "blocks are produced" (wait_for t (Wait_condition.blocks_to_be_produced 1)) in let%bind () = - section "restart node with the same state after 1 block" + section_hard "restart node with the same state after 1 block" (let%bind () = Node.stop node_c in [%log info] "%s stopped, will now wait for blocks to be produced" (Node.id node_c) ; @@ -112,7 +113,7 @@ module Make (Inputs : Intf.Test.Inputs_intf) = struct (Wait_condition.nodes_to_synchronize [ node_a; node_b; node_c ]) ) in - section "network is fully connected after one node was restarted" + section_hard "network is fully connected after one node was restarted" (let%bind () = Malleable_error.lift (after (Time.Span.of_sec 240.0)) in let%bind final_connectivity_data = fetch_connectivity_data ~logger (Core.String.Map.data all_nodes) From a613fe2b1e92d748bac6f8980da9e409d4e79648 Mon Sep 17 00:00:00 2001 From: mrmr1993 Date: Wed, 26 Jul 2023 19:26:44 +0100 Subject: [PATCH 2/3] Make medium bootstrap sections hard fail --- src/app/test_executive/medium_bootstrap.ml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/app/test_executive/medium_bootstrap.ml b/src/app/test_executive/medium_bootstrap.ml index 74a6b66185d..8bb34b4fda2 100644 --- a/src/app/test_executive/medium_bootstrap.ml +++ b/src/app/test_executive/medium_bootstrap.ml @@ -49,8 +49,9 @@ module Make (Inputs : Intf.Test.Inputs_intf) = struct let logger = Logger.create () in let all_nodes = Network.all_nodes network in let%bind () = - wait_for t - (Wait_condition.nodes_to_initialize (Core.String.Map.data all_nodes)) + section_hard "Wait for nodes to initialize" + (wait_for t + (Wait_condition.nodes_to_initialize (Core.String.Map.data all_nodes)) ) in let node_a = Core.String.Map.find_exn (Network.block_producers network) "node-a" @@ -62,11 +63,11 @@ module Make (Inputs : Intf.Test.Inputs_intf) = struct Core.String.Map.find_exn (Network.block_producers network) "node-c" in let%bind () = - section "blocks are produced" + section_hard "blocks are produced" (wait_for t (Wait_condition.blocks_to_be_produced 1)) in let%bind () = - section "restart node after 2k+1, ie 5, blocks" + section_hard "restart node after 2k+1, ie 5, blocks" (let%bind () = Node.stop node_c in [%log info] "%s stopped, will now wait for blocks to be produced" (Node.id node_c) ; @@ -79,7 +80,7 @@ module Make (Inputs : Intf.Test.Inputs_intf) = struct wait_for t (Wait_condition.nodes_to_synchronize [ node_a; node_b; node_c ]) ) in - section "network is fully connected after one node was restarted" + section_hard "network is fully connected after one node was restarted" (let%bind () = Malleable_error.lift (after (Time.Span.of_sec 240.0)) in let%bind final_connectivity_data = fetch_connectivity_data ~logger (Core.String.Map.data all_nodes) From e2b557b0b879216a578713f9841cd9b5eb62c367 Mon Sep 17 00:00:00 2001 From: mrmr1993 Date: Wed, 26 Jul 2023 19:42:36 +0100 Subject: [PATCH 3/3] Reduce the timeout for the Nodes_to_synchronize wait condition This wait condition usually runs very quickly, and we only see the timeout being hit when the test infrastructure has failed. With the previous timeout of 48 slots, this takes 1.6 hours. Reducing this significantly should help with long-running failed tests. The numbers here are chosen arbitrarily. Really, the time it takes should depend on the known state of the nodes entering the wait condition, accounting for the number of blocks to be bootstrapped / caught-up to, but I don't want to do that in this first pass. --- src/lib/integration_test_lib/wait_condition.ml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lib/integration_test_lib/wait_condition.ml b/src/lib/integration_test_lib/wait_condition.ml index 97476d2ac76..5f1a78f7175 100644 --- a/src/lib/integration_test_lib/wait_condition.ml +++ b/src/lib/integration_test_lib/wait_condition.ml @@ -137,7 +137,8 @@ struct then Predicate_passed else Predicate_continuation () in - let soft_timeout_in_slots = 8 * 3 in + let soft_timeout_in_slots = 4 in + let hard_timeout_in_slots = 6 in let formatted_nodes = nodes |> List.map ~f:(fun node -> "\"" ^ Node.id node ^ "\"") @@ -147,7 +148,7 @@ struct ; description = Printf.sprintf "%s to synchronize" formatted_nodes ; predicate = Network_state_predicate (check (), check) ; soft_timeout = Slots soft_timeout_in_slots - ; hard_timeout = Slots (soft_timeout_in_slots * 2) + ; hard_timeout = Slots hard_timeout_in_slots } let signed_command_to_be_included_in_frontier ~txn_hash