Skip to content

Commit daa4a87

Browse files
gabriele-0201pepyakin
authored andcommitted
refactor(torture): address various follow-ups
+ update docs + `scheduled_rollback` can either be forced to crash or not, stop treating rollback and rollback crash as two different things + handle delays with `Duration`s
1 parent 8318629 commit daa4a87

File tree

3 files changed

+36
-104
lines changed

3 files changed

+36
-104
lines changed

torture/src/agent.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ pub async fn run(input: UnixStream) -> Result<()> {
8383
stream
8484
.send(Envelope {
8585
reqno,
86-
message: ToSupervisor::CommitSuccessful(elapsed.as_nanos() as u64),
86+
message: ToSupervisor::CommitSuccessful(elapsed),
8787
})
8888
.await?;
8989
}
@@ -157,10 +157,9 @@ pub async fn run(input: UnixStream) -> Result<()> {
157157
}
158158

159159
/// Execute the provided `task` and make it crash after the specified `crash_delay`.
160-
/// The delay must be specified in nanoseconds.
161160
async fn crash_task(
162161
task: impl Future<Output = ()> + Send + 'static,
163-
crash_delay: u64,
162+
crash_delay: Duration,
164163
op: &'static str,
165164
) {
166165
// TODO: implement this in the future.
@@ -177,7 +176,6 @@ async fn crash_task(
177176
let barrier_2 = barrier_1.clone();
178177
let task_1 = tokio::spawn(async move {
179178
barrier_1.wait().await;
180-
let crash_delay = Duration::from_nanos(crash_delay);
181179
sleep(crash_delay).await;
182180
tracing::info!("aborting {op} after {}ms", crash_delay.as_millis());
183181
std::process::abort();

torture/src/message.rs

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
//! This is used only for inter-process communication and thus doesn't need to care about versioning
44
//! or compatibility.
55
6+
use std::time::Duration;
7+
68
use serde::{Deserialize, Serialize};
79

810
pub type Key = [u8; 32];
@@ -47,7 +49,7 @@ pub struct InitPayload {
4749
/// Only used upon creation a new NOMT db.
4850
pub bitbox_seed: [u8; 16],
4951
/// Whether the agent is supposed to handle rollbacks.
50-
/// If `Some`, the maximum amount of supported blocks in a single rollback is specified.
52+
/// If `Some`, the maximum number of supported blocks in a single rollback is specified.
5153
pub rollback: Option<u32>,
5254
}
5355

@@ -60,8 +62,7 @@ pub struct CommitPayload {
6062
pub changeset: Vec<KeyValueChange>,
6163
/// If Some the supervisor expects the commit to crash,
6264
/// the crash should happen after the specified amount of time.
63-
/// Time is specified in nanoseconds.
64-
pub should_crash: Option<u64>,
65+
pub should_crash: Option<Duration>,
6566
}
6667

6768
/// The parameters for the [`ToAgent::Rollback`] message.
@@ -71,8 +72,7 @@ pub struct RollbackPayload {
7172
pub n_commits: usize,
7273
/// If Some the supervisor expects the rollback to crash,
7374
/// the crash should happen after the specified amount of time.
74-
/// Time is specified in nanoseconds.
75-
pub should_crash: Option<u64>,
75+
pub should_crash: Option<Duration>,
7676
}
7777

7878
/// The maximum size of an envelope, in the serialized form.
@@ -118,8 +118,7 @@ pub enum ToSupervisor {
118118
/// A generic acknowledgment message.
119119
Ack,
120120
/// The response to a successful commit, it contains the elapsed time to perform the commit.
121-
/// Time is measured in nanoseconds.
122-
CommitSuccessful(u64),
121+
CommitSuccessful(Duration),
123122
/// The response to a query for a key-value pair.
124123
QueryValue(Option<Value>),
125124
/// The response to a query for the current sequence number of the database.

torture/src/supervisor/workload.rs

Lines changed: 28 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ impl WorkloadState {
216216
}
217217

218218
fn rollback(&mut self, snapshot: Snapshot) {
219+
// The application of a rollback counts as increased sync_seq.
219220
self.committed.sync_seqn += 1;
220221
self.committed.state = snapshot.state;
221222
}
@@ -246,8 +247,8 @@ pub struct Workload {
246247
workload_id: u64,
247248
/// The seed for bitbox generated for this workload.
248249
bitbox_seed: [u8; 16],
249-
/// Data collected to evaluate the average commit time in nanoseconds.
250-
tot_commit_time: u64,
250+
/// Data collected to evaluate the average commit time.
251+
tot_commit_time: Duration,
251252
n_successfull_commit: u64,
252253
/// Whether to ensure the correct application of the changeset after every commit.
253254
ensure_changeset: bool,
@@ -257,18 +258,9 @@ pub struct Workload {
257258
sample_snapshot: bool,
258259
/// The max number of commits involved in a rollback.
259260
max_rollback_commits: u32,
260-
/// If `Some` there are rollbacks waiting to be applied.
261-
scheduled_rollbacks: ScheduledRollbacks,
262-
}
263-
264-
/// Tracker of rollbacks waiting for being applied.
265-
struct ScheduledRollbacks {
266-
/// If `Some` there is rollback waiting to be applied.
267-
rollback: Option<ScheduledRollback>,
268261
/// If `Some` there is rollback waiting to be applied,
269-
/// alongside the delay after which the rollback process should panic,
270-
/// measured in nanoseconds.
271-
rollback_crash: Option<(ScheduledRollback, u64)>,
262+
/// possibly alongside the delay after which the rollback process should panic.
263+
scheduled_rollback: Option<(ScheduledRollback, Option<Duration>)>,
272264
}
273265

274266
/// Contains the information required to apply a rollback.
@@ -317,16 +309,13 @@ impl Workload {
317309
state,
318310
workload_id,
319311
bitbox_seed,
320-
tot_commit_time: 0,
312+
tot_commit_time: Duration::ZERO,
321313
n_successfull_commit: 0,
322314
ensure_changeset: workload_params.ensure_changeset,
323315
ensure_snapshot: workload_params.ensure_snapshot,
324316
sample_snapshot: workload_params.sample_snapshot,
325317
max_rollback_commits: workload_params.max_rollback_commits,
326-
scheduled_rollbacks: ScheduledRollbacks {
327-
rollback: None,
328-
rollback_crash: None,
329-
},
318+
scheduled_rollback: None,
330319
}
331320
}
332321

@@ -360,28 +349,20 @@ impl Workload {
360349
let rr = agent.rr().clone();
361350
trace!("run_iteration");
362351

363-
if let Some((scheduled_rollback, should_crash)) = self
364-
.scheduled_rollbacks
365-
.is_time(self.state.committed.sync_seqn)
366-
{
352+
if self.scheduled_rollback.as_ref().map_or(false, |(r, _)| {
353+
r.sync_seqn == self.state.committed.sync_seqn
354+
}) {
355+
// UNWRAP: scheduled_rollback has just be checked to be `Some`
356+
let (scheduled_rollback, should_crash) = self.scheduled_rollback.take().unwrap();
367357
self.perform_scheduled_rollback(&rr, scheduled_rollback, should_crash)
368358
.await?;
369359
return Ok(());
370360
}
371361

372362
// Do not schedule new rollbacks if they are already scheduled.
373-
let mut rollback = self.state.biases.rollback;
374-
if self.scheduled_rollbacks.is_rollback_scheduled() {
375-
rollback = 0.0;
376-
}
377-
378-
let mut rollback_crash = self.state.biases.rollback_crash;
379-
if self.scheduled_rollbacks.is_rollback_crash_scheduled() {
380-
rollback_crash = 0.0;
381-
}
382-
383-
if self.state.rng.gen_bool(rollback) {
384-
if self.state.rng.gen_bool(rollback_crash) {
363+
let is_rollback_scheduled = self.scheduled_rollback.is_some();
364+
if !is_rollback_scheduled && self.state.rng.gen_bool(self.state.biases.rollback) {
365+
if self.state.rng.gen_bool(self.state.biases.rollback_crash) {
385366
self.schedule_rollback(true /*should_crash*/).await?
386367
} else {
387368
self.schedule_rollback(false /*should_crash*/).await?
@@ -437,19 +418,20 @@ impl Workload {
437418

438419
// The agent should crash after `crash_delay`ns.
439420
// If no data avaible crash after 300ms.
440-
let mut crash_delay = self
421+
let mut crash_delay_millis = self
441422
.tot_commit_time
442-
.checked_div(self.n_successfull_commit)
443-
.unwrap_or(Duration::from_millis(300).as_nanos() as u64);
423+
.as_millis()
424+
.checked_div(self.n_successfull_commit as u128)
425+
.unwrap_or(300) as u64;
444426
// Crash a little bit earlier than the average commit time to increase the
445427
// possibilities of crashing during sync.
446-
crash_delay = (crash_delay as f64 * 0.98) as u64;
428+
crash_delay_millis = (crash_delay_millis as f64 * 0.98) as u64;
447429

448430
trace!("exercising commit crash");
449431
rr.send_request(crate::message::ToAgent::Commit(
450432
crate::message::CommitPayload {
451433
changeset: changeset.clone(),
452-
should_crash: Some(crash_delay),
434+
should_crash: Some(Duration::from_millis(crash_delay_millis)),
453435
},
454436
))
455437
.await?;
@@ -478,10 +460,6 @@ impl Workload {
478460

479461
async fn schedule_rollback(&mut self, should_crash: bool) -> anyhow::Result<()> {
480462
let n_commits_to_rollback = self.state.rng.gen_range(1..self.max_rollback_commits) as usize;
481-
if n_commits_to_rollback == 0 {
482-
trace!("No available commits to perform rollback with");
483-
return Ok(());
484-
}
485463

486464
let last_snapshot = &self.state.committed;
487465
let rollback_sync_seqn = last_snapshot.sync_seqn + n_commits_to_rollback as u32;
@@ -491,14 +469,15 @@ impl Workload {
491469
snapshot: last_snapshot.clone(),
492470
};
493471

494-
if should_crash {
472+
let maybe_crash_delay = if should_crash {
495473
// TODO: more complex crash delay evaluation for rollbacks.
496-
let crash_delay = Duration::from_millis(10).as_nanos() as u64;
497-
self.scheduled_rollbacks.rollback_crash = Some((scheduled_rollback, crash_delay));
474+
Some(Duration::from_millis(10))
498475
} else {
499-
self.scheduled_rollbacks.rollback = Some(scheduled_rollback);
476+
None
500477
};
501478

479+
self.scheduled_rollback = Some((scheduled_rollback, maybe_crash_delay));
480+
502481
trace!(
503482
"scheduled rollback {}for sync_seqn: {} of {} commits",
504483
if should_crash { "crash " } else { "" },
@@ -513,7 +492,7 @@ impl Workload {
513492
&mut self,
514493
rr: &comms::RequestResponse,
515494
scheduled_rollback: ScheduledRollback,
516-
should_crash: Option<u64>,
495+
should_crash: Option<Duration>,
517496
) -> anyhow::Result<()> {
518497
let ScheduledRollback {
519498
n_commits,
@@ -560,7 +539,7 @@ impl Workload {
560539
rr: &comms::RequestResponse,
561540
n_commits_to_rollback: usize,
562541
snapshot: Snapshot,
563-
crash_delay: u64,
542+
crash_delay: Duration,
564543
) -> anyhow::Result<()> {
565544
trace!(
566545
"exercising rollback crash of {} commits",
@@ -778,47 +757,3 @@ impl Workload {
778757
self.workdir
779758
}
780759
}
781-
782-
impl ScheduledRollbacks {
783-
fn is_time(&mut self, curr_sync_seqn: u32) -> Option<(ScheduledRollback, Option<u64>)> {
784-
if self
785-
.rollback
786-
.as_ref()
787-
.map_or(false, |ScheduledRollback { sync_seqn, .. }| {
788-
curr_sync_seqn == *sync_seqn
789-
})
790-
{
791-
// UNWRAP: self.rollback has just been checked to be Some.
792-
let scheduled_rollback = self.rollback.take().unwrap();
793-
794-
// The probability of having scheduled at the same time both a rollback and
795-
// a rollback crash is very low, but if it happens, only the rollback will be applied.
796-
// Discard the rollback crash data.
797-
let _ = self.is_time(curr_sync_seqn);
798-
799-
return Some((scheduled_rollback, None));
800-
}
801-
802-
if self
803-
.rollback_crash
804-
.as_ref()
805-
.map_or(false, |(ScheduledRollback { sync_seqn, .. }, _)| {
806-
curr_sync_seqn == *sync_seqn
807-
})
808-
{
809-
// UNWRAP: self.rollback has just been checked to be Some.
810-
let (scheduled_rollback, crash_delay) = self.rollback_crash.take().unwrap();
811-
return Some((scheduled_rollback, Some(crash_delay)));
812-
}
813-
814-
return None;
815-
}
816-
817-
fn is_rollback_scheduled(&self) -> bool {
818-
self.rollback.is_some()
819-
}
820-
821-
fn is_rollback_crash_scheduled(&self) -> bool {
822-
self.rollback_crash.is_some()
823-
}
824-
}

0 commit comments

Comments
 (0)