Skip to content

Commit

Permalink
Add support for wrapping the work without timing wrap itself
Browse files Browse the repository at this point in the history
The `wrap` function can be e.g. used to install effect handlers.

This also tweaks the internal implementation of `Times.record` to try to
minimize memory use, allocations, and false sharing as much as possible.
  • Loading branch information
polytypic committed Feb 28, 2024
1 parent aeffdbe commit 646d437
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 86 deletions.
12 changes: 9 additions & 3 deletions lib/multicore_bench.mli
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,19 @@ module Times : sig
?domain_local_await:[< `Busy_wait | `Neglect > `Busy_wait ] ->
?n_warmups:int ->
?n_runs_min:int ->
?n_runs_max:int ->
?before:(unit -> unit) ->
init:(int -> 's) ->
?wrap:(int -> 's -> (unit -> unit) -> unit) ->
work:(int -> 's -> unit) ->
?after:(unit -> unit) ->
unit ->
t
(** [record ~budgetf ~n_domains ~init ~work ()] essentially repeatedly runs
[work i (init i)] on specified number of domains, [i ∊ [0, n_domains-1]],
and records the times that calls of [work] take. The calls of [work] are
synchronized to start as simultaneously as possible.
[let x = init i in wrap i x (fun () -> .. work i x ..)] on specified
number of domains, [i ∊ [0, n_domains-1]], and records the times that
calls of [work] take. The calls of [work] are synchronized to start as
simultaneously as possible.
Optional arguments:
Expand All @@ -96,6 +99,9 @@ module Times : sig
- [~n_runs_min]: Specifies the minimum number of timed runs. The upper
bound is determined dynamically based on [budgetf]. Defaults to [7].
- [~n_runs_max]: Specifies the maximum number of timed runs. Defaults to
[1023].
- [~before]: Specifies an action to run on one domain before [init].
- [~after]: Specifies an action to run on one domain after [work]. *)
Expand Down
230 changes: 147 additions & 83 deletions lib/times.ml
Original file line number Diff line number Diff line change
@@ -1,105 +1,169 @@
type t = { inverted : bool; times_per_domain : float array array; runs : int }

let record ~budgetf ~n_domains ?(ensure_multi_domain = true)
let with_busy_wait () =
let open struct
type state = Init | Released | Awaiting of { mutable released : bool }
end in
let state = Atomic.make Init in
let release () =
if Multicore_magic.fenceless_get state != Released then
match Atomic.exchange state Released with
| Awaiting r -> r.released <- true
| _ -> ()
in
let await () =
if Multicore_magic.fenceless_get state != Released then
let awaiting = Awaiting { released = false } in
if Atomic.compare_and_set state Init awaiting then
match awaiting with
| Awaiting r ->
(* Avoid sleeping *)
while not r.released do
Domain.cpu_relax ()
done
| _ -> ()
in
Domain_local_await.{ release; await }

let wrap _ _ action = action ()

(** ⚠️ This function is written in a very low level manner to avoid memore use,
allocations, and false sharing as much as possible during a run of [work] as
those can cause undesirable noise. *)
let record (type a) ~budgetf ~n_domains ?(ensure_multi_domain = true)
?(domain_local_await = `Busy_wait) ?(n_warmups = 3) ?(n_runs_min = 7)
?(before = Fun.id) ~init ~work ?(after = Fun.id) () =
let barrier = Barrier.make n_domains in
let results =
Array.init n_domains @@ fun _ ->
Stack.create () |> Multicore_magic.copy_as_padded
?(n_runs_max = 1023) ?(before = Fun.id) ~init ?(wrap = wrap) ~work
?(after = Fun.id) () =
Gc.full_major ();
let open struct
type shared = {
barrier : Barrier.t;
start_earliest : Mtime.Span.t Atomic.t;
work : int -> a -> unit;
wrap : int -> a -> (unit -> unit) -> unit;
results : Float.Array.t Array.t;
budget_start : Mtime.Span.t;
before : unit -> unit;
init : int -> a;
after : unit -> unit;
n_warmups : int;
n_runs_min : int;
budgetf : float;
mutable budget_used : bool;
mutable exit : bool;
mutable runs : int;
}
end in
let s =
{
barrier = Barrier.make n_domains;
start_earliest =
Atomic.make Mtime.Span.zero |> Multicore_magic.copy_as_padded;
work = Multicore_magic.copy_as_padded work;
wrap;
results =
Array.init n_domains (fun _ ->
Float.Array.create (Int.max n_runs_min n_runs_max));
budget_start = Mtime_clock.elapsed ();
before;
init;
after;
n_warmups;
n_runs_min;
budgetf;
budget_used = false;
exit = false;
runs = 0;
}
in
let budget_used = ref false |> Multicore_magic.copy_as_padded in
let runs = ref 0 |> Multicore_magic.copy_as_padded in
let exit = ref false in
let extra_domain =
if n_domains = 1 && ensure_multi_domain then
Some
( Domain.spawn @@ fun () ->
while not !exit do
while not s.exit do
Domain.cpu_relax ()
done )
else None
in
Gc.full_major ();
let budget_start = Mtime_clock.elapsed () in
let with_busy_wait () =
let open struct
type state = Init | Released | Awaiting of { mutable released : bool }
end in
let state = Atomic.make Init in
let release () =
if Multicore_magic.fenceless_get state != Released then
match Atomic.exchange state Released with
| Awaiting r -> r.released <- true
| _ -> ()
in
let await () =
if Multicore_magic.fenceless_get state != Released then
let awaiting = Awaiting { released = false } in
if Atomic.compare_and_set state Init awaiting then
match awaiting with
| Awaiting r ->
(* Avoid sleeping *)
while not r.released do
Domain.cpu_relax ()
done
| _ -> ()
in
Domain_local_await.{ release; await }
in
let start_earliest = Atomic.make Mtime.Span.zero in
let main domain_i =
let main i =
let benchmark () =
for _ = 1 to n_warmups do
Barrier.await barrier;
if domain_i = 0 then begin
before ();
let open struct
type local = {
domain_i : int;
mutable stop_current : Mtime.Span.t;
mutable state : a;
}
end in
let l =
Multicore_magic.copy_as_padded
{ domain_i = i; stop_current = Mtime.Span.zero; state = Obj.magic () }
in
let doit =
Multicore_magic.copy_as_padded @@ fun () ->
Barrier.await s.barrier;
if Multicore_magic.fenceless_get s.start_earliest == Mtime.Span.zero
then begin
let start_current = Mtime_clock.elapsed () in
if Multicore_magic.fenceless_get s.start_earliest == Mtime.Span.zero
then
Atomic.compare_and_set s.start_earliest Mtime.Span.zero
start_current
|> ignore
end;
s.work l.domain_i l.state;
l.stop_current <- Mtime_clock.elapsed ()
in
(* warmup runs *)
for _ = 1 to s.n_warmups do
if l.domain_i = 0 then begin
Multicore_magic.fenceless_set s.start_earliest Mtime.Span.zero;
s.before ();
Gc.major ()
end;
Barrier.await barrier;
let state = init domain_i in
Barrier.await barrier;
work domain_i state;
Barrier.await barrier;
if domain_i = 0 then after ()
Barrier.await s.barrier;
l.state <- s.init l.domain_i;
s.wrap l.domain_i l.state doit;
Barrier.await s.barrier;
l.state <- Obj.magic ();
if l.domain_i = 0 then s.after ();
Barrier.await s.barrier
done;
while !runs < n_runs_min || not !budget_used do
Barrier.await barrier;
if domain_i = 0 then begin
Multicore_magic.fenceless_set start_earliest Mtime.Span.zero;
before ();
(* timed runs *)
while
(s.runs < s.n_runs_min && s.runs < Float.Array.length s.results.(0))
|| not s.budget_used
do
if l.domain_i = 0 then begin
Multicore_magic.fenceless_set s.start_earliest Mtime.Span.zero;
s.before ();
Gc.major ()
end;
Barrier.await s.barrier;
l.state <- s.init l.domain_i;
s.wrap l.domain_i l.state doit;
Barrier.await s.barrier;
l.state <- Obj.magic ();
Float.Array.set s.results.(l.domain_i) s.runs
(Mtime.Span.to_float_ns
(Mtime.Span.abs_diff l.stop_current
(Multicore_magic.fenceless_get s.start_earliest))
*. (1. /. 1_000_000_000.0));
l.stop_current <- Mtime.Span.zero;
Barrier.await s.barrier;
if l.domain_i = 0 then begin
s.after ();
s.runs <- s.runs + 1;
if
let budget_stop = Mtime_clock.elapsed () in
let elapsedf =
Mtime.Span.to_float_ns
(Mtime.Span.abs_diff budget_stop budget_start)
(Mtime.Span.abs_diff budget_stop s.budget_start)
*. (1. /. 1_000_000_000.0)
in
budgetf < elapsedf
then budget_used := true;
incr runs;
Gc.major ()
end;
Barrier.await barrier;
let state = init domain_i in
Barrier.await barrier;
if Multicore_magic.fenceless_get start_earliest == Mtime.Span.zero then begin
let start_current = Mtime_clock.elapsed () in
if Multicore_magic.fenceless_get start_earliest == Mtime.Span.zero
then
Atomic.compare_and_set start_earliest Mtime.Span.zero start_current
|> ignore
s.budgetf < elapsedf
then s.budget_used <- true
end;
work domain_i state;
let stop_current = Mtime_clock.elapsed () in
Barrier.await barrier;
if domain_i = 0 then after ();
Stack.push
(Mtime.Span.to_float_ns
(Mtime.Span.abs_diff stop_current
(Multicore_magic.fenceless_get start_earliest))
*. (1. /. 1_000_000_000.0))
results.(domain_i)
Barrier.await s.barrier
done
in
match domain_local_await with
Expand All @@ -113,14 +177,14 @@ let record ~budgetf ~n_domains ?(ensure_multi_domain = true)
Domain.spawn @@ fun () -> main (domain_i + 1)
in
main 0;
s.exit <- true;
Array.iter Domain.join domains;
exit := true;
Option.iter Domain.join extra_domain;
let times_per_domain =
Array.init (Array.length results) @@ fun i ->
Stack.to_seq results.(i) |> Array.of_seq
Array.init (Array.length s.results) @@ fun i ->
s.results.(i) |> Float.Array.to_seq |> Seq.take s.runs |> Array.of_seq
in
{ inverted = false; times_per_domain; runs = !runs }
{ inverted = false; times_per_domain; runs = s.runs }

let average { inverted; times_per_domain; runs } =
let domains = Array.length times_per_domain in
Expand Down

0 comments on commit 646d437

Please sign in to comment.