Skip to content

Commit fb78541

Browse files
committed
Add support for wrapping the work without timing wrap itself
The `wrap` function can be e.g. used to install effect handlers. This also tweaks the internal implementation of `Times.record` to try to minimize memory use, allocations, and false sharing as much as possible.
1 parent aeffdbe commit fb78541

File tree

2 files changed

+180
-105
lines changed

2 files changed

+180
-105
lines changed

lib/multicore_bench.mli

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,19 @@ module Times : sig
6767
?domain_local_await:[< `Busy_wait | `Neglect > `Busy_wait ] ->
6868
?n_warmups:int ->
6969
?n_runs_min:int ->
70+
?n_runs_max:int ->
7071
?before:(unit -> unit) ->
7172
init:(int -> 's) ->
73+
?wrap:(int -> 's -> (unit -> unit) -> unit) ->
7274
work:(int -> 's -> unit) ->
7375
?after:(unit -> unit) ->
7476
unit ->
7577
t
7678
(** [record ~budgetf ~n_domains ~init ~work ()] essentially repeatedly runs
77-
[work i (init i)] on specified number of domains, [i ∊ [0, n_domains-1]],
78-
and records the times that calls of [work] take. The calls of [work] are
79-
synchronized to start as simultaneously as possible.
79+
[let x = init i in wrap i x (fun () -> .. work i x ..)] on specified
80+
number of domains, [i ∊ [0, n_domains-1]], and records the times that
81+
calls of [work] take. The calls of [work] are synchronized to start as
82+
simultaneously as possible.
8083
8184
Optional arguments:
8285
@@ -96,6 +99,9 @@ module Times : sig
9699
- [~n_runs_min]: Specifies the minimum number of timed runs. The upper
97100
bound is determined dynamically based on [budgetf]. Defaults to [7].
98101
102+
- [~n_runs_max]: Specifies the maximum number of timed runs. Defaults to
103+
[1023].
104+
99105
- [~before]: Specifies an action to run on one domain before [init].
100106
101107
- [~after]: Specifies an action to run on one domain after [work]. *)

lib/times.ml

Lines changed: 171 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,167 @@
1-
type t = { inverted : bool; times_per_domain : float array array; runs : int }
1+
type t = { inverted : bool; times_per_domain : Float.Array.t array; runs : int }
22

3-
let record ~budgetf ~n_domains ?(ensure_multi_domain = true)
3+
let with_busy_wait () =
4+
let open struct
5+
type state = Init | Released | Awaiting of { mutable released : bool }
6+
end in
7+
let state = Atomic.make Init in
8+
let release () =
9+
if Multicore_magic.fenceless_get state != Released then
10+
match Atomic.exchange state Released with
11+
| Awaiting r -> r.released <- true
12+
| _ -> ()
13+
in
14+
let await () =
15+
if Multicore_magic.fenceless_get state != Released then
16+
let awaiting = Awaiting { released = false } in
17+
if Atomic.compare_and_set state Init awaiting then
18+
match awaiting with
19+
| Awaiting r ->
20+
(* Avoid sleeping *)
21+
while not r.released do
22+
Domain.cpu_relax ()
23+
done
24+
| _ -> ()
25+
in
26+
Domain_local_await.{ release; await }
27+
28+
let wrap _ _ action = action ()
29+
30+
(** ⚠️ This function is written in a very low level manner to avoid memore use,
31+
allocations, and false sharing as much as possible during a run of [work] as
32+
those can cause undesirable noise. *)
33+
let record (type a) ~budgetf ~n_domains ?(ensure_multi_domain = true)
434
?(domain_local_await = `Busy_wait) ?(n_warmups = 3) ?(n_runs_min = 7)
5-
?(before = Fun.id) ~init ~work ?(after = Fun.id) () =
6-
let barrier = Barrier.make n_domains in
7-
let results =
8-
Array.init n_domains @@ fun _ ->
9-
Stack.create () |> Multicore_magic.copy_as_padded
35+
?(n_runs_max = 1023) ?(before = Fun.id) ~init ?(wrap = wrap) ~work
36+
?(after = Fun.id) () =
37+
Gc.full_major ();
38+
let open struct
39+
type shared = {
40+
barrier : Barrier.t;
41+
start_earliest : Mtime.Span.t Atomic.t;
42+
work : int -> a -> unit;
43+
wrap : int -> a -> (unit -> unit) -> unit;
44+
results : Float.Array.t Array.t;
45+
budget_start : Mtime.Span.t;
46+
before : unit -> unit;
47+
init : int -> a;
48+
after : unit -> unit;
49+
n_warmups : int;
50+
n_runs_min : int;
51+
budgetf : float;
52+
mutable budget_used : bool;
53+
mutable exit : bool;
54+
mutable runs : int;
55+
}
56+
end in
57+
let s =
58+
{
59+
barrier = Barrier.make n_domains;
60+
start_earliest =
61+
Atomic.make Mtime.Span.zero |> Multicore_magic.copy_as_padded;
62+
work = Multicore_magic.copy_as_padded work;
63+
wrap;
64+
results =
65+
Array.init n_domains (fun _ ->
66+
Float.Array.create (Int.max n_runs_min n_runs_max));
67+
budget_start = Mtime_clock.elapsed ();
68+
before;
69+
init;
70+
after;
71+
n_warmups;
72+
n_runs_min;
73+
budgetf;
74+
budget_used = false;
75+
exit = false;
76+
runs = 0;
77+
}
1078
in
11-
let budget_used = ref false |> Multicore_magic.copy_as_padded in
12-
let runs = ref 0 |> Multicore_magic.copy_as_padded in
13-
let exit = ref false in
1479
let extra_domain =
1580
if n_domains = 1 && ensure_multi_domain then
1681
Some
1782
( Domain.spawn @@ fun () ->
18-
while not !exit do
83+
while not s.exit do
1984
Domain.cpu_relax ()
2085
done )
2186
else None
2287
in
23-
Gc.full_major ();
24-
let budget_start = Mtime_clock.elapsed () in
25-
let with_busy_wait () =
26-
let open struct
27-
type state = Init | Released | Awaiting of { mutable released : bool }
28-
end in
29-
let state = Atomic.make Init in
30-
let release () =
31-
if Multicore_magic.fenceless_get state != Released then
32-
match Atomic.exchange state Released with
33-
| Awaiting r -> r.released <- true
34-
| _ -> ()
35-
in
36-
let await () =
37-
if Multicore_magic.fenceless_get state != Released then
38-
let awaiting = Awaiting { released = false } in
39-
if Atomic.compare_and_set state Init awaiting then
40-
match awaiting with
41-
| Awaiting r ->
42-
(* Avoid sleeping *)
43-
while not r.released do
44-
Domain.cpu_relax ()
45-
done
46-
| _ -> ()
47-
in
48-
Domain_local_await.{ release; await }
49-
in
50-
let start_earliest = Atomic.make Mtime.Span.zero in
51-
let main domain_i =
88+
let main i =
5289
let benchmark () =
53-
for _ = 1 to n_warmups do
54-
Barrier.await barrier;
55-
if domain_i = 0 then begin
56-
before ();
90+
let open struct
91+
type local = {
92+
domain_i : int;
93+
mutable stop_current : Mtime.Span.t;
94+
mutable state : a;
95+
}
96+
end in
97+
let l =
98+
Multicore_magic.copy_as_padded
99+
{ domain_i = i; stop_current = Mtime.Span.zero; state = Obj.magic () }
100+
in
101+
let doit =
102+
Multicore_magic.copy_as_padded @@ fun () ->
103+
Barrier.await s.barrier;
104+
if Multicore_magic.fenceless_get s.start_earliest == Mtime.Span.zero
105+
then begin
106+
let start_current = Mtime_clock.elapsed () in
107+
if Multicore_magic.fenceless_get s.start_earliest == Mtime.Span.zero
108+
then
109+
Atomic.compare_and_set s.start_earliest Mtime.Span.zero
110+
start_current
111+
|> ignore
112+
end;
113+
s.work l.domain_i l.state;
114+
l.stop_current <- Mtime_clock.elapsed ()
115+
in
116+
(* warmup runs *)
117+
for _ = 1 to s.n_warmups do
118+
if l.domain_i = 0 then begin
119+
Multicore_magic.fenceless_set s.start_earliest Mtime.Span.zero;
120+
s.before ();
57121
Gc.major ()
58122
end;
59-
Barrier.await barrier;
60-
let state = init domain_i in
61-
Barrier.await barrier;
62-
work domain_i state;
63-
Barrier.await barrier;
64-
if domain_i = 0 then after ()
123+
Barrier.await s.barrier;
124+
l.state <- s.init l.domain_i;
125+
s.wrap l.domain_i l.state doit;
126+
Barrier.await s.barrier;
127+
l.state <- Obj.magic ();
128+
if l.domain_i = 0 then s.after ();
129+
Barrier.await s.barrier
65130
done;
66-
while !runs < n_runs_min || not !budget_used do
67-
Barrier.await barrier;
68-
if domain_i = 0 then begin
69-
Multicore_magic.fenceless_set start_earliest Mtime.Span.zero;
70-
before ();
131+
(* timed runs *)
132+
while s.runs < s.n_runs_min || not s.budget_used do
133+
if l.domain_i = 0 then begin
134+
Multicore_magic.fenceless_set s.start_earliest Mtime.Span.zero;
135+
s.before ();
136+
Gc.major ()
137+
end;
138+
Barrier.await s.barrier;
139+
l.state <- s.init l.domain_i;
140+
s.wrap l.domain_i l.state doit;
141+
Barrier.await s.barrier;
142+
l.state <- Obj.magic ();
143+
Float.Array.set s.results.(l.domain_i) s.runs
144+
(Mtime.Span.to_float_ns
145+
(Mtime.Span.abs_diff l.stop_current
146+
(Multicore_magic.fenceless_get s.start_earliest))
147+
*. (1. /. 1_000_000_000.0));
148+
l.stop_current <- Mtime.Span.zero;
149+
Barrier.await s.barrier;
150+
if l.domain_i = 0 then begin
151+
s.after ();
152+
s.runs <- s.runs + 1;
71153
if
72154
let budget_stop = Mtime_clock.elapsed () in
73155
let elapsedf =
74156
Mtime.Span.to_float_ns
75-
(Mtime.Span.abs_diff budget_stop budget_start)
157+
(Mtime.Span.abs_diff budget_stop s.budget_start)
76158
*. (1. /. 1_000_000_000.0)
77159
in
78-
budgetf < elapsedf
79-
then budget_used := true;
80-
incr runs;
81-
Gc.major ()
82-
end;
83-
Barrier.await barrier;
84-
let state = init domain_i in
85-
Barrier.await barrier;
86-
if Multicore_magic.fenceless_get start_earliest == Mtime.Span.zero then begin
87-
let start_current = Mtime_clock.elapsed () in
88-
if Multicore_magic.fenceless_get start_earliest == Mtime.Span.zero
89-
then
90-
Atomic.compare_and_set start_earliest Mtime.Span.zero start_current
91-
|> ignore
160+
s.budgetf < elapsedf
161+
|| Float.Array.length s.results.(l.domain_i) <= s.runs
162+
then s.budget_used <- true
92163
end;
93-
work domain_i state;
94-
let stop_current = Mtime_clock.elapsed () in
95-
Barrier.await barrier;
96-
if domain_i = 0 then after ();
97-
Stack.push
98-
(Mtime.Span.to_float_ns
99-
(Mtime.Span.abs_diff stop_current
100-
(Multicore_magic.fenceless_get start_earliest))
101-
*. (1. /. 1_000_000_000.0))
102-
results.(domain_i)
164+
Barrier.await s.barrier
103165
done
104166
in
105167
match domain_local_await with
@@ -113,33 +175,35 @@ let record ~budgetf ~n_domains ?(ensure_multi_domain = true)
113175
Domain.spawn @@ fun () -> main (domain_i + 1)
114176
in
115177
main 0;
178+
s.exit <- true;
116179
Array.iter Domain.join domains;
117-
exit := true;
118180
Option.iter Domain.join extra_domain;
119181
let times_per_domain =
120-
Array.init (Array.length results) @@ fun i ->
121-
Stack.to_seq results.(i) |> Array.of_seq
182+
s.results |> Array.map @@ fun times -> Float.Array.sub times 0 s.runs
122183
in
123-
{ inverted = false; times_per_domain; runs = !runs }
184+
{ inverted = false; times_per_domain; runs = s.runs }
124185

125186
let average { inverted; times_per_domain; runs } =
126187
let domains = Array.length times_per_domain in
127-
let n = Array.length times_per_domain.(0) in
128-
let times = Array.create_float n in
188+
let n = Float.Array.length times_per_domain.(0) in
189+
let times = Float.Array.create n in
129190
for run_i = 0 to n - 1 do
130-
times.(run_i) <- 0.0;
191+
Float.Array.set times run_i 0.0;
131192
for domain_i = 0 to domains - 1 do
132-
times.(run_i) <- times.(run_i) +. times_per_domain.(domain_i).(run_i)
193+
Float.Array.set times run_i
194+
(Float.Array.get times run_i
195+
+. Float.Array.get times_per_domain.(domain_i) run_i)
133196
done;
134-
times.(run_i) <- times.(run_i) /. Float.of_int domains
197+
Float.Array.set times run_i
198+
(Float.Array.get times run_i /. Float.of_int domains)
135199
done;
136200
{ inverted; times_per_domain = [| times |]; runs }
137201

138202
let invert { inverted; times_per_domain; runs } =
139203
{
140204
inverted = not inverted;
141205
times_per_domain =
142-
Array.map (Array.map (fun v -> 1.0 /. v)) times_per_domain;
206+
Array.map (Float.Array.map (fun v -> 1.0 /. v)) times_per_domain;
143207
runs;
144208
}
145209

@@ -163,39 +227,44 @@ module Stats = struct
163227
}
164228

165229
let mean_of times =
166-
Array.fold_left ( +. ) 0.0 times /. Float.of_int (Array.length times)
230+
Float.Array.fold_left ( +. ) 0.0 times
231+
/. Float.of_int (Float.Array.length times)
167232

168233
let sd_of times mean =
169234
Float.sqrt
170235
(mean_of
171-
(Array.map
236+
(Float.Array.map
172237
(fun v ->
173238
let d = v -. mean in
174239
d *. d)
175240
times))
176241

177242
let median_of times =
178-
Array.sort Float.compare times;
179-
let n = Array.length times in
180-
if n land 1 = 0 then (times.((n asr 1) - 1) +. times.(n asr 1)) /. 2.0
181-
else times.(n asr 1)
243+
Float.Array.sort Float.compare times;
244+
let n = Float.Array.length times in
245+
if n land 1 = 0 then
246+
(Float.Array.get times ((n asr 1) - 1) +. Float.Array.get times (n asr 1))
247+
/. 2.0
248+
else Float.Array.get times (n asr 1)
182249

183250
let of_times { inverted; times_per_domain; runs } =
184251
let domains = Array.length times_per_domain in
185-
let n = Array.length times_per_domain.(0) in
186-
let times = Array.create_float n in
252+
let n = Float.Array.length times_per_domain.(0) in
253+
let times = Float.Array.create n in
187254
for run_i = 0 to n - 1 do
188-
times.(run_i) <- 0.0;
255+
Float.Array.set times run_i 0.0;
189256
for domain_i = 0 to domains - 1 do
190-
times.(run_i) <- times.(run_i) +. times_per_domain.(domain_i).(run_i)
257+
Float.Array.set times run_i
258+
(Float.Array.get times run_i
259+
+. Float.Array.get times_per_domain.(domain_i) run_i)
191260
done
192261
done;
193262
let mean = mean_of times in
194263
let sd = sd_of times mean in
195264
let median = median_of times in
196265
let best =
197-
if inverted then Array.fold_left Float.max Float.min_float times
198-
else Array.fold_left Float.min Float.max_float times
266+
if inverted then Float.Array.fold_left Float.max Float.min_float times
267+
else Float.Array.fold_left Float.min Float.max_float times
199268
in
200269
{ mean; sd; median; inverted; best; runs }
201270

0 commit comments

Comments
 (0)