1
- type t = { inverted : bool ; times_per_domain : float array array ; runs : int }
1
+ type t = { inverted : bool ; times_per_domain : Float.Array .t array ; runs : int }
2
2
3
- let record ~budgetf ~n_domains ?(ensure_multi_domain = true )
3
+ let with_busy_wait () =
4
+ let open struct
5
+ type state = Init | Released | Awaiting of { mutable released : bool }
6
+ end in
7
+ let state = Atomic. make Init in
8
+ let release () =
9
+ if Multicore_magic. fenceless_get state != Released then
10
+ match Atomic. exchange state Released with
11
+ | Awaiting r -> r.released < - true
12
+ | _ -> ()
13
+ in
14
+ let await () =
15
+ if Multicore_magic. fenceless_get state != Released then
16
+ let awaiting = Awaiting { released = false } in
17
+ if Atomic. compare_and_set state Init awaiting then
18
+ match awaiting with
19
+ | Awaiting r ->
20
+ (* Avoid sleeping *)
21
+ while not r.released do
22
+ Domain. cpu_relax ()
23
+ done
24
+ | _ -> ()
25
+ in
26
+ Domain_local_await. { release; await }
27
+
28
+ let wrap _ _ action = action ()
29
+
30
+ (* * ⚠️ This function is written in a very low level manner to avoid memore use,
31
+ allocations, and false sharing as much as possible during a run of [work] as
32
+ those can cause undesirable noise. *)
33
+ let record (type a ) ~budgetf ~n_domains ?(ensure_multi_domain = true )
4
34
?(domain_local_await = `Busy_wait ) ?(n_warmups = 3 ) ?(n_runs_min = 7 )
5
- ?(before = Fun. id) ~init ~work ?(after = Fun. id) () =
6
- let barrier = Barrier. make n_domains in
7
- let results =
8
- Array. init n_domains @@ fun _ ->
9
- Stack. create () |> Multicore_magic. copy_as_padded
35
+ ?(n_runs_max = 1023 ) ?(before = Fun. id) ~init ?(wrap = wrap) ~work
36
+ ?(after = Fun. id) () =
37
+ Gc. full_major () ;
38
+ let open struct
39
+ type shared = {
40
+ barrier : Barrier .t ;
41
+ start_earliest : Mtime.Span .t Atomic .t ;
42
+ work : int -> a -> unit ;
43
+ wrap : int -> a -> (unit -> unit ) -> unit ;
44
+ results : Float.Array .t Array .t ;
45
+ budget_start : Mtime.Span .t ;
46
+ before : unit -> unit ;
47
+ init : int -> a ;
48
+ after : unit -> unit ;
49
+ n_warmups : int ;
50
+ n_runs_min : int ;
51
+ budgetf : float ;
52
+ mutable budget_used : bool ;
53
+ mutable exit : bool ;
54
+ mutable runs : int ;
55
+ }
56
+ end in
57
+ let s =
58
+ {
59
+ barrier = Barrier. make n_domains;
60
+ start_earliest =
61
+ Atomic. make Mtime.Span. zero |> Multicore_magic. copy_as_padded;
62
+ work = Multicore_magic. copy_as_padded work;
63
+ wrap;
64
+ results =
65
+ Array. init n_domains (fun _ ->
66
+ Float.Array. create (Int. max n_runs_min n_runs_max));
67
+ budget_start = Mtime_clock. elapsed () ;
68
+ before;
69
+ init;
70
+ after;
71
+ n_warmups;
72
+ n_runs_min;
73
+ budgetf;
74
+ budget_used = false ;
75
+ exit = false ;
76
+ runs = 0 ;
77
+ }
10
78
in
11
- let budget_used = ref false |> Multicore_magic. copy_as_padded in
12
- let runs = ref 0 |> Multicore_magic. copy_as_padded in
13
- let exit = ref false in
14
79
let extra_domain =
15
80
if n_domains = 1 && ensure_multi_domain then
16
81
Some
17
82
( Domain. spawn @@ fun () ->
18
- while not ! exit do
83
+ while not s. exit do
19
84
Domain. cpu_relax ()
20
85
done )
21
86
else None
22
87
in
23
- Gc. full_major () ;
24
- let budget_start = Mtime_clock. elapsed () in
25
- let with_busy_wait () =
26
- let open struct
27
- type state = Init | Released | Awaiting of { mutable released : bool }
28
- end in
29
- let state = Atomic. make Init in
30
- let release () =
31
- if Multicore_magic. fenceless_get state != Released then
32
- match Atomic. exchange state Released with
33
- | Awaiting r -> r.released < - true
34
- | _ -> ()
35
- in
36
- let await () =
37
- if Multicore_magic. fenceless_get state != Released then
38
- let awaiting = Awaiting { released = false } in
39
- if Atomic. compare_and_set state Init awaiting then
40
- match awaiting with
41
- | Awaiting r ->
42
- (* Avoid sleeping *)
43
- while not r.released do
44
- Domain. cpu_relax ()
45
- done
46
- | _ -> ()
47
- in
48
- Domain_local_await. { release; await }
49
- in
50
- let start_earliest = Atomic. make Mtime.Span. zero in
51
- let main domain_i =
88
+ let main i =
52
89
let benchmark () =
53
- for _ = 1 to n_warmups do
54
- Barrier. await barrier;
55
- if domain_i = 0 then begin
56
- before () ;
90
+ let open struct
91
+ type local = {
92
+ domain_i : int ;
93
+ mutable stop_current : Mtime.Span .t ;
94
+ mutable state : a ;
95
+ }
96
+ end in
97
+ let l =
98
+ Multicore_magic. copy_as_padded
99
+ { domain_i = i; stop_current = Mtime.Span. zero; state = Obj. magic () }
100
+ in
101
+ let doit =
102
+ Multicore_magic. copy_as_padded @@ fun () ->
103
+ Barrier. await s.barrier;
104
+ if Multicore_magic. fenceless_get s.start_earliest == Mtime.Span. zero
105
+ then begin
106
+ let start_current = Mtime_clock. elapsed () in
107
+ if Multicore_magic. fenceless_get s.start_earliest == Mtime.Span. zero
108
+ then
109
+ Atomic. compare_and_set s.start_earliest Mtime.Span. zero
110
+ start_current
111
+ |> ignore
112
+ end ;
113
+ s.work l.domain_i l.state;
114
+ l.stop_current < - Mtime_clock. elapsed ()
115
+ in
116
+ (* warmup runs *)
117
+ for _ = 1 to s.n_warmups do
118
+ if l.domain_i = 0 then begin
119
+ Multicore_magic. fenceless_set s.start_earliest Mtime.Span. zero;
120
+ s.before () ;
57
121
Gc. major ()
58
122
end ;
59
- Barrier. await barrier;
60
- let state = init domain_i in
61
- Barrier. await barrier;
62
- work domain_i state;
63
- Barrier. await barrier;
64
- if domain_i = 0 then after ()
123
+ Barrier. await s.barrier;
124
+ l.state < - s.init l.domain_i;
125
+ s.wrap l.domain_i l.state doit;
126
+ Barrier. await s.barrier;
127
+ l.state < - Obj. magic () ;
128
+ if l.domain_i = 0 then s.after () ;
129
+ Barrier. await s.barrier
65
130
done ;
66
- while ! runs < n_runs_min || not ! budget_used do
67
- Barrier. await barrier;
68
- if domain_i = 0 then begin
69
- Multicore_magic. fenceless_set start_earliest Mtime.Span. zero;
70
- before () ;
131
+ (* timed runs *)
132
+ while s.runs < s.n_runs_min || not s.budget_used do
133
+ if l.domain_i = 0 then begin
134
+ Multicore_magic. fenceless_set s.start_earliest Mtime.Span. zero;
135
+ s.before () ;
136
+ Gc. major ()
137
+ end ;
138
+ Barrier. await s.barrier;
139
+ l.state < - s.init l.domain_i;
140
+ s.wrap l.domain_i l.state doit;
141
+ Barrier. await s.barrier;
142
+ l.state < - Obj. magic () ;
143
+ Float.Array. set s.results.(l.domain_i) s.runs
144
+ (Mtime.Span. to_float_ns
145
+ (Mtime.Span. abs_diff l.stop_current
146
+ (Multicore_magic. fenceless_get s.start_earliest))
147
+ *. (1. /. 1_000_000_000.0 ));
148
+ l.stop_current < - Mtime.Span. zero;
149
+ Barrier. await s.barrier;
150
+ if l.domain_i = 0 then begin
151
+ s.after () ;
152
+ s.runs < - s.runs + 1 ;
71
153
if
72
154
let budget_stop = Mtime_clock. elapsed () in
73
155
let elapsedf =
74
156
Mtime.Span. to_float_ns
75
- (Mtime.Span. abs_diff budget_stop budget_start)
157
+ (Mtime.Span. abs_diff budget_stop s. budget_start)
76
158
*. (1. /. 1_000_000_000.0 )
77
159
in
78
- budgetf < elapsedf
79
- then budget_used := true ;
80
- incr runs;
81
- Gc. major ()
82
- end ;
83
- Barrier. await barrier;
84
- let state = init domain_i in
85
- Barrier. await barrier;
86
- if Multicore_magic. fenceless_get start_earliest == Mtime.Span. zero then begin
87
- let start_current = Mtime_clock. elapsed () in
88
- if Multicore_magic. fenceless_get start_earliest == Mtime.Span. zero
89
- then
90
- Atomic. compare_and_set start_earliest Mtime.Span. zero start_current
91
- |> ignore
160
+ s.budgetf < elapsedf
161
+ || Float.Array. length s.results.(l.domain_i) < = s.runs
162
+ then s.budget_used < - true
92
163
end ;
93
- work domain_i state;
94
- let stop_current = Mtime_clock. elapsed () in
95
- Barrier. await barrier;
96
- if domain_i = 0 then after () ;
97
- Stack. push
98
- (Mtime.Span. to_float_ns
99
- (Mtime.Span. abs_diff stop_current
100
- (Multicore_magic. fenceless_get start_earliest))
101
- *. (1. /. 1_000_000_000.0 ))
102
- results.(domain_i)
164
+ Barrier. await s.barrier
103
165
done
104
166
in
105
167
match domain_local_await with
@@ -113,33 +175,35 @@ let record ~budgetf ~n_domains ?(ensure_multi_domain = true)
113
175
Domain. spawn @@ fun () -> main (domain_i + 1 )
114
176
in
115
177
main 0 ;
178
+ s.exit < - true ;
116
179
Array. iter Domain. join domains;
117
- exit := true ;
118
180
Option. iter Domain. join extra_domain;
119
181
let times_per_domain =
120
- Array. init (Array. length results) @@ fun i ->
121
- Stack. to_seq results.(i) |> Array. of_seq
182
+ s.results |> Array. map @@ fun times -> Float.Array. sub times 0 s.runs
122
183
in
123
- { inverted = false ; times_per_domain; runs = ! runs }
184
+ { inverted = false ; times_per_domain; runs = s. runs }
124
185
125
186
let average { inverted; times_per_domain; runs } =
126
187
let domains = Array. length times_per_domain in
127
- let n = Array. length times_per_domain.(0 ) in
128
- let times = Array. create_float n in
188
+ let n = Float. Array. length times_per_domain.(0 ) in
189
+ let times = Float. Array.create n in
129
190
for run_i = 0 to n - 1 do
130
- times.(run_i) < - 0.0 ;
191
+ Float.Array. set times run_i 0.0 ;
131
192
for domain_i = 0 to domains - 1 do
132
- times.(run_i) < - times.(run_i) +. times_per_domain.(domain_i).(run_i)
193
+ Float.Array. set times run_i
194
+ (Float.Array. get times run_i
195
+ +. Float.Array. get times_per_domain.(domain_i) run_i)
133
196
done ;
134
- times.(run_i) < - times.(run_i) /. Float. of_int domains
197
+ Float.Array. set times run_i
198
+ (Float.Array. get times run_i /. Float. of_int domains)
135
199
done ;
136
200
{ inverted; times_per_domain = [| times |]; runs }
137
201
138
202
let invert { inverted; times_per_domain; runs } =
139
203
{
140
204
inverted = not inverted;
141
205
times_per_domain =
142
- Array. map (Array. map (fun v -> 1.0 /. v)) times_per_domain;
206
+ Array. map (Float. Array. map (fun v -> 1.0 /. v)) times_per_domain;
143
207
runs;
144
208
}
145
209
@@ -163,39 +227,44 @@ module Stats = struct
163
227
}
164
228
165
229
let mean_of times =
166
- Array. fold_left ( +. ) 0.0 times /. Float. of_int (Array. length times)
230
+ Float.Array. fold_left ( +. ) 0.0 times
231
+ /. Float. of_int (Float.Array. length times)
167
232
168
233
let sd_of times mean =
169
234
Float. sqrt
170
235
(mean_of
171
- (Array. map
236
+ (Float. Array. map
172
237
(fun v ->
173
238
let d = v -. mean in
174
239
d *. d)
175
240
times))
176
241
177
242
let median_of times =
178
- Array. sort Float. compare times;
179
- let n = Array. length times in
180
- if n land 1 = 0 then (times.((n asr 1 ) - 1 ) +. times.(n asr 1 )) /. 2.0
181
- else times.(n asr 1 )
243
+ Float.Array. sort Float. compare times;
244
+ let n = Float.Array. length times in
245
+ if n land 1 = 0 then
246
+ (Float.Array. get times ((n asr 1 ) - 1 ) +. Float.Array. get times (n asr 1 ))
247
+ /. 2.0
248
+ else Float.Array. get times (n asr 1 )
182
249
183
250
let of_times { inverted; times_per_domain; runs } =
184
251
let domains = Array. length times_per_domain in
185
- let n = Array. length times_per_domain.(0 ) in
186
- let times = Array. create_float n in
252
+ let n = Float. Array. length times_per_domain.(0 ) in
253
+ let times = Float. Array.create n in
187
254
for run_i = 0 to n - 1 do
188
- times.(run_i) < - 0.0 ;
255
+ Float.Array. set times run_i 0.0 ;
189
256
for domain_i = 0 to domains - 1 do
190
- times.(run_i) < - times.(run_i) +. times_per_domain.(domain_i).(run_i)
257
+ Float.Array. set times run_i
258
+ (Float.Array. get times run_i
259
+ +. Float.Array. get times_per_domain.(domain_i) run_i)
191
260
done
192
261
done ;
193
262
let mean = mean_of times in
194
263
let sd = sd_of times mean in
195
264
let median = median_of times in
196
265
let best =
197
- if inverted then Array. fold_left Float. max Float. min_float times
198
- else Array. fold_left Float. min Float. max_float times
266
+ if inverted then Float. Array. fold_left Float. max Float. min_float times
267
+ else Float. Array. fold_left Float. min Float. max_float times
199
268
in
200
269
{ mean; sd; median; inverted; best; runs }
201
270
0 commit comments