-
Notifications
You must be signed in to change notification settings - Fork 575
/
Copy pathscheduler.h
1311 lines (1240 loc) · 59.4 KB
/
scheduler.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* **********************************************************
* Copyright (c) 2023-2025 Google, Inc. All rights reserved.
* **********************************************************/
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of Google, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/* Scheduler of traced software threads onto simulated cpus. */
#ifndef _DRMEMTRACE_SCHEDULER_H_
#define _DRMEMTRACE_SCHEDULER_H_ 1
/**
* @file drmemtrace/scheduler.h
* @brief DrMemtrace top-level trace scheduler.
*/
#define NOMINMAX // Avoid windows.h messing up std::max.
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <atomic>
#include <deque>
#include <limits>
#include <map>
#include <memory>
#include <mutex>
#include <queue>
#include <set>
#include <stack>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "archive_istream.h"
#include "archive_ostream.h"
#include "flexible_queue.h"
#include "memref.h"
#include "memtrace_stream.h"
#include "mutex_dbg_owned.h"
#include "reader.h"
#include "record_file_reader.h"
#include "speculator.h"
#include "trace_entry.h"
#include "utils.h"
namespace dynamorio { /**< General DynamoRIO namespace. */
namespace drmemtrace { /**< DrMemtrace tracing + simulation infrastructure namespace. */
template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t;
/**
* Schedules traced software threads onto simulated cpus.
* Takes in a set of recorded traces and maps them onto a new set of output
* streams, typically representing simulated cpus.
*
* This is a templated class to support not just operating over
* #dynamorio::drmemtrace::memref_t inputs read by #dynamorio::drmemtrace::reader_t, but
* also over #dynamorio::drmemtrace::trace_entry_t records read by
* #dynamorio::drmemtrace::record_reader_t.
*/
template <typename RecordType, typename ReaderType> class scheduler_tmpl_t {
public:
/**
* Status codes used by non-stream member functions.
* get_error_string() provides additional information such as which input file
* path failed.
*/
enum scheduler_status_t {
STATUS_SUCCESS, /**< Success. */
STATUS_ERROR_INVALID_PARAMETER, /**< Error: invalid parameter. */
STATUS_ERROR_FILE_OPEN_FAILED, /**< Error: file open failed. */
STATUS_ERROR_FILE_READ_FAILED, /**< Error: file read failed. */
STATUS_ERROR_NOT_IMPLEMENTED, /**< Error: not implemented. */
STATUS_ERROR_FILE_WRITE_FAILED, /**< Error: file write failed. */
STATUS_ERROR_RANGE_INVALID, /**< Error: region of interest invalid. */
};
/**
* Status codes used by stream member functions.
* get_error_string() provides additional information such as which input file
* path failed.
*/
enum stream_status_t {
STATUS_OK, /**< Stream is healthy and can continue to advance. */
STATUS_EOF, /**< Stream is at its end. */
/**
* For dynamic scheduling with cross-stream dependencies, the scheduler may pause
* a stream if it gets ahead of another stream it should have a dependence on.
* This value is also used for schedules following the recorded timestamps
* (#DEPENDENCY_TIMESTAMPS) to avoid one stream getting ahead of another.
* #STATUS_WAIT should be treated as artificial, an artifact of enforcing a
* recorded schedule on concurrent differently-timed output streams.
* Simulators are suggested to not advance simulated time for #STATUS_WAIT while
* they should advance time for #STATUS_IDLE as the latter indicates a true
* lack of work.
*/
STATUS_WAIT,
STATUS_INVALID, /**< Error condition. */
STATUS_REGION_INVALID, /**< Input region is out of bounds. */
STATUS_NOT_IMPLEMENTED, /**< Feature not implemented. */
STATUS_SKIPPED, /**< Used for internal scheduler purposes. */
STATUS_RECORD_FAILED, /**< Failed to record schedule for future replay. */
/**
* This code indicates that all inputs are blocked waiting for kernel resources
* (such as i/o). This is similar to #STATUS_WAIT, but #STATUS_WAIT indicates an
* artificial pause due to imposing the original ordering while #STATUS_IDLE
* indicates actual idle time in the application. Simulators are suggested
* to not advance simulated time for #STATUS_WAIT while they should advance
* time for #STATUS_IDLE.
*/
STATUS_IDLE,
/**
* Indicates an input has a binding whose outputs are all marked inactive.
*/
STATUS_IMPOSSIBLE_BINDING,
STATUS_STOLE, /**< Used for internal scheduler purposes. */
};
/** Identifies an input stream by its index. */
typedef int input_ordinal_t;
/** Identifies an output stream by its index. */
typedef int output_ordinal_t;
/** Sentinel value indicating that no input stream is specified. */
static constexpr input_ordinal_t INVALID_INPUT_ORDINAL = -1;
/** Sentinel value indicating that no input stream is specified. */
static constexpr output_ordinal_t INVALID_OUTPUT_ORDINAL = -1;
/** A bounded sequence of instructions. */
struct range_t {
/** Convenience constructor. */
range_t(uint64_t start, uint64_t stop)
: start_instruction(start)
, stop_instruction(stop)
{
}
/**
* The starting point as a trace instruction ordinal. These ordinals
* begin at 1, so a 'start_instruction' value of 0 is invalid.
*/
uint64_t start_instruction;
/** The ending point, inclusive. A stop value of 0 means the end of the trace. */
uint64_t stop_instruction;
};
/**
* A time range in units of the microsecond timestamps in the traces..
*/
struct timestamp_range_t {
/** Convenience constructor. */
timestamp_range_t(uint64_t start, uint64_t stop)
: start_timestamp(start)
, stop_timestamp(stop)
{
}
/**
* The starting time in the microsecond timestamp units in the trace.
*/
uint64_t start_timestamp;
/**
* The ending time in the microsecond timestamp units in the trace.
* The ending time is inclusive. 0 means the end of the trace.
*/
uint64_t stop_timestamp;
};
/**
* Specifies details about one set of input trace files from one workload,
* each input file representing a single software thread.
* It is assumed that there is no thread id duplication within one workload.
*/
struct input_thread_info_t {
/** Convenience constructor for common usage. */
explicit input_thread_info_t(std::vector<range_t> regions)
: regions_of_interest(regions)
{
}
/** Convenience constructor for common usage. */
input_thread_info_t(memref_tid_t tid, std::vector<range_t> regions)
: tids(1, tid)
, regions_of_interest(regions)
{
}
/** Convenience constructor for common usage. */
input_thread_info_t(memref_tid_t tid, int priority)
: tids(1, tid)
, priority(priority)
{
}
/**
* Convenience constructor for placing all threads for one workload on a set of
* cores for a static partitioning.
*/
input_thread_info_t(std::set<output_ordinal_t> output_binding)
: output_binding(output_binding)
{
}
/** Convenience constructor for placing one thread on a set of cores. */
input_thread_info_t(memref_tid_t tid, std::set<output_ordinal_t> output_binding)
: tids(1, tid)
, output_binding(output_binding)
{
}
/** Size of the struct for binary-compatible additions. */
size_t struct_size = sizeof(input_thread_info_t);
/**
* Which threads the details in this structure apply to.
* If empty, the details apply to all not-yet-mentioned (by other 'tids'
* vectors in prior entries for this workload) threads in the
* #dynamorio::drmemtrace::scheduler_tmpl_t::input_workload_t.
*/
std::vector<memref_tid_t> tids;
/**
* Limits these threads to this set of output streams. They will not
* be scheduled on any other output streams.
*/
std::set<output_ordinal_t> output_binding;
/**
* Relative priority for scheduling. The default is 0. Higher values have
* higher priorities and will starve lower-priority inputs.
* Higher priorities out-weigh dependencies such as #DEPENDENCY_TIMESTAMPS.
*/
int priority = 0;
/**
* If non-empty, all input records outside of these ranges are skipped: it is as
* though the input were constructed by concatenating these ranges together. A
* #TRACE_MARKER_TYPE_WINDOW_ID marker is inserted between
* ranges (with a value equal to the range ordinal) to notify the client of the
* discontinuity. This marker is inserted between back-to-back regions with no
* separation, but it is not inserted prior to the first range. A
* #dynamorio::drmemtrace::TRACE_TYPE_THREAD_EXIT record is inserted after the
* final range. These ranges must be non-overlapping and in increasing order.
*
* Be aware that selecting a subset of code can remove inter-input
* communication steps that could be required for forward progress.
* For example, if selected subsets include #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE
* with no timeout but do not include a corresponding
* #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE for wakeup, an input could remain
* unscheduled.
*
* Also beware that this can skip over trace header entries (like
* #TRACE_MARKER_TYPE_FILETYPE), which should ideally be obtained from the
* #dynamorio::drmemtrace::memtrace_stream_t API instead.
*/
std::vector<range_t> regions_of_interest;
};
/** Specifies an input that is already opened by a reader. */
struct input_reader_t {
/** Creates an empty reader. */
input_reader_t() = default;
/** Creates a reader entry. */
input_reader_t(std::unique_ptr<ReaderType> reader,
std::unique_ptr<ReaderType> end, memref_tid_t tid)
: reader(std::move(reader))
, end(std::move(end))
, tid(tid)
{
}
/** The reader for the input stream. */
std::unique_ptr<ReaderType> reader;
/** The end reader for 'reader'. */
std::unique_ptr<ReaderType> end;
/**
* A unique identifier to distinguish from other readers for this workload.
* Typically this will be the thread id but it does not need to be, so long
* as it is not 0 (DynamoRIO's INVALID_THREAD_ID sentinel).
* This allows the 'thread_modifiers' field of 'input_workload_t'
* to refer to this input.
*/
memref_tid_t tid = INVALID_THREAD_ID;
};
/** Specifies the input workloads to be scheduled. */
struct input_workload_t {
/** Create an empty workload. This is not a valid final input. */
input_workload_t()
{
}
/**
* Create a workload coming from a directory of many trace files or from
* a single trace file where each trace file uses the given regions of interest.
*/
input_workload_t(const std::string &trace_path,
std::vector<range_t> regions_of_interest = {})
: path(trace_path)
{
if (!regions_of_interest.empty())
thread_modifiers.emplace_back(regions_of_interest);
}
/**
* Create a workload with a set of pre-initialized readers which use the given
* regions of interest.
*/
input_workload_t(std::vector<input_reader_t> readers,
std::vector<range_t> regions_of_interest = {})
: readers(std::move(readers))
{
if (!regions_of_interest.empty())
thread_modifiers.emplace_back(regions_of_interest);
}
/** Size of the struct for binary-compatible additions. */
size_t struct_size = sizeof(input_workload_t);
/**
* A directory of trace files or a single trace file.
* A reader will be opened for each input file and its init() function
* will be called; that function will not block (variants where it
* does block, such as for IPC, should leave 'path' empty and use 'reader'
* below instead).
*/
std::string path;
/**
* An alternative to passing in a path and having the scheduler open that file(s)
* is to directly pass in a reader. This field is only considered if 'path' is
* empty. The scheduler will call the init() function for each reader at the time
* of the first access to it through an output stream (supporting IPC readers
* whose init() blocks).
*/
std::vector<input_reader_t> readers;
/**
* If empty, every trace file in 'path' or every reader in 'readers' becomes
* an enabled input. If non-empty, only those inputs whose thread ids are
* in this set are enabled and the rest are ignored. It is an error to
* have both this and 'only_shards' be non-empty.
*/
std::set<memref_tid_t> only_threads;
/** Scheduling modifiers for the threads in this workload. */
std::vector<input_thread_info_t> thread_modifiers;
/**
* If non-empty, all input records outside of these ranges are skipped. These
* times cut across all inputs of this workload. The times are converted into a
* separate sequence of instruction
* #dynamorio::drmemtrace::scheduler_tmpl_t::range_t for each input. The end
* result is as though the #dynamorio::drmemtrace::scheduler_tmpl_t::
* input_thread_info_t.regions_of_interest
* field were set for each input. See the comments by that field for further
* details such as the marker inserted between ranges. Although the times cut
* across all inputs for determining the per-input instruction-ordinal ranges, no
* barrier is applied to align the resulting regions of interest: i.e., one input
* can finish its initial region and move to its next region before all other
* inputs finish their initial regions.
*
* If non-empty, the
* #dynamorio::drmemtrace::scheduler_tmpl_t::
* input_thread_info_t.regions_of_interest
* field must be empty for each modifier for this workload.
*
* If non-empty, the #dynamorio::drmemtrace::scheduler_tmpl_t::
* scheduler_options_t.replay_as_traced_istream field must also be specified to
* provide the timestamp-to-instruction-ordinal mappings. These mappings are not
* precise due to the coarse-grained timestamps and the elision of adjacent
* timestamps in the istream. Interpolation is used to estimate instruction
* ordinals when timestamps fall in between recorded points.
*/
std::vector<timestamp_range_t> times_of_interest;
/**
* If empty, every trace file in 'path' or every reader in 'readers' becomes
* an enabled input. If non-empty, only those inputs whose indices are
* in this set are enabled and the rest are ignored. An index is the
* 0-based ordinal in the 'readers' vector or in the files opened at
* 'path' (which are sorted lexicographically by path). It is an error to
* have both this and 'only_threads' be non-empty.
*/
std::set<input_ordinal_t> only_shards;
/**
* If greater than zero, imposes a maximum number of outputs that the inputs
* comprising this workload can execute upon simultaneously. If an input would
* be executed next but would exceed this cap, a different input is selected
* instead or the output goes idle if none are found.
*/
int output_limit = 0;
// Work around a known Visual Studio issue where it complains about deleted copy
// constructors for unique_ptr by deleting our copies and defaulting our moves.
input_workload_t(const input_workload_t &) = delete;
input_workload_t &
operator=(const input_workload_t &) = delete;
input_workload_t(input_workload_t &&) = default;
input_workload_t &
operator=(input_workload_t &&) = default;
};
/** Controls how inputs are mapped to outputs. */
enum mapping_t {
// TODO i#5843: Can we get doxygen to link references without the fully-qualified
// name?
/**
* Each input stream is mapped to a single output stream (i.e., no input will
* appear on more than one output), supporting lock-free parallel analysis when
* combined with #DEPENDENCY_IGNORE.
*/
MAP_TO_CONSISTENT_OUTPUT,
// TODO i#5843: Currently it is up to the user to figure out the original core
// count and pass that number in. It would be convenient to have the scheduler
// figure out the output count. We'd need some way to return that count; I
// imagine we could add a new query routine to get it and so not change the
// existing interfaces.
/**
* Each input stream is assumed to contain markers indicating how it was
* originally scheduled. Those markers are honored with respect to which core
* number (considered to correspond to output stream ordinal) an input is
* scheduled into. This requires an output stream count equal to the number of
* cores occupied by the input stream set. When combined with
* #DEPENDENCY_TIMESTAMPS, this will
* precisely replay the recorded schedule; for this mode,
* #dynamorio::drmemtrace::scheduler_tmpl_t::
* scheduler_options_t.replay_as_traced_istream
* must be specified.
* The original as-traced cpuid that is mapped to each output stream can be
* obtained by calling the get_output_cpuid() function on each stream.
*
* An alternative use of this mapping is with a single output to interleave
* inputs in a strict timestamp order, as with make_scheduler_serial_options(),
* without specifying a schedule file and without recreating core mappings:
* only timestamps are honored.
*/
MAP_TO_RECORDED_OUTPUT,
/**
* The input streams are scheduled using a new dynamic sequence onto the
* output streams. Any input markers indicating how the software threads were
* originally mapped to cores during tracing are ignored. Instead, inputs
* run until either a quantum expires (see
* #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t::quantum_unit)
* or a (potentially) blocking system call is identified. At this point,
* a new input is selected, taking into consideration other options such
* as priorities, core bindings, and inter-input dependencies.
*/
MAP_TO_ANY_OUTPUT,
/**
* A schedule recorded previously by this scheduler is to be replayed.
* The input schedule data is specified in
* #dynamorio::drmemtrace::scheduler_tmpl_t::
* scheduler_options_t.schedule_replay_istream.
* The same output count and input stream order and count must be re-specified;
* scheduling details such as regions of interest and core bindings do not
* need to be re-specified and are in fact ignored.
*/
MAP_AS_PREVIOUSLY,
};
/**
* Flags specifying how inter-input-stream dependencies are handled. The _BITFIELD
* values can be combined. Typical combinations are provided so the enum type can be
* used directly.
*/
enum inter_input_dependency_t {
/** Ignores all inter-input dependencies. */
DEPENDENCY_IGNORE = 0x00,
/**
* Ensures timestamps in the inputs arrive at the outputs in timestamp order.
* For #MAP_TO_ANY_OUTPUT, enforcing asked-for context switch rates is more
* important that honoring precise trace-buffer-based timestamp inter-input
* dependencies: thus, timestamp ordering will be followed at context switch
* points for picking the next input, but timestamps will not preempt an input.
* To precisely follow the recorded timestamps, use #MAP_TO_RECORDED_OUTPUT.
* If this flag is on, #dynamorio::drmemtrace::scheduler_tmpl_t::
* scheduler_options_t.read_inputs_in_init must be set to true.
*/
DEPENDENCY_TIMESTAMPS_BITFIELD = 0x01,
/**
* Ensures timestamps in the inputs arrive at the outputs in timestamp order.
* For #MAP_TO_ANY_OUTPUT, enforcing asked-for context switch rates is more
* important that honoring precise trace-buffer-based timestamp inter-input
* dependencies: thus, timestamp ordering will be followed at context switch
* points for picking the next input, but timestamps will not preempt an input.
* To precisely follow the recorded timestamps, use #MAP_TO_RECORDED_OUTPUT.
*/
DEPENDENCY_DIRECT_SWITCH_BITFIELD = 0x02,
/**
* Combines #DEPENDENCY_TIMESTAMPS_BITFIELD and
* #DEPENDENCY_DIRECT_SWITCH_BITFIELD. This is the recommended setting for most
* schedules.
*/
DEPENDENCY_TIMESTAMPS =
(DEPENDENCY_TIMESTAMPS_BITFIELD | DEPENDENCY_DIRECT_SWITCH_BITFIELD),
// TODO i#5843: Add inferred data dependencies.
};
/**
* Quantum units used for replacing one input with another pre-emptively.
*/
enum quantum_unit_t {
/** Uses the instruction count as the quantum. */
QUANTUM_INSTRUCTIONS,
/**
* Uses the user's notion of time as the quantum. This must be supplied by the
* user by calling the next_record() variant that takes in the current time.
*/
QUANTUM_TIME,
};
/** Flags controlling aspects of the scheduler beyond the core scheduling. */
enum scheduler_flags_t {
/** Default behavior. */
SCHEDULER_DEFAULTS = 0,
// TODO i#5843: Decide and then describe whether the physaddr is in every
// record or obtained via an API call.
/**
* Whether physical addresses should be provided in addition to virtual
* addresses.
*/
SCHEDULER_PROVIDE_PHYSICAL_ADDRESSES = 0x1,
/**
* Specifies that speculation should supply just NOP instructions.
*/
SCHEDULER_SPECULATE_NOPS = 0x2,
/**
* Causes the get_record_ordinal() and get_instruction_ordinal() results
* for an output stream to equal those values for the current input stream
* for that output, rather than accumulating across inputs.
* This also changes the behavior of get_shard_index() as documented under that
* function.
*/
SCHEDULER_USE_INPUT_ORDINALS = 0x4,
// This was added for the analyzer view tool on a single trace specified via
// a directory where the analyzer isn't listing the dir so it doesn't know
// whether to request SCHEDULER_USE_INPUT_ORDINALS.
/**
* If there is just one input and just one output stream, this sets
* #SCHEDULER_USE_INPUT_ORDINALS. In all cases, this changes the behavior
* of get_shard_index() as documented under that function.
*/
SCHEDULER_USE_SINGLE_INPUT_ORDINALS = 0x8,
// TODO i#5843: Add more speculation flags for other strategies.
};
/**
* Types of context switches for
* #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t::
* kernel_switch_trace_path and kernel_switch_reader.
* The enum value is the subfile component name in the archive_istream_t.
*/
enum switch_type_t {
/** Invalid value. */
SWITCH_INVALID = 0,
/** Generic thread context switch. */
SWITCH_THREAD,
/**
* Generic process context switch. A workload is considered a process.
*/
SWITCH_PROCESS,
};
/**
* Collects the parameters specifying how the scheduler should behave, outside
* of the workload inputs and the output count.
*/
struct scheduler_options_t {
/** Constructs a default set of options. */
scheduler_options_t()
{
}
/** Constructs a set of options with the given type and strategy. */
scheduler_options_t(mapping_t mapping, inter_input_dependency_t deps,
scheduler_flags_t flags = SCHEDULER_DEFAULTS,
int verbosity = 0)
: mapping(mapping)
, deps(deps)
, flags(flags)
, verbosity(verbosity)
{
}
/** Size of the struct for binary-compatible additions. */
size_t struct_size = sizeof(scheduler_options_t);
/** The mapping of inputs to outputs. */
mapping_t mapping = MAP_TO_ANY_OUTPUT;
/** How inter-input dependencies are handled. */
inter_input_dependency_t deps = DEPENDENCY_IGNORE;
/** Optional flags affecting scheduler behavior. */
scheduler_flags_t flags = SCHEDULER_DEFAULTS;
/** The unit of the schedule time quantum. */
quantum_unit_t quantum_unit = QUANTUM_INSTRUCTIONS;
/**
* Deprecated: use #quantum_duration_us and #time_units_per_us for #QUANTUM_TIME,
* or #quantum_duration_instrs for #QUANTUM_INSTRUCTIONS, instead. It
* is an error to set this to a non-zero value when #struct_size includes
* #quantum_duration_us. When #struct_size does not include
* #quantum_duration_us and this value is non-zero, the value in
* #quantum_duration_us is replaced with this value divided by the default
* value of #time_units_per_us.
*/
uint64_t quantum_duration = 0;
/**
* If > 0, diagnostic messages are printed to stderr. Higher values produce
* more frequent diagnostics.
*/
int verbosity = 0;
/**
* Output stream for recording the schedule for later replay.
* write_recorded_schedule() must be called when finished to write the
* in-memory data out to this stream.
*/
archive_ostream_t *schedule_record_ostream = nullptr;
/**
* Input stream for replaying a previously recorded schedule when
* #MAP_AS_PREVIOUSLY is specified. If this is non-nullptr and
* #MAP_AS_PREVIOUSLY is specified, schedule_record_ostream must be nullptr, and
* most other fields in this struct controlling scheduling are ignored.
*/
archive_istream_t *schedule_replay_istream = nullptr;
/**
* Input stream for replaying the traced schedule when #MAP_TO_RECORDED_OUTPUT is
* specified for more than one output stream (whose count must match the number
* of traced cores). Alternatively, if
* #dynamorio::drmemtrace::scheduler_tmpl_t::input_workload_t.times_of_interest
* is non-empty, this stream is required for obtaining the mappings between
* timestamps and instruction ordinals.
*/
archive_istream_t *replay_as_traced_istream = nullptr;
/**
* Determines the minimum latency in the unit of the trace's timestamps
* (microseconds) for which a non-maybe-blocking system call (one without
* a #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker) will be treated as
* blocking and trigger a context switch.
*/
uint64_t syscall_switch_threshold = 30000000;
/**
* Determines the minimum latency in the unit of the trace's timestamps
* (microseconds) for which a maybe-blocking system call (one with
* a #TRACE_MARKER_TYPE_MAYBE_BLOCKING_SYSCALL marker) will be treated as
* blocking and trigger a context switch.
*/
uint64_t blocking_switch_threshold = 500;
/**
* Deprecated: use #block_time_multiplier instead. It is an error to set
* this to a non-zero value when #struct_size includes #block_time_multiplier.
* When #struct_size does not include #block_time_multiplier and this value is
* non-zero, the value in #block_time_multiplier is replaced with this value
* divided by the default value of #time_units_per_us.
*/
double block_time_scale = 0.;
/**
* Deprecated: use #block_time_max_us and #time_units_per_us instead. It is
* an error to set this to a non-zero value when #struct_size includes
* #block_time_max_us. When #struct_size does not include #block_time_max_us
* and this value is non-zero, the value in #block_time_max_us is replaced
* with this value divided by the default value of #time_units_per_us.
*/
uint64_t block_time_max = 0;
// XXX: Should we share the file-to-reader code currently in the scheduler
// with the analyzer and only then need reader interfaces and not pass paths
// to the scheduler?
/**
* Input file containing template sequences of kernel context switch code.
* Each sequence must start with a #TRACE_MARKER_TYPE_CONTEXT_SWITCH_START
* marker and end with #TRACE_MARKER_TYPE_CONTEXT_SWITCH_END.
* The values of each marker must hold a #switch_type_t enum value
* indicating which type of switch it corresponds to.
* Each sequence can be stored as a separate subfile of an archive file,
* or concatenated into a single file.
* Each sequence should be in the regular offline drmemtrace format.
* The sequence is inserted into the output stream on each context switch
* of the indicated type.
* The same file (or reader) must be passed when replaying as this kernel
* code is not stored when recording.
* An alternative to passing the file path is to pass #kernel_switch_reader
* and #kernel_switch_reader_end.
*/
std::string kernel_switch_trace_path;
/**
* An alternative to #kernel_switch_trace_path is to pass a reader and
* #kernel_switch_reader_end. See the description of #kernel_switch_trace_path.
* This field is only examined if #kernel_switch_trace_path is empty.
* The scheduler will call the init() function for the reader.
*/
std::unique_ptr<ReaderType> kernel_switch_reader;
/** The end reader for #kernel_switch_reader. */
std::unique_ptr<ReaderType> kernel_switch_reader_end;
/**
* If true, enables a mode where all outputs are serialized into one global outer
* layer output. The single global output stream alternates in round-robin
* lockstep among each core output. The core outputs operate just like they
* would with no serialization, other than timing differences relative to other
* core outputs.
*/
bool single_lockstep_output = false;
/**
* If true, enables a mode where the normal methods of choosing the next input
* based on priority, timestamps (if -sched_order_time is set), and FIFO order
* are disabled. Instead, the scheduler selects the next input randomly. Output
* bindings are still honored. This is intended for experimental use in
* sensitivity studies.
*/
bool randomize_next_input = false;
/**
* If true, the scheduler will read from each input to determine its filetype
* during initialization. If false, the filetype will not be available prior
* to explicit record retrieval by the user, but this may be required for
* inputs whose sources are not yet set up at scheduler init time (e.g.,
* inputs over blocking pipes with data only becoming available after
* initializing the scheduler, as happens with online trace analyzers).
* This must be true for #DEPENDENCY_TIMESTAMPS as it also requires reading
* ahead.
*/
bool read_inputs_in_init = true;
/**
* If true, the scheduler will attempt to switch to the recorded targets of
* #TRACE_MARKER_TYPE_DIRECT_THREAD_SWITCH system call metadata markers
* regardless of system call latency. Furthermore, the scheduler will model
* "unscheduled" semantics and honor the #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE
* and #TRACE_MARKER_TYPE_SYSCALL_SCHEDULE markers. If false, these markers are
* ignored and only system call latency thresholds are used to determine switches
* (these markers remain: they are not removed from the trace).
*/
bool honor_direct_switches = true;
/**
* How many time units for the "cur_time" value passed to next_record() are
* equivalent to one simulated microsecond. E.g., if the time units are in
* picoseconds, pass one million here. This is used to scale all of the
* other parameters that are in microseconds (they all end in "_us": e.g.,
* #quantum_duration_us) so that they operate on the right time scale for the
* passed-in simulator time (or wall-clock microseconds if no time is passed).
* The default value is a rough estimate when no accurate simulated time is
* available: the instruction count is used in that case, and we use the
* instructions per microsecond for a 2GHz clock at 0.5 IPC as our default.
*/
double time_units_per_us = 1000.;
/**
* The scheduling quantum duration for preemption, in simulated microseconds,
* for #QUANTUM_TIME. This value is multiplied by #time_units_per_us to
* produce a value that is compared to the "cur_time" parameter to
* next_record() to determine when to force a quantum switch.
*/
uint64_t quantum_duration_us = 5000;
/**
* The scheduling quantum duration for preemption, in instruction count,
* for #QUANTUM_INSTRUCTIONS. The time passed to next_record() is ignored
* for purposes of quantum preempts.
*
* Instructions executed in a quantum may end up higher than the specified
* value to avoid interruption of the kernel system call sequence.
*/
// We pick 10 million to match 2 instructions per nanosecond with a 5ms quantum.
uint64_t quantum_duration_instrs = 10 * 1000 * 1000;
/**
* Controls the amount of time inputs are considered blocked at a syscall
* whose as-traced latency (recorded in timestamp records in the trace)
* exceeds #syscall_switch_threshold or #blocking_switch_threshold. The
* as-traced syscall latency (which is in traced microseconds) is multiplied
* by this field to produce the blocked time in simulated microseconds. Once
* that many simulated microseconds have passed according to the "cur_time"
* value passed to next_record() (multiplied by #time_units_per_us), the
* input will be no longer considered blocked. The blocked time is clamped
* to a maximum value controlled by #block_time_max.
*
* While there is no direct overhead during tracing, indirect overhead
* does result in some inflation of recorded system call latencies.
* Thus, a value below 0 is typically used here. This value, in combination
* with #block_time_max_us, can be tuned to achieve a desired idle rate.
* The default value errs on the side of less idle time.
*/
double block_time_multiplier = 0.1;
/**
* The maximum time in microseconds for an input to be considered blocked for
* any one system call. This value is multiplied by #time_units_per_us to
* produce a value that is compared to the "cur_time" parameter to
* next_record(). If any block time (see #block_time_multiplier) exceeds
* this value, it is capped to this value. This value is also used as a
* fallback to avoid hangs when there are no scheduled inputs: if the only
* inputs left are "unscheduled" (see #TRACE_MARKER_TYPE_SYSCALL_UNSCHEDULE),
* after this amount of time those inputs are all re-scheduled.
*/
// TODO i#6959: Once we have -exit_if_all_unscheduled raise this.
uint64_t block_time_max_us = 2500;
/**
* The minimum time in microseconds that must have elapsed after an input last
* ran on an output before that input is allowed to be migrated to a different
* output. This value is multiplied by #time_units_per_us to produce a value
* that is compared to the "cur_time" parameter to next_record().
*/
uint64_t migration_threshold_us = 500;
/**
* The period in microseconds at which rebalancing is performed to keep output
* run queues from becoming uneven. This value is multiplied by
* #time_units_per_us to produce a value that is compared to the "cur_time"
* parameter to next_record().
*/
uint64_t rebalance_period_us = 50000;
/**
* Determines whether an unscheduled-indefinitely input really is unscheduled for
* an infinite time, or instead is treated as blocked for the maxiumim time
* (#block_time_max_us) scaled by #block_time_multiplier.
*/
bool honor_infinite_timeouts = false;
/**
* For #MAP_TO_ANY_OUTPUT, when an input reaches EOF, if the number of non-EOF
* inputs left as a fraction of the original inputs is equal to or less than
* this value then the scheduler exits (sets all outputs to EOF) rather than
* finishing off the final inputs. This helps avoid long sequences of idles
* during staggered endings with fewer inputs left than cores and only a small
* fraction of the total instructions left in those inputs. Since the remaining
* instruction count is not considered (as it is not available), use discretion
* when raising this value on uneven inputs.
*/
double exit_if_fraction_inputs_left = 0.1;
/**
* Enables the noise generator to create synthetic trace records that will be
* scheduled alongside records of one or more real traces.
*/
bool enable_noise_generator = false;
/**
* Number of synthetic trace records produced by the noise generator.
*/
uint64_t noise_generator_num_records = 0;
// When adding new options, also add to print_configuration().
};
/**
* Constructs options for a parallel no-inter-input-dependencies schedule where
* the output stream's get_record_ordinal() and get_instruction_ordinal() reflect
* just the current input rather than all past inputs on that output.
*/
static scheduler_options_t
make_scheduler_parallel_options(int verbosity = 0)
{
return scheduler_options_t(sched_type_t::MAP_TO_CONSISTENT_OUTPUT,
sched_type_t::DEPENDENCY_IGNORE,
sched_type_t::SCHEDULER_USE_INPUT_ORDINALS, verbosity);
}
/**
* Constructs options for a serial as-recorded schedule where
* the output stream's get_record_ordinal() and get_instruction_ordinal() honor
* skipped records in the input if, and only if, there is a single input and
* a single output.
*/
static scheduler_options_t
make_scheduler_serial_options(int verbosity = 0)
{
return scheduler_options_t(
sched_type_t::MAP_TO_RECORDED_OUTPUT, sched_type_t::DEPENDENCY_TIMESTAMPS,
sched_type_t::SCHEDULER_USE_SINGLE_INPUT_ORDINALS, verbosity);
}
/**
* Represents a stream of RecordType trace records derived from a
* subset of a set of input recorded traces. Provides more
* information about the record stream using the
* #dynamorio::drmemtrace::memtrace_stream_t API.
*/
class stream_t : public memtrace_stream_t {
public:
stream_t(scheduler_impl_tmpl_t<RecordType, ReaderType> *scheduler, int ordinal,
int verbosity = 0, int max_ordinal = -1)
: scheduler_(scheduler)
, ordinal_(ordinal)
, max_ordinal_(max_ordinal)
, verbosity_(verbosity)
{
}
virtual ~stream_t()
{
}
// We deliberately use a regular function which can return a status for things
// like STATUS_WAIT and abandon attempting to follow std::iterator here as ++;*
// makes it harder to return multiple different statuses as first-class events.
// We don’t plan to use range-based for loops or other language features for
// iterators and our iteration is only forward, so std::iterator's value is
// diminished.
/**
* Advances to the next record in the stream. Returns a status code on whether
* and how to continue. Uses the instruction count plus idle count to this point
* as the time; use the variant that takes "cur_time" to instead provide a time.
*/
virtual stream_status_t
next_record(RecordType &record);
/**
* Advances to the next record in the stream. Returns a status code on whether
* and how to continue. Supplies the current time for #QUANTUM_TIME. The time
* should be considered to be the simulated time prior to processing the returned
* record. The time's units can be chosen by the caller, with
* #dynamorio::drmemtrace::scheduler_tmpl_t::scheduler_options_t.time_units_per_us
* providing the conversion to simulated microseconds. #STATUS_INVALID is
* returned if 0 or a value smaller than the start time of the current input's
* quantum is passed in when #QUANTUM_TIME and #MAP_TO_ANY_OUTPUT are specified.
*/
virtual stream_status_t
next_record(RecordType &record, uint64_t cur_time);
/**
* Queues the last-read record returned by next_record() such that it will be
* returned on the subsequent call to next_record() when this same input is
* active. Causes ordinal queries on the current input to be off by one until the
* record is re-read. Furthermore, the get_last_timestamp() query may still
* include this record, whether called on the input or output stream, immediately
* after this call. Fails if called multiple times in a row without an
* intervening next_record() call. Fails if called during speculation (between
* start_speculation() and stop_speculation() calls).
*/
virtual stream_status_t
unread_last_record();
/**
* Begins a diversion from the regular inputs to a side stream of records
* representing speculative execution starting at 'start_address'.
*
* Because the instruction record after a branch typically needs to be read before
* knowing whether a simulator is on the wrong path or not, this routine supports
* putting back the current record so that it will be re-provided as the first
* record after stop_speculation(), if "queue_current_record" is true. The same
* caveats on the input stream ordinals and last timestamp described under
* unread_last_record() apply to this record queueing. Calling
* start_speculation() immediately after unread_last_record() and requesting
* queueing will return a failure code.
*
* This call can be nested; each call needs to be paired with a corresponding
* stop_speculation() call.
*/
virtual stream_status_t
start_speculation(addr_t start_address, bool queue_current_record);
/**
* Stops speculative execution, resuming execution at the
* stream of records from the point at which the prior matching
* start_speculation() call was made, either repeating the current record at that
* time (if "true" was passed for "queue_current_record" to start_speculation())
* or continuing on the subsequent record (if "false" was passed). Returns
* #STATUS_INVALID if there was no prior start_speculation() call.
*/
virtual stream_status_t
stop_speculation();
/**
* Disables or re-enables this output stream. If "active" is false, this
* stream becomes inactive and its currently assigned input is moved to the
* ready queue to be scheduled on other outputs. The #STATUS_IDLE code is
* returned to next_record() for inactive streams. If "active" is true,
* this stream becomes active again.
* This is only supported for #MAP_TO_ANY_OUTPUT.
*/
virtual stream_status_t
set_active(bool active);
// memtrace_stream_t interface:
/**
* Returns the count of #memref_t records from the start of
* the trace to this point. It does not include synthetic records (see
* is_record_synthetic()).
*
* If #SCHEDULER_USE_INPUT_ORDINALS is set, then this value matches the record
* ordinal for the current input stream (and thus might decrease or not change
* across records if the input changed). Otherwise, if multiple input streams
* fed into this output stream, this includes the records from all those streams
* that were presented here: thus, this may be larger than what the current input
* stream reports (see get_input_stream_interface() and
* get_input_stream_ordinal()). This does not advance across skipped records in
* an input stream from a region of interest (see
* #dynamorio::drmemtrace::scheduler_tmpl_t::range_t), but it does advance if the
* output stream skipped ahead.
*/
uint64_t
get_record_ordinal() const override;
/**
* Returns the count of instructions from the start of the trace to this point.
* For record_scheduler_t, if any encoding records or the internal record
* TRACE_MARKER_TYPE_BRANCH_TARGET records are present prior to an instruction
* marker, the count will increase at the first of those records as they are
* considered part of the instruction.
* If #SCHEDULER_USE_INPUT_ORDINALS is set, then this value matches the
* instruction ordinal for the current input stream (and thus might decrease or
* not change across records if the input changed). Otherwise, if multiple input