-
Notifications
You must be signed in to change notification settings - Fork 375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PROF-7440] Add approximate thread state categorization for timeline #3162
Changes from all commits
9356692
4f2ebc7
393b67d
86e8f5d
c728fa4
21aa188
f508e71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,7 +40,7 @@ static void record_placeholder_stack_in_native_code( | |
sampling_buffer* buffer, | ||
VALUE recorder_instance, | ||
sample_values values, | ||
ddog_prof_Slice_Label labels, | ||
sample_labels labels, | ||
sampling_buffer *record_buffer, | ||
int extra_frames_in_record_buffer | ||
); | ||
|
@@ -49,7 +49,7 @@ static void sample_thread_internal( | |
sampling_buffer* buffer, | ||
VALUE recorder_instance, | ||
sample_values values, | ||
ddog_prof_Slice_Label labels, | ||
sample_labels labels, | ||
sampling_buffer *record_buffer, | ||
int extra_frames_in_record_buffer | ||
); | ||
|
@@ -85,13 +85,14 @@ static VALUE _native_sample( | |
VALUE zero = INT2NUM(0); | ||
sample_values values = { | ||
.cpu_time_ns = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("cpu-time"), zero)), | ||
.cpu_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("cpu-samples"), zero)), | ||
.cpu_or_wall_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("cpu-samples"), zero)), | ||
.wall_time_ns = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("wall-time"), zero)), | ||
.alloc_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("alloc-samples"), zero)), | ||
}; | ||
|
||
long labels_count = RARRAY_LEN(labels_array) + RARRAY_LEN(numeric_labels_array); | ||
ddog_prof_Label labels[labels_count]; | ||
ddog_prof_Label *state_label = NULL; | ||
|
||
for (int i = 0; i < RARRAY_LEN(labels_array); i++) { | ||
VALUE key_str_pair = rb_ary_entry(labels_array, i); | ||
|
@@ -100,6 +101,10 @@ static VALUE _native_sample( | |
.key = char_slice_from_ruby_string(rb_ary_entry(key_str_pair, 0)), | ||
.str = char_slice_from_ruby_string(rb_ary_entry(key_str_pair, 1)) | ||
}; | ||
|
||
if (rb_str_equal(rb_ary_entry(key_str_pair, 0), rb_str_new_cstr("state"))) { | ||
state_label = &labels[i]; | ||
} | ||
} | ||
for (int i = 0; i < RARRAY_LEN(numeric_labels_array); i++) { | ||
VALUE key_str_pair = rb_ary_entry(numeric_labels_array, i); | ||
|
@@ -115,12 +120,14 @@ static VALUE _native_sample( | |
|
||
sampling_buffer *buffer = sampling_buffer_new(max_frames_requested); | ||
|
||
ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = labels_count}; | ||
|
||
sample_thread( | ||
thread, | ||
buffer, | ||
recorder_instance, | ||
values, | ||
(ddog_prof_Slice_Label) {.ptr = labels, .len = labels_count}, | ||
(sample_labels) {.labels = slice_labels, .state_label = state_label}, | ||
RTEST(in_gc) ? SAMPLE_IN_GC : SAMPLE_REGULAR | ||
); | ||
|
||
|
@@ -134,7 +141,7 @@ void sample_thread( | |
sampling_buffer* buffer, | ||
VALUE recorder_instance, | ||
sample_values values, | ||
ddog_prof_Slice_Label labels, | ||
sample_labels labels, | ||
sample_type type | ||
) { | ||
// Samples thread into recorder | ||
|
@@ -171,6 +178,8 @@ void sample_thread( | |
rb_raise(rb_eArgError, "Unexpected value for sample_type: %d", type); | ||
} | ||
|
||
#define CHARSLICE_EQUALS(must_be_a_literal, charslice) (strlen("" must_be_a_literal) == charslice.len && strncmp(must_be_a_literal, charslice.ptr, charslice.len) == 0) | ||
|
||
// Idea: Should we release the global vm lock (GVL) after we get the data from `rb_profile_frames`? That way other Ruby threads | ||
// could continue making progress while the sample was ingested into the profile. | ||
// | ||
|
@@ -195,7 +204,7 @@ static void sample_thread_internal( | |
sampling_buffer* buffer, | ||
VALUE recorder_instance, | ||
sample_values values, | ||
ddog_prof_Slice_Label labels, | ||
sample_labels labels, | ||
sampling_buffer *record_buffer, | ||
int extra_frames_in_record_buffer | ||
) { | ||
|
@@ -228,6 +237,15 @@ static void sample_thread_internal( | |
VALUE last_ruby_frame = Qnil; | ||
int last_ruby_line = 0; | ||
|
||
ddog_prof_Label *state_label = labels.state_label; | ||
bool cpu_or_wall_sample = values.cpu_or_wall_samples > 0; | ||
bool has_cpu_time = cpu_or_wall_sample && values.cpu_time_ns > 0; | ||
bool only_wall_time = cpu_or_wall_sample && values.cpu_time_ns == 0 && values.wall_time_ns > 0; | ||
|
||
if (cpu_or_wall_sample && state_label == NULL) rb_raise(rb_eRuntimeError, "BUG: Unexpected missing state_label"); | ||
|
||
if (has_cpu_time) state_label->str = DDOG_CHARSLICE_C("had cpu"); | ||
|
||
for (int i = captured_frames - 1; i >= 0; i--) { | ||
VALUE name, filename; | ||
int line; | ||
|
@@ -248,10 +266,55 @@ static void sample_thread_internal( | |
name = NIL_P(name) ? missing_string : name; | ||
filename = NIL_P(filename) ? missing_string : filename; | ||
|
||
ddog_CharSlice name_slice = char_slice_from_ruby_string(name); | ||
ddog_CharSlice filename_slice = char_slice_from_ruby_string(filename); | ||
|
||
bool top_of_the_stack = i == 0; | ||
|
||
// When there's only wall-time in a sample, this means that the thread was not active in the sampled period. | ||
// | ||
// We try to categorize what it was doing based on what we observe at the top of the stack. This is a very rough | ||
// approximation, and in the future we hope to replace this with a more accurate approach (such as using the | ||
// GVL instrumentation API.) | ||
if (top_of_the_stack && only_wall_time) { | ||
if (!buffer->is_ruby_frame[i]) { | ||
// We know that known versions of Ruby implement these using native code; thus if we find a method with the | ||
// same name that is not native code, we ignore it, as it's probably a user method that coincidentally | ||
// has the same name. Thus, even though "matching just by method name" is kinda weak, | ||
// "matching by method name" + is native code seems actually to be good enough for a lot of cases. | ||
|
||
if (CHARSLICE_EQUALS("sleep", name_slice)) { // Expected to be Kernel.sleep | ||
state_label->str = DDOG_CHARSLICE_C("sleeping"); | ||
} else if (CHARSLICE_EQUALS("select", name_slice)) { // Expected to be Kernel.select | ||
state_label->str = DDOG_CHARSLICE_C("waiting"); | ||
} else if ( | ||
CHARSLICE_EQUALS("synchronize", name_slice) || // Expected to be Monitor/Mutex#synchronize | ||
CHARSLICE_EQUALS("lock", name_slice) || // Expected to be Mutex#lock | ||
CHARSLICE_EQUALS("join", name_slice) // Expected to be Thread#join | ||
) { | ||
state_label->str = DDOG_CHARSLICE_C("blocked"); | ||
} else if (CHARSLICE_EQUALS("wait_readable", name_slice)) { // Expected to be IO#wait_readable | ||
state_label->str = DDOG_CHARSLICE_C("network"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly. I'm pretty sure it's possible to do better, by looking at parts of the stack we aren't today to identify the class of instance that |
||
} | ||
#ifdef NO_PRIMITIVE_POP // Ruby < 3.2 | ||
else if (CHARSLICE_EQUALS("pop", name_slice)) { // Expected to be Queue/SizedQueue#pop | ||
state_label->str = DDOG_CHARSLICE_C("waiting"); | ||
} | ||
#endif | ||
} else { | ||
#ifndef NO_PRIMITIVE_POP // Ruby >= 3.2 | ||
// Unlike the above, Ruby actually treats this one specially and gives it a nice file name we can match on! | ||
if (CHARSLICE_EQUALS("pop", name_slice) && CHARSLICE_EQUALS("<internal:thread_sync>", filename_slice)) { // Expected to be Queue/SizedQueue#pop | ||
state_label->str = DDOG_CHARSLICE_C("waiting"); | ||
} | ||
#endif | ||
} | ||
} | ||
|
||
buffer->locations[i] = (ddog_prof_Location) { | ||
.function = (ddog_prof_Function) { | ||
.name = char_slice_from_ruby_string(name), | ||
.filename = char_slice_from_ruby_string(filename) | ||
.name = name_slice, | ||
.filename = filename_slice, | ||
}, | ||
.line = line, | ||
}; | ||
|
@@ -320,7 +383,7 @@ static void record_placeholder_stack_in_native_code( | |
sampling_buffer* buffer, | ||
VALUE recorder_instance, | ||
sample_values values, | ||
ddog_prof_Slice_Label labels, | ||
sample_labels labels, | ||
sampling_buffer *record_buffer, | ||
int extra_frames_in_record_buffer | ||
) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,13 +2,21 @@ | |
|
||
#include <datadog/profiling.h> | ||
|
||
typedef struct sample_values { | ||
typedef struct { | ||
int64_t cpu_time_ns; | ||
int64_t wall_time_ns; | ||
uint32_t cpu_samples; | ||
uint32_t cpu_or_wall_samples; | ||
uint32_t alloc_samples; | ||
} sample_values; | ||
|
||
void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, ddog_prof_Slice_Label labels); | ||
typedef struct sample_labels { | ||
ddog_prof_Slice_Label labels; | ||
|
||
// This is used to allow the `Collectors::Stack` to modify the existing label, if any. This MUST be NULL or point | ||
// somewhere inside the labels slice above. | ||
Comment on lines
+15
to
+16
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Feels a bit weird to have this "pollute" the model/prototypes when it seems to be an internal detail of Addendum: After progressing in the review, noticed you do have the sanity check of checking state label exists if it's a CPU/wall sample so I guess we'd be forced to look the label up for almost every sample which may justify the "caching" done here so I have less of an issue with it now and leaving this here just to make sure I understood intentions 😄 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You got it exactly right. This label was a bit of a new thing, because up until now all labels were set by the thread context collector, and they were opaque to everyone else. But now suddenly we have this need to have a label that shows up based on the stack, which with the current structure of the profiler happens later than all the other labels. And yeah, this is something that will happen on every cpu/wall sample, for every thread, with the Ruby VM stopped for sampling, so it's on the hot path and thus we'd need to be careful if we wanted to abstract this to avoid overhead. So that's why it ended up like this. As a follow-up note, it doesn't 100% need to be like this. Currently, the array for the labels is stack allocated in the thread context collector, and it's sized to fit exactly the number of labels we need. If we were to stop stack-allocating it, and instead had a pre-sized buffer that was a bit bigger, we could then keep a pointer to the next free element and the size of the buffer or something like that, and the stack collector could then just add the label when it wants. (Technically we could do that with the stack-allocated array, and my first prototype of this did that, but I didn't quite like how it looked) The stack collector already has a number of such fixed buffers to store the raw stack sample it gets from the VM, and indeed in a past prototype they started as stack-allocated and then moved to reusable heap-allocated buffers. Furthermore, if we moved more stuff away from stack allocation, we could have a bit more freedom in sequencing the operations in a sample, e.g. right now we have collectors calling each other as shown in https://github.com/DataDog/dd-trace-rb/blob/master/docs/ProfilingDevelopment.md#how-sampling-happens , but we could move to a more flat approach: sample the_sample = ...
thread_context.add_labels(the_sample)
thread_context.add_values(the_sample)
stack.add_stack(the_sample)
...
recorder.record(the_sample) So yeah... Happy to revisit these things as you find them :) |
||
ddog_prof_Label *state_label; | ||
} sample_labels; | ||
|
||
void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels); | ||
void record_endpoint(VALUE recorder_instance, uint64_t local_root_span_id, ddog_CharSlice endpoint); | ||
VALUE enforce_recorder_instance(VALUE object); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have an explicit concat with
""
in the strlen part but no such concat in thestrncmp
. I'd expect to need one in both places or neither, what's the deal here? 😄There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this is a bit of C dark magic cool trick. It relies on the fact that you can do this in C:
And the result is concatenated. Note that you can only do this for literals, that is, it's not a generic way of joining strings in C.
In this case, I wanted to force the string being compared to the charslice to be a literal, so by by doing
This causes the compiler to barf if you do this:
Output:
The error could be a bit better but I hope the "must_be_a_literal" hint would lead people in the right direction ;)
The concat doesn't need to happen on both because
"" must_be_a_literal
doesn't do anything else other than cause the error to happen, and otherwise is a no-op. (Doing it on both causes more errors on the compiler side, so from that POV is even slightly worse)As a final note, why I did this: more out of a tiny cheap micro-optimization; by forcing it to be a literal, I know the compiler can optimize away the
strlen
; otherwise I would probably have wanted to cache it somewhere, which would be more boilerplate.