Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-9263] Add experimental support for profiling code hotspots when used with opentelemetry ruby gem #3510

Merged
10 commits merged into from
Oct 10, 2024
Merged
5 changes: 3 additions & 2 deletions Matrixfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
'' => '✅ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ 3.4 / ✅ jruby'
},
'profiling:main' => {
'opentelemetry' => '❌ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ 3.4 / ❌ jruby',
'' => '✅ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ 3.4 / ✅ jruby'
'opentelemetry' => '❌ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ 3.4 / ❌ jruby',
'opentelemetry_otlp' => '❌ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ 3.4 / ❌ jruby',
'' => '✅ 2.5 / ✅ 2.6 / ✅ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ 3.4 / ✅ jruby',
},
'profiling:ractors' => {
'' => '❌ 2.5 / ❌ 2.6 / ❌ 2.7 / ✅ 3.0 / ✅ 3.1 / ✅ 3.2 / ✅ 3.3 / ✅ 3.4 / ✅ jruby'
Expand Down
5 changes: 5 additions & 0 deletions appraisal/ruby-2.6.rb
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,11 @@
gem 'opentelemetry-sdk', '~> 1.1'
end

appraise 'opentelemetry_otlp' do
gem 'opentelemetry-sdk', '~> 1.1'
gem 'opentelemetry-exporter-otlp'
end

[3, 4, 5].each do |n|
appraise "redis-#{n}" do
gem 'redis', "~> #{n}"
Expand Down
5 changes: 5 additions & 0 deletions appraisal/ruby-2.7.rb
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,11 @@
gem 'opentelemetry-sdk', '~> 1.1'
end

appraise 'opentelemetry_otlp' do
gem 'opentelemetry-sdk', '~> 1.1'
gem 'opentelemetry-exporter-otlp'
end

[3, 4, 5].each do |n|
appraise "redis-#{n}" do
gem 'redis', "~> #{n}"
Expand Down
5 changes: 5 additions & 0 deletions appraisal/ruby-3.0.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@
gem 'opentelemetry-sdk', '~> 1.1'
end

appraise 'opentelemetry_otlp' do
gem 'opentelemetry-sdk', '~> 1.1'
gem 'opentelemetry-exporter-otlp'
end

appraise 'contrib-old' do
gem 'dalli', '< 3.0.0'
gem 'presto-client', '>= 0.5.14' # Renamed to trino-client in >= 1.0
Expand Down
5 changes: 5 additions & 0 deletions appraisal/ruby-3.1.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@
gem 'opentelemetry-sdk', '~> 1.1'
end

appraise 'opentelemetry_otlp' do
gem 'opentelemetry-sdk', '~> 1.1'
gem 'opentelemetry-exporter-otlp'
end

appraise 'contrib-old' do
gem 'dalli', '< 3.0.0'
gem 'presto-client', '>= 0.5.14' # Renamed to trino-client in >= 1.0
Expand Down
5 changes: 5 additions & 0 deletions appraisal/ruby-3.2.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@
gem 'opentelemetry-sdk', '~> 1.1'
end

appraise 'opentelemetry_otlp' do
gem 'opentelemetry-sdk', '~> 1.1'
gem 'opentelemetry-exporter-otlp'
end

appraise 'contrib-old' do
gem 'dalli', '< 3.0.0'
gem 'presto-client', '>= 0.5.14' # Renamed to trino-client in >= 1.0
Expand Down
5 changes: 5 additions & 0 deletions appraisal/ruby-3.3.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,11 @@
gem 'opentelemetry-sdk', '~> 1.1'
end

appraise 'opentelemetry_otlp' do
gem 'opentelemetry-sdk', '~> 1.1'
gem 'opentelemetry-exporter-otlp'
end

appraise 'contrib-old' do
gem 'dalli', '< 3.0.0'
gem 'presto-client', '>= 0.5.14' # Renamed to trino-client in >= 1.0
Expand Down
5 changes: 5 additions & 0 deletions appraisal/ruby-3.4.rb
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@
gem 'opentelemetry-sdk', '~> 1.1'
end

appraise 'opentelemetry_otlp' do
gem 'opentelemetry-sdk', '~> 1.1'
gem 'opentelemetry-exporter-otlp'
end

appraise 'contrib-old' do
gem 'dalli', '< 3.0.0'
gem 'presto-client', '>= 0.5.14' # Renamed to trino-client in >= 1.0
Expand Down
183 changes: 172 additions & 11 deletions ext/datadog_profiling_native_extension/collectors_thread_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,17 @@ static ID at_otel_values_id; // id of :@otel_values in Ruby
static ID at_parent_span_id_id; // id of :@parent_span_id in Ruby
static ID at_datadog_trace_id; // id of :@datadog_trace in Ruby

// Used to support reading trace identifiers from the opentelemetry Ruby library when the ddtrace gem tracing
// integration is NOT in use.
static ID at_span_id_id; // id of :@span_id in Ruby
static ID at_trace_id_id; // id of :@trace_id in Ruby
static ID at_entries_id; // id of :@entries in Ruby
static ID at_context_id; // id of :@context in Ruby
static ID at_kind_id; // id of :@kind in Ruby
static ID at_name_id; // id of :@name in Ruby
static ID server_id; // id of :server in Ruby
static ID otel_context_storage_id; // id of :__opentelemetry_context_storage__ in Ruby

// Contains state for a single ThreadContext instance
struct thread_context_collector_state {
// Note: Places in this file that usually need to be changed when this struct is changed are tagged with
Expand Down Expand Up @@ -164,6 +175,11 @@ struct trace_identifiers {
VALUE trace_endpoint;
};

struct otel_span {
VALUE span;
VALUE span_context;
};

static void thread_context_collector_typed_data_mark(void *state_ptr);
static void thread_context_collector_typed_data_free(void *state_ptr);
static int hash_map_per_thread_context_mark(st_data_t key_thread, st_data_t _value, st_data_t _argument);
Expand Down Expand Up @@ -237,6 +253,13 @@ static void ddtrace_otel_trace_identifiers_for(
VALUE otel_values
);
static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples);
static void otel_without_ddtrace_trace_identifiers_for(
struct thread_context_collector_state *state,
VALUE thread,
struct trace_identifiers *trace_identifiers_result
);
static struct otel_span otel_span_and_context_from(VALUE otel_context, VALUE otel_current_span_key);
static uint64_t otel_span_id_to_uint(VALUE otel_span_id);

void collectors_thread_context_init(VALUE profiling_module) {
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
Expand Down Expand Up @@ -278,6 +301,14 @@ void collectors_thread_context_init(VALUE profiling_module) {
at_otel_values_id = rb_intern_const("@otel_values");
at_parent_span_id_id = rb_intern_const("@parent_span_id");
at_datadog_trace_id = rb_intern_const("@datadog_trace");
at_span_id_id = rb_intern_const("@span_id");
at_trace_id_id = rb_intern_const("@trace_id");
at_entries_id = rb_intern_const("@entries");
at_context_id = rb_intern_const("@context");
at_kind_id = rb_intern_const("@kind");
at_name_id = rb_intern_const("@name");
server_id = rb_intern_const("server");
otel_context_storage_id = rb_intern_const("__opentelemetry_context_storage__");

gc_profiling_init();
}
Expand Down Expand Up @@ -759,6 +790,11 @@ static void trigger_sample_for_thread(
struct trace_identifiers trace_identifiers_result = {.valid = false, .trace_endpoint = Qnil};
trace_identifiers_for(state, thread, &trace_identifiers_result);

if (!trace_identifiers_result.valid) {
Copy link
Contributor

@AlexJF AlexJF Mar 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth/possible doing a bit of extra work at the start to arrive at a sticky decision here and short-circuit constantly failing trace_identifiers_for with all its rb_var_get if ddtrace is not used for tracing at all?

Or thinking is that we want to support situations where there's a mix of ddtrace and pure-ot traces and/or the ability to change between one and the other dynamically (e.g. via a feature flag)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think your suggestion makes sense.

My intent here in checking both is that the profiler may start quite early in the app lifecycle, so we may not know which one is going to be used yet.

Or thinking is that we want to support situations where there's a mix of ddtrace and pure-ot traces and/or the ability to change between one and the other dynamically (e.g. via a feature flag)?

I'm not sure mixing is even possible at this point, since the ddtrace otel support monkey patches itself pretty deep into opentelemetry (which is why I needed to contort a bit to be able to test both).

For that reason, and after our last discussion, I think it makes sense to stop checking opentelemetry once we see data coming from ddtrace traces.

The reverse is harder to figure out, actually. It would be weird, but not impossible, for an app that started with opentelemetry to then switch over to ddtrace.


TL;DR: I'll wait for feedback from our customer on how this is working before acting on this comment, just in case we end up going in a completely different direction BUT I'll definitely come back to it before marking the PR as non-draft.

// If we couldn't get something with ddtrace, let's see if we can get some trace identifiers from opentelemetry directly
otel_without_ddtrace_trace_identifiers_for(state, thread, &trace_identifiers_result);
}

if (trace_identifiers_result.valid) {
labels[label_pos++] = (ddog_prof_Label) {.key = DDOG_CHARSLICE_C("local root span id"), .num = trace_identifiers_result.local_root_span_id};
labels[label_pos++] = (ddog_prof_Label) {.key = DDOG_CHARSLICE_C("span id"), .num = trace_identifiers_result.span_id};
Expand Down Expand Up @@ -1372,25 +1408,31 @@ static ddog_CharSlice ruby_value_type_to_class_name(enum ruby_value_type type) {
}
}

// Used to access OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY). Will raise exceptions if it fails.
static VALUE read_otel_current_span_key_const(DDTRACE_UNUSED VALUE _unused) {
VALUE opentelemetry_module = rb_const_get(rb_cObject, rb_intern("OpenTelemetry"));
ENFORCE_TYPE(opentelemetry_module, T_MODULE);
VALUE trace_module = rb_const_get(opentelemetry_module, rb_intern("Trace"));
ENFORCE_TYPE(trace_module, T_MODULE);
return rb_const_get(trace_module, rb_intern("CURRENT_SPAN_KEY"));
}

static VALUE get_otel_current_span_key(struct thread_context_collector_state *state) {
if (state->otel_current_span_key == Qnil) {
VALUE datadog_module = rb_const_get(rb_cObject, rb_intern("Datadog"));
VALUE opentelemetry_module = rb_const_get(datadog_module, rb_intern("OpenTelemetry"));
VALUE api_module = rb_const_get(opentelemetry_module, rb_intern("API"));
VALUE context_module = rb_const_get(api_module, rb_intern_const("Context"));
VALUE current_span_key = rb_const_get(context_module, rb_intern_const("CURRENT_SPAN_KEY"));

if (current_span_key == Qnil) {
rb_raise(rb_eRuntimeError, "Unexpected: Missing Datadog::OpenTelemetry::API::Context::CURRENT_SPAN_KEY");
}
// If this fails, we want to fail gracefully, rather than raise an exception (e.g. if the opentelemetry gem
// gets refactored, we should not fall on our face)
VALUE span_key = rb_protect(read_otel_current_span_key_const, Qnil, NULL);

// Marks when we failed to get the value, so we don't wasting resources trying
VALUE not_found_marker = Qfalse;

state->otel_current_span_key = current_span_key;
state->otel_current_span_key = span_key != Qnil ? span_key : not_found_marker;
}

return state->otel_current_span_key;
}

// This method gets used when ddtrace is being used indirectly via the otel APIs. Information gets stored slightly
// This method gets used when ddtrace is being used indirectly via the opentelemetry APIs. Information gets stored slightly
// differently, and this codepath handles it.
static void ddtrace_otel_trace_identifiers_for(
struct thread_context_collector_state *state,
Expand All @@ -1410,6 +1452,7 @@ static void ddtrace_otel_trace_identifiers_for(
if (resolved_numeric_span_id == Qnil) return;

VALUE otel_current_span_key = get_otel_current_span_key(state);
if (otel_current_span_key == Qfalse) return;
VALUE current_trace = *active_trace;

// ddtrace uses a different structure when spans are created from otel, where each otel span will have a unique ddtrace
Expand Down Expand Up @@ -1462,3 +1505,121 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples));
return Qtrue;
}

// This method differs from trace_identifiers_for/ddtrace_otel_trace_identifiers_for to support the situation where
// the opentelemetry ruby library is being used for tracing AND the ddtrace tracing bits are not involved at all.
//
// Thus, in this case, we're directly reading from the opentelemetry stuff, which is different to how ddtrace tracing
// does it.
//
// This is somewhat brittle: we're coupling on internal details of the opentelemetry gem to get what we need. In the
// future maybe the otel ruby folks would be open to having a nice public way of getting this data that suits the
// usecase of profilers.
// Until then, the strategy below is to be extremely defensive, and if anything is out of place, we immediately return
// and give up on getting trace data from opentelemetry. (Thus, worst case would be -- you upgrade opentelemetry and
// profiling features relying on reading this data stop working, but you'll still get profiles and the app will be
// otherwise undisturbed).
//
// Specifically, the way this works is:
// 1. The latest entry in the opentelemetry context storage represents the current span (if any). We take the span id
// and trace id from this span.
// 2. To find the local root span id, we walk the context storage backwards from the current span, and find the earliest
// entry in the context storage that has the same trace id as the current span; we use the found span as the local
// root span id.
// This matches the semantics of how ddtrace tracing creates a TraceOperation and assigns a local root span to it.
static void otel_without_ddtrace_trace_identifiers_for(
struct thread_context_collector_state *state,
VALUE thread,
struct trace_identifiers *trace_identifiers_result
) {
VALUE context_storage = rb_thread_local_aref(thread, otel_context_storage_id /* __opentelemetry_context_storage__ */);

// If it exists, context_storage is expected to be an Array[OpenTelemetry::Context]
if (context_storage == Qnil || !RB_TYPE_P(context_storage, T_ARRAY)) return;

VALUE otel_current_span_key = get_otel_current_span_key(state);
if (otel_current_span_key == Qfalse) return;

int active_context_index = RARRAY_LEN(context_storage) - 1;
if (active_context_index < 0) return;

struct otel_span active_span_and_context = otel_span_and_context_from(rb_ary_entry(context_storage, active_context_index), otel_current_span_key);
// If it exists, active_span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context)
VALUE active_span_context = active_span_and_context.span_context;
if (active_span_context == Qnil) return;

// Get the span id and trace id from the active span...
VALUE active_span_id = rb_ivar_get(active_span_context, at_span_id_id /* @span_id */);
VALUE active_span_trace_id = rb_ivar_get(active_span_context, at_trace_id_id /* @trace_id */);
if (active_span_id == Qnil || active_span_trace_id == Qnil || !RB_TYPE_P(active_span_id, T_STRING) || !RB_TYPE_P(active_span_trace_id, T_STRING)) return;

VALUE local_root_span_id = active_span_id;
VALUE local_root_span = active_span_and_context.span;

// Now find the oldest span starting from the active span that still has the same trace id as the active span
for (int i = active_context_index - 1; i >= 0; i--) {
struct otel_span span_and_context = otel_span_and_context_from(rb_ary_entry(context_storage, i), otel_current_span_key);
VALUE span_context = span_and_context.span_context;
if (span_context == Qnil) return;

VALUE span_id = rb_ivar_get(span_context, at_span_id_id /* @span_id */);
VALUE span_trace_id = rb_ivar_get(span_context, at_trace_id_id /* @trace_id */);
if (span_id == Qnil || span_trace_id == Qnil || !RB_TYPE_P(span_id, T_STRING) || !RB_TYPE_P(span_trace_id, T_STRING)) return;

if (rb_str_equal(active_span_trace_id, span_trace_id) == Qfalse) break;

local_root_span_id = span_id;
local_root_span = span_and_context.span;
}

// Convert the span ids into uint64_t to match what the Datadog tracer does
trace_identifiers_result->span_id = otel_span_id_to_uint(active_span_id);
trace_identifiers_result->local_root_span_id = otel_span_id_to_uint(local_root_span_id);

if (trace_identifiers_result->span_id == 0 || trace_identifiers_result->local_root_span_id == 0) return;

trace_identifiers_result->valid = true;

if (!state->endpoint_collection_enabled) return;

VALUE root_span_type = rb_ivar_get(local_root_span, at_kind_id /* @kind */);
// We filter out spans that don't have `kind: :server`
if (root_span_type == Qnil || !RB_TYPE_P(root_span_type, T_SYMBOL) || SYM2ID(root_span_type) != server_id) return;

VALUE trace_resource = rb_ivar_get(local_root_span, at_name_id /* @name */);
if (!RB_TYPE_P(trace_resource, T_STRING)) return;

trace_identifiers_result->trace_endpoint = trace_resource;
}

static struct otel_span otel_span_and_context_from(VALUE otel_context, VALUE otel_current_span_key) {
struct otel_span failed = {.span = Qnil, .span_context = Qnil};

if (otel_context == Qnil) return failed;

VALUE context_entries = rb_ivar_get(otel_context, at_entries_id /* @entries */);
if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed;

// If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span]
VALUE span = rb_hash_lookup(context_entries, otel_current_span_key);
if (span == Qnil) return failed;

return (struct otel_span) {.span = span, .span_context = rb_ivar_get(span, at_context_id /* @context */)};
}

// Otel span ids are represented as a big-endian 8-byte string
static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
if (!RB_TYPE_P(otel_span_id, T_STRING) || RSTRING_LEN(otel_span_id) != 8) { return 0; }

unsigned char *span_bytes = (unsigned char*) StringValuePtr(otel_span_id);

return \
((uint64_t)span_bytes[0] << 56) |
((uint64_t)span_bytes[1] << 48) |
((uint64_t)span_bytes[2] << 40) |
((uint64_t)span_bytes[3] << 32) |
((uint64_t)span_bytes[4] << 24) |
((uint64_t)span_bytes[5] << 16) |
((uint64_t)span_bytes[6] << 8) |
((uint64_t)span_bytes[7]);
}
45 changes: 45 additions & 0 deletions gemfiles/ruby_2.6_opentelemetry_otlp.gemfile

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading