diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index e27f8b53894..dff70397c16 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -1140,15 +1140,9 @@ static VALUE rescued_sample_allocation(VALUE tracepoint_data) { discrete_dynamic_sampler_events_since_last_sample(&state->allocation_sampler) : // if we aren't, then we're sampling every event 1; - - // To control bias from sampling, we clamp the maximum weight attributed to a single allocation sample. This avoids - // assigning a very large number to a sample, if for instance the dynamic sampling mechanism chose a really big interval. + // TODO: Signal in the profile that clamping happened? unsigned int weight = allocations_since_last_sample > MAX_ALLOC_WEIGHT ? MAX_ALLOC_WEIGHT : (unsigned int) allocations_since_last_sample; thread_context_collector_sample_allocation(state->thread_context_collector_instance, weight, new_object); - // ...but we still represent the skipped samples in the profile, thus the data will account for all allocations. - if (weight < allocations_since_last_sample) { - thread_context_collector_sample_skipped_allocation_samples(state->thread_context_collector_instance, allocations_since_last_sample - weight); - } // Return a dummy VALUE because we're called from rb_rescue2 which requires it return Qnil; diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index fd50f6a591b..733aeba4104 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -268,8 +268,7 @@ void sample_thread( recorder_instance, (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = captured_frames}, values, - labels, - /* placeholder: */ false + labels ); } @@ -381,8 +380,7 @@ void record_placeholder_stack( recorder_instance, (ddog_prof_Slice_Location) {.ptr = buffer->locations, .len = 1}, values, - labels, - /* placeholder: */ true + labels ); } diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 6b675ee4d58..6378cde7fe1 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -231,7 +231,6 @@ static void ddtrace_otel_trace_identifiers_for( VALUE active_span, VALUE otel_values ); -static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples); void collectors_thread_context_init(VALUE profiling_module) { VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors"); @@ -262,7 +261,6 @@ void collectors_thread_context_init(VALUE profiling_module) { rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1); rb_define_singleton_method(testing_module, "_native_gc_tracking", _native_gc_tracking, 1); rb_define_singleton_method(testing_module, "_native_new_empty_thread", _native_new_empty_thread, 0); - rb_define_singleton_method(testing_module, "_native_sample_skipped_allocation_samples", _native_sample_skipped_allocation_samples, 2); at_active_span_id = rb_intern_const("@active_span"); at_active_trace_id = rb_intern_const("@active_trace"); @@ -1402,33 +1400,3 @@ static void ddtrace_otel_trace_identifiers_for( *active_trace = current_trace; *numeric_span_id = resolved_numeric_span_id; } - -void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int skipped_samples) { - struct thread_context_collector_state *state; - TypedData_Get_Struct(self_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state); - - ddog_prof_Label labels[] = { - // Providing .num = 0 should not be needed but the tracer-2.7 docker image ships a buggy gcc that complains about this - {.key = DDOG_CHARSLICE_C("thread id"), .str = DDOG_CHARSLICE_C("SA"), .num = 0}, - {.key = DDOG_CHARSLICE_C("thread name"), .str = DDOG_CHARSLICE_C("Skipped Samples"), .num = 0}, - {.key = DDOG_CHARSLICE_C("allocation class"), .str = DDOG_CHARSLICE_C("(Skipped Samples)"), .num = 0}, - }; - ddog_prof_Slice_Label slice_labels = {.ptr = labels, .len = sizeof(labels) / sizeof(labels[0])}; - - record_placeholder_stack( - state->sampling_buffer, - state->recorder_instance, - (sample_values) {.alloc_samples = skipped_samples}, - (sample_labels) { - .labels = slice_labels, - .state_label = NULL, - .end_timestamp_ns = 0, // For now we're not collecting timestamps for allocation events - }, - DDOG_CHARSLICE_C("Skipped Samples") - ); -} - -static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples) { - thread_context_collector_sample_skipped_allocation_samples(collector_instance, NUM2UINT(skipped_samples)); - return Qtrue; -} diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.h b/ext/datadog_profiling_native_extension/collectors_thread_context.h index 073c1caa911..6299d96b43e 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.h +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.h @@ -9,7 +9,6 @@ void thread_context_collector_sample( VALUE profiler_overhead_stack_thread ); void thread_context_collector_sample_allocation(VALUE self_instance, unsigned int sample_weight, VALUE new_object); -void thread_context_collector_sample_skipped_allocation_samples(VALUE self_instance, unsigned int skipped_samples); VALUE thread_context_collector_sample_after_gc(VALUE self_instance); void thread_context_collector_on_gc_start(VALUE self_instance); bool thread_context_collector_on_gc_finish(VALUE self_instance); diff --git a/ext/datadog_profiling_native_extension/heap_recorder.c b/ext/datadog_profiling_native_extension/heap_recorder.c index d96ba5b544c..9702a32bbb8 100644 --- a/ext/datadog_profiling_native_extension/heap_recorder.c +++ b/ext/datadog_profiling_native_extension/heap_recorder.c @@ -166,12 +166,6 @@ struct heap_recorder { size_t objects_frozen; } stats_last_update; }; - -struct end_heap_allocation_args { - struct heap_recorder *heap_recorder; - ddog_prof_Slice_Location locations; -}; - static heap_record* get_or_create_heap_record(heap_recorder*, ddog_prof_Slice_Location); static void cleanup_heap_record_if_unused(heap_recorder*, heap_record*); static void on_committed_object_record_cleanup(heap_recorder *heap_recorder, object_record *record); @@ -182,7 +176,6 @@ static int st_object_records_iterate(st_data_t, st_data_t, st_data_t); static int st_object_records_debug(st_data_t key, st_data_t value, st_data_t extra); static int update_object_record_entry(st_data_t*, st_data_t*, st_data_t, int); static void commit_recording(heap_recorder*, heap_record*, recording); -static VALUE end_heap_allocation_recording(VALUE end_heap_allocation_args); // ========================== // Heap Recorder External API @@ -347,28 +340,9 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj }; } -// end_heap_allocation_recording_with_rb_protect gets called while the stack_recorder is holding one of the profile -// locks. To enable us to correctly unlock the profile on exception, we wrap the call to end_heap_allocation_recording -// with an rb_protect. -__attribute__((warn_unused_result)) -int end_heap_allocation_recording_with_rb_protect(struct heap_recorder *heap_recorder, ddog_prof_Slice_Location locations) { - int exception_state; - struct end_heap_allocation_args end_heap_allocation_args = { - .heap_recorder = heap_recorder, - .locations = locations, - }; - rb_protect(end_heap_allocation_recording, (VALUE) &end_heap_allocation_args, &exception_state); - return exception_state; -} - -static VALUE end_heap_allocation_recording(VALUE end_heap_allocation_args) { - struct end_heap_allocation_args *args = (struct end_heap_allocation_args *) end_heap_allocation_args; - - struct heap_recorder *heap_recorder = args->heap_recorder; - ddog_prof_Slice_Location locations = args->locations; - +void end_heap_allocation_recording(struct heap_recorder *heap_recorder, ddog_prof_Slice_Location locations) { if (heap_recorder == NULL) { - return Qnil; + return; } recording active_recording = heap_recorder->active_recording; @@ -382,16 +356,15 @@ static VALUE end_heap_allocation_recording(VALUE end_heap_allocation_args) { // data required for committing though. heap_recorder->active_recording = (recording) {0}; - if (active_recording.object_record == &SKIPPED_RECORD) { // special marker when we decided to skip due to sampling - return Qnil; + if (active_recording.object_record == &SKIPPED_RECORD) { + // special marker when we decided to skip due to sampling + return; } heap_record *heap_record = get_or_create_heap_record(heap_recorder, locations); // And then commit the new allocation. commit_recording(heap_recorder, heap_record, active_recording); - - return Qnil; } void heap_recorder_prepare_iteration(heap_recorder *heap_recorder) { diff --git a/ext/datadog_profiling_native_extension/heap_recorder.h b/ext/datadog_profiling_native_extension/heap_recorder.h index 6647d2e7432..4d8a7aa1e89 100644 --- a/ext/datadog_profiling_native_extension/heap_recorder.h +++ b/ext/datadog_profiling_native_extension/heap_recorder.h @@ -114,9 +114,7 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj // @param locations The stacktrace representing the location of the allocation. // // WARN: It is illegal to call this without previously having called ::start_heap_allocation_recording. -// WARN: This method rescues exceptions with `rb_protect`, returning the exception state integer for the caller to handle. -__attribute__((warn_unused_result)) -int end_heap_allocation_recording_with_rb_protect(heap_recorder *heap_recorder, ddog_prof_Slice_Location locations); +void end_heap_allocation_recording(heap_recorder *heap_recorder, ddog_prof_Slice_Location locations); // Update the heap recorder to reflect the latest state of the VM and prepare internal structures // for efficient iteration. diff --git a/ext/datadog_profiling_native_extension/stack_recorder.c b/ext/datadog_profiling_native_extension/stack_recorder.c index d16471ce496..b7d930b8835 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.c +++ b/ext/datadog_profiling_native_extension/stack_recorder.c @@ -592,7 +592,7 @@ static VALUE ruby_time_from(ddog_Timespec ddprof_time) { return rb_time_timespec_new(&time, utc); } -void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels, bool placeholder) { +void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels) { struct stack_recorder_state *state; TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state); @@ -612,20 +612,12 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, metric_values[position_for[ALLOC_SAMPLES_UNSCALED_VALUE_ID]] = values.alloc_samples_unscaled; metric_values[position_for[TIMELINE_VALUE_ID]] = values.timeline_wall_time_ns; - if (!placeholder && values.alloc_samples > 0) { + if (values.alloc_samples != 0) { // If we got an allocation sample end the heap allocation recording to commit the heap sample. // FIXME: Heap sampling currently has to be done in 2 parts because the construction of locations is happening // very late in the allocation-sampling path (which is shared with the cpu sampling path). This can // be fixed with some refactoring but for now this leads to a less impactful change. - // - // NOTE: The heap recorder is allowed to raise exceptions if something's wrong. But we also need to handle it - // on this side to make sure we properly unlock the active slot mutex on our way out. Otherwise, this would - // later lead to deadlocks (since the active slot mutex is not expected to be locked forever). - int exception_state = end_heap_allocation_recording_with_rb_protect(state->heap_recorder, locations); - if (exception_state) { - sampler_unlock_active_profile(active_slot); - rb_jump_tag(exception_state); - } + end_heap_allocation_recording(state->heap_recorder, locations); } ddog_prof_Profile_Result result = ddog_prof_Profile_add( diff --git a/ext/datadog_profiling_native_extension/stack_recorder.h b/ext/datadog_profiling_native_extension/stack_recorder.h index 3908bdb0e7b..30b56746364 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.h +++ b/ext/datadog_profiling_native_extension/stack_recorder.h @@ -22,7 +22,7 @@ typedef struct sample_labels { int64_t end_timestamp_ns; } sample_labels; -void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels, bool placeholder); +void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, sample_values values, sample_labels labels); void record_endpoint(VALUE recorder_instance, uint64_t local_root_span_id, ddog_CharSlice endpoint); void track_object(VALUE recorder_instance, VALUE new_object, unsigned int sample_weight, ddog_CharSlice *alloc_class); VALUE enforce_recorder_instance(VALUE object); diff --git a/lib/datadog/profiling/profiler.rb b/lib/datadog/profiling/profiler.rb index 8d6578a8fc8..3231eabc330 100644 --- a/lib/datadog/profiling/profiler.rb +++ b/lib/datadog/profiling/profiler.rb @@ -59,7 +59,6 @@ def component_failed(failed_component) # we're operating in a degraded state and crash tracking may still be helpful. if failed_component == :worker - scheduler.mark_profiler_failed stop_scheduler elsif failed_component == :scheduler stop_worker diff --git a/lib/datadog/profiling/scheduler.rb b/lib/datadog/profiling/scheduler.rb index 73cc94fc27a..96da490b4e2 100644 --- a/lib/datadog/profiling/scheduler.rb +++ b/lib/datadog/profiling/scheduler.rb @@ -22,8 +22,7 @@ class Scheduler < Core::Worker attr_reader \ :exporter, - :transport, - :profiler_failed + :transport public @@ -35,7 +34,6 @@ def initialize( ) @exporter = exporter @transport = transport - @profiler_failed = false # Workers::Async::Thread settings self.fork_policy = fork_policy @@ -82,14 +80,8 @@ def loop_wait_before_first_iteration? true end - # This is called by the Profiler class whenever an issue happened in the profiler. This makes sure that even - # if there is data to be flushed, we don't try to flush it. - def mark_profiler_failed - @profiler_failed = true - end - def work_pending? - !profiler_failed && exporter.can_flush? + exporter.can_flush? end def reset_after_fork diff --git a/sig/datadog/profiling/scheduler.rbs b/sig/datadog/profiling/scheduler.rbs index 8a7e9765ed6..fabd51ecabb 100644 --- a/sig/datadog/profiling/scheduler.rbs +++ b/sig/datadog/profiling/scheduler.rbs @@ -14,7 +14,6 @@ module Datadog def start: (?on_failure_proc: ::Proc?) -> void def reset_after_fork: () -> void - def mark_profiler_failed: () -> true end end end diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 55a80406c92..633e1271ac6 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -507,7 +507,6 @@ context 'with dynamic_sampling_rate_enabled' do let(:options) { { dynamic_sampling_rate_enabled: true } } - it 'keeps statistics on how allocation sampling is doing' do stub_const('CpuAndWallTimeWorkerSpec::TestStruct', Struct.new(:foo)) @@ -537,34 +536,6 @@ one_second_in_ns = 1_000_000_000 expect(sampling_time_ns_max).to be < one_second_in_ns, "A single sample should not take longer than 1s, #{stats}" end - - # When large numbers of objects are allocated, the dynamic sampling rate kicks in, and we don't sample every - # object. - # We then assign a weight to every sample to compensate for this; to avoid bias, we have a limit on this weight, - # and we clamp it if it goes over the limit. - # But the total amount of allocations recorded should match the number we observed, and thus we record the - # remainder above the clamped value as a separate "Skipped Samples" step. - it 'records skipped allocation samples when weights are clamped' do - start - - thread_that_allocates_as_fast_as_possible = Thread.new { loop { BasicObject.new } } - - Thread.pass - - allocation_samples = try_wait_until do - samples = samples_from_pprof(recorder.serialize!).select { |it| it.values[:'alloc-samples'] > 0 } - samples if samples.any? { |it| it.labels[:'thread name'] == 'Skipped Samples' } - end - - thread_that_allocates_as_fast_as_possible.kill - thread_that_allocates_as_fast_as_possible.join - - GC.start - - cpu_and_wall_time_worker.stop - - expect(allocation_samples).to_not be_empty - end end context 'when sampling optimized Ruby strings' do @@ -658,7 +629,6 @@ let(:options) { { dynamic_sampling_rate_enabled: false } } before do - skip 'Heap profiling is only supported on Ruby >= 2.7' if RUBY_VERSION < '2.7' allow(Datadog.logger).to receive(:warn) expect(Datadog.logger).to receive(:warn).with(/dynamic sampling rate disabled/) end diff --git a/spec/datadog/profiling/collectors/thread_context_spec.rb b/spec/datadog/profiling/collectors/thread_context_spec.rb index a6e3ccaa385..024a127abfb 100644 --- a/spec/datadog/profiling/collectors/thread_context_spec.rb +++ b/spec/datadog/profiling/collectors/thread_context_spec.rb @@ -79,10 +79,6 @@ def sample_allocation(weight:, new_object: Object.new) described_class::Testing._native_sample_allocation(cpu_and_wall_time_collector, weight, new_object) end - def sample_skipped_allocation_samples(skipped_samples) - described_class::Testing._native_sample_skipped_allocation_samples(cpu_and_wall_time_collector, skipped_samples) - end - def thread_list described_class::Testing._native_thread_list end @@ -1220,41 +1216,6 @@ def self.otel_sdk_available? end end - describe '#sample_skipped_allocation_samples' do - let(:single_sample) do - expect(samples.size).to be 1 - samples.first - end - before { sample_skipped_allocation_samples(123) } - - it 'records the number of skipped allocations' do - expect(single_sample.values).to include('alloc-samples': 123) - end - - it 'attributes the skipped samples to a "Skipped Samples" thread' do - expect(single_sample.labels).to include('thread id': 'SA', 'thread name': 'Skipped Samples') - end - - it 'attributes the skipped samples to a "(Skipped Samples)" allocation class' do - expect(single_sample.labels).to include('allocation class': '(Skipped Samples)') - end - - it 'includes a placeholder stack attributed to "Skipped Samples"' do - expect(single_sample.locations.size).to be 1 - expect(single_sample.locations.first.path).to eq 'Skipped Samples' - end - - context 'when heap sampling is enabled' do - let(:recorder) { build_stack_recorder(heap_samples_enabled: true) } - - it 'records only the number of skipped allocations, and does not record any heap samples' do - GC.start # Force any incorrect heap samples to have age > 1 - - expect(single_sample.values).to include('alloc-samples': 123, 'heap-live-samples': 0) - end - end - end - describe '#thread_list' do it "returns the same as Ruby's Thread.list" do expect(thread_list).to eq Thread.list diff --git a/spec/datadog/profiling/profiler_spec.rb b/spec/datadog/profiling/profiler_spec.rb index ccdbf7b69ff..98ee1722754 100644 --- a/spec/datadog/profiling/profiler_spec.rb +++ b/spec/datadog/profiling/profiler_spec.rb @@ -123,7 +123,6 @@ before do allow(scheduler).to receive(:enabled=) allow(scheduler).to receive(:stop) - allow(scheduler).to receive(:mark_profiler_failed) end it 'logs the issue' do @@ -132,12 +131,6 @@ worker_on_failure end - it 'marks the profiler as having failed in the scheduler' do - expect(scheduler).to receive(:mark_profiler_failed) - - worker_on_failure - end - it 'stops the scheduler' do expect(scheduler).to receive(:enabled=).with(false) expect(scheduler).to receive(:stop).with(true) diff --git a/spec/datadog/profiling/scheduler_spec.rb b/spec/datadog/profiling/scheduler_spec.rb index fa3ea06f38f..3a33b9037ff 100644 --- a/spec/datadog/profiling/scheduler_spec.rb +++ b/spec/datadog/profiling/scheduler_spec.rb @@ -249,15 +249,6 @@ it { is_expected.to be false } end - - context 'when the profiler was marked as failed' do - before do - scheduler.mark_profiler_failed - expect(exporter).to_not receive(:can_flush?) - end - - it { is_expected.to be false } - end end describe '#reset_after_fork' do diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index 1731e0cce6d..7320decf400 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -412,7 +412,6 @@ def sample_allocation(obj) end before do - GC.start allocations = [a_string, an_array, "a fearsome interpolated string: #{sample_rate}", (-10..-1).to_a, a_hash, { 'z' => -1, 'y' => '-2', 'x' => false }, Object.new] @num_allocations = 0 @@ -781,33 +780,6 @@ def create_obj_in_recycled_slot(should_sample_original: false) expect(relevant_sample).to be nil end end - - # NOTE: This is a regression test that exceptions in end_heap_allocation_recording_with_rb_protect are safely - # handled by the stack_recorder. - context 'when the heap sampler raises an exception during _native_sample' do - it 'propagates the exception' do - expect do - Datadog::Profiling::Collectors::Stack::Testing - ._native_sample(Thread.current, stack_recorder, metric_values, labels, numeric_labels, 400, false) - end.to raise_error(RuntimeError, /Ended a heap recording/) - end - - it 'does not keep the active slot mutex locked' do - expect(active_slot).to be 1 - expect(slot_one_mutex_locked?).to be false - expect(slot_two_mutex_locked?).to be true - - begin - Datadog::Profiling::Collectors::Stack::Testing - ._native_sample(Thread.current, stack_recorder, metric_values, labels, numeric_labels, 400, false) - rescue # rubocop:disable Lint/SuppressedException - end - - expect(active_slot).to be 1 - expect(slot_one_mutex_locked?).to be false - expect(slot_two_mutex_locked?).to be true - end - end end end