Merge pull request #3797 from DataDog/ivoanjo/prof-10201-coarse-times…

…tamps [PROF-10201] Reduce allocation profiling overhead by using coarse timestamps
DataDog · Jul 24, 2024 · 2cdc371 · 2cdc371
2 parents 5d1295c + 446f2ad
commit 2cdc371
Show file tree

Hide file tree

Showing 8 changed files with 227 additions and 58 deletions.
diff --git a/benchmarks/profiler_allocation.rb b/benchmarks/profiler_allocation.rb
@@ -0,0 +1,66 @@
+# Used to quickly run benchmark under RSpec as part of the usual test suite, to validate it didn't bitrot
+VALIDATE_BENCHMARK_MODE = ENV['VALIDATE_BENCHMARK'] == 'true'
+
+return unless __FILE__ == $PROGRAM_NAME || VALIDATE_BENCHMARK_MODE
+
+require 'benchmark/ips'
+require 'datadog'
+require 'pry'
+require_relative 'dogstatsd_reporter'
+
+# This benchmark measures the performance of allocation profiling
+
+class ExportToFile
+  PPROF_PREFIX = ENV.fetch('DD_PROFILING_PPROF_PREFIX', 'profiler-allocation')
+
+  def export(flush)
+    File.write("#{PPROF_PREFIX}#{flush.start.strftime('%Y%m%dT%H%M%SZ')}.pprof", flush.pprof_data)
+    true
+  end
+end
+
+class ProfilerAllocationBenchmark
+  def run_benchmark
+    Benchmark.ips do |x|
+      benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
+      x.config(
+        **benchmark_time,
+        suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_allocation')
+      )
+
+      x.report('Allocations (baseline)', 'BasicObject.new')
+
+      x.save! 'profiler-allocation-results.json' unless VALIDATE_BENCHMARK_MODE
+      x.compare!
+    end
+
+    Datadog.configure do |c|
+      c.profiling.enabled = true
+      c.profiling.allocation_enabled = true
+      c.profiling.advanced.gc_enabled = false
+      c.profiling.exporter.transport = ExportToFile.new unless VALIDATE_BENCHMARK_MODE
+    end
+    Datadog::Profiling.wait_until_running
+
+    3.times { GC.start }
+
+    Benchmark.ips do |x|
+      benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
+      x.config(
+        **benchmark_time,
+        suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_allocation')
+      )
+
+      x.report("Allocations (#{ENV['CONFIG']})", 'BasicObject.new')
+
+      x.save! 'profiler-allocation-results.json' unless VALIDATE_BENCHMARK_MODE
+      x.compare!
+    end
+  end
+end
+
+puts "Current pid is #{Process.pid}"
+
+ProfilerAllocationBenchmark.new.instance_exec do
+  run_benchmark
+end
diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c
@@ -1045,8 +1045,27 @@ static VALUE _native_allocation_count(DDTRACE_UNUSED VALUE self) {
   return are_allocations_being_tracked ? ULL2NUM(allocation_count) : Qnil;
 }
 
+#define HANDLE_CLOCK_FAILURE(call) ({ \
+    long _result = (call); \
+    if (_result == 0) { \
+        delayed_error(state, ERR_CLOCK_FAIL); \
+        return; \
+    } \
+    _result; \
+})
+
 // Implements memory-related profiling events. This function is called by Ruby via the `object_allocation_tracepoint`
 // when the RUBY_INTERNAL_EVENT_NEWOBJ event is triggered.
+//
+// When allocation sampling is enabled, this function gets called for almost all* objects allocated by the Ruby VM.
+// (*In some weird cases the VM may skip this tracepoint.)
+//
+// At a high level, there's two paths through this function:
+// 1. should_sample == false -> return
+// 2. should_sample == true -> sample
+//
+// On big applications, path 1. is the hottest, since we don't sample every option. So it's quite important for it to
+// be as fast as possible.
 static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused) {
   // Update thread-local allocation count
   if (RB_UNLIKELY(allocation_count == UINT64_MAX)) {
@@ -1061,25 +1080,39 @@ static void on_newobj_event(VALUE tracepoint_data, DDTRACE_UNUSED void *unused)
   // and disabled before it is cleared, but just in case...
   if (state == NULL) return;
 
-  // In a few cases, we may actually be allocating an object as part of profiler sampling. We don't want to recursively
+  // In rare cases, we may actually be allocating an object as part of profiler sampling. We don't want to recursively
   // sample, so we just return early
   if (state->during_sample) {
     state->stats.allocations_during_sample++;
     return;
   }
 
-  if (state->dynamic_sampling_rate_enabled) {
-    long now = monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE);
-    if (now == 0) {
-      delayed_error(state, ERR_CLOCK_FAIL);
-      return;
-    }
-    if (!discrete_dynamic_sampler_should_sample(&state->allocation_sampler, now)) {
-      state->stats.allocation_skipped++;
-      return;
+  // Hot path: Dynamic sampling rate is usually enabled and the sampling decision is usually false
+  if (RB_LIKELY(state->dynamic_sampling_rate_enabled && !discrete_dynamic_sampler_should_sample(&state->allocation_sampler))) {
+    state->stats.allocation_skipped++;
+
+    coarse_instant now = monotonic_coarse_wall_time_now_ns();
+    HANDLE_CLOCK_FAILURE(now.timestamp_ns);
+
+    bool needs_readjust = discrete_dynamic_sampler_skipped_sample(&state->allocation_sampler, now);
+    if (RB_UNLIKELY(needs_readjust)) {
+      // We rarely readjust, so this is a cold path
+      // Also, while above we used the cheaper monotonic_coarse, for this call we want the regular monotonic call,
+      // which is why we end up getting time "again".
+      discrete_dynamic_sampler_readjust(
+        &state->allocation_sampler, HANDLE_CLOCK_FAILURE(monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE))
+      );
     }
+
+    return;
   }
 
+  // From here on, we've decided to go ahead with the sample, which is way less common than skipping it
+
+  discrete_dynamic_sampler_before_sample(
+    &state->allocation_sampler, HANDLE_CLOCK_FAILURE(monotonic_wall_time_now_ns(DO_NOT_RAISE_ON_FAILURE))
+  );
+
   // @ivoanjo: Strictly speaking, this is not needed because Ruby should not call the same tracepoint while a previous
   // invocation is still pending, (e.g. it wouldn't call `on_newobj_event` while it's already running), but I decided
   // to keep this here for consistency -- every call to the thread context (other than the special gc calls which are

diff --git a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.c
@@ -20,6 +20,9 @@
 
 #define EMA_SMOOTHING_FACTOR 0.6
 
+static void maybe_readjust(discrete_dynamic_sampler *sampler, long now_ns);
+inline bool should_readjust(discrete_dynamic_sampler *sampler, coarse_instant now);
+
 void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name, long now_ns) {
   sampler->debug_name = debug_name;
   discrete_dynamic_sampler_set_overhead_target_percentage(sampler, BASE_OVERHEAD_PCT, now_ns);
@@ -54,27 +57,25 @@ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sa
   return discrete_dynamic_sampler_reset(sampler, now_ns);
 }
 
-static void maybe_readjust(discrete_dynamic_sampler *sampler, long now);
-
-bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns) {
+// NOTE: See header for an explanation of when this should get used
+__attribute__((warn_unused_result))
+bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler) {
   // For efficiency reasons we don't do true random sampling but rather systematic
   // sampling following a sample interval/skip. This can be biased and hide patterns
-  // but the dynamic interval and rather indeterministic pattern of allocations in
+  // but the dynamic interval and rather nondeterministic pattern of allocations in
   // most real applications should help reduce the bias impact.
   sampler->events_since_last_sample++;
   sampler->events_since_last_readjustment++;
-  bool should_sample = sampler->sampling_interval > 0 && sampler->events_since_last_sample >= sampler->sampling_interval;
 
-  if (should_sample) {
-    sampler->sample_start_time_ns = now_ns;
-  } else {
-    // check if we should readjust our sampler after this event, even if we didn't sample it
-    maybe_readjust(sampler, now_ns);
-  }
+  return sampler->sampling_interval > 0 && sampler->events_since_last_sample >= sampler->sampling_interval;
+}
 
-  return should_sample;
+// NOTE: See header for an explanation of when this should get used
+void discrete_dynamic_sampler_before_sample(discrete_dynamic_sampler *sampler, long now_ns) {
+  sampler->sample_start_time_ns = now_ns;
 }
 
+// NOTE: See header for an explanation of when this should get used
 long discrete_dynamic_sampler_after_sample(discrete_dynamic_sampler *sampler, long now_ns) {
   long last_sampling_time_ns = sampler->sample_start_time_ns == 0 ? 0 : long_max_of(0, now_ns - sampler->sample_start_time_ns);
   sampler->samples_since_last_readjustment++;
@@ -95,6 +96,11 @@ size_t discrete_dynamic_sampler_events_since_last_sample(discrete_dynamic_sample
   return sampler->events_since_last_sample;
 }
 
+// NOTE: See header for an explanation of when this should get used
+bool discrete_dynamic_sampler_skipped_sample(discrete_dynamic_sampler *sampler, coarse_instant now) {
+  return should_readjust(sampler, now);
+}
+
 static double ewma_adj_window(double latest_value, double avg, long current_window_time_ns, bool is_first) {
   if (is_first) {
     return latest_value;
@@ -109,19 +115,27 @@ static double ewma_adj_window(double latest_value, double avg, long current_wind
   return (1-alpha) * avg + alpha * latest_value;
 }
 
-static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
-  long this_window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now - sampler->last_readjust_time_ns;
+static void maybe_readjust(discrete_dynamic_sampler *sampler, long now_ns) {
+  if (should_readjust(sampler, to_coarse_instant(now_ns))) discrete_dynamic_sampler_readjust(sampler, now_ns);
+}
+
+inline bool should_readjust(discrete_dynamic_sampler *sampler, coarse_instant now) {
+  long this_window_time_ns =
+    sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now.timestamp_ns - sampler->last_readjust_time_ns;
 
   bool should_readjust_based_on_time = this_window_time_ns >= ADJUSTMENT_WINDOW_NS;
   bool should_readjust_based_on_samples = sampler->samples_since_last_readjustment >= ADJUSTMENT_WINDOW_SAMPLES;
 
-  if (!should_readjust_based_on_time && !should_readjust_based_on_samples) {
-    // not enough time or samples have passed to perform a readjustment
-    return;
-  }
+  return should_readjust_based_on_time || should_readjust_based_on_samples;
+}
 
-  if (this_window_time_ns == 0) {
-    // should not be possible given previous condition but lets protect against div by 0 below.
+// NOTE: This method ASSUMES should_readjust == true
+// NOTE: See header for an explanation of when this should get used
+void discrete_dynamic_sampler_readjust(discrete_dynamic_sampler *sampler, long now_ns) {
+  long this_window_time_ns = sampler->last_readjust_time_ns == 0 ? ADJUSTMENT_WINDOW_NS : now_ns - sampler->last_readjust_time_ns;
+
+  if (this_window_time_ns <= 0) {
+    // should not be possible given should_readjust but lets protect against div by 0 below.
     return;
   }
 
@@ -143,7 +157,7 @@ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
     // Lets update our average sampling time per event
     long avg_sampling_time_in_window_ns = sampler->samples_since_last_readjustment == 0 ? 0 : sampler->sampling_time_since_last_readjustment_ns / sampler->samples_since_last_readjustment;
     if (avg_sampling_time_in_window_ns > sampler->max_sampling_time_ns) {
-      // If the average sampling time in the previous window was deemed unnacceptable, clamp it to the
+      // If the average sampling time in the previous window was deemed unacceptable, clamp it to the
       // maximum acceptable value and register this operation in our counter.
       // NOTE: This is important so that events like suspensions or system overloads do not lead us to
       //       learn arbitrarily big sampling times which may then result in us not sampling anything
@@ -286,7 +300,7 @@ static void maybe_readjust(discrete_dynamic_sampler *sampler, long now) {
   sampler->events_since_last_readjustment = 0;
   sampler->samples_since_last_readjustment = 0;
   sampler->sampling_time_since_last_readjustment_ns = 0;
-  sampler->last_readjust_time_ns = now;
+  sampler->last_readjust_time_ns = now_ns;
   sampler->has_completed_full_adjustment_window = true;
 }
 
@@ -406,7 +420,14 @@ VALUE _native_should_sample(VALUE self, VALUE now_ns) {
   sampler_state *state;
   TypedData_Get_Struct(self, sampler_state, &sampler_typed_data, state);
 
-  return discrete_dynamic_sampler_should_sample(&state->sampler, NUM2LONG(now_ns)) ? Qtrue : Qfalse;
+  if (discrete_dynamic_sampler_should_sample(&state->sampler)) {
+    discrete_dynamic_sampler_before_sample(&state->sampler, NUM2LONG(now_ns));
+    return Qtrue;
+  } else {
+    bool needs_readjust = discrete_dynamic_sampler_skipped_sample(&state->sampler, to_coarse_instant(NUM2LONG(now_ns)));
+    if (needs_readjust) discrete_dynamic_sampler_readjust(&state->sampler, NUM2LONG(now_ns));
+    return Qfalse;
+  }
 }
 
 VALUE _native_after_sample(VALUE self, VALUE now_ns) {

diff --git a/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.h b/ext/datadog_profiling_native_extension/collectors_discrete_dynamic_sampler.h
@@ -5,6 +5,8 @@
 
 #include <ruby.h>
 
+#include "time_helpers.h"
+
 // A sampler that will sample discrete events based on the overhead of their
 // sampling.
 //
@@ -62,7 +64,6 @@ typedef struct discrete_dynamic_sampler {
   unsigned long sampling_time_clamps;
 } discrete_dynamic_sampler;
 
-
 // Init a new sampler with sane defaults.
 void discrete_dynamic_sampler_init(discrete_dynamic_sampler *sampler, const char *debug_name, long now_ns);
 
@@ -80,9 +81,38 @@ void discrete_dynamic_sampler_set_overhead_target_percentage(discrete_dynamic_sa
 // @return True if the event associated with this decision should be sampled, false
 //         otherwise.
 //
-// NOTE: If true is returned we implicitly assume the start of a sampling operation
-//       and it is expected that a follow-up after_sample call is issued.
-bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler, long now_ns);
+// IMPORTANT: A call to this method MUST be followed by a call to either
+//            `discrete_dynamic_sampler_before_sample` if return is `true` or
+//            `discrete_dynamic_sampler_skipped_sample` if return is `false`.
+//
+// In the past, this method took care of before_sample/skipped_sample/readjust as well.
+// We've had to split it up because we wanted to both use coarse time and regular monotonic time depending on the
+// situation, but also wanted time to come as an argument from the outside.
+//
+// TL;DR here's how they should be used as Ruby code:
+// if discrete_dynamic_sampler_should_sample
+//   discrete_dynamic_sampler_before_sample(monotonic_wall_time_now_ns())
+// else
+//   needs_readjust = discrete_dynamic_sampler_skipped_sample(monotonic_coarse_wall_time_now_ns())
+//   discrete_dynamic_sampler_readjust(monotonic_wall_time_now_ns()) if needs_readjust
+// end
+__attribute__((warn_unused_result))
+bool discrete_dynamic_sampler_should_sample(discrete_dynamic_sampler *sampler);
+
+// Signal the start of a sampling operation.
+// MUST be called after `discrete_dynamic_sampler_should_sample` returns `true`.
+void discrete_dynamic_sampler_before_sample(discrete_dynamic_sampler *sampler, long now_ns);
+
+// Signals that sampling did not happen
+// MUST be called after `discrete_dynamic_sampler_skipped_sample` returns `false`.
+//
+// @return True if the sampler needs to be readjusted.
+//
+// IMPORTANT: A call to this method MUST be followed by a call to `discrete_dynamic_sampler_readjust` if return is `true`.
+__attribute__((warn_unused_result))
+bool discrete_dynamic_sampler_skipped_sample(discrete_dynamic_sampler *sampler, coarse_instant now);
+
+void discrete_dynamic_sampler_readjust(discrete_dynamic_sampler *sampler, long now_ns);
 
 // Signal the end of a sampling operation.
 //

diff --git a/ext/datadog_profiling_native_extension/extconf.rb b/ext/datadog_profiling_native_extension/extconf.rb
@@ -129,6 +129,9 @@ def add_compiler_flag(flag)
   # but it's slower to build
   # so instead we just assume that we have the function we need on Linux, and nowhere else
   $defs << '-DHAVE_PTHREAD_GETCPUCLOCKID'
+
+  # Not available on macOS
+  $defs << '-DHAVE_CLOCK_MONOTONIC_COARSE'
 end
 
 have_func 'malloc_stats'

diff --git a/ext/datadog_profiling_native_extension/time_helpers.c b/ext/datadog_profiling_native_extension/time_helpers.c
@@ -4,21 +4,6 @@
 #include "ruby_helpers.h"
 #include "time_helpers.h"
 
-// Safety: This function is assumed never to raise exceptions by callers when raise_on_failure == false
-long retrieve_clock_as_ns(clockid_t clock_id, bool raise_on_failure) {
-  struct timespec clock_value;
-
-  if (clock_gettime(clock_id, &clock_value) != 0) {
-    if (raise_on_failure) ENFORCE_SUCCESS_GVL(errno);
-    return 0;
-  }
-
-  return clock_value.tv_nsec + SECONDS_AS_NS(clock_value.tv_sec);
-}
-
-long monotonic_wall_time_now_ns(bool raise_on_failure) { return retrieve_clock_as_ns(CLOCK_MONOTONIC, raise_on_failure); }
-long system_epoch_time_now_ns(bool raise_on_failure)   { return retrieve_clock_as_ns(CLOCK_REALTIME,  raise_on_failure); }
-
 // Design: The monotonic_to_system_epoch_state struct is kept somewhere by the caller, and MUST be initialized to
 // MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER.
 //