Skip to content

Commit

Permalink
Merge pull request #3956 from DataDog/zachg/lazy_sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
marcotc authored Oct 7, 2024
2 parents 94cfe27 + bc81964 commit 9e26192
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 20 deletions.
1 change: 1 addition & 0 deletions lib/datadog/tracing/distributed/propagation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def initialize(
# DEV-2.0: if needed.
# DEV-2.0: Ideally, we'd have a separate stream to report tracer errors and never
# DEV-2.0: touch the active span.
# DEV-3.0: Sample trace here instead of when generating digest.
#
# @param digest [TraceDigest]
# @param data [Hash]
Expand Down
7 changes: 6 additions & 1 deletion lib/datadog/tracing/sampling/matcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def match?(trace)
# @return [#match?(String)]
def self.glob_to_regex(glob)
# Optimization for match-all case
return MATCH_ALL if glob == MATCH_ALL_PATTERN
return MATCH_ALL if /\A\*+\z/.match?(glob)

# Ensure no undesired characters are treated as regex.
glob = Regexp.quote(glob)
Expand Down Expand Up @@ -100,6 +100,11 @@ def tags_match?(trace)
@tags.all? do |name, matcher|
tag = trace.get_tag(name)

# Floats: Matching floating point values with a non-zero decimal part is not supported.
# For floating point values with a non-zero decimal part, any all * pattern always returns true.
# Other patterns always return false.
return false if tag.is_a?(Float) && tag.truncate != tag && matcher != MATCH_ALL

# Format metrics as strings, to allow for partial number matching (/4.*/ matching '400', '404', etc.).
# Because metrics are floats, we use the '%g' format specifier to avoid trailing zeros, which
# can affect exact string matching (e.g. '400' matching '400.0').
Expand Down
28 changes: 26 additions & 2 deletions lib/datadog/tracing/trace_operation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

require_relative '../core/environment/identity'
require_relative '../core/utils'

require_relative 'tracer'
require_relative 'event'
require_relative 'metadata/tagging'
require_relative 'sampling/ext'
Expand Down Expand Up @@ -75,7 +75,9 @@ def initialize(
metrics: nil,
trace_state: nil,
trace_state_unknown_fields: nil,
remote_parent: false
remote_parent: false,
tracer: nil

)
# Attributes
@id = id || Tracing::Utils::TraceId.next_id
Expand All @@ -98,6 +100,7 @@ def initialize(
@profiling_enabled = profiling_enabled
@trace_state = trace_state
@trace_state_unknown_fields = trace_state_unknown_fields
@tracer = tracer

# Generic tags
set_tags(tags) if tags
Expand Down Expand Up @@ -161,6 +164,23 @@ def resource
@resource || (root_span && root_span.resource)
end

# When retrieving tags or metrics we need to include root span tags for sampling purposes
def get_tag(key)
super || (root_span && root_span.get_tag(key))
end

def get_metric(key)
super || (root_span && root_span.get_metric(key))
end

def tags
all_tags = {}
all_tags.merge!(root_span&.tags || {}) if root_span
all_tags.merge!(super)

all_tags
end

# Returns true if the resource has been explicitly set
#
# @return [Boolean]
Expand Down Expand Up @@ -284,10 +304,14 @@ def flush!
# Returns a set of trace headers used for continuing traces.
# Used for propagation across execution contexts.
# Data should reflect the active state of the trace.
# DEV-3.0: Sampling is a side effect of generating the digest.
# We should move the sample call to inject and right before moving to new contexts(threads, forking etc.)
def to_digest
# Resolve current span ID
span_id = @active_span && @active_span.id
span_id ||= @parent_span_id unless finished?
# sample the trace_operation with the tracer
@tracer&.sample_trace(self) unless sampling_priority

TraceDigest.new(
span_id: span_id,
Expand Down
26 changes: 14 additions & 12 deletions lib/datadog/tracing/tracer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,17 @@ def continue_trace!(digest, key = nil, &block)
context.activate!(trace, &block)
end

# Sample a span, tagging the trace as appropriate.
def sample_trace(trace_op)
begin
@sampler.sample!(trace_op)
rescue StandardError => e
SAMPLE_TRACE_LOG_ONLY_ONCE.run do
Datadog.logger.warn { "Failed to sample trace: #{e.class.name} #{e} at #{Array(e.backtrace).first}" }
end
end
end

# @!visibility private
# TODO: make this private
def trace_completed
Expand Down Expand Up @@ -331,12 +342,14 @@ def build_trace(digest = nil)
trace_state: digest.trace_state,
trace_state_unknown_fields: digest.trace_state_unknown_fields,
remote_parent: digest.span_remote,
tracer: self
)
else
TraceOperation.new(
hostname: hostname,
profiling_enabled: profiling_enabled,
remote_parent: false,
tracer: self
)
end
end
Expand All @@ -347,7 +360,6 @@ def bind_trace_events!(trace_op)
events.span_before_start.subscribe do |event_span_op, event_trace_op|
event_trace_op.service ||= @default_service
event_span_op.service ||= @default_service
sample_trace(event_trace_op) if event_span_op && event_span_op.parent_id == 0
end

events.span_finished.subscribe do |event_span, event_trace_op|
Expand Down Expand Up @@ -463,17 +475,6 @@ def subscribe_trace_deactivation!(context, trace, original_trace)
end
end

# Sample a span, tagging the trace as appropriate.
def sample_trace(trace_op)
begin
@sampler.sample!(trace_op)
rescue StandardError => e
SAMPLE_TRACE_LOG_ONLY_ONCE.run do
Datadog.logger.warn { "Failed to sample trace: #{e.class.name} #{e} at #{Array(e.backtrace).first}" }
end
end
end

SAMPLE_TRACE_LOG_ONLY_ONCE = Core::Utils::OnlyOnce.new
private_constant :SAMPLE_TRACE_LOG_ONLY_ONCE

Expand All @@ -492,6 +493,7 @@ def sample_span(trace_op, span)

# Flush finished spans from the trace buffer, send them to writer.
def flush_trace(trace_op)
sample_trace(trace_op) unless trace_op.sampling_priority
begin
trace = @trace_flush.consume!(trace_op)
write(trace) if trace && !trace.empty?
Expand Down
87 changes: 86 additions & 1 deletion spec/datadog/tracing/integration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,11 @@ def agent_receives_span_step3(previous_success)
@span = trace.spans[0]
end

tracer.trace('my.op').finish
tracer.trace('my.op', service: 'my.service') do |span|
span.set_tag('tag', 'tag_value')
span.set_tag('tag2', 'tag_value2')
span.resource = 'my.resource'
end

try_wait_until { tracer.writer.stats[:traces_flushed] >= 1 }

Expand Down Expand Up @@ -319,6 +323,87 @@ def agent_receives_span_step3(previous_success)
it_behaves_like 'sampling decision', '-3'
end

context 'with a matching resource name' do
include_context 'DD_TRACE_SAMPLING_RULES configuration' do
let(:rule) { { resource: 'my.resource', sample_rate: 1.0 } }
end

it_behaves_like 'flushed trace'
it_behaves_like 'priority sampled', Datadog::Tracing::Sampling::Ext::Priority::USER_KEEP
it_behaves_like 'rule sampling rate metric', 1.0
it_behaves_like 'rate limit metric', 1.0
it_behaves_like 'sampling decision', '-3'
end

context 'with a matching service name' do
include_context 'DD_TRACE_SAMPLING_RULES configuration' do
let(:rule) { { service: 'my.service', sample_rate: 1.0 } }
end

it_behaves_like 'flushed trace'
it_behaves_like 'priority sampled', Datadog::Tracing::Sampling::Ext::Priority::USER_KEEP
it_behaves_like 'rule sampling rate metric', 1.0
it_behaves_like 'rate limit metric', 1.0
it_behaves_like 'sampling decision', '-3'
end

context 'with matching tags' do
include_context 'DD_TRACE_SAMPLING_RULES configuration' do
let(:rule) { { tags: { tag: 'tag_value', tag2: 'tag_value2' }, sample_rate: 1.0 } }
end

it_behaves_like 'flushed trace'
it_behaves_like 'priority sampled', Datadog::Tracing::Sampling::Ext::Priority::USER_KEEP
it_behaves_like 'rule sampling rate metric', 1.0
it_behaves_like 'rate limit metric', 1.0
it_behaves_like 'sampling decision', '-3'
end

context 'with matching tags and matching service and matching resource' do
include_context 'DD_TRACE_SAMPLING_RULES configuration' do
let(:rule) do
{ resource: 'my.resource', service: 'my.service', tags: { tag: 'tag_value', tag2: 'tag_value2' },
sample_rate: 1.0 }
end
end

it_behaves_like 'flushed trace'
it_behaves_like 'priority sampled', Datadog::Tracing::Sampling::Ext::Priority::USER_KEEP
it_behaves_like 'rule sampling rate metric', 1.0
it_behaves_like 'rate limit metric', 1.0
it_behaves_like 'sampling decision', '-3'
end

context 'with not matching tags and matching service and matching resource' do
include_context 'DD_TRACE_SAMPLING_RULES configuration' do
let(:rule) do
{ resource: 'my.resource', service: 'my.service', tags: { tag: 'wrong_tag_value' },
sample_rate: 1.0 }
end
end

it_behaves_like 'flushed trace'
it_behaves_like 'priority sampled', Datadog::Tracing::Sampling::Ext::Priority::AUTO_KEEP
it_behaves_like 'rule sampling rate metric', nil # Rule is not applied
it_behaves_like 'rate limit metric', nil # Rate limiter is never reached, thus has no value to provide
it_behaves_like 'sampling decision', '-0'
end

context 'drop with matching tags and matching service and matching resource' do
include_context 'DD_TRACE_SAMPLING_RULES configuration' do
let(:rule) do
{ resource: 'my.resource', service: 'my.service', tags: { tag: 'tag_value' },
sample_rate: 0 }
end
end

it_behaves_like 'flushed trace'
it_behaves_like 'priority sampled', Datadog::Tracing::Sampling::Ext::Priority::USER_REJECT
it_behaves_like 'rule sampling rate metric', 0.0
it_behaves_like 'rate limit metric', nil # Rate limiter is never reached, thus has no value to provide
it_behaves_like 'sampling decision', nil
end

context 'with low sample rate' do
let(:rule) { Datadog::Tracing::Sampling::SimpleRule.new(sample_rate: Float::MIN) }

Expand Down
21 changes: 21 additions & 0 deletions spec/datadog/tracing/sampling/matcher_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,20 @@
let(:tags) { { 'metric1' => '1', 'metric2' => '3' } }

it { is_expected.to eq(false) }

context 'with a float that has a non-zero decimal' do
let(:tags) { { 'metric1' => '2*' } }
let(:trace_tags) { { 'metric1' => 20.1 } }

it { is_expected.to eq(false) }
end

context 'with a float that has a zero decimal' do
let(:tags) { { 'metric1' => '2*' } }
let(:trace_tags) { { 'metric1' => 20.0 } }

it { is_expected.to eq(true) }
end
end
end

Expand Down Expand Up @@ -151,6 +165,13 @@

it { is_expected.to eq(true) }
end

context 'when multiple *s are used' do
let(:trace_service) { 'hello_service' }
let(:service) { '***' }

it { is_expected.to eq(true) }
end
end

context 'with a resource matcher' do
Expand Down
67 changes: 67 additions & 0 deletions spec/datadog/tracing/trace_operation_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,22 @@
end

context 'given' do
context ':trace_operation_samples' do
let(:tracer) { instance_double(Datadog::Tracing::Tracer) }
let(:trace_op) { described_class.new(tracer: tracer) }

describe '#to_digest' do
before do
allow(tracer).to receive(:sample_trace)
end

it 'calls tracer.sample_trace' do
expect(tracer).to receive(:sample_trace).with(trace_op)
trace_op.to_digest
end
end
end

context ':agent_sample_rate' do
subject(:options) { { agent_sample_rate: agent_sample_rate } }
let(:agent_sample_rate) { 0.5 }
Expand Down Expand Up @@ -301,6 +317,57 @@
end
end

context 'when trace operation returns root span values as well' do
let(:options) { { tags: { ok: 'test' } } }
context 'for tags' do
it do
# When tags are added to the root span they should be accessible through the trace operation
span = trace_op.build_span('test', tags: { 'foo' => 'bar' })
span.start
expect(trace_op.get_tag('foo')).to eq('bar')
expect(trace_op.get_tag('ok')).to eq('test')
expect(trace_op.tags).to eq('foo' => 'bar', 'ok' => 'test')
span.finish
end

context 'trace operation tags take precedent over root span tags' do
it do
# When tags are added to the root span they should be accessible through the trace operation
span = trace_op.build_span('test', tags: { 'ok' => 'should_not_be' })
span.start
expect(trace_op.tags).to eq('ok' => 'test')
span.finish
end

context 'for metrics' do
let(:options) { { metrics: { metric1: 123 } } }
it do
# When tags are added to the root span they should be accessible through the trace operation
span = trace_op.build_span('test', tags: { 'metric2' => 456 })
span.start
expect(trace_op.get_metric('metric1')).to eq(123)
expect(trace_op.get_metric('metric2')).to eq(456)

span.finish
end
end

context 'for metrics override' do
let(:options) { { metrics: { metric1: 123 } } }

it do
# When tags are added to the root span they should be accessible through the trace operation
span = trace_op.build_span('test', tags: { 'metric1' => 456 })
span.start
expect(trace_op.get_metric('metric1')).to eq(123)
expect(trace_op.tags).to eq({ 'metric1' => 123 })
span.finish
end
end
end
end
end

context 'when :max_length is non-zero' do
let(:options) { { max_length: 3 } }

Expand Down
Loading

0 comments on commit 9e26192

Please sign in to comment.