Skip to content

Commit

Permalink
Merge pull request #3865 from DataDog/tonycthsu/telemetry_log_tracing
Browse files Browse the repository at this point in the history
Add telemetry logs for Tracing
  • Loading branch information
TonyCTHsu committed Sep 16, 2024
2 parents 9b198f9 + 257be68 commit 31a4c84
Show file tree
Hide file tree
Showing 20 changed files with 96 additions and 50 deletions.
7 changes: 7 additions & 0 deletions lib/datadog/core/metrics/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require_relative '../utils/time'
require_relative '../utils/only_once'
require_relative '../telemetry/logger'
require_relative '../configuration/ext'

require_relative 'ext'
Expand Down Expand Up @@ -100,6 +101,7 @@ def count(stat, value = nil, options = nil, &block)
Datadog.logger.error(
"Failed to send count stat. Cause: #{e.class.name} #{e.message} Source: #{Array(e.backtrace).first}"
)
Telemetry::Logger.report(e, description: 'Failed to send count stat')
end

def distribution(stat, value = nil, options = nil, &block)
Expand All @@ -113,6 +115,7 @@ def distribution(stat, value = nil, options = nil, &block)
Datadog.logger.error(
"Failed to send distribution stat. Cause: #{e.class.name} #{e.message} Source: #{Array(e.backtrace).first}"
)
Telemetry::Logger.report(e, description: 'Failed to send distribution stat')
end

def increment(stat, options = nil)
Expand All @@ -125,6 +128,7 @@ def increment(stat, options = nil)
Datadog.logger.error(
"Failed to send increment stat. Cause: #{e.class.name} #{e.message} Source: #{Array(e.backtrace).first}"
)
Telemetry::Logger.report(e, description: 'Failed to send increment stat')
end

def gauge(stat, value = nil, options = nil, &block)
Expand All @@ -138,6 +142,7 @@ def gauge(stat, value = nil, options = nil, &block)
Datadog.logger.error(
"Failed to send gauge stat. Cause: #{e.class.name} #{e.message} Source: #{Array(e.backtrace).first}"
)
Telemetry::Logger.report(e, description: 'Failed to send gauge stat')
end

def time(stat, options = nil)
Expand All @@ -153,9 +158,11 @@ def time(stat, options = nil)
distribution(stat, ((finished - start) * 1000), options)
end
rescue StandardError => e
# TODO: Likely to be redundant, since `distribution` handles its own errors.
Datadog.logger.error(
"Failed to send time stat. Cause: #{e.class.name} #{e.message} Source: #{Array(e.backtrace).first}"
)
Telemetry::Logger.report(e, description: 'Failed to send time stat')
end
end

Expand Down
2 changes: 2 additions & 0 deletions lib/datadog/core/remote/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def initialize(settings, capabilities, agent_settings)
@client.sync
@healthy ||= true
rescue Client::SyncError => e
# Transient errors due to network or agent. Logged the error but not via telemetry
Datadog.logger.error do
"remote worker client sync error: #{e.message} location: #{Array(e.backtrace).first}. skipping sync"
end
Expand All @@ -48,6 +49,7 @@ def initialize(settings, capabilities, agent_settings)
# negotiation object stores error logging state that should be reset.
negotiation = Negotiation.new(settings, agent_settings)

# Transient errors due to network or agent. Logged the error but not via telemetry
Datadog.logger.error do
"remote worker error: #{e.class.name} #{e.message} location: #{Array(e.backtrace).first}. "\
'reseting client state'
Expand Down
8 changes: 4 additions & 4 deletions lib/datadog/core/remote/negotiation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def endpoint?(path)

if res.internal_error? && network_error?(res.error)
unless @logged[:agent_unreachable]
Datadog.logger.error { "agent unreachable: cannot negotiate #{path}" }
Datadog.logger.warn { "agent unreachable: cannot negotiate #{path}" }
@logged[:agent_unreachable] = true
end

Expand All @@ -29,7 +29,7 @@ def endpoint?(path)

if res.not_found?
unless @logged[:no_info_endpoint]
Datadog.logger.error { "agent reachable but has no /info endpoint: cannot negotiate #{path}" }
Datadog.logger.warn { "agent reachable but has no /info endpoint: cannot negotiate #{path}" }
@logged[:no_info_endpoint] = true
end

Expand All @@ -38,7 +38,7 @@ def endpoint?(path)

unless res.ok?
unless @logged[:unexpected_response]
Datadog.logger.error { "agent reachable but unexpected response: cannot negotiate #{path}" }
Datadog.logger.warn { "agent reachable but unexpected response: cannot negotiate #{path}" }
@logged[:unexpected_response] = true
end

Expand All @@ -47,7 +47,7 @@ def endpoint?(path)

unless res.endpoints.include?(path)
unless @logged[:no_config_endpoint]
Datadog.logger.error { "agent reachable but does not report #{path}" }
Datadog.logger.warn { "agent reachable but does not report #{path}" }
@logged[:no_config_endpoint] = true
end

Expand Down
2 changes: 1 addition & 1 deletion lib/datadog/core/runtime/metrics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def gc_metrics
def try_flush
yield
rescue StandardError => e
Datadog.logger.error("Error while sending runtime metric. Cause: #{e.class.name} #{e.message}")
Datadog.logger.warn("Error while sending runtime metric. Cause: #{e.class.name} #{e.message}")
end

def default_metric_options
Expand Down
3 changes: 3 additions & 0 deletions lib/datadog/core/telemetry/event.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# frozen_string_literal: true

require_relative '../utils/forking'
require_relative '../utils/sequence'

module Datadog
module Core
module Telemetry
Expand Down
2 changes: 2 additions & 0 deletions lib/datadog/opentelemetry/sdk/propagator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def inject(
setter: ::OpenTelemetry::Context::Propagation.text_map_setter
)
unless setter == ::OpenTelemetry::Context::Propagation.text_map_setter
# PENDING: Not to report telemetry logs for now
Datadog.logger.error(
'Custom setter is not supported. Please inform the `datadog` team at ' \
' https://github.com/DataDog/dd-trace-rb of your use case so we can best support you. Using the default ' \
Expand All @@ -31,6 +32,7 @@ def extract(
)
if getter != ::OpenTelemetry::Context::Propagation.text_map_getter &&
getter != ::OpenTelemetry::Common::Propagation.rack_env_getter
# PENDING: Not to report telemetry logs for now
Datadog.logger.error(
"Custom getter #{getter} is not supported. Please inform the `datadog` team at " \
' https://github.com/DataDog/dd-trace-rb of your use case so we can best support you. Using the default ' \
Expand Down
6 changes: 6 additions & 0 deletions lib/datadog/tracing/distributed/propagation.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require_relative '../configuration/ext'
require_relative '../trace_digest'
require_relative '../trace_operation'
require_relative '../../core/telemetry/logger'

module Datadog
module Tracing
Expand Down Expand Up @@ -72,6 +73,10 @@ def inject!(digest, data)
::Datadog.logger.error(
"Error injecting distributed trace data. Cause: #{e} Location: #{Array(e.backtrace).first}"
)
::Datadog::Core::Telemetry::Logger.report(
e,
description: "Error injecting distributed trace data with #{propagator.class.name}"
)
end

result
Expand Down Expand Up @@ -127,6 +132,7 @@ def extract(data)
)
end
rescue => e
# TODO: Not to report Telemetry logs for now
::Datadog.logger.error(
"Error extracting distributed trace data. Cause: #{e} Location: #{Array(e.backtrace).first}"
)
Expand Down
2 changes: 1 addition & 1 deletion lib/datadog/tracing/sampling/rate_sampler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def initialize(sample_rate = 1.0, decision: nil)
super()

unless sample_rate >= 0.0 && sample_rate <= 1.0
Datadog.logger.error('sample rate is not between 0 and 1, falling back to 1')
Datadog.logger.warn('sample rate is not between 0 and 1, falling back to 1')
sample_rate = 1.0
end

Expand Down
2 changes: 2 additions & 0 deletions lib/datadog/tracing/sampling/rule.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require_relative 'matcher'
require_relative 'rate_sampler'
require_relative '../../core/telemetry/logger'

module Datadog
module Tracing
Expand Down Expand Up @@ -35,6 +36,7 @@ def match?(trace)
Datadog.logger.error(
"Matcher failed. Cause: #{e.class.name} #{e.message} Source: #{Array(e.backtrace).first}"
)
Datadog::Core::Telemetry::Logger.report(e, description: 'Matcher failed')
nil
end

Expand Down
6 changes: 5 additions & 1 deletion lib/datadog/tracing/sampling/rule_sampler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require_relative 'ext'
require_relative '../../core/rate_limiter'
require_relative 'rule'
require_relative '../../core/telemetry/logger'

module Datadog
module Tracing
Expand Down Expand Up @@ -80,9 +81,10 @@ def self.parse(rules, rate_limit, default_sample_rate)

new(parsed_rules, rate_limit: rate_limit, default_sample_rate: default_sample_rate)
rescue => e
Datadog.logger.error do
Datadog.logger.warn do
"Could not parse trace sampling rules '#{rules}': #{e.class.name} #{e.message} at #{Array(e.backtrace).first}"
end

nil
end

Expand Down Expand Up @@ -140,6 +142,8 @@ def sample_trace(trace)
Datadog.logger.error(
"Rule sampling failed. Cause: #{e.class.name} #{e.message} Source: #{Array(e.backtrace).first}"
)
Datadog::Core::Telemetry::Logger.report(e, description: 'Rule sampling failed')

yield(trace)
end

Expand Down
1 change: 1 addition & 0 deletions lib/datadog/tracing/transport/http/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def send_request(request, &block)
if stats.consecutive_errors > 0
Datadog.logger.debug(message)
else
# Not to report telemetry logs
Datadog.logger.error(message)
end

Expand Down
1 change: 1 addition & 0 deletions lib/datadog/tracing/transport/io/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def send_request(request)
if stats.consecutive_errors > 0
Datadog.logger.debug(message)
else
# Not to report telemetry logs
Datadog.logger.error(message)
end

Expand Down
2 changes: 1 addition & 1 deletion lib/datadog/tracing/workers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def callback_traces
# ensures that the thread will not die because of an exception.
# TODO[manu]: findout the reason and reschedule the send if it's not
# a fatal exception
Datadog.logger.error(
Datadog.logger.warn(
"Error during traces flush: dropped #{traces.length} items. Cause: #{e} Location: #{Array(e.backtrace).first}"
)
end
Expand Down
2 changes: 1 addition & 1 deletion lib/datadog/tracing/workers/trace_writer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def write_traces(traces)
traces = process_traces(traces)
flush_traces(traces)
rescue StandardError => e
Datadog.logger.error(
Datadog.logger.warn(
"Error while writing traces: dropped #{traces.length} items. Cause: #{e} Location: #{Array(e.backtrace).first}"
)
end
Expand Down
58 changes: 31 additions & 27 deletions spec/datadog/core/metrics/client_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,16 @@

it { is_expected.to have_attributes(statsd: statsd) }

shared_examples_for 'missing value arg' do
it 'logs an error without raising' do
expect(Datadog.logger).to receive(:error)
shared_examples_for 'logs an error without raising' do |action|
it do
expect(Datadog.logger).to receive(:error).with(
/Failed to send #{action} stat/
)
expect(Datadog::Core::Telemetry::Logger).to receive(:report).with(
a_kind_of(StandardError),
description: "Failed to send #{action} stat"
)

expect { subject }.to_not raise_error
end
end
Expand Down Expand Up @@ -383,7 +390,7 @@
context 'that does not yield args' do
subject(:count) { metrics.count(stat) {} }

it_behaves_like 'missing value arg'
it_behaves_like 'logs an error without raising', 'count'
end

context 'that yields args' do
Expand Down Expand Up @@ -433,12 +440,9 @@
end

context 'which raises an error' do
before do
expect(statsd).to receive(:count).and_raise(StandardError)
expect(Datadog.logger).to receive(:error)
end
before { allow(statsd).to receive(:count).and_raise(StandardError) }

it { expect { count }.to_not raise_error }
it_behaves_like 'logs an error without raising', 'count'
end
end
end
Expand All @@ -464,7 +468,7 @@
context 'that does not yield args' do
subject(:distribution) { metrics.distribution(stat) {} }

it_behaves_like 'missing value arg'
it_behaves_like 'logs an error without raising', 'distribution'
end

context 'that yields args' do
Expand Down Expand Up @@ -514,12 +518,9 @@
end

context 'which raises an error' do
before do
expect(statsd).to receive(:distribution).and_raise(StandardError)
expect(Datadog.logger).to receive(:error)
end
before { allow(statsd).to receive(:distribution).and_raise(StandardError) }

it { expect { distribution }.to_not raise_error }
it_behaves_like 'logs an error without raising', 'distribution'
end
end
end
Expand All @@ -545,7 +546,7 @@
context 'that does not yield args' do
subject(:gauge) { metrics.gauge(stat) {} }

it_behaves_like 'missing value arg'
it_behaves_like 'logs an error without raising', 'gauge'
end

context 'that yields args' do
Expand Down Expand Up @@ -595,12 +596,9 @@
end

context 'which raises an error' do
before do
expect(statsd).to receive(:gauge).and_raise(StandardError)
expect(Datadog.logger).to receive(:error)
end
before { allow(statsd).to receive(:gauge).and_raise(StandardError) }

it { expect { gauge }.to_not raise_error }
it_behaves_like 'logs an error without raising', 'gauge'
end
end
end
Expand All @@ -622,6 +620,12 @@

context 'when #statsd is a Datadog::Statsd' do
context 'and given a block' do
context 'when raise exception' do
subject(:increment) { metrics.increment(stat) { raise } }

it_behaves_like 'logs an error without raising', 'increment'
end

context 'that yields args' do
subject(:increment) { metrics.increment(stat) { stat_options } }

Expand Down Expand Up @@ -676,12 +680,9 @@
end

context 'which raises an error' do
before do
expect(statsd).to receive(:increment).and_raise(StandardError)
expect(Datadog.logger).to receive(:error)
end
before { allow(statsd).to receive(:increment).and_raise(StandardError) }

it { expect { increment }.to_not raise_error }
it_behaves_like 'logs an error without raising', 'increment'
end
end
end
Expand Down Expand Up @@ -711,7 +712,10 @@
let(:error) { RuntimeError.new }
# Expect the given block to raise its errors through

it { expect { time }.to raise_error(error) }
it do
expect(Datadog.logger).not_to receive(:error)
expect { time }.to raise_error(error)
end
end
end

Expand Down
Loading

0 comments on commit 31a4c84

Please sign in to comment.