Skip to content

Commit 8e2868a

Browse files
authored
feat: Close anomalous spans in Datadog middleware (#798)
Also, add an additional assertion in `run_middleware` to ensure that the middleware hasn't silently entered an error state. (Helped me debug a case where I had forgotten to fake a Waffle flag.)
1 parent 25aed4b commit 8e2868a

File tree

4 files changed

+166
-5
lines changed

4 files changed

+166
-5
lines changed

CHANGELOG.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@ Change Log
1414
Unreleased
1515
~~~~~~~~~~
1616

17+
[4.5.0] - 2024-09-19
18+
~~~~~~~~~~~~~~~~~~~~
19+
Added
20+
-----
21+
* Datadog diagnostics middleware can now attempt to close anomalous spans. Can be enabled via Waffle flag ``datadog.diagnostics.close_anomalous_spans`` (controlled separately from logging feature).
22+
1723
[4.4.0] - 2024-09-10
1824
~~~~~~~~~~~~~~~~~~~~
1925
Changed

edx_arch_experiments/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22
A plugin to include applications under development by the architecture team at 2U.
33
"""
44

5-
__version__ = '4.4.0'
5+
__version__ = '4.5.0'

edx_arch_experiments/datadog_diagnostics/middleware.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,16 @@
2727
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
2828
DETECT_ANOMALOUS_TRACE = WaffleFlag('datadog.diagnostics.detect_anomalous_trace', module_name=__name__)
2929

30+
# .. toggle_name: datadog.diagnostics.close_anomalous_spans
31+
# .. toggle_implementation: WaffleFlag
32+
# .. toggle_default: False
33+
# .. toggle_description: Close anomalous spans that are ancestors of the django.request span.
34+
# .. toggle_use_cases: temporary
35+
# .. toggle_creation_date: 2024-09-19
36+
# .. toggle_target_removal_date: 2024-12-01
37+
# .. toggle_tickets: https://github.com/edx/edx-arch-experiments/issues/692
38+
CLOSE_ANOMALOUS_SPANS = WaffleFlag('datadog.diagnostics.close_anomalous_spans', module_name=__name__)
39+
3040
# .. toggle_name: datadog.diagnostics.log_root_span
3141
# .. toggle_implementation: WaffleFlag
3242
# .. toggle_default: False
@@ -76,6 +86,8 @@ def __call__(self, request):
7686
def process_view(self, request, _view_func, _view_args, _view_kwargs):
7787
try:
7888
self.log_diagnostics(request)
89+
if CLOSE_ANOMALOUS_SPANS.is_enabled():
90+
self.close_anomalous_spans(request)
7991
except BaseException as e:
8092
# If there's an error, it will probably hit every request,
8193
# so let's just log it once.
@@ -86,6 +98,57 @@ def process_view(self, request, _view_func, _view_args, _view_kwargs):
8698
f"(suppressing further errors): {e!r}"
8799
)
88100

101+
# pylint: disable=protected-access
102+
def close_anomalous_spans(self, request):
103+
"""
104+
Detect anomalous spans and close them.
105+
106+
This closes any open spans that are ancestors of the current
107+
request. The trace will still have two requests concatenated
108+
together, but the problematic spans should not affect
109+
future requests.
110+
111+
Only activates if the root span is itself closed, which is a
112+
cheap thing to check.
113+
"""
114+
# If the root span is still open, probably not an anomalous trace.
115+
if self.dd_tracer.current_root_span().duration is None:
116+
return # nothing to do!
117+
118+
# Walk upwards until we find the django.request span.
119+
walk_span = self.dd_tracer.current_span()
120+
while walk_span.name != 'django.request':
121+
walk_span = walk_span._parent
122+
if walk_span is None:
123+
# If we can't find the django.request root, there's
124+
# something bad about our assumptions and we should
125+
# not attempt a fix.
126+
log.error(
127+
"Did not find django.request span when walking anomalous trace "
128+
"to root. Not attempting a fix."
129+
)
130+
return
131+
132+
# Go "above" the request span
133+
walk_span = walk_span._parent
134+
135+
# Now close everything above the current request span that's
136+
# still open, logging as we go.
137+
while walk_span is not None:
138+
# We call finish() individually rather than
139+
# finish_with_ancestors() because this gives us a chance
140+
# to log each one.
141+
if walk_span.duration is None:
142+
walk_span.finish()
143+
log.info(
144+
f"Closed span in anomalous trace: name={walk_span.name} "
145+
f"id={walk_span.span_id:x} trace={walk_span.trace_id:x}"
146+
)
147+
# Keep walking up even if we discover closed spans; we've
148+
# previously seen multiple contiguous segments of open
149+
# spans separated by closed ones.
150+
walk_span = walk_span._parent
151+
89152
def log_diagnostics(self, request):
90153
"""
91154
Contains all the actual logging logic.

edx_arch_experiments/datadog_diagnostics/tests/test_middleware.py

Lines changed: 96 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
"""
44

55
import re
6+
from contextlib import ExitStack
67
from unittest.mock import Mock, patch
78

89
import ddt
910
import ddtrace
1011
from django.test import TestCase, override_settings
1112

12-
from ..middleware import DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware
13+
from ..middleware import CLOSE_ANOMALOUS_SPANS, DETECT_ANOMALOUS_TRACE, LOG_ROOT_SPAN, DatadogDiagnosticMiddleware
1314

1415

1516
def fake_view(_request):
@@ -24,7 +25,7 @@ def make_middleware(self):
2425
"""Make an instance of the middleware with current settings."""
2526
return DatadogDiagnosticMiddleware(fake_view)
2627

27-
def run_middleware(self, middleware=None):
28+
def run_middleware(self, middleware=None, check_error_state=True):
2829
"""Run the middleware using a fake request."""
2930
if middleware is None:
3031
middleware = self.make_middleware()
@@ -36,6 +37,9 @@ def run_middleware(self, middleware=None):
3637

3738
middleware.process_view(request, None, None, None)
3839

40+
if check_error_state:
41+
assert middleware.error is False
42+
3943
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
4044
def test_log_diagnostics_error_only_once(self, mock_log_error):
4145
"""
@@ -48,8 +52,9 @@ def test_log_diagnostics_error_only_once(self, mock_log_error):
4852
bad_method = Mock(side_effect=lambda request: 1/0)
4953
middleware.log_diagnostics = bad_method
5054

51-
self.run_middleware(middleware)
52-
self.run_middleware(middleware)
55+
self.run_middleware(middleware, check_error_state=False)
56+
self.run_middleware(middleware, check_error_state=False)
57+
assert middleware.error is True
5358

5459
# Called twice
5560
assert len(bad_method.call_args_list) == 2
@@ -74,6 +79,7 @@ def test_log_diagnostics_error_only_once(self, mock_log_error):
7479
def test_anomalous_trace(self, enabled, cause_anomaly, mock_log_warning):
7580
with (
7681
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=enabled),
82+
patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False),
7783
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
7884
# Need at least two levels of spans in order to fake
7985
# an anomaly. (Otherwise current_root_span returns None.)
@@ -108,6 +114,7 @@ def test_anomalous_trace_truncation(self, mock_log_warning):
108114
"""
109115
with (
110116
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=True),
117+
patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False),
111118
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
112119
# Need at least two levels of spans in order to fake
113120
# an anomaly. (Otherwise current_root_span returns None.)
@@ -134,6 +141,7 @@ def test_anomalous_trace_truncation(self, mock_log_warning):
134141
def test_log_root_span(self, mock_log_info):
135142
with (
136143
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False),
144+
patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=False),
137145
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=True),
138146
# Need at least two levels of spans for interesting logging
139147
ddtrace.tracer.trace("local_root"),
@@ -149,3 +157,87 @@ def test_log_root_span(self, mock_log_info):
149157
r"current span = name='inner_span' .*",
150158
log_msg
151159
)
160+
161+
def run_close_with(self, *, enabled, anomalous, ancestors=None):
162+
"""
163+
Run a "close anomalous spans" scenario with supplied settings.
164+
165+
ancestors is a list of span operation names, defaulting to
166+
something reasonable if not supplied.
167+
"""
168+
with (
169+
patch.object(DETECT_ANOMALOUS_TRACE, 'is_enabled', return_value=False),
170+
patch.object(CLOSE_ANOMALOUS_SPANS, 'is_enabled', return_value=enabled),
171+
patch.object(LOG_ROOT_SPAN, 'is_enabled', return_value=False),
172+
ExitStack() as stack,
173+
):
174+
if ancestors is None:
175+
ancestors = [
176+
'django.request', 'django.view',
177+
'celery.apply',
178+
# ^ will need to close some of these
179+
'django.request', 'django.view',
180+
]
181+
for ancestor_name in ancestors:
182+
stack.enter_context(ddtrace.tracer.trace(ancestor_name))
183+
# make anomaly readily detectable
184+
if anomalous:
185+
ddtrace.tracer.current_root_span().finish()
186+
187+
self.run_middleware()
188+
189+
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
190+
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
191+
def test_close_disabled(self, mock_log_error, mock_log_info):
192+
"""
193+
Confirm that nothing interesting happens when close-spans flag is disabled.
194+
"""
195+
self.run_close_with(enabled=False, anomalous=True)
196+
197+
mock_log_error.assert_not_called()
198+
mock_log_info.assert_not_called()
199+
200+
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
201+
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
202+
def test_close_applied(self, mock_log_error, mock_log_info):
203+
"""
204+
Confirm that anomalous spans are closed, at least for future requests.
205+
"""
206+
self.run_close_with(enabled=True, anomalous=True)
207+
208+
mock_log_error.assert_not_called()
209+
210+
# Expect to close celery.apply and the one above it (but we've
211+
# already closed the root, above).
212+
assert len(mock_log_info.call_args_list) == 2
213+
assert [call[0][0].split(' id=')[0] for call in mock_log_info.call_args_list] == [
214+
"Closed span in anomalous trace: name=celery.apply",
215+
"Closed span in anomalous trace: name=django.view",
216+
]
217+
218+
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
219+
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
220+
def test_close_not_needed(self, mock_log_error, mock_log_info):
221+
"""
222+
Confirm that no logging when anomalous trace not present.
223+
"""
224+
self.run_close_with(enabled=True, anomalous=False)
225+
226+
mock_log_error.assert_not_called()
227+
mock_log_info.assert_not_called()
228+
229+
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.info')
230+
@patch('edx_arch_experiments.datadog_diagnostics.middleware.log.error')
231+
def test_close_missing_request(self, mock_log_error, mock_log_info):
232+
"""
233+
Check that we look for the expected ancestor and only close above it.
234+
"""
235+
self.run_close_with(enabled=True, anomalous=True, ancestors=[
236+
# Artificial scenario standing in for something unexpected.
237+
'django.view', 'celery.apply', 'django.view',
238+
])
239+
240+
mock_log_error.assert_called_once_with(
241+
"Did not find django.request span when walking anomalous trace to root. Not attempting a fix."
242+
)
243+
mock_log_info.assert_not_called()

0 commit comments

Comments
 (0)