Skip to content

Commit 976eef3

Browse files
authored
fix(langchain): detach existing SpanHolder token before overwrite in _create_llm_span (#3958)
1 parent b80f109 commit 976eef3

2 files changed

Lines changed: 192 additions & 0 deletions

File tree

packages/opentelemetry-instrumentation-langchain/opentelemetry/instrumentation/langchain/callback_handler.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,16 @@ def _create_llm_span(
438438
workflow_name = self.get_workflow_name(parent_run_id)
439439
entity_path = self.get_entity_path(parent_run_id)
440440

441+
# Capture and detach the old holder BEFORE _create_span runs.
442+
# _create_span unconditionally overwrites self.spans[run_id], which
443+
# would make the old SpanHolder (and its suppression token) unreachable.
444+
# Detaching here, before the overwrite, ensures we clean up the original
445+
# suppression token from a duplicate run_id and that the new suppression
446+
# is layered on top of the clean baseline context.
447+
old_holder = self.spans.get(run_id)
448+
if old_holder is not None and old_holder.token is not None:
449+
self._safe_detach_context(old_holder.token)
450+
441451
span = self._create_span(
442452
run_id,
443453
parent_run_id,
@@ -462,6 +472,15 @@ def _create_llm_span(
462472
span, GenAIAttributes.GEN_AI_OPERATION_NAME, operation_name
463473
)
464474

475+
# _create_span has already attached the span context and stored a new
476+
# SpanHolder(span, span_token, ...). Detach that span_token before
477+
# attaching suppression so the suppression layers on top of the clean
478+
# baseline context. If we detached after attaching suppression,
479+
# ContextVar.reset() would wipe the suppression flag.
480+
current_holder = self.spans.get(run_id)
481+
if current_holder is not None and current_holder.token is not None:
482+
self._safe_detach_context(current_holder.token)
483+
465484
# we already have an LLM span by this point,
466485
# so skip any downstream instrumentation from here
467486
try:
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""
2+
Unit tests for TraceloopCallbackHandler OTel context-token lifecycle.
3+
4+
These tests do NOT use VCR cassettes; they exercise context-management
5+
logic in isolation by calling internal methods directly.
6+
7+
No real HTTP is performed — the Tracer is backed by an InMemorySpanExporter.
8+
"""
9+
from unittest.mock import MagicMock
10+
from uuid import uuid4
11+
12+
import pytest
13+
from opentelemetry import context as context_api
14+
from opentelemetry.sdk.trace import TracerProvider
15+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
16+
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
17+
from opentelemetry.semconv_ai import (
18+
SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY,
19+
LLMRequestTypeValues,
20+
)
21+
22+
from opentelemetry.instrumentation.langchain.callback_handler import (
23+
TraceloopCallbackHandler,
24+
)
25+
26+
27+
@pytest.fixture
28+
def handler():
29+
exporter = InMemorySpanExporter()
30+
provider = TracerProvider()
31+
provider.add_span_processor(SimpleSpanProcessor(exporter))
32+
tracer = provider.get_tracer("test")
33+
return TraceloopCallbackHandler(
34+
tracer=tracer,
35+
duration_histogram=MagicMock(),
36+
token_histogram=MagicMock(),
37+
)
38+
39+
40+
@pytest.fixture(autouse=True)
41+
def restore_otel_context():
42+
"""
43+
Snapshot the OTel context before each test and restore it afterwards.
44+
45+
context_api.attach(ctx) stores the *current* context in the returned token.
46+
context_api.detach(token) calls ContextVar.reset(token), which unconditionally
47+
resets the ContextVar to the snapshot saved in the token — regardless of any
48+
intermediate attaches that happened while the test ran. This ensures leaked
49+
tokens from a failing test cannot pollute subsequent tests.
50+
"""
51+
restore_token = context_api.attach(context_api.get_current())
52+
yield
53+
context_api.detach(restore_token)
54+
55+
56+
def _suppression_active() -> bool:
57+
return context_api.get_value(SUPPRESS_LANGUAGE_MODEL_INSTRUMENTATION_KEY) is True
58+
59+
60+
# ---------------------------------------------------------------------------
61+
# Ordering fix (commit 2) — normal single-call lifecycle
62+
# ---------------------------------------------------------------------------
63+
64+
def test_suppression_active_after_create_llm_span(handler):
65+
"""After a normal _create_llm_span call the suppression flag must be set."""
66+
run_id = uuid4()
67+
assert not _suppression_active(), "precondition: no suppression before call"
68+
69+
handler._create_llm_span(run_id, None, "gpt-4", LLMRequestTypeValues.CHAT)
70+
71+
assert _suppression_active(), (
72+
"Suppression must be active so downstream OpenAI/Bedrock instrumentation "
73+
"is skipped for this LLM call."
74+
)
75+
76+
span = handler.spans[run_id].span
77+
handler._end_span(span, run_id)
78+
79+
80+
def test_suppression_cleared_after_end_span(handler):
81+
"""After _end_span the suppression flag must be gone (normal single-call lifecycle)."""
82+
run_id = uuid4()
83+
handler._create_llm_span(run_id, None, "gpt-4", LLMRequestTypeValues.CHAT)
84+
85+
span = handler.spans[run_id].span
86+
handler._end_span(span, run_id)
87+
88+
assert not _suppression_active(), (
89+
"Suppression must be cleared when the LLM span ends; otherwise every "
90+
"subsequent call in this thread/task is permanently suppressed."
91+
)
92+
93+
94+
# ---------------------------------------------------------------------------
95+
# P2 — duplicate run_id leaks supp_token_1 (issue #3957)
96+
# ---------------------------------------------------------------------------
97+
98+
def test_duplicate_run_id_leaks_suppression_token(handler):
99+
"""
100+
Regression test for issue #3957.
101+
102+
When _create_llm_span is called a second time with the same run_id,
103+
_create_span (called first inside _create_llm_span) unconditionally
104+
overwrites self.spans[run_id] before _create_llm_span can read and
105+
detach the old SpanHolder's suppression token.
106+
107+
Context-stack trace (each arrow = context_api.attach / detach)
108+
---------------------------------------------------------------
109+
ctx_0 baseline (no suppression)
110+
↓ _create_span (1st call): attach span_ctx_1
111+
span_ctx_1 {SPAN_KEY: span_1} span_token_1 remembers ctx_0
112+
↓ detach span_token_1 (commit-2 ordering fix)
113+
ctx_0
114+
↓ attach suppression
115+
supp_ctx_1 {SUPPRESS: True} supp_token_1 remembers ctx_0
116+
← stored in SpanHolder; 1st _create_llm_span returns
117+
118+
↓ _create_span (2nd call, same run_id!): attach span_ctx_2
119+
span_ctx_2 {SUPPRESS:True, SPAN_KEY: span_2} span_token_2 remembers supp_ctx_1
120+
_create_span writes self.spans[run_id] = SpanHolder(span_2, span_token_2)
121+
supp_token_1 is now GONE from self.spans — unreachable forever
122+
↓ _create_llm_span reads existing_holder → finds SpanHolder(span_2, span_token_2)
123+
↓ detach span_token_2
124+
supp_ctx_1 {SUPPRESS: True} NOT ctx_0 — 1st suppression lives on
125+
↓ attach new suppression
126+
supp_ctx_2 {SUPPRESS: True} supp_token_2 remembers supp_ctx_1
127+
stored in SpanHolder; 2nd _create_llm_span returns
128+
129+
↓ _end_span: detach supp_token_2
130+
supp_ctx_1 {SUPPRESS: True} ← BUG: supp_token_1 never detached
131+
132+
Expected after _end_span: ctx_0 (no suppression)
133+
Actual after _end_span: supp_ctx_1 ({SUPPRESS: True})
134+
135+
Root cause
136+
----------
137+
The fix must capture `existing_holder = self.spans.get(run_id)` BEFORE
138+
calling `_create_span`, because _create_span unconditionally overwrites
139+
self.spans[run_id] at line 334, destroying the reference to supp_token_1.
140+
141+
Correct fix in _create_llm_span (before the _create_span call):
142+
143+
old_holder = self.spans.get(run_id)
144+
if old_holder is not None and old_holder.token is not None:
145+
self._safe_detach_context(old_holder.token)
146+
147+
span = self._create_span(run_id, ...) # now safe to overwrite
148+
"""
149+
run_id = uuid4()
150+
assert not _suppression_active(), "precondition: no suppression before test"
151+
152+
# First call — normal; establishes supp_token_1
153+
handler._create_llm_span(run_id, None, "gpt-4", LLMRequestTypeValues.CHAT)
154+
assert _suppression_active(), "suppression active after 1st call (sanity)"
155+
156+
# Second call with the SAME run_id — triggers issue #3957
157+
handler._create_llm_span(run_id, None, "gpt-4", LLMRequestTypeValues.CHAT)
158+
assert _suppression_active(), "suppression active after 2nd call (sanity)"
159+
160+
# End the one surviving span (from the 2nd call)
161+
span = handler.spans[run_id].span
162+
handler._end_span(span, run_id)
163+
164+
# After all spans are done the suppression MUST be cleared.
165+
# P2 bug present → still True (supp_token_1 leaked, never detached)
166+
# P2 bug fixed → None (both tokens properly cleaned up)
167+
assert not _suppression_active(), (
168+
"Issue #3957 not fixed: suppression flag is still set after _end_span. "
169+
"supp_token_1 from the first _create_llm_span call was never detached because "
170+
"_create_span overwrites self.spans[run_id] before the old holder can be read. "
171+
"Fix: move `existing_holder = self.spans.get(run_id)` (and its detach) to "
172+
"BEFORE the `_create_span(...)` call in _create_llm_span."
173+
)

0 commit comments

Comments
 (0)