Skip to content

Commit 4e4c93f

Browse files
authored
fix(tornado): match http routes in correct order and missing routes certain regex patterns (#17515)
## Description Two fixes to `_find_route`, the helper that determines the `http.route` tag for Tornado requests. **1. Route matching order in nested applications** `_find_route` pushes sub-rules of a nested `Application` onto the front of the processing deque with `extendleft`. Because `extendleft` reverses the input sequence as it inserts, the routes were processed in reverse declaration order — a catch-all defined after a specific route would incorrectly win. Fix: `extendleft(reversed(rule.target.rules))`. **2. `http.route` missing for complex regex patterns** Tornado's `PathMatches._path` is `None` whenever the route regex cannot be reversed (e.g. patterns containing non-capturing groups `(?:...)`, lookaheads, or lookbehinds). Previously `http.route` was absent for those routes. This adds `_regex_to_route`, a `lru_cache`-backed helper that converts any regex pattern to a route string: capturing groups become `%s` (consistent with Tornado's own format), non-capturing constructs are kept verbatim. Both issues were surfaced by Streamlit, which mounts a nested Tornado application with non-trivial route patterns. **3. Endpoint discovery at startup used to send the wrong path format `_collect_endpoints` previously sent the regex pattern as the route which does not match the `http.route` behavior. The discrepancy leads to duplicated endpoints in the Datadog UI. ## Testing - `test_nested_application_route_order`: nested app with a specific route before a catch-all; asserts the specific route wins. Fails without fix 1. - `test_complex_pattern_route`: covers a pure non-capturing group route and a mixed non-capturing + capturing group route; asserts `http.route` is populated and capturing groups are rendered as `%s`. Fails without fix 2. - `test_regex_to_route`: parametrised unit tests for `_regex_to_route` in isolation, covering anchors, positional/named capturing groups, non-capturing groups, lookaheads, nested groups, and escaped parens. ## Risks None. Both changes only affect route tag computation; request dispatch is handled by Tornado independently. - This changes only adds `http.route` values for cases that were previously not supported. ## Additional Notes - Slightly refactored the appsec telemetry tests to allow for the fact that tornado does not support typed path parameters in the route. Co-authored-by: florentin.labelle <florentin.labelle@datadoghq.com>
1 parent 8bcbdba commit 4e4c93f

File tree

11 files changed

+395
-44
lines changed

11 files changed

+395
-44
lines changed

ddtrace/contrib/internal/tornado/application.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
from urllib.parse import urlparse
22

33
from tornado import template
4+
from tornado.routing import PathMatches
45
import tornado.web
56

67
from ddtrace import config
78
from ddtrace._trace.pin import Pin
89
from ddtrace.contrib.internal.tornado import decorators
910
from ddtrace.contrib.internal.tornado.constants import CONFIG_KEY
11+
from ddtrace.contrib.internal.tornado.handlers import _path_for_path_match
1012
from ddtrace.contrib.internal.tornado.stack_context import context_provider
1113
from ddtrace.internal.endpoints import endpoint_collection
1214
from ddtrace.internal.schema import schematize_service_name
@@ -33,8 +35,7 @@ def _collect_endpoints(app):
3335
while rules:
3436
rule = rules.pop()
3537
matcher = getattr(rule, "matcher", None)
36-
regex = getattr(matcher, "regex", None) if matcher is not None else None
37-
path = regex.pattern if regex is not None else None
38+
path = _path_for_path_match(matcher) if isinstance(matcher, PathMatches) else None
3839
target = getattr(rule, "target", None)
3940

4041
if path is not None and isinstance(target, type) and issubclass(target, tornado.web.RequestHandler):

ddtrace/contrib/internal/tornado/handlers.py

Lines changed: 120 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from collections import deque
2+
from functools import lru_cache
23

4+
from tornado.routing import PathMatches
35
from tornado.web import HTTPError
46

57
from ddtrace import config
@@ -104,6 +106,117 @@ async def execute(func, handler, args, kwargs):
104106
raise
105107

106108

109+
# Regex group prefixes that mark a group as non-capturing (kept verbatim in the route).
110+
# Full Python re syntax for groups starting with '(':
111+
# (?:...) non-capturing group
112+
# (?=...) positive lookahead
113+
# (?!...) negative lookahead
114+
# (?<=...) positive lookbehind
115+
# (?<!...) negative lookbehind
116+
# (?>...) atomic group
117+
# (?(...) conditional group
118+
# (?P=name) named backreference — NOT a capturing group despite starting with ?P
119+
#
120+
# Capturing groups — intentionally absent, they become %s in the route:
121+
# (...) positional capturing group
122+
# (?P<name>...) named capturing group
123+
#
124+
# Comments — handled separately in _regex_to_route, removed entirely from the route:
125+
# (?#...) inline comment — matches nothing, contributes nothing to the path
126+
_NON_CAPTURING_PREFIXES = ("?:", "?=", "?!", "?<=", "?<!", "?>", "?(", "?P=")
127+
128+
129+
@lru_cache(maxsize=512)
130+
def _regex_to_route(pattern: str) -> str:
131+
"""
132+
Convert a compiled regex pattern to an ``http.route``-style string.
133+
134+
This mirrors what Tornado's ``PathMatches._find_groups`` produces: capturing
135+
groups (positional ``(...)`` and named ``(?P<name>...)``) are replaced with
136+
``%s``, while non-capturing constructs (``(?:...)``, lookaheads, lookbehinds)
137+
are kept verbatim so the route remains readable.
138+
139+
Unlike ``_find_groups`` this never returns ``None`` — it always produces a
140+
usable route string regardless of pattern complexity.
141+
"""
142+
# Strip the anchors that Tornado's PathMatches.__init__ appends.
143+
if pattern.startswith("^"):
144+
pattern = pattern[1:]
145+
if pattern.endswith("$"):
146+
pattern = pattern[:-1]
147+
148+
result = []
149+
i = 0
150+
n = len(pattern)
151+
152+
while i < n:
153+
if pattern[i] == "\\":
154+
# Escaped character — copy both chars and move on.
155+
result.append(pattern[i : i + 2])
156+
i += 2
157+
continue
158+
159+
if pattern[i] == "[":
160+
# Character class — copy verbatim until the closing ']'.
161+
# Inside '[…]', '(' is literal, not a group boundary.
162+
j = i + 1
163+
if j < n and pattern[j] == "^":
164+
j += 1
165+
# A ']' immediately after '[' or '[^' is a literal ']', not the end.
166+
if j < n and pattern[j] == "]":
167+
j += 1
168+
while j < n and pattern[j] != "]":
169+
if pattern[j] == "\\":
170+
j += 2
171+
else:
172+
j += 1
173+
if j < n:
174+
j += 1 # skip the closing ']'
175+
result.append(pattern[i:j])
176+
i = j
177+
continue
178+
179+
if pattern[i] != "(":
180+
result.append(pattern[i])
181+
i += 1
182+
continue
183+
184+
# Peek past the opening parenthesis to decide whether this is capturing.
185+
suffix = pattern[i + 1 :]
186+
capturing = not any(suffix.startswith(p) for p in _NON_CAPTURING_PREFIXES)
187+
188+
# Find the matching closing paren, respecting nesting and escapes.
189+
depth = 1
190+
j = i + 1
191+
while j < n and depth > 0:
192+
if pattern[j] == "\\":
193+
j += 2
194+
elif pattern[j] == "(":
195+
depth += 1
196+
j += 1
197+
elif pattern[j] == ")":
198+
depth -= 1
199+
j += 1
200+
else:
201+
j += 1
202+
203+
if suffix.startswith("?#"):
204+
# Inline comment (?#...) — matches nothing, omit from route entirely.
205+
pass
206+
else:
207+
result.append("%s" if capturing else pattern[i:j])
208+
i = j
209+
210+
return "".join(result)
211+
212+
213+
def _path_for_path_match(matcher: PathMatches) -> str:
214+
if matcher._path is not None:
215+
return matcher._path
216+
217+
return _regex_to_route(matcher.regex.pattern)
218+
219+
107220
def _find_route(initial_rule_set, request):
108221
"""
109222
We have to walk through the same chain of rules that tornado does to find a matching rule.
@@ -115,11 +228,14 @@ def _find_route(initial_rule_set, request):
115228

116229
while len(rules) > 0:
117230
rule = rules.popleft()
118-
if (m := rule.matcher.match(request)) is not None:
119-
if hasattr(rule.matcher, "_path"):
120-
return rule.matcher._path, m.get("path_args", []) or m.get("path_kwargs", {})
231+
matcher = rule.matcher
232+
if (m := matcher.match(request)) is not None:
233+
if isinstance(matcher, PathMatches):
234+
path_args = m.get("path_args", []) or m.get("path_kwargs", {})
235+
return _path_for_path_match(matcher), path_args
236+
121237
elif hasattr(rule.target, "rules"):
122-
rules.extendleft(rule.target.rules)
238+
rules.extendleft(reversed(rule.target.rules))
123239

124240
return "^$", {}
125241

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
---
2+
fixes:
3+
- |
4+
tornado: Fixes an issue where routes inside a nested Tornado application were matched
5+
in reverse declaration order, causing a catch-all pattern to win over a more-specific
6+
route defined before it. This resulted in incorrect ``http.route`` tags on spans.
7+
- |
8+
tornado: The ``http.route`` tag is now populated for routes whose regex cannot be
9+
reversed by Tornado (e.g. patterns containing non-capturing groups such as
10+
``(?:a|b)``). Capturing groups are still rendered as ``%s``, consistent with
11+
Tornado's own route format, while non-capturing constructs are kept verbatim.

tests/appsec/contrib_appsec/test_django.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,28 @@ def location(self, response):
8585

8686

8787
class Test_Django(_Test_Django_Base, utils.Contrib_TestClass_For_Threats):
88-
pass
88+
ENDPOINT_DISCOVERY_EXPECTED_PATHS = {
89+
"^$",
90+
"asm/<int:param_int>/<str:param_str>",
91+
"^asm/?$",
92+
"new_service/<str:service_name>",
93+
"login",
94+
"login_sdk",
95+
"rasp/<str:endpoint>",
96+
}
97+
98+
@staticmethod
99+
def endpoint_path_to_uri(path: str) -> str:
100+
import re
101+
102+
# Django regex-style routes: ^asm/?$ → /asm
103+
if re.match(r"^\^.*\$$", path):
104+
path = path[1:-1]
105+
if path.endswith("/?"):
106+
path = path[:-2]
107+
path = re.sub(r"<int:[a-z_]+>", "123", path)
108+
path = re.sub(r"<str:[a-z_]+>", "abczx", path)
109+
return path if path.startswith("/") else ("/" + path)
89110

90111

91112
class Test_Django_RC(_Test_Django_Base, utils.Contrib_TestClass_For_Threats_RC):

tests/appsec/contrib_appsec/test_fastapi.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,23 @@ def location(self, response):
120120

121121

122122
class Test_FastAPI(_Test_FastAPI_Base, utils.Contrib_TestClass_For_Threats):
123-
pass
123+
ENDPOINT_DISCOVERY_EXPECTED_PATHS = {
124+
"/",
125+
"/asm/{param_int:int}/{param_str:str}",
126+
"/asm/",
127+
"/new_service/{service_name:str}",
128+
"/login/",
129+
"/login_sdk/",
130+
"/rasp/{endpoint:str}/",
131+
}
132+
133+
@staticmethod
134+
def endpoint_path_to_uri(path: str) -> str:
135+
import re
136+
137+
path = re.sub(r"\{[a-z_]+:int\}", "123", path)
138+
path = re.sub(r"\{[a-z_]+:str\}", "abczx", path)
139+
return path
124140

125141

126142
class Test_FastAPI_RC(_Test_FastAPI_Base, utils.Contrib_TestClass_For_Threats_RC):

tests/appsec/contrib_appsec/test_flask.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,23 @@ def location(self, response):
104104

105105

106106
class Test_Flask(_Test_Flask_Base, utils.Contrib_TestClass_For_Threats):
107-
pass
107+
ENDPOINT_DISCOVERY_EXPECTED_PATHS = {
108+
"/",
109+
"/asm/<int:param_int>/<string:param_str>",
110+
"/asm/",
111+
"/new_service/<string:service_name>",
112+
"/login",
113+
"/login_sdk",
114+
"/rasp/<string:endpoint>/",
115+
}
116+
117+
@staticmethod
118+
def endpoint_path_to_uri(path: str) -> str:
119+
import re
120+
121+
path = re.sub(r"<int:[a-z_]+>", "123", path)
122+
path = re.sub(r"<(str|string):[a-z_]+>", "abczx", path)
123+
return path
108124

109125

110126
class Test_Flask_RC(_Test_Flask_Base, utils.Contrib_TestClass_For_Threats_RC):

tests/appsec/contrib_appsec/test_tornado.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,24 @@ def location(self, response):
8989

9090

9191
class Test_Tornado(_Test_Tornado_Base, utils.Contrib_TestClass_For_Threats):
92-
pass
92+
ENDPOINT_DISCOVERY_EXPECTED_PATHS = {
93+
"/",
94+
"/asm/%s/%s/?",
95+
"/asm/?",
96+
"/new_service/%s/?",
97+
"/login/?",
98+
"/login_sdk/?",
99+
"/rasp/%s/?",
100+
}
101+
102+
@staticmethod
103+
def endpoint_path_to_uri(path: str) -> str:
104+
# Tornado uses %s for all capturing groups and does not have a notion of typed path parameters.
105+
# We substitute with "123" which satisfies both \d+ and [^/]+ patterns.
106+
path = path.replace("%s", "123")
107+
if path.endswith("/?"):
108+
path = path[:-2]
109+
return path if path.startswith("/") else ("/" + path)
93110

94111

95112
class Test_Tornado_RC(_Test_Tornado_Base, utils.Contrib_TestClass_For_Threats_RC):

tests/appsec/contrib_appsec/utils.py

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import sys
44
from typing import Any
5+
from typing import ClassVar
56
from typing import Generator
67
from urllib.parse import quote
78
from urllib.parse import urlencode
@@ -187,6 +188,18 @@ class Contrib_TestClass_For_Threats(_Contrib_TestClass_Base):
187188
Factorized test class for threats tests on all supported frameworks
188189
"""
189190

191+
# Expected ep.path values (as stored in the endpoint collection) that must
192+
# be discovered. Each child class must override with framework-specific routes.
193+
ENDPOINT_DISCOVERY_EXPECTED_PATHS: ClassVar[set[str]] = set()
194+
195+
@staticmethod
196+
def endpoint_path_to_uri(path: str) -> str:
197+
"""Convert an ep.path from the endpoint collection to a requestable URI.
198+
199+
Each framework test class must implement this with its own route format logic.
200+
"""
201+
raise NotImplementedError("Subclasses must implement endpoint_path_to_uri")
202+
190203
@pytest.mark.parametrize("asm_enabled", [True, False])
191204
def test_healthcheck(self, interface: Interface, get_entry_span_tag, asm_enabled: bool):
192205
# you can disable any test in a framework like that:
@@ -252,34 +265,9 @@ def test_api_endpoint_discovery(self, interface: Interface, find_resource):
252265
"""
253266
from ddtrace.internal.endpoints import endpoint_collection
254267

255-
def parse(path: str) -> str:
256-
import re
257-
258-
# substitutions to make a url path from route
259-
if re.match(r"^\^.*\$$", path):
260-
path = path[1:-1]
261-
path = re.sub(r"<int:[a-z_]+>|\{[a-z_]+:int\}", "123", path)
262-
path = re.sub(r"<(str|string):[a-z_]+>|\{[a-z_]+:str\}", "abczx", path)
263-
# Tornado regex patterns (from regex.pattern, ending with $)
264-
if path.endswith("$"):
265-
path = path[:-1]
266-
path = re.sub(r"\(\?P<[a-z_]+>\\d\+\)", "123", path)
267-
path = re.sub(r"\(\?P<[a-z_]+>\[[^\]]+\]\+?\)", "abczx", path)
268-
path = re.sub(r"\(\\d\+\)", "123", path)
269-
path = re.sub(r"\(\[[^\]]+\]\+?\)", "abczx", path)
270-
if path.endswith("/?"):
271-
path = path[:-2]
272-
return path if path.startswith("/") else ("/" + path)
273-
274-
must_found: set[str] = {
275-
"",
276-
"/asm/123/abczx",
277-
"/asm",
278-
"/new_service/abczx",
279-
"/login",
280-
"/login_sdk",
281-
"/rasp/abczx",
282-
}
268+
expected = self.ENDPOINT_DISCOVERY_EXPECTED_PATHS
269+
assert expected, "ENDPOINT_DISCOVERY_EXPECTED_PATHS must be set on the test class"
270+
283271
found: set[str] = set()
284272
with override_global_config(dict(_asm_enabled=True)):
285273
self.update_tracer(interface)
@@ -295,20 +283,20 @@ def parse(path: str) -> str:
295283
assert ep.operation_name
296284
if ep.method not in ("GET", "*", "POST") or ep.path.startswith("/static"):
297285
continue
298-
path = parse(ep.path)
299-
found.add(path.rstrip("/"))
286+
found.add(ep.path)
287+
uri = self.endpoint_path_to_uri(ep.path)
300288
response = (
301-
interface.client.post(path, data=json.dumps({"data": "content"}), content_type="application/json")
289+
interface.client.post(uri, data=json.dumps({"data": "content"}), content_type="application/json")
302290
if ep.method == "POST"
303-
else interface.client.get(path)
291+
else interface.client.get(uri)
304292
)
305293
assert self.status(response) in (
306294
200,
307295
401,
308-
), f"ep.path failed: [{self.status(response)}] {ep.path} -> {path}"
296+
), f"ep.path failed: [{self.status(response)}] {ep.path} -> {uri}"
309297
resource = "GET" + ep.resource_name[1:] if ep.resource_name.startswith("* ") else ep.resource_name
310298
assert find_resource(resource)
311-
assert must_found <= found
299+
assert expected <= found, f"missing paths: {expected - found}"
312300

313301
@pytest.mark.parametrize("asm_enabled", [True, False])
314302
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)