Switch to OpenTracing's ContextVarsScopeManager (#18849)

Switch to OpenTracing's `ContextVarsScopeManager` instead of our own
custom `LogContextScopeManager`.

This is now possible because the linked Twisted issue from the comment
in our custom `LogContextScopeManager` is resolved:
https://twistedmatrix.com/trac/ticket/10301

This PR is spawning from exploring different possibilities to solve the
`scope` loss problem I was encountering in
https://github.com/element-hq/synapse/pull/18804#discussion_r2268254424.
This appears to solve the problem and I've added the additional test
from there to this PR 
This commit is contained in:
Eric Eastwood 2025-08-27 11:41:00 -05:00 committed by GitHub
parent df2cfb3932
commit 27fc3389f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 144 additions and 180 deletions

1
changelog.d/18849.misc Normal file
View File

@ -0,0 +1 @@
Switch to OpenTracing's `ContextVarsScopeManager` instead of our own custom `LogContextScopeManager`.

View File

@ -56,7 +56,6 @@ from twisted.internet import defer, threads
from twisted.python.threadpool import ThreadPool
if TYPE_CHECKING:
from synapse.logging.scopecontextmanager import _LogContextScope
from synapse.types import ISynapseReactor
logger = logging.getLogger(__name__)
@ -230,14 +229,13 @@ LoggingContextOrSentinel = Union["LoggingContext", "_Sentinel"]
class _Sentinel:
"""Sentinel to represent the root context"""
__slots__ = ["previous_context", "finished", "request", "scope", "tag"]
__slots__ = ["previous_context", "finished", "request", "tag"]
def __init__(self) -> None:
# Minimal set for compatibility with LoggingContext
self.previous_context = None
self.finished = False
self.request = None
self.scope = None
self.tag = None
def __str__(self) -> str:
@ -290,7 +288,6 @@ class LoggingContext:
"finished",
"request",
"tag",
"scope",
]
def __init__(
@ -311,7 +308,6 @@ class LoggingContext:
self.main_thread = get_thread_id()
self.request = None
self.tag = ""
self.scope: Optional["_LogContextScope"] = None
# keep track of whether we have hit the __exit__ block for this context
# (suggesting that the the thing that created the context thinks it should
@ -324,9 +320,6 @@ class LoggingContext:
# we track the current request_id
self.request = self.parent_context.request
# we also track the current scope:
self.scope = self.parent_context.scope
if request is not None:
# the request param overrides the request from the parent context
self.request = request

View File

@ -251,18 +251,17 @@ class _DummyTagNames:
try:
import opentracing
import opentracing.tags
from opentracing.scope_managers.contextvars import ContextVarsScopeManager
tags = opentracing.tags
except ImportError:
opentracing = None # type: ignore[assignment]
tags = _DummyTagNames # type: ignore[assignment]
ContextVarsScopeManager = None # type: ignore
try:
from jaeger_client import Config as JaegerConfig
from synapse.logging.scopecontextmanager import LogContextScopeManager
except ImportError:
JaegerConfig = None # type: ignore
LogContextScopeManager = None # type: ignore
try:
@ -484,7 +483,7 @@ def init_tracer(hs: "HomeServer") -> None:
config = JaegerConfig(
config=jaeger_config,
service_name=f"{hs.config.server.server_name} {instance_name_by_type}",
scope_manager=LogContextScopeManager(),
scope_manager=ContextVarsScopeManager(),
metrics_factory=PrometheusMetricsFactory(),
)

View File

@ -1,161 +0,0 @@
#
# This file is licensed under the Affero General Public License (AGPL) version 3.
#
# Copyright 2019 The Matrix.org Foundation C.I.C.
# Copyright (C) 2023 New Vector, Ltd
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# See the GNU Affero General Public License for more details:
# <https://www.gnu.org/licenses/agpl-3.0.html>.
#
# Originally licensed under the Apache License, Version 2.0:
# <http://www.apache.org/licenses/LICENSE-2.0>.
#
# [This file includes modifications made by New Vector Limited]
#
#
import logging
from typing import Optional
from opentracing import Scope, ScopeManager, Span
from synapse.logging.context import (
LoggingContext,
current_context,
nested_logging_context,
)
logger = logging.getLogger(__name__)
class LogContextScopeManager(ScopeManager):
"""
The LogContextScopeManager tracks the active scope in opentracing
by using the log contexts which are native to synapse. This is so
that the basic opentracing api can be used across twisted defereds.
It would be nice just to use opentracing's ContextVarsScopeManager,
but currently that doesn't work due to https://twistedmatrix.com/trac/ticket/10301.
"""
def __init__(self) -> None:
pass
@property
def active(self) -> Optional[Scope]:
"""
Returns the currently active Scope which can be used to access the
currently active Scope.span.
If there is a non-null Scope, its wrapped Span
becomes an implicit parent of any newly-created Span at
Tracer.start_active_span() time.
Return:
The Scope that is active, or None if not available.
"""
ctx = current_context()
return ctx.scope
def activate(self, span: Span, finish_on_close: bool) -> Scope:
"""
Makes a Span active.
Args
span: the span that should become active.
finish_on_close: whether Span should be automatically finished when
Scope.close() is called.
Returns:
Scope to control the end of the active period for
*span*. It is a programming error to neglect to call
Scope.close() on the returned instance.
"""
ctx = current_context()
if not ctx:
logger.error("Tried to activate scope outside of loggingcontext")
return Scope(None, span) # type: ignore[arg-type]
if ctx.scope is not None:
# start a new logging context as a child of the existing one.
# Doing so -- rather than updating the existing logcontext -- means that
# creating several concurrent spans under the same logcontext works
# correctly.
ctx = nested_logging_context("")
enter_logcontext = True
else:
# if there is no span currently associated with the current logcontext, we
# just store the scope in it.
#
# This feels a bit dubious, but it does hack around a problem where a
# span outlasts its parent logcontext (which would otherwise lead to
# "Re-starting finished log context" errors).
enter_logcontext = False
scope = _LogContextScope(self, span, ctx, enter_logcontext, finish_on_close)
ctx.scope = scope
if enter_logcontext:
ctx.__enter__()
return scope
class _LogContextScope(Scope):
"""
A custom opentracing scope, associated with a LogContext
* When the scope is closed, the logcontext's active scope is reset to None.
and - if enter_logcontext was set - the logcontext is finished too.
"""
def __init__(
self,
manager: LogContextScopeManager,
span: Span,
logcontext: LoggingContext,
enter_logcontext: bool,
finish_on_close: bool,
):
"""
Args:
manager:
the manager that is responsible for this scope.
span:
the opentracing span which this scope represents the local
lifetime for.
logcontext:
the log context to which this scope is attached.
enter_logcontext:
if True the log context will be exited when the scope is finished
finish_on_close:
if True finish the span when the scope is closed
"""
super().__init__(manager, span)
self.logcontext = logcontext
self._finish_on_close = finish_on_close
self._enter_logcontext = enter_logcontext
def __str__(self) -> str:
return f"Scope<{self.span}>"
def close(self) -> None:
active_scope = self.manager.active
if active_scope is not self:
logger.error(
"Closing scope %s which is not the currently-active one %s",
self,
active_scope,
)
if self._finish_on_close:
self.span.finish()
self.logcontext.scope = None
if self._enter_logcontext:
self.logcontext.__exit__(None, None, None)

View File

@ -19,7 +19,7 @@
#
#
from typing import Awaitable, cast
from typing import Awaitable, Dict, cast
from twisted.internet import defer
from twisted.internet.testing import MemoryReactorClock
@ -38,9 +38,11 @@ from synapse.logging.opentracing import (
from synapse.util import Clock
try:
from synapse.logging.scopecontextmanager import LogContextScopeManager
import opentracing
from opentracing.scope_managers.contextvars import ContextVarsScopeManager
except ImportError:
LogContextScopeManager = None # type: ignore
opentracing = None # type: ignore
ContextVarsScopeManager = None # type: ignore
try:
import jaeger_client
@ -54,9 +56,10 @@ from tests.unittest import TestCase
logger = logging.getLogger(__name__)
class LogContextScopeManagerTestCase(TestCase):
class TracingScopeTestCase(TestCase):
"""
Test logging contexts and active opentracing spans.
Test that our tracing machinery works well in a variety of situations (especially
with Twisted's runtime and deferreds).
There's casts throughout this from generic opentracing objects (e.g.
opentracing.Span) to the ones specific to Jaeger since they have additional
@ -64,7 +67,7 @@ class LogContextScopeManagerTestCase(TestCase):
opentracing backend is Jaeger.
"""
if LogContextScopeManager is None:
if opentracing is None:
skip = "Requires opentracing" # type: ignore[unreachable]
if jaeger_client is None:
skip = "Requires jaeger_client" # type: ignore[unreachable]
@ -74,7 +77,7 @@ class LogContextScopeManagerTestCase(TestCase):
# global variables that power opentracing. We create our own tracer instance
# and test with it.
scope_manager = LogContextScopeManager()
scope_manager = ContextVarsScopeManager()
config = jaeger_client.config.Config(
config={}, service_name="test", scope_manager=scope_manager
)
@ -208,6 +211,135 @@ class LogContextScopeManagerTestCase(TestCase):
[scopes[1].span, scopes[2].span, scopes[0].span],
)
def test_run_in_background_active_scope_still_available(self) -> None:
"""
Test that tasks running via `run_in_background` still have access to the
active tracing scope.
This is a regression test for a previous Synapse issue where the tracing scope
would `__exit__` and close before the `run_in_background` task completed and our
own previous custom `_LogContextScope.close(...)` would clear
`LoggingContext.scope` preventing further tracing spans from having the correct
parent.
"""
reactor = MemoryReactorClock()
clock = Clock(reactor)
scope_map: Dict[str, opentracing.Scope] = {}
async def async_task() -> None:
root_scope = scope_map["root"]
root_context = cast(jaeger_client.SpanContext, root_scope.span.context)
self.assertEqual(
self._tracer.active_span,
root_scope.span,
"expected to inherit the root tracing scope from where this was run",
)
# Return control back to the reactor thread and wait an arbitrary amount
await clock.sleep(4)
# This is a key part of what we're testing! In a previous version of
# Synapse, we would lose the active span at this point.
self.assertEqual(
self._tracer.active_span,
root_scope.span,
"expected to still have a root tracing scope/span active",
)
# For complete-ness sake, let's also trace more sub-tasks here and assert
# they have the correct span parents as well (root)
# Start tracing some other sub-task.
#
# This is a key part of what we're testing! In a previous version of
# Synapse, it would have the incorrect span parents.
scope = start_active_span(
"task1",
tracer=self._tracer,
)
scope_map["task1"] = scope
# Ensure the span parent is pointing to the root scope
context = cast(jaeger_client.SpanContext, scope.span.context)
self.assertEqual(
context.parent_id,
root_context.span_id,
"expected task1 parent to be the root span",
)
# Ensure that the active span is our new sub-task now
self.assertEqual(self._tracer.active_span, scope.span)
# Return control back to the reactor thread and wait an arbitrary amount
await clock.sleep(4)
# We should still see the active span as the scope wasn't closed yet
self.assertEqual(self._tracer.active_span, scope.span)
scope.close()
async def root() -> None:
with start_active_span(
"root span",
tracer=self._tracer,
# We will close this off later. We're basically just mimicking the same
# pattern for how we handle requests. We pass the span off to the
# request for it to finish.
finish_on_close=False,
) as root_scope:
scope_map["root"] = root_scope
self.assertEqual(self._tracer.active_span, root_scope.span)
# Fire-and-forget a task
#
# XXX: The root scope context manager will `__exit__` before this task
# completes.
run_in_background(async_task)
# Because we used `run_in_background`, the active span should still be
# the root.
self.assertEqual(self._tracer.active_span, root_scope.span)
# We shouldn't see any active spans outside of the scope
self.assertIsNone(self._tracer.active_span)
with LoggingContext("root context"):
# Start the test off
d_root = defer.ensureDeferred(root())
# Let the tasks complete
reactor.pump((2,) * 8)
self.successResultOf(d_root)
# After we see all of the tasks are done (like a request when it
# `_finished_processing`), let's finish our root span
scope_map["root"].span.finish()
# Sanity check again: We shouldn't see any active spans leftover in this
# this context.
self.assertIsNone(self._tracer.active_span)
# The spans should be reported in order of their finishing: task 1, task 2,
# root.
#
# We use `assertIncludes` just as an easier way to see if items are missing or
# added. We assert the order just below
self.assertIncludes(
set(self._reporter.get_spans()),
{
scope_map["task1"].span,
scope_map["root"].span,
},
exact=True,
)
# This is where we actually assert the correct order
self.assertEqual(
self._reporter.get_spans(),
[
scope_map["task1"].span,
scope_map["root"].span,
],
)
def test_trace_decorator_sync(self) -> None:
"""
Test whether we can use `@trace_with_opname` (`@trace`) and `@tag_args`