Switch to OpenTracing's ContextVarsScopeManager (#18849)
Switch to OpenTracing's `ContextVarsScopeManager` instead of our own custom `LogContextScopeManager`. This is now possible because the linked Twisted issue from the comment in our custom `LogContextScopeManager` is resolved: https://twistedmatrix.com/trac/ticket/10301 This PR is spawning from exploring different possibilities to solve the `scope` loss problem I was encountering in https://github.com/element-hq/synapse/pull/18804#discussion_r2268254424. This appears to solve the problem and I've added the additional test from there to this PR ✅
This commit is contained in:
parent
df2cfb3932
commit
27fc3389f3
1
changelog.d/18849.misc
Normal file
1
changelog.d/18849.misc
Normal file
@ -0,0 +1 @@
|
||||
Switch to OpenTracing's `ContextVarsScopeManager` instead of our own custom `LogContextScopeManager`.
|
||||
@ -56,7 +56,6 @@ from twisted.internet import defer, threads
|
||||
from twisted.python.threadpool import ThreadPool
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from synapse.logging.scopecontextmanager import _LogContextScope
|
||||
from synapse.types import ISynapseReactor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -230,14 +229,13 @@ LoggingContextOrSentinel = Union["LoggingContext", "_Sentinel"]
|
||||
class _Sentinel:
|
||||
"""Sentinel to represent the root context"""
|
||||
|
||||
__slots__ = ["previous_context", "finished", "request", "scope", "tag"]
|
||||
__slots__ = ["previous_context", "finished", "request", "tag"]
|
||||
|
||||
def __init__(self) -> None:
|
||||
# Minimal set for compatibility with LoggingContext
|
||||
self.previous_context = None
|
||||
self.finished = False
|
||||
self.request = None
|
||||
self.scope = None
|
||||
self.tag = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
@ -290,7 +288,6 @@ class LoggingContext:
|
||||
"finished",
|
||||
"request",
|
||||
"tag",
|
||||
"scope",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
@ -311,7 +308,6 @@ class LoggingContext:
|
||||
self.main_thread = get_thread_id()
|
||||
self.request = None
|
||||
self.tag = ""
|
||||
self.scope: Optional["_LogContextScope"] = None
|
||||
|
||||
# keep track of whether we have hit the __exit__ block for this context
|
||||
# (suggesting that the the thing that created the context thinks it should
|
||||
@ -324,9 +320,6 @@ class LoggingContext:
|
||||
# we track the current request_id
|
||||
self.request = self.parent_context.request
|
||||
|
||||
# we also track the current scope:
|
||||
self.scope = self.parent_context.scope
|
||||
|
||||
if request is not None:
|
||||
# the request param overrides the request from the parent context
|
||||
self.request = request
|
||||
|
||||
@ -251,18 +251,17 @@ class _DummyTagNames:
|
||||
try:
|
||||
import opentracing
|
||||
import opentracing.tags
|
||||
from opentracing.scope_managers.contextvars import ContextVarsScopeManager
|
||||
|
||||
tags = opentracing.tags
|
||||
except ImportError:
|
||||
opentracing = None # type: ignore[assignment]
|
||||
tags = _DummyTagNames # type: ignore[assignment]
|
||||
ContextVarsScopeManager = None # type: ignore
|
||||
try:
|
||||
from jaeger_client import Config as JaegerConfig
|
||||
|
||||
from synapse.logging.scopecontextmanager import LogContextScopeManager
|
||||
except ImportError:
|
||||
JaegerConfig = None # type: ignore
|
||||
LogContextScopeManager = None # type: ignore
|
||||
|
||||
|
||||
try:
|
||||
@ -484,7 +483,7 @@ def init_tracer(hs: "HomeServer") -> None:
|
||||
config = JaegerConfig(
|
||||
config=jaeger_config,
|
||||
service_name=f"{hs.config.server.server_name} {instance_name_by_type}",
|
||||
scope_manager=LogContextScopeManager(),
|
||||
scope_manager=ContextVarsScopeManager(),
|
||||
metrics_factory=PrometheusMetricsFactory(),
|
||||
)
|
||||
|
||||
|
||||
@ -1,161 +0,0 @@
|
||||
#
|
||||
# This file is licensed under the Affero General Public License (AGPL) version 3.
|
||||
#
|
||||
# Copyright 2019 The Matrix.org Foundation C.I.C.
|
||||
# Copyright (C) 2023 New Vector, Ltd
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as
|
||||
# published by the Free Software Foundation, either version 3 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# See the GNU Affero General Public License for more details:
|
||||
# <https://www.gnu.org/licenses/agpl-3.0.html>.
|
||||
#
|
||||
# Originally licensed under the Apache License, Version 2.0:
|
||||
# <http://www.apache.org/licenses/LICENSE-2.0>.
|
||||
#
|
||||
# [This file includes modifications made by New Vector Limited]
|
||||
#
|
||||
#
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from opentracing import Scope, ScopeManager, Span
|
||||
|
||||
from synapse.logging.context import (
|
||||
LoggingContext,
|
||||
current_context,
|
||||
nested_logging_context,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogContextScopeManager(ScopeManager):
|
||||
"""
|
||||
The LogContextScopeManager tracks the active scope in opentracing
|
||||
by using the log contexts which are native to synapse. This is so
|
||||
that the basic opentracing api can be used across twisted defereds.
|
||||
|
||||
It would be nice just to use opentracing's ContextVarsScopeManager,
|
||||
but currently that doesn't work due to https://twistedmatrix.com/trac/ticket/10301.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
@property
|
||||
def active(self) -> Optional[Scope]:
|
||||
"""
|
||||
Returns the currently active Scope which can be used to access the
|
||||
currently active Scope.span.
|
||||
If there is a non-null Scope, its wrapped Span
|
||||
becomes an implicit parent of any newly-created Span at
|
||||
Tracer.start_active_span() time.
|
||||
|
||||
Return:
|
||||
The Scope that is active, or None if not available.
|
||||
"""
|
||||
ctx = current_context()
|
||||
return ctx.scope
|
||||
|
||||
def activate(self, span: Span, finish_on_close: bool) -> Scope:
|
||||
"""
|
||||
Makes a Span active.
|
||||
Args
|
||||
span: the span that should become active.
|
||||
finish_on_close: whether Span should be automatically finished when
|
||||
Scope.close() is called.
|
||||
|
||||
Returns:
|
||||
Scope to control the end of the active period for
|
||||
*span*. It is a programming error to neglect to call
|
||||
Scope.close() on the returned instance.
|
||||
"""
|
||||
|
||||
ctx = current_context()
|
||||
|
||||
if not ctx:
|
||||
logger.error("Tried to activate scope outside of loggingcontext")
|
||||
return Scope(None, span) # type: ignore[arg-type]
|
||||
|
||||
if ctx.scope is not None:
|
||||
# start a new logging context as a child of the existing one.
|
||||
# Doing so -- rather than updating the existing logcontext -- means that
|
||||
# creating several concurrent spans under the same logcontext works
|
||||
# correctly.
|
||||
ctx = nested_logging_context("")
|
||||
enter_logcontext = True
|
||||
else:
|
||||
# if there is no span currently associated with the current logcontext, we
|
||||
# just store the scope in it.
|
||||
#
|
||||
# This feels a bit dubious, but it does hack around a problem where a
|
||||
# span outlasts its parent logcontext (which would otherwise lead to
|
||||
# "Re-starting finished log context" errors).
|
||||
enter_logcontext = False
|
||||
|
||||
scope = _LogContextScope(self, span, ctx, enter_logcontext, finish_on_close)
|
||||
ctx.scope = scope
|
||||
if enter_logcontext:
|
||||
ctx.__enter__()
|
||||
|
||||
return scope
|
||||
|
||||
|
||||
class _LogContextScope(Scope):
|
||||
"""
|
||||
A custom opentracing scope, associated with a LogContext
|
||||
|
||||
* When the scope is closed, the logcontext's active scope is reset to None.
|
||||
and - if enter_logcontext was set - the logcontext is finished too.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
manager: LogContextScopeManager,
|
||||
span: Span,
|
||||
logcontext: LoggingContext,
|
||||
enter_logcontext: bool,
|
||||
finish_on_close: bool,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
manager:
|
||||
the manager that is responsible for this scope.
|
||||
span:
|
||||
the opentracing span which this scope represents the local
|
||||
lifetime for.
|
||||
logcontext:
|
||||
the log context to which this scope is attached.
|
||||
enter_logcontext:
|
||||
if True the log context will be exited when the scope is finished
|
||||
finish_on_close:
|
||||
if True finish the span when the scope is closed
|
||||
"""
|
||||
super().__init__(manager, span)
|
||||
self.logcontext = logcontext
|
||||
self._finish_on_close = finish_on_close
|
||||
self._enter_logcontext = enter_logcontext
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Scope<{self.span}>"
|
||||
|
||||
def close(self) -> None:
|
||||
active_scope = self.manager.active
|
||||
if active_scope is not self:
|
||||
logger.error(
|
||||
"Closing scope %s which is not the currently-active one %s",
|
||||
self,
|
||||
active_scope,
|
||||
)
|
||||
|
||||
if self._finish_on_close:
|
||||
self.span.finish()
|
||||
|
||||
self.logcontext.scope = None
|
||||
|
||||
if self._enter_logcontext:
|
||||
self.logcontext.__exit__(None, None, None)
|
||||
@ -19,7 +19,7 @@
|
||||
#
|
||||
#
|
||||
|
||||
from typing import Awaitable, cast
|
||||
from typing import Awaitable, Dict, cast
|
||||
|
||||
from twisted.internet import defer
|
||||
from twisted.internet.testing import MemoryReactorClock
|
||||
@ -38,9 +38,11 @@ from synapse.logging.opentracing import (
|
||||
from synapse.util import Clock
|
||||
|
||||
try:
|
||||
from synapse.logging.scopecontextmanager import LogContextScopeManager
|
||||
import opentracing
|
||||
from opentracing.scope_managers.contextvars import ContextVarsScopeManager
|
||||
except ImportError:
|
||||
LogContextScopeManager = None # type: ignore
|
||||
opentracing = None # type: ignore
|
||||
ContextVarsScopeManager = None # type: ignore
|
||||
|
||||
try:
|
||||
import jaeger_client
|
||||
@ -54,9 +56,10 @@ from tests.unittest import TestCase
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LogContextScopeManagerTestCase(TestCase):
|
||||
class TracingScopeTestCase(TestCase):
|
||||
"""
|
||||
Test logging contexts and active opentracing spans.
|
||||
Test that our tracing machinery works well in a variety of situations (especially
|
||||
with Twisted's runtime and deferreds).
|
||||
|
||||
There's casts throughout this from generic opentracing objects (e.g.
|
||||
opentracing.Span) to the ones specific to Jaeger since they have additional
|
||||
@ -64,7 +67,7 @@ class LogContextScopeManagerTestCase(TestCase):
|
||||
opentracing backend is Jaeger.
|
||||
"""
|
||||
|
||||
if LogContextScopeManager is None:
|
||||
if opentracing is None:
|
||||
skip = "Requires opentracing" # type: ignore[unreachable]
|
||||
if jaeger_client is None:
|
||||
skip = "Requires jaeger_client" # type: ignore[unreachable]
|
||||
@ -74,7 +77,7 @@ class LogContextScopeManagerTestCase(TestCase):
|
||||
# global variables that power opentracing. We create our own tracer instance
|
||||
# and test with it.
|
||||
|
||||
scope_manager = LogContextScopeManager()
|
||||
scope_manager = ContextVarsScopeManager()
|
||||
config = jaeger_client.config.Config(
|
||||
config={}, service_name="test", scope_manager=scope_manager
|
||||
)
|
||||
@ -208,6 +211,135 @@ class LogContextScopeManagerTestCase(TestCase):
|
||||
[scopes[1].span, scopes[2].span, scopes[0].span],
|
||||
)
|
||||
|
||||
def test_run_in_background_active_scope_still_available(self) -> None:
|
||||
"""
|
||||
Test that tasks running via `run_in_background` still have access to the
|
||||
active tracing scope.
|
||||
|
||||
This is a regression test for a previous Synapse issue where the tracing scope
|
||||
would `__exit__` and close before the `run_in_background` task completed and our
|
||||
own previous custom `_LogContextScope.close(...)` would clear
|
||||
`LoggingContext.scope` preventing further tracing spans from having the correct
|
||||
parent.
|
||||
"""
|
||||
reactor = MemoryReactorClock()
|
||||
clock = Clock(reactor)
|
||||
|
||||
scope_map: Dict[str, opentracing.Scope] = {}
|
||||
|
||||
async def async_task() -> None:
|
||||
root_scope = scope_map["root"]
|
||||
root_context = cast(jaeger_client.SpanContext, root_scope.span.context)
|
||||
|
||||
self.assertEqual(
|
||||
self._tracer.active_span,
|
||||
root_scope.span,
|
||||
"expected to inherit the root tracing scope from where this was run",
|
||||
)
|
||||
|
||||
# Return control back to the reactor thread and wait an arbitrary amount
|
||||
await clock.sleep(4)
|
||||
|
||||
# This is a key part of what we're testing! In a previous version of
|
||||
# Synapse, we would lose the active span at this point.
|
||||
self.assertEqual(
|
||||
self._tracer.active_span,
|
||||
root_scope.span,
|
||||
"expected to still have a root tracing scope/span active",
|
||||
)
|
||||
|
||||
# For complete-ness sake, let's also trace more sub-tasks here and assert
|
||||
# they have the correct span parents as well (root)
|
||||
|
||||
# Start tracing some other sub-task.
|
||||
#
|
||||
# This is a key part of what we're testing! In a previous version of
|
||||
# Synapse, it would have the incorrect span parents.
|
||||
scope = start_active_span(
|
||||
"task1",
|
||||
tracer=self._tracer,
|
||||
)
|
||||
scope_map["task1"] = scope
|
||||
|
||||
# Ensure the span parent is pointing to the root scope
|
||||
context = cast(jaeger_client.SpanContext, scope.span.context)
|
||||
self.assertEqual(
|
||||
context.parent_id,
|
||||
root_context.span_id,
|
||||
"expected task1 parent to be the root span",
|
||||
)
|
||||
|
||||
# Ensure that the active span is our new sub-task now
|
||||
self.assertEqual(self._tracer.active_span, scope.span)
|
||||
# Return control back to the reactor thread and wait an arbitrary amount
|
||||
await clock.sleep(4)
|
||||
# We should still see the active span as the scope wasn't closed yet
|
||||
self.assertEqual(self._tracer.active_span, scope.span)
|
||||
scope.close()
|
||||
|
||||
async def root() -> None:
|
||||
with start_active_span(
|
||||
"root span",
|
||||
tracer=self._tracer,
|
||||
# We will close this off later. We're basically just mimicking the same
|
||||
# pattern for how we handle requests. We pass the span off to the
|
||||
# request for it to finish.
|
||||
finish_on_close=False,
|
||||
) as root_scope:
|
||||
scope_map["root"] = root_scope
|
||||
self.assertEqual(self._tracer.active_span, root_scope.span)
|
||||
|
||||
# Fire-and-forget a task
|
||||
#
|
||||
# XXX: The root scope context manager will `__exit__` before this task
|
||||
# completes.
|
||||
run_in_background(async_task)
|
||||
|
||||
# Because we used `run_in_background`, the active span should still be
|
||||
# the root.
|
||||
self.assertEqual(self._tracer.active_span, root_scope.span)
|
||||
|
||||
# We shouldn't see any active spans outside of the scope
|
||||
self.assertIsNone(self._tracer.active_span)
|
||||
|
||||
with LoggingContext("root context"):
|
||||
# Start the test off
|
||||
d_root = defer.ensureDeferred(root())
|
||||
|
||||
# Let the tasks complete
|
||||
reactor.pump((2,) * 8)
|
||||
self.successResultOf(d_root)
|
||||
|
||||
# After we see all of the tasks are done (like a request when it
|
||||
# `_finished_processing`), let's finish our root span
|
||||
scope_map["root"].span.finish()
|
||||
|
||||
# Sanity check again: We shouldn't see any active spans leftover in this
|
||||
# this context.
|
||||
self.assertIsNone(self._tracer.active_span)
|
||||
|
||||
# The spans should be reported in order of their finishing: task 1, task 2,
|
||||
# root.
|
||||
#
|
||||
# We use `assertIncludes` just as an easier way to see if items are missing or
|
||||
# added. We assert the order just below
|
||||
self.assertIncludes(
|
||||
set(self._reporter.get_spans()),
|
||||
{
|
||||
scope_map["task1"].span,
|
||||
scope_map["root"].span,
|
||||
},
|
||||
exact=True,
|
||||
)
|
||||
# This is where we actually assert the correct order
|
||||
self.assertEqual(
|
||||
self._reporter.get_spans(),
|
||||
[
|
||||
scope_map["task1"].span,
|
||||
scope_map["root"].span,
|
||||
],
|
||||
)
|
||||
|
||||
def test_trace_decorator_sync(self) -> None:
|
||||
"""
|
||||
Test whether we can use `@trace_with_opname` (`@trace`) and `@tag_args`
|
||||
|
||||
Loading…
Reference in New Issue
Block a user