diff --git a/changelog.d/18932.misc b/changelog.d/18932.misc new file mode 100644 index 000000000..675412ddb --- /dev/null +++ b/changelog.d/18932.misc @@ -0,0 +1 @@ +Disconnect background process work from request trace. diff --git a/synapse/events/auto_accept_invites.py b/synapse/events/auto_accept_invites.py index 6873ee9d3..9e17edd22 100644 --- a/synapse/events/auto_accept_invites.py +++ b/synapse/events/auto_accept_invites.py @@ -119,7 +119,6 @@ class InviteAutoAccepter: event.state_key, event.room_id, "join", - bg_start_span=False, ) if is_direct_message: diff --git a/synapse/logging/opentracing.py b/synapse/logging/opentracing.py index 405939d51..8d350016c 100644 --- a/synapse/logging/opentracing.py +++ b/synapse/logging/opentracing.py @@ -576,7 +576,9 @@ def start_active_span_follows_from( operation_name: str, contexts: Collection, child_of: Optional[Union["opentracing.Span", "opentracing.SpanContext"]] = None, + tags: Optional[Dict[str, str]] = None, start_time: Optional[float] = None, + ignore_active_span: bool = False, *, inherit_force_tracing: bool = False, tracer: Optional["opentracing.Tracer"] = None, @@ -591,9 +593,16 @@ def start_active_span_follows_from( span will be the parent. (If there is no currently active span, the first span in `contexts` will be the parent.) + tags: an optional dictionary of span tags. The caller gives up ownership of that + dictionary, because the :class:`Tracer` may use it as-is to avoid extra data + copying. + start_time: optional override for the start time of the created span. Seconds since the epoch. + ignore_active_span: an explicit flag that ignores the current active + scope and creates a root span. + inherit_force_tracing: if set, and any of the previous contexts have had tracing forced, the new span will also have tracing forced. tracer: override the opentracing tracer. By default the global tracer is used. @@ -606,7 +615,9 @@ def start_active_span_follows_from( operation_name, child_of=child_of, references=references, + tags=tags, start_time=start_time, + ignore_active_span=ignore_active_span, tracer=tracer, ) diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 633705b02..08de02f03 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -20,7 +20,7 @@ import logging import threading -from contextlib import nullcontext +from contextlib import contextmanager, nullcontext from functools import wraps from types import TracebackType from typing import ( @@ -28,7 +28,9 @@ from typing import ( Any, Awaitable, Callable, + ContextManager, Dict, + Generator, Iterable, Optional, Protocol, @@ -49,7 +51,12 @@ from synapse.logging.context import ( LoggingContext, PreserveLoggingContext, ) -from synapse.logging.opentracing import SynapseTags, start_active_span +from synapse.logging.opentracing import ( + SynapseTags, + active_span, + start_active_span, + start_active_span_follows_from, +) from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics._types import Collector @@ -264,15 +271,97 @@ def run_as_background_process( with BackgroundProcessLoggingContext( name=desc, server_name=server_name, instance_id=count - ) as context: + ) as logging_context: try: if bg_start_span: - ctx = start_active_span( - f"bgproc.{desc}", tags={SynapseTags.REQUEST_ID: str(context)} - ) + original_active_tracing_span = active_span() + + # If there is already an active span (e.g. because this background + # process was started as part of handling a request for example), + # because this is a long-running background task that may serve a + # broader purpose than the request that kicked it off, we don't want + # it to be a direct child of the currently active trace connected to + # the request. We only want a loose reference to jump between the + # traces. + # + # For example, when making a `/messages` request, when approaching a + # gap, we may kick off a background process to fetch missing events + # from federation. The `/messages` request trace should't include + # the entire time taken and details around fetching the missing + # events since the request doesn't rely on the result, it was just + # part of the heuristic to initiate things. + # + # We don't care about the value from the context manager as it's not + # used (so we just use `Any` for the type). Ideally, we'd be able to + # mark this as unused like an `assert_never` of sorts. + tracing_scope: ContextManager[Any] + if original_active_tracing_span is not None: + # With the OpenTracing client that we're using, it's impossible to + # create a disconnected root span while also providing `references` + # so we first create a bare root span, then create a child span that + # includes the references that we want. + root_tracing_scope = start_active_span( + f"bgproc.{desc}", + tags={SynapseTags.REQUEST_ID: str(logging_context)}, + # Create a root span for the background process (disconnected + # from other spans) + ignore_active_span=True, + ) + + # Also add a span in the original request trace that cross-links + # to background process trace. We immediately finish the span as + # this is just a marker to follow where the real work is being + # done. + # + # In OpenTracing, `FOLLOWS_FROM` indicates parent-child + # relationship whereas we just want a cross-link to the + # downstream trace. This is a bit hacky, but the closest we + # can get to in OpenTracing land. If we ever migrate to + # OpenTelemetry, we should use a normal `Link` for this. + with start_active_span_follows_from( + f"start_bgproc.{desc}", + child_of=original_active_tracing_span, + ignore_active_span=True, + # Points to the background process span. + contexts=[root_tracing_scope.span.context], + ): + pass + + # Then start the tracing scope that we're going to use for + # the duration of the background process within the root + # span we just created. + child_tracing_scope = start_active_span_follows_from( + f"bgproc_child.{desc}", + child_of=root_tracing_scope.span, + ignore_active_span=True, + tags={SynapseTags.REQUEST_ID: str(logging_context)}, + # Create the `FOLLOWS_FROM` reference to the request's + # span so there is a loose coupling between the two + # traces and it's easy to jump between. + contexts=[original_active_tracing_span.context], + ) + + # For easy usage down below, we create a context manager that + # combines both scopes. + @contextmanager + def combined_context_manager() -> Generator[None, None, None]: + with root_tracing_scope, child_tracing_scope: + yield + + tracing_scope = combined_context_manager() + + else: + # Otherwise, when there is no active span, we will be creating + # a disconnected root span already and we don't have to + # worry about cross-linking to anything. + tracing_scope = start_active_span( + f"bgproc.{desc}", + tags={SynapseTags.REQUEST_ID: str(logging_context)}, + ) else: - ctx = nullcontext() # type: ignore[assignment] - with ctx: + tracing_scope = nullcontext() + + with tracing_scope: return await func(*args, **kwargs) except Exception: logger.exception( diff --git a/synapse/replication/tcp/redis.py b/synapse/replication/tcp/redis.py index aba79b237..0b1be033b 100644 --- a/synapse/replication/tcp/redis.py +++ b/synapse/replication/tcp/redis.py @@ -250,6 +250,13 @@ class RedisSubscriber(SubscriberProtocol): self.server_name, self._async_send_command, cmd, + # We originally started tracing background processes to avoid `There was no + # active span` errors but this change meant we started generating 15x the + # number of spans than before (this is one of the most heavily called + # instances of `run_as_background_process`). + # + # Since we don't log or tag a tracing span in the downstream + # code, we can safely disable this. bg_start_span=False, )