From c0b9437ab6511cad351dc79cfabec3b8a1c767fa Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 30 Oct 2025 11:49:15 -0500 Subject: [PATCH] Fix lost logcontext when using `timeout_deferred(...)` (#19090) Fix lost logcontext when using `timeout_deferred(...)` and things actually timeout. Fix https://github.com/element-hq/synapse/issues/19087 (our HTTP client times out requests using `timeout_deferred(...)` Fix https://github.com/element-hq/synapse/issues/19066 (`/sync` uses `notifier.wait_for_events()` which uses `timeout_deferred(...)` under the hood) ### When/why did these lost logcontext warnings start happening? ``` synapse.logging.context - 107 - WARNING - sentinel - Expected logging context call_later but found POST-2453 synapse.logging.context - 107 - WARNING - sentinel - Expected logging context call_later was lost ``` In https://github.com/element-hq/synapse/pull/18828, we switched `timeout_deferred(...)` from using `reactor.callLater(...)` to [`clock.call_later(...)`](https://github.com/element-hq/synapse/blob/3b59ac3b69f6a2f73a504699b30313d8dcfe4709/synapse/util/clock.py#L224-L313) under the hood. This meant it started dealing with logcontexts but our `time_it_out()` callback didn't follow our [Synapse logcontext rules](https://github.com/element-hq/synapse/blob/3b59ac3b69f6a2f73a504699b30313d8dcfe4709/docs/log_contexts.md). --- changelog.d/19090.bugfix | 1 + synapse/logging/context.py | 22 ++++++++--- synapse/util/async_helpers.py | 3 +- synapse/util/clock.py | 2 +- tests/unittest.py | 3 ++ tests/util/test_async_helpers.py | 68 +++++++++++++++++++++++++++++++- 6 files changed, 89 insertions(+), 10 deletions(-) create mode 100644 changelog.d/19090.bugfix diff --git a/changelog.d/19090.bugfix b/changelog.d/19090.bugfix new file mode 100644 index 000000000..077dafcbf --- /dev/null +++ b/changelog.d/19090.bugfix @@ -0,0 +1 @@ +Fix lost logcontext warnings from timeouts in sync and requests made by Synapse itself. diff --git a/synapse/logging/context.py b/synapse/logging/context.py index 5b87de6eb..86e994cbb 100644 --- a/synapse/logging/context.py +++ b/synapse/logging/context.py @@ -896,7 +896,7 @@ def run_in_background( # If the function messes with logcontexts, we can assume it follows the Synapse # logcontext rules (Rules for functions returning awaitables: "If the awaitable # is already complete, the function returns with the same logcontext it started - # with."). If it function doesn't touch logcontexts at all, we can also assume + # with."). If the function doesn't touch logcontexts at all, we can also assume # the logcontext is unchanged. # # Either way, the function should have maintained the calling logcontext, so we @@ -905,11 +905,21 @@ def run_in_background( # to reset the logcontext to the sentinel logcontext as that would run # immediately (remember our goal is to maintain the calling logcontext when we # return). - logcontext_debug_logger.debug( - "run_in_background(%s): deferred already completed and the function should have maintained the logcontext %s", - instance_id, - calling_context, - ) + if current_context() != calling_context: + logcontext_error( + "run_in_background(%s): deferred already completed but the function did not maintain the calling logcontext %s (found %s)" + % ( + instance_id, + calling_context, + current_context(), + ) + ) + else: + logcontext_debug_logger.debug( + "run_in_background(%s): deferred already completed (maintained the calling logcontext %s)", + instance_id, + calling_context, + ) return d # Since the function we called may follow the Synapse logcontext rules (Rules for diff --git a/synapse/util/async_helpers.py b/synapse/util/async_helpers.py index c568b377d..99e899d1e 100644 --- a/synapse/util/async_helpers.py +++ b/synapse/util/async_helpers.py @@ -808,7 +808,8 @@ def timeout_deferred( timed_out[0] = True try: - deferred.cancel() + with PreserveLoggingContext(): + deferred.cancel() except Exception: # if we throw any exception it'll break time outs logger.exception("Canceller failed during timeout") diff --git a/synapse/util/clock.py b/synapse/util/clock.py index 655758262..5b59cef60 100644 --- a/synapse/util/clock.py +++ b/synapse/util/clock.py @@ -266,7 +266,7 @@ class Clock: # We use `PreserveLoggingContext` to prevent our new `call_later` # logcontext from finishing as soon as we exit this function, in case `f` # returns an awaitable/deferred which would continue running and may try to - # restore the `loop_call` context when it's done (because it's trying to + # restore the `call_later` context when it's done (because it's trying to # adhere to the Synapse logcontext rules.) # # This also ensures that we return to the `sentinel` context when we exit diff --git a/tests/unittest.py b/tests/unittest.py index 1007f4045..049a92caa 100644 --- a/tests/unittest.py +++ b/tests/unittest.py @@ -341,6 +341,9 @@ def logcontext_clean(target: TV) -> TV: """ def logcontext_error(msg: str) -> NoReturn: + # Log so we can still see it in the logs like normal + logger.warning(msg) + # But also fail the test raise AssertionError("logcontext error: %s" % (msg)) patcher = patch("synapse.logging.context.logcontext_error", new=logcontext_error) diff --git a/tests/util/test_async_helpers.py b/tests/util/test_async_helpers.py index a02a2f0ce..8fbee12fb 100644 --- a/tests/util/test_async_helpers.py +++ b/tests/util/test_async_helpers.py @@ -45,7 +45,7 @@ from synapse.util.async_helpers import ( ) from tests.server import get_clock -from tests.unittest import TestCase +from tests.unittest import TestCase, logcontext_clean logger = logging.getLogger(__name__) @@ -198,7 +198,12 @@ class TimeoutDeferredTest(TestCase): self.failureResultOf(timing_out_d, defer.TimeoutError) - async def test_logcontext_is_preserved_on_cancellation(self) -> None: + @logcontext_clean + async def test_logcontext_is_preserved_on_timeout_cancellation(self) -> None: + """ + Test that the logcontext is preserved when we timeout and the deferred is + cancelled. + """ # Sanity check that we start in the sentinel context self.assertEqual(current_context(), SENTINEL_CONTEXT) @@ -270,6 +275,65 @@ class TimeoutDeferredTest(TestCase): # Back to the sentinel context self.assertEqual(current_context(), SENTINEL_CONTEXT) + @logcontext_clean + async def test_logcontext_is_not_lost_when_awaiting_on_timeout_cancellation( + self, + ) -> None: + """ + Test that the logcontext isn't lost when we `await make_deferred_yieldable(...)` + the deferred to complete/timeout and it times out. + """ + + # Sanity check that we start in the sentinel context + self.assertEqual(current_context(), SENTINEL_CONTEXT) + + # Create a deferred which we will never complete + incomplete_d: Deferred = Deferred() + + async def competing_task() -> None: + with LoggingContext( + name="competing", server_name="test_server" + ) as context_competing: + timing_out_d = timeout_deferred( + deferred=incomplete_d, + timeout=1.0, + clock=self.clock, + ) + self.assertNoResult(timing_out_d) + # We should still be in the logcontext we started in + self.assertIs(current_context(), context_competing) + + # Mimic the normal use case to wait for the work to complete or timeout. + # + # In this specific test, we expect the deferred to timeout and raise an + # exception at this point. + await make_deferred_yieldable(timing_out_d) + + self.fail( + "We should not make it to this point as the `timing_out_d` should have been cancelled" + ) + + d = defer.ensureDeferred(competing_task()) + + # Still in the sentinel context + self.assertEqual(current_context(), SENTINEL_CONTEXT) + + # Pump until we trigger the timeout + self.reactor.pump( + # We only need to pump `1.0` (seconds) as we set + # `timeout_deferred(timeout=1.0)` above + (1.0,) + ) + + # Still in the sentinel context + self.assertEqual(current_context(), SENTINEL_CONTEXT) + + # We expect a failure due to the timeout + self.failureResultOf(d, defer.TimeoutError) + + # Back to the sentinel context at the end of the day + self.assertEqual(current_context(), SENTINEL_CONTEXT) + class _TestException(Exception): # pass