Fix cache metrics to collect from all servers (#18748)

Follow-up to https://github.com/element-hq/synapse/pull/18604

Previously, our cache metrics did include the `server_name` label as
expected but we were only seeing the last server being reported. This
was caused because we would
`CACHE_METRIC_REGISTRY.register_hook(metric_name, metric.collect)` where
the `metric_name` only took into account the cache name so it would be
overwritten every time we spawn a new server.

This PR updates the register logic to include the `server_name` so we
have a hook for every cache on every server as expected.

I noticed this problem thanks to some [tests in the Synapse Pro for
Small Hosts](https://github.com/element-hq/synapse-small-hosts/pull/173)
repo that sanity check all metrics to ensure that we can see each metric
includes data from multiple servers.
This commit is contained in:
Eric Eastwood 2025-08-01 12:29:58 -05:00 committed by GitHub
parent 510924a2f6
commit e43a1cec84
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 143 additions and 22 deletions

1
changelog.d/18748.misc Normal file
View File

@ -0,0 +1 @@
Refactor cache metrics to be homeserver-scoped.

View File

@ -44,8 +44,6 @@ TRACK_MEMORY_USAGE = False
# just before they are returned from the scrape endpoint.
CACHE_METRIC_REGISTRY = DynamicCollectorRegistry()
caches_by_name: Dict[str, Sized] = {}
cache_size = Gauge(
"synapse_util_caches_cache_size",
"",
@ -242,8 +240,7 @@ def register_cache(
server_name=server_name,
collect_callback=collect_callback,
)
metric_name = "cache_%s_%s" % (cache_type, cache_name)
caches_by_name[cache_name] = cache
metric_name = "cache_%s_%s_%s" % (cache_type, cache_name, server_name)
CACHE_METRIC_REGISTRY.register_hook(metric_name, metric.collect)
return metric

View File

@ -159,26 +159,149 @@ class CacheMetricsTests(unittest.HomeserverTestCase):
name=CACHE_NAME, server_name=self.hs.hostname, max_entries=777
)
items = {
x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
for x in filter(
lambda x: b"cache_metrics_test_fgjkbdfg" in x,
generate_latest(REGISTRY).split(b"\n"),
)
}
metrics_map = get_latest_metrics()
self.assertEqual(items["synapse_util_caches_cache_size"], "0.0")
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
cache_size_metric = f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="{self.hs.hostname}"}}'
cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="{self.hs.hostname}"}}'
cache_size_metric_value = metrics_map.get(cache_size_metric)
self.assertIsNotNone(
cache_size_metric_value,
f"Missing metric {cache_size_metric} in cache metrics {metrics_map}",
)
cache_max_size_metric_value = metrics_map.get(cache_max_size_metric)
self.assertIsNotNone(
cache_max_size_metric_value,
f"Missing metric {cache_max_size_metric} in cache metrics {metrics_map}",
)
self.assertEqual(cache_size_metric_value, "0.0")
self.assertEqual(cache_max_size_metric_value, "777.0")
cache.prefill("1", "hi")
items = {
x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
for x in filter(
lambda x: b"cache_metrics_test_fgjkbdfg" in x,
generate_latest(REGISTRY).split(b"\n"),
)
}
metrics_map = get_latest_metrics()
self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
cache_size_metric_value = metrics_map.get(cache_size_metric)
self.assertIsNotNone(
cache_size_metric_value,
f"Missing metric {cache_size_metric} in cache metrics {metrics_map}",
)
cache_max_size_metric_value = metrics_map.get(cache_max_size_metric)
self.assertIsNotNone(
cache_max_size_metric_value,
f"Missing metric {cache_max_size_metric} in cache metrics {metrics_map}",
)
self.assertEqual(cache_size_metric_value, "1.0")
self.assertEqual(cache_max_size_metric_value, "777.0")
def test_cache_metric_multiple_servers(self) -> None:
"""
Test that cache metrics are reported correctly across multiple servers. We will
have an metrics entry for each homeserver that is labeled with the `server_name`
label.
"""
CACHE_NAME = "cache_metric_multiple_servers_test"
cache1: DeferredCache[str, str] = DeferredCache(
name=CACHE_NAME, server_name="hs1", max_entries=777
)
cache2: DeferredCache[str, str] = DeferredCache(
name=CACHE_NAME, server_name="hs2", max_entries=777
)
metrics_map = get_latest_metrics()
hs1_cache_size_metric = (
f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="hs1"}}'
)
hs2_cache_size_metric = (
f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="hs2"}}'
)
hs1_cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="hs1"}}'
hs2_cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="hs2"}}'
# Find the metrics for the caches from both homeservers
hs1_cache_size_metric_value = metrics_map.get(hs1_cache_size_metric)
self.assertIsNotNone(
hs1_cache_size_metric_value,
f"Missing metric {hs1_cache_size_metric} in cache metrics {metrics_map}",
)
hs2_cache_size_metric_value = metrics_map.get(hs2_cache_size_metric)
self.assertIsNotNone(
hs2_cache_size_metric_value,
f"Missing metric {hs2_cache_size_metric} in cache metrics {metrics_map}",
)
hs1_cache_max_size_metric_value = metrics_map.get(hs1_cache_max_size_metric)
self.assertIsNotNone(
hs1_cache_max_size_metric_value,
f"Missing metric {hs1_cache_max_size_metric} in cache metrics {metrics_map}",
)
hs2_cache_max_size_metric_value = metrics_map.get(hs2_cache_max_size_metric)
self.assertIsNotNone(
hs2_cache_max_size_metric_value,
f"Missing metric {hs2_cache_max_size_metric} in cache metrics {metrics_map}",
)
# Sanity check the metric values
self.assertEqual(hs1_cache_size_metric_value, "0.0")
self.assertEqual(hs2_cache_size_metric_value, "0.0")
self.assertEqual(hs1_cache_max_size_metric_value, "777.0")
self.assertEqual(hs2_cache_max_size_metric_value, "777.0")
# Add something to both caches to change the numbers
cache1.prefill("1", "hi")
cache2.prefill("2", "ho")
metrics_map = get_latest_metrics()
# Find the metrics for the caches from both homeservers
hs1_cache_size_metric_value = metrics_map.get(hs1_cache_size_metric)
self.assertIsNotNone(
hs1_cache_size_metric_value,
f"Missing metric {hs1_cache_size_metric} in cache metrics {metrics_map}",
)
hs2_cache_size_metric_value = metrics_map.get(hs2_cache_size_metric)
self.assertIsNotNone(
hs2_cache_size_metric_value,
f"Missing metric {hs2_cache_size_metric} in cache metrics {metrics_map}",
)
hs1_cache_max_size_metric_value = metrics_map.get(hs1_cache_max_size_metric)
self.assertIsNotNone(
hs1_cache_max_size_metric_value,
f"Missing metric {hs1_cache_max_size_metric} in cache metrics {metrics_map}",
)
hs2_cache_max_size_metric_value = metrics_map.get(hs2_cache_max_size_metric)
self.assertIsNotNone(
hs2_cache_max_size_metric_value,
f"Missing metric {hs2_cache_max_size_metric} in cache metrics {metrics_map}",
)
# Sanity check the metric values
self.assertEqual(hs1_cache_size_metric_value, "1.0")
self.assertEqual(hs2_cache_size_metric_value, "1.0")
self.assertEqual(hs1_cache_max_size_metric_value, "777.0")
self.assertEqual(hs2_cache_max_size_metric_value, "777.0")
def get_latest_metrics() -> Dict[str, str]:
"""
Collect the latest metrics from the registry and parse them into an easy to use map.
The key includes the metric name and labels.
Example output:
{
"synapse_util_caches_cache_size": "0.0",
"synapse_util_caches_cache_max_size{name="some_cache",server_name="hs1"}": "777.0",
...
}
"""
metric_map = {
x.split(b" ")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
for x in filter(
lambda x: len(x) > 0 and not x.startswith(b"#"),
generate_latest(REGISTRY).split(b"\n"),
)
}
return metric_map