Fix cache metrics to collect from all servers (#18748)
Follow-up to https://github.com/element-hq/synapse/pull/18604 Previously, our cache metrics did include the `server_name` label as expected but we were only seeing the last server being reported. This was caused because we would `CACHE_METRIC_REGISTRY.register_hook(metric_name, metric.collect)` where the `metric_name` only took into account the cache name so it would be overwritten every time we spawn a new server. This PR updates the register logic to include the `server_name` so we have a hook for every cache on every server as expected. I noticed this problem thanks to some [tests in the Synapse Pro for Small Hosts](https://github.com/element-hq/synapse-small-hosts/pull/173) repo that sanity check all metrics to ensure that we can see each metric includes data from multiple servers.
This commit is contained in:
parent
510924a2f6
commit
e43a1cec84
1
changelog.d/18748.misc
Normal file
1
changelog.d/18748.misc
Normal file
@ -0,0 +1 @@
|
||||
Refactor cache metrics to be homeserver-scoped.
|
||||
@ -44,8 +44,6 @@ TRACK_MEMORY_USAGE = False
|
||||
# just before they are returned from the scrape endpoint.
|
||||
CACHE_METRIC_REGISTRY = DynamicCollectorRegistry()
|
||||
|
||||
caches_by_name: Dict[str, Sized] = {}
|
||||
|
||||
cache_size = Gauge(
|
||||
"synapse_util_caches_cache_size",
|
||||
"",
|
||||
@ -242,8 +240,7 @@ def register_cache(
|
||||
server_name=server_name,
|
||||
collect_callback=collect_callback,
|
||||
)
|
||||
metric_name = "cache_%s_%s" % (cache_type, cache_name)
|
||||
caches_by_name[cache_name] = cache
|
||||
metric_name = "cache_%s_%s_%s" % (cache_type, cache_name, server_name)
|
||||
CACHE_METRIC_REGISTRY.register_hook(metric_name, metric.collect)
|
||||
return metric
|
||||
|
||||
|
||||
@ -159,26 +159,149 @@ class CacheMetricsTests(unittest.HomeserverTestCase):
|
||||
name=CACHE_NAME, server_name=self.hs.hostname, max_entries=777
|
||||
)
|
||||
|
||||
items = {
|
||||
x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
|
||||
for x in filter(
|
||||
lambda x: b"cache_metrics_test_fgjkbdfg" in x,
|
||||
generate_latest(REGISTRY).split(b"\n"),
|
||||
)
|
||||
}
|
||||
metrics_map = get_latest_metrics()
|
||||
|
||||
self.assertEqual(items["synapse_util_caches_cache_size"], "0.0")
|
||||
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
|
||||
cache_size_metric = f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="{self.hs.hostname}"}}'
|
||||
cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="{self.hs.hostname}"}}'
|
||||
|
||||
cache_size_metric_value = metrics_map.get(cache_size_metric)
|
||||
self.assertIsNotNone(
|
||||
cache_size_metric_value,
|
||||
f"Missing metric {cache_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
cache_max_size_metric_value = metrics_map.get(cache_max_size_metric)
|
||||
self.assertIsNotNone(
|
||||
cache_max_size_metric_value,
|
||||
f"Missing metric {cache_max_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
|
||||
self.assertEqual(cache_size_metric_value, "0.0")
|
||||
self.assertEqual(cache_max_size_metric_value, "777.0")
|
||||
|
||||
cache.prefill("1", "hi")
|
||||
|
||||
items = {
|
||||
x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
|
||||
for x in filter(
|
||||
lambda x: b"cache_metrics_test_fgjkbdfg" in x,
|
||||
generate_latest(REGISTRY).split(b"\n"),
|
||||
)
|
||||
}
|
||||
metrics_map = get_latest_metrics()
|
||||
|
||||
self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
|
||||
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
|
||||
cache_size_metric_value = metrics_map.get(cache_size_metric)
|
||||
self.assertIsNotNone(
|
||||
cache_size_metric_value,
|
||||
f"Missing metric {cache_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
cache_max_size_metric_value = metrics_map.get(cache_max_size_metric)
|
||||
self.assertIsNotNone(
|
||||
cache_max_size_metric_value,
|
||||
f"Missing metric {cache_max_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
|
||||
self.assertEqual(cache_size_metric_value, "1.0")
|
||||
self.assertEqual(cache_max_size_metric_value, "777.0")
|
||||
|
||||
def test_cache_metric_multiple_servers(self) -> None:
|
||||
"""
|
||||
Test that cache metrics are reported correctly across multiple servers. We will
|
||||
have an metrics entry for each homeserver that is labeled with the `server_name`
|
||||
label.
|
||||
"""
|
||||
CACHE_NAME = "cache_metric_multiple_servers_test"
|
||||
cache1: DeferredCache[str, str] = DeferredCache(
|
||||
name=CACHE_NAME, server_name="hs1", max_entries=777
|
||||
)
|
||||
cache2: DeferredCache[str, str] = DeferredCache(
|
||||
name=CACHE_NAME, server_name="hs2", max_entries=777
|
||||
)
|
||||
|
||||
metrics_map = get_latest_metrics()
|
||||
|
||||
hs1_cache_size_metric = (
|
||||
f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="hs1"}}'
|
||||
)
|
||||
hs2_cache_size_metric = (
|
||||
f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="hs2"}}'
|
||||
)
|
||||
hs1_cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="hs1"}}'
|
||||
hs2_cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="hs2"}}'
|
||||
|
||||
# Find the metrics for the caches from both homeservers
|
||||
hs1_cache_size_metric_value = metrics_map.get(hs1_cache_size_metric)
|
||||
self.assertIsNotNone(
|
||||
hs1_cache_size_metric_value,
|
||||
f"Missing metric {hs1_cache_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
hs2_cache_size_metric_value = metrics_map.get(hs2_cache_size_metric)
|
||||
self.assertIsNotNone(
|
||||
hs2_cache_size_metric_value,
|
||||
f"Missing metric {hs2_cache_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
hs1_cache_max_size_metric_value = metrics_map.get(hs1_cache_max_size_metric)
|
||||
self.assertIsNotNone(
|
||||
hs1_cache_max_size_metric_value,
|
||||
f"Missing metric {hs1_cache_max_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
hs2_cache_max_size_metric_value = metrics_map.get(hs2_cache_max_size_metric)
|
||||
self.assertIsNotNone(
|
||||
hs2_cache_max_size_metric_value,
|
||||
f"Missing metric {hs2_cache_max_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
|
||||
# Sanity check the metric values
|
||||
self.assertEqual(hs1_cache_size_metric_value, "0.0")
|
||||
self.assertEqual(hs2_cache_size_metric_value, "0.0")
|
||||
self.assertEqual(hs1_cache_max_size_metric_value, "777.0")
|
||||
self.assertEqual(hs2_cache_max_size_metric_value, "777.0")
|
||||
|
||||
# Add something to both caches to change the numbers
|
||||
cache1.prefill("1", "hi")
|
||||
cache2.prefill("2", "ho")
|
||||
|
||||
metrics_map = get_latest_metrics()
|
||||
|
||||
# Find the metrics for the caches from both homeservers
|
||||
hs1_cache_size_metric_value = metrics_map.get(hs1_cache_size_metric)
|
||||
self.assertIsNotNone(
|
||||
hs1_cache_size_metric_value,
|
||||
f"Missing metric {hs1_cache_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
hs2_cache_size_metric_value = metrics_map.get(hs2_cache_size_metric)
|
||||
self.assertIsNotNone(
|
||||
hs2_cache_size_metric_value,
|
||||
f"Missing metric {hs2_cache_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
hs1_cache_max_size_metric_value = metrics_map.get(hs1_cache_max_size_metric)
|
||||
self.assertIsNotNone(
|
||||
hs1_cache_max_size_metric_value,
|
||||
f"Missing metric {hs1_cache_max_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
hs2_cache_max_size_metric_value = metrics_map.get(hs2_cache_max_size_metric)
|
||||
self.assertIsNotNone(
|
||||
hs2_cache_max_size_metric_value,
|
||||
f"Missing metric {hs2_cache_max_size_metric} in cache metrics {metrics_map}",
|
||||
)
|
||||
|
||||
# Sanity check the metric values
|
||||
self.assertEqual(hs1_cache_size_metric_value, "1.0")
|
||||
self.assertEqual(hs2_cache_size_metric_value, "1.0")
|
||||
self.assertEqual(hs1_cache_max_size_metric_value, "777.0")
|
||||
self.assertEqual(hs2_cache_max_size_metric_value, "777.0")
|
||||
|
||||
|
||||
def get_latest_metrics() -> Dict[str, str]:
|
||||
"""
|
||||
Collect the latest metrics from the registry and parse them into an easy to use map.
|
||||
The key includes the metric name and labels.
|
||||
|
||||
Example output:
|
||||
{
|
||||
"synapse_util_caches_cache_size": "0.0",
|
||||
"synapse_util_caches_cache_max_size{name="some_cache",server_name="hs1"}": "777.0",
|
||||
...
|
||||
}
|
||||
"""
|
||||
metric_map = {
|
||||
x.split(b" ")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
|
||||
for x in filter(
|
||||
lambda x: len(x) > 0 and not x.startswith(b"#"),
|
||||
generate_latest(REGISTRY).split(b"\n"),
|
||||
)
|
||||
}
|
||||
|
||||
return metric_map
|
||||
|
||||
Loading…
Reference in New Issue
Block a user