diff --git a/changelog.d/18748.misc b/changelog.d/18748.misc new file mode 100644 index 000000000..c06fb23af --- /dev/null +++ b/changelog.d/18748.misc @@ -0,0 +1 @@ +Refactor cache metrics to be homeserver-scoped. diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py index 3087ad6ad..340da5bd4 100644 --- a/synapse/util/caches/__init__.py +++ b/synapse/util/caches/__init__.py @@ -44,8 +44,6 @@ TRACK_MEMORY_USAGE = False # just before they are returned from the scrape endpoint. CACHE_METRIC_REGISTRY = DynamicCollectorRegistry() -caches_by_name: Dict[str, Sized] = {} - cache_size = Gauge( "synapse_util_caches_cache_size", "", @@ -242,8 +240,7 @@ def register_cache( server_name=server_name, collect_callback=collect_callback, ) - metric_name = "cache_%s_%s" % (cache_type, cache_name) - caches_by_name[cache_name] = cache + metric_name = "cache_%s_%s_%s" % (cache_type, cache_name, server_name) CACHE_METRIC_REGISTRY.register_hook(metric_name, metric.collect) return metric diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e92d5f6df..d87f62133 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -159,26 +159,149 @@ class CacheMetricsTests(unittest.HomeserverTestCase): name=CACHE_NAME, server_name=self.hs.hostname, max_entries=777 ) - items = { - x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii") - for x in filter( - lambda x: b"cache_metrics_test_fgjkbdfg" in x, - generate_latest(REGISTRY).split(b"\n"), - ) - } + metrics_map = get_latest_metrics() - self.assertEqual(items["synapse_util_caches_cache_size"], "0.0") - self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0") + cache_size_metric = f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="{self.hs.hostname}"}}' + cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="{self.hs.hostname}"}}' + + cache_size_metric_value = metrics_map.get(cache_size_metric) + self.assertIsNotNone( + cache_size_metric_value, + f"Missing metric {cache_size_metric} in cache metrics {metrics_map}", + ) + cache_max_size_metric_value = metrics_map.get(cache_max_size_metric) + self.assertIsNotNone( + cache_max_size_metric_value, + f"Missing metric {cache_max_size_metric} in cache metrics {metrics_map}", + ) + + self.assertEqual(cache_size_metric_value, "0.0") + self.assertEqual(cache_max_size_metric_value, "777.0") cache.prefill("1", "hi") - items = { - x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii") - for x in filter( - lambda x: b"cache_metrics_test_fgjkbdfg" in x, - generate_latest(REGISTRY).split(b"\n"), - ) - } + metrics_map = get_latest_metrics() - self.assertEqual(items["synapse_util_caches_cache_size"], "1.0") - self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0") + cache_size_metric_value = metrics_map.get(cache_size_metric) + self.assertIsNotNone( + cache_size_metric_value, + f"Missing metric {cache_size_metric} in cache metrics {metrics_map}", + ) + cache_max_size_metric_value = metrics_map.get(cache_max_size_metric) + self.assertIsNotNone( + cache_max_size_metric_value, + f"Missing metric {cache_max_size_metric} in cache metrics {metrics_map}", + ) + + self.assertEqual(cache_size_metric_value, "1.0") + self.assertEqual(cache_max_size_metric_value, "777.0") + + def test_cache_metric_multiple_servers(self) -> None: + """ + Test that cache metrics are reported correctly across multiple servers. We will + have an metrics entry for each homeserver that is labeled with the `server_name` + label. + """ + CACHE_NAME = "cache_metric_multiple_servers_test" + cache1: DeferredCache[str, str] = DeferredCache( + name=CACHE_NAME, server_name="hs1", max_entries=777 + ) + cache2: DeferredCache[str, str] = DeferredCache( + name=CACHE_NAME, server_name="hs2", max_entries=777 + ) + + metrics_map = get_latest_metrics() + + hs1_cache_size_metric = ( + f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="hs1"}}' + ) + hs2_cache_size_metric = ( + f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="hs2"}}' + ) + hs1_cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="hs1"}}' + hs2_cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="hs2"}}' + + # Find the metrics for the caches from both homeservers + hs1_cache_size_metric_value = metrics_map.get(hs1_cache_size_metric) + self.assertIsNotNone( + hs1_cache_size_metric_value, + f"Missing metric {hs1_cache_size_metric} in cache metrics {metrics_map}", + ) + hs2_cache_size_metric_value = metrics_map.get(hs2_cache_size_metric) + self.assertIsNotNone( + hs2_cache_size_metric_value, + f"Missing metric {hs2_cache_size_metric} in cache metrics {metrics_map}", + ) + hs1_cache_max_size_metric_value = metrics_map.get(hs1_cache_max_size_metric) + self.assertIsNotNone( + hs1_cache_max_size_metric_value, + f"Missing metric {hs1_cache_max_size_metric} in cache metrics {metrics_map}", + ) + hs2_cache_max_size_metric_value = metrics_map.get(hs2_cache_max_size_metric) + self.assertIsNotNone( + hs2_cache_max_size_metric_value, + f"Missing metric {hs2_cache_max_size_metric} in cache metrics {metrics_map}", + ) + + # Sanity check the metric values + self.assertEqual(hs1_cache_size_metric_value, "0.0") + self.assertEqual(hs2_cache_size_metric_value, "0.0") + self.assertEqual(hs1_cache_max_size_metric_value, "777.0") + self.assertEqual(hs2_cache_max_size_metric_value, "777.0") + + # Add something to both caches to change the numbers + cache1.prefill("1", "hi") + cache2.prefill("2", "ho") + + metrics_map = get_latest_metrics() + + # Find the metrics for the caches from both homeservers + hs1_cache_size_metric_value = metrics_map.get(hs1_cache_size_metric) + self.assertIsNotNone( + hs1_cache_size_metric_value, + f"Missing metric {hs1_cache_size_metric} in cache metrics {metrics_map}", + ) + hs2_cache_size_metric_value = metrics_map.get(hs2_cache_size_metric) + self.assertIsNotNone( + hs2_cache_size_metric_value, + f"Missing metric {hs2_cache_size_metric} in cache metrics {metrics_map}", + ) + hs1_cache_max_size_metric_value = metrics_map.get(hs1_cache_max_size_metric) + self.assertIsNotNone( + hs1_cache_max_size_metric_value, + f"Missing metric {hs1_cache_max_size_metric} in cache metrics {metrics_map}", + ) + hs2_cache_max_size_metric_value = metrics_map.get(hs2_cache_max_size_metric) + self.assertIsNotNone( + hs2_cache_max_size_metric_value, + f"Missing metric {hs2_cache_max_size_metric} in cache metrics {metrics_map}", + ) + + # Sanity check the metric values + self.assertEqual(hs1_cache_size_metric_value, "1.0") + self.assertEqual(hs2_cache_size_metric_value, "1.0") + self.assertEqual(hs1_cache_max_size_metric_value, "777.0") + self.assertEqual(hs2_cache_max_size_metric_value, "777.0") + + +def get_latest_metrics() -> Dict[str, str]: + """ + Collect the latest metrics from the registry and parse them into an easy to use map. + The key includes the metric name and labels. + + Example output: + { + "synapse_util_caches_cache_size": "0.0", + "synapse_util_caches_cache_max_size{name="some_cache",server_name="hs1"}": "777.0", + ... + } + """ + metric_map = { + x.split(b" ")[0].decode("ascii"): x.split(b" ")[1].decode("ascii") + for x in filter( + lambda x: len(x) > 0 and not x.startswith(b"#"), + generate_latest(REGISTRY).split(b"\n"), + ) + } + + return metric_map