diff --git a/crud.lua b/crud.lua index 37ec7f7d..2a5dcf95 100644 --- a/crud.lua +++ b/crud.lua @@ -174,7 +174,8 @@ crud.rebalance.router_cache_length = rebalance.router.cache_length crud.rebalance.router_cache_last_clear_ts = rebalance.router.cache_last_clear_ts function crud.init_router() - rawset(_G, 'crud', crud) + rawset(_G, 'crud', crud) + rebalance.metrics.enable_router_metrics() end function crud.stop_router() diff --git a/crud/common/rebalance.lua b/crud/common/rebalance.lua index e17c6d08..69c2506f 100644 --- a/crud/common/rebalance.lua +++ b/crud/common/rebalance.lua @@ -3,10 +3,11 @@ local log = require('log') local vshard_consts = require('vshard.consts') local utils = require('crud.common.utils') +local has_metrics_module, metrics = pcall(require, 'metrics') + local SETTINGS_SPACE_NAME = '_crud_settings' local SAFE_MOD_ENABLE_EVENT = '_crud.safe_mode_enable' - local M = { safe_mode = false, safe_mode_enable_hooks = {}, @@ -85,6 +86,8 @@ local function safe_mode_disable() end local function rebalance_init() + M.metrics.enable_storage_metrics() + -- box.watch was introduced in tarantool 2.10.0 if not utils.tarantool_supports_box_watch() then log.warn('This version of tarantool does not support autoswitch to safe mode during rebalance. ' @@ -131,20 +134,54 @@ local function rebalance_stop() end local function router_cache_clear() - local r = utils.get_vshard_router_instance() M._router_cache_last_clear_ts = fiber.time() - return r:_route_map_clear() + return utils.get_vshard_router_instance():_route_map_clear() end local function router_cache_length() - local r = utils.get_vshard_router_instance() - return r.known_bucket_count + return utils.get_vshard_router_instance().known_bucket_count end local function router_cache_last_clear_ts() return M._router_cache_last_clear_ts end +-- Rebalance related metrics +local function enable_storage_metrics() + if not has_metrics_module then + return + end + + local safe_mode_enabled_gauge = metrics.gauge( + 'tnt_crud_storage_safe_mode_enabled', + "is safe mode enabled on this storage instance" + ) + + metrics.register_callback(function() + safe_mode_enabled_gauge:set(safe_mode_status() and 1 or 0) + end) +end + +local function enable_router_metrics() + if not has_metrics_module then + return + end + + local router_cache_length_gauge = metrics.gauge( + 'tnt_crud_router_cache_length', + "number of bucket routes in vshard router cache" + ) + local router_cache_last_clear_ts_gauge = metrics.gauge( + 'tnt_crud_router_cache_last_clear_ts', + "when vshard router cache was cleared last time" + ) + + metrics.register_callback(function() + router_cache_length_gauge:set(router_cache_length()) + router_cache_last_clear_ts_gauge:set(router_cache_last_clear_ts()) + end) +end + M.init = rebalance_init M.stop = rebalance_stop M.safe_mode_status = safe_mode_status @@ -167,4 +204,9 @@ M.storage_api = { rebalance_safe_mode_disable = safe_mode_disable, } +M.metrics = { + enable_storage_metrics = enable_storage_metrics, + enable_router_metrics = enable_router_metrics, +} + return M diff --git a/test/integration/metrics_test.lua b/test/integration/metrics_test.lua new file mode 100644 index 00000000..e1dd3778 --- /dev/null +++ b/test/integration/metrics_test.lua @@ -0,0 +1,94 @@ +local helpers = require('test.helper') +local t = require('luatest') + +local pgroup = t.group('metrics_integration', helpers.backend_matrix({ + {engine = 'memtx'}, +})) + +local function before_all(g) + helpers.start_default_cluster(g, 'srv_stats') +end + +local function after_all(g) + helpers.stop_cluster(g.cluster, g.params.backend) +end + +local function before_each(g) + g.router:eval("crud = require('crud')") + helpers.call_on_storages(g.cluster, function(server) + server:call('_crud.rebalance_safe_mode_disable') + end) +end + +pgroup.before_all(before_all) + +pgroup.after_all(after_all) + +pgroup.before_each(before_each) + +pgroup.test_safe_mode_metrics = function(g) + local has_metrics_module = require('metrics') + t.skip_if(not has_metrics_module, 'No metrics module in current version') + + -- Check safe mode metric on storage + helpers.call_on_storages(g.cluster, function(server) + local observed = server:eval("return require('metrics').collect({ invoke_callbacks = true })") + local has_metric = false + for _, m in pairs(observed) do + if m.metric_name == 'tnt_crud_storage_safe_mode_enabled' then + t.assert_equals(m.value, 0, 'Metric shows safe mode disabled') + has_metric = true + break + end + end + if not has_metric then + t.fail('No tnt_crud_storage_safe_mode_enabled metric found') + end + end) + + -- Enable safe mode + helpers.call_on_storages(g.cluster, function(server) + server:call('_crud.rebalance_safe_mode_enable') + end) + + -- Check that metric value has changed + helpers.call_on_storages(g.cluster, function(server) + local observed = server:eval("return require('metrics').collect({ invoke_callbacks = true })") + local has_metric = false + for _, m in pairs(observed) do + if m.metric_name == 'tnt_crud_storage_safe_mode_enabled' then + t.assert_equals(m.value, 1, 'Metric shows safe mode enabled') + has_metric = true + break + end + end + if not has_metric then + t.fail('No tnt_crud_storage_safe_mode_enabled metric found') + end + end) + + -- Check router cache metric + local observed = g.router:eval("return require('metrics').collect({ invoke_callbacks = true })") + local first_ts = 0 + for _, m in pairs(observed) do + if m.metric_name == 'tnt_crud_router_cache_last_clear_ts' then + first_ts = m.value + break + end + end + t.assert_gt(first_ts, 0, 'Last cache clear TS is greater than zero') + + -- Clear router cache + g.router:eval("crud.rebalance.router_cache_clear()") + + -- Check that last_clear_ts has changed + observed = g.router:eval("return require('metrics').collect({ invoke_callbacks = true })") + local new_ts = 0 + for _, m in pairs(observed) do + if m.metric_name == 'tnt_crud_router_cache_last_clear_ts' then + new_ts = m.value + break + end + end + t.assert_gt(new_ts, first_ts, 'Last cache clear TS is greater than the first one') +end