Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion crud.lua
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ crud.rebalance.router_cache_length = rebalance.router.cache_length
crud.rebalance.router_cache_last_clear_ts = rebalance.router.cache_last_clear_ts

function crud.init_router()
rawset(_G, 'crud', crud)
rawset(_G, 'crud', crud)
rebalance.metrics.enable_router_metrics()
end

function crud.stop_router()
Expand Down
52 changes: 47 additions & 5 deletions crud/common/rebalance.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ local log = require('log')
local vshard_consts = require('vshard.consts')
local utils = require('crud.common.utils')

local has_metrics_module, metrics = pcall(require, 'metrics')

local SETTINGS_SPACE_NAME = '_crud_settings'
local SAFE_MOD_ENABLE_EVENT = '_crud.safe_mode_enable'


local M = {
safe_mode = false,
safe_mode_enable_hooks = {},
Expand Down Expand Up @@ -85,6 +86,8 @@ local function safe_mode_disable()
end

local function rebalance_init()
M.metrics.enable_storage_metrics()

-- box.watch was introduced in tarantool 2.10.0
if not utils.tarantool_supports_box_watch() then
log.warn('This version of tarantool does not support autoswitch to safe mode during rebalance. '
Expand Down Expand Up @@ -131,20 +134,54 @@ local function rebalance_stop()
end

local function router_cache_clear()
local r = utils.get_vshard_router_instance()
M._router_cache_last_clear_ts = fiber.time()
return r:_route_map_clear()
return utils.get_vshard_router_instance():_route_map_clear()
end

local function router_cache_length()
local r = utils.get_vshard_router_instance()
return r.known_bucket_count
return utils.get_vshard_router_instance().known_bucket_count
end

local function router_cache_last_clear_ts()
return M._router_cache_last_clear_ts
end

-- Rebalance related metrics
local function enable_storage_metrics()
if not has_metrics_module then
return
end

local safe_mode_enabled_gauge = metrics.gauge(
'tnt_crud_storage_safe_mode_enabled',
"is safe mode enabled on this storage instance"
)

metrics.register_callback(function()
safe_mode_enabled_gauge:set(safe_mode_status() and 1 or 0)
end)
end

local function enable_router_metrics()
if not has_metrics_module then
return
end

local router_cache_length_gauge = metrics.gauge(
'tnt_crud_router_cache_length',
"number of bucket routes in vshard router cache"
)
local router_cache_last_clear_ts_gauge = metrics.gauge(
'tnt_crud_router_cache_last_clear_ts',
"when vshard router cache was cleared last time"
)

metrics.register_callback(function()
router_cache_length_gauge:set(router_cache_length())
router_cache_last_clear_ts_gauge:set(router_cache_last_clear_ts())
end)
end

M.init = rebalance_init
M.stop = rebalance_stop
M.safe_mode_status = safe_mode_status
Expand All @@ -167,4 +204,9 @@ M.storage_api = {
rebalance_safe_mode_disable = safe_mode_disable,
}

M.metrics = {
enable_storage_metrics = enable_storage_metrics,
enable_router_metrics = enable_router_metrics,
}

return M
94 changes: 94 additions & 0 deletions test/integration/metrics_test.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
local helpers = require('test.helper')
local t = require('luatest')

local pgroup = t.group('metrics_integration', helpers.backend_matrix({
{engine = 'memtx'},
}))

local function before_all(g)
helpers.start_default_cluster(g, 'srv_stats')
end

local function after_all(g)
helpers.stop_cluster(g.cluster, g.params.backend)
end

local function before_each(g)
g.router:eval("crud = require('crud')")
helpers.call_on_storages(g.cluster, function(server)
server:call('_crud.rebalance_safe_mode_disable')
end)
end

pgroup.before_all(before_all)

pgroup.after_all(after_all)

pgroup.before_each(before_each)

pgroup.test_safe_mode_metrics = function(g)
local has_metrics_module = require('metrics')
t.skip_if(not has_metrics_module, 'No metrics module in current version')

-- Check safe mode metric on storage
helpers.call_on_storages(g.cluster, function(server)
local observed = server:eval("return require('metrics').collect({ invoke_callbacks = true })")
local has_metric = false
for _, m in pairs(observed) do
if m.metric_name == 'tnt_crud_storage_safe_mode_enabled' then
t.assert_equals(m.value, 0, 'Metric shows safe mode disabled')
has_metric = true
break
end
end
if not has_metric then
t.fail('No tnt_crud_storage_safe_mode_enabled metric found')
end
end)

-- Enable safe mode
helpers.call_on_storages(g.cluster, function(server)
server:call('_crud.rebalance_safe_mode_enable')
end)

-- Check that metric value has changed
helpers.call_on_storages(g.cluster, function(server)
local observed = server:eval("return require('metrics').collect({ invoke_callbacks = true })")
local has_metric = false
for _, m in pairs(observed) do
if m.metric_name == 'tnt_crud_storage_safe_mode_enabled' then
t.assert_equals(m.value, 1, 'Metric shows safe mode enabled')
has_metric = true
break
end
end
if not has_metric then
t.fail('No tnt_crud_storage_safe_mode_enabled metric found')
end
end)

-- Check router cache metric
local observed = g.router:eval("return require('metrics').collect({ invoke_callbacks = true })")
local first_ts = 0
for _, m in pairs(observed) do
if m.metric_name == 'tnt_crud_router_cache_last_clear_ts' then
first_ts = m.value
break
end
end
t.assert_gt(first_ts, 0, 'Last cache clear TS is greater than zero')

-- Clear router cache
g.router:eval("crud.rebalance.router_cache_clear()")

-- Check that last_clear_ts has changed
observed = g.router:eval("return require('metrics').collect({ invoke_callbacks = true })")
local new_ts = 0
for _, m in pairs(observed) do
if m.metric_name == 'tnt_crud_router_cache_last_clear_ts' then
new_ts = m.value
break
end
end
t.assert_gt(new_ts, first_ts, 'Last cache clear TS is greater than the first one')
end
Loading