Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

- Failover coordinator section with status and visibility panels (#247)

### Changed

### Fixed
Expand Down
3 changes: 2 additions & 1 deletion config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,10 @@ metrics_prefix: ''
# - tdg_iproto
# - tdg_rest_api
# - tdg_tasks
# default is [cluster_tarantool3, replication_tarantool3, http, net, slab, mvcc, space, vinyl, cpu, runtime, luajit, operations, crud, expirationd]
# default is [cluster_tarantool3, failover_coordinator, replication_tarantool3, http, net, slab, mvcc, space, vinyl, cpu, runtime, luajit, operations, crud, expirationd]
sections:
- cluster_tarantool3
- failover_coordinator
- replication_tarantool3
- http
- net
Expand Down
1 change: 1 addition & 0 deletions dashboard/build/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ local variable = import 'dashboard/variable.libsonnet';
metrics_prefix: '',
sections: [
'cluster_tarantool3',
'failover_coordinator',
'replication_tarantool3',
'http',
'net',
Expand Down
186 changes: 186 additions & 0 deletions dashboard/panels/cluster.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,192 @@ local prometheus = grafana.prometheus;

{
row:: common.row('Cluster overview'),
failover_coordinator_row:: common.row('Failover coordinator'),

coordinators_status(
cfg,
title='Coordinators status',
):: tablePanel.new(
title=title,
datasource=cfg.datasource,
transform='table',
).addTarget(
if cfg.type == variable.datasource_type.prometheus then
local filters_obj = common.remove_field(cfg.filters, 'alias');
local filters = common.prometheus_query_filters(filters_obj);
local metric = std.format('%starantool_coordinator_active', [cfg.metrics_prefix]);
prometheus.target(
expr=if filters == '' then metric else std.format('%s{%s}', [metric, filters]),
format='table',
instant=true,
)
else
error 'InfluxDB target is not supported yet'
) {
options: { cellHeight: 'sm', showHeader: true },
fieldConfig: {
defaults: {
custom: {
align: 'auto',
cellOptions: { type: 'auto' },
footer: { reducers: [] },
inspect: false,
},
mappings: [],
thresholds: {
mode: 'absolute',
steps: [
{ color: 'green', value: 0 },
{ color: 'red', value: 80 },
],
},
},
overrides: [
{
matcher: { id: 'byName', options: 'status' },
properties: [
{ id: 'custom.cellOptions', value: { type: 'color-text' } },
{
id: 'mappings',
value: [
{
type: 'value',
options: {
'0': { color: 'yellow', text: 'passive' },
'1': { color: 'green', text: 'active' },
},
},
{
type: 'special',
options: {
match: 'null',
result: { color: 'red', text: 'disconnected' },
},
},
],
},
],
},
],
},
transformations: [
{
id: 'organize',
options: {
excludeByName: {
Time: true,
__name__: true,
instance: true,
job: true,
},
indexByName: { Value: 1, alias: 0 },
renameByName: { Value: 'status', alias: 'uuid' },
},
},
],
},

instances_seen_by_coordinators(
cfg,
title='Instances seen by coordinators',
):: tablePanel.new(
title=title,
datasource=cfg.datasource,
transform='table',
).addTarget(
if cfg.type == variable.datasource_type.prometheus then
local filters_obj = common.remove_field(cfg.filters, 'alias');
local filters = common.prometheus_query_filters(filters_obj);
local metric = std.format('%starantool_instance_status', [cfg.metrics_prefix]);
prometheus.target(
expr=if filters == '' then metric else std.format('%s{%s}', [metric, filters]),
format='table',
instant=true,
)
else
error 'InfluxDB target is not supported yet'
) {
options: { cellHeight: 'sm', showHeader: true },
fieldConfig: {
defaults: {
custom: {
align: 'auto',
cellOptions: { type: 'auto' },
footer: { reducers: [] },
inspect: false,
},
mappings: [],
thresholds: {
mode: 'absolute',
steps: [
{ color: 'red', value: null },
{ color: 'green', value: 1 },
],
},
},
overrides: [
{
matcher: { id: 'byName', options: 'status' },
properties: [
{ id: 'custom.cellOptions', value: { type: 'color-text' } },
{
id: 'mappings',
value: [
{
type: 'value',
options: {
'0': { color: 'red', text: 'down' },
'1': { color: 'green', text: 'alive' },
},
},
{
type: 'special',
options: {
match: 'nan',
result: { color: 'red', text: 'unknown' },
},
},
],
},
],
},
],
},
transformations: [
{
id: 'organize',
options: {
excludeByName: {
Time: true,
__name__: true,
job: true,
exported_job: true,
endpoint: true,
namespace: true,
pod: true,
service: true,
instance: true,
},
renameByName: {
alias: 'coordinator uuid',
exported_instance: 'instance',
Value: 'status',
},
},
},
{
id: 'organize',
options: {
indexByName: {
'coordinator uuid': 0,
replicaset: 1,
instance: 2,
status: 3,
},
},
},
],
},

health_overview_table(
cfg,
Expand Down
6 changes: 6 additions & 0 deletions dashboard/section.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ local vinyl = import 'dashboard/panels/vinyl.libsonnet';
cluster.election_term(cfg),
],

failover_coordinator(cfg):: if cfg.type == variable.datasource_type.prometheus then [
cluster.failover_coordinator_row,
cluster.coordinators_status(cfg) { gridPos: { w: 10, h: 14, x: 0, y: 3 } },
cluster.instances_seen_by_coordinators(cfg) { gridPos: { w: 14, h: 14, x: 10, y: 3 } },
] else [],

cluster_cartridge(cfg):: if cfg.type == variable.datasource_type.prometheus then [
// Must be used only in the top of a dashboard, overall stat panels use complicated layout
cluster.row,
Expand Down
21 changes: 21 additions & 0 deletions doc/monitoring/grafana_dashboard.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,21 @@ to enable statistics export.
.. image:: images/Prometheus_dashboard_3.png
:width: 30%

-------------------------------------------------------------------------------
Monitor failover coordinators
-------------------------------------------------------------------------------

The Tarantool 3 dashboard now includes a dedicated **Failover coordinator**
section (Prometheus only). It shows:

- **Coordinators status** - whether each coordinator is active or passive.
- **Instances seen by coordinators** - instances visible to every coordinator,
together with their cluster status.

.. image:: images/failover_coordinator_section.png
:width: 75%
:alt: Failover coordinator section

.. _monitoring-grafana_dashboard-monitoring_stack:

-------------------------------------------------------------------------------
Expand Down Expand Up @@ -139,6 +154,9 @@ to Telegraf configuration including each Tarantool instance metrics URL:
"label_pairs_name",
"label_pairs_index_name",
"label_pairs_delta",
"label_pairs_replicaset",
"label_pairs_instance",
"label_pairs_exported_instance",
"label_pairs_stream",
"label_pairs_thread",
"label_pairs_kind"
Expand Down Expand Up @@ -180,6 +198,9 @@ For TDG dashboard, please use
"label_pairs_delta",
"label_pairs_stream",
"label_pairs_thread",
"label_pairs_replicaset",
"label_pairs_instance",
"label_pairs_exported_instance",
"label_pairs_type",
"label_pairs_connector_name",
"label_pairs_broker_name",
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
54 changes: 54 additions & 0 deletions example_cluster/tarantool3_project/roles/pseudoapp.lua
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,60 @@ local function apply(cfg)

httpd:start()

local coordinator_status = metrics.gauge(
'tarantool_coordinator_active',
'Mock failover coordinator status'
)
local instance_status = metrics.gauge(
'tarantool_instance_status',
'Mock failover coordinator visibility'
)

fiber.create(function()
local uuid = require('uuid')

local active = 1

local uuid_a = tostring(uuid.new())
local uuid_b = tostring(uuid.new())

while true do
coordinator_status:set(active, {
alias = uuid_a,
})
coordinator_status:set((active + 1) % 2, {
alias = uuid_b,
})

instance_status:set(1, {
alias = uuid_a,
replicaset = 'storages_1',
instance = 'storage_1_master',
})

instance_status:set(1, {
alias = uuid_a,
replicaset = 'storages_2',
instance = 'storage_2_master',
})

instance_status:set(1, {
alias = uuid_b,
replicaset = 'storages_1',
instance = 'storage_1_replica',
})

instance_status:set(0, {
alias = uuid_b,
replicaset = 'storages_2',
instance = 'storage_2_replica',
})

active = (active + 1) % 2
fiber.sleep(5)
end
end)

box.watch('box.status', function()
if box.info.ro then
return
Expand Down
2 changes: 2 additions & 0 deletions supported_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ Based on [tarantool/metrics 1.6.1](https://github.com/tarantool/metrics/releases
- **tnt_cartridge_cluster_issues**: unsupported (decided not to support: superseded by **tnt_cartridge_issues**)
- [x] **tnt_cartridge_failover_trigger_total**: see *Cluster overview/Failovers triggered* panel ([#178](https://github.com/tarantool/grafana-dashboard/issues/178))
- [x] **tnt_cartridge_config_checksum**: see *Cluster overview/Configuration checksum* panel ([#242](https://github.com/tarantool/grafana-dashboard/pull/242))
- [x] **tarantool_coordinator_active**: see *Failover coordinator/Coordinators status* panel ([#247](https://github.com/tarantool/grafana-dashboard/issues/247))
- [x] **tarantool_instance_status**: see *Failover coordinator/Instances seen by coordinators* panel ([#247](https://github.com/tarantool/grafana-dashboard/issues/247))
- [x] **tnt_synchro_queue_owner**: see *Replication overview/Synchronous queue owner* panel ([#178](https://github.com/tarantool/grafana-dashboard/issues/178))
- [x] **tnt_synchro_queue_term**: see *Replication overview/Synchronous queue term* panel ([#178](https://github.com/tarantool/grafana-dashboard/issues/178))
- [x] **tnt_synchro_queue_len**: see *Replication overview/Synchronous queue transactions* panel ([#178](https://github.com/tarantool/grafana-dashboard/issues/178))
Expand Down
1 change: 1 addition & 0 deletions tests/Prometheus/dashboard_tarantool3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ filters:
alias: ['=~', '$alias']
sections:
- cluster_tarantool3
- failover_coordinator
- replication_tarantool3
- http
- net
Expand Down
Loading