Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions tidb/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# CHANGELOG - TiDB

## 2.2.0 / 2026-04-19

***Added***:

* Add `tiflash_syncing_data_freshness` histogram metric to track TiFlash replication lag from TiKV ([#XXXX](https://github.com/DataDog/integrations-extras/pull/XXXX))
* Add PD client metrics: `pd_client_cmd_handle_cmds_duration_seconds` and `pd_client_request_handle_requests_duration_seconds`
* Add TiDB session phase duration metrics: `tidb_session_parse_duration_seconds`, `tidb_session_compile_duration_seconds`, `tidb_session_execute_duration_seconds`, `tidb_session_transaction_duration_seconds`
* Add TiDB connection metrics: `tidb_server_get_token_duration_seconds`, `tidb_server_conn_idle_duration_seconds`
* Add TiDB server metrics: `tidb_server_query_total`, `tidb_server_disconnection_total`, `tidb_server_plan_cache_total`, `tidb_server_plan_cache_miss_total`
* Add TiDB TiKV client metric: `tidb_tikvclient_request_seconds`
* Add TiKV raftstore metrics: `tikv_raftstore_append_log_duration_seconds`, `tikv_raftstore_apply_log_duration_seconds`, `tikv_raftstore_commit_log_duration_seconds`, `tikv_raftstore_store_duration_secs`, `tikv_raftstore_apply_duration_secs`
* Add TiKV storage and gRPC metrics: `tikv_storage_engine_async_request_duration_seconds`, `tikv_grpc_msg_duration_seconds`, `tikv_engine_flow_bytes`, `tikv_thread_cpu_seconds_total`
* Add unit tests and fixture data for all new metrics

## 2.1.1 / 2025-10-17

***Added***
Expand Down
502 changes: 502 additions & 0 deletions tidb/assets/dashboards/overview.json

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions tidb/datadog_checks/tidb/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from datadog_checks.base import OpenMetricsBaseCheck

from .metrics import TIDB_METRICS, TIFLASH_METRICS, TIKV_METRICS
from .metrics import PD_METRICS, TIDB_METRICS, TIFLASH_METRICS, TIKV_METRICS
from .utils import build_check


Expand All @@ -29,14 +29,14 @@ def __init__(self, name, init_config, instances=None):
"pd",
{
'pd_metric_url': 'http://localhost:2379/metrics',
'metrics': TIDB_METRICS + TIFLASH_METRICS + TIKV_METRICS,
'metrics': TIDB_METRICS + TIFLASH_METRICS + TIKV_METRICS + PD_METRICS,
},
),
'tidb_cloud': build_check(
"pd",
{
'pd_metric_url': 'http://localhost:2379/metrics',
'metrics': TIDB_METRICS + TIFLASH_METRICS + TIKV_METRICS,
'metrics': TIDB_METRICS + TIFLASH_METRICS + TIKV_METRICS + PD_METRICS,
},
),
}
Expand Down
33 changes: 32 additions & 1 deletion tidb/datadog_checks/tidb/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,22 @@
'tidb_server_execute_error_total',
'tidb_server_handle_query_duration_seconds',
'tidb_server_connections',
'tidb_server_query_total',
'tidb_server_disconnection_total',
'tidb_server_plan_cache_total',
'tidb_server_plan_cache_miss_total',
# session phase duration metrics
'tidb_session_parse_duration_seconds',
'tidb_session_compile_duration_seconds',
'tidb_session_execute_duration_seconds',
'tidb_session_transaction_duration_seconds',
'tidb_server_get_token_duration_seconds',
'tidb_server_conn_idle_duration_seconds',
# tikv client metrics from TiDB side
'tidb_tikvclient_request_seconds',
# cpu metrics
'process_cpu_seconds_total',
'process_start_time_seconds',
# memory metrics
'process_resident_memory_bytes',
# no disk metrics for TiDB
Expand All @@ -14,13 +28,24 @@
TIKV_METRICS = [
# cpu metrics
'process_cpu_seconds_total',
'tikv_thread_cpu_seconds_total',
# memory metrics
'process_resident_memory_bytes',
# disk metrics
'tikv_engine_size_bytes',
'tikv_store_size_bytes',
# disk traffic metrics
'tikv_io_bytes',
'tikv_engine_flow_bytes',
# gRPC metrics
'tikv_grpc_msg_duration_seconds',
# raftstore metrics
'tikv_raftstore_append_log_duration_seconds',
'tikv_raftstore_apply_log_duration_seconds',
'tikv_raftstore_commit_log_duration_seconds',
'tikv_raftstore_store_duration_secs',
'tikv_raftstore_apply_duration_secs',
'tikv_storage_engine_async_request_duration_seconds',
]
TIFLASH_METRICS = [
# cpu metrics
Expand All @@ -30,5 +55,11 @@
# disk metrics
{'tiflash_system_current_metric_StoreSizeUsed': 'tiflash_store_size_used_bytes'},
{'tiflash_system_current_metric_StoreSizeCapacity': 'tiflash_store_size_capacity_bytes'},
# no disk traffic metrics for TiFlash
# replication lag metrics
{'tiflash_syncing_data_freshness': 'tiflash_syncing_data_freshness'},
]
PD_METRICS = [
# client command duration metrics
'pd_client_cmd_handle_cmds_duration_seconds',
'pd_client_request_handle_requests_duration_seconds',
]
41 changes: 41 additions & 0 deletions tidb/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,51 @@ tidb_cluster.tidb_server_execute_error_total,count,,error,,The total number of e
tidb_cluster.tidb_server_connections,gauge,,connection,,Current number of connections in TiDB server,1,tidb,,
tidb_cluster.tidb_server_handle_query_duration_seconds.count,count,,query,,The total number of handled queries in server,1,tidb,,
tidb_cluster.tidb_server_handle_query_duration_seconds.sum,count,,second,,The sum of handled query duration in server,1,tidb,,
tidb_cluster.tidb_server_query_total,count,,query,,The total number of queries processed by TiDB,1,tidb,,
tidb_cluster.tidb_server_disconnection_total,count,,connection,,The total number of disconnections from TiDB,1,tidb,,
tidb_cluster.tidb_server_plan_cache_total,count,,hit,,The total number of plan cache hits in TiDB,1,tidb,,
tidb_cluster.tidb_server_plan_cache_miss_total,count,,miss,,The total number of plan cache misses in TiDB,1,tidb,,
tidb_cluster.tidb_session_parse_duration_seconds.count,count,,query,,The total number of parse operations in TiDB session,1,tidb,,
tidb_cluster.tidb_session_parse_duration_seconds.sum,count,,second,,The sum of parse duration in TiDB session,1,tidb,,
tidb_cluster.tidb_session_compile_duration_seconds.count,count,,query,,The total number of compile operations in TiDB session,1,tidb,,
tidb_cluster.tidb_session_compile_duration_seconds.sum,count,,second,,The sum of compile duration in TiDB session,1,tidb,,
tidb_cluster.tidb_session_execute_duration_seconds.count,count,,query,,The total number of execute operations in TiDB session,1,tidb,,
tidb_cluster.tidb_session_execute_duration_seconds.sum,count,,second,,The sum of execute duration in TiDB session,1,tidb,,
tidb_cluster.tidb_session_transaction_duration_seconds.count,count,,transaction,,The total number of transactions in TiDB session,1,tidb,,
tidb_cluster.tidb_session_transaction_duration_seconds.sum,count,,second,,The sum of transaction duration in TiDB session,1,tidb,,
tidb_cluster.tidb_server_get_token_duration_seconds.count,count,,query,,The total number of token acquisitions in TiDB,1,tidb,,
tidb_cluster.tidb_server_get_token_duration_seconds.sum,count,,second,,The sum of token acquisition duration in TiDB,1,tidb,,
tidb_cluster.tidb_server_conn_idle_duration_seconds.count,count,,connection,,The total number of idle connection samples in TiDB,1,tidb,,
tidb_cluster.tidb_server_conn_idle_duration_seconds.sum,count,,second,,The sum of idle connection duration in TiDB,1,tidb,,
tidb_cluster.tidb_tikvclient_request_seconds.count,count,,request,,The total number of TiKV client requests from TiDB,1,tidb,,
tidb_cluster.tidb_tikvclient_request_seconds.sum,count,,second,,The sum of TiKV client request duration from TiDB,1,tidb,,
tidb_cluster.tikv_engine_size_bytes,gauge,,byte,,The disk usage bytes of TiKV instances,1,tidb,,
tidb_cluster.tikv_store_size_bytes,gauge,,byte,,The disk capacity bytes of TiKV instances,1,tidb,,
tidb_cluster.tikv_io_bytes,count,,byte,,The io read/write bytes of TiKV instances,1,tidb,,
tidb_cluster.tikv_engine_flow_bytes,count,,byte,,The flow bytes through TiKV engine,1,tidb,,
tidb_cluster.tikv_thread_cpu_seconds_total,count,,second,,The total CPU time spent by TiKV threads,1,tidb,,
tidb_cluster.tikv_grpc_msg_duration_seconds.count,count,,request,,The total number of gRPC messages processed by TiKV,1,tidb,,
tidb_cluster.tikv_grpc_msg_duration_seconds.sum,count,,second,,The sum of gRPC message processing duration in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_append_log_duration_seconds.count,count,,operation,,The total number of raft log append operations in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_append_log_duration_seconds.sum,count,,second,,The sum of raft log append duration in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_apply_log_duration_seconds.count,count,,operation,,The total number of raft log apply operations in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_apply_log_duration_seconds.sum,count,,second,,The sum of raft log apply duration in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_commit_log_duration_seconds.count,count,,operation,,The total number of raft log commit operations in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_commit_log_duration_seconds.sum,count,,second,,The sum of raft log commit duration in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_store_duration_secs.count,count,,operation,,The total number of raft store operations in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_store_duration_secs.sum,count,,second,,The sum of raft store operation duration in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_apply_duration_secs.count,count,,operation,,The total number of raft apply operations in TiKV,1,tidb,,
tidb_cluster.tikv_raftstore_apply_duration_secs.sum,count,,second,,The sum of raft apply operation duration in TiKV,1,tidb,,
tidb_cluster.tikv_storage_engine_async_request_duration_seconds.count,count,,request,,The total number of async storage engine requests in TiKV,1,tidb,,
tidb_cluster.tikv_storage_engine_async_request_duration_seconds.sum,count,,second,,The sum of async storage engine request duration in TiKV,1,tidb,,
tidb_cluster.tiflash_store_size_used_bytes,gauge,,byte,,The disk usage bytes of TiFlash instances,1,tidb,,
tidb_cluster.tiflash_store_size_capacity_bytes,gauge,,byte,,The disk capacity bytes of TiFlash instances,1,tidb,,
tidb_cluster.tiflash_syncing_data_freshness.sum,count,,second,,The total replication lag seconds from TiKV to TiFlash,1,tidb,,
tidb_cluster.tiflash_syncing_data_freshness.count,count,,query,,The total number of TiFlash replication lag observations,1,tidb,,
tidb_cluster.tiflash_syncing_data_freshness.bucket,count,,query,,The histogram buckets for TiFlash replication lag,1,tidb,,
tidb_cluster.pd_client_cmd_handle_cmds_duration_seconds.count,count,,command,,The total number of PD client commands handled,1,tidb,,
tidb_cluster.pd_client_cmd_handle_cmds_duration_seconds.sum,count,,second,,The sum of PD client command handling duration,1,tidb,,
tidb_cluster.pd_client_request_handle_requests_duration_seconds.count,count,,request,,The total number of PD client requests handled,1,tidb,,
tidb_cluster.pd_client_request_handle_requests_duration_seconds.sum,count,,second,,The sum of PD client request handling duration,1,tidb,,
tidb_cluster.process_cpu_seconds_total,count,,second,,The cpu usage seconds of TiDB/TiKV/TiFlash instances,1,tidb,,
tidb_cluster.process_resident_memory_bytes,gauge,,byte,,The resident memory bytes of TiDB/TiKV/TiFlash instances,1,tidb,,
22 changes: 22 additions & 0 deletions tidb/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ def mock_tikv_metrics():
yield


@pytest.fixture()
def mock_pd_metrics():
with mock.patch(
'requests.Session.get',
return_value=mock.MagicMock(
status_code=200,
iter_lines=lambda **kwargs: _get_mock_metrics("mock_pd_metrics.txt").split("\n"),
headers={'Content-Type': "text/plain"},
),
):
yield


def _get_mock_metrics(filename):
f_name = os.path.join(os.path.dirname(__file__), 'fixtures', filename)
with open(f_name, 'r') as f:
Expand Down Expand Up @@ -119,6 +132,15 @@ def tikv_instance():
}


@pytest.fixture(scope="session")
def pd_instance():
return {
'pd_metric_url': "http://{}:{}/metrics".format(HOST, PD_PORT),
'max_returned_metrics': "10000",
'tags': ['tidb_cluster_name:test'],
}


# Integration test docker-compose environment


Expand Down
133 changes: 133 additions & 0 deletions tidb/tests/expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,62 @@
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_server_query_total': [
'type:OK',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_server_disconnection_total': [
'result:ok',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_server_plan_cache_total': [
'type:hit',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_server_plan_cache_miss_total': [
'type:miss',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_session_parse_duration_seconds.sum': [
'sql_type:general',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_session_compile_duration_seconds.sum': [
'sql_type:general',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_session_execute_duration_seconds.sum': [
'type:general',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_session_transaction_duration_seconds.sum': [
'sql_type:general',
'type:commit',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_server_get_token_duration_seconds.sum': [
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_server_conn_idle_duration_seconds.sum': [
'in_txn:0',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.tidb_tikvclient_request_seconds.sum': [
'store:1',
'type:Prewrite',
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
],
'tidb_cluster.process_cpu_seconds_total': [
'tidb_cluster_component:tidb',
'tidb_cluster_name:test',
Expand Down Expand Up @@ -47,6 +103,15 @@
'tidb_cluster_component:tiflash',
'tidb_cluster_name:test',
],
'tidb_cluster.tiflash_syncing_data_freshness.sum': [
'tidb_cluster_component:tiflash',
'tidb_cluster_name:test',
],
'tidb_cluster.tiflash_syncing_data_freshness.count': [
'upper_bound:none',
'tidb_cluster_component:tiflash',
'tidb_cluster_name:test',
],
},
'service_check': {
'tidb_cluster.prometheus.health': [
Expand Down Expand Up @@ -96,6 +161,52 @@
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_engine_flow_bytes': [
'db:kv',
'type:keys_read',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_thread_cpu_seconds_total': [
'name:raftstore',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_grpc_msg_duration_seconds.sum': [
'type:kv_get',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_raftstore_append_log_duration_seconds.sum': [
'type:normal',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_raftstore_apply_log_duration_seconds.sum': [
'type:normal',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_raftstore_commit_log_duration_seconds.sum': [
'type:normal',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_raftstore_store_duration_secs.sum': [
'type:normal',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_raftstore_apply_duration_secs.sum': [
'type:normal',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.tikv_storage_engine_async_request_duration_seconds.sum': [
'type:write',
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
],
'tidb_cluster.process_cpu_seconds_total': [
'tidb_cluster_component:tikv',
'tidb_cluster_name:test',
Expand All @@ -113,3 +224,25 @@
],
},
}

EXPECTED_PD = {
'metrics': {
'tidb_cluster.pd_client_cmd_handle_cmds_duration_seconds.sum': [
'type:tso',
'tidb_cluster_component:pd',
'tidb_cluster_name:test',
],
'tidb_cluster.pd_client_request_handle_requests_duration_seconds.sum': [
'type:tso',
'tidb_cluster_component:pd',
'tidb_cluster_name:test',
],
},
'service_check': {
'tidb_cluster.prometheus.health': [
'endpoint:http://localhost:2379/metrics',
'tidb_cluster_component:pd',
'tidb_cluster_name:test',
],
},
}
14 changes: 14 additions & 0 deletions tidb/tests/fixtures/mock_pd_metrics.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# HELP pd_client_cmd_handle_cmds_duration_seconds Bucketed histogram of processing time (s) of handled cmds.
# TYPE pd_client_cmd_handle_cmds_duration_seconds histogram
pd_client_cmd_handle_cmds_duration_seconds_bucket{type="tso",le="0.001"} 500
pd_client_cmd_handle_cmds_duration_seconds_bucket{type="tso",le="0.005"} 520
pd_client_cmd_handle_cmds_duration_seconds_bucket{type="tso",le="+Inf"} 520
pd_client_cmd_handle_cmds_duration_seconds_sum{type="tso"} 0.21
pd_client_cmd_handle_cmds_duration_seconds_count{type="tso"} 520
# HELP pd_client_request_handle_requests_duration_seconds Bucketed histogram of processing time (s) of handled requests.
# TYPE pd_client_request_handle_requests_duration_seconds histogram
pd_client_request_handle_requests_duration_seconds_bucket{type="tso",le="0.001"} 500
pd_client_request_handle_requests_duration_seconds_bucket{type="tso",le="0.005"} 520
pd_client_request_handle_requests_duration_seconds_bucket{type="tso",le="+Inf"} 520
pd_client_request_handle_requests_duration_seconds_sum{type="tso"} 0.19
pd_client_request_handle_requests_duration_seconds_count{type="tso"} 520
Loading
Loading