diff --git a/client/clients/router/client.go b/client/clients/router/client.go index 188758853c5..5724a05261b 100644 --- a/client/clients/router/client.go +++ b/client/clients/router/client.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/log" "github.com/tikv/pd/client/errs" + "github.com/tikv/pd/client/metrics" "github.com/tikv/pd/client/opt" "github.com/tikv/pd/client/pkg/batch" cctx "github.com/tikv/pd/client/pkg/connectionctx" @@ -209,8 +210,12 @@ func NewClient( } }, }, - requestCh: make(chan *Request, defaultMaxRouterRequestBatchSize*2), - batchController: batch.NewController(defaultMaxRouterRequestBatchSize, requestFinisher(nil), nil), + requestCh: make(chan *Request, defaultMaxRouterRequestBatchSize*2), + batchController: batch.NewController( + defaultMaxRouterRequestBatchSize, + requestFinisher(nil), + metrics.QueryRegionBestBatchSize, + ), } c.leaderURL.Store(svcDiscovery.GetServingURL()) c.svcDiscovery.ExecAndAddLeaderSwitchedCallback(c.updateLeaderURL) @@ -234,6 +239,7 @@ func (c *Cli) newRequest(ctx context.Context) *Request { req.needBuckets = false req.region = nil // Initialize the runtime fields. + req.start = time.Now() req.pool = c.reqPool return req @@ -523,14 +529,26 @@ func (c *Cli) processRequests(stream pdpb.PD_QueryRegionClient) error { panic("invalid region query request received") } } + start := time.Now() err := stream.Send(queryReq) if err != nil { return err } + metrics.QueryRegionBatchSendLatency.Observe( + time.Since( + c.batchController.GetExtraBatchingStartTime(), + ).Seconds(), + ) resp, err := stream.Recv() if err != nil { + metrics.RequestFailedDurationQueryRegion.Observe(time.Since(start).Seconds()) return err } + metrics.RequestDurationQueryRegion.Observe(time.Since(start).Seconds()) + metrics.QueryRegionBatchSizeTotal.Observe(float64(len(requests))) + metrics.QueryRegionBatchSizeByKeys.Observe(float64(len(queryReq.Keys))) + metrics.QueryRegionBatchSizeByPrevKeys.Observe(float64(len(queryReq.PrevKeys))) + metrics.QueryRegionBatchSizeByIDs.Observe(float64(len(queryReq.Ids))) c.doneCollectedRequests(resp) return nil } diff --git a/client/clients/router/request.go b/client/clients/router/request.go index 4578514597d..350240ac914 100644 --- a/client/clients/router/request.go +++ b/client/clients/router/request.go @@ -18,9 +18,11 @@ import ( "context" "runtime/trace" "sync" + "time" "github.com/pingcap/errors" + "github.com/tikv/pd/client/metrics" "github.com/tikv/pd/client/opt" ) @@ -44,7 +46,8 @@ type Request struct { region *Region // Runtime fields. - pool *sync.Pool + start time.Time + pool *sync.Pool } func (req *Request) tryDone(err error) { @@ -55,14 +58,20 @@ func (req *Request) tryDone(err error) { } func (req *Request) wait() (*Region, error) { - // TODO: introduce the metrics. + start := time.Now() + metrics.CmdDurationQueryRegionAsyncWait.Observe(start.Sub(req.start).Seconds()) select { case err := <-req.done: defer req.pool.Put(req) defer trace.StartRegion(req.requestCtx, "pdclient.regionReqDone").End() + now := time.Now() if err != nil { + metrics.CmdFailedDurationQueryRegionWait.Observe(now.Sub(start).Seconds()) + metrics.CmdFailedDurationQueryRegion.Observe(now.Sub(req.start).Seconds()) return nil, errors.WithStack(err) } + metrics.CmdDurationQueryRegionWait.Observe(now.Sub(start).Seconds()) + metrics.CmdDurationQueryRegion.Observe(now.Sub(req.start).Seconds()) return req.region, nil case <-req.requestCtx.Done(): return nil, errors.WithStack(req.requestCtx.Err()) diff --git a/client/inner_client.go b/client/inner_client.go index 181ee2c9d52..8ad0d3b6b75 100644 --- a/client/inner_client.go +++ b/client/inner_client.go @@ -214,6 +214,10 @@ func (c *innerClient) setup() error { // Create dispatchers c.createTokenDispatcher() + + // Enable the router client + c.initRouterClient() + return nil } diff --git a/client/metrics/metrics.go b/client/metrics/metrics.go index d168f9b46f3..9fbbbbbcb84 100644 --- a/client/metrics/metrics.go +++ b/client/metrics/metrics.go @@ -26,7 +26,7 @@ var initialized int32 func init() { initMetrics(prometheus.Labels{}) - initCmdDurations() + initLabelValues() initRegisteredConsumers() } @@ -56,7 +56,7 @@ func InitAndRegisterMetrics(constLabels prometheus.Labels) { if atomic.CompareAndSwapInt32(&initialized, 0, 1) { // init metrics with constLabels initMetrics(constLabels) - initCmdDurations() + initLabelValues() initRegisteredConsumers() // register metrics registerMetrics() @@ -84,6 +84,12 @@ var ( EstimateTSOLatencyGauge *prometheus.GaugeVec // CircuitBreakerCounters is a vector for different circuit breaker counters CircuitBreakerCounters *prometheus.CounterVec + // QueryRegionBestBatchSize is the histogram of the best batch size of query region requests. + QueryRegionBestBatchSize prometheus.Histogram + // QueryRegionBatchSize is the histogram of the batch size of query region requests. + QueryRegionBatchSize *prometheus.HistogramVec + // QueryRegionBatchSendLatency is the histogram of the latency of sending query region requests. + QueryRegionBatchSendLatency prometheus.Histogram ) func initMetrics(constLabels prometheus.Labels) { @@ -201,6 +207,36 @@ func initMetrics(constLabels prometheus.Labels) { Help: "Circuit breaker counters", ConstLabels: constLabels, }, []string{"name", "event"}) + + QueryRegionBestBatchSize = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: "pd_client", + Subsystem: "request", + Name: "handle_query_region_best_batch_size", + Help: "Bucketed histogram of the best batch size of handled query region requests.", + ConstLabels: constLabels, + Buckets: prometheus.ExponentialBuckets(1, 2, 13), + }) + + QueryRegionBatchSize = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "pd_client", + Subsystem: "request", + Name: "handle_query_region_batch_size", + Help: "Bucketed histogram of the batch size of handled query region requests.", + ConstLabels: constLabels, + Buckets: []float64{1, 2, 4, 8, 10, 14, 18, 22, 26, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 110, 120, 140, 160, 180, 200, 500, 1000}, + }, []string{"type"}) + + QueryRegionBatchSendLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: "pd_client", + Subsystem: "request", + Name: "query_region_batch_send_latency", + ConstLabels: constLabels, + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), + Help: "query region batch send latency", + }) } // CmdDurationXXX and CmdFailedDurationXXX are the durations of the client commands. @@ -230,6 +266,9 @@ var ( CmdDurationPut prometheus.Observer CmdDurationUpdateGCSafePointV2 prometheus.Observer CmdDurationUpdateServiceSafePointV2 prometheus.Observer + CmdDurationQueryRegionAsyncWait prometheus.Observer + CmdDurationQueryRegionWait prometheus.Observer + CmdDurationQueryRegion prometheus.Observer CmdFailedDurationGetRegion prometheus.Observer CmdFailedDurationTSOWait prometheus.Observer @@ -249,6 +288,9 @@ var ( CmdFailedDurationPut prometheus.Observer CmdFailedDurationUpdateGCSafePointV2 prometheus.Observer CmdFailedDurationUpdateServiceSafePointV2 prometheus.Observer + CmdFailedDurationQueryRegionAsyncWait prometheus.Observer + CmdFailedDurationQueryRegionWait prometheus.Observer + CmdFailedDurationQueryRegion prometheus.Observer InternalCmdDurationGetClusterInfo prometheus.Observer InternalCmdDurationGetMembers prometheus.Observer @@ -260,9 +302,18 @@ var ( RequestDurationTSO prometheus.Observer // RequestFailedDurationTSO records the durations of the failed TSO requests. RequestFailedDurationTSO prometheus.Observer + // RequestDurationQueryRegion records the durations of the successful query region requests. + RequestDurationQueryRegion prometheus.Observer + // RequestFailedDurationQueryRegion records the durations of the failed query region requests. + RequestFailedDurationQueryRegion prometheus.Observer + + QueryRegionBatchSizeTotal prometheus.Observer + QueryRegionBatchSizeByKeys prometheus.Observer + QueryRegionBatchSizeByPrevKeys prometheus.Observer + QueryRegionBatchSizeByIDs prometheus.Observer ) -func initCmdDurations() { +func initLabelValues() { // WithLabelValues is a heavy operation, define variable to avoid call it every time. CmdDurationTSOWait = cmdDuration.WithLabelValues("wait") CmdDurationTSO = cmdDuration.WithLabelValues("tso") @@ -289,6 +340,9 @@ func initCmdDurations() { CmdDurationPut = cmdDuration.WithLabelValues("put") CmdDurationUpdateGCSafePointV2 = cmdDuration.WithLabelValues("update_gc_safe_point_v2") CmdDurationUpdateServiceSafePointV2 = cmdDuration.WithLabelValues("update_service_safe_point_v2") + CmdDurationQueryRegionAsyncWait = cmdDuration.WithLabelValues("query_region_async_wait") + CmdDurationQueryRegionWait = cmdDuration.WithLabelValues("query_region_wait") + CmdDurationQueryRegion = cmdDuration.WithLabelValues("query_region") CmdFailedDurationGetRegion = cmdFailedDuration.WithLabelValues("get_region") CmdFailedDurationTSOWait = cmdFailedDuration.WithLabelValues("wait") @@ -308,6 +362,9 @@ func initCmdDurations() { CmdFailedDurationPut = cmdFailedDuration.WithLabelValues("put") CmdFailedDurationUpdateGCSafePointV2 = cmdFailedDuration.WithLabelValues("update_gc_safe_point_v2") CmdFailedDurationUpdateServiceSafePointV2 = cmdFailedDuration.WithLabelValues("update_service_safe_point_v2") + CmdFailedDurationQueryRegionAsyncWait = cmdFailedDuration.WithLabelValues("query_region_async_wait") + CmdFailedDurationQueryRegionWait = cmdFailedDuration.WithLabelValues("query_region_wait") + CmdFailedDurationQueryRegion = cmdFailedDuration.WithLabelValues("query_region") InternalCmdDurationGetClusterInfo = internalCmdDuration.WithLabelValues("get_cluster_info") InternalCmdDurationGetMembers = internalCmdDuration.WithLabelValues("get_members") @@ -317,6 +374,13 @@ func initCmdDurations() { RequestDurationTSO = requestDuration.WithLabelValues("tso") RequestFailedDurationTSO = requestDuration.WithLabelValues("tso-failed") + RequestDurationQueryRegion = requestDuration.WithLabelValues("query_region") + RequestFailedDurationQueryRegion = requestDuration.WithLabelValues("query_region-failed") + + QueryRegionBatchSizeTotal = QueryRegionBatchSize.WithLabelValues("total") + QueryRegionBatchSizeByKeys = QueryRegionBatchSize.WithLabelValues("by_keys") + QueryRegionBatchSizeByPrevKeys = QueryRegionBatchSize.WithLabelValues("by_prev_keys") + QueryRegionBatchSizeByIDs = QueryRegionBatchSize.WithLabelValues("by_ids") } func registerMetrics() { @@ -331,4 +395,7 @@ func registerMetrics() { prometheus.MustRegister(RequestForwarded) prometheus.MustRegister(EstimateTSOLatencyGauge) prometheus.MustRegister(CircuitBreakerCounters) + prometheus.MustRegister(QueryRegionBestBatchSize) + prometheus.MustRegister(QueryRegionBatchSize) + prometheus.MustRegister(QueryRegionBatchSendLatency) } diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 7a83b5d5448..4fc58afe772 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -2,7 +2,7 @@ "__inputs": [ { "name": "DS_TEST-CLUSTER", - "label": "test-cluster", + "label": "${DS_TEST-CLUSTER}", "description": "", "type": "datasource", "pluginId": "prometheus", @@ -10773,7 +10773,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed of handling TiDB requests", + "description": "The server-side time consumed of handling the TSO requests", "editable": true, "error": false, "fill": 1, @@ -10918,7 +10918,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The time consumed of handling TiDB requests", + "description": "The client-side time consumed of handling the TSO requests", "editable": true, "error": false, "fill": 1, @@ -10961,14 +10961,14 @@ "steppedLine": false, "targets": [ { - "expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type)", + "expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"tso.*\"}[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"tso.*\"}[30s])) by (type)", "intervalFactor": 2, "legendFormat": "avg {{type}}", "refId": "A", "step": 2 }, { - "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", + "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"tso.*\"}[30s])) by (type, le))", "hide": false, "intervalFactor": 2, "legendFormat": "90% {{type}}", @@ -10976,7 +10976,7 @@ "step": 2 }, { - "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", + "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"tso.*\"}[30s])) by (type, le))", "hide": false, "intervalFactor": 2, "legendFormat": "99% {{type}}", @@ -10984,7 +10984,7 @@ "step": 2 }, { - "expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (type, le))", + "expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"tso.*\"}[30s])) by (type, le))", "hide": false, "intervalFactor": 2, "legendFormat": "99.9% {{type}}", @@ -10996,7 +10996,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "PD client requests handle duration", + "title": "PD client TSO handle duration", "tooltip": { "msResolution": false, "shared": true, @@ -11040,7 +11040,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The count of TSO grpc requests", + "description": "The count of TSO gRPC requests", "editable": true, "error": false, "fill": 1, @@ -11111,7 +11111,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Handle requests count", + "title": "Handle TSO requests count", "tooltip": { "msResolution": false, "shared": true, @@ -11155,7 +11155,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The batch size of TiDB tso requests", + "description": "The batch size of the TSO requests", "editable": true, "error": false, "fill": 1, @@ -11225,7 +11225,556 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Tso Request batch size", + "title": "TSO request batch size", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The server-side time consumed by handling the QueryRegion requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 132 + }, + "hiddenSeries": false, + "id": 1626, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(pd_server_query_region_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "90% query", + "refId": "A", + "step": 2 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(pd_server_query_region_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99% query", + "refId": "B", + "step": 2 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(pd_server_query_region_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99.9% query", + "refId": "C", + "step": 2 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(pd_core_query_region_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (le, type))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "90% {{type}}", + "refId": "D", + "step": 2 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(pd_core_query_region_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (le, type))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99% {{type}}", + "refId": "E", + "step": 2 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(pd_core_query_region_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[30s])) by (le, type))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99.9% {{type}}", + "refId": "F", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PD server QueryRegion handle duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The client-side time consumed by handling the QueryRegion requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 132 + }, + "hiddenSeries": false, + "id": 1627, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"query_region.*\"}[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"query_region.*\"}[30s])) by (type)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "avg {{type}}", + "refId": "A", + "step": 2 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"query_region.*\"}[30s])) by (type, le))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "90% {{type}}", + "refId": "B", + "step": 2 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"query_region.*\"}[30s])) by (type, le))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99% {{type}}", + "refId": "C", + "step": 2 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"query_region.*\"}[30s])) by (type, le))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99.9% {{type}}", + "refId": "D", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PD client QueryRegion handle duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The count of the QueryRegion gRPC requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 141 + }, + "hiddenSeries": false, + "id": 1628, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(pd_server_query_region_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "QueryRegion request/secs", + "refId": "A", + "step": 2 + }, + { + "exemplar": true, + "expr": "sum(rate(pd_core_query_region_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "B", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Handle QueryRegion requests count", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The batch size of the QueryRegion requests", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 141 + }, + "hiddenSeries": false, + "id": 1629, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "7.5.17", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_query_region_batch_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, type))", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "0.99 {{type}}", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_query_region_batch_size_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, type))", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "0.90 {{type}}", + "refId": "C" + }, + { + "exemplar": true, + "expr": "sum(rate(pd_client_request_handle_query_region_batch_size_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type) / sum(rate(pd_client_request_handle_query_region_batch_size_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "avg {{type}}", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "QueryRegion request batch size", "tooltip": { "msResolution": false, "shared": true, diff --git a/pkg/core/metrics.go b/pkg/core/metrics.go index 65cc8be861e..709d40b7501 100644 --- a/pkg/core/metrics.go +++ b/pkg/core/metrics.go @@ -82,6 +82,32 @@ var ( regionCollectCount = HeartbeatBreakdownHandleCount.WithLabelValues("CollectRegionStats") otherDurationSum = HeartbeatBreakdownHandleDurationSum.WithLabelValues("Other") otherCount = HeartbeatBreakdownHandleCount.WithLabelValues("Other") + + // QueryRegion statistics + queryRegionDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "pd", + Subsystem: "core", + Name: "query_region_duration_seconds", + Help: "Bucketed histogram of processing time (s) of region query.", + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), + }, []string{"type"}) + + queryRegionCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "pd", + Subsystem: "core", + Name: "query_region_count", + Help: "The number of region query types.", + }, []string{"type"}) + + queryRegionByKeysDuration = queryRegionDuration.WithLabelValues("by-keys") + queryRegionByPrevKeysDuration = queryRegionDuration.WithLabelValues("by-prev-keys") + queryRegionByIDsDuration = queryRegionDuration.WithLabelValues("by-ids") + + queryRegionKeysCount = queryRegionCount.WithLabelValues("keys") + queryRegionPrevKeysCount = queryRegionCount.WithLabelValues("prev-keys") + queryRegionIDsCount = queryRegionCount.WithLabelValues("ids") ) func init() { @@ -89,6 +115,8 @@ func init() { prometheus.MustRegister(HeartbeatBreakdownHandleCount) prometheus.MustRegister(AcquireRegionsLockWaitDurationSum) prometheus.MustRegister(AcquireRegionsLockWaitCount) + prometheus.MustRegister(queryRegionDuration) + prometheus.MustRegister(queryRegionCount) } var tracerPool = &sync.Pool{ diff --git a/pkg/core/region.go b/pkg/core/region.go index 92b5b1cccbc..50a981c55e4 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -1471,22 +1471,33 @@ func (r *RegionsInfo) QueryRegions( keys, prevKeys [][]byte, ids []uint64, needBuckets bool, ) ([]uint64, []uint64, map[uint64]*pdpb.RegionResponse) { // Iterate the region keys to find the regions. + queryRegionKeysCount.Add(float64(len(keys))) + start := time.Now() regions := r.getRegionsByKeys(keys) + queryRegionByKeysDuration.Observe(time.Since(start).Seconds()) // Assert the returned regions count matches the input keys. if len(regions) != len(keys) { panic("returned regions count mismatch with the input keys") } + // Iterate the prevKeys to find the regions. + queryRegionPrevKeysCount.Add(float64(len(prevKeys))) + start = time.Now() prevRegions := r.getRegionsByPrevKeys(prevKeys) + queryRegionByPrevKeysDuration.Observe(time.Since(start).Seconds()) // Assert the returned regions count matches the input keys. if len(prevRegions) != len(prevKeys) { panic("returned prev regions count mismatch with the input keys") } + // Build the key -> ID map for the final results. regionsByID := make(map[uint64]*pdpb.RegionResponse, len(regions)+len(prevRegions)+len(ids)) keyIDMap := sortOutKeyIDMap(regionsByID, regions, needBuckets) prevKeyIDMap := sortOutKeyIDMap(regionsByID, prevRegions, needBuckets) + // Iterate the region IDs to find the regions. + queryRegionIDsCount.Add(float64(len(ids))) + start = time.Now() for _, id := range ids { // Check if the region has been found. if regionFound, ok := regionsByID[id]; (ok && regionFound != nil) || id == 0 { @@ -1508,6 +1519,8 @@ func (r *RegionsInfo) QueryRegions( regionsByID[id] = regionResp } } + queryRegionByIDsDuration.Observe(time.Since(start).Seconds()) + return keyIDMap, prevKeyIDMap, regionsByID } diff --git a/server/grpc_service.go b/server/grpc_service.go index b985e870a03..c3698763fa2 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -1573,7 +1573,7 @@ func (s *GrpcServer) QueryRegion(stream pdpb.PD_QueryRegionServer) error { request.GetIds(), needBuckets, ) - regionQueryDuration.Observe(time.Since(start).Seconds()) + queryRegionDuration.Observe(time.Since(start).Seconds()) // Build the response and send it to the client. response := &pdpb.QueryRegionResponse{ Header: wrapHeader(), diff --git a/server/metrics.go b/server/metrics.go index dd07447140b..f53385f8728 100644 --- a/server/metrics.go +++ b/server/metrics.go @@ -99,11 +99,11 @@ var ( Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }) - regionQueryDuration = prometheus.NewHistogram( + queryRegionDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: "pd", Subsystem: "server", - Name: "region_query_duration_seconds", + Name: "query_region_duration_seconds", Help: "Bucketed histogram of processing time (s) of region query requests.", Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }) @@ -181,7 +181,7 @@ func init() { prometheus.MustRegister(tsoProxyBatchSize) prometheus.MustRegister(tsoProxyForwardTimeoutCounter) prometheus.MustRegister(tsoHandleDuration) - prometheus.MustRegister(regionQueryDuration) + prometheus.MustRegister(queryRegionDuration) prometheus.MustRegister(regionHeartbeatHandleDuration) prometheus.MustRegister(storeHeartbeatHandleDuration) prometheus.MustRegister(bucketReportCounter)