Skip to content

Commit 77da938

Browse files
committed
Merge branch 'm/tso-client-optimization' into m/tso-opt-with-hard-limit
2 parents c109bd3 + d3aabb3 commit 77da938

File tree

4 files changed

+216
-56
lines changed

4 files changed

+216
-56
lines changed

client/histogram.go

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
package pd
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
"sync/atomic"
7+
"time"
8+
9+
"github.com/pingcap/log"
10+
"go.uber.org/zap"
11+
)
12+
13+
type Histogram struct {
14+
buckets []int
15+
sum float64
16+
sumSquare float64
17+
count int
18+
interval float64
19+
cutoff float64
20+
}
21+
22+
func NewHistogram(interval float64, bucketsCount int, cutoff float64) *Histogram {
23+
return &Histogram{
24+
buckets: make([]int, bucketsCount),
25+
interval: interval,
26+
count: 0,
27+
cutoff: cutoff,
28+
}
29+
}
30+
31+
func (h *Histogram) Observe(value float64) {
32+
if value >= h.cutoff {
33+
return
34+
}
35+
36+
index := int(value / h.interval)
37+
for index >= len(h.buckets) {
38+
h.buckets = append(h.buckets, 0)
39+
}
40+
41+
h.buckets[index]++
42+
h.count++
43+
h.sum += value
44+
h.sumSquare += value * value
45+
}
46+
47+
func (h *Histogram) GetPercentile(p float64) float64 {
48+
if h.count == 0 {
49+
return 0
50+
}
51+
limit := float64(h.count) * p
52+
result := 0.
53+
for i := 0; i < len(h.buckets); i += 1 {
54+
samplesInBucket := float64(h.buckets[i])
55+
if samplesInBucket >= limit {
56+
result += limit / samplesInBucket * h.interval
57+
break
58+
}
59+
result += h.interval
60+
limit -= samplesInBucket
61+
}
62+
return result
63+
}
64+
65+
func (h *Histogram) GetAvg() float64 {
66+
return h.sum / float64(h.count)
67+
}
68+
69+
func (h *Histogram) String() string {
70+
sb := &strings.Builder{}
71+
_, err := fmt.Fprintf(sb, "{ count: %v, sum: %v, sum_square: %v, interval: %v, buckets.len: %v, buckets: [", h.count, h.sum, h.sumSquare, h.interval, len(h.buckets))
72+
if err != nil {
73+
panic("unreachable")
74+
}
75+
76+
if len(h.buckets) > 0 {
77+
put := func(value, count int) {
78+
if count == 1 {
79+
_, err = fmt.Fprintf(sb, "%v;", value)
80+
} else {
81+
_, err = fmt.Fprintf(sb, "%v,%v;", value, count)
82+
}
83+
if err != nil {
84+
panic("unreachable")
85+
}
86+
}
87+
88+
lastValue := h.buckets[0]
89+
lastValueCount := 1
90+
91+
for i := 1; i < len(h.buckets); i++ {
92+
if h.buckets[i] == lastValue {
93+
lastValueCount++
94+
continue
95+
}
96+
97+
put(lastValue, lastValueCount)
98+
lastValue = h.buckets[i]
99+
lastValueCount = 1
100+
}
101+
102+
put(lastValue, lastValueCount)
103+
}
104+
105+
_, err = sb.WriteString("] }")
106+
if err != nil {
107+
panic("unreachable")
108+
}
109+
110+
return sb.String()
111+
}
112+
113+
func (h *Histogram) Clear() {
114+
h.sum = 0
115+
h.sumSquare = 0
116+
h.count = 0
117+
for i := 0; i < len(h.buckets); i++ {
118+
h.buckets[i] = 0
119+
}
120+
}
121+
122+
type AutoDumpHistogram struct {
123+
name string
124+
mainHistogram *Histogram
125+
backHistogram *Histogram
126+
accumulated *Histogram
127+
isDumping atomic.Bool
128+
lastDumpHistogramTime time.Time
129+
dumpInterval time.Duration
130+
}
131+
132+
func NewAutoDumpingHistogram(name string, interval float64, bucketsCount int, cutoff float64, dumpInterval time.Duration) *AutoDumpHistogram {
133+
return &AutoDumpHistogram{
134+
name: name,
135+
mainHistogram: NewHistogram(interval, bucketsCount, cutoff),
136+
backHistogram: NewHistogram(interval, bucketsCount, cutoff),
137+
accumulated: NewHistogram(interval, bucketsCount, cutoff),
138+
lastDumpHistogramTime: time.Now(),
139+
dumpInterval: dumpInterval,
140+
}
141+
}
142+
143+
func (h *AutoDumpHistogram) Observe(value float64, now time.Time) {
144+
// Not thread-safe.
145+
h.mainHistogram.Observe(value)
146+
if now.Sub(h.lastDumpHistogramTime) >= h.dumpInterval && !h.isDumping.Load() {
147+
h.isDumping.Store(true)
148+
h.mainHistogram, h.backHistogram = h.backHistogram, h.mainHistogram
149+
h.lastDumpHistogramTime = now
150+
go h.dump(now)
151+
}
152+
}
153+
154+
func (h *AutoDumpHistogram) dump(now time.Time) {
155+
defer h.isDumping.Store(false)
156+
157+
h.accumulated.sum += h.backHistogram.sum
158+
h.accumulated.sumSquare += h.backHistogram.sumSquare
159+
h.accumulated.count += h.backHistogram.count
160+
for i := 0; i < len(h.accumulated.buckets) && i < len(h.backHistogram.buckets); i++ {
161+
h.accumulated.buckets[i] += h.backHistogram.buckets[i]
162+
}
163+
if len(h.backHistogram.buckets) > len(h.accumulated.buckets) {
164+
h.accumulated.buckets = append(h.accumulated.buckets, h.backHistogram.buckets[len(h.accumulated.buckets):]...)
165+
}
166+
167+
log.Info("dumping histogram", zap.String("name", h.name), zap.Time("time", now), zap.Stringer("histogram", h.accumulated))
168+
169+
h.backHistogram.Clear()
170+
}

client/tso_batch_controller.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
package pd
1616

1717
import (
18+
"math"
1819
"runtime/trace"
1920
"time"
2021

@@ -113,9 +114,10 @@ func newTSOBatchController(maxBatchSize int) *tsoBatchController {
113114
// return nil
114115
//}
115116

116-
func (tbc *tsoBatchController) pushRequest(tsoReq *tsoRequest) {
117+
func (tbc *tsoBatchController) pushRequest(tsoReq *tsoRequest, beforeReceiveDurationHist *AutoDumpHistogram, now time.Time) {
117118
tbc.collectedRequests[tbc.collectedRequestCount] = tsoReq
118119
tbc.collectedRequestCount++
120+
beforeReceiveDurationHist.Observe(math.Max(now.Sub(tsoReq.start).Seconds(), 0), now)
119121
}
120122

121123
func (tbc *tsoBatchController) getCollectedRequests() []*tsoRequest {
@@ -142,16 +144,17 @@ func (tbc *tsoBatchController) finishCollectedRequests(physical, firstLogical in
142144

143145
}
144146

145-
func finishCollectedRequests(requests []*tsoRequest, physical, firstLogical int64, suffixBits uint32, err error, statFunc func(latency time.Duration)) {
147+
func finishCollectedRequests(requests []*tsoRequest, physical, firstLogical int64, suffixBits uint32, err error, statFunc func(latency time.Duration, now time.Time)) {
148+
now := time.Now()
146149
for i := 0; i < len(requests); i++ {
147150
tsoReq := requests[i]
148151
// Retrieve the request context before the request is done to trace without race.
149152
requestCtx := tsoReq.requestCtx
150153
tsoReq.physical, tsoReq.logical = physical, tsoutil.AddLogical(firstLogical, int64(i), suffixBits)
151-
tsoReq.tryDone(err)
152154
if statFunc != nil {
153-
statFunc(time.Since(tsoReq.start))
155+
statFunc(now.Sub(tsoReq.start), now)
154156
}
157+
tsoReq.tryDone(err)
155158
trace.StartRegion(requestCtx, "pdclient.tsoReqDequeue").End()
156159
}
157160
}

client/tso_dispatcher.go

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ type tsoDispatcher struct {
9090
tsoReqTokenCh chan struct{}
9191

9292
dispatcherID string
93+
94+
beforeHandleDurationHist *AutoDumpHistogram
95+
batchWaitTimerDuration *AutoDumpHistogram
96+
batchNoWaitCollectDuration *AutoDumpHistogram
9397
}
9498

9599
func newTSODispatcher(
@@ -109,6 +113,7 @@ func newTSODispatcher(
109113
// maxBatchSize,
110114
// )
111115
//})
116+
id := fmt.Sprintf("%d", dispatcherIDAlloc.Add(1))
112117
td := &tsoDispatcher{
113118
ctx: dispatcherCtx,
114119
cancel: dispatcherCancel,
@@ -126,7 +131,11 @@ func newTSODispatcher(
126131
updateConnectionCtxsCh: make(chan struct{}, 1),
127132
tsoReqTokenCh: make(chan struct{}, 256),
128133

129-
dispatcherID: fmt.Sprintf("%d", dispatcherIDAlloc.Add(1)),
134+
dispatcherID: id,
135+
136+
beforeHandleDurationHist: NewAutoDumpingHistogram("beforeHandleDurationHist-"+id, 2e-5, 2000, 1, time.Minute),
137+
batchWaitTimerDuration: NewAutoDumpingHistogram("batchWaitTimerDurationHist-"+id, 2e-5, 2000, 1, time.Minute),
138+
batchNoWaitCollectDuration: NewAutoDumpingHistogram("batchNoWaitCollectDurationHist-"+id, 2e-5, 2000, 1, time.Minute),
130139
}
131140
go td.watchTSDeadline()
132141
return td
@@ -302,7 +311,7 @@ tsoBatchLoop:
302311
return
303312
case firstRequest := <-td.reqChan:
304313
batchController.batchStartTime = time.Now()
305-
batchController.pushRequest(firstRequest)
314+
batchController.pushRequest(firstRequest, td.beforeHandleDurationHist, batchController.batchStartTime)
306315
// Token is not ready. Continue the loop to wait for the token or another request.
307316
continue
308317
case <-td.tsoReqTokenCh:
@@ -317,7 +326,7 @@ tsoBatchLoop:
317326
return
318327
case firstRequest := <-td.reqChan:
319328
batchController.batchStartTime = time.Now()
320-
batchController.pushRequest(firstRequest)
329+
batchController.pushRequest(firstRequest, td.beforeHandleDurationHist, batchController.batchStartTime)
321330
// Token is not ready. Continue the loop to wait for the token or another request.
322331
}
323332
}
@@ -388,7 +397,8 @@ tsoBatchLoop:
388397
latency := stream.EstimatedRoundTripLatency()
389398
estimateTSOLatencyGauge.WithLabelValues(td.dispatcherID, streamURL).Set(latency.Seconds())
390399
totalBatchTime := latency / time.Duration(concurrencyFactor)
391-
remainingBatchTime := totalBatchTime - time.Since(currentBatchStartTime)
400+
waitTimerStart := time.Now()
401+
remainingBatchTime := totalBatchTime - waitTimerStart.Sub(currentBatchStartTime)
392402
if remainingBatchTime > 0 && !batchNoWait {
393403
if !batchingTimer.Stop() {
394404
select {
@@ -405,13 +415,16 @@ tsoBatchLoop:
405415
zap.String("dc-location", dc))
406416
return
407417
case req := <-td.reqChan:
408-
batchController.pushRequest(req)
418+
batchController.pushRequest(req, td.beforeHandleDurationHist, time.Now())
409419
case <-batchingTimer.C:
410420
break batchingLoop
411421
}
412422
}
413423
}
424+
waitTimerEnd := time.Now()
425+
td.batchWaitTimerDuration.Observe(waitTimerEnd.Sub(waitTimerStart).Seconds(), waitTimerEnd)
414426

427+
nowaitCollectStart := time.Now()
415428
// Continue collecting as many as possible without blocking
416429
nonWaitingBatchLoop:
417430
for {
@@ -421,11 +434,13 @@ tsoBatchLoop:
421434
zap.String("dc-location", dc))
422435
return
423436
case req := <-td.reqChan:
424-
batchController.pushRequest(req)
437+
batchController.pushRequest(req, td.beforeHandleDurationHist, nowaitCollectStart)
425438
default:
426439
break nonWaitingBatchLoop
427440
}
428441
}
442+
nowaitCollectEnd := time.Now()
443+
td.batchWaitTimerDuration.Observe(nowaitCollectEnd.Sub(nowaitCollectStart).Seconds(), nowaitCollectEnd)
429444

430445
//done := make(chan struct{})
431446
//dl := newTSDeadline(option.timeout, done, cancel)
@@ -598,7 +613,7 @@ func (td *tsoDispatcher) cancelCollectedRequests(tbc *tsoBatchController, err er
598613
td.tsoReqTokenCh <- struct{}{}
599614
}
600615

601-
func (td *tsoDispatcher) onBatchedRespReceived(reqID uint64, result tsoRequestResult, err error, statFunc func(latency time.Duration)) {
616+
func (td *tsoDispatcher) onBatchedRespReceived(reqID uint64, result tsoRequestResult, err error, statFunc func(latency time.Duration, now time.Time)) {
602617
tbc, loaded := td.pendingBatches.LoadAndDelete(reqID)
603618
if !loaded {
604619
log.Info("received response for already abandoned requests")

0 commit comments

Comments
 (0)