Skip to content

Commit

Permalink
added atomic transaction metrics to vttablet
Browse files Browse the repository at this point in the history
Signed-off-by: Harshit Gangal <harshit@planetscale.com>
  • Loading branch information
harshit-gangal committed Oct 15, 2024
1 parent 3bb138f commit 4d877f9
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 8 deletions.
7 changes: 6 additions & 1 deletion go/vt/vttablet/tabletserver/dt_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,12 @@ func (dte *DTExecutor) CommitPrepared(dtid string) (err error) {
defer func() {
if err != nil {
log.Warningf("failed to commit the prepared transaction '%s' with error: %v", dtid, err)
dte.te.checkErrorAndMarkFailed(ctx, dtid, err, "TwopcCommit")
fail := dte.te.checkErrorAndMarkFailed(ctx, dtid, err, "TwopcCommit")
if fail {
dte.te.env.Stats().CommitPreparedFail.Add("NonRetryable", 1)
} else {
dte.te.env.Stats().CommitPreparedFail.Add("Retryable", 1)
}
}
dte.te.txPool.RollbackAndRelease(ctx, conn)
}()
Expand Down
11 changes: 9 additions & 2 deletions go/vt/vttablet/tabletserver/tabletenv/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ type Stats struct {
ErrorCounters *stats.CountersWithSingleLabel
InternalErrors *stats.CountersWithSingleLabel
Warnings *stats.CountersWithSingleLabel
Unresolved *stats.GaugesWithSingleLabel // For now, only Prepares are tracked
UserTableQueryCount *stats.CountersWithMultiLabels // Per CallerID/table counts
UserTableQueryTimesNs *stats.CountersWithMultiLabels // Per CallerID/table latencies
UserTransactionCount *stats.CountersWithMultiLabels // Per CallerID transaction counts
Expand All @@ -49,6 +48,11 @@ type Stats struct {
UserReservedTimesNs *stats.CountersWithSingleLabel // Per CallerID reserved connection duration

QueryTimingsByTabletType *servenv.TimingsWrapper // Query timings split by current tablet type

// Atomic Transactions
Unresolved *stats.GaugesWithSingleLabel
CommitPreparedFail *stats.CountersWithSingleLabel
RedoPreparedFail *stats.CountersWithSingleLabel
}

// NewStats instantiates a new set of stats scoped by exporter.
Expand Down Expand Up @@ -83,7 +87,6 @@ func NewStats(exporter *servenv.Exporter) *Stats {
),
InternalErrors: exporter.NewCountersWithSingleLabel("InternalErrors", "Internal component errors", "type", "Task", "StrayTransactions", "Panic", "HungQuery", "Schema", "TwopcCommit", "TwopcResurrection", "WatchdogFail", "Messages"),
Warnings: exporter.NewCountersWithSingleLabel("Warnings", "Warnings", "type", "ResultsExceeded"),
Unresolved: exporter.NewGaugesWithSingleLabel("Unresolved", "Unresolved items", "item_type", "Prepares"),
UserTableQueryCount: exporter.NewCountersWithMultiLabels("UserTableQueryCount", "Queries received for each CallerID/table combination", []string{"TableName", "CallerID", "Type"}),
UserTableQueryTimesNs: exporter.NewCountersWithMultiLabels("UserTableQueryTimesNs", "Total latency for each CallerID/table combination", []string{"TableName", "CallerID", "Type"}),
UserTransactionCount: exporter.NewCountersWithMultiLabels("UserTransactionCount", "transactions received for each CallerID", []string{"CallerID", "Conclusion"}),
Expand All @@ -98,6 +101,10 @@ func NewStats(exporter *servenv.Exporter) *Stats {
UserReservedTimesNs: exporter.NewCountersWithSingleLabel("UserReservedTimesNs", "Total reserved connection latency for each CallerID", "CallerID"),

QueryTimingsByTabletType: exporter.NewTimings("QueryTimingsByTabletType", "Query timings broken down by active tablet type", "TabletType"),

Unresolved: exporter.NewGaugesWithSingleLabel("UnresolvedTransaction", "Unresolved items", "ManagerType"),
CommitPreparedFail: exporter.NewCountersWithSingleLabel("CommitPreparedFail", "failed prepared transactions commit", "FailureType"),
RedoPreparedFail: exporter.NewCountersWithSingleLabel("RedoPreparedFail", "failed prepared transactions on redo", "FailureType"),
}
stats.QPSRates = exporter.NewRates("QPS", stats.QueryTimings, 15*60/5, 5*time.Second)
return stats
Expand Down
12 changes: 7 additions & 5 deletions go/vt/vttablet/tabletserver/tx_engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ type TxEngine struct {
// 2. TabletControls have been set in the tablet record, and Query service is going to be disabled.
twopcAllowed []bool
shutdownGracePeriod time.Duration
coordinatorAddress string
abandonAge time.Duration
ticks *timer.Timer

Expand Down Expand Up @@ -454,6 +453,9 @@ func (te *TxEngine) prepareFromRedo() error {
allErrs = append(allErrs, vterrors.Wrapf(err, "dtid - %v", preparedTx.Dtid))
if prepFailed {
failedCounter++
te.env.Stats().RedoPreparedFail.Add("NonRetryable", 1)
} else {
te.env.Stats().RedoPreparedFail.Add("Retryable", 1)
}
} else {
preparedCounter++
Expand Down Expand Up @@ -580,14 +582,13 @@ func (te *TxEngine) startTransactionWatcher() {
ctx, cancel := context.WithTimeout(tabletenv.LocalContext(), te.abandonAge/4)
defer cancel()

// Raise alerts on prepares that have been unresolved for too long.
// Use 5x abandonAge to give opportunity for transaction coordinator to resolve these redo logs.
count, err := te.twoPC.CountUnresolvedRedo(ctx, time.Now().Add(-te.abandonAge*5))
// Track unresolved redo logs.
count, err := te.twoPC.CountUnresolvedRedo(ctx, time.Now().Add(-te.abandonAge))
if err != nil {
te.env.Stats().InternalErrors.Add("RedoWatcherFail", 1)
log.Errorf("Error reading prepared transactions: %v", err)
}
te.env.Stats().Unresolved.Set("Prepares", count)
te.env.Stats().Unresolved.Set("ResourceManager", count)

// Notify lingering distributed transactions.
count, err = te.twoPC.CountUnresolvedTransaction(ctx, time.Now().Add(-te.abandonAge))
Expand All @@ -596,6 +597,7 @@ func (te *TxEngine) startTransactionWatcher() {
log.Errorf("Error reading unresolved transactions: %v", err)
return
}
te.env.Stats().Unresolved.Set("MetadataManager", count)
if count > 0 {
te.dxNotify()
}
Expand Down

0 comments on commit 4d877f9

Please sign in to comment.