Skip to content

Commit e91a691

Browse files
authored
Merge pull request #11077 from vegaprotocol/add-snapshot-debugging
feat: add snapshot debug config
2 parents 074531c + efcc42a commit e91a691

File tree

4 files changed

+116
-0
lines changed

4 files changed

+116
-0
lines changed

core/processor/abci.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ type SnapshotEngine interface {
131131
ReceiveSnapshotChunk(context.Context, *types.RawChunk, string) tmtypes.ResponseApplySnapshotChunk
132132
RetrieveSnapshotChunk(uint64, uint32, uint32) (*types.RawChunk, error)
133133
HasRestoredStateAlready() bool
134+
135+
// debug snapshot issues/hash mismatch problems
136+
SnapshotDump(ctx context.Context, path string) ([]byte, error)
134137
}
135138

136139
type StateVarEngine interface {
@@ -1115,6 +1118,15 @@ func (app *App) Finalize() []byte {
11151118
if err == nil {
11161119
app.protocolUpgradeService.SetCoreReadyForUpgrade()
11171120
}
1121+
} else if app.cfg.SnapshotDebug.DevEnabled {
1122+
if height, _ := vgcontext.BlockHeightFromContext(app.blockCtx); height == app.cfg.SnapshotDebug.CrashAtHeight {
1123+
hash, err := app.snapshotEngine.SnapshotDump(app.blockCtx, app.cfg.SnapshotDebug.DebugCrashFile)
1124+
if err != nil {
1125+
app.log.Panic("Failed to dump snapshot file", logging.Error(err), logging.String("snapshot-hash", string(hash)))
1126+
} else {
1127+
app.log.Panic("Dumped snapshot file successfully", logging.String("snapshot-hash", string(hash)), logging.String("dump-file", app.cfg.SnapshotDebug.DebugCrashFile))
1128+
}
1129+
}
11181130
} else {
11191131
snapHash, _, err = app.snapshotEngine.Snapshot(app.blockCtx)
11201132
}

core/processor/config.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ const (
2525
namedLogger = "processor"
2626
)
2727

28+
type Snapshot struct {
29+
DevEnabled encoding.Bool
30+
CrashAtHeight uint64 `long:"crash-with-snapshot-at"`
31+
DebugCrashFile string `long:"snapshot-dump-path"`
32+
}
33+
2834
// Config represent the configuration of the processor package.
2935
type Config struct {
3036
Level encoding.LogLevel `long:"log-level"`
@@ -33,6 +39,7 @@ type Config struct {
3339
LogOrderCancelDebug encoding.Bool `long:"log-order-cancel-debug"`
3440
Ratelimit ratelimit.Config `group:"Ratelimit" namespace:"ratelimit"`
3541
KeepCheckpointsMax uint `long:"keep-checkpoints-max"`
42+
SnapshotDebug Snapshot `group:"SnapshotDebug" namespace:"snapshotdebug"`
3643
}
3744

3845
// NewDefaultConfig creates an instance of the package specific configuration, given a
@@ -43,5 +50,10 @@ func NewDefaultConfig() Config {
4350
LogOrderSubmitDebug: true,
4451
Ratelimit: ratelimit.NewDefaultConfig(),
4552
KeepCheckpointsMax: 20,
53+
SnapshotDebug: Snapshot{
54+
DevEnabled: false,
55+
CrashAtHeight: 0,
56+
DebugCrashFile: "/tmp/snapshot.json",
57+
},
4658
}
4759
}

core/snapshot/engine.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616
package snapshot
1717

1818
import (
19+
"bufio"
1920
"context"
2021
"errors"
2122
"fmt"
23+
"os"
2224
"reflect"
2325
"sort"
2426
"sync"
@@ -34,9 +36,12 @@ import (
3436
"code.vegaprotocol.io/vega/libs/proto"
3537
"code.vegaprotocol.io/vega/logging"
3638
"code.vegaprotocol.io/vega/paths"
39+
snapshotpb "code.vegaprotocol.io/vega/protos/vega/snapshot/v1"
3740
"code.vegaprotocol.io/vega/version"
3841

3942
tmtypes "github.com/cometbft/cometbft/abci/types"
43+
"github.com/gogo/protobuf/jsonpb"
44+
"github.com/libp2p/go-libp2p/p2p/host/autonat/pb"
4045
"go.uber.org/zap"
4146
"golang.org/x/exp/slices"
4247
)
@@ -512,6 +517,52 @@ func (e *Engine) Snapshot(ctx context.Context) ([]byte, DoneCh, error) {
512517
return e.snapshotNow(ctx, true)
513518
}
514519

520+
// SnapshotDump takes a snapshot on demand, without persisting it to the underlying DB
521+
// it's meant to just dump to a file for debugging.
522+
func (e *Engine) SnapshotDump(ctx context.Context, path string) ([]byte, error) {
523+
e.ensureEngineIsStarted()
524+
// dump file
525+
f, err := os.Create(path)
526+
if err != nil {
527+
return nil, err
528+
}
529+
defer func() { _ = f.Close() }()
530+
hash, ch, err := e.snapshotNow(ctx, true)
531+
if err != nil {
532+
return nil, err
533+
}
534+
<-ch
535+
payloads, err := e.snapshotTree.AsProtoPayloads()
536+
if err != nil {
537+
return hash, err
538+
}
539+
540+
w := bufio.NewWriter(f)
541+
m := jsonpb.Marshaler{Indent: " "}
542+
543+
payloadData := struct {
544+
Data []*snapshotpb.Payload `json:"data,omitempty" protobuf:"bytes,1,rep,name=data"`
545+
pb.Message
546+
}{
547+
Data: payloads,
548+
}
549+
550+
s, err := m.MarshalToString(&payloadData)
551+
if err != nil {
552+
return hash, err
553+
}
554+
555+
if _, err = w.WriteString(s); err != nil {
556+
return hash, err
557+
}
558+
559+
if err = w.Flush(); err != nil {
560+
return hash, err
561+
}
562+
563+
return hash, nil
564+
}
565+
515566
// SnapshotNow triggers the snapshot process right now, ignoring the defined
516567
// interval.
517568
func (e *Engine) SnapshotNow(ctx context.Context) ([]byte, error) {

core/snapshot/tree/tree.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,47 @@ func (t *Tree) AsPayloads() ([]*types.Payload, error) {
134134
return payloads, nil
135135
}
136136

137+
func (t *Tree) AsProtoPayloads() ([]*snapshotpb.Payload, error) {
138+
lastSnapshotTree, err := t.innerTree.GetImmutable(t.innerTree.Version())
139+
if err != nil {
140+
return nil, fmt.Errorf("could not generate the immutable AVL tree: %w", err)
141+
}
142+
143+
exporter, err := lastSnapshotTree.Export()
144+
if err != nil {
145+
return nil, fmt.Errorf("could not export the AVL tree: %w", err)
146+
}
147+
defer exporter.Close()
148+
149+
payloads := []*snapshotpb.Payload{}
150+
151+
exportedNode, err := exporter.Next()
152+
for err == nil {
153+
// If there is no value, it means the node is an intermediary node and
154+
// not a leaf. Only leaves hold the data we are looking for.
155+
if exportedNode.Value == nil {
156+
exportedNode, err = exporter.Next()
157+
continue
158+
}
159+
160+
// sort out the payload for this node
161+
payloadProto := &snapshotpb.Payload{}
162+
if perr := proto.Unmarshal(exportedNode.Value, payloadProto); perr != nil {
163+
return nil, perr
164+
}
165+
166+
payloads = append(payloads, payloadProto)
167+
168+
exportedNode, err = exporter.Next()
169+
}
170+
171+
if !errors.Is(err, iavl.ErrorExportDone) {
172+
return nil, fmt.Errorf("failed to export AVL tree: %w", err)
173+
}
174+
175+
return payloads, nil
176+
}
177+
137178
func (t *Tree) FindImmutableTreeByHeight(blockHeight uint64) (*iavl.ImmutableTree, error) {
138179
version, err := t.metadataDB.FindVersionByBlockHeight(blockHeight)
139180
if err != nil {

0 commit comments

Comments
 (0)