Skip to content

Commit bf6f9c5

Browse files
authored
feat: do not recover from panics with fatal errors (#3534)
* feat: do not recover from panics with fatal errors * Do not panic on logical compaction issues (bugs)
1 parent c46bfc4 commit bf6f9c5

File tree

4 files changed

+134
-6
lines changed

4 files changed

+134
-6
lines changed

pkg/experiment/metastore/metastore_fsm.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package metastore
22

33
import (
4+
"errors"
45
"fmt"
56
"io"
67
"reflect"
@@ -57,6 +58,16 @@ type fsmError struct {
5758
err error
5859
}
5960

61+
type fatalCommandError struct {
62+
err error
63+
}
64+
65+
func (e fatalCommandError) Error() string {
66+
return fmt.Sprintf("fatal FSM command error: %v", e.err)
67+
}
68+
69+
func (e fatalCommandError) Unwrap() error { return e }
70+
6071
func errResponse(l *raft.Log, err error) fsmResponse {
6172
return fsmResponse{err: &fsmError{log: l, err: err}}
6273
}
@@ -129,6 +140,10 @@ func handleCommand[Req, Resp proto.Message](raw []byte, cmd *raft.Log, call comm
129140
var resp fsmResponse
130141
defer func() {
131142
if r := recover(); r != nil {
143+
var fCommandError fatalCommandError
144+
if errors.As(r.(error), &fCommandError) {
145+
panic(fCommandError)
146+
}
132147
resp.err = util.PanicError(r)
133148
}
134149
}()
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
package metastore
2+
3+
import (
4+
"errors"
5+
"testing"
6+
7+
"github.com/hashicorp/raft"
8+
"github.com/stretchr/testify/assert"
9+
"google.golang.org/protobuf/proto"
10+
11+
metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1"
12+
)
13+
14+
var testError = errors.New("test error")
15+
var typedNil *metastorev1.AddBlockResponse = nil
16+
17+
func Test_handleCommandErrorHandling(t *testing.T) {
18+
type args[Req proto.Message, Resp proto.Message] struct {
19+
raw []byte
20+
cmd *raft.Log
21+
call commandCall[Req, Resp]
22+
}
23+
type testCase[Req proto.Message, Resp proto.Message] struct {
24+
name string
25+
args args[Req, Resp]
26+
want fsmResponse
27+
wantPanic bool
28+
}
29+
tests := []testCase[*metastorev1.AddBlockRequest, *metastorev1.AddBlockResponse]{
30+
{
31+
name: "no error",
32+
args: args[*metastorev1.AddBlockRequest, *metastorev1.AddBlockResponse]{
33+
raw: make([]byte, 0),
34+
cmd: &raft.Log{},
35+
call: func(log *raft.Log, request *metastorev1.AddBlockRequest) (*metastorev1.AddBlockResponse, error) {
36+
return &metastorev1.AddBlockResponse{}, nil
37+
},
38+
},
39+
want: fsmResponse{
40+
msg: &metastorev1.AddBlockResponse{},
41+
err: nil,
42+
},
43+
},
44+
{
45+
name: "a simple error is returned",
46+
args: args[*metastorev1.AddBlockRequest, *metastorev1.AddBlockResponse]{
47+
raw: make([]byte, 0),
48+
cmd: &raft.Log{},
49+
call: func(log *raft.Log, request *metastorev1.AddBlockRequest) (*metastorev1.AddBlockResponse, error) {
50+
return nil, testError
51+
},
52+
},
53+
want: fsmResponse{
54+
msg: typedNil,
55+
err: testError,
56+
},
57+
},
58+
{
59+
name: "a panic with a fatal error results in a real panic",
60+
args: args[*metastorev1.AddBlockRequest, *metastorev1.AddBlockResponse]{
61+
raw: make([]byte, 0),
62+
cmd: &raft.Log{},
63+
call: func(log *raft.Log, request *metastorev1.AddBlockRequest) (*metastorev1.AddBlockResponse, error) {
64+
panic(fatalCommandError{testError})
65+
},
66+
},
67+
wantPanic: true,
68+
},
69+
}
70+
for _, tt := range tests {
71+
t.Run(tt.name, func(t *testing.T) {
72+
defer func() {
73+
if r := recover(); r != nil {
74+
assert.True(t, tt.wantPanic)
75+
}
76+
}()
77+
assert.Equal(t, tt.want, handleCommand(tt.args.raw, tt.args.cmd, tt.args.call))
78+
})
79+
}
80+
}

pkg/experiment/metastore/metastore_state_poll_compaction_jobs.go

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,12 @@ func (m *metastoreState) pollCompactionJobs(request *compactorv1.PollCompactionJ
6565
level.Warn(m.logger).Log("msg", "job is not assigned to the worker", "job", jobUpdate.JobName, "raft_log_index", jobUpdate.RaftLogIndex)
6666
continue
6767
}
68-
jobKey := tenantShard{tenant: job.TenantId, shard: job.Shard}
6968
level.Debug(m.logger).Log("msg", "processing status update for compaction job", "job", jobUpdate.JobName, "status", jobUpdate.Status)
7069
switch jobUpdate.Status {
7170
case compactorv1.CompactionStatus_COMPACTION_STATUS_SUCCESS:
7271
// clean up the job, we don't keep completed jobs around
7372
m.compactionJobQueue.evict(job.Name, job.RaftLogIndex)
73+
jobKey := tenantShard{tenant: job.TenantId, shard: job.Shard}
7474
stateUpdate.deletedJobs[jobKey] = append(stateUpdate.deletedJobs[jobKey], job.Name)
7575
m.compactionMetrics.completedJobs.WithLabelValues(
7676
fmt.Sprint(job.Shard), job.TenantId, fmt.Sprint(job.CompactionLevel)).Inc()
@@ -179,7 +179,7 @@ func (m *metastoreState) pollCompactionJobs(request *compactorv1.PollCompactionJ
179179

180180
err = m.writeToDb(stateUpdate)
181181
if err != nil {
182-
panic(fmt.Errorf("error writing metastore compaction state to db: %w", err))
182+
panic(fatalCommandError{fmt.Errorf("error persisting metadata state to db, %w", err)})
183183
}
184184

185185
return resp, nil
@@ -268,7 +268,12 @@ func (m *metastoreState) writeToDb(sTable *pollStateUpdate) error {
268268
for _, b := range blocks {
269269
block := m.findBlock(shard, b)
270270
if block == nil {
271-
return fmt.Errorf("block %s not found in shard %d", b, shard)
271+
level.Error(m.logger).Log(
272+
"msg", "a newly compacted block could not be found",
273+
"block", b,
274+
"shard", shard,
275+
)
276+
continue
272277
}
273278
name, key := keyForBlockMeta(shard, "", b)
274279
err := updateBlockMetadataBucket(tx, name, func(bucket *bbolt.Bucket) error {
@@ -294,7 +299,11 @@ func (m *metastoreState) writeToDb(sTable *pollStateUpdate) error {
294299
for _, jobName := range sTable.newJobs {
295300
job := m.findJob(jobName)
296301
if job == nil {
297-
return fmt.Errorf("job %s not found", jobName)
302+
level.Error(m.logger).Log(
303+
"msg", "a newly added job could not be found",
304+
"job", jobName,
305+
)
306+
continue
298307
}
299308
err := m.persistCompactionJob(job.Shard, job.TenantId, job, tx)
300309
if err != nil {
@@ -305,7 +314,13 @@ func (m *metastoreState) writeToDb(sTable *pollStateUpdate) error {
305314
for _, l := range levels {
306315
queue := m.getOrCreateCompactionBlockQueue(key).blocksByLevel[l]
307316
if queue == nil {
308-
return fmt.Errorf("block queue for %v and level %d not found", key, l)
317+
level.Error(m.logger).Log(
318+
"msg", "block queue not found",
319+
"shard", key.shard,
320+
"tenant", key.tenant,
321+
"level", l,
322+
)
323+
continue
309324
}
310325
err := m.persistCompactionJobBlockQueue(key.shard, key.tenant, l, queue, tx)
311326
if err != nil {
@@ -327,7 +342,11 @@ func (m *metastoreState) writeToDb(sTable *pollStateUpdate) error {
327342
for _, jobName := range sTable.updatedJobs {
328343
job := m.findJob(jobName)
329344
if job == nil {
330-
return fmt.Errorf("job %s not found", jobName)
345+
level.Error(m.logger).Log(
346+
"msg", "an updated job could not be found",
347+
"job", jobName,
348+
)
349+
continue
331350
}
332351
err := m.persistCompactionJob(job.Shard, job.TenantId, job, tx)
333352
if err != nil {

pkg/experiment/metastore/metastore_state_poll_compaction_jobs_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,20 @@ func Test_FailedCompaction(t *testing.T) {
271271
verifyCompactionState(t, m)
272272
}
273273

274+
func Test_PanicWithDbErrors(t *testing.T) {
275+
m := initState(t)
276+
addLevel0Blocks(m, 20)
277+
278+
// set up panic recovery
279+
defer func() {
280+
r := recover()
281+
require.NotNilf(t, r, "we should panic when a DB error is returned")
282+
}()
283+
// close the db, this should cause errors when persisting the state
284+
_ = m.db.boltdb.Close()
285+
_, _ = m.pollCompactionJobs(&compactorv1.PollCompactionJobsRequest{JobCapacity: 2}, 20, 20)
286+
}
287+
274288
func addLevel0Blocks(m *metastoreState, count int) {
275289
for i := 0; i < count; i++ {
276290
b := createBlock(i, 0, "", 0)

0 commit comments

Comments
 (0)