Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(zbchaos): query current cluster topology #459

Merged
merged 7 commits into from
Dec 12, 2023
48 changes: 37 additions & 11 deletions go-chaos/cmd/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,29 +99,55 @@ func waitForChange(flags *Flags) error {
if err != nil {
return err
}
if topology.LastChange != nil && topology.LastChange.Id == int64(flags.changeId) {
if topology.LastChange.Status == "COMPLETED" {
internal.LogInfo("Change %d completed successfully", flags.changeId)
return nil
} else {
return fmt.Errorf("change %d failed with status %s", flags.changeId, topology.LastChange.Status)
}
} else if topology.LastChange != nil && topology.LastChange.Id > int64(flags.changeId) {
changeStatus := describeChangeStatus(topology, int64(flags.changeId))
switch changeStatus {
case ChangeStatusCompleted:
internal.LogInfo("Change %d completed successfully", flags.changeId)
return nil
case ChangeStatusFailed:
internal.LogInfo("Change %d failed with status %s", flags.changeId, topology.LastChange.Status)
return fmt.Errorf("change %d failed with status %s", flags.changeId, topology.LastChange.Status)
case ChangeStatusOutdated:
internal.LogInfo("Change %d is outdated but was most likely completed successfully, latest change is %d", flags.changeId, topology.LastChange.Id)
return nil
} else if topology.PendingChange != nil && topology.PendingChange.Id == int64(flags.changeId) {
case ChangeStatusPending:
competed := len(topology.PendingChange.Completed)
pending := len(topology.PendingChange.Pending)
total := competed + pending
internal.LogInfo("Change %d is %s with %d/%d operations complete", flags.changeId, topology.PendingChange.Status, competed, total)
} else {
case ChangeStatusUnknown:
internal.LogInfo("Change %d not yet started", flags.changeId)
}

time.Sleep(5 * time.Second)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔧 I would add some maximum time out or something, this looks to me like it could potentially loop forever? I know we assume the change will eventually complete, but still... 😅

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we should definitely fail at some point.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, good point 👍 Picking a good value is tricky though, I'd expect scaling to take anywhere from 5 minutes to >30 minutes..

Using that in an automated chaos test is problematic anyway because we'll run into job timeouts... 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

W00t 30 minutes 😆 😅

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've set a timeout of 25 minutes now

}
}

type ChangeStatus string

const (
ChangeStatusOutdated ChangeStatus = "OUTDATED"
ChangeStatusCompleted ChangeStatus = "COMPLETED"
ChangeStatusFailed ChangeStatus = "FAILED"
ChangeStatusPending ChangeStatus = "PENDING"
ChangeStatusUnknown ChangeStatus = "UNKNOWN"
)

func describeChangeStatus(topology *CurrentTopology, changeId int64) ChangeStatus {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 Feels like it makes sense to have this type of status enum inside the returned topology? 🤔

Copy link
Member Author

@lenaschoenburg lenaschoenburg Dec 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our API doesn't offer an endpoint to query "what's the status of change X". If we would offer that, the endpoint would return a similar enum as we have defined here, yeah.

if topology.LastChange != nil && topology.LastChange.Id == changeId {
if topology.LastChange.Status == "COMPLETED" {
return ChangeStatusCompleted
} else {
return ChangeStatusFailed
}
} else if topology.LastChange != nil && topology.LastChange.Id > changeId {
return ChangeStatusOutdated
} else if topology.PendingChange != nil && topology.PendingChange.Id == changeId {
return ChangeStatusPending
} else {
return ChangeStatusUnknown
}
}

func queryTopology(port int) (*CurrentTopology, error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔧 Would be great if you could add a IT for it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added a minimal one now. None of the other commands have integration tests so I hope I did it in a way that is acceptable: ffd0a20

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No integration tests for other commands you say?

shock

url := fmt.Sprintf("http://localhost:%d/actuator/cluster", port)
resp, err := http.Get(url)
Expand Down
107 changes: 107 additions & 0 deletions go-chaos/cmd/cluster_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright 2023 Camunda Services GmbH
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cmd

import (
"testing"

"github.com/stretchr/testify/assert"
)

func Test_DescribeChangeStatusWithPending(t *testing.T) {
// given
topology := CurrentTopology{
Version: 1,
Brokers: []BrokerState{},
LastChange: &LastChange{
Id: 2,
Status: "COMPLETED",
StartedAt: "2021-09-01T12:00:00.000Z",
CompletedAt: "2021-09-01T12:00:00.000Z",
},
PendingChange: &TopologyChange{
Id: 3,
Status: "IN_PROGRESS",
StartedAt: "2021-09-01T12:00:00.000Z",
CompletedAt: "2021-09-01T12:00:00.000Z",
InternalVersion: 1,
Completed: []Operation{
{
Operation: "ADD",
BrokerId: 1,
},
},
Pending: []Operation{
{
Operation: "ADD_BROKER",
BrokerId: 2,
},
},
},
}

// then
assert.Equal(t, ChangeStatusPending, describeChangeStatus(&topology, 3))
assert.Equal(t, ChangeStatusCompleted, describeChangeStatus(&topology, 2))
assert.Equal(t, ChangeStatusOutdated, describeChangeStatus(&topology, 1))
assert.Equal(t, ChangeStatusUnknown, describeChangeStatus(&topology, 4))
}

func Test_DescribeChangeStatusWithoutChanges(t *testing.T) {
// given
topology := CurrentTopology{
Version: 1,
Brokers: []BrokerState{},
LastChange: nil,
PendingChange: nil,
}

// then
assert.Equal(t, ChangeStatusUnknown, describeChangeStatus(&topology, 1))
assert.Equal(t, ChangeStatusUnknown, describeChangeStatus(&topology, 2))
}

func Test_DescribeChangeStatusWithoutCompleted(t *testing.T) {
// given
topology := CurrentTopology{
Version: 1,
Brokers: []BrokerState{},
LastChange: nil,
PendingChange: &TopologyChange{
Id: 3,
Status: "IN_PROGRESS",
StartedAt: "2021-09-01T12:00:00.000Z",
CompletedAt: "2021-09-01T12:00:00.000Z",
InternalVersion: 1,
Completed: []Operation{
{
Operation: "ADD",
BrokerId: 1,
},
},
Pending: []Operation{
{
Operation: "ADD_BROKER",
BrokerId: 2,
},
},
},
}

// then
assert.Equal(t, ChangeStatusUnknown, describeChangeStatus(&topology, 1))
assert.Equal(t, ChangeStatusPending, describeChangeStatus(&topology, 3))
assert.Equal(t, ChangeStatusUnknown, describeChangeStatus(&topology, 4))
}
Loading