Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OSD removal #186

Merged
merged 1 commit into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 54 additions & 24 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ jobs:
with:
fetch-depth: 0

- name: Free disk
run: tests/scripts/actionutils.sh free_runner_disk

- name: Install and setup
run: |
tests/scripts/actionutils.sh install_microceph
Expand Down Expand Up @@ -91,23 +94,34 @@ jobs:
set -eux
# Show ceph's status
sudo microceph.ceph status

# Ceph status expectations for a single node cluster
test_single() {
local status="$1"
( echo "$status" | grep -qF "mon: 1 daemons" ) || { echo fail ; return ; }
( echo "$status" | grep -qE "mgr: .*active, " )|| { echo fail ; return ; }
( echo "$status" | grep -qF "osd: 3 osds" ) || { echo fail ; return ; }
( echo "$status" | grep -qF "rgw: 1 daemon" ) || { echo fail ; return ; }
echo ok
}
# Confirm ceph is healthy and services started
sudo microceph.ceph status | grep -F "mon: 1 daemons"
sudo microceph.ceph status | grep -E "mgr: .*active, "
sudo microceph.ceph status | grep -F "osd: 3 osds"
sudo microceph.ceph status | grep -F "rgw: 1 daemon"

res=$( test_single "$( sudo microceph.ceph status )" )
[ $res = "ok" ] || { echo "single node status fails" ; exit 1 ; }
# Check health after restart
sudo snap stop microceph
sudo snap start microceph
sleep 2m

sudo microceph.ceph status
sudo microceph.ceph status | grep -F "mon: 1 daemons"
sudo microceph.ceph status | grep -E "mgr: .*active, "
sudo microceph.ceph status | grep -F "osd: 3 osds"
sudo microceph.ceph status | grep -F "rgw: 1 daemon"
for i in $(seq 1 16); do
status=$( sudo microceph.ceph status )
echo "$status"
res=$( test_single "$status" )
if [ $res = "ok" ] ; then
echo "Single tests pass"
break
else
echo "Single tests fail, retry $i/16"
sleep 15
fi
done
sleep 1
pgrep ceph-osd || { echo "No ceph-osd process found" ; exit 1; }

- name: Exercise RGW
Expand Down Expand Up @@ -201,7 +215,7 @@ jobs:
run: |
set -uex
# We still have failure domain OSD
lxc exec node-head -- sh -c "sudo microceph.ceph osd crush rule ls" | grep -F microceph_auto_osd
lxc exec node-head -- sh -c "sudo microceph.ceph config get mon osd_pool_default_crush_rule" | fgrep -x 1
# Add a 3rd OSD, should switch to host failure domain
tests/scripts/actionutils.sh add_osd_to_node node-head
for i in $(seq 1 8); do
Expand All @@ -214,23 +228,40 @@ jobs:
sleep 2
fi
done
# Expect exactly one rule with host failure dom
rules=$( lxc exec node-head -- sh -c "sudo microceph.ceph osd crush rule ls" )
echo $rules
echo $rules | grep -F microceph_auto_host
num=$( echo $rules | wc -l)
if [ $num != '1' ] ; then echo "Expect exactly one rule" ; exit 1 ; fi

sleep 1
lxc exec node-head -- sh -c "sudo microceph.ceph -s"
# Now default to host rule
lxc exec node-head -- sh -c "sudo microceph.ceph config get mon osd_pool_default_crush_rule" | fgrep -x 2
- name: Test 3 osds present
run: |
set -uex
lxc exec node-head -- sh -c "microceph.ceph -s" | egrep "osd: 3 osds: 3 up.*3 in"

- name: Test osd host rule
- name: Test crush rules
run: |
set -uex
lxc exec node-head -- sh -c "microceph.ceph osd crush rule ls" | grep -F microceph_auto_host
lxc exec node-head -- sh -c "microceph.ceph osd pool ls detail" | grep -F "crush_rule 1"
lxc exec node-head -- sh -c "microceph.ceph osd pool ls detail" | grep -F "crush_rule 2"

- name: Add another OSD
run: |
tests/scripts/actionutils.sh add_osd_to_node node-wrk3
for i in $(seq 1 8); do
res=$( ( lxc exec node-head -- sh -c 'sudo microceph.ceph -s | grep -F osd: | sed -E "s/.* ([[:digit:]]*) in .*/\1/"' ) || true )
if [[ $res -gt 3 ]] ; then
echo "Found >3 OSDs"
break
else
echo -n '.'
sleep 2
fi
done

- name: Remove OSD again
run: |
set -uex
lxc exec node-wrk3 -- sh -c "microceph disk remove 3"
lxc exec node-head -- sh -c "microceph.ceph -s" | egrep "osd: 3 osds: 3 up.*3 in"

- name: Test migrate services
run: |
Expand All @@ -239,4 +270,3 @@ jobs:
sleep 2
lxc exec node-head -- sh -c "microceph status" | grep -F -A 1 node-wrk1 | grep -E "^ Services: osd$"
lxc exec node-head -- sh -c "microceph status" | grep -F -A 1 node-wrk3 | grep -E "^ Services: mds, mgr, mon$"

54 changes: 54 additions & 0 deletions microceph/api/disks.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@ package api

import (
"encoding/json"
"fmt"
"github.com/canonical/microceph/microceph/common"
"github.com/gorilla/mux"
"net/http"
"net/url"
"strconv"

"github.com/canonical/lxd/lxd/response"
"github.com/canonical/microcluster/rest"
Expand All @@ -20,6 +25,13 @@ var disksCmd = rest.Endpoint{
Post: rest.EndpointAction{Handler: cmdDisksPost, ProxyTarget: true},
}

// /1.0/disks/{osdid} endpoint.
var disksDelCmd = rest.Endpoint{
Path: "disks/{osdid}",

Delete: rest.EndpointAction{Handler: cmdDisksDelete, ProxyTarget: true},
}

func cmdDisksGet(s *state.State, r *http.Request) response.Response {
disks, err := ceph.ListOSD(s)
if err != nil {
Expand All @@ -44,3 +56,45 @@ func cmdDisksPost(s *state.State, r *http.Request) response.Response {

return response.EmptySyncResponse
}

// cmdDisksDelete is the handler for DELETE /1.0/disks/{osdid}.
func cmdDisksDelete(s *state.State, r *http.Request) response.Response {
var osd string
osd, err := url.PathUnescape(mux.Vars(r)["osdid"])
if err != nil {
return response.BadRequest(err)
}

var req types.DisksDelete
osdid, err := strconv.ParseInt(osd, 10, 64)

if err != nil {
return response.BadRequest(err)
}
err = json.NewDecoder(r.Body).Decode(&req)
if err != nil {
return response.BadRequest(err)
}

cs := common.CephState{State: s}
needDowngrade, err := ceph.IsDowngradeNeeded(cs, osdid)
if err != nil {
return response.InternalError(err)
}
if needDowngrade && !req.ConfirmDowngrade {
errorMsg := fmt.Errorf(
"Removing osd.%s would require a downgrade of the automatic crush rule from 'host' to 'osd' level. "+
"Likely this will result in additional data movement. Please confirm by setting the "+
"'--confirm-failure-domain-downgrade' flag to true",
osd,
)
return response.BadRequest(errorMsg)
}

err = ceph.RemoveOSD(cs, osdid, req.BypassSafety, req.Timeout)
if err != nil {
return response.SmartError(err)
}

return response.EmptySyncResponse
}
1 change: 1 addition & 0 deletions microceph/api/endpoints.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
// Endpoints is a global list of all API endpoints on the /1.0 endpoint of microceph.
var Endpoints = []rest.Endpoint{
disksCmd,
disksDelCmd,
resourcesCmd,
servicesCmd,
rgwServiceCmd,
Expand Down
8 changes: 8 additions & 0 deletions microceph/api/types/disks.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ type DisksPost struct {
Encrypt bool `json:"encrypt" yaml:"encrypt"`
}

// DisksDelete holds an OSD number and a flag for forcing the removal
type DisksDelete struct {
OSD int64 `json:"osdid" yaml:"osdid"`
BypassSafety bool `json:"bypass_safety" yaml:"bypass_safety"`
ConfirmDowngrade bool `json:"confirm_downgrade" yaml:"confirm_downgrade"`
Timeout int64 `json:"timeout" yaml:"timeout"`
}

// Disks is a slice of disks
type Disks []Disk

Expand Down
27 changes: 6 additions & 21 deletions microceph/ceph/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,16 @@ func Bootstrap(s common.StateInterface) error {
return err
}

// ensure crush rules
// setup up crush rules
err = ensureCrushRules()
if err != nil {
return err
}
// configure the default crush rule for new pools
err = setDefaultCrushRule("microceph_auto_osd")
if err != nil {
return err
}

// Re-generate the configuration from the database.
err = updateConfig(s)
Expand Down Expand Up @@ -264,23 +269,3 @@ func initMds(s common.StateInterface, dataPath string) error {
return nil

}

// ensureCrushRules removes the default replicated rule and adds a microceph default rule with failure domain OSD
func ensureCrushRules() error {
// Remove the default replicated rule it it exists.
if haveCrushRule("replicated_rule") {
err := removeCrushRule("replicated_rule")
if err != nil {
return fmt.Errorf("Failed to remove default replicated rule: %w", err)
}
}
// Add a microceph default rule with failure domain OSD if it does not exist.
if haveCrushRule("microceph_auto_rule") {
return nil
}
err := addCrushRule("microceph_auto_osd", "osd")
if err != nil {
return fmt.Errorf("Failed to add microceph default rule: %w", err)
}
return nil
}
5 changes: 3 additions & 2 deletions microceph/ceph/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ func (c ConfigTable) Keys() (keys []string) {
// so that each request for the map guarantees consistent definition.
func GetConstConfigTable() ConfigTable {
return ConfigTable{
"public_network": {"global", []string{"mon", "osd"}},
"cluster_network": {"global", []string{"osd"}},
"public_network": {"global", []string{"mon", "osd"}},
"cluster_network": {"global", []string{"osd"}},
"osd_pool_default_crush_rule": {"global", []string{}},
}
}

Expand Down
60 changes: 50 additions & 10 deletions microceph/ceph/crush.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,12 @@ package ceph
import (
"encoding/json"
"fmt"
"github.com/canonical/microceph/microceph/api/types"
"strings"

"github.com/tidwall/gjson"
)

// removeCrushRule removes a named crush rule
func removeCrushRule(name string) error {
_, err := processExec.RunCommand("ceph", "osd", "crush", "rule", "rm", name)
if err != nil {
return err
}

return nil
}

// addCrushRule creates a new default crush rule with a given name and failure domain
func addCrushRule(name string, failureDomain string) error {
_, err := processExec.RunCommand("ceph", "osd", "crush", "rule", "create-replicated", name, "default", failureDomain)
Expand Down Expand Up @@ -60,6 +51,9 @@ func getCrushRuleID(name string) (string, error) {
}
var jsond map[string]any
err = json.Unmarshal([]byte(output), &jsond)
if err != nil {
return "", err
}
val, ok := jsond["rule_id"]
if !ok {
return "", fmt.Errorf("rule_id not found in crush rule dump")
Expand Down Expand Up @@ -101,3 +95,49 @@ func setPoolCrushRule(pool string, rule string) error {
}
return nil
}

// setDefaultCrushRule sets the default crush rule for new pools
func setDefaultCrushRule(rule string) error {
rid, err := getCrushRuleID(rule)
if err != nil {
return err
}
err = SetConfigItem(types.Config{
Key: "osd_pool_default_crush_rule",
Value: rid,
})
if err != nil {
return err
}
return nil
}

// getDefaultCrushRule returns the current default crush rule for new pools
func getDefaultCrushRule() (string, error) {
configs, err := GetConfigItem(types.Config{
Key: "osd_pool_default_crush_rule",
})
if err != nil {
return "", err
}
return strings.TrimSpace(configs[0].Value), nil
}

// ensureCrushRules set up the crush rules for the automatic failure domain handling.
func ensureCrushRules() error {
// Add a microceph default rule with failure domain OSD if it does not exist.
if !haveCrushRule("microceph_auto_osd") {
err := addCrushRule("microceph_auto_osd", "osd")
if err != nil {
return fmt.Errorf("Failed to add microceph default crush rule: %w", err)
}
}
// Add a microceph default rule with failure domain host if it does not exist.
if !haveCrushRule("microceph_auto_host") {
err := addCrushRule("microceph_auto_host", "host")
if err != nil {
return fmt.Errorf("Failed to add microceph default crush rule: %w", err)
}
}
return nil
}
Loading
Loading