Skip to content

Commit b1cc0af

Browse files
committed
Do not process gRPC calls when gateway is going down.
Fixes #992 Signed-off-by: Gil Bregman <gbregman@il.ibm.com>
1 parent 719a6ee commit b1cc0af

File tree

10 files changed

+36
-10
lines changed

10 files changed

+36
-10
lines changed

.env

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,5 @@ DHCHAP_KEY6="DHHC-1:01:Bu4tZd7X2oW7XxmVH5tGCdoS30pDX6bZvexHYoudeVlJW9yz:"
101101
DHCHAP_KEY7="DHHC-1:01:JPJkDQ2po2FfLmKYlTF/sJ2HzVO/FKWxgXKE/H6XfL8ogQ1T:"
102102
DHCHAP_KEY8="DHHC-1:01:e0B0vDxKleDzYVtG42xqFvoWZfiufkoywmfRKrETzayRdf1j:"
103103
DHCHAP_KEY9="DHHC-1:01:KD+sfH3/o2bRQoV0ESjBUywQlMnSaYpZISUbVa0k0nsWpNST:"
104+
DHCHAP_KEY10="DHHC-1:00:rWf0ZFYO7IgWGttM8w6jUrAY4cTQyqyXPdmxHeOSve3w5QU9:"
105+
DHCHAP_KEY11="DHHC-1:02:j3uUz05r5aQy42vX4tDXqVf9HgUPPdEp3kXTgUWl9EphsG7jwpr9KSIt3bmRLXBijPTIDQ==:"

ceph-nvmeof.conf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ max_ns_to_change_lb_grp = 8
3232
#verify_nqns = True
3333
#allowed_consecutive_spdk_ping_failures = 1
3434
#spdk_ping_interval_in_seconds = 2.0
35-
#max_hosts_per_namespace = 1
35+
#max_hosts_per_namespace = 8
3636
#max_namespaces_with_netmask = 1000
3737
#max_subsystems = 128
3838
#max_namespaces = 1024

control/grpc.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp
362362
self.host_name = socket.gethostname()
363363
self.verify_nqns = self.config.getboolean_with_default("gateway", "verify_nqns", True)
364364
self.gateway_group = self.config.get_with_default("gateway", "group", "")
365-
self.max_hosts_per_namespace = self.config.getint_with_default("gateway", "max_hosts_per_namespace", 1)
365+
self.max_hosts_per_namespace = self.config.getint_with_default("gateway", "max_hosts_per_namespace", 8)
366366
self.max_namespaces_with_netmask = self.config.getint_with_default("gateway", "max_namespaces_with_netmask", 1000)
367367
self.max_subsystems = self.config.getint_with_default("gateway", "max_subsystems", GatewayService.MAX_SUBSYSTEMS_DEFAULT)
368368
self.max_namespaces = self.config.getint_with_default("gateway", "max_namespaces", GatewayService.MAX_NAMESPACES_DEFAULT)
@@ -386,6 +386,7 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp
386386
self._init_cluster_context()
387387
self.subsys_max_ns = {}
388388
self.host_info = SubsystemHostAuth()
389+
self.up_and_running = True
389390
self.rebalance = Rebalance(self)
390391

391392
def get_directories_for_key_file(self, key_type : str, subsysnqn : str, create_dir : bool = False) -> []:
@@ -668,6 +669,12 @@ def execute_grpc_function(self, func, request, context):
668669
called might take OMAP lock internally, however does NOT ensure
669670
taking OMAP lock in any way.
670671
"""
672+
673+
if not self.up_and_running:
674+
errmsg = "Gateway is going down"
675+
self.logger.error(errmsg)
676+
return pb2.req_status(status=errno.ESHUTDOWN, error_message=errmsg)
677+
671678
return self.omap_lock.execute_omap_locking_function(self._grpc_function_with_lock, func, request, context)
672679

673680
def create_bdev(self, anagrp: int, name, uuid, rbd_pool_name, rbd_image_name, block_size, create_image, rbd_image_size, context, peer_msg = ""):
@@ -988,7 +995,7 @@ def create_subsystem_safe(self, request, context):
988995
else:
989996
subsys_using_serial = self.serial_number_already_used(context, request.serial_number)
990997
if subsys_using_serial:
991-
errmsg = f"Serial number {request.serial_number} already used by subsystem {subsys_using_serial}"
998+
errmsg = f"Serial number {request.serial_number} is already used by subsystem {subsys_using_serial}"
992999
if subsys_already_exists or subsys_using_serial:
9931000
errmsg = f"{create_subsystem_error_prefix}: {errmsg}"
9941001
self.logger.error(f"{errmsg}")
@@ -1527,7 +1534,8 @@ def namespace_change_load_balancing_group_safe(self, request, context):
15271534
grps_list = []
15281535
peer_msg = self.get_peer_message(context)
15291536
change_lb_group_failure_prefix = f"Failure changing load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn}"
1530-
self.logger.info(f"Received auto {request.auto_lb_logic} request to change load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn} to {request.anagrpid}, context: {context}{peer_msg}")
1537+
auto_lb_msg = "auto" if request.auto_lb_logic else "manual"
1538+
self.logger.info(f"Received {auto_lb_msg} request to change load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn} to {request.anagrpid}, context: {context}{peer_msg}")
15311539

15321540
if not request.subsystem_nqn:
15331541
errmsg = f"Failure changing load balancing group for namespace, missing subsystem NQN"

control/prometheus.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(self, spdk_rpc_client, config, gateway_rpc):
109109
self.gw_config = config
110110
_bdev_pools = config.get_with_default('gateway', 'prometheus_bdev_pools', '')
111111
self.bdev_pools = _bdev_pools.split(',') if _bdev_pools else []
112-
self.interval = config.getint_with_default('gateway', 'prometheus_stats_inteval', 10)
112+
self.interval = config.getint_with_default('gateway', 'prometheus_stats_interval', 10)
113113
self.lock = threading.Lock()
114114
self.hostname = os.getenv('NODE_NAME') or os.getenv('HOSTNAME')
115115

control/server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,8 @@ def __enter__(self):
125125

126126
def __exit__(self, exc_type, exc_value, traceback):
127127
"""Cleans up SPDK and server instances."""
128+
if self.gateway_rpc:
129+
self.gateway_rpc.up_and_running = False
128130
if exc_type is not None:
129131
self.logger.exception("GatewayServer exception occurred:")
130132
else:

control/state.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -602,7 +602,7 @@ def cleanup_omap(self, omap_lock = None):
602602
if omap_lock and omap_lock.omap_file_lock_duration > 0:
603603
try:
604604
omap_lock.unlock_omap()
605-
except Exceprion:
605+
except Exception:
606606
pass
607607
if self.ioctx:
608608
try:

tests/ceph-nvmeof.no-huge.conf

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ state_update_notify = True
1717
state_update_timeout_in_msec = 2000
1818
state_update_interval_sec = 5
1919
enable_spdk_discovery_controller = False
20+
rebalance_period_sec = 7
21+
max_gws_in_grp = 16
22+
max_ns_to_change_lb_grp = 8
2023
#omap_file_lock_duration = 20
2124
#omap_file_lock_retries = 30
2225
#omap_file_lock_retry_sleep_interval = 1.0
@@ -29,7 +32,7 @@ enable_spdk_discovery_controller = False
2932
#verify_nqns = True
3033
#allowed_consecutive_spdk_ping_failures = 1
3134
#spdk_ping_interval_in_seconds = 2.0
32-
#max_hosts_per_namespace = 1
35+
#max_hosts_per_namespace = 8
3336
#max_namespaces_with_netmask = 1000
3437
#max_subsystems = 128
3538
#max_namespaces = 1024

tests/ceph-nvmeof.tls.conf

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ enable_auth = True
1616
state_update_notify = True
1717
state_update_interval_sec = 5
1818
enable_spdk_discovery_controller = False
19+
rebalance_period_sec = 7
20+
max_gws_in_grp = 16
21+
max_ns_to_change_lb_grp = 8
1922
#omap_file_lock_duration = 20
2023
#omap_file_lock_retries = 30
2124
#omap_file_lock_retry_sleep_interval = 1.0
@@ -28,7 +31,7 @@ enable_spdk_discovery_controller = False
2831
#verify_nqns = True
2932
#allowed_consecutive_spdk_ping_failures = 1
3033
#spdk_ping_interval_in_seconds = 2.0
31-
#max_hosts_per_namespace = 1
34+
#max_hosts_per_namespace = 8
3235
#max_namespaces_with_netmask = 1000
3336
#max_subsystems = 128
3437
#max_namespaces = 1024

tests/test_cli.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
subsystem5 = "nqn.2016-06.io.spdk:cnode5"
3333
subsystem6 = "nqn.2016-06.io.spdk:cnode6"
3434
subsystem7 = "nqn.2016-06.io.spdk:cnode7"
35+
subsystem8 = "nqn.2016-06.io.spdk:cnode8"
3536
discovery_nqn = "nqn.2014-08.org.nvmexpress.discovery"
3637
serial = "Ceph00000000000001"
3738
uuid = "948878ee-c3b2-4d58-a29b-2cff713fc02d"
@@ -70,6 +71,7 @@ def gateway(config):
7071
port = config.getint("gateway", "port")
7172
config.config["gateway"]["group"] = group_name
7273
config.config["gateway"]["max_namespaces_with_netmask"] = "3"
74+
config.config["gateway"]["max_hosts_per_namespace"] = "1"
7375
config.config["gateway"]["max_subsystems"] = "3"
7476
config.config["gateway"]["max_namespaces"] = "12"
7577
config.config["gateway"]["max_namespaces_per_subsystem"] = "11"
@@ -212,6 +214,9 @@ def test_create_subsystem(self, caplog, gateway):
212214
assert f'"nqn": "{subsystem}"' in caplog.text
213215
assert f'"max_namespaces": 2049' in caplog.text
214216
caplog.clear()
217+
cli(["subsystem", "add", "--subsystem", subsystem, "--max-namespaces", "2049", "--no-group-append"])
218+
assert f"Failure creating subsystem {subsystem}: Subsystem already exists" in caplog.text
219+
caplog.clear()
215220
cli(["subsystem", "add", "--subsystem", subsystem2, "--serial-number", serial, "--no-group-append"])
216221
assert f"Adding subsystem {subsystem2}: Successful" in caplog.text
217222
caplog.clear()
@@ -251,6 +256,9 @@ def test_create_subsystem(self, caplog, gateway):
251256
assert subs_list.subsystems[0].nqn == subsystem
252257
assert subs_list.subsystems[1].nqn == subsystem2
253258
caplog.clear()
259+
cli(["subsystem", "add", "--subsystem", subsystem8, "--serial-number", serial, "--no-group-append"])
260+
assert f"Failure creating subsystem {subsystem8}: Serial number {serial} is already used by subsystem {subsystem2}" in caplog.text
261+
caplog.clear()
254262
subs_list = cli_test(["subsystem", "list"])
255263
assert subs_list != None
256264
assert subs_list.status == 0

tests/test_cli_change_lb.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,10 @@ def change_one_namespace_lb_group(caplog, subsys, nsid_to_change, new_group):
124124
time.sleep(8)
125125

126126
assert f"Changing load balancing group of namespace {nsid_to_change} in {subsys} to {new_group}: Successful" in caplog.text
127-
assert f"Received auto False request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: <grpc._server" in caplog.text
127+
assert f"Received manual request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: <grpc._server" in caplog.text
128128
assert f"Received request to delete namespace" not in caplog.text
129129
assert f"Received request to add a namespace" not in caplog.text
130-
assert f"Received auto False request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: None" in caplog.text
130+
assert f"Received manual request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: None" in caplog.text
131131

132132
def switch_namespaces_lb_group(caplog, ns_count, subsys):
133133
for i in range(1, 1 + (ns_count // 2)):

0 commit comments

Comments
 (0)