From a7c9008ba06fc7d2d11729ba9a02f5991394560c Mon Sep 17 00:00:00 2001 From: Gil Bregman Date: Sun, 22 Dec 2024 11:17:58 +0200 Subject: [PATCH] Do not process gRPC calls when gateway is going down. Fixes #992 Signed-off-by: Gil Bregman --- .env | 2 ++ ceph-nvmeof.conf | 2 +- control/grpc.py | 12 ++++++++++-- control/prometheus.py | 2 +- control/server.py | 2 ++ control/state.py | 2 +- tests/ceph-nvmeof.no-huge.conf | 5 ++++- tests/ceph-nvmeof.tls.conf | 5 ++++- tests/test_cli.py | 7 +++++++ tests/test_cli_change_lb.py | 4 ++-- 10 files changed, 34 insertions(+), 9 deletions(-) diff --git a/.env b/.env index 968097cd3..57b848dbf 100644 --- a/.env +++ b/.env @@ -101,3 +101,5 @@ DHCHAP_KEY6="DHHC-1:01:Bu4tZd7X2oW7XxmVH5tGCdoS30pDX6bZvexHYoudeVlJW9yz:" DHCHAP_KEY7="DHHC-1:01:JPJkDQ2po2FfLmKYlTF/sJ2HzVO/FKWxgXKE/H6XfL8ogQ1T:" DHCHAP_KEY8="DHHC-1:01:e0B0vDxKleDzYVtG42xqFvoWZfiufkoywmfRKrETzayRdf1j:" DHCHAP_KEY9="DHHC-1:01:KD+sfH3/o2bRQoV0ESjBUywQlMnSaYpZISUbVa0k0nsWpNST:" +DHCHAP_KEY10="DHHC-1:00:rWf0ZFYO7IgWGttM8w6jUrAY4cTQyqyXPdmxHeOSve3w5QU9:" +DHCHAP_KEY11="DHHC-1:02:j3uUz05r5aQy42vX4tDXqVf9HgUPPdEp3kXTgUWl9EphsG7jwpr9KSIt3bmRLXBijPTIDQ==:" diff --git a/ceph-nvmeof.conf b/ceph-nvmeof.conf index ea3c9c083..86875fe3c 100644 --- a/ceph-nvmeof.conf +++ b/ceph-nvmeof.conf @@ -32,7 +32,7 @@ max_ns_to_change_lb_grp = 8 #verify_nqns = True #allowed_consecutive_spdk_ping_failures = 1 #spdk_ping_interval_in_seconds = 2.0 -#max_hosts_per_namespace = 1 +#max_hosts_per_namespace = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_namespaces = 1024 diff --git a/control/grpc.py b/control/grpc.py index 4261d2bc0..95d9bd59a 100644 --- a/control/grpc.py +++ b/control/grpc.py @@ -362,7 +362,7 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp self.host_name = socket.gethostname() self.verify_nqns = self.config.getboolean_with_default("gateway", "verify_nqns", True) self.gateway_group = self.config.get_with_default("gateway", "group", "") - self.max_hosts_per_namespace = self.config.getint_with_default("gateway", "max_hosts_per_namespace", 1) + self.max_hosts_per_namespace = self.config.getint_with_default("gateway", "max_hosts_per_namespace", 8) self.max_namespaces_with_netmask = self.config.getint_with_default("gateway", "max_namespaces_with_netmask", 1000) self.max_subsystems = self.config.getint_with_default("gateway", "max_subsystems", GatewayService.MAX_SUBSYSTEMS_DEFAULT) self.max_namespaces = self.config.getint_with_default("gateway", "max_namespaces", GatewayService.MAX_NAMESPACES_DEFAULT) @@ -386,6 +386,7 @@ def __init__(self, config: GatewayConfig, gateway_state: GatewayStateHandler, rp self._init_cluster_context() self.subsys_max_ns = {} self.host_info = SubsystemHostAuth() + self.up_and_running = True self.rebalance = Rebalance(self) def get_directories_for_key_file(self, key_type : str, subsysnqn : str, create_dir : bool = False) -> []: @@ -668,6 +669,12 @@ def execute_grpc_function(self, func, request, context): called might take OMAP lock internally, however does NOT ensure taking OMAP lock in any way. """ + + if not self.up_and_running: + errmsg = "Gateway is going down" + self.logger.error(errmsg) + return pb2.req_status(status=errno.ESHUTDOWN, error_message=errmsg) + return self.omap_lock.execute_omap_locking_function(self._grpc_function_with_lock, func, request, context) def create_bdev(self, anagrp: int, name, uuid, rbd_pool_name, rbd_image_name, block_size, create_image, rbd_image_size, context, peer_msg = ""): @@ -1527,7 +1534,8 @@ def namespace_change_load_balancing_group_safe(self, request, context): grps_list = [] peer_msg = self.get_peer_message(context) change_lb_group_failure_prefix = f"Failure changing load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn}" - self.logger.info(f"Received auto {request.auto_lb_logic} request to change load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn} to {request.anagrpid}, context: {context}{peer_msg}") + auto_lb_msg = "auto" if request.auto_lb_logic else "manual" + self.logger.info(f"Received {auto_lb_msg} request to change load balancing group for namespace with NSID {request.nsid} in {request.subsystem_nqn} to {request.anagrpid}, context: {context}{peer_msg}") if not request.subsystem_nqn: errmsg = f"Failure changing load balancing group for namespace, missing subsystem NQN" diff --git a/control/prometheus.py b/control/prometheus.py index ec2fa1154..411b6057c 100644 --- a/control/prometheus.py +++ b/control/prometheus.py @@ -109,7 +109,7 @@ def __init__(self, spdk_rpc_client, config, gateway_rpc): self.gw_config = config _bdev_pools = config.get_with_default('gateway', 'prometheus_bdev_pools', '') self.bdev_pools = _bdev_pools.split(',') if _bdev_pools else [] - self.interval = config.getint_with_default('gateway', 'prometheus_stats_inteval', 10) + self.interval = config.getint_with_default('gateway', 'prometheus_stats_interval', 10) self.lock = threading.Lock() self.hostname = os.getenv('NODE_NAME') or os.getenv('HOSTNAME') diff --git a/control/server.py b/control/server.py index 39c9279e1..f1e889a11 100644 --- a/control/server.py +++ b/control/server.py @@ -125,6 +125,8 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): """Cleans up SPDK and server instances.""" + if self.gateway_rpc: + self.gateway_rpc.up_and_running = False if exc_type is not None: self.logger.exception("GatewayServer exception occurred:") else: diff --git a/control/state.py b/control/state.py index 27e5d5232..908371652 100644 --- a/control/state.py +++ b/control/state.py @@ -602,7 +602,7 @@ def cleanup_omap(self, omap_lock = None): if omap_lock and omap_lock.omap_file_lock_duration > 0: try: omap_lock.unlock_omap() - except Exceprion: + except Exception: pass if self.ioctx: try: diff --git a/tests/ceph-nvmeof.no-huge.conf b/tests/ceph-nvmeof.no-huge.conf index 556651054..d5b62aaf4 100644 --- a/tests/ceph-nvmeof.no-huge.conf +++ b/tests/ceph-nvmeof.no-huge.conf @@ -17,6 +17,9 @@ state_update_notify = True state_update_timeout_in_msec = 2000 state_update_interval_sec = 5 enable_spdk_discovery_controller = False +rebalance_period_sec = 7 +max_gws_in_grp = 16 +max_ns_to_change_lb_grp = 8 #omap_file_lock_duration = 20 #omap_file_lock_retries = 30 #omap_file_lock_retry_sleep_interval = 1.0 @@ -29,7 +32,7 @@ enable_spdk_discovery_controller = False #verify_nqns = True #allowed_consecutive_spdk_ping_failures = 1 #spdk_ping_interval_in_seconds = 2.0 -#max_hosts_per_namespace = 1 +#max_hosts_per_namespace = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_namespaces = 1024 diff --git a/tests/ceph-nvmeof.tls.conf b/tests/ceph-nvmeof.tls.conf index 14be0cc37..42334e0b6 100644 --- a/tests/ceph-nvmeof.tls.conf +++ b/tests/ceph-nvmeof.tls.conf @@ -16,6 +16,9 @@ enable_auth = True state_update_notify = True state_update_interval_sec = 5 enable_spdk_discovery_controller = False +rebalance_period_sec = 7 +max_gws_in_grp = 16 +max_ns_to_change_lb_grp = 8 #omap_file_lock_duration = 20 #omap_file_lock_retries = 30 #omap_file_lock_retry_sleep_interval = 1.0 @@ -28,7 +31,7 @@ enable_spdk_discovery_controller = False #verify_nqns = True #allowed_consecutive_spdk_ping_failures = 1 #spdk_ping_interval_in_seconds = 2.0 -#max_hosts_per_namespace = 1 +#max_hosts_per_namespace = 8 #max_namespaces_with_netmask = 1000 #max_subsystems = 128 #max_namespaces = 1024 diff --git a/tests/test_cli.py b/tests/test_cli.py index 8559b8d42..e3e4d089c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -70,6 +70,7 @@ def gateway(config): port = config.getint("gateway", "port") config.config["gateway"]["group"] = group_name config.config["gateway"]["max_namespaces_with_netmask"] = "3" + config.config["gateway"]["max_hosts_per_namespace"] = "1" config.config["gateway"]["max_subsystems"] = "3" config.config["gateway"]["max_namespaces"] = "12" config.config["gateway"]["max_namespaces_per_subsystem"] = "11" @@ -212,6 +213,9 @@ def test_create_subsystem(self, caplog, gateway): assert f'"nqn": "{subsystem}"' in caplog.text assert f'"max_namespaces": 2049' in caplog.text caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem, "--max-namespaces", "2049", "--no-group-append"]) + assert f"Failure creating subsystem {subsystem}: Subsystem already exists" in caplog.text + caplog.clear() cli(["subsystem", "add", "--subsystem", subsystem2, "--serial-number", serial, "--no-group-append"]) assert f"Adding subsystem {subsystem2}: Successful" in caplog.text caplog.clear() @@ -251,6 +255,9 @@ def test_create_subsystem(self, caplog, gateway): assert subs_list.subsystems[0].nqn == subsystem assert subs_list.subsystems[1].nqn == subsystem2 caplog.clear() + cli(["subsystem", "add", "--subsystem", subsystem8, "--serial-number", serial, "--no-group-append"]) + assert f"Failure creating subsystem {subsystem8}: Serial number {serial} is already used by subsystem {subsystem2}" in caplog.text + caplog.clear() subs_list = cli_test(["subsystem", "list"]) assert subs_list != None assert subs_list.status == 0 diff --git a/tests/test_cli_change_lb.py b/tests/test_cli_change_lb.py index 07cebb43c..6e83d179a 100755 --- a/tests/test_cli_change_lb.py +++ b/tests/test_cli_change_lb.py @@ -124,10 +124,10 @@ def change_one_namespace_lb_group(caplog, subsys, nsid_to_change, new_group): time.sleep(8) assert f"Changing load balancing group of namespace {nsid_to_change} in {subsys} to {new_group}: Successful" in caplog.text - assert f"Received auto False request to change load balancing group for namespace with NSID {nsid_to_change} in {subsys} to {new_group}, context: