From 66399b3829ed77a62f0a8c937b91aac2d31295ec Mon Sep 17 00:00:00 2001 From: Gil Bregman Date: Thu, 4 Jan 2024 13:37:49 +0200 Subject: [PATCH] Save NVMeOF log in files under /var/log/ceph. Fixes #317 Signed-off-by: Gil Bregman --- Makefile | 1 + ceph-nvmeof.conf | 8 ++- control/__main__.py | 24 +++++-- control/config.py | 161 +++++++++++++++++++++++++++++++++++++++++++ control/discovery.py | 10 +-- control/grpc.py | 5 +- control/server.py | 13 ++-- control/state.py | 6 +- docker-compose.yaml | 1 + tests/test_cli.py | 14 ++++ tests/test_grpc.py | 2 - 11 files changed, 217 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index dcba0c297..6163fc48a 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,7 @@ setup: ## Configure huge-pages (requires sudo/root password) @echo Setup core dump pattern as /tmp/coredump/core.* mkdir -p /tmp/coredump + sudo mkdir -p /var/log/ceph sudo bash -c 'echo "|/usr/bin/env tee /tmp/coredump/core.%e.%p.%h.%t" > /proc/sys/kernel/core_pattern' sudo bash -c 'echo $(HUGEPAGES) > $(HUGEPAGES_DIR)' @echo Actual Hugepages allocation: $$(cat $(HUGEPAGES_DIR)) diff --git a/ceph-nvmeof.conf b/ceph-nvmeof.conf index 07666a587..6319136d5 100644 --- a/ceph-nvmeof.conf +++ b/ceph-nvmeof.conf @@ -22,11 +22,17 @@ enable_spdk_discovery_controller = False #omap_file_lock_retries = 15 #omap_file_lock_retry_sleep_interval = 5 #omap_file_update_reloads = 10 +log_level=debug +#max_log_file_size_in_mb=10 +#max_log_files_count=20 +# +# Notice that if you change the log directory the log files will only be visible inside the container +# +#log_directory = /var/log/ceph/ [discovery] addr = 0.0.0.0 port = 8009 -debug = 20 [ceph] pool = rbd diff --git a/control/__main__.py b/control/__main__.py index 2544fd719..a66182c82 100644 --- a/control/__main__.py +++ b/control/__main__.py @@ -9,15 +9,23 @@ import logging import argparse +import signal from .server import GatewayServer from .config import GatewayConfig +from .config import GatewayLogger -if __name__ == '__main__': - # Set up root logger - logging.basicConfig() - logger = logging.getLogger() - logger.setLevel(logging.DEBUG) +gw_logger = None +gw_name = None + +def sigterm_handler(signum, frame): + if gw_logger: + logger = gw_logger.logger + else: + logger = GatewayLogger().logger + if gw_logger and gw_name: + gw_logger.compress_log_files(gw_name) +if __name__ == '__main__': parser = argparse.ArgumentParser(prog="python3 -m control", description="Manage NVMe gateways", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -30,8 +38,12 @@ ) args = parser.parse_args() + signal.signal(signal.SIGTERM, sigterm_handler) + config = GatewayConfig(args.config) - config.dump_config_file(logger) + gw_logger = GatewayLogger(config) + config.dump_config_file(gw_logger.logger) with GatewayServer(config) as gateway: + gw_name = gateway.name gateway.serve() gateway.keep_alive() diff --git a/control/config.py b/control/config.py index a0150a552..ea57bc1ea 100644 --- a/control/config.py +++ b/control/config.py @@ -8,6 +8,12 @@ # import configparser +import os +import socket +import logging +import logging.handlers +import gzip +import shutil class GatewayEnumUtils: def get_value_from_key(e_type, keyval, ignore_case = False): @@ -46,6 +52,7 @@ class GatewayConfig: """ DISCOVERY_NQN = "nqn.2014-08.org.nvmexpress.discovery" + CEPH_RUN_DIRECTORY = "/var/run/ceph/" def __init__(self, conffile): self.filepath = conffile @@ -102,3 +109,157 @@ def escape_address_if_ipv6(addr) -> str: if ":" in addr and not addr.strip().startswith("["): ret_addr = f"[{addr}]" return ret_addr + +class GatewayLogger: + CEPH_LOG_DIRECTORY = "/var/log/ceph/" + MAX_LOG_FILE_SIZE_DEFAULT = 10 + MAX_LOG_FILES_COUNT_DEFAULT = 20 + NVME_LOG_DIR_PREFIX = "nvmeof-" + NVME_LOG_FILE_NAME = "nvmeof-log" + logger = None + + def __init__(self, config=None): + self.logger = None + if config: + self.log_directory = config.get_with_default("gateway", "log_directory", GatewayLogger.CEPH_LOG_DIRECTORY) + else: + self.log_directory = GatewayLogger.CEPH_LOG_DIRECTORY + if not self.log_directory.endswith("/"): + self.log_directory += "/" + + if not GatewayLogger.logger: + frmtr = logging.Formatter(fmt='[%(asctime)s] %(levelname)s %(filename)s:%(lineno)d: %(message)s') + frmtr.default_msec_format = None + + if config: + max_log_file_size = config.getint_with_default("gateway", "max_log_file_size_in_mb", GatewayLogger.MAX_LOG_FILE_SIZE_DEFAULT) + max_log_files_count = config.getint_with_default("gateway", "max_log_files_count", GatewayLogger.MAX_LOG_FILES_COUNT_DEFAULT) + log_level = config.get_with_default("gateway", "log_level", "info") + gateway_name = config.get("gateway", "name") + else: + max_log_file_size = GatewayLogger.MAX_LOG_FILE_SIZE_DEFAULT + max_log_files_count = GatewayLogger.MAX_LOG_FILES_COUNT_DEFAULT + log_level = "info" + gateway_name = None + if not gateway_name: + gateway_name = socket.gethostname() + gw_log_dir = self.log_directory + GatewayLogger.NVME_LOG_DIR_PREFIX + gateway_name + + handler = None + try: + shutil.rmtree(gw_log_dir + ".bak2", ignore_errors = True) + except Exception: + pass + try: + os.rename(gw_log_dir + ".bak", gw_log_dir + ".bak2") + except Exception: + pass + try: + os.rename(gw_log_dir, gw_log_dir + ".bak") + except Exception: + pass + # Just to be on the safe side, in case the rename failed + try: + shutil.rmtree(gw_log_dir, ignore_errors = True) + except Exception: + pass + try: + os.makedirs(gw_log_dir, 0o777, True) + handler = logging.handlers.RotatingFileHandler(gw_log_dir + "/" + GatewayLogger.NVME_LOG_FILE_NAME, + maxBytes = max_log_file_size * 1024 * 1024, + backupCount = max_log_files_count) + handler.setFormatter(frmtr) + except Exception: + pass + + logging.basicConfig(level=GatewayLogger.get_log_level(log_level)) + GatewayLogger.logger = logging.getLogger("nvmeof") + if handler: + GatewayLogger.logger.addHandler(handler) + self.logger = GatewayLogger.logger + self.compress_log_files() + + def get_log_level(log_level): + if type(log_level) == int: + return log_level + assert type(log_level) == str + if log_level.upper() == "DEBUG": + return logging.DEBUG + elif log_level.upper() == "INFO": + return logging.INFO + elif log_level.upper() == "WARNING": + return logging.WARNING + elif log_level.upper() == "ERROR": + return logging.ERROR + elif log_level.upper() == "CRITICAL": + return logging.CRITICAL + elif log_level.upper() == "NOTSET": + return logging.NOTSET + else: + assert False + + def set_log_level(self, log_level): + log_level = GatewayLogger.get_log_level(log_level) + self.logger.setLevel(log_level) + + def compress_file(self, src, dest): + self.logger.debug(f"Will compress {src} to {dest}") + try: + os.remove(dest) + except Exception: + pass + need_to_remove_dest = False + try: + with open(src, 'rb') as f_in: + with gzip.open(dest, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + except FileNotFoundError: + return + except Exception: + self.logger.exception(f"Failure compressing file {src}") + need_to_remove_dest = True + + if need_to_remove_dest: + # We ran into a problem trying to compress so need to remove destination file in case one was created + try: + os.remove(dest) + except Exception: + self.logger.exception(f"Failure deleting file {dest}, ignore") + return + + # If we got here the compression was successful so we can delete the source file + try: + os.remove(src) + except Exception: + self.logger.exception(f"Failure deleting file {src}, ignore") + + def compress_log_files(self, gw_name = None): + try: + tree_list = os.walk(self.log_directory) + except Exception: + self.logger.exception(f"Failure listing files in directory {self.log_directory}") + tree_list = [] + + compress_also = None + if gw_name: + compress_also = self.log_directory + GatewayLogger.NVME_LOG_DIR_PREFIX + gw_name + + for rootdir, subdirs, files in tree_list: + if compress_also and rootdir == compress_also: + self.compress_file(rootdir + "/" + GatewayLogger.NVME_LOG_FILE_NAME, + rootdir + "/" + GatewayLogger.NVME_LOG_FILE_NAME + ".gz") + for file in files: + if not file.startswith(GatewayLogger.NVME_LOG_FILE_NAME): + continue + if not "." in file: + # Only work on backup files which have a dot and a numeric extension + continue + if file.endswith(".gz"): + continue + filename_extenstion = file.split(".")[-1:][0] + try: + ext_num = int(filename_extenstion) + except Exception: + self.logger.exception(f"Invalid extension for log file {rootdir}/{file}") + continue + self.compress_file(rootdir + "/" + file, rootdir + "/" + file + ".gz") diff --git a/control/discovery.py b/control/discovery.py index 147893074..c0f566a73 100644 --- a/control/discovery.py +++ b/control/discovery.py @@ -14,6 +14,7 @@ from .config import GatewayConfig from .state import GatewayState, LocalGatewayState, OmapGatewayState, GatewayStateHandler from .config import GatewayEnumUtils +from .config import GatewayLogger from .proto import gateway_pb2 as pb2 import rados @@ -297,9 +298,7 @@ def __init__(self, config): self.lock = threading.Lock() self.omap_state = OmapGatewayState(self.config) - self.logger = logging.getLogger(__name__) - log_level = self.config.getint_with_default("discovery", "debug", 20) - self.logger.setLevel(level=log_level) + self.logger = GatewayLogger(config).logger gateway_group = self.config.get_with_default("gateway", "group", "") self.omap_name = f"nvmeof.{gateway_group}.state" \ @@ -1032,11 +1031,6 @@ def start_service(self): self.logger.debug("received a ctrl+C interrupt. exiting...") def main(args=None): - # Set up root logger - logging.basicConfig() - logger = logging.getLogger() - logger.setLevel(logging.DEBUG) - parser = argparse.ArgumentParser(prog="python3 -m control", description="Discover NVMe gateways") parser.add_argument( diff --git a/control/grpc.py b/control/grpc.py index b0f085fdd..bb8f644a3 100644 --- a/control/grpc.py +++ b/control/grpc.py @@ -28,6 +28,7 @@ from .proto import gateway_pb2_grpc as pb2_grpc from .config import GatewayConfig from .config import GatewayEnumUtils +from .config import GatewayLogger from .state import GatewayState MAX_ANA_GROUPS = 4 @@ -47,7 +48,7 @@ class GatewayService(pb2_grpc.GatewayServicer): def __init__(self, config, gateway_state, omap_lock, spdk_rpc_client) -> None: """Constructor""" - self.logger = logging.getLogger(__name__) + self.logger = GatewayLogger(config).logger ver = os.getenv("NVMEOF_VERSION") if ver: self.logger.info(f"Using NVMeoF gateway version {ver}") @@ -818,7 +819,7 @@ def list_namespaces(self, request, context=None): nsid_msg = f"namespace with NSID {request.nsid} and UUID {request.uuid}" else: nsid_msg = f"namespace with NSID {request.nsid}" - self.logger.info(f"Received request to list {nsid_msg}for {request.subsystem}, context: {context}") + self.logger.info(f"Received request to list {nsid_msg} for {request.subsystem}, context: {context}") with self.rpc_lock: try: diff --git a/control/server.py b/control/server.py index c09175f2c..6cb61ad68 100644 --- a/control/server.py +++ b/control/server.py @@ -15,7 +15,6 @@ import grpc import json import logging -import signal from concurrent import futures from google.protobuf import json_format @@ -29,11 +28,12 @@ from .grpc import GatewayService from .discovery import DiscoveryService from .config import GatewayConfig +from .config import GatewayLogger def sigchld_handler(signum, frame): """Handle SIGCHLD, runs when a spdk process terminates.""" - logger = logging.getLogger(__name__) - logger.error(f"GatewayServer: GSIGCHLD received {signum=}") + logger = GatewayLogger().logger + logger.error(f"GatewayServer: SIGCHLD received {signum=}") try: pid, wait_status = os.waitpid(-1, os.WNOHANG) @@ -61,8 +61,9 @@ class GatewayServer: """ def __init__(self, config): - self.logger = logging.getLogger(__name__) self.config = config + self.gw_logger_object = GatewayLogger(self.config) + self.logger = self.gw_logger_object.logger self.spdk_process = None self.gateway_rpc = None self.server = None @@ -74,7 +75,6 @@ def __init__(self, config): self.name = socket.gethostname() self.logger.info(f"Starting gateway {self.name}") - def __enter__(self): return self @@ -95,6 +95,7 @@ def __exit__(self, exc_type, exc_value, traceback): self._stop_discovery() self.logger.info("Exiting the gateway process.") + self.gw_logger_object.compress_log_files(self.name) def serve(self): """Starts gateway server.""" @@ -196,7 +197,7 @@ def _get_spdk_rpc_socket_path(self, omap_state) -> str: spdk_rpc_socket_dir = self.config.get_with_default("spdk", "rpc_socket_dir", "") if not spdk_rpc_socket_dir: - spdk_rpc_socket_dir = "/var/run/ceph/" + spdk_rpc_socket_dir = GatewayConfig.CEPH_RUN_DIRECTORY if omap_state.ceph_fsid: spdk_rpc_socket_dir += omap_state.ceph_fsid + "/" if not spdk_rpc_socket_dir.endswith("/"): diff --git a/control/state.py b/control/state.py index cd2cf1063..95de59668 100644 --- a/control/state.py +++ b/control/state.py @@ -15,7 +15,7 @@ from typing import Dict from collections import defaultdict from abc import ABC, abstractmethod - +from .config import GatewayLogger class GatewayState(ABC): """Persists gateway NVMeoF target state. @@ -316,7 +316,7 @@ class OmapGatewayState(GatewayState): def __init__(self, config): self.config = config self.version = 1 - self.logger = logging.getLogger(__name__) + self.logger = GatewayLogger(self.config).logger self.watch = None gateway_group = self.config.get("gateway", "group") self.omap_name = f"nvmeof.{gateway_group}.state" if gateway_group else "nvmeof.state" @@ -516,7 +516,7 @@ def __init__(self, config, local, omap, gateway_rpc_caller): self.omap = omap self.gateway_rpc_caller = gateway_rpc_caller self.update_timer = None - self.logger = logging.getLogger(__name__) + self.logger = GatewayLogger(self.config).logger self.update_interval = self.config.getint("gateway", "state_update_interval_sec") if self.update_interval < 1: diff --git a/docker-compose.yaml b/docker-compose.yaml index 50da19d90..d21b45107 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -112,6 +112,7 @@ services: - /dev/vfio/vfio:/dev/vfio/vfio - $NVMEOF_CONFIG:/src/ceph-nvmeof.conf - /tmp/coredump:/tmp/coredump # core dump + - /var/log/ceph:/var/log/ceph - ceph-conf:/etc/ceph:ro cap_add: - SYS_ADMIN # huge-pages diff --git a/tests/test_cli.py b/tests/test_cli.py index 3aaafe9db..49e9dc133 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -716,3 +716,17 @@ def test_log_flags(self, caplog, gateway): pass assert "error: argument --level/-l: invalid choice: 'JUNK'" in caplog.text assert rc == 2 + +class TestLogFiles: + def test_log_files(self, caplog, gateway): + look_for = f"/var/log/ceph/nvmeof-{gateway_name}" + tree_list = os.walk("/var/log/ceph") + for rootdir, subdirs, files in tree_list: + if rootdir == "/var/log/ceph": + pass + elif rootdir == look_for: + assert "nvmeof-log" in files + else: + assert "nvmeof-log.gz" in files + with open(f"/var/log/ceph/nvmeof-{gateway_name}/nvmeof-log", "r") as f: + assert f"Starting gateway {gateway_name}" in f.read() diff --git a/tests/test_grpc.py b/tests/test_grpc.py index 36d2a891d..5171b99b9 100644 --- a/tests/test_grpc.py +++ b/tests/test_grpc.py @@ -5,8 +5,6 @@ import logging import warnings -# Set up a logger -logger = logging.getLogger(__name__) image = "mytestdevimage" pool = "rbd" subsystem_prefix = "nqn.2016-06.io.spdk:cnode"