Skip to content

Commit

Permalink
Save NVMeOF log in files under /var/log/ceph.
Browse files Browse the repository at this point in the history
Fixes #317

Signed-off-by: Gil Bregman <gbregman@il.ibm.com>
  • Loading branch information
gbregman committed Jan 7, 2024
1 parent 4300218 commit 66399b3
Show file tree
Hide file tree
Showing 11 changed files with 217 additions and 28 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ setup: ## Configure huge-pages (requires sudo/root password)

@echo Setup core dump pattern as /tmp/coredump/core.*
mkdir -p /tmp/coredump
sudo mkdir -p /var/log/ceph
sudo bash -c 'echo "|/usr/bin/env tee /tmp/coredump/core.%e.%p.%h.%t" > /proc/sys/kernel/core_pattern'
sudo bash -c 'echo $(HUGEPAGES) > $(HUGEPAGES_DIR)'
@echo Actual Hugepages allocation: $$(cat $(HUGEPAGES_DIR))
Expand Down
8 changes: 7 additions & 1 deletion ceph-nvmeof.conf
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,17 @@ enable_spdk_discovery_controller = False
#omap_file_lock_retries = 15
#omap_file_lock_retry_sleep_interval = 5
#omap_file_update_reloads = 10
log_level=debug
#max_log_file_size_in_mb=10
#max_log_files_count=20
#
# Notice that if you change the log directory the log files will only be visible inside the container
#
#log_directory = /var/log/ceph/

[discovery]
addr = 0.0.0.0
port = 8009
debug = 20

[ceph]
pool = rbd
Expand Down
24 changes: 18 additions & 6 deletions control/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,23 @@

import logging
import argparse
import signal
from .server import GatewayServer
from .config import GatewayConfig
from .config import GatewayLogger

if __name__ == '__main__':
# Set up root logger
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
gw_logger = None
gw_name = None

def sigterm_handler(signum, frame):
if gw_logger:
logger = gw_logger.logger
else:
logger = GatewayLogger().logger
if gw_logger and gw_name:
gw_logger.compress_log_files(gw_name)

if __name__ == '__main__':
parser = argparse.ArgumentParser(prog="python3 -m control",
description="Manage NVMe gateways",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
Expand All @@ -30,8 +38,12 @@
)
args = parser.parse_args()

signal.signal(signal.SIGTERM, sigterm_handler)

config = GatewayConfig(args.config)
config.dump_config_file(logger)
gw_logger = GatewayLogger(config)
config.dump_config_file(gw_logger.logger)
with GatewayServer(config) as gateway:
gw_name = gateway.name
gateway.serve()
gateway.keep_alive()
161 changes: 161 additions & 0 deletions control/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
#

import configparser
import os
import socket
import logging
import logging.handlers
import gzip
import shutil

class GatewayEnumUtils:
def get_value_from_key(e_type, keyval, ignore_case = False):
Expand Down Expand Up @@ -46,6 +52,7 @@ class GatewayConfig:
"""

DISCOVERY_NQN = "nqn.2014-08.org.nvmexpress.discovery"
CEPH_RUN_DIRECTORY = "/var/run/ceph/"

def __init__(self, conffile):
self.filepath = conffile
Expand Down Expand Up @@ -102,3 +109,157 @@ def escape_address_if_ipv6(addr) -> str:
if ":" in addr and not addr.strip().startswith("["):
ret_addr = f"[{addr}]"
return ret_addr

class GatewayLogger:
CEPH_LOG_DIRECTORY = "/var/log/ceph/"
MAX_LOG_FILE_SIZE_DEFAULT = 10
MAX_LOG_FILES_COUNT_DEFAULT = 20
NVME_LOG_DIR_PREFIX = "nvmeof-"
NVME_LOG_FILE_NAME = "nvmeof-log"
logger = None

def __init__(self, config=None):
self.logger = None
if config:
self.log_directory = config.get_with_default("gateway", "log_directory", GatewayLogger.CEPH_LOG_DIRECTORY)
else:
self.log_directory = GatewayLogger.CEPH_LOG_DIRECTORY
if not self.log_directory.endswith("/"):
self.log_directory += "/"

if not GatewayLogger.logger:
frmtr = logging.Formatter(fmt='[%(asctime)s] %(levelname)s %(filename)s:%(lineno)d: %(message)s')
frmtr.default_msec_format = None

if config:
max_log_file_size = config.getint_with_default("gateway", "max_log_file_size_in_mb", GatewayLogger.MAX_LOG_FILE_SIZE_DEFAULT)
max_log_files_count = config.getint_with_default("gateway", "max_log_files_count", GatewayLogger.MAX_LOG_FILES_COUNT_DEFAULT)
log_level = config.get_with_default("gateway", "log_level", "info")
gateway_name = config.get("gateway", "name")
else:
max_log_file_size = GatewayLogger.MAX_LOG_FILE_SIZE_DEFAULT
max_log_files_count = GatewayLogger.MAX_LOG_FILES_COUNT_DEFAULT
log_level = "info"
gateway_name = None
if not gateway_name:
gateway_name = socket.gethostname()
gw_log_dir = self.log_directory + GatewayLogger.NVME_LOG_DIR_PREFIX + gateway_name

handler = None
try:
shutil.rmtree(gw_log_dir + ".bak2", ignore_errors = True)
except Exception:
pass
try:
os.rename(gw_log_dir + ".bak", gw_log_dir + ".bak2")
except Exception:
pass
try:
os.rename(gw_log_dir, gw_log_dir + ".bak")
except Exception:
pass
# Just to be on the safe side, in case the rename failed
try:
shutil.rmtree(gw_log_dir, ignore_errors = True)
except Exception:
pass
try:
os.makedirs(gw_log_dir, 0o777, True)
handler = logging.handlers.RotatingFileHandler(gw_log_dir + "/" + GatewayLogger.NVME_LOG_FILE_NAME,
maxBytes = max_log_file_size * 1024 * 1024,
backupCount = max_log_files_count)
handler.setFormatter(frmtr)
except Exception:
pass

logging.basicConfig(level=GatewayLogger.get_log_level(log_level))
GatewayLogger.logger = logging.getLogger("nvmeof")
if handler:
GatewayLogger.logger.addHandler(handler)
self.logger = GatewayLogger.logger
self.compress_log_files()

def get_log_level(log_level):
if type(log_level) == int:
return log_level
assert type(log_level) == str
if log_level.upper() == "DEBUG":
return logging.DEBUG
elif log_level.upper() == "INFO":
return logging.INFO
elif log_level.upper() == "WARNING":
return logging.WARNING
elif log_level.upper() == "ERROR":
return logging.ERROR
elif log_level.upper() == "CRITICAL":
return logging.CRITICAL
elif log_level.upper() == "NOTSET":
return logging.NOTSET
else:
assert False

def set_log_level(self, log_level):
log_level = GatewayLogger.get_log_level(log_level)
self.logger.setLevel(log_level)

def compress_file(self, src, dest):
self.logger.debug(f"Will compress {src} to {dest}")
try:
os.remove(dest)
except Exception:
pass
need_to_remove_dest = False
try:
with open(src, 'rb') as f_in:
with gzip.open(dest, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
except FileNotFoundError:
return
except Exception:
self.logger.exception(f"Failure compressing file {src}")
need_to_remove_dest = True

if need_to_remove_dest:
# We ran into a problem trying to compress so need to remove destination file in case one was created
try:
os.remove(dest)
except Exception:
self.logger.exception(f"Failure deleting file {dest}, ignore")
return

# If we got here the compression was successful so we can delete the source file
try:
os.remove(src)
except Exception:
self.logger.exception(f"Failure deleting file {src}, ignore")

def compress_log_files(self, gw_name = None):
try:
tree_list = os.walk(self.log_directory)
except Exception:
self.logger.exception(f"Failure listing files in directory {self.log_directory}")
tree_list = []

compress_also = None
if gw_name:
compress_also = self.log_directory + GatewayLogger.NVME_LOG_DIR_PREFIX + gw_name

for rootdir, subdirs, files in tree_list:
if compress_also and rootdir == compress_also:
self.compress_file(rootdir + "/" + GatewayLogger.NVME_LOG_FILE_NAME,
rootdir + "/" + GatewayLogger.NVME_LOG_FILE_NAME + ".gz")
for file in files:
if not file.startswith(GatewayLogger.NVME_LOG_FILE_NAME):
continue
if not "." in file:
# Only work on backup files which have a dot and a numeric extension
continue
if file.endswith(".gz"):
continue
filename_extenstion = file.split(".")[-1:][0]
try:
ext_num = int(filename_extenstion)
except Exception:
self.logger.exception(f"Invalid extension for log file {rootdir}/{file}")
continue
self.compress_file(rootdir + "/" + file, rootdir + "/" + file + ".gz")
10 changes: 2 additions & 8 deletions control/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .config import GatewayConfig
from .state import GatewayState, LocalGatewayState, OmapGatewayState, GatewayStateHandler
from .config import GatewayEnumUtils
from .config import GatewayLogger
from .proto import gateway_pb2 as pb2

import rados
Expand Down Expand Up @@ -297,9 +298,7 @@ def __init__(self, config):
self.lock = threading.Lock()
self.omap_state = OmapGatewayState(self.config)

self.logger = logging.getLogger(__name__)
log_level = self.config.getint_with_default("discovery", "debug", 20)
self.logger.setLevel(level=log_level)
self.logger = GatewayLogger(config).logger

gateway_group = self.config.get_with_default("gateway", "group", "")
self.omap_name = f"nvmeof.{gateway_group}.state" \
Expand Down Expand Up @@ -1032,11 +1031,6 @@ def start_service(self):
self.logger.debug("received a ctrl+C interrupt. exiting...")

def main(args=None):
# Set up root logger
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

parser = argparse.ArgumentParser(prog="python3 -m control",
description="Discover NVMe gateways")
parser.add_argument(
Expand Down
5 changes: 3 additions & 2 deletions control/grpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from .proto import gateway_pb2_grpc as pb2_grpc
from .config import GatewayConfig
from .config import GatewayEnumUtils
from .config import GatewayLogger
from .state import GatewayState

MAX_ANA_GROUPS = 4
Expand All @@ -47,7 +48,7 @@ class GatewayService(pb2_grpc.GatewayServicer):

def __init__(self, config, gateway_state, omap_lock, spdk_rpc_client) -> None:
"""Constructor"""
self.logger = logging.getLogger(__name__)
self.logger = GatewayLogger(config).logger
ver = os.getenv("NVMEOF_VERSION")
if ver:
self.logger.info(f"Using NVMeoF gateway version {ver}")
Expand Down Expand Up @@ -818,7 +819,7 @@ def list_namespaces(self, request, context=None):
nsid_msg = f"namespace with NSID {request.nsid} and UUID {request.uuid}"
else:
nsid_msg = f"namespace with NSID {request.nsid}"
self.logger.info(f"Received request to list {nsid_msg}for {request.subsystem}, context: {context}")
self.logger.info(f"Received request to list {nsid_msg} for {request.subsystem}, context: {context}")

with self.rpc_lock:
try:
Expand Down
13 changes: 7 additions & 6 deletions control/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import grpc
import json
import logging
import signal
from concurrent import futures
from google.protobuf import json_format

Expand All @@ -29,11 +28,12 @@
from .grpc import GatewayService
from .discovery import DiscoveryService
from .config import GatewayConfig
from .config import GatewayLogger

def sigchld_handler(signum, frame):
"""Handle SIGCHLD, runs when a spdk process terminates."""
logger = logging.getLogger(__name__)
logger.error(f"GatewayServer: GSIGCHLD received {signum=}")
logger = GatewayLogger().logger
logger.error(f"GatewayServer: SIGCHLD received {signum=}")

try:
pid, wait_status = os.waitpid(-1, os.WNOHANG)
Expand Down Expand Up @@ -61,8 +61,9 @@ class GatewayServer:
"""

def __init__(self, config):
self.logger = logging.getLogger(__name__)
self.config = config
self.gw_logger_object = GatewayLogger(self.config)
self.logger = self.gw_logger_object.logger
self.spdk_process = None
self.gateway_rpc = None
self.server = None
Expand All @@ -74,7 +75,6 @@ def __init__(self, config):
self.name = socket.gethostname()
self.logger.info(f"Starting gateway {self.name}")


def __enter__(self):
return self

Expand All @@ -95,6 +95,7 @@ def __exit__(self, exc_type, exc_value, traceback):
self._stop_discovery()

self.logger.info("Exiting the gateway process.")
self.gw_logger_object.compress_log_files(self.name)

def serve(self):
"""Starts gateway server."""
Expand Down Expand Up @@ -196,7 +197,7 @@ def _get_spdk_rpc_socket_path(self, omap_state) -> str:

spdk_rpc_socket_dir = self.config.get_with_default("spdk", "rpc_socket_dir", "")
if not spdk_rpc_socket_dir:
spdk_rpc_socket_dir = "/var/run/ceph/"
spdk_rpc_socket_dir = GatewayConfig.CEPH_RUN_DIRECTORY
if omap_state.ceph_fsid:
spdk_rpc_socket_dir += omap_state.ceph_fsid + "/"
if not spdk_rpc_socket_dir.endswith("/"):
Expand Down
6 changes: 3 additions & 3 deletions control/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from typing import Dict
from collections import defaultdict
from abc import ABC, abstractmethod

from .config import GatewayLogger

class GatewayState(ABC):
"""Persists gateway NVMeoF target state.
Expand Down Expand Up @@ -316,7 +316,7 @@ class OmapGatewayState(GatewayState):
def __init__(self, config):
self.config = config
self.version = 1
self.logger = logging.getLogger(__name__)
self.logger = GatewayLogger(self.config).logger
self.watch = None
gateway_group = self.config.get("gateway", "group")
self.omap_name = f"nvmeof.{gateway_group}.state" if gateway_group else "nvmeof.state"
Expand Down Expand Up @@ -516,7 +516,7 @@ def __init__(self, config, local, omap, gateway_rpc_caller):
self.omap = omap
self.gateway_rpc_caller = gateway_rpc_caller
self.update_timer = None
self.logger = logging.getLogger(__name__)
self.logger = GatewayLogger(self.config).logger
self.update_interval = self.config.getint("gateway",
"state_update_interval_sec")
if self.update_interval < 1:
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ services:
- /dev/vfio/vfio:/dev/vfio/vfio
- $NVMEOF_CONFIG:/src/ceph-nvmeof.conf
- /tmp/coredump:/tmp/coredump # core dump
- /var/log/ceph:/var/log/ceph
- ceph-conf:/etc/ceph:ro
cap_add:
- SYS_ADMIN # huge-pages
Expand Down
Loading

0 comments on commit 66399b3

Please sign in to comment.