Skip to content

Commit

Permalink
coredump: generate stacktraces also for processes running in containe…
Browse files Browse the repository at this point in the history
…rs w/o coredump forwarding

Note that entering container namespace has to be explicitly enabled by
setting SYSTEMD_COREDUMP_ALLOW_NAMESPACE_CHANGE environment variable.

RHEL-only

Resolves: RHEL-29430
  • Loading branch information
msekletar committed Apr 24, 2024
1 parent e100e38 commit a4769ff
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 60 deletions.
2 changes: 1 addition & 1 deletion src/analyze/analyze-inspect-elf.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ static int analyze_elf(char **filenames, JsonFormatFlags json_flags) {
if (fd < 0)
return log_error_errno(fd, "Could not open \"%s\": %m", abspath);

r = parse_elf_object(fd, abspath, /* fork_disable_dump= */false, NULL, &package_metadata);
r = parse_elf_object(fd, -EBADF, UID_NOBODY, GID_NOBODY, abspath, /* fork_disable_dump= */false, NULL, &package_metadata);
if (r < 0)
return log_error_errno(r, "Parsing \"%s\" as ELF object failed: %m", abspath);

Expand Down
52 changes: 52 additions & 0 deletions src/basic/socket-util.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@
# define IDN_FLAGS 0
#endif

/* From the kernel's include/net/scm.h */
#ifndef SCM_MAX_FD
# define SCM_MAX_FD 253
#endif

static const char* const socket_address_type_table[] = {
[SOCK_STREAM] = "Stream",
[SOCK_DGRAM] = "Datagram",
Expand Down Expand Up @@ -951,6 +956,53 @@ int getpeergroups(int fd, gid_t **ret) {
return (int) n;
}

ssize_t send_many_fds_iov_sa(
int transport_fd,
int *fds_array, size_t n_fds_array,
const struct iovec *iov, size_t iovlen,
const struct sockaddr *sa, socklen_t len,
int flags) {

_cleanup_free_ struct cmsghdr *cmsg = NULL;
struct msghdr mh = {
.msg_name = (struct sockaddr*) sa,
.msg_namelen = len,
.msg_iov = (struct iovec *)iov,
.msg_iovlen = iovlen,
};
ssize_t k;

assert(transport_fd >= 0);
assert(fds_array || n_fds_array == 0);

/* The kernel will reject sending more than SCM_MAX_FD FDs at once */
if (n_fds_array > SCM_MAX_FD)
return -E2BIG;

/* We need either an FD array or data to send. If there's nothing, return an error. */
if (n_fds_array == 0 && !iov)
return -EINVAL;

if (n_fds_array > 0) {
mh.msg_controllen = CMSG_SPACE(sizeof(int) * n_fds_array);
mh.msg_control = cmsg = malloc(mh.msg_controllen);
if (!cmsg)
return -ENOMEM;

*cmsg = (struct cmsghdr) {
.cmsg_len = CMSG_LEN(sizeof(int) * n_fds_array),
.cmsg_level = SOL_SOCKET,
.cmsg_type = SCM_RIGHTS,
};
memcpy(CMSG_DATA(cmsg), fds_array, sizeof(int) * n_fds_array);
}
k = sendmsg(transport_fd, &mh, MSG_NOSIGNAL | flags);
if (k < 0)
return (ssize_t) -errno;

return k;
}

ssize_t send_one_fd_iov_sa(
int transport_fd,
int fd,
Expand Down
15 changes: 15 additions & 0 deletions src/basic/socket-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,13 @@ int getpeercred(int fd, struct ucred *ucred);
int getpeersec(int fd, char **ret);
int getpeergroups(int fd, gid_t **ret);

ssize_t send_many_fds_iov_sa(
int transport_fd,
int *fds_array, size_t n_fds_array,
const struct iovec *iov, size_t iovlen,
const struct sockaddr *sa, socklen_t len,
int flags);

ssize_t send_one_fd_iov_sa(
int transport_fd,
int fd,
Expand All @@ -163,6 +170,14 @@ int send_one_fd_sa(int transport_fd,
int fd,
const struct sockaddr *sa, socklen_t len,
int flags);
static inline int send_many_fds(
int transport_fd,
int *fds_array,
size_t n_fds_array,
int flags) {

return send_many_fds_iov_sa(transport_fd, fds_array, n_fds_array, NULL, 0, NULL, 0, flags);
}
#define send_one_fd_iov(transport_fd, fd, iov, iovlen, flags) send_one_fd_iov_sa(transport_fd, fd, iov, iovlen, NULL, 0, flags)
#define send_one_fd(transport_fd, fd, flags) send_one_fd_iov_sa(transport_fd, fd, NULL, 0, NULL, 0, flags)
ssize_t receive_one_fd_iov(int transport_fd, struct iovec *iov, size_t iovlen, int flags, int *ret_fd);
Expand Down
151 changes: 97 additions & 54 deletions src/coredump/coredump.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "coredump-vacuum.h"
#include "dirent-util.h"
#include "elf-util.h"
#include "env-util.h"
#include "escape.h"
#include "fd-util.h"
#include "fileio.h"
Expand All @@ -36,6 +37,7 @@
#include "main-func.h"
#include "memory-util.h"
#include "mkdir-label.h"
#include "namespace-util.h"
#include "parse-util.h"
#include "process-util.h"
#include "signal-util.h"
Expand Down Expand Up @@ -130,6 +132,8 @@ typedef struct Context {
const char *meta[_META_MAX];
size_t meta_size[_META_MAX];
pid_t pid;
uid_t uid;
gid_t gid;
bool is_pid1;
bool is_journald;
} Context;
Expand Down Expand Up @@ -866,36 +870,11 @@ static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) {
return 1;
}

static int change_uid_gid(const Context *context) {
uid_t uid;
gid_t gid;
int r;

r = parse_uid(context->meta[META_ARGV_UID], &uid);
if (r < 0)
return r;

if (uid_is_system(uid)) {
const char *user = "systemd-coredump";

r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
if (r < 0) {
log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
uid = gid = 0;
}
} else {
r = parse_gid(context->meta[META_ARGV_GID], &gid);
if (r < 0)
return r;
}

return drop_privileges(uid, gid, 0);
}

static int submit_coredump(
const Context *context,
struct iovec_wrapper *iovw,
int input_fd) {
int input_fd,
int mntns_fd) {

_cleanup_(json_variant_unrefp) JsonVariant *json_metadata = NULL;
_cleanup_close_ int coredump_fd = -1, coredump_node_fd = -1;
Expand Down Expand Up @@ -938,15 +917,6 @@ static int submit_coredump(
/* Vacuum again, but exclude the coredump we just created */
(void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);

/* Now, let's drop privileges to become the user who owns the segfaulted process
* and allocate the coredump memory under the user's uid. This also ensures that
* the credentials journald will see are the ones of the coredumping user, thus
* making sure the user gets access to the core dump. Let's also get rid of all
* capabilities, if we run as root, we won't need them anymore. */
r = change_uid_gid(context);
if (r < 0)
return log_error_errno(r, "Failed to drop privileges: %m");

/* Try to get a stack trace if we can */
if (coredump_size > arg_process_size_max)
log_debug("Not generating stack trace: core size %"PRIu64" is greater "
Expand All @@ -956,12 +926,23 @@ static int submit_coredump(
bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */

(void) parse_elf_object(coredump_fd,
mntns_fd,
context->uid,
context->gid,
context->meta[META_EXE],
/* fork_disable_dump= */ skip, /* avoid loops */
&stacktrace,
&json_metadata);
}

/* Now, let's drop privileges to become the user who owns the segfaulted process. This also ensures
* that the credentials journald will see are the ones of the coredumping user, thus making sure
* the user gets access to the core dump. Let's also get rid of all capabilities, if we run as root,
* we won't need them anymore. */
r = drop_privileges(context->uid, context->gid, 0);
if (r < 0)
return log_error_errno(r, "Failed to drop privileges: %m");

log:
core_message = strjoina("Process ", context->meta[META_ARGV_PID],
" (", context->meta[META_COMM], ") of user ",
Expand Down Expand Up @@ -1094,6 +1075,15 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) {
if (r < 0)
return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);

r = parse_uid(context->meta[META_ARGV_UID], &context->uid);
if (r < 0)
return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]);

r = parse_gid(context->meta[META_ARGV_GID], &context->gid);
if (r < 0)
return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]);


unit = context->meta[META_UNIT];
context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
Expand All @@ -1102,11 +1092,11 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) {
}

static int process_socket(int fd) {
_cleanup_close_ int input_fd = -1;
_cleanup_close_ int input_fd = -EBADF, mntns_fd = -EBADF;
Context context = {};
struct iovec_wrapper iovw = {};
struct iovec iovec;
int r;
int iterations = 0, r;

assert(fd >= 0);

Expand Down Expand Up @@ -1146,23 +1136,39 @@ static int process_socket(int fd) {
goto finish;
}

/* The final zero-length datagram carries the file descriptor and tells us
/* The final zero-length datagram carries the file descriptors and tells us
* that we're done. */
if (n == 0) {
struct cmsghdr *found;

free(iovec.iov_base);

found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
if (!found) {
cmsg_close_all(&mh);
r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
"Coredump file descriptor missing.");
goto finish;
found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int) * 2));
if (found) {
int fds[2] = { -EBADF, -EBADF };

memcpy(fds, CMSG_DATA(found), sizeof(int) * 2);

assert(mntns_fd < 0);

/* Maybe we already got coredump FD in previous iteration? */
safe_close(input_fd);

input_fd = fds[0];
mntns_fd = fds[1];

/* We have all FDs we need let's take a shortcut here. */
break;
} else {
found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
if (found)
input_fd = *CMSG_DATA(found);
}

assert(input_fd < 0);
input_fd = *(int*) CMSG_DATA(found);
/* This is the first message that carries file descriptors, maybe there will be one more that actually contains array of descriptors. */
if (iterations++ == 0)
continue;

break;
} else
cmsg_close_all(&mh);
Expand All @@ -1177,7 +1183,11 @@ static int process_socket(int fd) {
}

/* Make sure we got all data we really need */
assert(input_fd >= 0);
if (input_fd < 0) {
r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
"Coredump file descriptor missing.");
goto finish;
}

r = save_context(&context, &iovw);
if (r < 0)
Expand All @@ -1192,15 +1202,15 @@ static int process_socket(int fd) {
goto finish;
}

r = submit_coredump(&context, &iovw, input_fd);
r = submit_coredump(&context, &iovw, input_fd, mntns_fd);

finish:
iovw_free_contents(&iovw, true);
return r;
}

static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
_cleanup_close_ int fd = -1;
static int send_iovec(const struct iovec_wrapper *iovw, int input_fd, int mntns_fd) {
_cleanup_close_ int fd = -EBADF;
int r;

assert(iovw);
Expand Down Expand Up @@ -1256,6 +1266,12 @@ static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
if (r < 0)
return log_error_errno(r, "Failed to send coredump fd: %m");

if (mntns_fd >= 0) {
r = send_many_fds(fd, (int[]) { input_fd, mntns_fd }, 2, 0);
if (r < 0)
return log_error_errno(r, "Failed to send coredump fds: %m");
}

return 0;
}

Expand Down Expand Up @@ -1428,7 +1444,7 @@ static int gather_pid_metadata(struct iovec_wrapper *iovw, Context *context) {
static int process_kernel(int argc, char* argv[]) {
Context context = {};
struct iovec_wrapper *iovw;
int r;
int r, mntns_fd = -EBADF;

/* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
* could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
Expand Down Expand Up @@ -1462,6 +1478,33 @@ static int process_kernel(int argc, char* argv[]) {
log_open();
}

r = in_same_namespace(getpid_cached(), context.pid, NAMESPACE_PID);
if (r < 0)
log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");

if (r == 0 && getenv_bool("SYSTEMD_COREDUMP_ALLOW_NAMESPACE_CHANGE") > 0) {
int fd;

r = namespace_open(context.pid, NULL, &fd, NULL, NULL, NULL);
if (r < 0)
return log_error_errno(r, "Failed to open mntns of crashing process: %m");

mntns_fd = fd;
} else {
/* Crashing process is not running in the container or changing namespace is disabled, but we
still need to send mount namespace fd along side coredump fd so let's just open our own
mount namespace. Entering it will be NOP but that is OK. */
int fd;

r = namespace_open(getpid_cached(), NULL, &fd, NULL, NULL, NULL);
if (r < 0)
return log_error_errno(r, "Failed to open our mntns: %m");

mntns_fd = fd;
}

assert(mntns_fd >= 0 && fd_is_ns(mntns_fd, CLONE_NEWNS) > 0);

/* If this is PID 1 disable coredump collection, we'll unlikely be able to process
* it later on.
*
Expand All @@ -1474,9 +1517,9 @@ static int process_kernel(int argc, char* argv[]) {
}

if (context.is_journald || context.is_pid1)
r = submit_coredump(&context, iovw, STDIN_FILENO);
r = submit_coredump(&context, iovw, STDIN_FILENO, mntns_fd);
else
r = send_iovec(iovw, STDIN_FILENO);
r = send_iovec(iovw, STDIN_FILENO, mntns_fd);

finish:
iovw = iovw_free_free(iovw);
Expand Down
Loading

0 comments on commit a4769ff

Please sign in to comment.