diff --git a/src/analyze/analyze-inspect-elf.c b/src/analyze/analyze-inspect-elf.c index 155c611c71..b8074100a5 100644 --- a/src/analyze/analyze-inspect-elf.c +++ b/src/analyze/analyze-inspect-elf.c @@ -30,7 +30,7 @@ static int analyze_elf(char **filenames, JsonFormatFlags json_flags) { if (fd < 0) return log_error_errno(fd, "Could not open \"%s\": %m", abspath); - r = parse_elf_object(fd, abspath, /* fork_disable_dump= */false, NULL, &package_metadata); + r = parse_elf_object(fd, -EBADF, UID_NOBODY, GID_NOBODY, abspath, /* fork_disable_dump= */false, NULL, &package_metadata); if (r < 0) return log_error_errno(r, "Parsing \"%s\" as ELF object failed: %m", abspath); diff --git a/src/basic/socket-util.c b/src/basic/socket-util.c index f39be19a59..1d86ca3f55 100644 --- a/src/basic/socket-util.c +++ b/src/basic/socket-util.c @@ -41,6 +41,11 @@ # define IDN_FLAGS 0 #endif +/* From the kernel's include/net/scm.h */ +#ifndef SCM_MAX_FD +# define SCM_MAX_FD 253 +#endif + static const char* const socket_address_type_table[] = { [SOCK_STREAM] = "Stream", [SOCK_DGRAM] = "Datagram", @@ -951,6 +956,53 @@ int getpeergroups(int fd, gid_t **ret) { return (int) n; } +ssize_t send_many_fds_iov_sa( + int transport_fd, + int *fds_array, size_t n_fds_array, + const struct iovec *iov, size_t iovlen, + const struct sockaddr *sa, socklen_t len, + int flags) { + + _cleanup_free_ struct cmsghdr *cmsg = NULL; + struct msghdr mh = { + .msg_name = (struct sockaddr*) sa, + .msg_namelen = len, + .msg_iov = (struct iovec *)iov, + .msg_iovlen = iovlen, + }; + ssize_t k; + + assert(transport_fd >= 0); + assert(fds_array || n_fds_array == 0); + + /* The kernel will reject sending more than SCM_MAX_FD FDs at once */ + if (n_fds_array > SCM_MAX_FD) + return -E2BIG; + + /* We need either an FD array or data to send. If there's nothing, return an error. */ + if (n_fds_array == 0 && !iov) + return -EINVAL; + + if (n_fds_array > 0) { + mh.msg_controllen = CMSG_SPACE(sizeof(int) * n_fds_array); + mh.msg_control = cmsg = malloc(mh.msg_controllen); + if (!cmsg) + return -ENOMEM; + + *cmsg = (struct cmsghdr) { + .cmsg_len = CMSG_LEN(sizeof(int) * n_fds_array), + .cmsg_level = SOL_SOCKET, + .cmsg_type = SCM_RIGHTS, + }; + memcpy(CMSG_DATA(cmsg), fds_array, sizeof(int) * n_fds_array); + } + k = sendmsg(transport_fd, &mh, MSG_NOSIGNAL | flags); + if (k < 0) + return (ssize_t) -errno; + + return k; +} + ssize_t send_one_fd_iov_sa( int transport_fd, int fd, diff --git a/src/basic/socket-util.h b/src/basic/socket-util.h index 2e36e1a56b..61bf8ff32b 100644 --- a/src/basic/socket-util.h +++ b/src/basic/socket-util.h @@ -153,6 +153,13 @@ int getpeercred(int fd, struct ucred *ucred); int getpeersec(int fd, char **ret); int getpeergroups(int fd, gid_t **ret); +ssize_t send_many_fds_iov_sa( + int transport_fd, + int *fds_array, size_t n_fds_array, + const struct iovec *iov, size_t iovlen, + const struct sockaddr *sa, socklen_t len, + int flags); + ssize_t send_one_fd_iov_sa( int transport_fd, int fd, @@ -163,6 +170,14 @@ int send_one_fd_sa(int transport_fd, int fd, const struct sockaddr *sa, socklen_t len, int flags); +static inline int send_many_fds( + int transport_fd, + int *fds_array, + size_t n_fds_array, + int flags) { + + return send_many_fds_iov_sa(transport_fd, fds_array, n_fds_array, NULL, 0, NULL, 0, flags); +} #define send_one_fd_iov(transport_fd, fd, iov, iovlen, flags) send_one_fd_iov_sa(transport_fd, fd, iov, iovlen, NULL, 0, flags) #define send_one_fd(transport_fd, fd, flags) send_one_fd_iov_sa(transport_fd, fd, NULL, 0, NULL, 0, flags) ssize_t receive_one_fd_iov(int transport_fd, struct iovec *iov, size_t iovlen, int flags, int *ret_fd); diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c index b9c5f3ad04..2acb16f63f 100644 --- a/src/coredump/coredump.c +++ b/src/coredump/coredump.c @@ -24,6 +24,7 @@ #include "coredump-vacuum.h" #include "dirent-util.h" #include "elf-util.h" +#include "env-util.h" #include "escape.h" #include "fd-util.h" #include "fileio.h" @@ -36,6 +37,7 @@ #include "main-func.h" #include "memory-util.h" #include "mkdir-label.h" +#include "namespace-util.h" #include "parse-util.h" #include "process-util.h" #include "signal-util.h" @@ -130,6 +132,8 @@ typedef struct Context { const char *meta[_META_MAX]; size_t meta_size[_META_MAX]; pid_t pid; + uid_t uid; + gid_t gid; bool is_pid1; bool is_journald; } Context; @@ -866,36 +870,11 @@ static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) { return 1; } -static int change_uid_gid(const Context *context) { - uid_t uid; - gid_t gid; - int r; - - r = parse_uid(context->meta[META_ARGV_UID], &uid); - if (r < 0) - return r; - - if (uid_is_system(uid)) { - const char *user = "systemd-coredump"; - - r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0); - if (r < 0) { - log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user); - uid = gid = 0; - } - } else { - r = parse_gid(context->meta[META_ARGV_GID], &gid); - if (r < 0) - return r; - } - - return drop_privileges(uid, gid, 0); -} - static int submit_coredump( const Context *context, struct iovec_wrapper *iovw, - int input_fd) { + int input_fd, + int mntns_fd) { _cleanup_(json_variant_unrefp) JsonVariant *json_metadata = NULL; _cleanup_close_ int coredump_fd = -1, coredump_node_fd = -1; @@ -938,15 +917,6 @@ static int submit_coredump( /* Vacuum again, but exclude the coredump we just created */ (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use); - /* Now, let's drop privileges to become the user who owns the segfaulted process - * and allocate the coredump memory under the user's uid. This also ensures that - * the credentials journald will see are the ones of the coredumping user, thus - * making sure the user gets access to the core dump. Let's also get rid of all - * capabilities, if we run as root, we won't need them anymore. */ - r = change_uid_gid(context); - if (r < 0) - return log_error_errno(r, "Failed to drop privileges: %m"); - /* Try to get a stack trace if we can */ if (coredump_size > arg_process_size_max) log_debug("Not generating stack trace: core size %"PRIu64" is greater " @@ -956,12 +926,23 @@ static int submit_coredump( bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */ (void) parse_elf_object(coredump_fd, + mntns_fd, + context->uid, + context->gid, context->meta[META_EXE], /* fork_disable_dump= */ skip, /* avoid loops */ &stacktrace, &json_metadata); } + /* Now, let's drop privileges to become the user who owns the segfaulted process. This also ensures + * that the credentials journald will see are the ones of the coredumping user, thus making sure + * the user gets access to the core dump. Let's also get rid of all capabilities, if we run as root, + * we won't need them anymore. */ + r = drop_privileges(context->uid, context->gid, 0); + if (r < 0) + return log_error_errno(r, "Failed to drop privileges: %m"); + log: core_message = strjoina("Process ", context->meta[META_ARGV_PID], " (", context->meta[META_COMM], ") of user ", @@ -1094,6 +1075,15 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) { if (r < 0) return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]); + r = parse_uid(context->meta[META_ARGV_UID], &context->uid); + if (r < 0) + return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]); + + r = parse_gid(context->meta[META_ARGV_GID], &context->gid); + if (r < 0) + return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]); + + unit = context->meta[META_UNIT]; context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE); context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE); @@ -1102,11 +1092,11 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) { } static int process_socket(int fd) { - _cleanup_close_ int input_fd = -1; + _cleanup_close_ int input_fd = -EBADF, mntns_fd = -EBADF; Context context = {}; struct iovec_wrapper iovw = {}; struct iovec iovec; - int r; + int iterations = 0, r; assert(fd >= 0); @@ -1146,23 +1136,39 @@ static int process_socket(int fd) { goto finish; } - /* The final zero-length datagram carries the file descriptor and tells us + /* The final zero-length datagram carries the file descriptors and tells us * that we're done. */ if (n == 0) { struct cmsghdr *found; free(iovec.iov_base); - found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int))); - if (!found) { - cmsg_close_all(&mh); - r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG), - "Coredump file descriptor missing."); - goto finish; + found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int) * 2)); + if (found) { + int fds[2] = { -EBADF, -EBADF }; + + memcpy(fds, CMSG_DATA(found), sizeof(int) * 2); + + assert(mntns_fd < 0); + + /* Maybe we already got coredump FD in previous iteration? */ + safe_close(input_fd); + + input_fd = fds[0]; + mntns_fd = fds[1]; + + /* We have all FDs we need let's take a shortcut here. */ + break; + } else { + found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int))); + if (found) + input_fd = *CMSG_DATA(found); } - assert(input_fd < 0); - input_fd = *(int*) CMSG_DATA(found); + /* This is the first message that carries file descriptors, maybe there will be one more that actually contains array of descriptors. */ + if (iterations++ == 0) + continue; + break; } else cmsg_close_all(&mh); @@ -1177,7 +1183,11 @@ static int process_socket(int fd) { } /* Make sure we got all data we really need */ - assert(input_fd >= 0); + if (input_fd < 0) { + r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG), + "Coredump file descriptor missing."); + goto finish; + } r = save_context(&context, &iovw); if (r < 0) @@ -1192,16 +1202,16 @@ static int process_socket(int fd) { goto finish; } - r = submit_coredump(&context, &iovw, input_fd); + r = submit_coredump(&context, &iovw, input_fd, mntns_fd); finish: iovw_free_contents(&iovw, true); return r; } -static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) { - _cleanup_close_ int fd = -1; - int r; +static int send_iovec(const struct iovec_wrapper *iovw, int input_fd, int mntns_fd) { + _cleanup_close_ int fd = -EBADF; + int r; assert(iovw); assert(input_fd >= 0); @@ -1256,6 +1266,12 @@ static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) { if (r < 0) return log_error_errno(r, "Failed to send coredump fd: %m"); + if (mntns_fd >= 0) { + r = send_many_fds(fd, (int[]) { input_fd, mntns_fd }, 2, 0); + if (r < 0) + return log_error_errno(r, "Failed to send coredump fds: %m"); + } + return 0; } @@ -1428,7 +1444,7 @@ static int gather_pid_metadata(struct iovec_wrapper *iovw, Context *context) { static int process_kernel(int argc, char* argv[]) { Context context = {}; struct iovec_wrapper *iovw; - int r; + int r, mntns_fd = -EBADF; /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to @@ -1462,6 +1478,33 @@ static int process_kernel(int argc, char* argv[]) { log_open(); } + r = in_same_namespace(getpid_cached(), context.pid, NAMESPACE_PID); + if (r < 0) + log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m"); + + if (r == 0 && getenv_bool("SYSTEMD_COREDUMP_ALLOW_NAMESPACE_CHANGE") > 0) { + int fd; + + r = namespace_open(context.pid, NULL, &fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to open mntns of crashing process: %m"); + + mntns_fd = fd; + } else { + /* Crashing process is not running in the container or changing namespace is disabled, but we + still need to send mount namespace fd along side coredump fd so let's just open our own + mount namespace. Entering it will be NOP but that is OK. */ + int fd; + + r = namespace_open(getpid_cached(), NULL, &fd, NULL, NULL, NULL); + if (r < 0) + return log_error_errno(r, "Failed to open our mntns: %m"); + + mntns_fd = fd; + } + + assert(mntns_fd >= 0 && fd_is_ns(mntns_fd, CLONE_NEWNS) > 0); + /* If this is PID 1 disable coredump collection, we'll unlikely be able to process * it later on. * @@ -1474,9 +1517,9 @@ static int process_kernel(int argc, char* argv[]) { } if (context.is_journald || context.is_pid1) - r = submit_coredump(&context, iovw, STDIN_FILENO); + r = submit_coredump(&context, iovw, STDIN_FILENO, mntns_fd); else - r = send_iovec(iovw, STDIN_FILENO); + r = send_iovec(iovw, STDIN_FILENO, mntns_fd); finish: iovw = iovw_free_free(iovw); diff --git a/src/shared/elf-util.c b/src/shared/elf-util.c index bde5013b92..68f48ddd98 100644 --- a/src/shared/elf-util.c +++ b/src/shared/elf-util.c @@ -12,6 +12,7 @@ #include #include "alloc-util.h" +#include "capability-util.h" #include "dlfcn-util.h" #include "elf-util.h" #include "errno-util.h" @@ -21,9 +22,12 @@ #include "hexdecoct.h" #include "io-util.h" #include "macro.h" +#include "namespace-util.h" #include "process-util.h" #include "rlimit-util.h" #include "string-util.h" +#include "uid-alloc-range.h" +#include "user-util.h" #include "util.h" #define FRAMES_MAX 64 @@ -752,11 +756,27 @@ static int parse_elf(int fd, const char *executable, char **ret, JsonVariant **r return 0; } -int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) { +static int core_change_uid_gid(uid_t uid, gid_t gid) { + uid_t u = uid; + gid_t g = gid; + int r; + + if (uid_is_system(u)) { + const char *user = "systemd-coredump"; + + r = get_user_creds(&user, &u, &g, NULL, NULL, 0); + if (r < 0) + log_warning_errno(r, "Cannot resolve %s user, ignoring: %m", user); + } + + return drop_privileges(u, g, 0); +} + +int parse_elf_object(int fd, int mntns_fd, uid_t uid, gid_t gid, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) { _cleanup_close_pair_ int error_pipe[2] = { -1, -1 }, return_pipe[2] = { -1, -1 }, json_pipe[2] = { -1, -1 }; _cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL; _cleanup_free_ char *buf = NULL; - int r; + int flags, r; assert(fd >= 0); @@ -784,6 +804,10 @@ int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, cha return r; } + flags = FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS|FORK_WAIT|FORK_REOPEN_LOG; + if (mntns_fd >= 0) + flags &= ~(FORK_CLOSE_ALL_FDS|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS); + /* Parsing possibly malformed data is crash-happy, so fork. In case we crash, * the core file will not be lost, and the messages will still be attached to * the journal. Reading the elf object might be slow, but it still has an upper @@ -793,7 +817,7 @@ int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, cha r = safe_fork_full("(sd-parse-elf)", (int[]){ fd, error_pipe[1], return_pipe[1], json_pipe[1] }, 4, - FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS|FORK_WAIT|FORK_REOPEN_LOG, + flags, NULL); if (r < 0) { if (r == -EPROTO) { /* We should have the errno from the child, but don't clobber original error */ @@ -811,6 +835,23 @@ int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, cha return r; } if (r == 0) { + if (mntns_fd >= 0) { + r = namespace_enter(/* pidns_fd = */ -EBADF, + mntns_fd, + /* netns_fd = */ -EBADF, + /* userns_fd = */ -EBADF, + /* root_fd = */ -EBADF); + if (r < 0) + log_notice_errno(r, "Failed to enter mount namespace of crashing process, ignoring: %m"); + } + + if (uid != UID_NOBODY && gid != GID_NOBODY) { + r = core_change_uid_gid(uid, gid); + if (r < 0) { + log_notice_errno(r, "Failed to drop privileges, ignoring: %m"); + } + } + /* We want to avoid loops, given this can be called from systemd-coredump */ if (fork_disable_dump) { r = RET_NERRNO(prctl(PR_SET_DUMPABLE, 0)); diff --git a/src/shared/elf-util.h b/src/shared/elf-util.h index b28e64cea6..350464941f 100644 --- a/src/shared/elf-util.h +++ b/src/shared/elf-util.h @@ -10,9 +10,9 @@ int dlopen_elf(void); /* Parse an ELF object in a forked process, so that errors while iterating over * untrusted and potentially malicious data do not propagate to the main caller's process. * If fork_disable_dump, the child process will not dump core if it crashes. */ -int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata); +int parse_elf_object(int fd, int mntns_fd, uid_t uid, gid_t gid, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata); #else -static inline int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) { +static inline int parse_elf_object(int fd, int mntns_fd, uid_t uid, gid_t gid, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) { return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "elfutils disabled, parsing ELF objects not supported"); } #endif