Skip to content

Commit

Permalink
cgroup,native: ensure we start the cleaner before creating cgroup
Browse files Browse the repository at this point in the history
Some callers of bst would send it a SIGKILL as soon as the underlying
operation would get canceled. The problem is that it was possible to
race the cgroup initialization code of the native driver, such that the
SIGKILL would be received after mkdirat of the cgroup directory, but
before the cgroup cleaner has any chance at starting.

This commit fixes this issue by reordering the operations. The cgroup
cleaner is now started before mkdirat, and waits on a blocked pipe read
until the parent process (which is the outer helper) dies. This ensures
that the cleaner is started first and foremost, and that it waits until
the cgroup has been initialized by the helper.
  • Loading branch information
Snaipe committed Feb 6, 2024
1 parent 8667710 commit 7469152
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 22 deletions.
87 changes: 69 additions & 18 deletions cgroup.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <ftw.h>
#include <libgen.h>
Expand Down Expand Up @@ -123,19 +124,43 @@ static int rm_cgroup(const char *fpath, const struct stat *sb, int tflag, struct
/* If bst has entered a cgroup this function will epoll the cgroup.events file
to detect when all pids have exited the cgroup ("populated 0"). The cgroup is
destroyed when this condition is met. */
static void run_cleaner_child(int cgroupfd, int parentfd, const char *name)
static void run_cleaner_child(int lock, int parentfd, const char *name)
{
/* Wait for the parent to die before proceeding */
int ok;
switch (read(lock, &ok, sizeof (ok))) {
case -1:
warn("run_cgroup_child: read on lock");
goto lastDitchEffort;
case 0:
break;
}

int cgroupfd = openat(parentfd, name, O_RDONLY | O_DIRECTORY, 0);
if (cgroupfd == -1) {
switch (errno) {
case ENOENT:
/* Parent died before it made the cgroup; nothing to do */
return;
default:
warn("run_cgroup_child: open %s", name);
goto lastDitchEffort;
}
}

char fdpath[PATH_MAX];
makepath_r(fdpath, "/proc/self/fd/%d", cgroupfd);

char cgroup_path[PATH_MAX];
if (readlink(fdpath, cgroup_path, sizeof (cgroup_path)) == -1) {
err(1, "cgroup_run_cleaner: readlink");
warn("run_cgroup_child: readlink");
goto lastDitchEffort;
}

int eventfd = openat(cgroupfd, "cgroup.events", 0);
if (eventfd == -1) {
err(1, "unable to open cgroup.events");
warn("run_cgroup_child: open cgroup.events");
goto recursiveClean;
}

struct epoll_event event = {
Expand All @@ -144,11 +169,13 @@ static void run_cleaner_child(int cgroupfd, int parentfd, const char *name)

int epollfd = epoll_create1(0);
if (epollfd == -1) {
err(1, "epoll_create1");
warn("run_cgroup_child: epoll_create1");
goto recursiveClean;
}

if (epoll_ctl(epollfd, EPOLL_CTL_ADD, eventfd, &event) == -1) {
err(1, "epoll_ctl_add cgroupfd");
warn("run_cgroup_child: epoll_ctl_add cgroupfd");
goto recursiveClean;
}

/* The first event is the initial state of the file; skip it, because
Expand All @@ -157,52 +184,70 @@ static void run_cleaner_child(int cgroupfd, int parentfd, const char *name)

FILE *eventsfp = fdopen(eventfd, "r");
if (eventsfp == NULL) {
err(1, "unable to open file pointer to cgroup.events");
warn("run_cgroup_child: fdopen cgroup.events");
goto recursiveClean;
}

char populated[BUFSIZ];
for (;;) {
int ready = epoll_wait(epollfd, &event, 1, -1);
if (ready == -1) {
err(1, "epoll_wait cgroup.events");
warn("run_cgroup_child: epoll_wait cgroup.events");
goto recursiveClean;
}

rewind(eventsfp);

/* The order of elements in cgroup.events is not necessarily specified. */
while (fgets(populated, BUFSIZ, eventsfp) != NULL) {
if (strnlen(populated, sizeof(populated)) == sizeof(populated)) {
err(1, "exceeded cgroup.events line read buffer");
warn("run_cgroup_child: exceeded cgroup.events line read buffer");
goto recursiveClean;
}
if (strncmp(populated, "populated 0", 11) == 0) {
nftw(cgroup_path, rm_cgroup, 128, 0);

/* Let the process exit; no need to clean up fds */
return;
goto recursiveClean;
}
}
}

/* Let the process exit; no need to clean up fds. We don't need to
set any exit code since no parent process cares about them. */

recursiveClean:
nftw(cgroup_path, rm_cgroup, 128, 0);
return;

lastDitchEffort:
if (unlinkat(parentfd, name, AT_REMOVEDIR) == -1) {
warn("run_cgroup_child: unlinkat");
}
return;
}

void cgroup_run_cleaner(int cgroupfd, int parentfd, const char *name)
void cgroup_start_cleaner(int parentfd, const char *name)
{
int fds[2];
if (pipe2(fds, O_CLOEXEC) == -1) {
err(1, "cgroup_start_cleaner: pipe2");
}

pid_t pid = fork();
if (pid == -1) {
err(1, "cgroup_run_cleaner: fork");
err(1, "cgroup_start_cleaner: fork");
}

/* This process is intentionally left to leak as the bst root process must have exited
and thus been removed from bst's cgroup.procs for the cgroup hierarchy to be removed */
and thus been removed from bst's cgroup.procs for the cgroup hierarchy to be removed */
if (pid == 0) {
/* Create a new session in case current group leader is killed */
if (setsid() == -1) {
err(1, "unable to create new session leader for cgroup cleanup process");
err(1, "cgroup_start_cleaner: setsid");
}

/* Make sure all file descriptors except for the ones we're actually using
get closed. This avoids keeping around file descriptors on which
the parent process might be waiting on. */
rebind_fds_and_close_rest(3, &cgroupfd, &parentfd, NULL);
rebind_fds_and_close_rest(3, &fds[0], &parentfd, NULL);

/* From now on, use syslog to report error messages. This is necessary
since the parent bst process might be gone by the time there's an
Expand All @@ -211,9 +256,15 @@ void cgroup_run_cleaner(int cgroupfd, int parentfd, const char *name)
openlog("bst", LOG_CONS | LOG_PID, LOG_USER);
err_flags |= ERR_USE_SYSLOG;

run_cleaner_child(cgroupfd, parentfd, name);
run_cleaner_child(fds[0], parentfd, name);
_exit(0);
}

close(fds[0]);

/* Deliberately leak fds[1]. This _is_ important. It will get closed
once this process dies, releasing the read(2) lock of the cgroup
cleaner. */
}

void cgroup_enable_controllers(int cgroupfd)
Expand Down
2 changes: 1 addition & 1 deletion cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ bool cgroup_current_path(char *path);
int cgroup_join(const char *parent, const char *name);
bool cgroup_read_current(char *path);
void cgroup_enable_controllers(int cgroupfd);
void cgroup_run_cleaner(int cgroupfd, int parentfd, const char *name);
void cgroup_start_cleaner(int parentfd, const char *name);

#endif /* !CGROUP_H_ */
6 changes: 3 additions & 3 deletions cgroup_native.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ static int cgroup_native_join_cgroup(const char *parent, const char *name)
err(1, "cgroup_native_join_cgroup: open %s", parent);
}

/* Start cleaner daemon; it will remove the cgroup once this process dies. */
cgroup_start_cleaner(parentfd, name);

if (mkdirat(parentfd, name, 0777) == -1) {
err(1, "cgroup_native_join_cgroup: mkdir %s under %s", name, parent);
}
Expand All @@ -71,9 +74,6 @@ static int cgroup_native_join_cgroup(const char *parent, const char *name)
goto unlink;
}

/* Start cleaner daemon; it will remove the cgroup once this process dies. */
cgroup_run_cleaner(cgroupfd, parentfd, name);

if (write(procs, "0", 1) == (ssize_t)-1) {
warn("cgroup_native_join_cgroup: write cgroup.procs");
goto unlink;
Expand Down

0 comments on commit 7469152

Please sign in to comment.