-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathenter.c
370 lines (316 loc) · 10.3 KB
/
enter.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
/* Copyright © 2020 Arista Networks, Inc. All rights reserved.
*
* Use of this source code is governed by the MIT license that can be found
* in the LICENSE file.
*/
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <inttypes.h>
#include <limits.h>
#include <sched.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/capability.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include "enter.h"
#include "mount.h"
#include "setarch.h"
#include "userns.h"
#define lengthof(Arr) (sizeof (Arr) / sizeof (*Arr))
struct cloneflag {
const char *name;
int flag;
};
enum {
/* To ensure backward and forward compatibility for the ability of
unsharing the maximum of namespaces, we re-define these constants. */
BST_CLONE_NEWNET = 0x40000000,
BST_CLONE_NEWUTS = 0x04000000,
BST_CLONE_NEWCGROUP = 0x02000000,
BST_CLONE_NEWNS = 0x00020000,
BST_CLONE_NEWPID = 0x20000000,
BST_CLONE_NEWUSER = 0x10000000,
BST_CLONE_NEWIPC = 0x08000000,
BST_CLONE_NEWTIME = 0x00000080,
ALL_NAMESPACES = 0
| BST_CLONE_NEWCGROUP
| BST_CLONE_NEWIPC
| BST_CLONE_NEWNS
| BST_CLONE_NEWNET
| BST_CLONE_NEWPID
| BST_CLONE_NEWUSER
| BST_CLONE_NEWUTS
| BST_CLONE_NEWTIME
,
};
static int opts_to_unshareflags(const struct entry_settings *opts)
{
static struct cloneflag flags[] = {
{ "cgroup", BST_CLONE_NEWCGROUP },
{ "ipc", BST_CLONE_NEWIPC },
{ "mount", BST_CLONE_NEWNS },
{ "network", BST_CLONE_NEWNET },
{ "pid", BST_CLONE_NEWPID },
{ "time", BST_CLONE_NEWTIME },
{ "user", BST_CLONE_NEWUSER },
{ "uts", BST_CLONE_NEWUTS },
{ "all", ALL_NAMESPACES },
};
int unshareflags = ALL_NAMESPACES;
for (const char *const *s = opts->shares; s < opts->shares + opts->nshares; ++s) {
struct cloneflag *f;
for (f = flags; f < flags + lengthof(flags); ++f) {
if (strcmp(f->name, *s) == 0) {
goto found;
}
}
fprintf(stderr, "namespace `%s` does not exist.\n", *s);
fprintf(stderr, "valid namespaces are: ");
for (f = flags; f < flags + lengthof(flags) - 1; ++f) {
fprintf(stderr, "%s, ", f->name);
}
fprintf(stderr, "%s.\n", f->name);
exit(1);
found:
unshareflags &= ~f->flag;
}
return unshareflags;
}
int enter(const struct entry_settings *opts)
{
const char *root = opts->root ? opts->root : "/";
char resolved_root[PATH_MAX];
if (realpath(root, resolved_root) == NULL) {
err(1, "realpath(\"%s\")", root);
}
root = resolved_root;
int unshareflags = opts_to_unshareflags(opts);
struct userns_helper userns_helper;
if (unshareflags & BST_CLONE_NEWUSER) {
userns_helper = userns_helper_spawn();
}
int timens_offsets = -1;
if (unshareflags & BST_CLONE_NEWTIME) {
timens_offsets = open("/proc/self/timens_offsets", O_WRONLY);
if (timens_offsets == -1) {
if (errno != ENOENT) {
err(1, "open(\"/proc/self/timens_offsets\")");
}
/* The kernel evidently don't support time namespaces yet. No need
to try below. */
unshareflags &= ~BST_CLONE_NEWTIME;
}
}
/* Drop all privileges, or none if we're real uid 0. */
if (setuid(getuid()) == -1) {
err(1, "setuid");
}
/* Unshare all relevant namespaces. It's hard to check in advance which
namespaces are supported, so we unshare them one by one in order. */
int unshareables[] = {
/* User namespace must be unshared first and foremost. */
BST_CLONE_NEWUSER,
BST_CLONE_NEWNS,
BST_CLONE_NEWUTS,
BST_CLONE_NEWPID,
BST_CLONE_NEWNET,
BST_CLONE_NEWIPC,
BST_CLONE_NEWCGROUP,
BST_CLONE_NEWTIME,
0,
};
for (int *flag = &unshareables[0]; *flag != 0; ++flag) {
if (!(unshareflags & *flag)) {
continue;
}
int rc = unshare(*flag);
if (rc == -1 && errno == EINVAL) {
/* We realized that the namespace isn't supported -- remove it
from the unshare set. */
unshareflags &= ~*flag;
continue;
}
if (rc == -1) {
err(1, "unshare");
}
}
/* Just unsharing the mount namespace is not sufficient -- if we don't make
every mount entry private, any change we make will be applied to the
parent mount namespace if it happens to have MS_SHARED propagation. We
don't like coin flips. */
if (unshareflags & BST_CLONE_NEWNS && mount("none", "/", "", MS_REC | MS_PRIVATE, "") == -1) {
err(1, "could not make / private: mount");
}
if (opts->arch && opts->arch[0] != 0) {
setarch(opts->arch);
}
if (unshareflags & BST_CLONE_NEWTIME) {
init_clocks(timens_offsets, opts->clockspecs, lengthof(opts->clockspecs));
}
if (timens_offsets != -1 && close(timens_offsets) == -1) {
err(1, "close(timens_offsets)");
}
/* You can't "really" unshare the PID namespace of a running process
without forking, since for process hierarchy reasons only the next
child process enters the namespace as init and subsequent calls to
clone(2) fails.
Since we set-up some things like the parent death signal, it's just
cleaner and easier to always fork, regardless of unsharing the PID
namespace. */
pid_t pid = fork();
if (pid == -1) {
err(1, "fork");
}
if (pid) {
int status;
if (unshareflags & BST_CLONE_NEWUSER) {
userns_helper_sendpid(&userns_helper, pid);
userns_helper_close(&userns_helper);
}
if (waitpid(pid, &status, 0) == -1) {
err(1, "waitpid");
}
if (WIFEXITED(status)) {
return WEXITSTATUS(status);
}
return WTERMSIG(status) | 1 << 7;
}
/* We can't afford to leave the child alive in the background if bst
dies from uncatcheable signals. Or at least, we could, but this makes us
leaky by default which isn't great, and the obvious workaround to
daemonize the process tree is to just nohup bst. */
if (prctl(PR_SET_PDEATHSIG, SIGKILL) == -1) {
err(1, "prctl(PR_SET_PDEATHSIG)");
}
if (unshareflags & BST_CLONE_NEWUSER) {
userns_helper_wait(&userns_helper);
userns_helper_close(&userns_helper);
}
/* Set the host and domain names only when in an UTS namespace. */
if ((opts->hostname || opts->domainname) && !(unshareflags & BST_CLONE_NEWUTS)) {
errx(1, "attempted to set host or domain names on the host UTS namespace.");
}
const char *hostname = opts->hostname;
if (!hostname && (unshareflags & BST_CLONE_NEWUTS)) {
hostname = "localhost";
}
if (hostname && sethostname(hostname, strlen(hostname)) == -1) {
err(1, "sethostname");
}
const char *domainname = opts->domainname;
if (!domainname && (unshareflags & BST_CLONE_NEWUTS)) {
domainname = "localdomain";
}
if (domainname && setdomainname(domainname, strlen(domainname)) == -1) {
err(1, "setdomainname");
}
/* The setuid will drop privileges. We ask to keep permitted capabilities
in order to restore them for the rest of the program. */
prctl(PR_SET_KEEPCAPS, 1);
if (opts->ngroups != 0 && setgroups(opts->ngroups, opts->groups) == -1) {
err(1, "setgroups");
}
if (setregid(opts->gid, opts->gid) == -1) {
err(1, "setregid");
}
if (setreuid(opts->uid, opts->uid) == -1) {
err(1, "setreuid");
}
/* give ourselves back CAP_SYS_CHROOT if we need to chroot, and
CAP_SYS_ADMIN if we want to mount. */
cap_value_t cap_list[2];
size_t ncaps = 0;
if (strcmp(root, "/") != 0) {
cap_list[ncaps++] = CAP_SYS_CHROOT;
}
if (opts->nmounts > 0 || opts->nmutables > 0) {
cap_list[ncaps++] = CAP_SYS_ADMIN;
}
cap_t caps = cap_get_proc();
if (caps == NULL) {
err(1, "cap_get_proc");
}
if (ncaps > 0 && cap_set_flag(caps, CAP_EFFECTIVE, ncaps, cap_list, CAP_SET) == -1) {
err(1, "cap_set_flag");
}
if (cap_set_proc(caps) == -1) {
err(1, "caps_set_proc");
}
if (cap_free(caps) == -1) {
err(1, "cap_free");
}
/* We have a special case for pivot_root: the syscall wants the
new root to be a mount point, so we indulge. */
if (unshareflags & BST_CLONE_NEWNS && strcmp(root, "/") != 0) {
if (mount(root, root, "none", MS_BIND|MS_REC, "") == -1) {
err(1, "mount(\"/\", \"/\", MS_BIND|MS_REC)");
}
}
/* We have to do this after setuid/setgid/setgroups since mounting
tmpfses in user namespaces forces the options uid=<real-uid> and
gid=<real-gid>. */
if (opts->nmounts > 0 || opts->nmutables > 0) {
/* Don't shoot ourselves in the foot. It's technically possible to
let users mount things in the host mount namespace but in practice
it's a terrible idea due to the sheer amount of things that can go
wrong, like "what do I do if one of the mounts failed but the previous
ones didn't?", or "how do I clean up things that I've (re)mounted?". */
if (!(unshareflags & BST_CLONE_NEWNS)) {
errx(1, "attempted to mount things on the host mount namespace.");
}
mount_entries(root, opts->mounts, opts->nmounts);
mount_mutables(root, opts->mutables, opts->nmutables);
}
/* Don't chroot if root is "/". This is a better default since it
allows us to run commands that unshare nothing unprivileged. */
if (strcmp(root, "/") != 0) {
/* The chroot-ing logic is a bit delicate. If we don't have a mount
namespace, we just use chroot. This has its limitations though,
namely, in that situation, you won't be able to nest user
namespaces, and you'll instead get baffling EPERMs when calling
unshare(CLONE_NEWUSER).
In order to remediate that, we pivot the root. Since this actively
changes the world for every running process, we *insist* that this
must be done in a mount namespace, because otherwise pivot_root
will burn your house, invoke dragons, and eat your children. */
if (!(unshareflags & BST_CLONE_NEWNS)) {
if (chroot(root) == -1) {
err(1, "chroot");
}
} else {
if (chdir(root) == -1) {
err(1, "pivot_root: pre chdir");
}
/* Pivot the root to `root` (new_root) and mount the old root
(old_dir) on top of it. Then, unmount "." to get rid of the
old root.
The pivot_root manpage documents this approach: old_dir is
always layered on top of new_root, which means that we can
use this technique to avoid creating a mount point for the
old root under new_root, or assuming anything about the layout
of the new root. */
if (syscall(SYS_pivot_root, ".", ".") == -1) {
err(1, "pivot_root");
}
if (umount2(".", MNT_DETACH)) {
err(1, "pivot_root: umount2");
}
if (chdir("/") == -1) {
err(1, "pivot_root: post chdir");
}
}
}
if (chdir(opts->workdir) == -1) {
err(1, "chdir");
}
execvpe(opts->pathname, opts->argv, opts->envp);
err(1, "execvpe");
}