]>
Commit | Line | Data |
---|---|---|
e0732705 CS |
1 | /* |
2 | * lxc: linux Container library | |
3 | * | |
4 | * (C) Copyright IBM Corp. 2007, 2008 | |
5 | * | |
6 | * Authors: | |
9afe19d6 | 7 | * Daniel Lezcano <daniel.lezcano at free.fr> |
e0732705 CS |
8 | * |
9 | * This library is free software; you can redistribute it and/or | |
10 | * modify it under the terms of the GNU Lesser General Public | |
11 | * License as published by the Free Software Foundation; either | |
12 | * version 2.1 of the License, or (at your option) any later version. | |
13 | * | |
14 | * This library is distributed in the hope that it will be useful, | |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
17 | * Lesser General Public License for more details. | |
18 | * | |
19 | * You should have received a copy of the GNU Lesser General Public | |
20 | * License along with this library; if not, write to the Free Software | |
21 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
22 | */ | |
23 | ||
24 | #define _GNU_SOURCE | |
25 | #include <unistd.h> | |
26 | #include <stdio.h> | |
27 | #include <string.h> | |
28 | #include <stdlib.h> | |
29 | #include <errno.h> | |
30 | #include <fcntl.h> | |
31 | #include <sys/param.h> | |
32 | #include <sys/prctl.h> | |
7a0b0b56 | 33 | #include <sys/mount.h> |
5ec27989 | 34 | #include <sys/socket.h> |
1ba0013f | 35 | #include <sys/syscall.h> |
905022f7 | 36 | #include <sys/wait.h> |
910bb4fa | 37 | #include <linux/unistd.h> |
905022f7 | 38 | #include <pwd.h> |
e0732705 CS |
39 | |
40 | #if !HAVE_DECL_PR_CAPBSET_DROP | |
41 | #define PR_CAPBSET_DROP 24 | |
42 | #endif | |
43 | ||
44 | #include "namespace.h" | |
45 | #include "log.h" | |
46 | #include "attach.h" | |
47 | #include "caps.h" | |
e0732705 | 48 | #include "config.h" |
9958532b | 49 | #include "apparmor.h" |
6a44839f | 50 | #include "utils.h" |
9c4693b8 CS |
51 | #include "commands.h" |
52 | #include "cgroup.h" | |
53 | ||
54 | #if HAVE_SYS_PERSONALITY_H | |
55 | #include <sys/personality.h> | |
56 | #endif | |
e0732705 | 57 | |
a3da2f3b SG |
58 | #ifndef SOCK_CLOEXEC |
59 | # define SOCK_CLOEXEC 02000000 | |
60 | #endif | |
61 | ||
e0732705 CS |
62 | lxc_log_define(lxc_attach, lxc); |
63 | ||
e0732705 CS |
64 | struct lxc_proc_context_info *lxc_proc_get_context_info(pid_t pid) |
65 | { | |
66 | struct lxc_proc_context_info *info = calloc(1, sizeof(*info)); | |
67 | FILE *proc_file; | |
68 | char proc_fn[MAXPATHLEN]; | |
460a1cf0 | 69 | char *line = NULL; |
e0732705 | 70 | size_t line_bufsz = 0; |
460a1cf0 | 71 | int ret, found; |
e0732705 CS |
72 | |
73 | if (!info) { | |
74 | SYSERROR("Could not allocate memory."); | |
75 | return NULL; | |
76 | } | |
77 | ||
78 | /* read capabilities */ | |
79 | snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", pid); | |
80 | ||
81 | proc_file = fopen(proc_fn, "r"); | |
82 | if (!proc_file) { | |
83 | SYSERROR("Could not open %s", proc_fn); | |
84 | goto out_error; | |
85 | } | |
86 | ||
87 | found = 0; | |
88 | while (getline(&line, &line_bufsz, proc_file) != -1) { | |
89 | ret = sscanf(line, "CapBnd: %llx", &info->capability_mask); | |
90 | if (ret != EOF && ret > 0) { | |
91 | found = 1; | |
92 | break; | |
93 | } | |
94 | } | |
95 | ||
fa9ac567 SH |
96 | if (line) |
97 | free(line); | |
e0732705 CS |
98 | fclose(proc_file); |
99 | ||
100 | if (!found) { | |
101 | SYSERROR("Could not read capability bounding set from %s", proc_fn); | |
102 | errno = ENOENT; | |
103 | goto out_error; | |
104 | } | |
105 | ||
106 | /* read personality */ | |
107 | snprintf(proc_fn, MAXPATHLEN, "/proc/%d/personality", pid); | |
108 | ||
109 | proc_file = fopen(proc_fn, "r"); | |
110 | if (!proc_file) { | |
111 | SYSERROR("Could not open %s", proc_fn); | |
112 | goto out_error; | |
113 | } | |
114 | ||
115 | ret = fscanf(proc_file, "%lx", &info->personality); | |
116 | fclose(proc_file); | |
117 | ||
118 | if (ret == EOF || ret == 0) { | |
119 | SYSERROR("Could not read personality from %s", proc_fn); | |
120 | errno = ENOENT; | |
121 | goto out_error; | |
122 | } | |
9958532b | 123 | info->aa_profile = aa_get_profile(pid); |
e0732705 | 124 | |
e0732705 CS |
125 | return info; |
126 | ||
127 | out_error: | |
460a1cf0 | 128 | free(info); |
e0732705 CS |
129 | return NULL; |
130 | } | |
131 | ||
fc763ab7 | 132 | int lxc_attach_to_ns(pid_t pid, int which) |
99d50954 CS |
133 | { |
134 | char path[MAXPATHLEN]; | |
fc763ab7 CS |
135 | /* according to <http://article.gmane.org/gmane.linux.kernel.containers.lxc.devel/1429>, |
136 | * the file for user namepsaces in /proc/$pid/ns will be called | |
137 | * 'user' once the kernel supports it | |
138 | */ | |
139 | static char *ns[] = { "mnt", "pid", "uts", "ipc", "user", "net" }; | |
140 | static int flags[] = { | |
141 | CLONE_NEWNS, CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, | |
142 | CLONE_NEWUSER, CLONE_NEWNET | |
143 | }; | |
144 | static const int size = sizeof(ns) / sizeof(char *); | |
99d50954 | 145 | int fd[size]; |
fc763ab7 CS |
146 | int i, j, saved_errno; |
147 | ||
99d50954 CS |
148 | |
149 | snprintf(path, MAXPATHLEN, "/proc/%d/ns", pid); | |
150 | if (access(path, X_OK)) { | |
151 | ERROR("Does this kernel version support 'attach' ?"); | |
152 | return -1; | |
153 | } | |
154 | ||
155 | for (i = 0; i < size; i++) { | |
fc763ab7 CS |
156 | /* ignore if we are not supposed to attach to that |
157 | * namespace | |
158 | */ | |
159 | if (which != -1 && !(which & flags[i])) { | |
160 | fd[i] = -1; | |
161 | continue; | |
162 | } | |
163 | ||
99d50954 | 164 | snprintf(path, MAXPATHLEN, "/proc/%d/ns/%s", pid, ns[i]); |
9c4693b8 | 165 | fd[i] = open(path, O_RDONLY | O_CLOEXEC); |
99d50954 | 166 | if (fd[i] < 0) { |
fc763ab7 CS |
167 | saved_errno = errno; |
168 | ||
169 | /* close all already opened file descriptors before | |
170 | * we return an error, so we don't leak them | |
171 | */ | |
172 | for (j = 0; j < i; j++) | |
173 | close(fd[j]); | |
174 | ||
175 | errno = saved_errno; | |
99d50954 CS |
176 | SYSERROR("failed to open '%s'", path); |
177 | return -1; | |
178 | } | |
179 | } | |
180 | ||
181 | for (i = 0; i < size; i++) { | |
fc763ab7 CS |
182 | if (fd[i] >= 0 && setns(fd[i], 0) != 0) { |
183 | saved_errno = errno; | |
184 | ||
185 | for (j = i; j < size; j++) | |
186 | close(fd[j]); | |
187 | ||
188 | errno = saved_errno; | |
99d50954 CS |
189 | SYSERROR("failed to set namespace '%s'", ns[i]); |
190 | return -1; | |
191 | } | |
192 | ||
193 | close(fd[i]); | |
194 | } | |
195 | ||
196 | return 0; | |
197 | } | |
198 | ||
7a0b0b56 CS |
199 | int lxc_attach_remount_sys_proc() |
200 | { | |
201 | int ret; | |
202 | ||
203 | ret = unshare(CLONE_NEWNS); | |
204 | if (ret < 0) { | |
205 | SYSERROR("failed to unshare mount namespace"); | |
206 | return -1; | |
207 | } | |
208 | ||
209 | /* assume /proc is always mounted, so remount it */ | |
210 | ret = umount2("/proc", MNT_DETACH); | |
211 | if (ret < 0) { | |
212 | SYSERROR("failed to unmount /proc"); | |
213 | return -1; | |
214 | } | |
215 | ||
216 | ret = mount("none", "/proc", "proc", 0, NULL); | |
217 | if (ret < 0) { | |
218 | SYSERROR("failed to remount /proc"); | |
219 | return -1; | |
220 | } | |
221 | ||
222 | /* try to umount /sys - if it's not a mount point, | |
223 | * we'll get EINVAL, then we ignore it because it | |
224 | * may not have been mounted in the first place | |
225 | */ | |
226 | ret = umount2("/sys", MNT_DETACH); | |
227 | if (ret < 0 && errno != EINVAL) { | |
228 | SYSERROR("failed to unmount /sys"); | |
229 | return -1; | |
230 | } else if (ret == 0) { | |
231 | /* remount it */ | |
232 | ret = mount("none", "/sys", "sysfs", 0, NULL); | |
233 | if (ret < 0) { | |
234 | SYSERROR("failed to remount /sys"); | |
235 | return -1; | |
236 | } | |
237 | } | |
238 | ||
239 | return 0; | |
240 | } | |
241 | ||
e0732705 CS |
242 | int lxc_attach_drop_privs(struct lxc_proc_context_info *ctx) |
243 | { | |
244 | int last_cap = lxc_caps_last_cap(); | |
245 | int cap; | |
246 | ||
247 | for (cap = 0; cap <= last_cap; cap++) { | |
248 | if (ctx->capability_mask & (1LL << cap)) | |
249 | continue; | |
250 | ||
251 | if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) { | |
252 | SYSERROR("failed to remove capability id %d", cap); | |
253 | return -1; | |
254 | } | |
255 | } | |
256 | ||
257 | return 0; | |
258 | } | |
905022f7 | 259 | |
799f96fd | 260 | int lxc_attach_set_environment(enum lxc_attach_env_policy_t policy, char** extra_env, char** extra_keep) |
b3a39ba6 | 261 | { |
799f96fd | 262 | if (policy == LXC_ATTACH_CLEAR_ENV) { |
3d5e9f48 CS |
263 | char **extra_keep_store = NULL; |
264 | char *path_env; | |
265 | size_t n; | |
266 | int path_kept = 0; | |
267 | ||
268 | if (extra_keep) { | |
269 | size_t count, i; | |
270 | ||
271 | for (count = 0; extra_keep[count]; count++); | |
272 | ||
273 | extra_keep_store = calloc(count, sizeof(char *)); | |
274 | if (!extra_keep_store) { | |
275 | SYSERROR("failed to allocate memory for storing current " | |
276 | "environment variable values that will be kept"); | |
277 | return -1; | |
278 | } | |
279 | for (i = 0; i < count; i++) { | |
280 | char *v = getenv(extra_keep[i]); | |
281 | if (v) { | |
282 | extra_keep_store[i] = strdup(v); | |
283 | if (!extra_keep_store[i]) { | |
284 | SYSERROR("failed to allocate memory for storing current " | |
285 | "environment variable values that will be kept"); | |
286 | while (i > 0) | |
287 | free(extra_keep_store[--i]); | |
288 | free(extra_keep_store); | |
289 | return -1; | |
290 | } | |
291 | if (strcmp(extra_keep[i], "PATH") == 0) | |
292 | path_kept = 1; | |
293 | } | |
294 | /* calloc sets entire array to zero, so we don't | |
295 | * need an else */ | |
296 | } | |
297 | } | |
298 | ||
799f96fd CS |
299 | if (clearenv()) { |
300 | SYSERROR("failed to clear environment"); | |
3d5e9f48 CS |
301 | return -1; |
302 | } | |
303 | ||
304 | if (extra_keep_store) { | |
305 | size_t i; | |
306 | for (i = 0; extra_keep[i]; i++) { | |
307 | if (extra_keep_store[i]) | |
308 | setenv(extra_keep[i], extra_keep_store[i], 1); | |
309 | free(extra_keep_store[i]); | |
310 | } | |
311 | free(extra_keep_store); | |
312 | } | |
313 | ||
314 | /* always set a default path; shells and execlp tend | |
315 | * to be fine without it, but there is a disturbing | |
316 | * number of C programs out there that just assume | |
317 | * that getenv("PATH") is never NULL and then die a | |
318 | * painful segfault death. */ | |
319 | if (!path_kept) { | |
320 | n = confstr(_CS_PATH, NULL, 0); | |
321 | path_env = malloc(n); | |
322 | if (path_env) { | |
323 | confstr(_CS_PATH, path_env, n); | |
324 | setenv("PATH", path_env, 1); | |
325 | free(path_env); | |
326 | } | |
327 | /* don't error out, this is just an extra service */ | |
799f96fd | 328 | } |
b3a39ba6 DW |
329 | } |
330 | ||
331 | if (putenv("container=lxc")) { | |
332 | SYSERROR("failed to set environment variable"); | |
333 | return -1; | |
334 | } | |
335 | ||
3d5e9f48 CS |
336 | /* set extra environment variables */ |
337 | if (extra_env) { | |
338 | for (; *extra_env; extra_env++) { | |
339 | /* duplicate the string, just to be on | |
340 | * the safe side, because putenv does not | |
341 | * do it for us */ | |
342 | char *p = strdup(*extra_env); | |
343 | /* we just assume the user knows what they | |
344 | * are doing, so we don't do any checks */ | |
345 | if (!p) { | |
346 | SYSERROR("failed to allocate memory for additional environment " | |
347 | "variables"); | |
348 | return -1; | |
349 | } | |
350 | putenv(p); | |
351 | } | |
352 | } | |
353 | ||
b3a39ba6 DW |
354 | return 0; |
355 | } | |
356 | ||
905022f7 CS |
357 | char *lxc_attach_getpwshell(uid_t uid) |
358 | { | |
359 | /* local variables */ | |
360 | pid_t pid; | |
361 | int pipes[2]; | |
362 | int ret; | |
363 | int fd; | |
364 | char *result = NULL; | |
365 | ||
366 | /* we need to fork off a process that runs the | |
367 | * getent program, and we need to capture its | |
368 | * output, so we use a pipe for that purpose | |
369 | */ | |
370 | ret = pipe(pipes); | |
371 | if (ret < 0) | |
372 | return NULL; | |
373 | ||
374 | pid = fork(); | |
375 | if (pid < 0) { | |
376 | close(pipes[0]); | |
377 | close(pipes[1]); | |
378 | return NULL; | |
379 | } | |
380 | ||
381 | if (pid) { | |
382 | /* parent process */ | |
383 | FILE *pipe_f; | |
384 | char *line = NULL; | |
385 | size_t line_bufsz = 0; | |
386 | int found = 0; | |
387 | int status; | |
388 | ||
389 | close(pipes[1]); | |
390 | ||
391 | pipe_f = fdopen(pipes[0], "r"); | |
392 | while (getline(&line, &line_bufsz, pipe_f) != -1) { | |
393 | char *token; | |
394 | char *saveptr = NULL; | |
395 | long value; | |
396 | char *endptr = NULL; | |
397 | int i; | |
398 | ||
399 | /* if we already found something, just continue | |
400 | * to read until the pipe doesn't deliver any more | |
401 | * data, but don't modify the existing data | |
402 | * structure | |
403 | */ | |
404 | if (found) | |
405 | continue; | |
406 | ||
407 | /* trim line on the right hand side */ | |
bbb8a488 | 408 | for (i = strlen(line); i > 0 && (line[i - 1] == '\n' || line[i - 1] == '\r'); --i) |
905022f7 CS |
409 | line[i - 1] = '\0'; |
410 | ||
411 | /* split into tokens: first user name */ | |
412 | token = strtok_r(line, ":", &saveptr); | |
413 | if (!token) | |
414 | continue; | |
415 | /* next: dummy password field */ | |
416 | token = strtok_r(NULL, ":", &saveptr); | |
417 | if (!token) | |
418 | continue; | |
419 | /* next: user id */ | |
420 | token = strtok_r(NULL, ":", &saveptr); | |
421 | value = token ? strtol(token, &endptr, 10) : 0; | |
422 | if (!token || !endptr || *endptr || value == LONG_MIN || value == LONG_MAX) | |
423 | continue; | |
424 | /* dummy sanity check: user id matches */ | |
425 | if ((uid_t) value != uid) | |
426 | continue; | |
427 | /* skip fields: gid, gecos, dir, go to next field 'shell' */ | |
428 | for (i = 0; i < 4; i++) { | |
429 | token = strtok_r(NULL, ":", &saveptr); | |
430 | if (!token) | |
431 | break; | |
432 | } | |
433 | if (!token) | |
434 | continue; | |
53a54099 SH |
435 | if (result) |
436 | free(result); | |
905022f7 CS |
437 | result = strdup(token); |
438 | ||
439 | /* sanity check that there are no fields after that */ | |
440 | token = strtok_r(NULL, ":", &saveptr); | |
441 | if (token) | |
442 | continue; | |
443 | ||
444 | found = 1; | |
445 | } | |
446 | ||
447 | free(line); | |
448 | fclose(pipe_f); | |
449 | again: | |
450 | if (waitpid(pid, &status, 0) < 0) { | |
451 | if (errno == EINTR) | |
452 | goto again; | |
453 | return NULL; | |
454 | } | |
455 | ||
456 | /* some sanity checks: if anything even hinted at going | |
457 | * wrong: we can't be sure we have a valid result, so | |
458 | * we assume we don't | |
459 | */ | |
460 | ||
461 | if (!WIFEXITED(status)) | |
462 | return NULL; | |
463 | ||
464 | if (WEXITSTATUS(status) != 0) | |
465 | return NULL; | |
466 | ||
467 | if (!found) | |
468 | return NULL; | |
469 | ||
470 | return result; | |
471 | } else { | |
472 | /* child process */ | |
473 | char uid_buf[32]; | |
474 | char *arguments[] = { | |
475 | "getent", | |
476 | "passwd", | |
477 | uid_buf, | |
478 | NULL | |
479 | }; | |
480 | ||
481 | close(pipes[0]); | |
482 | ||
483 | /* we want to capture stdout */ | |
484 | dup2(pipes[1], 1); | |
485 | close(pipes[1]); | |
486 | ||
487 | /* get rid of stdin/stderr, so we try to associate it | |
488 | * with /dev/null | |
489 | */ | |
490 | fd = open("/dev/null", O_RDWR); | |
491 | if (fd < 0) { | |
492 | close(0); | |
493 | close(2); | |
494 | } else { | |
495 | dup2(fd, 0); | |
496 | dup2(fd, 2); | |
497 | close(fd); | |
498 | } | |
499 | ||
500 | /* finish argument list */ | |
501 | ret = snprintf(uid_buf, sizeof(uid_buf), "%ld", (long) uid); | |
502 | if (ret <= 0) | |
503 | exit(-1); | |
504 | ||
505 | /* try to run getent program */ | |
506 | (void) execvp("getent", arguments); | |
507 | exit(-1); | |
508 | } | |
509 | } | |
cb3e61fa CS |
510 | |
511 | void lxc_attach_get_init_uidgid(uid_t* init_uid, gid_t* init_gid) | |
512 | { | |
513 | FILE *proc_file; | |
514 | char proc_fn[MAXPATHLEN]; | |
515 | char *line = NULL; | |
516 | size_t line_bufsz = 0; | |
517 | int ret; | |
518 | long value = -1; | |
519 | uid_t uid = (uid_t)-1; | |
520 | gid_t gid = (gid_t)-1; | |
521 | ||
522 | /* read capabilities */ | |
523 | snprintf(proc_fn, MAXPATHLEN, "/proc/%d/status", 1); | |
524 | ||
525 | proc_file = fopen(proc_fn, "r"); | |
526 | if (!proc_file) | |
527 | return; | |
528 | ||
529 | while (getline(&line, &line_bufsz, proc_file) != -1) { | |
530 | /* format is: real, effective, saved set user, fs | |
531 | * we only care about real uid | |
532 | */ | |
533 | ret = sscanf(line, "Uid: %ld", &value); | |
534 | if (ret != EOF && ret > 0) { | |
535 | uid = (uid_t) value; | |
536 | } else { | |
537 | ret = sscanf(line, "Gid: %ld", &value); | |
538 | if (ret != EOF && ret > 0) | |
539 | gid = (gid_t) value; | |
540 | } | |
541 | if (uid != (uid_t)-1 && gid != (gid_t)-1) | |
542 | break; | |
543 | } | |
544 | ||
545 | fclose(proc_file); | |
546 | free(line); | |
547 | ||
548 | /* only override arguments if we found something */ | |
549 | if (uid != (uid_t)-1) | |
550 | *init_uid = uid; | |
551 | if (gid != (gid_t)-1) | |
552 | *init_gid = gid; | |
553 | ||
554 | /* TODO: we should also parse supplementary groups and use | |
555 | * setgroups() to set them */ | |
556 | } | |
9c4693b8 CS |
557 | |
558 | struct attach_clone_payload { | |
559 | int ipc_socket; | |
560 | lxc_attach_options_t* options; | |
561 | struct lxc_proc_context_info* init_ctx; | |
562 | lxc_attach_exec_t exec_function; | |
563 | void* exec_payload; | |
564 | }; | |
565 | ||
566 | static int attach_child_main(void* data); | |
567 | ||
568 | /* help the optimizer along if it doesn't know that exit always exits */ | |
569 | #define rexit(c) do { int __c = (c); exit(__c); return __c; } while(0) | |
570 | ||
571 | /* define default options if no options are supplied by the user */ | |
572 | static lxc_attach_options_t attach_static_default_options = LXC_ATTACH_OPTIONS_DEFAULT; | |
573 | ||
574 | int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_function, void* exec_payload, lxc_attach_options_t* options, pid_t* attached_process) | |
575 | { | |
576 | int ret, status; | |
577 | pid_t init_pid, pid, attached_pid; | |
578 | struct lxc_proc_context_info *init_ctx; | |
579 | char* cwd; | |
580 | char* new_cwd; | |
581 | int ipc_sockets[2]; | |
582 | ||
583 | if (!options) | |
584 | options = &attach_static_default_options; | |
585 | ||
586 | init_pid = lxc_cmd_get_init_pid(name, lxcpath); | |
587 | if (init_pid < 0) { | |
588 | ERROR("failed to get the init pid"); | |
589 | return -1; | |
590 | } | |
591 | ||
592 | init_ctx = lxc_proc_get_context_info(init_pid); | |
593 | if (!init_ctx) { | |
594 | ERROR("failed to get context of the init process, pid = %ld", (long)init_pid); | |
595 | return -1; | |
596 | } | |
597 | ||
598 | cwd = getcwd(NULL, 0); | |
599 | ||
600 | /* determine which namespaces the container was created with | |
601 | * by asking lxc-start, if necessary | |
602 | */ | |
603 | if (options->namespaces == -1) { | |
604 | options->namespaces = lxc_cmd_get_clone_flags(name, lxcpath); | |
605 | /* call failed */ | |
606 | if (options->namespaces == -1) { | |
607 | ERROR("failed to automatically determine the " | |
608 | "namespaces which the container unshared"); | |
609 | free(cwd); | |
610 | free(init_ctx->aa_profile); | |
611 | free(init_ctx); | |
612 | return -1; | |
613 | } | |
614 | } | |
615 | ||
616 | /* create a socket pair for IPC communication; set SOCK_CLOEXEC in order | |
617 | * to make sure we don't irritate other threads that want to fork+exec away | |
618 | * | |
619 | * IMPORTANT: if the initial process is multithreaded and another call | |
620 | * just fork()s away without exec'ing directly after, the socket fd will | |
621 | * exist in the forked process from the other thread and any close() in | |
622 | * our own child process will not really cause the socket to close properly, | |
623 | * potentiall causing the parent to hang. | |
624 | * | |
625 | * For this reason, while IPC is still active, we have to use shutdown() | |
626 | * if the child exits prematurely in order to signal that the socket | |
627 | * is closed and cannot assume that the child exiting will automatically | |
628 | * do that. | |
629 | * | |
630 | * IPC mechanism: (X is receiver) | |
631 | * initial process intermediate attached | |
632 | * X <--- send pid of | |
633 | * attached proc, | |
634 | * then exit | |
635 | * send 0 ------------------------------------> X | |
636 | * [do initialization] | |
637 | * X <------------------------------------ send 1 | |
638 | * [add to cgroup, ...] | |
639 | * send 2 ------------------------------------> X | |
640 | * close socket close socket | |
641 | * run program | |
642 | */ | |
643 | ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); | |
644 | if (ret < 0) { | |
645 | SYSERROR("could not set up required IPC mechanism for attaching"); | |
646 | free(cwd); | |
647 | free(init_ctx->aa_profile); | |
648 | free(init_ctx); | |
649 | return -1; | |
650 | } | |
651 | ||
652 | /* create intermediate subprocess, three reasons: | |
653 | * 1. runs all pthread_atfork handlers and the | |
654 | * child will no longer be threaded | |
655 | * (we can't properly setns() in a threaded process) | |
656 | * 2. we can't setns() in the child itself, since | |
657 | * we want to make sure we are properly attached to | |
658 | * the pidns | |
659 | * 3. also, the initial thread has to put the attached | |
660 | * process into the cgroup, which we can only do if | |
661 | * we didn't already setns() (otherwise, user | |
662 | * namespaces will hate us) | |
663 | */ | |
664 | pid = fork(); | |
665 | ||
666 | if (pid < 0) { | |
667 | SYSERROR("failed to create first subprocess"); | |
668 | free(cwd); | |
669 | free(init_ctx->aa_profile); | |
670 | free(init_ctx); | |
671 | return -1; | |
672 | } | |
673 | ||
674 | if (pid) { | |
675 | pid_t to_cleanup_pid = pid; | |
676 | int expected = 0; | |
677 | ||
678 | /* inital thread, we close the socket that is for the | |
679 | * subprocesses | |
680 | */ | |
681 | close(ipc_sockets[1]); | |
682 | free(cwd); | |
683 | ||
684 | /* get pid from intermediate process */ | |
685 | ret = lxc_read_nointr_expect(ipc_sockets[0], &attached_pid, sizeof(attached_pid), NULL); | |
686 | if (ret <= 0) { | |
687 | if (ret != 0) | |
688 | ERROR("error using IPC to receive pid of attached process"); | |
689 | goto cleanup_error; | |
690 | } | |
691 | ||
692 | /* reap intermediate process */ | |
693 | ret = wait_for_pid(pid); | |
694 | if (ret < 0) | |
695 | goto cleanup_error; | |
696 | ||
697 | /* we will always have to reap the grandchild now */ | |
698 | to_cleanup_pid = attached_pid; | |
699 | ||
700 | /* tell attached process it may start initializing */ | |
701 | status = 0; | |
702 | ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status)); | |
703 | if (ret <= 0) { | |
704 | ERROR("error using IPC to notify attached process for initialization (0)"); | |
705 | goto cleanup_error; | |
706 | } | |
707 | ||
708 | /* wait for the attached process to finish initializing */ | |
709 | expected = 1; | |
710 | ret = lxc_read_nointr_expect(ipc_sockets[0], &status, sizeof(status), &expected); | |
711 | if (ret <= 0) { | |
712 | if (ret != 0) | |
713 | ERROR("error using IPC to receive notification from attached process (1)"); | |
714 | goto cleanup_error; | |
715 | } | |
716 | ||
717 | /* attach to cgroup, if requested */ | |
718 | if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP) { | |
719 | ret = lxc_cgroup_attach(attached_pid, name, lxcpath); | |
720 | if (ret < 0) { | |
721 | ERROR("could not move attached process %ld to cgroup of container", (long)attached_pid); | |
722 | goto cleanup_error; | |
723 | } | |
724 | } | |
725 | ||
726 | /* tell attached process we're done */ | |
727 | status = 2; | |
728 | ret = lxc_write_nointr(ipc_sockets[0], &status, sizeof(status)); | |
729 | if (ret <= 0) { | |
730 | ERROR("error using IPC to notify attached process for initialization (2)"); | |
731 | goto cleanup_error; | |
732 | } | |
733 | ||
734 | /* now shut down communication with child, we're done */ | |
735 | shutdown(ipc_sockets[0], SHUT_RDWR); | |
736 | close(ipc_sockets[0]); | |
737 | free(init_ctx->aa_profile); | |
738 | free(init_ctx); | |
739 | ||
740 | /* we're done, the child process should now execute whatever | |
741 | * it is that the user requested. The parent can now track it | |
742 | * with waitpid() or similar. | |
743 | */ | |
744 | ||
745 | *attached_process = attached_pid; | |
746 | return 0; | |
747 | ||
748 | cleanup_error: | |
749 | /* first shut down the socket, then wait for the pid, | |
750 | * otherwise the pid we're waiting for may never exit | |
751 | */ | |
752 | shutdown(ipc_sockets[0], SHUT_RDWR); | |
753 | close(ipc_sockets[0]); | |
754 | if (to_cleanup_pid) | |
755 | (void) wait_for_pid(to_cleanup_pid); | |
756 | free(init_ctx->aa_profile); | |
757 | free(init_ctx); | |
758 | return -1; | |
759 | } | |
760 | ||
761 | /* first subprocess begins here, we close the socket that is for the | |
762 | * initial thread | |
763 | */ | |
764 | close(ipc_sockets[0]); | |
765 | ||
766 | /* attach now, create another subprocess later, since pid namespaces | |
767 | * only really affect the children of the current process | |
768 | */ | |
769 | ret = lxc_attach_to_ns(init_pid, options->namespaces); | |
770 | if (ret < 0) { | |
771 | ERROR("failed to enter the namespace"); | |
772 | shutdown(ipc_sockets[1], SHUT_RDWR); | |
773 | rexit(-1); | |
774 | } | |
775 | ||
776 | /* attach succeeded, try to cwd */ | |
777 | if (options->initial_cwd) | |
778 | new_cwd = options->initial_cwd; | |
779 | else | |
780 | new_cwd = cwd; | |
781 | ret = chdir(new_cwd); | |
782 | if (ret < 0) | |
783 | WARN("could not change directory to '%s'", new_cwd); | |
784 | free(cwd); | |
785 | ||
786 | /* now create the real child process */ | |
787 | { | |
788 | struct attach_clone_payload payload = { | |
789 | .ipc_socket = ipc_sockets[1], | |
790 | .options = options, | |
791 | .init_ctx = init_ctx, | |
792 | .exec_function = exec_function, | |
793 | .exec_payload = exec_payload | |
794 | }; | |
795 | /* We use clone_parent here to make this subprocess a direct child of | |
796 | * the initial process. Then this intermediate process can exit and | |
797 | * the parent can directly track the attached process. | |
798 | */ | |
799 | pid = lxc_clone(attach_child_main, &payload, CLONE_PARENT); | |
800 | } | |
801 | ||
802 | /* shouldn't happen, clone() should always return positive pid */ | |
803 | if (pid <= 0) { | |
804 | SYSERROR("failed to create subprocess"); | |
805 | shutdown(ipc_sockets[1], SHUT_RDWR); | |
806 | rexit(-1); | |
807 | } | |
808 | ||
809 | /* tell grandparent the pid of the pid of the newly created child */ | |
810 | ret = lxc_write_nointr(ipc_sockets[1], &pid, sizeof(pid)); | |
811 | if (ret != sizeof(pid)) { | |
812 | /* if this really happens here, this is very unfortunate, since the | |
813 | * parent will not know the pid of the attached process and will | |
814 | * not be able to wait for it (and we won't either due to CLONE_PARENT) | |
815 | * so the parent won't be able to reap it and the attached process | |
816 | * will remain a zombie | |
817 | */ | |
818 | ERROR("error using IPC to notify main process of pid of the attached process"); | |
819 | shutdown(ipc_sockets[1], SHUT_RDWR); | |
820 | rexit(-1); | |
821 | } | |
822 | ||
823 | /* the rest is in the hands of the initial and the attached process */ | |
824 | rexit(0); | |
825 | } | |
826 | ||
827 | int attach_child_main(void* data) | |
828 | { | |
829 | struct attach_clone_payload* payload = (struct attach_clone_payload*)data; | |
830 | int ipc_socket = payload->ipc_socket; | |
831 | lxc_attach_options_t* options = payload->options; | |
832 | struct lxc_proc_context_info* init_ctx = payload->init_ctx; | |
1a2e58cf | 833 | #if HAVE_SYS_PERSONALITY_H |
9c4693b8 | 834 | long new_personality; |
1a2e58cf | 835 | #endif |
9c4693b8 CS |
836 | int ret; |
837 | int status; | |
838 | int expected; | |
839 | long flags; | |
840 | int fd; | |
841 | uid_t new_uid; | |
842 | gid_t new_gid; | |
843 | ||
844 | /* wait for the initial thread to signal us that it's ready | |
845 | * for us to start initializing | |
846 | */ | |
847 | expected = 0; | |
848 | status = -1; | |
849 | ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected); | |
850 | if (ret <= 0) { | |
851 | ERROR("error using IPC to receive notification from initial process (0)"); | |
852 | shutdown(ipc_socket, SHUT_RDWR); | |
853 | rexit(-1); | |
854 | } | |
855 | ||
856 | /* load apparmor profile */ | |
857 | if ((options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_APPARMOR)) { | |
858 | ret = attach_apparmor(init_ctx->aa_profile); | |
859 | if (ret < 0) { | |
860 | shutdown(ipc_socket, SHUT_RDWR); | |
861 | rexit(-1); | |
862 | } | |
863 | } | |
864 | ||
865 | /* A description of the purpose of this functionality is | |
866 | * provided in the lxc-attach(1) manual page. We have to | |
867 | * remount here and not in the parent process, otherwise | |
868 | * /proc may not properly reflect the new pid namespace. | |
869 | */ | |
870 | if (!(options->namespaces & CLONE_NEWNS) && (options->attach_flags & LXC_ATTACH_REMOUNT_PROC_SYS)) { | |
871 | ret = lxc_attach_remount_sys_proc(); | |
872 | if (ret < 0) { | |
873 | shutdown(ipc_socket, SHUT_RDWR); | |
874 | rexit(-1); | |
875 | } | |
876 | } | |
877 | ||
878 | /* now perform additional attachments*/ | |
879 | #if HAVE_SYS_PERSONALITY_H | |
880 | if (options->personality < 0) | |
881 | new_personality = init_ctx->personality; | |
882 | else | |
883 | new_personality = options->personality; | |
884 | ||
885 | if (options->attach_flags & LXC_ATTACH_SET_PERSONALITY) { | |
886 | ret = personality(new_personality); | |
887 | if (ret < 0) { | |
888 | SYSERROR("could not ensure correct architecture"); | |
889 | shutdown(ipc_socket, SHUT_RDWR); | |
890 | rexit(-1); | |
891 | } | |
892 | } | |
893 | #endif | |
894 | ||
895 | if (options->attach_flags & LXC_ATTACH_DROP_CAPABILITIES) { | |
896 | ret = lxc_attach_drop_privs(init_ctx); | |
897 | if (ret < 0) { | |
898 | ERROR("could not drop privileges"); | |
899 | shutdown(ipc_socket, SHUT_RDWR); | |
900 | rexit(-1); | |
901 | } | |
902 | } | |
903 | ||
904 | /* always set the environment (specify (LXC_ATTACH_KEEP_ENV, NULL, NULL) if you want this to be a no-op) */ | |
905 | ret = lxc_attach_set_environment(options->env_policy, options->extra_env_vars, options->extra_keep_env); | |
906 | if (ret < 0) { | |
907 | ERROR("could not set initial environment for attached process"); | |
908 | shutdown(ipc_socket, SHUT_RDWR); | |
909 | rexit(-1); | |
910 | } | |
911 | ||
912 | /* set user / group id */ | |
913 | new_uid = 0; | |
914 | new_gid = 0; | |
915 | /* ignore errors, we will fall back to root in that case | |
916 | * (/proc was not mounted etc.) | |
917 | */ | |
918 | if (options->namespaces & CLONE_NEWUSER) | |
919 | lxc_attach_get_init_uidgid(&new_uid, &new_gid); | |
920 | ||
921 | if (options->uid != (uid_t)-1) | |
922 | new_uid = options->uid; | |
923 | if (options->gid != (gid_t)-1) | |
924 | new_gid = options->gid; | |
925 | ||
926 | /* try to set the uid/gid combination */ | |
927 | if ((new_gid != 0 || options->namespaces & CLONE_NEWUSER) && setgid(new_gid)) { | |
928 | SYSERROR("switching to container gid"); | |
929 | shutdown(ipc_socket, SHUT_RDWR); | |
930 | rexit(-1); | |
931 | } | |
932 | if ((new_uid != 0 || options->namespaces & CLONE_NEWUSER) && setuid(new_uid)) { | |
933 | SYSERROR("switching to container uid"); | |
934 | shutdown(ipc_socket, SHUT_RDWR); | |
935 | rexit(-1); | |
936 | } | |
937 | ||
938 | /* tell initial process it may now put us into the cgroups */ | |
939 | status = 1; | |
940 | ret = lxc_write_nointr(ipc_socket, &status, sizeof(status)); | |
941 | if (ret != sizeof(status)) { | |
942 | ERROR("error using IPC to notify initial process for initialization (1)"); | |
943 | shutdown(ipc_socket, SHUT_RDWR); | |
944 | rexit(-1); | |
945 | } | |
946 | ||
947 | /* wait for the initial thread to signal us that it has done | |
948 | * everything for us when it comes to cgroups etc. | |
949 | */ | |
950 | expected = 2; | |
951 | status = -1; | |
952 | ret = lxc_read_nointr_expect(ipc_socket, &status, sizeof(status), &expected); | |
953 | if (ret <= 0) { | |
954 | ERROR("error using IPC to receive final notification from initial process (2)"); | |
955 | shutdown(ipc_socket, SHUT_RDWR); | |
956 | rexit(-1); | |
957 | } | |
958 | ||
959 | shutdown(ipc_socket, SHUT_RDWR); | |
960 | close(ipc_socket); | |
961 | free(init_ctx->aa_profile); | |
962 | free(init_ctx); | |
963 | ||
964 | /* The following is done after the communication socket is | |
965 | * shut down. That way, all errors that might (though | |
966 | * unlikely) occur up until this point will have their messages | |
967 | * printed to the original stderr (if logging is so configured) | |
968 | * and not the fd the user supplied, if any. | |
969 | */ | |
970 | ||
971 | /* fd handling for stdin, stdout and stderr; | |
972 | * ignore errors here, user may want to make sure | |
973 | * the fds are closed, for example */ | |
974 | if (options->stdin_fd >= 0 && options->stdin_fd != 0) | |
975 | dup2(options->stdin_fd, 0); | |
976 | if (options->stdout_fd >= 0 && options->stdout_fd != 1) | |
977 | dup2(options->stdout_fd, 1); | |
978 | if (options->stderr_fd >= 0 && options->stderr_fd != 2) | |
979 | dup2(options->stderr_fd, 2); | |
980 | ||
981 | /* close the old fds */ | |
982 | if (options->stdin_fd > 2) | |
983 | close(options->stdin_fd); | |
984 | if (options->stdout_fd > 2) | |
985 | close(options->stdout_fd); | |
986 | if (options->stderr_fd > 2) | |
987 | close(options->stderr_fd); | |
988 | ||
989 | /* try to remove CLOEXEC flag from stdin/stdout/stderr, | |
990 | * but also here, ignore errors */ | |
991 | for (fd = 0; fd <= 2; fd++) { | |
992 | flags = fcntl(fd, F_GETFL); | |
993 | if (flags < 0) | |
994 | continue; | |
995 | if (flags & FD_CLOEXEC) | |
996 | fcntl(fd, F_SETFL, flags & ~FD_CLOEXEC); | |
997 | } | |
998 | ||
999 | /* we're done, so we can now do whatever the user intended us to do */ | |
1000 | rexit(payload->exec_function(payload->exec_payload)); | |
1001 | } | |
1002 | ||
1003 | int lxc_attach_run_command(void* payload) | |
1004 | { | |
1005 | lxc_attach_command_t* cmd = (lxc_attach_command_t*)payload; | |
1006 | ||
1007 | execvp(cmd->program, cmd->argv); | |
1008 | SYSERROR("failed to exec '%s'", cmd->program); | |
1009 | return -1; | |
1010 | } | |
1011 | ||
1012 | int lxc_attach_run_shell(void* payload) | |
1013 | { | |
1014 | uid_t uid; | |
1015 | struct passwd *passwd; | |
1016 | char *user_shell; | |
1017 | ||
1018 | /* ignore payload parameter */ | |
1019 | (void)payload; | |
1020 | ||
1021 | uid = getuid(); | |
1022 | passwd = getpwuid(uid); | |
1023 | ||
1024 | /* this probably happens because of incompatible nss | |
1025 | * implementations in host and container (remember, this | |
1026 | * code is still using the host's glibc but our mount | |
1027 | * namespace is in the container) | |
1028 | * we may try to get the information by spawning a | |
1029 | * [getent passwd uid] process and parsing the result | |
1030 | */ | |
1031 | if (!passwd) | |
1032 | user_shell = lxc_attach_getpwshell(uid); | |
1033 | else | |
1034 | user_shell = passwd->pw_shell; | |
1035 | ||
1036 | if (user_shell) | |
1037 | execlp(user_shell, user_shell, NULL); | |
1038 | ||
1039 | /* executed if either no passwd entry or execvp fails, | |
1040 | * we will fall back on /bin/sh as a default shell | |
1041 | */ | |
1042 | execlp("/bin/sh", "/bin/sh", NULL); | |
1043 | SYSERROR("failed to exec shell"); | |
1044 | return -1; | |
1045 | } |