2 * lxc: linux Container library
4 * (C) Copyright IBM Corp. 2007, 2008
7 * Daniel Lezcano <daniel.lezcano at free.fr>
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 /* Properly support loop devices on 32bit systems. */
27 #define _FILE_OFFSET_BITS 64
36 #include <linux/loop.h>
37 #include <linux/magic.h>
38 #include <linux/types.h>
39 #include <sys/syscall.h>
40 #include <sys/types.h>
43 #ifdef HAVE_LINUX_MEMFD_H
44 #include <linux/memfd.h>
47 #include "initutils.h"
49 /* Define __S_ISTYPE if missing from the C library. */
51 #define __S_ISTYPE(mode, mask) (((mode)&S_IFMT) == (mask))
56 #define CAP_SETFCAP 31
59 #ifndef CAP_MAC_OVERRIDE
60 #define CAP_MAC_OVERRIDE 32
64 #define CAP_MAC_ADMIN 33
68 #ifndef PR_CAPBSET_DROP
69 #define PR_CAPBSET_DROP 24
72 #ifndef LO_FLAGS_AUTOCLEAR
73 #define LO_FLAGS_AUTOCLEAR 4
84 /* needed for cgroup automount checks, regardless of whether we
85 * have included linux/capability.h or not */
87 #define CAP_SYS_ADMIN 21
90 #ifndef CGROUP_SUPER_MAGIC
91 #define CGROUP_SUPER_MAGIC 0x27e0eb
94 #ifndef CGROUP2_SUPER_MAGIC
95 #define CGROUP2_SUPER_MAGIC 0x63677270
99 /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */
100 #define LXC_NUMSTRLEN64 21
101 #define LXC_LINELEN 4096
102 #define LXC_IDMAPLEN 4096
103 #define LXC_MAX_BUFFER 4096
105 /* returns 1 on success, 0 if there were any failures */
106 extern int lxc_rmdir_onedev(const char *path
, const char *exclude
);
107 extern int get_u16(unsigned short *val
, const char *arg
, int base
);
108 extern int mkdir_p(const char *dir
, mode_t mode
);
109 extern char *get_rundir(void);
111 /* Define getline() if missing from the C library */
114 #include <../include/getline.h>
118 #if !defined(__NR_setns) && !defined(__NR_set_ns)
119 #if defined(__x86_64__)
120 #define __NR_setns 308
121 #elif defined(__i386__)
122 #define __NR_setns 346
123 #elif defined(__arm__)
124 #define __NR_setns 375
125 #elif defined(__aarch64__)
126 #define __NR_setns 375
127 #elif defined(__powerpc__)
128 #define __NR_setns 350
129 #elif defined(__s390__)
130 #define __NR_setns 339
134 /* Define setns() if missing from the C library */
136 static inline int setns(int fd
, int nstype
)
139 return syscall(__NR_setns
, fd
, nstype
);
140 #elif defined(__NR_set_ns)
141 return syscall(__NR_set_ns
, fd
, nstype
);
149 /* Define sethostname() if missing from the C library */
150 #ifndef HAVE_SETHOSTNAME
151 static inline int sethostname(const char * name
, size_t len
)
153 #ifdef __NR_sethostname
154 return syscall(__NR_sethostname
, name
, len
);
162 /* Define unshare() if missing from the C library */
164 static inline int unshare(int flags
)
167 return syscall(__NR_unshare
, flags
);
174 extern int unshare(int);
177 /* Define signalfd() if missing from the C library */
178 #ifdef HAVE_SYS_SIGNALFD_H
179 # include <sys/signalfd.h>
181 /* assume kernel headers are too old */
183 struct signalfd_siginfo
193 uint32_t ssi_overrun
;
204 # ifndef __NR_signalfd4
205 /* assume kernel headers are too old */
207 # define __NR_signalfd4 327
209 # define __NR_signalfd4 289
211 # define __NR_signalfd4 313
213 # define __NR_signalfd4 322
215 # define __NR_signalfd4 355
216 # elif __mips__ && _MIPS_SIM == _ABIO32
217 # define __NR_signalfd4 4324
218 # elif __mips__ && _MIPS_SIM == _ABI64
219 # define __NR_signalfd4 5283
220 # elif __mips__ && _MIPS_SIM == _ABIN32
221 # define __NR_signalfd4 6287
225 # ifndef __NR_signalfd
226 /* assume kernel headers are too old */
228 # define __NR_signalfd 321
230 # define __NR_signalfd 282
232 # define __NR_signalfd 305
234 # define __NR_signalfd 316
236 # define __NR_signalfd 349
237 # elif __mips__ && _MIPS_SIM == _ABIO32
238 # define __NR_signalfd 4317
239 # elif __mips__ && _MIPS_SIM == _ABI64
240 # define __NR_signalfd 5276
241 # elif __mips__ && _MIPS_SIM == _ABIN32
242 # define __NR_signalfd 6280
246 static inline int signalfd(int fd
, const sigset_t
*mask
, int flags
)
250 retval
= syscall (__NR_signalfd4
, fd
, mask
, _NSIG
/ 8, flags
);
251 if (errno
== ENOSYS
&& flags
== 0)
252 retval
= syscall (__NR_signalfd
, fd
, mask
, _NSIG
/ 8);
258 #ifndef LO_FLAGS_AUTOCLEAR
259 #define LO_FLAGS_AUTOCLEAR 4
262 #ifndef LOOP_CTL_GET_FREE
263 #define LOOP_CTL_GET_FREE 0x4C82
268 #define MFD_CLOEXEC 0x0001U
271 #ifndef MFD_ALLOW_SEALING
272 #define MFD_ALLOW_SEALING 0x0002U
275 #ifndef HAVE_MEMFD_CREATE
276 static inline int memfd_create(const char *name
, unsigned int flags
) {
277 #ifndef __NR_memfd_create
279 #define __NR_memfd_create 356
280 #elif defined __x86_64__
281 #define __NR_memfd_create 319
282 #elif defined __arm__
283 #define __NR_memfd_create 385
284 #elif defined __aarch64__
285 #define __NR_memfd_create 279
286 #elif defined __s390__
287 #define __NR_memfd_create 350
288 #elif defined __powerpc__
289 #define __NR_memfd_create 360
290 #elif defined __sparc__
291 #define __NR_memfd_create 348
292 #elif defined __blackfin__
293 #define __NR_memfd_create 390
294 #elif defined __ia64__
295 #define __NR_memfd_create 1340
296 #elif defined _MIPS_SIM
297 #if _MIPS_SIM == _MIPS_SIM_ABI32
298 #define __NR_memfd_create 4354
300 #if _MIPS_SIM == _MIPS_SIM_NABI32
301 #define __NR_memfd_create 6318
303 #if _MIPS_SIM == _MIPS_SIM_ABI64
304 #define __NR_memfd_create 5314
308 #ifdef __NR_memfd_create
309 return syscall(__NR_memfd_create
, name
, flags
);
316 extern int memfd_create(const char *name
, unsigned int flags
);
319 static inline int lxc_set_cloexec(int fd
)
321 return fcntl(fd
, F_SETFD
, FD_CLOEXEC
);
324 /* Struct to carry child pid from lxc_popen() to lxc_pclose().
325 * Not an opaque struct to allow direct access to the underlying FILE *
326 * (i.e., struct lxc_popen_FILE *file; fgets(buf, sizeof(buf), file->f))
327 * without additional wrappers.
329 struct lxc_popen_FILE
{
335 /* popen(command, "re") replacement that restores default signal mask
336 * via sigprocmask(2) (unblocks all signals) after fork(2) but prior to calling exec(3).
337 * In short, popen(command, "re") does pipe() + fork() + exec()
338 * while lxc_popen(command) does pipe() + fork() + sigprocmask() + exec().
339 * Returns pointer to struct lxc_popen_FILE, that should be freed with lxc_pclose().
340 * On error returns NULL.
342 extern struct lxc_popen_FILE
*lxc_popen(const char *command
);
344 /* pclose() replacement to be used on struct lxc_popen_FILE *,
345 * returned by lxc_popen().
346 * Waits for associated process to terminate, returns its exit status and
347 * frees resources, pointed to by struct lxc_popen_FILE *.
349 extern int lxc_pclose(struct lxc_popen_FILE
*fp
);
352 * BUILD_BUG_ON - break compile if a condition is true.
353 * @condition: the condition which the compiler should know is false.
355 * If you have some code which relies on certain constants being equal, or
356 * other compile-time-evaluated condition, you should use BUILD_BUG_ON to
357 * detect if someone changes it.
359 * The implementation uses gcc's reluctance to create a negative array, but
360 * gcc (as of 4.4) only emits that error for obvious cases (eg. not arguments
361 * to inline functions). So as a fallback we use the optimizer; if it can't
362 * prove the condition is false, it will cause a link error on the undefined
363 * "__build_bug_on_failed". This error message can be harder to track down
364 * though, hence the two different methods.
367 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
369 extern int __build_bug_on_failed
;
370 #define BUILD_BUG_ON(condition) \
372 ((void)sizeof(char[1 - 2*!!(condition)])); \
373 if (condition) __build_bug_on_failed = 1; \
378 * wait on a child we forked
380 extern int wait_for_pid(pid_t pid
);
381 extern int lxc_wait_for_pid_status(pid_t pid
);
383 /* send and receive buffers completely */
384 extern ssize_t
lxc_write_nointr(int fd
, const void* buf
, size_t count
);
385 extern ssize_t
lxc_read_nointr(int fd
, void* buf
, size_t count
);
386 extern ssize_t
lxc_read_nointr_expect(int fd
, void *buf
, size_t count
,
387 const void *expected_buf
);
389 #define SHA_DIGEST_LENGTH 20
390 extern int sha1sum_file(char *fnam
, unsigned char *md_value
);
393 /* read and write whole files */
394 extern int lxc_write_to_file(const char *filename
, const void *buf
,
395 size_t count
, bool add_newline
);
396 extern int lxc_read_from_file(const char *filename
, void* buf
, size_t count
);
398 /* convert variadic argument lists to arrays (for execl type argument lists) */
399 extern char** lxc_va_arg_list_to_argv(va_list ap
, size_t skip
, int do_strdup
);
400 extern const char** lxc_va_arg_list_to_argv_const(va_list ap
, size_t skip
);
402 /* Some simple string functions; if they return pointers, they are allocated
405 extern char *lxc_string_replace(const char *needle
, const char *replacement
,
406 const char *haystack
);
407 extern bool lxc_string_in_array(const char *needle
, const char **haystack
);
408 extern char *lxc_string_join(const char *sep
, const char **parts
,
410 /* Normalize and split path: Leading and trailing / are removed, multiple
411 * / are compactified, .. and . are resolved (.. on the top level is considered
415 * foo/../bar -> { bar, NULL }
417 * ./bar/baz/.. -> { bar, NULL }
418 * foo//bar -> { foo, bar, NULL }
420 extern char **lxc_normalize_path(const char *path
);
421 /* remove multiple slashes from the path, e.g. ///foo//bar -> /foo/bar */
422 extern char *lxc_deslashify(const char *path
);
423 extern char *lxc_append_paths(const char *first
, const char *second
);
424 /* Note: the following two functions use strtok(), so they will never
425 * consider an empty element, even if two delimiters are next to
428 extern bool lxc_string_in_list(const char *needle
, const char *haystack
,
430 extern char **lxc_string_split(const char *string
, char sep
);
431 extern char **lxc_string_split_and_trim(const char *string
, char sep
);
432 extern char **lxc_string_split_quoted(char *string
);
433 /* Append string to NULL-terminated string array. */
434 extern int lxc_append_string(char ***list
, char *entry
);
436 /* some simple array manipulation utilities */
437 typedef void (*lxc_free_fn
)(void *);
438 typedef void *(*lxc_dup_fn
)(void *);
439 extern int lxc_grow_array(void ***array
, size_t *capacity
, size_t new_size
,
440 size_t capacity_increment
);
441 extern void lxc_free_array(void **array
, lxc_free_fn element_free_fn
);
442 extern size_t lxc_array_len(void **array
);
444 extern void **lxc_append_null_to_array(void **array
, size_t count
);
446 /* initialize rand with urandom */
447 extern int randseed(bool);
449 /* are we unprivileged with respect to our namespaces */
450 inline static bool am_guest_unpriv(void) {
451 return geteuid() != 0;
454 /* are we unprivileged with respect to init_user_ns */
455 inline static bool am_host_unpriv(void)
458 uid_t user
, host
, count
;
464 /* Now: are we in a user namespace? Because then we're also
467 f
= fopen("/proc/self/uid_map", "r");
472 ret
= fscanf(f
, "%u %u %u", &user
, &host
, &count
);
478 if (user
!= 0 || host
!= 0 || count
!= UINT32_MAX
)
484 * parse /proc/self/uid_map to find what @orig maps to
486 extern uid_t
get_ns_uid(uid_t orig
);
488 extern bool dir_exists(const char *path
);
490 #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
491 extern uint64_t fnv_64a_buf(void *buf
, size_t len
, uint64_t hval
);
493 extern int detect_shared_rootfs(void);
494 extern bool detect_ramfs_rootfs(void);
495 extern char *on_path(const char *cmd
, const char *rootfs
);
496 extern bool file_exists(const char *f
);
497 extern bool cgns_supported(void);
498 extern char *choose_init(const char *rootfs
);
499 extern int print_to_file(const char *file
, const char *content
);
500 extern bool switch_to_ns(pid_t pid
, const char *ns
);
501 extern int is_dir(const char *path
);
502 extern char *get_template_path(const char *t
);
503 extern int safe_mount(const char *src
, const char *dest
, const char *fstype
,
504 unsigned long flags
, const void *data
,
506 extern int lxc_mount_proc_if_needed(const char *rootfs
);
507 extern int open_devnull(void);
508 extern int set_stdfds(int fd
);
509 extern int null_stdfds(void);
510 extern int lxc_count_file_lines(const char *fn
);
511 extern int lxc_preserve_ns(const int pid
, const char *ns
);
513 /* Check whether a signal is blocked by a process. */
514 extern bool task_blocking_signal(pid_t pid
, int signal
);
516 /* Helper functions to parse numbers. */
517 extern int lxc_safe_uint(const char *numstr
, unsigned int *converted
);
518 extern int lxc_safe_int(const char *numstr
, int *converted
);
519 extern int lxc_safe_long(const char *numstr
, long int *converted
);
520 extern int lxc_safe_long_long(const char *numstr
, long long int *converted
);
521 extern int lxc_safe_ulong(const char *numstr
, unsigned long *converted
);
522 /* Handles B, kb, MB, GB. Detects overflows and reports -ERANGE. */
523 extern int parse_byte_size_string(const char *s
, int64_t *converted
);
525 /* Switch to a new uid and gid. */
526 extern int lxc_switch_uid_gid(uid_t uid
, gid_t gid
);
527 extern int lxc_setgroups(int size
, gid_t list
[]);
529 /* Find an unused loop device and associate it with source. */
530 extern int lxc_prepare_loop_dev(const char *source
, char *loop_dev
, int flags
);
532 /* Clear all mounts on a given node.
533 * >= 0 successfully cleared. The number returned is the number of umounts
535 * < 0 error umounting. Return -errno.
537 extern int lxc_unstack_mountpoint(const char *path
, bool lazy
);
540 * run_command runs a command and collect it's std{err,out} output in buf.
542 * @param[out] buf The buffer where the commands std{err,out] output will be
543 * read into. If no output was produced, buf will be memset
545 * @param[in] buf_size The size of buf. This function will reserve one byte for
547 * @param[in] child_fn The function to be run in the child process. This
548 * function must exec.
549 * @param[in] args Arguments to be passed to child_fn.
551 extern int run_command(char *buf
, size_t buf_size
, int (*child_fn
)(void *),
554 /* Concatenate all passed-in strings into one path. Do not fail. If any piece
555 * is not prefixed with '/', add a '/'.
557 __attribute__((sentinel
)) extern char *must_make_path(const char *first
, ...);
558 __attribute__((sentinel
)) extern char *must_append_path(char *first
, ...);
560 /* return copy of string @entry; do not fail. */
561 extern char *must_copy_string(const char *entry
);
563 /* Re-alllocate a pointer, do not fail */
564 extern void *must_realloc(void *orig
, size_t sz
);
566 /* __typeof__ should be safe to use with all compilers. */
567 typedef __typeof__(((struct statfs
*)NULL
)->f_type
) fs_type_magic
;
568 extern bool has_fs_type(const char *path
, fs_type_magic magic_val
);
569 extern bool is_fs_type(const struct statfs
*fs
, fs_type_magic magic_val
);
570 extern bool lxc_nic_exists(char *nic
);
571 extern int lxc_make_tmpfile(char *template, bool rm
);
573 static inline uint64_t lxc_getpagesize(void)
577 pgsz
= sysconf(_SC_PAGESIZE
);
584 /* If n is not a power of 2 this function will return the next power of 2
585 * greater than that number. Note that this function always returns the *next*
586 * power of 2 *greater* that number not the *nearest*. For example, passing 1025
587 * as argument this function will return 2048 although the closest power of 2
589 * If the caller passes in 0 they will receive 0 in return since this is invalid
590 * input and 0 is not a power of 2.
592 extern uint64_t lxc_find_next_power2(uint64_t n
);
594 static inline pid_t
lxc_raw_gettid(void)
597 return syscall(SYS_gettid
);
599 return lxc_raw_getpid();
603 /* Set a signal the child process will receive after the parent has died. */
604 extern int lxc_set_death_signal(int signal
);
606 #endif /* __LXC_UTILS_H */