]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame_incremental - kernel/user_namespace.c
UBUNTU: Ubuntu-5.15.0-39.42
[mirror_ubuntu-jammy-kernel.git] / kernel / user_namespace.c
... / ...
CommitLineData
1// SPDX-License-Identifier: GPL-2.0-only
2
3#include <linux/export.h>
4#include <linux/nsproxy.h>
5#include <linux/slab.h>
6#include <linux/sched/signal.h>
7#include <linux/user_namespace.h>
8#include <linux/proc_ns.h>
9#include <linux/highuid.h>
10#include <linux/cred.h>
11#include <linux/securebits.h>
12#include <linux/keyctl.h>
13#include <linux/key-type.h>
14#include <keys/user-type.h>
15#include <linux/seq_file.h>
16#include <linux/fs.h>
17#include <linux/uaccess.h>
18#include <linux/ctype.h>
19#include <linux/projid.h>
20#include <linux/fs_struct.h>
21#include <linux/bsearch.h>
22#include <linux/sort.h>
23
24/*
25 * sysctl determining whether unprivileged users may unshare a new
26 * userns. Allowed by default
27 */
28int unprivileged_userns_clone = 1;
29
30static struct kmem_cache *user_ns_cachep __read_mostly;
31static DEFINE_MUTEX(userns_state_mutex);
32
33static bool new_idmap_permitted(const struct file *file,
34 struct user_namespace *ns, int cap_setid,
35 struct uid_gid_map *map);
36static void free_user_ns(struct work_struct *work);
37
38static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
39{
40 return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
41}
42
43static void dec_user_namespaces(struct ucounts *ucounts)
44{
45 return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
46}
47
48static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
49{
50 /* Start with the same capabilities as init but useless for doing
51 * anything as the capabilities are bound to the new user namespace.
52 */
53 cred->securebits = SECUREBITS_DEFAULT;
54 cred->cap_inheritable = CAP_EMPTY_SET;
55 cred->cap_permitted = CAP_FULL_SET;
56 cred->cap_effective = CAP_FULL_SET;
57 cred->cap_ambient = CAP_EMPTY_SET;
58 cred->cap_bset = CAP_FULL_SET;
59#ifdef CONFIG_KEYS
60 key_put(cred->request_key_auth);
61 cred->request_key_auth = NULL;
62#endif
63 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
64 cred->user_ns = user_ns;
65}
66
67static unsigned long enforced_nproc_rlimit(void)
68{
69 unsigned long limit = RLIM_INFINITY;
70
71 /* Is RLIMIT_NPROC currently enforced? */
72 if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
73 (current_user_ns() != &init_user_ns))
74 limit = rlimit(RLIMIT_NPROC);
75
76 return limit;
77}
78
79/*
80 * Create a new user namespace, deriving the creator from the user in the
81 * passed credentials, and replacing that user with the new root user for the
82 * new namespace.
83 *
84 * This is called by copy_creds(), which will finish setting the target task's
85 * credentials.
86 */
87int create_user_ns(struct cred *new)
88{
89 struct user_namespace *ns, *parent_ns = new->user_ns;
90 kuid_t owner = new->euid;
91 kgid_t group = new->egid;
92 struct ucounts *ucounts;
93 int ret, i;
94
95 ret = -ENOSPC;
96 if (parent_ns->level > 32)
97 goto fail;
98
99 ucounts = inc_user_namespaces(parent_ns, owner);
100 if (!ucounts)
101 goto fail;
102
103 /*
104 * Verify that we can not violate the policy of which files
105 * may be accessed that is specified by the root directory,
106 * by verifying that the root directory is at the root of the
107 * mount namespace which allows all files to be accessed.
108 */
109 ret = -EPERM;
110 if (current_chrooted())
111 goto fail_dec;
112
113 /* The creator needs a mapping in the parent user namespace
114 * or else we won't be able to reasonably tell userspace who
115 * created a user_namespace.
116 */
117 ret = -EPERM;
118 if (!kuid_has_mapping(parent_ns, owner) ||
119 !kgid_has_mapping(parent_ns, group))
120 goto fail_dec;
121
122 ret = -ENOMEM;
123 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
124 if (!ns)
125 goto fail_dec;
126
127 ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
128 ret = ns_alloc_inum(&ns->ns);
129 if (ret)
130 goto fail_free;
131 ns->ns.ops = &userns_operations;
132
133 refcount_set(&ns->ns.count, 1);
134 /* Leave the new->user_ns reference with the new user namespace. */
135 ns->parent = parent_ns;
136 ns->level = parent_ns->level + 1;
137 ns->owner = owner;
138 ns->group = group;
139 INIT_WORK(&ns->work, free_user_ns);
140 for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
141 ns->ucount_max[i] = INT_MAX;
142 }
143 set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
144 set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
145 set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
146 set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
147 ns->ucounts = ucounts;
148
149 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
150 mutex_lock(&userns_state_mutex);
151 ns->flags = parent_ns->flags;
152 mutex_unlock(&userns_state_mutex);
153
154#ifdef CONFIG_KEYS
155 INIT_LIST_HEAD(&ns->keyring_name_list);
156 init_rwsem(&ns->keyring_sem);
157#endif
158 ret = -ENOMEM;
159 if (!setup_userns_sysctls(ns))
160 goto fail_keyring;
161
162 set_cred_user_ns(new, ns);
163 return 0;
164fail_keyring:
165#ifdef CONFIG_PERSISTENT_KEYRINGS
166 key_put(ns->persistent_keyring_register);
167#endif
168 ns_free_inum(&ns->ns);
169fail_free:
170 kmem_cache_free(user_ns_cachep, ns);
171fail_dec:
172 dec_user_namespaces(ucounts);
173fail:
174 return ret;
175}
176
177int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
178{
179 struct cred *cred;
180 int err = -ENOMEM;
181
182 if (!(unshare_flags & CLONE_NEWUSER))
183 return 0;
184
185 cred = prepare_creds();
186 if (cred) {
187 err = create_user_ns(cred);
188 if (err)
189 put_cred(cred);
190 else
191 *new_cred = cred;
192 }
193
194 return err;
195}
196
197static void free_user_ns(struct work_struct *work)
198{
199 struct user_namespace *parent, *ns =
200 container_of(work, struct user_namespace, work);
201
202 do {
203 struct ucounts *ucounts = ns->ucounts;
204 parent = ns->parent;
205 if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
206 kfree(ns->gid_map.forward);
207 kfree(ns->gid_map.reverse);
208 }
209 if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
210 kfree(ns->uid_map.forward);
211 kfree(ns->uid_map.reverse);
212 }
213 if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
214 kfree(ns->projid_map.forward);
215 kfree(ns->projid_map.reverse);
216 }
217 retire_userns_sysctls(ns);
218 key_free_user_ns(ns);
219 ns_free_inum(&ns->ns);
220 kmem_cache_free(user_ns_cachep, ns);
221 dec_user_namespaces(ucounts);
222 ns = parent;
223 } while (refcount_dec_and_test(&parent->ns.count));
224}
225
226void __put_user_ns(struct user_namespace *ns)
227{
228 schedule_work(&ns->work);
229}
230EXPORT_SYMBOL(__put_user_ns);
231
232/**
233 * idmap_key struct holds the information necessary to find an idmapping in a
234 * sorted idmap array. It is passed to cmp_map_id() as first argument.
235 */
236struct idmap_key {
237 bool map_up; /* true -> id from kid; false -> kid from id */
238 u32 id; /* id to find */
239 u32 count; /* == 0 unless used with map_id_range_down() */
240};
241
242/**
243 * cmp_map_id - Function to be passed to bsearch() to find the requested
244 * idmapping. Expects struct idmap_key to be passed via @k.
245 */
246static int cmp_map_id(const void *k, const void *e)
247{
248 u32 first, last, id2;
249 const struct idmap_key *key = k;
250 const struct uid_gid_extent *el = e;
251
252 id2 = key->id + key->count - 1;
253
254 /* handle map_id_{down,up}() */
255 if (key->map_up)
256 first = el->lower_first;
257 else
258 first = el->first;
259
260 last = first + el->count - 1;
261
262 if (key->id >= first && key->id <= last &&
263 (id2 >= first && id2 <= last))
264 return 0;
265
266 if (key->id < first || id2 < first)
267 return -1;
268
269 return 1;
270}
271
272/**
273 * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
274 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
275 */
276static struct uid_gid_extent *
277map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
278{
279 struct idmap_key key;
280
281 key.map_up = false;
282 key.count = count;
283 key.id = id;
284
285 return bsearch(&key, map->forward, extents,
286 sizeof(struct uid_gid_extent), cmp_map_id);
287}
288
289/**
290 * map_id_range_down_base - Find idmap via binary search in static extent array.
291 * Can only be called if number of mappings is equal or less than
292 * UID_GID_MAP_MAX_BASE_EXTENTS.
293 */
294static struct uid_gid_extent *
295map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
296{
297 unsigned idx;
298 u32 first, last, id2;
299
300 id2 = id + count - 1;
301
302 /* Find the matching extent */
303 for (idx = 0; idx < extents; idx++) {
304 first = map->extent[idx].first;
305 last = first + map->extent[idx].count - 1;
306 if (id >= first && id <= last &&
307 (id2 >= first && id2 <= last))
308 return &map->extent[idx];
309 }
310 return NULL;
311}
312
313static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
314{
315 struct uid_gid_extent *extent;
316 unsigned extents = map->nr_extents;
317 smp_rmb();
318
319 if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
320 extent = map_id_range_down_base(extents, map, id, count);
321 else
322 extent = map_id_range_down_max(extents, map, id, count);
323
324 /* Map the id or note failure */
325 if (extent)
326 id = (id - extent->first) + extent->lower_first;
327 else
328 id = (u32) -1;
329
330 return id;
331}
332
333static u32 map_id_down(struct uid_gid_map *map, u32 id)
334{
335 return map_id_range_down(map, id, 1);
336}
337
338/**
339 * map_id_up_base - Find idmap via binary search in static extent array.
340 * Can only be called if number of mappings is equal or less than
341 * UID_GID_MAP_MAX_BASE_EXTENTS.
342 */
343static struct uid_gid_extent *
344map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
345{
346 unsigned idx;
347 u32 first, last;
348
349 /* Find the matching extent */
350 for (idx = 0; idx < extents; idx++) {
351 first = map->extent[idx].lower_first;
352 last = first + map->extent[idx].count - 1;
353 if (id >= first && id <= last)
354 return &map->extent[idx];
355 }
356 return NULL;
357}
358
359/**
360 * map_id_up_max - Find idmap via binary search in ordered idmap array.
361 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
362 */
363static struct uid_gid_extent *
364map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
365{
366 struct idmap_key key;
367
368 key.map_up = true;
369 key.count = 1;
370 key.id = id;
371
372 return bsearch(&key, map->reverse, extents,
373 sizeof(struct uid_gid_extent), cmp_map_id);
374}
375
376static u32 map_id_up(struct uid_gid_map *map, u32 id)
377{
378 struct uid_gid_extent *extent;
379 unsigned extents = map->nr_extents;
380 smp_rmb();
381
382 if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
383 extent = map_id_up_base(extents, map, id);
384 else
385 extent = map_id_up_max(extents, map, id);
386
387 /* Map the id or note failure */
388 if (extent)
389 id = (id - extent->lower_first) + extent->first;
390 else
391 id = (u32) -1;
392
393 return id;
394}
395
396/**
397 * make_kuid - Map a user-namespace uid pair into a kuid.
398 * @ns: User namespace that the uid is in
399 * @uid: User identifier
400 *
401 * Maps a user-namespace uid pair into a kernel internal kuid,
402 * and returns that kuid.
403 *
404 * When there is no mapping defined for the user-namespace uid
405 * pair INVALID_UID is returned. Callers are expected to test
406 * for and handle INVALID_UID being returned. INVALID_UID
407 * may be tested for using uid_valid().
408 */
409kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
410{
411 /* Map the uid to a global kernel uid */
412 return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
413}
414EXPORT_SYMBOL(make_kuid);
415
416/**
417 * from_kuid - Create a uid from a kuid user-namespace pair.
418 * @targ: The user namespace we want a uid in.
419 * @kuid: The kernel internal uid to start with.
420 *
421 * Map @kuid into the user-namespace specified by @targ and
422 * return the resulting uid.
423 *
424 * There is always a mapping into the initial user_namespace.
425 *
426 * If @kuid has no mapping in @targ (uid_t)-1 is returned.
427 */
428uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
429{
430 /* Map the uid from a global kernel uid */
431 return map_id_up(&targ->uid_map, __kuid_val(kuid));
432}
433EXPORT_SYMBOL(from_kuid);
434
435/**
436 * from_kuid_munged - Create a uid from a kuid user-namespace pair.
437 * @targ: The user namespace we want a uid in.
438 * @kuid: The kernel internal uid to start with.
439 *
440 * Map @kuid into the user-namespace specified by @targ and
441 * return the resulting uid.
442 *
443 * There is always a mapping into the initial user_namespace.
444 *
445 * Unlike from_kuid from_kuid_munged never fails and always
446 * returns a valid uid. This makes from_kuid_munged appropriate
447 * for use in syscalls like stat and getuid where failing the
448 * system call and failing to provide a valid uid are not an
449 * options.
450 *
451 * If @kuid has no mapping in @targ overflowuid is returned.
452 */
453uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
454{
455 uid_t uid;
456 uid = from_kuid(targ, kuid);
457
458 if (uid == (uid_t) -1)
459 uid = overflowuid;
460 return uid;
461}
462EXPORT_SYMBOL(from_kuid_munged);
463
464/**
465 * make_kgid - Map a user-namespace gid pair into a kgid.
466 * @ns: User namespace that the gid is in
467 * @gid: group identifier
468 *
469 * Maps a user-namespace gid pair into a kernel internal kgid,
470 * and returns that kgid.
471 *
472 * When there is no mapping defined for the user-namespace gid
473 * pair INVALID_GID is returned. Callers are expected to test
474 * for and handle INVALID_GID being returned. INVALID_GID may be
475 * tested for using gid_valid().
476 */
477kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
478{
479 /* Map the gid to a global kernel gid */
480 return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
481}
482EXPORT_SYMBOL(make_kgid);
483
484/**
485 * from_kgid - Create a gid from a kgid user-namespace pair.
486 * @targ: The user namespace we want a gid in.
487 * @kgid: The kernel internal gid to start with.
488 *
489 * Map @kgid into the user-namespace specified by @targ and
490 * return the resulting gid.
491 *
492 * There is always a mapping into the initial user_namespace.
493 *
494 * If @kgid has no mapping in @targ (gid_t)-1 is returned.
495 */
496gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
497{
498 /* Map the gid from a global kernel gid */
499 return map_id_up(&targ->gid_map, __kgid_val(kgid));
500}
501EXPORT_SYMBOL(from_kgid);
502
503/**
504 * from_kgid_munged - Create a gid from a kgid user-namespace pair.
505 * @targ: The user namespace we want a gid in.
506 * @kgid: The kernel internal gid to start with.
507 *
508 * Map @kgid into the user-namespace specified by @targ and
509 * return the resulting gid.
510 *
511 * There is always a mapping into the initial user_namespace.
512 *
513 * Unlike from_kgid from_kgid_munged never fails and always
514 * returns a valid gid. This makes from_kgid_munged appropriate
515 * for use in syscalls like stat and getgid where failing the
516 * system call and failing to provide a valid gid are not options.
517 *
518 * If @kgid has no mapping in @targ overflowgid is returned.
519 */
520gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
521{
522 gid_t gid;
523 gid = from_kgid(targ, kgid);
524
525 if (gid == (gid_t) -1)
526 gid = overflowgid;
527 return gid;
528}
529EXPORT_SYMBOL(from_kgid_munged);
530
531/**
532 * make_kprojid - Map a user-namespace projid pair into a kprojid.
533 * @ns: User namespace that the projid is in
534 * @projid: Project identifier
535 *
536 * Maps a user-namespace uid pair into a kernel internal kuid,
537 * and returns that kuid.
538 *
539 * When there is no mapping defined for the user-namespace projid
540 * pair INVALID_PROJID is returned. Callers are expected to test
541 * for and handle INVALID_PROJID being returned. INVALID_PROJID
542 * may be tested for using projid_valid().
543 */
544kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
545{
546 /* Map the uid to a global kernel uid */
547 return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
548}
549EXPORT_SYMBOL(make_kprojid);
550
551/**
552 * from_kprojid - Create a projid from a kprojid user-namespace pair.
553 * @targ: The user namespace we want a projid in.
554 * @kprojid: The kernel internal project identifier to start with.
555 *
556 * Map @kprojid into the user-namespace specified by @targ and
557 * return the resulting projid.
558 *
559 * There is always a mapping into the initial user_namespace.
560 *
561 * If @kprojid has no mapping in @targ (projid_t)-1 is returned.
562 */
563projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
564{
565 /* Map the uid from a global kernel uid */
566 return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
567}
568EXPORT_SYMBOL(from_kprojid);
569
570/**
571 * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
572 * @targ: The user namespace we want a projid in.
573 * @kprojid: The kernel internal projid to start with.
574 *
575 * Map @kprojid into the user-namespace specified by @targ and
576 * return the resulting projid.
577 *
578 * There is always a mapping into the initial user_namespace.
579 *
580 * Unlike from_kprojid from_kprojid_munged never fails and always
581 * returns a valid projid. This makes from_kprojid_munged
582 * appropriate for use in syscalls like stat and where
583 * failing the system call and failing to provide a valid projid are
584 * not an options.
585 *
586 * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
587 */
588projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
589{
590 projid_t projid;
591 projid = from_kprojid(targ, kprojid);
592
593 if (projid == (projid_t) -1)
594 projid = OVERFLOW_PROJID;
595 return projid;
596}
597EXPORT_SYMBOL(from_kprojid_munged);
598
599
600static int uid_m_show(struct seq_file *seq, void *v)
601{
602 struct user_namespace *ns = seq->private;
603 struct uid_gid_extent *extent = v;
604 struct user_namespace *lower_ns;
605 uid_t lower;
606
607 lower_ns = seq_user_ns(seq);
608 if ((lower_ns == ns) && lower_ns->parent)
609 lower_ns = lower_ns->parent;
610
611 lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
612
613 seq_printf(seq, "%10u %10u %10u\n",
614 extent->first,
615 lower,
616 extent->count);
617
618 return 0;
619}
620
621static int gid_m_show(struct seq_file *seq, void *v)
622{
623 struct user_namespace *ns = seq->private;
624 struct uid_gid_extent *extent = v;
625 struct user_namespace *lower_ns;
626 gid_t lower;
627
628 lower_ns = seq_user_ns(seq);
629 if ((lower_ns == ns) && lower_ns->parent)
630 lower_ns = lower_ns->parent;
631
632 lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
633
634 seq_printf(seq, "%10u %10u %10u\n",
635 extent->first,
636 lower,
637 extent->count);
638
639 return 0;
640}
641
642static int projid_m_show(struct seq_file *seq, void *v)
643{
644 struct user_namespace *ns = seq->private;
645 struct uid_gid_extent *extent = v;
646 struct user_namespace *lower_ns;
647 projid_t lower;
648
649 lower_ns = seq_user_ns(seq);
650 if ((lower_ns == ns) && lower_ns->parent)
651 lower_ns = lower_ns->parent;
652
653 lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
654
655 seq_printf(seq, "%10u %10u %10u\n",
656 extent->first,
657 lower,
658 extent->count);
659
660 return 0;
661}
662
663static void *m_start(struct seq_file *seq, loff_t *ppos,
664 struct uid_gid_map *map)
665{
666 loff_t pos = *ppos;
667 unsigned extents = map->nr_extents;
668 smp_rmb();
669
670 if (pos >= extents)
671 return NULL;
672
673 if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
674 return &map->extent[pos];
675
676 return &map->forward[pos];
677}
678
679static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
680{
681 struct user_namespace *ns = seq->private;
682
683 return m_start(seq, ppos, &ns->uid_map);
684}
685
686static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
687{
688 struct user_namespace *ns = seq->private;
689
690 return m_start(seq, ppos, &ns->gid_map);
691}
692
693static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
694{
695 struct user_namespace *ns = seq->private;
696
697 return m_start(seq, ppos, &ns->projid_map);
698}
699
700static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
701{
702 (*pos)++;
703 return seq->op->start(seq, pos);
704}
705
706static void m_stop(struct seq_file *seq, void *v)
707{
708 return;
709}
710
711const struct seq_operations proc_uid_seq_operations = {
712 .start = uid_m_start,
713 .stop = m_stop,
714 .next = m_next,
715 .show = uid_m_show,
716};
717
718const struct seq_operations proc_gid_seq_operations = {
719 .start = gid_m_start,
720 .stop = m_stop,
721 .next = m_next,
722 .show = gid_m_show,
723};
724
725const struct seq_operations proc_projid_seq_operations = {
726 .start = projid_m_start,
727 .stop = m_stop,
728 .next = m_next,
729 .show = projid_m_show,
730};
731
732static bool mappings_overlap(struct uid_gid_map *new_map,
733 struct uid_gid_extent *extent)
734{
735 u32 upper_first, lower_first, upper_last, lower_last;
736 unsigned idx;
737
738 upper_first = extent->first;
739 lower_first = extent->lower_first;
740 upper_last = upper_first + extent->count - 1;
741 lower_last = lower_first + extent->count - 1;
742
743 for (idx = 0; idx < new_map->nr_extents; idx++) {
744 u32 prev_upper_first, prev_lower_first;
745 u32 prev_upper_last, prev_lower_last;
746 struct uid_gid_extent *prev;
747
748 if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
749 prev = &new_map->extent[idx];
750 else
751 prev = &new_map->forward[idx];
752
753 prev_upper_first = prev->first;
754 prev_lower_first = prev->lower_first;
755 prev_upper_last = prev_upper_first + prev->count - 1;
756 prev_lower_last = prev_lower_first + prev->count - 1;
757
758 /* Does the upper range intersect a previous extent? */
759 if ((prev_upper_first <= upper_last) &&
760 (prev_upper_last >= upper_first))
761 return true;
762
763 /* Does the lower range intersect a previous extent? */
764 if ((prev_lower_first <= lower_last) &&
765 (prev_lower_last >= lower_first))
766 return true;
767 }
768 return false;
769}
770
771/**
772 * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
773 * Takes care to allocate a 4K block of memory if the number of mappings exceeds
774 * UID_GID_MAP_MAX_BASE_EXTENTS.
775 */
776static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
777{
778 struct uid_gid_extent *dest;
779
780 if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
781 struct uid_gid_extent *forward;
782
783 /* Allocate memory for 340 mappings. */
784 forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS,
785 sizeof(struct uid_gid_extent),
786 GFP_KERNEL);
787 if (!forward)
788 return -ENOMEM;
789
790 /* Copy over memory. Only set up memory for the forward pointer.
791 * Defer the memory setup for the reverse pointer.
792 */
793 memcpy(forward, map->extent,
794 map->nr_extents * sizeof(map->extent[0]));
795
796 map->forward = forward;
797 map->reverse = NULL;
798 }
799
800 if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
801 dest = &map->extent[map->nr_extents];
802 else
803 dest = &map->forward[map->nr_extents];
804
805 *dest = *extent;
806 map->nr_extents++;
807 return 0;
808}
809
810/* cmp function to sort() forward mappings */
811static int cmp_extents_forward(const void *a, const void *b)
812{
813 const struct uid_gid_extent *e1 = a;
814 const struct uid_gid_extent *e2 = b;
815
816 if (e1->first < e2->first)
817 return -1;
818
819 if (e1->first > e2->first)
820 return 1;
821
822 return 0;
823}
824
825/* cmp function to sort() reverse mappings */
826static int cmp_extents_reverse(const void *a, const void *b)
827{
828 const struct uid_gid_extent *e1 = a;
829 const struct uid_gid_extent *e2 = b;
830
831 if (e1->lower_first < e2->lower_first)
832 return -1;
833
834 if (e1->lower_first > e2->lower_first)
835 return 1;
836
837 return 0;
838}
839
840/**
841 * sort_idmaps - Sorts an array of idmap entries.
842 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
843 */
844static int sort_idmaps(struct uid_gid_map *map)
845{
846 if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
847 return 0;
848
849 /* Sort forward array. */
850 sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
851 cmp_extents_forward, NULL);
852
853 /* Only copy the memory from forward we actually need. */
854 map->reverse = kmemdup(map->forward,
855 map->nr_extents * sizeof(struct uid_gid_extent),
856 GFP_KERNEL);
857 if (!map->reverse)
858 return -ENOMEM;
859
860 /* Sort reverse array. */
861 sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
862 cmp_extents_reverse, NULL);
863
864 return 0;
865}
866
867/**
868 * verify_root_map() - check the uid 0 mapping
869 * @file: idmapping file
870 * @map_ns: user namespace of the target process
871 * @new_map: requested idmap
872 *
873 * If a process requests mapping parent uid 0 into the new ns, verify that the
874 * process writing the map had the CAP_SETFCAP capability as the target process
875 * will be able to write fscaps that are valid in ancestor user namespaces.
876 *
877 * Return: true if the mapping is allowed, false if not.
878 */
879static bool verify_root_map(const struct file *file,
880 struct user_namespace *map_ns,
881 struct uid_gid_map *new_map)
882{
883 int idx;
884 const struct user_namespace *file_ns = file->f_cred->user_ns;
885 struct uid_gid_extent *extent0 = NULL;
886
887 for (idx = 0; idx < new_map->nr_extents; idx++) {
888 if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
889 extent0 = &new_map->extent[idx];
890 else
891 extent0 = &new_map->forward[idx];
892 if (extent0->lower_first == 0)
893 break;
894
895 extent0 = NULL;
896 }
897
898 if (!extent0)
899 return true;
900
901 if (map_ns == file_ns) {
902 /* The process unshared its ns and is writing to its own
903 * /proc/self/uid_map. User already has full capabilites in
904 * the new namespace. Verify that the parent had CAP_SETFCAP
905 * when it unshared.
906 * */
907 if (!file_ns->parent_could_setfcap)
908 return false;
909 } else {
910 /* Process p1 is writing to uid_map of p2, who is in a child
911 * user namespace to p1's. Verify that the opener of the map
912 * file has CAP_SETFCAP against the parent of the new map
913 * namespace */
914 if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
915 return false;
916 }
917
918 return true;
919}
920
921static ssize_t map_write(struct file *file, const char __user *buf,
922 size_t count, loff_t *ppos,
923 int cap_setid,
924 struct uid_gid_map *map,
925 struct uid_gid_map *parent_map)
926{
927 struct seq_file *seq = file->private_data;
928 struct user_namespace *map_ns = seq->private;
929 struct uid_gid_map new_map;
930 unsigned idx;
931 struct uid_gid_extent extent;
932 char *kbuf = NULL, *pos, *next_line;
933 ssize_t ret;
934
935 /* Only allow < page size writes at the beginning of the file */
936 if ((*ppos != 0) || (count >= PAGE_SIZE))
937 return -EINVAL;
938
939 /* Slurp in the user data */
940 kbuf = memdup_user_nul(buf, count);
941 if (IS_ERR(kbuf))
942 return PTR_ERR(kbuf);
943
944 /*
945 * The userns_state_mutex serializes all writes to any given map.
946 *
947 * Any map is only ever written once.
948 *
949 * An id map fits within 1 cache line on most architectures.
950 *
951 * On read nothing needs to be done unless you are on an
952 * architecture with a crazy cache coherency model like alpha.
953 *
954 * There is a one time data dependency between reading the
955 * count of the extents and the values of the extents. The
956 * desired behavior is to see the values of the extents that
957 * were written before the count of the extents.
958 *
959 * To achieve this smp_wmb() is used on guarantee the write
960 * order and smp_rmb() is guaranteed that we don't have crazy
961 * architectures returning stale data.
962 */
963 mutex_lock(&userns_state_mutex);
964
965 memset(&new_map, 0, sizeof(struct uid_gid_map));
966
967 ret = -EPERM;
968 /* Only allow one successful write to the map */
969 if (map->nr_extents != 0)
970 goto out;
971
972 /*
973 * Adjusting namespace settings requires capabilities on the target.
974 */
975 if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
976 goto out;
977
978 /* Parse the user data */
979 ret = -EINVAL;
980 pos = kbuf;
981 for (; pos; pos = next_line) {
982
983 /* Find the end of line and ensure I don't look past it */
984 next_line = strchr(pos, '\n');
985 if (next_line) {
986 *next_line = '\0';
987 next_line++;
988 if (*next_line == '\0')
989 next_line = NULL;
990 }
991
992 pos = skip_spaces(pos);
993 extent.first = simple_strtoul(pos, &pos, 10);
994 if (!isspace(*pos))
995 goto out;
996
997 pos = skip_spaces(pos);
998 extent.lower_first = simple_strtoul(pos, &pos, 10);
999 if (!isspace(*pos))
1000 goto out;
1001
1002 pos = skip_spaces(pos);
1003 extent.count = simple_strtoul(pos, &pos, 10);
1004 if (*pos && !isspace(*pos))
1005 goto out;
1006
1007 /* Verify there is not trailing junk on the line */
1008 pos = skip_spaces(pos);
1009 if (*pos != '\0')
1010 goto out;
1011
1012 /* Verify we have been given valid starting values */
1013 if ((extent.first == (u32) -1) ||
1014 (extent.lower_first == (u32) -1))
1015 goto out;
1016
1017 /* Verify count is not zero and does not cause the
1018 * extent to wrap
1019 */
1020 if ((extent.first + extent.count) <= extent.first)
1021 goto out;
1022 if ((extent.lower_first + extent.count) <=
1023 extent.lower_first)
1024 goto out;
1025
1026 /* Do the ranges in extent overlap any previous extents? */
1027 if (mappings_overlap(&new_map, &extent))
1028 goto out;
1029
1030 if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
1031 (next_line != NULL))
1032 goto out;
1033
1034 ret = insert_extent(&new_map, &extent);
1035 if (ret < 0)
1036 goto out;
1037 ret = -EINVAL;
1038 }
1039 /* Be very certain the new map actually exists */
1040 if (new_map.nr_extents == 0)
1041 goto out;
1042
1043 ret = -EPERM;
1044 /* Validate the user is allowed to use user id's mapped to. */
1045 if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
1046 goto out;
1047
1048 ret = -EPERM;
1049 /* Map the lower ids from the parent user namespace to the
1050 * kernel global id space.
1051 */
1052 for (idx = 0; idx < new_map.nr_extents; idx++) {
1053 struct uid_gid_extent *e;
1054 u32 lower_first;
1055
1056 if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
1057 e = &new_map.extent[idx];
1058 else
1059 e = &new_map.forward[idx];
1060
1061 lower_first = map_id_range_down(parent_map,
1062 e->lower_first,
1063 e->count);
1064
1065 /* Fail if we can not map the specified extent to
1066 * the kernel global id space.
1067 */
1068 if (lower_first == (u32) -1)
1069 goto out;
1070
1071 e->lower_first = lower_first;
1072 }
1073
1074 /*
1075 * If we want to use binary search for lookup, this clones the extent
1076 * array and sorts both copies.
1077 */
1078 ret = sort_idmaps(&new_map);
1079 if (ret < 0)
1080 goto out;
1081
1082 /* Install the map */
1083 if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
1084 memcpy(map->extent, new_map.extent,
1085 new_map.nr_extents * sizeof(new_map.extent[0]));
1086 } else {
1087 map->forward = new_map.forward;
1088 map->reverse = new_map.reverse;
1089 }
1090 smp_wmb();
1091 map->nr_extents = new_map.nr_extents;
1092
1093 *ppos = count;
1094 ret = count;
1095out:
1096 if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
1097 kfree(new_map.forward);
1098 kfree(new_map.reverse);
1099 map->forward = NULL;
1100 map->reverse = NULL;
1101 map->nr_extents = 0;
1102 }
1103
1104 mutex_unlock(&userns_state_mutex);
1105 kfree(kbuf);
1106 return ret;
1107}
1108
1109ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
1110 size_t size, loff_t *ppos)
1111{
1112 struct seq_file *seq = file->private_data;
1113 struct user_namespace *ns = seq->private;
1114 struct user_namespace *seq_ns = seq_user_ns(seq);
1115
1116 if (!ns->parent)
1117 return -EPERM;
1118
1119 if ((seq_ns != ns) && (seq_ns != ns->parent))
1120 return -EPERM;
1121
1122 return map_write(file, buf, size, ppos, CAP_SETUID,
1123 &ns->uid_map, &ns->parent->uid_map);
1124}
1125
1126ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
1127 size_t size, loff_t *ppos)
1128{
1129 struct seq_file *seq = file->private_data;
1130 struct user_namespace *ns = seq->private;
1131 struct user_namespace *seq_ns = seq_user_ns(seq);
1132
1133 if (!ns->parent)
1134 return -EPERM;
1135
1136 if ((seq_ns != ns) && (seq_ns != ns->parent))
1137 return -EPERM;
1138
1139 return map_write(file, buf, size, ppos, CAP_SETGID,
1140 &ns->gid_map, &ns->parent->gid_map);
1141}
1142
1143ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
1144 size_t size, loff_t *ppos)
1145{
1146 struct seq_file *seq = file->private_data;
1147 struct user_namespace *ns = seq->private;
1148 struct user_namespace *seq_ns = seq_user_ns(seq);
1149
1150 if (!ns->parent)
1151 return -EPERM;
1152
1153 if ((seq_ns != ns) && (seq_ns != ns->parent))
1154 return -EPERM;
1155
1156 /* Anyone can set any valid project id no capability needed */
1157 return map_write(file, buf, size, ppos, -1,
1158 &ns->projid_map, &ns->parent->projid_map);
1159}
1160
1161static bool new_idmap_permitted(const struct file *file,
1162 struct user_namespace *ns, int cap_setid,
1163 struct uid_gid_map *new_map)
1164{
1165 const struct cred *cred = file->f_cred;
1166
1167 if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
1168 return false;
1169
1170 /* Don't allow mappings that would allow anything that wouldn't
1171 * be allowed without the establishment of unprivileged mappings.
1172 */
1173 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
1174 uid_eq(ns->owner, cred->euid)) {
1175 u32 id = new_map->extent[0].lower_first;
1176 if (cap_setid == CAP_SETUID) {
1177 kuid_t uid = make_kuid(ns->parent, id);
1178 if (uid_eq(uid, cred->euid))
1179 return true;
1180 } else if (cap_setid == CAP_SETGID) {
1181 kgid_t gid = make_kgid(ns->parent, id);
1182 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
1183 gid_eq(gid, cred->egid))
1184 return true;
1185 }
1186 }
1187
1188 /* Allow anyone to set a mapping that doesn't require privilege */
1189 if (!cap_valid(cap_setid))
1190 return true;
1191
1192 /* Allow the specified ids if we have the appropriate capability
1193 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
1194 * And the opener of the id file also has the appropriate capability.
1195 */
1196 if (ns_capable(ns->parent, cap_setid) &&
1197 file_ns_capable(file, ns->parent, cap_setid))
1198 return true;
1199
1200 return false;
1201}
1202
1203int proc_setgroups_show(struct seq_file *seq, void *v)
1204{
1205 struct user_namespace *ns = seq->private;
1206 unsigned long userns_flags = READ_ONCE(ns->flags);
1207
1208 seq_printf(seq, "%s\n",
1209 (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
1210 "allow" : "deny");
1211 return 0;
1212}
1213
1214ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
1215 size_t count, loff_t *ppos)
1216{
1217 struct seq_file *seq = file->private_data;
1218 struct user_namespace *ns = seq->private;
1219 char kbuf[8], *pos;
1220 bool setgroups_allowed;
1221 ssize_t ret;
1222
1223 /* Only allow a very narrow range of strings to be written */
1224 ret = -EINVAL;
1225 if ((*ppos != 0) || (count >= sizeof(kbuf)))
1226 goto out;
1227
1228 /* What was written? */
1229 ret = -EFAULT;
1230 if (copy_from_user(kbuf, buf, count))
1231 goto out;
1232 kbuf[count] = '\0';
1233 pos = kbuf;
1234
1235 /* What is being requested? */
1236 ret = -EINVAL;
1237 if (strncmp(pos, "allow", 5) == 0) {
1238 pos += 5;
1239 setgroups_allowed = true;
1240 }
1241 else if (strncmp(pos, "deny", 4) == 0) {
1242 pos += 4;
1243 setgroups_allowed = false;
1244 }
1245 else
1246 goto out;
1247
1248 /* Verify there is not trailing junk on the line */
1249 pos = skip_spaces(pos);
1250 if (*pos != '\0')
1251 goto out;
1252
1253 ret = -EPERM;
1254 mutex_lock(&userns_state_mutex);
1255 if (setgroups_allowed) {
1256 /* Enabling setgroups after setgroups has been disabled
1257 * is not allowed.
1258 */
1259 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
1260 goto out_unlock;
1261 } else {
1262 /* Permanently disabling setgroups after setgroups has
1263 * been enabled by writing the gid_map is not allowed.
1264 */
1265 if (ns->gid_map.nr_extents != 0)
1266 goto out_unlock;
1267 ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
1268 }
1269 mutex_unlock(&userns_state_mutex);
1270
1271 /* Report a successful write */
1272 *ppos = count;
1273 ret = count;
1274out:
1275 return ret;
1276out_unlock:
1277 mutex_unlock(&userns_state_mutex);
1278 goto out;
1279}
1280
1281bool userns_may_setgroups(const struct user_namespace *ns)
1282{
1283 bool allowed;
1284
1285 mutex_lock(&userns_state_mutex);
1286 /* It is not safe to use setgroups until a gid mapping in
1287 * the user namespace has been established.
1288 */
1289 allowed = ns->gid_map.nr_extents != 0;
1290 /* Is setgroups allowed? */
1291 allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
1292 mutex_unlock(&userns_state_mutex);
1293
1294 return allowed;
1295}
1296
1297/*
1298 * Returns true if @child is the same namespace or a descendant of
1299 * @ancestor.
1300 */
1301bool in_userns(const struct user_namespace *ancestor,
1302 const struct user_namespace *child)
1303{
1304 const struct user_namespace *ns;
1305 for (ns = child; ns->level > ancestor->level; ns = ns->parent)
1306 ;
1307 return (ns == ancestor);
1308}
1309
1310bool current_in_userns(const struct user_namespace *target_ns)
1311{
1312 return in_userns(target_ns, current_user_ns());
1313}
1314EXPORT_SYMBOL(current_in_userns);
1315
1316static inline struct user_namespace *to_user_ns(struct ns_common *ns)
1317{
1318 return container_of(ns, struct user_namespace, ns);
1319}
1320
1321static struct ns_common *userns_get(struct task_struct *task)
1322{
1323 struct user_namespace *user_ns;
1324
1325 rcu_read_lock();
1326 user_ns = get_user_ns(__task_cred(task)->user_ns);
1327 rcu_read_unlock();
1328
1329 return user_ns ? &user_ns->ns : NULL;
1330}
1331
1332static void userns_put(struct ns_common *ns)
1333{
1334 put_user_ns(to_user_ns(ns));
1335}
1336
1337static int userns_install(struct nsset *nsset, struct ns_common *ns)
1338{
1339 struct user_namespace *user_ns = to_user_ns(ns);
1340 struct cred *cred;
1341
1342 /* Don't allow gaining capabilities by reentering
1343 * the same user namespace.
1344 */
1345 if (user_ns == current_user_ns())
1346 return -EINVAL;
1347
1348 /* Tasks that share a thread group must share a user namespace */
1349 if (!thread_group_empty(current))
1350 return -EINVAL;
1351
1352 if (current->fs->users != 1)
1353 return -EINVAL;
1354
1355 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
1356 return -EPERM;
1357
1358 cred = nsset_cred(nsset);
1359 if (!cred)
1360 return -EINVAL;
1361
1362 put_user_ns(cred->user_ns);
1363 set_cred_user_ns(cred, get_user_ns(user_ns));
1364
1365 if (set_cred_ucounts(cred) < 0)
1366 return -EINVAL;
1367
1368 return 0;
1369}
1370
1371struct ns_common *ns_get_owner(struct ns_common *ns)
1372{
1373 struct user_namespace *my_user_ns = current_user_ns();
1374 struct user_namespace *owner, *p;
1375
1376 /* See if the owner is in the current user namespace */
1377 owner = p = ns->ops->owner(ns);
1378 for (;;) {
1379 if (!p)
1380 return ERR_PTR(-EPERM);
1381 if (p == my_user_ns)
1382 break;
1383 p = p->parent;
1384 }
1385
1386 return &get_user_ns(owner)->ns;
1387}
1388
1389static struct user_namespace *userns_owner(struct ns_common *ns)
1390{
1391 return to_user_ns(ns)->parent;
1392}
1393
1394const struct proc_ns_operations userns_operations = {
1395 .name = "user",
1396 .type = CLONE_NEWUSER,
1397 .get = userns_get,
1398 .put = userns_put,
1399 .install = userns_install,
1400 .owner = userns_owner,
1401 .get_parent = ns_get_owner,
1402};
1403
1404static __init int user_namespaces_init(void)
1405{
1406 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
1407 return 0;
1408}
1409subsys_initcall(user_namespaces_init);