kernel/pid.c

   1 /*
   2  * Generic pidhash and scalable, time-bounded PID allocator
   3  *
   4  * (C) 2002-2003 William Irwin, IBM
   5  * (C) 2004 William Irwin, Oracle
   6  * (C) 2002-2004 Ingo Molnar, Red Hat
   7  *
   8  * pid-structures are backing objects for tasks sharing a given ID to chain
   9  * against. There is very little to them aside from hashing them and
  10  * parking tasks using given ID's on a list.
  11  *
  12  * The hash is always changed with the tasklist_lock write-acquired,
  13  * and the hash is only accessed with the tasklist_lock at least
  14  * read-acquired, so there's no additional SMP locking needed here.
  15  *
  16  * We have a list of bitmap pages, which bitmaps represent the PID space.
  17  * Allocating and freeing PIDs is completely lockless. The worst-case
  18  * allocation scenario when all but one out of 1 million PIDs possible are
  19  * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
  20  * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
  21  *
  22  * Pid namespaces:
  23  *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
  24  *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
  25  *     Many thanks to Oleg Nesterov for comments and help
  26  *
  27  */
  28
  29 #include <linux/mm.h>
  30 #include <linux/module.h>
  31 #include <linux/slab.h>
  32 #include <linux/init.h>
  33 #include <linux/bootmem.h>
  34 #include <linux/hash.h>
  35 #include <linux/pid_namespace.h>
  36 #include <linux/init_task.h>
  37
  38 #define pid_hashfn(nr, ns)      \
  39         hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
  40 static struct hlist_head *pid_hash;
  41 static int pidhash_shift;
  42 struct pid init_struct_pid = INIT_STRUCT_PID;
  43
  44 int pid_max = PID_MAX_DEFAULT;
  45
  46 #define RESERVED_PIDS           300
  47
  48 int pid_max_min = RESERVED_PIDS + 1;
  49 int pid_max_max = PID_MAX_LIMIT;
  50
  51 #define BITS_PER_PAGE           (PAGE_SIZE*8)
  52 #define BITS_PER_PAGE_MASK      (BITS_PER_PAGE-1)
  53
  54 static inline int mk_pid(struct pid_namespace *pid_ns,
  55                 struct pidmap *map, int off)
  56 {
  57         return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
  58 }
  59
  60 #define find_next_offset(map, off)                                      \
  61                 find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
  62
  63 /*
  64  * PID-map pages start out as NULL, they get allocated upon
  65  * first use and are never deallocated. This way a low pid_max
  66  * value does not cause lots of bitmaps to be allocated, but
  67  * the scheme scales to up to 4 million PIDs, runtime.
  68  */
  69 struct pid_namespace init_pid_ns = {
  70         .kref = {
  71                 .refcount       = ATOMIC_INIT(2),
  72         },
  73         .pidmap = {
  74                 [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
  75         },
  76         .last_pid = 0,
  77         .level = 0,
  78         .child_reaper = &init_task,
  79 };
  80 EXPORT_SYMBOL_GPL(init_pid_ns);
  81
  82 int is_container_init(struct task_struct *tsk)
  83 {
  84         int ret = 0;
  85         struct pid *pid;
  86
  87         rcu_read_lock();
  88         pid = task_pid(tsk);
  89         if (pid != NULL && pid->numbers[pid->level].nr == 1)
  90                 ret = 1;
  91         rcu_read_unlock();
  92
  93         return ret;
  94 }
  95 EXPORT_SYMBOL(is_container_init);
  96
  97 /*
  98  * Note: disable interrupts while the pidmap_lock is held as an
  99  * interrupt might come in and do read_lock(&tasklist_lock).
 100  *
 101  * If we don't disable interrupts there is a nasty deadlock between
 102  * detach_pid()->free_pid() and another cpu that does
 103  * spin_lock(&pidmap_lock) followed by an interrupt routine that does
 104  * read_lock(&tasklist_lock);
 105  *
 106  * After we clean up the tasklist_lock and know there are no
 107  * irq handlers that take it we can leave the interrupts enabled.
 108  * For now it is easier to be safe than to prove it can't happen.
 109  */
 110
 111 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 112
 113 static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
 114 {
 115         struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
 116         int offset = pid & BITS_PER_PAGE_MASK;
 117
 118         clear_bit(offset, map->page);
 119         atomic_inc(&map->nr_free);
 120 }
 121
 122 static int alloc_pidmap(struct pid_namespace *pid_ns)
 123 {
 124         int i, offset, max_scan, pid, last = pid_ns->last_pid;
 125         struct pidmap *map;
 126
 127         pid = last + 1;
 128         if (pid >= pid_max)
 129                 pid = RESERVED_PIDS;
 130         offset = pid & BITS_PER_PAGE_MASK;
 131         map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
 132         max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
 133         for (i = 0; i <= max_scan; ++i) {
 134                 if (unlikely(!map->page)) {
 135                         void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 136                         /*
 137                          * Free the page if someone raced with us
 138                          * installing it:
 139                          */
 140                         spin_lock_irq(&pidmap_lock);
 141                         if (map->page)
 142                                 kfree(page);
 143                         else
 144                                 map->page = page;
 145                         spin_unlock_irq(&pidmap_lock);
 146                         if (unlikely(!map->page))
 147                                 break;
 148                 }
 149                 if (likely(atomic_read(&map->nr_free))) {
 150                         do {
 151                                 if (!test_and_set_bit(offset, map->page)) {
 152                                         atomic_dec(&map->nr_free);
 153                                         pid_ns->last_pid = pid;
 154                                         return pid;
 155                                 }
 156                                 offset = find_next_offset(map, offset);
 157                                 pid = mk_pid(pid_ns, map, offset);
 158                         /*
 159                          * find_next_offset() found a bit, the pid from it
 160                          * is in-bounds, and if we fell back to the last
 161                          * bitmap block and the final block was the same
 162                          * as the starting point, pid is before last_pid.
 163                          */
 164                         } while (offset < BITS_PER_PAGE && pid < pid_max &&
 165                                         (i != max_scan || pid < last ||
 166                                             !((last+1) & BITS_PER_PAGE_MASK)));
 167                 }
 168                 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
 169                         ++map;
 170                         offset = 0;
 171                 } else {
 172                         map = &pid_ns->pidmap[0];
 173                         offset = RESERVED_PIDS;
 174                         if (unlikely(last == offset))
 175                                 break;
 176                 }
 177                 pid = mk_pid(pid_ns, map, offset);
 178         }
 179         return -1;
 180 }
 181
 182 static int next_pidmap(struct pid_namespace *pid_ns, int last)
 183 {
 184         int offset;
 185         struct pidmap *map, *end;
 186
 187         offset = (last + 1) & BITS_PER_PAGE_MASK;
 188         map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
 189         end = &pid_ns->pidmap[PIDMAP_ENTRIES];
 190         for (; map < end; map++, offset = 0) {
 191                 if (unlikely(!map->page))
 192                         continue;
 193                 offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
 194                 if (offset < BITS_PER_PAGE)
 195                         return mk_pid(pid_ns, map, offset);
 196         }
 197         return -1;
 198 }
 199
 200 fastcall void put_pid(struct pid *pid)
 201 {
 202         struct pid_namespace *ns;
 203
 204         if (!pid)
 205                 return;
 206
 207         ns = pid->numbers[pid->level].ns;
 208         if ((atomic_read(&pid->count) == 1) ||
 209              atomic_dec_and_test(&pid->count)) {
 210                 kmem_cache_free(ns->pid_cachep, pid);
 211                 put_pid_ns(ns);
 212         }
 213 }
 214 EXPORT_SYMBOL_GPL(put_pid);
 215
 216 static void delayed_put_pid(struct rcu_head *rhp)
 217 {
 218         struct pid *pid = container_of(rhp, struct pid, rcu);
 219         put_pid(pid);
 220 }
 221
 222 fastcall void free_pid(struct pid *pid)
 223 {
 224         /* We can be called with write_lock_irq(&tasklist_lock) held */
 225         int i;
 226         unsigned long flags;
 227
 228         spin_lock_irqsave(&pidmap_lock, flags);
 229         for (i = 0; i <= pid->level; i++)
 230                 hlist_del_rcu(&pid->numbers[i].pid_chain);
 231         spin_unlock_irqrestore(&pidmap_lock, flags);
 232
 233         for (i = 0; i <= pid->level; i++)
 234                 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
 235
 236         call_rcu(&pid->rcu, delayed_put_pid);
 237 }
 238
 239 struct pid *alloc_pid(struct pid_namespace *ns)
 240 {
 241         struct pid *pid;
 242         enum pid_type type;
 243         int i, nr;
 244         struct pid_namespace *tmp;
 245         struct upid *upid;
 246
 247         pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
 248         if (!pid)
 249                 goto out;
 250
 251         tmp = ns;
 252         for (i = ns->level; i >= 0; i--) {
 253                 nr = alloc_pidmap(tmp);
 254                 if (nr < 0)
 255                         goto out_free;
 256
 257                 pid->numbers[i].nr = nr;
 258                 pid->numbers[i].ns = tmp;
 259                 tmp = tmp->parent;
 260         }
 261
 262         get_pid_ns(ns);
 263         pid->level = ns->level;
 264         pid->nr = pid->numbers[0].nr;
 265         atomic_set(&pid->count, 1);
 266         for (type = 0; type < PIDTYPE_MAX; ++type)
 267                 INIT_HLIST_HEAD(&pid->tasks[type]);
 268
 269         spin_lock_irq(&pidmap_lock);
 270         for (i = ns->level; i >= 0; i--) {
 271                 upid = &pid->numbers[i];
 272                 hlist_add_head_rcu(&upid->pid_chain,
 273                                 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
 274         }
 275         spin_unlock_irq(&pidmap_lock);
 276
 277 out:
 278         return pid;
 279
 280 out_free:
 281         for (i++; i <= ns->level; i++)
 282                 free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr);
 283
 284         kmem_cache_free(ns->pid_cachep, pid);
 285         pid = NULL;
 286         goto out;
 287 }
 288
 289 struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns)
 290 {
 291         struct hlist_node *elem;
 292         struct upid *pnr;
 293
 294         hlist_for_each_entry_rcu(pnr, elem,
 295                         &pid_hash[pid_hashfn(nr, ns)], pid_chain)
 296                 if (pnr->nr == nr && pnr->ns == ns)
 297                         return container_of(pnr, struct pid,
 298                                         numbers[ns->level]);
 299
 300         return NULL;
 301 }
 302 EXPORT_SYMBOL_GPL(find_pid_ns);
 303
 304 /*
 305  * attach_pid() must be called with the tasklist_lock write-held.
 306  */
 307 int fastcall attach_pid(struct task_struct *task, enum pid_type type,
 308                 struct pid *pid)
 309 {
 310         struct pid_link *link;
 311
 312         link = &task->pids[type];
 313         link->pid = pid;
 314         hlist_add_head_rcu(&link->node, &pid->tasks[type]);
 315
 316         return 0;
 317 }
 318
 319 void fastcall detach_pid(struct task_struct *task, enum pid_type type)
 320 {
 321         struct pid_link *link;
 322         struct pid *pid;
 323         int tmp;
 324
 325         link = &task->pids[type];
 326         pid = link->pid;
 327
 328         hlist_del_rcu(&link->node);
 329         link->pid = NULL;
 330
 331         for (tmp = PIDTYPE_MAX; --tmp >= 0; )
 332                 if (!hlist_empty(&pid->tasks[tmp]))
 333                         return;
 334
 335         free_pid(pid);
 336 }
 337
 338 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
 339 void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
 340                            enum pid_type type)
 341 {
 342         new->pids[type].pid = old->pids[type].pid;
 343         hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
 344         old->pids[type].pid = NULL;
 345 }
 346
 347 struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 348 {
 349         struct task_struct *result = NULL;
 350         if (pid) {
 351                 struct hlist_node *first;
 352                 first = rcu_dereference(pid->tasks[type].first);
 353                 if (first)
 354                         result = hlist_entry(first, struct task_struct, pids[(type)].node);
 355         }
 356         return result;
 357 }
 358
 359 /*
 360  * Must be called under rcu_read_lock() or with tasklist_lock read-held.
 361  */
 362 struct task_struct *find_task_by_pid_type_ns(int type, int nr,
 363                 struct pid_namespace *ns)
 364 {
 365         return pid_task(find_pid_ns(nr, ns), type);
 366 }
 367
 368 EXPORT_SYMBOL(find_task_by_pid_type_ns);
 369
 370 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 371 {
 372         struct pid *pid;
 373         rcu_read_lock();
 374         pid = get_pid(task->pids[type].pid);
 375         rcu_read_unlock();
 376         return pid;
 377 }
 378
 379 struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type)
 380 {
 381         struct task_struct *result;
 382         rcu_read_lock();
 383         result = pid_task(pid, type);
 384         if (result)
 385                 get_task_struct(result);
 386         rcu_read_unlock();
 387         return result;
 388 }
 389
 390 struct pid *find_get_pid(pid_t nr)
 391 {
 392         struct pid *pid;
 393
 394         rcu_read_lock();
 395         pid = get_pid(find_vpid(nr));
 396         rcu_read_unlock();
 397
 398         return pid;
 399 }
 400
 401 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
 402 {
 403         struct upid *upid;
 404         pid_t nr = 0;
 405
 406         if (pid && ns->level <= pid->level) {
 407                 upid = &pid->numbers[ns->level];
 408                 if (upid->ns == ns)
 409                         nr = upid->nr;
 410         }
 411         return nr;
 412 }
 413
 414 /*
 415  * Used by proc to find the first pid that is greater then or equal to nr.
 416  *
 417  * If there is a pid at nr this function is exactly the same as find_pid.
 418  */
 419 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 420 {
 421         struct pid *pid;
 422
 423         do {
 424                 pid = find_pid_ns(nr, ns);
 425                 if (pid)
 426                         break;
 427                 nr = next_pidmap(ns, nr);
 428         } while (nr > 0);
 429
 430         return pid;
 431 }
 432 EXPORT_SYMBOL_GPL(find_get_pid);
 433
 434 struct pid_cache {
 435         int nr_ids;
 436         char name[16];
 437         struct kmem_cache *cachep;
 438         struct list_head list;
 439 };
 440
 441 static LIST_HEAD(pid_caches_lh);
 442 static DEFINE_MUTEX(pid_caches_mutex);
 443
 444 /*
 445  * creates the kmem cache to allocate pids from.
 446  * @nr_ids: the number of numerical ids this pid will have to carry
 447  */
 448
 449 static struct kmem_cache *create_pid_cachep(int nr_ids)
 450 {
 451         struct pid_cache *pcache;
 452         struct kmem_cache *cachep;
 453
 454         mutex_lock(&pid_caches_mutex);
 455         list_for_each_entry (pcache, &pid_caches_lh, list)
 456                 if (pcache->nr_ids == nr_ids)
 457                         goto out;
 458
 459         pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
 460         if (pcache == NULL)
 461                 goto err_alloc;
 462
 463         snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
 464         cachep = kmem_cache_create(pcache->name,
 465                         sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
 466                         0, SLAB_HWCACHE_ALIGN, NULL);
 467         if (cachep == NULL)
 468                 goto err_cachep;
 469
 470         pcache->nr_ids = nr_ids;
 471         pcache->cachep = cachep;
 472         list_add(&pcache->list, &pid_caches_lh);
 473 out:
 474         mutex_unlock(&pid_caches_mutex);
 475         return pcache->cachep;
 476
 477 err_cachep:
 478         kfree(pcache);
 479 err_alloc:
 480         mutex_unlock(&pid_caches_mutex);
 481         return NULL;
 482 }
 483
 484 static struct pid_namespace *create_pid_namespace(int level)
 485 {
 486         struct pid_namespace *ns;
 487         int i;
 488
 489         ns = kmalloc(sizeof(struct pid_namespace), GFP_KERNEL);
 490         if (ns == NULL)
 491                 goto out;
 492
 493         ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 494         if (!ns->pidmap[0].page)
 495                 goto out_free;
 496
 497         ns->pid_cachep = create_pid_cachep(level + 1);
 498         if (ns->pid_cachep == NULL)
 499                 goto out_free_map;
 500
 501         kref_init(&ns->kref);
 502         ns->last_pid = 0;
 503         ns->child_reaper = NULL;
 504         ns->level = level;
 505
 506         set_bit(0, ns->pidmap[0].page);
 507         atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
 508
 509         for (i = 1; i < PIDMAP_ENTRIES; i++) {
 510                 ns->pidmap[i].page = 0;
 511                 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
 512         }
 513
 514         return ns;
 515
 516 out_free_map:
 517         kfree(ns->pidmap[0].page);
 518 out_free:
 519         kfree(ns);
 520 out:
 521         return ERR_PTR(-ENOMEM);
 522 }
 523
 524 static void destroy_pid_namespace(struct pid_namespace *ns)
 525 {
 526         int i;
 527
 528         for (i = 0; i < PIDMAP_ENTRIES; i++)
 529                 kfree(ns->pidmap[i].page);
 530         kfree(ns);
 531 }
 532
 533 struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
 534 {
 535         struct pid_namespace *new_ns;
 536
 537         BUG_ON(!old_ns);
 538         new_ns = get_pid_ns(old_ns);
 539         if (!(flags & CLONE_NEWPID))
 540                 goto out;
 541
 542         new_ns = ERR_PTR(-EINVAL);
 543         if (flags & CLONE_THREAD)
 544                 goto out_put;
 545
 546         new_ns = create_pid_namespace(old_ns->level + 1);
 547         if (!IS_ERR(new_ns))
 548                 new_ns->parent = get_pid_ns(old_ns);
 549
 550 out_put:
 551         put_pid_ns(old_ns);
 552 out:
 553         return new_ns;
 554 }
 555
 556 void free_pid_ns(struct kref *kref)
 557 {
 558         struct pid_namespace *ns, *parent;
 559
 560         ns = container_of(kref, struct pid_namespace, kref);
 561
 562         parent = ns->parent;
 563         destroy_pid_namespace(ns);
 564
 565         if (parent != NULL)
 566                 put_pid_ns(parent);
 567 }
 568
 569 /*
 570  * The pid hash table is scaled according to the amount of memory in the
 571  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
 572  * more.
 573  */
 574 void __init pidhash_init(void)
 575 {
 576         int i, pidhash_size;
 577         unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
 578
 579         pidhash_shift = max(4, fls(megabytes * 4));
 580         pidhash_shift = min(12, pidhash_shift);
 581         pidhash_size = 1 << pidhash_shift;
 582
 583         printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
 584                 pidhash_size, pidhash_shift,
 585                 pidhash_size * sizeof(struct hlist_head));
 586
 587         pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
 588         if (!pid_hash)
 589                 panic("Could not alloc pidhash!\n");
 590         for (i = 0; i < pidhash_size; i++)
 591                 INIT_HLIST_HEAD(&pid_hash[i]);
 592 }
 593
 594 void __init pidmap_init(void)
 595 {
 596         init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 597         /* Reserve PID 0. We never call free_pidmap(0) */
 598         set_bit(0, init_pid_ns.pidmap[0].page);
 599         atomic_dec(&init_pid_ns.pidmap[0].nr_free);
 600
 601         init_pid_ns.pid_cachep = create_pid_cachep(1);
 602         if (init_pid_ns.pid_cachep == NULL)
 603                 panic("Can't create pid_1 cachep\n");
 604 }