fs/proc/base.c

   1 /*
   2  *  linux/fs/proc/base.c
   3  *
   4  *  Copyright (C) 1991, 1992 Linus Torvalds
   5  *
   6  *  proc base directory handling functions
   7  *
   8  *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
   9  *  Instead of using magical inumbers to determine the kind of object
  10  *  we allocate and fill in-core inodes upon lookup. They don't even
  11  *  go into icache. We cache the reference to task_struct upon lookup too.
  12  *  Eventually it should become a filesystem in its own. We don't use the
  13  *  rest of procfs anymore.
  14  *
  15  *
  16  *  Changelog:
  17  *  17-Jan-2005
  18  *  Allan Bezerra
  19  *  Bruna Moreira <bruna.moreira@indt.org.br>
  20  *  Edjard Mota <edjard.mota@indt.org.br>
  21  *  Ilias Biris <ilias.biris@indt.org.br>
  22  *  Mauricio Lin <mauricio.lin@indt.org.br>
  23  *
  24  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  25  *
  26  *  A new process specific entry (smaps) included in /proc. It shows the
  27  *  size of rss for each memory area. The maps entry lacks information
  28  *  about physical memory size (rss) for each mapped file, i.e.,
  29  *  rss information for executables and library files.
  30  *  This additional information is useful for any tools that need to know
  31  *  about physical memory consumption for a process specific library.
  32  *
  33  *  Changelog:
  34  *  21-Feb-2005
  35  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  36  *  Pud inclusion in the page table walking.
  37  *
  38  *  ChangeLog:
  39  *  10-Mar-2005
  40  *  10LE Instituto Nokia de Tecnologia - INdT:
  41  *  A better way to walks through the page table as suggested by Hugh Dickins.
  42  *
  43  *  Simo Piiroinen <simo.piiroinen@nokia.com>:
  44  *  Smaps information related to shared, private, clean and dirty pages.
  45  *
  46  *  Paul Mundt <paul.mundt@nokia.com>:
  47  *  Overall revision about smaps.
  48  */
  49
  50 #include <linux/uaccess.h>
  51
  52 #include <linux/errno.h>
  53 #include <linux/time.h>
  54 #include <linux/proc_fs.h>
  55 #include <linux/stat.h>
  56 #include <linux/task_io_accounting_ops.h>
  57 #include <linux/init.h>
  58 #include <linux/capability.h>
  59 #include <linux/file.h>
  60 #include <linux/fdtable.h>
  61 #include <linux/string.h>
  62 #include <linux/seq_file.h>
  63 #include <linux/namei.h>
  64 #include <linux/mnt_namespace.h>
  65 #include <linux/mm.h>
  66 #include <linux/swap.h>
  67 #include <linux/rcupdate.h>
  68 #include <linux/kallsyms.h>
  69 #include <linux/stacktrace.h>
  70 #include <linux/resource.h>
  71 #include <linux/module.h>
  72 #include <linux/mount.h>
  73 #include <linux/security.h>
  74 #include <linux/ptrace.h>
  75 #include <linux/tracehook.h>
  76 #include <linux/printk.h>
  77 #include <linux/cgroup.h>
  78 #include <linux/cpuset.h>
  79 #include <linux/audit.h>
  80 #include <linux/poll.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/oom.h>
  83 #include <linux/elf.h>
  84 #include <linux/pid_namespace.h>
  85 #include <linux/user_namespace.h>
  86 #include <linux/fs_struct.h>
  87 #include <linux/slab.h>
  88 #include <linux/sched/autogroup.h>
  89 #include <linux/sched/mm.h>
  90 #include <linux/sched/coredump.h>
  91 #include <linux/sched/debug.h>
  92 #include <linux/sched/stat.h>
  93 #include <linux/flex_array.h>
  94 #include <linux/posix-timers.h>
  95 #ifdef CONFIG_HARDWALL
  96 #include <asm/hardwall.h>
  97 #endif
  98 #include <trace/events/oom.h>
  99 #include "internal.h"
 100 #include "fd.h"
 101
 102 /* NOTE:
 103  *      Implementing inode permission operations in /proc is almost
 104  *      certainly an error.  Permission checks need to happen during
 105  *      each system call not at open time.  The reason is that most of
 106  *      what we wish to check for permissions in /proc varies at runtime.
 107  *
 108  *      The classic example of a problem is opening file descriptors
 109  *      in /proc for a task before it execs a suid executable.
 110  */
 111
 112 static u8 nlink_tid;
 113 static u8 nlink_tgid;
 114
 115 struct pid_entry {
 116         const char *name;
 117         unsigned int len;
 118         umode_t mode;
 119         const struct inode_operations *iop;
 120         const struct file_operations *fop;
 121         union proc_op op;
 122 };
 123
 124 #define NOD(NAME, MODE, IOP, FOP, OP) {                 \
 125         .name = (NAME),                                 \
 126         .len  = sizeof(NAME) - 1,                       \
 127         .mode = MODE,                                   \
 128         .iop  = IOP,                                    \
 129         .fop  = FOP,                                    \
 130         .op   = OP,                                     \
 131 }
 132
 133 #define DIR(NAME, MODE, iops, fops)     \
 134         NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
 135 #define LNK(NAME, get_link)                                     \
 136         NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
 137                 &proc_pid_link_inode_operations, NULL,          \
 138                 { .proc_get_link = get_link } )
 139 #define REG(NAME, MODE, fops)                           \
 140         NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
 141 #define ONE(NAME, MODE, show)                           \
 142         NOD(NAME, (S_IFREG|(MODE)),                     \
 143                 NULL, &proc_single_file_operations,     \
 144                 { .proc_show = show } )
 145 #define ATTR(LSM, NAME, MODE)                           \
 146         NOD(NAME, (S_IFREG|(MODE)),                     \
 147                 NULL, &proc_pid_attr_operations,        \
 148                 { .lsm = LSM })
 149
 150 /*
 151  * Count the number of hardlinks for the pid_entry table, excluding the .
 152  * and .. links.
 153  */
 154 static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
 155         unsigned int n)
 156 {
 157         unsigned int i;
 158         unsigned int count;
 159
 160         count = 2;
 161         for (i = 0; i < n; ++i) {
 162                 if (S_ISDIR(entries[i].mode))
 163                         ++count;
 164         }
 165
 166         return count;
 167 }
 168
 169 static int get_task_root(struct task_struct *task, struct path *root)
 170 {
 171         int result = -ENOENT;
 172
 173         task_lock(task);
 174         if (task->fs) {
 175                 get_fs_root(task->fs, root);
 176                 result = 0;
 177         }
 178         task_unlock(task);
 179         return result;
 180 }
 181
 182 static int proc_cwd_link(struct dentry *dentry, struct path *path)
 183 {
 184         struct task_struct *task = get_proc_task(d_inode(dentry));
 185         int result = -ENOENT;
 186
 187         if (task) {
 188                 task_lock(task);
 189                 if (task->fs) {
 190                         get_fs_pwd(task->fs, path);
 191                         result = 0;
 192                 }
 193                 task_unlock(task);
 194                 put_task_struct(task);
 195         }
 196         return result;
 197 }
 198
 199 static int proc_root_link(struct dentry *dentry, struct path *path)
 200 {
 201         struct task_struct *task = get_proc_task(d_inode(dentry));
 202         int result = -ENOENT;
 203
 204         if (task) {
 205                 result = get_task_root(task, path);
 206                 put_task_struct(task);
 207         }
 208         return result;
 209 }
 210
 211 static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 212                                      size_t _count, loff_t *pos)
 213 {
 214         struct task_struct *tsk;
 215         struct mm_struct *mm;
 216         char *page;
 217         unsigned long count = _count;
 218         unsigned long arg_start, arg_end, env_start, env_end;
 219         unsigned long len1, len2, len;
 220         unsigned long p;
 221         char c;
 222         ssize_t rv;
 223
 224         BUG_ON(*pos < 0);
 225
 226         tsk = get_proc_task(file_inode(file));
 227         if (!tsk)
 228                 return -ESRCH;
 229         mm = get_task_mm(tsk);
 230         put_task_struct(tsk);
 231         if (!mm)
 232                 return 0;
 233         /* Check if process spawned far enough to have cmdline. */
 234         if (!mm->env_end) {
 235                 rv = 0;
 236                 goto out_mmput;
 237         }
 238
 239         page = (char *)__get_free_page(GFP_TEMPORARY);
 240         if (!page) {
 241                 rv = -ENOMEM;
 242                 goto out_mmput;
 243         }
 244
 245         down_read(&mm->mmap_sem);
 246         arg_start = mm->arg_start;
 247         arg_end = mm->arg_end;
 248         env_start = mm->env_start;
 249         env_end = mm->env_end;
 250         up_read(&mm->mmap_sem);
 251
 252         BUG_ON(arg_start > arg_end);
 253         BUG_ON(env_start > env_end);
 254
 255         len1 = arg_end - arg_start;
 256         len2 = env_end - env_start;
 257
 258         /* Empty ARGV. */
 259         if (len1 == 0) {
 260                 rv = 0;
 261                 goto out_free_page;
 262         }
 263         /*
 264          * Inherently racy -- command line shares address space
 265          * with code and data.
 266          */
 267         rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0);
 268         if (rv <= 0)
 269                 goto out_free_page;
 270
 271         rv = 0;
 272
 273         if (c == '\0') {
 274                 /* Command line (set of strings) occupies whole ARGV. */
 275                 if (len1 <= *pos)
 276                         goto out_free_page;
 277
 278                 p = arg_start + *pos;
 279                 len = len1 - *pos;
 280                 while (count > 0 && len > 0) {
 281                         unsigned int _count;
 282                         int nr_read;
 283
 284                         _count = min3(count, len, PAGE_SIZE);
 285                         nr_read = access_remote_vm(mm, p, page, _count, 0);
 286                         if (nr_read < 0)
 287                                 rv = nr_read;
 288                         if (nr_read <= 0)
 289                                 goto out_free_page;
 290
 291                         if (copy_to_user(buf, page, nr_read)) {
 292                                 rv = -EFAULT;
 293                                 goto out_free_page;
 294                         }
 295
 296                         p       += nr_read;
 297                         len     -= nr_read;
 298                         buf     += nr_read;
 299                         count   -= nr_read;
 300                         rv      += nr_read;
 301                 }
 302         } else {
 303                 /*
 304                  * Command line (1 string) occupies ARGV and
 305                  * extends into ENVP.
 306                  */
 307                 struct {
 308                         unsigned long p;
 309                         unsigned long len;
 310                 } cmdline[2] = {
 311                         { .p = arg_start, .len = len1 },
 312                         { .p = env_start, .len = len2 },
 313                 };
 314                 loff_t pos1 = *pos;
 315                 unsigned int i;
 316
 317                 i = 0;
 318                 while (i < 2 && pos1 >= cmdline[i].len) {
 319                         pos1 -= cmdline[i].len;
 320                         i++;
 321                 }
 322                 while (i < 2) {
 323                         p = cmdline[i].p + pos1;
 324                         len = cmdline[i].len - pos1;
 325                         while (count > 0 && len > 0) {
 326                                 unsigned int _count, l;
 327                                 int nr_read;
 328                                 bool final;
 329
 330                                 _count = min3(count, len, PAGE_SIZE);
 331                                 nr_read = access_remote_vm(mm, p, page, _count, 0);
 332                                 if (nr_read < 0)
 333                                         rv = nr_read;
 334                                 if (nr_read <= 0)
 335                                         goto out_free_page;
 336
 337                                 /*
 338                                  * Command line can be shorter than whole ARGV
 339                                  * even if last "marker" byte says it is not.
 340                                  */
 341                                 final = false;
 342                                 l = strnlen(page, nr_read);
 343                                 if (l < nr_read) {
 344                                         nr_read = l;
 345                                         final = true;
 346                                 }
 347
 348                                 if (copy_to_user(buf, page, nr_read)) {
 349                                         rv = -EFAULT;
 350                                         goto out_free_page;
 351                                 }
 352
 353                                 p       += nr_read;
 354                                 len     -= nr_read;
 355                                 buf     += nr_read;
 356                                 count   -= nr_read;
 357                                 rv      += nr_read;
 358
 359                                 if (final)
 360                                         goto out_free_page;
 361                         }
 362
 363                         /* Only first chunk can be read partially. */
 364                         pos1 = 0;
 365                         i++;
 366                 }
 367         }
 368
 369 out_free_page:
 370         free_page((unsigned long)page);
 371 out_mmput:
 372         mmput(mm);
 373         if (rv > 0)
 374                 *pos += rv;
 375         return rv;
 376 }
 377
 378 static const struct file_operations proc_pid_cmdline_ops = {
 379         .read   = proc_pid_cmdline_read,
 380         .llseek = generic_file_llseek,
 381 };
 382
 383 #ifdef CONFIG_KALLSYMS
 384 /*
 385  * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 386  * Returns the resolved symbol.  If that fails, simply return the address.
 387  */
 388 static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
 389                           struct pid *pid, struct task_struct *task)
 390 {
 391         unsigned long wchan;
 392         char symname[KSYM_NAME_LEN];
 393
 394         wchan = get_wchan(task);
 395
 396         if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)
 397                         && !lookup_symbol_name(wchan, symname))
 398                 seq_printf(m, "%s", symname);
 399         else
 400                 seq_putc(m, '0');
 401
 402         return 0;
 403 }
 404 #endif /* CONFIG_KALLSYMS */
 405
 406 static int lock_trace(struct task_struct *task)
 407 {
 408         int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
 409         if (err)
 410                 return err;
 411         if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
 412                 mutex_unlock(&task->signal->cred_guard_mutex);
 413                 return -EPERM;
 414         }
 415         return 0;
 416 }
 417
 418 static void unlock_trace(struct task_struct *task)
 419 {
 420         mutex_unlock(&task->signal->cred_guard_mutex);
 421 }
 422
 423 #ifdef CONFIG_STACKTRACE
 424
 425 #define MAX_STACK_TRACE_DEPTH   64
 426
 427 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 428                           struct pid *pid, struct task_struct *task)
 429 {
 430         struct stack_trace trace;
 431         unsigned long *entries;
 432         int err;
 433         int i;
 434
 435         entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
 436         if (!entries)
 437                 return -ENOMEM;
 438
 439         trace.nr_entries        = 0;
 440         trace.max_entries       = MAX_STACK_TRACE_DEPTH;
 441         trace.entries           = entries;
 442         trace.skip              = 0;
 443
 444         err = lock_trace(task);
 445         if (!err) {
 446                 save_stack_trace_tsk(task, &trace);
 447
 448                 for (i = 0; i < trace.nr_entries; i++) {
 449                         seq_printf(m, "[<%pK>] %pB\n",
 450                                    (void *)entries[i], (void *)entries[i]);
 451                 }
 452                 unlock_trace(task);
 453         }
 454         kfree(entries);
 455
 456         return err;
 457 }
 458 #endif
 459
 460 #ifdef CONFIG_SCHED_INFO
 461 /*
 462  * Provides /proc/PID/schedstat
 463  */
 464 static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 465                               struct pid *pid, struct task_struct *task)
 466 {
 467         if (unlikely(!sched_info_on()))
 468                 seq_printf(m, "0 0 0\n");
 469         else
 470                 seq_printf(m, "%llu %llu %lu\n",
 471                    (unsigned long long)task->se.sum_exec_runtime,
 472                    (unsigned long long)task->sched_info.run_delay,
 473                    task->sched_info.pcount);
 474
 475         return 0;
 476 }
 477 #endif
 478
 479 #ifdef CONFIG_LATENCYTOP
 480 static int lstats_show_proc(struct seq_file *m, void *v)
 481 {
 482         int i;
 483         struct inode *inode = m->private;
 484         struct task_struct *task = get_proc_task(inode);
 485
 486         if (!task)
 487                 return -ESRCH;
 488         seq_puts(m, "Latency Top version : v0.1\n");
 489         for (i = 0; i < 32; i++) {
 490                 struct latency_record *lr = &task->latency_record[i];
 491                 if (lr->backtrace[0]) {
 492                         int q;
 493                         seq_printf(m, "%i %li %li",
 494                                    lr->count, lr->time, lr->max);
 495                         for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 496                                 unsigned long bt = lr->backtrace[q];
 497                                 if (!bt)
 498                                         break;
 499                                 if (bt == ULONG_MAX)
 500                                         break;
 501                                 seq_printf(m, " %ps", (void *)bt);
 502                         }
 503                         seq_putc(m, '\n');
 504                 }
 505
 506         }
 507         put_task_struct(task);
 508         return 0;
 509 }
 510
 511 static int lstats_open(struct inode *inode, struct file *file)
 512 {
 513         return single_open(file, lstats_show_proc, inode);
 514 }
 515
 516 static ssize_t lstats_write(struct file *file, const char __user *buf,
 517                             size_t count, loff_t *offs)
 518 {
 519         struct task_struct *task = get_proc_task(file_inode(file));
 520
 521         if (!task)
 522                 return -ESRCH;
 523         clear_all_latency_tracing(task);
 524         put_task_struct(task);
 525
 526         return count;
 527 }
 528
 529 static const struct file_operations proc_lstats_operations = {
 530         .open           = lstats_open,
 531         .read           = seq_read,
 532         .write          = lstats_write,
 533         .llseek         = seq_lseek,
 534         .release        = single_release,
 535 };
 536
 537 #endif
 538
 539 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 540                           struct pid *pid, struct task_struct *task)
 541 {
 542         unsigned long totalpages = totalram_pages + total_swap_pages;
 543         unsigned long points = 0;
 544
 545         points = oom_badness(task, NULL, NULL, totalpages) *
 546                                         1000 / totalpages;
 547         seq_printf(m, "%lu\n", points);
 548
 549         return 0;
 550 }
 551
 552 struct limit_names {
 553         const char *name;
 554         const char *unit;
 555 };
 556
 557 static const struct limit_names lnames[RLIM_NLIMITS] = {
 558         [RLIMIT_CPU] = {"Max cpu time", "seconds"},
 559         [RLIMIT_FSIZE] = {"Max file size", "bytes"},
 560         [RLIMIT_DATA] = {"Max data size", "bytes"},
 561         [RLIMIT_STACK] = {"Max stack size", "bytes"},
 562         [RLIMIT_CORE] = {"Max core file size", "bytes"},
 563         [RLIMIT_RSS] = {"Max resident set", "bytes"},
 564         [RLIMIT_NPROC] = {"Max processes", "processes"},
 565         [RLIMIT_NOFILE] = {"Max open files", "files"},
 566         [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
 567         [RLIMIT_AS] = {"Max address space", "bytes"},
 568         [RLIMIT_LOCKS] = {"Max file locks", "locks"},
 569         [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
 570         [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
 571         [RLIMIT_NICE] = {"Max nice priority", NULL},
 572         [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
 573         [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
 574 };
 575
 576 /* Display limits for a process */
 577 static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
 578                            struct pid *pid, struct task_struct *task)
 579 {
 580         unsigned int i;
 581         unsigned long flags;
 582
 583         struct rlimit rlim[RLIM_NLIMITS];
 584
 585         if (!lock_task_sighand(task, &flags))
 586                 return 0;
 587         memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
 588         unlock_task_sighand(task, &flags);
 589
 590         /*
 591          * print the file header
 592          */
 593        seq_printf(m, "%-25s %-20s %-20s %-10s\n",
 594                   "Limit", "Soft Limit", "Hard Limit", "Units");
 595
 596         for (i = 0; i < RLIM_NLIMITS; i++) {
 597                 if (rlim[i].rlim_cur == RLIM_INFINITY)
 598                         seq_printf(m, "%-25s %-20s ",
 599                                    lnames[i].name, "unlimited");
 600                 else
 601                         seq_printf(m, "%-25s %-20lu ",
 602                                    lnames[i].name, rlim[i].rlim_cur);
 603
 604                 if (rlim[i].rlim_max == RLIM_INFINITY)
 605                         seq_printf(m, "%-20s ", "unlimited");
 606                 else
 607                         seq_printf(m, "%-20lu ", rlim[i].rlim_max);
 608
 609                 if (lnames[i].unit)
 610                         seq_printf(m, "%-10s\n", lnames[i].unit);
 611                 else
 612                         seq_putc(m, '\n');
 613         }
 614
 615         return 0;
 616 }
 617
 618 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 619 static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
 620                             struct pid *pid, struct task_struct *task)
 621 {
 622         long nr;
 623         unsigned long args[6], sp, pc;
 624         int res;
 625
 626         res = lock_trace(task);
 627         if (res)
 628                 return res;
 629
 630         if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
 631                 seq_puts(m, "running\n");
 632         else if (nr < 0)
 633                 seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
 634         else
 635                 seq_printf(m,
 636                        "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
 637                        nr,
 638                        args[0], args[1], args[2], args[3], args[4], args[5],
 639                        sp, pc);
 640         unlock_trace(task);
 641
 642         return 0;
 643 }
 644 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 645
 646 /************************************************************************/
 647 /*                       Here the fs part begins                        */
 648 /************************************************************************/
 649
 650 /* permission checks */
 651 static int proc_fd_access_allowed(struct inode *inode)
 652 {
 653         struct task_struct *task;
 654         int allowed = 0;
 655         /* Allow access to a task's file descriptors if it is us or we
 656          * may use ptrace attach to the process and find out that
 657          * information.
 658          */
 659         task = get_proc_task(inode);
 660         if (task) {
 661                 allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 662                 put_task_struct(task);
 663         }
 664         return allowed;
 665 }
 666
 667 int proc_setattr(struct dentry *dentry, struct iattr *attr)
 668 {
 669         int error;
 670         struct inode *inode = d_inode(dentry);
 671         struct user_namespace *s_user_ns;
 672
 673         if (attr->ia_valid & ATTR_MODE)
 674                 return -EPERM;
 675
 676         /* Don't let anyone mess with weird proc files */
 677         s_user_ns = inode->i_sb->s_user_ns;
 678         if (!kuid_has_mapping(s_user_ns, inode->i_uid) ||
 679             !kgid_has_mapping(s_user_ns, inode->i_gid))
 680                 return -EPERM;
 681
 682         error = setattr_prepare(dentry, attr);
 683         if (error)
 684                 return error;
 685
 686         setattr_copy(inode, attr);
 687         mark_inode_dirty(inode);
 688         return 0;
 689 }
 690
 691 /*
 692  * May current process learn task's sched/cmdline info (for hide_pid_min=1)
 693  * or euid/egid (for hide_pid_min=2)?
 694  */
 695 static bool has_pid_permissions(struct pid_namespace *pid,
 696                                  struct task_struct *task,
 697                                  int hide_pid_min)
 698 {
 699         if (pid->hide_pid < hide_pid_min)
 700                 return true;
 701         if (in_group_p(pid->pid_gid))
 702                 return true;
 703         return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 704 }
 705
 706
 707 static int proc_pid_permission(struct inode *inode, int mask)
 708 {
 709         struct pid_namespace *pid = inode->i_sb->s_fs_info;
 710         struct task_struct *task;
 711         bool has_perms;
 712
 713         task = get_proc_task(inode);
 714         if (!task)
 715                 return -ESRCH;
 716         has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS);
 717         put_task_struct(task);
 718
 719         if (!has_perms) {
 720                 if (pid->hide_pid == HIDEPID_INVISIBLE) {
 721                         /*
 722                          * Let's make getdents(), stat(), and open()
 723                          * consistent with each other.  If a process
 724                          * may not stat() a file, it shouldn't be seen
 725                          * in procfs at all.
 726                          */
 727                         return -ENOENT;
 728                 }
 729
 730                 return -EPERM;
 731         }
 732         return generic_permission(inode, mask);
 733 }
 734
 735
 736
 737 static const struct inode_operations proc_def_inode_operations = {
 738         .setattr        = proc_setattr,
 739 };
 740
 741 static int proc_single_show(struct seq_file *m, void *v)
 742 {
 743         struct inode *inode = m->private;
 744         struct pid_namespace *ns;
 745         struct pid *pid;
 746         struct task_struct *task;
 747         int ret;
 748
 749         ns = inode->i_sb->s_fs_info;
 750         pid = proc_pid(inode);
 751         task = get_pid_task(pid, PIDTYPE_PID);
 752         if (!task)
 753                 return -ESRCH;
 754
 755         ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
 756
 757         put_task_struct(task);
 758         return ret;
 759 }
 760
 761 static int proc_single_open(struct inode *inode, struct file *filp)
 762 {
 763         return single_open(filp, proc_single_show, inode);
 764 }
 765
 766 static const struct file_operations proc_single_file_operations = {
 767         .open           = proc_single_open,
 768         .read           = seq_read,
 769         .llseek         = seq_lseek,
 770         .release        = single_release,
 771 };
 772
 773
 774 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 775 {
 776         struct task_struct *task = get_proc_task(inode);
 777         struct mm_struct *mm = ERR_PTR(-ESRCH);
 778
 779         if (task) {
 780                 mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
 781                 put_task_struct(task);
 782
 783                 if (!IS_ERR_OR_NULL(mm)) {
 784                         /* ensure this mm_struct can't be freed */
 785                         mmgrab(mm);
 786                         /* but do not pin its memory */
 787                         mmput(mm);
 788                 }
 789         }
 790
 791         return mm;
 792 }
 793
 794 static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
 795 {
 796         struct mm_struct *mm = proc_mem_open(inode, mode);
 797
 798         if (IS_ERR(mm))
 799                 return PTR_ERR(mm);
 800
 801         file->private_data = mm;
 802         return 0;
 803 }
 804
 805 static int mem_open(struct inode *inode, struct file *file)
 806 {
 807         int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
 808
 809         /* OK to pass negative loff_t, we can catch out-of-range */
 810         file->f_mode |= FMODE_UNSIGNED_OFFSET;
 811
 812         return ret;
 813 }
 814
 815 static ssize_t mem_rw(struct file *file, char __user *buf,
 816                         size_t count, loff_t *ppos, int write)
 817 {
 818         struct mm_struct *mm = file->private_data;
 819         unsigned long addr = *ppos;
 820         ssize_t copied;
 821         char *page;
 822         unsigned int flags;
 823
 824         if (!mm)
 825                 return 0;
 826
 827         page = (char *)__get_free_page(GFP_TEMPORARY);
 828         if (!page)
 829                 return -ENOMEM;
 830
 831         copied = 0;
 832         if (!mmget_not_zero(mm))
 833                 goto free;
 834
 835         flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
 836
 837         while (count > 0) {
 838                 int this_len = min_t(int, count, PAGE_SIZE);
 839
 840                 if (write && copy_from_user(page, buf, this_len)) {
 841                         copied = -EFAULT;
 842                         break;
 843                 }
 844
 845                 this_len = access_remote_vm(mm, addr, page, this_len, flags);
 846                 if (!this_len) {
 847                         if (!copied)
 848                                 copied = -EIO;
 849                         break;
 850                 }
 851
 852                 if (!write && copy_to_user(buf, page, this_len)) {
 853                         copied = -EFAULT;
 854                         break;
 855                 }
 856
 857                 buf += this_len;
 858                 addr += this_len;
 859                 copied += this_len;
 860                 count -= this_len;
 861         }
 862         *ppos = addr;
 863
 864         mmput(mm);
 865 free:
 866         free_page((unsigned long) page);
 867         return copied;
 868 }
 869
 870 static ssize_t mem_read(struct file *file, char __user *buf,
 871                         size_t count, loff_t *ppos)
 872 {
 873         return mem_rw(file, buf, count, ppos, 0);
 874 }
 875
 876 static ssize_t mem_write(struct file *file, const char __user *buf,
 877                          size_t count, loff_t *ppos)
 878 {
 879         return mem_rw(file, (char __user*)buf, count, ppos, 1);
 880 }
 881
 882 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 883 {
 884         switch (orig) {
 885         case 0:
 886                 file->f_pos = offset;
 887                 break;
 888         case 1:
 889                 file->f_pos += offset;
 890                 break;
 891         default:
 892                 return -EINVAL;
 893         }
 894         force_successful_syscall_return();
 895         return file->f_pos;
 896 }
 897
 898 static int mem_release(struct inode *inode, struct file *file)
 899 {
 900         struct mm_struct *mm = file->private_data;
 901         if (mm)
 902                 mmdrop(mm);
 903         return 0;
 904 }
 905
 906 static const struct file_operations proc_mem_operations = {
 907         .llseek         = mem_lseek,
 908         .read           = mem_read,
 909         .write          = mem_write,
 910         .open           = mem_open,
 911         .release        = mem_release,
 912 };
 913
 914 static int environ_open(struct inode *inode, struct file *file)
 915 {
 916         return __mem_open(inode, file, PTRACE_MODE_READ);
 917 }
 918
 919 static ssize_t environ_read(struct file *file, char __user *buf,
 920                         size_t count, loff_t *ppos)
 921 {
 922         char *page;
 923         unsigned long src = *ppos;
 924         int ret = 0;
 925         struct mm_struct *mm = file->private_data;
 926         unsigned long env_start, env_end;
 927
 928         /* Ensure the process spawned far enough to have an environment. */
 929         if (!mm || !mm->env_end)
 930                 return 0;
 931
 932         page = (char *)__get_free_page(GFP_TEMPORARY);
 933         if (!page)
 934                 return -ENOMEM;
 935
 936         ret = 0;
 937         if (!mmget_not_zero(mm))
 938                 goto free;
 939
 940         down_read(&mm->mmap_sem);
 941         env_start = mm->env_start;
 942         env_end = mm->env_end;
 943         up_read(&mm->mmap_sem);
 944
 945         while (count > 0) {
 946                 size_t this_len, max_len;
 947                 int retval;
 948
 949                 if (src >= (env_end - env_start))
 950                         break;
 951
 952                 this_len = env_end - (env_start + src);
 953
 954                 max_len = min_t(size_t, PAGE_SIZE, count);
 955                 this_len = min(max_len, this_len);
 956
 957                 retval = access_remote_vm(mm, (env_start + src), page, this_len, 0);
 958
 959                 if (retval <= 0) {
 960                         ret = retval;
 961                         break;
 962                 }
 963
 964                 if (copy_to_user(buf, page, retval)) {
 965                         ret = -EFAULT;
 966                         break;
 967                 }
 968
 969                 ret += retval;
 970                 src += retval;
 971                 buf += retval;
 972                 count -= retval;
 973         }
 974         *ppos = src;
 975         mmput(mm);
 976
 977 free:
 978         free_page((unsigned long) page);
 979         return ret;
 980 }
 981
 982 static const struct file_operations proc_environ_operations = {
 983         .open           = environ_open,
 984         .read           = environ_read,
 985         .llseek         = generic_file_llseek,
 986         .release        = mem_release,
 987 };
 988
 989 static int auxv_open(struct inode *inode, struct file *file)
 990 {
 991         return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
 992 }
 993
 994 static ssize_t auxv_read(struct file *file, char __user *buf,
 995                         size_t count, loff_t *ppos)
 996 {
 997         struct mm_struct *mm = file->private_data;
 998         unsigned int nwords = 0;
 999
1000         if (!mm)
1001                 return 0;
1002         do {
1003                 nwords += 2;
1004         } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
1005         return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
1006                                        nwords * sizeof(mm->saved_auxv[0]));
1007 }
1008
1009 static const struct file_operations proc_auxv_operations = {
1010         .open           = auxv_open,
1011         .read           = auxv_read,
1012         .llseek         = generic_file_llseek,
1013         .release        = mem_release,
1014 };
1015
1016 static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1017                             loff_t *ppos)
1018 {
1019         struct task_struct *task = get_proc_task(file_inode(file));
1020         char buffer[PROC_NUMBUF];
1021         int oom_adj = OOM_ADJUST_MIN;
1022         size_t len;
1023
1024         if (!task)
1025                 return -ESRCH;
1026         if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
1027                 oom_adj = OOM_ADJUST_MAX;
1028         else
1029                 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
1030                           OOM_SCORE_ADJ_MAX;
1031         put_task_struct(task);
1032         len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
1033         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1034 }
1035
1036 static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
1037 {
1038         static DEFINE_MUTEX(oom_adj_mutex);
1039         struct mm_struct *mm = NULL;
1040         struct task_struct *task;
1041         int err = 0;
1042
1043         task = get_proc_task(file_inode(file));
1044         if (!task)
1045                 return -ESRCH;
1046
1047         mutex_lock(&oom_adj_mutex);
1048         if (legacy) {
1049                 if (oom_adj < task->signal->oom_score_adj &&
1050                                 !capable(CAP_SYS_RESOURCE)) {
1051                         err = -EACCES;
1052                         goto err_unlock;
1053                 }
1054                 /*
1055                  * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
1056                  * /proc/pid/oom_score_adj instead.
1057                  */
1058                 pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1059                           current->comm, task_pid_nr(current), task_pid_nr(task),
1060                           task_pid_nr(task));
1061         } else {
1062                 if ((short)oom_adj < task->signal->oom_score_adj_min &&
1063                                 !capable(CAP_SYS_RESOURCE)) {
1064                         err = -EACCES;
1065                         goto err_unlock;
1066                 }
1067         }
1068
1069         /*
1070          * Make sure we will check other processes sharing the mm if this is
1071          * not vfrok which wants its own oom_score_adj.
1072          * pin the mm so it doesn't go away and get reused after task_unlock
1073          */
1074         if (!task->vfork_done) {
1075                 struct task_struct *p = find_lock_task_mm(task);
1076
1077                 if (p) {
1078                         if (atomic_read(&p->mm->mm_users) > 1) {
1079                                 mm = p->mm;
1080                                 mmgrab(mm);
1081                         }
1082                         task_unlock(p);
1083                 }
1084         }
1085
1086         task->signal->oom_score_adj = oom_adj;
1087         if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1088                 task->signal->oom_score_adj_min = (short)oom_adj;
1089         trace_oom_score_adj_update(task);
1090
1091         if (mm) {
1092                 struct task_struct *p;
1093
1094                 rcu_read_lock();
1095                 for_each_process(p) {
1096                         if (same_thread_group(task, p))
1097                                 continue;
1098
1099                         /* do not touch kernel threads or the global init */
1100                         if (p->flags & PF_KTHREAD || is_global_init(p))
1101                                 continue;
1102
1103                         task_lock(p);
1104                         if (!p->vfork_done && process_shares_mm(p, mm)) {
1105                                 pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
1106                                                 task_pid_nr(p), p->comm,
1107                                                 p->signal->oom_score_adj, oom_adj,
1108                                                 task_pid_nr(task), task->comm);
1109                                 p->signal->oom_score_adj = oom_adj;
1110                                 if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1111                                         p->signal->oom_score_adj_min = (short)oom_adj;
1112                         }
1113                         task_unlock(p);
1114                 }
1115                 rcu_read_unlock();
1116                 mmdrop(mm);
1117         }
1118 err_unlock:
1119         mutex_unlock(&oom_adj_mutex);
1120         put_task_struct(task);
1121         return err;
1122 }
1123
1124 /*
1125  * /proc/pid/oom_adj exists solely for backwards compatibility with previous
1126  * kernels.  The effective policy is defined by oom_score_adj, which has a
1127  * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
1128  * Values written to oom_adj are simply mapped linearly to oom_score_adj.
1129  * Processes that become oom disabled via oom_adj will still be oom disabled
1130  * with this implementation.
1131  *
1132  * oom_adj cannot be removed since existing userspace binaries use it.
1133  */
1134 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1135                              size_t count, loff_t *ppos)
1136 {
1137         char buffer[PROC_NUMBUF];
1138         int oom_adj;
1139         int err;
1140
1141         memset(buffer, 0, sizeof(buffer));
1142         if (count > sizeof(buffer) - 1)
1143                 count = sizeof(buffer) - 1;
1144         if (copy_from_user(buffer, buf, count)) {
1145                 err = -EFAULT;
1146                 goto out;
1147         }
1148
1149         err = kstrtoint(strstrip(buffer), 0, &oom_adj);
1150         if (err)
1151                 goto out;
1152         if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
1153              oom_adj != OOM_DISABLE) {
1154                 err = -EINVAL;
1155                 goto out;
1156         }
1157
1158         /*
1159          * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1160          * value is always attainable.
1161          */
1162         if (oom_adj == OOM_ADJUST_MAX)
1163                 oom_adj = OOM_SCORE_ADJ_MAX;
1164         else
1165                 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
1166
1167         err = __set_oom_adj(file, oom_adj, true);
1168 out:
1169         return err < 0 ? err : count;
1170 }
1171
1172 static const struct file_operations proc_oom_adj_operations = {
1173         .read           = oom_adj_read,
1174         .write          = oom_adj_write,
1175         .llseek         = generic_file_llseek,
1176 };
1177
1178 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1179                                         size_t count, loff_t *ppos)
1180 {
1181         struct task_struct *task = get_proc_task(file_inode(file));
1182         char buffer[PROC_NUMBUF];
1183         short oom_score_adj = OOM_SCORE_ADJ_MIN;
1184         size_t len;
1185
1186         if (!task)
1187                 return -ESRCH;
1188         oom_score_adj = task->signal->oom_score_adj;
1189         put_task_struct(task);
1190         len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
1191         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1192 }
1193
1194 static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1195                                         size_t count, loff_t *ppos)
1196 {
1197         char buffer[PROC_NUMBUF];
1198         int oom_score_adj;
1199         int err;
1200
1201         memset(buffer, 0, sizeof(buffer));
1202         if (count > sizeof(buffer) - 1)
1203                 count = sizeof(buffer) - 1;
1204         if (copy_from_user(buffer, buf, count)) {
1205                 err = -EFAULT;
1206                 goto out;
1207         }
1208
1209         err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
1210         if (err)
1211                 goto out;
1212         if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1213                         oom_score_adj > OOM_SCORE_ADJ_MAX) {
1214                 err = -EINVAL;
1215                 goto out;
1216         }
1217
1218         err = __set_oom_adj(file, oom_score_adj, false);
1219 out:
1220         return err < 0 ? err : count;
1221 }
1222
1223 static const struct file_operations proc_oom_score_adj_operations = {
1224         .read           = oom_score_adj_read,
1225         .write          = oom_score_adj_write,
1226         .llseek         = default_llseek,
1227 };
1228
1229 #ifdef CONFIG_AUDITSYSCALL
1230 #define TMPBUFLEN 11
1231 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1232                                   size_t count, loff_t *ppos)
1233 {
1234         struct inode * inode = file_inode(file);
1235         struct task_struct *task = get_proc_task(inode);
1236         ssize_t length;
1237         char tmpbuf[TMPBUFLEN];
1238
1239         if (!task)
1240                 return -ESRCH;
1241         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1242                            from_kuid(file->f_cred->user_ns,
1243                                      audit_get_loginuid(task)));
1244         put_task_struct(task);
1245         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1246 }
1247
1248 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1249                                    size_t count, loff_t *ppos)
1250 {
1251         struct inode * inode = file_inode(file);
1252         uid_t loginuid;
1253         kuid_t kloginuid;
1254         int rv;
1255
1256         rcu_read_lock();
1257         if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1258                 rcu_read_unlock();
1259                 return -EPERM;
1260         }
1261         rcu_read_unlock();
1262
1263         if (*ppos != 0) {
1264                 /* No partial writes. */
1265                 return -EINVAL;
1266         }
1267
1268         rv = kstrtou32_from_user(buf, count, 10, &loginuid);
1269         if (rv < 0)
1270                 return rv;
1271
1272         /* is userspace tring to explicitly UNSET the loginuid? */
1273         if (loginuid == AUDIT_UID_UNSET) {
1274                 kloginuid = INVALID_UID;
1275         } else {
1276                 kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1277                 if (!uid_valid(kloginuid))
1278                         return -EINVAL;
1279         }
1280
1281         rv = audit_set_loginuid(kloginuid);
1282         if (rv < 0)
1283                 return rv;
1284         return count;
1285 }
1286
1287 static const struct file_operations proc_loginuid_operations = {
1288         .read           = proc_loginuid_read,
1289         .write          = proc_loginuid_write,
1290         .llseek         = generic_file_llseek,
1291 };
1292
1293 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1294                                   size_t count, loff_t *ppos)
1295 {
1296         struct inode * inode = file_inode(file);
1297         struct task_struct *task = get_proc_task(inode);
1298         ssize_t length;
1299         char tmpbuf[TMPBUFLEN];
1300
1301         if (!task)
1302                 return -ESRCH;
1303         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1304                                 audit_get_sessionid(task));
1305         put_task_struct(task);
1306         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1307 }
1308
1309 static const struct file_operations proc_sessionid_operations = {
1310         .read           = proc_sessionid_read,
1311         .llseek         = generic_file_llseek,
1312 };
1313 #endif
1314
1315 #ifdef CONFIG_FAULT_INJECTION
1316 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1317                                       size_t count, loff_t *ppos)
1318 {
1319         struct task_struct *task = get_proc_task(file_inode(file));
1320         char buffer[PROC_NUMBUF];
1321         size_t len;
1322         int make_it_fail;
1323
1324         if (!task)
1325                 return -ESRCH;
1326         make_it_fail = task->make_it_fail;
1327         put_task_struct(task);
1328
1329         len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1330
1331         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1332 }
1333
1334 static ssize_t proc_fault_inject_write(struct file * file,
1335                         const char __user * buf, size_t count, loff_t *ppos)
1336 {
1337         struct task_struct *task;
1338         char buffer[PROC_NUMBUF];
1339         int make_it_fail;
1340         int rv;
1341
1342         if (!capable(CAP_SYS_RESOURCE))
1343                 return -EPERM;
1344         memset(buffer, 0, sizeof(buffer));
1345         if (count > sizeof(buffer) - 1)
1346                 count = sizeof(buffer) - 1;
1347         if (copy_from_user(buffer, buf, count))
1348                 return -EFAULT;
1349         rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
1350         if (rv < 0)
1351                 return rv;
1352         if (make_it_fail < 0 || make_it_fail > 1)
1353                 return -EINVAL;
1354
1355         task = get_proc_task(file_inode(file));
1356         if (!task)
1357                 return -ESRCH;
1358         task->make_it_fail = make_it_fail;
1359         put_task_struct(task);
1360
1361         return count;
1362 }
1363
1364 static const struct file_operations proc_fault_inject_operations = {
1365         .read           = proc_fault_inject_read,
1366         .write          = proc_fault_inject_write,
1367         .llseek         = generic_file_llseek,
1368 };
1369
1370 static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
1371                                    size_t count, loff_t *ppos)
1372 {
1373         struct task_struct *task;
1374         int err;
1375         unsigned int n;
1376
1377         err = kstrtouint_from_user(buf, count, 0, &n);
1378         if (err)
1379                 return err;
1380
1381         task = get_proc_task(file_inode(file));
1382         if (!task)
1383                 return -ESRCH;
1384         WRITE_ONCE(task->fail_nth, n);
1385         put_task_struct(task);
1386
1387         return count;
1388 }
1389
1390 static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
1391                                   size_t count, loff_t *ppos)
1392 {
1393         struct task_struct *task;
1394         char numbuf[PROC_NUMBUF];
1395         ssize_t len;
1396
1397         task = get_proc_task(file_inode(file));
1398         if (!task)
1399                 return -ESRCH;
1400         len = snprintf(numbuf, sizeof(numbuf), "%u\n",
1401                         READ_ONCE(task->fail_nth));
1402         len = simple_read_from_buffer(buf, count, ppos, numbuf, len);
1403         put_task_struct(task);
1404
1405         return len;
1406 }
1407
1408 static const struct file_operations proc_fail_nth_operations = {
1409         .read           = proc_fail_nth_read,
1410         .write          = proc_fail_nth_write,
1411 };
1412 #endif
1413
1414
1415 #ifdef CONFIG_SCHED_DEBUG
1416 /*
1417  * Print out various scheduling related per-task fields:
1418  */
1419 static int sched_show(struct seq_file *m, void *v)
1420 {
1421         struct inode *inode = m->private;
1422         struct task_struct *p;
1423
1424         p = get_proc_task(inode);
1425         if (!p)
1426                 return -ESRCH;
1427         proc_sched_show_task(p, m);
1428
1429         put_task_struct(p);
1430
1431         return 0;
1432 }
1433
1434 static ssize_t
1435 sched_write(struct file *file, const char __user *buf,
1436             size_t count, loff_t *offset)
1437 {
1438         struct inode *inode = file_inode(file);
1439         struct task_struct *p;
1440
1441         p = get_proc_task(inode);
1442         if (!p)
1443                 return -ESRCH;
1444         proc_sched_set_task(p);
1445
1446         put_task_struct(p);
1447
1448         return count;
1449 }
1450
1451 static int sched_open(struct inode *inode, struct file *filp)
1452 {
1453         return single_open(filp, sched_show, inode);
1454 }
1455
1456 static const struct file_operations proc_pid_sched_operations = {
1457         .open           = sched_open,
1458         .read           = seq_read,
1459         .write          = sched_write,
1460         .llseek         = seq_lseek,
1461         .release        = single_release,
1462 };
1463
1464 #endif
1465
1466 #ifdef CONFIG_SCHED_AUTOGROUP
1467 /*
1468  * Print out autogroup related information:
1469  */
1470 static int sched_autogroup_show(struct seq_file *m, void *v)
1471 {
1472         struct inode *inode = m->private;
1473         struct task_struct *p;
1474
1475         p = get_proc_task(inode);
1476         if (!p)
1477                 return -ESRCH;
1478         proc_sched_autogroup_show_task(p, m);
1479
1480         put_task_struct(p);
1481
1482         return 0;
1483 }
1484
1485 static ssize_t
1486 sched_autogroup_write(struct file *file, const char __user *buf,
1487             size_t count, loff_t *offset)
1488 {
1489         struct inode *inode = file_inode(file);
1490         struct task_struct *p;
1491         char buffer[PROC_NUMBUF];
1492         int nice;
1493         int err;
1494
1495         memset(buffer, 0, sizeof(buffer));
1496         if (count > sizeof(buffer) - 1)
1497                 count = sizeof(buffer) - 1;
1498         if (copy_from_user(buffer, buf, count))
1499                 return -EFAULT;
1500
1501         err = kstrtoint(strstrip(buffer), 0, &nice);
1502         if (err < 0)
1503                 return err;
1504
1505         p = get_proc_task(inode);
1506         if (!p)
1507                 return -ESRCH;
1508
1509         err = proc_sched_autogroup_set_nice(p, nice);
1510         if (err)
1511                 count = err;
1512
1513         put_task_struct(p);
1514
1515         return count;
1516 }
1517
1518 static int sched_autogroup_open(struct inode *inode, struct file *filp)
1519 {
1520         int ret;
1521
1522         ret = single_open(filp, sched_autogroup_show, NULL);
1523         if (!ret) {
1524                 struct seq_file *m = filp->private_data;
1525
1526                 m->private = inode;
1527         }
1528         return ret;
1529 }
1530
1531 static const struct file_operations proc_pid_sched_autogroup_operations = {
1532         .open           = sched_autogroup_open,
1533         .read           = seq_read,
1534         .write          = sched_autogroup_write,
1535         .llseek         = seq_lseek,
1536         .release        = single_release,
1537 };
1538
1539 #endif /* CONFIG_SCHED_AUTOGROUP */
1540
1541 static ssize_t comm_write(struct file *file, const char __user *buf,
1542                                 size_t count, loff_t *offset)
1543 {
1544         struct inode *inode = file_inode(file);
1545         struct task_struct *p;
1546         char buffer[TASK_COMM_LEN];
1547         const size_t maxlen = sizeof(buffer) - 1;
1548
1549         memset(buffer, 0, sizeof(buffer));
1550         if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
1551                 return -EFAULT;
1552
1553         p = get_proc_task(inode);
1554         if (!p)
1555                 return -ESRCH;
1556
1557         if (same_thread_group(current, p))
1558                 set_task_comm(p, buffer);
1559         else
1560                 count = -EINVAL;
1561
1562         put_task_struct(p);
1563
1564         return count;
1565 }
1566
1567 static int comm_show(struct seq_file *m, void *v)
1568 {
1569         struct inode *inode = m->private;
1570         struct task_struct *p;
1571
1572         p = get_proc_task(inode);
1573         if (!p)
1574                 return -ESRCH;
1575
1576         task_lock(p);
1577         seq_printf(m, "%s\n", p->comm);
1578         task_unlock(p);
1579
1580         put_task_struct(p);
1581
1582         return 0;
1583 }
1584
1585 static int comm_open(struct inode *inode, struct file *filp)
1586 {
1587         return single_open(filp, comm_show, inode);
1588 }
1589
1590 static const struct file_operations proc_pid_set_comm_operations = {
1591         .open           = comm_open,
1592         .read           = seq_read,
1593         .write          = comm_write,
1594         .llseek         = seq_lseek,
1595         .release        = single_release,
1596 };
1597
1598 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1599 {
1600         struct task_struct *task;
1601         struct file *exe_file;
1602
1603         task = get_proc_task(d_inode(dentry));
1604         if (!task)
1605                 return -ENOENT;
1606         exe_file = get_task_exe_file(task);
1607         put_task_struct(task);
1608         if (exe_file) {
1609                 *exe_path = exe_file->f_path;
1610                 path_get(&exe_file->f_path);
1611                 fput(exe_file);
1612                 return 0;
1613         } else
1614                 return -ENOENT;
1615 }
1616
1617 static const char *proc_pid_get_link(struct dentry *dentry,
1618                                      struct inode *inode,
1619                                      struct delayed_call *done)
1620 {
1621         struct path path;
1622         int error = -EACCES;
1623
1624         if (!dentry)
1625                 return ERR_PTR(-ECHILD);
1626
1627         /* Are we allowed to snoop on the tasks file descriptors? */
1628         if (!proc_fd_access_allowed(inode))
1629                 goto out;
1630
1631         error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1632         if (error)
1633                 goto out;
1634
1635         nd_jump_link(&path);
1636         return NULL;
1637 out:
1638         return ERR_PTR(error);
1639 }
1640
1641 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1642 {
1643         char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
1644         char *pathname;
1645         int len;
1646
1647         if (!tmp)
1648                 return -ENOMEM;
1649
1650         pathname = d_path(path, tmp, PAGE_SIZE);
1651         len = PTR_ERR(pathname);
1652         if (IS_ERR(pathname))
1653                 goto out;
1654         len = tmp + PAGE_SIZE - 1 - pathname;
1655
1656         if (len > buflen)
1657                 len = buflen;
1658         if (copy_to_user(buffer, pathname, len))
1659                 len = -EFAULT;
1660  out:
1661         free_page((unsigned long)tmp);
1662         return len;
1663 }
1664
1665 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1666 {
1667         int error = -EACCES;
1668         struct inode *inode = d_inode(dentry);
1669         struct path path;
1670
1671         /* Are we allowed to snoop on the tasks file descriptors? */
1672         if (!proc_fd_access_allowed(inode))
1673                 goto out;
1674
1675         error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1676         if (error)
1677                 goto out;
1678
1679         error = do_proc_readlink(&path, buffer, buflen);
1680         path_put(&path);
1681 out:
1682         return error;
1683 }
1684
1685 const struct inode_operations proc_pid_link_inode_operations = {
1686         .readlink       = proc_pid_readlink,
1687         .get_link       = proc_pid_get_link,
1688         .setattr        = proc_setattr,
1689 };
1690
1691
1692 /* building an inode */
1693
1694 void task_dump_owner(struct task_struct *task, mode_t mode,
1695                      kuid_t *ruid, kgid_t *rgid)
1696 {
1697         /* Depending on the state of dumpable compute who should own a
1698          * proc file for a task.
1699          */
1700         const struct cred *cred;
1701         kuid_t uid;
1702         kgid_t gid;
1703
1704         /* Default to the tasks effective ownership */
1705         rcu_read_lock();
1706         cred = __task_cred(task);
1707         uid = cred->euid;
1708         gid = cred->egid;
1709         rcu_read_unlock();
1710
1711         /*
1712          * Before the /proc/pid/status file was created the only way to read
1713          * the effective uid of a /process was to stat /proc/pid.  Reading
1714          * /proc/pid/status is slow enough that procps and other packages
1715          * kept stating /proc/pid.  To keep the rules in /proc simple I have
1716          * made this apply to all per process world readable and executable
1717          * directories.
1718          */
1719         if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
1720                 struct mm_struct *mm;
1721                 task_lock(task);
1722                 mm = task->mm;
1723                 /* Make non-dumpable tasks owned by some root */
1724                 if (mm) {
1725                         if (get_dumpable(mm) != SUID_DUMP_USER) {
1726                                 struct user_namespace *user_ns = mm->user_ns;
1727
1728                                 uid = make_kuid(user_ns, 0);
1729                                 if (!uid_valid(uid))
1730                                         uid = GLOBAL_ROOT_UID;
1731
1732                                 gid = make_kgid(user_ns, 0);
1733                                 if (!gid_valid(gid))
1734                                         gid = GLOBAL_ROOT_GID;
1735                         }
1736                 } else {
1737                         uid = GLOBAL_ROOT_UID;
1738                         gid = GLOBAL_ROOT_GID;
1739                 }
1740                 task_unlock(task);
1741         }
1742         *ruid = uid;
1743         *rgid = gid;
1744 }
1745
1746 struct inode *proc_pid_make_inode(struct super_block * sb,
1747                                   struct task_struct *task, umode_t mode)
1748 {
1749         struct inode * inode;
1750         struct proc_inode *ei;
1751
1752         /* We need a new inode */
1753
1754         inode = new_inode(sb);
1755         if (!inode)
1756                 goto out;
1757
1758         /* Common stuff */
1759         ei = PROC_I(inode);
1760         inode->i_mode = mode;
1761         inode->i_ino = get_next_ino();
1762         inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
1763         inode->i_op = &proc_def_inode_operations;
1764
1765         /*
1766          * grab the reference to task.
1767          */
1768         ei->pid = get_task_pid(task, PIDTYPE_PID);
1769         if (!ei->pid)
1770                 goto out_unlock;
1771
1772         task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1773         security_task_to_inode(task, inode);
1774
1775 out:
1776         return inode;
1777
1778 out_unlock:
1779         iput(inode);
1780         return NULL;
1781 }
1782
1783 int pid_getattr(const struct path *path, struct kstat *stat,
1784                 u32 request_mask, unsigned int query_flags)
1785 {
1786         struct inode *inode = d_inode(path->dentry);
1787         struct task_struct *task;
1788         struct pid_namespace *pid = path->dentry->d_sb->s_fs_info;
1789
1790         generic_fillattr(inode, stat);
1791
1792         rcu_read_lock();
1793         stat->uid = GLOBAL_ROOT_UID;
1794         stat->gid = GLOBAL_ROOT_GID;
1795         task = pid_task(proc_pid(inode), PIDTYPE_PID);
1796         if (task) {
1797                 if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
1798                         rcu_read_unlock();
1799                         /*
1800                          * This doesn't prevent learning whether PID exists,
1801                          * it only makes getattr() consistent with readdir().
1802                          */
1803                         return -ENOENT;
1804                 }
1805                 task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1806         }
1807         rcu_read_unlock();
1808         return 0;
1809 }
1810
1811 /* dentry stuff */
1812
1813 /*
1814  *      Exceptional case: normally we are not allowed to unhash a busy
1815  * directory. In this case, however, we can do it - no aliasing problems
1816  * due to the way we treat inodes.
1817  *
1818  * Rewrite the inode's ownerships here because the owning task may have
1819  * performed a setuid(), etc.
1820  *
1821  */
1822 int pid_revalidate(struct dentry *dentry, unsigned int flags)
1823 {
1824         struct inode *inode;
1825         struct task_struct *task;
1826
1827         if (flags & LOOKUP_RCU)
1828                 return -ECHILD;
1829
1830         inode = d_inode(dentry);
1831         task = get_proc_task(inode);
1832
1833         if (task) {
1834                 task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1835
1836                 inode->i_mode &= ~(S_ISUID | S_ISGID);
1837                 security_task_to_inode(task, inode);
1838                 put_task_struct(task);
1839                 return 1;
1840         }
1841         return 0;
1842 }
1843
1844 static inline bool proc_inode_is_dead(struct inode *inode)
1845 {
1846         return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
1847 }
1848
1849 int pid_delete_dentry(const struct dentry *dentry)
1850 {
1851         /* Is the task we represent dead?
1852          * If so, then don't put the dentry on the lru list,
1853          * kill it immediately.
1854          */
1855         return proc_inode_is_dead(d_inode(dentry));
1856 }
1857
1858 const struct dentry_operations pid_dentry_operations =
1859 {
1860         .d_revalidate   = pid_revalidate,
1861         .d_delete       = pid_delete_dentry,
1862 };
1863
1864 /* Lookups */
1865
1866 /*
1867  * Fill a directory entry.
1868  *
1869  * If possible create the dcache entry and derive our inode number and
1870  * file type from dcache entry.
1871  *
1872  * Since all of the proc inode numbers are dynamically generated, the inode
1873  * numbers do not exist until the inode is cache.  This means creating the
1874  * the dcache entry in readdir is necessary to keep the inode numbers
1875  * reported by readdir in sync with the inode numbers reported
1876  * by stat.
1877  */
1878 bool proc_fill_cache(struct file *file, struct dir_context *ctx,
1879         const char *name, int len,
1880         instantiate_t instantiate, struct task_struct *task, const void *ptr)
1881 {
1882         struct dentry *child, *dir = file->f_path.dentry;
1883         struct qstr qname = QSTR_INIT(name, len);
1884         struct inode *inode;
1885         unsigned type;
1886         ino_t ino;
1887
1888         child = d_hash_and_lookup(dir, &qname);
1889         if (!child) {
1890                 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
1891                 child = d_alloc_parallel(dir, &qname, &wq);
1892                 if (IS_ERR(child))
1893                         goto end_instantiate;
1894                 if (d_in_lookup(child)) {
1895                         int err = instantiate(d_inode(dir), child, task, ptr);
1896                         d_lookup_done(child);
1897                         if (err < 0) {
1898                                 dput(child);
1899                                 goto end_instantiate;
1900                         }
1901                 }
1902         }
1903         inode = d_inode(child);
1904         ino = inode->i_ino;
1905         type = inode->i_mode >> 12;
1906         dput(child);
1907         return dir_emit(ctx, name, len, ino, type);
1908
1909 end_instantiate:
1910         return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
1911 }
1912
1913 /*
1914  * dname_to_vma_addr - maps a dentry name into two unsigned longs
1915  * which represent vma start and end addresses.
1916  */
1917 static int dname_to_vma_addr(struct dentry *dentry,
1918                              unsigned long *start, unsigned long *end)
1919 {
1920         if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
1921                 return -EINVAL;
1922
1923         return 0;
1924 }
1925
1926 static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1927 {
1928         unsigned long vm_start, vm_end;
1929         bool exact_vma_exists = false;
1930         struct mm_struct *mm = NULL;
1931         struct task_struct *task;
1932         struct inode *inode;
1933         int status = 0;
1934
1935         if (flags & LOOKUP_RCU)
1936                 return -ECHILD;
1937
1938         inode = d_inode(dentry);
1939         task = get_proc_task(inode);
1940         if (!task)
1941                 goto out_notask;
1942
1943         mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1944         if (IS_ERR_OR_NULL(mm))
1945                 goto out;
1946
1947         if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
1948                 down_read(&mm->mmap_sem);
1949                 exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
1950                 up_read(&mm->mmap_sem);
1951         }
1952
1953         mmput(mm);
1954
1955         if (exact_vma_exists) {
1956                 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1957
1958                 security_task_to_inode(task, inode);
1959                 status = 1;
1960         }
1961
1962 out:
1963         put_task_struct(task);
1964
1965 out_notask:
1966         return status;
1967 }
1968
1969 static const struct dentry_operations tid_map_files_dentry_operations = {
1970         .d_revalidate   = map_files_d_revalidate,
1971         .d_delete       = pid_delete_dentry,
1972 };
1973
1974 static int map_files_get_link(struct dentry *dentry, struct path *path)
1975 {
1976         unsigned long vm_start, vm_end;
1977         struct vm_area_struct *vma;
1978         struct task_struct *task;
1979         struct mm_struct *mm;
1980         int rc;
1981
1982         rc = -ENOENT;
1983         task = get_proc_task(d_inode(dentry));
1984         if (!task)
1985                 goto out;
1986
1987         mm = get_task_mm(task);
1988         put_task_struct(task);
1989         if (!mm)
1990                 goto out;
1991
1992         rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
1993         if (rc)
1994                 goto out_mmput;
1995
1996         rc = -ENOENT;
1997         down_read(&mm->mmap_sem);
1998         vma = find_exact_vma(mm, vm_start, vm_end);
1999         if (vma && vma->vm_file) {
2000                 *path = vma_pr_or_file(vma)->f_path;
2001                 path_get(path);
2002                 rc = 0;
2003         }
2004         up_read(&mm->mmap_sem);
2005
2006 out_mmput:
2007         mmput(mm);
2008 out:
2009         return rc;
2010 }
2011
2012 struct map_files_info {
2013         fmode_t         mode;
2014         unsigned int    len;
2015         unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
2016 };
2017
2018 /*
2019  * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
2020  * symlinks may be used to bypass permissions on ancestor directories in the
2021  * path to the file in question.
2022  */
2023 static const char *
2024 proc_map_files_get_link(struct dentry *dentry,
2025                         struct inode *inode,
2026                         struct delayed_call *done)
2027 {
2028         if (!capable(CAP_SYS_ADMIN))
2029                 return ERR_PTR(-EPERM);
2030
2031         return proc_pid_get_link(dentry, inode, done);
2032 }
2033
2034 /*
2035  * Identical to proc_pid_link_inode_operations except for get_link()
2036  */
2037 static const struct inode_operations proc_map_files_link_inode_operations = {
2038         .readlink       = proc_pid_readlink,
2039         .get_link       = proc_map_files_get_link,
2040         .setattr        = proc_setattr,
2041 };
2042
2043 static int
2044 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
2045                            struct task_struct *task, const void *ptr)
2046 {
2047         fmode_t mode = (fmode_t)(unsigned long)ptr;
2048         struct proc_inode *ei;
2049         struct inode *inode;
2050
2051         inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
2052                                     ((mode & FMODE_READ ) ? S_IRUSR : 0) |
2053                                     ((mode & FMODE_WRITE) ? S_IWUSR : 0));
2054         if (!inode)
2055                 return -ENOENT;
2056
2057         ei = PROC_I(inode);
2058         ei->op.proc_get_link = map_files_get_link;
2059
2060         inode->i_op = &proc_map_files_link_inode_operations;
2061         inode->i_size = 64;
2062
2063         d_set_d_op(dentry, &tid_map_files_dentry_operations);
2064         d_add(dentry, inode);
2065
2066         return 0;
2067 }
2068
2069 static struct dentry *proc_map_files_lookup(struct inode *dir,
2070                 struct dentry *dentry, unsigned int flags)
2071 {
2072         unsigned long vm_start, vm_end;
2073         struct vm_area_struct *vma;
2074         struct task_struct *task;
2075         int result;
2076         struct mm_struct *mm;
2077
2078         result = -ENOENT;
2079         task = get_proc_task(dir);
2080         if (!task)
2081                 goto out;
2082
2083         result = -EACCES;
2084         if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2085                 goto out_put_task;
2086
2087         result = -ENOENT;
2088         if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
2089                 goto out_put_task;
2090
2091         mm = get_task_mm(task);
2092         if (!mm)
2093                 goto out_put_task;
2094
2095         down_read(&mm->mmap_sem);
2096         vma = find_exact_vma(mm, vm_start, vm_end);
2097         if (!vma)
2098                 goto out_no_vma;
2099
2100         if (vma->vm_file)
2101                 result = proc_map_files_instantiate(dir, dentry, task,
2102                                 (void *)(unsigned long)vma->vm_file->f_mode);
2103
2104 out_no_vma:
2105         up_read(&mm->mmap_sem);
2106         mmput(mm);
2107 out_put_task:
2108         put_task_struct(task);
2109 out:
2110         return ERR_PTR(result);
2111 }
2112
2113 static const struct inode_operations proc_map_files_inode_operations = {
2114         .lookup         = proc_map_files_lookup,
2115         .permission     = proc_fd_permission,
2116         .setattr        = proc_setattr,
2117 };
2118
2119 static int
2120 proc_map_files_readdir(struct file *file, struct dir_context *ctx)
2121 {
2122         struct vm_area_struct *vma;
2123         struct task_struct *task;
2124         struct mm_struct *mm;
2125         unsigned long nr_files, pos, i;
2126         struct flex_array *fa = NULL;
2127         struct map_files_info info;
2128         struct map_files_info *p;
2129         int ret;
2130
2131         ret = -ENOENT;
2132         task = get_proc_task(file_inode(file));
2133         if (!task)
2134                 goto out;
2135
2136         ret = -EACCES;
2137         if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2138                 goto out_put_task;
2139
2140         ret = 0;
2141         if (!dir_emit_dots(file, ctx))
2142                 goto out_put_task;
2143
2144         mm = get_task_mm(task);
2145         if (!mm)
2146                 goto out_put_task;
2147         down_read(&mm->mmap_sem);
2148
2149         nr_files = 0;
2150
2151         /*
2152          * We need two passes here:
2153          *
2154          *  1) Collect vmas of mapped files with mmap_sem taken
2155          *  2) Release mmap_sem and instantiate entries
2156          *
2157          * otherwise we get lockdep complained, since filldir()
2158          * routine might require mmap_sem taken in might_fault().
2159          */
2160
2161         for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
2162                 if (vma->vm_file && ++pos > ctx->pos)
2163                         nr_files++;
2164         }
2165
2166         if (nr_files) {
2167                 fa = flex_array_alloc(sizeof(info), nr_files,
2168                                         GFP_KERNEL);
2169                 if (!fa || flex_array_prealloc(fa, 0, nr_files,
2170                                                 GFP_KERNEL)) {
2171                         ret = -ENOMEM;
2172                         if (fa)
2173                                 flex_array_free(fa);
2174                         up_read(&mm->mmap_sem);
2175                         mmput(mm);
2176                         goto out_put_task;
2177                 }
2178                 for (i = 0, vma = mm->mmap, pos = 2; vma;
2179                                 vma = vma->vm_next) {
2180                         if (!vma->vm_file)
2181                                 continue;
2182                         if (++pos <= ctx->pos)
2183                                 continue;
2184
2185                         info.mode = vma->vm_file->f_mode;
2186                         info.len = snprintf(info.name,
2187                                         sizeof(info.name), "%lx-%lx",
2188                                         vma->vm_start, vma->vm_end);
2189                         if (flex_array_put(fa, i++, &info, GFP_KERNEL))
2190                                 BUG();
2191                 }
2192         }
2193         up_read(&mm->mmap_sem);
2194
2195         for (i = 0; i < nr_files; i++) {
2196                 p = flex_array_get(fa, i);
2197                 if (!proc_fill_cache(file, ctx,
2198                                       p->name, p->len,
2199                                       proc_map_files_instantiate,
2200                                       task,
2201                                       (void *)(unsigned long)p->mode))
2202                         break;
2203                 ctx->pos++;
2204         }
2205         if (fa)
2206                 flex_array_free(fa);
2207         mmput(mm);
2208
2209 out_put_task:
2210         put_task_struct(task);
2211 out:
2212         return ret;
2213 }
2214
2215 static const struct file_operations proc_map_files_operations = {
2216         .read           = generic_read_dir,
2217         .iterate_shared = proc_map_files_readdir,
2218         .llseek         = generic_file_llseek,
2219 };
2220
2221 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
2222 struct timers_private {
2223         struct pid *pid;
2224         struct task_struct *task;
2225         struct sighand_struct *sighand;
2226         struct pid_namespace *ns;
2227         unsigned long flags;
2228 };
2229
2230 static void *timers_start(struct seq_file *m, loff_t *pos)
2231 {
2232         struct timers_private *tp = m->private;
2233
2234         tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
2235         if (!tp->task)
2236                 return ERR_PTR(-ESRCH);
2237
2238         tp->sighand = lock_task_sighand(tp->task, &tp->flags);
2239         if (!tp->sighand)
2240                 return ERR_PTR(-ESRCH);
2241
2242         return seq_list_start(&tp->task->signal->posix_timers, *pos);
2243 }
2244
2245 static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
2246 {
2247         struct timers_private *tp = m->private;
2248         return seq_list_next(v, &tp->task->signal->posix_timers, pos);
2249 }
2250
2251 static void timers_stop(struct seq_file *m, void *v)
2252 {
2253         struct timers_private *tp = m->private;
2254
2255         if (tp->sighand) {
2256                 unlock_task_sighand(tp->task, &tp->flags);
2257                 tp->sighand = NULL;
2258         }
2259
2260         if (tp->task) {
2261                 put_task_struct(tp->task);
2262                 tp->task = NULL;
2263         }
2264 }
2265
2266 static int show_timer(struct seq_file *m, void *v)
2267 {
2268         struct k_itimer *timer;
2269         struct timers_private *tp = m->private;
2270         int notify;
2271         static const char * const nstr[] = {
2272                 [SIGEV_SIGNAL] = "signal",
2273                 [SIGEV_NONE] = "none",
2274                 [SIGEV_THREAD] = "thread",
2275         };
2276
2277         timer = list_entry((struct list_head *)v, struct k_itimer, list);
2278         notify = timer->it_sigev_notify;
2279
2280         seq_printf(m, "ID: %d\n", timer->it_id);
2281         seq_printf(m, "signal: %d/%p\n",
2282                    timer->sigq->info.si_signo,
2283                    timer->sigq->info.si_value.sival_ptr);
2284         seq_printf(m, "notify: %s/%s.%d\n",
2285                    nstr[notify & ~SIGEV_THREAD_ID],
2286                    (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
2287                    pid_nr_ns(timer->it_pid, tp->ns));
2288         seq_printf(m, "ClockID: %d\n", timer->it_clock);
2289
2290         return 0;
2291 }
2292
2293 static const struct seq_operations proc_timers_seq_ops = {
2294         .start  = timers_start,
2295         .next   = timers_next,
2296         .stop   = timers_stop,
2297         .show   = show_timer,
2298 };
2299
2300 static int proc_timers_open(struct inode *inode, struct file *file)
2301 {
2302         struct timers_private *tp;
2303
2304         tp = __seq_open_private(file, &proc_timers_seq_ops,
2305                         sizeof(struct timers_private));
2306         if (!tp)
2307                 return -ENOMEM;
2308
2309         tp->pid = proc_pid(inode);
2310         tp->ns = inode->i_sb->s_fs_info;
2311         return 0;
2312 }
2313
2314 static const struct file_operations proc_timers_operations = {
2315         .open           = proc_timers_open,
2316         .read           = seq_read,
2317         .llseek         = seq_lseek,
2318         .release        = seq_release_private,
2319 };
2320 #endif
2321
2322 static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
2323                                         size_t count, loff_t *offset)
2324 {
2325         struct inode *inode = file_inode(file);
2326         struct task_struct *p;
2327         u64 slack_ns;
2328         int err;
2329
2330         err = kstrtoull_from_user(buf, count, 10, &slack_ns);
2331         if (err < 0)
2332                 return err;
2333
2334         p = get_proc_task(inode);
2335         if (!p)
2336                 return -ESRCH;
2337
2338         if (p != current) {
2339                 if (!capable(CAP_SYS_NICE)) {
2340                         count = -EPERM;
2341                         goto out;
2342                 }
2343
2344                 err = security_task_setscheduler(p);
2345                 if (err) {
2346                         count = err;
2347                         goto out;
2348                 }
2349         }
2350
2351         task_lock(p);
2352         if (slack_ns == 0)
2353                 p->timer_slack_ns = p->default_timer_slack_ns;
2354         else
2355                 p->timer_slack_ns = slack_ns;
2356         task_unlock(p);
2357
2358 out:
2359         put_task_struct(p);
2360
2361         return count;
2362 }
2363
2364 static int timerslack_ns_show(struct seq_file *m, void *v)
2365 {
2366         struct inode *inode = m->private;
2367         struct task_struct *p;
2368         int err = 0;
2369
2370         p = get_proc_task(inode);
2371         if (!p)
2372                 return -ESRCH;
2373
2374         if (p != current) {
2375
2376                 if (!capable(CAP_SYS_NICE)) {
2377                         err = -EPERM;
2378                         goto out;
2379                 }
2380                 err = security_task_getscheduler(p);
2381                 if (err)
2382                         goto out;
2383         }
2384
2385         task_lock(p);
2386         seq_printf(m, "%llu\n", p->timer_slack_ns);
2387         task_unlock(p);
2388
2389 out:
2390         put_task_struct(p);
2391
2392         return err;
2393 }
2394
2395 static int timerslack_ns_open(struct inode *inode, struct file *filp)
2396 {
2397         return single_open(filp, timerslack_ns_show, inode);
2398 }
2399
2400 static const struct file_operations proc_pid_set_timerslack_ns_operations = {
2401         .open           = timerslack_ns_open,
2402         .read           = seq_read,
2403         .write          = timerslack_ns_write,
2404         .llseek         = seq_lseek,
2405         .release        = single_release,
2406 };
2407
2408 static int proc_pident_instantiate(struct inode *dir,
2409         struct dentry *dentry, struct task_struct *task, const void *ptr)
2410 {
2411         const struct pid_entry *p = ptr;
2412         struct inode *inode;
2413         struct proc_inode *ei;
2414
2415         inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
2416         if (!inode)
2417                 goto out;
2418
2419         ei = PROC_I(inode);
2420         if (S_ISDIR(inode->i_mode))
2421                 set_nlink(inode, 2);    /* Use getattr to fix if necessary */
2422         if (p->iop)
2423                 inode->i_op = p->iop;
2424         if (p->fop)
2425                 inode->i_fop = p->fop;
2426         ei->op = p->op;
2427         d_set_d_op(dentry, &pid_dentry_operations);
2428         d_add(dentry, inode);
2429         /* Close the race of the process dying before we return the dentry */
2430         if (pid_revalidate(dentry, 0))
2431                 return 0;
2432 out:
2433         return -ENOENT;
2434 }
2435
2436 static struct dentry *proc_pident_lookup(struct inode *dir,
2437                                          struct dentry *dentry,
2438                                          const struct pid_entry *ents,
2439                                          unsigned int nents)
2440 {
2441         int error;
2442         struct task_struct *task = get_proc_task(dir);
2443         const struct pid_entry *p, *last;
2444
2445         error = -ENOENT;
2446
2447         if (!task)
2448                 goto out_no_task;
2449
2450         /*
2451          * Yes, it does not scale. And it should not. Don't add
2452          * new entries into /proc/<tgid>/ without very good reasons.
2453          */
2454         last = &ents[nents];
2455         for (p = ents; p < last; p++) {
2456                 if (p->len != dentry->d_name.len)
2457                         continue;
2458                 if (!memcmp(dentry->d_name.name, p->name, p->len))
2459                         break;
2460         }
2461         if (p >= last)
2462                 goto out;
2463
2464         error = proc_pident_instantiate(dir, dentry, task, p);
2465 out:
2466         put_task_struct(task);
2467 out_no_task:
2468         return ERR_PTR(error);
2469 }
2470
2471 static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2472                 const struct pid_entry *ents, unsigned int nents)
2473 {
2474         struct task_struct *task = get_proc_task(file_inode(file));
2475         const struct pid_entry *p;
2476
2477         if (!task)
2478                 return -ENOENT;
2479
2480         if (!dir_emit_dots(file, ctx))
2481                 goto out;
2482
2483         if (ctx->pos >= nents + 2)
2484                 goto out;
2485
2486         for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
2487                 if (!proc_fill_cache(file, ctx, p->name, p->len,
2488                                 proc_pident_instantiate, task, p))
2489                         break;
2490                 ctx->pos++;
2491         }
2492 out:
2493         put_task_struct(task);
2494         return 0;
2495 }
2496
2497 #ifdef CONFIG_SECURITY
2498 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2499                                   size_t count, loff_t *ppos)
2500 {
2501         struct inode * inode = file_inode(file);
2502         char *p = NULL;
2503         ssize_t length;
2504         struct task_struct *task = get_proc_task(inode);
2505
2506         if (!task)
2507                 return -ESRCH;
2508
2509         length = security_getprocattr(task, PROC_I(inode)->op.lsm,
2510                                       (char*)file->f_path.dentry->d_name.name,
2511                                       &p);
2512         put_task_struct(task);
2513         if (length > 0)
2514                 length = simple_read_from_buffer(buf, count, ppos, p, length);
2515         kfree(p);
2516         return length;
2517 }
2518
2519 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2520                                    size_t count, loff_t *ppos)
2521 {
2522         struct inode * inode = file_inode(file);
2523         void *page;
2524         ssize_t length;
2525         struct task_struct *task = get_proc_task(inode);
2526
2527         length = -ESRCH;
2528         if (!task)
2529                 goto out_no_task;
2530
2531         /* A task may only write its own attributes. */
2532         length = -EACCES;
2533         if (current != task)
2534                 goto out;
2535
2536         if (count > PAGE_SIZE)
2537                 count = PAGE_SIZE;
2538
2539         /* No partial writes. */
2540         length = -EINVAL;
2541         if (*ppos != 0)
2542                 goto out;
2543
2544         page = memdup_user(buf, count);
2545         if (IS_ERR(page)) {
2546                 length = PTR_ERR(page);
2547                 goto out;
2548         }
2549
2550         /* Guard against adverse ptrace interaction */
2551         length = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
2552         if (length < 0)
2553                 goto out_free;
2554
2555         length = security_setprocattr(PROC_I(inode)->op.lsm,
2556                                       file->f_path.dentry->d_name.name,
2557                                       page, count);
2558         mutex_unlock(&current->signal->cred_guard_mutex);
2559 out_free:
2560         kfree(page);
2561 out:
2562         put_task_struct(task);
2563 out_no_task:
2564         return length;
2565 }
2566
2567 static const struct file_operations proc_pid_attr_operations = {
2568         .read           = proc_pid_attr_read,
2569         .write          = proc_pid_attr_write,
2570         .llseek         = generic_file_llseek,
2571 };
2572
2573 #define LSM_DIR_OPS(LSM) \
2574 static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
2575                              struct dir_context *ctx) \
2576 { \
2577         return proc_pident_readdir(filp, ctx, \
2578                                    LSM##_attr_dir_stuff, \
2579                                    ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2580 } \
2581 \
2582 static const struct file_operations proc_##LSM##_attr_dir_ops = { \
2583         .read           = generic_read_dir, \
2584         .iterate        = proc_##LSM##_attr_dir_iterate, \
2585         .llseek         = default_llseek, \
2586 }; \
2587 \
2588 static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
2589                                 struct dentry *dentry, unsigned int flags) \
2590 { \
2591         return proc_pident_lookup(dir, dentry, \
2592                                   LSM##_attr_dir_stuff, \
2593                                   ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2594 } \
2595 \
2596 static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
2597         .lookup         = proc_##LSM##_attr_dir_lookup, \
2598         .getattr        = pid_getattr, \
2599         .setattr        = proc_setattr, \
2600 }
2601
2602 #ifdef CONFIG_SECURITY_SELINUX
2603 static const struct pid_entry selinux_attr_dir_stuff[] = {
2604         ATTR("selinux", "current",      0666),
2605         ATTR("selinux", "prev",         0444),
2606         ATTR("selinux", "exec",         0666),
2607         ATTR("selinux", "fscreate",     0666),
2608         ATTR("selinux", "keycreate",    0666),
2609         ATTR("selinux", "sockcreate",   0666),
2610         ATTR("selinux", "context",      0666),
2611 };
2612 LSM_DIR_OPS(selinux);
2613 #endif
2614
2615 #ifdef CONFIG_SECURITY_SMACK
2616 static const struct pid_entry smack_attr_dir_stuff[] = {
2617         ATTR("smack", "current",        0666),
2618         ATTR("smack", "context",        0666),
2619 };
2620 LSM_DIR_OPS(smack);
2621 #endif
2622
2623 #ifdef CONFIG_SECURITY_APPARMOR
2624 static const struct pid_entry apparmor_attr_dir_stuff[] = {
2625         ATTR("apparmor", "current",     0666),
2626         ATTR("apparmor", "prev",        0444),
2627         ATTR("apparmor", "exec",        0666),
2628         ATTR("apparmor", "context",     0666),
2629 };
2630 LSM_DIR_OPS(apparmor);
2631 #endif
2632
2633 static const struct pid_entry attr_dir_stuff[] = {
2634         ATTR(NULL, "current",           0666),
2635         ATTR(NULL, "prev",              0444),
2636         ATTR(NULL, "exec",              0666),
2637         ATTR(NULL, "fscreate",          0666),
2638         ATTR(NULL, "keycreate",         0666),
2639         ATTR(NULL, "sockcreate",        0666),
2640         ATTR(NULL, "context",           0666),
2641         ATTR(NULL, "display_lsm",       0666),
2642
2643 #ifdef CONFIG_SECURITY_SELINUX
2644         DIR("selinux",                  0555,
2645             proc_selinux_attr_dir_inode_ops, proc_selinux_attr_dir_ops),
2646 #endif
2647 #ifdef CONFIG_SECURITY_SMACK
2648         DIR("smack",                    0555,
2649             proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
2650 #endif
2651 #ifdef CONFIG_SECURITY_APPARMOR
2652         DIR("apparmor",                 0555,
2653             proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
2654 #endif
2655 };
2656
2657 static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2658 {
2659         return proc_pident_readdir(file, ctx,
2660                                    attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2661 }
2662
2663 static const struct file_operations proc_attr_dir_operations = {
2664         .read           = generic_read_dir,
2665         .iterate_shared = proc_attr_dir_readdir,
2666         .llseek         = generic_file_llseek,
2667 };
2668
2669 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2670                                 struct dentry *dentry, unsigned int flags)
2671 {
2672         return proc_pident_lookup(dir, dentry,
2673                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2674 }
2675
2676 static const struct inode_operations proc_attr_dir_inode_operations = {
2677         .lookup         = proc_attr_dir_lookup,
2678         .getattr        = pid_getattr,
2679         .setattr        = proc_setattr,
2680 };
2681
2682 #endif
2683
2684 #ifdef CONFIG_ELF_CORE
2685 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2686                                          size_t count, loff_t *ppos)
2687 {
2688         struct task_struct *task = get_proc_task(file_inode(file));
2689         struct mm_struct *mm;
2690         char buffer[PROC_NUMBUF];
2691         size_t len;
2692         int ret;
2693
2694         if (!task)
2695                 return -ESRCH;
2696
2697         ret = 0;
2698         mm = get_task_mm(task);
2699         if (mm) {
2700                 len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2701                                ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2702                                 MMF_DUMP_FILTER_SHIFT));
2703                 mmput(mm);
2704                 ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2705         }
2706
2707         put_task_struct(task);
2708
2709         return ret;
2710 }
2711
2712 static ssize_t proc_coredump_filter_write(struct file *file,
2713                                           const char __user *buf,
2714                                           size_t count,
2715                                           loff_t *ppos)
2716 {
2717         struct task_struct *task;
2718         struct mm_struct *mm;
2719         unsigned int val;
2720         int ret;
2721         int i;
2722         unsigned long mask;
2723
2724         ret = kstrtouint_from_user(buf, count, 0, &val);
2725         if (ret < 0)
2726                 return ret;
2727
2728         ret = -ESRCH;
2729         task = get_proc_task(file_inode(file));
2730         if (!task)
2731                 goto out_no_task;
2732
2733         mm = get_task_mm(task);
2734         if (!mm)
2735                 goto out_no_mm;
2736         ret = 0;
2737
2738         for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2739                 if (val & mask)
2740                         set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2741                 else
2742                         clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2743         }
2744
2745         mmput(mm);
2746  out_no_mm:
2747         put_task_struct(task);
2748  out_no_task:
2749         if (ret < 0)
2750                 return ret;
2751         return count;
2752 }
2753
2754 static const struct file_operations proc_coredump_filter_operations = {
2755         .read           = proc_coredump_filter_read,
2756         .write          = proc_coredump_filter_write,
2757         .llseek         = generic_file_llseek,
2758 };
2759 #endif
2760
2761 #ifdef CONFIG_TASK_IO_ACCOUNTING
2762 static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
2763 {
2764         struct task_io_accounting acct = task->ioac;
2765         unsigned long flags;
2766         int result;
2767
2768         result = mutex_lock_killable(&task->signal->cred_guard_mutex);
2769         if (result)
2770                 return result;
2771
2772         if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
2773                 result = -EACCES;
2774                 goto out_unlock;
2775         }
2776
2777         if (whole && lock_task_sighand(task, &flags)) {
2778                 struct task_struct *t = task;
2779
2780                 task_io_accounting_add(&acct, &task->signal->ioac);
2781                 while_each_thread(task, t)
2782                         task_io_accounting_add(&acct, &t->ioac);
2783
2784                 unlock_task_sighand(task, &flags);
2785         }
2786         seq_printf(m,
2787                    "rchar: %llu\n"
2788                    "wchar: %llu\n"
2789                    "syscr: %llu\n"
2790                    "syscw: %llu\n"
2791                    "read_bytes: %llu\n"
2792                    "write_bytes: %llu\n"
2793                    "cancelled_write_bytes: %llu\n",
2794                    (unsigned long long)acct.rchar,
2795                    (unsigned long long)acct.wchar,
2796                    (unsigned long long)acct.syscr,
2797                    (unsigned long long)acct.syscw,
2798                    (unsigned long long)acct.read_bytes,
2799                    (unsigned long long)acct.write_bytes,
2800                    (unsigned long long)acct.cancelled_write_bytes);
2801         result = 0;
2802
2803 out_unlock:
2804         mutex_unlock(&task->signal->cred_guard_mutex);
2805         return result;
2806 }
2807
2808 static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2809                                   struct pid *pid, struct task_struct *task)
2810 {
2811         return do_io_accounting(task, m, 0);
2812 }
2813
2814 static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2815                                    struct pid *pid, struct task_struct *task)
2816 {
2817         return do_io_accounting(task, m, 1);
2818 }
2819 #endif /* CONFIG_TASK_IO_ACCOUNTING */
2820
2821 #ifdef CONFIG_USER_NS
2822 static int proc_id_map_open(struct inode *inode, struct file *file,
2823         const struct seq_operations *seq_ops)
2824 {
2825         struct user_namespace *ns = NULL;
2826         struct task_struct *task;
2827         struct seq_file *seq;
2828         int ret = -EINVAL;
2829
2830         task = get_proc_task(inode);
2831         if (task) {
2832                 rcu_read_lock();
2833                 ns = get_user_ns(task_cred_xxx(task, user_ns));
2834                 rcu_read_unlock();
2835                 put_task_struct(task);
2836         }
2837         if (!ns)
2838                 goto err;
2839
2840         ret = seq_open(file, seq_ops);
2841         if (ret)
2842                 goto err_put_ns;
2843
2844         seq = file->private_data;
2845         seq->private = ns;
2846
2847         return 0;
2848 err_put_ns:
2849         put_user_ns(ns);
2850 err:
2851         return ret;
2852 }
2853
2854 static int proc_id_map_release(struct inode *inode, struct file *file)
2855 {
2856         struct seq_file *seq = file->private_data;
2857         struct user_namespace *ns = seq->private;
2858         put_user_ns(ns);
2859         return seq_release(inode, file);
2860 }
2861
2862 static int proc_uid_map_open(struct inode *inode, struct file *file)
2863 {
2864         return proc_id_map_open(inode, file, &proc_uid_seq_operations);
2865 }
2866
2867 static int proc_gid_map_open(struct inode *inode, struct file *file)
2868 {
2869         return proc_id_map_open(inode, file, &proc_gid_seq_operations);
2870 }
2871
2872 static int proc_projid_map_open(struct inode *inode, struct file *file)
2873 {
2874         return proc_id_map_open(inode, file, &proc_projid_seq_operations);
2875 }
2876
2877 static const struct file_operations proc_uid_map_operations = {
2878         .open           = proc_uid_map_open,
2879         .write          = proc_uid_map_write,
2880         .read           = seq_read,
2881         .llseek         = seq_lseek,
2882         .release        = proc_id_map_release,
2883 };
2884
2885 static const struct file_operations proc_gid_map_operations = {
2886         .open           = proc_gid_map_open,
2887         .write          = proc_gid_map_write,
2888         .read           = seq_read,
2889         .llseek         = seq_lseek,
2890         .release        = proc_id_map_release,
2891 };
2892
2893 static const struct file_operations proc_projid_map_operations = {
2894         .open           = proc_projid_map_open,
2895         .write          = proc_projid_map_write,
2896         .read           = seq_read,
2897         .llseek         = seq_lseek,
2898         .release        = proc_id_map_release,
2899 };
2900
2901 static int proc_setgroups_open(struct inode *inode, struct file *file)
2902 {
2903         struct user_namespace *ns = NULL;
2904         struct task_struct *task;
2905         int ret;
2906
2907         ret = -ESRCH;
2908         task = get_proc_task(inode);
2909         if (task) {
2910                 rcu_read_lock();
2911                 ns = get_user_ns(task_cred_xxx(task, user_ns));
2912                 rcu_read_unlock();
2913                 put_task_struct(task);
2914         }
2915         if (!ns)
2916                 goto err;
2917
2918         if (file->f_mode & FMODE_WRITE) {
2919                 ret = -EACCES;
2920                 if (!ns_capable(ns, CAP_SYS_ADMIN))
2921                         goto err_put_ns;
2922         }
2923
2924         ret = single_open(file, &proc_setgroups_show, ns);
2925         if (ret)
2926                 goto err_put_ns;
2927
2928         return 0;
2929 err_put_ns:
2930         put_user_ns(ns);
2931 err:
2932         return ret;
2933 }
2934
2935 static int proc_setgroups_release(struct inode *inode, struct file *file)
2936 {
2937         struct seq_file *seq = file->private_data;
2938         struct user_namespace *ns = seq->private;
2939         int ret = single_release(inode, file);
2940         put_user_ns(ns);
2941         return ret;
2942 }
2943
2944 static const struct file_operations proc_setgroups_operations = {
2945         .open           = proc_setgroups_open,
2946         .write          = proc_setgroups_write,
2947         .read           = seq_read,
2948         .llseek         = seq_lseek,
2949         .release        = proc_setgroups_release,
2950 };
2951 #endif /* CONFIG_USER_NS */
2952
2953 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
2954                                 struct pid *pid, struct task_struct *task)
2955 {
2956         int err = lock_trace(task);
2957         if (!err) {
2958                 seq_printf(m, "%08x\n", task->personality);
2959                 unlock_trace(task);
2960         }
2961         return err;
2962 }
2963
2964 #ifdef CONFIG_LIVEPATCH
2965 static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
2966                                 struct pid *pid, struct task_struct *task)
2967 {
2968         seq_printf(m, "%d\n", task->patch_state);
2969         return 0;
2970 }
2971 #endif /* CONFIG_LIVEPATCH */
2972
2973 /*
2974  * Thread groups
2975  */
2976 static const struct file_operations proc_task_operations;
2977 static const struct inode_operations proc_task_inode_operations;
2978
2979 static const struct pid_entry tgid_base_stuff[] = {
2980         DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2981         DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2982         DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
2983         DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2984         DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2985 #ifdef CONFIG_NET
2986         DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2987 #endif
2988         REG("environ",    S_IRUSR, proc_environ_operations),
2989         REG("auxv",       S_IRUSR, proc_auxv_operations),
2990         ONE("status",     S_IRUGO, proc_pid_status),
2991         ONE("personality", S_IRUSR, proc_pid_personality),
2992         ONE("limits",     S_IRUGO, proc_pid_limits),
2993 #ifdef CONFIG_SCHED_DEBUG
2994         REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2995 #endif
2996 #ifdef CONFIG_SCHED_AUTOGROUP
2997         REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
2998 #endif
2999         REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
3000 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3001         ONE("syscall",    S_IRUSR, proc_pid_syscall),
3002 #endif
3003         REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
3004         ONE("stat",       S_IRUGO, proc_tgid_stat),
3005         ONE("statm",      S_IRUGO, proc_pid_statm),
3006         REG("maps",       S_IRUGO, proc_pid_maps_operations),
3007 #ifdef CONFIG_NUMA
3008         REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
3009 #endif
3010         REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
3011         LNK("cwd",        proc_cwd_link),
3012         LNK("root",       proc_root_link),
3013         LNK("exe",        proc_exe_link),
3014         REG("mounts",     S_IRUGO, proc_mounts_operations),
3015         REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
3016         REG("mountstats", S_IRUSR, proc_mountstats_operations),
3017 #ifdef CONFIG_PROC_PAGE_MONITOR
3018         REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3019         REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
3020         REG("pagemap",    S_IRUSR, proc_pagemap_operations),
3021 #endif
3022 #ifdef CONFIG_SECURITY
3023         DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3024 #endif
3025 #ifdef CONFIG_KALLSYMS
3026         ONE("wchan",      S_IRUGO, proc_pid_wchan),
3027 #endif
3028 #ifdef CONFIG_STACKTRACE
3029         ONE("stack",      S_IRUSR, proc_pid_stack),
3030 #endif
3031 #ifdef CONFIG_SCHED_INFO
3032         ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
3033 #endif
3034 #ifdef CONFIG_LATENCYTOP
3035         REG("latency",  S_IRUGO, proc_lstats_operations),
3036 #endif
3037 #ifdef CONFIG_PROC_PID_CPUSET
3038         ONE("cpuset",     S_IRUGO, proc_cpuset_show),
3039 #endif
3040 #ifdef CONFIG_CGROUPS
3041         ONE("cgroup",  S_IRUGO, proc_cgroup_show),
3042 #endif
3043         ONE("oom_score",  S_IRUGO, proc_oom_score),
3044         REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3045         REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3046 #ifdef CONFIG_AUDITSYSCALL
3047         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
3048         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
3049 #endif
3050 #ifdef CONFIG_FAULT_INJECTION
3051         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3052         REG("fail-nth", 0644, proc_fail_nth_operations),
3053 #endif
3054 #ifdef CONFIG_ELF_CORE
3055         REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
3056 #endif
3057 #ifdef CONFIG_TASK_IO_ACCOUNTING
3058         ONE("io",       S_IRUSR, proc_tgid_io_accounting),
3059 #endif
3060 #ifdef CONFIG_HARDWALL
3061         ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
3062 #endif
3063 #ifdef CONFIG_USER_NS
3064         REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
3065         REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
3066         REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3067         REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
3068 #endif
3069 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
3070         REG("timers",     S_IRUGO, proc_timers_operations),
3071 #endif
3072         REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
3073 #ifdef CONFIG_LIVEPATCH
3074         ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
3075 #endif
3076 };
3077
3078 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
3079 {
3080         return proc_pident_readdir(file, ctx,
3081                                    tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3082 }
3083
3084 static const struct file_operations proc_tgid_base_operations = {
3085         .read           = generic_read_dir,
3086         .iterate_shared = proc_tgid_base_readdir,
3087         .llseek         = generic_file_llseek,
3088 };
3089
3090 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3091 {
3092         return proc_pident_lookup(dir, dentry,
3093                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3094 }
3095
3096 static const struct inode_operations proc_tgid_base_inode_operations = {
3097         .lookup         = proc_tgid_base_lookup,
3098         .getattr        = pid_getattr,
3099         .setattr        = proc_setattr,
3100         .permission     = proc_pid_permission,
3101 };
3102
3103 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
3104 {
3105         struct dentry *dentry, *leader, *dir;
3106         char buf[PROC_NUMBUF];
3107         struct qstr name;
3108
3109         name.name = buf;
3110         name.len = snprintf(buf, sizeof(buf), "%d", pid);
3111         /* no ->d_hash() rejects on procfs */
3112         dentry = d_hash_and_lookup(mnt->mnt_root, &name);
3113         if (dentry) {
3114                 d_invalidate(dentry);
3115                 dput(dentry);
3116         }
3117
3118         if (pid == tgid)
3119                 return;
3120
3121         name.name = buf;
3122         name.len = snprintf(buf, sizeof(buf), "%d", tgid);
3123         leader = d_hash_and_lookup(mnt->mnt_root, &name);
3124         if (!leader)
3125                 goto out;
3126
3127         name.name = "task";
3128         name.len = strlen(name.name);
3129         dir = d_hash_and_lookup(leader, &name);
3130         if (!dir)
3131                 goto out_put_leader;
3132
3133         name.name = buf;
3134         name.len = snprintf(buf, sizeof(buf), "%d", pid);
3135         dentry = d_hash_and_lookup(dir, &name);
3136         if (dentry) {
3137                 d_invalidate(dentry);
3138                 dput(dentry);
3139         }
3140
3141         dput(dir);
3142 out_put_leader:
3143         dput(leader);
3144 out:
3145         return;
3146 }
3147
3148 /**
3149  * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
3150  * @task: task that should be flushed.
3151  *
3152  * When flushing dentries from proc, one needs to flush them from global
3153  * proc (proc_mnt) and from all the namespaces' procs this task was seen
3154  * in. This call is supposed to do all of this job.
3155  *
3156  * Looks in the dcache for
3157  * /proc/@pid
3158  * /proc/@tgid/task/@pid
3159  * if either directory is present flushes it and all of it'ts children
3160  * from the dcache.
3161  *
3162  * It is safe and reasonable to cache /proc entries for a task until
3163  * that task exits.  After that they just clog up the dcache with
3164  * useless entries, possibly causing useful dcache entries to be
3165  * flushed instead.  This routine is proved to flush those useless
3166  * dcache entries at process exit time.
3167  *
3168  * NOTE: This routine is just an optimization so it does not guarantee
3169  *       that no dcache entries will exist at process exit time it
3170  *       just makes it very unlikely that any will persist.
3171  */
3172
3173 void proc_flush_task(struct task_struct *task)
3174 {
3175         int i;
3176         struct pid *pid, *tgid;
3177         struct upid *upid;
3178
3179         pid = task_pid(task);
3180         tgid = task_tgid(task);
3181
3182         for (i = 0; i <= pid->level; i++) {
3183                 upid = &pid->numbers[i];
3184                 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
3185                                         tgid->numbers[i].nr);
3186         }
3187 }
3188
3189 static int proc_pid_instantiate(struct inode *dir,
3190                                    struct dentry * dentry,
3191                                    struct task_struct *task, const void *ptr)
3192 {
3193         struct inode *inode;
3194
3195         inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
3196         if (!inode)
3197                 goto out;
3198
3199         inode->i_op = &proc_tgid_base_inode_operations;
3200         inode->i_fop = &proc_tgid_base_operations;
3201         inode->i_flags|=S_IMMUTABLE;
3202
3203         set_nlink(inode, nlink_tgid);
3204
3205         d_set_d_op(dentry, &pid_dentry_operations);
3206
3207         d_add(dentry, inode);
3208         /* Close the race of the process dying before we return the dentry */
3209         if (pid_revalidate(dentry, 0))
3210                 return 0;
3211 out:
3212         return -ENOENT;
3213 }
3214
3215 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3216 {
3217         int result = -ENOENT;
3218         struct task_struct *task;
3219         unsigned tgid;
3220         struct pid_namespace *ns;
3221
3222         tgid = name_to_int(&dentry->d_name);
3223         if (tgid == ~0U)
3224                 goto out;
3225
3226         ns = dentry->d_sb->s_fs_info;
3227         rcu_read_lock();
3228         task = find_task_by_pid_ns(tgid, ns);
3229         if (task)
3230                 get_task_struct(task);
3231         rcu_read_unlock();
3232         if (!task)
3233                 goto out;
3234
3235         result = proc_pid_instantiate(dir, dentry, task, NULL);
3236         put_task_struct(task);
3237 out:
3238         return ERR_PTR(result);
3239 }
3240
3241 /*
3242  * Find the first task with tgid >= tgid
3243  *
3244  */
3245 struct tgid_iter {
3246         unsigned int tgid;
3247         struct task_struct *task;
3248 };
3249 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
3250 {
3251         struct pid *pid;
3252
3253         if (iter.task)
3254                 put_task_struct(iter.task);
3255         rcu_read_lock();
3256 retry:
3257         iter.task = NULL;
3258         pid = find_ge_pid(iter.tgid, ns);
3259         if (pid) {
3260                 iter.tgid = pid_nr_ns(pid, ns);
3261                 iter.task = pid_task(pid, PIDTYPE_PID);
3262                 /* What we to know is if the pid we have find is the
3263                  * pid of a thread_group_leader.  Testing for task
3264                  * being a thread_group_leader is the obvious thing
3265                  * todo but there is a window when it fails, due to
3266                  * the pid transfer logic in de_thread.
3267                  *
3268                  * So we perform the straight forward test of seeing
3269                  * if the pid we have found is the pid of a thread
3270                  * group leader, and don't worry if the task we have
3271                  * found doesn't happen to be a thread group leader.
3272                  * As we don't care in the case of readdir.
3273                  */
3274                 if (!iter.task || !has_group_leader_pid(iter.task)) {
3275                         iter.tgid += 1;
3276                         goto retry;
3277                 }
3278                 get_task_struct(iter.task);
3279         }
3280         rcu_read_unlock();
3281         return iter;
3282 }
3283
3284 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
3285
3286 /* for the /proc/ directory itself, after non-process stuff has been done */
3287 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
3288 {
3289         struct tgid_iter iter;
3290         struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
3291         loff_t pos = ctx->pos;
3292
3293         if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
3294                 return 0;
3295
3296         if (pos == TGID_OFFSET - 2) {
3297                 struct inode *inode = d_inode(ns->proc_self);
3298                 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
3299                         return 0;
3300                 ctx->pos = pos = pos + 1;
3301         }
3302         if (pos == TGID_OFFSET - 1) {
3303                 struct inode *inode = d_inode(ns->proc_thread_self);
3304                 if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
3305                         return 0;
3306                 ctx->pos = pos = pos + 1;
3307         }
3308         iter.tgid = pos - TGID_OFFSET;
3309         iter.task = NULL;
3310         for (iter = next_tgid(ns, iter);
3311              iter.task;
3312              iter.tgid += 1, iter = next_tgid(ns, iter)) {
3313                 char name[PROC_NUMBUF];
3314                 int len;
3315
3316                 cond_resched();
3317                 if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
3318                         continue;
3319
3320                 len = snprintf(name, sizeof(name), "%d", iter.tgid);
3321                 ctx->pos = iter.tgid + TGID_OFFSET;
3322                 if (!proc_fill_cache(file, ctx, name, len,
3323                                      proc_pid_instantiate, iter.task, NULL)) {
3324                         put_task_struct(iter.task);
3325                         return 0;
3326                 }
3327         }
3328         ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
3329         return 0;
3330 }
3331
3332 /*
3333  * proc_tid_comm_permission is a special permission function exclusively
3334  * used for the node /proc/<pid>/task/<tid>/comm.
3335  * It bypasses generic permission checks in the case where a task of the same
3336  * task group attempts to access the node.
3337  * The rationale behind this is that glibc and bionic access this node for
3338  * cross thread naming (pthread_set/getname_np(!self)). However, if
3339  * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
3340  * which locks out the cross thread naming implementation.
3341  * This function makes sure that the node is always accessible for members of
3342  * same thread group.
3343  */
3344 static int proc_tid_comm_permission(struct inode *inode, int mask)
3345 {
3346         bool is_same_tgroup;
3347         struct task_struct *task;
3348
3349         task = get_proc_task(inode);
3350         if (!task)
3351                 return -ESRCH;
3352         is_same_tgroup = same_thread_group(current, task);
3353         put_task_struct(task);
3354
3355         if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
3356                 /* This file (/proc/<pid>/task/<tid>/comm) can always be
3357                  * read or written by the members of the corresponding
3358                  * thread group.
3359                  */
3360                 return 0;
3361         }
3362
3363         return generic_permission(inode, mask);
3364 }
3365
3366 static const struct inode_operations proc_tid_comm_inode_operations = {
3367                 .permission = proc_tid_comm_permission,
3368 };
3369
3370 /*
3371  * Tasks
3372  */
3373 static const struct pid_entry tid_base_stuff[] = {
3374         DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3375         DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3376         DIR("ns",        S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3377 #ifdef CONFIG_NET
3378         DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3379 #endif
3380         REG("environ",   S_IRUSR, proc_environ_operations),
3381         REG("auxv",      S_IRUSR, proc_auxv_operations),
3382         ONE("status",    S_IRUGO, proc_pid_status),
3383         ONE("personality", S_IRUSR, proc_pid_personality),
3384         ONE("limits",    S_IRUGO, proc_pid_limits),
3385 #ifdef CONFIG_SCHED_DEBUG
3386         REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3387 #endif
3388         NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
3389                          &proc_tid_comm_inode_operations,
3390                          &proc_pid_set_comm_operations, {}),
3391 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3392         ONE("syscall",   S_IRUSR, proc_pid_syscall),
3393 #endif
3394         REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
3395         ONE("stat",      S_IRUGO, proc_tid_stat),
3396         ONE("statm",     S_IRUGO, proc_pid_statm),
3397         REG("maps",      S_IRUGO, proc_tid_maps_operations),
3398 #ifdef CONFIG_PROC_CHILDREN
3399         REG("children",  S_IRUGO, proc_tid_children_operations),
3400 #endif
3401 #ifdef CONFIG_NUMA
3402         REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
3403 #endif
3404         REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
3405         LNK("cwd",       proc_cwd_link),
3406         LNK("root",      proc_root_link),
3407         LNK("exe",       proc_exe_link),
3408         REG("mounts",    S_IRUGO, proc_mounts_operations),
3409         REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
3410 #ifdef CONFIG_PROC_PAGE_MONITOR
3411         REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3412         REG("smaps",     S_IRUGO, proc_tid_smaps_operations),
3413         REG("pagemap",    S_IRUSR, proc_pagemap_operations),
3414 #endif
3415 #ifdef CONFIG_SECURITY
3416         DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3417 #endif
3418 #ifdef CONFIG_KALLSYMS
3419         ONE("wchan",     S_IRUGO, proc_pid_wchan),
3420 #endif
3421 #ifdef CONFIG_STACKTRACE
3422         ONE("stack",      S_IRUSR, proc_pid_stack),
3423 #endif
3424 #ifdef CONFIG_SCHED_INFO
3425         ONE("schedstat", S_IRUGO, proc_pid_schedstat),
3426 #endif
3427 #ifdef CONFIG_LATENCYTOP
3428         REG("latency",  S_IRUGO, proc_lstats_operations),
3429 #endif
3430 #ifdef CONFIG_PROC_PID_CPUSET
3431         ONE("cpuset",    S_IRUGO, proc_cpuset_show),
3432 #endif
3433 #ifdef CONFIG_CGROUPS
3434         ONE("cgroup",  S_IRUGO, proc_cgroup_show),
3435 #endif
3436         ONE("oom_score", S_IRUGO, proc_oom_score),
3437         REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3438         REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3439 #ifdef CONFIG_AUDITSYSCALL
3440         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
3441         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
3442 #endif
3443 #ifdef CONFIG_FAULT_INJECTION
3444         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3445         REG("fail-nth", 0644, proc_fail_nth_operations),
3446 #endif
3447 #ifdef CONFIG_TASK_IO_ACCOUNTING
3448         ONE("io",       S_IRUSR, proc_tid_io_accounting),
3449 #endif
3450 #ifdef CONFIG_HARDWALL
3451         ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
3452 #endif
3453 #ifdef CONFIG_USER_NS
3454         REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
3455         REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
3456         REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3457         REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
3458 #endif
3459 #ifdef CONFIG_LIVEPATCH
3460         ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
3461 #endif
3462 };
3463
3464 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
3465 {
3466         return proc_pident_readdir(file, ctx,
3467                                    tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3468 }
3469
3470 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3471 {
3472         return proc_pident_lookup(dir, dentry,
3473                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3474 }
3475
3476 static const struct file_operations proc_tid_base_operations = {
3477         .read           = generic_read_dir,
3478         .iterate_shared = proc_tid_base_readdir,
3479         .llseek         = generic_file_llseek,
3480 };
3481
3482 static const struct inode_operations proc_tid_base_inode_operations = {
3483         .lookup         = proc_tid_base_lookup,
3484         .getattr        = pid_getattr,
3485         .setattr        = proc_setattr,
3486 };
3487
3488 static int proc_task_instantiate(struct inode *dir,
3489         struct dentry *dentry, struct task_struct *task, const void *ptr)
3490 {
3491         struct inode *inode;
3492         inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
3493
3494         if (!inode)
3495                 goto out;
3496         inode->i_op = &proc_tid_base_inode_operations;
3497         inode->i_fop = &proc_tid_base_operations;
3498         inode->i_flags|=S_IMMUTABLE;
3499
3500         set_nlink(inode, nlink_tid);
3501
3502         d_set_d_op(dentry, &pid_dentry_operations);
3503
3504         d_add(dentry, inode);
3505         /* Close the race of the process dying before we return the dentry */
3506         if (pid_revalidate(dentry, 0))
3507                 return 0;
3508 out:
3509         return -ENOENT;
3510 }
3511
3512 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3513 {
3514         int result = -ENOENT;
3515         struct task_struct *task;
3516         struct task_struct *leader = get_proc_task(dir);
3517         unsigned tid;
3518         struct pid_namespace *ns;
3519
3520         if (!leader)
3521                 goto out_no_task;
3522
3523         tid = name_to_int(&dentry->d_name);
3524         if (tid == ~0U)
3525                 goto out;
3526
3527         ns = dentry->d_sb->s_fs_info;
3528         rcu_read_lock();
3529         task = find_task_by_pid_ns(tid, ns);
3530         if (task)
3531                 get_task_struct(task);
3532         rcu_read_unlock();
3533         if (!task)
3534                 goto out;
3535         if (!same_thread_group(leader, task))
3536                 goto out_drop_task;
3537
3538         result = proc_task_instantiate(dir, dentry, task, NULL);
3539 out_drop_task:
3540         put_task_struct(task);
3541 out:
3542         put_task_struct(leader);
3543 out_no_task:
3544         return ERR_PTR(result);
3545 }
3546
3547 /*
3548  * Find the first tid of a thread group to return to user space.
3549  *
3550  * Usually this is just the thread group leader, but if the users
3551  * buffer was too small or there was a seek into the middle of the
3552  * directory we have more work todo.
3553  *
3554  * In the case of a short read we start with find_task_by_pid.
3555  *
3556  * In the case of a seek we start with the leader and walk nr
3557  * threads past it.
3558  */
3559 static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
3560                                         struct pid_namespace *ns)
3561 {
3562         struct task_struct *pos, *task;
3563         unsigned long nr = f_pos;
3564
3565         if (nr != f_pos)        /* 32bit overflow? */
3566                 return NULL;
3567
3568         rcu_read_lock();
3569         task = pid_task(pid, PIDTYPE_PID);
3570         if (!task)
3571                 goto fail;
3572
3573         /* Attempt to start with the tid of a thread */
3574         if (tid && nr) {
3575                 pos = find_task_by_pid_ns(tid, ns);
3576                 if (pos && same_thread_group(pos, task))
3577                         goto found;
3578         }
3579
3580         /* If nr exceeds the number of threads there is nothing todo */
3581         if (nr >= get_nr_threads(task))
3582                 goto fail;
3583
3584         /* If we haven't found our starting place yet start
3585          * with the leader and walk nr threads forward.
3586          */
3587         pos = task = task->group_leader;
3588         do {
3589                 if (!nr--)
3590                         goto found;
3591         } while_each_thread(task, pos);
3592 fail:
3593         pos = NULL;
3594         goto out;
3595 found:
3596         get_task_struct(pos);
3597 out:
3598         rcu_read_unlock();
3599         return pos;
3600 }
3601
3602 /*
3603  * Find the next thread in the thread list.
3604  * Return NULL if there is an error or no next thread.
3605  *
3606  * The reference to the input task_struct is released.
3607  */
3608 static struct task_struct *next_tid(struct task_struct *start)
3609 {
3610         struct task_struct *pos = NULL;
3611         rcu_read_lock();
3612         if (pid_alive(start)) {
3613                 pos = next_thread(start);
3614                 if (thread_group_leader(pos))
3615                         pos = NULL;
3616                 else
3617                         get_task_struct(pos);
3618         }
3619         rcu_read_unlock();
3620         put_task_struct(start);
3621         return pos;
3622 }
3623
3624 /* for the /proc/TGID/task/ directories */
3625 static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3626 {
3627         struct inode *inode = file_inode(file);
3628         struct task_struct *task;
3629         struct pid_namespace *ns;
3630         int tid;
3631
3632         if (proc_inode_is_dead(inode))
3633                 return -ENOENT;
3634
3635         if (!dir_emit_dots(file, ctx))
3636                 return 0;
3637
3638         /* f_version caches the tgid value that the last readdir call couldn't
3639          * return. lseek aka telldir automagically resets f_version to 0.
3640          */
3641         ns = inode->i_sb->s_fs_info;
3642         tid = (int)file->f_version;
3643         file->f_version = 0;
3644         for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
3645              task;
3646              task = next_tid(task), ctx->pos++) {
3647                 char name[PROC_NUMBUF];
3648                 int len;
3649                 tid = task_pid_nr_ns(task, ns);
3650                 len = snprintf(name, sizeof(name), "%d", tid);
3651                 if (!proc_fill_cache(file, ctx, name, len,
3652                                 proc_task_instantiate, task, NULL)) {
3653                         /* returning this tgid failed, save it as the first
3654                          * pid for the next readir call */
3655                         file->f_version = (u64)tid;
3656                         put_task_struct(task);
3657                         break;
3658                 }
3659         }
3660
3661         return 0;
3662 }
3663
3664 static int proc_task_getattr(const struct path *path, struct kstat *stat,
3665                              u32 request_mask, unsigned int query_flags)
3666 {
3667         struct inode *inode = d_inode(path->dentry);
3668         struct task_struct *p = get_proc_task(inode);
3669         generic_fillattr(inode, stat);
3670
3671         if (p) {
3672                 stat->nlink += get_nr_threads(p);
3673                 put_task_struct(p);
3674         }
3675
3676         return 0;
3677 }
3678
3679 static const struct inode_operations proc_task_inode_operations = {
3680         .lookup         = proc_task_lookup,
3681         .getattr        = proc_task_getattr,
3682         .setattr        = proc_setattr,
3683         .permission     = proc_pid_permission,
3684 };
3685
3686 static const struct file_operations proc_task_operations = {
3687         .read           = generic_read_dir,
3688         .iterate_shared = proc_task_readdir,
3689         .llseek         = generic_file_llseek,
3690 };
3691
3692 void __init set_proc_pid_nlink(void)
3693 {
3694         nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3695         nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3696 }