fs/proc/base.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/fs/proc/base.c
   4  *
   5  *  Copyright (C) 1991, 1992 Linus Torvalds
   6  *
   7  *  proc base directory handling functions
   8  *
   9  *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
  10  *  Instead of using magical inumbers to determine the kind of object
  11  *  we allocate and fill in-core inodes upon lookup. They don't even
  12  *  go into icache. We cache the reference to task_struct upon lookup too.
  13  *  Eventually it should become a filesystem in its own. We don't use the
  14  *  rest of procfs anymore.
  15  *
  16  *
  17  *  Changelog:
  18  *  17-Jan-2005
  19  *  Allan Bezerra
  20  *  Bruna Moreira <bruna.moreira@indt.org.br>
  21  *  Edjard Mota <edjard.mota@indt.org.br>
  22  *  Ilias Biris <ilias.biris@indt.org.br>
  23  *  Mauricio Lin <mauricio.lin@indt.org.br>
  24  *
  25  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  26  *
  27  *  A new process specific entry (smaps) included in /proc. It shows the
  28  *  size of rss for each memory area. The maps entry lacks information
  29  *  about physical memory size (rss) for each mapped file, i.e.,
  30  *  rss information for executables and library files.
  31  *  This additional information is useful for any tools that need to know
  32  *  about physical memory consumption for a process specific library.
  33  *
  34  *  Changelog:
  35  *  21-Feb-2005
  36  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  37  *  Pud inclusion in the page table walking.
  38  *
  39  *  ChangeLog:
  40  *  10-Mar-2005
  41  *  10LE Instituto Nokia de Tecnologia - INdT:
  42  *  A better way to walks through the page table as suggested by Hugh Dickins.
  43  *
  44  *  Simo Piiroinen <simo.piiroinen@nokia.com>:
  45  *  Smaps information related to shared, private, clean and dirty pages.
  46  *
  47  *  Paul Mundt <paul.mundt@nokia.com>:
  48  *  Overall revision about smaps.
  49  */
  50
  51 #include <linux/uaccess.h>
  52
  53 #include <linux/errno.h>
  54 #include <linux/time.h>
  55 #include <linux/proc_fs.h>
  56 #include <linux/stat.h>
  57 #include <linux/task_io_accounting_ops.h>
  58 #include <linux/init.h>
  59 #include <linux/capability.h>
  60 #include <linux/file.h>
  61 #include <linux/fdtable.h>
  62 #include <linux/generic-radix-tree.h>
  63 #include <linux/string.h>
  64 #include <linux/seq_file.h>
  65 #include <linux/namei.h>
  66 #include <linux/mnt_namespace.h>
  67 #include <linux/mm.h>
  68 #include <linux/swap.h>
  69 #include <linux/rcupdate.h>
  70 #include <linux/kallsyms.h>
  71 #include <linux/stacktrace.h>
  72 #include <linux/resource.h>
  73 #include <linux/module.h>
  74 #include <linux/mount.h>
  75 #include <linux/security.h>
  76 #include <linux/ptrace.h>
  77 #include <linux/tracehook.h>
  78 #include <linux/printk.h>
  79 #include <linux/cache.h>
  80 #include <linux/cgroup.h>
  81 #include <linux/cpuset.h>
  82 #include <linux/audit.h>
  83 #include <linux/poll.h>
  84 #include <linux/nsproxy.h>
  85 #include <linux/oom.h>
  86 #include <linux/elf.h>
  87 #include <linux/pid_namespace.h>
  88 #include <linux/user_namespace.h>
  89 #include <linux/fs_struct.h>
  90 #include <linux/slab.h>
  91 #include <linux/sched/autogroup.h>
  92 #include <linux/sched/mm.h>
  93 #include <linux/sched/coredump.h>
  94 #include <linux/sched/debug.h>
  95 #include <linux/sched/stat.h>
  96 #include <linux/posix-timers.h>
  97 #include <linux/time_namespace.h>
  98 #include <linux/resctrl.h>
  99 #include <trace/events/oom.h>
 100 #include "internal.h"
 101 #include "fd.h"
 102
 103 #include "../../lib/kstrtox.h"
 104
 105 /* NOTE:
 106  *      Implementing inode permission operations in /proc is almost
 107  *      certainly an error.  Permission checks need to happen during
 108  *      each system call not at open time.  The reason is that most of
 109  *      what we wish to check for permissions in /proc varies at runtime.
 110  *
 111  *      The classic example of a problem is opening file descriptors
 112  *      in /proc for a task before it execs a suid executable.
 113  */
 114
 115 static u8 nlink_tid __ro_after_init;
 116 static u8 nlink_tgid __ro_after_init;
 117
 118 struct pid_entry {
 119         const char *name;
 120         unsigned int len;
 121         umode_t mode;
 122         const struct inode_operations *iop;
 123         const struct file_operations *fop;
 124         union proc_op op;
 125 };
 126
 127 #define NOD(NAME, MODE, IOP, FOP, OP) {                 \
 128         .name = (NAME),                                 \
 129         .len  = sizeof(NAME) - 1,                       \
 130         .mode = MODE,                                   \
 131         .iop  = IOP,                                    \
 132         .fop  = FOP,                                    \
 133         .op   = OP,                                     \
 134 }
 135
 136 #define DIR(NAME, MODE, iops, fops)     \
 137         NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
 138 #define LNK(NAME, get_link)                                     \
 139         NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
 140                 &proc_pid_link_inode_operations, NULL,          \
 141                 { .proc_get_link = get_link } )
 142 #define REG(NAME, MODE, fops)                           \
 143         NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
 144 #define ONE(NAME, MODE, show)                           \
 145         NOD(NAME, (S_IFREG|(MODE)),                     \
 146                 NULL, &proc_single_file_operations,     \
 147                 { .proc_show = show } )
 148 #define ATTR(LSM, NAME, MODE)                           \
 149         NOD(NAME, (S_IFREG|(MODE)),                     \
 150                 NULL, &proc_pid_attr_operations,        \
 151                 { .lsm = LSM })
 152
 153 /*
 154  * Count the number of hardlinks for the pid_entry table, excluding the .
 155  * and .. links.
 156  */
 157 static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
 158         unsigned int n)
 159 {
 160         unsigned int i;
 161         unsigned int count;
 162
 163         count = 2;
 164         for (i = 0; i < n; ++i) {
 165                 if (S_ISDIR(entries[i].mode))
 166                         ++count;
 167         }
 168
 169         return count;
 170 }
 171
 172 static int get_task_root(struct task_struct *task, struct path *root)
 173 {
 174         int result = -ENOENT;
 175
 176         task_lock(task);
 177         if (task->fs) {
 178                 get_fs_root(task->fs, root);
 179                 result = 0;
 180         }
 181         task_unlock(task);
 182         return result;
 183 }
 184
 185 static int proc_cwd_link(struct dentry *dentry, struct path *path)
 186 {
 187         struct task_struct *task = get_proc_task(d_inode(dentry));
 188         int result = -ENOENT;
 189
 190         if (task) {
 191                 task_lock(task);
 192                 if (task->fs) {
 193                         get_fs_pwd(task->fs, path);
 194                         result = 0;
 195                 }
 196                 task_unlock(task);
 197                 put_task_struct(task);
 198         }
 199         return result;
 200 }
 201
 202 static int proc_root_link(struct dentry *dentry, struct path *path)
 203 {
 204         struct task_struct *task = get_proc_task(d_inode(dentry));
 205         int result = -ENOENT;
 206
 207         if (task) {
 208                 result = get_task_root(task, path);
 209                 put_task_struct(task);
 210         }
 211         return result;
 212 }
 213
 214 /*
 215  * If the user used setproctitle(), we just get the string from
 216  * user space at arg_start, and limit it to a maximum of one page.
 217  */
 218 static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
 219                                 size_t count, unsigned long pos,
 220                                 unsigned long arg_start)
 221 {
 222         char *page;
 223         int ret, got;
 224
 225         if (pos >= PAGE_SIZE)
 226                 return 0;
 227
 228         page = (char *)__get_free_page(GFP_KERNEL);
 229         if (!page)
 230                 return -ENOMEM;
 231
 232         ret = 0;
 233         got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
 234         if (got > 0) {
 235                 int len = strnlen(page, got);
 236
 237                 /* Include the NUL character if it was found */
 238                 if (len < got)
 239                         len++;
 240
 241                 if (len > pos) {
 242                         len -= pos;
 243                         if (len > count)
 244                                 len = count;
 245                         len -= copy_to_user(buf, page+pos, len);
 246                         if (!len)
 247                                 len = -EFAULT;
 248                         ret = len;
 249                 }
 250         }
 251         free_page((unsigned long)page);
 252         return ret;
 253 }
 254
 255 static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
 256                               size_t count, loff_t *ppos)
 257 {
 258         unsigned long arg_start, arg_end, env_start, env_end;
 259         unsigned long pos, len;
 260         char *page, c;
 261
 262         /* Check if process spawned far enough to have cmdline. */
 263         if (!mm->env_end)
 264                 return 0;
 265
 266         spin_lock(&mm->arg_lock);
 267         arg_start = mm->arg_start;
 268         arg_end = mm->arg_end;
 269         env_start = mm->env_start;
 270         env_end = mm->env_end;
 271         spin_unlock(&mm->arg_lock);
 272
 273         if (arg_start >= arg_end)
 274                 return 0;
 275
 276         /*
 277          * We allow setproctitle() to overwrite the argument
 278          * strings, and overflow past the original end. But
 279          * only when it overflows into the environment area.
 280          */
 281         if (env_start != arg_end || env_end < env_start)
 282                 env_start = env_end = arg_end;
 283         len = env_end - arg_start;
 284
 285         /* We're not going to care if "*ppos" has high bits set */
 286         pos = *ppos;
 287         if (pos >= len)
 288                 return 0;
 289         if (count > len - pos)
 290                 count = len - pos;
 291         if (!count)
 292                 return 0;
 293
 294         /*
 295          * Magical special case: if the argv[] end byte is not
 296          * zero, the user has overwritten it with setproctitle(3).
 297          *
 298          * Possible future enhancement: do this only once when
 299          * pos is 0, and set a flag in the 'struct file'.
 300          */
 301         if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
 302                 return get_mm_proctitle(mm, buf, count, pos, arg_start);
 303
 304         /*
 305          * For the non-setproctitle() case we limit things strictly
 306          * to the [arg_start, arg_end[ range.
 307          */
 308         pos += arg_start;
 309         if (pos < arg_start || pos >= arg_end)
 310                 return 0;
 311         if (count > arg_end - pos)
 312                 count = arg_end - pos;
 313
 314         page = (char *)__get_free_page(GFP_KERNEL);
 315         if (!page)
 316                 return -ENOMEM;
 317
 318         len = 0;
 319         while (count) {
 320                 int got;
 321                 size_t size = min_t(size_t, PAGE_SIZE, count);
 322
 323                 got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
 324                 if (got <= 0)
 325                         break;
 326                 got -= copy_to_user(buf, page, got);
 327                 if (unlikely(!got)) {
 328                         if (!len)
 329                                 len = -EFAULT;
 330                         break;
 331                 }
 332                 pos += got;
 333                 buf += got;
 334                 len += got;
 335                 count -= got;
 336         }
 337
 338         free_page((unsigned long)page);
 339         return len;
 340 }
 341
 342 static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
 343                                 size_t count, loff_t *pos)
 344 {
 345         struct mm_struct *mm;
 346         ssize_t ret;
 347
 348         mm = get_task_mm(tsk);
 349         if (!mm)
 350                 return 0;
 351
 352         ret = get_mm_cmdline(mm, buf, count, pos);
 353         mmput(mm);
 354         return ret;
 355 }
 356
 357 static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 358                                      size_t count, loff_t *pos)
 359 {
 360         struct task_struct *tsk;
 361         ssize_t ret;
 362
 363         BUG_ON(*pos < 0);
 364
 365         tsk = get_proc_task(file_inode(file));
 366         if (!tsk)
 367                 return -ESRCH;
 368         ret = get_task_cmdline(tsk, buf, count, pos);
 369         put_task_struct(tsk);
 370         if (ret > 0)
 371                 *pos += ret;
 372         return ret;
 373 }
 374
 375 static const struct file_operations proc_pid_cmdline_ops = {
 376         .read   = proc_pid_cmdline_read,
 377         .llseek = generic_file_llseek,
 378 };
 379
 380 #ifdef CONFIG_KALLSYMS
 381 /*
 382  * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 383  * Returns the resolved symbol.  If that fails, simply return the address.
 384  */
 385 static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
 386                           struct pid *pid, struct task_struct *task)
 387 {
 388         unsigned long wchan;
 389         char symname[KSYM_NAME_LEN];
 390
 391         if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
 392                 goto print0;
 393
 394         wchan = get_wchan(task);
 395         if (wchan && !lookup_symbol_name(wchan, symname)) {
 396                 seq_puts(m, symname);
 397                 return 0;
 398         }
 399
 400 print0:
 401         seq_putc(m, '0');
 402         return 0;
 403 }
 404 #endif /* CONFIG_KALLSYMS */
 405
 406 static int lock_trace(struct task_struct *task)
 407 {
 408         int err = down_read_killable(&task->signal->exec_update_lock);
 409         if (err)
 410                 return err;
 411         if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
 412                 up_read(&task->signal->exec_update_lock);
 413                 return -EPERM;
 414         }
 415         return 0;
 416 }
 417
 418 static void unlock_trace(struct task_struct *task)
 419 {
 420         up_read(&task->signal->exec_update_lock);
 421 }
 422
 423 #ifdef CONFIG_STACKTRACE
 424
 425 #define MAX_STACK_TRACE_DEPTH   64
 426
 427 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 428                           struct pid *pid, struct task_struct *task)
 429 {
 430         unsigned long *entries;
 431         int err;
 432
 433         /*
 434          * The ability to racily run the kernel stack unwinder on a running task
 435          * and then observe the unwinder output is scary; while it is useful for
 436          * debugging kernel issues, it can also allow an attacker to leak kernel
 437          * stack contents.
 438          * Doing this in a manner that is at least safe from races would require
 439          * some work to ensure that the remote task can not be scheduled; and
 440          * even then, this would still expose the unwinder as local attack
 441          * surface.
 442          * Therefore, this interface is restricted to root.
 443          */
 444         if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
 445                 return -EACCES;
 446
 447         entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
 448                                 GFP_KERNEL);
 449         if (!entries)
 450                 return -ENOMEM;
 451
 452         err = lock_trace(task);
 453         if (!err) {
 454                 unsigned int i, nr_entries;
 455
 456                 nr_entries = stack_trace_save_tsk(task, entries,
 457                                                   MAX_STACK_TRACE_DEPTH, 0);
 458
 459                 for (i = 0; i < nr_entries; i++) {
 460                         seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
 461                 }
 462
 463                 unlock_trace(task);
 464         }
 465         kfree(entries);
 466
 467         return err;
 468 }
 469 #endif
 470
 471 #ifdef CONFIG_SCHED_INFO
 472 /*
 473  * Provides /proc/PID/schedstat
 474  */
 475 static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 476                               struct pid *pid, struct task_struct *task)
 477 {
 478         if (unlikely(!sched_info_on()))
 479                 seq_puts(m, "0 0 0\n");
 480         else
 481                 seq_printf(m, "%llu %llu %lu\n",
 482                    (unsigned long long)task->se.sum_exec_runtime,
 483                    (unsigned long long)task->sched_info.run_delay,
 484                    task->sched_info.pcount);
 485
 486         return 0;
 487 }
 488 #endif
 489
 490 #ifdef CONFIG_LATENCYTOP
 491 static int lstats_show_proc(struct seq_file *m, void *v)
 492 {
 493         int i;
 494         struct inode *inode = m->private;
 495         struct task_struct *task = get_proc_task(inode);
 496
 497         if (!task)
 498                 return -ESRCH;
 499         seq_puts(m, "Latency Top version : v0.1\n");
 500         for (i = 0; i < LT_SAVECOUNT; i++) {
 501                 struct latency_record *lr = &task->latency_record[i];
 502                 if (lr->backtrace[0]) {
 503                         int q;
 504                         seq_printf(m, "%i %li %li",
 505                                    lr->count, lr->time, lr->max);
 506                         for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 507                                 unsigned long bt = lr->backtrace[q];
 508
 509                                 if (!bt)
 510                                         break;
 511                                 seq_printf(m, " %ps", (void *)bt);
 512                         }
 513                         seq_putc(m, '\n');
 514                 }
 515
 516         }
 517         put_task_struct(task);
 518         return 0;
 519 }
 520
 521 static int lstats_open(struct inode *inode, struct file *file)
 522 {
 523         return single_open(file, lstats_show_proc, inode);
 524 }
 525
 526 static ssize_t lstats_write(struct file *file, const char __user *buf,
 527                             size_t count, loff_t *offs)
 528 {
 529         struct task_struct *task = get_proc_task(file_inode(file));
 530
 531         if (!task)
 532                 return -ESRCH;
 533         clear_tsk_latency_tracing(task);
 534         put_task_struct(task);
 535
 536         return count;
 537 }
 538
 539 static const struct file_operations proc_lstats_operations = {
 540         .open           = lstats_open,
 541         .read           = seq_read,
 542         .write          = lstats_write,
 543         .llseek         = seq_lseek,
 544         .release        = single_release,
 545 };
 546
 547 #endif
 548
 549 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
 550                           struct pid *pid, struct task_struct *task)
 551 {
 552         unsigned long totalpages = totalram_pages() + total_swap_pages;
 553         unsigned long points = 0;
 554         long badness;
 555
 556         badness = oom_badness(task, totalpages);
 557         /*
 558          * Special case OOM_SCORE_ADJ_MIN for all others scale the
 559          * badness value into [0, 2000] range which we have been
 560          * exporting for a long time so userspace might depend on it.
 561          */
 562         if (badness != LONG_MIN)
 563                 points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
 564
 565         seq_printf(m, "%lu\n", points);
 566
 567         return 0;
 568 }
 569
 570 struct limit_names {
 571         const char *name;
 572         const char *unit;
 573 };
 574
 575 static const struct limit_names lnames[RLIM_NLIMITS] = {
 576         [RLIMIT_CPU] = {"Max cpu time", "seconds"},
 577         [RLIMIT_FSIZE] = {"Max file size", "bytes"},
 578         [RLIMIT_DATA] = {"Max data size", "bytes"},
 579         [RLIMIT_STACK] = {"Max stack size", "bytes"},
 580         [RLIMIT_CORE] = {"Max core file size", "bytes"},
 581         [RLIMIT_RSS] = {"Max resident set", "bytes"},
 582         [RLIMIT_NPROC] = {"Max processes", "processes"},
 583         [RLIMIT_NOFILE] = {"Max open files", "files"},
 584         [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
 585         [RLIMIT_AS] = {"Max address space", "bytes"},
 586         [RLIMIT_LOCKS] = {"Max file locks", "locks"},
 587         [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
 588         [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
 589         [RLIMIT_NICE] = {"Max nice priority", NULL},
 590         [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
 591         [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
 592 };
 593
 594 /* Display limits for a process */
 595 static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
 596                            struct pid *pid, struct task_struct *task)
 597 {
 598         unsigned int i;
 599         unsigned long flags;
 600
 601         struct rlimit rlim[RLIM_NLIMITS];
 602
 603         if (!lock_task_sighand(task, &flags))
 604                 return 0;
 605         memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
 606         unlock_task_sighand(task, &flags);
 607
 608         /*
 609          * print the file header
 610          */
 611         seq_puts(m, "Limit                     "
 612                 "Soft Limit           "
 613                 "Hard Limit           "
 614                 "Units     \n");
 615
 616         for (i = 0; i < RLIM_NLIMITS; i++) {
 617                 if (rlim[i].rlim_cur == RLIM_INFINITY)
 618                         seq_printf(m, "%-25s %-20s ",
 619                                    lnames[i].name, "unlimited");
 620                 else
 621                         seq_printf(m, "%-25s %-20lu ",
 622                                    lnames[i].name, rlim[i].rlim_cur);
 623
 624                 if (rlim[i].rlim_max == RLIM_INFINITY)
 625                         seq_printf(m, "%-20s ", "unlimited");
 626                 else
 627                         seq_printf(m, "%-20lu ", rlim[i].rlim_max);
 628
 629                 if (lnames[i].unit)
 630                         seq_printf(m, "%-10s\n", lnames[i].unit);
 631                 else
 632                         seq_putc(m, '\n');
 633         }
 634
 635         return 0;
 636 }
 637
 638 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 639 static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
 640                             struct pid *pid, struct task_struct *task)
 641 {
 642         struct syscall_info info;
 643         u64 *args = &info.data.args[0];
 644         int res;
 645
 646         res = lock_trace(task);
 647         if (res)
 648                 return res;
 649
 650         if (task_current_syscall(task, &info))
 651                 seq_puts(m, "running\n");
 652         else if (info.data.nr < 0)
 653                 seq_printf(m, "%d 0x%llx 0x%llx\n",
 654                            info.data.nr, info.sp, info.data.instruction_pointer);
 655         else
 656                 seq_printf(m,
 657                        "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
 658                        info.data.nr,
 659                        args[0], args[1], args[2], args[3], args[4], args[5],
 660                        info.sp, info.data.instruction_pointer);
 661         unlock_trace(task);
 662
 663         return 0;
 664 }
 665 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 666
 667 /************************************************************************/
 668 /*                       Here the fs part begins                        */
 669 /************************************************************************/
 670
 671 /* permission checks */
 672 static int proc_fd_access_allowed(struct inode *inode)
 673 {
 674         struct task_struct *task;
 675         int allowed = 0;
 676         /* Allow access to a task's file descriptors if it is us or we
 677          * may use ptrace attach to the process and find out that
 678          * information.
 679          */
 680         task = get_proc_task(inode);
 681         if (task) {
 682                 allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 683                 put_task_struct(task);
 684         }
 685         return allowed;
 686 }
 687
 688 int proc_setattr(struct dentry *dentry, struct iattr *attr)
 689 {
 690         int error;
 691         struct inode *inode = d_inode(dentry);
 692
 693         if (attr->ia_valid & ATTR_MODE)
 694                 return -EPERM;
 695
 696         error = setattr_prepare(dentry, attr);
 697         if (error)
 698                 return error;
 699
 700         setattr_copy(inode, attr);
 701         mark_inode_dirty(inode);
 702         return 0;
 703 }
 704
 705 /*
 706  * May current process learn task's sched/cmdline info (for hide_pid_min=1)
 707  * or euid/egid (for hide_pid_min=2)?
 708  */
 709 static bool has_pid_permissions(struct proc_fs_info *fs_info,
 710                                  struct task_struct *task,
 711                                  enum proc_hidepid hide_pid_min)
 712 {
 713         /*
 714          * If 'hidpid' mount option is set force a ptrace check,
 715          * we indicate that we are using a filesystem syscall
 716          * by passing PTRACE_MODE_READ_FSCREDS
 717          */
 718         if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
 719                 return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 720
 721         if (fs_info->hide_pid < hide_pid_min)
 722                 return true;
 723         if (in_group_p(fs_info->pid_gid))
 724                 return true;
 725         return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
 726 }
 727
 728
 729 static int proc_pid_permission(struct inode *inode, int mask)
 730 {
 731         struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
 732         struct task_struct *task;
 733         bool has_perms;
 734
 735         task = get_proc_task(inode);
 736         if (!task)
 737                 return -ESRCH;
 738         has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
 739         put_task_struct(task);
 740
 741         if (!has_perms) {
 742                 if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
 743                         /*
 744                          * Let's make getdents(), stat(), and open()
 745                          * consistent with each other.  If a process
 746                          * may not stat() a file, it shouldn't be seen
 747                          * in procfs at all.
 748                          */
 749                         return -ENOENT;
 750                 }
 751
 752                 return -EPERM;
 753         }
 754         return generic_permission(inode, mask);
 755 }
 756
 757
 758
 759 static const struct inode_operations proc_def_inode_operations = {
 760         .setattr        = proc_setattr,
 761 };
 762
 763 static int proc_single_show(struct seq_file *m, void *v)
 764 {
 765         struct inode *inode = m->private;
 766         struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
 767         struct pid *pid = proc_pid(inode);
 768         struct task_struct *task;
 769         int ret;
 770
 771         task = get_pid_task(pid, PIDTYPE_PID);
 772         if (!task)
 773                 return -ESRCH;
 774
 775         ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
 776
 777         put_task_struct(task);
 778         return ret;
 779 }
 780
 781 static int proc_single_open(struct inode *inode, struct file *filp)
 782 {
 783         return single_open(filp, proc_single_show, inode);
 784 }
 785
 786 static const struct file_operations proc_single_file_operations = {
 787         .open           = proc_single_open,
 788         .read           = seq_read,
 789         .llseek         = seq_lseek,
 790         .release        = single_release,
 791 };
 792
 793
 794 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
 795 {
 796         struct task_struct *task = get_proc_task(inode);
 797         struct mm_struct *mm = ERR_PTR(-ESRCH);
 798
 799         if (task) {
 800                 mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
 801                 put_task_struct(task);
 802
 803                 if (!IS_ERR_OR_NULL(mm)) {
 804                         /* ensure this mm_struct can't be freed */
 805                         mmgrab(mm);
 806                         /* but do not pin its memory */
 807                         mmput(mm);
 808                 }
 809         }
 810
 811         return mm;
 812 }
 813
 814 static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
 815 {
 816         struct mm_struct *mm = proc_mem_open(inode, mode);
 817
 818         if (IS_ERR(mm))
 819                 return PTR_ERR(mm);
 820
 821         file->private_data = mm;
 822         return 0;
 823 }
 824
 825 static int mem_open(struct inode *inode, struct file *file)
 826 {
 827         int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
 828
 829         /* OK to pass negative loff_t, we can catch out-of-range */
 830         file->f_mode |= FMODE_UNSIGNED_OFFSET;
 831
 832         return ret;
 833 }
 834
 835 static ssize_t mem_rw(struct file *file, char __user *buf,
 836                         size_t count, loff_t *ppos, int write)
 837 {
 838         struct mm_struct *mm = file->private_data;
 839         unsigned long addr = *ppos;
 840         ssize_t copied;
 841         char *page;
 842         unsigned int flags;
 843
 844         if (!mm)
 845                 return 0;
 846
 847         page = (char *)__get_free_page(GFP_KERNEL);
 848         if (!page)
 849                 return -ENOMEM;
 850
 851         copied = 0;
 852         if (!mmget_not_zero(mm))
 853                 goto free;
 854
 855         flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
 856
 857         while (count > 0) {
 858                 size_t this_len = min_t(size_t, count, PAGE_SIZE);
 859
 860                 if (write && copy_from_user(page, buf, this_len)) {
 861                         copied = -EFAULT;
 862                         break;
 863                 }
 864
 865                 this_len = access_remote_vm(mm, addr, page, this_len, flags);
 866                 if (!this_len) {
 867                         if (!copied)
 868                                 copied = -EIO;
 869                         break;
 870                 }
 871
 872                 if (!write && copy_to_user(buf, page, this_len)) {
 873                         copied = -EFAULT;
 874                         break;
 875                 }
 876
 877                 buf += this_len;
 878                 addr += this_len;
 879                 copied += this_len;
 880                 count -= this_len;
 881         }
 882         *ppos = addr;
 883
 884         mmput(mm);
 885 free:
 886         free_page((unsigned long) page);
 887         return copied;
 888 }
 889
 890 static ssize_t mem_read(struct file *file, char __user *buf,
 891                         size_t count, loff_t *ppos)
 892 {
 893         return mem_rw(file, buf, count, ppos, 0);
 894 }
 895
 896 static ssize_t mem_write(struct file *file, const char __user *buf,
 897                          size_t count, loff_t *ppos)
 898 {
 899         return mem_rw(file, (char __user*)buf, count, ppos, 1);
 900 }
 901
 902 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 903 {
 904         switch (orig) {
 905         case 0:
 906                 file->f_pos = offset;
 907                 break;
 908         case 1:
 909                 file->f_pos += offset;
 910                 break;
 911         default:
 912                 return -EINVAL;
 913         }
 914         force_successful_syscall_return();
 915         return file->f_pos;
 916 }
 917
 918 static int mem_release(struct inode *inode, struct file *file)
 919 {
 920         struct mm_struct *mm = file->private_data;
 921         if (mm)
 922                 mmdrop(mm);
 923         return 0;
 924 }
 925
 926 static const struct file_operations proc_mem_operations = {
 927         .llseek         = mem_lseek,
 928         .read           = mem_read,
 929         .write          = mem_write,
 930         .open           = mem_open,
 931         .release        = mem_release,
 932 };
 933
 934 static int environ_open(struct inode *inode, struct file *file)
 935 {
 936         return __mem_open(inode, file, PTRACE_MODE_READ);
 937 }
 938
 939 static ssize_t environ_read(struct file *file, char __user *buf,
 940                         size_t count, loff_t *ppos)
 941 {
 942         char *page;
 943         unsigned long src = *ppos;
 944         int ret = 0;
 945         struct mm_struct *mm = file->private_data;
 946         unsigned long env_start, env_end;
 947
 948         /* Ensure the process spawned far enough to have an environment. */
 949         if (!mm || !mm->env_end)
 950                 return 0;
 951
 952         page = (char *)__get_free_page(GFP_KERNEL);
 953         if (!page)
 954                 return -ENOMEM;
 955
 956         ret = 0;
 957         if (!mmget_not_zero(mm))
 958                 goto free;
 959
 960         spin_lock(&mm->arg_lock);
 961         env_start = mm->env_start;
 962         env_end = mm->env_end;
 963         spin_unlock(&mm->arg_lock);
 964
 965         while (count > 0) {
 966                 size_t this_len, max_len;
 967                 int retval;
 968
 969                 if (src >= (env_end - env_start))
 970                         break;
 971
 972                 this_len = env_end - (env_start + src);
 973
 974                 max_len = min_t(size_t, PAGE_SIZE, count);
 975                 this_len = min(max_len, this_len);
 976
 977                 retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
 978
 979                 if (retval <= 0) {
 980                         ret = retval;
 981                         break;
 982                 }
 983
 984                 if (copy_to_user(buf, page, retval)) {
 985                         ret = -EFAULT;
 986                         break;
 987                 }
 988
 989                 ret += retval;
 990                 src += retval;
 991                 buf += retval;
 992                 count -= retval;
 993         }
 994         *ppos = src;
 995         mmput(mm);
 996
 997 free:
 998         free_page((unsigned long) page);
 999         return ret;
1000 }
1001
1002 static const struct file_operations proc_environ_operations = {
1003         .open           = environ_open,
1004         .read           = environ_read,
1005         .llseek         = generic_file_llseek,
1006         .release        = mem_release,
1007 };
1008
1009 static int auxv_open(struct inode *inode, struct file *file)
1010 {
1011         return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
1012 }
1013
1014 static ssize_t auxv_read(struct file *file, char __user *buf,
1015                         size_t count, loff_t *ppos)
1016 {
1017         struct mm_struct *mm = file->private_data;
1018         unsigned int nwords = 0;
1019
1020         if (!mm)
1021                 return 0;
1022         do {
1023                 nwords += 2;
1024         } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
1025         return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
1026                                        nwords * sizeof(mm->saved_auxv[0]));
1027 }
1028
1029 static const struct file_operations proc_auxv_operations = {
1030         .open           = auxv_open,
1031         .read           = auxv_read,
1032         .llseek         = generic_file_llseek,
1033         .release        = mem_release,
1034 };
1035
1036 static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1037                             loff_t *ppos)
1038 {
1039         struct task_struct *task = get_proc_task(file_inode(file));
1040         char buffer[PROC_NUMBUF];
1041         int oom_adj = OOM_ADJUST_MIN;
1042         size_t len;
1043
1044         if (!task)
1045                 return -ESRCH;
1046         if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
1047                 oom_adj = OOM_ADJUST_MAX;
1048         else
1049                 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
1050                           OOM_SCORE_ADJ_MAX;
1051         put_task_struct(task);
1052         if (oom_adj > OOM_ADJUST_MAX)
1053                 oom_adj = OOM_ADJUST_MAX;
1054         len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
1055         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1056 }
1057
1058 static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
1059 {
1060         struct mm_struct *mm = NULL;
1061         struct task_struct *task;
1062         int err = 0;
1063
1064         task = get_proc_task(file_inode(file));
1065         if (!task)
1066                 return -ESRCH;
1067
1068         mutex_lock(&oom_adj_mutex);
1069         if (legacy) {
1070                 if (oom_adj < task->signal->oom_score_adj &&
1071                                 !capable(CAP_SYS_RESOURCE)) {
1072                         err = -EACCES;
1073                         goto err_unlock;
1074                 }
1075                 /*
1076                  * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
1077                  * /proc/pid/oom_score_adj instead.
1078                  */
1079                 pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1080                           current->comm, task_pid_nr(current), task_pid_nr(task),
1081                           task_pid_nr(task));
1082         } else {
1083                 if ((short)oom_adj < task->signal->oom_score_adj_min &&
1084                                 !capable(CAP_SYS_RESOURCE)) {
1085                         err = -EACCES;
1086                         goto err_unlock;
1087                 }
1088         }
1089
1090         /*
1091          * Make sure we will check other processes sharing the mm if this is
1092          * not vfrok which wants its own oom_score_adj.
1093          * pin the mm so it doesn't go away and get reused after task_unlock
1094          */
1095         if (!task->vfork_done) {
1096                 struct task_struct *p = find_lock_task_mm(task);
1097
1098                 if (p) {
1099                         if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
1100                                 mm = p->mm;
1101                                 mmgrab(mm);
1102                         }
1103                         task_unlock(p);
1104                 }
1105         }
1106
1107         task->signal->oom_score_adj = oom_adj;
1108         if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1109                 task->signal->oom_score_adj_min = (short)oom_adj;
1110         trace_oom_score_adj_update(task);
1111
1112         if (mm) {
1113                 struct task_struct *p;
1114
1115                 rcu_read_lock();
1116                 for_each_process(p) {
1117                         if (same_thread_group(task, p))
1118                                 continue;
1119
1120                         /* do not touch kernel threads or the global init */
1121                         if (p->flags & PF_KTHREAD || is_global_init(p))
1122                                 continue;
1123
1124                         task_lock(p);
1125                         if (!p->vfork_done && process_shares_mm(p, mm)) {
1126                                 p->signal->oom_score_adj = oom_adj;
1127                                 if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1128                                         p->signal->oom_score_adj_min = (short)oom_adj;
1129                         }
1130                         task_unlock(p);
1131                 }
1132                 rcu_read_unlock();
1133                 mmdrop(mm);
1134         }
1135 err_unlock:
1136         mutex_unlock(&oom_adj_mutex);
1137         put_task_struct(task);
1138         return err;
1139 }
1140
1141 /*
1142  * /proc/pid/oom_adj exists solely for backwards compatibility with previous
1143  * kernels.  The effective policy is defined by oom_score_adj, which has a
1144  * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
1145  * Values written to oom_adj are simply mapped linearly to oom_score_adj.
1146  * Processes that become oom disabled via oom_adj will still be oom disabled
1147  * with this implementation.
1148  *
1149  * oom_adj cannot be removed since existing userspace binaries use it.
1150  */
1151 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1152                              size_t count, loff_t *ppos)
1153 {
1154         char buffer[PROC_NUMBUF];
1155         int oom_adj;
1156         int err;
1157
1158         memset(buffer, 0, sizeof(buffer));
1159         if (count > sizeof(buffer) - 1)
1160                 count = sizeof(buffer) - 1;
1161         if (copy_from_user(buffer, buf, count)) {
1162                 err = -EFAULT;
1163                 goto out;
1164         }
1165
1166         err = kstrtoint(strstrip(buffer), 0, &oom_adj);
1167         if (err)
1168                 goto out;
1169         if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
1170              oom_adj != OOM_DISABLE) {
1171                 err = -EINVAL;
1172                 goto out;
1173         }
1174
1175         /*
1176          * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1177          * value is always attainable.
1178          */
1179         if (oom_adj == OOM_ADJUST_MAX)
1180                 oom_adj = OOM_SCORE_ADJ_MAX;
1181         else
1182                 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
1183
1184         err = __set_oom_adj(file, oom_adj, true);
1185 out:
1186         return err < 0 ? err : count;
1187 }
1188
1189 static const struct file_operations proc_oom_adj_operations = {
1190         .read           = oom_adj_read,
1191         .write          = oom_adj_write,
1192         .llseek         = generic_file_llseek,
1193 };
1194
1195 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1196                                         size_t count, loff_t *ppos)
1197 {
1198         struct task_struct *task = get_proc_task(file_inode(file));
1199         char buffer[PROC_NUMBUF];
1200         short oom_score_adj = OOM_SCORE_ADJ_MIN;
1201         size_t len;
1202
1203         if (!task)
1204                 return -ESRCH;
1205         oom_score_adj = task->signal->oom_score_adj;
1206         put_task_struct(task);
1207         len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
1208         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1209 }
1210
1211 static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1212                                         size_t count, loff_t *ppos)
1213 {
1214         char buffer[PROC_NUMBUF];
1215         int oom_score_adj;
1216         int err;
1217
1218         memset(buffer, 0, sizeof(buffer));
1219         if (count > sizeof(buffer) - 1)
1220                 count = sizeof(buffer) - 1;
1221         if (copy_from_user(buffer, buf, count)) {
1222                 err = -EFAULT;
1223                 goto out;
1224         }
1225
1226         err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
1227         if (err)
1228                 goto out;
1229         if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1230                         oom_score_adj > OOM_SCORE_ADJ_MAX) {
1231                 err = -EINVAL;
1232                 goto out;
1233         }
1234
1235         err = __set_oom_adj(file, oom_score_adj, false);
1236 out:
1237         return err < 0 ? err : count;
1238 }
1239
1240 static const struct file_operations proc_oom_score_adj_operations = {
1241         .read           = oom_score_adj_read,
1242         .write          = oom_score_adj_write,
1243         .llseek         = default_llseek,
1244 };
1245
1246 #ifdef CONFIG_AUDIT
1247 #define TMPBUFLEN 11
1248 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1249                                   size_t count, loff_t *ppos)
1250 {
1251         struct inode * inode = file_inode(file);
1252         struct task_struct *task = get_proc_task(inode);
1253         ssize_t length;
1254         char tmpbuf[TMPBUFLEN];
1255
1256         if (!task)
1257                 return -ESRCH;
1258         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1259                            from_kuid(file->f_cred->user_ns,
1260                                      audit_get_loginuid(task)));
1261         put_task_struct(task);
1262         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1263 }
1264
1265 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1266                                    size_t count, loff_t *ppos)
1267 {
1268         struct inode * inode = file_inode(file);
1269         uid_t loginuid;
1270         kuid_t kloginuid;
1271         int rv;
1272
1273         /* Don't let kthreads write their own loginuid */
1274         if (current->flags & PF_KTHREAD)
1275                 return -EPERM;
1276
1277         rcu_read_lock();
1278         if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1279                 rcu_read_unlock();
1280                 return -EPERM;
1281         }
1282         rcu_read_unlock();
1283
1284         if (*ppos != 0) {
1285                 /* No partial writes. */
1286                 return -EINVAL;
1287         }
1288
1289         rv = kstrtou32_from_user(buf, count, 10, &loginuid);
1290         if (rv < 0)
1291                 return rv;
1292
1293         /* is userspace tring to explicitly UNSET the loginuid? */
1294         if (loginuid == AUDIT_UID_UNSET) {
1295                 kloginuid = INVALID_UID;
1296         } else {
1297                 kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1298                 if (!uid_valid(kloginuid))
1299                         return -EINVAL;
1300         }
1301
1302         rv = audit_set_loginuid(kloginuid);
1303         if (rv < 0)
1304                 return rv;
1305         return count;
1306 }
1307
1308 static const struct file_operations proc_loginuid_operations = {
1309         .read           = proc_loginuid_read,
1310         .write          = proc_loginuid_write,
1311         .llseek         = generic_file_llseek,
1312 };
1313
1314 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1315                                   size_t count, loff_t *ppos)
1316 {
1317         struct inode * inode = file_inode(file);
1318         struct task_struct *task = get_proc_task(inode);
1319         ssize_t length;
1320         char tmpbuf[TMPBUFLEN];
1321
1322         if (!task)
1323                 return -ESRCH;
1324         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1325                                 audit_get_sessionid(task));
1326         put_task_struct(task);
1327         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1328 }
1329
1330 static const struct file_operations proc_sessionid_operations = {
1331         .read           = proc_sessionid_read,
1332         .llseek         = generic_file_llseek,
1333 };
1334 #endif
1335
1336 #ifdef CONFIG_FAULT_INJECTION
1337 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1338                                       size_t count, loff_t *ppos)
1339 {
1340         struct task_struct *task = get_proc_task(file_inode(file));
1341         char buffer[PROC_NUMBUF];
1342         size_t len;
1343         int make_it_fail;
1344
1345         if (!task)
1346                 return -ESRCH;
1347         make_it_fail = task->make_it_fail;
1348         put_task_struct(task);
1349
1350         len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1351
1352         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1353 }
1354
1355 static ssize_t proc_fault_inject_write(struct file * file,
1356                         const char __user * buf, size_t count, loff_t *ppos)
1357 {
1358         struct task_struct *task;
1359         char buffer[PROC_NUMBUF];
1360         int make_it_fail;
1361         int rv;
1362
1363         if (!capable(CAP_SYS_RESOURCE))
1364                 return -EPERM;
1365         memset(buffer, 0, sizeof(buffer));
1366         if (count > sizeof(buffer) - 1)
1367                 count = sizeof(buffer) - 1;
1368         if (copy_from_user(buffer, buf, count))
1369                 return -EFAULT;
1370         rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
1371         if (rv < 0)
1372                 return rv;
1373         if (make_it_fail < 0 || make_it_fail > 1)
1374                 return -EINVAL;
1375
1376         task = get_proc_task(file_inode(file));
1377         if (!task)
1378                 return -ESRCH;
1379         task->make_it_fail = make_it_fail;
1380         put_task_struct(task);
1381
1382         return count;
1383 }
1384
1385 static const struct file_operations proc_fault_inject_operations = {
1386         .read           = proc_fault_inject_read,
1387         .write          = proc_fault_inject_write,
1388         .llseek         = generic_file_llseek,
1389 };
1390
1391 static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
1392                                    size_t count, loff_t *ppos)
1393 {
1394         struct task_struct *task;
1395         int err;
1396         unsigned int n;
1397
1398         err = kstrtouint_from_user(buf, count, 0, &n);
1399         if (err)
1400                 return err;
1401
1402         task = get_proc_task(file_inode(file));
1403         if (!task)
1404                 return -ESRCH;
1405         task->fail_nth = n;
1406         put_task_struct(task);
1407
1408         return count;
1409 }
1410
1411 static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
1412                                   size_t count, loff_t *ppos)
1413 {
1414         struct task_struct *task;
1415         char numbuf[PROC_NUMBUF];
1416         ssize_t len;
1417
1418         task = get_proc_task(file_inode(file));
1419         if (!task)
1420                 return -ESRCH;
1421         len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
1422         put_task_struct(task);
1423         return simple_read_from_buffer(buf, count, ppos, numbuf, len);
1424 }
1425
1426 static const struct file_operations proc_fail_nth_operations = {
1427         .read           = proc_fail_nth_read,
1428         .write          = proc_fail_nth_write,
1429 };
1430 #endif
1431
1432
1433 #ifdef CONFIG_SCHED_DEBUG
1434 /*
1435  * Print out various scheduling related per-task fields:
1436  */
1437 static int sched_show(struct seq_file *m, void *v)
1438 {
1439         struct inode *inode = m->private;
1440         struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
1441         struct task_struct *p;
1442
1443         p = get_proc_task(inode);
1444         if (!p)
1445                 return -ESRCH;
1446         proc_sched_show_task(p, ns, m);
1447
1448         put_task_struct(p);
1449
1450         return 0;
1451 }
1452
1453 static ssize_t
1454 sched_write(struct file *file, const char __user *buf,
1455             size_t count, loff_t *offset)
1456 {
1457         struct inode *inode = file_inode(file);
1458         struct task_struct *p;
1459
1460         p = get_proc_task(inode);
1461         if (!p)
1462                 return -ESRCH;
1463         proc_sched_set_task(p);
1464
1465         put_task_struct(p);
1466
1467         return count;
1468 }
1469
1470 static int sched_open(struct inode *inode, struct file *filp)
1471 {
1472         return single_open(filp, sched_show, inode);
1473 }
1474
1475 static const struct file_operations proc_pid_sched_operations = {
1476         .open           = sched_open,
1477         .read           = seq_read,
1478         .write          = sched_write,
1479         .llseek         = seq_lseek,
1480         .release        = single_release,
1481 };
1482
1483 #endif
1484
1485 #ifdef CONFIG_SCHED_AUTOGROUP
1486 /*
1487  * Print out autogroup related information:
1488  */
1489 static int sched_autogroup_show(struct seq_file *m, void *v)
1490 {
1491         struct inode *inode = m->private;
1492         struct task_struct *p;
1493
1494         p = get_proc_task(inode);
1495         if (!p)
1496                 return -ESRCH;
1497         proc_sched_autogroup_show_task(p, m);
1498
1499         put_task_struct(p);
1500
1501         return 0;
1502 }
1503
1504 static ssize_t
1505 sched_autogroup_write(struct file *file, const char __user *buf,
1506             size_t count, loff_t *offset)
1507 {
1508         struct inode *inode = file_inode(file);
1509         struct task_struct *p;
1510         char buffer[PROC_NUMBUF];
1511         int nice;
1512         int err;
1513
1514         memset(buffer, 0, sizeof(buffer));
1515         if (count > sizeof(buffer) - 1)
1516                 count = sizeof(buffer) - 1;
1517         if (copy_from_user(buffer, buf, count))
1518                 return -EFAULT;
1519
1520         err = kstrtoint(strstrip(buffer), 0, &nice);
1521         if (err < 0)
1522                 return err;
1523
1524         p = get_proc_task(inode);
1525         if (!p)
1526                 return -ESRCH;
1527
1528         err = proc_sched_autogroup_set_nice(p, nice);
1529         if (err)
1530                 count = err;
1531
1532         put_task_struct(p);
1533
1534         return count;
1535 }
1536
1537 static int sched_autogroup_open(struct inode *inode, struct file *filp)
1538 {
1539         int ret;
1540
1541         ret = single_open(filp, sched_autogroup_show, NULL);
1542         if (!ret) {
1543                 struct seq_file *m = filp->private_data;
1544
1545                 m->private = inode;
1546         }
1547         return ret;
1548 }
1549
1550 static const struct file_operations proc_pid_sched_autogroup_operations = {
1551         .open           = sched_autogroup_open,
1552         .read           = seq_read,
1553         .write          = sched_autogroup_write,
1554         .llseek         = seq_lseek,
1555         .release        = single_release,
1556 };
1557
1558 #endif /* CONFIG_SCHED_AUTOGROUP */
1559
1560 #ifdef CONFIG_TIME_NS
1561 static int timens_offsets_show(struct seq_file *m, void *v)
1562 {
1563         struct task_struct *p;
1564
1565         p = get_proc_task(file_inode(m->file));
1566         if (!p)
1567                 return -ESRCH;
1568         proc_timens_show_offsets(p, m);
1569
1570         put_task_struct(p);
1571
1572         return 0;
1573 }
1574
1575 static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
1576                                     size_t count, loff_t *ppos)
1577 {
1578         struct inode *inode = file_inode(file);
1579         struct proc_timens_offset offsets[2];
1580         char *kbuf = NULL, *pos, *next_line;
1581         struct task_struct *p;
1582         int ret, noffsets;
1583
1584         /* Only allow < page size writes at the beginning of the file */
1585         if ((*ppos != 0) || (count >= PAGE_SIZE))
1586                 return -EINVAL;
1587
1588         /* Slurp in the user data */
1589         kbuf = memdup_user_nul(buf, count);
1590         if (IS_ERR(kbuf))
1591                 return PTR_ERR(kbuf);
1592
1593         /* Parse the user data */
1594         ret = -EINVAL;
1595         noffsets = 0;
1596         for (pos = kbuf; pos; pos = next_line) {
1597                 struct proc_timens_offset *off = &offsets[noffsets];
1598                 char clock[10];
1599                 int err;
1600
1601                 /* Find the end of line and ensure we don't look past it */
1602                 next_line = strchr(pos, '\n');
1603                 if (next_line) {
1604                         *next_line = '\0';
1605                         next_line++;
1606                         if (*next_line == '\0')
1607                                 next_line = NULL;
1608                 }
1609
1610                 err = sscanf(pos, "%9s %lld %lu", clock,
1611                                 &off->val.tv_sec, &off->val.tv_nsec);
1612                 if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
1613                         goto out;
1614
1615                 clock[sizeof(clock) - 1] = 0;
1616                 if (strcmp(clock, "monotonic") == 0 ||
1617                     strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
1618                         off->clockid = CLOCK_MONOTONIC;
1619                 else if (strcmp(clock, "boottime") == 0 ||
1620                          strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
1621                         off->clockid = CLOCK_BOOTTIME;
1622                 else
1623                         goto out;
1624
1625                 noffsets++;
1626                 if (noffsets == ARRAY_SIZE(offsets)) {
1627                         if (next_line)
1628                                 count = next_line - kbuf;
1629                         break;
1630                 }
1631         }
1632
1633         ret = -ESRCH;
1634         p = get_proc_task(inode);
1635         if (!p)
1636                 goto out;
1637         ret = proc_timens_set_offset(file, p, offsets, noffsets);
1638         put_task_struct(p);
1639         if (ret)
1640                 goto out;
1641
1642         ret = count;
1643 out:
1644         kfree(kbuf);
1645         return ret;
1646 }
1647
1648 static int timens_offsets_open(struct inode *inode, struct file *filp)
1649 {
1650         return single_open(filp, timens_offsets_show, inode);
1651 }
1652
1653 static const struct file_operations proc_timens_offsets_operations = {
1654         .open           = timens_offsets_open,
1655         .read           = seq_read,
1656         .write          = timens_offsets_write,
1657         .llseek         = seq_lseek,
1658         .release        = single_release,
1659 };
1660 #endif /* CONFIG_TIME_NS */
1661
1662 static ssize_t comm_write(struct file *file, const char __user *buf,
1663                                 size_t count, loff_t *offset)
1664 {
1665         struct inode *inode = file_inode(file);
1666         struct task_struct *p;
1667         char buffer[TASK_COMM_LEN];
1668         const size_t maxlen = sizeof(buffer) - 1;
1669
1670         memset(buffer, 0, sizeof(buffer));
1671         if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
1672                 return -EFAULT;
1673
1674         p = get_proc_task(inode);
1675         if (!p)
1676                 return -ESRCH;
1677
1678         if (same_thread_group(current, p))
1679                 set_task_comm(p, buffer);
1680         else
1681                 count = -EINVAL;
1682
1683         put_task_struct(p);
1684
1685         return count;
1686 }
1687
1688 static int comm_show(struct seq_file *m, void *v)
1689 {
1690         struct inode *inode = m->private;
1691         struct task_struct *p;
1692
1693         p = get_proc_task(inode);
1694         if (!p)
1695                 return -ESRCH;
1696
1697         proc_task_name(m, p, false);
1698         seq_putc(m, '\n');
1699
1700         put_task_struct(p);
1701
1702         return 0;
1703 }
1704
1705 static int comm_open(struct inode *inode, struct file *filp)
1706 {
1707         return single_open(filp, comm_show, inode);
1708 }
1709
1710 static const struct file_operations proc_pid_set_comm_operations = {
1711         .open           = comm_open,
1712         .read           = seq_read,
1713         .write          = comm_write,
1714         .llseek         = seq_lseek,
1715         .release        = single_release,
1716 };
1717
1718 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1719 {
1720         struct task_struct *task;
1721         struct file *exe_file;
1722
1723         task = get_proc_task(d_inode(dentry));
1724         if (!task)
1725                 return -ENOENT;
1726         exe_file = get_task_exe_file(task);
1727         put_task_struct(task);
1728         if (exe_file) {
1729                 *exe_path = exe_file->f_path;
1730                 path_get(&exe_file->f_path);
1731                 fput(exe_file);
1732                 return 0;
1733         } else
1734                 return -ENOENT;
1735 }
1736
1737 static const char *proc_pid_get_link(struct dentry *dentry,
1738                                      struct inode *inode,
1739                                      struct delayed_call *done)
1740 {
1741         struct path path;
1742         int error = -EACCES;
1743
1744         if (!dentry)
1745                 return ERR_PTR(-ECHILD);
1746
1747         /* Are we allowed to snoop on the tasks file descriptors? */
1748         if (!proc_fd_access_allowed(inode))
1749                 goto out;
1750
1751         error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1752         if (error)
1753                 goto out;
1754
1755         error = nd_jump_link(&path);
1756 out:
1757         return ERR_PTR(error);
1758 }
1759
1760 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1761 {
1762         char *tmp = (char *)__get_free_page(GFP_KERNEL);
1763         char *pathname;
1764         int len;
1765
1766         if (!tmp)
1767                 return -ENOMEM;
1768
1769         pathname = d_path(path, tmp, PAGE_SIZE);
1770         len = PTR_ERR(pathname);
1771         if (IS_ERR(pathname))
1772                 goto out;
1773         len = tmp + PAGE_SIZE - 1 - pathname;
1774
1775         if (len > buflen)
1776                 len = buflen;
1777         if (copy_to_user(buffer, pathname, len))
1778                 len = -EFAULT;
1779  out:
1780         free_page((unsigned long)tmp);
1781         return len;
1782 }
1783
1784 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1785 {
1786         int error = -EACCES;
1787         struct inode *inode = d_inode(dentry);
1788         struct path path;
1789
1790         /* Are we allowed to snoop on the tasks file descriptors? */
1791         if (!proc_fd_access_allowed(inode))
1792                 goto out;
1793
1794         error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1795         if (error)
1796                 goto out;
1797
1798         error = do_proc_readlink(&path, buffer, buflen);
1799         path_put(&path);
1800 out:
1801         return error;
1802 }
1803
1804 const struct inode_operations proc_pid_link_inode_operations = {
1805         .readlink       = proc_pid_readlink,
1806         .get_link       = proc_pid_get_link,
1807         .setattr        = proc_setattr,
1808 };
1809
1810
1811 /* building an inode */
1812
1813 void task_dump_owner(struct task_struct *task, umode_t mode,
1814                      kuid_t *ruid, kgid_t *rgid)
1815 {
1816         /* Depending on the state of dumpable compute who should own a
1817          * proc file for a task.
1818          */
1819         const struct cred *cred;
1820         kuid_t uid;
1821         kgid_t gid;
1822
1823         if (unlikely(task->flags & PF_KTHREAD)) {
1824                 *ruid = GLOBAL_ROOT_UID;
1825                 *rgid = GLOBAL_ROOT_GID;
1826                 return;
1827         }
1828
1829         /* Default to the tasks effective ownership */
1830         rcu_read_lock();
1831         cred = __task_cred(task);
1832         uid = cred->euid;
1833         gid = cred->egid;
1834         rcu_read_unlock();
1835
1836         /*
1837          * Before the /proc/pid/status file was created the only way to read
1838          * the effective uid of a /process was to stat /proc/pid.  Reading
1839          * /proc/pid/status is slow enough that procps and other packages
1840          * kept stating /proc/pid.  To keep the rules in /proc simple I have
1841          * made this apply to all per process world readable and executable
1842          * directories.
1843          */
1844         if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
1845                 struct mm_struct *mm;
1846                 task_lock(task);
1847                 mm = task->mm;
1848                 /* Make non-dumpable tasks owned by some root */
1849                 if (mm) {
1850                         if (get_dumpable(mm) != SUID_DUMP_USER) {
1851                                 struct user_namespace *user_ns = mm->user_ns;
1852
1853                                 uid = make_kuid(user_ns, 0);
1854                                 if (!uid_valid(uid))
1855                                         uid = GLOBAL_ROOT_UID;
1856
1857                                 gid = make_kgid(user_ns, 0);
1858                                 if (!gid_valid(gid))
1859                                         gid = GLOBAL_ROOT_GID;
1860                         }
1861                 } else {
1862                         uid = GLOBAL_ROOT_UID;
1863                         gid = GLOBAL_ROOT_GID;
1864                 }
1865                 task_unlock(task);
1866         }
1867         *ruid = uid;
1868         *rgid = gid;
1869 }
1870
1871 void proc_pid_evict_inode(struct proc_inode *ei)
1872 {
1873         struct pid *pid = ei->pid;
1874
1875         if (S_ISDIR(ei->vfs_inode.i_mode)) {
1876                 spin_lock(&pid->lock);
1877                 hlist_del_init_rcu(&ei->sibling_inodes);
1878                 spin_unlock(&pid->lock);
1879         }
1880
1881         put_pid(pid);
1882 }
1883
1884 struct inode *proc_pid_make_inode(struct super_block * sb,
1885                                   struct task_struct *task, umode_t mode)
1886 {
1887         struct inode * inode;
1888         struct proc_inode *ei;
1889         struct pid *pid;
1890
1891         /* We need a new inode */
1892
1893         inode = new_inode(sb);
1894         if (!inode)
1895                 goto out;
1896
1897         /* Common stuff */
1898         ei = PROC_I(inode);
1899         inode->i_mode = mode;
1900         inode->i_ino = get_next_ino();
1901         inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
1902         inode->i_op = &proc_def_inode_operations;
1903
1904         /*
1905          * grab the reference to task.
1906          */
1907         pid = get_task_pid(task, PIDTYPE_PID);
1908         if (!pid)
1909                 goto out_unlock;
1910
1911         /* Let the pid remember us for quick removal */
1912         ei->pid = pid;
1913         if (S_ISDIR(mode)) {
1914                 spin_lock(&pid->lock);
1915                 hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
1916                 spin_unlock(&pid->lock);
1917         }
1918
1919         task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1920         security_task_to_inode(task, inode);
1921
1922 out:
1923         return inode;
1924
1925 out_unlock:
1926         iput(inode);
1927         return NULL;
1928 }
1929
1930 int pid_getattr(const struct path *path, struct kstat *stat,
1931                 u32 request_mask, unsigned int query_flags)
1932 {
1933         struct inode *inode = d_inode(path->dentry);
1934         struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
1935         struct task_struct *task;
1936
1937         generic_fillattr(inode, stat);
1938
1939         stat->uid = GLOBAL_ROOT_UID;
1940         stat->gid = GLOBAL_ROOT_GID;
1941         rcu_read_lock();
1942         task = pid_task(proc_pid(inode), PIDTYPE_PID);
1943         if (task) {
1944                 if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
1945                         rcu_read_unlock();
1946                         /*
1947                          * This doesn't prevent learning whether PID exists,
1948                          * it only makes getattr() consistent with readdir().
1949                          */
1950                         return -ENOENT;
1951                 }
1952                 task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1953         }
1954         rcu_read_unlock();
1955         return 0;
1956 }
1957
1958 /* dentry stuff */
1959
1960 /*
1961  * Set <pid>/... inode ownership (can change due to setuid(), etc.)
1962  */
1963 void pid_update_inode(struct task_struct *task, struct inode *inode)
1964 {
1965         task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1966
1967         inode->i_mode &= ~(S_ISUID | S_ISGID);
1968         security_task_to_inode(task, inode);
1969 }
1970
1971 /*
1972  * Rewrite the inode's ownerships here because the owning task may have
1973  * performed a setuid(), etc.
1974  *
1975  */
1976 static int pid_revalidate(struct dentry *dentry, unsigned int flags)
1977 {
1978         struct inode *inode;
1979         struct task_struct *task;
1980
1981         if (flags & LOOKUP_RCU)
1982                 return -ECHILD;
1983
1984         inode = d_inode(dentry);
1985         task = get_proc_task(inode);
1986
1987         if (task) {
1988                 pid_update_inode(task, inode);
1989                 put_task_struct(task);
1990                 return 1;
1991         }
1992         return 0;
1993 }
1994
1995 static inline bool proc_inode_is_dead(struct inode *inode)
1996 {
1997         return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
1998 }
1999
2000 int pid_delete_dentry(const struct dentry *dentry)
2001 {
2002         /* Is the task we represent dead?
2003          * If so, then don't put the dentry on the lru list,
2004          * kill it immediately.
2005          */
2006         return proc_inode_is_dead(d_inode(dentry));
2007 }
2008
2009 const struct dentry_operations pid_dentry_operations =
2010 {
2011         .d_revalidate   = pid_revalidate,
2012         .d_delete       = pid_delete_dentry,
2013 };
2014
2015 /* Lookups */
2016
2017 /*
2018  * Fill a directory entry.
2019  *
2020  * If possible create the dcache entry and derive our inode number and
2021  * file type from dcache entry.
2022  *
2023  * Since all of the proc inode numbers are dynamically generated, the inode
2024  * numbers do not exist until the inode is cache.  This means creating
2025  * the dcache entry in readdir is necessary to keep the inode numbers
2026  * reported by readdir in sync with the inode numbers reported
2027  * by stat.
2028  */
2029 bool proc_fill_cache(struct file *file, struct dir_context *ctx,
2030         const char *name, unsigned int len,
2031         instantiate_t instantiate, struct task_struct *task, const void *ptr)
2032 {
2033         struct dentry *child, *dir = file->f_path.dentry;
2034         struct qstr qname = QSTR_INIT(name, len);
2035         struct inode *inode;
2036         unsigned type = DT_UNKNOWN;
2037         ino_t ino = 1;
2038
2039         child = d_hash_and_lookup(dir, &qname);
2040         if (!child) {
2041                 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
2042                 child = d_alloc_parallel(dir, &qname, &wq);
2043                 if (IS_ERR(child))
2044                         goto end_instantiate;
2045                 if (d_in_lookup(child)) {
2046                         struct dentry *res;
2047                         res = instantiate(child, task, ptr);
2048                         d_lookup_done(child);
2049                         if (unlikely(res)) {
2050                                 dput(child);
2051                                 child = res;
2052                                 if (IS_ERR(child))
2053                                         goto end_instantiate;
2054                         }
2055                 }
2056         }
2057         inode = d_inode(child);
2058         ino = inode->i_ino;
2059         type = inode->i_mode >> 12;
2060         dput(child);
2061 end_instantiate:
2062         return dir_emit(ctx, name, len, ino, type);
2063 }
2064
2065 /*
2066  * dname_to_vma_addr - maps a dentry name into two unsigned longs
2067  * which represent vma start and end addresses.
2068  */
2069 static int dname_to_vma_addr(struct dentry *dentry,
2070                              unsigned long *start, unsigned long *end)
2071 {
2072         const char *str = dentry->d_name.name;
2073         unsigned long long sval, eval;
2074         unsigned int len;
2075
2076         if (str[0] == '0' && str[1] != '-')
2077                 return -EINVAL;
2078         len = _parse_integer(str, 16, &sval);
2079         if (len & KSTRTOX_OVERFLOW)
2080                 return -EINVAL;
2081         if (sval != (unsigned long)sval)
2082                 return -EINVAL;
2083         str += len;
2084
2085         if (*str != '-')
2086                 return -EINVAL;
2087         str++;
2088
2089         if (str[0] == '0' && str[1])
2090                 return -EINVAL;
2091         len = _parse_integer(str, 16, &eval);
2092         if (len & KSTRTOX_OVERFLOW)
2093                 return -EINVAL;
2094         if (eval != (unsigned long)eval)
2095                 return -EINVAL;
2096         str += len;
2097
2098         if (*str != '\0')
2099                 return -EINVAL;
2100
2101         *start = sval;
2102         *end = eval;
2103
2104         return 0;
2105 }
2106
2107 static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
2108 {
2109         unsigned long vm_start, vm_end;
2110         bool exact_vma_exists = false;
2111         struct mm_struct *mm = NULL;
2112         struct task_struct *task;
2113         struct inode *inode;
2114         int status = 0;
2115
2116         if (flags & LOOKUP_RCU)
2117                 return -ECHILD;
2118
2119         inode = d_inode(dentry);
2120         task = get_proc_task(inode);
2121         if (!task)
2122                 goto out_notask;
2123
2124         mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
2125         if (IS_ERR_OR_NULL(mm))
2126                 goto out;
2127
2128         if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
2129                 status = mmap_read_lock_killable(mm);
2130                 if (!status) {
2131                         exact_vma_exists = !!find_exact_vma(mm, vm_start,
2132                                                             vm_end);
2133                         mmap_read_unlock(mm);
2134                 }
2135         }
2136
2137         mmput(mm);
2138
2139         if (exact_vma_exists) {
2140                 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
2141
2142                 security_task_to_inode(task, inode);
2143                 status = 1;
2144         }
2145
2146 out:
2147         put_task_struct(task);
2148
2149 out_notask:
2150         return status;
2151 }
2152
2153 static const struct dentry_operations tid_map_files_dentry_operations = {
2154         .d_revalidate   = map_files_d_revalidate,
2155         .d_delete       = pid_delete_dentry,
2156 };
2157
2158 static int map_files_get_link(struct dentry *dentry, struct path *path)
2159 {
2160         unsigned long vm_start, vm_end;
2161         struct vm_area_struct *vma;
2162         struct task_struct *task;
2163         struct mm_struct *mm;
2164         int rc;
2165
2166         rc = -ENOENT;
2167         task = get_proc_task(d_inode(dentry));
2168         if (!task)
2169                 goto out;
2170
2171         mm = get_task_mm(task);
2172         put_task_struct(task);
2173         if (!mm)
2174                 goto out;
2175
2176         rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
2177         if (rc)
2178                 goto out_mmput;
2179
2180         rc = mmap_read_lock_killable(mm);
2181         if (rc)
2182                 goto out_mmput;
2183
2184         rc = -ENOENT;
2185         vma = find_exact_vma(mm, vm_start, vm_end);
2186         if (vma && vma->vm_file) {
2187                 *path = vma_pr_or_file(vma)->f_path;
2188                 path_get(path);
2189                 rc = 0;
2190         }
2191         mmap_read_unlock(mm);
2192
2193 out_mmput:
2194         mmput(mm);
2195 out:
2196         return rc;
2197 }
2198
2199 struct map_files_info {
2200         unsigned long   start;
2201         unsigned long   end;
2202         fmode_t         mode;
2203 };
2204
2205 /*
2206  * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
2207  * to concerns about how the symlinks may be used to bypass permissions on
2208  * ancestor directories in the path to the file in question.
2209  */
2210 static const char *
2211 proc_map_files_get_link(struct dentry *dentry,
2212                         struct inode *inode,
2213                         struct delayed_call *done)
2214 {
2215         if (!checkpoint_restore_ns_capable(&init_user_ns))
2216                 return ERR_PTR(-EPERM);
2217
2218         return proc_pid_get_link(dentry, inode, done);
2219 }
2220
2221 /*
2222  * Identical to proc_pid_link_inode_operations except for get_link()
2223  */
2224 static const struct inode_operations proc_map_files_link_inode_operations = {
2225         .readlink       = proc_pid_readlink,
2226         .get_link       = proc_map_files_get_link,
2227         .setattr        = proc_setattr,
2228 };
2229
2230 static struct dentry *
2231 proc_map_files_instantiate(struct dentry *dentry,
2232                            struct task_struct *task, const void *ptr)
2233 {
2234         fmode_t mode = (fmode_t)(unsigned long)ptr;
2235         struct proc_inode *ei;
2236         struct inode *inode;
2237
2238         inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
2239                                     ((mode & FMODE_READ ) ? S_IRUSR : 0) |
2240                                     ((mode & FMODE_WRITE) ? S_IWUSR : 0));
2241         if (!inode)
2242                 return ERR_PTR(-ENOENT);
2243
2244         ei = PROC_I(inode);
2245         ei->op.proc_get_link = map_files_get_link;
2246
2247         inode->i_op = &proc_map_files_link_inode_operations;
2248         inode->i_size = 64;
2249
2250         d_set_d_op(dentry, &tid_map_files_dentry_operations);
2251         return d_splice_alias(inode, dentry);
2252 }
2253
2254 static struct dentry *proc_map_files_lookup(struct inode *dir,
2255                 struct dentry *dentry, unsigned int flags)
2256 {
2257         unsigned long vm_start, vm_end;
2258         struct vm_area_struct *vma;
2259         struct task_struct *task;
2260         struct dentry *result;
2261         struct mm_struct *mm;
2262
2263         result = ERR_PTR(-ENOENT);
2264         task = get_proc_task(dir);
2265         if (!task)
2266                 goto out;
2267
2268         result = ERR_PTR(-EACCES);
2269         if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2270                 goto out_put_task;
2271
2272         result = ERR_PTR(-ENOENT);
2273         if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
2274                 goto out_put_task;
2275
2276         mm = get_task_mm(task);
2277         if (!mm)
2278                 goto out_put_task;
2279
2280         result = ERR_PTR(-EINTR);
2281         if (mmap_read_lock_killable(mm))
2282                 goto out_put_mm;
2283
2284         result = ERR_PTR(-ENOENT);
2285         vma = find_exact_vma(mm, vm_start, vm_end);
2286         if (!vma)
2287                 goto out_no_vma;
2288
2289         if (vma->vm_file)
2290                 result = proc_map_files_instantiate(dentry, task,
2291                                 (void *)(unsigned long)vma->vm_file->f_mode);
2292
2293 out_no_vma:
2294         mmap_read_unlock(mm);
2295 out_put_mm:
2296         mmput(mm);
2297 out_put_task:
2298         put_task_struct(task);
2299 out:
2300         return result;
2301 }
2302
2303 static const struct inode_operations proc_map_files_inode_operations = {
2304         .lookup         = proc_map_files_lookup,
2305         .permission     = proc_fd_permission,
2306         .setattr        = proc_setattr,
2307 };
2308
2309 static int
2310 proc_map_files_readdir(struct file *file, struct dir_context *ctx)
2311 {
2312         struct vm_area_struct *vma;
2313         struct task_struct *task;
2314         struct mm_struct *mm;
2315         unsigned long nr_files, pos, i;
2316         GENRADIX(struct map_files_info) fa;
2317         struct map_files_info *p;
2318         int ret;
2319
2320         genradix_init(&fa);
2321
2322         ret = -ENOENT;
2323         task = get_proc_task(file_inode(file));
2324         if (!task)
2325                 goto out;
2326
2327         ret = -EACCES;
2328         if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2329                 goto out_put_task;
2330
2331         ret = 0;
2332         if (!dir_emit_dots(file, ctx))
2333                 goto out_put_task;
2334
2335         mm = get_task_mm(task);
2336         if (!mm)
2337                 goto out_put_task;
2338
2339         ret = mmap_read_lock_killable(mm);
2340         if (ret) {
2341                 mmput(mm);
2342                 goto out_put_task;
2343         }
2344
2345         nr_files = 0;
2346
2347         /*
2348          * We need two passes here:
2349          *
2350          *  1) Collect vmas of mapped files with mmap_lock taken
2351          *  2) Release mmap_lock and instantiate entries
2352          *
2353          * otherwise we get lockdep complained, since filldir()
2354          * routine might require mmap_lock taken in might_fault().
2355          */
2356
2357         for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
2358                 if (!vma->vm_file)
2359                         continue;
2360                 if (++pos <= ctx->pos)
2361                         continue;
2362
2363                 p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
2364                 if (!p) {
2365                         ret = -ENOMEM;
2366                         mmap_read_unlock(mm);
2367                         mmput(mm);
2368                         goto out_put_task;
2369                 }
2370
2371                 p->start = vma->vm_start;
2372                 p->end = vma->vm_end;
2373                 p->mode = vma->vm_file->f_mode;
2374         }
2375         mmap_read_unlock(mm);
2376         mmput(mm);
2377
2378         for (i = 0; i < nr_files; i++) {
2379                 char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
2380                 unsigned int len;
2381
2382                 p = genradix_ptr(&fa, i);
2383                 len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
2384                 if (!proc_fill_cache(file, ctx,
2385                                       buf, len,
2386                                       proc_map_files_instantiate,
2387                                       task,
2388                                       (void *)(unsigned long)p->mode))
2389                         break;
2390                 ctx->pos++;
2391         }
2392
2393 out_put_task:
2394         put_task_struct(task);
2395 out:
2396         genradix_free(&fa);
2397         return ret;
2398 }
2399
2400 static const struct file_operations proc_map_files_operations = {
2401         .read           = generic_read_dir,
2402         .iterate_shared = proc_map_files_readdir,
2403         .llseek         = generic_file_llseek,
2404 };
2405
2406 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
2407 struct timers_private {
2408         struct pid *pid;
2409         struct task_struct *task;
2410         struct sighand_struct *sighand;
2411         struct pid_namespace *ns;
2412         unsigned long flags;
2413 };
2414
2415 static void *timers_start(struct seq_file *m, loff_t *pos)
2416 {
2417         struct timers_private *tp = m->private;
2418
2419         tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
2420         if (!tp->task)
2421                 return ERR_PTR(-ESRCH);
2422
2423         tp->sighand = lock_task_sighand(tp->task, &tp->flags);
2424         if (!tp->sighand)
2425                 return ERR_PTR(-ESRCH);
2426
2427         return seq_list_start(&tp->task->signal->posix_timers, *pos);
2428 }
2429
2430 static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
2431 {
2432         struct timers_private *tp = m->private;
2433         return seq_list_next(v, &tp->task->signal->posix_timers, pos);
2434 }
2435
2436 static void timers_stop(struct seq_file *m, void *v)
2437 {
2438         struct timers_private *tp = m->private;
2439
2440         if (tp->sighand) {
2441                 unlock_task_sighand(tp->task, &tp->flags);
2442                 tp->sighand = NULL;
2443         }
2444
2445         if (tp->task) {
2446                 put_task_struct(tp->task);
2447                 tp->task = NULL;
2448         }
2449 }
2450
2451 static int show_timer(struct seq_file *m, void *v)
2452 {
2453         struct k_itimer *timer;
2454         struct timers_private *tp = m->private;
2455         int notify;
2456         static const char * const nstr[] = {
2457                 [SIGEV_SIGNAL] = "signal",
2458                 [SIGEV_NONE] = "none",
2459                 [SIGEV_THREAD] = "thread",
2460         };
2461
2462         timer = list_entry((struct list_head *)v, struct k_itimer, list);
2463         notify = timer->it_sigev_notify;
2464
2465         seq_printf(m, "ID: %d\n", timer->it_id);
2466         seq_printf(m, "signal: %d/%px\n",
2467                    timer->sigq->info.si_signo,
2468                    timer->sigq->info.si_value.sival_ptr);
2469         seq_printf(m, "notify: %s/%s.%d\n",
2470                    nstr[notify & ~SIGEV_THREAD_ID],
2471                    (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
2472                    pid_nr_ns(timer->it_pid, tp->ns));
2473         seq_printf(m, "ClockID: %d\n", timer->it_clock);
2474
2475         return 0;
2476 }
2477
2478 static const struct seq_operations proc_timers_seq_ops = {
2479         .start  = timers_start,
2480         .next   = timers_next,
2481         .stop   = timers_stop,
2482         .show   = show_timer,
2483 };
2484
2485 static int proc_timers_open(struct inode *inode, struct file *file)
2486 {
2487         struct timers_private *tp;
2488
2489         tp = __seq_open_private(file, &proc_timers_seq_ops,
2490                         sizeof(struct timers_private));
2491         if (!tp)
2492                 return -ENOMEM;
2493
2494         tp->pid = proc_pid(inode);
2495         tp->ns = proc_pid_ns(inode->i_sb);
2496         return 0;
2497 }
2498
2499 static const struct file_operations proc_timers_operations = {
2500         .open           = proc_timers_open,
2501         .read           = seq_read,
2502         .llseek         = seq_lseek,
2503         .release        = seq_release_private,
2504 };
2505 #endif
2506
2507 static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
2508                                         size_t count, loff_t *offset)
2509 {
2510         struct inode *inode = file_inode(file);
2511         struct task_struct *p;
2512         u64 slack_ns;
2513         int err;
2514
2515         err = kstrtoull_from_user(buf, count, 10, &slack_ns);
2516         if (err < 0)
2517                 return err;
2518
2519         p = get_proc_task(inode);
2520         if (!p)
2521                 return -ESRCH;
2522
2523         if (p != current) {
2524                 rcu_read_lock();
2525                 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
2526                         rcu_read_unlock();
2527                         count = -EPERM;
2528                         goto out;
2529                 }
2530                 rcu_read_unlock();
2531
2532                 err = security_task_setscheduler(p);
2533                 if (err) {
2534                         count = err;
2535                         goto out;
2536                 }
2537         }
2538
2539         task_lock(p);
2540         if (slack_ns == 0)
2541                 p->timer_slack_ns = p->default_timer_slack_ns;
2542         else
2543                 p->timer_slack_ns = slack_ns;
2544         task_unlock(p);
2545
2546 out:
2547         put_task_struct(p);
2548
2549         return count;
2550 }
2551
2552 static int timerslack_ns_show(struct seq_file *m, void *v)
2553 {
2554         struct inode *inode = m->private;
2555         struct task_struct *p;
2556         int err = 0;
2557
2558         p = get_proc_task(inode);
2559         if (!p)
2560                 return -ESRCH;
2561
2562         if (p != current) {
2563                 rcu_read_lock();
2564                 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
2565                         rcu_read_unlock();
2566                         err = -EPERM;
2567                         goto out;
2568                 }
2569                 rcu_read_unlock();
2570
2571                 err = security_task_getscheduler(p);
2572                 if (err)
2573                         goto out;
2574         }
2575
2576         task_lock(p);
2577         seq_printf(m, "%llu\n", p->timer_slack_ns);
2578         task_unlock(p);
2579
2580 out:
2581         put_task_struct(p);
2582
2583         return err;
2584 }
2585
2586 static int timerslack_ns_open(struct inode *inode, struct file *filp)
2587 {
2588         return single_open(filp, timerslack_ns_show, inode);
2589 }
2590
2591 static const struct file_operations proc_pid_set_timerslack_ns_operations = {
2592         .open           = timerslack_ns_open,
2593         .read           = seq_read,
2594         .write          = timerslack_ns_write,
2595         .llseek         = seq_lseek,
2596         .release        = single_release,
2597 };
2598
2599 static struct dentry *proc_pident_instantiate(struct dentry *dentry,
2600         struct task_struct *task, const void *ptr)
2601 {
2602         const struct pid_entry *p = ptr;
2603         struct inode *inode;
2604         struct proc_inode *ei;
2605
2606         inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
2607         if (!inode)
2608                 return ERR_PTR(-ENOENT);
2609
2610         ei = PROC_I(inode);
2611         if (S_ISDIR(inode->i_mode))
2612                 set_nlink(inode, 2);    /* Use getattr to fix if necessary */
2613         if (p->iop)
2614                 inode->i_op = p->iop;
2615         if (p->fop)
2616                 inode->i_fop = p->fop;
2617         ei->op = p->op;
2618         pid_update_inode(task, inode);
2619         d_set_d_op(dentry, &pid_dentry_operations);
2620         return d_splice_alias(inode, dentry);
2621 }
2622
2623 static struct dentry *proc_pident_lookup(struct inode *dir,
2624                                          struct dentry *dentry,
2625                                          const struct pid_entry *p,
2626                                          const struct pid_entry *end)
2627 {
2628         struct task_struct *task = get_proc_task(dir);
2629         struct dentry *res = ERR_PTR(-ENOENT);
2630
2631         if (!task)
2632                 goto out_no_task;
2633
2634         /*
2635          * Yes, it does not scale. And it should not. Don't add
2636          * new entries into /proc/<tgid>/ without very good reasons.
2637          */
2638         for (; p < end; p++) {
2639                 if (p->len != dentry->d_name.len)
2640                         continue;
2641                 if (!memcmp(dentry->d_name.name, p->name, p->len)) {
2642                         res = proc_pident_instantiate(dentry, task, p);
2643                         break;
2644                 }
2645         }
2646         put_task_struct(task);
2647 out_no_task:
2648         return res;
2649 }
2650
2651 static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2652                 const struct pid_entry *ents, unsigned int nents)
2653 {
2654         struct task_struct *task = get_proc_task(file_inode(file));
2655         const struct pid_entry *p;
2656
2657         if (!task)
2658                 return -ENOENT;
2659
2660         if (!dir_emit_dots(file, ctx))
2661                 goto out;
2662
2663         if (ctx->pos >= nents + 2)
2664                 goto out;
2665
2666         for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
2667                 if (!proc_fill_cache(file, ctx, p->name, p->len,
2668                                 proc_pident_instantiate, task, p))
2669                         break;
2670                 ctx->pos++;
2671         }
2672 out:
2673         put_task_struct(task);
2674         return 0;
2675 }
2676
2677 #ifdef CONFIG_SECURITY
2678 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2679                                   size_t count, loff_t *ppos)
2680 {
2681         struct inode * inode = file_inode(file);
2682         char *p = NULL;
2683         ssize_t length;
2684         struct task_struct *task = get_proc_task(inode);
2685
2686         if (!task)
2687                 return -ESRCH;
2688
2689         length = security_getprocattr(task, PROC_I(inode)->op.lsm,
2690                                       (char*)file->f_path.dentry->d_name.name,
2691                                       &p);
2692         put_task_struct(task);
2693         if (length > 0)
2694                 length = simple_read_from_buffer(buf, count, ppos, p, length);
2695         kfree(p);
2696         return length;
2697 }
2698
2699 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2700                                    size_t count, loff_t *ppos)
2701 {
2702         struct inode * inode = file_inode(file);
2703         struct task_struct *task;
2704         void *page;
2705         int rv;
2706
2707         rcu_read_lock();
2708         task = pid_task(proc_pid(inode), PIDTYPE_PID);
2709         if (!task) {
2710                 rcu_read_unlock();
2711                 return -ESRCH;
2712         }
2713         /* A task may only write its own attributes. */
2714         if (current != task) {
2715                 rcu_read_unlock();
2716                 return -EACCES;
2717         }
2718         /* Prevent changes to overridden credentials. */
2719         if (current_cred() != current_real_cred()) {
2720                 rcu_read_unlock();
2721                 return -EBUSY;
2722         }
2723         rcu_read_unlock();
2724
2725         if (count > PAGE_SIZE)
2726                 count = PAGE_SIZE;
2727
2728         /* No partial writes. */
2729         if (*ppos != 0)
2730                 return -EINVAL;
2731
2732         page = memdup_user(buf, count);
2733         if (IS_ERR(page)) {
2734                 rv = PTR_ERR(page);
2735                 goto out;
2736         }
2737
2738         /* Guard against adverse ptrace interaction */
2739         rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
2740         if (rv < 0)
2741                 goto out_free;
2742
2743         rv = security_setprocattr(PROC_I(inode)->op.lsm,
2744                                   file->f_path.dentry->d_name.name, page,
2745                                   count);
2746         mutex_unlock(&current->signal->cred_guard_mutex);
2747 out_free:
2748         kfree(page);
2749 out:
2750         return rv;
2751 }
2752
2753 static const struct file_operations proc_pid_attr_operations = {
2754         .read           = proc_pid_attr_read,
2755         .write          = proc_pid_attr_write,
2756         .llseek         = generic_file_llseek,
2757 };
2758
2759 #define LSM_DIR_OPS(LSM) \
2760 static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
2761                              struct dir_context *ctx) \
2762 { \
2763         return proc_pident_readdir(filp, ctx, \
2764                                    LSM##_attr_dir_stuff, \
2765                                    ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2766 } \
2767 \
2768 static const struct file_operations proc_##LSM##_attr_dir_ops = { \
2769         .read           = generic_read_dir, \
2770         .iterate        = proc_##LSM##_attr_dir_iterate, \
2771         .llseek         = default_llseek, \
2772 }; \
2773 \
2774 static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
2775                                 struct dentry *dentry, unsigned int flags) \
2776 { \
2777         return proc_pident_lookup(dir, dentry, \
2778                                   LSM##_attr_dir_stuff, \
2779                                   LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2780 } \
2781 \
2782 static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
2783         .lookup         = proc_##LSM##_attr_dir_lookup, \
2784         .getattr        = pid_getattr, \
2785         .setattr        = proc_setattr, \
2786 }
2787
2788 #ifdef CONFIG_SECURITY_SMACK
2789 static const struct pid_entry smack_attr_dir_stuff[] = {
2790         ATTR("smack", "current",        0666),
2791 };
2792 LSM_DIR_OPS(smack);
2793 #endif
2794
2795 #ifdef CONFIG_SECURITY_APPARMOR
2796 static const struct pid_entry apparmor_attr_dir_stuff[] = {
2797         ATTR("apparmor", "current",     0666),
2798         ATTR("apparmor", "prev",        0444),
2799         ATTR("apparmor", "exec",        0666),
2800 };
2801 LSM_DIR_OPS(apparmor);
2802 #endif
2803
2804 static const struct pid_entry attr_dir_stuff[] = {
2805         ATTR(NULL, "current",           0666),
2806         ATTR(NULL, "prev",              0444),
2807         ATTR(NULL, "exec",              0666),
2808         ATTR(NULL, "fscreate",          0666),
2809         ATTR(NULL, "keycreate",         0666),
2810         ATTR(NULL, "sockcreate",        0666),
2811         ATTR(NULL, "display",           0666),
2812         ATTR(NULL, "context",           0444),
2813 #ifdef CONFIG_SECURITY_SMACK
2814         DIR("smack",                    0555,
2815             proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
2816 #endif
2817 #ifdef CONFIG_SECURITY_APPARMOR
2818         DIR("apparmor",                 0555,
2819             proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
2820 #endif
2821 };
2822
2823 static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2824 {
2825         return proc_pident_readdir(file, ctx,
2826                                    attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2827 }
2828
2829 static const struct file_operations proc_attr_dir_operations = {
2830         .read           = generic_read_dir,
2831         .iterate_shared = proc_attr_dir_readdir,
2832         .llseek         = generic_file_llseek,
2833 };
2834
2835 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2836                                 struct dentry *dentry, unsigned int flags)
2837 {
2838         return proc_pident_lookup(dir, dentry,
2839                                   attr_dir_stuff,
2840                                   attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
2841 }
2842
2843 static const struct inode_operations proc_attr_dir_inode_operations = {
2844         .lookup         = proc_attr_dir_lookup,
2845         .getattr        = pid_getattr,
2846         .setattr        = proc_setattr,
2847 };
2848
2849 #endif
2850
2851 #ifdef CONFIG_ELF_CORE
2852 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2853                                          size_t count, loff_t *ppos)
2854 {
2855         struct task_struct *task = get_proc_task(file_inode(file));
2856         struct mm_struct *mm;
2857         char buffer[PROC_NUMBUF];
2858         size_t len;
2859         int ret;
2860
2861         if (!task)
2862                 return -ESRCH;
2863
2864         ret = 0;
2865         mm = get_task_mm(task);
2866         if (mm) {
2867                 len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2868                                ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2869                                 MMF_DUMP_FILTER_SHIFT));
2870                 mmput(mm);
2871                 ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2872         }
2873
2874         put_task_struct(task);
2875
2876         return ret;
2877 }
2878
2879 static ssize_t proc_coredump_filter_write(struct file *file,
2880                                           const char __user *buf,
2881                                           size_t count,
2882                                           loff_t *ppos)
2883 {
2884         struct task_struct *task;
2885         struct mm_struct *mm;
2886         unsigned int val;
2887         int ret;
2888         int i;
2889         unsigned long mask;
2890
2891         ret = kstrtouint_from_user(buf, count, 0, &val);
2892         if (ret < 0)
2893                 return ret;
2894
2895         ret = -ESRCH;
2896         task = get_proc_task(file_inode(file));
2897         if (!task)
2898                 goto out_no_task;
2899
2900         mm = get_task_mm(task);
2901         if (!mm)
2902                 goto out_no_mm;
2903         ret = 0;
2904
2905         for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2906                 if (val & mask)
2907                         set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2908                 else
2909                         clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2910         }
2911
2912         mmput(mm);
2913  out_no_mm:
2914         put_task_struct(task);
2915  out_no_task:
2916         if (ret < 0)
2917                 return ret;
2918         return count;
2919 }
2920
2921 static const struct file_operations proc_coredump_filter_operations = {
2922         .read           = proc_coredump_filter_read,
2923         .write          = proc_coredump_filter_write,
2924         .llseek         = generic_file_llseek,
2925 };
2926 #endif
2927
2928 #ifdef CONFIG_TASK_IO_ACCOUNTING
2929 static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
2930 {
2931         struct task_io_accounting acct = task->ioac;
2932         unsigned long flags;
2933         int result;
2934
2935         result = down_read_killable(&task->signal->exec_update_lock);
2936         if (result)
2937                 return result;
2938
2939         if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
2940                 result = -EACCES;
2941                 goto out_unlock;
2942         }
2943
2944         if (whole && lock_task_sighand(task, &flags)) {
2945                 struct task_struct *t = task;
2946
2947                 task_io_accounting_add(&acct, &task->signal->ioac);
2948                 while_each_thread(task, t)
2949                         task_io_accounting_add(&acct, &t->ioac);
2950
2951                 unlock_task_sighand(task, &flags);
2952         }
2953         seq_printf(m,
2954                    "rchar: %llu\n"
2955                    "wchar: %llu\n"
2956                    "syscr: %llu\n"
2957                    "syscw: %llu\n"
2958                    "read_bytes: %llu\n"
2959                    "write_bytes: %llu\n"
2960                    "cancelled_write_bytes: %llu\n",
2961                    (unsigned long long)acct.rchar,
2962                    (unsigned long long)acct.wchar,
2963                    (unsigned long long)acct.syscr,
2964                    (unsigned long long)acct.syscw,
2965                    (unsigned long long)acct.read_bytes,
2966                    (unsigned long long)acct.write_bytes,
2967                    (unsigned long long)acct.cancelled_write_bytes);
2968         result = 0;
2969
2970 out_unlock:
2971         up_read(&task->signal->exec_update_lock);
2972         return result;
2973 }
2974
2975 static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2976                                   struct pid *pid, struct task_struct *task)
2977 {
2978         return do_io_accounting(task, m, 0);
2979 }
2980
2981 static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2982                                    struct pid *pid, struct task_struct *task)
2983 {
2984         return do_io_accounting(task, m, 1);
2985 }
2986 #endif /* CONFIG_TASK_IO_ACCOUNTING */
2987
2988 #ifdef CONFIG_USER_NS
2989 static int proc_id_map_open(struct inode *inode, struct file *file,
2990         const struct seq_operations *seq_ops)
2991 {
2992         struct user_namespace *ns = NULL;
2993         struct task_struct *task;
2994         struct seq_file *seq;
2995         int ret = -EINVAL;
2996
2997         task = get_proc_task(inode);
2998         if (task) {
2999                 rcu_read_lock();
3000                 ns = get_user_ns(task_cred_xxx(task, user_ns));
3001                 rcu_read_unlock();
3002                 put_task_struct(task);
3003         }
3004         if (!ns)
3005                 goto err;
3006
3007         ret = seq_open(file, seq_ops);
3008         if (ret)
3009                 goto err_put_ns;
3010
3011         seq = file->private_data;
3012         seq->private = ns;
3013
3014         return 0;
3015 err_put_ns:
3016         put_user_ns(ns);
3017 err:
3018         return ret;
3019 }
3020
3021 static int proc_id_map_release(struct inode *inode, struct file *file)
3022 {
3023         struct seq_file *seq = file->private_data;
3024         struct user_namespace *ns = seq->private;
3025         put_user_ns(ns);
3026         return seq_release(inode, file);
3027 }
3028
3029 static int proc_uid_map_open(struct inode *inode, struct file *file)
3030 {
3031         return proc_id_map_open(inode, file, &proc_uid_seq_operations);
3032 }
3033
3034 static int proc_gid_map_open(struct inode *inode, struct file *file)
3035 {
3036         return proc_id_map_open(inode, file, &proc_gid_seq_operations);
3037 }
3038
3039 static int proc_projid_map_open(struct inode *inode, struct file *file)
3040 {
3041         return proc_id_map_open(inode, file, &proc_projid_seq_operations);
3042 }
3043
3044 static const struct file_operations proc_uid_map_operations = {
3045         .open           = proc_uid_map_open,
3046         .write          = proc_uid_map_write,
3047         .read           = seq_read,
3048         .llseek         = seq_lseek,
3049         .release        = proc_id_map_release,
3050 };
3051
3052 static const struct file_operations proc_gid_map_operations = {
3053         .open           = proc_gid_map_open,
3054         .write          = proc_gid_map_write,
3055         .read           = seq_read,
3056         .llseek         = seq_lseek,
3057         .release        = proc_id_map_release,
3058 };
3059
3060 static const struct file_operations proc_projid_map_operations = {
3061         .open           = proc_projid_map_open,
3062         .write          = proc_projid_map_write,
3063         .read           = seq_read,
3064         .llseek         = seq_lseek,
3065         .release        = proc_id_map_release,
3066 };
3067
3068 static int proc_setgroups_open(struct inode *inode, struct file *file)
3069 {
3070         struct user_namespace *ns = NULL;
3071         struct task_struct *task;
3072         int ret;
3073
3074         ret = -ESRCH;
3075         task = get_proc_task(inode);
3076         if (task) {
3077                 rcu_read_lock();
3078                 ns = get_user_ns(task_cred_xxx(task, user_ns));
3079                 rcu_read_unlock();
3080                 put_task_struct(task);
3081         }
3082         if (!ns)
3083                 goto err;
3084
3085         if (file->f_mode & FMODE_WRITE) {
3086                 ret = -EACCES;
3087                 if (!ns_capable(ns, CAP_SYS_ADMIN))
3088                         goto err_put_ns;
3089         }
3090
3091         ret = single_open(file, &proc_setgroups_show, ns);
3092         if (ret)
3093                 goto err_put_ns;
3094
3095         return 0;
3096 err_put_ns:
3097         put_user_ns(ns);
3098 err:
3099         return ret;
3100 }
3101
3102 static int proc_setgroups_release(struct inode *inode, struct file *file)
3103 {
3104         struct seq_file *seq = file->private_data;
3105         struct user_namespace *ns = seq->private;
3106         int ret = single_release(inode, file);
3107         put_user_ns(ns);
3108         return ret;
3109 }
3110
3111 static const struct file_operations proc_setgroups_operations = {
3112         .open           = proc_setgroups_open,
3113         .write          = proc_setgroups_write,
3114         .read           = seq_read,
3115         .llseek         = seq_lseek,
3116         .release        = proc_setgroups_release,
3117 };
3118 #endif /* CONFIG_USER_NS */
3119
3120 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
3121                                 struct pid *pid, struct task_struct *task)
3122 {
3123         int err = lock_trace(task);
3124         if (!err) {
3125                 seq_printf(m, "%08x\n", task->personality);
3126                 unlock_trace(task);
3127         }
3128         return err;
3129 }
3130
3131 #ifdef CONFIG_LIVEPATCH
3132 static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
3133                                 struct pid *pid, struct task_struct *task)
3134 {
3135         seq_printf(m, "%d\n", task->patch_state);
3136         return 0;
3137 }
3138 #endif /* CONFIG_LIVEPATCH */
3139
3140 #ifdef CONFIG_STACKLEAK_METRICS
3141 static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
3142                                 struct pid *pid, struct task_struct *task)
3143 {
3144         unsigned long prev_depth = THREAD_SIZE -
3145                                 (task->prev_lowest_stack & (THREAD_SIZE - 1));
3146         unsigned long depth = THREAD_SIZE -
3147                                 (task->lowest_stack & (THREAD_SIZE - 1));
3148
3149         seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
3150                                                         prev_depth, depth);
3151         return 0;
3152 }
3153 #endif /* CONFIG_STACKLEAK_METRICS */
3154
3155 /*
3156  * Thread groups
3157  */
3158 static const struct file_operations proc_task_operations;
3159 static const struct inode_operations proc_task_inode_operations;
3160
3161 static const struct pid_entry tgid_base_stuff[] = {
3162         DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
3163         DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3164         DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
3165         DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3166         DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3167 #ifdef CONFIG_NET
3168         DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3169 #endif
3170         REG("environ",    S_IRUSR, proc_environ_operations),
3171         REG("auxv",       S_IRUSR, proc_auxv_operations),
3172         ONE("status",     S_IRUGO, proc_pid_status),
3173         ONE("personality", S_IRUSR, proc_pid_personality),
3174         ONE("limits",     S_IRUGO, proc_pid_limits),
3175 #ifdef CONFIG_SCHED_DEBUG
3176         REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3177 #endif
3178 #ifdef CONFIG_SCHED_AUTOGROUP
3179         REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
3180 #endif
3181 #ifdef CONFIG_TIME_NS
3182         REG("timens_offsets",  S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
3183 #endif
3184         REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
3185 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3186         ONE("syscall",    S_IRUSR, proc_pid_syscall),
3187 #endif
3188         REG("cmdline",    S_IRUGO, proc_pid_cmdline_ops),
3189         ONE("stat",       S_IRUGO, proc_tgid_stat),
3190         ONE("statm",      S_IRUGO, proc_pid_statm),
3191         REG("maps",       S_IRUGO, proc_pid_maps_operations),
3192 #ifdef CONFIG_NUMA
3193         REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
3194 #endif
3195         REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
3196         LNK("cwd",        proc_cwd_link),
3197         LNK("root",       proc_root_link),
3198         LNK("exe",        proc_exe_link),
3199         REG("mounts",     S_IRUGO, proc_mounts_operations),
3200         REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
3201         REG("mountstats", S_IRUSR, proc_mountstats_operations),
3202 #ifdef CONFIG_PROC_PAGE_MONITOR
3203         REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3204         REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
3205         REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3206         REG("pagemap",    S_IRUSR, proc_pagemap_operations),
3207 #endif
3208 #ifdef CONFIG_SECURITY
3209         DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3210 #endif
3211 #ifdef CONFIG_KALLSYMS
3212         ONE("wchan",      S_IRUGO, proc_pid_wchan),
3213 #endif
3214 #ifdef CONFIG_STACKTRACE
3215         ONE("stack",      S_IRUSR, proc_pid_stack),
3216 #endif
3217 #ifdef CONFIG_SCHED_INFO
3218         ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
3219 #endif
3220 #ifdef CONFIG_LATENCYTOP
3221         REG("latency",  S_IRUGO, proc_lstats_operations),
3222 #endif
3223 #ifdef CONFIG_PROC_PID_CPUSET
3224         ONE("cpuset",     S_IRUGO, proc_cpuset_show),
3225 #endif
3226 #ifdef CONFIG_CGROUPS
3227         ONE("cgroup",  S_IRUGO, proc_cgroup_show),
3228 #endif
3229 #ifdef CONFIG_PROC_CPU_RESCTRL
3230         ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
3231 #endif
3232         ONE("oom_score",  S_IRUGO, proc_oom_score),
3233         REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3234         REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3235 #ifdef CONFIG_AUDIT
3236         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
3237         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
3238 #endif
3239 #ifdef CONFIG_FAULT_INJECTION
3240         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3241         REG("fail-nth", 0644, proc_fail_nth_operations),
3242 #endif
3243 #ifdef CONFIG_ELF_CORE
3244         REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
3245 #endif
3246 #ifdef CONFIG_TASK_IO_ACCOUNTING
3247         ONE("io",       S_IRUSR, proc_tgid_io_accounting),
3248 #endif
3249 #ifdef CONFIG_USER_NS
3250         REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
3251         REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
3252         REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3253         REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
3254 #endif
3255 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
3256         REG("timers",     S_IRUGO, proc_timers_operations),
3257 #endif
3258         REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
3259 #ifdef CONFIG_LIVEPATCH
3260         ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
3261 #endif
3262 #ifdef CONFIG_STACKLEAK_METRICS
3263         ONE("stack_depth", S_IRUGO, proc_stack_depth),
3264 #endif
3265 #ifdef CONFIG_PROC_PID_ARCH_STATUS
3266         ONE("arch_status", S_IRUGO, proc_pid_arch_status),
3267 #endif
3268 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
3269         ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
3270 #endif
3271 };
3272
3273 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
3274 {
3275         return proc_pident_readdir(file, ctx,
3276                                    tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3277 }
3278
3279 static const struct file_operations proc_tgid_base_operations = {
3280         .read           = generic_read_dir,
3281         .iterate_shared = proc_tgid_base_readdir,
3282         .llseek         = generic_file_llseek,
3283 };
3284
3285 struct pid *tgid_pidfd_to_pid(const struct file *file)
3286 {
3287         if (file->f_op != &proc_tgid_base_operations)
3288                 return ERR_PTR(-EBADF);
3289
3290         return proc_pid(file_inode(file));
3291 }
3292
3293 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3294 {
3295         return proc_pident_lookup(dir, dentry,
3296                                   tgid_base_stuff,
3297                                   tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
3298 }
3299
3300 static const struct inode_operations proc_tgid_base_inode_operations = {
3301         .lookup         = proc_tgid_base_lookup,
3302         .getattr        = pid_getattr,
3303         .setattr        = proc_setattr,
3304         .permission     = proc_pid_permission,
3305 };
3306
3307 /**
3308  * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
3309  * @pid: pid that should be flushed.
3310  *
3311  * This function walks a list of inodes (that belong to any proc
3312  * filesystem) that are attached to the pid and flushes them from
3313  * the dentry cache.
3314  *
3315  * It is safe and reasonable to cache /proc entries for a task until
3316  * that task exits.  After that they just clog up the dcache with
3317  * useless entries, possibly causing useful dcache entries to be
3318  * flushed instead.  This routine is provided to flush those useless
3319  * dcache entries when a process is reaped.
3320  *
3321  * NOTE: This routine is just an optimization so it does not guarantee
3322  *       that no dcache entries will exist after a process is reaped
3323  *       it just makes it very unlikely that any will persist.
3324  */
3325
3326 void proc_flush_pid(struct pid *pid)
3327 {
3328         proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
3329 }
3330
3331 static struct dentry *proc_pid_instantiate(struct dentry * dentry,
3332                                    struct task_struct *task, const void *ptr)
3333 {
3334         struct inode *inode;
3335
3336         inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
3337         if (!inode)
3338                 return ERR_PTR(-ENOENT);
3339
3340         inode->i_op = &proc_tgid_base_inode_operations;
3341         inode->i_fop = &proc_tgid_base_operations;
3342         inode->i_flags|=S_IMMUTABLE;
3343
3344         set_nlink(inode, nlink_tgid);
3345         pid_update_inode(task, inode);
3346
3347         d_set_d_op(dentry, &pid_dentry_operations);
3348         return d_splice_alias(inode, dentry);
3349 }
3350
3351 struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
3352 {
3353         struct task_struct *task;
3354         unsigned tgid;
3355         struct proc_fs_info *fs_info;
3356         struct pid_namespace *ns;
3357         struct dentry *result = ERR_PTR(-ENOENT);
3358
3359         tgid = name_to_int(&dentry->d_name);
3360         if (tgid == ~0U)
3361                 goto out;
3362
3363         fs_info = proc_sb_info(dentry->d_sb);
3364         ns = fs_info->pid_ns;
3365         rcu_read_lock();
3366         task = find_task_by_pid_ns(tgid, ns);
3367         if (task)
3368                 get_task_struct(task);
3369         rcu_read_unlock();
3370         if (!task)
3371                 goto out;
3372
3373         /* Limit procfs to only ptraceable tasks */
3374         if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
3375                 if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
3376                         goto out_put_task;
3377         }
3378
3379         result = proc_pid_instantiate(dentry, task, NULL);
3380 out_put_task:
3381         put_task_struct(task);
3382 out:
3383         return result;
3384 }
3385
3386 /*
3387  * Find the first task with tgid >= tgid
3388  *
3389  */
3390 struct tgid_iter {
3391         unsigned int tgid;
3392         struct task_struct *task;
3393 };
3394 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
3395 {
3396         struct pid *pid;
3397
3398         if (iter.task)
3399                 put_task_struct(iter.task);
3400         rcu_read_lock();
3401 retry:
3402         iter.task = NULL;
3403         pid = find_ge_pid(iter.tgid, ns);
3404         if (pid) {
3405                 iter.tgid = pid_nr_ns(pid, ns);
3406                 iter.task = pid_task(pid, PIDTYPE_TGID);
3407                 if (!iter.task) {
3408                         iter.tgid += 1;
3409                         goto retry;
3410                 }
3411                 get_task_struct(iter.task);
3412         }
3413         rcu_read_unlock();
3414         return iter;
3415 }
3416
3417 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
3418
3419 /* for the /proc/ directory itself, after non-process stuff has been done */
3420 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
3421 {
3422         struct tgid_iter iter;
3423         struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
3424         struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
3425         loff_t pos = ctx->pos;
3426
3427         if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
3428                 return 0;
3429
3430         if (pos == TGID_OFFSET - 2) {
3431                 struct inode *inode = d_inode(fs_info->proc_self);
3432                 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
3433                         return 0;
3434                 ctx->pos = pos = pos + 1;
3435         }
3436         if (pos == TGID_OFFSET - 1) {
3437                 struct inode *inode = d_inode(fs_info->proc_thread_self);
3438                 if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
3439                         return 0;
3440                 ctx->pos = pos = pos + 1;
3441         }
3442         iter.tgid = pos - TGID_OFFSET;
3443         iter.task = NULL;
3444         for (iter = next_tgid(ns, iter);
3445              iter.task;
3446              iter.tgid += 1, iter = next_tgid(ns, iter)) {
3447                 char name[10 + 1];
3448                 unsigned int len;
3449
3450                 cond_resched();
3451                 if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
3452                         continue;
3453
3454                 len = snprintf(name, sizeof(name), "%u", iter.tgid);
3455                 ctx->pos = iter.tgid + TGID_OFFSET;
3456                 if (!proc_fill_cache(file, ctx, name, len,
3457                                      proc_pid_instantiate, iter.task, NULL)) {
3458                         put_task_struct(iter.task);
3459                         return 0;
3460                 }
3461         }
3462         ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
3463         return 0;
3464 }
3465
3466 /*
3467  * proc_tid_comm_permission is a special permission function exclusively
3468  * used for the node /proc/<pid>/task/<tid>/comm.
3469  * It bypasses generic permission checks in the case where a task of the same
3470  * task group attempts to access the node.
3471  * The rationale behind this is that glibc and bionic access this node for
3472  * cross thread naming (pthread_set/getname_np(!self)). However, if
3473  * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
3474  * which locks out the cross thread naming implementation.
3475  * This function makes sure that the node is always accessible for members of
3476  * same thread group.
3477  */
3478 static int proc_tid_comm_permission(struct inode *inode, int mask)
3479 {
3480         bool is_same_tgroup;
3481         struct task_struct *task;
3482
3483         task = get_proc_task(inode);
3484         if (!task)
3485                 return -ESRCH;
3486         is_same_tgroup = same_thread_group(current, task);
3487         put_task_struct(task);
3488
3489         if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
3490                 /* This file (/proc/<pid>/task/<tid>/comm) can always be
3491                  * read or written by the members of the corresponding
3492                  * thread group.
3493                  */
3494                 return 0;
3495         }
3496
3497         return generic_permission(inode, mask);
3498 }
3499
3500 static const struct inode_operations proc_tid_comm_inode_operations = {
3501                 .permission = proc_tid_comm_permission,
3502 };
3503
3504 /*
3505  * Tasks
3506  */
3507 static const struct pid_entry tid_base_stuff[] = {
3508         DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3509         DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3510         DIR("ns",        S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3511 #ifdef CONFIG_NET
3512         DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3513 #endif
3514         REG("environ",   S_IRUSR, proc_environ_operations),
3515         REG("auxv",      S_IRUSR, proc_auxv_operations),
3516         ONE("status",    S_IRUGO, proc_pid_status),
3517         ONE("personality", S_IRUSR, proc_pid_personality),
3518         ONE("limits",    S_IRUGO, proc_pid_limits),
3519 #ifdef CONFIG_SCHED_DEBUG
3520         REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3521 #endif
3522         NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
3523                          &proc_tid_comm_inode_operations,
3524                          &proc_pid_set_comm_operations, {}),
3525 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3526         ONE("syscall",   S_IRUSR, proc_pid_syscall),
3527 #endif
3528         REG("cmdline",   S_IRUGO, proc_pid_cmdline_ops),
3529         ONE("stat",      S_IRUGO, proc_tid_stat),
3530         ONE("statm",     S_IRUGO, proc_pid_statm),
3531         REG("maps",      S_IRUGO, proc_pid_maps_operations),
3532 #ifdef CONFIG_PROC_CHILDREN
3533         REG("children",  S_IRUGO, proc_tid_children_operations),
3534 #endif
3535 #ifdef CONFIG_NUMA
3536         REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
3537 #endif
3538         REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
3539         LNK("cwd",       proc_cwd_link),
3540         LNK("root",      proc_root_link),
3541         LNK("exe",       proc_exe_link),
3542         REG("mounts",    S_IRUGO, proc_mounts_operations),
3543         REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
3544 #ifdef CONFIG_PROC_PAGE_MONITOR
3545         REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3546         REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
3547         REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3548         REG("pagemap",    S_IRUSR, proc_pagemap_operations),
3549 #endif
3550 #ifdef CONFIG_SECURITY
3551         DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3552 #endif
3553 #ifdef CONFIG_KALLSYMS
3554         ONE("wchan",     S_IRUGO, proc_pid_wchan),
3555 #endif
3556 #ifdef CONFIG_STACKTRACE
3557         ONE("stack",      S_IRUSR, proc_pid_stack),
3558 #endif
3559 #ifdef CONFIG_SCHED_INFO
3560         ONE("schedstat", S_IRUGO, proc_pid_schedstat),
3561 #endif
3562 #ifdef CONFIG_LATENCYTOP
3563         REG("latency",  S_IRUGO, proc_lstats_operations),
3564 #endif
3565 #ifdef CONFIG_PROC_PID_CPUSET
3566         ONE("cpuset",    S_IRUGO, proc_cpuset_show),
3567 #endif
3568 #ifdef CONFIG_CGROUPS
3569         ONE("cgroup",  S_IRUGO, proc_cgroup_show),
3570 #endif
3571 #ifdef CONFIG_PROC_CPU_RESCTRL
3572         ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
3573 #endif
3574         ONE("oom_score", S_IRUGO, proc_oom_score),
3575         REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3576         REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3577 #ifdef CONFIG_AUDIT
3578         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
3579         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
3580 #endif
3581 #ifdef CONFIG_FAULT_INJECTION
3582         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3583         REG("fail-nth", 0644, proc_fail_nth_operations),
3584 #endif
3585 #ifdef CONFIG_TASK_IO_ACCOUNTING
3586         ONE("io",       S_IRUSR, proc_tid_io_accounting),
3587 #endif
3588 #ifdef CONFIG_USER_NS
3589         REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
3590         REG("gid_map",    S_IRUGO|S_IWUSR, proc_gid_map_operations),
3591         REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3592         REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
3593 #endif
3594 #ifdef CONFIG_LIVEPATCH
3595         ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
3596 #endif
3597 #ifdef CONFIG_PROC_PID_ARCH_STATUS
3598         ONE("arch_status", S_IRUGO, proc_pid_arch_status),
3599 #endif
3600 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
3601         ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
3602 #endif
3603 };
3604
3605 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
3606 {
3607         return proc_pident_readdir(file, ctx,
3608                                    tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3609 }
3610
3611 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3612 {
3613         return proc_pident_lookup(dir, dentry,
3614                                   tid_base_stuff,
3615                                   tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
3616 }
3617
3618 static const struct file_operations proc_tid_base_operations = {
3619         .read           = generic_read_dir,
3620         .iterate_shared = proc_tid_base_readdir,
3621         .llseek         = generic_file_llseek,
3622 };
3623
3624 static const struct inode_operations proc_tid_base_inode_operations = {
3625         .lookup         = proc_tid_base_lookup,
3626         .getattr        = pid_getattr,
3627         .setattr        = proc_setattr,
3628 };
3629
3630 static struct dentry *proc_task_instantiate(struct dentry *dentry,
3631         struct task_struct *task, const void *ptr)
3632 {
3633         struct inode *inode;
3634         inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
3635         if (!inode)
3636                 return ERR_PTR(-ENOENT);
3637
3638         inode->i_op = &proc_tid_base_inode_operations;
3639         inode->i_fop = &proc_tid_base_operations;
3640         inode->i_flags |= S_IMMUTABLE;
3641
3642         set_nlink(inode, nlink_tid);
3643         pid_update_inode(task, inode);
3644
3645         d_set_d_op(dentry, &pid_dentry_operations);
3646         return d_splice_alias(inode, dentry);
3647 }
3648
3649 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3650 {
3651         struct task_struct *task;
3652         struct task_struct *leader = get_proc_task(dir);
3653         unsigned tid;
3654         struct proc_fs_info *fs_info;
3655         struct pid_namespace *ns;
3656         struct dentry *result = ERR_PTR(-ENOENT);
3657
3658         if (!leader)
3659                 goto out_no_task;
3660
3661         tid = name_to_int(&dentry->d_name);
3662         if (tid == ~0U)
3663                 goto out;
3664
3665         fs_info = proc_sb_info(dentry->d_sb);
3666         ns = fs_info->pid_ns;
3667         rcu_read_lock();
3668         task = find_task_by_pid_ns(tid, ns);
3669         if (task)
3670                 get_task_struct(task);
3671         rcu_read_unlock();
3672         if (!task)
3673                 goto out;
3674         if (!same_thread_group(leader, task))
3675                 goto out_drop_task;
3676
3677         result = proc_task_instantiate(dentry, task, NULL);
3678 out_drop_task:
3679         put_task_struct(task);
3680 out:
3681         put_task_struct(leader);
3682 out_no_task:
3683         return result;
3684 }
3685
3686 /*
3687  * Find the first tid of a thread group to return to user space.
3688  *
3689  * Usually this is just the thread group leader, but if the users
3690  * buffer was too small or there was a seek into the middle of the
3691  * directory we have more work todo.
3692  *
3693  * In the case of a short read we start with find_task_by_pid.
3694  *
3695  * In the case of a seek we start with the leader and walk nr
3696  * threads past it.
3697  */
3698 static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
3699                                         struct pid_namespace *ns)
3700 {
3701         struct task_struct *pos, *task;
3702         unsigned long nr = f_pos;
3703
3704         if (nr != f_pos)        /* 32bit overflow? */
3705                 return NULL;
3706
3707         rcu_read_lock();
3708         task = pid_task(pid, PIDTYPE_PID);
3709         if (!task)
3710                 goto fail;
3711
3712         /* Attempt to start with the tid of a thread */
3713         if (tid && nr) {
3714                 pos = find_task_by_pid_ns(tid, ns);
3715                 if (pos && same_thread_group(pos, task))
3716                         goto found;
3717         }
3718
3719         /* If nr exceeds the number of threads there is nothing todo */
3720         if (nr >= get_nr_threads(task))
3721                 goto fail;
3722
3723         /* If we haven't found our starting place yet start
3724          * with the leader and walk nr threads forward.
3725          */
3726         pos = task = task->group_leader;
3727         do {
3728                 if (!nr--)
3729                         goto found;
3730         } while_each_thread(task, pos);
3731 fail:
3732         pos = NULL;
3733         goto out;
3734 found:
3735         get_task_struct(pos);
3736 out:
3737         rcu_read_unlock();
3738         return pos;
3739 }
3740
3741 /*
3742  * Find the next thread in the thread list.
3743  * Return NULL if there is an error or no next thread.
3744  *
3745  * The reference to the input task_struct is released.
3746  */
3747 static struct task_struct *next_tid(struct task_struct *start)
3748 {
3749         struct task_struct *pos = NULL;
3750         rcu_read_lock();
3751         if (pid_alive(start)) {
3752                 pos = next_thread(start);
3753                 if (thread_group_leader(pos))
3754                         pos = NULL;
3755                 else
3756                         get_task_struct(pos);
3757         }
3758         rcu_read_unlock();
3759         put_task_struct(start);
3760         return pos;
3761 }
3762
3763 /* for the /proc/TGID/task/ directories */
3764 static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3765 {
3766         struct inode *inode = file_inode(file);
3767         struct task_struct *task;
3768         struct pid_namespace *ns;
3769         int tid;
3770
3771         if (proc_inode_is_dead(inode))
3772                 return -ENOENT;
3773
3774         if (!dir_emit_dots(file, ctx))
3775                 return 0;
3776
3777         /* f_version caches the tgid value that the last readdir call couldn't
3778          * return. lseek aka telldir automagically resets f_version to 0.
3779          */
3780         ns = proc_pid_ns(inode->i_sb);
3781         tid = (int)file->f_version;
3782         file->f_version = 0;
3783         for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
3784              task;
3785              task = next_tid(task), ctx->pos++) {
3786                 char name[10 + 1];
3787                 unsigned int len;
3788                 tid = task_pid_nr_ns(task, ns);
3789                 len = snprintf(name, sizeof(name), "%u", tid);
3790                 if (!proc_fill_cache(file, ctx, name, len,
3791                                 proc_task_instantiate, task, NULL)) {
3792                         /* returning this tgid failed, save it as the first
3793                          * pid for the next readir call */
3794                         file->f_version = (u64)tid;
3795                         put_task_struct(task);
3796                         break;
3797                 }
3798         }
3799
3800         return 0;
3801 }
3802
3803 static int proc_task_getattr(const struct path *path, struct kstat *stat,
3804                              u32 request_mask, unsigned int query_flags)
3805 {
3806         struct inode *inode = d_inode(path->dentry);
3807         struct task_struct *p = get_proc_task(inode);
3808         generic_fillattr(inode, stat);
3809
3810         if (p) {
3811                 stat->nlink += get_nr_threads(p);
3812                 put_task_struct(p);
3813         }
3814
3815         return 0;
3816 }
3817
3818 static const struct inode_operations proc_task_inode_operations = {
3819         .lookup         = proc_task_lookup,
3820         .getattr        = proc_task_getattr,
3821         .setattr        = proc_setattr,
3822         .permission     = proc_pid_permission,
3823 };
3824
3825 static const struct file_operations proc_task_operations = {
3826         .read           = generic_read_dir,
3827         .iterate_shared = proc_task_readdir,
3828         .llseek         = generic_file_llseek,
3829 };
3830
3831 void __init set_proc_pid_nlink(void)
3832 {
3833         nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3834         nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3835 }