]> git.proxmox.com Git - mirror_ubuntu-hirsute-kernel.git/blob - fs/proc/base.c
UBUNTU: Ubuntu-5.11.0-22.23
[mirror_ubuntu-hirsute-kernel.git] / fs / proc / base.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * linux/fs/proc/base.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 *
7 * proc base directory handling functions
8 *
9 * 1999, Al Viro. Rewritten. Now it covers the whole per-process part.
10 * Instead of using magical inumbers to determine the kind of object
11 * we allocate and fill in-core inodes upon lookup. They don't even
12 * go into icache. We cache the reference to task_struct upon lookup too.
13 * Eventually it should become a filesystem in its own. We don't use the
14 * rest of procfs anymore.
15 *
16 *
17 * Changelog:
18 * 17-Jan-2005
19 * Allan Bezerra
20 * Bruna Moreira <bruna.moreira@indt.org.br>
21 * Edjard Mota <edjard.mota@indt.org.br>
22 * Ilias Biris <ilias.biris@indt.org.br>
23 * Mauricio Lin <mauricio.lin@indt.org.br>
24 *
25 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
26 *
27 * A new process specific entry (smaps) included in /proc. It shows the
28 * size of rss for each memory area. The maps entry lacks information
29 * about physical memory size (rss) for each mapped file, i.e.,
30 * rss information for executables and library files.
31 * This additional information is useful for any tools that need to know
32 * about physical memory consumption for a process specific library.
33 *
34 * Changelog:
35 * 21-Feb-2005
36 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
37 * Pud inclusion in the page table walking.
38 *
39 * ChangeLog:
40 * 10-Mar-2005
41 * 10LE Instituto Nokia de Tecnologia - INdT:
42 * A better way to walks through the page table as suggested by Hugh Dickins.
43 *
44 * Simo Piiroinen <simo.piiroinen@nokia.com>:
45 * Smaps information related to shared, private, clean and dirty pages.
46 *
47 * Paul Mundt <paul.mundt@nokia.com>:
48 * Overall revision about smaps.
49 */
50
51 #include <linux/uaccess.h>
52
53 #include <linux/errno.h>
54 #include <linux/time.h>
55 #include <linux/proc_fs.h>
56 #include <linux/stat.h>
57 #include <linux/task_io_accounting_ops.h>
58 #include <linux/init.h>
59 #include <linux/capability.h>
60 #include <linux/file.h>
61 #include <linux/fdtable.h>
62 #include <linux/generic-radix-tree.h>
63 #include <linux/string.h>
64 #include <linux/seq_file.h>
65 #include <linux/namei.h>
66 #include <linux/mnt_namespace.h>
67 #include <linux/mm.h>
68 #include <linux/swap.h>
69 #include <linux/rcupdate.h>
70 #include <linux/kallsyms.h>
71 #include <linux/stacktrace.h>
72 #include <linux/resource.h>
73 #include <linux/module.h>
74 #include <linux/mount.h>
75 #include <linux/security.h>
76 #include <linux/ptrace.h>
77 #include <linux/tracehook.h>
78 #include <linux/printk.h>
79 #include <linux/cache.h>
80 #include <linux/cgroup.h>
81 #include <linux/cpuset.h>
82 #include <linux/audit.h>
83 #include <linux/poll.h>
84 #include <linux/nsproxy.h>
85 #include <linux/oom.h>
86 #include <linux/elf.h>
87 #include <linux/pid_namespace.h>
88 #include <linux/user_namespace.h>
89 #include <linux/fs_struct.h>
90 #include <linux/slab.h>
91 #include <linux/sched/autogroup.h>
92 #include <linux/sched/mm.h>
93 #include <linux/sched/coredump.h>
94 #include <linux/sched/debug.h>
95 #include <linux/sched/stat.h>
96 #include <linux/posix-timers.h>
97 #include <linux/time_namespace.h>
98 #include <linux/resctrl.h>
99 #include <trace/events/oom.h>
100 #include "internal.h"
101 #include "fd.h"
102
103 #include "../../lib/kstrtox.h"
104
105 /* NOTE:
106 * Implementing inode permission operations in /proc is almost
107 * certainly an error. Permission checks need to happen during
108 * each system call not at open time. The reason is that most of
109 * what we wish to check for permissions in /proc varies at runtime.
110 *
111 * The classic example of a problem is opening file descriptors
112 * in /proc for a task before it execs a suid executable.
113 */
114
115 static u8 nlink_tid __ro_after_init;
116 static u8 nlink_tgid __ro_after_init;
117
118 struct pid_entry {
119 const char *name;
120 unsigned int len;
121 umode_t mode;
122 const struct inode_operations *iop;
123 const struct file_operations *fop;
124 union proc_op op;
125 };
126
127 #define NOD(NAME, MODE, IOP, FOP, OP) { \
128 .name = (NAME), \
129 .len = sizeof(NAME) - 1, \
130 .mode = MODE, \
131 .iop = IOP, \
132 .fop = FOP, \
133 .op = OP, \
134 }
135
136 #define DIR(NAME, MODE, iops, fops) \
137 NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
138 #define LNK(NAME, get_link) \
139 NOD(NAME, (S_IFLNK|S_IRWXUGO), \
140 &proc_pid_link_inode_operations, NULL, \
141 { .proc_get_link = get_link } )
142 #define REG(NAME, MODE, fops) \
143 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
144 #define ONE(NAME, MODE, show) \
145 NOD(NAME, (S_IFREG|(MODE)), \
146 NULL, &proc_single_file_operations, \
147 { .proc_show = show } )
148 #define ATTR(LSM, NAME, MODE) \
149 NOD(NAME, (S_IFREG|(MODE)), \
150 NULL, &proc_pid_attr_operations, \
151 { .lsm = LSM })
152
153 /*
154 * Count the number of hardlinks for the pid_entry table, excluding the .
155 * and .. links.
156 */
157 static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
158 unsigned int n)
159 {
160 unsigned int i;
161 unsigned int count;
162
163 count = 2;
164 for (i = 0; i < n; ++i) {
165 if (S_ISDIR(entries[i].mode))
166 ++count;
167 }
168
169 return count;
170 }
171
172 static int get_task_root(struct task_struct *task, struct path *root)
173 {
174 int result = -ENOENT;
175
176 task_lock(task);
177 if (task->fs) {
178 get_fs_root(task->fs, root);
179 result = 0;
180 }
181 task_unlock(task);
182 return result;
183 }
184
185 static int proc_cwd_link(struct dentry *dentry, struct path *path)
186 {
187 struct task_struct *task = get_proc_task(d_inode(dentry));
188 int result = -ENOENT;
189
190 if (task) {
191 task_lock(task);
192 if (task->fs) {
193 get_fs_pwd(task->fs, path);
194 result = 0;
195 }
196 task_unlock(task);
197 put_task_struct(task);
198 }
199 return result;
200 }
201
202 static int proc_root_link(struct dentry *dentry, struct path *path)
203 {
204 struct task_struct *task = get_proc_task(d_inode(dentry));
205 int result = -ENOENT;
206
207 if (task) {
208 result = get_task_root(task, path);
209 put_task_struct(task);
210 }
211 return result;
212 }
213
214 /*
215 * If the user used setproctitle(), we just get the string from
216 * user space at arg_start, and limit it to a maximum of one page.
217 */
218 static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
219 size_t count, unsigned long pos,
220 unsigned long arg_start)
221 {
222 char *page;
223 int ret, got;
224
225 if (pos >= PAGE_SIZE)
226 return 0;
227
228 page = (char *)__get_free_page(GFP_KERNEL);
229 if (!page)
230 return -ENOMEM;
231
232 ret = 0;
233 got = access_remote_vm(mm, arg_start, page, PAGE_SIZE, FOLL_ANON);
234 if (got > 0) {
235 int len = strnlen(page, got);
236
237 /* Include the NUL character if it was found */
238 if (len < got)
239 len++;
240
241 if (len > pos) {
242 len -= pos;
243 if (len > count)
244 len = count;
245 len -= copy_to_user(buf, page+pos, len);
246 if (!len)
247 len = -EFAULT;
248 ret = len;
249 }
250 }
251 free_page((unsigned long)page);
252 return ret;
253 }
254
255 static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
256 size_t count, loff_t *ppos)
257 {
258 unsigned long arg_start, arg_end, env_start, env_end;
259 unsigned long pos, len;
260 char *page, c;
261
262 /* Check if process spawned far enough to have cmdline. */
263 if (!mm->env_end)
264 return 0;
265
266 spin_lock(&mm->arg_lock);
267 arg_start = mm->arg_start;
268 arg_end = mm->arg_end;
269 env_start = mm->env_start;
270 env_end = mm->env_end;
271 spin_unlock(&mm->arg_lock);
272
273 if (arg_start >= arg_end)
274 return 0;
275
276 /*
277 * We allow setproctitle() to overwrite the argument
278 * strings, and overflow past the original end. But
279 * only when it overflows into the environment area.
280 */
281 if (env_start != arg_end || env_end < env_start)
282 env_start = env_end = arg_end;
283 len = env_end - arg_start;
284
285 /* We're not going to care if "*ppos" has high bits set */
286 pos = *ppos;
287 if (pos >= len)
288 return 0;
289 if (count > len - pos)
290 count = len - pos;
291 if (!count)
292 return 0;
293
294 /*
295 * Magical special case: if the argv[] end byte is not
296 * zero, the user has overwritten it with setproctitle(3).
297 *
298 * Possible future enhancement: do this only once when
299 * pos is 0, and set a flag in the 'struct file'.
300 */
301 if (access_remote_vm(mm, arg_end-1, &c, 1, FOLL_ANON) == 1 && c)
302 return get_mm_proctitle(mm, buf, count, pos, arg_start);
303
304 /*
305 * For the non-setproctitle() case we limit things strictly
306 * to the [arg_start, arg_end[ range.
307 */
308 pos += arg_start;
309 if (pos < arg_start || pos >= arg_end)
310 return 0;
311 if (count > arg_end - pos)
312 count = arg_end - pos;
313
314 page = (char *)__get_free_page(GFP_KERNEL);
315 if (!page)
316 return -ENOMEM;
317
318 len = 0;
319 while (count) {
320 int got;
321 size_t size = min_t(size_t, PAGE_SIZE, count);
322
323 got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
324 if (got <= 0)
325 break;
326 got -= copy_to_user(buf, page, got);
327 if (unlikely(!got)) {
328 if (!len)
329 len = -EFAULT;
330 break;
331 }
332 pos += got;
333 buf += got;
334 len += got;
335 count -= got;
336 }
337
338 free_page((unsigned long)page);
339 return len;
340 }
341
342 static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
343 size_t count, loff_t *pos)
344 {
345 struct mm_struct *mm;
346 ssize_t ret;
347
348 mm = get_task_mm(tsk);
349 if (!mm)
350 return 0;
351
352 ret = get_mm_cmdline(mm, buf, count, pos);
353 mmput(mm);
354 return ret;
355 }
356
357 static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
358 size_t count, loff_t *pos)
359 {
360 struct task_struct *tsk;
361 ssize_t ret;
362
363 BUG_ON(*pos < 0);
364
365 tsk = get_proc_task(file_inode(file));
366 if (!tsk)
367 return -ESRCH;
368 ret = get_task_cmdline(tsk, buf, count, pos);
369 put_task_struct(tsk);
370 if (ret > 0)
371 *pos += ret;
372 return ret;
373 }
374
375 static const struct file_operations proc_pid_cmdline_ops = {
376 .read = proc_pid_cmdline_read,
377 .llseek = generic_file_llseek,
378 };
379
380 #ifdef CONFIG_KALLSYMS
381 /*
382 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
383 * Returns the resolved symbol. If that fails, simply return the address.
384 */
385 static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
386 struct pid *pid, struct task_struct *task)
387 {
388 unsigned long wchan;
389 char symname[KSYM_NAME_LEN];
390
391 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
392 goto print0;
393
394 wchan = get_wchan(task);
395 if (wchan && !lookup_symbol_name(wchan, symname)) {
396 seq_puts(m, symname);
397 return 0;
398 }
399
400 print0:
401 seq_putc(m, '0');
402 return 0;
403 }
404 #endif /* CONFIG_KALLSYMS */
405
406 static int lock_trace(struct task_struct *task)
407 {
408 int err = down_read_killable(&task->signal->exec_update_lock);
409 if (err)
410 return err;
411 if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
412 up_read(&task->signal->exec_update_lock);
413 return -EPERM;
414 }
415 return 0;
416 }
417
418 static void unlock_trace(struct task_struct *task)
419 {
420 up_read(&task->signal->exec_update_lock);
421 }
422
423 #ifdef CONFIG_STACKTRACE
424
425 #define MAX_STACK_TRACE_DEPTH 64
426
427 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
428 struct pid *pid, struct task_struct *task)
429 {
430 unsigned long *entries;
431 int err;
432
433 /*
434 * The ability to racily run the kernel stack unwinder on a running task
435 * and then observe the unwinder output is scary; while it is useful for
436 * debugging kernel issues, it can also allow an attacker to leak kernel
437 * stack contents.
438 * Doing this in a manner that is at least safe from races would require
439 * some work to ensure that the remote task can not be scheduled; and
440 * even then, this would still expose the unwinder as local attack
441 * surface.
442 * Therefore, this interface is restricted to root.
443 */
444 if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
445 return -EACCES;
446
447 entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
448 GFP_KERNEL);
449 if (!entries)
450 return -ENOMEM;
451
452 err = lock_trace(task);
453 if (!err) {
454 unsigned int i, nr_entries;
455
456 nr_entries = stack_trace_save_tsk(task, entries,
457 MAX_STACK_TRACE_DEPTH, 0);
458
459 for (i = 0; i < nr_entries; i++) {
460 seq_printf(m, "[<0>] %pB\n", (void *)entries[i]);
461 }
462
463 unlock_trace(task);
464 }
465 kfree(entries);
466
467 return err;
468 }
469 #endif
470
471 #ifdef CONFIG_SCHED_INFO
472 /*
473 * Provides /proc/PID/schedstat
474 */
475 static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
476 struct pid *pid, struct task_struct *task)
477 {
478 if (unlikely(!sched_info_on()))
479 seq_puts(m, "0 0 0\n");
480 else
481 seq_printf(m, "%llu %llu %lu\n",
482 (unsigned long long)task->se.sum_exec_runtime,
483 (unsigned long long)task->sched_info.run_delay,
484 task->sched_info.pcount);
485
486 return 0;
487 }
488 #endif
489
490 #ifdef CONFIG_LATENCYTOP
491 static int lstats_show_proc(struct seq_file *m, void *v)
492 {
493 int i;
494 struct inode *inode = m->private;
495 struct task_struct *task = get_proc_task(inode);
496
497 if (!task)
498 return -ESRCH;
499 seq_puts(m, "Latency Top version : v0.1\n");
500 for (i = 0; i < LT_SAVECOUNT; i++) {
501 struct latency_record *lr = &task->latency_record[i];
502 if (lr->backtrace[0]) {
503 int q;
504 seq_printf(m, "%i %li %li",
505 lr->count, lr->time, lr->max);
506 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
507 unsigned long bt = lr->backtrace[q];
508
509 if (!bt)
510 break;
511 seq_printf(m, " %ps", (void *)bt);
512 }
513 seq_putc(m, '\n');
514 }
515
516 }
517 put_task_struct(task);
518 return 0;
519 }
520
521 static int lstats_open(struct inode *inode, struct file *file)
522 {
523 return single_open(file, lstats_show_proc, inode);
524 }
525
526 static ssize_t lstats_write(struct file *file, const char __user *buf,
527 size_t count, loff_t *offs)
528 {
529 struct task_struct *task = get_proc_task(file_inode(file));
530
531 if (!task)
532 return -ESRCH;
533 clear_tsk_latency_tracing(task);
534 put_task_struct(task);
535
536 return count;
537 }
538
539 static const struct file_operations proc_lstats_operations = {
540 .open = lstats_open,
541 .read = seq_read,
542 .write = lstats_write,
543 .llseek = seq_lseek,
544 .release = single_release,
545 };
546
547 #endif
548
549 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
550 struct pid *pid, struct task_struct *task)
551 {
552 unsigned long totalpages = totalram_pages() + total_swap_pages;
553 unsigned long points = 0;
554 long badness;
555
556 badness = oom_badness(task, totalpages);
557 /*
558 * Special case OOM_SCORE_ADJ_MIN for all others scale the
559 * badness value into [0, 2000] range which we have been
560 * exporting for a long time so userspace might depend on it.
561 */
562 if (badness != LONG_MIN)
563 points = (1000 + badness * 1000 / (long)totalpages) * 2 / 3;
564
565 seq_printf(m, "%lu\n", points);
566
567 return 0;
568 }
569
570 struct limit_names {
571 const char *name;
572 const char *unit;
573 };
574
575 static const struct limit_names lnames[RLIM_NLIMITS] = {
576 [RLIMIT_CPU] = {"Max cpu time", "seconds"},
577 [RLIMIT_FSIZE] = {"Max file size", "bytes"},
578 [RLIMIT_DATA] = {"Max data size", "bytes"},
579 [RLIMIT_STACK] = {"Max stack size", "bytes"},
580 [RLIMIT_CORE] = {"Max core file size", "bytes"},
581 [RLIMIT_RSS] = {"Max resident set", "bytes"},
582 [RLIMIT_NPROC] = {"Max processes", "processes"},
583 [RLIMIT_NOFILE] = {"Max open files", "files"},
584 [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
585 [RLIMIT_AS] = {"Max address space", "bytes"},
586 [RLIMIT_LOCKS] = {"Max file locks", "locks"},
587 [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
588 [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
589 [RLIMIT_NICE] = {"Max nice priority", NULL},
590 [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
591 [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
592 };
593
594 /* Display limits for a process */
595 static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
596 struct pid *pid, struct task_struct *task)
597 {
598 unsigned int i;
599 unsigned long flags;
600
601 struct rlimit rlim[RLIM_NLIMITS];
602
603 if (!lock_task_sighand(task, &flags))
604 return 0;
605 memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
606 unlock_task_sighand(task, &flags);
607
608 /*
609 * print the file header
610 */
611 seq_puts(m, "Limit "
612 "Soft Limit "
613 "Hard Limit "
614 "Units \n");
615
616 for (i = 0; i < RLIM_NLIMITS; i++) {
617 if (rlim[i].rlim_cur == RLIM_INFINITY)
618 seq_printf(m, "%-25s %-20s ",
619 lnames[i].name, "unlimited");
620 else
621 seq_printf(m, "%-25s %-20lu ",
622 lnames[i].name, rlim[i].rlim_cur);
623
624 if (rlim[i].rlim_max == RLIM_INFINITY)
625 seq_printf(m, "%-20s ", "unlimited");
626 else
627 seq_printf(m, "%-20lu ", rlim[i].rlim_max);
628
629 if (lnames[i].unit)
630 seq_printf(m, "%-10s\n", lnames[i].unit);
631 else
632 seq_putc(m, '\n');
633 }
634
635 return 0;
636 }
637
638 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
639 static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
640 struct pid *pid, struct task_struct *task)
641 {
642 struct syscall_info info;
643 u64 *args = &info.data.args[0];
644 int res;
645
646 res = lock_trace(task);
647 if (res)
648 return res;
649
650 if (task_current_syscall(task, &info))
651 seq_puts(m, "running\n");
652 else if (info.data.nr < 0)
653 seq_printf(m, "%d 0x%llx 0x%llx\n",
654 info.data.nr, info.sp, info.data.instruction_pointer);
655 else
656 seq_printf(m,
657 "%d 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx 0x%llx\n",
658 info.data.nr,
659 args[0], args[1], args[2], args[3], args[4], args[5],
660 info.sp, info.data.instruction_pointer);
661 unlock_trace(task);
662
663 return 0;
664 }
665 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
666
667 /************************************************************************/
668 /* Here the fs part begins */
669 /************************************************************************/
670
671 /* permission checks */
672 static int proc_fd_access_allowed(struct inode *inode)
673 {
674 struct task_struct *task;
675 int allowed = 0;
676 /* Allow access to a task's file descriptors if it is us or we
677 * may use ptrace attach to the process and find out that
678 * information.
679 */
680 task = get_proc_task(inode);
681 if (task) {
682 allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
683 put_task_struct(task);
684 }
685 return allowed;
686 }
687
688 int proc_setattr(struct dentry *dentry, struct iattr *attr)
689 {
690 int error;
691 struct inode *inode = d_inode(dentry);
692
693 if (attr->ia_valid & ATTR_MODE)
694 return -EPERM;
695
696 error = setattr_prepare(dentry, attr);
697 if (error)
698 return error;
699
700 setattr_copy(inode, attr);
701 mark_inode_dirty(inode);
702 return 0;
703 }
704
705 /*
706 * May current process learn task's sched/cmdline info (for hide_pid_min=1)
707 * or euid/egid (for hide_pid_min=2)?
708 */
709 static bool has_pid_permissions(struct proc_fs_info *fs_info,
710 struct task_struct *task,
711 enum proc_hidepid hide_pid_min)
712 {
713 /*
714 * If 'hidpid' mount option is set force a ptrace check,
715 * we indicate that we are using a filesystem syscall
716 * by passing PTRACE_MODE_READ_FSCREDS
717 */
718 if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE)
719 return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
720
721 if (fs_info->hide_pid < hide_pid_min)
722 return true;
723 if (in_group_p(fs_info->pid_gid))
724 return true;
725 return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS);
726 }
727
728
729 static int proc_pid_permission(struct inode *inode, int mask)
730 {
731 struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
732 struct task_struct *task;
733 bool has_perms;
734
735 task = get_proc_task(inode);
736 if (!task)
737 return -ESRCH;
738 has_perms = has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS);
739 put_task_struct(task);
740
741 if (!has_perms) {
742 if (fs_info->hide_pid == HIDEPID_INVISIBLE) {
743 /*
744 * Let's make getdents(), stat(), and open()
745 * consistent with each other. If a process
746 * may not stat() a file, it shouldn't be seen
747 * in procfs at all.
748 */
749 return -ENOENT;
750 }
751
752 return -EPERM;
753 }
754 return generic_permission(inode, mask);
755 }
756
757
758
759 static const struct inode_operations proc_def_inode_operations = {
760 .setattr = proc_setattr,
761 };
762
763 static int proc_single_show(struct seq_file *m, void *v)
764 {
765 struct inode *inode = m->private;
766 struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
767 struct pid *pid = proc_pid(inode);
768 struct task_struct *task;
769 int ret;
770
771 task = get_pid_task(pid, PIDTYPE_PID);
772 if (!task)
773 return -ESRCH;
774
775 ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
776
777 put_task_struct(task);
778 return ret;
779 }
780
781 static int proc_single_open(struct inode *inode, struct file *filp)
782 {
783 return single_open(filp, proc_single_show, inode);
784 }
785
786 static const struct file_operations proc_single_file_operations = {
787 .open = proc_single_open,
788 .read = seq_read,
789 .llseek = seq_lseek,
790 .release = single_release,
791 };
792
793
794 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
795 {
796 struct task_struct *task = get_proc_task(inode);
797 struct mm_struct *mm = ERR_PTR(-ESRCH);
798
799 if (task) {
800 mm = mm_access(task, mode | PTRACE_MODE_FSCREDS);
801 put_task_struct(task);
802
803 if (!IS_ERR_OR_NULL(mm)) {
804 /* ensure this mm_struct can't be freed */
805 mmgrab(mm);
806 /* but do not pin its memory */
807 mmput(mm);
808 }
809 }
810
811 return mm;
812 }
813
814 static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
815 {
816 struct mm_struct *mm = proc_mem_open(inode, mode);
817
818 if (IS_ERR(mm))
819 return PTR_ERR(mm);
820
821 file->private_data = mm;
822 return 0;
823 }
824
825 static int mem_open(struct inode *inode, struct file *file)
826 {
827 int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
828
829 /* OK to pass negative loff_t, we can catch out-of-range */
830 file->f_mode |= FMODE_UNSIGNED_OFFSET;
831
832 return ret;
833 }
834
835 static ssize_t mem_rw(struct file *file, char __user *buf,
836 size_t count, loff_t *ppos, int write)
837 {
838 struct mm_struct *mm = file->private_data;
839 unsigned long addr = *ppos;
840 ssize_t copied;
841 char *page;
842 unsigned int flags;
843
844 if (!mm)
845 return 0;
846
847 page = (char *)__get_free_page(GFP_KERNEL);
848 if (!page)
849 return -ENOMEM;
850
851 copied = 0;
852 if (!mmget_not_zero(mm))
853 goto free;
854
855 flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
856
857 while (count > 0) {
858 size_t this_len = min_t(size_t, count, PAGE_SIZE);
859
860 if (write && copy_from_user(page, buf, this_len)) {
861 copied = -EFAULT;
862 break;
863 }
864
865 this_len = access_remote_vm(mm, addr, page, this_len, flags);
866 if (!this_len) {
867 if (!copied)
868 copied = -EIO;
869 break;
870 }
871
872 if (!write && copy_to_user(buf, page, this_len)) {
873 copied = -EFAULT;
874 break;
875 }
876
877 buf += this_len;
878 addr += this_len;
879 copied += this_len;
880 count -= this_len;
881 }
882 *ppos = addr;
883
884 mmput(mm);
885 free:
886 free_page((unsigned long) page);
887 return copied;
888 }
889
890 static ssize_t mem_read(struct file *file, char __user *buf,
891 size_t count, loff_t *ppos)
892 {
893 return mem_rw(file, buf, count, ppos, 0);
894 }
895
896 static ssize_t mem_write(struct file *file, const char __user *buf,
897 size_t count, loff_t *ppos)
898 {
899 return mem_rw(file, (char __user*)buf, count, ppos, 1);
900 }
901
902 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
903 {
904 switch (orig) {
905 case 0:
906 file->f_pos = offset;
907 break;
908 case 1:
909 file->f_pos += offset;
910 break;
911 default:
912 return -EINVAL;
913 }
914 force_successful_syscall_return();
915 return file->f_pos;
916 }
917
918 static int mem_release(struct inode *inode, struct file *file)
919 {
920 struct mm_struct *mm = file->private_data;
921 if (mm)
922 mmdrop(mm);
923 return 0;
924 }
925
926 static const struct file_operations proc_mem_operations = {
927 .llseek = mem_lseek,
928 .read = mem_read,
929 .write = mem_write,
930 .open = mem_open,
931 .release = mem_release,
932 };
933
934 static int environ_open(struct inode *inode, struct file *file)
935 {
936 return __mem_open(inode, file, PTRACE_MODE_READ);
937 }
938
939 static ssize_t environ_read(struct file *file, char __user *buf,
940 size_t count, loff_t *ppos)
941 {
942 char *page;
943 unsigned long src = *ppos;
944 int ret = 0;
945 struct mm_struct *mm = file->private_data;
946 unsigned long env_start, env_end;
947
948 /* Ensure the process spawned far enough to have an environment. */
949 if (!mm || !mm->env_end)
950 return 0;
951
952 page = (char *)__get_free_page(GFP_KERNEL);
953 if (!page)
954 return -ENOMEM;
955
956 ret = 0;
957 if (!mmget_not_zero(mm))
958 goto free;
959
960 spin_lock(&mm->arg_lock);
961 env_start = mm->env_start;
962 env_end = mm->env_end;
963 spin_unlock(&mm->arg_lock);
964
965 while (count > 0) {
966 size_t this_len, max_len;
967 int retval;
968
969 if (src >= (env_end - env_start))
970 break;
971
972 this_len = env_end - (env_start + src);
973
974 max_len = min_t(size_t, PAGE_SIZE, count);
975 this_len = min(max_len, this_len);
976
977 retval = access_remote_vm(mm, (env_start + src), page, this_len, FOLL_ANON);
978
979 if (retval <= 0) {
980 ret = retval;
981 break;
982 }
983
984 if (copy_to_user(buf, page, retval)) {
985 ret = -EFAULT;
986 break;
987 }
988
989 ret += retval;
990 src += retval;
991 buf += retval;
992 count -= retval;
993 }
994 *ppos = src;
995 mmput(mm);
996
997 free:
998 free_page((unsigned long) page);
999 return ret;
1000 }
1001
1002 static const struct file_operations proc_environ_operations = {
1003 .open = environ_open,
1004 .read = environ_read,
1005 .llseek = generic_file_llseek,
1006 .release = mem_release,
1007 };
1008
1009 static int auxv_open(struct inode *inode, struct file *file)
1010 {
1011 return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
1012 }
1013
1014 static ssize_t auxv_read(struct file *file, char __user *buf,
1015 size_t count, loff_t *ppos)
1016 {
1017 struct mm_struct *mm = file->private_data;
1018 unsigned int nwords = 0;
1019
1020 if (!mm)
1021 return 0;
1022 do {
1023 nwords += 2;
1024 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
1025 return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
1026 nwords * sizeof(mm->saved_auxv[0]));
1027 }
1028
1029 static const struct file_operations proc_auxv_operations = {
1030 .open = auxv_open,
1031 .read = auxv_read,
1032 .llseek = generic_file_llseek,
1033 .release = mem_release,
1034 };
1035
1036 static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1037 loff_t *ppos)
1038 {
1039 struct task_struct *task = get_proc_task(file_inode(file));
1040 char buffer[PROC_NUMBUF];
1041 int oom_adj = OOM_ADJUST_MIN;
1042 size_t len;
1043
1044 if (!task)
1045 return -ESRCH;
1046 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
1047 oom_adj = OOM_ADJUST_MAX;
1048 else
1049 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
1050 OOM_SCORE_ADJ_MAX;
1051 put_task_struct(task);
1052 if (oom_adj > OOM_ADJUST_MAX)
1053 oom_adj = OOM_ADJUST_MAX;
1054 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
1055 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1056 }
1057
1058 static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
1059 {
1060 struct mm_struct *mm = NULL;
1061 struct task_struct *task;
1062 int err = 0;
1063
1064 task = get_proc_task(file_inode(file));
1065 if (!task)
1066 return -ESRCH;
1067
1068 mutex_lock(&oom_adj_mutex);
1069 if (legacy) {
1070 if (oom_adj < task->signal->oom_score_adj &&
1071 !capable(CAP_SYS_RESOURCE)) {
1072 err = -EACCES;
1073 goto err_unlock;
1074 }
1075 /*
1076 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
1077 * /proc/pid/oom_score_adj instead.
1078 */
1079 pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1080 current->comm, task_pid_nr(current), task_pid_nr(task),
1081 task_pid_nr(task));
1082 } else {
1083 if ((short)oom_adj < task->signal->oom_score_adj_min &&
1084 !capable(CAP_SYS_RESOURCE)) {
1085 err = -EACCES;
1086 goto err_unlock;
1087 }
1088 }
1089
1090 /*
1091 * Make sure we will check other processes sharing the mm if this is
1092 * not vfrok which wants its own oom_score_adj.
1093 * pin the mm so it doesn't go away and get reused after task_unlock
1094 */
1095 if (!task->vfork_done) {
1096 struct task_struct *p = find_lock_task_mm(task);
1097
1098 if (p) {
1099 if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) {
1100 mm = p->mm;
1101 mmgrab(mm);
1102 }
1103 task_unlock(p);
1104 }
1105 }
1106
1107 task->signal->oom_score_adj = oom_adj;
1108 if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1109 task->signal->oom_score_adj_min = (short)oom_adj;
1110 trace_oom_score_adj_update(task);
1111
1112 if (mm) {
1113 struct task_struct *p;
1114
1115 rcu_read_lock();
1116 for_each_process(p) {
1117 if (same_thread_group(task, p))
1118 continue;
1119
1120 /* do not touch kernel threads or the global init */
1121 if (p->flags & PF_KTHREAD || is_global_init(p))
1122 continue;
1123
1124 task_lock(p);
1125 if (!p->vfork_done && process_shares_mm(p, mm)) {
1126 p->signal->oom_score_adj = oom_adj;
1127 if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
1128 p->signal->oom_score_adj_min = (short)oom_adj;
1129 }
1130 task_unlock(p);
1131 }
1132 rcu_read_unlock();
1133 mmdrop(mm);
1134 }
1135 err_unlock:
1136 mutex_unlock(&oom_adj_mutex);
1137 put_task_struct(task);
1138 return err;
1139 }
1140
1141 /*
1142 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
1143 * kernels. The effective policy is defined by oom_score_adj, which has a
1144 * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
1145 * Values written to oom_adj are simply mapped linearly to oom_score_adj.
1146 * Processes that become oom disabled via oom_adj will still be oom disabled
1147 * with this implementation.
1148 *
1149 * oom_adj cannot be removed since existing userspace binaries use it.
1150 */
1151 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1152 size_t count, loff_t *ppos)
1153 {
1154 char buffer[PROC_NUMBUF];
1155 int oom_adj;
1156 int err;
1157
1158 memset(buffer, 0, sizeof(buffer));
1159 if (count > sizeof(buffer) - 1)
1160 count = sizeof(buffer) - 1;
1161 if (copy_from_user(buffer, buf, count)) {
1162 err = -EFAULT;
1163 goto out;
1164 }
1165
1166 err = kstrtoint(strstrip(buffer), 0, &oom_adj);
1167 if (err)
1168 goto out;
1169 if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
1170 oom_adj != OOM_DISABLE) {
1171 err = -EINVAL;
1172 goto out;
1173 }
1174
1175 /*
1176 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1177 * value is always attainable.
1178 */
1179 if (oom_adj == OOM_ADJUST_MAX)
1180 oom_adj = OOM_SCORE_ADJ_MAX;
1181 else
1182 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
1183
1184 err = __set_oom_adj(file, oom_adj, true);
1185 out:
1186 return err < 0 ? err : count;
1187 }
1188
1189 static const struct file_operations proc_oom_adj_operations = {
1190 .read = oom_adj_read,
1191 .write = oom_adj_write,
1192 .llseek = generic_file_llseek,
1193 };
1194
1195 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1196 size_t count, loff_t *ppos)
1197 {
1198 struct task_struct *task = get_proc_task(file_inode(file));
1199 char buffer[PROC_NUMBUF];
1200 short oom_score_adj = OOM_SCORE_ADJ_MIN;
1201 size_t len;
1202
1203 if (!task)
1204 return -ESRCH;
1205 oom_score_adj = task->signal->oom_score_adj;
1206 put_task_struct(task);
1207 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
1208 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1209 }
1210
1211 static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1212 size_t count, loff_t *ppos)
1213 {
1214 char buffer[PROC_NUMBUF];
1215 int oom_score_adj;
1216 int err;
1217
1218 memset(buffer, 0, sizeof(buffer));
1219 if (count > sizeof(buffer) - 1)
1220 count = sizeof(buffer) - 1;
1221 if (copy_from_user(buffer, buf, count)) {
1222 err = -EFAULT;
1223 goto out;
1224 }
1225
1226 err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
1227 if (err)
1228 goto out;
1229 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1230 oom_score_adj > OOM_SCORE_ADJ_MAX) {
1231 err = -EINVAL;
1232 goto out;
1233 }
1234
1235 err = __set_oom_adj(file, oom_score_adj, false);
1236 out:
1237 return err < 0 ? err : count;
1238 }
1239
1240 static const struct file_operations proc_oom_score_adj_operations = {
1241 .read = oom_score_adj_read,
1242 .write = oom_score_adj_write,
1243 .llseek = default_llseek,
1244 };
1245
1246 #ifdef CONFIG_AUDIT
1247 #define TMPBUFLEN 11
1248 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1249 size_t count, loff_t *ppos)
1250 {
1251 struct inode * inode = file_inode(file);
1252 struct task_struct *task = get_proc_task(inode);
1253 ssize_t length;
1254 char tmpbuf[TMPBUFLEN];
1255
1256 if (!task)
1257 return -ESRCH;
1258 length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1259 from_kuid(file->f_cred->user_ns,
1260 audit_get_loginuid(task)));
1261 put_task_struct(task);
1262 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1263 }
1264
1265 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1266 size_t count, loff_t *ppos)
1267 {
1268 struct inode * inode = file_inode(file);
1269 uid_t loginuid;
1270 kuid_t kloginuid;
1271 int rv;
1272
1273 /* Don't let kthreads write their own loginuid */
1274 if (current->flags & PF_KTHREAD)
1275 return -EPERM;
1276
1277 rcu_read_lock();
1278 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1279 rcu_read_unlock();
1280 return -EPERM;
1281 }
1282 rcu_read_unlock();
1283
1284 if (*ppos != 0) {
1285 /* No partial writes. */
1286 return -EINVAL;
1287 }
1288
1289 rv = kstrtou32_from_user(buf, count, 10, &loginuid);
1290 if (rv < 0)
1291 return rv;
1292
1293 /* is userspace tring to explicitly UNSET the loginuid? */
1294 if (loginuid == AUDIT_UID_UNSET) {
1295 kloginuid = INVALID_UID;
1296 } else {
1297 kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1298 if (!uid_valid(kloginuid))
1299 return -EINVAL;
1300 }
1301
1302 rv = audit_set_loginuid(kloginuid);
1303 if (rv < 0)
1304 return rv;
1305 return count;
1306 }
1307
1308 static const struct file_operations proc_loginuid_operations = {
1309 .read = proc_loginuid_read,
1310 .write = proc_loginuid_write,
1311 .llseek = generic_file_llseek,
1312 };
1313
1314 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1315 size_t count, loff_t *ppos)
1316 {
1317 struct inode * inode = file_inode(file);
1318 struct task_struct *task = get_proc_task(inode);
1319 ssize_t length;
1320 char tmpbuf[TMPBUFLEN];
1321
1322 if (!task)
1323 return -ESRCH;
1324 length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1325 audit_get_sessionid(task));
1326 put_task_struct(task);
1327 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1328 }
1329
1330 static const struct file_operations proc_sessionid_operations = {
1331 .read = proc_sessionid_read,
1332 .llseek = generic_file_llseek,
1333 };
1334 #endif
1335
1336 #ifdef CONFIG_FAULT_INJECTION
1337 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1338 size_t count, loff_t *ppos)
1339 {
1340 struct task_struct *task = get_proc_task(file_inode(file));
1341 char buffer[PROC_NUMBUF];
1342 size_t len;
1343 int make_it_fail;
1344
1345 if (!task)
1346 return -ESRCH;
1347 make_it_fail = task->make_it_fail;
1348 put_task_struct(task);
1349
1350 len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1351
1352 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1353 }
1354
1355 static ssize_t proc_fault_inject_write(struct file * file,
1356 const char __user * buf, size_t count, loff_t *ppos)
1357 {
1358 struct task_struct *task;
1359 char buffer[PROC_NUMBUF];
1360 int make_it_fail;
1361 int rv;
1362
1363 if (!capable(CAP_SYS_RESOURCE))
1364 return -EPERM;
1365 memset(buffer, 0, sizeof(buffer));
1366 if (count > sizeof(buffer) - 1)
1367 count = sizeof(buffer) - 1;
1368 if (copy_from_user(buffer, buf, count))
1369 return -EFAULT;
1370 rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
1371 if (rv < 0)
1372 return rv;
1373 if (make_it_fail < 0 || make_it_fail > 1)
1374 return -EINVAL;
1375
1376 task = get_proc_task(file_inode(file));
1377 if (!task)
1378 return -ESRCH;
1379 task->make_it_fail = make_it_fail;
1380 put_task_struct(task);
1381
1382 return count;
1383 }
1384
1385 static const struct file_operations proc_fault_inject_operations = {
1386 .read = proc_fault_inject_read,
1387 .write = proc_fault_inject_write,
1388 .llseek = generic_file_llseek,
1389 };
1390
1391 static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
1392 size_t count, loff_t *ppos)
1393 {
1394 struct task_struct *task;
1395 int err;
1396 unsigned int n;
1397
1398 err = kstrtouint_from_user(buf, count, 0, &n);
1399 if (err)
1400 return err;
1401
1402 task = get_proc_task(file_inode(file));
1403 if (!task)
1404 return -ESRCH;
1405 task->fail_nth = n;
1406 put_task_struct(task);
1407
1408 return count;
1409 }
1410
1411 static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
1412 size_t count, loff_t *ppos)
1413 {
1414 struct task_struct *task;
1415 char numbuf[PROC_NUMBUF];
1416 ssize_t len;
1417
1418 task = get_proc_task(file_inode(file));
1419 if (!task)
1420 return -ESRCH;
1421 len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
1422 put_task_struct(task);
1423 return simple_read_from_buffer(buf, count, ppos, numbuf, len);
1424 }
1425
1426 static const struct file_operations proc_fail_nth_operations = {
1427 .read = proc_fail_nth_read,
1428 .write = proc_fail_nth_write,
1429 };
1430 #endif
1431
1432
1433 #ifdef CONFIG_SCHED_DEBUG
1434 /*
1435 * Print out various scheduling related per-task fields:
1436 */
1437 static int sched_show(struct seq_file *m, void *v)
1438 {
1439 struct inode *inode = m->private;
1440 struct pid_namespace *ns = proc_pid_ns(inode->i_sb);
1441 struct task_struct *p;
1442
1443 p = get_proc_task(inode);
1444 if (!p)
1445 return -ESRCH;
1446 proc_sched_show_task(p, ns, m);
1447
1448 put_task_struct(p);
1449
1450 return 0;
1451 }
1452
1453 static ssize_t
1454 sched_write(struct file *file, const char __user *buf,
1455 size_t count, loff_t *offset)
1456 {
1457 struct inode *inode = file_inode(file);
1458 struct task_struct *p;
1459
1460 p = get_proc_task(inode);
1461 if (!p)
1462 return -ESRCH;
1463 proc_sched_set_task(p);
1464
1465 put_task_struct(p);
1466
1467 return count;
1468 }
1469
1470 static int sched_open(struct inode *inode, struct file *filp)
1471 {
1472 return single_open(filp, sched_show, inode);
1473 }
1474
1475 static const struct file_operations proc_pid_sched_operations = {
1476 .open = sched_open,
1477 .read = seq_read,
1478 .write = sched_write,
1479 .llseek = seq_lseek,
1480 .release = single_release,
1481 };
1482
1483 #endif
1484
1485 #ifdef CONFIG_SCHED_AUTOGROUP
1486 /*
1487 * Print out autogroup related information:
1488 */
1489 static int sched_autogroup_show(struct seq_file *m, void *v)
1490 {
1491 struct inode *inode = m->private;
1492 struct task_struct *p;
1493
1494 p = get_proc_task(inode);
1495 if (!p)
1496 return -ESRCH;
1497 proc_sched_autogroup_show_task(p, m);
1498
1499 put_task_struct(p);
1500
1501 return 0;
1502 }
1503
1504 static ssize_t
1505 sched_autogroup_write(struct file *file, const char __user *buf,
1506 size_t count, loff_t *offset)
1507 {
1508 struct inode *inode = file_inode(file);
1509 struct task_struct *p;
1510 char buffer[PROC_NUMBUF];
1511 int nice;
1512 int err;
1513
1514 memset(buffer, 0, sizeof(buffer));
1515 if (count > sizeof(buffer) - 1)
1516 count = sizeof(buffer) - 1;
1517 if (copy_from_user(buffer, buf, count))
1518 return -EFAULT;
1519
1520 err = kstrtoint(strstrip(buffer), 0, &nice);
1521 if (err < 0)
1522 return err;
1523
1524 p = get_proc_task(inode);
1525 if (!p)
1526 return -ESRCH;
1527
1528 err = proc_sched_autogroup_set_nice(p, nice);
1529 if (err)
1530 count = err;
1531
1532 put_task_struct(p);
1533
1534 return count;
1535 }
1536
1537 static int sched_autogroup_open(struct inode *inode, struct file *filp)
1538 {
1539 int ret;
1540
1541 ret = single_open(filp, sched_autogroup_show, NULL);
1542 if (!ret) {
1543 struct seq_file *m = filp->private_data;
1544
1545 m->private = inode;
1546 }
1547 return ret;
1548 }
1549
1550 static const struct file_operations proc_pid_sched_autogroup_operations = {
1551 .open = sched_autogroup_open,
1552 .read = seq_read,
1553 .write = sched_autogroup_write,
1554 .llseek = seq_lseek,
1555 .release = single_release,
1556 };
1557
1558 #endif /* CONFIG_SCHED_AUTOGROUP */
1559
1560 #ifdef CONFIG_TIME_NS
1561 static int timens_offsets_show(struct seq_file *m, void *v)
1562 {
1563 struct task_struct *p;
1564
1565 p = get_proc_task(file_inode(m->file));
1566 if (!p)
1567 return -ESRCH;
1568 proc_timens_show_offsets(p, m);
1569
1570 put_task_struct(p);
1571
1572 return 0;
1573 }
1574
1575 static ssize_t timens_offsets_write(struct file *file, const char __user *buf,
1576 size_t count, loff_t *ppos)
1577 {
1578 struct inode *inode = file_inode(file);
1579 struct proc_timens_offset offsets[2];
1580 char *kbuf = NULL, *pos, *next_line;
1581 struct task_struct *p;
1582 int ret, noffsets;
1583
1584 /* Only allow < page size writes at the beginning of the file */
1585 if ((*ppos != 0) || (count >= PAGE_SIZE))
1586 return -EINVAL;
1587
1588 /* Slurp in the user data */
1589 kbuf = memdup_user_nul(buf, count);
1590 if (IS_ERR(kbuf))
1591 return PTR_ERR(kbuf);
1592
1593 /* Parse the user data */
1594 ret = -EINVAL;
1595 noffsets = 0;
1596 for (pos = kbuf; pos; pos = next_line) {
1597 struct proc_timens_offset *off = &offsets[noffsets];
1598 char clock[10];
1599 int err;
1600
1601 /* Find the end of line and ensure we don't look past it */
1602 next_line = strchr(pos, '\n');
1603 if (next_line) {
1604 *next_line = '\0';
1605 next_line++;
1606 if (*next_line == '\0')
1607 next_line = NULL;
1608 }
1609
1610 err = sscanf(pos, "%9s %lld %lu", clock,
1611 &off->val.tv_sec, &off->val.tv_nsec);
1612 if (err != 3 || off->val.tv_nsec >= NSEC_PER_SEC)
1613 goto out;
1614
1615 clock[sizeof(clock) - 1] = 0;
1616 if (strcmp(clock, "monotonic") == 0 ||
1617 strcmp(clock, __stringify(CLOCK_MONOTONIC)) == 0)
1618 off->clockid = CLOCK_MONOTONIC;
1619 else if (strcmp(clock, "boottime") == 0 ||
1620 strcmp(clock, __stringify(CLOCK_BOOTTIME)) == 0)
1621 off->clockid = CLOCK_BOOTTIME;
1622 else
1623 goto out;
1624
1625 noffsets++;
1626 if (noffsets == ARRAY_SIZE(offsets)) {
1627 if (next_line)
1628 count = next_line - kbuf;
1629 break;
1630 }
1631 }
1632
1633 ret = -ESRCH;
1634 p = get_proc_task(inode);
1635 if (!p)
1636 goto out;
1637 ret = proc_timens_set_offset(file, p, offsets, noffsets);
1638 put_task_struct(p);
1639 if (ret)
1640 goto out;
1641
1642 ret = count;
1643 out:
1644 kfree(kbuf);
1645 return ret;
1646 }
1647
1648 static int timens_offsets_open(struct inode *inode, struct file *filp)
1649 {
1650 return single_open(filp, timens_offsets_show, inode);
1651 }
1652
1653 static const struct file_operations proc_timens_offsets_operations = {
1654 .open = timens_offsets_open,
1655 .read = seq_read,
1656 .write = timens_offsets_write,
1657 .llseek = seq_lseek,
1658 .release = single_release,
1659 };
1660 #endif /* CONFIG_TIME_NS */
1661
1662 static ssize_t comm_write(struct file *file, const char __user *buf,
1663 size_t count, loff_t *offset)
1664 {
1665 struct inode *inode = file_inode(file);
1666 struct task_struct *p;
1667 char buffer[TASK_COMM_LEN];
1668 const size_t maxlen = sizeof(buffer) - 1;
1669
1670 memset(buffer, 0, sizeof(buffer));
1671 if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
1672 return -EFAULT;
1673
1674 p = get_proc_task(inode);
1675 if (!p)
1676 return -ESRCH;
1677
1678 if (same_thread_group(current, p))
1679 set_task_comm(p, buffer);
1680 else
1681 count = -EINVAL;
1682
1683 put_task_struct(p);
1684
1685 return count;
1686 }
1687
1688 static int comm_show(struct seq_file *m, void *v)
1689 {
1690 struct inode *inode = m->private;
1691 struct task_struct *p;
1692
1693 p = get_proc_task(inode);
1694 if (!p)
1695 return -ESRCH;
1696
1697 proc_task_name(m, p, false);
1698 seq_putc(m, '\n');
1699
1700 put_task_struct(p);
1701
1702 return 0;
1703 }
1704
1705 static int comm_open(struct inode *inode, struct file *filp)
1706 {
1707 return single_open(filp, comm_show, inode);
1708 }
1709
1710 static const struct file_operations proc_pid_set_comm_operations = {
1711 .open = comm_open,
1712 .read = seq_read,
1713 .write = comm_write,
1714 .llseek = seq_lseek,
1715 .release = single_release,
1716 };
1717
1718 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1719 {
1720 struct task_struct *task;
1721 struct file *exe_file;
1722
1723 task = get_proc_task(d_inode(dentry));
1724 if (!task)
1725 return -ENOENT;
1726 exe_file = get_task_exe_file(task);
1727 put_task_struct(task);
1728 if (exe_file) {
1729 *exe_path = exe_file->f_path;
1730 path_get(&exe_file->f_path);
1731 fput(exe_file);
1732 return 0;
1733 } else
1734 return -ENOENT;
1735 }
1736
1737 static const char *proc_pid_get_link(struct dentry *dentry,
1738 struct inode *inode,
1739 struct delayed_call *done)
1740 {
1741 struct path path;
1742 int error = -EACCES;
1743
1744 if (!dentry)
1745 return ERR_PTR(-ECHILD);
1746
1747 /* Are we allowed to snoop on the tasks file descriptors? */
1748 if (!proc_fd_access_allowed(inode))
1749 goto out;
1750
1751 error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1752 if (error)
1753 goto out;
1754
1755 error = nd_jump_link(&path);
1756 out:
1757 return ERR_PTR(error);
1758 }
1759
1760 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1761 {
1762 char *tmp = (char *)__get_free_page(GFP_KERNEL);
1763 char *pathname;
1764 int len;
1765
1766 if (!tmp)
1767 return -ENOMEM;
1768
1769 pathname = d_path(path, tmp, PAGE_SIZE);
1770 len = PTR_ERR(pathname);
1771 if (IS_ERR(pathname))
1772 goto out;
1773 len = tmp + PAGE_SIZE - 1 - pathname;
1774
1775 if (len > buflen)
1776 len = buflen;
1777 if (copy_to_user(buffer, pathname, len))
1778 len = -EFAULT;
1779 out:
1780 free_page((unsigned long)tmp);
1781 return len;
1782 }
1783
1784 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1785 {
1786 int error = -EACCES;
1787 struct inode *inode = d_inode(dentry);
1788 struct path path;
1789
1790 /* Are we allowed to snoop on the tasks file descriptors? */
1791 if (!proc_fd_access_allowed(inode))
1792 goto out;
1793
1794 error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1795 if (error)
1796 goto out;
1797
1798 error = do_proc_readlink(&path, buffer, buflen);
1799 path_put(&path);
1800 out:
1801 return error;
1802 }
1803
1804 const struct inode_operations proc_pid_link_inode_operations = {
1805 .readlink = proc_pid_readlink,
1806 .get_link = proc_pid_get_link,
1807 .setattr = proc_setattr,
1808 };
1809
1810
1811 /* building an inode */
1812
1813 void task_dump_owner(struct task_struct *task, umode_t mode,
1814 kuid_t *ruid, kgid_t *rgid)
1815 {
1816 /* Depending on the state of dumpable compute who should own a
1817 * proc file for a task.
1818 */
1819 const struct cred *cred;
1820 kuid_t uid;
1821 kgid_t gid;
1822
1823 if (unlikely(task->flags & PF_KTHREAD)) {
1824 *ruid = GLOBAL_ROOT_UID;
1825 *rgid = GLOBAL_ROOT_GID;
1826 return;
1827 }
1828
1829 /* Default to the tasks effective ownership */
1830 rcu_read_lock();
1831 cred = __task_cred(task);
1832 uid = cred->euid;
1833 gid = cred->egid;
1834 rcu_read_unlock();
1835
1836 /*
1837 * Before the /proc/pid/status file was created the only way to read
1838 * the effective uid of a /process was to stat /proc/pid. Reading
1839 * /proc/pid/status is slow enough that procps and other packages
1840 * kept stating /proc/pid. To keep the rules in /proc simple I have
1841 * made this apply to all per process world readable and executable
1842 * directories.
1843 */
1844 if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
1845 struct mm_struct *mm;
1846 task_lock(task);
1847 mm = task->mm;
1848 /* Make non-dumpable tasks owned by some root */
1849 if (mm) {
1850 if (get_dumpable(mm) != SUID_DUMP_USER) {
1851 struct user_namespace *user_ns = mm->user_ns;
1852
1853 uid = make_kuid(user_ns, 0);
1854 if (!uid_valid(uid))
1855 uid = GLOBAL_ROOT_UID;
1856
1857 gid = make_kgid(user_ns, 0);
1858 if (!gid_valid(gid))
1859 gid = GLOBAL_ROOT_GID;
1860 }
1861 } else {
1862 uid = GLOBAL_ROOT_UID;
1863 gid = GLOBAL_ROOT_GID;
1864 }
1865 task_unlock(task);
1866 }
1867 *ruid = uid;
1868 *rgid = gid;
1869 }
1870
1871 void proc_pid_evict_inode(struct proc_inode *ei)
1872 {
1873 struct pid *pid = ei->pid;
1874
1875 if (S_ISDIR(ei->vfs_inode.i_mode)) {
1876 spin_lock(&pid->lock);
1877 hlist_del_init_rcu(&ei->sibling_inodes);
1878 spin_unlock(&pid->lock);
1879 }
1880
1881 put_pid(pid);
1882 }
1883
1884 struct inode *proc_pid_make_inode(struct super_block * sb,
1885 struct task_struct *task, umode_t mode)
1886 {
1887 struct inode * inode;
1888 struct proc_inode *ei;
1889 struct pid *pid;
1890
1891 /* We need a new inode */
1892
1893 inode = new_inode(sb);
1894 if (!inode)
1895 goto out;
1896
1897 /* Common stuff */
1898 ei = PROC_I(inode);
1899 inode->i_mode = mode;
1900 inode->i_ino = get_next_ino();
1901 inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
1902 inode->i_op = &proc_def_inode_operations;
1903
1904 /*
1905 * grab the reference to task.
1906 */
1907 pid = get_task_pid(task, PIDTYPE_PID);
1908 if (!pid)
1909 goto out_unlock;
1910
1911 /* Let the pid remember us for quick removal */
1912 ei->pid = pid;
1913 if (S_ISDIR(mode)) {
1914 spin_lock(&pid->lock);
1915 hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
1916 spin_unlock(&pid->lock);
1917 }
1918
1919 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
1920 security_task_to_inode(task, inode);
1921
1922 out:
1923 return inode;
1924
1925 out_unlock:
1926 iput(inode);
1927 return NULL;
1928 }
1929
1930 int pid_getattr(const struct path *path, struct kstat *stat,
1931 u32 request_mask, unsigned int query_flags)
1932 {
1933 struct inode *inode = d_inode(path->dentry);
1934 struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
1935 struct task_struct *task;
1936
1937 generic_fillattr(inode, stat);
1938
1939 stat->uid = GLOBAL_ROOT_UID;
1940 stat->gid = GLOBAL_ROOT_GID;
1941 rcu_read_lock();
1942 task = pid_task(proc_pid(inode), PIDTYPE_PID);
1943 if (task) {
1944 if (!has_pid_permissions(fs_info, task, HIDEPID_INVISIBLE)) {
1945 rcu_read_unlock();
1946 /*
1947 * This doesn't prevent learning whether PID exists,
1948 * it only makes getattr() consistent with readdir().
1949 */
1950 return -ENOENT;
1951 }
1952 task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
1953 }
1954 rcu_read_unlock();
1955 return 0;
1956 }
1957
1958 /* dentry stuff */
1959
1960 /*
1961 * Set <pid>/... inode ownership (can change due to setuid(), etc.)
1962 */
1963 void pid_update_inode(struct task_struct *task, struct inode *inode)
1964 {
1965 task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
1966
1967 inode->i_mode &= ~(S_ISUID | S_ISGID);
1968 security_task_to_inode(task, inode);
1969 }
1970
1971 /*
1972 * Rewrite the inode's ownerships here because the owning task may have
1973 * performed a setuid(), etc.
1974 *
1975 */
1976 static int pid_revalidate(struct dentry *dentry, unsigned int flags)
1977 {
1978 struct inode *inode;
1979 struct task_struct *task;
1980
1981 if (flags & LOOKUP_RCU)
1982 return -ECHILD;
1983
1984 inode = d_inode(dentry);
1985 task = get_proc_task(inode);
1986
1987 if (task) {
1988 pid_update_inode(task, inode);
1989 put_task_struct(task);
1990 return 1;
1991 }
1992 return 0;
1993 }
1994
1995 static inline bool proc_inode_is_dead(struct inode *inode)
1996 {
1997 return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
1998 }
1999
2000 int pid_delete_dentry(const struct dentry *dentry)
2001 {
2002 /* Is the task we represent dead?
2003 * If so, then don't put the dentry on the lru list,
2004 * kill it immediately.
2005 */
2006 return proc_inode_is_dead(d_inode(dentry));
2007 }
2008
2009 const struct dentry_operations pid_dentry_operations =
2010 {
2011 .d_revalidate = pid_revalidate,
2012 .d_delete = pid_delete_dentry,
2013 };
2014
2015 /* Lookups */
2016
2017 /*
2018 * Fill a directory entry.
2019 *
2020 * If possible create the dcache entry and derive our inode number and
2021 * file type from dcache entry.
2022 *
2023 * Since all of the proc inode numbers are dynamically generated, the inode
2024 * numbers do not exist until the inode is cache. This means creating
2025 * the dcache entry in readdir is necessary to keep the inode numbers
2026 * reported by readdir in sync with the inode numbers reported
2027 * by stat.
2028 */
2029 bool proc_fill_cache(struct file *file, struct dir_context *ctx,
2030 const char *name, unsigned int len,
2031 instantiate_t instantiate, struct task_struct *task, const void *ptr)
2032 {
2033 struct dentry *child, *dir = file->f_path.dentry;
2034 struct qstr qname = QSTR_INIT(name, len);
2035 struct inode *inode;
2036 unsigned type = DT_UNKNOWN;
2037 ino_t ino = 1;
2038
2039 child = d_hash_and_lookup(dir, &qname);
2040 if (!child) {
2041 DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
2042 child = d_alloc_parallel(dir, &qname, &wq);
2043 if (IS_ERR(child))
2044 goto end_instantiate;
2045 if (d_in_lookup(child)) {
2046 struct dentry *res;
2047 res = instantiate(child, task, ptr);
2048 d_lookup_done(child);
2049 if (unlikely(res)) {
2050 dput(child);
2051 child = res;
2052 if (IS_ERR(child))
2053 goto end_instantiate;
2054 }
2055 }
2056 }
2057 inode = d_inode(child);
2058 ino = inode->i_ino;
2059 type = inode->i_mode >> 12;
2060 dput(child);
2061 end_instantiate:
2062 return dir_emit(ctx, name, len, ino, type);
2063 }
2064
2065 /*
2066 * dname_to_vma_addr - maps a dentry name into two unsigned longs
2067 * which represent vma start and end addresses.
2068 */
2069 static int dname_to_vma_addr(struct dentry *dentry,
2070 unsigned long *start, unsigned long *end)
2071 {
2072 const char *str = dentry->d_name.name;
2073 unsigned long long sval, eval;
2074 unsigned int len;
2075
2076 if (str[0] == '0' && str[1] != '-')
2077 return -EINVAL;
2078 len = _parse_integer(str, 16, &sval);
2079 if (len & KSTRTOX_OVERFLOW)
2080 return -EINVAL;
2081 if (sval != (unsigned long)sval)
2082 return -EINVAL;
2083 str += len;
2084
2085 if (*str != '-')
2086 return -EINVAL;
2087 str++;
2088
2089 if (str[0] == '0' && str[1])
2090 return -EINVAL;
2091 len = _parse_integer(str, 16, &eval);
2092 if (len & KSTRTOX_OVERFLOW)
2093 return -EINVAL;
2094 if (eval != (unsigned long)eval)
2095 return -EINVAL;
2096 str += len;
2097
2098 if (*str != '\0')
2099 return -EINVAL;
2100
2101 *start = sval;
2102 *end = eval;
2103
2104 return 0;
2105 }
2106
2107 static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
2108 {
2109 unsigned long vm_start, vm_end;
2110 bool exact_vma_exists = false;
2111 struct mm_struct *mm = NULL;
2112 struct task_struct *task;
2113 struct inode *inode;
2114 int status = 0;
2115
2116 if (flags & LOOKUP_RCU)
2117 return -ECHILD;
2118
2119 inode = d_inode(dentry);
2120 task = get_proc_task(inode);
2121 if (!task)
2122 goto out_notask;
2123
2124 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
2125 if (IS_ERR_OR_NULL(mm))
2126 goto out;
2127
2128 if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
2129 status = mmap_read_lock_killable(mm);
2130 if (!status) {
2131 exact_vma_exists = !!find_exact_vma(mm, vm_start,
2132 vm_end);
2133 mmap_read_unlock(mm);
2134 }
2135 }
2136
2137 mmput(mm);
2138
2139 if (exact_vma_exists) {
2140 task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
2141
2142 security_task_to_inode(task, inode);
2143 status = 1;
2144 }
2145
2146 out:
2147 put_task_struct(task);
2148
2149 out_notask:
2150 return status;
2151 }
2152
2153 static const struct dentry_operations tid_map_files_dentry_operations = {
2154 .d_revalidate = map_files_d_revalidate,
2155 .d_delete = pid_delete_dentry,
2156 };
2157
2158 static int map_files_get_link(struct dentry *dentry, struct path *path)
2159 {
2160 unsigned long vm_start, vm_end;
2161 struct vm_area_struct *vma;
2162 struct task_struct *task;
2163 struct mm_struct *mm;
2164 int rc;
2165
2166 rc = -ENOENT;
2167 task = get_proc_task(d_inode(dentry));
2168 if (!task)
2169 goto out;
2170
2171 mm = get_task_mm(task);
2172 put_task_struct(task);
2173 if (!mm)
2174 goto out;
2175
2176 rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
2177 if (rc)
2178 goto out_mmput;
2179
2180 rc = mmap_read_lock_killable(mm);
2181 if (rc)
2182 goto out_mmput;
2183
2184 rc = -ENOENT;
2185 vma = find_exact_vma(mm, vm_start, vm_end);
2186 if (vma && vma->vm_file) {
2187 *path = vma_pr_or_file(vma)->f_path;
2188 path_get(path);
2189 rc = 0;
2190 }
2191 mmap_read_unlock(mm);
2192
2193 out_mmput:
2194 mmput(mm);
2195 out:
2196 return rc;
2197 }
2198
2199 struct map_files_info {
2200 unsigned long start;
2201 unsigned long end;
2202 fmode_t mode;
2203 };
2204
2205 /*
2206 * Only allow CAP_SYS_ADMIN and CAP_CHECKPOINT_RESTORE to follow the links, due
2207 * to concerns about how the symlinks may be used to bypass permissions on
2208 * ancestor directories in the path to the file in question.
2209 */
2210 static const char *
2211 proc_map_files_get_link(struct dentry *dentry,
2212 struct inode *inode,
2213 struct delayed_call *done)
2214 {
2215 if (!checkpoint_restore_ns_capable(&init_user_ns))
2216 return ERR_PTR(-EPERM);
2217
2218 return proc_pid_get_link(dentry, inode, done);
2219 }
2220
2221 /*
2222 * Identical to proc_pid_link_inode_operations except for get_link()
2223 */
2224 static const struct inode_operations proc_map_files_link_inode_operations = {
2225 .readlink = proc_pid_readlink,
2226 .get_link = proc_map_files_get_link,
2227 .setattr = proc_setattr,
2228 };
2229
2230 static struct dentry *
2231 proc_map_files_instantiate(struct dentry *dentry,
2232 struct task_struct *task, const void *ptr)
2233 {
2234 fmode_t mode = (fmode_t)(unsigned long)ptr;
2235 struct proc_inode *ei;
2236 struct inode *inode;
2237
2238 inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK |
2239 ((mode & FMODE_READ ) ? S_IRUSR : 0) |
2240 ((mode & FMODE_WRITE) ? S_IWUSR : 0));
2241 if (!inode)
2242 return ERR_PTR(-ENOENT);
2243
2244 ei = PROC_I(inode);
2245 ei->op.proc_get_link = map_files_get_link;
2246
2247 inode->i_op = &proc_map_files_link_inode_operations;
2248 inode->i_size = 64;
2249
2250 d_set_d_op(dentry, &tid_map_files_dentry_operations);
2251 return d_splice_alias(inode, dentry);
2252 }
2253
2254 static struct dentry *proc_map_files_lookup(struct inode *dir,
2255 struct dentry *dentry, unsigned int flags)
2256 {
2257 unsigned long vm_start, vm_end;
2258 struct vm_area_struct *vma;
2259 struct task_struct *task;
2260 struct dentry *result;
2261 struct mm_struct *mm;
2262
2263 result = ERR_PTR(-ENOENT);
2264 task = get_proc_task(dir);
2265 if (!task)
2266 goto out;
2267
2268 result = ERR_PTR(-EACCES);
2269 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2270 goto out_put_task;
2271
2272 result = ERR_PTR(-ENOENT);
2273 if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
2274 goto out_put_task;
2275
2276 mm = get_task_mm(task);
2277 if (!mm)
2278 goto out_put_task;
2279
2280 result = ERR_PTR(-EINTR);
2281 if (mmap_read_lock_killable(mm))
2282 goto out_put_mm;
2283
2284 result = ERR_PTR(-ENOENT);
2285 vma = find_exact_vma(mm, vm_start, vm_end);
2286 if (!vma)
2287 goto out_no_vma;
2288
2289 if (vma->vm_file)
2290 result = proc_map_files_instantiate(dentry, task,
2291 (void *)(unsigned long)vma->vm_file->f_mode);
2292
2293 out_no_vma:
2294 mmap_read_unlock(mm);
2295 out_put_mm:
2296 mmput(mm);
2297 out_put_task:
2298 put_task_struct(task);
2299 out:
2300 return result;
2301 }
2302
2303 static const struct inode_operations proc_map_files_inode_operations = {
2304 .lookup = proc_map_files_lookup,
2305 .permission = proc_fd_permission,
2306 .setattr = proc_setattr,
2307 };
2308
2309 static int
2310 proc_map_files_readdir(struct file *file, struct dir_context *ctx)
2311 {
2312 struct vm_area_struct *vma;
2313 struct task_struct *task;
2314 struct mm_struct *mm;
2315 unsigned long nr_files, pos, i;
2316 GENRADIX(struct map_files_info) fa;
2317 struct map_files_info *p;
2318 int ret;
2319
2320 genradix_init(&fa);
2321
2322 ret = -ENOENT;
2323 task = get_proc_task(file_inode(file));
2324 if (!task)
2325 goto out;
2326
2327 ret = -EACCES;
2328 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS))
2329 goto out_put_task;
2330
2331 ret = 0;
2332 if (!dir_emit_dots(file, ctx))
2333 goto out_put_task;
2334
2335 mm = get_task_mm(task);
2336 if (!mm)
2337 goto out_put_task;
2338
2339 ret = mmap_read_lock_killable(mm);
2340 if (ret) {
2341 mmput(mm);
2342 goto out_put_task;
2343 }
2344
2345 nr_files = 0;
2346
2347 /*
2348 * We need two passes here:
2349 *
2350 * 1) Collect vmas of mapped files with mmap_lock taken
2351 * 2) Release mmap_lock and instantiate entries
2352 *
2353 * otherwise we get lockdep complained, since filldir()
2354 * routine might require mmap_lock taken in might_fault().
2355 */
2356
2357 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
2358 if (!vma->vm_file)
2359 continue;
2360 if (++pos <= ctx->pos)
2361 continue;
2362
2363 p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL);
2364 if (!p) {
2365 ret = -ENOMEM;
2366 mmap_read_unlock(mm);
2367 mmput(mm);
2368 goto out_put_task;
2369 }
2370
2371 p->start = vma->vm_start;
2372 p->end = vma->vm_end;
2373 p->mode = vma->vm_file->f_mode;
2374 }
2375 mmap_read_unlock(mm);
2376 mmput(mm);
2377
2378 for (i = 0; i < nr_files; i++) {
2379 char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
2380 unsigned int len;
2381
2382 p = genradix_ptr(&fa, i);
2383 len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
2384 if (!proc_fill_cache(file, ctx,
2385 buf, len,
2386 proc_map_files_instantiate,
2387 task,
2388 (void *)(unsigned long)p->mode))
2389 break;
2390 ctx->pos++;
2391 }
2392
2393 out_put_task:
2394 put_task_struct(task);
2395 out:
2396 genradix_free(&fa);
2397 return ret;
2398 }
2399
2400 static const struct file_operations proc_map_files_operations = {
2401 .read = generic_read_dir,
2402 .iterate_shared = proc_map_files_readdir,
2403 .llseek = generic_file_llseek,
2404 };
2405
2406 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
2407 struct timers_private {
2408 struct pid *pid;
2409 struct task_struct *task;
2410 struct sighand_struct *sighand;
2411 struct pid_namespace *ns;
2412 unsigned long flags;
2413 };
2414
2415 static void *timers_start(struct seq_file *m, loff_t *pos)
2416 {
2417 struct timers_private *tp = m->private;
2418
2419 tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
2420 if (!tp->task)
2421 return ERR_PTR(-ESRCH);
2422
2423 tp->sighand = lock_task_sighand(tp->task, &tp->flags);
2424 if (!tp->sighand)
2425 return ERR_PTR(-ESRCH);
2426
2427 return seq_list_start(&tp->task->signal->posix_timers, *pos);
2428 }
2429
2430 static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
2431 {
2432 struct timers_private *tp = m->private;
2433 return seq_list_next(v, &tp->task->signal->posix_timers, pos);
2434 }
2435
2436 static void timers_stop(struct seq_file *m, void *v)
2437 {
2438 struct timers_private *tp = m->private;
2439
2440 if (tp->sighand) {
2441 unlock_task_sighand(tp->task, &tp->flags);
2442 tp->sighand = NULL;
2443 }
2444
2445 if (tp->task) {
2446 put_task_struct(tp->task);
2447 tp->task = NULL;
2448 }
2449 }
2450
2451 static int show_timer(struct seq_file *m, void *v)
2452 {
2453 struct k_itimer *timer;
2454 struct timers_private *tp = m->private;
2455 int notify;
2456 static const char * const nstr[] = {
2457 [SIGEV_SIGNAL] = "signal",
2458 [SIGEV_NONE] = "none",
2459 [SIGEV_THREAD] = "thread",
2460 };
2461
2462 timer = list_entry((struct list_head *)v, struct k_itimer, list);
2463 notify = timer->it_sigev_notify;
2464
2465 seq_printf(m, "ID: %d\n", timer->it_id);
2466 seq_printf(m, "signal: %d/%px\n",
2467 timer->sigq->info.si_signo,
2468 timer->sigq->info.si_value.sival_ptr);
2469 seq_printf(m, "notify: %s/%s.%d\n",
2470 nstr[notify & ~SIGEV_THREAD_ID],
2471 (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
2472 pid_nr_ns(timer->it_pid, tp->ns));
2473 seq_printf(m, "ClockID: %d\n", timer->it_clock);
2474
2475 return 0;
2476 }
2477
2478 static const struct seq_operations proc_timers_seq_ops = {
2479 .start = timers_start,
2480 .next = timers_next,
2481 .stop = timers_stop,
2482 .show = show_timer,
2483 };
2484
2485 static int proc_timers_open(struct inode *inode, struct file *file)
2486 {
2487 struct timers_private *tp;
2488
2489 tp = __seq_open_private(file, &proc_timers_seq_ops,
2490 sizeof(struct timers_private));
2491 if (!tp)
2492 return -ENOMEM;
2493
2494 tp->pid = proc_pid(inode);
2495 tp->ns = proc_pid_ns(inode->i_sb);
2496 return 0;
2497 }
2498
2499 static const struct file_operations proc_timers_operations = {
2500 .open = proc_timers_open,
2501 .read = seq_read,
2502 .llseek = seq_lseek,
2503 .release = seq_release_private,
2504 };
2505 #endif
2506
2507 static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
2508 size_t count, loff_t *offset)
2509 {
2510 struct inode *inode = file_inode(file);
2511 struct task_struct *p;
2512 u64 slack_ns;
2513 int err;
2514
2515 err = kstrtoull_from_user(buf, count, 10, &slack_ns);
2516 if (err < 0)
2517 return err;
2518
2519 p = get_proc_task(inode);
2520 if (!p)
2521 return -ESRCH;
2522
2523 if (p != current) {
2524 rcu_read_lock();
2525 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
2526 rcu_read_unlock();
2527 count = -EPERM;
2528 goto out;
2529 }
2530 rcu_read_unlock();
2531
2532 err = security_task_setscheduler(p);
2533 if (err) {
2534 count = err;
2535 goto out;
2536 }
2537 }
2538
2539 task_lock(p);
2540 if (slack_ns == 0)
2541 p->timer_slack_ns = p->default_timer_slack_ns;
2542 else
2543 p->timer_slack_ns = slack_ns;
2544 task_unlock(p);
2545
2546 out:
2547 put_task_struct(p);
2548
2549 return count;
2550 }
2551
2552 static int timerslack_ns_show(struct seq_file *m, void *v)
2553 {
2554 struct inode *inode = m->private;
2555 struct task_struct *p;
2556 int err = 0;
2557
2558 p = get_proc_task(inode);
2559 if (!p)
2560 return -ESRCH;
2561
2562 if (p != current) {
2563 rcu_read_lock();
2564 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
2565 rcu_read_unlock();
2566 err = -EPERM;
2567 goto out;
2568 }
2569 rcu_read_unlock();
2570
2571 err = security_task_getscheduler(p);
2572 if (err)
2573 goto out;
2574 }
2575
2576 task_lock(p);
2577 seq_printf(m, "%llu\n", p->timer_slack_ns);
2578 task_unlock(p);
2579
2580 out:
2581 put_task_struct(p);
2582
2583 return err;
2584 }
2585
2586 static int timerslack_ns_open(struct inode *inode, struct file *filp)
2587 {
2588 return single_open(filp, timerslack_ns_show, inode);
2589 }
2590
2591 static const struct file_operations proc_pid_set_timerslack_ns_operations = {
2592 .open = timerslack_ns_open,
2593 .read = seq_read,
2594 .write = timerslack_ns_write,
2595 .llseek = seq_lseek,
2596 .release = single_release,
2597 };
2598
2599 static struct dentry *proc_pident_instantiate(struct dentry *dentry,
2600 struct task_struct *task, const void *ptr)
2601 {
2602 const struct pid_entry *p = ptr;
2603 struct inode *inode;
2604 struct proc_inode *ei;
2605
2606 inode = proc_pid_make_inode(dentry->d_sb, task, p->mode);
2607 if (!inode)
2608 return ERR_PTR(-ENOENT);
2609
2610 ei = PROC_I(inode);
2611 if (S_ISDIR(inode->i_mode))
2612 set_nlink(inode, 2); /* Use getattr to fix if necessary */
2613 if (p->iop)
2614 inode->i_op = p->iop;
2615 if (p->fop)
2616 inode->i_fop = p->fop;
2617 ei->op = p->op;
2618 pid_update_inode(task, inode);
2619 d_set_d_op(dentry, &pid_dentry_operations);
2620 return d_splice_alias(inode, dentry);
2621 }
2622
2623 static struct dentry *proc_pident_lookup(struct inode *dir,
2624 struct dentry *dentry,
2625 const struct pid_entry *p,
2626 const struct pid_entry *end)
2627 {
2628 struct task_struct *task = get_proc_task(dir);
2629 struct dentry *res = ERR_PTR(-ENOENT);
2630
2631 if (!task)
2632 goto out_no_task;
2633
2634 /*
2635 * Yes, it does not scale. And it should not. Don't add
2636 * new entries into /proc/<tgid>/ without very good reasons.
2637 */
2638 for (; p < end; p++) {
2639 if (p->len != dentry->d_name.len)
2640 continue;
2641 if (!memcmp(dentry->d_name.name, p->name, p->len)) {
2642 res = proc_pident_instantiate(dentry, task, p);
2643 break;
2644 }
2645 }
2646 put_task_struct(task);
2647 out_no_task:
2648 return res;
2649 }
2650
2651 static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2652 const struct pid_entry *ents, unsigned int nents)
2653 {
2654 struct task_struct *task = get_proc_task(file_inode(file));
2655 const struct pid_entry *p;
2656
2657 if (!task)
2658 return -ENOENT;
2659
2660 if (!dir_emit_dots(file, ctx))
2661 goto out;
2662
2663 if (ctx->pos >= nents + 2)
2664 goto out;
2665
2666 for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
2667 if (!proc_fill_cache(file, ctx, p->name, p->len,
2668 proc_pident_instantiate, task, p))
2669 break;
2670 ctx->pos++;
2671 }
2672 out:
2673 put_task_struct(task);
2674 return 0;
2675 }
2676
2677 #ifdef CONFIG_SECURITY
2678 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2679 size_t count, loff_t *ppos)
2680 {
2681 struct inode * inode = file_inode(file);
2682 char *p = NULL;
2683 ssize_t length;
2684 struct task_struct *task = get_proc_task(inode);
2685
2686 if (!task)
2687 return -ESRCH;
2688
2689 length = security_getprocattr(task, PROC_I(inode)->op.lsm,
2690 (char*)file->f_path.dentry->d_name.name,
2691 &p);
2692 put_task_struct(task);
2693 if (length > 0)
2694 length = simple_read_from_buffer(buf, count, ppos, p, length);
2695 kfree(p);
2696 return length;
2697 }
2698
2699 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2700 size_t count, loff_t *ppos)
2701 {
2702 struct inode * inode = file_inode(file);
2703 struct task_struct *task;
2704 void *page;
2705 int rv;
2706
2707 rcu_read_lock();
2708 task = pid_task(proc_pid(inode), PIDTYPE_PID);
2709 if (!task) {
2710 rcu_read_unlock();
2711 return -ESRCH;
2712 }
2713 /* A task may only write its own attributes. */
2714 if (current != task) {
2715 rcu_read_unlock();
2716 return -EACCES;
2717 }
2718 /* Prevent changes to overridden credentials. */
2719 if (current_cred() != current_real_cred()) {
2720 rcu_read_unlock();
2721 return -EBUSY;
2722 }
2723 rcu_read_unlock();
2724
2725 if (count > PAGE_SIZE)
2726 count = PAGE_SIZE;
2727
2728 /* No partial writes. */
2729 if (*ppos != 0)
2730 return -EINVAL;
2731
2732 page = memdup_user(buf, count);
2733 if (IS_ERR(page)) {
2734 rv = PTR_ERR(page);
2735 goto out;
2736 }
2737
2738 /* Guard against adverse ptrace interaction */
2739 rv = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
2740 if (rv < 0)
2741 goto out_free;
2742
2743 rv = security_setprocattr(PROC_I(inode)->op.lsm,
2744 file->f_path.dentry->d_name.name, page,
2745 count);
2746 mutex_unlock(&current->signal->cred_guard_mutex);
2747 out_free:
2748 kfree(page);
2749 out:
2750 return rv;
2751 }
2752
2753 static const struct file_operations proc_pid_attr_operations = {
2754 .read = proc_pid_attr_read,
2755 .write = proc_pid_attr_write,
2756 .llseek = generic_file_llseek,
2757 };
2758
2759 #define LSM_DIR_OPS(LSM) \
2760 static int proc_##LSM##_attr_dir_iterate(struct file *filp, \
2761 struct dir_context *ctx) \
2762 { \
2763 return proc_pident_readdir(filp, ctx, \
2764 LSM##_attr_dir_stuff, \
2765 ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2766 } \
2767 \
2768 static const struct file_operations proc_##LSM##_attr_dir_ops = { \
2769 .read = generic_read_dir, \
2770 .iterate = proc_##LSM##_attr_dir_iterate, \
2771 .llseek = default_llseek, \
2772 }; \
2773 \
2774 static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \
2775 struct dentry *dentry, unsigned int flags) \
2776 { \
2777 return proc_pident_lookup(dir, dentry, \
2778 LSM##_attr_dir_stuff, \
2779 LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \
2780 } \
2781 \
2782 static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
2783 .lookup = proc_##LSM##_attr_dir_lookup, \
2784 .getattr = pid_getattr, \
2785 .setattr = proc_setattr, \
2786 }
2787
2788 #ifdef CONFIG_SECURITY_SMACK
2789 static const struct pid_entry smack_attr_dir_stuff[] = {
2790 ATTR("smack", "current", 0666),
2791 };
2792 LSM_DIR_OPS(smack);
2793 #endif
2794
2795 #ifdef CONFIG_SECURITY_APPARMOR
2796 static const struct pid_entry apparmor_attr_dir_stuff[] = {
2797 ATTR("apparmor", "current", 0666),
2798 ATTR("apparmor", "prev", 0444),
2799 ATTR("apparmor", "exec", 0666),
2800 };
2801 LSM_DIR_OPS(apparmor);
2802 #endif
2803
2804 static const struct pid_entry attr_dir_stuff[] = {
2805 ATTR(NULL, "current", 0666),
2806 ATTR(NULL, "prev", 0444),
2807 ATTR(NULL, "exec", 0666),
2808 ATTR(NULL, "fscreate", 0666),
2809 ATTR(NULL, "keycreate", 0666),
2810 ATTR(NULL, "sockcreate", 0666),
2811 ATTR(NULL, "display", 0666),
2812 ATTR(NULL, "context", 0444),
2813 #ifdef CONFIG_SECURITY_SMACK
2814 DIR("smack", 0555,
2815 proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
2816 #endif
2817 #ifdef CONFIG_SECURITY_APPARMOR
2818 DIR("apparmor", 0555,
2819 proc_apparmor_attr_dir_inode_ops, proc_apparmor_attr_dir_ops),
2820 #endif
2821 };
2822
2823 static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2824 {
2825 return proc_pident_readdir(file, ctx,
2826 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2827 }
2828
2829 static const struct file_operations proc_attr_dir_operations = {
2830 .read = generic_read_dir,
2831 .iterate_shared = proc_attr_dir_readdir,
2832 .llseek = generic_file_llseek,
2833 };
2834
2835 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2836 struct dentry *dentry, unsigned int flags)
2837 {
2838 return proc_pident_lookup(dir, dentry,
2839 attr_dir_stuff,
2840 attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff));
2841 }
2842
2843 static const struct inode_operations proc_attr_dir_inode_operations = {
2844 .lookup = proc_attr_dir_lookup,
2845 .getattr = pid_getattr,
2846 .setattr = proc_setattr,
2847 };
2848
2849 #endif
2850
2851 #ifdef CONFIG_ELF_CORE
2852 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2853 size_t count, loff_t *ppos)
2854 {
2855 struct task_struct *task = get_proc_task(file_inode(file));
2856 struct mm_struct *mm;
2857 char buffer[PROC_NUMBUF];
2858 size_t len;
2859 int ret;
2860
2861 if (!task)
2862 return -ESRCH;
2863
2864 ret = 0;
2865 mm = get_task_mm(task);
2866 if (mm) {
2867 len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2868 ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2869 MMF_DUMP_FILTER_SHIFT));
2870 mmput(mm);
2871 ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2872 }
2873
2874 put_task_struct(task);
2875
2876 return ret;
2877 }
2878
2879 static ssize_t proc_coredump_filter_write(struct file *file,
2880 const char __user *buf,
2881 size_t count,
2882 loff_t *ppos)
2883 {
2884 struct task_struct *task;
2885 struct mm_struct *mm;
2886 unsigned int val;
2887 int ret;
2888 int i;
2889 unsigned long mask;
2890
2891 ret = kstrtouint_from_user(buf, count, 0, &val);
2892 if (ret < 0)
2893 return ret;
2894
2895 ret = -ESRCH;
2896 task = get_proc_task(file_inode(file));
2897 if (!task)
2898 goto out_no_task;
2899
2900 mm = get_task_mm(task);
2901 if (!mm)
2902 goto out_no_mm;
2903 ret = 0;
2904
2905 for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2906 if (val & mask)
2907 set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2908 else
2909 clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2910 }
2911
2912 mmput(mm);
2913 out_no_mm:
2914 put_task_struct(task);
2915 out_no_task:
2916 if (ret < 0)
2917 return ret;
2918 return count;
2919 }
2920
2921 static const struct file_operations proc_coredump_filter_operations = {
2922 .read = proc_coredump_filter_read,
2923 .write = proc_coredump_filter_write,
2924 .llseek = generic_file_llseek,
2925 };
2926 #endif
2927
2928 #ifdef CONFIG_TASK_IO_ACCOUNTING
2929 static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
2930 {
2931 struct task_io_accounting acct = task->ioac;
2932 unsigned long flags;
2933 int result;
2934
2935 result = down_read_killable(&task->signal->exec_update_lock);
2936 if (result)
2937 return result;
2938
2939 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
2940 result = -EACCES;
2941 goto out_unlock;
2942 }
2943
2944 if (whole && lock_task_sighand(task, &flags)) {
2945 struct task_struct *t = task;
2946
2947 task_io_accounting_add(&acct, &task->signal->ioac);
2948 while_each_thread(task, t)
2949 task_io_accounting_add(&acct, &t->ioac);
2950
2951 unlock_task_sighand(task, &flags);
2952 }
2953 seq_printf(m,
2954 "rchar: %llu\n"
2955 "wchar: %llu\n"
2956 "syscr: %llu\n"
2957 "syscw: %llu\n"
2958 "read_bytes: %llu\n"
2959 "write_bytes: %llu\n"
2960 "cancelled_write_bytes: %llu\n",
2961 (unsigned long long)acct.rchar,
2962 (unsigned long long)acct.wchar,
2963 (unsigned long long)acct.syscr,
2964 (unsigned long long)acct.syscw,
2965 (unsigned long long)acct.read_bytes,
2966 (unsigned long long)acct.write_bytes,
2967 (unsigned long long)acct.cancelled_write_bytes);
2968 result = 0;
2969
2970 out_unlock:
2971 up_read(&task->signal->exec_update_lock);
2972 return result;
2973 }
2974
2975 static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2976 struct pid *pid, struct task_struct *task)
2977 {
2978 return do_io_accounting(task, m, 0);
2979 }
2980
2981 static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2982 struct pid *pid, struct task_struct *task)
2983 {
2984 return do_io_accounting(task, m, 1);
2985 }
2986 #endif /* CONFIG_TASK_IO_ACCOUNTING */
2987
2988 #ifdef CONFIG_USER_NS
2989 static int proc_id_map_open(struct inode *inode, struct file *file,
2990 const struct seq_operations *seq_ops)
2991 {
2992 struct user_namespace *ns = NULL;
2993 struct task_struct *task;
2994 struct seq_file *seq;
2995 int ret = -EINVAL;
2996
2997 task = get_proc_task(inode);
2998 if (task) {
2999 rcu_read_lock();
3000 ns = get_user_ns(task_cred_xxx(task, user_ns));
3001 rcu_read_unlock();
3002 put_task_struct(task);
3003 }
3004 if (!ns)
3005 goto err;
3006
3007 ret = seq_open(file, seq_ops);
3008 if (ret)
3009 goto err_put_ns;
3010
3011 seq = file->private_data;
3012 seq->private = ns;
3013
3014 return 0;
3015 err_put_ns:
3016 put_user_ns(ns);
3017 err:
3018 return ret;
3019 }
3020
3021 static int proc_id_map_release(struct inode *inode, struct file *file)
3022 {
3023 struct seq_file *seq = file->private_data;
3024 struct user_namespace *ns = seq->private;
3025 put_user_ns(ns);
3026 return seq_release(inode, file);
3027 }
3028
3029 static int proc_uid_map_open(struct inode *inode, struct file *file)
3030 {
3031 return proc_id_map_open(inode, file, &proc_uid_seq_operations);
3032 }
3033
3034 static int proc_gid_map_open(struct inode *inode, struct file *file)
3035 {
3036 return proc_id_map_open(inode, file, &proc_gid_seq_operations);
3037 }
3038
3039 static int proc_projid_map_open(struct inode *inode, struct file *file)
3040 {
3041 return proc_id_map_open(inode, file, &proc_projid_seq_operations);
3042 }
3043
3044 static const struct file_operations proc_uid_map_operations = {
3045 .open = proc_uid_map_open,
3046 .write = proc_uid_map_write,
3047 .read = seq_read,
3048 .llseek = seq_lseek,
3049 .release = proc_id_map_release,
3050 };
3051
3052 static const struct file_operations proc_gid_map_operations = {
3053 .open = proc_gid_map_open,
3054 .write = proc_gid_map_write,
3055 .read = seq_read,
3056 .llseek = seq_lseek,
3057 .release = proc_id_map_release,
3058 };
3059
3060 static const struct file_operations proc_projid_map_operations = {
3061 .open = proc_projid_map_open,
3062 .write = proc_projid_map_write,
3063 .read = seq_read,
3064 .llseek = seq_lseek,
3065 .release = proc_id_map_release,
3066 };
3067
3068 static int proc_setgroups_open(struct inode *inode, struct file *file)
3069 {
3070 struct user_namespace *ns = NULL;
3071 struct task_struct *task;
3072 int ret;
3073
3074 ret = -ESRCH;
3075 task = get_proc_task(inode);
3076 if (task) {
3077 rcu_read_lock();
3078 ns = get_user_ns(task_cred_xxx(task, user_ns));
3079 rcu_read_unlock();
3080 put_task_struct(task);
3081 }
3082 if (!ns)
3083 goto err;
3084
3085 if (file->f_mode & FMODE_WRITE) {
3086 ret = -EACCES;
3087 if (!ns_capable(ns, CAP_SYS_ADMIN))
3088 goto err_put_ns;
3089 }
3090
3091 ret = single_open(file, &proc_setgroups_show, ns);
3092 if (ret)
3093 goto err_put_ns;
3094
3095 return 0;
3096 err_put_ns:
3097 put_user_ns(ns);
3098 err:
3099 return ret;
3100 }
3101
3102 static int proc_setgroups_release(struct inode *inode, struct file *file)
3103 {
3104 struct seq_file *seq = file->private_data;
3105 struct user_namespace *ns = seq->private;
3106 int ret = single_release(inode, file);
3107 put_user_ns(ns);
3108 return ret;
3109 }
3110
3111 static const struct file_operations proc_setgroups_operations = {
3112 .open = proc_setgroups_open,
3113 .write = proc_setgroups_write,
3114 .read = seq_read,
3115 .llseek = seq_lseek,
3116 .release = proc_setgroups_release,
3117 };
3118 #endif /* CONFIG_USER_NS */
3119
3120 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
3121 struct pid *pid, struct task_struct *task)
3122 {
3123 int err = lock_trace(task);
3124 if (!err) {
3125 seq_printf(m, "%08x\n", task->personality);
3126 unlock_trace(task);
3127 }
3128 return err;
3129 }
3130
3131 #ifdef CONFIG_LIVEPATCH
3132 static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
3133 struct pid *pid, struct task_struct *task)
3134 {
3135 seq_printf(m, "%d\n", task->patch_state);
3136 return 0;
3137 }
3138 #endif /* CONFIG_LIVEPATCH */
3139
3140 #ifdef CONFIG_STACKLEAK_METRICS
3141 static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
3142 struct pid *pid, struct task_struct *task)
3143 {
3144 unsigned long prev_depth = THREAD_SIZE -
3145 (task->prev_lowest_stack & (THREAD_SIZE - 1));
3146 unsigned long depth = THREAD_SIZE -
3147 (task->lowest_stack & (THREAD_SIZE - 1));
3148
3149 seq_printf(m, "previous stack depth: %lu\nstack depth: %lu\n",
3150 prev_depth, depth);
3151 return 0;
3152 }
3153 #endif /* CONFIG_STACKLEAK_METRICS */
3154
3155 /*
3156 * Thread groups
3157 */
3158 static const struct file_operations proc_task_operations;
3159 static const struct inode_operations proc_task_inode_operations;
3160
3161 static const struct pid_entry tgid_base_stuff[] = {
3162 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
3163 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3164 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
3165 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3166 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3167 #ifdef CONFIG_NET
3168 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3169 #endif
3170 REG("environ", S_IRUSR, proc_environ_operations),
3171 REG("auxv", S_IRUSR, proc_auxv_operations),
3172 ONE("status", S_IRUGO, proc_pid_status),
3173 ONE("personality", S_IRUSR, proc_pid_personality),
3174 ONE("limits", S_IRUGO, proc_pid_limits),
3175 #ifdef CONFIG_SCHED_DEBUG
3176 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3177 #endif
3178 #ifdef CONFIG_SCHED_AUTOGROUP
3179 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
3180 #endif
3181 #ifdef CONFIG_TIME_NS
3182 REG("timens_offsets", S_IRUGO|S_IWUSR, proc_timens_offsets_operations),
3183 #endif
3184 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
3185 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3186 ONE("syscall", S_IRUSR, proc_pid_syscall),
3187 #endif
3188 REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
3189 ONE("stat", S_IRUGO, proc_tgid_stat),
3190 ONE("statm", S_IRUGO, proc_pid_statm),
3191 REG("maps", S_IRUGO, proc_pid_maps_operations),
3192 #ifdef CONFIG_NUMA
3193 REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
3194 #endif
3195 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
3196 LNK("cwd", proc_cwd_link),
3197 LNK("root", proc_root_link),
3198 LNK("exe", proc_exe_link),
3199 REG("mounts", S_IRUGO, proc_mounts_operations),
3200 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
3201 REG("mountstats", S_IRUSR, proc_mountstats_operations),
3202 #ifdef CONFIG_PROC_PAGE_MONITOR
3203 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3204 REG("smaps", S_IRUGO, proc_pid_smaps_operations),
3205 REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3206 REG("pagemap", S_IRUSR, proc_pagemap_operations),
3207 #endif
3208 #ifdef CONFIG_SECURITY
3209 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3210 #endif
3211 #ifdef CONFIG_KALLSYMS
3212 ONE("wchan", S_IRUGO, proc_pid_wchan),
3213 #endif
3214 #ifdef CONFIG_STACKTRACE
3215 ONE("stack", S_IRUSR, proc_pid_stack),
3216 #endif
3217 #ifdef CONFIG_SCHED_INFO
3218 ONE("schedstat", S_IRUGO, proc_pid_schedstat),
3219 #endif
3220 #ifdef CONFIG_LATENCYTOP
3221 REG("latency", S_IRUGO, proc_lstats_operations),
3222 #endif
3223 #ifdef CONFIG_PROC_PID_CPUSET
3224 ONE("cpuset", S_IRUGO, proc_cpuset_show),
3225 #endif
3226 #ifdef CONFIG_CGROUPS
3227 ONE("cgroup", S_IRUGO, proc_cgroup_show),
3228 #endif
3229 #ifdef CONFIG_PROC_CPU_RESCTRL
3230 ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
3231 #endif
3232 ONE("oom_score", S_IRUGO, proc_oom_score),
3233 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3234 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3235 #ifdef CONFIG_AUDIT
3236 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
3237 REG("sessionid", S_IRUGO, proc_sessionid_operations),
3238 #endif
3239 #ifdef CONFIG_FAULT_INJECTION
3240 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3241 REG("fail-nth", 0644, proc_fail_nth_operations),
3242 #endif
3243 #ifdef CONFIG_ELF_CORE
3244 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
3245 #endif
3246 #ifdef CONFIG_TASK_IO_ACCOUNTING
3247 ONE("io", S_IRUSR, proc_tgid_io_accounting),
3248 #endif
3249 #ifdef CONFIG_USER_NS
3250 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
3251 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
3252 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3253 REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
3254 #endif
3255 #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
3256 REG("timers", S_IRUGO, proc_timers_operations),
3257 #endif
3258 REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
3259 #ifdef CONFIG_LIVEPATCH
3260 ONE("patch_state", S_IRUSR, proc_pid_patch_state),
3261 #endif
3262 #ifdef CONFIG_STACKLEAK_METRICS
3263 ONE("stack_depth", S_IRUGO, proc_stack_depth),
3264 #endif
3265 #ifdef CONFIG_PROC_PID_ARCH_STATUS
3266 ONE("arch_status", S_IRUGO, proc_pid_arch_status),
3267 #endif
3268 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
3269 ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
3270 #endif
3271 };
3272
3273 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
3274 {
3275 return proc_pident_readdir(file, ctx,
3276 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3277 }
3278
3279 static const struct file_operations proc_tgid_base_operations = {
3280 .read = generic_read_dir,
3281 .iterate_shared = proc_tgid_base_readdir,
3282 .llseek = generic_file_llseek,
3283 };
3284
3285 struct pid *tgid_pidfd_to_pid(const struct file *file)
3286 {
3287 if (file->f_op != &proc_tgid_base_operations)
3288 return ERR_PTR(-EBADF);
3289
3290 return proc_pid(file_inode(file));
3291 }
3292
3293 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3294 {
3295 return proc_pident_lookup(dir, dentry,
3296 tgid_base_stuff,
3297 tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff));
3298 }
3299
3300 static const struct inode_operations proc_tgid_base_inode_operations = {
3301 .lookup = proc_tgid_base_lookup,
3302 .getattr = pid_getattr,
3303 .setattr = proc_setattr,
3304 .permission = proc_pid_permission,
3305 };
3306
3307 /**
3308 * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache.
3309 * @pid: pid that should be flushed.
3310 *
3311 * This function walks a list of inodes (that belong to any proc
3312 * filesystem) that are attached to the pid and flushes them from
3313 * the dentry cache.
3314 *
3315 * It is safe and reasonable to cache /proc entries for a task until
3316 * that task exits. After that they just clog up the dcache with
3317 * useless entries, possibly causing useful dcache entries to be
3318 * flushed instead. This routine is provided to flush those useless
3319 * dcache entries when a process is reaped.
3320 *
3321 * NOTE: This routine is just an optimization so it does not guarantee
3322 * that no dcache entries will exist after a process is reaped
3323 * it just makes it very unlikely that any will persist.
3324 */
3325
3326 void proc_flush_pid(struct pid *pid)
3327 {
3328 proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock);
3329 }
3330
3331 static struct dentry *proc_pid_instantiate(struct dentry * dentry,
3332 struct task_struct *task, const void *ptr)
3333 {
3334 struct inode *inode;
3335
3336 inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
3337 if (!inode)
3338 return ERR_PTR(-ENOENT);
3339
3340 inode->i_op = &proc_tgid_base_inode_operations;
3341 inode->i_fop = &proc_tgid_base_operations;
3342 inode->i_flags|=S_IMMUTABLE;
3343
3344 set_nlink(inode, nlink_tgid);
3345 pid_update_inode(task, inode);
3346
3347 d_set_d_op(dentry, &pid_dentry_operations);
3348 return d_splice_alias(inode, dentry);
3349 }
3350
3351 struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
3352 {
3353 struct task_struct *task;
3354 unsigned tgid;
3355 struct proc_fs_info *fs_info;
3356 struct pid_namespace *ns;
3357 struct dentry *result = ERR_PTR(-ENOENT);
3358
3359 tgid = name_to_int(&dentry->d_name);
3360 if (tgid == ~0U)
3361 goto out;
3362
3363 fs_info = proc_sb_info(dentry->d_sb);
3364 ns = fs_info->pid_ns;
3365 rcu_read_lock();
3366 task = find_task_by_pid_ns(tgid, ns);
3367 if (task)
3368 get_task_struct(task);
3369 rcu_read_unlock();
3370 if (!task)
3371 goto out;
3372
3373 /* Limit procfs to only ptraceable tasks */
3374 if (fs_info->hide_pid == HIDEPID_NOT_PTRACEABLE) {
3375 if (!has_pid_permissions(fs_info, task, HIDEPID_NO_ACCESS))
3376 goto out_put_task;
3377 }
3378
3379 result = proc_pid_instantiate(dentry, task, NULL);
3380 out_put_task:
3381 put_task_struct(task);
3382 out:
3383 return result;
3384 }
3385
3386 /*
3387 * Find the first task with tgid >= tgid
3388 *
3389 */
3390 struct tgid_iter {
3391 unsigned int tgid;
3392 struct task_struct *task;
3393 };
3394 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
3395 {
3396 struct pid *pid;
3397
3398 if (iter.task)
3399 put_task_struct(iter.task);
3400 rcu_read_lock();
3401 retry:
3402 iter.task = NULL;
3403 pid = find_ge_pid(iter.tgid, ns);
3404 if (pid) {
3405 iter.tgid = pid_nr_ns(pid, ns);
3406 iter.task = pid_task(pid, PIDTYPE_TGID);
3407 if (!iter.task) {
3408 iter.tgid += 1;
3409 goto retry;
3410 }
3411 get_task_struct(iter.task);
3412 }
3413 rcu_read_unlock();
3414 return iter;
3415 }
3416
3417 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
3418
3419 /* for the /proc/ directory itself, after non-process stuff has been done */
3420 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
3421 {
3422 struct tgid_iter iter;
3423 struct proc_fs_info *fs_info = proc_sb_info(file_inode(file)->i_sb);
3424 struct pid_namespace *ns = proc_pid_ns(file_inode(file)->i_sb);
3425 loff_t pos = ctx->pos;
3426
3427 if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
3428 return 0;
3429
3430 if (pos == TGID_OFFSET - 2) {
3431 struct inode *inode = d_inode(fs_info->proc_self);
3432 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
3433 return 0;
3434 ctx->pos = pos = pos + 1;
3435 }
3436 if (pos == TGID_OFFSET - 1) {
3437 struct inode *inode = d_inode(fs_info->proc_thread_self);
3438 if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
3439 return 0;
3440 ctx->pos = pos = pos + 1;
3441 }
3442 iter.tgid = pos - TGID_OFFSET;
3443 iter.task = NULL;
3444 for (iter = next_tgid(ns, iter);
3445 iter.task;
3446 iter.tgid += 1, iter = next_tgid(ns, iter)) {
3447 char name[10 + 1];
3448 unsigned int len;
3449
3450 cond_resched();
3451 if (!has_pid_permissions(fs_info, iter.task, HIDEPID_INVISIBLE))
3452 continue;
3453
3454 len = snprintf(name, sizeof(name), "%u", iter.tgid);
3455 ctx->pos = iter.tgid + TGID_OFFSET;
3456 if (!proc_fill_cache(file, ctx, name, len,
3457 proc_pid_instantiate, iter.task, NULL)) {
3458 put_task_struct(iter.task);
3459 return 0;
3460 }
3461 }
3462 ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
3463 return 0;
3464 }
3465
3466 /*
3467 * proc_tid_comm_permission is a special permission function exclusively
3468 * used for the node /proc/<pid>/task/<tid>/comm.
3469 * It bypasses generic permission checks in the case where a task of the same
3470 * task group attempts to access the node.
3471 * The rationale behind this is that glibc and bionic access this node for
3472 * cross thread naming (pthread_set/getname_np(!self)). However, if
3473 * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
3474 * which locks out the cross thread naming implementation.
3475 * This function makes sure that the node is always accessible for members of
3476 * same thread group.
3477 */
3478 static int proc_tid_comm_permission(struct inode *inode, int mask)
3479 {
3480 bool is_same_tgroup;
3481 struct task_struct *task;
3482
3483 task = get_proc_task(inode);
3484 if (!task)
3485 return -ESRCH;
3486 is_same_tgroup = same_thread_group(current, task);
3487 put_task_struct(task);
3488
3489 if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
3490 /* This file (/proc/<pid>/task/<tid>/comm) can always be
3491 * read or written by the members of the corresponding
3492 * thread group.
3493 */
3494 return 0;
3495 }
3496
3497 return generic_permission(inode, mask);
3498 }
3499
3500 static const struct inode_operations proc_tid_comm_inode_operations = {
3501 .permission = proc_tid_comm_permission,
3502 };
3503
3504 /*
3505 * Tasks
3506 */
3507 static const struct pid_entry tid_base_stuff[] = {
3508 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3509 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3510 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3511 #ifdef CONFIG_NET
3512 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
3513 #endif
3514 REG("environ", S_IRUSR, proc_environ_operations),
3515 REG("auxv", S_IRUSR, proc_auxv_operations),
3516 ONE("status", S_IRUGO, proc_pid_status),
3517 ONE("personality", S_IRUSR, proc_pid_personality),
3518 ONE("limits", S_IRUGO, proc_pid_limits),
3519 #ifdef CONFIG_SCHED_DEBUG
3520 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3521 #endif
3522 NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
3523 &proc_tid_comm_inode_operations,
3524 &proc_pid_set_comm_operations, {}),
3525 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3526 ONE("syscall", S_IRUSR, proc_pid_syscall),
3527 #endif
3528 REG("cmdline", S_IRUGO, proc_pid_cmdline_ops),
3529 ONE("stat", S_IRUGO, proc_tid_stat),
3530 ONE("statm", S_IRUGO, proc_pid_statm),
3531 REG("maps", S_IRUGO, proc_pid_maps_operations),
3532 #ifdef CONFIG_PROC_CHILDREN
3533 REG("children", S_IRUGO, proc_tid_children_operations),
3534 #endif
3535 #ifdef CONFIG_NUMA
3536 REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
3537 #endif
3538 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
3539 LNK("cwd", proc_cwd_link),
3540 LNK("root", proc_root_link),
3541 LNK("exe", proc_exe_link),
3542 REG("mounts", S_IRUGO, proc_mounts_operations),
3543 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
3544 #ifdef CONFIG_PROC_PAGE_MONITOR
3545 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3546 REG("smaps", S_IRUGO, proc_pid_smaps_operations),
3547 REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
3548 REG("pagemap", S_IRUSR, proc_pagemap_operations),
3549 #endif
3550 #ifdef CONFIG_SECURITY
3551 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
3552 #endif
3553 #ifdef CONFIG_KALLSYMS
3554 ONE("wchan", S_IRUGO, proc_pid_wchan),
3555 #endif
3556 #ifdef CONFIG_STACKTRACE
3557 ONE("stack", S_IRUSR, proc_pid_stack),
3558 #endif
3559 #ifdef CONFIG_SCHED_INFO
3560 ONE("schedstat", S_IRUGO, proc_pid_schedstat),
3561 #endif
3562 #ifdef CONFIG_LATENCYTOP
3563 REG("latency", S_IRUGO, proc_lstats_operations),
3564 #endif
3565 #ifdef CONFIG_PROC_PID_CPUSET
3566 ONE("cpuset", S_IRUGO, proc_cpuset_show),
3567 #endif
3568 #ifdef CONFIG_CGROUPS
3569 ONE("cgroup", S_IRUGO, proc_cgroup_show),
3570 #endif
3571 #ifdef CONFIG_PROC_CPU_RESCTRL
3572 ONE("cpu_resctrl_groups", S_IRUGO, proc_resctrl_show),
3573 #endif
3574 ONE("oom_score", S_IRUGO, proc_oom_score),
3575 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
3576 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3577 #ifdef CONFIG_AUDIT
3578 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
3579 REG("sessionid", S_IRUGO, proc_sessionid_operations),
3580 #endif
3581 #ifdef CONFIG_FAULT_INJECTION
3582 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
3583 REG("fail-nth", 0644, proc_fail_nth_operations),
3584 #endif
3585 #ifdef CONFIG_TASK_IO_ACCOUNTING
3586 ONE("io", S_IRUSR, proc_tid_io_accounting),
3587 #endif
3588 #ifdef CONFIG_USER_NS
3589 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
3590 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
3591 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
3592 REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
3593 #endif
3594 #ifdef CONFIG_LIVEPATCH
3595 ONE("patch_state", S_IRUSR, proc_pid_patch_state),
3596 #endif
3597 #ifdef CONFIG_PROC_PID_ARCH_STATUS
3598 ONE("arch_status", S_IRUGO, proc_pid_arch_status),
3599 #endif
3600 #ifdef CONFIG_SECCOMP_CACHE_DEBUG
3601 ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
3602 #endif
3603 };
3604
3605 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
3606 {
3607 return proc_pident_readdir(file, ctx,
3608 tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3609 }
3610
3611 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
3612 {
3613 return proc_pident_lookup(dir, dentry,
3614 tid_base_stuff,
3615 tid_base_stuff + ARRAY_SIZE(tid_base_stuff));
3616 }
3617
3618 static const struct file_operations proc_tid_base_operations = {
3619 .read = generic_read_dir,
3620 .iterate_shared = proc_tid_base_readdir,
3621 .llseek = generic_file_llseek,
3622 };
3623
3624 static const struct inode_operations proc_tid_base_inode_operations = {
3625 .lookup = proc_tid_base_lookup,
3626 .getattr = pid_getattr,
3627 .setattr = proc_setattr,
3628 };
3629
3630 static struct dentry *proc_task_instantiate(struct dentry *dentry,
3631 struct task_struct *task, const void *ptr)
3632 {
3633 struct inode *inode;
3634 inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
3635 if (!inode)
3636 return ERR_PTR(-ENOENT);
3637
3638 inode->i_op = &proc_tid_base_inode_operations;
3639 inode->i_fop = &proc_tid_base_operations;
3640 inode->i_flags |= S_IMMUTABLE;
3641
3642 set_nlink(inode, nlink_tid);
3643 pid_update_inode(task, inode);
3644
3645 d_set_d_op(dentry, &pid_dentry_operations);
3646 return d_splice_alias(inode, dentry);
3647 }
3648
3649 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
3650 {
3651 struct task_struct *task;
3652 struct task_struct *leader = get_proc_task(dir);
3653 unsigned tid;
3654 struct proc_fs_info *fs_info;
3655 struct pid_namespace *ns;
3656 struct dentry *result = ERR_PTR(-ENOENT);
3657
3658 if (!leader)
3659 goto out_no_task;
3660
3661 tid = name_to_int(&dentry->d_name);
3662 if (tid == ~0U)
3663 goto out;
3664
3665 fs_info = proc_sb_info(dentry->d_sb);
3666 ns = fs_info->pid_ns;
3667 rcu_read_lock();
3668 task = find_task_by_pid_ns(tid, ns);
3669 if (task)
3670 get_task_struct(task);
3671 rcu_read_unlock();
3672 if (!task)
3673 goto out;
3674 if (!same_thread_group(leader, task))
3675 goto out_drop_task;
3676
3677 result = proc_task_instantiate(dentry, task, NULL);
3678 out_drop_task:
3679 put_task_struct(task);
3680 out:
3681 put_task_struct(leader);
3682 out_no_task:
3683 return result;
3684 }
3685
3686 /*
3687 * Find the first tid of a thread group to return to user space.
3688 *
3689 * Usually this is just the thread group leader, but if the users
3690 * buffer was too small or there was a seek into the middle of the
3691 * directory we have more work todo.
3692 *
3693 * In the case of a short read we start with find_task_by_pid.
3694 *
3695 * In the case of a seek we start with the leader and walk nr
3696 * threads past it.
3697 */
3698 static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
3699 struct pid_namespace *ns)
3700 {
3701 struct task_struct *pos, *task;
3702 unsigned long nr = f_pos;
3703
3704 if (nr != f_pos) /* 32bit overflow? */
3705 return NULL;
3706
3707 rcu_read_lock();
3708 task = pid_task(pid, PIDTYPE_PID);
3709 if (!task)
3710 goto fail;
3711
3712 /* Attempt to start with the tid of a thread */
3713 if (tid && nr) {
3714 pos = find_task_by_pid_ns(tid, ns);
3715 if (pos && same_thread_group(pos, task))
3716 goto found;
3717 }
3718
3719 /* If nr exceeds the number of threads there is nothing todo */
3720 if (nr >= get_nr_threads(task))
3721 goto fail;
3722
3723 /* If we haven't found our starting place yet start
3724 * with the leader and walk nr threads forward.
3725 */
3726 pos = task = task->group_leader;
3727 do {
3728 if (!nr--)
3729 goto found;
3730 } while_each_thread(task, pos);
3731 fail:
3732 pos = NULL;
3733 goto out;
3734 found:
3735 get_task_struct(pos);
3736 out:
3737 rcu_read_unlock();
3738 return pos;
3739 }
3740
3741 /*
3742 * Find the next thread in the thread list.
3743 * Return NULL if there is an error or no next thread.
3744 *
3745 * The reference to the input task_struct is released.
3746 */
3747 static struct task_struct *next_tid(struct task_struct *start)
3748 {
3749 struct task_struct *pos = NULL;
3750 rcu_read_lock();
3751 if (pid_alive(start)) {
3752 pos = next_thread(start);
3753 if (thread_group_leader(pos))
3754 pos = NULL;
3755 else
3756 get_task_struct(pos);
3757 }
3758 rcu_read_unlock();
3759 put_task_struct(start);
3760 return pos;
3761 }
3762
3763 /* for the /proc/TGID/task/ directories */
3764 static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3765 {
3766 struct inode *inode = file_inode(file);
3767 struct task_struct *task;
3768 struct pid_namespace *ns;
3769 int tid;
3770
3771 if (proc_inode_is_dead(inode))
3772 return -ENOENT;
3773
3774 if (!dir_emit_dots(file, ctx))
3775 return 0;
3776
3777 /* f_version caches the tgid value that the last readdir call couldn't
3778 * return. lseek aka telldir automagically resets f_version to 0.
3779 */
3780 ns = proc_pid_ns(inode->i_sb);
3781 tid = (int)file->f_version;
3782 file->f_version = 0;
3783 for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
3784 task;
3785 task = next_tid(task), ctx->pos++) {
3786 char name[10 + 1];
3787 unsigned int len;
3788 tid = task_pid_nr_ns(task, ns);
3789 len = snprintf(name, sizeof(name), "%u", tid);
3790 if (!proc_fill_cache(file, ctx, name, len,
3791 proc_task_instantiate, task, NULL)) {
3792 /* returning this tgid failed, save it as the first
3793 * pid for the next readir call */
3794 file->f_version = (u64)tid;
3795 put_task_struct(task);
3796 break;
3797 }
3798 }
3799
3800 return 0;
3801 }
3802
3803 static int proc_task_getattr(const struct path *path, struct kstat *stat,
3804 u32 request_mask, unsigned int query_flags)
3805 {
3806 struct inode *inode = d_inode(path->dentry);
3807 struct task_struct *p = get_proc_task(inode);
3808 generic_fillattr(inode, stat);
3809
3810 if (p) {
3811 stat->nlink += get_nr_threads(p);
3812 put_task_struct(p);
3813 }
3814
3815 return 0;
3816 }
3817
3818 static const struct inode_operations proc_task_inode_operations = {
3819 .lookup = proc_task_lookup,
3820 .getattr = proc_task_getattr,
3821 .setattr = proc_setattr,
3822 .permission = proc_pid_permission,
3823 };
3824
3825 static const struct file_operations proc_task_operations = {
3826 .read = generic_read_dir,
3827 .iterate_shared = proc_task_readdir,
3828 .llseek = generic_file_llseek,
3829 };
3830
3831 void __init set_proc_pid_nlink(void)
3832 {
3833 nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3834 nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3835 }