]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - fs/coredump.c
f2fs: fix compressed file start atomic write may cause data corruption
[mirror_ubuntu-jammy-kernel.git] / fs / coredump.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
10c28d93
AK
2#include <linux/slab.h>
3#include <linux/file.h>
4#include <linux/fdtable.h>
70d78fe7 5#include <linux/freezer.h>
10c28d93
AK
6#include <linux/mm.h>
7#include <linux/stat.h>
8#include <linux/fcntl.h>
9#include <linux/swap.h>
315c6926 10#include <linux/ctype.h>
10c28d93
AK
11#include <linux/string.h>
12#include <linux/init.h>
13#include <linux/pagemap.h>
14#include <linux/perf_event.h>
15#include <linux/highmem.h>
16#include <linux/spinlock.h>
17#include <linux/key.h>
18#include <linux/personality.h>
19#include <linux/binfmts.h>
179899fd 20#include <linux/coredump.h>
f7ccbae4 21#include <linux/sched/coredump.h>
3f07c014 22#include <linux/sched/signal.h>
68db0cf1 23#include <linux/sched/task_stack.h>
10c28d93
AK
24#include <linux/utsname.h>
25#include <linux/pid_namespace.h>
26#include <linux/module.h>
27#include <linux/namei.h>
28#include <linux/mount.h>
29#include <linux/security.h>
30#include <linux/syscalls.h>
31#include <linux/tsacct_kern.h>
32#include <linux/cn_proc.h>
33#include <linux/audit.h>
34#include <linux/tracehook.h>
35#include <linux/kmod.h>
36#include <linux/fsnotify.h>
37#include <linux/fs_struct.h>
38#include <linux/pipe_fs_i.h>
39#include <linux/oom.h>
40#include <linux/compat.h>
378c6520
JH
41#include <linux/fs.h>
42#include <linux/path.h>
03927c8a 43#include <linux/timekeeping.h>
0a6f3a9c 44#include <linux/elf.h>
10c28d93 45
7c0f6ba6 46#include <linux/uaccess.h>
10c28d93
AK
47#include <asm/mmu_context.h>
48#include <asm/tlb.h>
49#include <asm/exec.h>
50
51#include <trace/events/task.h>
52#include "internal.h"
53
54#include <trace/events/sched.h>
55
56int core_uses_pid;
10c28d93 57unsigned int core_pipe_limit;
3ceadcf6
ON
58char core_pattern[CORENAME_MAX_SIZE] = "core";
59static int core_name_size = CORENAME_MAX_SIZE;
10c28d93
AK
60
61struct core_name {
62 char *corename;
63 int used, size;
64};
10c28d93
AK
65
66/* The maximal length of core_pattern is also specified in sysctl.c */
67
3ceadcf6 68static int expand_corename(struct core_name *cn, int size)
10c28d93 69{
e7fd1549 70 char *corename = krealloc(cn->corename, size, GFP_KERNEL);
10c28d93 71
e7fd1549 72 if (!corename)
10c28d93 73 return -ENOMEM;
10c28d93 74
3ceadcf6
ON
75 if (size > core_name_size) /* racy but harmless */
76 core_name_size = size;
77
78 cn->size = ksize(corename);
e7fd1549 79 cn->corename = corename;
10c28d93
AK
80 return 0;
81}
82
b4176b7c
NI
83static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt,
84 va_list arg)
10c28d93 85{
5fe9d8ca 86 int free, need;
404ca80e 87 va_list arg_copy;
10c28d93 88
5fe9d8ca
ON
89again:
90 free = cn->size - cn->used;
404ca80e
ED
91
92 va_copy(arg_copy, arg);
93 need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
94 va_end(arg_copy);
95
5fe9d8ca
ON
96 if (need < free) {
97 cn->used += need;
98 return 0;
99 }
10c28d93 100
3ceadcf6 101 if (!expand_corename(cn, cn->size + need - free + 1))
5fe9d8ca 102 goto again;
10c28d93 103
5fe9d8ca 104 return -ENOMEM;
10c28d93
AK
105}
106
b4176b7c 107static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...)
bc03c691
ON
108{
109 va_list arg;
110 int ret;
111
112 va_start(arg, fmt);
113 ret = cn_vprintf(cn, fmt, arg);
114 va_end(arg);
115
116 return ret;
117}
118
b4176b7c
NI
119static __printf(2, 3)
120int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
10c28d93 121{
923bed03
ON
122 int cur = cn->used;
123 va_list arg;
124 int ret;
125
126 va_start(arg, fmt);
127 ret = cn_vprintf(cn, fmt, arg);
128 va_end(arg);
129
ac94b6e3
JH
130 if (ret == 0) {
131 /*
132 * Ensure that this coredump name component can't cause the
133 * resulting corefile path to consist of a ".." or ".".
134 */
135 if ((cn->used - cur == 1 && cn->corename[cur] == '.') ||
136 (cn->used - cur == 2 && cn->corename[cur] == '.'
137 && cn->corename[cur+1] == '.'))
138 cn->corename[cur] = '!';
139
140 /*
141 * Empty names are fishy and could be used to create a "//" in a
142 * corefile name, causing the coredump to happen one directory
143 * level too high. Enforce that all components of the core
144 * pattern are at least one character long.
145 */
146 if (cn->used == cur)
147 ret = cn_printf(cn, "!");
148 }
149
923bed03
ON
150 for (; cur < cn->used; ++cur) {
151 if (cn->corename[cur] == '/')
152 cn->corename[cur] = '!';
153 }
154 return ret;
10c28d93
AK
155}
156
f38c85f1 157static int cn_print_exe_file(struct core_name *cn, bool name_only)
10c28d93
AK
158{
159 struct file *exe_file;
f38c85f1 160 char *pathbuf, *path, *ptr;
10c28d93
AK
161 int ret;
162
163 exe_file = get_mm_exe_file(current->mm);
923bed03
ON
164 if (!exe_file)
165 return cn_esc_printf(cn, "%s (path unknown)", current->comm);
10c28d93 166
0ee931c4 167 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
10c28d93
AK
168 if (!pathbuf) {
169 ret = -ENOMEM;
170 goto put_exe_file;
171 }
172
9bf39ab2 173 path = file_path(exe_file, pathbuf, PATH_MAX);
10c28d93
AK
174 if (IS_ERR(path)) {
175 ret = PTR_ERR(path);
176 goto free_buf;
177 }
178
f38c85f1
LW
179 if (name_only) {
180 ptr = strrchr(path, '/');
181 if (ptr)
182 path = ptr + 1;
183 }
923bed03 184 ret = cn_esc_printf(cn, "%s", path);
10c28d93
AK
185
186free_buf:
187 kfree(pathbuf);
188put_exe_file:
189 fput(exe_file);
190 return ret;
191}
192
193/* format_corename will inspect the pattern parameter, and output a
194 * name into corename, which must have space for at least
195 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
196 */
315c6926
PW
197static int format_corename(struct core_name *cn, struct coredump_params *cprm,
198 size_t **argv, int *argc)
10c28d93
AK
199{
200 const struct cred *cred = current_cred();
201 const char *pat_ptr = core_pattern;
202 int ispipe = (*pat_ptr == '|');
315c6926 203 bool was_space = false;
10c28d93
AK
204 int pid_in_pattern = 0;
205 int err = 0;
206
e7fd1549 207 cn->used = 0;
3ceadcf6
ON
208 cn->corename = NULL;
209 if (expand_corename(cn, core_name_size))
10c28d93 210 return -ENOMEM;
888ffc59
ON
211 cn->corename[0] = '\0';
212
315c6926
PW
213 if (ispipe) {
214 int argvs = sizeof(core_pattern) / 2;
215 (*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
216 if (!(*argv))
217 return -ENOMEM;
218 (*argv)[(*argc)++] = 0;
888ffc59 219 ++pat_ptr;
db973a72
SM
220 if (!(*pat_ptr))
221 return -ENOMEM;
315c6926 222 }
10c28d93
AK
223
224 /* Repeat as long as we have more pattern to process and more output
225 space */
226 while (*pat_ptr) {
315c6926
PW
227 /*
228 * Split on spaces before doing template expansion so that
229 * %e and %E don't get split if they have spaces in them
230 */
231 if (ispipe) {
232 if (isspace(*pat_ptr)) {
2bf509d9
MD
233 if (cn->used != 0)
234 was_space = true;
315c6926
PW
235 pat_ptr++;
236 continue;
237 } else if (was_space) {
238 was_space = false;
239 err = cn_printf(cn, "%c", '\0');
240 if (err)
241 return err;
242 (*argv)[(*argc)++] = cn->used;
243 }
244 }
10c28d93 245 if (*pat_ptr != '%') {
10c28d93
AK
246 err = cn_printf(cn, "%c", *pat_ptr++);
247 } else {
248 switch (*++pat_ptr) {
249 /* single % at the end, drop that */
250 case 0:
251 goto out;
252 /* Double percent, output one percent */
253 case '%':
254 err = cn_printf(cn, "%c", '%');
255 break;
256 /* pid */
257 case 'p':
258 pid_in_pattern = 1;
259 err = cn_printf(cn, "%d",
260 task_tgid_vnr(current));
261 break;
65aafb1e
SG
262 /* global pid */
263 case 'P':
264 err = cn_printf(cn, "%d",
265 task_tgid_nr(current));
266 break;
b03023ec
ON
267 case 'i':
268 err = cn_printf(cn, "%d",
269 task_pid_vnr(current));
270 break;
271 case 'I':
272 err = cn_printf(cn, "%d",
273 task_pid_nr(current));
274 break;
10c28d93
AK
275 /* uid */
276 case 'u':
5202efe5
NI
277 err = cn_printf(cn, "%u",
278 from_kuid(&init_user_ns,
279 cred->uid));
10c28d93
AK
280 break;
281 /* gid */
282 case 'g':
5202efe5
NI
283 err = cn_printf(cn, "%u",
284 from_kgid(&init_user_ns,
285 cred->gid));
10c28d93 286 break;
12a2b4b2
ON
287 case 'd':
288 err = cn_printf(cn, "%d",
289 __get_dumpable(cprm->mm_flags));
290 break;
10c28d93
AK
291 /* signal that caused the coredump */
292 case 's':
b4176b7c
NI
293 err = cn_printf(cn, "%d",
294 cprm->siginfo->si_signo);
10c28d93
AK
295 break;
296 /* UNIX time of coredump */
297 case 't': {
03927c8a
AB
298 time64_t time;
299
300 time = ktime_get_real_seconds();
301 err = cn_printf(cn, "%lld", time);
10c28d93
AK
302 break;
303 }
304 /* hostname */
923bed03 305 case 'h':
10c28d93 306 down_read(&uts_sem);
923bed03 307 err = cn_esc_printf(cn, "%s",
10c28d93
AK
308 utsname()->nodename);
309 up_read(&uts_sem);
10c28d93 310 break;
f38c85f1 311 /* executable, could be changed by prctl PR_SET_NAME etc */
923bed03
ON
312 case 'e':
313 err = cn_esc_printf(cn, "%s", current->comm);
10c28d93 314 break;
f38c85f1
LW
315 /* file name of executable */
316 case 'f':
317 err = cn_print_exe_file(cn, true);
318 break;
10c28d93 319 case 'E':
f38c85f1 320 err = cn_print_exe_file(cn, false);
10c28d93
AK
321 break;
322 /* core limit size */
323 case 'c':
324 err = cn_printf(cn, "%lu",
325 rlimit(RLIMIT_CORE));
326 break;
327 default:
328 break;
329 }
330 ++pat_ptr;
331 }
332
333 if (err)
334 return err;
335 }
336
888ffc59 337out:
10c28d93
AK
338 /* Backward compatibility with core_uses_pid:
339 *
340 * If core_pattern does not include a %p (as is the default)
341 * and core_uses_pid is set, then .%pid will be appended to
342 * the filename. Do not do this for piped commands. */
343 if (!ispipe && !pid_in_pattern && core_uses_pid) {
344 err = cn_printf(cn, ".%d", task_tgid_vnr(current));
345 if (err)
346 return err;
347 }
10c28d93
AK
348 return ispipe;
349}
350
5fa534c9 351static int zap_process(struct task_struct *start, int exit_code, int flags)
10c28d93
AK
352{
353 struct task_struct *t;
354 int nr = 0;
355
5fa534c9
ON
356 /* ignore all signals except SIGKILL, see prepare_signal() */
357 start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
10c28d93
AK
358 start->signal->group_exit_code = exit_code;
359 start->signal->group_stop_count = 0;
360
d61ba589 361 for_each_thread(start, t) {
10c28d93
AK
362 task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
363 if (t != current && t->mm) {
364 sigaddset(&t->pending.signal, SIGKILL);
365 signal_wake_up(t, 1);
366 nr++;
367 }
d61ba589 368 }
10c28d93
AK
369
370 return nr;
371}
372
403bad72
ON
373static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
374 struct core_state *core_state, int exit_code)
10c28d93
AK
375{
376 struct task_struct *g, *p;
377 unsigned long flags;
378 int nr = -EAGAIN;
379
380 spin_lock_irq(&tsk->sighand->siglock);
381 if (!signal_group_exit(tsk->signal)) {
382 mm->core_state = core_state;
6cd8f0ac 383 tsk->signal->group_exit_task = tsk;
5fa534c9 384 nr = zap_process(tsk, exit_code, 0);
403bad72 385 clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
10c28d93
AK
386 }
387 spin_unlock_irq(&tsk->sighand->siglock);
388 if (unlikely(nr < 0))
389 return nr;
390
aed8adb7 391 tsk->flags |= PF_DUMPCORE;
10c28d93
AK
392 if (atomic_read(&mm->mm_users) == nr + 1)
393 goto done;
394 /*
395 * We should find and kill all tasks which use this mm, and we should
396 * count them correctly into ->nr_threads. We don't take tasklist
397 * lock, but this is safe wrt:
398 *
399 * fork:
400 * None of sub-threads can fork after zap_process(leader). All
401 * processes which were created before this point should be
402 * visible to zap_threads() because copy_process() adds the new
403 * process to the tail of init_task.tasks list, and lock/unlock
404 * of ->siglock provides a memory barrier.
405 *
406 * do_exit:
c1e8d7c6 407 * The caller holds mm->mmap_lock. This means that the task which
10c28d93
AK
408 * uses this mm can't pass exit_mm(), so it can't exit or clear
409 * its ->mm.
410 *
411 * de_thread:
412 * It does list_replace_rcu(&leader->tasks, &current->tasks),
413 * we must see either old or new leader, this does not matter.
414 * However, it can change p->sighand, so lock_task_sighand(p)
c1e8d7c6 415 * must be used. Since p->mm != NULL and we hold ->mmap_lock
10c28d93
AK
416 * it can't fail.
417 *
418 * Note also that "g" can be the old leader with ->mm == NULL
419 * and already unhashed and thus removed from ->thread_group.
420 * This is OK, __unhash_process()->list_del_rcu() does not
421 * clear the ->next pointer, we will find the new leader via
422 * next_thread().
423 */
424 rcu_read_lock();
425 for_each_process(g) {
426 if (g == tsk->group_leader)
427 continue;
428 if (g->flags & PF_KTHREAD)
429 continue;
d61ba589
ON
430
431 for_each_thread(g, p) {
432 if (unlikely(!p->mm))
433 continue;
434 if (unlikely(p->mm == mm)) {
435 lock_task_sighand(p, &flags);
436 nr += zap_process(p, exit_code,
437 SIGNAL_GROUP_EXIT);
438 unlock_task_sighand(p, &flags);
10c28d93 439 }
d61ba589
ON
440 break;
441 }
10c28d93
AK
442 }
443 rcu_read_unlock();
444done:
445 atomic_set(&core_state->nr_threads, nr);
446 return nr;
447}
448
449static int coredump_wait(int exit_code, struct core_state *core_state)
450{
451 struct task_struct *tsk = current;
452 struct mm_struct *mm = tsk->mm;
453 int core_waiters = -EBUSY;
454
455 init_completion(&core_state->startup);
456 core_state->dumper.task = tsk;
457 core_state->dumper.next = NULL;
458
d8ed45c5 459 if (mmap_write_lock_killable(mm))
4136c26b
MH
460 return -EINTR;
461
10c28d93
AK
462 if (!mm->core_state)
463 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
d8ed45c5 464 mmap_write_unlock(mm);
10c28d93
AK
465
466 if (core_waiters > 0) {
467 struct core_thread *ptr;
468
70d78fe7 469 freezer_do_not_count();
10c28d93 470 wait_for_completion(&core_state->startup);
70d78fe7 471 freezer_count();
10c28d93
AK
472 /*
473 * Wait for all the threads to become inactive, so that
474 * all the thread context (extended register state, like
475 * fpu etc) gets copied to the memory.
476 */
477 ptr = core_state->dumper.next;
478 while (ptr != NULL) {
479 wait_task_inactive(ptr->task, 0);
480 ptr = ptr->next;
481 }
482 }
483
484 return core_waiters;
485}
486
acdedd99 487static void coredump_finish(struct mm_struct *mm, bool core_dumped)
10c28d93
AK
488{
489 struct core_thread *curr, *next;
490 struct task_struct *task;
491
6cd8f0ac 492 spin_lock_irq(&current->sighand->siglock);
acdedd99
ON
493 if (core_dumped && !__fatal_signal_pending(current))
494 current->signal->group_exit_code |= 0x80;
6cd8f0ac
ON
495 current->signal->group_exit_task = NULL;
496 current->signal->flags = SIGNAL_GROUP_EXIT;
497 spin_unlock_irq(&current->sighand->siglock);
498
10c28d93
AK
499 next = mm->core_state->dumper.next;
500 while ((curr = next) != NULL) {
501 next = curr->next;
502 task = curr->task;
503 /*
504 * see exit_mm(), curr->task must not see
505 * ->task == NULL before we read ->next.
506 */
507 smp_mb();
508 curr->task = NULL;
509 wake_up_process(task);
510 }
511
512 mm->core_state = NULL;
513}
514
528f827e
ON
515static bool dump_interrupted(void)
516{
517 /*
518 * SIGKILL or freezing() interrupt the coredumping. Perhaps we
519 * can do try_to_freeze() and check __fatal_signal_pending(),
520 * but then we need to teach dump_write() to restart and clear
521 * TIF_SIGPENDING.
522 */
06af8679 523 return fatal_signal_pending(current) || freezing(current);
528f827e
ON
524}
525
10c28d93
AK
526static void wait_for_dump_helpers(struct file *file)
527{
de32ec4c 528 struct pipe_inode_info *pipe = file->private_data;
10c28d93
AK
529
530 pipe_lock(pipe);
531 pipe->readers++;
532 pipe->writers--;
0ddad21d 533 wake_up_interruptible_sync(&pipe->rd_wait);
dc7ee2aa
ON
534 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
535 pipe_unlock(pipe);
10c28d93 536
dc7ee2aa
ON
537 /*
538 * We actually want wait_event_freezable() but then we need
539 * to clear TIF_SIGPENDING and improve dump_interrupted().
540 */
0ddad21d 541 wait_event_interruptible(pipe->rd_wait, pipe->readers == 1);
10c28d93 542
dc7ee2aa 543 pipe_lock(pipe);
10c28d93
AK
544 pipe->readers--;
545 pipe->writers++;
546 pipe_unlock(pipe);
10c28d93
AK
547}
548
549/*
550 * umh_pipe_setup
551 * helper function to customize the process used
552 * to collect the core in userspace. Specifically
553 * it sets up a pipe and installs it as fd 0 (stdin)
554 * for the process. Returns 0 on success, or
555 * PTR_ERR on failure.
556 * Note that it also sets the core limit to 1. This
557 * is a special value that we use to trap recursive
558 * core dumps
559 */
560static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
561{
562 struct file *files[2];
563 struct coredump_params *cp = (struct coredump_params *)info->data;
564 int err = create_pipe_files(files, 0);
565 if (err)
566 return err;
567
568 cp->file = files[1];
569
45525b26
AV
570 err = replace_fd(0, files[0], 0);
571 fput(files[0]);
10c28d93
AK
572 /* and disallow core files too */
573 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
574
45525b26 575 return err;
10c28d93
AK
576}
577
ae7795bc 578void do_coredump(const kernel_siginfo_t *siginfo)
10c28d93
AK
579{
580 struct core_state core_state;
581 struct core_name cn;
582 struct mm_struct *mm = current->mm;
583 struct linux_binfmt * binfmt;
584 const struct cred *old_cred;
585 struct cred *cred;
586 int retval = 0;
10c28d93 587 int ispipe;
315c6926
PW
588 size_t *argv = NULL;
589 int argc = 0;
fbb18169
JH
590 /* require nonrelative corefile path and be extra careful */
591 bool need_suid_safe = false;
acdedd99 592 bool core_dumped = false;
10c28d93
AK
593 static atomic_t core_dump_count = ATOMIC_INIT(0);
594 struct coredump_params cprm = {
5ab1c309 595 .siginfo = siginfo,
541880d9 596 .regs = signal_pt_regs(),
10c28d93
AK
597 .limit = rlimit(RLIMIT_CORE),
598 /*
599 * We must use the same mm->flags while dumping core to avoid
600 * inconsistency of bit flags, since this flag is not protected
601 * by any locks.
602 */
603 .mm_flags = mm->flags,
604 };
605
5ab1c309 606 audit_core_dumps(siginfo->si_signo);
10c28d93
AK
607
608 binfmt = mm->binfmt;
609 if (!binfmt || !binfmt->core_dump)
610 goto fail;
611 if (!__get_dumpable(cprm.mm_flags))
612 goto fail;
613
614 cred = prepare_creds();
615 if (!cred)
616 goto fail;
617 /*
618 * We cannot trust fsuid as being the "true" uid of the process
619 * nor do we know its entire history. We only know it was tainted
620 * so we dump it as root in mode 2, and only into a controlled
621 * environment (pipe handler or fully qualified path).
622 */
e579d2c2 623 if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
10c28d93 624 /* Setuid core dump mode */
10c28d93 625 cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
fbb18169 626 need_suid_safe = true;
10c28d93
AK
627 }
628
5ab1c309 629 retval = coredump_wait(siginfo->si_signo, &core_state);
10c28d93
AK
630 if (retval < 0)
631 goto fail_creds;
632
633 old_cred = override_creds(cred);
634
315c6926 635 ispipe = format_corename(&cn, &cprm, &argv, &argc);
10c28d93 636
fb96c475 637 if (ispipe) {
315c6926 638 int argi;
10c28d93
AK
639 int dump_count;
640 char **helper_argv;
907ed132 641 struct subprocess_info *sub_info;
10c28d93
AK
642
643 if (ispipe < 0) {
644 printk(KERN_WARNING "format_corename failed\n");
645 printk(KERN_WARNING "Aborting core\n");
e7fd1549 646 goto fail_unlock;
10c28d93
AK
647 }
648
649 if (cprm.limit == 1) {
650 /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
651 *
652 * Normally core limits are irrelevant to pipes, since
653 * we're not writing to the file system, but we use
fcbc32bc 654 * cprm.limit of 1 here as a special value, this is a
10c28d93
AK
655 * consistent way to catch recursive crashes.
656 * We can still crash if the core_pattern binary sets
657 * RLIM_CORE = !1, but it runs as root, and can do
658 * lots of stupid things.
659 *
660 * Note that we use task_tgid_vnr here to grab the pid
661 * of the process group leader. That way we get the
662 * right pid if a thread in a multi-threaded
663 * core_pattern process dies.
664 */
665 printk(KERN_WARNING
666 "Process %d(%s) has RLIMIT_CORE set to 1\n",
667 task_tgid_vnr(current), current->comm);
668 printk(KERN_WARNING "Aborting core\n");
669 goto fail_unlock;
670 }
671 cprm.limit = RLIM_INFINITY;
672
673 dump_count = atomic_inc_return(&core_dump_count);
674 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
675 printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
676 task_tgid_vnr(current), current->comm);
677 printk(KERN_WARNING "Skipping core dump\n");
678 goto fail_dropcount;
679 }
680
315c6926
PW
681 helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
682 GFP_KERNEL);
10c28d93
AK
683 if (!helper_argv) {
684 printk(KERN_WARNING "%s failed to allocate memory\n",
685 __func__);
686 goto fail_dropcount;
687 }
315c6926
PW
688 for (argi = 0; argi < argc; argi++)
689 helper_argv[argi] = cn.corename + argv[argi];
690 helper_argv[argi] = NULL;
10c28d93 691
907ed132
LDM
692 retval = -ENOMEM;
693 sub_info = call_usermodehelper_setup(helper_argv[0],
694 helper_argv, NULL, GFP_KERNEL,
695 umh_pipe_setup, NULL, &cprm);
696 if (sub_info)
697 retval = call_usermodehelper_exec(sub_info,
698 UMH_WAIT_EXEC);
699
315c6926 700 kfree(helper_argv);
10c28d93 701 if (retval) {
888ffc59 702 printk(KERN_INFO "Core dump to |%s pipe failed\n",
10c28d93
AK
703 cn.corename);
704 goto close_fail;
fb96c475 705 }
10c28d93 706 } else {
643fe55a 707 struct user_namespace *mnt_userns;
10c28d93 708 struct inode *inode;
378c6520
JH
709 int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
710 O_LARGEFILE | O_EXCL;
10c28d93
AK
711
712 if (cprm.limit < binfmt->min_coredump)
713 goto fail_unlock;
714
fbb18169 715 if (need_suid_safe && cn.corename[0] != '/') {
10c28d93
AK
716 printk(KERN_WARNING "Pid %d(%s) can only dump core "\
717 "to fully qualified path!\n",
718 task_tgid_vnr(current), current->comm);
719 printk(KERN_WARNING "Skipping core dump\n");
720 goto fail_unlock;
721 }
722
fbb18169
JH
723 /*
724 * Unlink the file if it exists unless this is a SUID
725 * binary - in that case, we're running around with root
726 * privs and don't want to unlink another user's coredump.
727 */
728 if (!need_suid_safe) {
fbb18169
JH
729 /*
730 * If it doesn't exist, that's fine. If there's some
731 * other problem, we'll catch it at the filp_open().
732 */
96271654 733 do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
fbb18169
JH
734 }
735
736 /*
737 * There is a race between unlinking and creating the
738 * file, but if that causes an EEXIST here, that's
739 * fine - another process raced with us while creating
740 * the corefile, and the other process won. To userspace,
741 * what matters is that at least one of the two processes
742 * writes its coredump successfully, not which one.
743 */
378c6520
JH
744 if (need_suid_safe) {
745 /*
746 * Using user namespaces, normal user tasks can change
747 * their current->fs->root to point to arbitrary
748 * directories. Since the intention of the "only dump
749 * with a fully qualified path" rule is to control where
750 * coredumps may be placed using root privileges,
751 * current->fs->root must not be used. Instead, use the
752 * root directory of init_task.
753 */
754 struct path root;
755
756 task_lock(&init_task);
757 get_fs_root(init_task.fs, &root);
758 task_unlock(&init_task);
ffb37ca3
AV
759 cprm.file = file_open_root(&root, cn.corename,
760 open_flags, 0600);
378c6520
JH
761 path_put(&root);
762 } else {
763 cprm.file = filp_open(cn.corename, open_flags, 0600);
764 }
10c28d93
AK
765 if (IS_ERR(cprm.file))
766 goto fail_unlock;
767
496ad9aa 768 inode = file_inode(cprm.file);
10c28d93
AK
769 if (inode->i_nlink > 1)
770 goto close_fail;
771 if (d_unhashed(cprm.file->f_path.dentry))
772 goto close_fail;
773 /*
774 * AK: actually i see no reason to not allow this for named
775 * pipes etc, but keep the previous behaviour for now.
776 */
777 if (!S_ISREG(inode->i_mode))
778 goto close_fail;
779 /*
40f705a7
JH
780 * Don't dump core if the filesystem changed owner or mode
781 * of the file during file creation. This is an issue when
782 * a process dumps core while its cwd is e.g. on a vfat
783 * filesystem.
10c28d93 784 */
643fe55a 785 mnt_userns = file_mnt_user_ns(cprm.file);
dbd9d6f8
DO
786 if (!uid_eq(i_uid_into_mnt(mnt_userns, inode),
787 current_fsuid())) {
788 pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n",
789 cn.corename);
10c28d93 790 goto close_fail;
dbd9d6f8
DO
791 }
792 if ((inode->i_mode & 0677) != 0600) {
793 pr_info_ratelimited("Core dump to %s aborted: cannot preserve file permissions\n",
794 cn.corename);
40f705a7 795 goto close_fail;
dbd9d6f8 796 }
86cc0584 797 if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
10c28d93 798 goto close_fail;
643fe55a
CB
799 if (do_truncate(mnt_userns, cprm.file->f_path.dentry,
800 0, 0, cprm.file))
10c28d93
AK
801 goto close_fail;
802 }
803
804 /* get us an unshared descriptor table; almost always a no-op */
c39ab6de 805 /* The cell spufs coredump code reads the file descriptor tables */
1f702603 806 retval = unshare_files();
10c28d93
AK
807 if (retval)
808 goto close_fail;
e86d35c3 809 if (!dump_interrupted()) {
3740d93e
LC
810 /*
811 * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
812 * have this set to NULL.
813 */
814 if (!cprm.file) {
815 pr_info("Core dump to |%s disabled\n", cn.corename);
816 goto close_fail;
817 }
e86d35c3
AV
818 file_start_write(cprm.file);
819 core_dumped = binfmt->core_dump(&cprm);
d0f1088b
AV
820 /*
821 * Ensures that file size is big enough to contain the current
822 * file postion. This prevents gdb from complaining about
823 * a truncated file if the last "write" to the file was
824 * dump_skip.
825 */
826 if (cprm.to_skip) {
827 cprm.to_skip--;
828 dump_emit(&cprm, "", 1);
829 }
e86d35c3
AV
830 file_end_write(cprm.file);
831 }
10c28d93
AK
832 if (ispipe && core_pipe_limit)
833 wait_for_dump_helpers(cprm.file);
834close_fail:
835 if (cprm.file)
836 filp_close(cprm.file, NULL);
837fail_dropcount:
838 if (ispipe)
839 atomic_dec(&core_dump_count);
840fail_unlock:
315c6926 841 kfree(argv);
10c28d93 842 kfree(cn.corename);
acdedd99 843 coredump_finish(mm, core_dumped);
10c28d93
AK
844 revert_creds(old_cred);
845fail_creds:
846 put_cred(cred);
847fail:
848 return;
849}
850
851/*
852 * Core dumping helper functions. These are the only things you should
853 * do on a core-file: use only these functions to write out all the
854 * necessary info.
855 */
d0f1088b 856static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
ecc8c772
AV
857{
858 struct file *file = cprm->file;
2507a4fb
AV
859 loff_t pos = file->f_pos;
860 ssize_t n;
2c4cb043 861 if (cprm->written + nr > cprm->limit)
ecc8c772 862 return 0;
df0c09c0
JH
863
864
865 if (dump_interrupted())
866 return 0;
867 n = __kernel_write(file, addr, nr, &pos);
868 if (n != nr)
869 return 0;
870 file->f_pos = pos;
871 cprm->written += n;
872 cprm->pos += n;
873
ecc8c772
AV
874 return 1;
875}
ecc8c772 876
d0f1088b 877static int __dump_skip(struct coredump_params *cprm, size_t nr)
10c28d93 878{
9b56d543
AV
879 static char zeroes[PAGE_SIZE];
880 struct file *file = cprm->file;
10c28d93 881 if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
528f827e 882 if (dump_interrupted() ||
9b56d543 883 file->f_op->llseek(file, nr, SEEK_CUR) < 0)
10c28d93 884 return 0;
1607f09c 885 cprm->pos += nr;
9b56d543 886 return 1;
10c28d93 887 } else {
9b56d543 888 while (nr > PAGE_SIZE) {
d0f1088b 889 if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
9b56d543
AV
890 return 0;
891 nr -= PAGE_SIZE;
10c28d93 892 }
d0f1088b 893 return __dump_emit(cprm, zeroes, nr);
10c28d93 894 }
10c28d93 895}
d0f1088b
AV
896
897int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
898{
899 if (cprm->to_skip) {
900 if (!__dump_skip(cprm, cprm->to_skip))
901 return 0;
902 cprm->to_skip = 0;
903 }
904 return __dump_emit(cprm, addr, nr);
905}
906EXPORT_SYMBOL(dump_emit);
907
908void dump_skip_to(struct coredump_params *cprm, unsigned long pos)
909{
910 cprm->to_skip = pos - cprm->pos;
911}
912EXPORT_SYMBOL(dump_skip_to);
913
914void dump_skip(struct coredump_params *cprm, size_t nr)
915{
916 cprm->to_skip += nr;
917}
9b56d543 918EXPORT_SYMBOL(dump_skip);
22a8cb82 919
afc63a97
JH
920#ifdef CONFIG_ELF_CORE
921int dump_user_range(struct coredump_params *cprm, unsigned long start,
922 unsigned long len)
923{
924 unsigned long addr;
925
926 for (addr = start; addr < start + len; addr += PAGE_SIZE) {
927 struct page *page;
928 int stop;
929
930 /*
931 * To avoid having to allocate page tables for virtual address
932 * ranges that have never been used yet, and also to make it
933 * easy to generate sparse core files, use a helper that returns
934 * NULL when encountering an empty page table entry that would
935 * otherwise have been filled with the zero page.
936 */
937 page = get_dump_page(addr);
938 if (page) {
3159ed57 939 void *kaddr = kmap_local_page(page);
afc63a97
JH
940
941 stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
3159ed57 942 kunmap_local(kaddr);
afc63a97 943 put_page(page);
d0f1088b
AV
944 if (stop)
945 return 0;
afc63a97 946 } else {
d0f1088b 947 dump_skip(cprm, PAGE_SIZE);
afc63a97 948 }
afc63a97
JH
949 }
950 return 1;
951}
952#endif
953
22a8cb82
AV
954int dump_align(struct coredump_params *cprm, int align)
955{
d0f1088b 956 unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1);
22a8cb82 957 if (align & (align - 1))
db51242d 958 return 0;
d0f1088b
AV
959 if (mod)
960 cprm->to_skip += align - mod;
961 return 1;
22a8cb82
AV
962}
963EXPORT_SYMBOL(dump_align);
4d22c75d 964
429a22e7
JH
965/*
966 * The purpose of always_dump_vma() is to make sure that special kernel mappings
967 * that are useful for post-mortem analysis are included in every core dump.
968 * In that way we ensure that the core dump is fully interpretable later
969 * without matching up the same kernel and hardware config to see what PC values
970 * meant. These special mappings include - vDSO, vsyscall, and other
971 * architecture specific mappings
972 */
973static bool always_dump_vma(struct vm_area_struct *vma)
974{
975 /* Any vsyscall mappings? */
976 if (vma == get_gate_vma(vma->vm_mm))
977 return true;
978
979 /*
980 * Assume that all vmas with a .name op should always be dumped.
981 * If this changes, a new vm_ops field can easily be added.
982 */
983 if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
984 return true;
985
986 /*
987 * arch_vma_name() returns non-NULL for special architecture mappings,
988 * such as vDSO sections.
989 */
990 if (arch_vma_name(vma))
991 return true;
992
993 return false;
994}
995
0a6f3a9c
JH
996#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
997
429a22e7
JH
998/*
999 * Decide how much of @vma's contents should be included in a core dump.
1000 */
a07279c9
JH
1001static unsigned long vma_dump_size(struct vm_area_struct *vma,
1002 unsigned long mm_flags)
429a22e7
JH
1003{
1004#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
1005
1006 /* always dump the vdso and vsyscall sections */
1007 if (always_dump_vma(vma))
1008 goto whole;
1009
1010 if (vma->vm_flags & VM_DONTDUMP)
1011 return 0;
1012
1013 /* support for DAX */
1014 if (vma_is_dax(vma)) {
1015 if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
1016 goto whole;
1017 if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
1018 goto whole;
1019 return 0;
1020 }
1021
1022 /* Hugetlb memory check */
1023 if (is_vm_hugetlb_page(vma)) {
1024 if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
1025 goto whole;
1026 if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
1027 goto whole;
1028 return 0;
1029 }
1030
1031 /* Do not dump I/O mapped devices or special mappings */
1032 if (vma->vm_flags & VM_IO)
1033 return 0;
1034
1035 /* By default, dump shared memory if mapped from an anonymous file. */
1036 if (vma->vm_flags & VM_SHARED) {
1037 if (file_inode(vma->vm_file)->i_nlink == 0 ?
1038 FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
1039 goto whole;
1040 return 0;
1041 }
1042
1043 /* Dump segments that have been written to. */
1044 if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE))
1045 goto whole;
1046 if (vma->vm_file == NULL)
1047 return 0;
1048
1049 if (FILTER(MAPPED_PRIVATE))
1050 goto whole;
1051
1052 /*
1053 * If this is the beginning of an executable file mapping,
1054 * dump the first page to aid in determining what was mapped here.
1055 */
1056 if (FILTER(ELF_HEADERS) &&
0a6f3a9c
JH
1057 vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
1058 if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
1059 return PAGE_SIZE;
1060
1061 /*
1062 * ELF libraries aren't always executable.
1063 * We'll want to check whether the mapping starts with the ELF
1064 * magic, but not now - we're holding the mmap lock,
1065 * so copy_from_user() doesn't work here.
1066 * Use a placeholder instead, and fix it up later in
1067 * dump_vma_snapshot().
1068 */
1069 return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
1070 }
429a22e7
JH
1071
1072#undef FILTER
1073
1074 return 0;
1075
1076whole:
1077 return vma->vm_end - vma->vm_start;
1078}
a07279c9
JH
1079
1080static struct vm_area_struct *first_vma(struct task_struct *tsk,
1081 struct vm_area_struct *gate_vma)
1082{
1083 struct vm_area_struct *ret = tsk->mm->mmap;
1084
1085 if (ret)
1086 return ret;
1087 return gate_vma;
1088}
1089
1090/*
1091 * Helper function for iterating across a vma list. It ensures that the caller
1092 * will visit `gate_vma' prior to terminating the search.
1093 */
1094static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
1095 struct vm_area_struct *gate_vma)
1096{
1097 struct vm_area_struct *ret;
1098
1099 ret = this_vma->vm_next;
1100 if (ret)
1101 return ret;
1102 if (this_vma == gate_vma)
1103 return NULL;
1104 return gate_vma;
1105}
1106
1107/*
1108 * Under the mmap_lock, take a snapshot of relevant information about the task's
1109 * VMAs.
1110 */
1111int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
1112 struct core_vma_metadata **vma_meta,
1113 size_t *vma_data_size_ptr)
1114{
1115 struct vm_area_struct *vma, *gate_vma;
1116 struct mm_struct *mm = current->mm;
1117 int i;
1118 size_t vma_data_size = 0;
1119
1120 /*
1121 * Once the stack expansion code is fixed to not change VMA bounds
1122 * under mmap_lock in read mode, this can be changed to take the
1123 * mmap_lock in read mode.
1124 */
1125 if (mmap_write_lock_killable(mm))
1126 return -EINTR;
1127
1128 gate_vma = get_gate_vma(mm);
1129 *vma_count = mm->map_count + (gate_vma ? 1 : 0);
1130
1131 *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
1132 if (!*vma_meta) {
1133 mmap_write_unlock(mm);
1134 return -ENOMEM;
1135 }
1136
1137 for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
1138 vma = next_vma(vma, gate_vma), i++) {
1139 struct core_vma_metadata *m = (*vma_meta) + i;
1140
1141 m->start = vma->vm_start;
1142 m->end = vma->vm_end;
1143 m->flags = vma->vm_flags;
1144 m->dump_size = vma_dump_size(vma, cprm->mm_flags);
a07279c9
JH
1145 }
1146
1147 mmap_write_unlock(mm);
1148
6fcac87e
Q
1149 if (WARN_ON(i != *vma_count)) {
1150 kvfree(*vma_meta);
a07279c9 1151 return -EFAULT;
6fcac87e 1152 }
a07279c9 1153
0a6f3a9c
JH
1154 for (i = 0; i < *vma_count; i++) {
1155 struct core_vma_metadata *m = (*vma_meta) + i;
1156
1157 if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
1158 char elfmag[SELFMAG];
1159
1160 if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
1161 memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
1162 m->dump_size = 0;
1163 } else {
1164 m->dump_size = PAGE_SIZE;
1165 }
1166 }
1167
1168 vma_data_size += m->dump_size;
1169 }
1170
a07279c9
JH
1171 *vma_data_size_ptr = vma_data_size;
1172 return 0;
1173}