]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - kernel/seccomp.c
seccomp: Filter flag to log all actions except SECCOMP_RET_ALLOW
[mirror_ubuntu-artful-kernel.git] / kernel / seccomp.c
CommitLineData
1da177e4
LT
1/*
2 * linux/kernel/seccomp.c
3 *
4 * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
5 *
e2cfabdf
WD
6 * Copyright (C) 2012 Google, Inc.
7 * Will Drewry <wad@chromium.org>
8 *
9 * This defines a simple but solid secure-computing facility.
10 *
11 * Mode 1 uses a fixed list of allowed system calls.
12 * Mode 2 allows user-defined system call filters in the form
13 * of Berkeley Packet Filters/Linux Socket Filters.
1da177e4
LT
14 */
15
e2cfabdf 16#include <linux/atomic.h>
85e7bac3 17#include <linux/audit.h>
5b101740 18#include <linux/compat.h>
aac883e7 19#include <linux/kmemleak.h>
e2cfabdf
WD
20#include <linux/sched.h>
21#include <linux/seccomp.h>
c8bee430 22#include <linux/slab.h>
48dc92b9 23#include <linux/syscalls.h>
aac883e7 24#include <linux/sysctl.h>
1da177e4 25
a4412fc9 26#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
e2cfabdf 27#include <asm/syscall.h>
a4412fc9 28#endif
e2cfabdf
WD
29
30#ifdef CONFIG_SECCOMP_FILTER
e2cfabdf 31#include <linux/filter.h>
c2e1f2e3 32#include <linux/pid.h>
fb0fadf9 33#include <linux/ptrace.h>
e2cfabdf 34#include <linux/security.h>
e2cfabdf
WD
35#include <linux/tracehook.h>
36#include <linux/uaccess.h>
37
38/**
39 * struct seccomp_filter - container for seccomp BPF programs
40 *
41 * @usage: reference count to manage the object lifetime.
42 * get/put helpers should be used when accessing an instance
43 * outside of a lifetime-guarded section. In general, this
44 * is only needed for handling filters shared across tasks.
85438171 45 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
e2cfabdf
WD
46 * @prev: points to a previously installed, or inherited, filter
47 * @len: the number of instructions in the program
119ce5c8 48 * @insnsi: the BPF program instructions to evaluate
e2cfabdf
WD
49 *
50 * seccomp_filter objects are organized in a tree linked via the @prev
51 * pointer. For any task, it appears to be a singly-linked list starting
52 * with current->seccomp.filter, the most recently attached or inherited filter.
53 * However, multiple filters may share a @prev node, by way of fork(), which
54 * results in a unidirectional tree existing in memory. This is similar to
55 * how namespaces work.
56 *
57 * seccomp_filter objects should never be modified after being attached
58 * to a task_struct (other than @usage).
59 */
60struct seccomp_filter {
61 atomic_t usage;
85438171 62 bool log;
e2cfabdf 63 struct seccomp_filter *prev;
7ae457c1 64 struct bpf_prog *prog;
e2cfabdf
WD
65};
66
67/* Limit any path through the tree to 256KB worth of instructions. */
68#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
69
bd4cf0ed 70/*
e2cfabdf
WD
71 * Endianness is explicitly ignored and left for BPF program authors to manage
72 * as per the specific architecture.
73 */
bd4cf0ed 74static void populate_seccomp_data(struct seccomp_data *sd)
e2cfabdf 75{
bd4cf0ed
AS
76 struct task_struct *task = current;
77 struct pt_regs *regs = task_pt_regs(task);
2eac7648 78 unsigned long args[6];
e2cfabdf 79
bd4cf0ed 80 sd->nr = syscall_get_nr(task, regs);
0b747172 81 sd->arch = syscall_get_arch();
2eac7648
DB
82 syscall_get_arguments(task, regs, 0, 6, args);
83 sd->args[0] = args[0];
84 sd->args[1] = args[1];
85 sd->args[2] = args[2];
86 sd->args[3] = args[3];
87 sd->args[4] = args[4];
88 sd->args[5] = args[5];
bd4cf0ed 89 sd->instruction_pointer = KSTK_EIP(task);
e2cfabdf
WD
90}
91
92/**
93 * seccomp_check_filter - verify seccomp filter code
94 * @filter: filter to verify
95 * @flen: length of filter
96 *
4df95ff4 97 * Takes a previously checked filter (by bpf_check_classic) and
e2cfabdf
WD
98 * redirects all filter code that loads struct sk_buff data
99 * and related data through seccomp_bpf_load. It also
100 * enforces length and alignment checking of those loads.
101 *
102 * Returns 0 if the rule set is legal or -EINVAL if not.
103 */
104static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
105{
106 int pc;
107 for (pc = 0; pc < flen; pc++) {
108 struct sock_filter *ftest = &filter[pc];
109 u16 code = ftest->code;
110 u32 k = ftest->k;
111
112 switch (code) {
34805931 113 case BPF_LD | BPF_W | BPF_ABS:
bd4cf0ed 114 ftest->code = BPF_LDX | BPF_W | BPF_ABS;
e2cfabdf
WD
115 /* 32-bit aligned and not out of bounds. */
116 if (k >= sizeof(struct seccomp_data) || k & 3)
117 return -EINVAL;
118 continue;
34805931 119 case BPF_LD | BPF_W | BPF_LEN:
bd4cf0ed 120 ftest->code = BPF_LD | BPF_IMM;
e2cfabdf
WD
121 ftest->k = sizeof(struct seccomp_data);
122 continue;
34805931 123 case BPF_LDX | BPF_W | BPF_LEN:
bd4cf0ed 124 ftest->code = BPF_LDX | BPF_IMM;
e2cfabdf
WD
125 ftest->k = sizeof(struct seccomp_data);
126 continue;
127 /* Explicitly include allowed calls. */
34805931
DB
128 case BPF_RET | BPF_K:
129 case BPF_RET | BPF_A:
130 case BPF_ALU | BPF_ADD | BPF_K:
131 case BPF_ALU | BPF_ADD | BPF_X:
132 case BPF_ALU | BPF_SUB | BPF_K:
133 case BPF_ALU | BPF_SUB | BPF_X:
134 case BPF_ALU | BPF_MUL | BPF_K:
135 case BPF_ALU | BPF_MUL | BPF_X:
136 case BPF_ALU | BPF_DIV | BPF_K:
137 case BPF_ALU | BPF_DIV | BPF_X:
138 case BPF_ALU | BPF_AND | BPF_K:
139 case BPF_ALU | BPF_AND | BPF_X:
140 case BPF_ALU | BPF_OR | BPF_K:
141 case BPF_ALU | BPF_OR | BPF_X:
142 case BPF_ALU | BPF_XOR | BPF_K:
143 case BPF_ALU | BPF_XOR | BPF_X:
144 case BPF_ALU | BPF_LSH | BPF_K:
145 case BPF_ALU | BPF_LSH | BPF_X:
146 case BPF_ALU | BPF_RSH | BPF_K:
147 case BPF_ALU | BPF_RSH | BPF_X:
148 case BPF_ALU | BPF_NEG:
149 case BPF_LD | BPF_IMM:
150 case BPF_LDX | BPF_IMM:
151 case BPF_MISC | BPF_TAX:
152 case BPF_MISC | BPF_TXA:
153 case BPF_LD | BPF_MEM:
154 case BPF_LDX | BPF_MEM:
155 case BPF_ST:
156 case BPF_STX:
157 case BPF_JMP | BPF_JA:
158 case BPF_JMP | BPF_JEQ | BPF_K:
159 case BPF_JMP | BPF_JEQ | BPF_X:
160 case BPF_JMP | BPF_JGE | BPF_K:
161 case BPF_JMP | BPF_JGE | BPF_X:
162 case BPF_JMP | BPF_JGT | BPF_K:
163 case BPF_JMP | BPF_JGT | BPF_X:
164 case BPF_JMP | BPF_JSET | BPF_K:
165 case BPF_JMP | BPF_JSET | BPF_X:
e2cfabdf
WD
166 continue;
167 default:
168 return -EINVAL;
169 }
170 }
171 return 0;
172}
173
174/**
175 * seccomp_run_filters - evaluates all seccomp filters against @syscall
176 * @syscall: number of the current system call
806b8085
KC
177 * @match: stores struct seccomp_filter that resulted in the return value,
178 * unless filter returned SECCOMP_RET_ALLOW, in which case it will
179 * be unchanged.
e2cfabdf
WD
180 *
181 * Returns valid seccomp BPF response codes.
182 */
806b8085
KC
183static u32 seccomp_run_filters(struct seccomp_data *sd,
184 struct seccomp_filter **match)
e2cfabdf 185{
d39bd00d 186 struct seccomp_data sd_local;
acf3b2c7 187 u32 ret = SECCOMP_RET_ALLOW;
8225d385
PK
188 /* Make sure cross-thread synced filter points somewhere sane. */
189 struct seccomp_filter *f =
190 lockless_dereference(current->seccomp.filter);
acf3b2c7
WD
191
192 /* Ensure unexpected behavior doesn't result in failing open. */
3ba2530c 193 if (unlikely(WARN_ON(f == NULL)))
acf3b2c7
WD
194 return SECCOMP_RET_KILL;
195
d39bd00d
AL
196 if (!sd) {
197 populate_seccomp_data(&sd_local);
198 sd = &sd_local;
199 }
bd4cf0ed 200
e2cfabdf
WD
201 /*
202 * All filters in the list are evaluated and the lowest BPF return
acf3b2c7 203 * value always takes priority (ignoring the DATA).
e2cfabdf 204 */
3ba2530c 205 for (; f; f = f->prev) {
d39bd00d 206 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
8f577cad 207
806b8085 208 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) {
acf3b2c7 209 ret = cur_ret;
806b8085
KC
210 *match = f;
211 }
e2cfabdf
WD
212 }
213 return ret;
214}
1f41b450 215#endif /* CONFIG_SECCOMP_FILTER */
e2cfabdf 216
1f41b450
KC
217static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
218{
69f6a34b 219 assert_spin_locked(&current->sighand->siglock);
dbd95212 220
1f41b450
KC
221 if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
222 return false;
223
224 return true;
225}
226
3ba2530c
KC
227static inline void seccomp_assign_mode(struct task_struct *task,
228 unsigned long seccomp_mode)
1f41b450 229{
69f6a34b 230 assert_spin_locked(&task->sighand->siglock);
dbd95212 231
3ba2530c
KC
232 task->seccomp.mode = seccomp_mode;
233 /*
234 * Make sure TIF_SECCOMP cannot be set before the mode (and
235 * filter) is set.
236 */
237 smp_mb__before_atomic();
238 set_tsk_thread_flag(task, TIF_SECCOMP);
1f41b450
KC
239}
240
241#ifdef CONFIG_SECCOMP_FILTER
c2e1f2e3
KC
242/* Returns 1 if the parent is an ancestor of the child. */
243static int is_ancestor(struct seccomp_filter *parent,
244 struct seccomp_filter *child)
245{
246 /* NULL is the root ancestor. */
247 if (parent == NULL)
248 return 1;
249 for (; child; child = child->prev)
250 if (child == parent)
251 return 1;
252 return 0;
253}
254
255/**
256 * seccomp_can_sync_threads: checks if all threads can be synchronized
257 *
258 * Expects sighand and cred_guard_mutex locks to be held.
259 *
260 * Returns 0 on success, -ve on error, or the pid of a thread which was
261 * either not in the correct seccomp mode or it did not have an ancestral
262 * seccomp filter.
263 */
264static inline pid_t seccomp_can_sync_threads(void)
265{
266 struct task_struct *thread, *caller;
267
268 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
69f6a34b 269 assert_spin_locked(&current->sighand->siglock);
c2e1f2e3
KC
270
271 /* Validate all threads being eligible for synchronization. */
272 caller = current;
273 for_each_thread(caller, thread) {
274 pid_t failed;
275
276 /* Skip current, since it is initiating the sync. */
277 if (thread == caller)
278 continue;
279
280 if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
281 (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
282 is_ancestor(thread->seccomp.filter,
283 caller->seccomp.filter)))
284 continue;
285
286 /* Return the first thread that cannot be synchronized. */
287 failed = task_pid_vnr(thread);
288 /* If the pid cannot be resolved, then return -ESRCH */
289 if (unlikely(WARN_ON(failed == 0)))
290 failed = -ESRCH;
291 return failed;
292 }
293
294 return 0;
295}
296
297/**
298 * seccomp_sync_threads: sets all threads to use current's filter
299 *
300 * Expects sighand and cred_guard_mutex locks to be held, and for
301 * seccomp_can_sync_threads() to have returned success already
302 * without dropping the locks.
303 *
304 */
305static inline void seccomp_sync_threads(void)
306{
307 struct task_struct *thread, *caller;
308
309 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
69f6a34b 310 assert_spin_locked(&current->sighand->siglock);
c2e1f2e3
KC
311
312 /* Synchronize all threads. */
313 caller = current;
314 for_each_thread(caller, thread) {
315 /* Skip current, since it needs no changes. */
316 if (thread == caller)
317 continue;
318
319 /* Get a task reference for the new leaf node. */
320 get_seccomp_filter(caller);
321 /*
322 * Drop the task reference to the shared ancestor since
323 * current's path will hold a reference. (This also
324 * allows a put before the assignment.)
325 */
326 put_seccomp_filter(thread);
327 smp_store_release(&thread->seccomp.filter,
328 caller->seccomp.filter);
95a9c620
JH
329
330 /*
331 * Don't let an unprivileged task work around
332 * the no_new_privs restriction by creating
333 * a thread that sets it up, enters seccomp,
334 * then dies.
335 */
336 if (task_no_new_privs(caller))
337 task_set_no_new_privs(thread);
338
c2e1f2e3
KC
339 /*
340 * Opt the other thread into seccomp if needed.
341 * As threads are considered to be trust-realm
342 * equivalent (see ptrace_may_access), it is safe to
343 * allow one thread to transition the other.
344 */
95a9c620 345 if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
c2e1f2e3 346 seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
c2e1f2e3
KC
347 }
348}
349
e2cfabdf 350/**
c8bee430 351 * seccomp_prepare_filter: Prepares a seccomp filter for use.
e2cfabdf
WD
352 * @fprog: BPF program to install
353 *
c8bee430 354 * Returns filter on success or an ERR_PTR on failure.
e2cfabdf 355 */
c8bee430 356static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
e2cfabdf 357{
ac67eb2c
DB
358 struct seccomp_filter *sfilter;
359 int ret;
f8e529ed 360 const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
e2cfabdf
WD
361
362 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
c8bee430 363 return ERR_PTR(-EINVAL);
d9e12f42 364
c8bee430 365 BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
e2cfabdf
WD
366
367 /*
119ce5c8 368 * Installing a seccomp filter requires that the task has
e2cfabdf
WD
369 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
370 * This avoids scenarios where unprivileged tasks can affect the
371 * behavior of privileged children.
372 */
1d4457f9 373 if (!task_no_new_privs(current) &&
e2cfabdf
WD
374 security_capable_noaudit(current_cred(), current_user_ns(),
375 CAP_SYS_ADMIN) != 0)
c8bee430 376 return ERR_PTR(-EACCES);
e2cfabdf 377
bd4cf0ed 378 /* Allocate a new seccomp_filter */
ac67eb2c
DB
379 sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
380 if (!sfilter)
d9e12f42 381 return ERR_PTR(-ENOMEM);
ac67eb2c
DB
382
383 ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
f8e529ed 384 seccomp_check_filter, save_orig);
ac67eb2c
DB
385 if (ret < 0) {
386 kfree(sfilter);
387 return ERR_PTR(ret);
d9e12f42 388 }
bd4cf0ed 389
ac67eb2c 390 atomic_set(&sfilter->usage, 1);
e2cfabdf 391
ac67eb2c 392 return sfilter;
e2cfabdf
WD
393}
394
395/**
c8bee430 396 * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
e2cfabdf
WD
397 * @user_filter: pointer to the user data containing a sock_fprog.
398 *
399 * Returns 0 on success and non-zero otherwise.
400 */
c8bee430
KC
401static struct seccomp_filter *
402seccomp_prepare_user_filter(const char __user *user_filter)
e2cfabdf
WD
403{
404 struct sock_fprog fprog;
c8bee430 405 struct seccomp_filter *filter = ERR_PTR(-EFAULT);
e2cfabdf
WD
406
407#ifdef CONFIG_COMPAT
408 if (is_compat_task()) {
409 struct compat_sock_fprog fprog32;
410 if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
411 goto out;
412 fprog.len = fprog32.len;
413 fprog.filter = compat_ptr(fprog32.filter);
414 } else /* falls through to the if below. */
415#endif
416 if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
417 goto out;
c8bee430 418 filter = seccomp_prepare_filter(&fprog);
e2cfabdf 419out:
c8bee430
KC
420 return filter;
421}
422
423/**
424 * seccomp_attach_filter: validate and attach filter
425 * @flags: flags to change filter behavior
426 * @filter: seccomp filter to add to the current process
427 *
dbd95212
KC
428 * Caller must be holding current->sighand->siglock lock.
429 *
c8bee430
KC
430 * Returns 0 on success, -ve on error.
431 */
432static long seccomp_attach_filter(unsigned int flags,
433 struct seccomp_filter *filter)
434{
435 unsigned long total_insns;
436 struct seccomp_filter *walker;
437
69f6a34b 438 assert_spin_locked(&current->sighand->siglock);
dbd95212 439
c8bee430
KC
440 /* Validate resulting filter length. */
441 total_insns = filter->prog->len;
442 for (walker = current->seccomp.filter; walker; walker = walker->prev)
443 total_insns += walker->prog->len + 4; /* 4 instr penalty */
444 if (total_insns > MAX_INSNS_PER_PATH)
445 return -ENOMEM;
446
c2e1f2e3
KC
447 /* If thread sync has been requested, check that it is possible. */
448 if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
449 int ret;
450
451 ret = seccomp_can_sync_threads();
452 if (ret)
453 return ret;
454 }
455
85438171
TH
456 /* Set log flag, if present. */
457 if (flags & SECCOMP_FILTER_FLAG_LOG)
458 filter->log = true;
459
c8bee430
KC
460 /*
461 * If there is an existing filter, make it the prev and don't drop its
462 * task reference.
463 */
464 filter->prev = current->seccomp.filter;
465 current->seccomp.filter = filter;
466
c2e1f2e3
KC
467 /* Now that the new filter is in place, synchronize to all threads. */
468 if (flags & SECCOMP_FILTER_FLAG_TSYNC)
469 seccomp_sync_threads();
470
c8bee430 471 return 0;
e2cfabdf
WD
472}
473
236de4e6
ON
474void __get_seccomp_filter(struct seccomp_filter *filter)
475{
476 /* Reference count is bounded by the number of total processes. */
477 atomic_inc(&filter->usage);
478}
479
e2cfabdf
WD
480/* get_seccomp_filter - increments the reference count of the filter on @tsk */
481void get_seccomp_filter(struct task_struct *tsk)
482{
483 struct seccomp_filter *orig = tsk->seccomp.filter;
484 if (!orig)
485 return;
236de4e6 486 __get_seccomp_filter(orig);
e2cfabdf
WD
487}
488
c8bee430
KC
489static inline void seccomp_filter_free(struct seccomp_filter *filter)
490{
491 if (filter) {
bab18991 492 bpf_prog_destroy(filter->prog);
c8bee430
KC
493 kfree(filter);
494 }
495}
496
236de4e6 497static void __put_seccomp_filter(struct seccomp_filter *orig)
e2cfabdf 498{
e2cfabdf
WD
499 /* Clean up single-reference branches iteratively. */
500 while (orig && atomic_dec_and_test(&orig->usage)) {
501 struct seccomp_filter *freeme = orig;
502 orig = orig->prev;
c8bee430 503 seccomp_filter_free(freeme);
e2cfabdf
WD
504 }
505}
bb6ea430 506
236de4e6
ON
507/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
508void put_seccomp_filter(struct task_struct *tsk)
509{
510 __put_seccomp_filter(tsk->seccomp.filter);
511}
512
bb6ea430
WD
513/**
514 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
515 * @syscall: syscall number to send to userland
516 * @reason: filter-supplied reason code to send to userland (via si_errno)
517 *
518 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
519 */
520static void seccomp_send_sigsys(int syscall, int reason)
521{
522 struct siginfo info;
523 memset(&info, 0, sizeof(info));
524 info.si_signo = SIGSYS;
525 info.si_code = SYS_SECCOMP;
526 info.si_call_addr = (void __user *)KSTK_EIP(current);
527 info.si_errno = reason;
5e937a9a 528 info.si_arch = syscall_get_arch();
bb6ea430
WD
529 info.si_syscall = syscall;
530 force_sig_info(SIGSYS, &info, current);
531}
e2cfabdf 532#endif /* CONFIG_SECCOMP_FILTER */
1da177e4 533
c8a22a5f
TH
534/* For use with seccomp_actions_logged */
535#define SECCOMP_LOG_KILL (1 << 0)
536#define SECCOMP_LOG_TRAP (1 << 2)
537#define SECCOMP_LOG_ERRNO (1 << 3)
538#define SECCOMP_LOG_TRACE (1 << 4)
539#define SECCOMP_LOG_ALLOW (1 << 5)
540
541static u32 seccomp_actions_logged = SECCOMP_LOG_KILL | SECCOMP_LOG_TRAP |
542 SECCOMP_LOG_ERRNO | SECCOMP_LOG_TRACE;
543
85438171
TH
544static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
545 bool requested)
c8a22a5f
TH
546{
547 bool log = false;
548
549 switch (action) {
550 case SECCOMP_RET_ALLOW:
85438171 551 break;
c8a22a5f 552 case SECCOMP_RET_TRAP:
85438171
TH
553 log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
554 break;
c8a22a5f 555 case SECCOMP_RET_ERRNO:
85438171
TH
556 log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
557 break;
c8a22a5f 558 case SECCOMP_RET_TRACE:
85438171 559 log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
c8a22a5f
TH
560 break;
561 case SECCOMP_RET_KILL:
562 default:
563 log = seccomp_actions_logged & SECCOMP_LOG_KILL;
564 }
565
566 /*
85438171
TH
567 * Force an audit message to be emitted when the action is RET_KILL or
568 * the FILTER_FLAG_LOG bit was set and the action is allowed to be
569 * logged by the admin.
c8a22a5f
TH
570 */
571 if (log)
572 return __audit_seccomp(syscall, signr, action);
573
574 /*
575 * Let the audit subsystem decide if the action should be audited based
576 * on whether the current task itself is being audited.
577 */
578 return audit_seccomp(syscall, signr, action);
579}
580
1da177e4
LT
581/*
582 * Secure computing mode 1 allows only read/write/exit/sigreturn.
583 * To be fully secure this must be combined with rlimit
584 * to limit the stack allocations too.
585 */
586static int mode1_syscalls[] = {
587 __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
588 0, /* null terminated */
589};
590
5b101740 591#ifdef CONFIG_COMPAT
1da177e4
LT
592static int mode1_syscalls_32[] = {
593 __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
594 0, /* null terminated */
595};
596#endif
597
a4412fc9 598static void __secure_computing_strict(int this_syscall)
1da177e4 599{
a4412fc9
AL
600 int *syscall_whitelist = mode1_syscalls;
601#ifdef CONFIG_COMPAT
602 if (is_compat_task())
603 syscall_whitelist = mode1_syscalls_32;
604#endif
605 do {
606 if (*syscall_whitelist == this_syscall)
607 return;
608 } while (*++syscall_whitelist);
609
610#ifdef SECCOMP_DEBUG
611 dump_stack();
612#endif
85438171 613 seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL, true);
a4412fc9
AL
614 do_exit(SIGKILL);
615}
616
617#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
618void secure_computing_strict(int this_syscall)
619{
620 int mode = current->seccomp.mode;
621
13c4a901
TA
622 if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
623 unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
624 return;
625
221272f9 626 if (mode == SECCOMP_MODE_DISABLED)
a4412fc9
AL
627 return;
628 else if (mode == SECCOMP_MODE_STRICT)
629 __secure_computing_strict(this_syscall);
630 else
631 BUG();
632}
633#else
634int __secure_computing(void)
635{
d39bd00d 636 u32 phase1_result = seccomp_phase1(NULL);
13aa72f0
AL
637
638 if (likely(phase1_result == SECCOMP_PHASE1_OK))
639 return 0;
640 else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
641 return -1;
642 else
643 return seccomp_phase2(phase1_result);
644}
645
646#ifdef CONFIG_SECCOMP_FILTER
d39bd00d 647static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
13aa72f0
AL
648{
649 u32 filter_ret, action;
806b8085 650 struct seccomp_filter *match = NULL;
13aa72f0 651 int data;
1da177e4 652
3ba2530c
KC
653 /*
654 * Make sure that any changes to mode from another thread have
655 * been seen after TIF_SECCOMP was seen.
656 */
657 rmb();
658
806b8085 659 filter_ret = seccomp_run_filters(sd, &match);
13aa72f0
AL
660 data = filter_ret & SECCOMP_RET_DATA;
661 action = filter_ret & SECCOMP_RET_ACTION;
662
663 switch (action) {
664 case SECCOMP_RET_ERRNO:
580c57f1
KC
665 /* Set low-order bits as an errno, capped at MAX_ERRNO. */
666 if (data > MAX_ERRNO)
667 data = MAX_ERRNO;
d39bd00d 668 syscall_set_return_value(current, task_pt_regs(current),
13aa72f0
AL
669 -data, 0);
670 goto skip;
671
672 case SECCOMP_RET_TRAP:
673 /* Show the handler the original registers. */
d39bd00d 674 syscall_rollback(current, task_pt_regs(current));
13aa72f0
AL
675 /* Let the filter pass back 16 bits of data. */
676 seccomp_send_sigsys(this_syscall, data);
677 goto skip;
678
679 case SECCOMP_RET_TRACE:
680 return filter_ret; /* Save the rest for phase 2. */
681
682 case SECCOMP_RET_ALLOW:
806b8085
KC
683 /*
684 * Note that the "match" filter will always be NULL for
685 * this action since SECCOMP_RET_ALLOW is the starting
686 * state in seccomp_run_filters().
687 */
13aa72f0
AL
688 return SECCOMP_PHASE1_OK;
689
690 case SECCOMP_RET_KILL:
691 default:
85438171 692 seccomp_log(this_syscall, SIGSYS, action, true);
13aa72f0
AL
693 do_exit(SIGSYS);
694 }
695
696 unreachable();
697
698skip:
85438171 699 seccomp_log(this_syscall, 0, action, match ? match->log : false);
13aa72f0
AL
700 return SECCOMP_PHASE1_SKIP;
701}
1da177e4 702#endif
13aa72f0
AL
703
704/**
705 * seccomp_phase1() - run fast path seccomp checks on the current syscall
d39bd00d 706 * @arg sd: The seccomp_data or NULL
13aa72f0
AL
707 *
708 * This only reads pt_regs via the syscall_xyz helpers. The only change
709 * it will make to pt_regs is via syscall_set_return_value, and it will
710 * only do that if it returns SECCOMP_PHASE1_SKIP.
711 *
d39bd00d
AL
712 * If sd is provided, it will not read pt_regs at all.
713 *
13aa72f0
AL
714 * It may also call do_exit or force a signal; these actions must be
715 * safe.
716 *
717 * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
718 * be processed normally.
719 *
720 * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
721 * invoked. In this case, seccomp_phase1 will have set the return value
722 * using syscall_set_return_value.
723 *
724 * If it returns anything else, then the return value should be passed
725 * to seccomp_phase2 from a context in which ptrace hooks are safe.
726 */
d39bd00d 727u32 seccomp_phase1(struct seccomp_data *sd)
13aa72f0
AL
728{
729 int mode = current->seccomp.mode;
d39bd00d
AL
730 int this_syscall = sd ? sd->nr :
731 syscall_get_nr(current, task_pt_regs(current));
13aa72f0 732
13c4a901
TA
733 if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
734 unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
735 return SECCOMP_PHASE1_OK;
736
13aa72f0 737 switch (mode) {
e2cfabdf 738 case SECCOMP_MODE_STRICT:
13aa72f0
AL
739 __secure_computing_strict(this_syscall); /* may call do_exit */
740 return SECCOMP_PHASE1_OK;
e2cfabdf 741#ifdef CONFIG_SECCOMP_FILTER
13aa72f0 742 case SECCOMP_MODE_FILTER:
d39bd00d 743 return __seccomp_phase1_filter(this_syscall, sd);
e2cfabdf 744#endif
1da177e4
LT
745 default:
746 BUG();
747 }
13aa72f0 748}
1da177e4 749
13aa72f0
AL
750/**
751 * seccomp_phase2() - finish slow path seccomp work for the current syscall
752 * @phase1_result: The return value from seccomp_phase1()
753 *
754 * This must be called from a context in which ptrace hooks can be used.
755 *
756 * Returns 0 if the syscall should be processed or -1 to skip the syscall.
757 */
758int seccomp_phase2(u32 phase1_result)
759{
760 struct pt_regs *regs = task_pt_regs(current);
761 u32 action = phase1_result & SECCOMP_RET_ACTION;
762 int data = phase1_result & SECCOMP_RET_DATA;
763
764 BUG_ON(action != SECCOMP_RET_TRACE);
765
85438171
TH
766 /* We don't have access to the filter that was matched in the phase1
767 * stage in order to know if logging was requested when the filter was
768 * loaded. Logging for SECCOMP_RET_TRACE isn't particularly useful so
769 * hard-coding the _requested_ parameter of seccomp_log() to 'false'
770 * will suffice for this backport.
771 */
772 seccomp_log(syscall_get_nr(current, regs), 0, action, false);
13aa72f0
AL
773
774 /* Skip these calls if there is no tracer. */
775 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
776 syscall_set_return_value(current, regs,
777 -ENOSYS, 0);
778 return -1;
779 }
780
781 /* Allow the BPF to provide the event message */
782 ptrace_event(PTRACE_EVENT_SECCOMP, data);
783 /*
784 * The delivery of a fatal signal during event
785 * notification may silently skip tracer notification.
786 * Terminating the task now avoids executing a system
787 * call that may not be intended.
788 */
789 if (fatal_signal_pending(current))
790 do_exit(SIGSYS);
791 if (syscall_get_nr(current, regs) < 0)
792 return -1; /* Explicit request to skip. */
793
794 return 0;
1da177e4 795}
a4412fc9 796#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
1d9d02fe
AA
797
798long prctl_get_seccomp(void)
799{
800 return current->seccomp.mode;
801}
802
e2cfabdf 803/**
3b23dd12 804 * seccomp_set_mode_strict: internal function for setting strict seccomp
e2cfabdf
WD
805 *
806 * Once current->seccomp.mode is non-zero, it may not be changed.
807 *
808 * Returns 0 on success or -EINVAL on failure.
809 */
3b23dd12 810static long seccomp_set_mode_strict(void)
1d9d02fe 811{
3b23dd12 812 const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
e2cfabdf 813 long ret = -EINVAL;
1d9d02fe 814
dbd95212
KC
815 spin_lock_irq(&current->sighand->siglock);
816
1f41b450 817 if (!seccomp_may_assign_mode(seccomp_mode))
1d9d02fe
AA
818 goto out;
819
cf99abac 820#ifdef TIF_NOTSC
3b23dd12 821 disable_TSC();
cf99abac 822#endif
3ba2530c 823 seccomp_assign_mode(current, seccomp_mode);
3b23dd12
KC
824 ret = 0;
825
826out:
dbd95212 827 spin_unlock_irq(&current->sighand->siglock);
3b23dd12
KC
828
829 return ret;
830}
831
e2cfabdf 832#ifdef CONFIG_SECCOMP_FILTER
3b23dd12
KC
833/**
834 * seccomp_set_mode_filter: internal function for setting seccomp filter
48dc92b9 835 * @flags: flags to change filter behavior
3b23dd12
KC
836 * @filter: struct sock_fprog containing filter
837 *
838 * This function may be called repeatedly to install additional filters.
839 * Every filter successfully installed will be evaluated (in reverse order)
840 * for each system call the task makes.
841 *
842 * Once current->seccomp.mode is non-zero, it may not be changed.
843 *
844 * Returns 0 on success or -EINVAL on failure.
845 */
48dc92b9
KC
846static long seccomp_set_mode_filter(unsigned int flags,
847 const char __user *filter)
3b23dd12
KC
848{
849 const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
c8bee430 850 struct seccomp_filter *prepared = NULL;
3b23dd12
KC
851 long ret = -EINVAL;
852
48dc92b9 853 /* Validate flags. */
c2e1f2e3 854 if (flags & ~SECCOMP_FILTER_FLAG_MASK)
dbd95212 855 return -EINVAL;
48dc92b9 856
c8bee430
KC
857 /* Prepare the new filter before holding any locks. */
858 prepared = seccomp_prepare_user_filter(filter);
859 if (IS_ERR(prepared))
860 return PTR_ERR(prepared);
861
c2e1f2e3
KC
862 /*
863 * Make sure we cannot change seccomp or nnp state via TSYNC
864 * while another thread is in the middle of calling exec.
865 */
866 if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
867 mutex_lock_killable(&current->signal->cred_guard_mutex))
868 goto out_free;
869
dbd95212
KC
870 spin_lock_irq(&current->sighand->siglock);
871
3b23dd12
KC
872 if (!seccomp_may_assign_mode(seccomp_mode))
873 goto out;
874
c8bee430 875 ret = seccomp_attach_filter(flags, prepared);
3b23dd12 876 if (ret)
e2cfabdf 877 goto out;
c8bee430
KC
878 /* Do not free the successfully attached filter. */
879 prepared = NULL;
1d9d02fe 880
3ba2530c 881 seccomp_assign_mode(current, seccomp_mode);
e2cfabdf 882out:
dbd95212 883 spin_unlock_irq(&current->sighand->siglock);
c2e1f2e3
KC
884 if (flags & SECCOMP_FILTER_FLAG_TSYNC)
885 mutex_unlock(&current->signal->cred_guard_mutex);
886out_free:
c8bee430 887 seccomp_filter_free(prepared);
1d9d02fe
AA
888 return ret;
889}
3b23dd12 890#else
48dc92b9
KC
891static inline long seccomp_set_mode_filter(unsigned int flags,
892 const char __user *filter)
3b23dd12
KC
893{
894 return -EINVAL;
895}
896#endif
d78ab02c 897
4ae31a2f
TH
898static long seccomp_get_action_avail(const char __user *uaction)
899{
900 u32 action;
901
902 if (copy_from_user(&action, uaction, sizeof(action)))
903 return -EFAULT;
904
905 switch (action) {
906 case SECCOMP_RET_KILL:
907 case SECCOMP_RET_TRAP:
908 case SECCOMP_RET_ERRNO:
909 case SECCOMP_RET_TRACE:
910 case SECCOMP_RET_ALLOW:
911 break;
912 default:
913 return -EOPNOTSUPP;
914 }
915
916 return 0;
917}
918
48dc92b9
KC
919/* Common entry point for both prctl and syscall. */
920static long do_seccomp(unsigned int op, unsigned int flags,
921 const char __user *uargs)
922{
923 switch (op) {
924 case SECCOMP_SET_MODE_STRICT:
925 if (flags != 0 || uargs != NULL)
926 return -EINVAL;
927 return seccomp_set_mode_strict();
928 case SECCOMP_SET_MODE_FILTER:
929 return seccomp_set_mode_filter(flags, uargs);
4ae31a2f
TH
930 case SECCOMP_GET_ACTION_AVAIL:
931 if (flags != 0)
932 return -EINVAL;
933
934 return seccomp_get_action_avail(uargs);
48dc92b9
KC
935 default:
936 return -EINVAL;
937 }
938}
939
940SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
941 const char __user *, uargs)
942{
943 return do_seccomp(op, flags, uargs);
944}
945
d78ab02c
KC
946/**
947 * prctl_set_seccomp: configures current->seccomp.mode
948 * @seccomp_mode: requested mode to use
949 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
950 *
951 * Returns 0 on success or -EINVAL on failure.
952 */
953long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
954{
48dc92b9
KC
955 unsigned int op;
956 char __user *uargs;
957
3b23dd12
KC
958 switch (seccomp_mode) {
959 case SECCOMP_MODE_STRICT:
48dc92b9
KC
960 op = SECCOMP_SET_MODE_STRICT;
961 /*
962 * Setting strict mode through prctl always ignored filter,
963 * so make sure it is always NULL here to pass the internal
964 * check in do_seccomp().
965 */
966 uargs = NULL;
967 break;
3b23dd12 968 case SECCOMP_MODE_FILTER:
48dc92b9
KC
969 op = SECCOMP_SET_MODE_FILTER;
970 uargs = filter;
971 break;
3b23dd12
KC
972 default:
973 return -EINVAL;
974 }
48dc92b9
KC
975
976 /* prctl interface doesn't have flags, so they are always zero. */
977 return do_seccomp(op, 0, uargs);
d78ab02c 978}
f8e529ed
TA
979
980#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
981long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
982 void __user *data)
983{
984 struct seccomp_filter *filter;
985 struct sock_fprog_kern *fprog;
986 long ret;
987 unsigned long count = 0;
988
989 if (!capable(CAP_SYS_ADMIN) ||
990 current->seccomp.mode != SECCOMP_MODE_DISABLED) {
991 return -EACCES;
992 }
993
994 spin_lock_irq(&task->sighand->siglock);
995 if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
996 ret = -EINVAL;
997 goto out;
998 }
999
1000 filter = task->seccomp.filter;
1001 while (filter) {
1002 filter = filter->prev;
1003 count++;
1004 }
1005
1006 if (filter_off >= count) {
1007 ret = -ENOENT;
1008 goto out;
1009 }
1010 count -= filter_off;
1011
1012 filter = task->seccomp.filter;
1013 while (filter && count > 1) {
1014 filter = filter->prev;
1015 count--;
1016 }
1017
1018 if (WARN_ON(count != 1 || !filter)) {
1019 /* The filter tree shouldn't shrink while we're using it. */
1020 ret = -ENOENT;
1021 goto out;
1022 }
1023
1024 fprog = filter->prog->orig_prog;
1025 if (!fprog) {
1026 /* This must be a new non-cBPF filter, since we save every
1027 * every cBPF filter's orig_prog above when
1028 * CONFIG_CHECKPOINT_RESTORE is enabled.
1029 */
1030 ret = -EMEDIUMTYPE;
1031 goto out;
1032 }
1033
1034 ret = fprog->len;
1035 if (!data)
1036 goto out;
1037
236de4e6 1038 __get_seccomp_filter(filter);
f8e529ed
TA
1039 spin_unlock_irq(&task->sighand->siglock);
1040
1041 if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
1042 ret = -EFAULT;
1043
236de4e6 1044 __put_seccomp_filter(filter);
f8e529ed
TA
1045 return ret;
1046
1047out:
1048 spin_unlock_irq(&task->sighand->siglock);
1049 return ret;
1050}
1051#endif
aac883e7
TH
1052
1053#ifdef CONFIG_SYSCTL
1054
1055/* Human readable action names for friendly sysctl interaction */
1056#define SECCOMP_RET_KILL_NAME "kill"
1057#define SECCOMP_RET_TRAP_NAME "trap"
1058#define SECCOMP_RET_ERRNO_NAME "errno"
1059#define SECCOMP_RET_TRACE_NAME "trace"
1060#define SECCOMP_RET_ALLOW_NAME "allow"
1061
1062static const char seccomp_actions_avail[] = SECCOMP_RET_KILL_NAME " "
1063 SECCOMP_RET_TRAP_NAME " "
1064 SECCOMP_RET_ERRNO_NAME " "
1065 SECCOMP_RET_TRACE_NAME " "
1066 SECCOMP_RET_ALLOW_NAME;
1067
c8a22a5f
TH
1068struct seccomp_log_name {
1069 u32 log;
1070 const char *name;
1071};
1072
1073static const struct seccomp_log_name seccomp_log_names[] = {
1074 { SECCOMP_LOG_KILL, SECCOMP_RET_KILL_NAME },
1075 { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
1076 { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
1077 { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
1078 { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
1079 { }
1080};
1081
1082static bool seccomp_names_from_actions_logged(char *names, size_t size,
1083 u32 actions_logged)
1084{
1085 const struct seccomp_log_name *cur;
1086 bool append_space = false;
1087
1088 for (cur = seccomp_log_names; cur->name && size; cur++) {
1089 ssize_t ret;
1090
1091 if (!(actions_logged & cur->log))
1092 continue;
1093
1094 if (append_space) {
1095 ret = strscpy(names, " ", size);
1096 if (ret < 0)
1097 return false;
1098
1099 names += ret;
1100 size -= ret;
1101 } else
1102 append_space = true;
1103
1104 ret = strscpy(names, cur->name, size);
1105 if (ret < 0)
1106 return false;
1107
1108 names += ret;
1109 size -= ret;
1110 }
1111
1112 return true;
1113}
1114
1115static bool seccomp_action_logged_from_name(u32 *action_logged,
1116 const char *name)
1117{
1118 const struct seccomp_log_name *cur;
1119
1120 for (cur = seccomp_log_names; cur->name; cur++) {
1121 if (!strcmp(cur->name, name)) {
1122 *action_logged = cur->log;
1123 return true;
1124 }
1125 }
1126
1127 return false;
1128}
1129
1130static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
1131{
1132 char *name;
1133
1134 *actions_logged = 0;
1135 while ((name = strsep(&names, " ")) && *name) {
1136 u32 action_logged = 0;
1137
1138 if (!seccomp_action_logged_from_name(&action_logged, name))
1139 return false;
1140
1141 *actions_logged |= action_logged;
1142 }
1143
1144 return true;
1145}
1146
1147static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
1148 void __user *buffer, size_t *lenp,
1149 loff_t *ppos)
1150{
1151 char names[sizeof(seccomp_actions_avail)];
1152 struct ctl_table table;
1153 int ret;
1154
1155 if (write && !capable(CAP_SYS_ADMIN))
1156 return -EPERM;
1157
1158 memset(names, 0, sizeof(names));
1159
1160 if (!write) {
1161 if (!seccomp_names_from_actions_logged(names, sizeof(names),
1162 seccomp_actions_logged))
1163 return -EINVAL;
1164 }
1165
1166 table = *ro_table;
1167 table.data = names;
1168 table.maxlen = sizeof(names);
1169 ret = proc_dostring(&table, write, buffer, lenp, ppos);
1170 if (ret)
1171 return ret;
1172
1173 if (write) {
1174 u32 actions_logged;
1175
1176 if (!seccomp_actions_logged_from_names(&actions_logged,
1177 table.data))
1178 return -EINVAL;
1179
1180 if (actions_logged & SECCOMP_LOG_ALLOW)
1181 return -EINVAL;
1182
1183 seccomp_actions_logged = actions_logged;
1184 }
1185
1186 return 0;
1187}
1188
aac883e7
TH
1189static struct ctl_path seccomp_sysctl_path[] = {
1190 { .procname = "kernel", },
1191 { .procname = "seccomp", },
1192 { }
1193};
1194
1195static struct ctl_table seccomp_sysctl_table[] = {
1196 {
1197 .procname = "actions_avail",
1198 .data = (void *) &seccomp_actions_avail,
1199 .maxlen = sizeof(seccomp_actions_avail),
1200 .mode = 0444,
1201 .proc_handler = proc_dostring,
1202 },
c8a22a5f
TH
1203 {
1204 .procname = "actions_logged",
1205 .mode = 0644,
1206 .proc_handler = seccomp_actions_logged_handler,
1207 },
aac883e7
TH
1208 { }
1209};
1210
1211static int __init seccomp_sysctl_init(void)
1212{
1213 struct ctl_table_header *hdr;
1214
1215 hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
1216 if (!hdr)
1217 pr_warn("seccomp: sysctl registration failed\n");
1218 else
1219 kmemleak_not_leak(hdr);
1220
1221 return 0;
1222}
1223
1224device_initcall(seccomp_sysctl_init)
1225
1226#endif /* CONFIG_SYSCTL */