]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - fs/eventfd.c
UBUNTU: Ubuntu-4.15.0-96.97
[mirror_ubuntu-bionic-kernel.git] / fs / eventfd.c
CommitLineData
e1ad7468
DL
1/*
2 * fs/eventfd.c
3 *
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
5 *
6 */
7
8#include <linux/file.h>
9#include <linux/poll.h>
10#include <linux/init.h>
11#include <linux/fs.h>
174cd4b1 12#include <linux/sched/signal.h>
e1ad7468 13#include <linux/kernel.h>
5a0e3ad6 14#include <linux/slab.h>
e1ad7468
DL
15#include <linux/list.h>
16#include <linux/spinlock.h>
17#include <linux/anon_inodes.h>
7747cdb2 18#include <linux/syscalls.h>
630d9c47 19#include <linux/export.h>
13389010
DL
20#include <linux/kref.h>
21#include <linux/eventfd.h>
cbac5542
CG
22#include <linux/proc_fs.h>
23#include <linux/seq_file.h>
e1ad7468 24
fde5fd9c
JA
25DEFINE_PER_CPU(int, eventfd_wake_count);
26
e1ad7468 27struct eventfd_ctx {
13389010 28 struct kref kref;
e1ad7468
DL
29 wait_queue_head_t wqh;
30 /*
31 * Every time that a write(2) is performed on an eventfd, the
32 * value of the __u64 being written is added to "count" and a
33 * wakeup is performed on "wqh". A read(2) will return the "count"
34 * value to userspace, and will reset "count" to zero. The kernel
13389010 35 * side eventfd_signal() also, adds to the "count" counter and
e1ad7468
DL
36 * issue a wakeup.
37 */
38 __u64 count;
bcd0b235 39 unsigned int flags;
e1ad7468
DL
40};
41
13389010
DL
42/**
43 * eventfd_signal - Adds @n to the eventfd counter.
44 * @ctx: [in] Pointer to the eventfd context.
45 * @n: [in] Value of the counter to be added to the eventfd internal counter.
46 * The value cannot be negative.
47 *
48 * This function is supposed to be called by the kernel in paths that do not
49 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
20d5a865 50 * value, and we signal this as overflow condition by returning a POLLERR
13389010
DL
51 * to poll(2).
52 *
20d5a865 53 * Returns the amount by which the counter was incremented. This will be less
ee62c6b2 54 * than @n if the counter has overflowed.
e1ad7468 55 */
ee62c6b2 56__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
e1ad7468 57{
e1ad7468
DL
58 unsigned long flags;
59
fde5fd9c
JA
60 /*
61 * Deadlock or stack overflow issues can happen if we recurse here
62 * through waitqueue wakeup handlers. If the caller users potentially
63 * nested waitqueues with custom wakeup handlers, then it should
64 * check eventfd_signal_count() before calling this function. If
65 * it returns true, the eventfd_signal() call should be deferred to a
66 * safe context.
67 */
68 if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
69 return 0;
70
d48eb233 71 spin_lock_irqsave(&ctx->wqh.lock, flags);
fde5fd9c 72 this_cpu_inc(eventfd_wake_count);
e1ad7468 73 if (ULLONG_MAX - ctx->count < n)
ee62c6b2 74 n = ULLONG_MAX - ctx->count;
e1ad7468
DL
75 ctx->count += n;
76 if (waitqueue_active(&ctx->wqh))
39510888 77 wake_up_locked_poll(&ctx->wqh, POLLIN);
fde5fd9c 78 this_cpu_dec(eventfd_wake_count);
d48eb233 79 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
e1ad7468
DL
80
81 return n;
82}
5718607b 83EXPORT_SYMBOL_GPL(eventfd_signal);
e1ad7468 84
562787a5
DL
85static void eventfd_free_ctx(struct eventfd_ctx *ctx)
86{
87 kfree(ctx);
88}
89
13389010
DL
90static void eventfd_free(struct kref *kref)
91{
92 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
93
562787a5 94 eventfd_free_ctx(ctx);
13389010
DL
95}
96
97/**
98 * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
99 * @ctx: [in] Pointer to the eventfd context.
100 *
101 * Returns: In case of success, returns a pointer to the eventfd context.
102 */
103struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
104{
105 kref_get(&ctx->kref);
106 return ctx;
107}
108EXPORT_SYMBOL_GPL(eventfd_ctx_get);
109
110/**
111 * eventfd_ctx_put - Releases a reference to the internal eventfd context.
112 * @ctx: [in] Pointer to eventfd context.
113 *
114 * The eventfd context reference must have been previously acquired either
36182185 115 * with eventfd_ctx_get() or eventfd_ctx_fdget().
13389010
DL
116 */
117void eventfd_ctx_put(struct eventfd_ctx *ctx)
118{
119 kref_put(&ctx->kref, eventfd_free);
120}
121EXPORT_SYMBOL_GPL(eventfd_ctx_put);
122
e1ad7468
DL
123static int eventfd_release(struct inode *inode, struct file *file)
124{
13389010
DL
125 struct eventfd_ctx *ctx = file->private_data;
126
127 wake_up_poll(&ctx->wqh, POLLHUP);
128 eventfd_ctx_put(ctx);
e1ad7468
DL
129 return 0;
130}
131
132static unsigned int eventfd_poll(struct file *file, poll_table *wait)
133{
134 struct eventfd_ctx *ctx = file->private_data;
135 unsigned int events = 0;
e22553e2 136 u64 count;
e1ad7468
DL
137
138 poll_wait(file, &ctx->wqh, wait);
a484c3dd
PB
139
140 /*
141 * All writes to ctx->count occur within ctx->wqh.lock. This read
142 * can be done outside ctx->wqh.lock because we know that poll_wait
143 * takes that lock (through add_wait_queue) if our caller will sleep.
144 *
145 * The read _can_ therefore seep into add_wait_queue's critical
146 * section, but cannot move above it! add_wait_queue's spin_lock acts
147 * as an acquire barrier and ensures that the read be ordered properly
148 * against the writes. The following CAN happen and is safe:
149 *
150 * poll write
151 * ----------------- ------------
152 * lock ctx->wqh.lock (in poll_wait)
153 * count = ctx->count
154 * __add_wait_queue
155 * unlock ctx->wqh.lock
156 * lock ctx->qwh.lock
157 * ctx->count += n
158 * if (waitqueue_active)
159 * wake_up_locked_poll
160 * unlock ctx->qwh.lock
161 * eventfd_poll returns 0
162 *
163 * but the following, which would miss a wakeup, cannot happen:
164 *
165 * poll write
166 * ----------------- ------------
167 * count = ctx->count (INVALID!)
168 * lock ctx->qwh.lock
169 * ctx->count += n
170 * **waitqueue_active is false**
171 * **no wake_up_locked_poll!**
172 * unlock ctx->qwh.lock
173 * lock ctx->wqh.lock (in poll_wait)
174 * __add_wait_queue
175 * unlock ctx->wqh.lock
176 * eventfd_poll returns 0
177 */
178 count = READ_ONCE(ctx->count);
e1ad7468 179
e22553e2 180 if (count > 0)
e1ad7468 181 events |= POLLIN;
e22553e2 182 if (count == ULLONG_MAX)
e1ad7468 183 events |= POLLERR;
e22553e2 184 if (ULLONG_MAX - 1 > count)
e1ad7468 185 events |= POLLOUT;
e1ad7468
DL
186
187 return events;
188}
189
cb289d62
DL
190static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
191{
192 *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
193 ctx->count -= *cnt;
194}
195
196/**
197 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
198 * @ctx: [in] Pointer to eventfd context.
199 * @wait: [in] Wait queue to be removed.
36182185 200 * @cnt: [out] Pointer to the 64-bit counter value.
cb289d62 201 *
36182185 202 * Returns %0 if successful, or the following error codes:
cb289d62
DL
203 *
204 * -EAGAIN : The operation would have blocked.
205 *
206 * This is used to atomically remove a wait queue entry from the eventfd wait
207 * queue head, and read/reset the counter value.
208 */
ac6424b9 209int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
cb289d62
DL
210 __u64 *cnt)
211{
212 unsigned long flags;
213
214 spin_lock_irqsave(&ctx->wqh.lock, flags);
215 eventfd_ctx_do_read(ctx, cnt);
216 __remove_wait_queue(&ctx->wqh, wait);
217 if (*cnt != 0 && waitqueue_active(&ctx->wqh))
218 wake_up_locked_poll(&ctx->wqh, POLLOUT);
219 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
220
221 return *cnt != 0 ? 0 : -EAGAIN;
222}
223EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
224
225/**
226 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
227 * @ctx: [in] Pointer to eventfd context.
228 * @no_wait: [in] Different from zero if the operation should not block.
36182185 229 * @cnt: [out] Pointer to the 64-bit counter value.
cb289d62 230 *
36182185 231 * Returns %0 if successful, or the following error codes:
cb289d62 232 *
f16df9f7
MCC
233 * - -EAGAIN : The operation would have blocked but @no_wait was non-zero.
234 * - -ERESTARTSYS : A signal interrupted the wait operation.
cb289d62
DL
235 *
236 * If @no_wait is zero, the function might sleep until the eventfd internal
237 * counter becomes greater than zero.
238 */
239ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
e1ad7468 240{
e1ad7468 241 ssize_t res;
e1ad7468
DL
242 DECLARE_WAITQUEUE(wait, current);
243
d48eb233 244 spin_lock_irq(&ctx->wqh.lock);
cb289d62 245 *cnt = 0;
e1ad7468 246 res = -EAGAIN;
bcd0b235 247 if (ctx->count > 0)
cb289d62
DL
248 res = 0;
249 else if (!no_wait) {
e1ad7468 250 __add_wait_queue(&ctx->wqh, &wait);
cb289d62 251 for (;;) {
e1ad7468
DL
252 set_current_state(TASK_INTERRUPTIBLE);
253 if (ctx->count > 0) {
cb289d62 254 res = 0;
e1ad7468
DL
255 break;
256 }
257 if (signal_pending(current)) {
258 res = -ERESTARTSYS;
259 break;
260 }
d48eb233 261 spin_unlock_irq(&ctx->wqh.lock);
e1ad7468 262 schedule();
d48eb233 263 spin_lock_irq(&ctx->wqh.lock);
e1ad7468
DL
264 }
265 __remove_wait_queue(&ctx->wqh, &wait);
266 __set_current_state(TASK_RUNNING);
267 }
cb289d62
DL
268 if (likely(res == 0)) {
269 eventfd_ctx_do_read(ctx, cnt);
e1ad7468 270 if (waitqueue_active(&ctx->wqh))
39510888 271 wake_up_locked_poll(&ctx->wqh, POLLOUT);
e1ad7468 272 }
d48eb233 273 spin_unlock_irq(&ctx->wqh.lock);
e1ad7468
DL
274
275 return res;
276}
cb289d62
DL
277EXPORT_SYMBOL_GPL(eventfd_ctx_read);
278
279static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
280 loff_t *ppos)
281{
282 struct eventfd_ctx *ctx = file->private_data;
283 ssize_t res;
284 __u64 cnt;
285
286 if (count < sizeof(cnt))
287 return -EINVAL;
288 res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
289 if (res < 0)
290 return res;
291
292 return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
293}
e1ad7468
DL
294
295static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
296 loff_t *ppos)
297{
298 struct eventfd_ctx *ctx = file->private_data;
299 ssize_t res;
300 __u64 ucnt;
301 DECLARE_WAITQUEUE(wait, current);
302
303 if (count < sizeof(ucnt))
304 return -EINVAL;
305 if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
306 return -EFAULT;
307 if (ucnt == ULLONG_MAX)
308 return -EINVAL;
d48eb233 309 spin_lock_irq(&ctx->wqh.lock);
e1ad7468
DL
310 res = -EAGAIN;
311 if (ULLONG_MAX - ctx->count > ucnt)
312 res = sizeof(ucnt);
313 else if (!(file->f_flags & O_NONBLOCK)) {
314 __add_wait_queue(&ctx->wqh, &wait);
315 for (res = 0;;) {
316 set_current_state(TASK_INTERRUPTIBLE);
317 if (ULLONG_MAX - ctx->count > ucnt) {
318 res = sizeof(ucnt);
319 break;
320 }
321 if (signal_pending(current)) {
322 res = -ERESTARTSYS;
323 break;
324 }
d48eb233 325 spin_unlock_irq(&ctx->wqh.lock);
e1ad7468 326 schedule();
d48eb233 327 spin_lock_irq(&ctx->wqh.lock);
e1ad7468
DL
328 }
329 __remove_wait_queue(&ctx->wqh, &wait);
330 __set_current_state(TASK_RUNNING);
331 }
bcd0b235 332 if (likely(res > 0)) {
e1ad7468
DL
333 ctx->count += ucnt;
334 if (waitqueue_active(&ctx->wqh))
39510888 335 wake_up_locked_poll(&ctx->wqh, POLLIN);
e1ad7468 336 }
d48eb233 337 spin_unlock_irq(&ctx->wqh.lock);
e1ad7468
DL
338
339 return res;
340}
341
cbac5542 342#ifdef CONFIG_PROC_FS
a3816ab0 343static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
cbac5542
CG
344{
345 struct eventfd_ctx *ctx = f->private_data;
cbac5542
CG
346
347 spin_lock_irq(&ctx->wqh.lock);
a3816ab0
JP
348 seq_printf(m, "eventfd-count: %16llx\n",
349 (unsigned long long)ctx->count);
cbac5542 350 spin_unlock_irq(&ctx->wqh.lock);
cbac5542
CG
351}
352#endif
353
e1ad7468 354static const struct file_operations eventfd_fops = {
cbac5542
CG
355#ifdef CONFIG_PROC_FS
356 .show_fdinfo = eventfd_show_fdinfo,
357#endif
e1ad7468
DL
358 .release = eventfd_release,
359 .poll = eventfd_poll,
360 .read = eventfd_read,
361 .write = eventfd_write,
6038f373 362 .llseek = noop_llseek,
e1ad7468
DL
363};
364
13389010
DL
365/**
366 * eventfd_fget - Acquire a reference of an eventfd file descriptor.
367 * @fd: [in] Eventfd file descriptor.
368 *
369 * Returns a pointer to the eventfd file structure in case of success, or the
370 * following error pointer:
371 *
372 * -EBADF : Invalid @fd file descriptor.
373 * -EINVAL : The @fd file descriptor is not an eventfd file.
374 */
e1ad7468
DL
375struct file *eventfd_fget(int fd)
376{
377 struct file *file;
378
379 file = fget(fd);
380 if (!file)
381 return ERR_PTR(-EBADF);
382 if (file->f_op != &eventfd_fops) {
383 fput(file);
384 return ERR_PTR(-EINVAL);
385 }
386
387 return file;
388}
5718607b 389EXPORT_SYMBOL_GPL(eventfd_fget);
e1ad7468 390
13389010
DL
391/**
392 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
393 * @fd: [in] Eventfd file descriptor.
394 *
395 * Returns a pointer to the internal eventfd context, otherwise the error
396 * pointers returned by the following functions:
397 *
398 * eventfd_fget
399 */
400struct eventfd_ctx *eventfd_ctx_fdget(int fd)
401{
13389010 402 struct eventfd_ctx *ctx;
36a74117
AV
403 struct fd f = fdget(fd);
404 if (!f.file)
405 return ERR_PTR(-EBADF);
406 ctx = eventfd_ctx_fileget(f.file);
407 fdput(f);
13389010
DL
408 return ctx;
409}
410EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
411
412/**
413 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
414 * @file: [in] Eventfd file pointer.
415 *
416 * Returns a pointer to the internal eventfd context, otherwise the error
417 * pointer:
418 *
419 * -EINVAL : The @fd file descriptor is not an eventfd file.
420 */
421struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
422{
423 if (file->f_op != &eventfd_fops)
424 return ERR_PTR(-EINVAL);
425
426 return eventfd_ctx_get(file->private_data);
427}
428EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
429
562787a5
DL
430/**
431 * eventfd_file_create - Creates an eventfd file pointer.
432 * @count: Initial eventfd counter value.
433 * @flags: Flags for the eventfd file.
434 *
435 * This function creates an eventfd file pointer, w/out installing it into
436 * the fd table. This is useful when the eventfd file is used during the
437 * initialization of data structures that require extra setup after the eventfd
438 * creation. So the eventfd creation is split into the file pointer creation
439 * phase, and the file descriptor installation phase.
440 * In this way races with userspace closing the newly installed file descriptor
441 * can be avoided.
442 * Returns an eventfd file pointer, or a proper error pointer.
443 */
444struct file *eventfd_file_create(unsigned int count, int flags)
e1ad7468 445{
562787a5 446 struct file *file;
e1ad7468 447 struct eventfd_ctx *ctx;
e1ad7468 448
e38b36f3
UD
449 /* Check the EFD_* constants for consistency. */
450 BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
451 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
452
bcd0b235 453 if (flags & ~EFD_FLAGS_SET)
562787a5 454 return ERR_PTR(-EINVAL);
b087498e 455
e1ad7468
DL
456 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
457 if (!ctx)
562787a5 458 return ERR_PTR(-ENOMEM);
e1ad7468 459
13389010 460 kref_init(&ctx->kref);
e1ad7468 461 init_waitqueue_head(&ctx->wqh);
e1ad7468 462 ctx->count = count;
bcd0b235 463 ctx->flags = flags;
e1ad7468 464
562787a5 465 file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
628ff7c1 466 O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
562787a5
DL
467 if (IS_ERR(file))
468 eventfd_free_ctx(ctx);
469
470 return file;
471}
472
473SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
474{
475 int fd, error;
476 struct file *file;
477
478 error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
479 if (error < 0)
480 return error;
481 fd = error;
482
483 file = eventfd_file_create(count, flags);
484 if (IS_ERR(file)) {
485 error = PTR_ERR(file);
486 goto err_put_unused_fd;
487 }
488 fd_install(fd, file);
489
2030a42c 490 return fd;
562787a5
DL
491
492err_put_unused_fd:
493 put_unused_fd(fd);
494
495 return error;
e1ad7468
DL
496}
497
d4e82042 498SYSCALL_DEFINE1(eventfd, unsigned int, count)
b087498e
UD
499{
500 return sys_eventfd2(count, 0);
501}
bcd0b235 502