1 // SPDX-License-Identifier: GPL-2.0
2 /* Watch queue and general notification mechanism, built on pipes
4 * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
7 * See Documentation/watch_queue.rst
10 #define pr_fmt(fmt) "watchq: " fmt
11 #include <linux/module.h>
12 #include <linux/init.h>
13 #include <linux/sched.h>
14 #include <linux/slab.h>
15 #include <linux/printk.h>
16 #include <linux/miscdevice.h>
19 #include <linux/pagemap.h>
20 #include <linux/poll.h>
21 #include <linux/uaccess.h>
22 #include <linux/vmalloc.h>
23 #include <linux/file.h>
24 #include <linux/security.h>
25 #include <linux/cred.h>
26 #include <linux/sched/signal.h>
27 #include <linux/watch_queue.h>
28 #include <linux/pipe_fs_i.h>
30 MODULE_DESCRIPTION("Watch queue");
31 MODULE_AUTHOR("Red Hat, Inc.");
32 MODULE_LICENSE("GPL");
34 #define WATCH_QUEUE_NOTE_SIZE 128
35 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
38 * This must be called under the RCU read-lock, which makes
39 * sure that the wqueue still exists. It can then take the lock,
40 * and check that the wqueue hasn't been destroyed, which in
41 * turn makes sure that the notification pipe still exists.
43 static inline bool lock_wqueue(struct watch_queue
*wqueue
)
45 spin_lock_bh(&wqueue
->lock
);
46 if (unlikely(wqueue
->defunct
)) {
47 spin_unlock_bh(&wqueue
->lock
);
53 static inline void unlock_wqueue(struct watch_queue
*wqueue
)
55 spin_unlock_bh(&wqueue
->lock
);
58 static void watch_queue_pipe_buf_release(struct pipe_inode_info
*pipe
,
59 struct pipe_buffer
*buf
)
61 struct watch_queue
*wqueue
= (struct watch_queue
*)buf
->private;
65 /* We need to work out which note within the page this refers to, but
66 * the note might have been maximum size, so merely ANDing the offset
67 * off doesn't work. OTOH, the note must've been more than zero size.
69 bit
= buf
->offset
+ buf
->len
;
70 if ((bit
& (WATCH_QUEUE_NOTE_SIZE
- 1)) == 0)
71 bit
-= WATCH_QUEUE_NOTE_SIZE
;
72 bit
/= WATCH_QUEUE_NOTE_SIZE
;
77 set_bit(bit
, wqueue
->notes_bitmap
);
78 generic_pipe_buf_release(pipe
, buf
);
81 // No try_steal function => no stealing
82 #define watch_queue_pipe_buf_try_steal NULL
84 /* New data written to a pipe may be appended to a buffer with this type. */
85 static const struct pipe_buf_operations watch_queue_pipe_buf_ops
= {
86 .release
= watch_queue_pipe_buf_release
,
87 .try_steal
= watch_queue_pipe_buf_try_steal
,
88 .get
= generic_pipe_buf_get
,
92 * Post a notification to a watch queue.
94 * Must be called with the RCU lock for reading, and the
95 * watch_queue lock held, which guarantees that the pipe
96 * hasn't been released.
98 static bool post_one_notification(struct watch_queue
*wqueue
,
99 struct watch_notification
*n
)
102 struct pipe_inode_info
*pipe
= wqueue
->pipe
;
103 struct pipe_buffer
*buf
;
105 unsigned int head
, tail
, mask
, note
, offset
, len
;
111 spin_lock_irq(&pipe
->rd_wait
.lock
);
113 mask
= pipe
->ring_size
- 1;
116 if (pipe_full(head
, tail
, pipe
->ring_size
))
119 note
= find_first_bit(wqueue
->notes_bitmap
, wqueue
->nr_notes
);
120 if (note
>= wqueue
->nr_notes
)
123 page
= wqueue
->notes
[note
/ WATCH_QUEUE_NOTES_PER_PAGE
];
124 offset
= note
% WATCH_QUEUE_NOTES_PER_PAGE
* WATCH_QUEUE_NOTE_SIZE
;
126 len
= n
->info
& WATCH_INFO_LENGTH
;
127 p
= kmap_atomic(page
);
128 memcpy(p
+ offset
, n
, len
);
131 buf
= &pipe
->bufs
[head
& mask
];
133 buf
->private = (unsigned long)wqueue
;
134 buf
->ops
= &watch_queue_pipe_buf_ops
;
135 buf
->offset
= offset
;
137 buf
->flags
= PIPE_BUF_FLAG_WHOLE
;
138 smp_store_release(&pipe
->head
, head
+ 1); /* vs pipe_read() */
140 if (!test_and_clear_bit(note
, wqueue
->notes_bitmap
)) {
141 spin_unlock_irq(&pipe
->rd_wait
.lock
);
144 wake_up_interruptible_sync_poll_locked(&pipe
->rd_wait
, EPOLLIN
| EPOLLRDNORM
);
148 spin_unlock_irq(&pipe
->rd_wait
.lock
);
150 kill_fasync(&pipe
->fasync_readers
, SIGIO
, POLL_IN
);
154 buf
= &pipe
->bufs
[(head
- 1) & mask
];
155 buf
->flags
|= PIPE_BUF_FLAG_LOSS
;
160 * Apply filter rules to a notification.
162 static bool filter_watch_notification(const struct watch_filter
*wf
,
163 const struct watch_notification
*n
)
165 const struct watch_type_filter
*wt
;
166 unsigned int st_bits
= sizeof(wt
->subtype_filter
[0]) * 8;
167 unsigned int st_index
= n
->subtype
/ st_bits
;
168 unsigned int st_bit
= 1U << (n
->subtype
% st_bits
);
171 if (!test_bit(n
->type
, wf
->type_filter
))
174 for (i
= 0; i
< wf
->nr_filters
; i
++) {
175 wt
= &wf
->filters
[i
];
176 if (n
->type
== wt
->type
&&
177 (wt
->subtype_filter
[st_index
] & st_bit
) &&
178 (n
->info
& wt
->info_mask
) == wt
->info_filter
)
182 return false; /* If there is a filter, the default is to reject. */
186 * __post_watch_notification - Post an event notification
187 * @wlist: The watch list to post the event to.
188 * @n: The notification record to post.
189 * @cred: The creds of the process that triggered the notification.
190 * @id: The ID to match on the watch.
192 * Post a notification of an event into a set of watch queues and let the users
195 * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and
196 * should be in units of sizeof(*n).
198 void __post_watch_notification(struct watch_list
*wlist
,
199 struct watch_notification
*n
,
200 const struct cred
*cred
,
203 const struct watch_filter
*wf
;
204 struct watch_queue
*wqueue
;
207 if (((n
->info
& WATCH_INFO_LENGTH
) >> WATCH_INFO_LENGTH__SHIFT
) == 0) {
214 hlist_for_each_entry_rcu(watch
, &wlist
->watchers
, list_node
) {
217 n
->info
&= ~WATCH_INFO_ID
;
218 n
->info
|= watch
->info_id
;
220 wqueue
= rcu_dereference(watch
->queue
);
221 wf
= rcu_dereference(wqueue
->filter
);
222 if (wf
&& !filter_watch_notification(wf
, n
))
225 if (security_post_notification(watch
->cred
, cred
, n
) < 0)
228 if (lock_wqueue(wqueue
)) {
229 post_one_notification(wqueue
, n
);
230 unlock_wqueue(wqueue
);
236 EXPORT_SYMBOL(__post_watch_notification
);
239 * Allocate sufficient pages to preallocation for the requested number of
242 long watch_queue_set_size(struct pipe_inode_info
*pipe
, unsigned int nr_notes
)
244 struct watch_queue
*wqueue
= pipe
->watch_queue
;
246 unsigned long *bitmap
;
247 unsigned long user_bufs
;
249 int ret
, i
, nr_pages
;
257 nr_notes
> 512) /* TODO: choose a better hard limit */
260 nr_pages
= (nr_notes
+ WATCH_QUEUE_NOTES_PER_PAGE
- 1);
261 nr_pages
/= WATCH_QUEUE_NOTES_PER_PAGE
;
262 user_bufs
= account_pipe_buffers(pipe
->user
, pipe
->nr_accounted
, nr_pages
);
264 if (nr_pages
> pipe
->max_usage
&&
265 (too_many_pipe_buffers_hard(user_bufs
) ||
266 too_many_pipe_buffers_soft(user_bufs
)) &&
267 pipe_is_unprivileged_user()) {
272 nr_notes
= nr_pages
* WATCH_QUEUE_NOTES_PER_PAGE
;
273 ret
= pipe_resize_ring(pipe
, roundup_pow_of_two(nr_notes
));
277 pages
= kcalloc(sizeof(struct page
*), nr_pages
, GFP_KERNEL
);
281 for (i
= 0; i
< nr_pages
; i
++) {
282 pages
[i
] = alloc_page(GFP_KERNEL
);
285 pages
[i
]->index
= i
* WATCH_QUEUE_NOTES_PER_PAGE
;
288 bmsize
= (nr_notes
+ BITS_PER_LONG
- 1) / BITS_PER_LONG
;
289 bmsize
*= sizeof(unsigned long);
290 bitmap
= kmalloc(bmsize
, GFP_KERNEL
);
294 memset(bitmap
, 0xff, bmsize
);
295 wqueue
->notes
= pages
;
296 wqueue
->notes_bitmap
= bitmap
;
297 wqueue
->nr_pages
= nr_pages
;
298 wqueue
->nr_notes
= nr_notes
;
303 __free_page(pages
[i
]);
306 (void) account_pipe_buffers(pipe
->user
, nr_pages
, pipe
->nr_accounted
);
311 * Set the filter on a watch queue.
313 long watch_queue_set_filter(struct pipe_inode_info
*pipe
,
314 struct watch_notification_filter __user
*_filter
)
316 struct watch_notification_type_filter
*tf
;
317 struct watch_notification_filter filter
;
318 struct watch_type_filter
*q
;
319 struct watch_filter
*wfilter
;
320 struct watch_queue
*wqueue
= pipe
->watch_queue
;
321 int ret
, nr_filter
= 0, i
;
327 /* Remove the old filter */
332 /* Grab the user's filter specification */
333 if (copy_from_user(&filter
, _filter
, sizeof(filter
)) != 0)
335 if (filter
.nr_filters
== 0 ||
336 filter
.nr_filters
> 16 ||
337 filter
.__reserved
!= 0)
340 tf
= memdup_user(_filter
->filters
, filter
.nr_filters
* sizeof(*tf
));
345 for (i
= 0; i
< filter
.nr_filters
; i
++) {
346 if ((tf
[i
].info_filter
& ~tf
[i
].info_mask
) ||
347 tf
[i
].info_mask
& WATCH_INFO_LENGTH
)
349 /* Ignore any unknown types */
350 if (tf
[i
].type
>= WATCH_TYPE__NR
)
355 /* Now we need to build the internal filter from only the relevant
356 * user-specified filters.
359 wfilter
= kzalloc(struct_size(wfilter
, filters
, nr_filter
), GFP_KERNEL
);
362 wfilter
->nr_filters
= nr_filter
;
364 q
= wfilter
->filters
;
365 for (i
= 0; i
< filter
.nr_filters
; i
++) {
366 if (tf
[i
].type
>= WATCH_TYPE__NR
)
369 q
->type
= tf
[i
].type
;
370 q
->info_filter
= tf
[i
].info_filter
;
371 q
->info_mask
= tf
[i
].info_mask
;
372 q
->subtype_filter
[0] = tf
[i
].subtype_filter
[0];
373 __set_bit(q
->type
, wfilter
->type_filter
);
380 wfilter
= rcu_replace_pointer(wqueue
->filter
, wfilter
,
381 lockdep_is_held(&pipe
->mutex
));
384 kfree_rcu(wfilter
, rcu
);
392 static void __put_watch_queue(struct kref
*kref
)
394 struct watch_queue
*wqueue
=
395 container_of(kref
, struct watch_queue
, usage
);
396 struct watch_filter
*wfilter
;
399 for (i
= 0; i
< wqueue
->nr_pages
; i
++)
400 __free_page(wqueue
->notes
[i
]);
401 kfree(wqueue
->notes
);
402 bitmap_free(wqueue
->notes_bitmap
);
404 wfilter
= rcu_access_pointer(wqueue
->filter
);
406 kfree_rcu(wfilter
, rcu
);
407 kfree_rcu(wqueue
, rcu
);
411 * put_watch_queue - Dispose of a ref on a watchqueue.
412 * @wqueue: The watch queue to unref.
414 void put_watch_queue(struct watch_queue
*wqueue
)
416 kref_put(&wqueue
->usage
, __put_watch_queue
);
418 EXPORT_SYMBOL(put_watch_queue
);
420 static void free_watch(struct rcu_head
*rcu
)
422 struct watch
*watch
= container_of(rcu
, struct watch
, rcu
);
424 put_watch_queue(rcu_access_pointer(watch
->queue
));
425 atomic_dec(&watch
->cred
->user
->nr_watches
);
426 put_cred(watch
->cred
);
430 static void __put_watch(struct kref
*kref
)
432 struct watch
*watch
= container_of(kref
, struct watch
, usage
);
434 call_rcu(&watch
->rcu
, free_watch
);
440 static void put_watch(struct watch
*watch
)
442 kref_put(&watch
->usage
, __put_watch
);
446 * init_watch - Initialise a watch
447 * @watch: The watch to initialise.
448 * @wqueue: The queue to assign.
450 * Initialise a watch and set the watch queue.
452 void init_watch(struct watch
*watch
, struct watch_queue
*wqueue
)
454 kref_init(&watch
->usage
);
455 INIT_HLIST_NODE(&watch
->list_node
);
456 INIT_HLIST_NODE(&watch
->queue_node
);
457 rcu_assign_pointer(watch
->queue
, wqueue
);
460 static int add_one_watch(struct watch
*watch
, struct watch_list
*wlist
, struct watch_queue
*wqueue
)
462 const struct cred
*cred
;
465 hlist_for_each_entry(w
, &wlist
->watchers
, list_node
) {
466 struct watch_queue
*wq
= rcu_access_pointer(w
->queue
);
467 if (wqueue
== wq
&& watch
->id
== w
->id
)
471 cred
= current_cred();
472 if (atomic_inc_return(&cred
->user
->nr_watches
) > task_rlimit(current
, RLIMIT_NOFILE
)) {
473 atomic_dec(&cred
->user
->nr_watches
);
477 watch
->cred
= get_cred(cred
);
478 rcu_assign_pointer(watch
->watch_list
, wlist
);
480 kref_get(&wqueue
->usage
);
481 kref_get(&watch
->usage
);
482 hlist_add_head(&watch
->queue_node
, &wqueue
->watches
);
483 hlist_add_head_rcu(&watch
->list_node
, &wlist
->watchers
);
488 * add_watch_to_object - Add a watch on an object to a watch list
489 * @watch: The watch to add
490 * @wlist: The watch list to add to
492 * @watch->queue must have been set to point to the queue to post notifications
493 * to and the watch list of the object to be watched. @watch->cred must also
494 * have been set to the appropriate credentials and a ref taken on them.
496 * The caller must pin the queue and the list both and must hold the list
497 * locked against racing watch additions/removals.
499 int add_watch_to_object(struct watch
*watch
, struct watch_list
*wlist
)
501 struct watch_queue
*wqueue
;
506 wqueue
= rcu_access_pointer(watch
->queue
);
507 if (lock_wqueue(wqueue
)) {
508 spin_lock(&wlist
->lock
);
509 ret
= add_one_watch(watch
, wlist
, wqueue
);
510 spin_unlock(&wlist
->lock
);
511 unlock_wqueue(wqueue
);
517 EXPORT_SYMBOL(add_watch_to_object
);
520 * remove_watch_from_object - Remove a watch or all watches from an object.
521 * @wlist: The watch list to remove from
522 * @wq: The watch queue of interest (ignored if @all is true)
523 * @id: The ID of the watch to remove (ignored if @all is true)
524 * @all: True to remove all objects
526 * Remove a specific watch or all watches from an object. A notification is
527 * sent to the watcher to tell them that this happened.
529 int remove_watch_from_object(struct watch_list
*wlist
, struct watch_queue
*wq
,
532 struct watch_notification_removal n
;
533 struct watch_queue
*wqueue
;
540 spin_lock(&wlist
->lock
);
541 hlist_for_each_entry(watch
, &wlist
->watchers
, list_node
) {
543 (watch
->id
== id
&& rcu_access_pointer(watch
->queue
) == wq
))
546 spin_unlock(&wlist
->lock
);
551 hlist_del_init_rcu(&watch
->list_node
);
552 rcu_assign_pointer(watch
->watch_list
, NULL
);
553 spin_unlock(&wlist
->lock
);
555 /* We now own the reference on watch that used to belong to wlist. */
557 n
.watch
.type
= WATCH_TYPE_META
;
558 n
.watch
.subtype
= WATCH_META_REMOVAL_NOTIFICATION
;
559 n
.watch
.info
= watch
->info_id
| watch_sizeof(n
.watch
);
562 n
.watch
.info
= watch
->info_id
| watch_sizeof(n
);
564 wqueue
= rcu_dereference(watch
->queue
);
566 if (lock_wqueue(wqueue
)) {
567 post_one_notification(wqueue
, &n
.watch
);
569 if (!hlist_unhashed(&watch
->queue_node
)) {
570 hlist_del_init_rcu(&watch
->queue_node
);
574 unlock_wqueue(wqueue
);
577 if (wlist
->release_watch
) {
578 void (*release_watch
)(struct watch
*);
580 release_watch
= wlist
->release_watch
;
582 (*release_watch
)(watch
);
587 if (all
&& !hlist_empty(&wlist
->watchers
))
593 EXPORT_SYMBOL(remove_watch_from_object
);
596 * Remove all the watches that are contributory to a queue. This has the
597 * potential to race with removal of the watches by the destruction of the
598 * objects being watched or with the distribution of notifications.
600 void watch_queue_clear(struct watch_queue
*wqueue
)
602 struct watch_list
*wlist
;
607 spin_lock_bh(&wqueue
->lock
);
609 /* Prevent new notifications from being stored. */
610 wqueue
->defunct
= true;
612 while (!hlist_empty(&wqueue
->watches
)) {
613 watch
= hlist_entry(wqueue
->watches
.first
, struct watch
, queue_node
);
614 hlist_del_init_rcu(&watch
->queue_node
);
615 /* We now own a ref on the watch. */
616 spin_unlock_bh(&wqueue
->lock
);
618 /* We can't do the next bit under the queue lock as we need to
619 * get the list lock - which would cause a deadlock if someone
620 * was removing from the opposite direction at the same time or
621 * posting a notification.
623 wlist
= rcu_dereference(watch
->watch_list
);
625 void (*release_watch
)(struct watch
*);
627 spin_lock(&wlist
->lock
);
629 release
= !hlist_unhashed(&watch
->list_node
);
631 hlist_del_init_rcu(&watch
->list_node
);
632 rcu_assign_pointer(watch
->watch_list
, NULL
);
634 /* We now own a second ref on the watch. */
637 release_watch
= wlist
->release_watch
;
638 spin_unlock(&wlist
->lock
);
643 /* This might need to call dput(), so
644 * we have to drop all the locks.
646 (*release_watch
)(watch
);
654 spin_lock_bh(&wqueue
->lock
);
657 spin_unlock_bh(&wqueue
->lock
);
662 * get_watch_queue - Get a watch queue from its file descriptor.
663 * @fd: The fd to query.
665 struct watch_queue
*get_watch_queue(int fd
)
667 struct pipe_inode_info
*pipe
;
668 struct watch_queue
*wqueue
= ERR_PTR(-EINVAL
);
673 pipe
= get_pipe_info(f
.file
, false);
674 if (pipe
&& pipe
->watch_queue
) {
675 wqueue
= pipe
->watch_queue
;
676 kref_get(&wqueue
->usage
);
683 EXPORT_SYMBOL(get_watch_queue
);
686 * Initialise a watch queue
688 int watch_queue_init(struct pipe_inode_info
*pipe
)
690 struct watch_queue
*wqueue
;
692 wqueue
= kzalloc(sizeof(*wqueue
), GFP_KERNEL
);
697 kref_init(&wqueue
->usage
);
698 spin_lock_init(&wqueue
->lock
);
699 INIT_HLIST_HEAD(&wqueue
->watches
);
701 pipe
->watch_queue
= wqueue
;