1 // SPDX-License-Identifier: GPL-2.0
3 * dax: direct host memory access
4 * Copyright (C) 2020 Red Hat, Inc.
10 #include <linux/uio.h>
11 #include <linux/pfn_t.h>
12 #include <linux/iomap.h>
13 #include <linux/interval_tree.h>
16 * Default memory range size. A power of 2 so it agrees with common FUSE_INIT
17 * map_alignment values 4KB and 64KB.
19 #define FUSE_DAX_SHIFT 21
20 #define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT)
21 #define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
23 /** Translation information for file offsets to DAX window offsets */
24 struct fuse_dax_mapping
{
25 /* Will connect in fcd->free_ranges to keep track of free memory */
26 struct list_head list
;
28 /* For interval tree in file/inode */
29 struct interval_tree_node itn
;
31 /** Position in DAX window */
34 /** Length of mapping, in bytes */
37 /* Is this mapping read-only or read-write */
41 /* Per-inode dax map */
42 struct fuse_inode_dax
{
43 /* Semaphore to protect modifications to the dmap tree */
44 struct rw_semaphore sem
;
46 /* Sorted rb tree of struct fuse_dax_mapping elements */
47 struct rb_root_cached tree
;
51 struct fuse_conn_dax
{
53 struct dax_device
*dev
;
55 /* Lock protecting accessess to members of this structure */
58 /* DAX Window Free Ranges */
60 struct list_head free_ranges
;
63 static inline struct fuse_dax_mapping
*
64 node_to_dmap(struct interval_tree_node
*node
)
69 return container_of(node
, struct fuse_dax_mapping
, itn
);
72 static struct fuse_dax_mapping
*alloc_dax_mapping(struct fuse_conn_dax
*fcd
)
74 struct fuse_dax_mapping
*dmap
;
76 spin_lock(&fcd
->lock
);
77 dmap
= list_first_entry_or_null(&fcd
->free_ranges
,
78 struct fuse_dax_mapping
, list
);
80 list_del_init(&dmap
->list
);
81 WARN_ON(fcd
->nr_free_ranges
<= 0);
82 fcd
->nr_free_ranges
--;
84 spin_unlock(&fcd
->lock
);
88 /* This assumes fcd->lock is held */
89 static void __dmap_add_to_free_pool(struct fuse_conn_dax
*fcd
,
90 struct fuse_dax_mapping
*dmap
)
92 list_add_tail(&dmap
->list
, &fcd
->free_ranges
);
93 fcd
->nr_free_ranges
++;
96 static void dmap_add_to_free_pool(struct fuse_conn_dax
*fcd
,
97 struct fuse_dax_mapping
*dmap
)
99 /* Return fuse_dax_mapping to free list */
100 spin_lock(&fcd
->lock
);
101 __dmap_add_to_free_pool(fcd
, dmap
);
102 spin_unlock(&fcd
->lock
);
105 static int fuse_setup_one_mapping(struct inode
*inode
, unsigned long start_idx
,
106 struct fuse_dax_mapping
*dmap
, bool writable
,
109 struct fuse_conn
*fc
= get_fuse_conn(inode
);
110 struct fuse_conn_dax
*fcd
= fc
->dax
;
111 struct fuse_inode
*fi
= get_fuse_inode(inode
);
112 struct fuse_setupmapping_in inarg
;
113 loff_t offset
= start_idx
<< FUSE_DAX_SHIFT
;
117 WARN_ON(fcd
->nr_free_ranges
< 0);
119 /* Ask fuse daemon to setup mapping */
120 memset(&inarg
, 0, sizeof(inarg
));
121 inarg
.foffset
= offset
;
123 inarg
.moffset
= dmap
->window_offset
;
124 inarg
.len
= FUSE_DAX_SZ
;
125 inarg
.flags
|= FUSE_SETUPMAPPING_FLAG_READ
;
127 inarg
.flags
|= FUSE_SETUPMAPPING_FLAG_WRITE
;
128 args
.opcode
= FUSE_SETUPMAPPING
;
129 args
.nodeid
= fi
->nodeid
;
131 args
.in_args
[0].size
= sizeof(inarg
);
132 args
.in_args
[0].value
= &inarg
;
133 err
= fuse_simple_request(fc
, &args
);
136 dmap
->writable
= writable
;
138 dmap
->itn
.start
= dmap
->itn
.last
= start_idx
;
139 /* Protected by fi->dax->sem */
140 interval_tree_insert(&dmap
->itn
, &fi
->dax
->tree
);
146 static int fuse_send_removemapping(struct inode
*inode
,
147 struct fuse_removemapping_in
*inargp
,
148 struct fuse_removemapping_one
*remove_one
)
150 struct fuse_inode
*fi
= get_fuse_inode(inode
);
151 struct fuse_conn
*fc
= get_fuse_conn(inode
);
154 args
.opcode
= FUSE_REMOVEMAPPING
;
155 args
.nodeid
= fi
->nodeid
;
157 args
.in_args
[0].size
= sizeof(*inargp
);
158 args
.in_args
[0].value
= inargp
;
159 args
.in_args
[1].size
= inargp
->count
* sizeof(*remove_one
);
160 args
.in_args
[1].value
= remove_one
;
161 return fuse_simple_request(fc
, &args
);
164 static int dmap_removemapping_list(struct inode
*inode
, unsigned int num
,
165 struct list_head
*to_remove
)
167 struct fuse_removemapping_one
*remove_one
, *ptr
;
168 struct fuse_removemapping_in inarg
;
169 struct fuse_dax_mapping
*dmap
;
170 int ret
, i
= 0, nr_alloc
;
172 nr_alloc
= min_t(unsigned int, num
, FUSE_REMOVEMAPPING_MAX_ENTRY
);
173 remove_one
= kmalloc_array(nr_alloc
, sizeof(*remove_one
), GFP_NOFS
);
178 list_for_each_entry(dmap
, to_remove
, list
) {
179 ptr
->moffset
= dmap
->window_offset
;
180 ptr
->len
= dmap
->length
;
184 if (i
>= nr_alloc
|| num
== 0) {
185 memset(&inarg
, 0, sizeof(inarg
));
187 ret
= fuse_send_removemapping(inode
, &inarg
,
201 * Cleanup dmap entry and add back to free list. This should be called with
204 static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax
*fcd
,
205 struct fuse_dax_mapping
*dmap
)
207 pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
208 dmap
->itn
.start
, dmap
->itn
.last
, dmap
->window_offset
,
210 dmap
->itn
.start
= dmap
->itn
.last
= 0;
211 __dmap_add_to_free_pool(fcd
, dmap
);
215 * Free inode dmap entries whose range falls inside [start, end].
216 * Does not take any locks. At this point of time it should only be
217 * called from evict_inode() path where we know all dmap entries can be
220 static void inode_reclaim_dmap_range(struct fuse_conn_dax
*fcd
,
222 loff_t start
, loff_t end
)
224 struct fuse_inode
*fi
= get_fuse_inode(inode
);
225 struct fuse_dax_mapping
*dmap
, *n
;
227 LIST_HEAD(to_remove
);
228 unsigned long start_idx
= start
>> FUSE_DAX_SHIFT
;
229 unsigned long end_idx
= end
>> FUSE_DAX_SHIFT
;
230 struct interval_tree_node
*node
;
233 node
= interval_tree_iter_first(&fi
->dax
->tree
, start_idx
,
237 dmap
= node_to_dmap(node
);
238 interval_tree_remove(&dmap
->itn
, &fi
->dax
->tree
);
240 list_add(&dmap
->list
, &to_remove
);
243 /* Nothing to remove */
244 if (list_empty(&to_remove
))
247 WARN_ON(fi
->dax
->nr
< num
);
249 err
= dmap_removemapping_list(inode
, num
, &to_remove
);
250 if (err
&& err
!= -ENOTCONN
) {
251 pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
254 spin_lock(&fcd
->lock
);
255 list_for_each_entry_safe(dmap
, n
, &to_remove
, list
) {
256 list_del_init(&dmap
->list
);
257 dmap_reinit_add_to_free_pool(fcd
, dmap
);
259 spin_unlock(&fcd
->lock
);
263 * It is called from evict_inode() and by that time inode is going away. So
264 * this function does not take any locks like fi->dax->sem for traversing
265 * that fuse inode interval tree. If that lock is taken then lock validator
266 * complains of deadlock situation w.r.t fs_reclaim lock.
268 void fuse_dax_inode_cleanup(struct inode
*inode
)
270 struct fuse_conn
*fc
= get_fuse_conn(inode
);
271 struct fuse_inode
*fi
= get_fuse_inode(inode
);
274 * fuse_evict_inode() has already called truncate_inode_pages_final()
275 * before we arrive here. So we should not have to worry about any
276 * pages/exception entries still associated with inode.
278 inode_reclaim_dmap_range(fc
->dax
, inode
, 0, -1);
279 WARN_ON(fi
->dax
->nr
);
282 static void fuse_fill_iomap_hole(struct iomap
*iomap
, loff_t length
)
284 iomap
->addr
= IOMAP_NULL_ADDR
;
285 iomap
->length
= length
;
286 iomap
->type
= IOMAP_HOLE
;
289 static void fuse_fill_iomap(struct inode
*inode
, loff_t pos
, loff_t length
,
290 struct iomap
*iomap
, struct fuse_dax_mapping
*dmap
,
294 loff_t i_size
= i_size_read(inode
);
296 offset
= pos
- (dmap
->itn
.start
<< FUSE_DAX_SHIFT
);
297 len
= min(length
, dmap
->length
- offset
);
299 /* If length is beyond end of file, truncate further */
300 if (pos
+ len
> i_size
)
304 iomap
->addr
= dmap
->window_offset
+ offset
;
306 if (flags
& IOMAP_FAULT
)
307 iomap
->length
= ALIGN(len
, PAGE_SIZE
);
308 iomap
->type
= IOMAP_MAPPED
;
310 /* Mapping beyond end of file is hole */
311 fuse_fill_iomap_hole(iomap
, length
);
315 static int fuse_setup_new_dax_mapping(struct inode
*inode
, loff_t pos
,
316 loff_t length
, unsigned int flags
,
319 struct fuse_inode
*fi
= get_fuse_inode(inode
);
320 struct fuse_conn
*fc
= get_fuse_conn(inode
);
321 struct fuse_conn_dax
*fcd
= fc
->dax
;
322 struct fuse_dax_mapping
*dmap
, *alloc_dmap
= NULL
;
324 bool writable
= flags
& IOMAP_WRITE
;
325 unsigned long start_idx
= pos
>> FUSE_DAX_SHIFT
;
326 struct interval_tree_node
*node
;
328 alloc_dmap
= alloc_dax_mapping(fcd
);
333 * Take write lock so that only one caller can try to setup mapping
336 down_write(&fi
->dax
->sem
);
338 * We dropped lock. Check again if somebody else setup
341 node
= interval_tree_iter_first(&fi
->dax
->tree
, start_idx
, start_idx
);
343 dmap
= node_to_dmap(node
);
344 fuse_fill_iomap(inode
, pos
, length
, iomap
, dmap
, flags
);
345 dmap_add_to_free_pool(fcd
, alloc_dmap
);
346 up_write(&fi
->dax
->sem
);
350 /* Setup one mapping */
351 ret
= fuse_setup_one_mapping(inode
, pos
>> FUSE_DAX_SHIFT
, alloc_dmap
,
354 dmap_add_to_free_pool(fcd
, alloc_dmap
);
355 up_write(&fi
->dax
->sem
);
358 fuse_fill_iomap(inode
, pos
, length
, iomap
, alloc_dmap
, flags
);
359 up_write(&fi
->dax
->sem
);
363 static int fuse_upgrade_dax_mapping(struct inode
*inode
, loff_t pos
,
364 loff_t length
, unsigned int flags
,
367 struct fuse_inode
*fi
= get_fuse_inode(inode
);
368 struct fuse_dax_mapping
*dmap
;
370 unsigned long idx
= pos
>> FUSE_DAX_SHIFT
;
371 struct interval_tree_node
*node
;
374 * Take exclusive lock so that only one caller can try to setup
375 * mapping and others wait.
377 down_write(&fi
->dax
->sem
);
378 node
= interval_tree_iter_first(&fi
->dax
->tree
, idx
, idx
);
380 /* We are holding either inode lock or i_mmap_sem, and that should
381 * ensure that dmap can't reclaimed or truncated and it should still
382 * be there in tree despite the fact we dropped and re-acquired the
389 dmap
= node_to_dmap(node
);
391 /* Maybe another thread already upgraded mapping while we were not
394 if (dmap
->writable
) {
399 ret
= fuse_setup_one_mapping(inode
, pos
>> FUSE_DAX_SHIFT
, dmap
, true,
404 fuse_fill_iomap(inode
, pos
, length
, iomap
, dmap
, flags
);
406 up_write(&fi
->dax
->sem
);
410 /* This is just for DAX and the mapping is ephemeral, do not use it for other
411 * purposes since there is no block device with a permanent mapping.
413 static int fuse_iomap_begin(struct inode
*inode
, loff_t pos
, loff_t length
,
414 unsigned int flags
, struct iomap
*iomap
,
415 struct iomap
*srcmap
)
417 struct fuse_inode
*fi
= get_fuse_inode(inode
);
418 struct fuse_conn
*fc
= get_fuse_conn(inode
);
419 struct fuse_dax_mapping
*dmap
;
420 bool writable
= flags
& IOMAP_WRITE
;
421 unsigned long start_idx
= pos
>> FUSE_DAX_SHIFT
;
422 struct interval_tree_node
*node
;
424 /* We don't support FIEMAP */
425 if (WARN_ON(flags
& IOMAP_REPORT
))
431 iomap
->dax_dev
= fc
->dax
->dev
;
434 * Both read/write and mmap path can race here. So we need something
435 * to make sure if we are setting up mapping, then other path waits
437 * For now, use a semaphore for this. It probably needs to be
440 down_read(&fi
->dax
->sem
);
441 node
= interval_tree_iter_first(&fi
->dax
->tree
, start_idx
, start_idx
);
443 dmap
= node_to_dmap(node
);
444 if (writable
&& !dmap
->writable
) {
445 /* Upgrade read-only mapping to read-write. This will
446 * require exclusive fi->dax->sem lock as we don't want
447 * two threads to be trying to this simultaneously
448 * for same dmap. So drop shared lock and acquire
451 up_read(&fi
->dax
->sem
);
452 pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
453 __func__
, pos
, length
);
454 return fuse_upgrade_dax_mapping(inode
, pos
, length
,
457 fuse_fill_iomap(inode
, pos
, length
, iomap
, dmap
, flags
);
458 up_read(&fi
->dax
->sem
);
462 up_read(&fi
->dax
->sem
);
463 pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
464 __func__
, pos
, length
);
465 if (pos
>= i_size_read(inode
))
468 return fuse_setup_new_dax_mapping(inode
, pos
, length
, flags
,
473 * If read beyond end of file happnes, fs code seems to return
477 fuse_fill_iomap_hole(iomap
, length
);
478 pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
479 __func__
, pos
, length
, iomap
->length
);
483 static int fuse_iomap_end(struct inode
*inode
, loff_t pos
, loff_t length
,
484 ssize_t written
, unsigned int flags
,
487 /* DAX writes beyond end-of-file aren't handled using iomap, so the
488 * file size is unchanged and there is nothing to do here.
493 static const struct iomap_ops fuse_iomap_ops
= {
494 .iomap_begin
= fuse_iomap_begin
,
495 .iomap_end
= fuse_iomap_end
,
498 ssize_t
fuse_dax_read_iter(struct kiocb
*iocb
, struct iov_iter
*to
)
500 struct inode
*inode
= file_inode(iocb
->ki_filp
);
503 if (iocb
->ki_flags
& IOCB_NOWAIT
) {
504 if (!inode_trylock_shared(inode
))
507 inode_lock_shared(inode
);
510 ret
= dax_iomap_rw(iocb
, to
, &fuse_iomap_ops
);
511 inode_unlock_shared(inode
);
513 /* TODO file_accessed(iocb->f_filp) */
517 static bool file_extending_write(struct kiocb
*iocb
, struct iov_iter
*from
)
519 struct inode
*inode
= file_inode(iocb
->ki_filp
);
521 return (iov_iter_rw(from
) == WRITE
&&
522 ((iocb
->ki_pos
) >= i_size_read(inode
) ||
523 (iocb
->ki_pos
+ iov_iter_count(from
) > i_size_read(inode
))));
526 static ssize_t
fuse_dax_direct_write(struct kiocb
*iocb
, struct iov_iter
*from
)
528 struct inode
*inode
= file_inode(iocb
->ki_filp
);
529 struct fuse_io_priv io
= FUSE_IO_PRIV_SYNC(iocb
);
532 ret
= fuse_direct_io(&io
, from
, &iocb
->ki_pos
, FUSE_DIO_WRITE
);
536 fuse_invalidate_attr(inode
);
537 fuse_write_update_size(inode
, iocb
->ki_pos
);
541 ssize_t
fuse_dax_write_iter(struct kiocb
*iocb
, struct iov_iter
*from
)
543 struct inode
*inode
= file_inode(iocb
->ki_filp
);
546 if (iocb
->ki_flags
& IOCB_NOWAIT
) {
547 if (!inode_trylock(inode
))
553 ret
= generic_write_checks(iocb
, from
);
557 ret
= file_remove_privs(iocb
->ki_filp
);
560 /* TODO file_update_time() but we don't want metadata I/O */
562 /* Do not use dax for file extending writes as write and on
563 * disk i_size increase are not atomic otherwise.
565 if (file_extending_write(iocb
, from
))
566 ret
= fuse_dax_direct_write(iocb
, from
);
568 ret
= dax_iomap_rw(iocb
, from
, &fuse_iomap_ops
);
574 ret
= generic_write_sync(iocb
, ret
);
578 static void fuse_free_dax_mem_ranges(struct list_head
*mem_list
)
580 struct fuse_dax_mapping
*range
, *temp
;
582 /* Free All allocated elements */
583 list_for_each_entry_safe(range
, temp
, mem_list
, list
) {
584 list_del(&range
->list
);
589 void fuse_dax_conn_free(struct fuse_conn
*fc
)
592 fuse_free_dax_mem_ranges(&fc
->dax
->free_ranges
);
597 static int fuse_dax_mem_range_init(struct fuse_conn_dax
*fcd
)
599 long nr_pages
, nr_ranges
;
602 struct fuse_dax_mapping
*range
;
604 size_t dax_size
= -1;
607 INIT_LIST_HEAD(&fcd
->free_ranges
);
608 id
= dax_read_lock();
609 nr_pages
= dax_direct_access(fcd
->dev
, 0, PHYS_PFN(dax_size
), &kaddr
,
613 pr_debug("dax_direct_access() returned %ld\n", nr_pages
);
617 nr_ranges
= nr_pages
/FUSE_DAX_PAGES
;
618 pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n",
619 __func__
, nr_pages
, nr_ranges
);
621 for (i
= 0; i
< nr_ranges
; i
++) {
622 range
= kzalloc(sizeof(struct fuse_dax_mapping
), GFP_KERNEL
);
627 /* TODO: This offset only works if virtio-fs driver is not
628 * having some memory hidden at the beginning. This needs
631 range
->window_offset
= i
* FUSE_DAX_SZ
;
632 range
->length
= FUSE_DAX_SZ
;
633 list_add_tail(&range
->list
, &fcd
->free_ranges
);
636 fcd
->nr_free_ranges
= nr_ranges
;
639 /* Free All allocated elements */
640 fuse_free_dax_mem_ranges(&fcd
->free_ranges
);
644 int fuse_dax_conn_alloc(struct fuse_conn
*fc
, struct dax_device
*dax_dev
)
646 struct fuse_conn_dax
*fcd
;
652 fcd
= kzalloc(sizeof(*fcd
), GFP_KERNEL
);
656 spin_lock_init(&fcd
->lock
);
658 err
= fuse_dax_mem_range_init(fcd
);
668 bool fuse_dax_inode_alloc(struct super_block
*sb
, struct fuse_inode
*fi
)
670 struct fuse_conn
*fc
= get_fuse_conn_super(sb
);
674 fi
->dax
= kzalloc(sizeof(*fi
->dax
), GFP_KERNEL_ACCOUNT
);
678 init_rwsem(&fi
->dax
->sem
);
679 fi
->dax
->tree
= RB_ROOT_CACHED
;
685 void fuse_dax_inode_init(struct inode
*inode
)
687 struct fuse_conn
*fc
= get_fuse_conn(inode
);
692 inode
->i_flags
|= S_DAX
;
695 bool fuse_dax_check_alignment(struct fuse_conn
*fc
, unsigned int map_alignment
)
697 if (fc
->dax
&& (map_alignment
> FUSE_DAX_SHIFT
)) {
698 pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
699 map_alignment
, FUSE_DAX_SZ
);