2 * Copyright(c) 2016 Intel Corporation. All rights reserved.
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 #include <linux/pagemap.h>
14 #include <linux/module.h>
15 #include <linux/device.h>
16 #include <linux/mount.h>
17 #include <linux/pfn_t.h>
18 #include <linux/hash.h>
19 #include <linux/cdev.h>
20 #include <linux/slab.h>
21 #include <linux/dax.h>
26 static dev_t dax_devt
;
27 DEFINE_STATIC_SRCU(dax_srcu
);
28 static struct class *dax_class
;
29 static DEFINE_IDA(dax_minor_ida
);
30 static int nr_dax
= CONFIG_NR_DEV_DAX
;
31 module_param(nr_dax
, int, S_IRUGO
);
32 static struct vfsmount
*dax_mnt
;
33 static struct kmem_cache
*dax_cache __read_mostly
;
34 static struct super_block
*dax_superblock __read_mostly
;
35 MODULE_PARM_DESC(nr_dax
, "max number of device-dax instances");
38 * struct dax_region - mapping infrastructure for dax devices
39 * @id: kernel-wide unique region for a memory range
40 * @base: linear address corresponding to @res
41 * @kref: to pin while other agents have a need to do lookups
42 * @dev: parent device backing this region
43 * @align: allocation and mapping alignment for child dax devices
44 * @res: physical address range of the region
45 * @pfn_flags: identify whether the pfns are paged back or not
55 unsigned long pfn_flags
;
59 * struct dax_dev - subdivision of a dax region
60 * @region - parent region
61 * @dev - device backing the character device
62 * @cdev - core chardev data
63 * @alive - !alive + srcu grace period == no new mappings can be established
64 * @id - child id in the region
65 * @num_resources - number of physical address extents in this device
66 * @res - array of physical address ranges
69 struct dax_region
*region
;
76 struct resource res
[0];
80 * Rely on the fact that drvdata is set before the attributes are
81 * registered, and that the attributes are unregistered before drvdata
82 * is cleared to assume that drvdata is always valid.
84 static ssize_t
id_show(struct device
*dev
,
85 struct device_attribute
*attr
, char *buf
)
87 struct dax_region
*dax_region
= dev_get_drvdata(dev
);
89 return sprintf(buf
, "%d\n", dax_region
->id
);
91 static DEVICE_ATTR_RO(id
);
93 static ssize_t
region_size_show(struct device
*dev
,
94 struct device_attribute
*attr
, char *buf
)
96 struct dax_region
*dax_region
= dev_get_drvdata(dev
);
98 return sprintf(buf
, "%llu\n", (unsigned long long)
99 resource_size(&dax_region
->res
));
101 static struct device_attribute dev_attr_region_size
= __ATTR(size
, 0444,
102 region_size_show
, NULL
);
104 static ssize_t
align_show(struct device
*dev
,
105 struct device_attribute
*attr
, char *buf
)
107 struct dax_region
*dax_region
= dev_get_drvdata(dev
);
109 return sprintf(buf
, "%u\n", dax_region
->align
);
111 static DEVICE_ATTR_RO(align
);
113 static struct attribute
*dax_region_attributes
[] = {
114 &dev_attr_region_size
.attr
,
115 &dev_attr_align
.attr
,
120 static const struct attribute_group dax_region_attribute_group
= {
121 .name
= "dax_region",
122 .attrs
= dax_region_attributes
,
125 static const struct attribute_group
*dax_region_attribute_groups
[] = {
126 &dax_region_attribute_group
,
130 static struct inode
*dax_alloc_inode(struct super_block
*sb
)
132 return kmem_cache_alloc(dax_cache
, GFP_KERNEL
);
135 static void dax_i_callback(struct rcu_head
*head
)
137 struct inode
*inode
= container_of(head
, struct inode
, i_rcu
);
139 kmem_cache_free(dax_cache
, inode
);
142 static void dax_destroy_inode(struct inode
*inode
)
144 call_rcu(&inode
->i_rcu
, dax_i_callback
);
147 static const struct super_operations dax_sops
= {
148 .statfs
= simple_statfs
,
149 .alloc_inode
= dax_alloc_inode
,
150 .destroy_inode
= dax_destroy_inode
,
151 .drop_inode
= generic_delete_inode
,
154 static struct dentry
*dax_mount(struct file_system_type
*fs_type
,
155 int flags
, const char *dev_name
, void *data
)
157 return mount_pseudo(fs_type
, "dax:", &dax_sops
, NULL
, DAXFS_MAGIC
);
160 static struct file_system_type dax_type
= {
163 .kill_sb
= kill_anon_super
,
166 static int dax_test(struct inode
*inode
, void *data
)
168 return inode
->i_cdev
== data
;
171 static int dax_set(struct inode
*inode
, void *data
)
173 inode
->i_cdev
= data
;
177 static struct inode
*dax_inode_get(struct cdev
*cdev
, dev_t devt
)
181 inode
= iget5_locked(dax_superblock
, hash_32(devt
+ DAXFS_MAGIC
, 31),
182 dax_test
, dax_set
, cdev
);
187 if (inode
->i_state
& I_NEW
) {
188 inode
->i_mode
= S_IFCHR
;
189 inode
->i_flags
= S_DAX
;
190 inode
->i_rdev
= devt
;
191 mapping_set_gfp_mask(&inode
->i_data
, GFP_USER
);
192 unlock_new_inode(inode
);
197 static void init_once(void *inode
)
199 inode_init_once(inode
);
202 static int dax_inode_init(void)
206 dax_cache
= kmem_cache_create("dax_cache", sizeof(struct inode
), 0,
207 (SLAB_HWCACHE_ALIGN
|SLAB_RECLAIM_ACCOUNT
|
208 SLAB_MEM_SPREAD
|SLAB_ACCOUNT
),
213 rc
= register_filesystem(&dax_type
);
215 goto err_register_fs
;
217 dax_mnt
= kern_mount(&dax_type
);
218 if (IS_ERR(dax_mnt
)) {
219 rc
= PTR_ERR(dax_mnt
);
222 dax_superblock
= dax_mnt
->mnt_sb
;
227 unregister_filesystem(&dax_type
);
229 kmem_cache_destroy(dax_cache
);
234 static void dax_inode_exit(void)
236 kern_unmount(dax_mnt
);
237 unregister_filesystem(&dax_type
);
238 kmem_cache_destroy(dax_cache
);
241 static void dax_region_free(struct kref
*kref
)
243 struct dax_region
*dax_region
;
245 dax_region
= container_of(kref
, struct dax_region
, kref
);
249 void dax_region_put(struct dax_region
*dax_region
)
251 kref_put(&dax_region
->kref
, dax_region_free
);
253 EXPORT_SYMBOL_GPL(dax_region_put
);
255 static void dax_region_unregister(void *region
)
257 struct dax_region
*dax_region
= region
;
259 sysfs_remove_groups(&dax_region
->dev
->kobj
,
260 dax_region_attribute_groups
);
261 dax_region_put(dax_region
);
264 struct dax_region
*alloc_dax_region(struct device
*parent
, int region_id
,
265 struct resource
*res
, unsigned int align
, void *addr
,
266 unsigned long pfn_flags
)
268 struct dax_region
*dax_region
;
271 * The DAX core assumes that it can store its private data in
272 * parent->driver_data. This WARN is a reminder / safeguard for
273 * developers of device-dax drivers.
275 if (dev_get_drvdata(parent
)) {
276 dev_WARN(parent
, "dax core failed to setup private data\n");
280 if (!IS_ALIGNED(res
->start
, align
)
281 || !IS_ALIGNED(resource_size(res
), align
))
284 dax_region
= kzalloc(sizeof(*dax_region
), GFP_KERNEL
);
288 dev_set_drvdata(parent
, dax_region
);
289 memcpy(&dax_region
->res
, res
, sizeof(*res
));
290 dax_region
->pfn_flags
= pfn_flags
;
291 kref_init(&dax_region
->kref
);
292 dax_region
->id
= region_id
;
293 ida_init(&dax_region
->ida
);
294 dax_region
->align
= align
;
295 dax_region
->dev
= parent
;
296 dax_region
->base
= addr
;
297 if (sysfs_create_groups(&parent
->kobj
, dax_region_attribute_groups
)) {
302 kref_get(&dax_region
->kref
);
303 if (devm_add_action_or_reset(parent
, dax_region_unregister
, dax_region
))
307 EXPORT_SYMBOL_GPL(alloc_dax_region
);
309 static struct dax_dev
*to_dax_dev(struct device
*dev
)
311 return container_of(dev
, struct dax_dev
, dev
);
314 static ssize_t
size_show(struct device
*dev
,
315 struct device_attribute
*attr
, char *buf
)
317 struct dax_dev
*dax_dev
= to_dax_dev(dev
);
318 unsigned long long size
= 0;
321 for (i
= 0; i
< dax_dev
->num_resources
; i
++)
322 size
+= resource_size(&dax_dev
->res
[i
]);
324 return sprintf(buf
, "%llu\n", size
);
326 static DEVICE_ATTR_RO(size
);
328 static struct attribute
*dax_device_attributes
[] = {
333 static const struct attribute_group dax_device_attribute_group
= {
334 .attrs
= dax_device_attributes
,
337 static const struct attribute_group
*dax_attribute_groups
[] = {
338 &dax_device_attribute_group
,
342 static int check_vma(struct dax_dev
*dax_dev
, struct vm_area_struct
*vma
,
345 struct dax_region
*dax_region
= dax_dev
->region
;
346 struct device
*dev
= &dax_dev
->dev
;
352 /* prevent private mappings from being established */
353 if ((vma
->vm_flags
& VM_MAYSHARE
) != VM_MAYSHARE
) {
354 dev_info(dev
, "%s: %s: fail, attempted private mapping\n",
355 current
->comm
, func
);
359 mask
= dax_region
->align
- 1;
360 if (vma
->vm_start
& mask
|| vma
->vm_end
& mask
) {
361 dev_info(dev
, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
362 current
->comm
, func
, vma
->vm_start
, vma
->vm_end
,
367 if ((dax_region
->pfn_flags
& (PFN_DEV
|PFN_MAP
)) == PFN_DEV
368 && (vma
->vm_flags
& VM_DONTCOPY
) == 0) {
369 dev_info(dev
, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
370 current
->comm
, func
);
374 if (!vma_is_dax(vma
)) {
375 dev_info(dev
, "%s: %s: fail, vma is not DAX capable\n",
376 current
->comm
, func
);
383 static phys_addr_t
pgoff_to_phys(struct dax_dev
*dax_dev
, pgoff_t pgoff
,
386 struct resource
*res
;
390 for (i
= 0; i
< dax_dev
->num_resources
; i
++) {
391 res
= &dax_dev
->res
[i
];
392 phys
= pgoff
* PAGE_SIZE
+ res
->start
;
393 if (phys
>= res
->start
&& phys
<= res
->end
)
395 pgoff
-= PHYS_PFN(resource_size(res
));
398 if (i
< dax_dev
->num_resources
) {
399 res
= &dax_dev
->res
[i
];
400 if (phys
+ size
- 1 <= res
->end
)
407 static int __dax_dev_fault(struct dax_dev
*dax_dev
, struct vm_area_struct
*vma
,
408 struct vm_fault
*vmf
)
410 struct device
*dev
= &dax_dev
->dev
;
411 struct dax_region
*dax_region
;
412 int rc
= VM_FAULT_SIGBUS
;
415 unsigned int fault_size
= PAGE_SIZE
;
417 if (check_vma(dax_dev
, vma
, __func__
))
418 return VM_FAULT_SIGBUS
;
420 dax_region
= dax_dev
->region
;
421 if (dax_region
->align
> PAGE_SIZE
) {
422 dev_dbg(dev
, "%s: alignment > fault size\n", __func__
);
423 return VM_FAULT_SIGBUS
;
426 if (fault_size
!= dax_region
->align
)
427 return VM_FAULT_SIGBUS
;
429 phys
= pgoff_to_phys(dax_dev
, vmf
->pgoff
, PAGE_SIZE
);
431 dev_dbg(dev
, "%s: phys_to_pgoff(%#lx) failed\n", __func__
,
433 return VM_FAULT_SIGBUS
;
436 pfn
= phys_to_pfn_t(phys
, dax_region
->pfn_flags
);
438 rc
= vm_insert_mixed(vma
, vmf
->address
, pfn
);
442 if (rc
< 0 && rc
!= -EBUSY
)
443 return VM_FAULT_SIGBUS
;
445 return VM_FAULT_NOPAGE
;
448 static int dax_dev_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
451 struct file
*filp
= vma
->vm_file
;
452 struct dax_dev
*dax_dev
= filp
->private_data
;
454 dev_dbg(&dax_dev
->dev
, "%s: %s: %s (%#lx - %#lx)\n", __func__
,
455 current
->comm
, (vmf
->flags
& FAULT_FLAG_WRITE
)
456 ? "write" : "read", vma
->vm_start
, vma
->vm_end
);
458 rc
= __dax_dev_fault(dax_dev
, vma
, vmf
);
464 static int __dax_dev_pmd_fault(struct dax_dev
*dax_dev
,
465 struct vm_area_struct
*vma
, unsigned long addr
, pmd_t
*pmd
,
468 unsigned long pmd_addr
= addr
& PMD_MASK
;
469 struct device
*dev
= &dax_dev
->dev
;
470 struct dax_region
*dax_region
;
474 unsigned int fault_size
= PMD_SIZE
;
476 if (check_vma(dax_dev
, vma
, __func__
))
477 return VM_FAULT_SIGBUS
;
479 dax_region
= dax_dev
->region
;
480 if (dax_region
->align
> PMD_SIZE
) {
481 dev_dbg(dev
, "%s: alignment > fault size\n", __func__
);
482 return VM_FAULT_SIGBUS
;
485 /* dax pmd mappings require pfn_t_devmap() */
486 if ((dax_region
->pfn_flags
& (PFN_DEV
|PFN_MAP
)) != (PFN_DEV
|PFN_MAP
)) {
487 dev_dbg(dev
, "%s: alignment > fault size\n", __func__
);
488 return VM_FAULT_SIGBUS
;
491 if (fault_size
< dax_region
->align
)
492 return VM_FAULT_SIGBUS
;
493 else if (fault_size
> dax_region
->align
)
494 return VM_FAULT_FALLBACK
;
496 /* if we are outside of the VMA */
497 if (pmd_addr
< vma
->vm_start
||
498 (pmd_addr
+ PMD_SIZE
) > vma
->vm_end
)
499 return VM_FAULT_SIGBUS
;
501 pgoff
= linear_page_index(vma
, pmd_addr
);
502 phys
= pgoff_to_phys(dax_dev
, pgoff
, PMD_SIZE
);
504 dev_dbg(dev
, "%s: phys_to_pgoff(%#lx) failed\n", __func__
,
506 return VM_FAULT_SIGBUS
;
509 pfn
= phys_to_pfn_t(phys
, dax_region
->pfn_flags
);
511 return vmf_insert_pfn_pmd(vma
, addr
, pmd
, pfn
,
512 flags
& FAULT_FLAG_WRITE
);
515 static int dax_dev_pmd_fault(struct vm_area_struct
*vma
, unsigned long addr
,
516 pmd_t
*pmd
, unsigned int flags
)
519 struct file
*filp
= vma
->vm_file
;
520 struct dax_dev
*dax_dev
= filp
->private_data
;
522 dev_dbg(&dax_dev
->dev
, "%s: %s: %s (%#lx - %#lx)\n", __func__
,
523 current
->comm
, (flags
& FAULT_FLAG_WRITE
)
524 ? "write" : "read", vma
->vm_start
, vma
->vm_end
);
526 id
= srcu_read_lock(&dax_srcu
);
527 rc
= __dax_dev_pmd_fault(dax_dev
, vma
, addr
, pmd
, flags
);
528 srcu_read_unlock(&dax_srcu
, id
);
533 static const struct vm_operations_struct dax_dev_vm_ops
= {
534 .fault
= dax_dev_fault
,
535 .pmd_fault
= dax_dev_pmd_fault
,
538 static int dax_mmap(struct file
*filp
, struct vm_area_struct
*vma
)
540 struct dax_dev
*dax_dev
= filp
->private_data
;
543 dev_dbg(&dax_dev
->dev
, "%s\n", __func__
);
545 rc
= check_vma(dax_dev
, vma
, __func__
);
549 vma
->vm_ops
= &dax_dev_vm_ops
;
550 vma
->vm_flags
|= VM_MIXEDMAP
| VM_HUGEPAGE
;
554 /* return an unmapped area aligned to the dax region specified alignment */
555 static unsigned long dax_get_unmapped_area(struct file
*filp
,
556 unsigned long addr
, unsigned long len
, unsigned long pgoff
,
559 unsigned long off
, off_end
, off_align
, len_align
, addr_align
, align
;
560 struct dax_dev
*dax_dev
= filp
? filp
->private_data
: NULL
;
561 struct dax_region
*dax_region
;
563 if (!dax_dev
|| addr
)
566 dax_region
= dax_dev
->region
;
567 align
= dax_region
->align
;
568 off
= pgoff
<< PAGE_SHIFT
;
570 off_align
= round_up(off
, align
);
572 if ((off_end
<= off_align
) || ((off_end
- off_align
) < align
))
575 len_align
= len
+ align
;
576 if ((off
+ len_align
) < off
)
579 addr_align
= current
->mm
->get_unmapped_area(filp
, addr
, len_align
,
581 if (!IS_ERR_VALUE(addr_align
)) {
582 addr_align
+= (off
- addr_align
) & (align
- 1);
586 return current
->mm
->get_unmapped_area(filp
, addr
, len
, pgoff
, flags
);
589 static int dax_open(struct inode
*inode
, struct file
*filp
)
591 struct dax_dev
*dax_dev
;
593 dax_dev
= container_of(inode
->i_cdev
, struct dax_dev
, cdev
);
594 dev_dbg(&dax_dev
->dev
, "%s\n", __func__
);
595 inode
->i_mapping
= dax_dev
->inode
->i_mapping
;
596 inode
->i_mapping
->host
= dax_dev
->inode
;
597 filp
->f_mapping
= inode
->i_mapping
;
598 filp
->private_data
= dax_dev
;
599 inode
->i_flags
= S_DAX
;
604 static int dax_release(struct inode
*inode
, struct file
*filp
)
606 struct dax_dev
*dax_dev
= filp
->private_data
;
608 dev_dbg(&dax_dev
->dev
, "%s\n", __func__
);
612 static const struct file_operations dax_fops
= {
613 .llseek
= noop_llseek
,
614 .owner
= THIS_MODULE
,
616 .release
= dax_release
,
617 .get_unmapped_area
= dax_get_unmapped_area
,
621 static void dax_dev_release(struct device
*dev
)
623 struct dax_dev
*dax_dev
= to_dax_dev(dev
);
624 struct dax_region
*dax_region
= dax_dev
->region
;
626 ida_simple_remove(&dax_region
->ida
, dax_dev
->id
);
627 ida_simple_remove(&dax_minor_ida
, MINOR(dev
->devt
));
628 dax_region_put(dax_region
);
629 iput(dax_dev
->inode
);
633 static void kill_dax_dev(struct dax_dev
*dax_dev
)
635 struct cdev
*cdev
= &dax_dev
->cdev
;
638 * Note, rcu is not protecting the liveness of dax_dev, rcu is
639 * ensuring that any fault handlers that might have seen
640 * dax_dev->alive == true, have completed. Any fault handlers
641 * that start after synchronize_srcu() has started will abort
642 * upon seeing dax_dev->alive == false.
644 dax_dev
->alive
= false;
645 synchronize_srcu(&dax_srcu
);
646 unmap_mapping_range(dax_dev
->inode
->i_mapping
, 0, 0, 1);
650 static void unregister_dax_dev(void *dev
)
652 struct dax_dev
*dax_dev
= to_dax_dev(dev
);
654 dev_dbg(dev
, "%s\n", __func__
);
656 kill_dax_dev(dax_dev
);
657 device_unregister(dev
);
660 struct dax_dev
*devm_create_dax_dev(struct dax_region
*dax_region
,
661 struct resource
*res
, int count
)
663 struct device
*parent
= dax_region
->dev
;
664 struct dax_dev
*dax_dev
;
665 int rc
= 0, minor
, i
;
670 dax_dev
= kzalloc(sizeof(*dax_dev
) + sizeof(*res
) * count
, GFP_KERNEL
);
672 return ERR_PTR(-ENOMEM
);
674 for (i
= 0; i
< count
; i
++) {
675 if (!IS_ALIGNED(res
[i
].start
, dax_region
->align
)
676 || !IS_ALIGNED(resource_size(&res
[i
]),
677 dax_region
->align
)) {
681 dax_dev
->res
[i
].start
= res
[i
].start
;
682 dax_dev
->res
[i
].end
= res
[i
].end
;
688 dax_dev
->id
= ida_simple_get(&dax_region
->ida
, 0, 0, GFP_KERNEL
);
689 if (dax_dev
->id
< 0) {
694 minor
= ida_simple_get(&dax_minor_ida
, 0, 0, GFP_KERNEL
);
700 dev_t
= MKDEV(MAJOR(dax_devt
), minor
);
702 dax_dev
->inode
= dax_inode_get(&dax_dev
->cdev
, dev_t
);
703 if (!dax_dev
->inode
) {
708 /* device_initialize() so cdev can reference kobj parent */
709 device_initialize(dev
);
711 cdev
= &dax_dev
->cdev
;
712 cdev_init(cdev
, &dax_fops
);
713 cdev
->owner
= parent
->driver
->owner
;
714 cdev
->kobj
.parent
= &dev
->kobj
;
715 rc
= cdev_add(&dax_dev
->cdev
, dev_t
, 1);
719 /* from here on we're committed to teardown via dax_dev_release() */
720 dax_dev
->num_resources
= count
;
721 dax_dev
->alive
= true;
722 dax_dev
->region
= dax_region
;
723 kref_get(&dax_region
->kref
);
726 dev
->class = dax_class
;
727 dev
->parent
= parent
;
728 dev
->groups
= dax_attribute_groups
;
729 dev
->release
= dax_dev_release
;
730 dev_set_name(dev
, "dax%d.%d", dax_region
->id
, dax_dev
->id
);
731 rc
= device_add(dev
);
733 kill_dax_dev(dax_dev
);
738 rc
= devm_add_action_or_reset(dax_region
->dev
, unregister_dax_dev
, dev
);
745 iput(dax_dev
->inode
);
747 ida_simple_remove(&dax_minor_ida
, minor
);
749 ida_simple_remove(&dax_region
->ida
, dax_dev
->id
);
755 EXPORT_SYMBOL_GPL(devm_create_dax_dev
);
757 static int __init
dax_init(void)
761 rc
= dax_inode_init();
765 nr_dax
= max(nr_dax
, 256);
766 rc
= alloc_chrdev_region(&dax_devt
, 0, nr_dax
, "dax");
770 dax_class
= class_create(THIS_MODULE
, "dax");
771 if (IS_ERR(dax_class
)) {
772 rc
= PTR_ERR(dax_class
);
779 unregister_chrdev_region(dax_devt
, nr_dax
);
785 static void __exit
dax_exit(void)
787 class_destroy(dax_class
);
788 unregister_chrdev_region(dax_devt
, nr_dax
);
789 ida_destroy(&dax_minor_ida
);
793 MODULE_AUTHOR("Intel Corporation");
794 MODULE_LICENSE("GPL v2");
795 subsys_initcall(dax_init
);
796 module_exit(dax_exit
);