]>
Commit | Line | Data |
---|---|---|
ab68f262 DW |
1 | /* |
2 | * Copyright(c) 2016 Intel Corporation. All rights reserved. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of version 2 of the GNU General Public License as | |
6 | * published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but | |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | */ | |
13 | #include <linux/pagemap.h> | |
14 | #include <linux/module.h> | |
15 | #include <linux/device.h> | |
50d34394 | 16 | #include <linux/magic.h> |
3bc52c45 | 17 | #include <linux/mount.h> |
ab68f262 | 18 | #include <linux/pfn_t.h> |
3bc52c45 | 19 | #include <linux/hash.h> |
ba09c01d | 20 | #include <linux/cdev.h> |
ab68f262 DW |
21 | #include <linux/slab.h> |
22 | #include <linux/dax.h> | |
23 | #include <linux/fs.h> | |
24 | #include <linux/mm.h> | |
efebc711 | 25 | #include "dax-private.h" |
ccdb07f6 | 26 | #include "dax.h" |
ab68f262 | 27 | |
ba09c01d | 28 | static dev_t dax_devt; |
956a4cd2 | 29 | DEFINE_STATIC_SRCU(dax_srcu); |
ab68f262 DW |
30 | static struct class *dax_class; |
31 | static DEFINE_IDA(dax_minor_ida); | |
ba09c01d DW |
32 | static int nr_dax = CONFIG_NR_DEV_DAX; |
33 | module_param(nr_dax, int, S_IRUGO); | |
3bc52c45 DW |
34 | static struct vfsmount *dax_mnt; |
35 | static struct kmem_cache *dax_cache __read_mostly; | |
36 | static struct super_block *dax_superblock __read_mostly; | |
ba09c01d | 37 | MODULE_PARM_DESC(nr_dax, "max number of device-dax instances"); |
ab68f262 | 38 | |
565851c9 DW |
39 | /* |
40 | * Rely on the fact that drvdata is set before the attributes are | |
41 | * registered, and that the attributes are unregistered before drvdata | |
42 | * is cleared to assume that drvdata is always valid. | |
43 | */ | |
d7fe1a67 DW |
44 | static ssize_t id_show(struct device *dev, |
45 | struct device_attribute *attr, char *buf) | |
46 | { | |
565851c9 | 47 | struct dax_region *dax_region = dev_get_drvdata(dev); |
d7fe1a67 | 48 | |
565851c9 | 49 | return sprintf(buf, "%d\n", dax_region->id); |
d7fe1a67 DW |
50 | } |
51 | static DEVICE_ATTR_RO(id); | |
52 | ||
53 | static ssize_t region_size_show(struct device *dev, | |
54 | struct device_attribute *attr, char *buf) | |
55 | { | |
565851c9 | 56 | struct dax_region *dax_region = dev_get_drvdata(dev); |
d7fe1a67 | 57 | |
565851c9 DW |
58 | return sprintf(buf, "%llu\n", (unsigned long long) |
59 | resource_size(&dax_region->res)); | |
d7fe1a67 DW |
60 | } |
61 | static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, | |
62 | region_size_show, NULL); | |
63 | ||
64 | static ssize_t align_show(struct device *dev, | |
65 | struct device_attribute *attr, char *buf) | |
66 | { | |
565851c9 | 67 | struct dax_region *dax_region = dev_get_drvdata(dev); |
d7fe1a67 | 68 | |
565851c9 | 69 | return sprintf(buf, "%u\n", dax_region->align); |
d7fe1a67 DW |
70 | } |
71 | static DEVICE_ATTR_RO(align); | |
72 | ||
73 | static struct attribute *dax_region_attributes[] = { | |
74 | &dev_attr_region_size.attr, | |
75 | &dev_attr_align.attr, | |
76 | &dev_attr_id.attr, | |
77 | NULL, | |
78 | }; | |
79 | ||
80 | static const struct attribute_group dax_region_attribute_group = { | |
81 | .name = "dax_region", | |
82 | .attrs = dax_region_attributes, | |
83 | }; | |
84 | ||
85 | static const struct attribute_group *dax_region_attribute_groups[] = { | |
86 | &dax_region_attribute_group, | |
87 | NULL, | |
88 | }; | |
89 | ||
3bc52c45 | 90 | static struct inode *dax_alloc_inode(struct super_block *sb) |
ab68f262 | 91 | { |
3bc52c45 DW |
92 | return kmem_cache_alloc(dax_cache, GFP_KERNEL); |
93 | } | |
ab68f262 | 94 | |
3bc52c45 DW |
95 | static void dax_i_callback(struct rcu_head *head) |
96 | { | |
97 | struct inode *inode = container_of(head, struct inode, i_rcu); | |
98 | ||
99 | kmem_cache_free(dax_cache, inode); | |
ab68f262 DW |
100 | } |
101 | ||
3bc52c45 | 102 | static void dax_destroy_inode(struct inode *inode) |
ab68f262 | 103 | { |
3bc52c45 | 104 | call_rcu(&inode->i_rcu, dax_i_callback); |
ab68f262 | 105 | } |
ab68f262 | 106 | |
3bc52c45 DW |
107 | static const struct super_operations dax_sops = { |
108 | .statfs = simple_statfs, | |
109 | .alloc_inode = dax_alloc_inode, | |
110 | .destroy_inode = dax_destroy_inode, | |
111 | .drop_inode = generic_delete_inode, | |
112 | }; | |
113 | ||
114 | static struct dentry *dax_mount(struct file_system_type *fs_type, | |
115 | int flags, const char *dev_name, void *data) | |
ab68f262 | 116 | { |
3bc52c45 DW |
117 | return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); |
118 | } | |
ab68f262 | 119 | |
3bc52c45 DW |
120 | static struct file_system_type dax_type = { |
121 | .name = "dax", | |
122 | .mount = dax_mount, | |
123 | .kill_sb = kill_anon_super, | |
124 | }; | |
125 | ||
126 | static int dax_test(struct inode *inode, void *data) | |
127 | { | |
128 | return inode->i_cdev == data; | |
129 | } | |
130 | ||
131 | static int dax_set(struct inode *inode, void *data) | |
132 | { | |
133 | inode->i_cdev = data; | |
134 | return 0; | |
135 | } | |
136 | ||
137 | static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt) | |
138 | { | |
139 | struct inode *inode; | |
140 | ||
141 | inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), | |
142 | dax_test, dax_set, cdev); | |
143 | ||
144 | if (!inode) | |
145 | return NULL; | |
146 | ||
147 | if (inode->i_state & I_NEW) { | |
148 | inode->i_mode = S_IFCHR; | |
149 | inode->i_flags = S_DAX; | |
150 | inode->i_rdev = devt; | |
151 | mapping_set_gfp_mask(&inode->i_data, GFP_USER); | |
152 | unlock_new_inode(inode); | |
153 | } | |
154 | return inode; | |
155 | } | |
156 | ||
157 | static void init_once(void *inode) | |
158 | { | |
159 | inode_init_once(inode); | |
160 | } | |
161 | ||
162 | static int dax_inode_init(void) | |
163 | { | |
164 | int rc; | |
165 | ||
166 | dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0, | |
167 | (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| | |
168 | SLAB_MEM_SPREAD|SLAB_ACCOUNT), | |
169 | init_once); | |
170 | if (!dax_cache) | |
171 | return -ENOMEM; | |
172 | ||
173 | rc = register_filesystem(&dax_type); | |
174 | if (rc) | |
175 | goto err_register_fs; | |
176 | ||
177 | dax_mnt = kern_mount(&dax_type); | |
178 | if (IS_ERR(dax_mnt)) { | |
179 | rc = PTR_ERR(dax_mnt); | |
180 | goto err_mount; | |
181 | } | |
182 | dax_superblock = dax_mnt->mnt_sb; | |
183 | ||
184 | return 0; | |
185 | ||
186 | err_mount: | |
187 | unregister_filesystem(&dax_type); | |
188 | err_register_fs: | |
189 | kmem_cache_destroy(dax_cache); | |
190 | ||
191 | return rc; | |
ab68f262 DW |
192 | } |
193 | ||
3bc52c45 DW |
194 | static void dax_inode_exit(void) |
195 | { | |
196 | kern_unmount(dax_mnt); | |
197 | unregister_filesystem(&dax_type); | |
198 | kmem_cache_destroy(dax_cache); | |
199 | } | |
200 | ||
ab68f262 DW |
201 | static void dax_region_free(struct kref *kref) |
202 | { | |
203 | struct dax_region *dax_region; | |
204 | ||
205 | dax_region = container_of(kref, struct dax_region, kref); | |
206 | kfree(dax_region); | |
207 | } | |
208 | ||
209 | void dax_region_put(struct dax_region *dax_region) | |
ab68f262 | 210 | { |
ab68f262 | 211 | kref_put(&dax_region->kref, dax_region_free); |
ab68f262 | 212 | } |
ab68f262 | 213 | EXPORT_SYMBOL_GPL(dax_region_put); |
ab68f262 | 214 | |
d7fe1a67 DW |
215 | static void dax_region_unregister(void *region) |
216 | { | |
217 | struct dax_region *dax_region = region; | |
218 | ||
219 | sysfs_remove_groups(&dax_region->dev->kobj, | |
220 | dax_region_attribute_groups); | |
221 | dax_region_put(dax_region); | |
222 | } | |
223 | ||
ab68f262 DW |
224 | struct dax_region *alloc_dax_region(struct device *parent, int region_id, |
225 | struct resource *res, unsigned int align, void *addr, | |
226 | unsigned long pfn_flags) | |
227 | { | |
228 | struct dax_region *dax_region; | |
229 | ||
d7fe1a67 DW |
230 | /* |
231 | * The DAX core assumes that it can store its private data in | |
232 | * parent->driver_data. This WARN is a reminder / safeguard for | |
233 | * developers of device-dax drivers. | |
234 | */ | |
235 | if (dev_get_drvdata(parent)) { | |
236 | dev_WARN(parent, "dax core failed to setup private data\n"); | |
237 | return NULL; | |
238 | } | |
239 | ||
9d2d01a0 DW |
240 | if (!IS_ALIGNED(res->start, align) |
241 | || !IS_ALIGNED(resource_size(res), align)) | |
242 | return NULL; | |
ab68f262 | 243 | |
9d2d01a0 | 244 | dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); |
ab68f262 DW |
245 | if (!dax_region) |
246 | return NULL; | |
247 | ||
d7fe1a67 | 248 | dev_set_drvdata(parent, dax_region); |
ab68f262 DW |
249 | memcpy(&dax_region->res, res, sizeof(*res)); |
250 | dax_region->pfn_flags = pfn_flags; | |
251 | kref_init(&dax_region->kref); | |
252 | dax_region->id = region_id; | |
253 | ida_init(&dax_region->ida); | |
254 | dax_region->align = align; | |
255 | dax_region->dev = parent; | |
256 | dax_region->base = addr; | |
d7fe1a67 DW |
257 | if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { |
258 | kfree(dax_region); | |
259 | return NULL;; | |
260 | } | |
ab68f262 | 261 | |
d7fe1a67 DW |
262 | kref_get(&dax_region->kref); |
263 | if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) | |
264 | return NULL; | |
ab68f262 DW |
265 | return dax_region; |
266 | } | |
267 | EXPORT_SYMBOL_GPL(alloc_dax_region); | |
268 | ||
ebd84d72 DW |
269 | static struct dax_dev *to_dax_dev(struct device *dev) |
270 | { | |
271 | return container_of(dev, struct dax_dev, dev); | |
272 | } | |
273 | ||
ab68f262 DW |
274 | static ssize_t size_show(struct device *dev, |
275 | struct device_attribute *attr, char *buf) | |
276 | { | |
ebd84d72 | 277 | struct dax_dev *dax_dev = to_dax_dev(dev); |
ab68f262 DW |
278 | unsigned long long size = 0; |
279 | int i; | |
280 | ||
281 | for (i = 0; i < dax_dev->num_resources; i++) | |
282 | size += resource_size(&dax_dev->res[i]); | |
283 | ||
284 | return sprintf(buf, "%llu\n", size); | |
285 | } | |
286 | static DEVICE_ATTR_RO(size); | |
287 | ||
288 | static struct attribute *dax_device_attributes[] = { | |
289 | &dev_attr_size.attr, | |
290 | NULL, | |
291 | }; | |
292 | ||
293 | static const struct attribute_group dax_device_attribute_group = { | |
294 | .attrs = dax_device_attributes, | |
295 | }; | |
296 | ||
297 | static const struct attribute_group *dax_attribute_groups[] = { | |
298 | &dax_device_attribute_group, | |
299 | NULL, | |
300 | }; | |
301 | ||
dee41079 DW |
302 | static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, |
303 | const char *func) | |
304 | { | |
305 | struct dax_region *dax_region = dax_dev->region; | |
ebd84d72 | 306 | struct device *dev = &dax_dev->dev; |
dee41079 DW |
307 | unsigned long mask; |
308 | ||
309 | if (!dax_dev->alive) | |
310 | return -ENXIO; | |
311 | ||
4cb19355 | 312 | /* prevent private mappings from being established */ |
325896ff | 313 | if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { |
dee41079 DW |
314 | dev_info(dev, "%s: %s: fail, attempted private mapping\n", |
315 | current->comm, func); | |
316 | return -EINVAL; | |
317 | } | |
318 | ||
319 | mask = dax_region->align - 1; | |
320 | if (vma->vm_start & mask || vma->vm_end & mask) { | |
321 | dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", | |
322 | current->comm, func, vma->vm_start, vma->vm_end, | |
323 | mask); | |
324 | return -EINVAL; | |
325 | } | |
326 | ||
327 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV | |
328 | && (vma->vm_flags & VM_DONTCOPY) == 0) { | |
329 | dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", | |
330 | current->comm, func); | |
331 | return -EINVAL; | |
332 | } | |
333 | ||
334 | if (!vma_is_dax(vma)) { | |
335 | dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", | |
336 | current->comm, func); | |
337 | return -EINVAL; | |
338 | } | |
339 | ||
340 | return 0; | |
341 | } | |
342 | ||
efebc711 DJ |
343 | /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ |
344 | __weak phys_addr_t dax_pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, | |
dee41079 DW |
345 | unsigned long size) |
346 | { | |
347 | struct resource *res; | |
348 | phys_addr_t phys; | |
349 | int i; | |
350 | ||
351 | for (i = 0; i < dax_dev->num_resources; i++) { | |
352 | res = &dax_dev->res[i]; | |
353 | phys = pgoff * PAGE_SIZE + res->start; | |
354 | if (phys >= res->start && phys <= res->end) | |
355 | break; | |
356 | pgoff -= PHYS_PFN(resource_size(res)); | |
357 | } | |
358 | ||
359 | if (i < dax_dev->num_resources) { | |
360 | res = &dax_dev->res[i]; | |
361 | if (phys + size - 1 <= res->end) | |
362 | return phys; | |
363 | } | |
364 | ||
365 | return -1; | |
366 | } | |
367 | ||
a2d58167 | 368 | static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) |
dee41079 | 369 | { |
ebd84d72 | 370 | struct device *dev = &dax_dev->dev; |
dee41079 DW |
371 | struct dax_region *dax_region; |
372 | int rc = VM_FAULT_SIGBUS; | |
373 | phys_addr_t phys; | |
374 | pfn_t pfn; | |
0134ed4f | 375 | unsigned int fault_size = PAGE_SIZE; |
dee41079 | 376 | |
11bac800 | 377 | if (check_vma(dax_dev, vmf->vma, __func__)) |
dee41079 DW |
378 | return VM_FAULT_SIGBUS; |
379 | ||
380 | dax_region = dax_dev->region; | |
381 | if (dax_region->align > PAGE_SIZE) { | |
382 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
383 | return VM_FAULT_SIGBUS; | |
384 | } | |
385 | ||
0134ed4f DJ |
386 | if (fault_size != dax_region->align) |
387 | return VM_FAULT_SIGBUS; | |
388 | ||
efebc711 | 389 | phys = dax_pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); |
dee41079 | 390 | if (phys == -1) { |
52084f89 | 391 | dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, |
dee41079 DW |
392 | vmf->pgoff); |
393 | return VM_FAULT_SIGBUS; | |
394 | } | |
395 | ||
396 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
397 | ||
11bac800 | 398 | rc = vm_insert_mixed(vmf->vma, vmf->address, pfn); |
dee41079 DW |
399 | |
400 | if (rc == -ENOMEM) | |
401 | return VM_FAULT_OOM; | |
402 | if (rc < 0 && rc != -EBUSY) | |
403 | return VM_FAULT_SIGBUS; | |
404 | ||
405 | return VM_FAULT_NOPAGE; | |
406 | } | |
407 | ||
f4200391 | 408 | static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) |
dee41079 | 409 | { |
d8a849e1 | 410 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
ebd84d72 | 411 | struct device *dev = &dax_dev->dev; |
dee41079 DW |
412 | struct dax_region *dax_region; |
413 | phys_addr_t phys; | |
414 | pgoff_t pgoff; | |
415 | pfn_t pfn; | |
0134ed4f | 416 | unsigned int fault_size = PMD_SIZE; |
dee41079 | 417 | |
f4200391 | 418 | if (check_vma(dax_dev, vmf->vma, __func__)) |
dee41079 DW |
419 | return VM_FAULT_SIGBUS; |
420 | ||
421 | dax_region = dax_dev->region; | |
422 | if (dax_region->align > PMD_SIZE) { | |
423 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
424 | return VM_FAULT_SIGBUS; | |
425 | } | |
426 | ||
427 | /* dax pmd mappings require pfn_t_devmap() */ | |
428 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { | |
429 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
430 | return VM_FAULT_SIGBUS; | |
431 | } | |
432 | ||
0134ed4f DJ |
433 | if (fault_size < dax_region->align) |
434 | return VM_FAULT_SIGBUS; | |
435 | else if (fault_size > dax_region->align) | |
436 | return VM_FAULT_FALLBACK; | |
437 | ||
438 | /* if we are outside of the VMA */ | |
439 | if (pmd_addr < vmf->vma->vm_start || | |
440 | (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) | |
441 | return VM_FAULT_SIGBUS; | |
442 | ||
f4200391 | 443 | pgoff = linear_page_index(vmf->vma, pmd_addr); |
efebc711 | 444 | phys = dax_pgoff_to_phys(dax_dev, pgoff, PMD_SIZE); |
dee41079 | 445 | if (phys == -1) { |
52084f89 | 446 | dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, |
dee41079 DW |
447 | pgoff); |
448 | return VM_FAULT_SIGBUS; | |
449 | } | |
450 | ||
451 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
452 | ||
f4200391 | 453 | return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, |
d8a849e1 | 454 | vmf->flags & FAULT_FLAG_WRITE); |
dee41079 DW |
455 | } |
456 | ||
9557feee DJ |
457 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD |
458 | static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) | |
459 | { | |
460 | unsigned long pud_addr = vmf->address & PUD_MASK; | |
461 | struct device *dev = &dax_dev->dev; | |
462 | struct dax_region *dax_region; | |
463 | phys_addr_t phys; | |
464 | pgoff_t pgoff; | |
465 | pfn_t pfn; | |
70b085b0 DJ |
466 | unsigned int fault_size = PUD_SIZE; |
467 | ||
9557feee DJ |
468 | |
469 | if (check_vma(dax_dev, vmf->vma, __func__)) | |
470 | return VM_FAULT_SIGBUS; | |
471 | ||
472 | dax_region = dax_dev->region; | |
473 | if (dax_region->align > PUD_SIZE) { | |
474 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
475 | return VM_FAULT_SIGBUS; | |
476 | } | |
477 | ||
478 | /* dax pud mappings require pfn_t_devmap() */ | |
479 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { | |
480 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
481 | return VM_FAULT_SIGBUS; | |
482 | } | |
483 | ||
70b085b0 DJ |
484 | if (fault_size < dax_region->align) |
485 | return VM_FAULT_SIGBUS; | |
486 | else if (fault_size > dax_region->align) | |
487 | return VM_FAULT_FALLBACK; | |
488 | ||
489 | /* if we are outside of the VMA */ | |
490 | if (pud_addr < vmf->vma->vm_start || | |
491 | (pud_addr + PUD_SIZE) > vmf->vma->vm_end) | |
492 | return VM_FAULT_SIGBUS; | |
493 | ||
9557feee | 494 | pgoff = linear_page_index(vmf->vma, pud_addr); |
efebc711 | 495 | phys = dax_pgoff_to_phys(dax_dev, pgoff, PUD_SIZE); |
9557feee | 496 | if (phys == -1) { |
52084f89 | 497 | dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, |
9557feee DJ |
498 | pgoff); |
499 | return VM_FAULT_SIGBUS; | |
500 | } | |
501 | ||
502 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
503 | ||
504 | return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, | |
505 | vmf->flags & FAULT_FLAG_WRITE); | |
506 | } | |
507 | #else | |
508 | static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) | |
509 | { | |
510 | return VM_FAULT_FALLBACK; | |
511 | } | |
512 | #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ | |
513 | ||
c791ace1 DJ |
514 | static int dax_dev_huge_fault(struct vm_fault *vmf, |
515 | enum page_entry_size pe_size) | |
dee41079 | 516 | { |
956a4cd2 | 517 | int rc, id; |
f4200391 | 518 | struct file *filp = vmf->vma->vm_file; |
dee41079 DW |
519 | struct dax_dev *dax_dev = filp->private_data; |
520 | ||
ebd84d72 | 521 | dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, |
d8a849e1 | 522 | current->comm, (vmf->flags & FAULT_FLAG_WRITE) |
f4200391 DJ |
523 | ? "write" : "read", |
524 | vmf->vma->vm_start, vmf->vma->vm_end); | |
dee41079 | 525 | |
956a4cd2 | 526 | id = srcu_read_lock(&dax_srcu); |
c791ace1 DJ |
527 | switch (pe_size) { |
528 | case PE_SIZE_PTE: | |
a2d58167 DJ |
529 | rc = __dax_dev_pte_fault(dax_dev, vmf); |
530 | break; | |
c791ace1 | 531 | case PE_SIZE_PMD: |
a2d58167 | 532 | rc = __dax_dev_pmd_fault(dax_dev, vmf); |
9557feee | 533 | break; |
c791ace1 | 534 | case PE_SIZE_PUD: |
9557feee | 535 | rc = __dax_dev_pud_fault(dax_dev, vmf); |
a2d58167 DJ |
536 | break; |
537 | default: | |
538 | return VM_FAULT_FALLBACK; | |
539 | } | |
956a4cd2 | 540 | srcu_read_unlock(&dax_srcu, id); |
dee41079 DW |
541 | |
542 | return rc; | |
543 | } | |
544 | ||
c791ace1 DJ |
545 | static int dax_dev_fault(struct vm_fault *vmf) |
546 | { | |
547 | return dax_dev_huge_fault(vmf, PE_SIZE_PTE); | |
548 | } | |
549 | ||
dee41079 DW |
550 | static const struct vm_operations_struct dax_dev_vm_ops = { |
551 | .fault = dax_dev_fault, | |
c791ace1 | 552 | .huge_fault = dax_dev_huge_fault, |
dee41079 DW |
553 | }; |
554 | ||
af69f51e | 555 | static int dax_mmap(struct file *filp, struct vm_area_struct *vma) |
dee41079 DW |
556 | { |
557 | struct dax_dev *dax_dev = filp->private_data; | |
558 | int rc; | |
559 | ||
ebd84d72 | 560 | dev_dbg(&dax_dev->dev, "%s\n", __func__); |
dee41079 DW |
561 | |
562 | rc = check_vma(dax_dev, vma, __func__); | |
563 | if (rc) | |
564 | return rc; | |
565 | ||
dee41079 DW |
566 | vma->vm_ops = &dax_dev_vm_ops; |
567 | vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; | |
568 | return 0; | |
043a9255 DW |
569 | } |
570 | ||
571 | /* return an unmapped area aligned to the dax region specified alignment */ | |
af69f51e | 572 | static unsigned long dax_get_unmapped_area(struct file *filp, |
043a9255 DW |
573 | unsigned long addr, unsigned long len, unsigned long pgoff, |
574 | unsigned long flags) | |
575 | { | |
576 | unsigned long off, off_end, off_align, len_align, addr_align, align; | |
577 | struct dax_dev *dax_dev = filp ? filp->private_data : NULL; | |
578 | struct dax_region *dax_region; | |
579 | ||
580 | if (!dax_dev || addr) | |
581 | goto out; | |
582 | ||
583 | dax_region = dax_dev->region; | |
584 | align = dax_region->align; | |
585 | off = pgoff << PAGE_SHIFT; | |
586 | off_end = off + len; | |
587 | off_align = round_up(off, align); | |
588 | ||
589 | if ((off_end <= off_align) || ((off_end - off_align) < align)) | |
590 | goto out; | |
591 | ||
592 | len_align = len + align; | |
593 | if ((off + len_align) < off) | |
594 | goto out; | |
dee41079 | 595 | |
043a9255 DW |
596 | addr_align = current->mm->get_unmapped_area(filp, addr, len_align, |
597 | pgoff, flags); | |
598 | if (!IS_ERR_VALUE(addr_align)) { | |
599 | addr_align += (off - addr_align) & (align - 1); | |
600 | return addr_align; | |
601 | } | |
602 | out: | |
603 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); | |
604 | } | |
605 | ||
af69f51e | 606 | static int dax_open(struct inode *inode, struct file *filp) |
043a9255 | 607 | { |
ba09c01d | 608 | struct dax_dev *dax_dev; |
043a9255 | 609 | |
ba09c01d DW |
610 | dax_dev = container_of(inode->i_cdev, struct dax_dev, cdev); |
611 | dev_dbg(&dax_dev->dev, "%s\n", __func__); | |
3bc52c45 DW |
612 | inode->i_mapping = dax_dev->inode->i_mapping; |
613 | inode->i_mapping->host = dax_dev->inode; | |
614 | filp->f_mapping = inode->i_mapping; | |
ebd84d72 DW |
615 | filp->private_data = dax_dev; |
616 | inode->i_flags = S_DAX; | |
043a9255 | 617 | |
043a9255 DW |
618 | return 0; |
619 | } | |
dee41079 | 620 | |
af69f51e | 621 | static int dax_release(struct inode *inode, struct file *filp) |
043a9255 DW |
622 | { |
623 | struct dax_dev *dax_dev = filp->private_data; | |
043a9255 | 624 | |
ba09c01d | 625 | dev_dbg(&dax_dev->dev, "%s\n", __func__); |
043a9255 | 626 | return 0; |
dee41079 DW |
627 | } |
628 | ||
ab68f262 DW |
629 | static const struct file_operations dax_fops = { |
630 | .llseek = noop_llseek, | |
631 | .owner = THIS_MODULE, | |
af69f51e DW |
632 | .open = dax_open, |
633 | .release = dax_release, | |
634 | .get_unmapped_area = dax_get_unmapped_area, | |
635 | .mmap = dax_mmap, | |
ab68f262 DW |
636 | }; |
637 | ||
ebd84d72 | 638 | static void dax_dev_release(struct device *dev) |
043a9255 | 639 | { |
ebd84d72 | 640 | struct dax_dev *dax_dev = to_dax_dev(dev); |
043a9255 DW |
641 | struct dax_region *dax_region = dax_dev->region; |
642 | ||
ebd84d72 DW |
643 | ida_simple_remove(&dax_region->ida, dax_dev->id); |
644 | ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); | |
645 | dax_region_put(dax_region); | |
3bc52c45 | 646 | iput(dax_dev->inode); |
ebd84d72 DW |
647 | kfree(dax_dev); |
648 | } | |
649 | ||
650 | static void unregister_dax_dev(void *dev) | |
651 | { | |
652 | struct dax_dev *dax_dev = to_dax_dev(dev); | |
ba09c01d | 653 | struct cdev *cdev = &dax_dev->cdev; |
ebd84d72 | 654 | |
043a9255 DW |
655 | dev_dbg(dev, "%s\n", __func__); |
656 | ||
657 | /* | |
658 | * Note, rcu is not protecting the liveness of dax_dev, rcu is | |
659 | * ensuring that any fault handlers that might have seen | |
660 | * dax_dev->alive == true, have completed. Any fault handlers | |
956a4cd2 | 661 | * that start after synchronize_srcu() has started will abort |
043a9255 DW |
662 | * upon seeing dax_dev->alive == false. |
663 | */ | |
664 | dax_dev->alive = false; | |
956a4cd2 | 665 | synchronize_srcu(&dax_srcu); |
9dc1e492 | 666 | unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); |
ba09c01d | 667 | cdev_del(cdev); |
043a9255 | 668 | device_unregister(dev); |
043a9255 DW |
669 | } |
670 | ||
d76911ee DW |
671 | struct dax_dev *devm_create_dax_dev(struct dax_region *dax_region, |
672 | struct resource *res, int count) | |
043a9255 DW |
673 | { |
674 | struct device *parent = dax_region->dev; | |
675 | struct dax_dev *dax_dev; | |
9d2d01a0 | 676 | int rc = 0, minor, i; |
043a9255 | 677 | struct device *dev; |
ba09c01d | 678 | struct cdev *cdev; |
043a9255 DW |
679 | dev_t dev_t; |
680 | ||
681 | dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); | |
682 | if (!dax_dev) | |
d76911ee | 683 | return ERR_PTR(-ENOMEM); |
043a9255 | 684 | |
9d2d01a0 DW |
685 | for (i = 0; i < count; i++) { |
686 | if (!IS_ALIGNED(res[i].start, dax_region->align) | |
687 | || !IS_ALIGNED(resource_size(&res[i]), | |
688 | dax_region->align)) { | |
689 | rc = -EINVAL; | |
690 | break; | |
691 | } | |
692 | dax_dev->res[i].start = res[i].start; | |
693 | dax_dev->res[i].end = res[i].end; | |
694 | } | |
695 | ||
696 | if (i < count) | |
697 | goto err_id; | |
698 | ||
043a9255 DW |
699 | dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); |
700 | if (dax_dev->id < 0) { | |
701 | rc = dax_dev->id; | |
702 | goto err_id; | |
703 | } | |
704 | ||
705 | minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); | |
706 | if (minor < 0) { | |
707 | rc = minor; | |
708 | goto err_minor; | |
709 | } | |
710 | ||
bc0a0fe9 AB |
711 | dev_t = MKDEV(MAJOR(dax_devt), minor); |
712 | dev = &dax_dev->dev; | |
3bc52c45 DW |
713 | dax_dev->inode = dax_inode_get(&dax_dev->cdev, dev_t); |
714 | if (!dax_dev->inode) { | |
715 | rc = -ENOMEM; | |
716 | goto err_inode; | |
717 | } | |
718 | ||
ba09c01d | 719 | /* device_initialize() so cdev can reference kobj parent */ |
ebd84d72 | 720 | device_initialize(dev); |
ba09c01d DW |
721 | |
722 | cdev = &dax_dev->cdev; | |
723 | cdev_init(cdev, &dax_fops); | |
724 | cdev->owner = parent->driver->owner; | |
725 | cdev->kobj.parent = &dev->kobj; | |
726 | rc = cdev_add(&dax_dev->cdev, dev_t, 1); | |
727 | if (rc) | |
728 | goto err_cdev; | |
729 | ||
730 | /* from here on we're committed to teardown via dax_dev_release() */ | |
ba09c01d DW |
731 | dax_dev->num_resources = count; |
732 | dax_dev->alive = true; | |
733 | dax_dev->region = dax_region; | |
734 | kref_get(&dax_region->kref); | |
735 | ||
ebd84d72 DW |
736 | dev->devt = dev_t; |
737 | dev->class = dax_class; | |
738 | dev->parent = parent; | |
739 | dev->groups = dax_attribute_groups; | |
740 | dev->release = dax_dev_release; | |
741 | dev_set_name(dev, "dax%d.%d", dax_region->id, dax_dev->id); | |
742 | rc = device_add(dev); | |
743 | if (rc) { | |
744 | put_device(dev); | |
d76911ee | 745 | return ERR_PTR(rc); |
ebd84d72 | 746 | } |
043a9255 | 747 | |
d76911ee DW |
748 | rc = devm_add_action_or_reset(dax_region->dev, unregister_dax_dev, dev); |
749 | if (rc) | |
750 | return ERR_PTR(rc); | |
751 | ||
752 | return dax_dev; | |
043a9255 | 753 | |
ba09c01d | 754 | err_cdev: |
3bc52c45 DW |
755 | iput(dax_dev->inode); |
756 | err_inode: | |
ba09c01d | 757 | ida_simple_remove(&dax_minor_ida, minor); |
043a9255 DW |
758 | err_minor: |
759 | ida_simple_remove(&dax_region->ida, dax_dev->id); | |
760 | err_id: | |
ebd84d72 | 761 | kfree(dax_dev); |
043a9255 | 762 | |
d76911ee | 763 | return ERR_PTR(rc); |
043a9255 DW |
764 | } |
765 | EXPORT_SYMBOL_GPL(devm_create_dax_dev); | |
766 | ||
ab68f262 DW |
767 | static int __init dax_init(void) |
768 | { | |
769 | int rc; | |
770 | ||
3bc52c45 DW |
771 | rc = dax_inode_init(); |
772 | if (rc) | |
ab68f262 | 773 | return rc; |
3bc52c45 | 774 | |
ba09c01d DW |
775 | nr_dax = max(nr_dax, 256); |
776 | rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax"); | |
777 | if (rc) | |
3bc52c45 | 778 | goto err_chrdev; |
ab68f262 DW |
779 | |
780 | dax_class = class_create(THIS_MODULE, "dax"); | |
781 | if (IS_ERR(dax_class)) { | |
3bc52c45 DW |
782 | rc = PTR_ERR(dax_class); |
783 | goto err_class; | |
ab68f262 DW |
784 | } |
785 | ||
786 | return 0; | |
3bc52c45 DW |
787 | |
788 | err_class: | |
789 | unregister_chrdev_region(dax_devt, nr_dax); | |
790 | err_chrdev: | |
791 | dax_inode_exit(); | |
792 | return rc; | |
ab68f262 DW |
793 | } |
794 | ||
795 | static void __exit dax_exit(void) | |
796 | { | |
797 | class_destroy(dax_class); | |
ba09c01d | 798 | unregister_chrdev_region(dax_devt, nr_dax); |
ab68f262 | 799 | ida_destroy(&dax_minor_ida); |
3bc52c45 | 800 | dax_inode_exit(); |
ab68f262 DW |
801 | } |
802 | ||
803 | MODULE_AUTHOR("Intel Corporation"); | |
804 | MODULE_LICENSE("GPL v2"); | |
805 | subsys_initcall(dax_init); | |
806 | module_exit(dax_exit); |