]>
Commit | Line | Data |
---|---|---|
ab68f262 DW |
1 | /* |
2 | * Copyright(c) 2016 Intel Corporation. All rights reserved. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of version 2 of the GNU General Public License as | |
6 | * published by the Free Software Foundation. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but | |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
11 | * General Public License for more details. | |
12 | */ | |
13 | #include <linux/pagemap.h> | |
14 | #include <linux/module.h> | |
15 | #include <linux/device.h> | |
16 | #include <linux/pfn_t.h> | |
17 | #include <linux/slab.h> | |
18 | #include <linux/dax.h> | |
19 | #include <linux/fs.h> | |
20 | #include <linux/mm.h> | |
21 | ||
22 | static int dax_major; | |
23 | static struct class *dax_class; | |
24 | static DEFINE_IDA(dax_minor_ida); | |
25 | ||
26 | /** | |
27 | * struct dax_region - mapping infrastructure for dax devices | |
28 | * @id: kernel-wide unique region for a memory range | |
29 | * @base: linear address corresponding to @res | |
30 | * @kref: to pin while other agents have a need to do lookups | |
31 | * @dev: parent device backing this region | |
32 | * @align: allocation and mapping alignment for child dax devices | |
33 | * @res: physical address range of the region | |
34 | * @pfn_flags: identify whether the pfns are paged back or not | |
35 | */ | |
36 | struct dax_region { | |
37 | int id; | |
38 | struct ida ida; | |
39 | void *base; | |
40 | struct kref kref; | |
41 | struct device *dev; | |
42 | unsigned int align; | |
43 | struct resource res; | |
44 | unsigned long pfn_flags; | |
45 | }; | |
46 | ||
47 | /** | |
48 | * struct dax_dev - subdivision of a dax region | |
49 | * @region - parent region | |
50 | * @dev - device backing the character device | |
51 | * @kref - enable this data to be tracked in filp->private_data | |
dee41079 | 52 | * @alive - !alive + rcu grace period == no new mappings can be established |
ab68f262 DW |
53 | * @id - child id in the region |
54 | * @num_resources - number of physical address extents in this device | |
55 | * @res - array of physical address ranges | |
56 | */ | |
57 | struct dax_dev { | |
58 | struct dax_region *region; | |
59 | struct device *dev; | |
60 | struct kref kref; | |
dee41079 | 61 | bool alive; |
ab68f262 DW |
62 | int id; |
63 | int num_resources; | |
64 | struct resource res[0]; | |
65 | }; | |
66 | ||
67 | static void dax_region_free(struct kref *kref) | |
68 | { | |
69 | struct dax_region *dax_region; | |
70 | ||
71 | dax_region = container_of(kref, struct dax_region, kref); | |
72 | kfree(dax_region); | |
73 | } | |
74 | ||
75 | void dax_region_put(struct dax_region *dax_region) | |
76 | { | |
77 | kref_put(&dax_region->kref, dax_region_free); | |
78 | } | |
79 | EXPORT_SYMBOL_GPL(dax_region_put); | |
80 | ||
81 | static void dax_dev_free(struct kref *kref) | |
82 | { | |
83 | struct dax_dev *dax_dev; | |
84 | ||
85 | dax_dev = container_of(kref, struct dax_dev, kref); | |
86 | dax_region_put(dax_dev->region); | |
87 | kfree(dax_dev); | |
88 | } | |
89 | ||
90 | static void dax_dev_put(struct dax_dev *dax_dev) | |
91 | { | |
92 | kref_put(&dax_dev->kref, dax_dev_free); | |
93 | } | |
94 | ||
95 | struct dax_region *alloc_dax_region(struct device *parent, int region_id, | |
96 | struct resource *res, unsigned int align, void *addr, | |
97 | unsigned long pfn_flags) | |
98 | { | |
99 | struct dax_region *dax_region; | |
100 | ||
101 | dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); | |
102 | ||
103 | if (!dax_region) | |
104 | return NULL; | |
105 | ||
106 | memcpy(&dax_region->res, res, sizeof(*res)); | |
107 | dax_region->pfn_flags = pfn_flags; | |
108 | kref_init(&dax_region->kref); | |
109 | dax_region->id = region_id; | |
110 | ida_init(&dax_region->ida); | |
111 | dax_region->align = align; | |
112 | dax_region->dev = parent; | |
113 | dax_region->base = addr; | |
114 | ||
115 | return dax_region; | |
116 | } | |
117 | EXPORT_SYMBOL_GPL(alloc_dax_region); | |
118 | ||
119 | static ssize_t size_show(struct device *dev, | |
120 | struct device_attribute *attr, char *buf) | |
121 | { | |
122 | struct dax_dev *dax_dev = dev_get_drvdata(dev); | |
123 | unsigned long long size = 0; | |
124 | int i; | |
125 | ||
126 | for (i = 0; i < dax_dev->num_resources; i++) | |
127 | size += resource_size(&dax_dev->res[i]); | |
128 | ||
129 | return sprintf(buf, "%llu\n", size); | |
130 | } | |
131 | static DEVICE_ATTR_RO(size); | |
132 | ||
133 | static struct attribute *dax_device_attributes[] = { | |
134 | &dev_attr_size.attr, | |
135 | NULL, | |
136 | }; | |
137 | ||
138 | static const struct attribute_group dax_device_attribute_group = { | |
139 | .attrs = dax_device_attributes, | |
140 | }; | |
141 | ||
142 | static const struct attribute_group *dax_attribute_groups[] = { | |
143 | &dax_device_attribute_group, | |
144 | NULL, | |
145 | }; | |
146 | ||
147 | static void unregister_dax_dev(void *_dev) | |
148 | { | |
149 | struct device *dev = _dev; | |
150 | struct dax_dev *dax_dev = dev_get_drvdata(dev); | |
151 | struct dax_region *dax_region = dax_dev->region; | |
152 | ||
153 | dev_dbg(dev, "%s\n", __func__); | |
154 | ||
dee41079 DW |
155 | /* |
156 | * Note, rcu is not protecting the liveness of dax_dev, rcu is | |
157 | * ensuring that any fault handlers that might have seen | |
158 | * dax_dev->alive == true, have completed. Any fault handlers | |
159 | * that start after synchronize_rcu() has started will abort | |
160 | * upon seeing dax_dev->alive == false. | |
161 | */ | |
162 | dax_dev->alive = false; | |
163 | synchronize_rcu(); | |
164 | ||
ab68f262 DW |
165 | get_device(dev); |
166 | device_unregister(dev); | |
167 | ida_simple_remove(&dax_region->ida, dax_dev->id); | |
168 | ida_simple_remove(&dax_minor_ida, MINOR(dev->devt)); | |
169 | put_device(dev); | |
170 | dax_dev_put(dax_dev); | |
171 | } | |
172 | ||
173 | int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, | |
174 | int count) | |
175 | { | |
176 | struct device *parent = dax_region->dev; | |
177 | struct dax_dev *dax_dev; | |
178 | struct device *dev; | |
179 | int rc, minor; | |
180 | dev_t dev_t; | |
181 | ||
182 | dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL); | |
183 | if (!dax_dev) | |
184 | return -ENOMEM; | |
185 | memcpy(dax_dev->res, res, sizeof(*res) * count); | |
186 | dax_dev->num_resources = count; | |
187 | kref_init(&dax_dev->kref); | |
dee41079 | 188 | dax_dev->alive = true; |
ab68f262 DW |
189 | dax_dev->region = dax_region; |
190 | kref_get(&dax_region->kref); | |
191 | ||
192 | dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); | |
193 | if (dax_dev->id < 0) { | |
194 | rc = dax_dev->id; | |
195 | goto err_id; | |
196 | } | |
197 | ||
198 | minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL); | |
199 | if (minor < 0) { | |
200 | rc = minor; | |
201 | goto err_minor; | |
202 | } | |
203 | ||
204 | dev_t = MKDEV(dax_major, minor); | |
205 | dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev, | |
206 | dax_attribute_groups, "dax%d.%d", dax_region->id, | |
207 | dax_dev->id); | |
208 | if (IS_ERR(dev)) { | |
209 | rc = PTR_ERR(dev); | |
210 | goto err_create; | |
211 | } | |
212 | dax_dev->dev = dev; | |
213 | ||
214 | rc = devm_add_action(dax_region->dev, unregister_dax_dev, dev); | |
215 | if (rc) { | |
216 | unregister_dax_dev(dev); | |
217 | return rc; | |
218 | } | |
219 | ||
220 | return 0; | |
221 | ||
222 | err_create: | |
223 | ida_simple_remove(&dax_minor_ida, minor); | |
224 | err_minor: | |
225 | ida_simple_remove(&dax_region->ida, dax_dev->id); | |
226 | err_id: | |
227 | dax_dev_put(dax_dev); | |
228 | ||
229 | return rc; | |
230 | } | |
231 | EXPORT_SYMBOL_GPL(devm_create_dax_dev); | |
232 | ||
dee41079 DW |
233 | /* return an unmapped area aligned to the dax region specified alignment */ |
234 | static unsigned long dax_dev_get_unmapped_area(struct file *filp, | |
235 | unsigned long addr, unsigned long len, unsigned long pgoff, | |
236 | unsigned long flags) | |
237 | { | |
238 | unsigned long off, off_end, off_align, len_align, addr_align, align; | |
239 | struct dax_dev *dax_dev = filp ? filp->private_data : NULL; | |
240 | struct dax_region *dax_region; | |
241 | ||
242 | if (!dax_dev || addr) | |
243 | goto out; | |
244 | ||
245 | dax_region = dax_dev->region; | |
246 | align = dax_region->align; | |
247 | off = pgoff << PAGE_SHIFT; | |
248 | off_end = off + len; | |
249 | off_align = round_up(off, align); | |
250 | ||
251 | if ((off_end <= off_align) || ((off_end - off_align) < align)) | |
252 | goto out; | |
253 | ||
254 | len_align = len + align; | |
255 | if ((off + len_align) < off) | |
256 | goto out; | |
257 | ||
258 | addr_align = current->mm->get_unmapped_area(filp, addr, len_align, | |
259 | pgoff, flags); | |
260 | if (!IS_ERR_VALUE(addr_align)) { | |
261 | addr_align += (off - addr_align) & (align - 1); | |
262 | return addr_align; | |
263 | } | |
264 | out: | |
265 | return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); | |
266 | } | |
267 | ||
268 | static int __match_devt(struct device *dev, const void *data) | |
269 | { | |
270 | const dev_t *devt = data; | |
271 | ||
272 | return dev->devt == *devt; | |
273 | } | |
274 | ||
275 | static struct device *dax_dev_find(dev_t dev_t) | |
276 | { | |
277 | return class_find_device(dax_class, NULL, &dev_t, __match_devt); | |
278 | } | |
279 | ||
280 | static int dax_dev_open(struct inode *inode, struct file *filp) | |
281 | { | |
282 | struct dax_dev *dax_dev = NULL; | |
283 | struct device *dev; | |
284 | ||
285 | dev = dax_dev_find(inode->i_rdev); | |
286 | if (!dev) | |
287 | return -ENXIO; | |
288 | ||
289 | device_lock(dev); | |
290 | dax_dev = dev_get_drvdata(dev); | |
291 | if (dax_dev) { | |
292 | dev_dbg(dev, "%s\n", __func__); | |
293 | filp->private_data = dax_dev; | |
294 | kref_get(&dax_dev->kref); | |
295 | inode->i_flags = S_DAX; | |
296 | } | |
297 | device_unlock(dev); | |
298 | ||
299 | if (!dax_dev) { | |
300 | put_device(dev); | |
301 | return -ENXIO; | |
302 | } | |
303 | return 0; | |
304 | } | |
305 | ||
306 | static int dax_dev_release(struct inode *inode, struct file *filp) | |
307 | { | |
308 | struct dax_dev *dax_dev = filp->private_data; | |
309 | struct device *dev = dax_dev->dev; | |
310 | ||
311 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
312 | dax_dev_put(dax_dev); | |
313 | put_device(dev); | |
314 | ||
315 | return 0; | |
316 | } | |
317 | ||
318 | static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |
319 | const char *func) | |
320 | { | |
321 | struct dax_region *dax_region = dax_dev->region; | |
322 | struct device *dev = dax_dev->dev; | |
323 | unsigned long mask; | |
324 | ||
325 | if (!dax_dev->alive) | |
326 | return -ENXIO; | |
327 | ||
328 | /* prevent private / writable mappings from being established */ | |
329 | if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) { | |
330 | dev_info(dev, "%s: %s: fail, attempted private mapping\n", | |
331 | current->comm, func); | |
332 | return -EINVAL; | |
333 | } | |
334 | ||
335 | mask = dax_region->align - 1; | |
336 | if (vma->vm_start & mask || vma->vm_end & mask) { | |
337 | dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", | |
338 | current->comm, func, vma->vm_start, vma->vm_end, | |
339 | mask); | |
340 | return -EINVAL; | |
341 | } | |
342 | ||
343 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV | |
344 | && (vma->vm_flags & VM_DONTCOPY) == 0) { | |
345 | dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", | |
346 | current->comm, func); | |
347 | return -EINVAL; | |
348 | } | |
349 | ||
350 | if (!vma_is_dax(vma)) { | |
351 | dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", | |
352 | current->comm, func); | |
353 | return -EINVAL; | |
354 | } | |
355 | ||
356 | return 0; | |
357 | } | |
358 | ||
359 | static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, | |
360 | unsigned long size) | |
361 | { | |
362 | struct resource *res; | |
363 | phys_addr_t phys; | |
364 | int i; | |
365 | ||
366 | for (i = 0; i < dax_dev->num_resources; i++) { | |
367 | res = &dax_dev->res[i]; | |
368 | phys = pgoff * PAGE_SIZE + res->start; | |
369 | if (phys >= res->start && phys <= res->end) | |
370 | break; | |
371 | pgoff -= PHYS_PFN(resource_size(res)); | |
372 | } | |
373 | ||
374 | if (i < dax_dev->num_resources) { | |
375 | res = &dax_dev->res[i]; | |
376 | if (phys + size - 1 <= res->end) | |
377 | return phys; | |
378 | } | |
379 | ||
380 | return -1; | |
381 | } | |
382 | ||
383 | static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |
384 | struct vm_fault *vmf) | |
385 | { | |
386 | unsigned long vaddr = (unsigned long) vmf->virtual_address; | |
387 | struct device *dev = dax_dev->dev; | |
388 | struct dax_region *dax_region; | |
389 | int rc = VM_FAULT_SIGBUS; | |
390 | phys_addr_t phys; | |
391 | pfn_t pfn; | |
392 | ||
393 | if (check_vma(dax_dev, vma, __func__)) | |
394 | return VM_FAULT_SIGBUS; | |
395 | ||
396 | dax_region = dax_dev->region; | |
397 | if (dax_region->align > PAGE_SIZE) { | |
398 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
399 | return VM_FAULT_SIGBUS; | |
400 | } | |
401 | ||
402 | phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); | |
403 | if (phys == -1) { | |
404 | dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, | |
405 | vmf->pgoff); | |
406 | return VM_FAULT_SIGBUS; | |
407 | } | |
408 | ||
409 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
410 | ||
411 | rc = vm_insert_mixed(vma, vaddr, pfn); | |
412 | ||
413 | if (rc == -ENOMEM) | |
414 | return VM_FAULT_OOM; | |
415 | if (rc < 0 && rc != -EBUSY) | |
416 | return VM_FAULT_SIGBUS; | |
417 | ||
418 | return VM_FAULT_NOPAGE; | |
419 | } | |
420 | ||
421 | static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |
422 | { | |
423 | int rc; | |
424 | struct file *filp = vma->vm_file; | |
425 | struct dax_dev *dax_dev = filp->private_data; | |
426 | ||
427 | dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, | |
428 | current->comm, (vmf->flags & FAULT_FLAG_WRITE) | |
429 | ? "write" : "read", vma->vm_start, vma->vm_end); | |
430 | rcu_read_lock(); | |
431 | rc = __dax_dev_fault(dax_dev, vma, vmf); | |
432 | rcu_read_unlock(); | |
433 | ||
434 | return rc; | |
435 | } | |
436 | ||
437 | static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, | |
438 | struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, | |
439 | unsigned int flags) | |
440 | { | |
441 | unsigned long pmd_addr = addr & PMD_MASK; | |
442 | struct device *dev = dax_dev->dev; | |
443 | struct dax_region *dax_region; | |
444 | phys_addr_t phys; | |
445 | pgoff_t pgoff; | |
446 | pfn_t pfn; | |
447 | ||
448 | if (check_vma(dax_dev, vma, __func__)) | |
449 | return VM_FAULT_SIGBUS; | |
450 | ||
451 | dax_region = dax_dev->region; | |
452 | if (dax_region->align > PMD_SIZE) { | |
453 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
454 | return VM_FAULT_SIGBUS; | |
455 | } | |
456 | ||
457 | /* dax pmd mappings require pfn_t_devmap() */ | |
458 | if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { | |
459 | dev_dbg(dev, "%s: alignment > fault size\n", __func__); | |
460 | return VM_FAULT_SIGBUS; | |
461 | } | |
462 | ||
463 | pgoff = linear_page_index(vma, pmd_addr); | |
464 | phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE); | |
465 | if (phys == -1) { | |
466 | dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, | |
467 | pgoff); | |
468 | return VM_FAULT_SIGBUS; | |
469 | } | |
470 | ||
471 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | |
472 | ||
473 | return vmf_insert_pfn_pmd(vma, addr, pmd, pfn, | |
474 | flags & FAULT_FLAG_WRITE); | |
475 | } | |
476 | ||
477 | static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |
478 | pmd_t *pmd, unsigned int flags) | |
479 | { | |
480 | int rc; | |
481 | struct file *filp = vma->vm_file; | |
482 | struct dax_dev *dax_dev = filp->private_data; | |
483 | ||
484 | dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, | |
485 | current->comm, (flags & FAULT_FLAG_WRITE) | |
486 | ? "write" : "read", vma->vm_start, vma->vm_end); | |
487 | ||
488 | rcu_read_lock(); | |
489 | rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags); | |
490 | rcu_read_unlock(); | |
491 | ||
492 | return rc; | |
493 | } | |
494 | ||
495 | static void dax_dev_vm_open(struct vm_area_struct *vma) | |
496 | { | |
497 | struct file *filp = vma->vm_file; | |
498 | struct dax_dev *dax_dev = filp->private_data; | |
499 | ||
500 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
501 | kref_get(&dax_dev->kref); | |
502 | } | |
503 | ||
504 | static void dax_dev_vm_close(struct vm_area_struct *vma) | |
505 | { | |
506 | struct file *filp = vma->vm_file; | |
507 | struct dax_dev *dax_dev = filp->private_data; | |
508 | ||
509 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
510 | dax_dev_put(dax_dev); | |
511 | } | |
512 | ||
513 | static const struct vm_operations_struct dax_dev_vm_ops = { | |
514 | .fault = dax_dev_fault, | |
515 | .pmd_fault = dax_dev_pmd_fault, | |
516 | .open = dax_dev_vm_open, | |
517 | .close = dax_dev_vm_close, | |
518 | }; | |
519 | ||
520 | static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) | |
521 | { | |
522 | struct dax_dev *dax_dev = filp->private_data; | |
523 | int rc; | |
524 | ||
525 | dev_dbg(dax_dev->dev, "%s\n", __func__); | |
526 | ||
527 | rc = check_vma(dax_dev, vma, __func__); | |
528 | if (rc) | |
529 | return rc; | |
530 | ||
531 | kref_get(&dax_dev->kref); | |
532 | vma->vm_ops = &dax_dev_vm_ops; | |
533 | vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; | |
534 | return 0; | |
535 | ||
536 | } | |
537 | ||
ab68f262 DW |
538 | static const struct file_operations dax_fops = { |
539 | .llseek = noop_llseek, | |
540 | .owner = THIS_MODULE, | |
dee41079 DW |
541 | .open = dax_dev_open, |
542 | .release = dax_dev_release, | |
543 | .get_unmapped_area = dax_dev_get_unmapped_area, | |
544 | .mmap = dax_dev_mmap, | |
ab68f262 DW |
545 | }; |
546 | ||
547 | static int __init dax_init(void) | |
548 | { | |
549 | int rc; | |
550 | ||
551 | rc = register_chrdev(0, "dax", &dax_fops); | |
552 | if (rc < 0) | |
553 | return rc; | |
554 | dax_major = rc; | |
555 | ||
556 | dax_class = class_create(THIS_MODULE, "dax"); | |
557 | if (IS_ERR(dax_class)) { | |
558 | unregister_chrdev(dax_major, "dax"); | |
559 | return PTR_ERR(dax_class); | |
560 | } | |
561 | ||
562 | return 0; | |
563 | } | |
564 | ||
565 | static void __exit dax_exit(void) | |
566 | { | |
567 | class_destroy(dax_class); | |
568 | unregister_chrdev(dax_major, "dax"); | |
569 | ida_destroy(&dax_minor_ida); | |
570 | } | |
571 | ||
572 | MODULE_AUTHOR("Intel Corporation"); | |
573 | MODULE_LICENSE("GPL v2"); | |
574 | subsys_initcall(dax_init); | |
575 | module_exit(dax_exit); |