]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/misc/ocxl/link.c
ocxl: Add AFU interrupt support
[mirror_ubuntu-bionic-kernel.git] / drivers / misc / ocxl / link.c
1 // SPDX-License-Identifier: GPL-2.0+
2 // Copyright 2017 IBM Corp.
3 #include <linux/sched/mm.h>
4 #include <linux/mutex.h>
5 #include <linux/mmu_context.h>
6 #include <asm/copro.h>
7 #include <asm/pnv-ocxl.h>
8 #include "ocxl_internal.h"
9
10
11 #define SPA_PASID_BITS 15
12 #define SPA_PASID_MAX ((1 << SPA_PASID_BITS) - 1)
13 #define SPA_PE_MASK SPA_PASID_MAX
14 #define SPA_SPA_SIZE_LOG 22 /* Each SPA is 4 Mb */
15
16 #define SPA_CFG_SF (1ull << (63-0))
17 #define SPA_CFG_TA (1ull << (63-1))
18 #define SPA_CFG_HV (1ull << (63-3))
19 #define SPA_CFG_UV (1ull << (63-4))
20 #define SPA_CFG_XLAT_hpt (0ull << (63-6)) /* Hashed page table (HPT) mode */
21 #define SPA_CFG_XLAT_roh (2ull << (63-6)) /* Radix on HPT mode */
22 #define SPA_CFG_XLAT_ror (3ull << (63-6)) /* Radix on Radix mode */
23 #define SPA_CFG_PR (1ull << (63-49))
24 #define SPA_CFG_TC (1ull << (63-54))
25 #define SPA_CFG_DR (1ull << (63-59))
26
27 #define SPA_XSL_TF (1ull << (63-3)) /* Translation fault */
28 #define SPA_XSL_S (1ull << (63-38)) /* Store operation */
29
30 #define SPA_PE_VALID 0x80000000
31
32
33 struct pe_data {
34 struct mm_struct *mm;
35 /* callback to trigger when a translation fault occurs */
36 void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr);
37 /* opaque pointer to be passed to the above callback */
38 void *xsl_err_data;
39 struct rcu_head rcu;
40 };
41
42 struct spa {
43 struct ocxl_process_element *spa_mem;
44 int spa_order;
45 struct mutex spa_lock;
46 struct radix_tree_root pe_tree; /* Maps PE handles to pe_data */
47 char *irq_name;
48 int virq;
49 void __iomem *reg_dsisr;
50 void __iomem *reg_dar;
51 void __iomem *reg_tfc;
52 void __iomem *reg_pe_handle;
53 /*
54 * The following field are used by the memory fault
55 * interrupt handler. We can only have one interrupt at a
56 * time. The NPU won't raise another interrupt until the
57 * previous one has been ack'd by writing to the TFC register
58 */
59 struct xsl_fault {
60 struct work_struct fault_work;
61 u64 pe;
62 u64 dsisr;
63 u64 dar;
64 struct pe_data pe_data;
65 } xsl_fault;
66 };
67
68 /*
69 * A opencapi link can be used be by several PCI functions. We have
70 * one link per device slot.
71 *
72 * A linked list of opencapi links should suffice, as there's a
73 * limited number of opencapi slots on a system and lookup is only
74 * done when the device is probed
75 */
76 struct link {
77 struct list_head list;
78 struct kref ref;
79 int domain;
80 int bus;
81 int dev;
82 atomic_t irq_available;
83 struct spa *spa;
84 void *platform_data;
85 };
86 static struct list_head links_list = LIST_HEAD_INIT(links_list);
87 static DEFINE_MUTEX(links_list_lock);
88
89 enum xsl_response {
90 CONTINUE,
91 ADDRESS_ERROR,
92 RESTART,
93 };
94
95
96 static void read_irq(struct spa *spa, u64 *dsisr, u64 *dar, u64 *pe)
97 {
98 u64 reg;
99
100 *dsisr = in_be64(spa->reg_dsisr);
101 *dar = in_be64(spa->reg_dar);
102 reg = in_be64(spa->reg_pe_handle);
103 *pe = reg & SPA_PE_MASK;
104 }
105
106 static void ack_irq(struct spa *spa, enum xsl_response r)
107 {
108 u64 reg = 0;
109
110 /* continue is not supported */
111 if (r == RESTART)
112 reg = PPC_BIT(31);
113 else if (r == ADDRESS_ERROR)
114 reg = PPC_BIT(30);
115 else
116 WARN(1, "Invalid irq response %d\n", r);
117
118 if (reg)
119 out_be64(spa->reg_tfc, reg);
120 }
121
122 static void xsl_fault_handler_bh(struct work_struct *fault_work)
123 {
124 unsigned int flt = 0;
125 unsigned long access, flags, inv_flags = 0;
126 enum xsl_response r;
127 struct xsl_fault *fault = container_of(fault_work, struct xsl_fault,
128 fault_work);
129 struct spa *spa = container_of(fault, struct spa, xsl_fault);
130
131 int rc;
132
133 /*
134 * We need to release a reference on the mm whenever exiting this
135 * function (taken in the memory fault interrupt handler)
136 */
137 rc = copro_handle_mm_fault(fault->pe_data.mm, fault->dar, fault->dsisr,
138 &flt);
139 if (rc) {
140 pr_debug("copro_handle_mm_fault failed: %d\n", rc);
141 if (fault->pe_data.xsl_err_cb) {
142 fault->pe_data.xsl_err_cb(
143 fault->pe_data.xsl_err_data,
144 fault->dar, fault->dsisr);
145 }
146 r = ADDRESS_ERROR;
147 goto ack;
148 }
149
150 if (!radix_enabled()) {
151 /*
152 * update_mmu_cache() will not have loaded the hash
153 * since current->trap is not a 0x400 or 0x300, so
154 * just call hash_page_mm() here.
155 */
156 access = _PAGE_PRESENT | _PAGE_READ;
157 if (fault->dsisr & SPA_XSL_S)
158 access |= _PAGE_WRITE;
159
160 if (REGION_ID(fault->dar) != USER_REGION_ID)
161 access |= _PAGE_PRIVILEGED;
162
163 local_irq_save(flags);
164 hash_page_mm(fault->pe_data.mm, fault->dar, access, 0x300,
165 inv_flags);
166 local_irq_restore(flags);
167 }
168 r = RESTART;
169 ack:
170 mmdrop(fault->pe_data.mm);
171 ack_irq(spa, r);
172 }
173
174 static irqreturn_t xsl_fault_handler(int irq, void *data)
175 {
176 struct link *link = (struct link *) data;
177 struct spa *spa = link->spa;
178 u64 dsisr, dar, pe_handle;
179 struct pe_data *pe_data;
180 struct ocxl_process_element *pe;
181 int lpid, pid, tid;
182
183 read_irq(spa, &dsisr, &dar, &pe_handle);
184
185 WARN_ON(pe_handle > SPA_PE_MASK);
186 pe = spa->spa_mem + pe_handle;
187 lpid = be32_to_cpu(pe->lpid);
188 pid = be32_to_cpu(pe->pid);
189 tid = be32_to_cpu(pe->tid);
190 /* We could be reading all null values here if the PE is being
191 * removed while an interrupt kicks in. It's not supposed to
192 * happen if the driver notified the AFU to terminate the
193 * PASID, and the AFU waited for pending operations before
194 * acknowledging. But even if it happens, we won't find a
195 * memory context below and fail silently, so it should be ok.
196 */
197 if (!(dsisr & SPA_XSL_TF)) {
198 WARN(1, "Invalid xsl interrupt fault register %#llx\n", dsisr);
199 ack_irq(spa, ADDRESS_ERROR);
200 return IRQ_HANDLED;
201 }
202
203 rcu_read_lock();
204 pe_data = radix_tree_lookup(&spa->pe_tree, pe_handle);
205 if (!pe_data) {
206 /*
207 * Could only happen if the driver didn't notify the
208 * AFU about PASID termination before removing the PE,
209 * or the AFU didn't wait for all memory access to
210 * have completed.
211 *
212 * Either way, we fail early, but we shouldn't log an
213 * error message, as it is a valid (if unexpected)
214 * scenario
215 */
216 rcu_read_unlock();
217 pr_debug("Unknown mm context for xsl interrupt\n");
218 ack_irq(spa, ADDRESS_ERROR);
219 return IRQ_HANDLED;
220 }
221 WARN_ON(pe_data->mm->context.id != pid);
222
223 spa->xsl_fault.pe = pe_handle;
224 spa->xsl_fault.dar = dar;
225 spa->xsl_fault.dsisr = dsisr;
226 spa->xsl_fault.pe_data = *pe_data;
227 mmgrab(pe_data->mm); /* mm count is released by bottom half */
228
229 rcu_read_unlock();
230 schedule_work(&spa->xsl_fault.fault_work);
231 return IRQ_HANDLED;
232 }
233
234 static void unmap_irq_registers(struct spa *spa)
235 {
236 pnv_ocxl_unmap_xsl_regs(spa->reg_dsisr, spa->reg_dar, spa->reg_tfc,
237 spa->reg_pe_handle);
238 }
239
240 static int map_irq_registers(struct pci_dev *dev, struct spa *spa)
241 {
242 return pnv_ocxl_map_xsl_regs(dev, &spa->reg_dsisr, &spa->reg_dar,
243 &spa->reg_tfc, &spa->reg_pe_handle);
244 }
245
246 static int setup_xsl_irq(struct pci_dev *dev, struct link *link)
247 {
248 struct spa *spa = link->spa;
249 int rc;
250 int hwirq;
251
252 rc = pnv_ocxl_get_xsl_irq(dev, &hwirq);
253 if (rc)
254 return rc;
255
256 rc = map_irq_registers(dev, spa);
257 if (rc)
258 return rc;
259
260 spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
261 link->domain, link->bus, link->dev);
262 if (!spa->irq_name) {
263 unmap_irq_registers(spa);
264 dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
265 return -ENOMEM;
266 }
267 /*
268 * At some point, we'll need to look into allowing a higher
269 * number of interrupts. Could we have an IRQ domain per link?
270 */
271 spa->virq = irq_create_mapping(NULL, hwirq);
272 if (!spa->virq) {
273 kfree(spa->irq_name);
274 unmap_irq_registers(spa);
275 dev_err(&dev->dev,
276 "irq_create_mapping failed for translation interrupt\n");
277 return -EINVAL;
278 }
279
280 dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
281
282 rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name,
283 link);
284 if (rc) {
285 irq_dispose_mapping(spa->virq);
286 kfree(spa->irq_name);
287 unmap_irq_registers(spa);
288 dev_err(&dev->dev,
289 "request_irq failed for translation interrupt: %d\n",
290 rc);
291 return -EINVAL;
292 }
293 return 0;
294 }
295
296 static void release_xsl_irq(struct link *link)
297 {
298 struct spa *spa = link->spa;
299
300 if (spa->virq) {
301 free_irq(spa->virq, link);
302 irq_dispose_mapping(spa->virq);
303 }
304 kfree(spa->irq_name);
305 unmap_irq_registers(spa);
306 }
307
308 static int alloc_spa(struct pci_dev *dev, struct link *link)
309 {
310 struct spa *spa;
311
312 spa = kzalloc(sizeof(struct spa), GFP_KERNEL);
313 if (!spa)
314 return -ENOMEM;
315
316 mutex_init(&spa->spa_lock);
317 INIT_RADIX_TREE(&spa->pe_tree, GFP_KERNEL);
318 INIT_WORK(&spa->xsl_fault.fault_work, xsl_fault_handler_bh);
319
320 spa->spa_order = SPA_SPA_SIZE_LOG - PAGE_SHIFT;
321 spa->spa_mem = (struct ocxl_process_element *)
322 __get_free_pages(GFP_KERNEL | __GFP_ZERO, spa->spa_order);
323 if (!spa->spa_mem) {
324 dev_err(&dev->dev, "Can't allocate Shared Process Area\n");
325 kfree(spa);
326 return -ENOMEM;
327 }
328 pr_debug("Allocated SPA for %x:%x:%x at %p\n", link->domain, link->bus,
329 link->dev, spa->spa_mem);
330
331 link->spa = spa;
332 return 0;
333 }
334
335 static void free_spa(struct link *link)
336 {
337 struct spa *spa = link->spa;
338
339 pr_debug("Freeing SPA for %x:%x:%x\n", link->domain, link->bus,
340 link->dev);
341
342 if (spa && spa->spa_mem) {
343 free_pages((unsigned long) spa->spa_mem, spa->spa_order);
344 kfree(spa);
345 link->spa = NULL;
346 }
347 }
348
349 static int alloc_link(struct pci_dev *dev, int PE_mask, struct link **out_link)
350 {
351 struct link *link;
352 int rc;
353
354 link = kzalloc(sizeof(struct link), GFP_KERNEL);
355 if (!link)
356 return -ENOMEM;
357
358 kref_init(&link->ref);
359 link->domain = pci_domain_nr(dev->bus);
360 link->bus = dev->bus->number;
361 link->dev = PCI_SLOT(dev->devfn);
362 atomic_set(&link->irq_available, MAX_IRQ_PER_LINK);
363
364 rc = alloc_spa(dev, link);
365 if (rc)
366 goto err_free;
367
368 rc = setup_xsl_irq(dev, link);
369 if (rc)
370 goto err_spa;
371
372 /* platform specific hook */
373 rc = pnv_ocxl_spa_setup(dev, link->spa->spa_mem, PE_mask,
374 &link->platform_data);
375 if (rc)
376 goto err_xsl_irq;
377
378 *out_link = link;
379 return 0;
380
381 err_xsl_irq:
382 release_xsl_irq(link);
383 err_spa:
384 free_spa(link);
385 err_free:
386 kfree(link);
387 return rc;
388 }
389
390 static void free_link(struct link *link)
391 {
392 release_xsl_irq(link);
393 free_spa(link);
394 kfree(link);
395 }
396
397 int ocxl_link_setup(struct pci_dev *dev, int PE_mask, void **link_handle)
398 {
399 int rc = 0;
400 struct link *link;
401
402 mutex_lock(&links_list_lock);
403 list_for_each_entry(link, &links_list, list) {
404 /* The functions of a device all share the same link */
405 if (link->domain == pci_domain_nr(dev->bus) &&
406 link->bus == dev->bus->number &&
407 link->dev == PCI_SLOT(dev->devfn)) {
408 kref_get(&link->ref);
409 *link_handle = link;
410 goto unlock;
411 }
412 }
413 rc = alloc_link(dev, PE_mask, &link);
414 if (rc)
415 goto unlock;
416
417 list_add(&link->list, &links_list);
418 *link_handle = link;
419 unlock:
420 mutex_unlock(&links_list_lock);
421 return rc;
422 }
423
424 static void release_xsl(struct kref *ref)
425 {
426 struct link *link = container_of(ref, struct link, ref);
427
428 list_del(&link->list);
429 /* call platform code before releasing data */
430 pnv_ocxl_spa_release(link->platform_data);
431 free_link(link);
432 }
433
434 void ocxl_link_release(struct pci_dev *dev, void *link_handle)
435 {
436 struct link *link = (struct link *) link_handle;
437
438 mutex_lock(&links_list_lock);
439 kref_put(&link->ref, release_xsl);
440 mutex_unlock(&links_list_lock);
441 }
442
443 static u64 calculate_cfg_state(bool kernel)
444 {
445 u64 state;
446
447 state = SPA_CFG_DR;
448 if (mfspr(SPRN_LPCR) & LPCR_TC)
449 state |= SPA_CFG_TC;
450 if (radix_enabled())
451 state |= SPA_CFG_XLAT_ror;
452 else
453 state |= SPA_CFG_XLAT_hpt;
454 state |= SPA_CFG_HV;
455 if (kernel) {
456 if (mfmsr() & MSR_SF)
457 state |= SPA_CFG_SF;
458 } else {
459 state |= SPA_CFG_PR;
460 if (!test_tsk_thread_flag(current, TIF_32BIT))
461 state |= SPA_CFG_SF;
462 }
463 return state;
464 }
465
466 int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
467 u64 amr, struct mm_struct *mm,
468 void (*xsl_err_cb)(void *data, u64 addr, u64 dsisr),
469 void *xsl_err_data)
470 {
471 struct link *link = (struct link *) link_handle;
472 struct spa *spa = link->spa;
473 struct ocxl_process_element *pe;
474 int pe_handle, rc = 0;
475 struct pe_data *pe_data;
476
477 BUILD_BUG_ON(sizeof(struct ocxl_process_element) != 128);
478 if (pasid > SPA_PASID_MAX)
479 return -EINVAL;
480
481 mutex_lock(&spa->spa_lock);
482 pe_handle = pasid & SPA_PE_MASK;
483 pe = spa->spa_mem + pe_handle;
484
485 if (pe->software_state) {
486 rc = -EBUSY;
487 goto unlock;
488 }
489
490 pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
491 if (!pe_data) {
492 rc = -ENOMEM;
493 goto unlock;
494 }
495
496 pe_data->mm = mm;
497 pe_data->xsl_err_cb = xsl_err_cb;
498 pe_data->xsl_err_data = xsl_err_data;
499
500 memset(pe, 0, sizeof(struct ocxl_process_element));
501 pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
502 pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
503 pe->pid = cpu_to_be32(pidr);
504 pe->tid = cpu_to_be32(tidr);
505 pe->amr = cpu_to_be64(amr);
506 pe->software_state = cpu_to_be32(SPA_PE_VALID);
507
508 mm_context_add_copro(mm);
509 /*
510 * Barrier is to make sure PE is visible in the SPA before it
511 * is used by the device. It also helps with the global TLBI
512 * invalidation
513 */
514 mb();
515 radix_tree_insert(&spa->pe_tree, pe_handle, pe_data);
516
517 /*
518 * The mm must stay valid for as long as the device uses it. We
519 * lower the count when the context is removed from the SPA.
520 *
521 * We grab mm_count (and not mm_users), as we don't want to
522 * end up in a circular dependency if a process mmaps its
523 * mmio, therefore incrementing the file ref count when
524 * calling mmap(), and forgets to unmap before exiting. In
525 * that scenario, when the kernel handles the death of the
526 * process, the file is not cleaned because unmap was not
527 * called, and the mm wouldn't be freed because we would still
528 * have a reference on mm_users. Incrementing mm_count solves
529 * the problem.
530 */
531 mmgrab(mm);
532 unlock:
533 mutex_unlock(&spa->spa_lock);
534 return rc;
535 }
536
537 int ocxl_link_remove_pe(void *link_handle, int pasid)
538 {
539 struct link *link = (struct link *) link_handle;
540 struct spa *spa = link->spa;
541 struct ocxl_process_element *pe;
542 struct pe_data *pe_data;
543 int pe_handle, rc;
544
545 if (pasid > SPA_PASID_MAX)
546 return -EINVAL;
547
548 /*
549 * About synchronization with our memory fault handler:
550 *
551 * Before removing the PE, the driver is supposed to have
552 * notified the AFU, which should have cleaned up and make
553 * sure the PASID is no longer in use, including pending
554 * interrupts. However, there's no way to be sure...
555 *
556 * We clear the PE and remove the context from our radix
557 * tree. From that point on, any new interrupt for that
558 * context will fail silently, which is ok. As mentioned
559 * above, that's not expected, but it could happen if the
560 * driver or AFU didn't do the right thing.
561 *
562 * There could still be a bottom half running, but we don't
563 * need to wait/flush, as it is managing a reference count on
564 * the mm it reads from the radix tree.
565 */
566 pe_handle = pasid & SPA_PE_MASK;
567 pe = spa->spa_mem + pe_handle;
568
569 mutex_lock(&spa->spa_lock);
570
571 if (!(be32_to_cpu(pe->software_state) & SPA_PE_VALID)) {
572 rc = -EINVAL;
573 goto unlock;
574 }
575
576 memset(pe, 0, sizeof(struct ocxl_process_element));
577 /*
578 * The barrier makes sure the PE is removed from the SPA
579 * before we clear the NPU context cache below, so that the
580 * old PE cannot be reloaded erroneously.
581 */
582 mb();
583
584 /*
585 * hook to platform code
586 * On powerpc, the entry needs to be cleared from the context
587 * cache of the NPU.
588 */
589 rc = pnv_ocxl_spa_remove_pe(link->platform_data, pe_handle);
590 WARN_ON(rc);
591
592 pe_data = radix_tree_delete(&spa->pe_tree, pe_handle);
593 if (!pe_data) {
594 WARN(1, "Couldn't find pe data when removing PE\n");
595 } else {
596 mm_context_remove_copro(pe_data->mm);
597 mmdrop(pe_data->mm);
598 kfree_rcu(pe_data, rcu);
599 }
600 unlock:
601 mutex_unlock(&spa->spa_lock);
602 return rc;
603 }
604
605 int ocxl_link_irq_alloc(void *link_handle, int *hw_irq, u64 *trigger_addr)
606 {
607 struct link *link = (struct link *) link_handle;
608 int rc, irq;
609 u64 addr;
610
611 if (atomic_dec_if_positive(&link->irq_available) < 0)
612 return -ENOSPC;
613
614 rc = pnv_ocxl_alloc_xive_irq(&irq, &addr);
615 if (rc) {
616 atomic_inc(&link->irq_available);
617 return rc;
618 }
619
620 *hw_irq = irq;
621 *trigger_addr = addr;
622 return 0;
623 }
624
625 void ocxl_link_free_irq(void *link_handle, int hw_irq)
626 {
627 struct link *link = (struct link *) link_handle;
628
629 pnv_ocxl_free_xive_irq(hw_irq);
630 atomic_inc(&link->irq_available);
631 }