]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - arch/powerpc/platforms/powernv/npu-dma.c
powerpc/powernv: Move npu struct from pnv_phb to pci_controller
[mirror_ubuntu-focal-kernel.git] / arch / powerpc / platforms / powernv / npu-dma.c
CommitLineData
5d2aa710
AP
1/*
2 * This file implements the DMA operations for NVLink devices. The NPU
3 * devices all point to the same iommu table as the parent PCI device.
4 *
5 * Copyright Alistair Popple, IBM Corporation 2015.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of version 2 of the GNU General Public
9 * License as published by the Free Software Foundation.
10 */
11
1ab66d1f
AP
12#include <linux/mmu_notifier.h>
13#include <linux/mmu_context.h>
14#include <linux/of.h>
5d2aa710
AP
15#include <linux/pci.h>
16#include <linux/memblock.h>
3689c37d 17#include <linux/sizes.h>
5d2aa710 18
99c3ce33 19#include <asm/debugfs.h>
1ab66d1f 20#include <asm/powernv.h>
5d2aa710
AP
21#include <asm/opal.h>
22
5d2aa710
AP
23#include "pci.h"
24
28a5933e
AP
25/*
26 * spinlock to protect initialisation of an npu_context for a particular
27 * mm_struct.
28 */
29static DEFINE_SPINLOCK(npu_context_lock);
30
5d2aa710
AP
31/*
32 * Other types of TCE cache invalidation are not functional in the
33 * hardware.
34 */
5d2aa710
AP
35static struct pci_dev *get_pci_dev(struct device_node *dn)
36{
902bdc57
AK
37 struct pci_dn *pdn = PCI_DN(dn);
38
39 return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
40 pdn->busno, pdn->devfn);
5d2aa710
AP
41}
42
43/* Given a NPU device get the associated PCI device. */
44struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
45{
46 struct device_node *dn;
47 struct pci_dev *gpdev;
48
4c3b89ef
AP
49 if (WARN_ON(!npdev))
50 return NULL;
51
52 if (WARN_ON(!npdev->dev.of_node))
53 return NULL;
54
5d2aa710
AP
55 /* Get assoicated PCI device */
56 dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
57 if (!dn)
58 return NULL;
59
60 gpdev = get_pci_dev(dn);
61 of_node_put(dn);
62
63 return gpdev;
64}
65EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
66
67/* Given the real PCI device get a linked NPU device. */
68struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
69{
70 struct device_node *dn;
71 struct pci_dev *npdev;
72
4c3b89ef
AP
73 if (WARN_ON(!gpdev))
74 return NULL;
75
377aa6b0
AP
76 /* Not all PCI devices have device-tree nodes */
77 if (!gpdev->dev.of_node)
4c3b89ef
AP
78 return NULL;
79
5d2aa710
AP
80 /* Get assoicated PCI device */
81 dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
82 if (!dn)
83 return NULL;
84
85 npdev = get_pci_dev(dn);
86 of_node_put(dn);
87
88 return npdev;
89}
90EXPORT_SYMBOL(pnv_pci_get_npu_dev);
91
5d2aa710
AP
92/*
93 * Returns the PE assoicated with the PCI device of the given
94 * NPU. Returns the linked pci device if pci_dev != NULL.
95 */
96static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
97 struct pci_dev **gpdev)
98{
99 struct pnv_phb *phb;
100 struct pci_controller *hose;
101 struct pci_dev *pdev;
102 struct pnv_ioda_pe *pe;
103 struct pci_dn *pdn;
104
85674868
AK
105 pdev = pnv_pci_get_gpu_dev(npe->pdev);
106 if (!pdev)
107 return NULL;
5d2aa710 108
85674868
AK
109 pdn = pci_get_pdn(pdev);
110 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
111 return NULL;
112
113 hose = pci_bus_to_host(pdev->bus);
114 phb = hose->private_data;
115 pe = &phb->ioda.pe_array[pdn->pe_number];
5d2aa710
AP
116
117 if (gpdev)
118 *gpdev = pdev;
119
120 return pe;
121}
122
b5cb9ab1 123long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
b575c731
AK
124 struct iommu_table *tbl)
125{
126 struct pnv_phb *phb = npe->phb;
127 int64_t rc;
128 const unsigned long size = tbl->it_indirect_levels ?
129 tbl->it_level_size : tbl->it_size;
130 const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
131 const __u64 win_size = tbl->it_size << tbl->it_page_shift;
132
133 pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
134 start_addr, start_addr + win_size - 1,
135 IOMMU_PAGE_SIZE(tbl));
136
137 rc = opal_pci_map_pe_dma_window(phb->opal_id,
138 npe->pe_number,
139 npe->pe_number,
140 tbl->it_indirect_levels + 1,
141 __pa(tbl->it_base),
142 size << 3,
143 IOMMU_PAGE_SIZE(tbl));
144 if (rc) {
145 pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
146 return rc;
147 }
6b3d12a9 148 pnv_pci_ioda2_tce_invalidate_entire(phb, false);
b575c731 149
85674868 150 /* Add the table to the list so its TCE cache will get invalidated */
b5cb9ab1 151 pnv_pci_link_table_and_group(phb->hose->node, num,
85674868
AK
152 tbl, &npe->table_group);
153
b575c731
AK
154 return 0;
155}
156
b5cb9ab1 157long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
b575c731
AK
158{
159 struct pnv_phb *phb = npe->phb;
160 int64_t rc;
161
162 pe_info(npe, "Removing DMA window\n");
163
164 rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
165 npe->pe_number,
166 0/* levels */, 0/* table address */,
167 0/* table size */, 0/* page size */);
168 if (rc) {
169 pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
170 return rc;
171 }
6b3d12a9 172 pnv_pci_ioda2_tce_invalidate_entire(phb, false);
b575c731 173
b5cb9ab1 174 pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
85674868 175 &npe->table_group);
5d2aa710 176
85674868 177 return 0;
5d2aa710
AP
178}
179
180/*
f9f83456 181 * Enables 32 bit DMA on NPU.
5d2aa710 182 */
f9f83456 183static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
5d2aa710 184{
5d2aa710
AP
185 struct pci_dev *gpdev;
186 struct pnv_ioda_pe *gpe;
5d2aa710
AP
187 int64_t rc;
188
189 /*
190 * Find the assoicated PCI devices and get the dma window
191 * information from there.
192 */
193 if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
194 return;
195
196 gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
197 if (!gpe)
198 return;
199
b5cb9ab1 200 rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
5d2aa710
AP
201
202 /*
3182215d
AP
203 * NVLink devices use the same TCE table configuration as
204 * their parent device so drivers shouldn't be doing DMA
205 * operations directly on these devices.
5d2aa710 206 */
3182215d 207 set_dma_ops(&npe->pdev->dev, NULL);
5d2aa710
AP
208}
209
210/*
f9f83456 211 * Enables bypass mode on the NPU. The NPU only supports one
446957ba 212 * window per link, so bypass needs to be explicitly enabled or
5d2aa710
AP
213 * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
214 * active at the same time.
215 */
f9f83456 216static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
5d2aa710
AP
217{
218 struct pnv_phb *phb = npe->phb;
219 int64_t rc = 0;
f9f83456 220 phys_addr_t top = memblock_end_of_DRAM();
5d2aa710 221
7f2c39e9 222 if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
5d2aa710
AP
223 return -EINVAL;
224
b5cb9ab1 225 rc = pnv_npu_unset_window(npe, 0);
b575c731
AK
226 if (rc != OPAL_SUCCESS)
227 return rc;
228
f9f83456
AK
229 /* Enable the bypass window */
230
231 top = roundup_pow_of_two(top);
1f52f176 232 dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
f9f83456
AK
233 npe->pe_number);
234 rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
235 npe->pe_number, npe->pe_number,
236 0 /* bypass base */, top);
5d2aa710 237
85674868 238 if (rc == OPAL_SUCCESS)
6b3d12a9 239 pnv_pci_ioda2_tce_invalidate_entire(phb, false);
85674868 240
5d2aa710
AP
241 return rc;
242}
243
f9f83456 244void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
5d2aa710 245{
f9f83456
AK
246 int i;
247 struct pnv_phb *phb;
248 struct pci_dn *pdn;
249 struct pnv_ioda_pe *npe;
250 struct pci_dev *npdev;
5d2aa710 251
f9f83456
AK
252 for (i = 0; ; ++i) {
253 npdev = pnv_pci_get_npu_dev(gpdev, i);
5d2aa710 254
f9f83456
AK
255 if (!npdev)
256 break;
5d2aa710 257
f9f83456
AK
258 pdn = pci_get_pdn(npdev);
259 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
260 return;
5d2aa710 261
f9f83456 262 phb = pci_bus_to_host(npdev->bus)->private_data;
5d2aa710 263
f9f83456
AK
264 /* We only do bypass if it's enabled on the linked device */
265 npe = &phb->ioda.pe_array[pdn->pe_number];
5d2aa710 266
f9f83456
AK
267 if (bypass) {
268 dev_info(&npdev->dev,
269 "Using 64-bit DMA iommu bypass\n");
270 pnv_npu_dma_set_bypass(npe);
271 } else {
272 dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
273 pnv_npu_dma_set_32(npe);
274 }
275 }
5d2aa710 276}
b5cb9ab1
AK
277
278/* Switch ownership from platform code to external user (e.g. VFIO) */
279void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
280{
281 struct pnv_phb *phb = npe->phb;
282 int64_t rc;
283
284 /*
285 * Note: NPU has just a single TVE in the hardware which means that
286 * while used by the kernel, it can have either 32bit window or
287 * DMA bypass but never both. So we deconfigure 32bit window only
288 * if it was enabled at the moment of ownership change.
289 */
290 if (npe->table_group.tables[0]) {
291 pnv_npu_unset_window(npe, 0);
292 return;
293 }
294
295 /* Disable bypass */
296 rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
297 npe->pe_number, npe->pe_number,
298 0 /* bypass base */, 0);
299 if (rc) {
300 pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
301 return;
302 }
6b3d12a9 303 pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
b5cb9ab1
AK
304}
305
306struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
307{
308 struct pnv_phb *phb = npe->phb;
309 struct pci_bus *pbus = phb->hose->bus;
310 struct pci_dev *npdev, *gpdev = NULL, *gptmp;
311 struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
312
313 if (!gpe || !gpdev)
314 return NULL;
315
316 list_for_each_entry(npdev, &pbus->devices, bus_list) {
317 gptmp = pnv_pci_get_gpu_dev(npdev);
318
319 if (gptmp != gpdev)
320 continue;
321
322 pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
323 iommu_group_add_device(gpe->table_group.group, &npdev->dev);
324 }
325
326 return gpe;
327}
1ab66d1f 328
46a1449d
AK
329/*
330 * NPU2 ATS
331 */
332/* Maximum possible number of ATSD MMIO registers per NPU */
333#define NV_NMMU_ATSD_REGS 8
334
335/* An NPU descriptor, valid for POWER9 only */
336struct npu {
337 int index;
338 __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
339 unsigned int mmio_atsd_count;
340
341 /* Bitmask for MMIO register usage */
342 unsigned long mmio_atsd_usage;
343
344 /* Do we need to explicitly flush the nest mmu? */
345 bool nmmu_flush;
346};
347
1ab66d1f
AP
348/* Maximum number of nvlinks per npu */
349#define NV_MAX_LINKS 6
350
351/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
352static int max_npu2_index;
353
354struct npu_context {
355 struct mm_struct *mm;
356 struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
357 struct mmu_notifier mn;
358 struct kref kref;
1b2c2b12 359 bool nmmu_flush;
1ab66d1f
AP
360
361 /* Callback to stop translation requests on a given GPU */
a1409ada 362 void (*release_cb)(struct npu_context *context, void *priv);
1ab66d1f
AP
363
364 /*
365 * Private pointer passed to the above callback for usage by
366 * device drivers.
367 */
368 void *priv;
369};
370
2b74e2a9
AP
371struct mmio_atsd_reg {
372 struct npu *npu;
373 int reg;
374};
375
1ab66d1f
AP
376/*
377 * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
378 * if none are available.
379 */
380static int get_mmio_atsd_reg(struct npu *npu)
381{
382 int i;
383
384 for (i = 0; i < npu->mmio_atsd_count; i++) {
9eab9901
RA
385 if (!test_bit(i, &npu->mmio_atsd_usage))
386 if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
387 return i;
1ab66d1f
AP
388 }
389
390 return -ENOSPC;
391}
392
393static void put_mmio_atsd_reg(struct npu *npu, int reg)
394{
2b74e2a9 395 clear_bit_unlock(reg, &npu->mmio_atsd_usage);
1ab66d1f
AP
396}
397
398/* MMIO ATSD register offsets */
7ead15a1
MH
399#define XTS_ATSD_LAUNCH 0
400#define XTS_ATSD_AVA 1
401#define XTS_ATSD_STAT 2
1ab66d1f 402
3689c37d 403static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
1ab66d1f 404{
7ead15a1
MH
405 unsigned long launch = 0;
406
407 if (psize == MMU_PAGE_COUNT) {
408 /* IS set to invalidate entire matching PID */
409 launch |= PPC_BIT(12);
410 } else {
411 /* AP set to invalidate region of psize */
412 launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
413 }
1ab66d1f 414
7ead15a1
MH
415 /* PRS set to process-scoped */
416 launch |= PPC_BIT(13);
417
418 /* PID */
419 launch |= pid << PPC_BITLSHIFT(38);
420
3689c37d 421 /* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
7ead15a1
MH
422
423 return launch;
1ab66d1f
AP
424}
425
7ead15a1
MH
426static void mmio_atsd_regs_write(struct mmio_atsd_reg
427 mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
428 unsigned long val)
1ab66d1f 429{
7ead15a1
MH
430 struct npu *npu;
431 int i, reg;
1ab66d1f 432
2b74e2a9 433 for (i = 0; i <= max_npu2_index; i++) {
7ead15a1
MH
434 reg = mmio_atsd_reg[i].reg;
435 if (reg < 0)
2b74e2a9
AP
436 continue;
437
7ead15a1
MH
438 npu = mmio_atsd_reg[i].npu;
439 __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
440 }
441}
1ab66d1f 442
7ead15a1 443static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
3689c37d 444 unsigned long pid)
7ead15a1 445{
3689c37d 446 unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
bbd5ff50 447
7ead15a1
MH
448 /* Invalidating the entire process doesn't use a va */
449 mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
1ab66d1f
AP
450}
451
3689c37d
MH
452static void mmio_invalidate_range(struct mmio_atsd_reg
453 mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
454 unsigned long start, unsigned long psize)
1ab66d1f 455{
3689c37d 456 unsigned long launch = get_atsd_launch_val(pid, psize);
1ab66d1f 457
7ead15a1 458 /* Write all VAs first */
3689c37d 459 mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
1ab66d1f 460
7ead15a1
MH
461 /* Issue one barrier for all address writes */
462 eieio();
bbd5ff50 463
7ead15a1
MH
464 /* Launch */
465 mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
1ab66d1f
AP
466}
467
468#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
469
bbd5ff50 470static void mmio_invalidate_wait(
2b74e2a9 471 struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
bbd5ff50
AP
472{
473 struct npu *npu;
474 int i, reg;
475
476 /* Wait for all invalidations to complete */
477 for (i = 0; i <= max_npu2_index; i++) {
478 if (mmio_atsd_reg[i].reg < 0)
479 continue;
480
481 /* Wait for completion */
482 npu = mmio_atsd_reg[i].npu;
483 reg = mmio_atsd_reg[i].reg;
484 while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
485 cpu_relax();
2b74e2a9
AP
486 }
487}
488
489/*
490 * Acquires all the address translation shootdown (ATSD) registers required to
491 * launch an ATSD on all links this npu_context is active on.
492 */
493static void acquire_atsd_reg(struct npu_context *npu_context,
494 struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
495{
496 int i, j;
497 struct npu *npu;
498 struct pci_dev *npdev;
bbd5ff50 499
2b74e2a9
AP
500 for (i = 0; i <= max_npu2_index; i++) {
501 mmio_atsd_reg[i].reg = -1;
502 for (j = 0; j < NV_MAX_LINKS; j++) {
503 /*
504 * There are no ordering requirements with respect to
505 * the setup of struct npu_context, but to ensure
506 * consistent behaviour we need to ensure npdev[][] is
507 * only read once.
508 */
509 npdev = READ_ONCE(npu_context->npdev[i][j]);
510 if (!npdev)
511 continue;
bbd5ff50 512
46a1449d 513 npu = pci_bus_to_host(npdev->bus)->npu;
2b74e2a9
AP
514 mmio_atsd_reg[i].npu = npu;
515 mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
516 while (mmio_atsd_reg[i].reg < 0) {
517 mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
518 cpu_relax();
519 }
520 break;
521 }
522 }
523}
524
525/*
526 * Release previously acquired ATSD registers. To avoid deadlocks the registers
527 * must be released in the same order they were acquired above in
528 * acquire_atsd_reg.
529 */
530static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
531{
532 int i;
533
534 for (i = 0; i <= max_npu2_index; i++) {
bbd5ff50 535 /*
2b74e2a9
AP
536 * We can't rely on npu_context->npdev[][] being the same here
537 * as when acquire_atsd_reg() was called, hence we use the
538 * values stored in mmio_atsd_reg during the acquire phase
539 * rather than re-reading npdev[][].
bbd5ff50 540 */
2b74e2a9
AP
541 if (mmio_atsd_reg[i].reg < 0)
542 continue;
543
544 put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
bbd5ff50
AP
545 }
546}
547
1ab66d1f 548/*
3689c37d 549 * Invalidate a virtual address range
1ab66d1f 550 */
3689c37d
MH
551static void mmio_invalidate(struct npu_context *npu_context,
552 unsigned long start, unsigned long size)
1ab66d1f 553{
bbd5ff50 554 struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
1ab66d1f 555 unsigned long pid = npu_context->mm->context.id;
3689c37d
MH
556 unsigned long atsd_start = 0;
557 unsigned long end = start + size - 1;
558 int atsd_psize = MMU_PAGE_COUNT;
559
560 /*
561 * Convert the input range into one of the supported sizes. If the range
562 * doesn't fit, use the next larger supported size. Invalidation latency
563 * is high, so over-invalidation is preferred to issuing multiple
564 * invalidates.
565 *
566 * A 4K page size isn't supported by NPU/GPU ATS, so that case is
567 * ignored.
568 */
569 if (size == SZ_64K) {
570 atsd_start = start;
571 atsd_psize = MMU_PAGE_64K;
572 } else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
573 atsd_start = ALIGN_DOWN(start, SZ_2M);
574 atsd_psize = MMU_PAGE_2M;
575 } else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
576 atsd_start = ALIGN_DOWN(start, SZ_1G);
577 atsd_psize = MMU_PAGE_1G;
578 }
1ab66d1f 579
1b2c2b12
AP
580 if (npu_context->nmmu_flush)
581 /*
582 * Unfortunately the nest mmu does not support flushing specific
583 * addresses so we have to flush the whole mm once before
584 * shooting down the GPU translation.
585 */
586 flush_all_mm(npu_context->mm);
bab9f954 587
1ab66d1f
AP
588 /*
589 * Loop over all the NPUs this process is active on and launch
590 * an invalidate.
591 */
2b74e2a9 592 acquire_atsd_reg(npu_context, mmio_atsd_reg);
3689c37d
MH
593
594 if (atsd_psize == MMU_PAGE_COUNT)
595 mmio_invalidate_pid(mmio_atsd_reg, pid);
2b74e2a9 596 else
3689c37d
MH
597 mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
598 atsd_psize);
2b74e2a9
AP
599
600 mmio_invalidate_wait(mmio_atsd_reg);
3689c37d
MH
601
602 /*
603 * The GPU requires two flush ATSDs to ensure all entries have been
604 * flushed. We use PID 0 as it will never be used for a process on the
605 * GPU.
606 */
607 mmio_invalidate_pid(mmio_atsd_reg, 0);
608 mmio_invalidate_wait(mmio_atsd_reg);
609 mmio_invalidate_pid(mmio_atsd_reg, 0);
610 mmio_invalidate_wait(mmio_atsd_reg);
611
2b74e2a9 612 release_atsd_reg(mmio_atsd_reg);
1ab66d1f
AP
613}
614
615static void pnv_npu2_mn_release(struct mmu_notifier *mn,
616 struct mm_struct *mm)
617{
618 struct npu_context *npu_context = mn_to_npu_context(mn);
619
620 /* Call into device driver to stop requests to the NMMU */
621 if (npu_context->release_cb)
622 npu_context->release_cb(npu_context, npu_context->priv);
623
624 /*
625 * There should be no more translation requests for this PID, but we
626 * need to ensure any entries for it are removed from the TLB.
627 */
3689c37d 628 mmio_invalidate(npu_context, 0, ~0UL);
1ab66d1f
AP
629}
630
631static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
632 struct mm_struct *mm,
633 unsigned long address,
634 pte_t pte)
635{
636 struct npu_context *npu_context = mn_to_npu_context(mn);
3689c37d 637 mmio_invalidate(npu_context, address, PAGE_SIZE);
1ab66d1f
AP
638}
639
1ab66d1f
AP
640static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
641 struct mm_struct *mm,
642 unsigned long start, unsigned long end)
643{
644 struct npu_context *npu_context = mn_to_npu_context(mn);
3689c37d 645 mmio_invalidate(npu_context, start, end - start);
1ab66d1f
AP
646}
647
648static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
649 .release = pnv_npu2_mn_release,
650 .change_pte = pnv_npu2_mn_change_pte,
1ab66d1f
AP
651 .invalidate_range = pnv_npu2_mn_invalidate_range,
652};
653
654/*
655 * Call into OPAL to setup the nmmu context for the current task in
656 * the NPU. This must be called to setup the context tables before the
657 * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
658 *
659 * A release callback should be registered to allow a device driver to
660 * be notified that it should not launch any new translation requests
661 * as the final TLB invalidate is about to occur.
662 *
663 * Returns an error if there no contexts are currently available or a
664 * npu_context which should be passed to pnv_npu2_handle_fault().
665 *
28a5933e
AP
666 * mmap_sem must be held in write mode and must not be called from interrupt
667 * context.
1ab66d1f
AP
668 */
669struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
670 unsigned long flags,
a1409ada 671 void (*cb)(struct npu_context *, void *),
1ab66d1f
AP
672 void *priv)
673{
674 int rc;
675 u32 nvlink_index;
676 struct device_node *nvlink_dn;
677 struct mm_struct *mm = current->mm;
678 struct pnv_phb *nphb;
679 struct npu *npu;
680 struct npu_context *npu_context;
46a1449d 681 struct pci_controller *hose;
1ab66d1f
AP
682
683 /*
684 * At present we don't support GPUs connected to multiple NPUs and I'm
685 * not sure the hardware does either.
686 */
687 struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
688
689 if (!firmware_has_feature(FW_FEATURE_OPAL))
690 return ERR_PTR(-ENODEV);
691
692 if (!npdev)
693 /* No nvlink associated with this GPU device */
694 return ERR_PTR(-ENODEV);
695
720c8404
MH
696 nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
697 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
698 &nvlink_index)))
699 return ERR_PTR(-ENODEV);
700
bbd5ff50
AP
701 if (!mm || mm->context.id == 0) {
702 /*
703 * Kernel thread contexts are not supported and context id 0 is
704 * reserved on the GPU.
705 */
1ab66d1f
AP
706 return ERR_PTR(-EINVAL);
707 }
708
46a1449d
AK
709 hose = pci_bus_to_host(npdev->bus);
710 nphb = hose->private_data;
711 npu = hose->npu;
1ab66d1f
AP
712
713 /*
714 * Setup the NPU context table for a particular GPU. These need to be
715 * per-GPU as we need the tables to filter ATSDs when there are no
28a5933e
AP
716 * active contexts on a particular GPU. It is safe for these to be
717 * called concurrently with destroy as the OPAL call takes appropriate
718 * locks and refcounts on init/destroy.
1ab66d1f
AP
719 */
720 rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
721 PCI_DEVID(gpdev->bus->number, gpdev->devfn));
722 if (rc < 0)
723 return ERR_PTR(-ENOSPC);
724
725 /*
726 * We store the npu pci device so we can more easily get at the
727 * associated npus.
728 */
28a5933e 729 spin_lock(&npu_context_lock);
1ab66d1f 730 npu_context = mm->context.npu_context;
a1409ada
AP
731 if (npu_context) {
732 if (npu_context->release_cb != cb ||
733 npu_context->priv != priv) {
734 spin_unlock(&npu_context_lock);
735 opal_npu_destroy_context(nphb->opal_id, mm->context.id,
736 PCI_DEVID(gpdev->bus->number,
737 gpdev->devfn));
738 return ERR_PTR(-EINVAL);
739 }
740
28a5933e 741 WARN_ON(!kref_get_unless_zero(&npu_context->kref));
a1409ada 742 }
28a5933e
AP
743 spin_unlock(&npu_context_lock);
744
1ab66d1f 745 if (!npu_context) {
28a5933e
AP
746 /*
747 * We can set up these fields without holding the
748 * npu_context_lock as the npu_context hasn't been returned to
749 * the caller meaning it can't be destroyed. Parallel allocation
750 * is protected against by mmap_sem.
751 */
720c8404 752 rc = -ENOMEM;
1ab66d1f 753 npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
720c8404
MH
754 if (npu_context) {
755 kref_init(&npu_context->kref);
756 npu_context->mm = mm;
757 npu_context->mn.ops = &nv_nmmu_notifier_ops;
758 rc = __mmu_notifier_register(&npu_context->mn, mm);
759 }
760
761 if (rc) {
762 kfree(npu_context);
763 opal_npu_destroy_context(nphb->opal_id, mm->context.id,
764 PCI_DEVID(gpdev->bus->number,
765 gpdev->devfn));
766 return ERR_PTR(rc);
767 }
1ab66d1f
AP
768
769 mm->context.npu_context = npu_context;
1ab66d1f
AP
770 }
771
772 npu_context->release_cb = cb;
773 npu_context->priv = priv;
2b74e2a9
AP
774
775 /*
776 * npdev is a pci_dev pointer setup by the PCI code. We assign it to
777 * npdev[][] to indicate to the mmu notifiers that an invalidation
778 * should also be sent over this nvlink. The notifiers don't use any
779 * other fields in npu_context, so we just need to ensure that when they
780 * deference npu_context->npdev[][] it is either a valid pointer or
781 * NULL.
782 */
783 WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
1ab66d1f 784
46a1449d 785 if (!npu->nmmu_flush) {
1b2c2b12
AP
786 /*
787 * If we're not explicitly flushing ourselves we need to mark
788 * the thread for global flushes
789 */
790 npu_context->nmmu_flush = false;
791 mm_context_add_copro(mm);
792 } else
793 npu_context->nmmu_flush = true;
794
1ab66d1f
AP
795 return npu_context;
796}
797EXPORT_SYMBOL(pnv_npu2_init_context);
798
799static void pnv_npu2_release_context(struct kref *kref)
800{
801 struct npu_context *npu_context =
802 container_of(kref, struct npu_context, kref);
803
1b2c2b12
AP
804 if (!npu_context->nmmu_flush)
805 mm_context_remove_copro(npu_context->mm);
806
1ab66d1f 807 npu_context->mm->context.npu_context = NULL;
1ab66d1f
AP
808}
809
28a5933e
AP
810/*
811 * Destroy a context on the given GPU. May free the npu_context if it is no
812 * longer active on any GPUs. Must not be called from interrupt context.
813 */
1ab66d1f
AP
814void pnv_npu2_destroy_context(struct npu_context *npu_context,
815 struct pci_dev *gpdev)
816{
28a5933e 817 int removed;
415ba3c1 818 struct pnv_phb *nphb;
1ab66d1f
AP
819 struct npu *npu;
820 struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
821 struct device_node *nvlink_dn;
822 u32 nvlink_index;
46a1449d 823 struct pci_controller *hose;
1ab66d1f
AP
824
825 if (WARN_ON(!npdev))
826 return;
827
828 if (!firmware_has_feature(FW_FEATURE_OPAL))
829 return;
830
46a1449d
AK
831 hose = pci_bus_to_host(npdev->bus);
832 nphb = hose->private_data;
833 npu = hose->npu;
1ab66d1f
AP
834 nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
835 if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
836 &nvlink_index)))
837 return;
2b74e2a9 838 WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
415ba3c1 839 opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
1ab66d1f 840 PCI_DEVID(gpdev->bus->number, gpdev->devfn));
28a5933e
AP
841 spin_lock(&npu_context_lock);
842 removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
843 spin_unlock(&npu_context_lock);
844
845 /*
846 * We need to do this outside of pnv_npu2_release_context so that it is
847 * outside the spinlock as mmu_notifier_destroy uses SRCU.
848 */
849 if (removed) {
850 mmu_notifier_unregister(&npu_context->mn,
851 npu_context->mm);
852
853 kfree(npu_context);
854 }
855
1ab66d1f
AP
856}
857EXPORT_SYMBOL(pnv_npu2_destroy_context);
858
859/*
860 * Assumes mmap_sem is held for the contexts associated mm.
861 */
862int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
863 unsigned long *flags, unsigned long *status, int count)
864{
865 u64 rc = 0, result = 0;
866 int i, is_write;
867 struct page *page[1];
868
869 /* mmap_sem should be held so the struct_mm must be present */
870 struct mm_struct *mm = context->mm;
871
872 if (!firmware_has_feature(FW_FEATURE_OPAL))
873 return -ENODEV;
874
875 WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
876
877 for (i = 0; i < count; i++) {
878 is_write = flags[i] & NPU2_WRITE;
879 rc = get_user_pages_remote(NULL, mm, ea[i], 1,
880 is_write ? FOLL_WRITE : 0,
881 page, NULL, NULL);
882
883 /*
884 * To support virtualised environments we will have to do an
885 * access to the page to ensure it gets faulted into the
886 * hypervisor. For the moment virtualisation is not supported in
887 * other areas so leave the access out.
888 */
889 if (rc != 1) {
890 status[i] = rc;
891 result = -EFAULT;
892 continue;
893 }
894
895 status[i] = 0;
896 put_page(page[0]);
897 }
898
899 return result;
900}
901EXPORT_SYMBOL(pnv_npu2_handle_fault);
902
903int pnv_npu2_init(struct pnv_phb *phb)
904{
905 unsigned int i;
906 u64 mmio_atsd;
907 struct device_node *dn;
908 struct pci_dev *gpdev;
909 static int npu_index;
910 uint64_t rc = 0;
46a1449d
AK
911 struct pci_controller *hose = phb->hose;
912 struct npu *npu;
913 int ret;
914
915 npu = kzalloc(sizeof(*npu), GFP_KERNEL);
916 if (!npu)
917 return -ENOMEM;
1ab66d1f 918
46a1449d 919 npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush");
1ab66d1f
AP
920 for_each_child_of_node(phb->hose->dn, dn) {
921 gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
922 if (gpdev) {
923 rc = opal_npu_map_lpar(phb->opal_id,
924 PCI_DEVID(gpdev->bus->number, gpdev->devfn),
925 0, 0);
926 if (rc)
927 dev_err(&gpdev->dev,
928 "Error %lld mapping device to LPAR\n",
929 rc);
930 }
931 }
932
46a1449d 933 for (i = 0; !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
1ab66d1f 934 i, &mmio_atsd); i++)
46a1449d 935 npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
1ab66d1f 936
46a1449d
AK
937 pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i);
938 npu->mmio_atsd_count = i;
939 npu->mmio_atsd_usage = 0;
1ab66d1f 940 npu_index++;
46a1449d
AK
941 if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
942 ret = -ENOSPC;
943 goto fail_exit;
944 }
1ab66d1f 945 max_npu2_index = npu_index;
46a1449d
AK
946 npu->index = npu_index;
947 hose->npu = npu;
1ab66d1f
AP
948
949 return 0;
46a1449d
AK
950
951fail_exit:
952 for (i = 0; i < npu->mmio_atsd_count; ++i)
953 iounmap(npu->mmio_atsd_regs[i]);
954
955 kfree(npu);
956
957 return ret;
1ab66d1f 958}