]> git.proxmox.com Git - pve-kernel.git/blob - patches/kernel/0008-UBUNTU-SAUCE-Add-mdev_set_iommu_device-kABI.patch
buildsys: improve DSC target
[pve-kernel.git] / patches / kernel / 0008-UBUNTU-SAUCE-Add-mdev_set_iommu_device-kABI.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Tarun Gupta <targupta@nvidia.com>
3 Date: Thu, 18 May 2023 16:48:09 +0530
4 Subject: [PATCH] UBUNTU: SAUCE: Add mdev_set_iommu_device() kABI.
5
6 With below commit present from 5.16+ upstream kernel onwards, support
7 mdev_set_iommu_device() kABI has been removed from kernel due to lack of
8 in-tree vendor drivers using the kABI.
9
10 fda49d97f2c4 ("vfio: remove the unused mdev iommu hook")
11
12 This patch partially reverts the above commit so that
13 mdev_set_iommu_device() kABI is still supported with HWE kernels for
14 Ubuntu 22.04. In this partial revert, have not added back the code for
15 "aux" variants (IOMMU_DEV_FEAT_AUX) present in
16 vfio_mdev_[attach|detach]_domain as this support was never added by any
17 in-tree driver or known out-of-tree driver. Nvidia vGPU doesn't make use
18 of IOMMU_DEV_FEAT_AUX feature.
19
20 Also, it adds back the vfio_bus_is_mdev() function which was reverted in
21 below patch as there were no users of it. This patch adds it back to
22 detect if this is an mdev device.
23
24 c3c0fa9d94f7 ("vfio: clean up the check for mediated device in
25 vfio_iommu_type1")
26
27 Also, in v6.2 kernel, "mdev_bus_type" struct has been unexported as
28 part of below commit because it was used only in mdev.ko. But, for
29 vGPU, as mentioned above, since we use vfio_bus_is_mdev() fn
30 in vfio_iommu_type1.ko, we need to again export "mdev_bus_type" struct.
31
32 2815fe149ffa ("vfio/mdev: unexport mdev_bus_type")
33
34 It is not a clean revert in vfio_iommu_type1_attach_group() fn as it is
35 changed in v6.2 upstream kernel compared to when
36 mdev_set_iommu_device() kABI was removed in 5.16 kernel.
37 In 5.19 kernel, VFIO_EMULATED_IOMMU handling is introduced in
38 vfio_iommu_type1_attach_group() fn which was not present when the patch
39 was reverted in 5.16 kernel.
40
41 But, the logic remains the same. The logic here is if this is an
42 vfio-mdev device, then check by calling vfio_mdev_iommu_device() if this
43 mdev device already has an backed IOMMU device (which will be provided
44 by mdev_set_iommu_device kABI from vendor driver). If the mdev device
45 has backed iommu device, then use that device's IOMMU domain.
46
47 This kABI is used by SRIOV based Nvidia vGPU to pin all guest sysmem on
48 VF during vGPU VM boot. With this patch, SRIOV based Nvidia vGPU will
49 continue to work with upstream kernels. Nvidia vGPU driver calls
50 mdev_set_iommu_device() for mdev device with VF as the backing IOMMU.
51
52 Separately, to fix this in upstream kernels, vGPU is planning to adopt
53 vfio-pci-core framework instead of using MDEV framework.
54 Currently, vfio-pci-core framework works with SRIOV vGPU but lacks
55 libvirt support to assign VFs using vfio-pci-core framework.
56 Will work with upstream libvirt community to get libvirt support for
57 vfio-pci-core devices. Post that, don't need this custom mdev patch
58 and vGPU can work out-of-box on Ubuntu with vfio-pci-core frameowrk
59
60 BugLink : https://bugs.launchpad.net/bugs/1988806
61
62 Signed-off-by: Tarun Gupta <targupta@nvidia.com>
63 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
64 ---
65 drivers/vfio/mdev/mdev_driver.c | 1 +
66 drivers/vfio/mdev/mdev_private.h | 1 -
67 drivers/vfio/vfio_iommu_type1.c | 126 ++++++++++++++++++++++++++++---
68 include/linux/mdev.h | 22 ++++++
69 4 files changed, 140 insertions(+), 10 deletions(-)
70
71 diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
72 index 7825d83a55f8..a4799e7d79fc 100644
73 --- a/drivers/vfio/mdev/mdev_driver.c
74 +++ b/drivers/vfio/mdev/mdev_driver.c
75 @@ -46,6 +46,7 @@ struct bus_type mdev_bus_type = {
76 .remove = mdev_remove,
77 .match = mdev_match,
78 };
79 +EXPORT_SYMBOL_GPL(mdev_bus_type);
80
81 /**
82 * mdev_register_driver - register a new MDEV driver
83 diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
84 index af457b27f607..ba1b2dbddc0b 100644
85 --- a/drivers/vfio/mdev/mdev_private.h
86 +++ b/drivers/vfio/mdev/mdev_private.h
87 @@ -13,7 +13,6 @@
88 int mdev_bus_register(void);
89 void mdev_bus_unregister(void);
90
91 -extern struct bus_type mdev_bus_type;
92 extern const struct attribute_group *mdev_device_groups[];
93
94 #define to_mdev_type_attr(_attr) \
95 diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
96 index 7fa68dc4e938..fef221a87aa7 100644
97 --- a/drivers/vfio/vfio_iommu_type1.c
98 +++ b/drivers/vfio/vfio_iommu_type1.c
99 @@ -36,6 +36,7 @@
100 #include <linux/uaccess.h>
101 #include <linux/vfio.h>
102 #include <linux/workqueue.h>
103 +#include <linux/mdev.h>
104 #include <linux/notifier.h>
105 #include <linux/irqdomain.h>
106 #include "vfio.h"
107 @@ -115,6 +116,7 @@ struct vfio_batch {
108 struct vfio_iommu_group {
109 struct iommu_group *iommu_group;
110 struct list_head next;
111 + bool mdev_group;
112 bool pinned_page_dirty_scope;
113 };
114
115 @@ -1744,6 +1746,18 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
116 return ret;
117 }
118
119 +static int vfio_bus_type(struct device *dev, void *data)
120 +{
121 + struct bus_type **bus = data;
122 +
123 + if (*bus && *bus != dev->bus)
124 + return -EINVAL;
125 +
126 + *bus = dev->bus;
127 +
128 + return 0;
129 +}
130 +
131 static int vfio_iommu_replay(struct vfio_iommu *iommu,
132 struct vfio_domain *domain)
133 {
134 @@ -1992,6 +2006,81 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
135 return ret;
136 }
137
138 +static int vfio_mdev_attach_domain(struct device *dev, void *data)
139 +{
140 + struct mdev_device *mdev = to_mdev_device(dev);
141 + struct iommu_domain *domain = data;
142 + struct device *iommu_device;
143 +
144 + iommu_device = mdev_get_iommu_device(mdev);
145 + if (iommu_device)
146 + return iommu_attach_device(domain, iommu_device);
147 +
148 + return -EINVAL;
149 +}
150 +
151 +static int vfio_mdev_detach_domain(struct device *dev, void *data)
152 +{
153 + struct mdev_device *mdev = to_mdev_device(dev);
154 + struct iommu_domain *domain = data;
155 + struct device *iommu_device;
156 +
157 + iommu_device = mdev_get_iommu_device(mdev);
158 + if (iommu_device)
159 + iommu_detach_device(domain, iommu_device);
160 +
161 + return 0;
162 +}
163 +
164 +static int vfio_iommu_attach_group(struct vfio_domain *domain,
165 + struct vfio_iommu_group *group)
166 +{
167 + if (group->mdev_group)
168 + return iommu_group_for_each_dev(group->iommu_group,
169 + domain->domain,
170 + vfio_mdev_attach_domain);
171 + else
172 + return iommu_attach_group(domain->domain, group->iommu_group);
173 +}
174 +
175 +static void vfio_iommu_detach_group(struct vfio_domain *domain,
176 + struct vfio_iommu_group *group)
177 +{
178 + if (group->mdev_group)
179 + iommu_group_for_each_dev(group->iommu_group, domain->domain,
180 + vfio_mdev_detach_domain);
181 + else
182 + iommu_detach_group(domain->domain, group->iommu_group);
183 +}
184 +
185 +static bool vfio_bus_is_mdev(struct bus_type *bus)
186 +{
187 + struct bus_type *mdev_bus;
188 + bool ret = false;
189 +
190 + mdev_bus = symbol_get(mdev_bus_type);
191 + if (mdev_bus) {
192 + ret = (bus == mdev_bus);
193 + symbol_put(mdev_bus_type);
194 + }
195 +
196 + return ret;
197 +}
198 +
199 +static int vfio_mdev_iommu_device(struct device *dev, void *data)
200 +{
201 + struct mdev_device *mdev = to_mdev_device(dev);
202 + struct device **old = data, *new;
203 +
204 + new = mdev_get_iommu_device(mdev);
205 + if (!new || (*old && *old != new))
206 + return -EINVAL;
207 +
208 + *old = new;
209 +
210 + return 0;
211 +}
212 +
213 /*
214 * This is a helper function to insert an address range to iova list.
215 * The list is initially created with a single entry corresponding to
216 @@ -2260,6 +2349,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
217 group->iommu_group = iommu_group;
218
219 if (type == VFIO_EMULATED_IOMMU) {
220 + struct bus_type *bus = NULL;
221 +
222 + ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
223 +
224 + if (!ret && vfio_bus_is_mdev(bus)) {
225 + struct device *iommu_device = NULL;
226 +
227 + group->mdev_group = true;
228 +
229 + /* Determine the isolation type */
230 + ret = iommu_group_for_each_dev(iommu_group,
231 + &iommu_device,
232 + vfio_mdev_iommu_device);
233 + if (!ret && iommu_device) {
234 + iommu_group = iommu_device->iommu_group;
235 + goto mdev_iommu_device;
236 + }
237 + }
238 +
239 list_add(&group->next, &iommu->emulated_iommu_groups);
240 /*
241 * An emulated IOMMU group cannot dirty memory directly, it can
242 @@ -2272,6 +2380,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
243 goto out_unlock;
244 }
245
246 +mdev_iommu_device:
247 +
248 ret = -ENOMEM;
249 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
250 if (!domain)
251 @@ -2294,7 +2404,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
252 goto out_domain;
253 }
254
255 - ret = iommu_attach_group(domain->domain, group->iommu_group);
256 + ret = vfio_iommu_attach_group(domain, group);
257 if (ret)
258 goto out_domain;
259
260 @@ -2370,17 +2480,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
261 if (d->domain->ops == domain->domain->ops &&
262 d->enforce_cache_coherency ==
263 domain->enforce_cache_coherency) {
264 - iommu_detach_group(domain->domain, group->iommu_group);
265 - if (!iommu_attach_group(d->domain,
266 - group->iommu_group)) {
267 + vfio_iommu_detach_group(domain, group);
268 + if (!vfio_iommu_attach_group(d, group)) {
269 list_add(&group->next, &d->group_list);
270 iommu_domain_free(domain->domain);
271 kfree(domain);
272 goto done;
273 }
274
275 - ret = iommu_attach_group(domain->domain,
276 - group->iommu_group);
277 + ret = vfio_iommu_attach_group(domain, group);
278 if (ret)
279 goto out_domain;
280 }
281 @@ -2417,7 +2525,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
282 return 0;
283
284 out_detach:
285 - iommu_detach_group(domain->domain, group->iommu_group);
286 + vfio_iommu_detach_group(domain, group);
287 out_domain:
288 iommu_domain_free(domain->domain);
289 vfio_iommu_iova_free(&iova_copy);
290 @@ -2578,7 +2686,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
291 if (!group)
292 continue;
293
294 - iommu_detach_group(domain->domain, group->iommu_group);
295 + vfio_iommu_detach_group(domain, group);
296 update_dirty_scope = !group->pinned_page_dirty_scope;
297 list_del(&group->next);
298 kfree(group);
299 @@ -2669,7 +2777,7 @@ static void vfio_release_domain(struct vfio_domain *domain)
300
301 list_for_each_entry_safe(group, group_tmp,
302 &domain->group_list, next) {
303 - iommu_detach_group(domain->domain, group->iommu_group);
304 + vfio_iommu_detach_group(domain, group);
305 list_del(&group->next);
306 kfree(group);
307 }
308 diff --git a/include/linux/mdev.h b/include/linux/mdev.h
309 index 139d05b26f82..b08163d67e63 100644
310 --- a/include/linux/mdev.h
311 +++ b/include/linux/mdev.h
312 @@ -20,6 +20,7 @@ struct mdev_device {
313 guid_t uuid;
314 struct list_head next;
315 struct mdev_type *type;
316 + struct device *iommu_device;
317 bool active;
318 };
319
320 @@ -53,6 +54,25 @@ static inline struct mdev_device *to_mdev_device(struct device *dev)
321 return container_of(dev, struct mdev_device, dev);
322 }
323
324 +/*
325 + * Called by the parent device driver to set the device which represents
326 + * this mdev in iommu protection scope. By default, the iommu device is
327 + * NULL, that indicates using vendor defined isolation.
328 + *
329 + * @dev: the mediated device that iommu will isolate.
330 + * @iommu_device: a pci device which represents the iommu for @dev.
331 + */
332 +static inline void mdev_set_iommu_device(struct mdev_device *mdev,
333 + struct device *iommu_device)
334 +{
335 + mdev->iommu_device = iommu_device;
336 +}
337 +
338 +static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev)
339 +{
340 + return mdev->iommu_device;
341 +}
342 +
343 /**
344 * struct mdev_driver - Mediated device driver
345 * @device_api: string to return for the device_api sysfs
346 @@ -73,6 +93,8 @@ struct mdev_driver {
347 struct device_driver driver;
348 };
349
350 +extern struct bus_type mdev_bus_type;
351 +
352 int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
353 struct mdev_driver *mdev_driver, struct mdev_type **types,
354 unsigned int nr_types);