]>
Commit | Line | Data |
---|---|---|
546ac1ff JF |
1 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io |
2 | * | |
3 | * This program is free software; you can redistribute it and/or | |
4 | * modify it under the terms of version 2 of the GNU General Public | |
5 | * License as published by the Free Software Foundation. | |
6 | * | |
7 | * This program is distributed in the hope that it will be useful, but | |
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
10 | * General Public License for more details. | |
11 | */ | |
12 | ||
13 | /* Devmaps primary use is as a backend map for XDP BPF helper call | |
14 | * bpf_redirect_map(). Because XDP is mostly concerned with performance we | |
15 | * spent some effort to ensure the datapath with redirect maps does not use | |
16 | * any locking. This is a quick note on the details. | |
17 | * | |
18 | * We have three possible paths to get into the devmap control plane bpf | |
19 | * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall | |
20 | * will invoke an update, delete, or lookup operation. To ensure updates and | |
21 | * deletes appear atomic from the datapath side xchg() is used to modify the | |
22 | * netdev_map array. Then because the datapath does a lookup into the netdev_map | |
23 | * array (read-only) from an RCU critical section we use call_rcu() to wait for | |
24 | * an rcu grace period before free'ing the old data structures. This ensures the | |
25 | * datapath always has a valid copy. However, the datapath does a "flush" | |
26 | * operation that pushes any pending packets in the driver outside the RCU | |
27 | * critical section. Each bpf_dtab_netdev tracks these pending operations using | |
28 | * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed | |
29 | * until all bits are cleared indicating outstanding flush operations have | |
30 | * completed. | |
31 | * | |
32 | * BPF syscalls may race with BPF program calls on any of the update, delete | |
33 | * or lookup operations. As noted above the xchg() operation also keep the | |
34 | * netdev_map consistent in this case. From the devmap side BPF programs | |
35 | * calling into these operations are the same as multiple user space threads | |
36 | * making system calls. | |
2ddf71e2 JF |
37 | * |
38 | * Finally, any of the above may race with a netdev_unregister notifier. The | |
39 | * unregister notifier must search for net devices in the map structure that | |
40 | * contain a reference to the net device and remove them. This is a two step | |
41 | * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) | |
42 | * check to see if the ifindex is the same as the net_device being removed. | |
4cc7b954 JF |
43 | * When removing the dev a cmpxchg() is used to ensure the correct dev is |
44 | * removed, in the case of a concurrent update or delete operation it is | |
45 | * possible that the initially referenced dev is no longer in the map. As the | |
46 | * notifier hook walks the map we know that new dev references can not be | |
47 | * added by the user because core infrastructure ensures dev_get_by_index() | |
48 | * calls will fail at this point. | |
546ac1ff JF |
49 | */ |
50 | #include <linux/bpf.h> | |
546ac1ff | 51 | #include <linux/filter.h> |
546ac1ff | 52 | |
6e71b04a CF |
53 | #define DEV_CREATE_FLAG_MASK \ |
54 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) | |
55 | ||
546ac1ff JF |
56 | struct bpf_dtab_netdev { |
57 | struct net_device *dev; | |
546ac1ff | 58 | struct bpf_dtab *dtab; |
af4d045c DB |
59 | unsigned int bit; |
60 | struct rcu_head rcu; | |
546ac1ff JF |
61 | }; |
62 | ||
63 | struct bpf_dtab { | |
64 | struct bpf_map map; | |
65 | struct bpf_dtab_netdev **netdev_map; | |
af4d045c | 66 | unsigned long __percpu *flush_needed; |
2ddf71e2 | 67 | struct list_head list; |
546ac1ff JF |
68 | }; |
69 | ||
4cc7b954 | 70 | static DEFINE_SPINLOCK(dev_map_lock); |
2ddf71e2 JF |
71 | static LIST_HEAD(dev_map_list); |
72 | ||
af4d045c DB |
73 | static u64 dev_map_bitmap_size(const union bpf_attr *attr) |
74 | { | |
8695a539 | 75 | return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); |
af4d045c DB |
76 | } |
77 | ||
546ac1ff JF |
78 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
79 | { | |
80 | struct bpf_dtab *dtab; | |
582db7e0 | 81 | int err = -EINVAL; |
546ac1ff | 82 | u64 cost; |
546ac1ff | 83 | |
9ef2a8cd JF |
84 | if (!capable(CAP_NET_ADMIN)) |
85 | return ERR_PTR(-EPERM); | |
86 | ||
546ac1ff JF |
87 | /* check sanity of attributes */ |
88 | if (attr->max_entries == 0 || attr->key_size != 4 || | |
6e71b04a | 89 | attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) |
546ac1ff JF |
90 | return ERR_PTR(-EINVAL); |
91 | ||
546ac1ff JF |
92 | dtab = kzalloc(sizeof(*dtab), GFP_USER); |
93 | if (!dtab) | |
94 | return ERR_PTR(-ENOMEM); | |
95 | ||
96 | /* mandatory map attributes */ | |
97 | dtab->map.map_type = attr->map_type; | |
98 | dtab->map.key_size = attr->key_size; | |
99 | dtab->map.value_size = attr->value_size; | |
100 | dtab->map.max_entries = attr->max_entries; | |
101 | dtab->map.map_flags = attr->map_flags; | |
96eabe7a | 102 | dtab->map.numa_node = bpf_map_attr_numa_node(attr); |
546ac1ff | 103 | |
546ac1ff JF |
104 | /* make sure page count doesn't overflow */ |
105 | cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); | |
af4d045c | 106 | cost += dev_map_bitmap_size(attr) * num_possible_cpus(); |
546ac1ff JF |
107 | if (cost >= U32_MAX - PAGE_SIZE) |
108 | goto free_dtab; | |
109 | ||
110 | dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | |
111 | ||
112 | /* if map size is larger than memlock limit, reject it early */ | |
113 | err = bpf_map_precharge_memlock(dtab->map.pages); | |
114 | if (err) | |
115 | goto free_dtab; | |
116 | ||
582db7e0 TK |
117 | err = -ENOMEM; |
118 | ||
11393cc9 | 119 | /* A per cpu bitfield with a bit per possible net device */ |
82f8dd28 DB |
120 | dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), |
121 | __alignof__(unsigned long), | |
122 | GFP_KERNEL | __GFP_NOWARN); | |
11393cc9 JF |
123 | if (!dtab->flush_needed) |
124 | goto free_dtab; | |
125 | ||
546ac1ff | 126 | dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * |
96eabe7a MKL |
127 | sizeof(struct bpf_dtab_netdev *), |
128 | dtab->map.numa_node); | |
546ac1ff JF |
129 | if (!dtab->netdev_map) |
130 | goto free_dtab; | |
131 | ||
4cc7b954 JF |
132 | spin_lock(&dev_map_lock); |
133 | list_add_tail_rcu(&dtab->list, &dev_map_list); | |
134 | spin_unlock(&dev_map_lock); | |
546ac1ff | 135 | |
af4d045c | 136 | return &dtab->map; |
546ac1ff | 137 | free_dtab: |
11393cc9 | 138 | free_percpu(dtab->flush_needed); |
546ac1ff | 139 | kfree(dtab); |
582db7e0 | 140 | return ERR_PTR(err); |
546ac1ff JF |
141 | } |
142 | ||
143 | static void dev_map_free(struct bpf_map *map) | |
144 | { | |
145 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
11393cc9 | 146 | int i, cpu; |
546ac1ff JF |
147 | |
148 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | |
149 | * so the programs (can be more than one that used this map) were | |
150 | * disconnected from events. Wait for outstanding critical sections in | |
151 | * these programs to complete. The rcu critical section only guarantees | |
152 | * no further reads against netdev_map. It does __not__ ensure pending | |
153 | * flush operations (if any) are complete. | |
154 | */ | |
274043c6 DB |
155 | |
156 | spin_lock(&dev_map_lock); | |
157 | list_del_rcu(&dtab->list); | |
158 | spin_unlock(&dev_map_lock); | |
159 | ||
546ac1ff JF |
160 | synchronize_rcu(); |
161 | ||
de96bc40 ED |
162 | /* Make sure prior __dev_map_entry_free() have completed. */ |
163 | rcu_barrier(); | |
164 | ||
11393cc9 JF |
165 | /* To ensure all pending flush operations have completed wait for flush |
166 | * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. | |
167 | * Because the above synchronize_rcu() ensures the map is disconnected | |
168 | * from the program we can assume no new bits will be set. | |
169 | */ | |
170 | for_each_online_cpu(cpu) { | |
171 | unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); | |
172 | ||
173 | while (!bitmap_empty(bitmap, dtab->map.max_entries)) | |
374fb014 | 174 | cond_resched(); |
11393cc9 JF |
175 | } |
176 | ||
546ac1ff JF |
177 | for (i = 0; i < dtab->map.max_entries; i++) { |
178 | struct bpf_dtab_netdev *dev; | |
179 | ||
180 | dev = dtab->netdev_map[i]; | |
181 | if (!dev) | |
182 | continue; | |
183 | ||
184 | dev_put(dev->dev); | |
185 | kfree(dev); | |
186 | } | |
187 | ||
11393cc9 | 188 | free_percpu(dtab->flush_needed); |
546ac1ff JF |
189 | bpf_map_area_free(dtab->netdev_map); |
190 | kfree(dtab); | |
191 | } | |
192 | ||
193 | static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |
194 | { | |
195 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
196 | u32 index = key ? *(u32 *)key : U32_MAX; | |
af4d045c | 197 | u32 *next = next_key; |
546ac1ff JF |
198 | |
199 | if (index >= dtab->map.max_entries) { | |
200 | *next = 0; | |
201 | return 0; | |
202 | } | |
203 | ||
204 | if (index == dtab->map.max_entries - 1) | |
205 | return -ENOENT; | |
546ac1ff JF |
206 | *next = index + 1; |
207 | return 0; | |
208 | } | |
209 | ||
af4d045c | 210 | void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) |
11393cc9 JF |
211 | { |
212 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
213 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | |
214 | ||
af4d045c | 215 | __set_bit(bit, bitmap); |
97f91a7c JF |
216 | } |
217 | ||
11393cc9 JF |
218 | /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled |
219 | * from the driver before returning from its napi->poll() routine. The poll() | |
220 | * routine is called either from busy_poll context or net_rx_action signaled | |
221 | * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the | |
222 | * net device can be torn down. On devmap tear down we ensure the ctx bitmap | |
223 | * is zeroed before completing to ensure all flush operations have completed. | |
224 | */ | |
225 | void __dev_map_flush(struct bpf_map *map) | |
226 | { | |
227 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
228 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | |
229 | u32 bit; | |
230 | ||
231 | for_each_set_bit(bit, bitmap, map->max_entries) { | |
232 | struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); | |
233 | struct net_device *netdev; | |
234 | ||
235 | /* This is possible if the dev entry is removed by user space | |
236 | * between xdp redirect and flush op. | |
237 | */ | |
238 | if (unlikely(!dev)) | |
239 | continue; | |
240 | ||
a5e2da6e DB |
241 | netdev = dev->dev; |
242 | if (likely(netdev->netdev_ops->ndo_xdp_flush)) | |
243 | netdev->netdev_ops->ndo_xdp_flush(netdev); | |
487c70bd TM |
244 | |
245 | __clear_bit(bit, bitmap); | |
11393cc9 JF |
246 | } |
247 | } | |
248 | ||
546ac1ff JF |
249 | /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or |
250 | * update happens in parallel here a dev_put wont happen until after reading the | |
251 | * ifindex. | |
252 | */ | |
af4d045c | 253 | struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) |
546ac1ff JF |
254 | { |
255 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
256 | struct bpf_dtab_netdev *dev; | |
546ac1ff | 257 | |
af4d045c | 258 | if (key >= map->max_entries) |
546ac1ff JF |
259 | return NULL; |
260 | ||
af4d045c DB |
261 | dev = READ_ONCE(dtab->netdev_map[key]); |
262 | return dev ? dev->dev : NULL; | |
546ac1ff JF |
263 | } |
264 | ||
af4d045c DB |
265 | static void *dev_map_lookup_elem(struct bpf_map *map, void *key) |
266 | { | |
267 | struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); | |
268 | ||
269 | return dev ? &dev->ifindex : NULL; | |
270 | } | |
271 | ||
272 | static void dev_map_flush_old(struct bpf_dtab_netdev *dev) | |
11393cc9 | 273 | { |
af4d045c DB |
274 | if (dev->dev->netdev_ops->ndo_xdp_flush) { |
275 | struct net_device *fl = dev->dev; | |
11393cc9 JF |
276 | unsigned long *bitmap; |
277 | int cpu; | |
278 | ||
279 | for_each_online_cpu(cpu) { | |
af4d045c DB |
280 | bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); |
281 | __clear_bit(dev->bit, bitmap); | |
11393cc9 | 282 | |
af4d045c | 283 | fl->netdev_ops->ndo_xdp_flush(dev->dev); |
11393cc9 JF |
284 | } |
285 | } | |
286 | } | |
287 | ||
546ac1ff JF |
288 | static void __dev_map_entry_free(struct rcu_head *rcu) |
289 | { | |
af4d045c | 290 | struct bpf_dtab_netdev *dev; |
546ac1ff | 291 | |
af4d045c DB |
292 | dev = container_of(rcu, struct bpf_dtab_netdev, rcu); |
293 | dev_map_flush_old(dev); | |
294 | dev_put(dev->dev); | |
295 | kfree(dev); | |
546ac1ff JF |
296 | } |
297 | ||
298 | static int dev_map_delete_elem(struct bpf_map *map, void *key) | |
299 | { | |
300 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
301 | struct bpf_dtab_netdev *old_dev; | |
302 | int k = *(u32 *)key; | |
303 | ||
304 | if (k >= map->max_entries) | |
305 | return -EINVAL; | |
306 | ||
af4d045c DB |
307 | /* Use call_rcu() here to ensure any rcu critical sections have |
308 | * completed, but this does not guarantee a flush has happened | |
546ac1ff JF |
309 | * yet. Because driver side rcu_read_lock/unlock only protects the |
310 | * running XDP program. However, for pending flush operations the | |
311 | * dev and ctx are stored in another per cpu map. And additionally, | |
312 | * the driver tear down ensures all soft irqs are complete before | |
313 | * removing the net device in the case of dev_put equals zero. | |
314 | */ | |
315 | old_dev = xchg(&dtab->netdev_map[k], NULL); | |
316 | if (old_dev) | |
317 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
318 | return 0; | |
319 | } | |
320 | ||
321 | static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, | |
322 | u64 map_flags) | |
323 | { | |
324 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
325 | struct net *net = current->nsproxy->net_ns; | |
326 | struct bpf_dtab_netdev *dev, *old_dev; | |
327 | u32 i = *(u32 *)key; | |
328 | u32 ifindex = *(u32 *)value; | |
329 | ||
330 | if (unlikely(map_flags > BPF_EXIST)) | |
331 | return -EINVAL; | |
546ac1ff JF |
332 | if (unlikely(i >= dtab->map.max_entries)) |
333 | return -E2BIG; | |
546ac1ff JF |
334 | if (unlikely(map_flags == BPF_NOEXIST)) |
335 | return -EEXIST; | |
336 | ||
337 | if (!ifindex) { | |
338 | dev = NULL; | |
339 | } else { | |
96eabe7a MKL |
340 | dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, |
341 | map->numa_node); | |
546ac1ff JF |
342 | if (!dev) |
343 | return -ENOMEM; | |
344 | ||
345 | dev->dev = dev_get_by_index(net, ifindex); | |
346 | if (!dev->dev) { | |
347 | kfree(dev); | |
348 | return -EINVAL; | |
349 | } | |
350 | ||
af4d045c | 351 | dev->bit = i; |
546ac1ff JF |
352 | dev->dtab = dtab; |
353 | } | |
354 | ||
355 | /* Use call_rcu() here to ensure rcu critical sections have completed | |
356 | * Remembering the driver side flush operation will happen before the | |
357 | * net device is removed. | |
358 | */ | |
359 | old_dev = xchg(&dtab->netdev_map[i], dev); | |
360 | if (old_dev) | |
361 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
362 | ||
363 | return 0; | |
364 | } | |
365 | ||
366 | const struct bpf_map_ops dev_map_ops = { | |
367 | .map_alloc = dev_map_alloc, | |
368 | .map_free = dev_map_free, | |
369 | .map_get_next_key = dev_map_get_next_key, | |
370 | .map_lookup_elem = dev_map_lookup_elem, | |
371 | .map_update_elem = dev_map_update_elem, | |
372 | .map_delete_elem = dev_map_delete_elem, | |
373 | }; | |
2ddf71e2 JF |
374 | |
375 | static int dev_map_notification(struct notifier_block *notifier, | |
376 | ulong event, void *ptr) | |
377 | { | |
378 | struct net_device *netdev = netdev_notifier_info_to_dev(ptr); | |
379 | struct bpf_dtab *dtab; | |
380 | int i; | |
381 | ||
382 | switch (event) { | |
383 | case NETDEV_UNREGISTER: | |
4cc7b954 JF |
384 | /* This rcu_read_lock/unlock pair is needed because |
385 | * dev_map_list is an RCU list AND to ensure a delete | |
386 | * operation does not free a netdev_map entry while we | |
387 | * are comparing it against the netdev being unregistered. | |
388 | */ | |
389 | rcu_read_lock(); | |
390 | list_for_each_entry_rcu(dtab, &dev_map_list, list) { | |
2ddf71e2 | 391 | for (i = 0; i < dtab->map.max_entries; i++) { |
4cc7b954 | 392 | struct bpf_dtab_netdev *dev, *odev; |
2ddf71e2 | 393 | |
4cc7b954 | 394 | dev = READ_ONCE(dtab->netdev_map[i]); |
c1468e32 | 395 | if (!dev || netdev != dev->dev) |
2ddf71e2 | 396 | continue; |
4cc7b954 JF |
397 | odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); |
398 | if (dev == odev) | |
2ddf71e2 JF |
399 | call_rcu(&dev->rcu, |
400 | __dev_map_entry_free); | |
401 | } | |
402 | } | |
4cc7b954 | 403 | rcu_read_unlock(); |
2ddf71e2 JF |
404 | break; |
405 | default: | |
406 | break; | |
407 | } | |
408 | return NOTIFY_OK; | |
409 | } | |
410 | ||
411 | static struct notifier_block dev_map_notifier = { | |
412 | .notifier_call = dev_map_notification, | |
413 | }; | |
414 | ||
415 | static int __init dev_map_init(void) | |
416 | { | |
417 | register_netdevice_notifier(&dev_map_notifier); | |
418 | return 0; | |
419 | } | |
420 | ||
421 | subsys_initcall(dev_map_init); |