]>
Commit | Line | Data |
---|---|---|
546ac1ff JF |
1 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io |
2 | * | |
3 | * This program is free software; you can redistribute it and/or | |
4 | * modify it under the terms of version 2 of the GNU General Public | |
5 | * License as published by the Free Software Foundation. | |
6 | * | |
7 | * This program is distributed in the hope that it will be useful, but | |
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
10 | * General Public License for more details. | |
11 | */ | |
12 | ||
13 | /* Devmaps primary use is as a backend map for XDP BPF helper call | |
14 | * bpf_redirect_map(). Because XDP is mostly concerned with performance we | |
15 | * spent some effort to ensure the datapath with redirect maps does not use | |
16 | * any locking. This is a quick note on the details. | |
17 | * | |
18 | * We have three possible paths to get into the devmap control plane bpf | |
19 | * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall | |
20 | * will invoke an update, delete, or lookup operation. To ensure updates and | |
21 | * deletes appear atomic from the datapath side xchg() is used to modify the | |
22 | * netdev_map array. Then because the datapath does a lookup into the netdev_map | |
23 | * array (read-only) from an RCU critical section we use call_rcu() to wait for | |
24 | * an rcu grace period before free'ing the old data structures. This ensures the | |
25 | * datapath always has a valid copy. However, the datapath does a "flush" | |
26 | * operation that pushes any pending packets in the driver outside the RCU | |
27 | * critical section. Each bpf_dtab_netdev tracks these pending operations using | |
28 | * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed | |
29 | * until all bits are cleared indicating outstanding flush operations have | |
30 | * completed. | |
31 | * | |
32 | * BPF syscalls may race with BPF program calls on any of the update, delete | |
33 | * or lookup operations. As noted above the xchg() operation also keep the | |
34 | * netdev_map consistent in this case. From the devmap side BPF programs | |
35 | * calling into these operations are the same as multiple user space threads | |
36 | * making system calls. | |
2ddf71e2 JF |
37 | * |
38 | * Finally, any of the above may race with a netdev_unregister notifier. The | |
39 | * unregister notifier must search for net devices in the map structure that | |
40 | * contain a reference to the net device and remove them. This is a two step | |
41 | * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) | |
42 | * check to see if the ifindex is the same as the net_device being removed. | |
4cc7b954 JF |
43 | * When removing the dev a cmpxchg() is used to ensure the correct dev is |
44 | * removed, in the case of a concurrent update or delete operation it is | |
45 | * possible that the initially referenced dev is no longer in the map. As the | |
46 | * notifier hook walks the map we know that new dev references can not be | |
47 | * added by the user because core infrastructure ensures dev_get_by_index() | |
48 | * calls will fail at this point. | |
546ac1ff JF |
49 | */ |
50 | #include <linux/bpf.h> | |
546ac1ff | 51 | #include <linux/filter.h> |
546ac1ff JF |
52 | |
53 | struct bpf_dtab_netdev { | |
54 | struct net_device *dev; | |
546ac1ff | 55 | struct bpf_dtab *dtab; |
af4d045c DB |
56 | unsigned int bit; |
57 | struct rcu_head rcu; | |
546ac1ff JF |
58 | }; |
59 | ||
60 | struct bpf_dtab { | |
61 | struct bpf_map map; | |
62 | struct bpf_dtab_netdev **netdev_map; | |
af4d045c | 63 | unsigned long __percpu *flush_needed; |
2ddf71e2 | 64 | struct list_head list; |
546ac1ff JF |
65 | }; |
66 | ||
4cc7b954 | 67 | static DEFINE_SPINLOCK(dev_map_lock); |
2ddf71e2 JF |
68 | static LIST_HEAD(dev_map_list); |
69 | ||
af4d045c DB |
70 | static u64 dev_map_bitmap_size(const union bpf_attr *attr) |
71 | { | |
72 | return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); | |
73 | } | |
74 | ||
546ac1ff JF |
75 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
76 | { | |
77 | struct bpf_dtab *dtab; | |
582db7e0 | 78 | int err = -EINVAL; |
546ac1ff | 79 | u64 cost; |
546ac1ff JF |
80 | |
81 | /* check sanity of attributes */ | |
82 | if (attr->max_entries == 0 || attr->key_size != 4 || | |
96eabe7a | 83 | attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) |
546ac1ff JF |
84 | return ERR_PTR(-EINVAL); |
85 | ||
546ac1ff JF |
86 | dtab = kzalloc(sizeof(*dtab), GFP_USER); |
87 | if (!dtab) | |
88 | return ERR_PTR(-ENOMEM); | |
89 | ||
90 | /* mandatory map attributes */ | |
91 | dtab->map.map_type = attr->map_type; | |
92 | dtab->map.key_size = attr->key_size; | |
93 | dtab->map.value_size = attr->value_size; | |
94 | dtab->map.max_entries = attr->max_entries; | |
95 | dtab->map.map_flags = attr->map_flags; | |
96eabe7a | 96 | dtab->map.numa_node = bpf_map_attr_numa_node(attr); |
546ac1ff | 97 | |
546ac1ff JF |
98 | /* make sure page count doesn't overflow */ |
99 | cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); | |
af4d045c | 100 | cost += dev_map_bitmap_size(attr) * num_possible_cpus(); |
546ac1ff JF |
101 | if (cost >= U32_MAX - PAGE_SIZE) |
102 | goto free_dtab; | |
103 | ||
104 | dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | |
105 | ||
106 | /* if map size is larger than memlock limit, reject it early */ | |
107 | err = bpf_map_precharge_memlock(dtab->map.pages); | |
108 | if (err) | |
109 | goto free_dtab; | |
110 | ||
582db7e0 TK |
111 | err = -ENOMEM; |
112 | ||
11393cc9 | 113 | /* A per cpu bitfield with a bit per possible net device */ |
af4d045c DB |
114 | dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr), |
115 | __alignof__(unsigned long)); | |
11393cc9 JF |
116 | if (!dtab->flush_needed) |
117 | goto free_dtab; | |
118 | ||
546ac1ff | 119 | dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * |
96eabe7a MKL |
120 | sizeof(struct bpf_dtab_netdev *), |
121 | dtab->map.numa_node); | |
546ac1ff JF |
122 | if (!dtab->netdev_map) |
123 | goto free_dtab; | |
124 | ||
4cc7b954 JF |
125 | spin_lock(&dev_map_lock); |
126 | list_add_tail_rcu(&dtab->list, &dev_map_list); | |
127 | spin_unlock(&dev_map_lock); | |
546ac1ff | 128 | |
af4d045c | 129 | return &dtab->map; |
546ac1ff | 130 | free_dtab: |
11393cc9 | 131 | free_percpu(dtab->flush_needed); |
546ac1ff | 132 | kfree(dtab); |
582db7e0 | 133 | return ERR_PTR(err); |
546ac1ff JF |
134 | } |
135 | ||
136 | static void dev_map_free(struct bpf_map *map) | |
137 | { | |
138 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
11393cc9 | 139 | int i, cpu; |
546ac1ff JF |
140 | |
141 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | |
142 | * so the programs (can be more than one that used this map) were | |
143 | * disconnected from events. Wait for outstanding critical sections in | |
144 | * these programs to complete. The rcu critical section only guarantees | |
145 | * no further reads against netdev_map. It does __not__ ensure pending | |
146 | * flush operations (if any) are complete. | |
147 | */ | |
274043c6 DB |
148 | |
149 | spin_lock(&dev_map_lock); | |
150 | list_del_rcu(&dtab->list); | |
151 | spin_unlock(&dev_map_lock); | |
152 | ||
546ac1ff JF |
153 | synchronize_rcu(); |
154 | ||
11393cc9 JF |
155 | /* To ensure all pending flush operations have completed wait for flush |
156 | * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. | |
157 | * Because the above synchronize_rcu() ensures the map is disconnected | |
158 | * from the program we can assume no new bits will be set. | |
159 | */ | |
160 | for_each_online_cpu(cpu) { | |
161 | unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); | |
162 | ||
163 | while (!bitmap_empty(bitmap, dtab->map.max_entries)) | |
374fb014 | 164 | cond_resched(); |
11393cc9 JF |
165 | } |
166 | ||
546ac1ff JF |
167 | for (i = 0; i < dtab->map.max_entries; i++) { |
168 | struct bpf_dtab_netdev *dev; | |
169 | ||
170 | dev = dtab->netdev_map[i]; | |
171 | if (!dev) | |
172 | continue; | |
173 | ||
174 | dev_put(dev->dev); | |
175 | kfree(dev); | |
176 | } | |
177 | ||
11393cc9 | 178 | free_percpu(dtab->flush_needed); |
546ac1ff JF |
179 | bpf_map_area_free(dtab->netdev_map); |
180 | kfree(dtab); | |
181 | } | |
182 | ||
183 | static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |
184 | { | |
185 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
186 | u32 index = key ? *(u32 *)key : U32_MAX; | |
af4d045c | 187 | u32 *next = next_key; |
546ac1ff JF |
188 | |
189 | if (index >= dtab->map.max_entries) { | |
190 | *next = 0; | |
191 | return 0; | |
192 | } | |
193 | ||
194 | if (index == dtab->map.max_entries - 1) | |
195 | return -ENOENT; | |
546ac1ff JF |
196 | *next = index + 1; |
197 | return 0; | |
198 | } | |
199 | ||
af4d045c | 200 | void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) |
11393cc9 JF |
201 | { |
202 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
203 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | |
204 | ||
af4d045c | 205 | __set_bit(bit, bitmap); |
97f91a7c JF |
206 | } |
207 | ||
11393cc9 JF |
208 | /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled |
209 | * from the driver before returning from its napi->poll() routine. The poll() | |
210 | * routine is called either from busy_poll context or net_rx_action signaled | |
211 | * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the | |
212 | * net device can be torn down. On devmap tear down we ensure the ctx bitmap | |
213 | * is zeroed before completing to ensure all flush operations have completed. | |
214 | */ | |
215 | void __dev_map_flush(struct bpf_map *map) | |
216 | { | |
217 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
218 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | |
219 | u32 bit; | |
220 | ||
221 | for_each_set_bit(bit, bitmap, map->max_entries) { | |
222 | struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); | |
223 | struct net_device *netdev; | |
224 | ||
225 | /* This is possible if the dev entry is removed by user space | |
226 | * between xdp redirect and flush op. | |
227 | */ | |
228 | if (unlikely(!dev)) | |
229 | continue; | |
230 | ||
11393cc9 | 231 | __clear_bit(bit, bitmap); |
a5e2da6e DB |
232 | netdev = dev->dev; |
233 | if (likely(netdev->netdev_ops->ndo_xdp_flush)) | |
234 | netdev->netdev_ops->ndo_xdp_flush(netdev); | |
11393cc9 JF |
235 | } |
236 | } | |
237 | ||
546ac1ff JF |
238 | /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or |
239 | * update happens in parallel here a dev_put wont happen until after reading the | |
240 | * ifindex. | |
241 | */ | |
af4d045c | 242 | struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) |
546ac1ff JF |
243 | { |
244 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
245 | struct bpf_dtab_netdev *dev; | |
546ac1ff | 246 | |
af4d045c | 247 | if (key >= map->max_entries) |
546ac1ff JF |
248 | return NULL; |
249 | ||
af4d045c DB |
250 | dev = READ_ONCE(dtab->netdev_map[key]); |
251 | return dev ? dev->dev : NULL; | |
546ac1ff JF |
252 | } |
253 | ||
af4d045c DB |
254 | static void *dev_map_lookup_elem(struct bpf_map *map, void *key) |
255 | { | |
256 | struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); | |
257 | ||
258 | return dev ? &dev->ifindex : NULL; | |
259 | } | |
260 | ||
261 | static void dev_map_flush_old(struct bpf_dtab_netdev *dev) | |
11393cc9 | 262 | { |
af4d045c DB |
263 | if (dev->dev->netdev_ops->ndo_xdp_flush) { |
264 | struct net_device *fl = dev->dev; | |
11393cc9 JF |
265 | unsigned long *bitmap; |
266 | int cpu; | |
267 | ||
268 | for_each_online_cpu(cpu) { | |
af4d045c DB |
269 | bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); |
270 | __clear_bit(dev->bit, bitmap); | |
11393cc9 | 271 | |
af4d045c | 272 | fl->netdev_ops->ndo_xdp_flush(dev->dev); |
11393cc9 JF |
273 | } |
274 | } | |
275 | } | |
276 | ||
546ac1ff JF |
277 | static void __dev_map_entry_free(struct rcu_head *rcu) |
278 | { | |
af4d045c | 279 | struct bpf_dtab_netdev *dev; |
546ac1ff | 280 | |
af4d045c DB |
281 | dev = container_of(rcu, struct bpf_dtab_netdev, rcu); |
282 | dev_map_flush_old(dev); | |
283 | dev_put(dev->dev); | |
284 | kfree(dev); | |
546ac1ff JF |
285 | } |
286 | ||
287 | static int dev_map_delete_elem(struct bpf_map *map, void *key) | |
288 | { | |
289 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
290 | struct bpf_dtab_netdev *old_dev; | |
291 | int k = *(u32 *)key; | |
292 | ||
293 | if (k >= map->max_entries) | |
294 | return -EINVAL; | |
295 | ||
af4d045c DB |
296 | /* Use call_rcu() here to ensure any rcu critical sections have |
297 | * completed, but this does not guarantee a flush has happened | |
546ac1ff JF |
298 | * yet. Because driver side rcu_read_lock/unlock only protects the |
299 | * running XDP program. However, for pending flush operations the | |
300 | * dev and ctx are stored in another per cpu map. And additionally, | |
301 | * the driver tear down ensures all soft irqs are complete before | |
302 | * removing the net device in the case of dev_put equals zero. | |
303 | */ | |
304 | old_dev = xchg(&dtab->netdev_map[k], NULL); | |
305 | if (old_dev) | |
306 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
307 | return 0; | |
308 | } | |
309 | ||
310 | static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, | |
311 | u64 map_flags) | |
312 | { | |
313 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
314 | struct net *net = current->nsproxy->net_ns; | |
315 | struct bpf_dtab_netdev *dev, *old_dev; | |
316 | u32 i = *(u32 *)key; | |
317 | u32 ifindex = *(u32 *)value; | |
318 | ||
319 | if (unlikely(map_flags > BPF_EXIST)) | |
320 | return -EINVAL; | |
546ac1ff JF |
321 | if (unlikely(i >= dtab->map.max_entries)) |
322 | return -E2BIG; | |
546ac1ff JF |
323 | if (unlikely(map_flags == BPF_NOEXIST)) |
324 | return -EEXIST; | |
325 | ||
326 | if (!ifindex) { | |
327 | dev = NULL; | |
328 | } else { | |
96eabe7a MKL |
329 | dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, |
330 | map->numa_node); | |
546ac1ff JF |
331 | if (!dev) |
332 | return -ENOMEM; | |
333 | ||
334 | dev->dev = dev_get_by_index(net, ifindex); | |
335 | if (!dev->dev) { | |
336 | kfree(dev); | |
337 | return -EINVAL; | |
338 | } | |
339 | ||
af4d045c | 340 | dev->bit = i; |
546ac1ff JF |
341 | dev->dtab = dtab; |
342 | } | |
343 | ||
344 | /* Use call_rcu() here to ensure rcu critical sections have completed | |
345 | * Remembering the driver side flush operation will happen before the | |
346 | * net device is removed. | |
347 | */ | |
348 | old_dev = xchg(&dtab->netdev_map[i], dev); | |
349 | if (old_dev) | |
350 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
351 | ||
352 | return 0; | |
353 | } | |
354 | ||
355 | const struct bpf_map_ops dev_map_ops = { | |
356 | .map_alloc = dev_map_alloc, | |
357 | .map_free = dev_map_free, | |
358 | .map_get_next_key = dev_map_get_next_key, | |
359 | .map_lookup_elem = dev_map_lookup_elem, | |
360 | .map_update_elem = dev_map_update_elem, | |
361 | .map_delete_elem = dev_map_delete_elem, | |
362 | }; | |
2ddf71e2 JF |
363 | |
364 | static int dev_map_notification(struct notifier_block *notifier, | |
365 | ulong event, void *ptr) | |
366 | { | |
367 | struct net_device *netdev = netdev_notifier_info_to_dev(ptr); | |
368 | struct bpf_dtab *dtab; | |
369 | int i; | |
370 | ||
371 | switch (event) { | |
372 | case NETDEV_UNREGISTER: | |
4cc7b954 JF |
373 | /* This rcu_read_lock/unlock pair is needed because |
374 | * dev_map_list is an RCU list AND to ensure a delete | |
375 | * operation does not free a netdev_map entry while we | |
376 | * are comparing it against the netdev being unregistered. | |
377 | */ | |
378 | rcu_read_lock(); | |
379 | list_for_each_entry_rcu(dtab, &dev_map_list, list) { | |
2ddf71e2 | 380 | for (i = 0; i < dtab->map.max_entries; i++) { |
4cc7b954 | 381 | struct bpf_dtab_netdev *dev, *odev; |
2ddf71e2 | 382 | |
4cc7b954 | 383 | dev = READ_ONCE(dtab->netdev_map[i]); |
2ddf71e2 JF |
384 | if (!dev || |
385 | dev->dev->ifindex != netdev->ifindex) | |
386 | continue; | |
4cc7b954 JF |
387 | odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); |
388 | if (dev == odev) | |
2ddf71e2 JF |
389 | call_rcu(&dev->rcu, |
390 | __dev_map_entry_free); | |
391 | } | |
392 | } | |
4cc7b954 | 393 | rcu_read_unlock(); |
2ddf71e2 JF |
394 | break; |
395 | default: | |
396 | break; | |
397 | } | |
398 | return NOTIFY_OK; | |
399 | } | |
400 | ||
401 | static struct notifier_block dev_map_notifier = { | |
402 | .notifier_call = dev_map_notification, | |
403 | }; | |
404 | ||
405 | static int __init dev_map_init(void) | |
406 | { | |
407 | register_netdevice_notifier(&dev_map_notifier); | |
408 | return 0; | |
409 | } | |
410 | ||
411 | subsys_initcall(dev_map_init); |