]>
Commit | Line | Data |
---|---|---|
546ac1ff JF |
1 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io |
2 | * | |
3 | * This program is free software; you can redistribute it and/or | |
4 | * modify it under the terms of version 2 of the GNU General Public | |
5 | * License as published by the Free Software Foundation. | |
6 | * | |
7 | * This program is distributed in the hope that it will be useful, but | |
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
10 | * General Public License for more details. | |
11 | */ | |
12 | ||
13 | /* Devmaps primary use is as a backend map for XDP BPF helper call | |
14 | * bpf_redirect_map(). Because XDP is mostly concerned with performance we | |
15 | * spent some effort to ensure the datapath with redirect maps does not use | |
16 | * any locking. This is a quick note on the details. | |
17 | * | |
18 | * We have three possible paths to get into the devmap control plane bpf | |
19 | * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall | |
20 | * will invoke an update, delete, or lookup operation. To ensure updates and | |
21 | * deletes appear atomic from the datapath side xchg() is used to modify the | |
22 | * netdev_map array. Then because the datapath does a lookup into the netdev_map | |
23 | * array (read-only) from an RCU critical section we use call_rcu() to wait for | |
24 | * an rcu grace period before free'ing the old data structures. This ensures the | |
25 | * datapath always has a valid copy. However, the datapath does a "flush" | |
26 | * operation that pushes any pending packets in the driver outside the RCU | |
27 | * critical section. Each bpf_dtab_netdev tracks these pending operations using | |
28 | * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed | |
29 | * until all bits are cleared indicating outstanding flush operations have | |
30 | * completed. | |
31 | * | |
32 | * BPF syscalls may race with BPF program calls on any of the update, delete | |
33 | * or lookup operations. As noted above the xchg() operation also keep the | |
34 | * netdev_map consistent in this case. From the devmap side BPF programs | |
35 | * calling into these operations are the same as multiple user space threads | |
36 | * making system calls. | |
37 | */ | |
38 | #include <linux/bpf.h> | |
39 | #include <linux/jhash.h> | |
40 | #include <linux/filter.h> | |
41 | #include <linux/rculist_nulls.h> | |
42 | #include "percpu_freelist.h" | |
43 | #include "bpf_lru_list.h" | |
44 | #include "map_in_map.h" | |
45 | ||
46 | struct bpf_dtab_netdev { | |
47 | struct net_device *dev; | |
48 | int key; | |
49 | struct rcu_head rcu; | |
50 | struct bpf_dtab *dtab; | |
51 | }; | |
52 | ||
53 | struct bpf_dtab { | |
54 | struct bpf_map map; | |
55 | struct bpf_dtab_netdev **netdev_map; | |
11393cc9 | 56 | unsigned long int __percpu *flush_needed; |
546ac1ff JF |
57 | }; |
58 | ||
59 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |
60 | { | |
61 | struct bpf_dtab *dtab; | |
62 | u64 cost; | |
63 | int err; | |
64 | ||
65 | /* check sanity of attributes */ | |
66 | if (attr->max_entries == 0 || attr->key_size != 4 || | |
67 | attr->value_size != 4 || attr->map_flags) | |
68 | return ERR_PTR(-EINVAL); | |
69 | ||
70 | /* if value_size is bigger, the user space won't be able to | |
71 | * access the elements. | |
72 | */ | |
73 | if (attr->value_size > KMALLOC_MAX_SIZE) | |
74 | return ERR_PTR(-E2BIG); | |
75 | ||
76 | dtab = kzalloc(sizeof(*dtab), GFP_USER); | |
77 | if (!dtab) | |
78 | return ERR_PTR(-ENOMEM); | |
79 | ||
80 | /* mandatory map attributes */ | |
81 | dtab->map.map_type = attr->map_type; | |
82 | dtab->map.key_size = attr->key_size; | |
83 | dtab->map.value_size = attr->value_size; | |
84 | dtab->map.max_entries = attr->max_entries; | |
85 | dtab->map.map_flags = attr->map_flags; | |
86 | ||
87 | err = -ENOMEM; | |
88 | ||
89 | /* make sure page count doesn't overflow */ | |
90 | cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); | |
11393cc9 | 91 | cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); |
546ac1ff JF |
92 | if (cost >= U32_MAX - PAGE_SIZE) |
93 | goto free_dtab; | |
94 | ||
95 | dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | |
96 | ||
97 | /* if map size is larger than memlock limit, reject it early */ | |
98 | err = bpf_map_precharge_memlock(dtab->map.pages); | |
99 | if (err) | |
100 | goto free_dtab; | |
101 | ||
11393cc9 JF |
102 | /* A per cpu bitfield with a bit per possible net device */ |
103 | dtab->flush_needed = __alloc_percpu( | |
104 | BITS_TO_LONGS(attr->max_entries) * | |
105 | sizeof(unsigned long), | |
106 | __alignof__(unsigned long)); | |
107 | if (!dtab->flush_needed) | |
108 | goto free_dtab; | |
109 | ||
546ac1ff JF |
110 | dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * |
111 | sizeof(struct bpf_dtab_netdev *)); | |
112 | if (!dtab->netdev_map) | |
113 | goto free_dtab; | |
114 | ||
115 | return &dtab->map; | |
116 | ||
117 | free_dtab: | |
11393cc9 | 118 | free_percpu(dtab->flush_needed); |
546ac1ff JF |
119 | kfree(dtab); |
120 | return ERR_PTR(err); | |
121 | } | |
122 | ||
123 | static void dev_map_free(struct bpf_map *map) | |
124 | { | |
125 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
11393cc9 | 126 | int i, cpu; |
546ac1ff JF |
127 | |
128 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | |
129 | * so the programs (can be more than one that used this map) were | |
130 | * disconnected from events. Wait for outstanding critical sections in | |
131 | * these programs to complete. The rcu critical section only guarantees | |
132 | * no further reads against netdev_map. It does __not__ ensure pending | |
133 | * flush operations (if any) are complete. | |
134 | */ | |
135 | synchronize_rcu(); | |
136 | ||
11393cc9 JF |
137 | /* To ensure all pending flush operations have completed wait for flush |
138 | * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. | |
139 | * Because the above synchronize_rcu() ensures the map is disconnected | |
140 | * from the program we can assume no new bits will be set. | |
141 | */ | |
142 | for_each_online_cpu(cpu) { | |
143 | unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); | |
144 | ||
145 | while (!bitmap_empty(bitmap, dtab->map.max_entries)) | |
146 | cpu_relax(); | |
147 | } | |
148 | ||
546ac1ff JF |
149 | for (i = 0; i < dtab->map.max_entries; i++) { |
150 | struct bpf_dtab_netdev *dev; | |
151 | ||
152 | dev = dtab->netdev_map[i]; | |
153 | if (!dev) | |
154 | continue; | |
155 | ||
156 | dev_put(dev->dev); | |
157 | kfree(dev); | |
158 | } | |
159 | ||
160 | /* At this point bpf program is detached and all pending operations | |
161 | * _must_ be complete | |
162 | */ | |
11393cc9 | 163 | free_percpu(dtab->flush_needed); |
546ac1ff JF |
164 | bpf_map_area_free(dtab->netdev_map); |
165 | kfree(dtab); | |
166 | } | |
167 | ||
168 | static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |
169 | { | |
170 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
171 | u32 index = key ? *(u32 *)key : U32_MAX; | |
172 | u32 *next = (u32 *)next_key; | |
173 | ||
174 | if (index >= dtab->map.max_entries) { | |
175 | *next = 0; | |
176 | return 0; | |
177 | } | |
178 | ||
179 | if (index == dtab->map.max_entries - 1) | |
180 | return -ENOENT; | |
181 | ||
182 | *next = index + 1; | |
183 | return 0; | |
184 | } | |
185 | ||
11393cc9 JF |
186 | void __dev_map_insert_ctx(struct bpf_map *map, u32 key) |
187 | { | |
188 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
189 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | |
190 | ||
191 | __set_bit(key, bitmap); | |
192 | } | |
193 | ||
97f91a7c JF |
194 | struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) |
195 | { | |
196 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
197 | struct bpf_dtab_netdev *dev; | |
198 | ||
199 | if (key >= map->max_entries) | |
200 | return NULL; | |
201 | ||
202 | dev = READ_ONCE(dtab->netdev_map[key]); | |
203 | return dev ? dev->dev : NULL; | |
204 | } | |
205 | ||
11393cc9 JF |
206 | /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled |
207 | * from the driver before returning from its napi->poll() routine. The poll() | |
208 | * routine is called either from busy_poll context or net_rx_action signaled | |
209 | * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the | |
210 | * net device can be torn down. On devmap tear down we ensure the ctx bitmap | |
211 | * is zeroed before completing to ensure all flush operations have completed. | |
212 | */ | |
213 | void __dev_map_flush(struct bpf_map *map) | |
214 | { | |
215 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
216 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | |
217 | u32 bit; | |
218 | ||
219 | for_each_set_bit(bit, bitmap, map->max_entries) { | |
220 | struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); | |
221 | struct net_device *netdev; | |
222 | ||
223 | /* This is possible if the dev entry is removed by user space | |
224 | * between xdp redirect and flush op. | |
225 | */ | |
226 | if (unlikely(!dev)) | |
227 | continue; | |
228 | ||
229 | netdev = dev->dev; | |
230 | ||
231 | __clear_bit(bit, bitmap); | |
232 | if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush)) | |
233 | continue; | |
234 | ||
235 | netdev->netdev_ops->ndo_xdp_flush(netdev); | |
236 | } | |
237 | } | |
238 | ||
546ac1ff JF |
239 | /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or |
240 | * update happens in parallel here a dev_put wont happen until after reading the | |
241 | * ifindex. | |
242 | */ | |
243 | static void *dev_map_lookup_elem(struct bpf_map *map, void *key) | |
244 | { | |
245 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
246 | struct bpf_dtab_netdev *dev; | |
247 | u32 i = *(u32 *)key; | |
248 | ||
249 | if (i >= map->max_entries) | |
250 | return NULL; | |
251 | ||
252 | dev = READ_ONCE(dtab->netdev_map[i]); | |
253 | return dev ? &dev->dev->ifindex : NULL; | |
254 | } | |
255 | ||
11393cc9 JF |
256 | static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev) |
257 | { | |
258 | if (old_dev->dev->netdev_ops->ndo_xdp_flush) { | |
259 | struct net_device *fl = old_dev->dev; | |
260 | unsigned long *bitmap; | |
261 | int cpu; | |
262 | ||
263 | for_each_online_cpu(cpu) { | |
264 | bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu); | |
265 | __clear_bit(old_dev->key, bitmap); | |
266 | ||
267 | fl->netdev_ops->ndo_xdp_flush(old_dev->dev); | |
268 | } | |
269 | } | |
270 | } | |
271 | ||
546ac1ff JF |
272 | static void __dev_map_entry_free(struct rcu_head *rcu) |
273 | { | |
274 | struct bpf_dtab_netdev *old_dev; | |
275 | ||
276 | old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); | |
11393cc9 | 277 | dev_map_flush_old(old_dev); |
546ac1ff JF |
278 | dev_put(old_dev->dev); |
279 | kfree(old_dev); | |
280 | } | |
281 | ||
282 | static int dev_map_delete_elem(struct bpf_map *map, void *key) | |
283 | { | |
284 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
285 | struct bpf_dtab_netdev *old_dev; | |
286 | int k = *(u32 *)key; | |
287 | ||
288 | if (k >= map->max_entries) | |
289 | return -EINVAL; | |
290 | ||
291 | /* Use synchronize_rcu() here to ensure any rcu critical sections | |
292 | * have completed, but this does not guarantee a flush has happened | |
293 | * yet. Because driver side rcu_read_lock/unlock only protects the | |
294 | * running XDP program. However, for pending flush operations the | |
295 | * dev and ctx are stored in another per cpu map. And additionally, | |
296 | * the driver tear down ensures all soft irqs are complete before | |
297 | * removing the net device in the case of dev_put equals zero. | |
298 | */ | |
299 | old_dev = xchg(&dtab->netdev_map[k], NULL); | |
300 | if (old_dev) | |
301 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
302 | return 0; | |
303 | } | |
304 | ||
305 | static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, | |
306 | u64 map_flags) | |
307 | { | |
308 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
309 | struct net *net = current->nsproxy->net_ns; | |
310 | struct bpf_dtab_netdev *dev, *old_dev; | |
311 | u32 i = *(u32 *)key; | |
312 | u32 ifindex = *(u32 *)value; | |
313 | ||
314 | if (unlikely(map_flags > BPF_EXIST)) | |
315 | return -EINVAL; | |
316 | ||
317 | if (unlikely(i >= dtab->map.max_entries)) | |
318 | return -E2BIG; | |
319 | ||
320 | if (unlikely(map_flags == BPF_NOEXIST)) | |
321 | return -EEXIST; | |
322 | ||
323 | if (!ifindex) { | |
324 | dev = NULL; | |
325 | } else { | |
326 | dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN); | |
327 | if (!dev) | |
328 | return -ENOMEM; | |
329 | ||
330 | dev->dev = dev_get_by_index(net, ifindex); | |
331 | if (!dev->dev) { | |
332 | kfree(dev); | |
333 | return -EINVAL; | |
334 | } | |
335 | ||
336 | dev->key = i; | |
337 | dev->dtab = dtab; | |
338 | } | |
339 | ||
340 | /* Use call_rcu() here to ensure rcu critical sections have completed | |
341 | * Remembering the driver side flush operation will happen before the | |
342 | * net device is removed. | |
343 | */ | |
344 | old_dev = xchg(&dtab->netdev_map[i], dev); | |
345 | if (old_dev) | |
346 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
347 | ||
348 | return 0; | |
349 | } | |
350 | ||
351 | const struct bpf_map_ops dev_map_ops = { | |
352 | .map_alloc = dev_map_alloc, | |
353 | .map_free = dev_map_free, | |
354 | .map_get_next_key = dev_map_get_next_key, | |
355 | .map_lookup_elem = dev_map_lookup_elem, | |
356 | .map_update_elem = dev_map_update_elem, | |
357 | .map_delete_elem = dev_map_delete_elem, | |
358 | }; |