]>
Commit | Line | Data |
---|---|---|
0baf26b0 MKL |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright (c) 2019 Facebook */ | |
3 | ||
4 | #include <linux/types.h> | |
5 | #include <linux/bpf_verifier.h> | |
6 | #include <linux/bpf.h> | |
7 | #include <linux/btf.h> | |
e78aea8b | 8 | #include <linux/btf_ids.h> |
0baf26b0 MKL |
9 | #include <linux/filter.h> |
10 | #include <net/tcp.h> | |
ab14fd4e | 11 | #include <net/bpf_sk_storage.h> |
0baf26b0 | 12 | |
eb18b49e MKL |
13 | /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ |
14 | extern struct bpf_struct_ops bpf_tcp_congestion_ops; | |
15 | ||
0baf26b0 MKL |
16 | static u32 optional_ops[] = { |
17 | offsetof(struct tcp_congestion_ops, init), | |
18 | offsetof(struct tcp_congestion_ops, release), | |
19 | offsetof(struct tcp_congestion_ops, set_state), | |
20 | offsetof(struct tcp_congestion_ops, cwnd_event), | |
21 | offsetof(struct tcp_congestion_ops, in_ack_event), | |
22 | offsetof(struct tcp_congestion_ops, pkts_acked), | |
23 | offsetof(struct tcp_congestion_ops, min_tso_segs), | |
24 | offsetof(struct tcp_congestion_ops, sndbuf_expand), | |
25 | offsetof(struct tcp_congestion_ops, cong_control), | |
26 | }; | |
27 | ||
28 | static u32 unsupported_ops[] = { | |
29 | offsetof(struct tcp_congestion_ops, get_info), | |
30 | }; | |
31 | ||
32 | static const struct btf_type *tcp_sock_type; | |
33 | static u32 tcp_sock_id, sock_id; | |
34 | ||
35 | static int bpf_tcp_ca_init(struct btf *btf) | |
36 | { | |
37 | s32 type_id; | |
38 | ||
39 | type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT); | |
40 | if (type_id < 0) | |
41 | return -EINVAL; | |
42 | sock_id = type_id; | |
43 | ||
44 | type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT); | |
45 | if (type_id < 0) | |
46 | return -EINVAL; | |
47 | tcp_sock_id = type_id; | |
48 | tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); | |
49 | ||
50 | return 0; | |
51 | } | |
52 | ||
53 | static bool is_optional(u32 member_offset) | |
54 | { | |
55 | unsigned int i; | |
56 | ||
57 | for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { | |
58 | if (member_offset == optional_ops[i]) | |
59 | return true; | |
60 | } | |
61 | ||
62 | return false; | |
63 | } | |
64 | ||
65 | static bool is_unsupported(u32 member_offset) | |
66 | { | |
67 | unsigned int i; | |
68 | ||
69 | for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { | |
70 | if (member_offset == unsupported_ops[i]) | |
71 | return true; | |
72 | } | |
73 | ||
74 | return false; | |
75 | } | |
76 | ||
77 | extern struct btf *btf_vmlinux; | |
78 | ||
79 | static bool bpf_tcp_ca_is_valid_access(int off, int size, | |
80 | enum bpf_access_type type, | |
81 | const struct bpf_prog *prog, | |
82 | struct bpf_insn_access_aux *info) | |
83 | { | |
84 | if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) | |
85 | return false; | |
86 | if (type != BPF_READ) | |
87 | return false; | |
88 | if (off % size != 0) | |
89 | return false; | |
90 | ||
91 | if (!btf_ctx_access(off, size, type, prog, info)) | |
92 | return false; | |
93 | ||
94 | if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) | |
95 | /* promote it to tcp_sock */ | |
96 | info->btf_id = tcp_sock_id; | |
97 | ||
98 | return true; | |
99 | } | |
100 | ||
101 | static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, | |
22dc4a0f | 102 | const struct btf *btf, |
0baf26b0 MKL |
103 | const struct btf_type *t, int off, |
104 | int size, enum bpf_access_type atype, | |
105 | u32 *next_btf_id) | |
106 | { | |
107 | size_t end; | |
108 | ||
109 | if (atype == BPF_READ) | |
22dc4a0f | 110 | return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); |
0baf26b0 MKL |
111 | |
112 | if (t != tcp_sock_type) { | |
113 | bpf_log(log, "only read is supported\n"); | |
114 | return -EACCES; | |
115 | } | |
116 | ||
117 | switch (off) { | |
118 | case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): | |
119 | end = offsetofend(struct inet_connection_sock, icsk_ca_priv); | |
120 | break; | |
121 | case offsetof(struct inet_connection_sock, icsk_ack.pending): | |
122 | end = offsetofend(struct inet_connection_sock, | |
123 | icsk_ack.pending); | |
124 | break; | |
125 | case offsetof(struct tcp_sock, snd_cwnd): | |
126 | end = offsetofend(struct tcp_sock, snd_cwnd); | |
127 | break; | |
128 | case offsetof(struct tcp_sock, snd_cwnd_cnt): | |
129 | end = offsetofend(struct tcp_sock, snd_cwnd_cnt); | |
130 | break; | |
131 | case offsetof(struct tcp_sock, snd_ssthresh): | |
132 | end = offsetofend(struct tcp_sock, snd_ssthresh); | |
133 | break; | |
134 | case offsetof(struct tcp_sock, ecn_flags): | |
135 | end = offsetofend(struct tcp_sock, ecn_flags); | |
136 | break; | |
137 | default: | |
138 | bpf_log(log, "no write support to tcp_sock at off %d\n", off); | |
139 | return -EACCES; | |
140 | } | |
141 | ||
142 | if (off + size > end) { | |
143 | bpf_log(log, | |
144 | "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n", | |
145 | off, size, end); | |
146 | return -EACCES; | |
147 | } | |
148 | ||
149 | return NOT_INIT; | |
150 | } | |
151 | ||
206057fe MKL |
152 | BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) |
153 | { | |
154 | /* bpf_tcp_ca prog cannot have NULL tp */ | |
155 | __tcp_send_ack((struct sock *)tp, rcv_nxt); | |
156 | return 0; | |
157 | } | |
158 | ||
159 | static const struct bpf_func_proto bpf_tcp_send_ack_proto = { | |
160 | .func = bpf_tcp_send_ack, | |
161 | .gpl_only = false, | |
162 | /* In case we want to report error later */ | |
163 | .ret_type = RET_INTEGER, | |
164 | .arg1_type = ARG_PTR_TO_BTF_ID, | |
9436ef6e | 165 | .arg1_btf_id = &tcp_sock_id, |
206057fe | 166 | .arg2_type = ARG_ANYTHING, |
206057fe MKL |
167 | }; |
168 | ||
eb18b49e MKL |
169 | static u32 prog_ops_moff(const struct bpf_prog *prog) |
170 | { | |
171 | const struct btf_member *m; | |
172 | const struct btf_type *t; | |
173 | u32 midx; | |
174 | ||
175 | midx = prog->expected_attach_type; | |
176 | t = bpf_tcp_congestion_ops.type; | |
177 | m = &btf_type_member(t)[midx]; | |
178 | ||
179 | return btf_member_bit_offset(t, m) / 8; | |
180 | } | |
181 | ||
0baf26b0 MKL |
182 | static const struct bpf_func_proto * |
183 | bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, | |
184 | const struct bpf_prog *prog) | |
185 | { | |
206057fe MKL |
186 | switch (func_id) { |
187 | case BPF_FUNC_tcp_send_ack: | |
188 | return &bpf_tcp_send_ack_proto; | |
ab14fd4e | 189 | case BPF_FUNC_sk_storage_get: |
592a3498 | 190 | return &bpf_sk_storage_get_proto; |
ab14fd4e | 191 | case BPF_FUNC_sk_storage_delete: |
592a3498 | 192 | return &bpf_sk_storage_delete_proto; |
eb18b49e MKL |
193 | case BPF_FUNC_setsockopt: |
194 | /* Does not allow release() to call setsockopt. | |
195 | * release() is called when the current bpf-tcp-cc | |
196 | * is retiring. It is not allowed to call | |
197 | * setsockopt() to make further changes which | |
198 | * may potentially allocate new resources. | |
199 | */ | |
200 | if (prog_ops_moff(prog) != | |
201 | offsetof(struct tcp_congestion_ops, release)) | |
202 | return &bpf_sk_setsockopt_proto; | |
203 | return NULL; | |
204 | case BPF_FUNC_getsockopt: | |
205 | /* Since get/setsockopt is usually expected to | |
206 | * be available together, disable getsockopt for | |
207 | * release also to avoid usage surprise. | |
208 | * The bpf-tcp-cc already has a more powerful way | |
209 | * to read tcp_sock from the PTR_TO_BTF_ID. | |
210 | */ | |
211 | if (prog_ops_moff(prog) != | |
212 | offsetof(struct tcp_congestion_ops, release)) | |
213 | return &bpf_sk_getsockopt_proto; | |
214 | return NULL; | |
727231d2 DB |
215 | case BPF_FUNC_ktime_get_coarse_ns: |
216 | return &bpf_ktime_get_coarse_ns_proto; | |
206057fe MKL |
217 | default: |
218 | return bpf_base_func_proto(func_id); | |
219 | } | |
0baf26b0 MKL |
220 | } |
221 | ||
e78aea8b MKL |
222 | BTF_SET_START(bpf_tcp_ca_kfunc_ids) |
223 | BTF_ID(func, tcp_reno_ssthresh) | |
224 | BTF_ID(func, tcp_reno_cong_avoid) | |
225 | BTF_ID(func, tcp_reno_undo_cwnd) | |
226 | BTF_ID(func, tcp_slow_start) | |
227 | BTF_ID(func, tcp_cong_avoid_ai) | |
569c484f | 228 | #ifdef CONFIG_X86 |
7aae231a | 229 | #ifdef CONFIG_DYNAMIC_FTRACE |
e78aea8b MKL |
230 | #if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC) |
231 | BTF_ID(func, cubictcp_init) | |
232 | BTF_ID(func, cubictcp_recalc_ssthresh) | |
233 | BTF_ID(func, cubictcp_cong_avoid) | |
234 | BTF_ID(func, cubictcp_state) | |
235 | BTF_ID(func, cubictcp_cwnd_event) | |
236 | BTF_ID(func, cubictcp_acked) | |
237 | #endif | |
238 | #if IS_BUILTIN(CONFIG_TCP_CONG_DCTCP) | |
239 | BTF_ID(func, dctcp_init) | |
240 | BTF_ID(func, dctcp_update_alpha) | |
241 | BTF_ID(func, dctcp_cwnd_event) | |
242 | BTF_ID(func, dctcp_ssthresh) | |
243 | BTF_ID(func, dctcp_cwnd_undo) | |
244 | BTF_ID(func, dctcp_state) | |
245 | #endif | |
246 | #if IS_BUILTIN(CONFIG_TCP_CONG_BBR) | |
247 | BTF_ID(func, bbr_init) | |
248 | BTF_ID(func, bbr_main) | |
249 | BTF_ID(func, bbr_sndbuf_expand) | |
250 | BTF_ID(func, bbr_undo_cwnd) | |
21cfd2db | 251 | BTF_ID(func, bbr_cwnd_event) |
e78aea8b MKL |
252 | BTF_ID(func, bbr_ssthresh) |
253 | BTF_ID(func, bbr_min_tso_segs) | |
254 | BTF_ID(func, bbr_set_state) | |
255 | #endif | |
7aae231a | 256 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
569c484f | 257 | #endif /* CONFIG_X86 */ |
e78aea8b MKL |
258 | BTF_SET_END(bpf_tcp_ca_kfunc_ids) |
259 | ||
260 | static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id) | |
261 | { | |
262 | return btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id); | |
263 | } | |
264 | ||
0baf26b0 MKL |
265 | static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { |
266 | .get_func_proto = bpf_tcp_ca_get_func_proto, | |
267 | .is_valid_access = bpf_tcp_ca_is_valid_access, | |
268 | .btf_struct_access = bpf_tcp_ca_btf_struct_access, | |
e78aea8b | 269 | .check_kfunc_call = bpf_tcp_ca_check_kfunc_call, |
0baf26b0 MKL |
270 | }; |
271 | ||
272 | static int bpf_tcp_ca_init_member(const struct btf_type *t, | |
273 | const struct btf_member *member, | |
274 | void *kdata, const void *udata) | |
275 | { | |
276 | const struct tcp_congestion_ops *utcp_ca; | |
277 | struct tcp_congestion_ops *tcp_ca; | |
0baf26b0 MKL |
278 | int prog_fd; |
279 | u32 moff; | |
280 | ||
281 | utcp_ca = (const struct tcp_congestion_ops *)udata; | |
282 | tcp_ca = (struct tcp_congestion_ops *)kdata; | |
283 | ||
284 | moff = btf_member_bit_offset(t, member) / 8; | |
285 | switch (moff) { | |
286 | case offsetof(struct tcp_congestion_ops, flags): | |
287 | if (utcp_ca->flags & ~TCP_CONG_MASK) | |
288 | return -EINVAL; | |
289 | tcp_ca->flags = utcp_ca->flags; | |
290 | return 1; | |
291 | case offsetof(struct tcp_congestion_ops, name): | |
8e7ae251 MKL |
292 | if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name, |
293 | sizeof(tcp_ca->name)) <= 0) | |
0baf26b0 MKL |
294 | return -EINVAL; |
295 | if (tcp_ca_find(utcp_ca->name)) | |
296 | return -EEXIST; | |
0baf26b0 MKL |
297 | return 1; |
298 | } | |
299 | ||
300 | if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) | |
301 | return 0; | |
302 | ||
303 | /* Ensure bpf_prog is provided for compulsory func ptr */ | |
304 | prog_fd = (int)(*(unsigned long *)(udata + moff)); | |
305 | if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) | |
306 | return -EINVAL; | |
307 | ||
308 | return 0; | |
309 | } | |
310 | ||
311 | static int bpf_tcp_ca_check_member(const struct btf_type *t, | |
312 | const struct btf_member *member) | |
313 | { | |
314 | if (is_unsupported(btf_member_bit_offset(t, member) / 8)) | |
315 | return -ENOTSUPP; | |
316 | return 0; | |
317 | } | |
318 | ||
319 | static int bpf_tcp_ca_reg(void *kdata) | |
320 | { | |
321 | return tcp_register_congestion_control(kdata); | |
322 | } | |
323 | ||
324 | static void bpf_tcp_ca_unreg(void *kdata) | |
325 | { | |
326 | tcp_unregister_congestion_control(kdata); | |
327 | } | |
328 | ||
0baf26b0 MKL |
329 | struct bpf_struct_ops bpf_tcp_congestion_ops = { |
330 | .verifier_ops = &bpf_tcp_ca_verifier_ops, | |
331 | .reg = bpf_tcp_ca_reg, | |
332 | .unreg = bpf_tcp_ca_unreg, | |
333 | .check_member = bpf_tcp_ca_check_member, | |
334 | .init_member = bpf_tcp_ca_init_member, | |
335 | .init = bpf_tcp_ca_init, | |
336 | .name = "tcp_congestion_ops", | |
337 | }; |