]> git.proxmox.com Git - mirror_ovs.git/blob - lib/ipf.c
tunnel: Bareudp Tunnel Support.
[mirror_ovs.git] / lib / ipf.c
1 /*
2 * Copyright (c) 2019 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include <ctype.h>
19 #include <errno.h>
20 #include <sys/types.h>
21 #include <netinet/in.h>
22 #include <netinet/ip6.h>
23 #include <netinet/icmp6.h>
24 #include <string.h>
25
26 #include "coverage.h"
27 #include "csum.h"
28 #include "ipf.h"
29 #include "latch.h"
30 #include "openvswitch/hmap.h"
31 #include "openvswitch/poll-loop.h"
32 #include "openvswitch/vlog.h"
33 #include "ovs-atomic.h"
34 #include "packets.h"
35 #include "util.h"
36
37 VLOG_DEFINE_THIS_MODULE(ipf);
38 COVERAGE_DEFINE(ipf_stuck_frag_list_purged);
39
40 enum {
41 IPV4_PACKET_MAX_HDR_SIZE = 60,
42 IPV4_PACKET_MAX_SIZE = 65535,
43 IPV6_PACKET_MAX_DATA = 65535,
44 };
45
46 enum ipf_list_state {
47 IPF_LIST_STATE_UNUSED,
48 IPF_LIST_STATE_REASS_FAIL,
49 IPF_LIST_STATE_OTHER_SEEN,
50 IPF_LIST_STATE_FIRST_SEEN,
51 IPF_LIST_STATE_LAST_SEEN,
52 IPF_LIST_STATE_FIRST_LAST_SEEN,
53 IPF_LIST_STATE_COMPLETED,
54 IPF_LIST_STATE_NUM,
55 };
56
57 static char *ipf_state_name[IPF_LIST_STATE_NUM] =
58 {"unused", "reassemble fail", "other frag", "first frag", "last frag",
59 "first/last frag", "complete"};
60
61 enum ipf_list_type {
62 IPF_FRAG_COMPLETED_LIST,
63 IPF_FRAG_EXPIRY_LIST,
64 };
65
66 enum {
67 IPF_INVALID_IDX = -1,
68 IPF_V4_FRAG_SIZE_LBOUND = 400,
69 IPF_V4_FRAG_SIZE_MIN_DEF = 1200,
70 IPF_V6_FRAG_SIZE_LBOUND = 400, /* Useful for testing. */
71 IPF_V6_FRAG_SIZE_MIN_DEF = 1280,
72 IPF_MAX_FRAGS_DEFAULT = 1000,
73 IPF_NFRAG_UBOUND = 5000,
74 };
75
76 enum ipf_counter_type {
77 IPF_NFRAGS_ACCEPTED,
78 IPF_NFRAGS_COMPL_SENT,
79 IPF_NFRAGS_EXPD_SENT,
80 IPF_NFRAGS_TOO_SMALL,
81 IPF_NFRAGS_OVERLAP,
82 IPF_NFRAGS_PURGED,
83 IPF_NFRAGS_NUM_CNTS,
84 };
85
86 union ipf_addr {
87 ovs_be32 ipv4;
88 struct in6_addr ipv6;
89 };
90
91 /* Represents a single fragment; part of a list of fragments. */
92 struct ipf_frag {
93 struct dp_packet *pkt;
94 uint16_t start_data_byte;
95 uint16_t end_data_byte;
96 bool dnsteal; /* 'do not steal': if true, ipf should not free packet. */
97 };
98
99 /* The key for a collection of fragments potentially making up an unfragmented
100 * packet. */
101 struct ipf_list_key {
102 /* ipf_list_key_hash() requires 'src_addr' and 'dst_addr' to be the first
103 * two members. */
104 union ipf_addr src_addr;
105 union ipf_addr dst_addr;
106 uint32_t recirc_id;
107 ovs_be32 ip_id; /* V6 is 32 bits. */
108 ovs_be16 dl_type;
109 uint16_t zone;
110 uint8_t nw_proto;
111 };
112
113 /* A collection of fragments potentially making up an unfragmented packet. */
114 struct ipf_list {
115 struct hmap_node node; /* In struct ipf's 'frag_lists'. */
116 struct ovs_list list_node; /* In struct ipf's 'frag_exp_list' or
117 * 'frag_complete_list'. */
118 struct ipf_frag *frag_list; /* List of fragments for this list. */
119 struct ipf_list_key key; /* The key for the fragemnt list. */
120 struct dp_packet *reass_execute_ctx; /* Reassembled packet. */
121 long long expiration; /* In milliseconds. */
122 int last_sent_idx; /* Last sent fragment idx. */
123 int last_inuse_idx; /* Last inuse fragment idx. */
124 int size; /* Fragment list size. */
125 uint8_t state; /* Frag list state; see ipf_list_state. */
126 };
127
128 /* Represents a reassambled packet which typically is passed through
129 * conntrack. */
130 struct reassembled_pkt {
131 struct ovs_list rp_list_node; /* In struct ipf's
132 * 'reassembled_pkt_list'. */
133 struct dp_packet *pkt;
134 struct ipf_list *list;
135 };
136
137 struct ipf {
138 /* The clean thread is used to clean up fragments in the 'ipf'
139 * module if packet batches are not longer be sent through its user. */
140 pthread_t ipf_clean_thread;
141 struct latch ipf_clean_thread_exit;
142
143 int max_v4_frag_list_size;
144
145 struct ovs_mutex ipf_lock; /* Protects all of the following. */
146 /* These contain 'struct ipf_list's. */
147 struct hmap frag_lists OVS_GUARDED;
148 struct ovs_list frag_exp_list OVS_GUARDED;
149 struct ovs_list frag_complete_list OVS_GUARDED;
150 /* Contains 'struct reassembled_pkt's. */
151 struct ovs_list reassembled_pkt_list OVS_GUARDED;
152
153 /* Used to allow disabling fragmentation reassembly. */
154 atomic_bool ifp_v4_enabled;
155 atomic_bool ifp_v6_enabled;
156
157 /* Will be clamped above 400 bytes; the value chosen should handle
158 * alg control packets of interest that use string encoding of mutable
159 * IP fields; meaning, the control packets should not be fragmented. */
160 atomic_uint min_v4_frag_size;
161 atomic_uint min_v6_frag_size;
162
163 /* Configurable maximum allowable fragments in process. */
164 atomic_uint nfrag_max;
165
166 /* Number of fragments in process. */
167 atomic_count nfrag;
168
169 atomic_uint64_t n4frag_cnt[IPF_NFRAGS_NUM_CNTS];
170 atomic_uint64_t n6frag_cnt[IPF_NFRAGS_NUM_CNTS];
171 };
172
173 static void
174 ipf_print_reass_packet(const char *es, const void *pkt)
175 {
176 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
177 if (!VLOG_DROP_WARN(&rl)) {
178 struct ds ds = DS_EMPTY_INITIALIZER;
179 ds_put_hex_dump(&ds, pkt, 128, 0, false);
180 VLOG_WARN("%s\n%s", es, ds_cstr(&ds));
181 ds_destroy(&ds);
182 }
183 }
184
185 static void
186 ipf_count(struct ipf *ipf, bool v6, enum ipf_counter_type cntr)
187 {
188 atomic_count_inc64(v6 ? &ipf->n6frag_cnt[cntr] : &ipf->n4frag_cnt[cntr]);
189 }
190
191 static bool
192 ipf_get_v4_enabled(struct ipf *ipf)
193 {
194 bool ifp_v4_enabled_;
195 atomic_read_relaxed(&ipf->ifp_v4_enabled, &ifp_v4_enabled_);
196 return ifp_v4_enabled_;
197 }
198
199 static bool
200 ipf_get_v6_enabled(struct ipf *ipf)
201 {
202 bool ifp_v6_enabled_;
203 atomic_read_relaxed(&ipf->ifp_v6_enabled, &ifp_v6_enabled_);
204 return ifp_v6_enabled_;
205 }
206
207 static bool
208 ipf_get_enabled(struct ipf *ipf)
209 {
210 return ipf_get_v4_enabled(ipf) || ipf_get_v6_enabled(ipf);
211 }
212
213 static uint32_t
214 ipf_addr_hash_add(uint32_t hash, const union ipf_addr *addr)
215 {
216 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
217 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
218 }
219
220 /* Adds a list of fragments to the list tracking expiry of yet to be
221 * completed reassembled packets, hence subject to expirty. */
222 static void
223 ipf_expiry_list_add(struct ovs_list *frag_exp_list, struct ipf_list *ipf_list,
224 long long now)
225 /* OVS_REQUIRES(ipf->ipf_lock) */
226 {
227 enum {
228 IPF_FRAG_LIST_TIMEOUT = 15000,
229 };
230
231 ipf_list->expiration = now + IPF_FRAG_LIST_TIMEOUT;
232 ovs_list_push_back(frag_exp_list, &ipf_list->list_node);
233 }
234
235 /* Adds a list of fragments to the list of completed packets, which will be
236 * subsequently transmitted. */
237 static void
238 ipf_completed_list_add(struct ovs_list *frag_complete_list,
239 struct ipf_list *ipf_list)
240 /* OVS_REQUIRES(ipf_lock) */
241 {
242 ovs_list_push_back(frag_complete_list, &ipf_list->list_node);
243 }
244
245 /* Adds a reassmebled packet to the list of reassembled packets, awaiting some
246 * processing, such as being sent through conntrack. */
247 static void
248 ipf_reassembled_list_add(struct ovs_list *reassembled_pkt_list,
249 struct reassembled_pkt *rp)
250 /* OVS_REQUIRES(ipf_lock) */
251 {
252 ovs_list_push_back(reassembled_pkt_list, &rp->rp_list_node);
253 }
254
255 /* Removed a frag list from tracking datastructures and frees list heap
256 * memory. */
257 static void
258 ipf_list_clean(struct hmap *frag_lists,
259 struct ipf_list *ipf_list)
260 /* OVS_REQUIRES(ipf_lock) */
261 {
262 ovs_list_remove(&ipf_list->list_node);
263 hmap_remove(frag_lists, &ipf_list->node);
264 free(ipf_list->frag_list);
265 free(ipf_list);
266 }
267
268 /* Removed a frag list sitting on the expiry list from tracking
269 * datastructures and frees list heap memory. */
270 static void
271 ipf_expiry_list_clean(struct hmap *frag_lists,
272 struct ipf_list *ipf_list)
273 /* OVS_REQUIRES(ipf_lock) */
274 {
275 ipf_list_clean(frag_lists, ipf_list);
276 }
277
278 /* Removed a frag list sitting on the completed list from tracking
279 * datastructures and frees list heap memory. */
280 static void
281 ipf_completed_list_clean(struct hmap *frag_lists,
282 struct ipf_list *ipf_list)
283 /* OVS_REQUIRES(ipf_lock) */
284 {
285 ipf_list_clean(frag_lists, ipf_list);
286 }
287
288 static void
289 ipf_expiry_list_remove(struct ipf_list *ipf_list)
290 /* OVS_REQUIRES(ipf_lock) */
291 {
292 ovs_list_remove(&ipf_list->list_node);
293 }
294
295 static void
296 ipf_reassembled_list_remove(struct reassembled_pkt *rp)
297 /* OVS_REQUIRES(ipf_lock) */
298 {
299 ovs_list_remove(&rp->rp_list_node);
300 }
301
302 /* Symmetric */
303 static uint32_t
304 ipf_list_key_hash(const struct ipf_list_key *key, uint32_t basis)
305 {
306 uint32_t hsrc, hdst, hash;
307 hsrc = hdst = basis;
308 hsrc = ipf_addr_hash_add(hsrc, &key->src_addr);
309 hdst = ipf_addr_hash_add(hdst, &key->dst_addr);
310 hash = hsrc ^ hdst;
311
312 /* Hash the rest of the key. */
313 return hash_words((uint32_t *) (&key->dst_addr + 1),
314 (uint32_t *) (key + 1) -
315 (uint32_t *) (&key->dst_addr + 1),
316 hash);
317 }
318
319 static bool
320 ipf_is_first_v4_frag(const struct dp_packet *pkt)
321 {
322 const struct ip_header *l3 = dp_packet_l3(pkt);
323 if (!(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) &&
324 l3->ip_frag_off & htons(IP_MORE_FRAGMENTS)) {
325 return true;
326 }
327 return false;
328 }
329
330 static bool
331 ipf_is_last_v4_frag(const struct dp_packet *pkt)
332 {
333 const struct ip_header *l3 = dp_packet_l3(pkt);
334 if (l3->ip_frag_off & htons(IP_FRAG_OFF_MASK) &&
335 !(l3->ip_frag_off & htons(IP_MORE_FRAGMENTS))) {
336 return true;
337 }
338 return false;
339 }
340
341 static bool
342 ipf_is_v6_frag(ovs_be16 ip6f_offlg)
343 {
344 if (ip6f_offlg & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) {
345 return true;
346 }
347 return false;
348 }
349
350 static bool
351 ipf_is_first_v6_frag(ovs_be16 ip6f_offlg)
352 {
353 if (!(ip6f_offlg & IP6F_OFF_MASK) &&
354 ip6f_offlg & IP6F_MORE_FRAG) {
355 return true;
356 }
357 return false;
358 }
359
360 static bool
361 ipf_is_last_v6_frag(ovs_be16 ip6f_offlg)
362 {
363 if ((ip6f_offlg & IP6F_OFF_MASK) &&
364 !(ip6f_offlg & IP6F_MORE_FRAG)) {
365 return true;
366 }
367 return false;
368 }
369
370 /* Checks for a completed packet collection of fragments. */
371 static bool
372 ipf_list_complete(const struct ipf_list *ipf_list)
373 /* OVS_REQUIRES(ipf_lock) */
374 {
375 for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
376 if (ipf_list->frag_list[i - 1].end_data_byte + 1
377 != ipf_list->frag_list[i].start_data_byte) {
378 return false;
379 }
380 }
381 return true;
382 }
383
384 /* Runs O(n) for a sorted or almost sorted list. */
385 static void
386 ipf_sort(struct ipf_frag *frag_list, size_t last_idx)
387 /* OVS_REQUIRES(ipf_lock) */
388 {
389 for (int li = 1; li <= last_idx; li++) {
390 struct ipf_frag ipf_frag = frag_list[li];
391 int ci = li - 1;
392 while (ci >= 0 &&
393 frag_list[ci].start_data_byte > ipf_frag.start_data_byte) {
394 frag_list[ci + 1] = frag_list[ci];
395 ci--;
396 }
397 frag_list[ci + 1] = ipf_frag;
398 }
399 }
400
401 /* Called on a sorted complete list of v4 fragments to reassemble them into
402 * a single packet that can be processed, such as passing through conntrack.
403 */
404 static struct dp_packet *
405 ipf_reassemble_v4_frags(struct ipf_list *ipf_list)
406 /* OVS_REQUIRES(ipf_lock) */
407 {
408 struct ipf_frag *frag_list = ipf_list->frag_list;
409 struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
410 dp_packet_set_size(pkt, dp_packet_size(pkt) - dp_packet_l2_pad_size(pkt));
411 struct ip_header *l3 = dp_packet_l3(pkt);
412 int len = ntohs(l3->ip_tot_len);
413
414 int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
415 frag_list[1].start_data_byte + 1;
416
417 if (len + rest_len > IPV4_PACKET_MAX_SIZE) {
418 ipf_print_reass_packet(
419 "Unsupported big reassembled v4 packet; v4 hdr:", l3);
420 dp_packet_delete(pkt);
421 return NULL;
422 }
423
424 dp_packet_prealloc_tailroom(pkt, rest_len);
425
426 for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
427 size_t add_len = frag_list[i].end_data_byte -
428 frag_list[i].start_data_byte + 1;
429 const char *l4 = dp_packet_l4(frag_list[i].pkt);
430 dp_packet_put(pkt, l4, add_len);
431 }
432
433 len += rest_len;
434 l3 = dp_packet_l3(pkt);
435 ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS);
436 if (!dp_packet_hwol_is_ipv4(pkt)) {
437 l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off,
438 new_ip_frag_off);
439 l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len));
440 }
441 l3->ip_tot_len = htons(len);
442 l3->ip_frag_off = new_ip_frag_off;
443 dp_packet_set_l2_pad_size(pkt, 0);
444
445 return pkt;
446 }
447
448 /* Called on a sorted complete list of v6 fragments to reassemble them into
449 * a single packet that can be processed, such as passing through conntrack.
450 */
451 static struct dp_packet *
452 ipf_reassemble_v6_frags(struct ipf_list *ipf_list)
453 /* OVS_REQUIRES(ipf_lock) */
454 {
455 struct ipf_frag *frag_list = ipf_list->frag_list;
456 struct dp_packet *pkt = dp_packet_clone(frag_list[0].pkt);
457 dp_packet_set_size(pkt, dp_packet_size(pkt) - dp_packet_l2_pad_size(pkt));
458 struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
459 int pl = ntohs(l3->ip6_plen) - sizeof(struct ovs_16aligned_ip6_frag);
460
461 int rest_len = frag_list[ipf_list->last_inuse_idx].end_data_byte -
462 frag_list[1].start_data_byte + 1;
463
464 if (pl + rest_len > IPV6_PACKET_MAX_DATA) {
465 ipf_print_reass_packet(
466 "Unsupported big reassembled v6 packet; v6 hdr:", l3);
467 dp_packet_delete(pkt);
468 return NULL;
469 }
470
471 dp_packet_prealloc_tailroom(pkt, rest_len);
472
473 for (int i = 1; i <= ipf_list->last_inuse_idx; i++) {
474 size_t add_len = frag_list[i].end_data_byte -
475 frag_list[i].start_data_byte + 1;
476 const char *l4 = dp_packet_l4(frag_list[i].pkt);
477 dp_packet_put(pkt, l4, add_len);
478 }
479
480 pl += rest_len;
481 l3 = dp_packet_l3(pkt);
482
483 uint8_t nw_proto = l3->ip6_nxt;
484 uint8_t nw_frag = 0;
485 const void *data = l3 + 1;
486 size_t datasize = pl;
487
488 const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
489 if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr)
490 || !nw_frag || !frag_hdr) {
491
492 ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3);
493 dp_packet_delete(pkt);
494 return NULL;
495 }
496
497 struct ovs_16aligned_ip6_frag *fh =
498 CONST_CAST(struct ovs_16aligned_ip6_frag *, frag_hdr);
499 fh->ip6f_offlg = 0;
500 l3->ip6_plen = htons(pl);
501 l3->ip6_ctlun.ip6_un1.ip6_un1_nxt = nw_proto;
502 dp_packet_set_l2_pad_size(pkt, 0);
503 return pkt;
504 }
505
506 /* Called when a frag list state transitions to another state. This is
507 * triggered by new fragment for the list being received.*/
508 static void
509 ipf_list_state_transition(struct ipf *ipf, struct ipf_list *ipf_list,
510 bool ff, bool lf, bool v6)
511 OVS_REQUIRES(ipf->ipf_lock)
512 {
513 enum ipf_list_state curr_state = ipf_list->state;
514 enum ipf_list_state next_state;
515 switch (curr_state) {
516 case IPF_LIST_STATE_UNUSED:
517 case IPF_LIST_STATE_OTHER_SEEN:
518 if (ff) {
519 next_state = IPF_LIST_STATE_FIRST_SEEN;
520 } else if (lf) {
521 next_state = IPF_LIST_STATE_LAST_SEEN;
522 } else {
523 next_state = IPF_LIST_STATE_OTHER_SEEN;
524 }
525 break;
526 case IPF_LIST_STATE_FIRST_SEEN:
527 if (lf) {
528 next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
529 } else {
530 next_state = IPF_LIST_STATE_FIRST_SEEN;
531 }
532 break;
533 case IPF_LIST_STATE_LAST_SEEN:
534 if (ff) {
535 next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
536 } else {
537 next_state = IPF_LIST_STATE_LAST_SEEN;
538 }
539 break;
540 case IPF_LIST_STATE_FIRST_LAST_SEEN:
541 next_state = IPF_LIST_STATE_FIRST_LAST_SEEN;
542 break;
543 case IPF_LIST_STATE_COMPLETED:
544 case IPF_LIST_STATE_REASS_FAIL:
545 case IPF_LIST_STATE_NUM:
546 default:
547 OVS_NOT_REACHED();
548 }
549
550 if (next_state == IPF_LIST_STATE_FIRST_LAST_SEEN) {
551 ipf_sort(ipf_list->frag_list, ipf_list->last_inuse_idx);
552 if (ipf_list_complete(ipf_list)) {
553 struct dp_packet *reass_pkt = v6
554 ? ipf_reassemble_v6_frags(ipf_list)
555 : ipf_reassemble_v4_frags(ipf_list);
556 if (reass_pkt) {
557 struct reassembled_pkt *rp = xzalloc(sizeof *rp);
558 rp->pkt = reass_pkt;
559 rp->list = ipf_list;
560 ipf_reassembled_list_add(&ipf->reassembled_pkt_list, rp);
561 ipf_expiry_list_remove(ipf_list);
562 next_state = IPF_LIST_STATE_COMPLETED;
563 } else {
564 next_state = IPF_LIST_STATE_REASS_FAIL;
565 }
566 }
567 }
568 ipf_list->state = next_state;
569 }
570
571 /* Some sanity checks are redundant, but prudent, in case code paths for
572 * fragments change in future. The processing cost for fragments is not
573 * important. */
574 static bool
575 ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet *pkt)
576 {
577 if (OVS_UNLIKELY(dp_packet_ip_checksum_bad(pkt))) {
578 goto invalid_pkt;
579 }
580
581 const struct eth_header *l2 = dp_packet_eth(pkt);
582 const struct ip_header *l3 = dp_packet_l3(pkt);
583
584 if (OVS_UNLIKELY(!l2 || !l3)) {
585 goto invalid_pkt;
586 }
587
588 size_t l3_size = dp_packet_l3_size(pkt);
589 if (OVS_UNLIKELY(l3_size < IP_HEADER_LEN)) {
590 goto invalid_pkt;
591 }
592
593 if (!IP_IS_FRAGMENT(l3->ip_frag_off)) {
594 return false;
595 }
596
597 uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
598 if (OVS_UNLIKELY(ip_tot_len != l3_size)) {
599 goto invalid_pkt;
600 }
601
602 size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
603 if (OVS_UNLIKELY(ip_hdr_len < IP_HEADER_LEN)) {
604 goto invalid_pkt;
605 }
606 if (OVS_UNLIKELY(l3_size < ip_hdr_len)) {
607 goto invalid_pkt;
608 }
609
610 if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt)
611 && !dp_packet_hwol_is_ipv4(pkt)
612 && csum(l3, ip_hdr_len) != 0)) {
613 goto invalid_pkt;
614 }
615
616 uint32_t min_v4_frag_size_;
617 atomic_read_relaxed(&ipf->min_v4_frag_size, &min_v4_frag_size_);
618 bool lf = ipf_is_last_v4_frag(pkt);
619 if (OVS_UNLIKELY(!lf && dp_packet_l3_size(pkt) < min_v4_frag_size_)) {
620 ipf_count(ipf, false, IPF_NFRAGS_TOO_SMALL);
621 goto invalid_pkt;
622 }
623 return true;
624
625 invalid_pkt:
626 pkt->md.ct_state = CS_INVALID;
627 return false;
628 }
629
630 static bool
631 ipf_v4_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
632 struct ipf_list_key *key, uint16_t *start_data_byte,
633 uint16_t *end_data_byte, bool *ff, bool *lf)
634 {
635 const struct ip_header *l3 = dp_packet_l3(pkt);
636 uint16_t ip_tot_len = ntohs(l3->ip_tot_len);
637 size_t ip_hdr_len = IP_IHL(l3->ip_ihl_ver) * 4;
638
639 *start_data_byte = ntohs(l3->ip_frag_off & htons(IP_FRAG_OFF_MASK)) * 8;
640 *end_data_byte = *start_data_byte + ip_tot_len - ip_hdr_len - 1;
641 *ff = ipf_is_first_v4_frag(pkt);
642 *lf = ipf_is_last_v4_frag(pkt);
643 memset(key, 0, sizeof *key);
644 key->ip_id = be16_to_be32(l3->ip_id);
645 key->dl_type = dl_type;
646 key->src_addr.ipv4 = get_16aligned_be32(&l3->ip_src);
647 key->dst_addr.ipv4 = get_16aligned_be32(&l3->ip_dst);
648 key->nw_proto = l3->ip_proto;
649 key->zone = zone;
650 key->recirc_id = pkt->md.recirc_id;
651 return true;
652 }
653
654 /* Some sanity checks are redundant, but prudent, in case code paths for
655 * fragments change in future. The processing cost for fragments is not
656 * important. */
657 static bool
658 ipf_is_valid_v6_frag(struct ipf *ipf, struct dp_packet *pkt)
659 {
660 const struct eth_header *l2 = dp_packet_eth(pkt);
661 const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
662 const char *l4 = dp_packet_l4(pkt);
663
664 if (OVS_UNLIKELY(!l2 || !l3 || !l4)) {
665 goto invalid_pkt;
666 }
667
668 size_t l3_size = dp_packet_l3_size(pkt);
669 size_t l3_hdr_size = sizeof *l3;
670
671 if (OVS_UNLIKELY(l3_size < l3_hdr_size)) {
672 goto invalid_pkt;
673 }
674
675 uint8_t nw_frag = 0;
676 uint8_t nw_proto = l3->ip6_nxt;
677 const void *data = l3 + 1;
678 size_t datasize = l3_size - l3_hdr_size;
679 const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
680 if (!parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag,
681 &frag_hdr) || !nw_frag || !frag_hdr) {
682 return false;
683 }
684
685 int pl = ntohs(l3->ip6_plen);
686 if (OVS_UNLIKELY(pl + l3_hdr_size != l3_size)) {
687 goto invalid_pkt;
688 }
689
690 ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
691 if (OVS_UNLIKELY(!ipf_is_v6_frag(ip6f_offlg))) {
692 return false;
693 }
694
695 uint32_t min_v6_frag_size_;
696 atomic_read_relaxed(&ipf->min_v6_frag_size, &min_v6_frag_size_);
697 bool lf = ipf_is_last_v6_frag(ip6f_offlg);
698
699 if (OVS_UNLIKELY(!lf && dp_packet_l3_size(pkt) < min_v6_frag_size_)) {
700 ipf_count(ipf, true, IPF_NFRAGS_TOO_SMALL);
701 goto invalid_pkt;
702 }
703
704 return true;
705
706 invalid_pkt:
707 pkt->md.ct_state = CS_INVALID;
708 return false;
709
710 }
711
712 static void
713 ipf_v6_key_extract(struct dp_packet *pkt, ovs_be16 dl_type, uint16_t zone,
714 struct ipf_list_key *key, uint16_t *start_data_byte,
715 uint16_t *end_data_byte, bool *ff, bool *lf)
716 {
717 const struct ovs_16aligned_ip6_hdr *l3 = dp_packet_l3(pkt);
718 uint8_t nw_frag = 0;
719 uint8_t nw_proto = l3->ip6_nxt;
720 const void *data = l3 + 1;
721 size_t datasize = dp_packet_l3_size(pkt) - sizeof *l3;
722 const struct ovs_16aligned_ip6_frag *frag_hdr = NULL;
723
724 parse_ipv6_ext_hdrs(&data, &datasize, &nw_proto, &nw_frag, &frag_hdr);
725 ovs_assert(nw_frag && frag_hdr);
726 ovs_be16 ip6f_offlg = frag_hdr->ip6f_offlg;
727 *start_data_byte = ntohs(ip6f_offlg & IP6F_OFF_MASK) +
728 sizeof (struct ovs_16aligned_ip6_frag);
729 *end_data_byte = *start_data_byte + dp_packet_l4_size(pkt) - 1;
730 *ff = ipf_is_first_v6_frag(ip6f_offlg);
731 *lf = ipf_is_last_v6_frag(ip6f_offlg);
732 memset(key, 0, sizeof *key);
733 key->ip_id = get_16aligned_be32(&frag_hdr->ip6f_ident);
734 key->dl_type = dl_type;
735 memcpy(&key->src_addr.ipv6, &l3->ip6_src, sizeof key->src_addr.ipv6);
736 /* We are not supporting parsing of the routing header to use as the
737 * dst address part of the key. */
738 memcpy(&key->dst_addr.ipv6, &l3->ip6_dst, sizeof key->dst_addr.ipv6);
739 key->nw_proto = 0; /* Not used for key for V6. */
740 key->zone = zone;
741 key->recirc_id = pkt->md.recirc_id;
742 }
743
744 static bool
745 ipf_list_key_eq(const struct ipf_list_key *key1,
746 const struct ipf_list_key *key2)
747 /* OVS_REQUIRES(ipf_lock) */
748 {
749 if (!memcmp(&key1->src_addr, &key2->src_addr, sizeof key1->src_addr) &&
750 !memcmp(&key1->dst_addr, &key2->dst_addr, sizeof key1->dst_addr) &&
751 key1->dl_type == key2->dl_type &&
752 key1->ip_id == key2->ip_id &&
753 key1->zone == key2->zone &&
754 key1->nw_proto == key2->nw_proto &&
755 key1->recirc_id == key2->recirc_id) {
756 return true;
757 }
758 return false;
759 }
760
761 static struct ipf_list *
762 ipf_list_key_lookup(struct ipf *ipf, const struct ipf_list_key *key,
763 uint32_t hash)
764 OVS_REQUIRES(ipf->ipf_lock)
765 {
766 struct ipf_list *ipf_list;
767 HMAP_FOR_EACH_WITH_HASH (ipf_list, node, hash, &ipf->frag_lists) {
768 if (ipf_list_key_eq(&ipf_list->key, key)) {
769 return ipf_list;
770 }
771 }
772 return NULL;
773 }
774
775 static bool
776 ipf_is_frag_duped(const struct ipf_frag *frag_list, int last_inuse_idx,
777 size_t start_data_byte, size_t end_data_byte)
778 /* OVS_REQUIRES(ipf_lock) */
779 {
780 for (int i = 0; i <= last_inuse_idx; i++) {
781 if ((start_data_byte >= frag_list[i].start_data_byte &&
782 start_data_byte <= frag_list[i].end_data_byte) ||
783 (end_data_byte >= frag_list[i].start_data_byte &&
784 end_data_byte <= frag_list[i].end_data_byte)) {
785 return true;
786 }
787 }
788 return false;
789 }
790
791 /* Adds a fragment to a list of fragments, if the fragment is not a
792 * duplicate. If the fragment is a duplicate, that fragment is marked
793 * invalid to avoid the work that conntrack would do to mark the fragment
794 * as invalid, which it will in all cases. */
795 static bool
796 ipf_process_frag(struct ipf *ipf, struct ipf_list *ipf_list,
797 struct dp_packet *pkt, uint16_t start_data_byte,
798 uint16_t end_data_byte, bool ff, bool lf, bool v6,
799 bool dnsteal)
800 OVS_REQUIRES(ipf->ipf_lock)
801 {
802 bool duped_frag = ipf_is_frag_duped(ipf_list->frag_list,
803 ipf_list->last_inuse_idx, start_data_byte, end_data_byte);
804 int last_inuse_idx = ipf_list->last_inuse_idx;
805
806 if (!duped_frag) {
807 if (last_inuse_idx < ipf_list->size - 1) {
808 /* In the case of dpdk, it would be unfortunate if we had
809 * to create a clone fragment outside the dpdk mp due to the
810 * mempool size being too limited. We will otherwise need to
811 * recommend not setting the mempool number of buffers too low
812 * and also clamp the number of fragments. */
813 struct ipf_frag *frag = &ipf_list->frag_list[last_inuse_idx + 1];
814 frag->pkt = pkt;
815 frag->start_data_byte = start_data_byte;
816 frag->end_data_byte = end_data_byte;
817 frag->dnsteal = dnsteal;
818 ipf_list->last_inuse_idx++;
819 atomic_count_inc(&ipf->nfrag);
820 ipf_count(ipf, v6, IPF_NFRAGS_ACCEPTED);
821 ipf_list_state_transition(ipf, ipf_list, ff, lf, v6);
822 } else {
823 OVS_NOT_REACHED();
824 }
825 } else {
826 ipf_count(ipf, v6, IPF_NFRAGS_OVERLAP);
827 pkt->md.ct_state = CS_INVALID;
828 return false;
829 }
830 return true;
831 }
832
833 static void
834 ipf_list_init(struct ipf_list *ipf_list, struct ipf_list_key *key,
835 int max_frag_list_size)
836 {
837 ipf_list->key = *key;
838 ipf_list->last_inuse_idx = IPF_INVALID_IDX;
839 ipf_list->last_sent_idx = IPF_INVALID_IDX;
840 ipf_list->reass_execute_ctx = NULL;
841 ipf_list->state = IPF_LIST_STATE_UNUSED;
842 ipf_list->size = max_frag_list_size;
843 ipf_list->frag_list
844 = xzalloc(ipf_list->size * sizeof *ipf_list->frag_list);
845 }
846
847 /* Generates a fragment list key from a well formed fragment and either starts
848 * a new fragment list or increases the size of the existing fragment list,
849 * while checking if the maximum supported fragements are supported or the
850 * list size is impossibly big. Calls 'ipf_process_frag()' to add a fragment
851 * to a list of fragemnts. */
852 static bool
853 ipf_handle_frag(struct ipf *ipf, struct dp_packet *pkt, ovs_be16 dl_type,
854 uint16_t zone, long long now, uint32_t hash_basis,
855 bool dnsteal)
856 OVS_REQUIRES(ipf->ipf_lock)
857 {
858 struct ipf_list_key key;
859 /* Initialize 4 variables for some versions of GCC. */
860 uint16_t start_data_byte = 0;
861 uint16_t end_data_byte = 0;
862 bool ff = false;
863 bool lf = false;
864 bool v6 = dl_type == htons(ETH_TYPE_IPV6);
865
866 if (v6 && ipf_get_v6_enabled(ipf)) {
867 ipf_v6_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
868 &end_data_byte, &ff, &lf);
869 } else if (!v6 && ipf_get_v4_enabled(ipf)) {
870 ipf_v4_key_extract(pkt, dl_type, zone, &key, &start_data_byte,
871 &end_data_byte, &ff, &lf);
872 } else {
873 return false;
874 }
875
876 unsigned int nfrag_max;
877 atomic_read_relaxed(&ipf->nfrag_max, &nfrag_max);
878 if (atomic_count_get(&ipf->nfrag) >= nfrag_max) {
879 return false;
880 }
881
882 uint32_t hash = ipf_list_key_hash(&key, hash_basis);
883 struct ipf_list *ipf_list = ipf_list_key_lookup(ipf, &key, hash);
884 enum {
885 IPF_FRAG_LIST_MIN_INCREMENT = 4,
886 IPF_IPV6_MAX_FRAG_LIST_SIZE = 65535,
887 };
888
889 int max_frag_list_size;
890 if (v6) {
891 /* Because the calculation with extension headers is variable,
892 * we don't calculate a hard maximum fragment list size upfront. The
893 * fragment list size is practically limited by the code, however. */
894 max_frag_list_size = IPF_IPV6_MAX_FRAG_LIST_SIZE;
895 } else {
896 max_frag_list_size = ipf->max_v4_frag_list_size;
897 }
898
899 if (!ipf_list) {
900 ipf_list = xmalloc(sizeof *ipf_list);
901 ipf_list_init(ipf_list, &key,
902 MIN(max_frag_list_size, IPF_FRAG_LIST_MIN_INCREMENT));
903 hmap_insert(&ipf->frag_lists, &ipf_list->node, hash);
904 ipf_expiry_list_add(&ipf->frag_exp_list, ipf_list, now);
905 } else if (ipf_list->state == IPF_LIST_STATE_REASS_FAIL ||
906 ipf_list->state == IPF_LIST_STATE_COMPLETED) {
907 /* Bail out as early as possible. */
908 return false;
909 } else if (ipf_list->last_inuse_idx + 1 >= ipf_list->size) {
910 int increment = MIN(IPF_FRAG_LIST_MIN_INCREMENT,
911 max_frag_list_size - ipf_list->size);
912 /* Enforce limit. */
913 if (increment > 0) {
914 ipf_list->frag_list =
915 xrealloc(ipf_list->frag_list, (ipf_list->size + increment) *
916 sizeof *ipf_list->frag_list);
917 ipf_list->size += increment;
918 } else {
919 return false;
920 }
921 }
922
923 return ipf_process_frag(ipf, ipf_list, pkt, start_data_byte,
924 end_data_byte, ff, lf, v6, dnsteal);
925 }
926
927 /* Filters out fragments from a batch of fragments and adjust the batch. */
928 static void
929 ipf_extract_frags_from_batch(struct ipf *ipf, struct dp_packet_batch *pb,
930 ovs_be16 dl_type, uint16_t zone, long long now,
931 uint32_t hash_basis)
932 {
933 const size_t pb_cnt = dp_packet_batch_size(pb);
934 int pb_idx; /* Index in a packet batch. */
935 struct dp_packet *pkt;
936
937 DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
938 if (OVS_UNLIKELY((dl_type == htons(ETH_TYPE_IP) &&
939 ipf_is_valid_v4_frag(ipf, pkt))
940 ||
941 (dl_type == htons(ETH_TYPE_IPV6) &&
942 ipf_is_valid_v6_frag(ipf, pkt)))) {
943
944 ovs_mutex_lock(&ipf->ipf_lock);
945 if (!ipf_handle_frag(ipf, pkt, dl_type, zone, now, hash_basis,
946 pb->do_not_steal)) {
947 dp_packet_batch_refill(pb, pkt, pb_idx);
948 }
949 ovs_mutex_unlock(&ipf->ipf_lock);
950 } else {
951 dp_packet_batch_refill(pb, pkt, pb_idx);
952 }
953 }
954 }
955
956 /* In case of DPDK, a memory source check is done, as DPDK memory pool
957 * management has trouble dealing with multiple source types. The
958 * check_source paramater is used to indicate when this check is needed. */
959 static bool
960 ipf_dp_packet_batch_add(struct dp_packet_batch *pb , struct dp_packet *pkt,
961 bool check_source OVS_UNUSED)
962 {
963 #ifdef DPDK_NETDEV
964 if ((dp_packet_batch_is_full(pb)) ||
965 /* DPDK cannot handle multiple sources in a batch. */
966 (check_source && !dp_packet_batch_is_empty(pb)
967 && pb->packets[0]->source != pkt->source)) {
968 #else
969 if (dp_packet_batch_is_full(pb)) {
970 #endif
971 return false;
972 }
973
974 dp_packet_batch_add(pb, pkt);
975 return true;
976 }
977
978 /* This would be used in rare cases where a list cannot be sent. One rare
979 * reason known right now is a mempool source check, which exists due to DPDK
980 * support, where packets are no longer being received on any port with a
981 * source matching the fragment. Another reason is a race where all
982 * conntrack rules are unconfigured when some fragments are yet to be
983 * flushed.
984 *
985 * Returns true if the list was purged. */
986 static bool
987 ipf_purge_list_check(struct ipf *ipf, struct ipf_list *ipf_list,
988 long long now)
989 OVS_REQUIRES(ipf->ipf_lock)
990 {
991 enum {
992 IPF_FRAG_LIST_PURGE_TIME_ADJ = 10000
993 };
994
995 if (now < ipf_list->expiration + IPF_FRAG_LIST_PURGE_TIME_ADJ) {
996 return false;
997 }
998
999 while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
1000 struct dp_packet * pkt
1001 = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
1002 dp_packet_delete(pkt);
1003 atomic_count_dec(&ipf->nfrag);
1004 COVERAGE_INC(ipf_stuck_frag_list_purged);
1005 ipf_count(ipf, ipf_list->key.dl_type == htons(ETH_TYPE_IPV6),
1006 IPF_NFRAGS_PURGED);
1007 ipf_list->last_sent_idx++;
1008 }
1009
1010 return true;
1011 }
1012
1013 /* Does the packet batch management and common accounting work associated
1014 * with 'ipf_send_completed_frags()' and 'ipf_send_expired_frags()'. */
1015 static bool
1016 ipf_send_frags_in_list(struct ipf *ipf, struct ipf_list *ipf_list,
1017 struct dp_packet_batch *pb,
1018 enum ipf_list_type list_type, bool v6, long long now)
1019 OVS_REQUIRES(ipf->ipf_lock)
1020 {
1021 if (ipf_purge_list_check(ipf, ipf_list, now)) {
1022 return true;
1023 }
1024
1025 while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
1026 struct dp_packet *pkt
1027 = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
1028 if (ipf_dp_packet_batch_add(pb, pkt, true)) {
1029 ipf_list->last_sent_idx++;
1030 atomic_count_dec(&ipf->nfrag);
1031
1032 if (list_type == IPF_FRAG_COMPLETED_LIST) {
1033 ipf_count(ipf, v6, IPF_NFRAGS_COMPL_SENT);
1034 } else {
1035 ipf_count(ipf, v6, IPF_NFRAGS_EXPD_SENT);
1036 pkt->md.ct_state = CS_INVALID;
1037 }
1038
1039 if (ipf_list->last_sent_idx == ipf_list->last_inuse_idx) {
1040 return true;
1041 }
1042 } else {
1043 return false;
1044 }
1045 }
1046 OVS_NOT_REACHED();
1047 }
1048
1049 /* Adds fragments associated with a completed fragment list to a packet batch
1050 * to be processed by the calling application, typically conntrack. Also
1051 * cleans up the list context when it is empty.*/
1052 static void
1053 ipf_send_completed_frags(struct ipf *ipf, struct dp_packet_batch *pb,
1054 long long now, bool v6)
1055 {
1056 if (ovs_list_is_empty(&ipf->frag_complete_list)) {
1057 return;
1058 }
1059
1060 ovs_mutex_lock(&ipf->ipf_lock);
1061 struct ipf_list *ipf_list, *next;
1062
1063 LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_complete_list) {
1064 if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_COMPLETED_LIST,
1065 v6, now)) {
1066 ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
1067 } else {
1068 break;
1069 }
1070 }
1071
1072 ovs_mutex_unlock(&ipf->ipf_lock);
1073 }
1074
1075 /* Conservatively adds fragments associated with a expired fragment list to
1076 * a packet batch to be processed by the calling application, typically
1077 * conntrack. Also cleans up the list context when it is empty.*/
1078 static void
1079 ipf_send_expired_frags(struct ipf *ipf, struct dp_packet_batch *pb,
1080 long long now, bool v6)
1081 {
1082 enum {
1083 /* Very conservative, due to DOS probability. */
1084 IPF_FRAG_LIST_MAX_EXPIRED = 1,
1085 };
1086
1087
1088 if (ovs_list_is_empty(&ipf->frag_exp_list)) {
1089 return;
1090 }
1091
1092 ovs_mutex_lock(&ipf->ipf_lock);
1093 struct ipf_list *ipf_list, *next;
1094 size_t lists_removed = 0;
1095
1096 LIST_FOR_EACH_SAFE (ipf_list, next, list_node, &ipf->frag_exp_list) {
1097 if (now <= ipf_list->expiration ||
1098 lists_removed >= IPF_FRAG_LIST_MAX_EXPIRED) {
1099 break;
1100 }
1101
1102 if (ipf_send_frags_in_list(ipf, ipf_list, pb, IPF_FRAG_EXPIRY_LIST,
1103 v6, now)) {
1104 ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
1105 lists_removed++;
1106 } else {
1107 break;
1108 }
1109 }
1110
1111 ovs_mutex_unlock(&ipf->ipf_lock);
1112 }
1113
1114 /* Adds a reassmebled packet to a packet batch to be processed by the caller.
1115 */
1116 static void
1117 ipf_execute_reass_pkts(struct ipf *ipf, struct dp_packet_batch *pb)
1118 {
1119 if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
1120 return;
1121 }
1122
1123 ovs_mutex_lock(&ipf->ipf_lock);
1124 struct reassembled_pkt *rp, *next;
1125
1126 LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) {
1127 if (!rp->list->reass_execute_ctx &&
1128 ipf_dp_packet_batch_add(pb, rp->pkt, false)) {
1129 rp->list->reass_execute_ctx = rp->pkt;
1130 }
1131 }
1132
1133 ovs_mutex_unlock(&ipf->ipf_lock);
1134 }
1135
1136 /* Checks for reassembled packets post processing by conntrack and edits the
1137 * fragments if needed based on what conntrack decided. */
1138 static void
1139 ipf_post_execute_reass_pkts(struct ipf *ipf,
1140 struct dp_packet_batch *pb, bool v6)
1141 {
1142 if (ovs_list_is_empty(&ipf->reassembled_pkt_list)) {
1143 return;
1144 }
1145
1146 ovs_mutex_lock(&ipf->ipf_lock);
1147 struct reassembled_pkt *rp, *next;
1148
1149 LIST_FOR_EACH_SAFE (rp, next, rp_list_node, &ipf->reassembled_pkt_list) {
1150 const size_t pb_cnt = dp_packet_batch_size(pb);
1151 int pb_idx;
1152 struct dp_packet *pkt;
1153 /* Inner batch loop is constant time since batch size is <=
1154 * NETDEV_MAX_BURST. */
1155 DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
1156 if (pkt == rp->list->reass_execute_ctx) {
1157 for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
1158 rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label;
1159 rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark;
1160 rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state;
1161 rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone;
1162 rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
1163 pkt->md.ct_orig_tuple_ipv6;
1164 if (pkt->md.ct_orig_tuple_ipv6) {
1165 rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 =
1166 pkt->md.ct_orig_tuple.ipv6;
1167 } else {
1168 rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4 =
1169 pkt->md.ct_orig_tuple.ipv4;
1170 }
1171 }
1172
1173 const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
1174 void *l4_frag = dp_packet_l4(frag_0->pkt);
1175 void *l4_reass = dp_packet_l4(pkt);
1176 memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
1177
1178 if (v6) {
1179 struct ovs_16aligned_ip6_hdr *l3_frag
1180 = dp_packet_l3(frag_0->pkt);
1181 struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt);
1182 l3_frag->ip6_src = l3_reass->ip6_src;
1183 l3_frag->ip6_dst = l3_reass->ip6_dst;
1184 } else {
1185 struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
1186 struct ip_header *l3_reass = dp_packet_l3(pkt);
1187 if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) {
1188 ovs_be32 reass_ip =
1189 get_16aligned_be32(&l3_reass->ip_src);
1190 ovs_be32 frag_ip =
1191 get_16aligned_be32(&l3_frag->ip_src);
1192
1193 l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
1194 frag_ip, reass_ip);
1195 reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
1196 frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
1197 l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
1198 frag_ip, reass_ip);
1199 }
1200
1201 l3_frag->ip_src = l3_reass->ip_src;
1202 l3_frag->ip_dst = l3_reass->ip_dst;
1203 }
1204
1205 ipf_completed_list_add(&ipf->frag_complete_list, rp->list);
1206 ipf_reassembled_list_remove(rp);
1207 dp_packet_delete(rp->pkt);
1208 free(rp);
1209 } else {
1210 dp_packet_batch_refill(pb, pkt, pb_idx);
1211 }
1212 }
1213 }
1214
1215 ovs_mutex_unlock(&ipf->ipf_lock);
1216 }
1217
1218 /* Extracts any fragments from the batch and reassembles them when a
1219 * complete packet is received. Completed packets are attempted to
1220 * be added to the batch to be sent through conntrack. */
1221 void
1222 ipf_preprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
1223 long long now, ovs_be16 dl_type, uint16_t zone,
1224 uint32_t hash_basis)
1225 {
1226 if (ipf_get_enabled(ipf)) {
1227 ipf_extract_frags_from_batch(ipf, pb, dl_type, zone, now, hash_basis);
1228 }
1229
1230 if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
1231 ipf_execute_reass_pkts(ipf, pb);
1232 }
1233 }
1234
1235 /* Updates fragments based on the processing of the reassembled packet sent
1236 * through conntrack and adds these fragments to any batches seen. Expired
1237 * fragments are marked as invalid and also added to the batches seen
1238 * with low priority. Reassembled packets are freed. */
1239 void
1240 ipf_postprocess_conntrack(struct ipf *ipf, struct dp_packet_batch *pb,
1241 long long now, ovs_be16 dl_type)
1242 {
1243 if (ipf_get_enabled(ipf) || atomic_count_get(&ipf->nfrag)) {
1244 bool v6 = dl_type == htons(ETH_TYPE_IPV6);
1245 ipf_post_execute_reass_pkts(ipf, pb, v6);
1246 ipf_send_completed_frags(ipf, pb, now, v6);
1247 ipf_send_expired_frags(ipf, pb, now, v6);
1248 }
1249 }
1250
1251 static void *
1252 ipf_clean_thread_main(void *f)
1253 {
1254 struct ipf *ipf = f;
1255
1256 enum {
1257 IPF_FRAG_LIST_CLEAN_TIMEOUT = 60000,
1258 };
1259
1260 while (!latch_is_set(&ipf->ipf_clean_thread_exit)) {
1261
1262 long long now = time_msec();
1263
1264 if (!ovs_list_is_empty(&ipf->frag_exp_list) ||
1265 !ovs_list_is_empty(&ipf->frag_complete_list)) {
1266
1267 ovs_mutex_lock(&ipf->ipf_lock);
1268
1269 struct ipf_list *ipf_list, *next;
1270 LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
1271 &ipf->frag_exp_list) {
1272 if (ipf_purge_list_check(ipf, ipf_list, now)) {
1273 ipf_expiry_list_clean(&ipf->frag_lists, ipf_list);
1274 }
1275 }
1276
1277 LIST_FOR_EACH_SAFE (ipf_list, next, list_node,
1278 &ipf->frag_complete_list) {
1279 if (ipf_purge_list_check(ipf, ipf_list, now)) {
1280 ipf_completed_list_clean(&ipf->frag_lists, ipf_list);
1281 }
1282 }
1283
1284 ovs_mutex_unlock(&ipf->ipf_lock);
1285 }
1286
1287 poll_timer_wait_until(now + IPF_FRAG_LIST_CLEAN_TIMEOUT);
1288 latch_wait(&ipf->ipf_clean_thread_exit);
1289 poll_block();
1290 }
1291
1292 return NULL;
1293 }
1294
1295 struct ipf *
1296 ipf_init(void)
1297 {
1298 struct ipf *ipf = xzalloc(sizeof *ipf);
1299
1300 ovs_mutex_init_adaptive(&ipf->ipf_lock);
1301 ovs_mutex_lock(&ipf->ipf_lock);
1302 hmap_init(&ipf->frag_lists);
1303 ovs_list_init(&ipf->frag_exp_list);
1304 ovs_list_init(&ipf->frag_complete_list);
1305 ovs_list_init(&ipf->reassembled_pkt_list);
1306 atomic_init(&ipf->min_v4_frag_size, IPF_V4_FRAG_SIZE_MIN_DEF);
1307 atomic_init(&ipf->min_v6_frag_size, IPF_V6_FRAG_SIZE_MIN_DEF);
1308 ipf->max_v4_frag_list_size = DIV_ROUND_UP(
1309 IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
1310 ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
1311 ovs_mutex_unlock(&ipf->ipf_lock);
1312 atomic_count_init(&ipf->nfrag, 0);
1313 for (size_t i = 0; i < IPF_NFRAGS_NUM_CNTS; i++) {
1314 atomic_init(&ipf->n4frag_cnt[i], 0);
1315 atomic_init(&ipf->n6frag_cnt[i], 0);
1316 }
1317 atomic_init(&ipf->nfrag_max, IPF_MAX_FRAGS_DEFAULT);
1318 atomic_init(&ipf->ifp_v4_enabled, true);
1319 atomic_init(&ipf->ifp_v6_enabled, true);
1320 latch_init(&ipf->ipf_clean_thread_exit);
1321 ipf->ipf_clean_thread = ovs_thread_create("ipf_clean",
1322 ipf_clean_thread_main, ipf);
1323
1324 return ipf;
1325 }
1326
1327 void
1328 ipf_destroy(struct ipf *ipf)
1329 {
1330 ovs_mutex_lock(&ipf->ipf_lock);
1331 latch_set(&ipf->ipf_clean_thread_exit);
1332 pthread_join(ipf->ipf_clean_thread, NULL);
1333 latch_destroy(&ipf->ipf_clean_thread_exit);
1334
1335 struct ipf_list *ipf_list;
1336 HMAP_FOR_EACH_POP (ipf_list, node, &ipf->frag_lists) {
1337 while (ipf_list->last_sent_idx < ipf_list->last_inuse_idx) {
1338 struct dp_packet *pkt
1339 = ipf_list->frag_list[ipf_list->last_sent_idx + 1].pkt;
1340 if (!ipf_list->frag_list[ipf_list->last_sent_idx + 1].dnsteal) {
1341 dp_packet_delete(pkt);
1342 }
1343 atomic_count_dec(&ipf->nfrag);
1344 ipf_list->last_sent_idx++;
1345 }
1346 free(ipf_list->frag_list);
1347 free(ipf_list);
1348 }
1349
1350 if (atomic_count_get(&ipf->nfrag)) {
1351 VLOG_WARN("ipf destroy with non-zero fragment count. ");
1352 }
1353
1354 struct reassembled_pkt *rp;
1355 LIST_FOR_EACH_POP (rp, rp_list_node, &ipf->reassembled_pkt_list) {
1356 dp_packet_delete(rp->pkt);
1357 free(rp);
1358 }
1359
1360 hmap_destroy(&ipf->frag_lists);
1361 ovs_list_poison(&ipf->frag_exp_list);
1362 ovs_list_poison(&ipf->frag_complete_list);
1363 ovs_list_poison(&ipf->reassembled_pkt_list);
1364 ovs_mutex_unlock(&ipf->ipf_lock);
1365 ovs_mutex_destroy(&ipf->ipf_lock);
1366 free(ipf);
1367 }
1368
1369 int
1370 ipf_set_enabled(struct ipf *ipf, bool v6, bool enable)
1371 {
1372 atomic_store_relaxed(v6 ? &ipf->ifp_v6_enabled : &ipf->ifp_v4_enabled,
1373 enable);
1374 return 0;
1375 }
1376
1377 int
1378 ipf_set_min_frag(struct ipf *ipf, bool v6, uint32_t value)
1379 {
1380 /* If the user specifies an unreasonably large number, fragmentation
1381 * will not work well but it will not blow up. */
1382 if (value < (v6 ? IPF_V6_FRAG_SIZE_LBOUND : IPF_V4_FRAG_SIZE_LBOUND)) {
1383 return 1;
1384 }
1385
1386 ovs_mutex_lock(&ipf->ipf_lock);
1387 if (v6) {
1388 atomic_store_relaxed(&ipf->min_v6_frag_size, value);
1389 } else {
1390 atomic_store_relaxed(&ipf->min_v4_frag_size, value);
1391 ipf->max_v4_frag_list_size = DIV_ROUND_UP(
1392 IPV4_PACKET_MAX_SIZE - IPV4_PACKET_MAX_HDR_SIZE,
1393 ipf->min_v4_frag_size - IPV4_PACKET_MAX_HDR_SIZE);
1394 }
1395 ovs_mutex_unlock(&ipf->ipf_lock);
1396 return 0;
1397 }
1398
1399 int
1400 ipf_set_max_nfrags(struct ipf *ipf, uint32_t value)
1401 {
1402 if (value > IPF_NFRAG_UBOUND) {
1403 return 1;
1404 }
1405 atomic_store_relaxed(&ipf->nfrag_max, value);
1406 return 0;
1407 }
1408
1409 int
1410 ipf_get_status(struct ipf *ipf, struct ipf_status *ipf_status)
1411 {
1412 ipf_status->nfrag = atomic_count_get(&ipf->nfrag);
1413 atomic_read_relaxed(&ipf->nfrag_max, &ipf_status->nfrag_max);
1414
1415 atomic_read_relaxed(&ipf->ifp_v4_enabled, &ipf_status->v4.enabled);
1416 atomic_read_relaxed(&ipf->min_v4_frag_size,
1417 &ipf_status->v4.min_frag_size);
1418 atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_ACCEPTED],
1419 &ipf_status->v4.nfrag_accepted);
1420 atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_COMPL_SENT],
1421 &ipf_status->v4.nfrag_completed_sent);
1422 atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_EXPD_SENT],
1423 &ipf_status->v4.nfrag_expired_sent);
1424 atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_TOO_SMALL],
1425 &ipf_status->v4.nfrag_too_small);
1426 atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_OVERLAP],
1427 &ipf_status->v4.nfrag_overlap);
1428 atomic_read_relaxed(&ipf->n4frag_cnt[IPF_NFRAGS_PURGED],
1429 &ipf_status->v4.nfrag_purged);
1430
1431 atomic_read_relaxed(&ipf->ifp_v6_enabled, &ipf_status->v6.enabled);
1432 atomic_read_relaxed(&ipf->min_v6_frag_size,
1433 &ipf_status->v6.min_frag_size);
1434 atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_ACCEPTED],
1435 &ipf_status->v6.nfrag_accepted);
1436 atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_COMPL_SENT],
1437 &ipf_status->v6.nfrag_completed_sent);
1438 atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_EXPD_SENT],
1439 &ipf_status->v6.nfrag_expired_sent);
1440 atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_TOO_SMALL],
1441 &ipf_status->v6.nfrag_too_small);
1442 atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_OVERLAP],
1443 &ipf_status->v6.nfrag_overlap);
1444 atomic_read_relaxed(&ipf->n6frag_cnt[IPF_NFRAGS_PURGED],
1445 &ipf_status->v6.nfrag_purged);
1446 return 0;
1447 }
1448
1449 struct ipf_dump_ctx {
1450 struct hmap_position bucket_pos;
1451 };
1452
1453 /* Allocates an 'ipf_dump_ctx' to keep track of an hmap position. The
1454 * caller must call ipf_dump_done() when dumping is finished. */
1455 int
1456 ipf_dump_start(struct ipf_dump_ctx **ipf_dump_ctx)
1457 {
1458 *ipf_dump_ctx = xzalloc(sizeof **ipf_dump_ctx);
1459 return 0;
1460 }
1461
1462 /* Creates a string representation of the state of an 'ipf_list' and puts
1463 * it in 'ds'. */
1464 static void
1465 ipf_dump_create(const struct ipf_list *ipf_list, struct ds *ds)
1466 {
1467 ds_put_cstr(ds, "(");
1468 if (ipf_list->key.dl_type == htons(ETH_TYPE_IP)) {
1469 ds_put_format(ds, "src="IP_FMT",dst="IP_FMT",",
1470 IP_ARGS(ipf_list->key.src_addr.ipv4),
1471 IP_ARGS(ipf_list->key.dst_addr.ipv4));
1472 } else {
1473 ds_put_cstr(ds, "src=");
1474 ipv6_format_addr(&ipf_list->key.src_addr.ipv6, ds);
1475 ds_put_cstr(ds, ",dst=");
1476 ipv6_format_addr(&ipf_list->key.dst_addr.ipv6, ds);
1477 ds_put_cstr(ds, ",");
1478 }
1479
1480 ds_put_format(ds, "recirc_id=%u,ip_id=%u,dl_type=0x%x,zone=%u,nw_proto=%u",
1481 ipf_list->key.recirc_id, ntohl(ipf_list->key.ip_id),
1482 ntohs(ipf_list->key.dl_type), ipf_list->key.zone,
1483 ipf_list->key.nw_proto);
1484
1485 ds_put_format(ds, ",num_fragments=%u,state=%s",
1486 ipf_list->last_inuse_idx + 1,
1487 ipf_state_name[ipf_list->state]);
1488
1489 ds_put_cstr(ds, ")");
1490 }
1491
1492 /* Finds the next ipf list starting from 'ipf_dump_ctx->bucket_pos' and uses
1493 * ipf_dump_create() to create a string representation of the state of an
1494 * ipf list, to which 'dump' is pointed to. Returns EOF when there are no
1495 * more ipf lists. */
1496 int
1497 ipf_dump_next(struct ipf *ipf, struct ipf_dump_ctx *ipf_dump_ctx, char **dump)
1498 {
1499 ovs_mutex_lock(&ipf->ipf_lock);
1500
1501 struct hmap_node *node = hmap_at_position(&ipf->frag_lists,
1502 &ipf_dump_ctx->bucket_pos);
1503 if (!node) {
1504 ovs_mutex_unlock(&ipf->ipf_lock);
1505 return EOF;
1506 } else {
1507 struct ipf_list *ipf_list_;
1508 INIT_CONTAINER(ipf_list_, node, node);
1509 struct ipf_list ipf_list = *ipf_list_;
1510 ovs_mutex_unlock(&ipf->ipf_lock);
1511 struct ds ds = DS_EMPTY_INITIALIZER;
1512 ipf_dump_create(&ipf_list, &ds);
1513 *dump = ds_steal_cstr(&ds);
1514 return 0;
1515 }
1516 }
1517
1518 /* Frees 'ipf_dump_ctx' allocated by ipf_dump_start(). */
1519 int
1520 ipf_dump_done(struct ipf_dump_ctx *ipf_dump_ctx)
1521 {
1522 free(ipf_dump_ctx);
1523 return 0;
1524 }