2 * Copyright (c) 2019 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 #include <sys/types.h>
21 #include <netinet/in.h>
22 #include <netinet/ip6.h>
23 #include <netinet/icmp6.h>
30 #include "openvswitch/hmap.h"
31 #include "openvswitch/poll-loop.h"
32 #include "openvswitch/vlog.h"
33 #include "ovs-atomic.h"
37 VLOG_DEFINE_THIS_MODULE(ipf
);
38 COVERAGE_DEFINE(ipf_stuck_frag_list_purged
);
41 IPV4_PACKET_MAX_HDR_SIZE
= 60,
42 IPV4_PACKET_MAX_SIZE
= 65535,
43 IPV6_PACKET_MAX_DATA
= 65535,
47 IPF_LIST_STATE_UNUSED
,
48 IPF_LIST_STATE_REASS_FAIL
,
49 IPF_LIST_STATE_OTHER_SEEN
,
50 IPF_LIST_STATE_FIRST_SEEN
,
51 IPF_LIST_STATE_LAST_SEEN
,
52 IPF_LIST_STATE_FIRST_LAST_SEEN
,
53 IPF_LIST_STATE_COMPLETED
,
57 static char *ipf_state_name
[IPF_LIST_STATE_NUM
] =
58 {"unused", "reassemble fail", "other frag", "first frag", "last frag",
59 "first/last frag", "complete"};
62 IPF_FRAG_COMPLETED_LIST
,
68 IPF_V4_FRAG_SIZE_LBOUND
= 400,
69 IPF_V4_FRAG_SIZE_MIN_DEF
= 1200,
70 IPF_V6_FRAG_SIZE_LBOUND
= 400, /* Useful for testing. */
71 IPF_V6_FRAG_SIZE_MIN_DEF
= 1280,
72 IPF_MAX_FRAGS_DEFAULT
= 1000,
73 IPF_NFRAG_UBOUND
= 5000,
76 enum ipf_counter_type
{
78 IPF_NFRAGS_COMPL_SENT
,
91 /* Represents a single fragment; part of a list of fragments. */
93 struct dp_packet
*pkt
;
94 uint16_t start_data_byte
;
95 uint16_t end_data_byte
;
96 bool dnsteal
; /* 'do not steal': if true, ipf should not free packet. */
99 /* The key for a collection of fragments potentially making up an unfragmented
101 struct ipf_list_key
{
102 /* ipf_list_key_hash() requires 'src_addr' and 'dst_addr' to be the first
104 union ipf_addr src_addr
;
105 union ipf_addr dst_addr
;
107 ovs_be32 ip_id
; /* V6 is 32 bits. */
113 /* A collection of fragments potentially making up an unfragmented packet. */
115 struct hmap_node node
; /* In struct ipf's 'frag_lists'. */
116 struct ovs_list list_node
; /* In struct ipf's 'frag_exp_list' or
117 * 'frag_complete_list'. */
118 struct ipf_frag
*frag_list
; /* List of fragments for this list. */
119 struct ipf_list_key key
; /* The key for the fragemnt list. */
120 struct dp_packet
*reass_execute_ctx
; /* Reassembled packet. */
121 long long expiration
; /* In milliseconds. */
122 int last_sent_idx
; /* Last sent fragment idx. */
123 int last_inuse_idx
; /* Last inuse fragment idx. */
124 int size
; /* Fragment list size. */
125 uint8_t state
; /* Frag list state; see ipf_list_state. */
128 /* Represents a reassambled packet which typically is passed through
130 struct reassembled_pkt
{
131 struct ovs_list rp_list_node
; /* In struct ipf's
132 * 'reassembled_pkt_list'. */
133 struct dp_packet
*pkt
;
134 struct ipf_list
*list
;
138 /* The clean thread is used to clean up fragments in the 'ipf'
139 * module if packet batches are not longer be sent through its user. */
140 pthread_t ipf_clean_thread
;
141 struct latch ipf_clean_thread_exit
;
143 int max_v4_frag_list_size
;
145 struct ovs_mutex ipf_lock
; /* Protects all of the following. */
146 /* These contain 'struct ipf_list's. */
147 struct hmap frag_lists OVS_GUARDED
;
148 struct ovs_list frag_exp_list OVS_GUARDED
;
149 struct ovs_list frag_complete_list OVS_GUARDED
;
150 /* Contains 'struct reassembled_pkt's. */
151 struct ovs_list reassembled_pkt_list OVS_GUARDED
;
153 /* Used to allow disabling fragmentation reassembly. */
154 atomic_bool ifp_v4_enabled
;
155 atomic_bool ifp_v6_enabled
;
157 /* Will be clamped above 400 bytes; the value chosen should handle
158 * alg control packets of interest that use string encoding of mutable
159 * IP fields; meaning, the control packets should not be fragmented. */
160 atomic_uint min_v4_frag_size
;
161 atomic_uint min_v6_frag_size
;
163 /* Configurable maximum allowable fragments in process. */
164 atomic_uint nfrag_max
;
166 /* Number of fragments in process. */
169 atomic_uint64_t n4frag_cnt
[IPF_NFRAGS_NUM_CNTS
];
170 atomic_uint64_t n6frag_cnt
[IPF_NFRAGS_NUM_CNTS
];
174 ipf_print_reass_packet(const char *es
, const void *pkt
)
176 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(10, 10);
177 if (!VLOG_DROP_WARN(&rl
)) {
178 struct ds ds
= DS_EMPTY_INITIALIZER
;
179 ds_put_hex_dump(&ds
, pkt
, 128, 0, false);
180 VLOG_WARN("%s\n%s", es
, ds_cstr(&ds
));
186 ipf_count(struct ipf
*ipf
, bool v6
, enum ipf_counter_type cntr
)
188 atomic_count_inc64(v6
? &ipf
->n6frag_cnt
[cntr
] : &ipf
->n4frag_cnt
[cntr
]);
192 ipf_get_v4_enabled(struct ipf
*ipf
)
194 bool ifp_v4_enabled_
;
195 atomic_read_relaxed(&ipf
->ifp_v4_enabled
, &ifp_v4_enabled_
);
196 return ifp_v4_enabled_
;
200 ipf_get_v6_enabled(struct ipf
*ipf
)
202 bool ifp_v6_enabled_
;
203 atomic_read_relaxed(&ipf
->ifp_v6_enabled
, &ifp_v6_enabled_
);
204 return ifp_v6_enabled_
;
208 ipf_get_enabled(struct ipf
*ipf
)
210 return ipf_get_v4_enabled(ipf
) || ipf_get_v6_enabled(ipf
);
214 ipf_addr_hash_add(uint32_t hash
, const union ipf_addr
*addr
)
216 BUILD_ASSERT_DECL(sizeof *addr
% 4 == 0);
217 return hash_add_bytes32(hash
, (const uint32_t *) addr
, sizeof *addr
);
220 /* Adds a list of fragments to the list tracking expiry of yet to be
221 * completed reassembled packets, hence subject to expirty. */
223 ipf_expiry_list_add(struct ovs_list
*frag_exp_list
, struct ipf_list
*ipf_list
,
225 /* OVS_REQUIRES(ipf->ipf_lock) */
228 IPF_FRAG_LIST_TIMEOUT
= 15000,
231 ipf_list
->expiration
= now
+ IPF_FRAG_LIST_TIMEOUT
;
232 ovs_list_push_back(frag_exp_list
, &ipf_list
->list_node
);
235 /* Adds a list of fragments to the list of completed packets, which will be
236 * subsequently transmitted. */
238 ipf_completed_list_add(struct ovs_list
*frag_complete_list
,
239 struct ipf_list
*ipf_list
)
240 /* OVS_REQUIRES(ipf_lock) */
242 ovs_list_push_back(frag_complete_list
, &ipf_list
->list_node
);
245 /* Adds a reassmebled packet to the list of reassembled packets, awaiting some
246 * processing, such as being sent through conntrack. */
248 ipf_reassembled_list_add(struct ovs_list
*reassembled_pkt_list
,
249 struct reassembled_pkt
*rp
)
250 /* OVS_REQUIRES(ipf_lock) */
252 ovs_list_push_back(reassembled_pkt_list
, &rp
->rp_list_node
);
255 /* Removed a frag list from tracking datastructures and frees list heap
258 ipf_list_clean(struct hmap
*frag_lists
,
259 struct ipf_list
*ipf_list
)
260 /* OVS_REQUIRES(ipf_lock) */
262 ovs_list_remove(&ipf_list
->list_node
);
263 hmap_remove(frag_lists
, &ipf_list
->node
);
264 free(ipf_list
->frag_list
);
268 /* Removed a frag list sitting on the expiry list from tracking
269 * datastructures and frees list heap memory. */
271 ipf_expiry_list_clean(struct hmap
*frag_lists
,
272 struct ipf_list
*ipf_list
)
273 /* OVS_REQUIRES(ipf_lock) */
275 ipf_list_clean(frag_lists
, ipf_list
);
278 /* Removed a frag list sitting on the completed list from tracking
279 * datastructures and frees list heap memory. */
281 ipf_completed_list_clean(struct hmap
*frag_lists
,
282 struct ipf_list
*ipf_list
)
283 /* OVS_REQUIRES(ipf_lock) */
285 ipf_list_clean(frag_lists
, ipf_list
);
289 ipf_expiry_list_remove(struct ipf_list
*ipf_list
)
290 /* OVS_REQUIRES(ipf_lock) */
292 ovs_list_remove(&ipf_list
->list_node
);
296 ipf_reassembled_list_remove(struct reassembled_pkt
*rp
)
297 /* OVS_REQUIRES(ipf_lock) */
299 ovs_list_remove(&rp
->rp_list_node
);
304 ipf_list_key_hash(const struct ipf_list_key
*key
, uint32_t basis
)
306 uint32_t hsrc
, hdst
, hash
;
308 hsrc
= ipf_addr_hash_add(hsrc
, &key
->src_addr
);
309 hdst
= ipf_addr_hash_add(hdst
, &key
->dst_addr
);
312 /* Hash the rest of the key. */
313 return hash_words((uint32_t *) (&key
->dst_addr
+ 1),
314 (uint32_t *) (key
+ 1) -
315 (uint32_t *) (&key
->dst_addr
+ 1),
320 ipf_is_first_v4_frag(const struct dp_packet
*pkt
)
322 const struct ip_header
*l3
= dp_packet_l3(pkt
);
323 if (!(l3
->ip_frag_off
& htons(IP_FRAG_OFF_MASK
)) &&
324 l3
->ip_frag_off
& htons(IP_MORE_FRAGMENTS
)) {
331 ipf_is_last_v4_frag(const struct dp_packet
*pkt
)
333 const struct ip_header
*l3
= dp_packet_l3(pkt
);
334 if (l3
->ip_frag_off
& htons(IP_FRAG_OFF_MASK
) &&
335 !(l3
->ip_frag_off
& htons(IP_MORE_FRAGMENTS
))) {
342 ipf_is_v6_frag(ovs_be16 ip6f_offlg
)
344 if (ip6f_offlg
& (IP6F_OFF_MASK
| IP6F_MORE_FRAG
)) {
351 ipf_is_first_v6_frag(ovs_be16 ip6f_offlg
)
353 if (!(ip6f_offlg
& IP6F_OFF_MASK
) &&
354 ip6f_offlg
& IP6F_MORE_FRAG
) {
361 ipf_is_last_v6_frag(ovs_be16 ip6f_offlg
)
363 if ((ip6f_offlg
& IP6F_OFF_MASK
) &&
364 !(ip6f_offlg
& IP6F_MORE_FRAG
)) {
370 /* Checks for a completed packet collection of fragments. */
372 ipf_list_complete(const struct ipf_list
*ipf_list
)
373 /* OVS_REQUIRES(ipf_lock) */
375 for (int i
= 1; i
<= ipf_list
->last_inuse_idx
; i
++) {
376 if (ipf_list
->frag_list
[i
- 1].end_data_byte
+ 1
377 != ipf_list
->frag_list
[i
].start_data_byte
) {
384 /* Runs O(n) for a sorted or almost sorted list. */
386 ipf_sort(struct ipf_frag
*frag_list
, size_t last_idx
)
387 /* OVS_REQUIRES(ipf_lock) */
389 for (int li
= 1; li
<= last_idx
; li
++) {
390 struct ipf_frag ipf_frag
= frag_list
[li
];
393 frag_list
[ci
].start_data_byte
> ipf_frag
.start_data_byte
) {
394 frag_list
[ci
+ 1] = frag_list
[ci
];
397 frag_list
[ci
+ 1] = ipf_frag
;
401 /* Called on a sorted complete list of v4 fragments to reassemble them into
402 * a single packet that can be processed, such as passing through conntrack.
404 static struct dp_packet
*
405 ipf_reassemble_v4_frags(struct ipf_list
*ipf_list
)
406 /* OVS_REQUIRES(ipf_lock) */
408 struct ipf_frag
*frag_list
= ipf_list
->frag_list
;
409 struct dp_packet
*pkt
= dp_packet_clone(frag_list
[0].pkt
);
410 dp_packet_set_size(pkt
, dp_packet_size(pkt
) - dp_packet_l2_pad_size(pkt
));
411 struct ip_header
*l3
= dp_packet_l3(pkt
);
412 int len
= ntohs(l3
->ip_tot_len
);
414 int rest_len
= frag_list
[ipf_list
->last_inuse_idx
].end_data_byte
-
415 frag_list
[1].start_data_byte
+ 1;
417 if (len
+ rest_len
> IPV4_PACKET_MAX_SIZE
) {
418 ipf_print_reass_packet(
419 "Unsupported big reassembled v4 packet; v4 hdr:", l3
);
420 dp_packet_delete(pkt
);
424 dp_packet_prealloc_tailroom(pkt
, rest_len
);
426 for (int i
= 1; i
<= ipf_list
->last_inuse_idx
; i
++) {
427 size_t add_len
= frag_list
[i
].end_data_byte
-
428 frag_list
[i
].start_data_byte
+ 1;
429 const char *l4
= dp_packet_l4(frag_list
[i
].pkt
);
430 dp_packet_put(pkt
, l4
, add_len
);
434 l3
= dp_packet_l3(pkt
);
435 ovs_be16 new_ip_frag_off
= l3
->ip_frag_off
& ~htons(IP_MORE_FRAGMENTS
);
436 l3
->ip_csum
= recalc_csum16(l3
->ip_csum
, l3
->ip_frag_off
,
438 l3
->ip_csum
= recalc_csum16(l3
->ip_csum
, l3
->ip_tot_len
, htons(len
));
439 l3
->ip_tot_len
= htons(len
);
440 l3
->ip_frag_off
= new_ip_frag_off
;
441 dp_packet_set_l2_pad_size(pkt
, 0);
446 /* Called on a sorted complete list of v6 fragments to reassemble them into
447 * a single packet that can be processed, such as passing through conntrack.
449 static struct dp_packet
*
450 ipf_reassemble_v6_frags(struct ipf_list
*ipf_list
)
451 /* OVS_REQUIRES(ipf_lock) */
453 struct ipf_frag
*frag_list
= ipf_list
->frag_list
;
454 struct dp_packet
*pkt
= dp_packet_clone(frag_list
[0].pkt
);
455 dp_packet_set_size(pkt
, dp_packet_size(pkt
) - dp_packet_l2_pad_size(pkt
));
456 struct ovs_16aligned_ip6_hdr
*l3
= dp_packet_l3(pkt
);
457 int pl
= ntohs(l3
->ip6_plen
) - sizeof(struct ovs_16aligned_ip6_frag
);
459 int rest_len
= frag_list
[ipf_list
->last_inuse_idx
].end_data_byte
-
460 frag_list
[1].start_data_byte
+ 1;
462 if (pl
+ rest_len
> IPV6_PACKET_MAX_DATA
) {
463 ipf_print_reass_packet(
464 "Unsupported big reassembled v6 packet; v6 hdr:", l3
);
465 dp_packet_delete(pkt
);
469 dp_packet_prealloc_tailroom(pkt
, rest_len
);
471 for (int i
= 1; i
<= ipf_list
->last_inuse_idx
; i
++) {
472 size_t add_len
= frag_list
[i
].end_data_byte
-
473 frag_list
[i
].start_data_byte
+ 1;
474 const char *l4
= dp_packet_l4(frag_list
[i
].pkt
);
475 dp_packet_put(pkt
, l4
, add_len
);
479 l3
= dp_packet_l3(pkt
);
481 uint8_t nw_proto
= l3
->ip6_nxt
;
483 const void *data
= l3
+ 1;
484 size_t datasize
= pl
;
486 const struct ovs_16aligned_ip6_frag
*frag_hdr
= NULL
;
487 if (!parse_ipv6_ext_hdrs(&data
, &datasize
, &nw_proto
, &nw_frag
, &frag_hdr
)
488 || !nw_frag
|| !frag_hdr
) {
490 ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3
);
491 dp_packet_delete(pkt
);
495 struct ovs_16aligned_ip6_frag
*fh
=
496 CONST_CAST(struct ovs_16aligned_ip6_frag
*, frag_hdr
);
498 l3
->ip6_plen
= htons(pl
);
499 l3
->ip6_ctlun
.ip6_un1
.ip6_un1_nxt
= nw_proto
;
500 dp_packet_set_l2_pad_size(pkt
, 0);
504 /* Called when a frag list state transitions to another state. This is
505 * triggered by new fragment for the list being received.*/
507 ipf_list_state_transition(struct ipf
*ipf
, struct ipf_list
*ipf_list
,
508 bool ff
, bool lf
, bool v6
)
509 OVS_REQUIRES(ipf
->ipf_lock
)
511 enum ipf_list_state curr_state
= ipf_list
->state
;
512 enum ipf_list_state next_state
;
513 switch (curr_state
) {
514 case IPF_LIST_STATE_UNUSED
:
515 case IPF_LIST_STATE_OTHER_SEEN
:
517 next_state
= IPF_LIST_STATE_FIRST_SEEN
;
519 next_state
= IPF_LIST_STATE_LAST_SEEN
;
521 next_state
= IPF_LIST_STATE_OTHER_SEEN
;
524 case IPF_LIST_STATE_FIRST_SEEN
:
526 next_state
= IPF_LIST_STATE_FIRST_LAST_SEEN
;
528 next_state
= IPF_LIST_STATE_FIRST_SEEN
;
531 case IPF_LIST_STATE_LAST_SEEN
:
533 next_state
= IPF_LIST_STATE_FIRST_LAST_SEEN
;
535 next_state
= IPF_LIST_STATE_LAST_SEEN
;
538 case IPF_LIST_STATE_FIRST_LAST_SEEN
:
539 next_state
= IPF_LIST_STATE_FIRST_LAST_SEEN
;
541 case IPF_LIST_STATE_COMPLETED
:
542 case IPF_LIST_STATE_REASS_FAIL
:
543 case IPF_LIST_STATE_NUM
:
548 if (next_state
== IPF_LIST_STATE_FIRST_LAST_SEEN
) {
549 ipf_sort(ipf_list
->frag_list
, ipf_list
->last_inuse_idx
);
550 if (ipf_list_complete(ipf_list
)) {
551 struct dp_packet
*reass_pkt
= v6
552 ? ipf_reassemble_v6_frags(ipf_list
)
553 : ipf_reassemble_v4_frags(ipf_list
);
555 struct reassembled_pkt
*rp
= xzalloc(sizeof *rp
);
558 ipf_reassembled_list_add(&ipf
->reassembled_pkt_list
, rp
);
559 ipf_expiry_list_remove(ipf_list
);
560 next_state
= IPF_LIST_STATE_COMPLETED
;
562 next_state
= IPF_LIST_STATE_REASS_FAIL
;
566 ipf_list
->state
= next_state
;
569 /* Some sanity checks are redundant, but prudent, in case code paths for
570 * fragments change in future. The processing cost for fragments is not
573 ipf_is_valid_v4_frag(struct ipf
*ipf
, struct dp_packet
*pkt
)
575 if (OVS_UNLIKELY(dp_packet_ip_checksum_bad(pkt
))) {
579 const struct eth_header
*l2
= dp_packet_eth(pkt
);
580 const struct ip_header
*l3
= dp_packet_l3(pkt
);
582 if (OVS_UNLIKELY(!l2
|| !l3
)) {
586 size_t l3_size
= dp_packet_l3_size(pkt
);
587 if (OVS_UNLIKELY(l3_size
< IP_HEADER_LEN
)) {
591 if (!IP_IS_FRAGMENT(l3
->ip_frag_off
)) {
595 uint16_t ip_tot_len
= ntohs(l3
->ip_tot_len
);
596 if (OVS_UNLIKELY(ip_tot_len
!= l3_size
)) {
600 size_t ip_hdr_len
= IP_IHL(l3
->ip_ihl_ver
) * 4;
601 if (OVS_UNLIKELY(ip_hdr_len
< IP_HEADER_LEN
)) {
604 if (OVS_UNLIKELY(l3_size
< ip_hdr_len
)) {
608 if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt
)
609 && csum(l3
, ip_hdr_len
) != 0)) {
613 uint32_t min_v4_frag_size_
;
614 atomic_read_relaxed(&ipf
->min_v4_frag_size
, &min_v4_frag_size_
);
615 bool lf
= ipf_is_last_v4_frag(pkt
);
616 if (OVS_UNLIKELY(!lf
&& dp_packet_l3_size(pkt
) < min_v4_frag_size_
)) {
617 ipf_count(ipf
, false, IPF_NFRAGS_TOO_SMALL
);
623 pkt
->md
.ct_state
= CS_INVALID
;
628 ipf_v4_key_extract(struct dp_packet
*pkt
, ovs_be16 dl_type
, uint16_t zone
,
629 struct ipf_list_key
*key
, uint16_t *start_data_byte
,
630 uint16_t *end_data_byte
, bool *ff
, bool *lf
)
632 const struct ip_header
*l3
= dp_packet_l3(pkt
);
633 uint16_t ip_tot_len
= ntohs(l3
->ip_tot_len
);
634 size_t ip_hdr_len
= IP_IHL(l3
->ip_ihl_ver
) * 4;
636 *start_data_byte
= ntohs(l3
->ip_frag_off
& htons(IP_FRAG_OFF_MASK
)) * 8;
637 *end_data_byte
= *start_data_byte
+ ip_tot_len
- ip_hdr_len
- 1;
638 *ff
= ipf_is_first_v4_frag(pkt
);
639 *lf
= ipf_is_last_v4_frag(pkt
);
640 memset(key
, 0, sizeof *key
);
641 key
->ip_id
= be16_to_be32(l3
->ip_id
);
642 key
->dl_type
= dl_type
;
643 key
->src_addr
.ipv4
= get_16aligned_be32(&l3
->ip_src
);
644 key
->dst_addr
.ipv4
= get_16aligned_be32(&l3
->ip_dst
);
645 key
->nw_proto
= l3
->ip_proto
;
647 key
->recirc_id
= pkt
->md
.recirc_id
;
651 /* Some sanity checks are redundant, but prudent, in case code paths for
652 * fragments change in future. The processing cost for fragments is not
655 ipf_is_valid_v6_frag(struct ipf
*ipf
, struct dp_packet
*pkt
)
657 const struct eth_header
*l2
= dp_packet_eth(pkt
);
658 const struct ovs_16aligned_ip6_hdr
*l3
= dp_packet_l3(pkt
);
659 const char *l4
= dp_packet_l4(pkt
);
661 if (OVS_UNLIKELY(!l2
|| !l3
|| !l4
)) {
665 size_t l3_size
= dp_packet_l3_size(pkt
);
666 size_t l3_hdr_size
= sizeof *l3
;
668 if (OVS_UNLIKELY(l3_size
< l3_hdr_size
)) {
673 uint8_t nw_proto
= l3
->ip6_nxt
;
674 const void *data
= l3
+ 1;
675 size_t datasize
= l3_size
- l3_hdr_size
;
676 const struct ovs_16aligned_ip6_frag
*frag_hdr
= NULL
;
677 if (!parse_ipv6_ext_hdrs(&data
, &datasize
, &nw_proto
, &nw_frag
,
678 &frag_hdr
) || !nw_frag
|| !frag_hdr
) {
682 int pl
= ntohs(l3
->ip6_plen
);
683 if (OVS_UNLIKELY(pl
+ l3_hdr_size
!= l3_size
)) {
687 ovs_be16 ip6f_offlg
= frag_hdr
->ip6f_offlg
;
688 if (OVS_UNLIKELY(!ipf_is_v6_frag(ip6f_offlg
))) {
692 uint32_t min_v6_frag_size_
;
693 atomic_read_relaxed(&ipf
->min_v6_frag_size
, &min_v6_frag_size_
);
694 bool lf
= ipf_is_last_v6_frag(ip6f_offlg
);
696 if (OVS_UNLIKELY(!lf
&& dp_packet_l3_size(pkt
) < min_v6_frag_size_
)) {
697 ipf_count(ipf
, true, IPF_NFRAGS_TOO_SMALL
);
704 pkt
->md
.ct_state
= CS_INVALID
;
710 ipf_v6_key_extract(struct dp_packet
*pkt
, ovs_be16 dl_type
, uint16_t zone
,
711 struct ipf_list_key
*key
, uint16_t *start_data_byte
,
712 uint16_t *end_data_byte
, bool *ff
, bool *lf
)
714 const struct ovs_16aligned_ip6_hdr
*l3
= dp_packet_l3(pkt
);
716 uint8_t nw_proto
= l3
->ip6_nxt
;
717 const void *data
= l3
+ 1;
718 size_t datasize
= dp_packet_l3_size(pkt
) - sizeof *l3
;
719 const struct ovs_16aligned_ip6_frag
*frag_hdr
= NULL
;
721 parse_ipv6_ext_hdrs(&data
, &datasize
, &nw_proto
, &nw_frag
, &frag_hdr
);
722 ovs_assert(nw_frag
&& frag_hdr
);
723 ovs_be16 ip6f_offlg
= frag_hdr
->ip6f_offlg
;
724 *start_data_byte
= ntohs(ip6f_offlg
& IP6F_OFF_MASK
) +
725 sizeof (struct ovs_16aligned_ip6_frag
);
726 *end_data_byte
= *start_data_byte
+ dp_packet_l4_size(pkt
) - 1;
727 *ff
= ipf_is_first_v6_frag(ip6f_offlg
);
728 *lf
= ipf_is_last_v6_frag(ip6f_offlg
);
729 memset(key
, 0, sizeof *key
);
730 key
->ip_id
= get_16aligned_be32(&frag_hdr
->ip6f_ident
);
731 key
->dl_type
= dl_type
;
732 memcpy(&key
->src_addr
.ipv6
, &l3
->ip6_src
, sizeof key
->src_addr
.ipv6
);
733 /* We are not supporting parsing of the routing header to use as the
734 * dst address part of the key. */
735 memcpy(&key
->dst_addr
.ipv6
, &l3
->ip6_dst
, sizeof key
->dst_addr
.ipv6
);
736 key
->nw_proto
= 0; /* Not used for key for V6. */
738 key
->recirc_id
= pkt
->md
.recirc_id
;
742 ipf_list_key_eq(const struct ipf_list_key
*key1
,
743 const struct ipf_list_key
*key2
)
744 /* OVS_REQUIRES(ipf_lock) */
746 if (!memcmp(&key1
->src_addr
, &key2
->src_addr
, sizeof key1
->src_addr
) &&
747 !memcmp(&key1
->dst_addr
, &key2
->dst_addr
, sizeof key1
->dst_addr
) &&
748 key1
->dl_type
== key2
->dl_type
&&
749 key1
->ip_id
== key2
->ip_id
&&
750 key1
->zone
== key2
->zone
&&
751 key1
->nw_proto
== key2
->nw_proto
&&
752 key1
->recirc_id
== key2
->recirc_id
) {
758 static struct ipf_list
*
759 ipf_list_key_lookup(struct ipf
*ipf
, const struct ipf_list_key
*key
,
761 OVS_REQUIRES(ipf
->ipf_lock
)
763 struct ipf_list
*ipf_list
;
764 HMAP_FOR_EACH_WITH_HASH (ipf_list
, node
, hash
, &ipf
->frag_lists
) {
765 if (ipf_list_key_eq(&ipf_list
->key
, key
)) {
773 ipf_is_frag_duped(const struct ipf_frag
*frag_list
, int last_inuse_idx
,
774 size_t start_data_byte
, size_t end_data_byte
)
775 /* OVS_REQUIRES(ipf_lock) */
777 for (int i
= 0; i
<= last_inuse_idx
; i
++) {
778 if ((start_data_byte
>= frag_list
[i
].start_data_byte
&&
779 start_data_byte
<= frag_list
[i
].end_data_byte
) ||
780 (end_data_byte
>= frag_list
[i
].start_data_byte
&&
781 end_data_byte
<= frag_list
[i
].end_data_byte
)) {
788 /* Adds a fragment to a list of fragments, if the fragment is not a
789 * duplicate. If the fragment is a duplicate, that fragment is marked
790 * invalid to avoid the work that conntrack would do to mark the fragment
791 * as invalid, which it will in all cases. */
793 ipf_process_frag(struct ipf
*ipf
, struct ipf_list
*ipf_list
,
794 struct dp_packet
*pkt
, uint16_t start_data_byte
,
795 uint16_t end_data_byte
, bool ff
, bool lf
, bool v6
,
797 OVS_REQUIRES(ipf
->ipf_lock
)
799 bool duped_frag
= ipf_is_frag_duped(ipf_list
->frag_list
,
800 ipf_list
->last_inuse_idx
, start_data_byte
, end_data_byte
);
801 int last_inuse_idx
= ipf_list
->last_inuse_idx
;
804 if (last_inuse_idx
< ipf_list
->size
- 1) {
805 /* In the case of dpdk, it would be unfortunate if we had
806 * to create a clone fragment outside the dpdk mp due to the
807 * mempool size being too limited. We will otherwise need to
808 * recommend not setting the mempool number of buffers too low
809 * and also clamp the number of fragments. */
810 struct ipf_frag
*frag
= &ipf_list
->frag_list
[last_inuse_idx
+ 1];
812 frag
->start_data_byte
= start_data_byte
;
813 frag
->end_data_byte
= end_data_byte
;
814 frag
->dnsteal
= dnsteal
;
815 ipf_list
->last_inuse_idx
++;
816 atomic_count_inc(&ipf
->nfrag
);
817 ipf_count(ipf
, v6
, IPF_NFRAGS_ACCEPTED
);
818 ipf_list_state_transition(ipf
, ipf_list
, ff
, lf
, v6
);
823 ipf_count(ipf
, v6
, IPF_NFRAGS_OVERLAP
);
824 pkt
->md
.ct_state
= CS_INVALID
;
831 ipf_list_init(struct ipf_list
*ipf_list
, struct ipf_list_key
*key
,
832 int max_frag_list_size
)
834 ipf_list
->key
= *key
;
835 ipf_list
->last_inuse_idx
= IPF_INVALID_IDX
;
836 ipf_list
->last_sent_idx
= IPF_INVALID_IDX
;
837 ipf_list
->reass_execute_ctx
= NULL
;
838 ipf_list
->state
= IPF_LIST_STATE_UNUSED
;
839 ipf_list
->size
= max_frag_list_size
;
841 = xzalloc(ipf_list
->size
* sizeof *ipf_list
->frag_list
);
844 /* Generates a fragment list key from a well formed fragment and either starts
845 * a new fragment list or increases the size of the existing fragment list,
846 * while checking if the maximum supported fragements are supported or the
847 * list size is impossibly big. Calls 'ipf_process_frag()' to add a fragment
848 * to a list of fragemnts. */
850 ipf_handle_frag(struct ipf
*ipf
, struct dp_packet
*pkt
, ovs_be16 dl_type
,
851 uint16_t zone
, long long now
, uint32_t hash_basis
,
853 OVS_REQUIRES(ipf
->ipf_lock
)
855 struct ipf_list_key key
;
856 /* Initialize 4 variables for some versions of GCC. */
857 uint16_t start_data_byte
= 0;
858 uint16_t end_data_byte
= 0;
861 bool v6
= dl_type
== htons(ETH_TYPE_IPV6
);
863 if (v6
&& ipf_get_v6_enabled(ipf
)) {
864 ipf_v6_key_extract(pkt
, dl_type
, zone
, &key
, &start_data_byte
,
865 &end_data_byte
, &ff
, &lf
);
866 } else if (!v6
&& ipf_get_v4_enabled(ipf
)) {
867 ipf_v4_key_extract(pkt
, dl_type
, zone
, &key
, &start_data_byte
,
868 &end_data_byte
, &ff
, &lf
);
873 unsigned int nfrag_max
;
874 atomic_read_relaxed(&ipf
->nfrag_max
, &nfrag_max
);
875 if (atomic_count_get(&ipf
->nfrag
) >= nfrag_max
) {
879 uint32_t hash
= ipf_list_key_hash(&key
, hash_basis
);
880 struct ipf_list
*ipf_list
= ipf_list_key_lookup(ipf
, &key
, hash
);
882 IPF_FRAG_LIST_MIN_INCREMENT
= 4,
883 IPF_IPV6_MAX_FRAG_LIST_SIZE
= 65535,
886 int max_frag_list_size
;
888 /* Because the calculation with extension headers is variable,
889 * we don't calculate a hard maximum fragment list size upfront. The
890 * fragment list size is practically limited by the code, however. */
891 max_frag_list_size
= IPF_IPV6_MAX_FRAG_LIST_SIZE
;
893 max_frag_list_size
= ipf
->max_v4_frag_list_size
;
897 ipf_list
= xmalloc(sizeof *ipf_list
);
898 ipf_list_init(ipf_list
, &key
,
899 MIN(max_frag_list_size
, IPF_FRAG_LIST_MIN_INCREMENT
));
900 hmap_insert(&ipf
->frag_lists
, &ipf_list
->node
, hash
);
901 ipf_expiry_list_add(&ipf
->frag_exp_list
, ipf_list
, now
);
902 } else if (ipf_list
->state
== IPF_LIST_STATE_REASS_FAIL
||
903 ipf_list
->state
== IPF_LIST_STATE_COMPLETED
) {
904 /* Bail out as early as possible. */
906 } else if (ipf_list
->last_inuse_idx
+ 1 >= ipf_list
->size
) {
907 int increment
= MIN(IPF_FRAG_LIST_MIN_INCREMENT
,
908 max_frag_list_size
- ipf_list
->size
);
911 ipf_list
->frag_list
=
912 xrealloc(ipf_list
->frag_list
, (ipf_list
->size
+ increment
) *
913 sizeof *ipf_list
->frag_list
);
914 ipf_list
->size
+= increment
;
920 return ipf_process_frag(ipf
, ipf_list
, pkt
, start_data_byte
,
921 end_data_byte
, ff
, lf
, v6
, dnsteal
);
924 /* Filters out fragments from a batch of fragments and adjust the batch. */
926 ipf_extract_frags_from_batch(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
927 ovs_be16 dl_type
, uint16_t zone
, long long now
,
930 const size_t pb_cnt
= dp_packet_batch_size(pb
);
931 int pb_idx
; /* Index in a packet batch. */
932 struct dp_packet
*pkt
;
934 DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx
, pb_cnt
, pkt
, pb
) {
935 if (OVS_UNLIKELY((dl_type
== htons(ETH_TYPE_IP
) &&
936 ipf_is_valid_v4_frag(ipf
, pkt
))
938 (dl_type
== htons(ETH_TYPE_IPV6
) &&
939 ipf_is_valid_v6_frag(ipf
, pkt
)))) {
941 ovs_mutex_lock(&ipf
->ipf_lock
);
942 if (!ipf_handle_frag(ipf
, pkt
, dl_type
, zone
, now
, hash_basis
,
944 dp_packet_batch_refill(pb
, pkt
, pb_idx
);
946 ovs_mutex_unlock(&ipf
->ipf_lock
);
948 dp_packet_batch_refill(pb
, pkt
, pb_idx
);
953 /* In case of DPDK, a memory source check is done, as DPDK memory pool
954 * management has trouble dealing with multiple source types. The
955 * check_source paramater is used to indicate when this check is needed. */
957 ipf_dp_packet_batch_add(struct dp_packet_batch
*pb
, struct dp_packet
*pkt
,
958 bool check_source OVS_UNUSED
)
961 if ((dp_packet_batch_is_full(pb
)) ||
962 /* DPDK cannot handle multiple sources in a batch. */
963 (check_source
&& !dp_packet_batch_is_empty(pb
)
964 && pb
->packets
[0]->source
!= pkt
->source
)) {
966 if (dp_packet_batch_is_full(pb
)) {
971 dp_packet_batch_add(pb
, pkt
);
975 /* This would be used in rare cases where a list cannot be sent. One rare
976 * reason known right now is a mempool source check, which exists due to DPDK
977 * support, where packets are no longer being received on any port with a
978 * source matching the fragment. Another reason is a race where all
979 * conntrack rules are unconfigured when some fragments are yet to be
982 * Returns true if the list was purged. */
984 ipf_purge_list_check(struct ipf
*ipf
, struct ipf_list
*ipf_list
,
986 OVS_REQUIRES(ipf
->ipf_lock
)
989 IPF_FRAG_LIST_PURGE_TIME_ADJ
= 10000
992 if (now
< ipf_list
->expiration
+ IPF_FRAG_LIST_PURGE_TIME_ADJ
) {
996 while (ipf_list
->last_sent_idx
< ipf_list
->last_inuse_idx
) {
997 struct dp_packet
* pkt
998 = ipf_list
->frag_list
[ipf_list
->last_sent_idx
+ 1].pkt
;
999 dp_packet_delete(pkt
);
1000 atomic_count_dec(&ipf
->nfrag
);
1001 COVERAGE_INC(ipf_stuck_frag_list_purged
);
1002 ipf_count(ipf
, ipf_list
->key
.dl_type
== htons(ETH_TYPE_IPV6
),
1004 ipf_list
->last_sent_idx
++;
1010 /* Does the packet batch management and common accounting work associated
1011 * with 'ipf_send_completed_frags()' and 'ipf_send_expired_frags()'. */
1013 ipf_send_frags_in_list(struct ipf
*ipf
, struct ipf_list
*ipf_list
,
1014 struct dp_packet_batch
*pb
,
1015 enum ipf_list_type list_type
, bool v6
, long long now
)
1016 OVS_REQUIRES(ipf
->ipf_lock
)
1018 if (ipf_purge_list_check(ipf
, ipf_list
, now
)) {
1022 while (ipf_list
->last_sent_idx
< ipf_list
->last_inuse_idx
) {
1023 struct dp_packet
*pkt
1024 = ipf_list
->frag_list
[ipf_list
->last_sent_idx
+ 1].pkt
;
1025 if (ipf_dp_packet_batch_add(pb
, pkt
, true)) {
1026 ipf_list
->last_sent_idx
++;
1027 atomic_count_dec(&ipf
->nfrag
);
1029 if (list_type
== IPF_FRAG_COMPLETED_LIST
) {
1030 ipf_count(ipf
, v6
, IPF_NFRAGS_COMPL_SENT
);
1032 ipf_count(ipf
, v6
, IPF_NFRAGS_EXPD_SENT
);
1033 pkt
->md
.ct_state
= CS_INVALID
;
1036 if (ipf_list
->last_sent_idx
== ipf_list
->last_inuse_idx
) {
1046 /* Adds fragments associated with a completed fragment list to a packet batch
1047 * to be processed by the calling application, typically conntrack. Also
1048 * cleans up the list context when it is empty.*/
1050 ipf_send_completed_frags(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
1051 long long now
, bool v6
)
1053 if (ovs_list_is_empty(&ipf
->frag_complete_list
)) {
1057 ovs_mutex_lock(&ipf
->ipf_lock
);
1058 struct ipf_list
*ipf_list
, *next
;
1060 LIST_FOR_EACH_SAFE (ipf_list
, next
, list_node
, &ipf
->frag_complete_list
) {
1061 if (ipf_send_frags_in_list(ipf
, ipf_list
, pb
, IPF_FRAG_COMPLETED_LIST
,
1063 ipf_completed_list_clean(&ipf
->frag_lists
, ipf_list
);
1069 ovs_mutex_unlock(&ipf
->ipf_lock
);
1072 /* Conservatively adds fragments associated with a expired fragment list to
1073 * a packet batch to be processed by the calling application, typically
1074 * conntrack. Also cleans up the list context when it is empty.*/
1076 ipf_send_expired_frags(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
1077 long long now
, bool v6
)
1080 /* Very conservative, due to DOS probability. */
1081 IPF_FRAG_LIST_MAX_EXPIRED
= 1,
1085 if (ovs_list_is_empty(&ipf
->frag_exp_list
)) {
1089 ovs_mutex_lock(&ipf
->ipf_lock
);
1090 struct ipf_list
*ipf_list
, *next
;
1091 size_t lists_removed
= 0;
1093 LIST_FOR_EACH_SAFE (ipf_list
, next
, list_node
, &ipf
->frag_exp_list
) {
1094 if (now
<= ipf_list
->expiration
||
1095 lists_removed
>= IPF_FRAG_LIST_MAX_EXPIRED
) {
1099 if (ipf_send_frags_in_list(ipf
, ipf_list
, pb
, IPF_FRAG_EXPIRY_LIST
,
1101 ipf_expiry_list_clean(&ipf
->frag_lists
, ipf_list
);
1108 ovs_mutex_unlock(&ipf
->ipf_lock
);
1111 /* Adds a reassmebled packet to a packet batch to be processed by the caller.
1114 ipf_execute_reass_pkts(struct ipf
*ipf
, struct dp_packet_batch
*pb
)
1116 if (ovs_list_is_empty(&ipf
->reassembled_pkt_list
)) {
1120 ovs_mutex_lock(&ipf
->ipf_lock
);
1121 struct reassembled_pkt
*rp
, *next
;
1123 LIST_FOR_EACH_SAFE (rp
, next
, rp_list_node
, &ipf
->reassembled_pkt_list
) {
1124 if (!rp
->list
->reass_execute_ctx
&&
1125 ipf_dp_packet_batch_add(pb
, rp
->pkt
, false)) {
1126 rp
->list
->reass_execute_ctx
= rp
->pkt
;
1130 ovs_mutex_unlock(&ipf
->ipf_lock
);
1133 /* Checks for reassembled packets post processing by conntrack and edits the
1134 * fragments if needed based on what conntrack decided. */
1136 ipf_post_execute_reass_pkts(struct ipf
*ipf
,
1137 struct dp_packet_batch
*pb
, bool v6
)
1139 if (ovs_list_is_empty(&ipf
->reassembled_pkt_list
)) {
1143 ovs_mutex_lock(&ipf
->ipf_lock
);
1144 struct reassembled_pkt
*rp
, *next
;
1146 LIST_FOR_EACH_SAFE (rp
, next
, rp_list_node
, &ipf
->reassembled_pkt_list
) {
1147 const size_t pb_cnt
= dp_packet_batch_size(pb
);
1149 struct dp_packet
*pkt
;
1150 /* Inner batch loop is constant time since batch size is <=
1151 * NETDEV_MAX_BURST. */
1152 DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx
, pb_cnt
, pkt
, pb
) {
1153 if (pkt
== rp
->list
->reass_execute_ctx
) {
1154 for (int i
= 0; i
<= rp
->list
->last_inuse_idx
; i
++) {
1155 rp
->list
->frag_list
[i
].pkt
->md
.ct_label
= pkt
->md
.ct_label
;
1156 rp
->list
->frag_list
[i
].pkt
->md
.ct_mark
= pkt
->md
.ct_mark
;
1157 rp
->list
->frag_list
[i
].pkt
->md
.ct_state
= pkt
->md
.ct_state
;
1158 rp
->list
->frag_list
[i
].pkt
->md
.ct_zone
= pkt
->md
.ct_zone
;
1159 rp
->list
->frag_list
[i
].pkt
->md
.ct_orig_tuple_ipv6
=
1160 pkt
->md
.ct_orig_tuple_ipv6
;
1161 if (pkt
->md
.ct_orig_tuple_ipv6
) {
1162 rp
->list
->frag_list
[i
].pkt
->md
.ct_orig_tuple
.ipv6
=
1163 pkt
->md
.ct_orig_tuple
.ipv6
;
1165 rp
->list
->frag_list
[i
].pkt
->md
.ct_orig_tuple
.ipv4
=
1166 pkt
->md
.ct_orig_tuple
.ipv4
;
1170 const struct ipf_frag
*frag_0
= &rp
->list
->frag_list
[0];
1171 void *l4_frag
= dp_packet_l4(frag_0
->pkt
);
1172 void *l4_reass
= dp_packet_l4(pkt
);
1173 memcpy(l4_frag
, l4_reass
, dp_packet_l4_size(frag_0
->pkt
));
1176 struct ovs_16aligned_ip6_hdr
*l3_frag
1177 = dp_packet_l3(frag_0
->pkt
);
1178 struct ovs_16aligned_ip6_hdr
*l3_reass
= dp_packet_l3(pkt
);
1179 l3_frag
->ip6_src
= l3_reass
->ip6_src
;
1180 l3_frag
->ip6_dst
= l3_reass
->ip6_dst
;
1182 struct ip_header
*l3_frag
= dp_packet_l3(frag_0
->pkt
);
1183 struct ip_header
*l3_reass
= dp_packet_l3(pkt
);
1184 ovs_be32 reass_ip
= get_16aligned_be32(&l3_reass
->ip_src
);
1185 ovs_be32 frag_ip
= get_16aligned_be32(&l3_frag
->ip_src
);
1186 l3_frag
->ip_csum
= recalc_csum32(l3_frag
->ip_csum
,
1188 l3_frag
->ip_src
= l3_reass
->ip_src
;
1190 reass_ip
= get_16aligned_be32(&l3_reass
->ip_dst
);
1191 frag_ip
= get_16aligned_be32(&l3_frag
->ip_dst
);
1192 l3_frag
->ip_csum
= recalc_csum32(l3_frag
->ip_csum
,
1194 l3_frag
->ip_dst
= l3_reass
->ip_dst
;
1197 ipf_completed_list_add(&ipf
->frag_complete_list
, rp
->list
);
1198 ipf_reassembled_list_remove(rp
);
1199 dp_packet_delete(rp
->pkt
);
1202 dp_packet_batch_refill(pb
, pkt
, pb_idx
);
1207 ovs_mutex_unlock(&ipf
->ipf_lock
);
1210 /* Extracts any fragments from the batch and reassembles them when a
1211 * complete packet is received. Completed packets are attempted to
1212 * be added to the batch to be sent through conntrack. */
1214 ipf_preprocess_conntrack(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
1215 long long now
, ovs_be16 dl_type
, uint16_t zone
,
1216 uint32_t hash_basis
)
1218 if (ipf_get_enabled(ipf
)) {
1219 ipf_extract_frags_from_batch(ipf
, pb
, dl_type
, zone
, now
, hash_basis
);
1222 if (ipf_get_enabled(ipf
) || atomic_count_get(&ipf
->nfrag
)) {
1223 ipf_execute_reass_pkts(ipf
, pb
);
1227 /* Updates fragments based on the processing of the reassembled packet sent
1228 * through conntrack and adds these fragments to any batches seen. Expired
1229 * fragments are marked as invalid and also added to the batches seen
1230 * with low priority. Reassembled packets are freed. */
1232 ipf_postprocess_conntrack(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
1233 long long now
, ovs_be16 dl_type
)
1235 if (ipf_get_enabled(ipf
) || atomic_count_get(&ipf
->nfrag
)) {
1236 bool v6
= dl_type
== htons(ETH_TYPE_IPV6
);
1237 ipf_post_execute_reass_pkts(ipf
, pb
, v6
);
1238 ipf_send_completed_frags(ipf
, pb
, now
, v6
);
1239 ipf_send_expired_frags(ipf
, pb
, now
, v6
);
1244 ipf_clean_thread_main(void *f
)
1246 struct ipf
*ipf
= f
;
1249 IPF_FRAG_LIST_CLEAN_TIMEOUT
= 60000,
1252 while (!latch_is_set(&ipf
->ipf_clean_thread_exit
)) {
1254 long long now
= time_msec();
1256 if (!ovs_list_is_empty(&ipf
->frag_exp_list
) ||
1257 !ovs_list_is_empty(&ipf
->frag_complete_list
)) {
1259 ovs_mutex_lock(&ipf
->ipf_lock
);
1261 struct ipf_list
*ipf_list
, *next
;
1262 LIST_FOR_EACH_SAFE (ipf_list
, next
, list_node
,
1263 &ipf
->frag_exp_list
) {
1264 if (ipf_purge_list_check(ipf
, ipf_list
, now
)) {
1265 ipf_expiry_list_clean(&ipf
->frag_lists
, ipf_list
);
1269 LIST_FOR_EACH_SAFE (ipf_list
, next
, list_node
,
1270 &ipf
->frag_complete_list
) {
1271 if (ipf_purge_list_check(ipf
, ipf_list
, now
)) {
1272 ipf_completed_list_clean(&ipf
->frag_lists
, ipf_list
);
1276 ovs_mutex_unlock(&ipf
->ipf_lock
);
1279 poll_timer_wait_until(now
+ IPF_FRAG_LIST_CLEAN_TIMEOUT
);
1280 latch_wait(&ipf
->ipf_clean_thread_exit
);
1290 struct ipf
*ipf
= xzalloc(sizeof *ipf
);
1292 ovs_mutex_init_adaptive(&ipf
->ipf_lock
);
1293 ovs_mutex_lock(&ipf
->ipf_lock
);
1294 hmap_init(&ipf
->frag_lists
);
1295 ovs_list_init(&ipf
->frag_exp_list
);
1296 ovs_list_init(&ipf
->frag_complete_list
);
1297 ovs_list_init(&ipf
->reassembled_pkt_list
);
1298 atomic_init(&ipf
->min_v4_frag_size
, IPF_V4_FRAG_SIZE_MIN_DEF
);
1299 atomic_init(&ipf
->min_v6_frag_size
, IPF_V6_FRAG_SIZE_MIN_DEF
);
1300 ipf
->max_v4_frag_list_size
= DIV_ROUND_UP(
1301 IPV4_PACKET_MAX_SIZE
- IPV4_PACKET_MAX_HDR_SIZE
,
1302 ipf
->min_v4_frag_size
- IPV4_PACKET_MAX_HDR_SIZE
);
1303 ovs_mutex_unlock(&ipf
->ipf_lock
);
1304 atomic_count_init(&ipf
->nfrag
, 0);
1305 for (size_t i
= 0; i
< IPF_NFRAGS_NUM_CNTS
; i
++) {
1306 atomic_init(&ipf
->n4frag_cnt
[i
], 0);
1307 atomic_init(&ipf
->n6frag_cnt
[i
], 0);
1309 atomic_init(&ipf
->nfrag_max
, IPF_MAX_FRAGS_DEFAULT
);
1310 atomic_init(&ipf
->ifp_v4_enabled
, true);
1311 atomic_init(&ipf
->ifp_v6_enabled
, true);
1312 latch_init(&ipf
->ipf_clean_thread_exit
);
1313 ipf
->ipf_clean_thread
= ovs_thread_create("ipf_clean",
1314 ipf_clean_thread_main
, ipf
);
1320 ipf_destroy(struct ipf
*ipf
)
1322 ovs_mutex_lock(&ipf
->ipf_lock
);
1323 latch_set(&ipf
->ipf_clean_thread_exit
);
1324 pthread_join(ipf
->ipf_clean_thread
, NULL
);
1325 latch_destroy(&ipf
->ipf_clean_thread_exit
);
1327 struct ipf_list
*ipf_list
;
1328 HMAP_FOR_EACH_POP (ipf_list
, node
, &ipf
->frag_lists
) {
1329 while (ipf_list
->last_sent_idx
< ipf_list
->last_inuse_idx
) {
1330 struct dp_packet
*pkt
1331 = ipf_list
->frag_list
[ipf_list
->last_sent_idx
+ 1].pkt
;
1332 if (!ipf_list
->frag_list
[ipf_list
->last_sent_idx
+ 1].dnsteal
) {
1333 dp_packet_delete(pkt
);
1335 atomic_count_dec(&ipf
->nfrag
);
1336 ipf_list
->last_sent_idx
++;
1338 free(ipf_list
->frag_list
);
1342 if (atomic_count_get(&ipf
->nfrag
)) {
1343 VLOG_WARN("ipf destroy with non-zero fragment count. ");
1346 struct reassembled_pkt
*rp
;
1347 LIST_FOR_EACH_POP (rp
, rp_list_node
, &ipf
->reassembled_pkt_list
) {
1348 dp_packet_delete(rp
->pkt
);
1352 hmap_destroy(&ipf
->frag_lists
);
1353 ovs_list_poison(&ipf
->frag_exp_list
);
1354 ovs_list_poison(&ipf
->frag_complete_list
);
1355 ovs_list_poison(&ipf
->reassembled_pkt_list
);
1356 ovs_mutex_unlock(&ipf
->ipf_lock
);
1357 ovs_mutex_destroy(&ipf
->ipf_lock
);
1362 ipf_set_enabled(struct ipf
*ipf
, bool v6
, bool enable
)
1364 atomic_store_relaxed(v6
? &ipf
->ifp_v6_enabled
: &ipf
->ifp_v4_enabled
,
1370 ipf_set_min_frag(struct ipf
*ipf
, bool v6
, uint32_t value
)
1372 /* If the user specifies an unreasonably large number, fragmentation
1373 * will not work well but it will not blow up. */
1374 if (value
< (v6
? IPF_V6_FRAG_SIZE_LBOUND
: IPF_V4_FRAG_SIZE_LBOUND
)) {
1378 ovs_mutex_lock(&ipf
->ipf_lock
);
1380 atomic_store_relaxed(&ipf
->min_v6_frag_size
, value
);
1382 atomic_store_relaxed(&ipf
->min_v4_frag_size
, value
);
1383 ipf
->max_v4_frag_list_size
= DIV_ROUND_UP(
1384 IPV4_PACKET_MAX_SIZE
- IPV4_PACKET_MAX_HDR_SIZE
,
1385 ipf
->min_v4_frag_size
- IPV4_PACKET_MAX_HDR_SIZE
);
1387 ovs_mutex_unlock(&ipf
->ipf_lock
);
1392 ipf_set_max_nfrags(struct ipf
*ipf
, uint32_t value
)
1394 if (value
> IPF_NFRAG_UBOUND
) {
1397 atomic_store_relaxed(&ipf
->nfrag_max
, value
);
1402 ipf_get_status(struct ipf
*ipf
, struct ipf_status
*ipf_status
)
1404 ipf_status
->nfrag
= atomic_count_get(&ipf
->nfrag
);
1405 atomic_read_relaxed(&ipf
->nfrag_max
, &ipf_status
->nfrag_max
);
1407 atomic_read_relaxed(&ipf
->ifp_v4_enabled
, &ipf_status
->v4
.enabled
);
1408 atomic_read_relaxed(&ipf
->min_v4_frag_size
,
1409 &ipf_status
->v4
.min_frag_size
);
1410 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_ACCEPTED
],
1411 &ipf_status
->v4
.nfrag_accepted
);
1412 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_COMPL_SENT
],
1413 &ipf_status
->v4
.nfrag_completed_sent
);
1414 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_EXPD_SENT
],
1415 &ipf_status
->v4
.nfrag_expired_sent
);
1416 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_TOO_SMALL
],
1417 &ipf_status
->v4
.nfrag_too_small
);
1418 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_OVERLAP
],
1419 &ipf_status
->v4
.nfrag_overlap
);
1420 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_PURGED
],
1421 &ipf_status
->v4
.nfrag_purged
);
1423 atomic_read_relaxed(&ipf
->ifp_v6_enabled
, &ipf_status
->v6
.enabled
);
1424 atomic_read_relaxed(&ipf
->min_v6_frag_size
,
1425 &ipf_status
->v6
.min_frag_size
);
1426 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_ACCEPTED
],
1427 &ipf_status
->v6
.nfrag_accepted
);
1428 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_COMPL_SENT
],
1429 &ipf_status
->v6
.nfrag_completed_sent
);
1430 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_EXPD_SENT
],
1431 &ipf_status
->v6
.nfrag_expired_sent
);
1432 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_TOO_SMALL
],
1433 &ipf_status
->v6
.nfrag_too_small
);
1434 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_OVERLAP
],
1435 &ipf_status
->v6
.nfrag_overlap
);
1436 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_PURGED
],
1437 &ipf_status
->v6
.nfrag_purged
);
1441 struct ipf_dump_ctx
{
1442 struct hmap_position bucket_pos
;
1445 /* Allocates an 'ipf_dump_ctx' to keep track of an hmap position. The
1446 * caller must call ipf_dump_done() when dumping is finished. */
1448 ipf_dump_start(struct ipf_dump_ctx
**ipf_dump_ctx
)
1450 *ipf_dump_ctx
= xzalloc(sizeof **ipf_dump_ctx
);
1454 /* Creates a string representation of the state of an 'ipf_list' and puts
1457 ipf_dump_create(const struct ipf_list
*ipf_list
, struct ds
*ds
)
1459 ds_put_cstr(ds
, "(");
1460 if (ipf_list
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
1461 ds_put_format(ds
, "src="IP_FMT
",dst="IP_FMT
",",
1462 IP_ARGS(ipf_list
->key
.src_addr
.ipv4
),
1463 IP_ARGS(ipf_list
->key
.dst_addr
.ipv4
));
1465 ds_put_cstr(ds
, "src=");
1466 ipv6_format_addr(&ipf_list
->key
.src_addr
.ipv6
, ds
);
1467 ds_put_cstr(ds
, ",dst=");
1468 ipv6_format_addr(&ipf_list
->key
.dst_addr
.ipv6
, ds
);
1469 ds_put_cstr(ds
, ",");
1472 ds_put_format(ds
, "recirc_id=%u,ip_id=%u,dl_type=0x%x,zone=%u,nw_proto=%u",
1473 ipf_list
->key
.recirc_id
, ntohl(ipf_list
->key
.ip_id
),
1474 ntohs(ipf_list
->key
.dl_type
), ipf_list
->key
.zone
,
1475 ipf_list
->key
.nw_proto
);
1477 ds_put_format(ds
, ",num_fragments=%u,state=%s",
1478 ipf_list
->last_inuse_idx
+ 1,
1479 ipf_state_name
[ipf_list
->state
]);
1481 ds_put_cstr(ds
, ")");
1484 /* Finds the next ipf list starting from 'ipf_dump_ctx->bucket_pos' and uses
1485 * ipf_dump_create() to create a string representation of the state of an
1486 * ipf list, to which 'dump' is pointed to. Returns EOF when there are no
1487 * more ipf lists. */
1489 ipf_dump_next(struct ipf
*ipf
, struct ipf_dump_ctx
*ipf_dump_ctx
, char **dump
)
1491 ovs_mutex_lock(&ipf
->ipf_lock
);
1493 struct hmap_node
*node
= hmap_at_position(&ipf
->frag_lists
,
1494 &ipf_dump_ctx
->bucket_pos
);
1496 ovs_mutex_unlock(&ipf
->ipf_lock
);
1499 struct ipf_list
*ipf_list_
;
1500 INIT_CONTAINER(ipf_list_
, node
, node
);
1501 struct ipf_list ipf_list
= *ipf_list_
;
1502 ovs_mutex_unlock(&ipf
->ipf_lock
);
1503 struct ds ds
= DS_EMPTY_INITIALIZER
;
1504 ipf_dump_create(&ipf_list
, &ds
);
1505 *dump
= ds_steal_cstr(&ds
);
1510 /* Frees 'ipf_dump_ctx' allocated by ipf_dump_start(). */
1512 ipf_dump_done(struct ipf_dump_ctx
*ipf_dump_ctx
)