2 * Copyright (c) 2019 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 #include <sys/types.h>
21 #include <netinet/in.h>
22 #include <netinet/ip6.h>
23 #include <netinet/icmp6.h>
30 #include "openvswitch/hmap.h"
31 #include "openvswitch/poll-loop.h"
32 #include "openvswitch/vlog.h"
33 #include "ovs-atomic.h"
37 VLOG_DEFINE_THIS_MODULE(ipf
);
38 COVERAGE_DEFINE(ipf_stuck_frag_list_purged
);
41 IPV4_PACKET_MAX_HDR_SIZE
= 60,
42 IPV4_PACKET_MAX_SIZE
= 65535,
43 IPV6_PACKET_MAX_DATA
= 65535,
47 IPF_LIST_STATE_UNUSED
,
48 IPF_LIST_STATE_REASS_FAIL
,
49 IPF_LIST_STATE_OTHER_SEEN
,
50 IPF_LIST_STATE_FIRST_SEEN
,
51 IPF_LIST_STATE_LAST_SEEN
,
52 IPF_LIST_STATE_FIRST_LAST_SEEN
,
53 IPF_LIST_STATE_COMPLETED
,
57 static char *ipf_state_name
[IPF_LIST_STATE_NUM
] =
58 {"unused", "reassemble fail", "other frag", "first frag", "last frag",
59 "first/last frag", "complete"};
62 IPF_FRAG_COMPLETED_LIST
,
68 IPF_V4_FRAG_SIZE_LBOUND
= 400,
69 IPF_V4_FRAG_SIZE_MIN_DEF
= 1200,
70 IPF_V6_FRAG_SIZE_LBOUND
= 400, /* Useful for testing. */
71 IPF_V6_FRAG_SIZE_MIN_DEF
= 1280,
72 IPF_MAX_FRAGS_DEFAULT
= 1000,
73 IPF_NFRAG_UBOUND
= 5000,
76 enum ipf_counter_type
{
78 IPF_NFRAGS_COMPL_SENT
,
91 /* Represents a single fragment; part of a list of fragments. */
93 struct dp_packet
*pkt
;
94 uint16_t start_data_byte
;
95 uint16_t end_data_byte
;
96 bool dnsteal
; /* 'do not steal': if true, ipf should not free packet. */
99 /* The key for a collection of fragments potentially making up an unfragmented
101 struct ipf_list_key
{
102 /* ipf_list_key_hash() requires 'src_addr' and 'dst_addr' to be the first
104 union ipf_addr src_addr
;
105 union ipf_addr dst_addr
;
107 ovs_be32 ip_id
; /* V6 is 32 bits. */
113 /* A collection of fragments potentially making up an unfragmented packet. */
115 struct hmap_node node
; /* In struct ipf's 'frag_lists'. */
116 struct ovs_list list_node
; /* In struct ipf's 'frag_exp_list' or
117 * 'frag_complete_list'. */
118 struct ipf_frag
*frag_list
; /* List of fragments for this list. */
119 struct ipf_list_key key
; /* The key for the fragemnt list. */
120 struct dp_packet
*reass_execute_ctx
; /* Reassembled packet. */
121 long long expiration
; /* In milliseconds. */
122 int last_sent_idx
; /* Last sent fragment idx. */
123 int last_inuse_idx
; /* Last inuse fragment idx. */
124 int size
; /* Fragment list size. */
125 uint8_t state
; /* Frag list state; see ipf_list_state. */
128 /* Represents a reassambled packet which typically is passed through
130 struct reassembled_pkt
{
131 struct ovs_list rp_list_node
; /* In struct ipf's
132 * 'reassembled_pkt_list'. */
133 struct dp_packet
*pkt
;
134 struct ipf_list
*list
;
138 /* The clean thread is used to clean up fragments in the 'ipf'
139 * module if packet batches are not longer be sent through its user. */
140 pthread_t ipf_clean_thread
;
141 struct latch ipf_clean_thread_exit
;
143 int max_v4_frag_list_size
;
145 struct ovs_mutex ipf_lock
; /* Protects all of the following. */
146 /* These contain 'struct ipf_list's. */
147 struct hmap frag_lists OVS_GUARDED
;
148 struct ovs_list frag_exp_list OVS_GUARDED
;
149 struct ovs_list frag_complete_list OVS_GUARDED
;
150 /* Contains 'struct reassembled_pkt's. */
151 struct ovs_list reassembled_pkt_list OVS_GUARDED
;
153 /* Used to allow disabling fragmentation reassembly. */
154 atomic_bool ifp_v4_enabled
;
155 atomic_bool ifp_v6_enabled
;
157 /* Will be clamped above 400 bytes; the value chosen should handle
158 * alg control packets of interest that use string encoding of mutable
159 * IP fields; meaning, the control packets should not be fragmented. */
160 atomic_uint min_v4_frag_size
;
161 atomic_uint min_v6_frag_size
;
163 /* Configurable maximum allowable fragments in process. */
164 atomic_uint nfrag_max
;
166 /* Number of fragments in process. */
169 atomic_uint64_t n4frag_cnt
[IPF_NFRAGS_NUM_CNTS
];
170 atomic_uint64_t n6frag_cnt
[IPF_NFRAGS_NUM_CNTS
];
174 ipf_print_reass_packet(const char *es
, const void *pkt
)
176 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(10, 10);
177 if (!VLOG_DROP_WARN(&rl
)) {
178 struct ds ds
= DS_EMPTY_INITIALIZER
;
179 ds_put_hex_dump(&ds
, pkt
, 128, 0, false);
180 VLOG_WARN("%s\n%s", es
, ds_cstr(&ds
));
186 ipf_count(struct ipf
*ipf
, bool v6
, enum ipf_counter_type cntr
)
188 atomic_count_inc64(v6
? &ipf
->n6frag_cnt
[cntr
] : &ipf
->n4frag_cnt
[cntr
]);
192 ipf_get_v4_enabled(struct ipf
*ipf
)
194 bool ifp_v4_enabled_
;
195 atomic_read_relaxed(&ipf
->ifp_v4_enabled
, &ifp_v4_enabled_
);
196 return ifp_v4_enabled_
;
200 ipf_get_v6_enabled(struct ipf
*ipf
)
202 bool ifp_v6_enabled_
;
203 atomic_read_relaxed(&ipf
->ifp_v6_enabled
, &ifp_v6_enabled_
);
204 return ifp_v6_enabled_
;
208 ipf_get_enabled(struct ipf
*ipf
)
210 return ipf_get_v4_enabled(ipf
) || ipf_get_v6_enabled(ipf
);
214 ipf_addr_hash_add(uint32_t hash
, const union ipf_addr
*addr
)
216 BUILD_ASSERT_DECL(sizeof *addr
% 4 == 0);
217 return hash_add_bytes32(hash
, (const uint32_t *) addr
, sizeof *addr
);
220 /* Adds a list of fragments to the list tracking expiry of yet to be
221 * completed reassembled packets, hence subject to expirty. */
223 ipf_expiry_list_add(struct ovs_list
*frag_exp_list
, struct ipf_list
*ipf_list
,
225 /* OVS_REQUIRES(ipf->ipf_lock) */
228 IPF_FRAG_LIST_TIMEOUT
= 15000,
231 ipf_list
->expiration
= now
+ IPF_FRAG_LIST_TIMEOUT
;
232 ovs_list_push_back(frag_exp_list
, &ipf_list
->list_node
);
235 /* Adds a list of fragments to the list of completed packets, which will be
236 * subsequently transmitted. */
238 ipf_completed_list_add(struct ovs_list
*frag_complete_list
,
239 struct ipf_list
*ipf_list
)
240 /* OVS_REQUIRES(ipf_lock) */
242 ovs_list_push_back(frag_complete_list
, &ipf_list
->list_node
);
245 /* Adds a reassmebled packet to the list of reassembled packets, awaiting some
246 * processing, such as being sent through conntrack. */
248 ipf_reassembled_list_add(struct ovs_list
*reassembled_pkt_list
,
249 struct reassembled_pkt
*rp
)
250 /* OVS_REQUIRES(ipf_lock) */
252 ovs_list_push_back(reassembled_pkt_list
, &rp
->rp_list_node
);
255 /* Removed a frag list from tracking datastructures and frees list heap
258 ipf_list_clean(struct hmap
*frag_lists
,
259 struct ipf_list
*ipf_list
)
260 /* OVS_REQUIRES(ipf_lock) */
262 ovs_list_remove(&ipf_list
->list_node
);
263 hmap_remove(frag_lists
, &ipf_list
->node
);
264 free(ipf_list
->frag_list
);
268 /* Removed a frag list sitting on the expiry list from tracking
269 * datastructures and frees list heap memory. */
271 ipf_expiry_list_clean(struct hmap
*frag_lists
,
272 struct ipf_list
*ipf_list
)
273 /* OVS_REQUIRES(ipf_lock) */
275 ipf_list_clean(frag_lists
, ipf_list
);
278 /* Removed a frag list sitting on the completed list from tracking
279 * datastructures and frees list heap memory. */
281 ipf_completed_list_clean(struct hmap
*frag_lists
,
282 struct ipf_list
*ipf_list
)
283 /* OVS_REQUIRES(ipf_lock) */
285 ipf_list_clean(frag_lists
, ipf_list
);
289 ipf_expiry_list_remove(struct ipf_list
*ipf_list
)
290 /* OVS_REQUIRES(ipf_lock) */
292 ovs_list_remove(&ipf_list
->list_node
);
296 ipf_reassembled_list_remove(struct reassembled_pkt
*rp
)
297 /* OVS_REQUIRES(ipf_lock) */
299 ovs_list_remove(&rp
->rp_list_node
);
304 ipf_list_key_hash(const struct ipf_list_key
*key
, uint32_t basis
)
306 uint32_t hsrc
, hdst
, hash
;
308 hsrc
= ipf_addr_hash_add(hsrc
, &key
->src_addr
);
309 hdst
= ipf_addr_hash_add(hdst
, &key
->dst_addr
);
312 /* Hash the rest of the key. */
313 return hash_words((uint32_t *) (&key
->dst_addr
+ 1),
314 (uint32_t *) (key
+ 1) -
315 (uint32_t *) (&key
->dst_addr
+ 1),
320 ipf_is_first_v4_frag(const struct dp_packet
*pkt
)
322 const struct ip_header
*l3
= dp_packet_l3(pkt
);
323 if (!(l3
->ip_frag_off
& htons(IP_FRAG_OFF_MASK
)) &&
324 l3
->ip_frag_off
& htons(IP_MORE_FRAGMENTS
)) {
331 ipf_is_last_v4_frag(const struct dp_packet
*pkt
)
333 const struct ip_header
*l3
= dp_packet_l3(pkt
);
334 if (l3
->ip_frag_off
& htons(IP_FRAG_OFF_MASK
) &&
335 !(l3
->ip_frag_off
& htons(IP_MORE_FRAGMENTS
))) {
342 ipf_is_v6_frag(ovs_be16 ip6f_offlg
)
344 if (ip6f_offlg
& (IP6F_OFF_MASK
| IP6F_MORE_FRAG
)) {
351 ipf_is_first_v6_frag(ovs_be16 ip6f_offlg
)
353 if (!(ip6f_offlg
& IP6F_OFF_MASK
) &&
354 ip6f_offlg
& IP6F_MORE_FRAG
) {
361 ipf_is_last_v6_frag(ovs_be16 ip6f_offlg
)
363 if ((ip6f_offlg
& IP6F_OFF_MASK
) &&
364 !(ip6f_offlg
& IP6F_MORE_FRAG
)) {
370 /* Checks for a completed packet collection of fragments. */
372 ipf_list_complete(const struct ipf_list
*ipf_list
)
373 /* OVS_REQUIRES(ipf_lock) */
375 for (int i
= 1; i
<= ipf_list
->last_inuse_idx
; i
++) {
376 if (ipf_list
->frag_list
[i
- 1].end_data_byte
+ 1
377 != ipf_list
->frag_list
[i
].start_data_byte
) {
384 /* Runs O(n) for a sorted or almost sorted list. */
386 ipf_sort(struct ipf_frag
*frag_list
, size_t last_idx
)
387 /* OVS_REQUIRES(ipf_lock) */
389 for (int li
= 1; li
<= last_idx
; li
++) {
390 struct ipf_frag ipf_frag
= frag_list
[li
];
393 frag_list
[ci
].start_data_byte
> ipf_frag
.start_data_byte
) {
394 frag_list
[ci
+ 1] = frag_list
[ci
];
397 frag_list
[ci
+ 1] = ipf_frag
;
401 /* Called on a sorted complete list of v4 fragments to reassemble them into
402 * a single packet that can be processed, such as passing through conntrack.
404 static struct dp_packet
*
405 ipf_reassemble_v4_frags(struct ipf_list
*ipf_list
)
406 /* OVS_REQUIRES(ipf_lock) */
408 struct ipf_frag
*frag_list
= ipf_list
->frag_list
;
409 struct dp_packet
*pkt
= dp_packet_clone(frag_list
[0].pkt
);
410 dp_packet_set_size(pkt
, dp_packet_size(pkt
) - dp_packet_l2_pad_size(pkt
));
411 struct ip_header
*l3
= dp_packet_l3(pkt
);
412 int len
= ntohs(l3
->ip_tot_len
);
414 int rest_len
= frag_list
[ipf_list
->last_inuse_idx
].end_data_byte
-
415 frag_list
[1].start_data_byte
+ 1;
417 if (len
+ rest_len
> IPV4_PACKET_MAX_SIZE
) {
418 ipf_print_reass_packet(
419 "Unsupported big reassembled v4 packet; v4 hdr:", l3
);
420 dp_packet_delete(pkt
);
424 dp_packet_prealloc_tailroom(pkt
, rest_len
);
426 for (int i
= 1; i
<= ipf_list
->last_inuse_idx
; i
++) {
427 size_t add_len
= frag_list
[i
].end_data_byte
-
428 frag_list
[i
].start_data_byte
+ 1;
429 const char *l4
= dp_packet_l4(frag_list
[i
].pkt
);
430 dp_packet_put(pkt
, l4
, add_len
);
434 l3
= dp_packet_l3(pkt
);
435 ovs_be16 new_ip_frag_off
= l3
->ip_frag_off
& ~htons(IP_MORE_FRAGMENTS
);
436 if (!dp_packet_hwol_is_ipv4(pkt
)) {
437 l3
->ip_csum
= recalc_csum16(l3
->ip_csum
, l3
->ip_frag_off
,
439 l3
->ip_csum
= recalc_csum16(l3
->ip_csum
, l3
->ip_tot_len
, htons(len
));
441 l3
->ip_tot_len
= htons(len
);
442 l3
->ip_frag_off
= new_ip_frag_off
;
443 dp_packet_set_l2_pad_size(pkt
, 0);
448 /* Called on a sorted complete list of v6 fragments to reassemble them into
449 * a single packet that can be processed, such as passing through conntrack.
451 static struct dp_packet
*
452 ipf_reassemble_v6_frags(struct ipf_list
*ipf_list
)
453 /* OVS_REQUIRES(ipf_lock) */
455 struct ipf_frag
*frag_list
= ipf_list
->frag_list
;
456 struct dp_packet
*pkt
= dp_packet_clone(frag_list
[0].pkt
);
457 dp_packet_set_size(pkt
, dp_packet_size(pkt
) - dp_packet_l2_pad_size(pkt
));
458 struct ovs_16aligned_ip6_hdr
*l3
= dp_packet_l3(pkt
);
459 int pl
= ntohs(l3
->ip6_plen
) - sizeof(struct ovs_16aligned_ip6_frag
);
461 int rest_len
= frag_list
[ipf_list
->last_inuse_idx
].end_data_byte
-
462 frag_list
[1].start_data_byte
+ 1;
464 if (pl
+ rest_len
> IPV6_PACKET_MAX_DATA
) {
465 ipf_print_reass_packet(
466 "Unsupported big reassembled v6 packet; v6 hdr:", l3
);
467 dp_packet_delete(pkt
);
471 dp_packet_prealloc_tailroom(pkt
, rest_len
);
473 for (int i
= 1; i
<= ipf_list
->last_inuse_idx
; i
++) {
474 size_t add_len
= frag_list
[i
].end_data_byte
-
475 frag_list
[i
].start_data_byte
+ 1;
476 const char *l4
= dp_packet_l4(frag_list
[i
].pkt
);
477 dp_packet_put(pkt
, l4
, add_len
);
481 l3
= dp_packet_l3(pkt
);
483 uint8_t nw_proto
= l3
->ip6_nxt
;
485 const void *data
= l3
+ 1;
486 size_t datasize
= pl
;
488 const struct ovs_16aligned_ip6_frag
*frag_hdr
= NULL
;
489 if (!parse_ipv6_ext_hdrs(&data
, &datasize
, &nw_proto
, &nw_frag
, &frag_hdr
)
490 || !nw_frag
|| !frag_hdr
) {
492 ipf_print_reass_packet("Unparsed reassembled v6 packet; v6 hdr:", l3
);
493 dp_packet_delete(pkt
);
497 struct ovs_16aligned_ip6_frag
*fh
=
498 CONST_CAST(struct ovs_16aligned_ip6_frag
*, frag_hdr
);
500 l3
->ip6_plen
= htons(pl
);
501 l3
->ip6_ctlun
.ip6_un1
.ip6_un1_nxt
= nw_proto
;
502 dp_packet_set_l2_pad_size(pkt
, 0);
506 /* Called when a frag list state transitions to another state. This is
507 * triggered by new fragment for the list being received.*/
509 ipf_list_state_transition(struct ipf
*ipf
, struct ipf_list
*ipf_list
,
510 bool ff
, bool lf
, bool v6
)
511 OVS_REQUIRES(ipf
->ipf_lock
)
513 enum ipf_list_state curr_state
= ipf_list
->state
;
514 enum ipf_list_state next_state
;
515 switch (curr_state
) {
516 case IPF_LIST_STATE_UNUSED
:
517 case IPF_LIST_STATE_OTHER_SEEN
:
519 next_state
= IPF_LIST_STATE_FIRST_SEEN
;
521 next_state
= IPF_LIST_STATE_LAST_SEEN
;
523 next_state
= IPF_LIST_STATE_OTHER_SEEN
;
526 case IPF_LIST_STATE_FIRST_SEEN
:
528 next_state
= IPF_LIST_STATE_FIRST_LAST_SEEN
;
530 next_state
= IPF_LIST_STATE_FIRST_SEEN
;
533 case IPF_LIST_STATE_LAST_SEEN
:
535 next_state
= IPF_LIST_STATE_FIRST_LAST_SEEN
;
537 next_state
= IPF_LIST_STATE_LAST_SEEN
;
540 case IPF_LIST_STATE_FIRST_LAST_SEEN
:
541 next_state
= IPF_LIST_STATE_FIRST_LAST_SEEN
;
543 case IPF_LIST_STATE_COMPLETED
:
544 case IPF_LIST_STATE_REASS_FAIL
:
545 case IPF_LIST_STATE_NUM
:
550 if (next_state
== IPF_LIST_STATE_FIRST_LAST_SEEN
) {
551 ipf_sort(ipf_list
->frag_list
, ipf_list
->last_inuse_idx
);
552 if (ipf_list_complete(ipf_list
)) {
553 struct dp_packet
*reass_pkt
= v6
554 ? ipf_reassemble_v6_frags(ipf_list
)
555 : ipf_reassemble_v4_frags(ipf_list
);
557 struct reassembled_pkt
*rp
= xzalloc(sizeof *rp
);
560 ipf_reassembled_list_add(&ipf
->reassembled_pkt_list
, rp
);
561 ipf_expiry_list_remove(ipf_list
);
562 next_state
= IPF_LIST_STATE_COMPLETED
;
564 next_state
= IPF_LIST_STATE_REASS_FAIL
;
568 ipf_list
->state
= next_state
;
571 /* Some sanity checks are redundant, but prudent, in case code paths for
572 * fragments change in future. The processing cost for fragments is not
575 ipf_is_valid_v4_frag(struct ipf
*ipf
, struct dp_packet
*pkt
)
577 if (OVS_UNLIKELY(dp_packet_ip_checksum_bad(pkt
))) {
581 const struct eth_header
*l2
= dp_packet_eth(pkt
);
582 const struct ip_header
*l3
= dp_packet_l3(pkt
);
584 if (OVS_UNLIKELY(!l2
|| !l3
)) {
588 size_t l3_size
= dp_packet_l3_size(pkt
);
589 if (OVS_UNLIKELY(l3_size
< IP_HEADER_LEN
)) {
593 if (!IP_IS_FRAGMENT(l3
->ip_frag_off
)) {
597 uint16_t ip_tot_len
= ntohs(l3
->ip_tot_len
);
598 if (OVS_UNLIKELY(ip_tot_len
!= l3_size
)) {
602 size_t ip_hdr_len
= IP_IHL(l3
->ip_ihl_ver
) * 4;
603 if (OVS_UNLIKELY(ip_hdr_len
< IP_HEADER_LEN
)) {
606 if (OVS_UNLIKELY(l3_size
< ip_hdr_len
)) {
610 if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt
)
611 && !dp_packet_hwol_is_ipv4(pkt
)
612 && csum(l3
, ip_hdr_len
) != 0)) {
616 uint32_t min_v4_frag_size_
;
617 atomic_read_relaxed(&ipf
->min_v4_frag_size
, &min_v4_frag_size_
);
618 bool lf
= ipf_is_last_v4_frag(pkt
);
619 if (OVS_UNLIKELY(!lf
&& dp_packet_l3_size(pkt
) < min_v4_frag_size_
)) {
620 ipf_count(ipf
, false, IPF_NFRAGS_TOO_SMALL
);
626 pkt
->md
.ct_state
= CS_INVALID
;
631 ipf_v4_key_extract(struct dp_packet
*pkt
, ovs_be16 dl_type
, uint16_t zone
,
632 struct ipf_list_key
*key
, uint16_t *start_data_byte
,
633 uint16_t *end_data_byte
, bool *ff
, bool *lf
)
635 const struct ip_header
*l3
= dp_packet_l3(pkt
);
636 uint16_t ip_tot_len
= ntohs(l3
->ip_tot_len
);
637 size_t ip_hdr_len
= IP_IHL(l3
->ip_ihl_ver
) * 4;
639 *start_data_byte
= ntohs(l3
->ip_frag_off
& htons(IP_FRAG_OFF_MASK
)) * 8;
640 *end_data_byte
= *start_data_byte
+ ip_tot_len
- ip_hdr_len
- 1;
641 *ff
= ipf_is_first_v4_frag(pkt
);
642 *lf
= ipf_is_last_v4_frag(pkt
);
643 memset(key
, 0, sizeof *key
);
644 key
->ip_id
= be16_to_be32(l3
->ip_id
);
645 key
->dl_type
= dl_type
;
646 key
->src_addr
.ipv4
= get_16aligned_be32(&l3
->ip_src
);
647 key
->dst_addr
.ipv4
= get_16aligned_be32(&l3
->ip_dst
);
648 key
->nw_proto
= l3
->ip_proto
;
650 key
->recirc_id
= pkt
->md
.recirc_id
;
654 /* Some sanity checks are redundant, but prudent, in case code paths for
655 * fragments change in future. The processing cost for fragments is not
658 ipf_is_valid_v6_frag(struct ipf
*ipf
, struct dp_packet
*pkt
)
660 const struct eth_header
*l2
= dp_packet_eth(pkt
);
661 const struct ovs_16aligned_ip6_hdr
*l3
= dp_packet_l3(pkt
);
662 const char *l4
= dp_packet_l4(pkt
);
664 if (OVS_UNLIKELY(!l2
|| !l3
|| !l4
)) {
668 size_t l3_size
= dp_packet_l3_size(pkt
);
669 size_t l3_hdr_size
= sizeof *l3
;
671 if (OVS_UNLIKELY(l3_size
< l3_hdr_size
)) {
676 uint8_t nw_proto
= l3
->ip6_nxt
;
677 const void *data
= l3
+ 1;
678 size_t datasize
= l3_size
- l3_hdr_size
;
679 const struct ovs_16aligned_ip6_frag
*frag_hdr
= NULL
;
680 if (!parse_ipv6_ext_hdrs(&data
, &datasize
, &nw_proto
, &nw_frag
,
681 &frag_hdr
) || !nw_frag
|| !frag_hdr
) {
685 int pl
= ntohs(l3
->ip6_plen
);
686 if (OVS_UNLIKELY(pl
+ l3_hdr_size
!= l3_size
)) {
690 ovs_be16 ip6f_offlg
= frag_hdr
->ip6f_offlg
;
691 if (OVS_UNLIKELY(!ipf_is_v6_frag(ip6f_offlg
))) {
695 uint32_t min_v6_frag_size_
;
696 atomic_read_relaxed(&ipf
->min_v6_frag_size
, &min_v6_frag_size_
);
697 bool lf
= ipf_is_last_v6_frag(ip6f_offlg
);
699 if (OVS_UNLIKELY(!lf
&& dp_packet_l3_size(pkt
) < min_v6_frag_size_
)) {
700 ipf_count(ipf
, true, IPF_NFRAGS_TOO_SMALL
);
707 pkt
->md
.ct_state
= CS_INVALID
;
713 ipf_v6_key_extract(struct dp_packet
*pkt
, ovs_be16 dl_type
, uint16_t zone
,
714 struct ipf_list_key
*key
, uint16_t *start_data_byte
,
715 uint16_t *end_data_byte
, bool *ff
, bool *lf
)
717 const struct ovs_16aligned_ip6_hdr
*l3
= dp_packet_l3(pkt
);
719 uint8_t nw_proto
= l3
->ip6_nxt
;
720 const void *data
= l3
+ 1;
721 size_t datasize
= dp_packet_l3_size(pkt
) - sizeof *l3
;
722 const struct ovs_16aligned_ip6_frag
*frag_hdr
= NULL
;
724 parse_ipv6_ext_hdrs(&data
, &datasize
, &nw_proto
, &nw_frag
, &frag_hdr
);
725 ovs_assert(nw_frag
&& frag_hdr
);
726 ovs_be16 ip6f_offlg
= frag_hdr
->ip6f_offlg
;
727 *start_data_byte
= ntohs(ip6f_offlg
& IP6F_OFF_MASK
) +
728 sizeof (struct ovs_16aligned_ip6_frag
);
729 *end_data_byte
= *start_data_byte
+ dp_packet_l4_size(pkt
) - 1;
730 *ff
= ipf_is_first_v6_frag(ip6f_offlg
);
731 *lf
= ipf_is_last_v6_frag(ip6f_offlg
);
732 memset(key
, 0, sizeof *key
);
733 key
->ip_id
= get_16aligned_be32(&frag_hdr
->ip6f_ident
);
734 key
->dl_type
= dl_type
;
735 memcpy(&key
->src_addr
.ipv6
, &l3
->ip6_src
, sizeof key
->src_addr
.ipv6
);
736 /* We are not supporting parsing of the routing header to use as the
737 * dst address part of the key. */
738 memcpy(&key
->dst_addr
.ipv6
, &l3
->ip6_dst
, sizeof key
->dst_addr
.ipv6
);
739 key
->nw_proto
= 0; /* Not used for key for V6. */
741 key
->recirc_id
= pkt
->md
.recirc_id
;
745 ipf_list_key_eq(const struct ipf_list_key
*key1
,
746 const struct ipf_list_key
*key2
)
747 /* OVS_REQUIRES(ipf_lock) */
749 if (!memcmp(&key1
->src_addr
, &key2
->src_addr
, sizeof key1
->src_addr
) &&
750 !memcmp(&key1
->dst_addr
, &key2
->dst_addr
, sizeof key1
->dst_addr
) &&
751 key1
->dl_type
== key2
->dl_type
&&
752 key1
->ip_id
== key2
->ip_id
&&
753 key1
->zone
== key2
->zone
&&
754 key1
->nw_proto
== key2
->nw_proto
&&
755 key1
->recirc_id
== key2
->recirc_id
) {
761 static struct ipf_list
*
762 ipf_list_key_lookup(struct ipf
*ipf
, const struct ipf_list_key
*key
,
764 OVS_REQUIRES(ipf
->ipf_lock
)
766 struct ipf_list
*ipf_list
;
767 HMAP_FOR_EACH_WITH_HASH (ipf_list
, node
, hash
, &ipf
->frag_lists
) {
768 if (ipf_list_key_eq(&ipf_list
->key
, key
)) {
776 ipf_is_frag_duped(const struct ipf_frag
*frag_list
, int last_inuse_idx
,
777 size_t start_data_byte
, size_t end_data_byte
)
778 /* OVS_REQUIRES(ipf_lock) */
780 for (int i
= 0; i
<= last_inuse_idx
; i
++) {
781 if ((start_data_byte
>= frag_list
[i
].start_data_byte
&&
782 start_data_byte
<= frag_list
[i
].end_data_byte
) ||
783 (end_data_byte
>= frag_list
[i
].start_data_byte
&&
784 end_data_byte
<= frag_list
[i
].end_data_byte
)) {
791 /* Adds a fragment to a list of fragments, if the fragment is not a
792 * duplicate. If the fragment is a duplicate, that fragment is marked
793 * invalid to avoid the work that conntrack would do to mark the fragment
794 * as invalid, which it will in all cases. */
796 ipf_process_frag(struct ipf
*ipf
, struct ipf_list
*ipf_list
,
797 struct dp_packet
*pkt
, uint16_t start_data_byte
,
798 uint16_t end_data_byte
, bool ff
, bool lf
, bool v6
,
800 OVS_REQUIRES(ipf
->ipf_lock
)
802 bool duped_frag
= ipf_is_frag_duped(ipf_list
->frag_list
,
803 ipf_list
->last_inuse_idx
, start_data_byte
, end_data_byte
);
804 int last_inuse_idx
= ipf_list
->last_inuse_idx
;
807 if (last_inuse_idx
< ipf_list
->size
- 1) {
808 /* In the case of dpdk, it would be unfortunate if we had
809 * to create a clone fragment outside the dpdk mp due to the
810 * mempool size being too limited. We will otherwise need to
811 * recommend not setting the mempool number of buffers too low
812 * and also clamp the number of fragments. */
813 struct ipf_frag
*frag
= &ipf_list
->frag_list
[last_inuse_idx
+ 1];
815 frag
->start_data_byte
= start_data_byte
;
816 frag
->end_data_byte
= end_data_byte
;
817 frag
->dnsteal
= dnsteal
;
818 ipf_list
->last_inuse_idx
++;
819 atomic_count_inc(&ipf
->nfrag
);
820 ipf_count(ipf
, v6
, IPF_NFRAGS_ACCEPTED
);
821 ipf_list_state_transition(ipf
, ipf_list
, ff
, lf
, v6
);
826 ipf_count(ipf
, v6
, IPF_NFRAGS_OVERLAP
);
827 pkt
->md
.ct_state
= CS_INVALID
;
834 ipf_list_init(struct ipf_list
*ipf_list
, struct ipf_list_key
*key
,
835 int max_frag_list_size
)
837 ipf_list
->key
= *key
;
838 ipf_list
->last_inuse_idx
= IPF_INVALID_IDX
;
839 ipf_list
->last_sent_idx
= IPF_INVALID_IDX
;
840 ipf_list
->reass_execute_ctx
= NULL
;
841 ipf_list
->state
= IPF_LIST_STATE_UNUSED
;
842 ipf_list
->size
= max_frag_list_size
;
844 = xzalloc(ipf_list
->size
* sizeof *ipf_list
->frag_list
);
847 /* Generates a fragment list key from a well formed fragment and either starts
848 * a new fragment list or increases the size of the existing fragment list,
849 * while checking if the maximum supported fragements are supported or the
850 * list size is impossibly big. Calls 'ipf_process_frag()' to add a fragment
851 * to a list of fragemnts. */
853 ipf_handle_frag(struct ipf
*ipf
, struct dp_packet
*pkt
, ovs_be16 dl_type
,
854 uint16_t zone
, long long now
, uint32_t hash_basis
,
856 OVS_REQUIRES(ipf
->ipf_lock
)
858 struct ipf_list_key key
;
859 /* Initialize 4 variables for some versions of GCC. */
860 uint16_t start_data_byte
= 0;
861 uint16_t end_data_byte
= 0;
864 bool v6
= dl_type
== htons(ETH_TYPE_IPV6
);
866 if (v6
&& ipf_get_v6_enabled(ipf
)) {
867 ipf_v6_key_extract(pkt
, dl_type
, zone
, &key
, &start_data_byte
,
868 &end_data_byte
, &ff
, &lf
);
869 } else if (!v6
&& ipf_get_v4_enabled(ipf
)) {
870 ipf_v4_key_extract(pkt
, dl_type
, zone
, &key
, &start_data_byte
,
871 &end_data_byte
, &ff
, &lf
);
876 unsigned int nfrag_max
;
877 atomic_read_relaxed(&ipf
->nfrag_max
, &nfrag_max
);
878 if (atomic_count_get(&ipf
->nfrag
) >= nfrag_max
) {
882 uint32_t hash
= ipf_list_key_hash(&key
, hash_basis
);
883 struct ipf_list
*ipf_list
= ipf_list_key_lookup(ipf
, &key
, hash
);
885 IPF_FRAG_LIST_MIN_INCREMENT
= 4,
886 IPF_IPV6_MAX_FRAG_LIST_SIZE
= 65535,
889 int max_frag_list_size
;
891 /* Because the calculation with extension headers is variable,
892 * we don't calculate a hard maximum fragment list size upfront. The
893 * fragment list size is practically limited by the code, however. */
894 max_frag_list_size
= IPF_IPV6_MAX_FRAG_LIST_SIZE
;
896 max_frag_list_size
= ipf
->max_v4_frag_list_size
;
900 ipf_list
= xmalloc(sizeof *ipf_list
);
901 ipf_list_init(ipf_list
, &key
,
902 MIN(max_frag_list_size
, IPF_FRAG_LIST_MIN_INCREMENT
));
903 hmap_insert(&ipf
->frag_lists
, &ipf_list
->node
, hash
);
904 ipf_expiry_list_add(&ipf
->frag_exp_list
, ipf_list
, now
);
905 } else if (ipf_list
->state
== IPF_LIST_STATE_REASS_FAIL
||
906 ipf_list
->state
== IPF_LIST_STATE_COMPLETED
) {
907 /* Bail out as early as possible. */
909 } else if (ipf_list
->last_inuse_idx
+ 1 >= ipf_list
->size
) {
910 int increment
= MIN(IPF_FRAG_LIST_MIN_INCREMENT
,
911 max_frag_list_size
- ipf_list
->size
);
914 ipf_list
->frag_list
=
915 xrealloc(ipf_list
->frag_list
, (ipf_list
->size
+ increment
) *
916 sizeof *ipf_list
->frag_list
);
917 ipf_list
->size
+= increment
;
923 return ipf_process_frag(ipf
, ipf_list
, pkt
, start_data_byte
,
924 end_data_byte
, ff
, lf
, v6
, dnsteal
);
927 /* Filters out fragments from a batch of fragments and adjust the batch. */
929 ipf_extract_frags_from_batch(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
930 ovs_be16 dl_type
, uint16_t zone
, long long now
,
933 const size_t pb_cnt
= dp_packet_batch_size(pb
);
934 int pb_idx
; /* Index in a packet batch. */
935 struct dp_packet
*pkt
;
937 DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx
, pb_cnt
, pkt
, pb
) {
938 if (OVS_UNLIKELY((dl_type
== htons(ETH_TYPE_IP
) &&
939 ipf_is_valid_v4_frag(ipf
, pkt
))
941 (dl_type
== htons(ETH_TYPE_IPV6
) &&
942 ipf_is_valid_v6_frag(ipf
, pkt
)))) {
944 ovs_mutex_lock(&ipf
->ipf_lock
);
945 if (!ipf_handle_frag(ipf
, pkt
, dl_type
, zone
, now
, hash_basis
,
947 dp_packet_batch_refill(pb
, pkt
, pb_idx
);
949 ovs_mutex_unlock(&ipf
->ipf_lock
);
951 dp_packet_batch_refill(pb
, pkt
, pb_idx
);
956 /* In case of DPDK, a memory source check is done, as DPDK memory pool
957 * management has trouble dealing with multiple source types. The
958 * check_source paramater is used to indicate when this check is needed. */
960 ipf_dp_packet_batch_add(struct dp_packet_batch
*pb
, struct dp_packet
*pkt
,
961 bool check_source OVS_UNUSED
)
964 if ((dp_packet_batch_is_full(pb
)) ||
965 /* DPDK cannot handle multiple sources in a batch. */
966 (check_source
&& !dp_packet_batch_is_empty(pb
)
967 && pb
->packets
[0]->source
!= pkt
->source
)) {
969 if (dp_packet_batch_is_full(pb
)) {
974 dp_packet_batch_add(pb
, pkt
);
978 /* This would be used in rare cases where a list cannot be sent. One rare
979 * reason known right now is a mempool source check, which exists due to DPDK
980 * support, where packets are no longer being received on any port with a
981 * source matching the fragment. Another reason is a race where all
982 * conntrack rules are unconfigured when some fragments are yet to be
985 * Returns true if the list was purged. */
987 ipf_purge_list_check(struct ipf
*ipf
, struct ipf_list
*ipf_list
,
989 OVS_REQUIRES(ipf
->ipf_lock
)
992 IPF_FRAG_LIST_PURGE_TIME_ADJ
= 10000
995 if (now
< ipf_list
->expiration
+ IPF_FRAG_LIST_PURGE_TIME_ADJ
) {
999 while (ipf_list
->last_sent_idx
< ipf_list
->last_inuse_idx
) {
1000 struct dp_packet
* pkt
1001 = ipf_list
->frag_list
[ipf_list
->last_sent_idx
+ 1].pkt
;
1002 dp_packet_delete(pkt
);
1003 atomic_count_dec(&ipf
->nfrag
);
1004 COVERAGE_INC(ipf_stuck_frag_list_purged
);
1005 ipf_count(ipf
, ipf_list
->key
.dl_type
== htons(ETH_TYPE_IPV6
),
1007 ipf_list
->last_sent_idx
++;
1013 /* Does the packet batch management and common accounting work associated
1014 * with 'ipf_send_completed_frags()' and 'ipf_send_expired_frags()'. */
1016 ipf_send_frags_in_list(struct ipf
*ipf
, struct ipf_list
*ipf_list
,
1017 struct dp_packet_batch
*pb
,
1018 enum ipf_list_type list_type
, bool v6
, long long now
)
1019 OVS_REQUIRES(ipf
->ipf_lock
)
1021 if (ipf_purge_list_check(ipf
, ipf_list
, now
)) {
1025 while (ipf_list
->last_sent_idx
< ipf_list
->last_inuse_idx
) {
1026 struct dp_packet
*pkt
1027 = ipf_list
->frag_list
[ipf_list
->last_sent_idx
+ 1].pkt
;
1028 if (ipf_dp_packet_batch_add(pb
, pkt
, true)) {
1029 ipf_list
->last_sent_idx
++;
1030 atomic_count_dec(&ipf
->nfrag
);
1032 if (list_type
== IPF_FRAG_COMPLETED_LIST
) {
1033 ipf_count(ipf
, v6
, IPF_NFRAGS_COMPL_SENT
);
1035 ipf_count(ipf
, v6
, IPF_NFRAGS_EXPD_SENT
);
1036 pkt
->md
.ct_state
= CS_INVALID
;
1039 if (ipf_list
->last_sent_idx
== ipf_list
->last_inuse_idx
) {
1049 /* Adds fragments associated with a completed fragment list to a packet batch
1050 * to be processed by the calling application, typically conntrack. Also
1051 * cleans up the list context when it is empty.*/
1053 ipf_send_completed_frags(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
1054 long long now
, bool v6
)
1056 if (ovs_list_is_empty(&ipf
->frag_complete_list
)) {
1060 ovs_mutex_lock(&ipf
->ipf_lock
);
1061 struct ipf_list
*ipf_list
, *next
;
1063 LIST_FOR_EACH_SAFE (ipf_list
, next
, list_node
, &ipf
->frag_complete_list
) {
1064 if (ipf_send_frags_in_list(ipf
, ipf_list
, pb
, IPF_FRAG_COMPLETED_LIST
,
1066 ipf_completed_list_clean(&ipf
->frag_lists
, ipf_list
);
1072 ovs_mutex_unlock(&ipf
->ipf_lock
);
1075 /* Conservatively adds fragments associated with a expired fragment list to
1076 * a packet batch to be processed by the calling application, typically
1077 * conntrack. Also cleans up the list context when it is empty.*/
1079 ipf_send_expired_frags(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
1080 long long now
, bool v6
)
1083 /* Very conservative, due to DOS probability. */
1084 IPF_FRAG_LIST_MAX_EXPIRED
= 1,
1088 if (ovs_list_is_empty(&ipf
->frag_exp_list
)) {
1092 ovs_mutex_lock(&ipf
->ipf_lock
);
1093 struct ipf_list
*ipf_list
, *next
;
1094 size_t lists_removed
= 0;
1096 LIST_FOR_EACH_SAFE (ipf_list
, next
, list_node
, &ipf
->frag_exp_list
) {
1097 if (now
<= ipf_list
->expiration
||
1098 lists_removed
>= IPF_FRAG_LIST_MAX_EXPIRED
) {
1102 if (ipf_send_frags_in_list(ipf
, ipf_list
, pb
, IPF_FRAG_EXPIRY_LIST
,
1104 ipf_expiry_list_clean(&ipf
->frag_lists
, ipf_list
);
1111 ovs_mutex_unlock(&ipf
->ipf_lock
);
1114 /* Adds a reassmebled packet to a packet batch to be processed by the caller.
1117 ipf_execute_reass_pkts(struct ipf
*ipf
, struct dp_packet_batch
*pb
)
1119 if (ovs_list_is_empty(&ipf
->reassembled_pkt_list
)) {
1123 ovs_mutex_lock(&ipf
->ipf_lock
);
1124 struct reassembled_pkt
*rp
, *next
;
1126 LIST_FOR_EACH_SAFE (rp
, next
, rp_list_node
, &ipf
->reassembled_pkt_list
) {
1127 if (!rp
->list
->reass_execute_ctx
&&
1128 ipf_dp_packet_batch_add(pb
, rp
->pkt
, false)) {
1129 rp
->list
->reass_execute_ctx
= rp
->pkt
;
1133 ovs_mutex_unlock(&ipf
->ipf_lock
);
1136 /* Checks for reassembled packets post processing by conntrack and edits the
1137 * fragments if needed based on what conntrack decided. */
1139 ipf_post_execute_reass_pkts(struct ipf
*ipf
,
1140 struct dp_packet_batch
*pb
, bool v6
)
1142 if (ovs_list_is_empty(&ipf
->reassembled_pkt_list
)) {
1146 ovs_mutex_lock(&ipf
->ipf_lock
);
1147 struct reassembled_pkt
*rp
, *next
;
1149 LIST_FOR_EACH_SAFE (rp
, next
, rp_list_node
, &ipf
->reassembled_pkt_list
) {
1150 const size_t pb_cnt
= dp_packet_batch_size(pb
);
1152 struct dp_packet
*pkt
;
1153 /* Inner batch loop is constant time since batch size is <=
1154 * NETDEV_MAX_BURST. */
1155 DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx
, pb_cnt
, pkt
, pb
) {
1156 if (rp
&& pkt
== rp
->list
->reass_execute_ctx
) {
1157 for (int i
= 0; i
<= rp
->list
->last_inuse_idx
; i
++) {
1158 rp
->list
->frag_list
[i
].pkt
->md
.ct_label
= pkt
->md
.ct_label
;
1159 rp
->list
->frag_list
[i
].pkt
->md
.ct_mark
= pkt
->md
.ct_mark
;
1160 rp
->list
->frag_list
[i
].pkt
->md
.ct_state
= pkt
->md
.ct_state
;
1161 rp
->list
->frag_list
[i
].pkt
->md
.ct_zone
= pkt
->md
.ct_zone
;
1162 rp
->list
->frag_list
[i
].pkt
->md
.ct_orig_tuple_ipv6
=
1163 pkt
->md
.ct_orig_tuple_ipv6
;
1164 if (pkt
->md
.ct_orig_tuple_ipv6
) {
1165 rp
->list
->frag_list
[i
].pkt
->md
.ct_orig_tuple
.ipv6
=
1166 pkt
->md
.ct_orig_tuple
.ipv6
;
1168 rp
->list
->frag_list
[i
].pkt
->md
.ct_orig_tuple
.ipv4
=
1169 pkt
->md
.ct_orig_tuple
.ipv4
;
1173 const struct ipf_frag
*frag_0
= &rp
->list
->frag_list
[0];
1174 void *l4_frag
= dp_packet_l4(frag_0
->pkt
);
1175 void *l4_reass
= dp_packet_l4(pkt
);
1176 memcpy(l4_frag
, l4_reass
, dp_packet_l4_size(frag_0
->pkt
));
1179 struct ovs_16aligned_ip6_hdr
*l3_frag
1180 = dp_packet_l3(frag_0
->pkt
);
1181 struct ovs_16aligned_ip6_hdr
*l3_reass
= dp_packet_l3(pkt
);
1182 l3_frag
->ip6_src
= l3_reass
->ip6_src
;
1183 l3_frag
->ip6_dst
= l3_reass
->ip6_dst
;
1185 struct ip_header
*l3_frag
= dp_packet_l3(frag_0
->pkt
);
1186 struct ip_header
*l3_reass
= dp_packet_l3(pkt
);
1187 if (!dp_packet_hwol_is_ipv4(frag_0
->pkt
)) {
1189 get_16aligned_be32(&l3_reass
->ip_src
);
1191 get_16aligned_be32(&l3_frag
->ip_src
);
1193 l3_frag
->ip_csum
= recalc_csum32(l3_frag
->ip_csum
,
1195 reass_ip
= get_16aligned_be32(&l3_reass
->ip_dst
);
1196 frag_ip
= get_16aligned_be32(&l3_frag
->ip_dst
);
1197 l3_frag
->ip_csum
= recalc_csum32(l3_frag
->ip_csum
,
1201 l3_frag
->ip_src
= l3_reass
->ip_src
;
1202 l3_frag
->ip_dst
= l3_reass
->ip_dst
;
1205 ipf_completed_list_add(&ipf
->frag_complete_list
, rp
->list
);
1206 ipf_reassembled_list_remove(rp
);
1207 dp_packet_delete(rp
->pkt
);
1211 dp_packet_batch_refill(pb
, pkt
, pb_idx
);
1216 ovs_mutex_unlock(&ipf
->ipf_lock
);
1219 /* Extracts any fragments from the batch and reassembles them when a
1220 * complete packet is received. Completed packets are attempted to
1221 * be added to the batch to be sent through conntrack. */
1223 ipf_preprocess_conntrack(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
1224 long long now
, ovs_be16 dl_type
, uint16_t zone
,
1225 uint32_t hash_basis
)
1227 if (ipf_get_enabled(ipf
)) {
1228 ipf_extract_frags_from_batch(ipf
, pb
, dl_type
, zone
, now
, hash_basis
);
1231 if (ipf_get_enabled(ipf
) || atomic_count_get(&ipf
->nfrag
)) {
1232 ipf_execute_reass_pkts(ipf
, pb
);
1236 /* Updates fragments based on the processing of the reassembled packet sent
1237 * through conntrack and adds these fragments to any batches seen. Expired
1238 * fragments are marked as invalid and also added to the batches seen
1239 * with low priority. Reassembled packets are freed. */
1241 ipf_postprocess_conntrack(struct ipf
*ipf
, struct dp_packet_batch
*pb
,
1242 long long now
, ovs_be16 dl_type
)
1244 if (ipf_get_enabled(ipf
) || atomic_count_get(&ipf
->nfrag
)) {
1245 bool v6
= dl_type
== htons(ETH_TYPE_IPV6
);
1246 ipf_post_execute_reass_pkts(ipf
, pb
, v6
);
1247 ipf_send_completed_frags(ipf
, pb
, now
, v6
);
1248 ipf_send_expired_frags(ipf
, pb
, now
, v6
);
1253 ipf_clean_thread_main(void *f
)
1255 struct ipf
*ipf
= f
;
1258 IPF_FRAG_LIST_CLEAN_TIMEOUT
= 60000,
1261 while (!latch_is_set(&ipf
->ipf_clean_thread_exit
)) {
1263 long long now
= time_msec();
1265 if (!ovs_list_is_empty(&ipf
->frag_exp_list
) ||
1266 !ovs_list_is_empty(&ipf
->frag_complete_list
)) {
1268 ovs_mutex_lock(&ipf
->ipf_lock
);
1270 struct ipf_list
*ipf_list
, *next
;
1271 LIST_FOR_EACH_SAFE (ipf_list
, next
, list_node
,
1272 &ipf
->frag_exp_list
) {
1273 if (ipf_purge_list_check(ipf
, ipf_list
, now
)) {
1274 ipf_expiry_list_clean(&ipf
->frag_lists
, ipf_list
);
1278 LIST_FOR_EACH_SAFE (ipf_list
, next
, list_node
,
1279 &ipf
->frag_complete_list
) {
1280 if (ipf_purge_list_check(ipf
, ipf_list
, now
)) {
1281 ipf_completed_list_clean(&ipf
->frag_lists
, ipf_list
);
1285 ovs_mutex_unlock(&ipf
->ipf_lock
);
1288 poll_timer_wait_until(now
+ IPF_FRAG_LIST_CLEAN_TIMEOUT
);
1289 latch_wait(&ipf
->ipf_clean_thread_exit
);
1299 struct ipf
*ipf
= xzalloc(sizeof *ipf
);
1301 ovs_mutex_init_adaptive(&ipf
->ipf_lock
);
1302 ovs_mutex_lock(&ipf
->ipf_lock
);
1303 hmap_init(&ipf
->frag_lists
);
1304 ovs_list_init(&ipf
->frag_exp_list
);
1305 ovs_list_init(&ipf
->frag_complete_list
);
1306 ovs_list_init(&ipf
->reassembled_pkt_list
);
1307 atomic_init(&ipf
->min_v4_frag_size
, IPF_V4_FRAG_SIZE_MIN_DEF
);
1308 atomic_init(&ipf
->min_v6_frag_size
, IPF_V6_FRAG_SIZE_MIN_DEF
);
1309 ipf
->max_v4_frag_list_size
= DIV_ROUND_UP(
1310 IPV4_PACKET_MAX_SIZE
- IPV4_PACKET_MAX_HDR_SIZE
,
1311 ipf
->min_v4_frag_size
- IPV4_PACKET_MAX_HDR_SIZE
);
1312 ovs_mutex_unlock(&ipf
->ipf_lock
);
1313 atomic_count_init(&ipf
->nfrag
, 0);
1314 for (size_t i
= 0; i
< IPF_NFRAGS_NUM_CNTS
; i
++) {
1315 atomic_init(&ipf
->n4frag_cnt
[i
], 0);
1316 atomic_init(&ipf
->n6frag_cnt
[i
], 0);
1318 atomic_init(&ipf
->nfrag_max
, IPF_MAX_FRAGS_DEFAULT
);
1319 atomic_init(&ipf
->ifp_v4_enabled
, true);
1320 atomic_init(&ipf
->ifp_v6_enabled
, true);
1321 latch_init(&ipf
->ipf_clean_thread_exit
);
1322 ipf
->ipf_clean_thread
= ovs_thread_create("ipf_clean",
1323 ipf_clean_thread_main
, ipf
);
1329 ipf_destroy(struct ipf
*ipf
)
1331 ovs_mutex_lock(&ipf
->ipf_lock
);
1332 latch_set(&ipf
->ipf_clean_thread_exit
);
1333 pthread_join(ipf
->ipf_clean_thread
, NULL
);
1334 latch_destroy(&ipf
->ipf_clean_thread_exit
);
1336 struct ipf_list
*ipf_list
;
1337 HMAP_FOR_EACH_POP (ipf_list
, node
, &ipf
->frag_lists
) {
1338 while (ipf_list
->last_sent_idx
< ipf_list
->last_inuse_idx
) {
1339 struct dp_packet
*pkt
1340 = ipf_list
->frag_list
[ipf_list
->last_sent_idx
+ 1].pkt
;
1341 if (!ipf_list
->frag_list
[ipf_list
->last_sent_idx
+ 1].dnsteal
) {
1342 dp_packet_delete(pkt
);
1344 atomic_count_dec(&ipf
->nfrag
);
1345 ipf_list
->last_sent_idx
++;
1347 free(ipf_list
->frag_list
);
1351 if (atomic_count_get(&ipf
->nfrag
)) {
1352 VLOG_WARN("ipf destroy with non-zero fragment count. ");
1355 struct reassembled_pkt
*rp
;
1356 LIST_FOR_EACH_POP (rp
, rp_list_node
, &ipf
->reassembled_pkt_list
) {
1357 dp_packet_delete(rp
->pkt
);
1361 hmap_destroy(&ipf
->frag_lists
);
1362 ovs_list_poison(&ipf
->frag_exp_list
);
1363 ovs_list_poison(&ipf
->frag_complete_list
);
1364 ovs_list_poison(&ipf
->reassembled_pkt_list
);
1365 ovs_mutex_unlock(&ipf
->ipf_lock
);
1366 ovs_mutex_destroy(&ipf
->ipf_lock
);
1371 ipf_set_enabled(struct ipf
*ipf
, bool v6
, bool enable
)
1373 atomic_store_relaxed(v6
? &ipf
->ifp_v6_enabled
: &ipf
->ifp_v4_enabled
,
1379 ipf_set_min_frag(struct ipf
*ipf
, bool v6
, uint32_t value
)
1381 /* If the user specifies an unreasonably large number, fragmentation
1382 * will not work well but it will not blow up. */
1383 if (value
< (v6
? IPF_V6_FRAG_SIZE_LBOUND
: IPF_V4_FRAG_SIZE_LBOUND
)) {
1387 ovs_mutex_lock(&ipf
->ipf_lock
);
1389 atomic_store_relaxed(&ipf
->min_v6_frag_size
, value
);
1391 atomic_store_relaxed(&ipf
->min_v4_frag_size
, value
);
1392 ipf
->max_v4_frag_list_size
= DIV_ROUND_UP(
1393 IPV4_PACKET_MAX_SIZE
- IPV4_PACKET_MAX_HDR_SIZE
,
1394 ipf
->min_v4_frag_size
- IPV4_PACKET_MAX_HDR_SIZE
);
1396 ovs_mutex_unlock(&ipf
->ipf_lock
);
1401 ipf_set_max_nfrags(struct ipf
*ipf
, uint32_t value
)
1403 if (value
> IPF_NFRAG_UBOUND
) {
1406 atomic_store_relaxed(&ipf
->nfrag_max
, value
);
1411 ipf_get_status(struct ipf
*ipf
, struct ipf_status
*ipf_status
)
1413 ipf_status
->nfrag
= atomic_count_get(&ipf
->nfrag
);
1414 atomic_read_relaxed(&ipf
->nfrag_max
, &ipf_status
->nfrag_max
);
1416 atomic_read_relaxed(&ipf
->ifp_v4_enabled
, &ipf_status
->v4
.enabled
);
1417 atomic_read_relaxed(&ipf
->min_v4_frag_size
,
1418 &ipf_status
->v4
.min_frag_size
);
1419 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_ACCEPTED
],
1420 &ipf_status
->v4
.nfrag_accepted
);
1421 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_COMPL_SENT
],
1422 &ipf_status
->v4
.nfrag_completed_sent
);
1423 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_EXPD_SENT
],
1424 &ipf_status
->v4
.nfrag_expired_sent
);
1425 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_TOO_SMALL
],
1426 &ipf_status
->v4
.nfrag_too_small
);
1427 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_OVERLAP
],
1428 &ipf_status
->v4
.nfrag_overlap
);
1429 atomic_read_relaxed(&ipf
->n4frag_cnt
[IPF_NFRAGS_PURGED
],
1430 &ipf_status
->v4
.nfrag_purged
);
1432 atomic_read_relaxed(&ipf
->ifp_v6_enabled
, &ipf_status
->v6
.enabled
);
1433 atomic_read_relaxed(&ipf
->min_v6_frag_size
,
1434 &ipf_status
->v6
.min_frag_size
);
1435 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_ACCEPTED
],
1436 &ipf_status
->v6
.nfrag_accepted
);
1437 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_COMPL_SENT
],
1438 &ipf_status
->v6
.nfrag_completed_sent
);
1439 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_EXPD_SENT
],
1440 &ipf_status
->v6
.nfrag_expired_sent
);
1441 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_TOO_SMALL
],
1442 &ipf_status
->v6
.nfrag_too_small
);
1443 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_OVERLAP
],
1444 &ipf_status
->v6
.nfrag_overlap
);
1445 atomic_read_relaxed(&ipf
->n6frag_cnt
[IPF_NFRAGS_PURGED
],
1446 &ipf_status
->v6
.nfrag_purged
);
1450 struct ipf_dump_ctx
{
1451 struct hmap_position bucket_pos
;
1454 /* Allocates an 'ipf_dump_ctx' to keep track of an hmap position. The
1455 * caller must call ipf_dump_done() when dumping is finished. */
1457 ipf_dump_start(struct ipf_dump_ctx
**ipf_dump_ctx
)
1459 *ipf_dump_ctx
= xzalloc(sizeof **ipf_dump_ctx
);
1463 /* Creates a string representation of the state of an 'ipf_list' and puts
1466 ipf_dump_create(const struct ipf_list
*ipf_list
, struct ds
*ds
)
1468 ds_put_cstr(ds
, "(");
1469 if (ipf_list
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
1470 ds_put_format(ds
, "src="IP_FMT
",dst="IP_FMT
",",
1471 IP_ARGS(ipf_list
->key
.src_addr
.ipv4
),
1472 IP_ARGS(ipf_list
->key
.dst_addr
.ipv4
));
1474 ds_put_cstr(ds
, "src=");
1475 ipv6_format_addr(&ipf_list
->key
.src_addr
.ipv6
, ds
);
1476 ds_put_cstr(ds
, ",dst=");
1477 ipv6_format_addr(&ipf_list
->key
.dst_addr
.ipv6
, ds
);
1478 ds_put_cstr(ds
, ",");
1481 ds_put_format(ds
, "recirc_id=%u,ip_id=%u,dl_type=0x%x,zone=%u,nw_proto=%u",
1482 ipf_list
->key
.recirc_id
, ntohl(ipf_list
->key
.ip_id
),
1483 ntohs(ipf_list
->key
.dl_type
), ipf_list
->key
.zone
,
1484 ipf_list
->key
.nw_proto
);
1486 ds_put_format(ds
, ",num_fragments=%u,state=%s",
1487 ipf_list
->last_inuse_idx
+ 1,
1488 ipf_state_name
[ipf_list
->state
]);
1490 ds_put_cstr(ds
, ")");
1493 /* Finds the next ipf list starting from 'ipf_dump_ctx->bucket_pos' and uses
1494 * ipf_dump_create() to create a string representation of the state of an
1495 * ipf list, to which 'dump' is pointed to. Returns EOF when there are no
1496 * more ipf lists. */
1498 ipf_dump_next(struct ipf
*ipf
, struct ipf_dump_ctx
*ipf_dump_ctx
, char **dump
)
1500 ovs_mutex_lock(&ipf
->ipf_lock
);
1502 struct hmap_node
*node
= hmap_at_position(&ipf
->frag_lists
,
1503 &ipf_dump_ctx
->bucket_pos
);
1505 ovs_mutex_unlock(&ipf
->ipf_lock
);
1508 struct ipf_list
*ipf_list_
;
1509 INIT_CONTAINER(ipf_list_
, node
, node
);
1510 struct ipf_list ipf_list
= *ipf_list_
;
1511 ovs_mutex_unlock(&ipf
->ipf_lock
);
1512 struct ds ds
= DS_EMPTY_INITIALIZER
;
1513 ipf_dump_create(&ipf_list
, &ds
);
1514 *dump
= ds_steal_cstr(&ds
);
1519 /* Frees 'ipf_dump_ctx' allocated by ipf_dump_start(). */
1521 ipf_dump_done(struct ipf_dump_ctx
*ipf_dump_ctx
)