2 * Copyright (c) 2015, 2016 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
18 #include "conntrack.h"
21 #include <sys/types.h>
22 #include <netinet/in.h>
23 #include <netinet/icmp6.h>
26 #include "conntrack-private.h"
30 #include "dp-packet.h"
33 #include "odp-netlink.h"
34 #include "openvswitch/hmap.h"
35 #include "openvswitch/vlog.h"
37 #include "ovs-thread.h"
38 #include "poll-loop.h"
43 VLOG_DEFINE_THIS_MODULE(conntrack
);
45 COVERAGE_DEFINE(conntrack_full
);
46 COVERAGE_DEFINE(conntrack_long_cleanup
);
48 struct conn_lookup_ctx
{
56 static bool conn_key_extract(struct conntrack
*, struct dp_packet
*,
57 ovs_be16 dl_type
, struct conn_lookup_ctx
*,
59 static uint32_t conn_key_hash(const struct conn_key
*, uint32_t basis
);
60 static void conn_key_reverse(struct conn_key
*);
61 static void conn_key_lookup(struct conntrack_bucket
*ctb
,
62 struct conn_lookup_ctx
*ctx
,
64 static bool valid_new(struct dp_packet
*pkt
, struct conn_key
*);
65 static struct conn
*new_conn(struct conntrack_bucket
*, struct dp_packet
*pkt
,
66 struct conn_key
*, long long now
);
67 static void delete_conn(struct conn
*);
68 static enum ct_update_res
conn_update(struct conn
*,
69 struct conntrack_bucket
*ctb
,
70 struct dp_packet
*, bool reply
,
72 static bool conn_expired(struct conn
*, long long now
);
73 static void set_mark(struct dp_packet
*, struct conn
*,
74 uint32_t val
, uint32_t mask
);
75 static void set_label(struct dp_packet
*, struct conn
*,
76 const struct ovs_key_ct_labels
*val
,
77 const struct ovs_key_ct_labels
*mask
);
78 static void *clean_thread_main(void *f_
);
80 static struct nat_conn_key_node
*
81 nat_conn_keys_lookup(struct hmap
*nat_conn_keys
,
82 const struct conn_key
*key
,
86 nat_conn_keys_remove(struct hmap
*nat_conn_keys
,
87 const struct conn_key
*key
,
91 nat_select_range_tuple(struct conntrack
*ct
, const struct conn
*conn
,
92 struct conn
*nat_conn
);
95 reverse_icmp_type(uint8_t type
);
97 reverse_icmp6_type(uint8_t type
);
99 extract_l3_ipv4(struct conn_key
*key
, const void *data
, size_t size
,
100 const char **new_data
, bool validate_checksum
);
102 extract_l3_ipv6(struct conn_key
*key
, const void *data
, size_t size
,
103 const char **new_data
);
105 static struct ct_l4_proto
*l4_protos
[] = {
106 [IPPROTO_TCP
] = &ct_proto_tcp
,
107 [IPPROTO_UDP
] = &ct_proto_other
,
108 [IPPROTO_ICMP
] = &ct_proto_icmp4
,
109 [IPPROTO_ICMPV6
] = &ct_proto_icmp6
,
112 long long ct_timeout_val
[] = {
113 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
118 /* If the total number of connections goes above this value, no new connections
119 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
120 #define DEFAULT_N_CONN_LIMIT 3000000
122 /* Does a member by member comparison of two conn_keys; this
123 * function must be kept in sync with struct conn_key; returns 0
124 * if the keys are equal or 1 if the keys are not equal. */
126 conn_key_cmp(const struct conn_key
*key1
, const struct conn_key
*key2
)
128 if (!memcmp(&key1
->src
.addr
, &key2
->src
.addr
, sizeof key1
->src
.addr
) &&
129 !memcmp(&key1
->dst
.addr
, &key2
->dst
.addr
, sizeof key1
->dst
.addr
) &&
130 (key1
->src
.icmp_id
== key2
->src
.icmp_id
) &&
131 (key1
->src
.icmp_type
== key2
->src
.icmp_type
) &&
132 (key1
->src
.icmp_code
== key2
->src
.icmp_code
) &&
133 (key1
->dst
.icmp_id
== key2
->dst
.icmp_id
) &&
134 (key1
->dst
.icmp_type
== key2
->dst
.icmp_type
) &&
135 (key1
->dst
.icmp_code
== key2
->dst
.icmp_code
) &&
136 (key1
->dl_type
== key2
->dl_type
) &&
137 (key1
->zone
== key2
->zone
) &&
138 (key1
->nw_proto
== key2
->nw_proto
)) {
145 /* Initializes the connection tracker 'ct'. The caller is responsible for
146 * calling 'conntrack_destroy()', when the instance is not needed anymore */
148 conntrack_init(struct conntrack
*ct
)
151 long long now
= time_msec();
153 ct_rwlock_init(&ct
->resources_lock
);
154 ct_rwlock_wrlock(&ct
->resources_lock
);
155 hmap_init(&ct
->nat_conn_keys
);
156 ct_rwlock_unlock(&ct
->resources_lock
);
158 for (i
= 0; i
< CONNTRACK_BUCKETS
; i
++) {
159 struct conntrack_bucket
*ctb
= &ct
->buckets
[i
];
161 ct_lock_init(&ctb
->lock
);
162 ct_lock_lock(&ctb
->lock
);
163 hmap_init(&ctb
->connections
);
164 for (j
= 0; j
< ARRAY_SIZE(ctb
->exp_lists
); j
++) {
165 ovs_list_init(&ctb
->exp_lists
[j
]);
167 ct_lock_unlock(&ctb
->lock
);
168 ovs_mutex_init(&ctb
->cleanup_mutex
);
169 ovs_mutex_lock(&ctb
->cleanup_mutex
);
170 ctb
->next_cleanup
= now
+ CT_TM_MIN
;
171 ovs_mutex_unlock(&ctb
->cleanup_mutex
);
173 ct
->hash_basis
= random_uint32();
174 atomic_count_init(&ct
->n_conn
, 0);
175 atomic_init(&ct
->n_conn_limit
, DEFAULT_N_CONN_LIMIT
);
176 latch_init(&ct
->clean_thread_exit
);
177 ct
->clean_thread
= ovs_thread_create("ct_clean", clean_thread_main
, ct
);
180 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
182 conntrack_destroy(struct conntrack
*ct
)
186 latch_set(&ct
->clean_thread_exit
);
187 pthread_join(ct
->clean_thread
, NULL
);
188 latch_destroy(&ct
->clean_thread_exit
);
189 for (i
= 0; i
< CONNTRACK_BUCKETS
; i
++) {
190 struct conntrack_bucket
*ctb
= &ct
->buckets
[i
];
193 ovs_mutex_destroy(&ctb
->cleanup_mutex
);
194 ct_lock_lock(&ctb
->lock
);
195 HMAP_FOR_EACH_POP(conn
, node
, &ctb
->connections
) {
196 if (conn
->conn_type
== CT_CONN_TYPE_DEFAULT
) {
197 atomic_count_dec(&ct
->n_conn
);
201 hmap_destroy(&ctb
->connections
);
202 ct_lock_unlock(&ctb
->lock
);
203 ct_lock_destroy(&ctb
->lock
);
205 ct_rwlock_wrlock(&ct
->resources_lock
);
206 struct nat_conn_key_node
*nat_conn_key_node
;
207 HMAP_FOR_EACH_POP (nat_conn_key_node
, node
, &ct
->nat_conn_keys
) {
208 free(nat_conn_key_node
);
210 hmap_destroy(&ct
->nat_conn_keys
);
211 ct_rwlock_unlock(&ct
->resources_lock
);
212 ct_rwlock_destroy(&ct
->resources_lock
);
215 static unsigned hash_to_bucket(uint32_t hash
)
217 /* Extracts the most significant bits in hash. The least significant bits
218 * are already used internally by the hmap implementation. */
219 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT
< 32 && CONNTRACK_BUCKETS_SHIFT
>= 1);
221 return (hash
>> (32 - CONNTRACK_BUCKETS_SHIFT
)) % CONNTRACK_BUCKETS
;
225 write_ct_md(struct dp_packet
*pkt
, uint16_t zone
, const struct conn
*conn
,
226 const struct conn_key
*key
)
228 pkt
->md
.ct_state
|= CS_TRACKED
;
229 pkt
->md
.ct_zone
= zone
;
230 pkt
->md
.ct_mark
= conn
? conn
->mark
: 0;
231 pkt
->md
.ct_label
= conn
? conn
->label
: OVS_U128_ZERO
;
233 /* Use the original direction tuple if we have it. */
237 pkt
->md
.ct_orig_tuple_ipv6
= false;
239 if (key
->dl_type
== htons(ETH_TYPE_IP
)) {
240 pkt
->md
.ct_orig_tuple
.ipv4
= (struct ovs_key_ct_tuple_ipv4
) {
241 key
->src
.addr
.ipv4_aligned
,
242 key
->dst
.addr
.ipv4_aligned
,
243 key
->nw_proto
!= IPPROTO_ICMP
244 ? key
->src
.port
: htons(key
->src
.icmp_type
),
245 key
->nw_proto
!= IPPROTO_ICMP
246 ? key
->dst
.port
: htons(key
->src
.icmp_code
),
250 pkt
->md
.ct_orig_tuple_ipv6
= true;
251 pkt
->md
.ct_orig_tuple
.ipv6
= (struct ovs_key_ct_tuple_ipv6
) {
252 key
->src
.addr
.ipv6_aligned
,
253 key
->dst
.addr
.ipv6_aligned
,
254 key
->nw_proto
!= IPPROTO_ICMPV6
255 ? key
->src
.port
: htons(key
->src
.icmp_type
),
256 key
->nw_proto
!= IPPROTO_ICMPV6
257 ? key
->dst
.port
: htons(key
->src
.icmp_code
),
262 memset(&pkt
->md
.ct_orig_tuple
, 0, sizeof pkt
->md
.ct_orig_tuple
);
268 pat_packet(struct dp_packet
*pkt
, const struct conn
*conn
)
270 if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
271 if (conn
->key
.nw_proto
== IPPROTO_TCP
) {
272 struct tcp_header
*th
= dp_packet_l4(pkt
);
273 packet_set_tcp_port(pkt
, conn
->rev_key
.dst
.port
, th
->tcp_dst
);
274 } else if (conn
->key
.nw_proto
== IPPROTO_UDP
) {
275 struct udp_header
*uh
= dp_packet_l4(pkt
);
276 packet_set_udp_port(pkt
, conn
->rev_key
.dst
.port
, uh
->udp_dst
);
278 } else if (conn
->nat_info
->nat_action
& NAT_ACTION_DST
) {
279 if (conn
->key
.nw_proto
== IPPROTO_TCP
) {
280 struct tcp_header
*th
= dp_packet_l4(pkt
);
281 packet_set_tcp_port(pkt
, th
->tcp_src
, conn
->rev_key
.src
.port
);
282 } else if (conn
->key
.nw_proto
== IPPROTO_UDP
) {
283 struct udp_header
*uh
= dp_packet_l4(pkt
);
284 packet_set_udp_port(pkt
, uh
->udp_src
, conn
->rev_key
.src
.port
);
290 nat_packet(struct dp_packet
*pkt
, const struct conn
*conn
, bool related
)
292 if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
293 pkt
->md
.ct_state
|= CS_SRC_NAT
;
294 if (conn
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
295 struct ip_header
*nh
= dp_packet_l3(pkt
);
296 packet_set_ipv4_addr(pkt
, &nh
->ip_src
,
297 conn
->rev_key
.dst
.addr
.ipv4_aligned
);
299 struct ovs_16aligned_ip6_hdr
*nh6
= dp_packet_l3(pkt
);
300 packet_set_ipv6_addr(pkt
, conn
->key
.nw_proto
,
302 &conn
->rev_key
.dst
.addr
.ipv6_aligned
,
306 pat_packet(pkt
, conn
);
308 } else if (conn
->nat_info
->nat_action
& NAT_ACTION_DST
) {
309 pkt
->md
.ct_state
|= CS_DST_NAT
;
310 if (conn
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
311 struct ip_header
*nh
= dp_packet_l3(pkt
);
312 packet_set_ipv4_addr(pkt
, &nh
->ip_dst
,
313 conn
->rev_key
.src
.addr
.ipv4_aligned
);
315 struct ovs_16aligned_ip6_hdr
*nh6
= dp_packet_l3(pkt
);
316 packet_set_ipv6_addr(pkt
, conn
->key
.nw_proto
,
318 &conn
->rev_key
.src
.addr
.ipv6_aligned
,
322 pat_packet(pkt
, conn
);
328 un_pat_packet(struct dp_packet
*pkt
, const struct conn
*conn
)
330 if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
331 if (conn
->key
.nw_proto
== IPPROTO_TCP
) {
332 struct tcp_header
*th
= dp_packet_l4(pkt
);
333 packet_set_tcp_port(pkt
, th
->tcp_src
, conn
->key
.src
.port
);
334 } else if (conn
->key
.nw_proto
== IPPROTO_UDP
) {
335 struct udp_header
*uh
= dp_packet_l4(pkt
);
336 packet_set_udp_port(pkt
, uh
->udp_src
, conn
->key
.src
.port
);
338 } else if (conn
->nat_info
->nat_action
& NAT_ACTION_DST
) {
339 if (conn
->key
.nw_proto
== IPPROTO_TCP
) {
340 struct tcp_header
*th
= dp_packet_l4(pkt
);
341 packet_set_tcp_port(pkt
, conn
->key
.dst
.port
, th
->tcp_dst
);
342 } else if (conn
->key
.nw_proto
== IPPROTO_UDP
) {
343 struct udp_header
*uh
= dp_packet_l4(pkt
);
344 packet_set_udp_port(pkt
, conn
->key
.dst
.port
, uh
->udp_dst
);
350 reverse_pat_packet(struct dp_packet
*pkt
, const struct conn
*conn
)
352 if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
353 if (conn
->key
.nw_proto
== IPPROTO_TCP
) {
354 struct tcp_header
*th_in
= dp_packet_l4(pkt
);
355 packet_set_tcp_port(pkt
, conn
->key
.src
.port
,
357 } else if (conn
->key
.nw_proto
== IPPROTO_UDP
) {
358 struct udp_header
*uh_in
= dp_packet_l4(pkt
);
359 packet_set_udp_port(pkt
, conn
->key
.src
.port
,
362 } else if (conn
->nat_info
->nat_action
& NAT_ACTION_DST
) {
363 if (conn
->key
.nw_proto
== IPPROTO_TCP
) {
364 struct tcp_header
*th_in
= dp_packet_l4(pkt
);
365 packet_set_tcp_port(pkt
, th_in
->tcp_src
,
367 } else if (conn
->key
.nw_proto
== IPPROTO_UDP
) {
368 struct udp_header
*uh_in
= dp_packet_l4(pkt
);
369 packet_set_udp_port(pkt
, uh_in
->udp_src
,
376 reverse_nat_packet(struct dp_packet
*pkt
, const struct conn
*conn
)
378 char *tail
= dp_packet_tail(pkt
);
379 char pad
= dp_packet_l2_pad_size(pkt
);
380 struct conn_key inner_key
;
381 const char *inner_l4
= NULL
;
382 uint16_t orig_l3_ofs
= pkt
->l3_ofs
;
383 uint16_t orig_l4_ofs
= pkt
->l4_ofs
;
385 if (conn
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
386 struct ip_header
*nh
= dp_packet_l3(pkt
);
387 struct icmp_header
*icmp
= dp_packet_l4(pkt
);
388 struct ip_header
*inner_l3
= (struct ip_header
*) (icmp
+ 1);
389 extract_l3_ipv4(&inner_key
, inner_l3
, tail
- ((char *)inner_l3
)
390 -pad
, &inner_l4
, false);
392 pkt
->l3_ofs
+= (char *) inner_l3
- (char *) nh
;
393 pkt
->l4_ofs
+= inner_l4
- (char *) icmp
;
395 if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
396 packet_set_ipv4_addr(pkt
, &inner_l3
->ip_src
,
397 conn
->key
.src
.addr
.ipv4_aligned
);
398 } else if (conn
->nat_info
->nat_action
& NAT_ACTION_DST
) {
399 packet_set_ipv4_addr(pkt
, &inner_l3
->ip_dst
,
400 conn
->key
.dst
.addr
.ipv4_aligned
);
402 reverse_pat_packet(pkt
, conn
);
404 icmp
->icmp_csum
= csum(icmp
, tail
- (char *) icmp
- pad
);
406 struct ovs_16aligned_ip6_hdr
*nh6
= dp_packet_l3(pkt
);
407 struct icmp6_error_header
*icmp6
= dp_packet_l4(pkt
);
408 struct ovs_16aligned_ip6_hdr
*inner_l3_6
=
409 (struct ovs_16aligned_ip6_hdr
*) (icmp6
+ 1);
410 extract_l3_ipv6(&inner_key
, inner_l3_6
,
411 tail
- ((char *)inner_l3_6
) - pad
,
413 pkt
->l3_ofs
+= (char *) inner_l3_6
- (char *) nh6
;
414 pkt
->l4_ofs
+= inner_l4
- (char *) icmp6
;
416 if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
417 packet_set_ipv6_addr(pkt
, conn
->key
.nw_proto
,
418 inner_l3_6
->ip6_src
.be32
,
419 &conn
->key
.src
.addr
.ipv6_aligned
,
421 } else if (conn
->nat_info
->nat_action
& NAT_ACTION_DST
) {
422 packet_set_ipv6_addr(pkt
, conn
->key
.nw_proto
,
423 inner_l3_6
->ip6_dst
.be32
,
424 &conn
->key
.dst
.addr
.ipv6_aligned
,
427 reverse_pat_packet(pkt
, conn
);
428 uint32_t icmp6_csum
= packet_csum_pseudoheader6(nh6
);
429 icmp6
->icmp6_base
.icmp6_cksum
= 0;
430 icmp6
->icmp6_base
.icmp6_cksum
= csum_finish(
431 csum_continue(icmp6_csum
, icmp6
, tail
- (char *) icmp6
- pad
));
433 pkt
->l3_ofs
= orig_l3_ofs
;
434 pkt
->l4_ofs
= orig_l4_ofs
;
438 un_nat_packet(struct dp_packet
*pkt
, const struct conn
*conn
,
441 if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
442 pkt
->md
.ct_state
|= CS_DST_NAT
;
443 if (conn
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
444 struct ip_header
*nh
= dp_packet_l3(pkt
);
445 packet_set_ipv4_addr(pkt
, &nh
->ip_dst
,
446 conn
->key
.src
.addr
.ipv4_aligned
);
448 struct ovs_16aligned_ip6_hdr
*nh6
= dp_packet_l3(pkt
);
449 packet_set_ipv6_addr(pkt
, conn
->key
.nw_proto
,
451 &conn
->key
.src
.addr
.ipv6_aligned
, true);
454 if (OVS_UNLIKELY(related
)) {
455 reverse_nat_packet(pkt
, conn
);
457 un_pat_packet(pkt
, conn
);
459 } else if (conn
->nat_info
->nat_action
& NAT_ACTION_DST
) {
460 pkt
->md
.ct_state
|= CS_SRC_NAT
;
461 if (conn
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
462 struct ip_header
*nh
= dp_packet_l3(pkt
);
463 packet_set_ipv4_addr(pkt
, &nh
->ip_src
,
464 conn
->key
.dst
.addr
.ipv4_aligned
);
466 struct ovs_16aligned_ip6_hdr
*nh6
= dp_packet_l3(pkt
);
467 packet_set_ipv6_addr(pkt
, conn
->key
.nw_proto
,
469 &conn
->key
.dst
.addr
.ipv6_aligned
, true);
472 if (OVS_UNLIKELY(related
)) {
473 reverse_nat_packet(pkt
, conn
);
475 un_pat_packet(pkt
, conn
);
480 /* Typical usage of this helper is in non per-packet code;
481 * this is because the bucket lock needs to be held for lookup
482 * and a hash would have already been needed. Hence, this function
483 * is just intended for code clarity. */
485 conn_lookup(struct conntrack
*ct
, struct conn_key
*key
, long long now
)
487 struct conn_lookup_ctx ctx
;
490 ctx
.hash
= conn_key_hash(key
, ct
->hash_basis
);
491 unsigned bucket
= hash_to_bucket(ctx
.hash
);
492 conn_key_lookup(&ct
->buckets
[bucket
], &ctx
, now
);
497 nat_clean(struct conntrack
*ct
, struct conn
*conn
,
498 struct conntrack_bucket
*ctb
)
499 OVS_REQUIRES(ctb
->lock
)
501 long long now
= time_msec();
502 ct_rwlock_wrlock(&ct
->resources_lock
);
503 nat_conn_keys_remove(&ct
->nat_conn_keys
, &conn
->rev_key
, ct
->hash_basis
);
504 ct_rwlock_unlock(&ct
->resources_lock
);
505 ct_lock_unlock(&ctb
->lock
);
507 uint32_t hash_rev_conn
= conn_key_hash(&conn
->rev_key
, ct
->hash_basis
);
508 unsigned bucket_rev_conn
= hash_to_bucket(hash_rev_conn
);
510 ct_lock_lock(&ct
->buckets
[bucket_rev_conn
].lock
);
511 ct_rwlock_wrlock(&ct
->resources_lock
);
513 struct conn
*rev_conn
= conn_lookup(ct
, &conn
->rev_key
, now
);
515 struct nat_conn_key_node
*nat_conn_key_node
=
516 nat_conn_keys_lookup(&ct
->nat_conn_keys
, &conn
->rev_key
,
519 /* In the unlikely event, rev conn was recreated, then skip
520 * rev_conn cleanup. */
521 if (rev_conn
&& (!nat_conn_key_node
||
522 conn_key_cmp(&nat_conn_key_node
->value
,
523 &rev_conn
->rev_key
))) {
524 hmap_remove(&ct
->buckets
[bucket_rev_conn
].connections
,
530 ct_rwlock_unlock(&ct
->resources_lock
);
531 ct_lock_unlock(&ct
->buckets
[bucket_rev_conn
].lock
);
532 ct_lock_lock(&ctb
->lock
);
536 conn_clean(struct conntrack
*ct
, struct conn
*conn
,
537 struct conntrack_bucket
*ctb
)
538 OVS_REQUIRES(ctb
->lock
)
540 ovs_list_remove(&conn
->exp_node
);
541 hmap_remove(&ctb
->connections
, &conn
->node
);
542 atomic_count_dec(&ct
->n_conn
);
543 if (conn
->nat_info
) {
544 nat_clean(ct
, conn
, ctb
);
550 /* This function is called with the bucket lock held. */
552 conn_not_found(struct conntrack
*ct
, struct dp_packet
*pkt
,
553 struct conn_lookup_ctx
*ctx
, bool commit
, long long now
,
554 const struct nat_action_info_t
*nat_action_info
,
555 struct conn
*conn_for_un_nat_copy
)
557 unsigned bucket
= hash_to_bucket(ctx
->hash
);
558 struct conn
*nc
= NULL
;
560 if (!valid_new(pkt
, &ctx
->key
)) {
561 pkt
->md
.ct_state
= CS_INVALID
;
564 pkt
->md
.ct_state
= CS_NEW
;
567 unsigned int n_conn_limit
;
569 atomic_read_relaxed(&ct
->n_conn_limit
, &n_conn_limit
);
571 if (atomic_count_get(&ct
->n_conn
) >= n_conn_limit
) {
572 COVERAGE_INC(conntrack_full
);
576 nc
= new_conn(&ct
->buckets
[bucket
], pkt
, &ctx
->key
, now
);
578 nc
->rev_key
= nc
->key
;
579 conn_key_reverse(&nc
->rev_key
);
581 if (nat_action_info
) {
582 nc
->nat_info
= xmemdup(nat_action_info
, sizeof *nc
->nat_info
);
583 ct_rwlock_wrlock(&ct
->resources_lock
);
585 bool nat_res
= nat_select_range_tuple(ct
, nc
,
586 conn_for_un_nat_copy
);
592 ct_rwlock_unlock(&ct
->resources_lock
);
596 if (conn_for_un_nat_copy
&&
597 nc
->conn_type
== CT_CONN_TYPE_DEFAULT
) {
598 *nc
= *conn_for_un_nat_copy
;
599 conn_for_un_nat_copy
->conn_type
= CT_CONN_TYPE_UN_NAT
;
600 conn_for_un_nat_copy
->nat_info
= NULL
;
602 ct_rwlock_unlock(&ct
->resources_lock
);
604 nat_packet(pkt
, nc
, ctx
->icmp_related
);
606 hmap_insert(&ct
->buckets
[bucket
].connections
, &nc
->node
, ctx
->hash
);
607 atomic_count_inc(&ct
->n_conn
);
613 conn_update_state(struct conntrack
*ct
, struct dp_packet
*pkt
,
614 struct conn_lookup_ctx
*ctx
, struct conn
**conn
,
615 long long now
, unsigned bucket
)
616 OVS_REQUIRES(ct
->buckets
[bucket
].lock
)
618 bool create_new_conn
= false;
620 if (ctx
->icmp_related
) {
621 pkt
->md
.ct_state
|= CS_RELATED
;
623 pkt
->md
.ct_state
|= CS_REPLY_DIR
;
626 enum ct_update_res res
= conn_update(*conn
, &ct
->buckets
[bucket
],
627 pkt
, ctx
->reply
, now
);
630 case CT_UPDATE_VALID
:
631 pkt
->md
.ct_state
|= CS_ESTABLISHED
;
632 pkt
->md
.ct_state
&= ~CS_NEW
;
634 pkt
->md
.ct_state
|= CS_REPLY_DIR
;
637 case CT_UPDATE_INVALID
:
638 pkt
->md
.ct_state
= CS_INVALID
;
641 conn_clean(ct
, *conn
, &ct
->buckets
[bucket
]);
642 create_new_conn
= true;
648 return create_new_conn
;
652 create_un_nat_conn(struct conntrack
*ct
, struct conn
*conn_for_un_nat_copy
,
655 struct conn
*nc
= xmemdup(conn_for_un_nat_copy
, sizeof *nc
);
656 nc
->key
= conn_for_un_nat_copy
->rev_key
;
657 nc
->rev_key
= conn_for_un_nat_copy
->key
;
658 uint32_t un_nat_hash
= conn_key_hash(&nc
->key
, ct
->hash_basis
);
659 unsigned un_nat_conn_bucket
= hash_to_bucket(un_nat_hash
);
660 ct_lock_lock(&ct
->buckets
[un_nat_conn_bucket
].lock
);
661 ct_rwlock_rdlock(&ct
->resources_lock
);
663 struct conn
*rev_conn
= conn_lookup(ct
, &nc
->key
, now
);
665 struct nat_conn_key_node
*nat_conn_key_node
=
666 nat_conn_keys_lookup(&ct
->nat_conn_keys
, &nc
->key
, ct
->hash_basis
);
667 if (nat_conn_key_node
668 && !conn_key_cmp(&nat_conn_key_node
->value
, &nc
->rev_key
)
670 hmap_insert(&ct
->buckets
[un_nat_conn_bucket
].connections
,
671 &nc
->node
, un_nat_hash
);
675 ct_rwlock_unlock(&ct
->resources_lock
);
676 ct_lock_unlock(&ct
->buckets
[un_nat_conn_bucket
].lock
);
680 handle_nat(struct dp_packet
*pkt
, struct conn
*conn
,
681 uint16_t zone
, bool reply
, bool related
)
683 if (conn
->nat_info
&&
684 (!(pkt
->md
.ct_state
& (CS_SRC_NAT
| CS_DST_NAT
)) ||
685 (pkt
->md
.ct_state
& (CS_SRC_NAT
| CS_DST_NAT
) &&
686 zone
!= pkt
->md
.ct_zone
))) {
687 if (pkt
->md
.ct_state
& (CS_SRC_NAT
| CS_DST_NAT
)) {
688 pkt
->md
.ct_state
&= ~(CS_SRC_NAT
| CS_DST_NAT
);
691 un_nat_packet(pkt
, conn
, related
);
693 nat_packet(pkt
, conn
, related
);
699 check_orig_tuple(struct conntrack
*ct
, struct dp_packet
*pkt
,
700 struct conn_lookup_ctx
*ctx_in
, long long now
,
701 unsigned *bucket
, struct conn
**conn
,
702 const struct nat_action_info_t
*nat_action_info
)
703 OVS_REQUIRES(ct
->buckets
[*bucket
].lock
)
705 if ((ctx_in
->key
.dl_type
== htons(ETH_TYPE_IP
) &&
706 !pkt
->md
.ct_orig_tuple
.ipv4
.ipv4_proto
) ||
707 (ctx_in
->key
.dl_type
== htons(ETH_TYPE_IPV6
) &&
708 !pkt
->md
.ct_orig_tuple
.ipv6
.ipv6_proto
) ||
709 !(pkt
->md
.ct_state
& (CS_SRC_NAT
| CS_DST_NAT
)) ||
714 ct_lock_unlock(&ct
->buckets
[*bucket
].lock
);
715 struct conn_lookup_ctx ctx
;
716 memset(&ctx
, 0 , sizeof ctx
);
719 if (ctx_in
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
720 ctx
.key
.src
.addr
.ipv4_aligned
= pkt
->md
.ct_orig_tuple
.ipv4
.ipv4_src
;
721 ctx
.key
.dst
.addr
.ipv4_aligned
= pkt
->md
.ct_orig_tuple
.ipv4
.ipv4_dst
;
723 if (ctx_in
->key
.nw_proto
== IPPROTO_ICMP
) {
724 ctx
.key
.src
.icmp_id
= ctx_in
->key
.src
.icmp_id
;
725 ctx
.key
.dst
.icmp_id
= ctx_in
->key
.dst
.icmp_id
;
726 uint16_t src_port
= ntohs(pkt
->md
.ct_orig_tuple
.ipv4
.src_port
);
727 ctx
.key
.src
.icmp_type
= (uint8_t) src_port
;
728 ctx
.key
.dst
.icmp_type
= reverse_icmp_type(ctx
.key
.src
.icmp_type
);
730 ctx
.key
.src
.port
= pkt
->md
.ct_orig_tuple
.ipv4
.src_port
;
731 ctx
.key
.dst
.port
= pkt
->md
.ct_orig_tuple
.ipv4
.dst_port
;
733 ctx
.key
.nw_proto
= pkt
->md
.ct_orig_tuple
.ipv4
.ipv4_proto
;
735 ctx
.key
.src
.addr
.ipv6_aligned
= pkt
->md
.ct_orig_tuple
.ipv6
.ipv6_src
;
736 ctx
.key
.dst
.addr
.ipv6_aligned
= pkt
->md
.ct_orig_tuple
.ipv6
.ipv6_dst
;
738 if (ctx_in
->key
.nw_proto
== IPPROTO_ICMPV6
) {
739 ctx
.key
.src
.icmp_id
= ctx_in
->key
.src
.icmp_id
;
740 ctx
.key
.dst
.icmp_id
= ctx_in
->key
.dst
.icmp_id
;
741 uint16_t src_port
= ntohs(pkt
->md
.ct_orig_tuple
.ipv6
.src_port
);
742 ctx
.key
.src
.icmp_type
= (uint8_t) src_port
;
743 ctx
.key
.dst
.icmp_type
= reverse_icmp6_type(ctx
.key
.src
.icmp_type
);
745 ctx
.key
.src
.port
= pkt
->md
.ct_orig_tuple
.ipv6
.src_port
;
746 ctx
.key
.dst
.port
= pkt
->md
.ct_orig_tuple
.ipv6
.dst_port
;
748 ctx
.key
.nw_proto
= pkt
->md
.ct_orig_tuple
.ipv6
.ipv6_proto
;
751 ctx
.key
.dl_type
= ctx_in
->key
.dl_type
;
752 ctx
.key
.zone
= pkt
->md
.ct_zone
;
754 ctx
.hash
= conn_key_hash(&ctx
.key
, ct
->hash_basis
);
755 *bucket
= hash_to_bucket(ctx
.hash
);
756 ct_lock_lock(&ct
->buckets
[*bucket
].lock
);
757 conn_key_lookup(&ct
->buckets
[*bucket
], &ctx
, now
);
760 return *conn
? true : false;
764 process_one(struct conntrack
*ct
, struct dp_packet
*pkt
,
765 struct conn_lookup_ctx
*ctx
, uint16_t zone
,
766 bool force
, bool commit
, long long now
, const uint32_t *setmark
,
767 const struct ovs_key_ct_labels
*setlabel
,
768 const struct nat_action_info_t
*nat_action_info
)
771 unsigned bucket
= hash_to_bucket(ctx
->hash
);
772 ct_lock_lock(&ct
->buckets
[bucket
].lock
);
773 conn_key_lookup(&ct
->buckets
[bucket
], ctx
, now
);
776 /* Delete found entry if in wrong direction. 'force' implies commit. */
777 if (conn
&& force
&& ctx
->reply
) {
778 conn_clean(ct
, conn
, &ct
->buckets
[bucket
]);
782 if (OVS_LIKELY(conn
)) {
783 if (conn
->conn_type
== CT_CONN_TYPE_UN_NAT
) {
787 struct conn_lookup_ctx ctx2
;
789 ctx2
.key
= conn
->rev_key
;
790 ctx2
.hash
= conn_key_hash(&conn
->rev_key
, ct
->hash_basis
);
792 ct_lock_unlock(&ct
->buckets
[bucket
].lock
);
793 bucket
= hash_to_bucket(ctx2
.hash
);
795 ct_lock_lock(&ct
->buckets
[bucket
].lock
);
796 conn_key_lookup(&ct
->buckets
[bucket
], &ctx2
, now
);
801 /* It is a race condition where conn has timed out and removed
802 * between unlock of the rev_conn and lock of the forward conn;
804 pkt
->md
.ct_state
|= CS_TRACKED
| CS_INVALID
;
805 ct_lock_unlock(&ct
->buckets
[bucket
].lock
);
811 bool create_new_conn
= false;
812 struct conn conn_for_un_nat_copy
;
813 conn_for_un_nat_copy
.conn_type
= CT_CONN_TYPE_DEFAULT
;
814 if (OVS_LIKELY(conn
)) {
815 create_new_conn
= conn_update_state(ct
, pkt
, ctx
, &conn
, now
, bucket
);
816 if (nat_action_info
&& !create_new_conn
) {
817 handle_nat(pkt
, conn
, zone
, ctx
->reply
, ctx
->icmp_related
);
819 } else if (check_orig_tuple(ct
, pkt
, ctx
, now
, &bucket
, &conn
,
821 create_new_conn
= conn_update_state(ct
, pkt
, ctx
, &conn
, now
, bucket
);
823 if (ctx
->icmp_related
) {
824 pkt
->md
.ct_state
= CS_INVALID
;
826 create_new_conn
= true;
830 if (OVS_UNLIKELY(create_new_conn
)) {
831 conn
= conn_not_found(ct
, pkt
, ctx
, commit
, now
, nat_action_info
,
832 &conn_for_un_nat_copy
);
835 write_ct_md(pkt
, zone
, conn
, &ctx
->key
);
836 if (conn
&& setmark
) {
837 set_mark(pkt
, conn
, setmark
[0], setmark
[1]);
840 if (conn
&& setlabel
) {
841 set_label(pkt
, conn
, &setlabel
[0], &setlabel
[1]);
844 ct_lock_unlock(&ct
->buckets
[bucket
].lock
);
846 if (conn_for_un_nat_copy
.conn_type
== CT_CONN_TYPE_UN_NAT
) {
847 create_un_nat_conn(ct
, &conn_for_un_nat_copy
, now
);
851 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
852 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
853 * the l3 and and l4 offset properly set.
855 * If 'commit' is true, the packets are allowed to create new entries in the
856 * connection tables. 'setmark', if not NULL, should point to a two
857 * elements array containing a value and a mask to set the connection mark.
858 * 'setlabel' behaves similarly for the connection label.*/
860 conntrack_execute(struct conntrack
*ct
, struct dp_packet_batch
*pkt_batch
,
861 ovs_be16 dl_type
, bool force
, bool commit
, uint16_t zone
,
862 const uint32_t *setmark
,
863 const struct ovs_key_ct_labels
*setlabel
,
865 const struct nat_action_info_t
*nat_action_info
)
867 struct dp_packet
**pkts
= pkt_batch
->packets
;
868 size_t cnt
= pkt_batch
->count
;
869 long long now
= time_msec();
870 struct conn_lookup_ctx ctx
;
873 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(5, 5);
875 VLOG_WARN_RL(&rl
, "ALG helper \"%s\" not supported", helper
);
876 /* Continue without the helper */
879 for (size_t i
= 0; i
< cnt
; i
++) {
880 if (!conn_key_extract(ct
, pkts
[i
], dl_type
, &ctx
, zone
)) {
881 pkts
[i
]->md
.ct_state
= CS_INVALID
;
882 write_ct_md(pkts
[i
], zone
, NULL
, NULL
);
885 process_one(ct
, pkts
[i
], &ctx
, zone
, force
, commit
,
886 now
, setmark
, setlabel
, nat_action_info
);
893 set_mark(struct dp_packet
*pkt
, struct conn
*conn
, uint32_t val
, uint32_t mask
)
895 pkt
->md
.ct_mark
= val
| (pkt
->md
.ct_mark
& ~(mask
));
896 conn
->mark
= pkt
->md
.ct_mark
;
900 set_label(struct dp_packet
*pkt
, struct conn
*conn
,
901 const struct ovs_key_ct_labels
*val
,
902 const struct ovs_key_ct_labels
*mask
)
906 memcpy(&v
, val
, sizeof v
);
907 memcpy(&m
, mask
, sizeof m
);
909 pkt
->md
.ct_label
.u64
.lo
= v
.u64
.lo
910 | (pkt
->md
.ct_label
.u64
.lo
& ~(m
.u64
.lo
));
911 pkt
->md
.ct_label
.u64
.hi
= v
.u64
.hi
912 | (pkt
->md
.ct_label
.u64
.hi
& ~(m
.u64
.hi
));
913 conn
->label
= pkt
->md
.ct_label
;
917 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
918 * earliest expiration time among the remaining connections in 'ctb'. Returns
919 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
920 * if 'limit' is reached */
922 sweep_bucket(struct conntrack
*ct
, struct conntrack_bucket
*ctb
, long long now
,
924 OVS_REQUIRES(ctb
->lock
)
926 struct conn
*conn
, *next
;
927 long long min_expiration
= LLONG_MAX
;
931 for (i
= 0; i
< N_CT_TM
; i
++) {
932 LIST_FOR_EACH_SAFE (conn
, next
, exp_node
, &ctb
->exp_lists
[i
]) {
933 if (conn
->conn_type
== CT_CONN_TYPE_DEFAULT
) {
934 if (!conn_expired(conn
, now
) || count
>= limit
) {
935 min_expiration
= MIN(min_expiration
, conn
->expiration
);
936 if (count
>= limit
) {
937 /* Do not check other lists. */
938 COVERAGE_INC(conntrack_long_cleanup
);
939 return min_expiration
;
943 conn_clean(ct
, conn
, ctb
);
949 return min_expiration
;
952 /* Cleans up old connection entries from 'ct'. Returns the time when the
953 * next expiration might happen. The return value might be smaller than
954 * 'now', meaning that an internal limit has been reached, and some expired
955 * connections have not been deleted. */
957 conntrack_clean(struct conntrack
*ct
, long long now
)
959 long long next_wakeup
= now
+ CT_TM_MIN
;
960 unsigned int n_conn_limit
;
961 size_t clean_count
= 0;
964 atomic_read_relaxed(&ct
->n_conn_limit
, &n_conn_limit
);
966 for (i
= 0; i
< CONNTRACK_BUCKETS
; i
++) {
967 struct conntrack_bucket
*ctb
= &ct
->buckets
[i
];
971 ovs_mutex_lock(&ctb
->cleanup_mutex
);
972 if (ctb
->next_cleanup
> now
) {
976 ct_lock_lock(&ctb
->lock
);
977 prev_count
= hmap_count(&ctb
->connections
);
978 /* If the connections are well distributed among buckets, we want to
979 * limit to 10% of the global limit equally split among buckets. If
980 * the bucket is busier than the others, we limit to 10% of its
982 min_exp
= sweep_bucket(ct
, ctb
, now
,
983 MAX(prev_count
/10, n_conn_limit
/(CONNTRACK_BUCKETS
*10)));
984 clean_count
+= prev_count
- hmap_count(&ctb
->connections
);
987 /* We call hmap_shrink() only if sweep_bucket() managed to delete
988 * every expired connection. */
989 hmap_shrink(&ctb
->connections
);
992 ct_lock_unlock(&ctb
->lock
);
994 ctb
->next_cleanup
= MIN(min_exp
, now
+ CT_TM_MIN
);
997 next_wakeup
= MIN(next_wakeup
, ctb
->next_cleanup
);
998 ovs_mutex_unlock(&ctb
->cleanup_mutex
);
1001 VLOG_DBG("conntrack cleanup %"PRIuSIZE
" entries in %lld msec",
1002 clean_count
, time_msec() - now
);
1009 * We must call conntrack_clean() periodically. conntrack_clean() return
1010 * value gives an hint on when the next cleanup must be done (either because
1011 * there is an actual connection that expires, or because a new connection
1012 * might be created with the minimum timeout).
1014 * The logic below has two goals:
1016 * - We want to reduce the number of wakeups and batch connection cleanup
1017 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1018 * are coping with the current cleanup tasks, then we wait at least
1019 * 5 seconds to do further cleanup.
1021 * - We don't want to keep the buckets locked too long, as we might prevent
1022 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
1023 * behind, there is at least some 200ms blocks of time when buckets will be
1024 * left alone, so the datapath can operate unhindered.
1026 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1027 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1030 clean_thread_main(void *f_
)
1032 struct conntrack
*ct
= f_
;
1034 while (!latch_is_set(&ct
->clean_thread_exit
)) {
1035 long long next_wake
;
1036 long long now
= time_msec();
1038 next_wake
= conntrack_clean(ct
, now
);
1040 if (next_wake
< now
) {
1041 poll_timer_wait_until(now
+ CT_CLEAN_MIN_INTERVAL
);
1043 poll_timer_wait_until(MAX(next_wake
, now
+ CT_CLEAN_INTERVAL
));
1045 latch_wait(&ct
->clean_thread_exit
);
1052 /* Key extraction */
1054 /* The function stores a pointer to the first byte after the header in
1055 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1056 * not interested in the header's tail, meaning that the header has
1057 * already been parsed (e.g. by flow_extract): we take this as a hint to
1058 * save a few checks. If 'validate_checksum' is true, the function returns
1059 * false if the IPv4 checksum is invalid. */
1061 extract_l3_ipv4(struct conn_key
*key
, const void *data
, size_t size
,
1062 const char **new_data
, bool validate_checksum
)
1064 const struct ip_header
*ip
= data
;
1068 if (OVS_UNLIKELY(size
< IP_HEADER_LEN
)) {
1073 ip_len
= IP_IHL(ip
->ip_ihl_ver
) * 4;
1076 if (OVS_UNLIKELY(ip_len
< IP_HEADER_LEN
)) {
1079 if (OVS_UNLIKELY(size
< ip_len
)) {
1083 *new_data
= (char *) data
+ ip_len
;
1086 if (IP_IS_FRAGMENT(ip
->ip_frag_off
)) {
1090 if (validate_checksum
&& csum(data
, ip_len
) != 0) {
1094 key
->src
.addr
.ipv4
= ip
->ip_src
;
1095 key
->dst
.addr
.ipv4
= ip
->ip_dst
;
1096 key
->nw_proto
= ip
->ip_proto
;
1101 /* The function stores a pointer to the first byte after the header in
1102 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1103 * not interested in the header's tail, meaning that the header has
1104 * already been parsed (e.g. by flow_extract): we take this as a hint to
1105 * save a few checks. */
1107 extract_l3_ipv6(struct conn_key
*key
, const void *data
, size_t size
,
1108 const char **new_data
)
1110 const struct ovs_16aligned_ip6_hdr
*ip6
= data
;
1113 if (OVS_UNLIKELY(size
< sizeof *ip6
)) {
1118 uint8_t nw_proto
= ip6
->ip6_nxt
;
1119 uint8_t nw_frag
= 0;
1122 size
-= sizeof *ip6
;
1124 if (!parse_ipv6_ext_hdrs(&data
, &size
, &nw_proto
, &nw_frag
)) {
1136 key
->src
.addr
.ipv6
= ip6
->ip6_src
;
1137 key
->dst
.addr
.ipv6
= ip6
->ip6_dst
;
1138 key
->nw_proto
= nw_proto
;
1144 checksum_valid(const struct conn_key
*key
, const void *data
, size_t size
,
1149 if (key
->dl_type
== htons(ETH_TYPE_IP
)) {
1150 csum
= packet_csum_pseudoheader(l3
);
1151 } else if (key
->dl_type
== htons(ETH_TYPE_IPV6
)) {
1152 csum
= packet_csum_pseudoheader6(l3
);
1157 csum
= csum_continue(csum
, data
, size
);
1159 return csum_finish(csum
) == 0;
1163 check_l4_tcp(const struct conn_key
*key
, const void *data
, size_t size
,
1164 const void *l3
, bool validate_checksum
)
1166 const struct tcp_header
*tcp
= data
;
1167 if (size
< sizeof *tcp
) {
1171 size_t tcp_len
= TCP_OFFSET(tcp
->tcp_ctl
) * 4;
1172 if (OVS_UNLIKELY(tcp_len
< TCP_HEADER_LEN
|| tcp_len
> size
)) {
1176 return validate_checksum
? checksum_valid(key
, data
, size
, l3
) : true;
1180 check_l4_udp(const struct conn_key
*key
, const void *data
, size_t size
,
1181 const void *l3
, bool validate_checksum
)
1183 const struct udp_header
*udp
= data
;
1184 if (size
< sizeof *udp
) {
1188 size_t udp_len
= ntohs(udp
->udp_len
);
1189 if (OVS_UNLIKELY(udp_len
< UDP_HEADER_LEN
|| udp_len
> size
)) {
1193 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1194 return (udp
->udp_csum
== 0 && key
->dl_type
== htons(ETH_TYPE_IP
))
1195 || (validate_checksum
? checksum_valid(key
, data
, size
, l3
) : true);
1199 check_l4_icmp(const void *data
, size_t size
, bool validate_checksum
)
1201 return validate_checksum
? csum(data
, size
) == 0 : true;
1205 check_l4_icmp6(const struct conn_key
*key
, const void *data
, size_t size
,
1206 const void *l3
, bool validate_checksum
)
1208 return validate_checksum
? checksum_valid(key
, data
, size
, l3
) : true;
1212 extract_l4_tcp(struct conn_key
*key
, const void *data
, size_t size
)
1214 const struct tcp_header
*tcp
= data
;
1216 if (OVS_UNLIKELY(size
< TCP_HEADER_LEN
)) {
1220 key
->src
.port
= tcp
->tcp_src
;
1221 key
->dst
.port
= tcp
->tcp_dst
;
1223 /* Port 0 is invalid */
1224 return key
->src
.port
&& key
->dst
.port
;
1228 extract_l4_udp(struct conn_key
*key
, const void *data
, size_t size
)
1230 const struct udp_header
*udp
= data
;
1232 if (OVS_UNLIKELY(size
< UDP_HEADER_LEN
)) {
1236 key
->src
.port
= udp
->udp_src
;
1237 key
->dst
.port
= udp
->udp_dst
;
1239 /* Port 0 is invalid */
1240 return key
->src
.port
&& key
->dst
.port
;
1243 static inline bool extract_l4(struct conn_key
*key
, const void *data
,
1244 size_t size
, bool *related
, const void *l3
,
1245 bool validate_checksum
);
1248 reverse_icmp_type(uint8_t type
)
1251 case ICMP4_ECHO_REQUEST
:
1252 return ICMP4_ECHO_REPLY
;
1253 case ICMP4_ECHO_REPLY
:
1254 return ICMP4_ECHO_REQUEST
;
1256 case ICMP4_TIMESTAMP
:
1257 return ICMP4_TIMESTAMPREPLY
;
1258 case ICMP4_TIMESTAMPREPLY
:
1259 return ICMP4_TIMESTAMP
;
1261 case ICMP4_INFOREQUEST
:
1262 return ICMP4_INFOREPLY
;
1263 case ICMP4_INFOREPLY
:
1264 return ICMP4_INFOREQUEST
;
1270 /* If 'related' is not NULL and the function is processing an ICMP
1271 * error packet, extract the l3 and l4 fields from the nested header
1272 * instead and set *related to true. If 'related' is NULL we're
1273 * already processing a nested header and no such recursion is
1276 extract_l4_icmp(struct conn_key
*key
, const void *data
, size_t size
,
1279 const struct icmp_header
*icmp
= data
;
1281 if (OVS_UNLIKELY(size
< ICMP_HEADER_LEN
)) {
1285 switch (icmp
->icmp_type
) {
1286 case ICMP4_ECHO_REQUEST
:
1287 case ICMP4_ECHO_REPLY
:
1288 case ICMP4_TIMESTAMP
:
1289 case ICMP4_TIMESTAMPREPLY
:
1290 case ICMP4_INFOREQUEST
:
1291 case ICMP4_INFOREPLY
:
1292 if (icmp
->icmp_code
!= 0) {
1295 /* Separate ICMP connection: identified using id */
1296 key
->src
.icmp_id
= key
->dst
.icmp_id
= icmp
->icmp_fields
.echo
.id
;
1297 key
->src
.icmp_type
= icmp
->icmp_type
;
1298 key
->dst
.icmp_type
= reverse_icmp_type(icmp
->icmp_type
);
1300 case ICMP4_DST_UNREACH
:
1301 case ICMP4_TIME_EXCEEDED
:
1302 case ICMP4_PARAM_PROB
:
1303 case ICMP4_SOURCEQUENCH
:
1304 case ICMP4_REDIRECT
: {
1305 /* ICMP packet part of another connection. We should
1306 * extract the key from embedded packet header */
1307 struct conn_key inner_key
;
1308 const char *l3
= (const char *) (icmp
+ 1);
1309 const char *tail
= (const char *) data
+ size
;
1317 memset(&inner_key
, 0, sizeof inner_key
);
1318 inner_key
.dl_type
= htons(ETH_TYPE_IP
);
1319 ok
= extract_l3_ipv4(&inner_key
, l3
, tail
- l3
, &l4
, false);
1324 if (inner_key
.src
.addr
.ipv4_aligned
!= key
->dst
.addr
.ipv4_aligned
1325 || inner_key
.dst
.addr
.ipv4_aligned
!= key
->src
.addr
.ipv4_aligned
) {
1329 key
->src
= inner_key
.src
;
1330 key
->dst
= inner_key
.dst
;
1331 key
->nw_proto
= inner_key
.nw_proto
;
1333 ok
= extract_l4(key
, l4
, tail
- l4
, NULL
, l3
, false);
1335 conn_key_reverse(key
);
1348 reverse_icmp6_type(uint8_t type
)
1351 case ICMP6_ECHO_REQUEST
:
1352 return ICMP6_ECHO_REPLY
;
1353 case ICMP6_ECHO_REPLY
:
1354 return ICMP6_ECHO_REQUEST
;
1360 /* If 'related' is not NULL and the function is processing an ICMP
1361 * error packet, extract the l3 and l4 fields from the nested header
1362 * instead and set *related to true. If 'related' is NULL we're
1363 * already processing a nested header and no such recursion is
1366 extract_l4_icmp6(struct conn_key
*key
, const void *data
, size_t size
,
1369 const struct icmp6_header
*icmp6
= data
;
1371 /* All the messages that we support need at least 4 bytes after
1373 if (size
< sizeof *icmp6
+ 4) {
1377 switch (icmp6
->icmp6_type
) {
1378 case ICMP6_ECHO_REQUEST
:
1379 case ICMP6_ECHO_REPLY
:
1380 if (icmp6
->icmp6_code
!= 0) {
1383 /* Separate ICMP connection: identified using id */
1384 key
->src
.icmp_id
= key
->dst
.icmp_id
= *(ovs_be16
*) (icmp6
+ 1);
1385 key
->src
.icmp_type
= icmp6
->icmp6_type
;
1386 key
->dst
.icmp_type
= reverse_icmp6_type(icmp6
->icmp6_type
);
1388 case ICMP6_DST_UNREACH
:
1389 case ICMP6_PACKET_TOO_BIG
:
1390 case ICMP6_TIME_EXCEEDED
:
1391 case ICMP6_PARAM_PROB
: {
1392 /* ICMP packet part of another connection. We should
1393 * extract the key from embedded packet header */
1394 struct conn_key inner_key
;
1395 const char *l3
= (const char *) icmp6
+ 8;
1396 const char *tail
= (const char *) data
+ size
;
1397 const char *l4
= NULL
;
1404 memset(&inner_key
, 0, sizeof inner_key
);
1405 inner_key
.dl_type
= htons(ETH_TYPE_IPV6
);
1406 ok
= extract_l3_ipv6(&inner_key
, l3
, tail
- l3
, &l4
);
1411 /* pf doesn't do this, but it seems a good idea */
1412 if (!ipv6_addr_equals(&inner_key
.src
.addr
.ipv6_aligned
,
1413 &key
->dst
.addr
.ipv6_aligned
)
1414 || !ipv6_addr_equals(&inner_key
.dst
.addr
.ipv6_aligned
,
1415 &key
->src
.addr
.ipv6_aligned
)) {
1419 key
->src
= inner_key
.src
;
1420 key
->dst
= inner_key
.dst
;
1421 key
->nw_proto
= inner_key
.nw_proto
;
1423 ok
= extract_l4(key
, l4
, tail
- l4
, NULL
, l3
, false);
1425 conn_key_reverse(key
);
1437 /* Extract l4 fields into 'key', which must already contain valid l3
1440 * If 'related' is not NULL and an ICMP error packet is being
1441 * processed, the function will extract the key from the packet nested
1442 * in the ICMP payload and set '*related' to true.
1444 * If 'related' is NULL, it means that we're already parsing a header nested
1445 * in an ICMP error. In this case, we skip checksum and length validation. */
1447 extract_l4(struct conn_key
*key
, const void *data
, size_t size
, bool *related
,
1448 const void *l3
, bool validate_checksum
)
1450 if (key
->nw_proto
== IPPROTO_TCP
) {
1451 return (!related
|| check_l4_tcp(key
, data
, size
, l3
,
1452 validate_checksum
)) && extract_l4_tcp(key
, data
, size
);
1453 } else if (key
->nw_proto
== IPPROTO_UDP
) {
1454 return (!related
|| check_l4_udp(key
, data
, size
, l3
,
1455 validate_checksum
)) && extract_l4_udp(key
, data
, size
);
1456 } else if (key
->dl_type
== htons(ETH_TYPE_IP
)
1457 && key
->nw_proto
== IPPROTO_ICMP
) {
1458 return (!related
|| check_l4_icmp(data
, size
, validate_checksum
))
1459 && extract_l4_icmp(key
, data
, size
, related
);
1460 } else if (key
->dl_type
== htons(ETH_TYPE_IPV6
)
1461 && key
->nw_proto
== IPPROTO_ICMPV6
) {
1462 return (!related
|| check_l4_icmp6(key
, data
, size
, l3
,
1463 validate_checksum
)) && extract_l4_icmp6(key
, data
, size
,
1471 conn_key_extract(struct conntrack
*ct
, struct dp_packet
*pkt
, ovs_be16 dl_type
,
1472 struct conn_lookup_ctx
*ctx
, uint16_t zone
)
1474 const struct eth_header
*l2
= dp_packet_eth(pkt
);
1475 const struct ip_header
*l3
= dp_packet_l3(pkt
);
1476 const char *l4
= dp_packet_l4(pkt
);
1477 const char *tail
= dp_packet_tail(pkt
);
1480 memset(ctx
, 0, sizeof *ctx
);
1482 if (!l2
|| !l3
|| !l4
) {
1486 ctx
->key
.zone
= zone
;
1488 /* XXX In this function we parse the packet (again, it has already
1489 * gone through miniflow_extract()) for two reasons:
1491 * 1) To extract the l3 addresses and l4 ports.
1492 * We already have the l3 and l4 headers' pointers. Extracting
1493 * the l3 addresses and the l4 ports is really cheap, since they
1494 * can be found at fixed locations.
1495 * 2) To extract the l4 type.
1496 * Extracting the l4 types, for IPv6 can be quite expensive, because
1497 * it's not at a fixed location.
1499 * Here's a way to avoid (2) with the help of the datapath.
1500 * The datapath doesn't keep the packet's extracted flow[1], so
1501 * using that is not an option. We could use the packet's matching
1502 * megaflow, but we have to make sure that the l4 type (nw_proto)
1503 * is unwildcarded. This means either:
1505 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1506 * if the actions contains ct().
1508 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1509 * action. This is already done in different actions, but it's
1510 * unnecessary for the kernel.
1513 * [1] The reasons for this are that keeping the flow increases
1514 * (slightly) the cache footprint and increases computation
1515 * time as we move the packet around. Most importantly, the flow
1516 * should be updated by the actions and this can be slow, as
1517 * we use a sparse representation (miniflow).
1520 ctx
->key
.dl_type
= dl_type
;
1521 if (ctx
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
1522 bool hwol_bad_l3_csum
= dp_packet_ip_checksum_bad(pkt
);
1523 if (hwol_bad_l3_csum
) {
1526 bool hwol_good_l3_csum
= dp_packet_ip_checksum_valid(pkt
);
1527 /* Validate the checksum only when hwol is not supported. */
1528 ok
= extract_l3_ipv4(&ctx
->key
, l3
, tail
- (char *) l3
, NULL
,
1529 !hwol_good_l3_csum
);
1531 } else if (ctx
->key
.dl_type
== htons(ETH_TYPE_IPV6
)) {
1532 ok
= extract_l3_ipv6(&ctx
->key
, l3
, tail
- (char *) l3
, NULL
);
1539 bool hwol_bad_l4_csum
= dp_packet_l4_checksum_bad(pkt
);
1540 if (!hwol_bad_l4_csum
) {
1541 bool hwol_good_l4_csum
= dp_packet_l4_checksum_valid(pkt
);
1542 /* Validate the checksum only when hwol is not supported. */
1543 if (extract_l4(&ctx
->key
, l4
, tail
- l4
, &ctx
->icmp_related
, l3
,
1544 !hwol_good_l4_csum
)) {
1545 ctx
->hash
= conn_key_hash(&ctx
->key
, ct
->hash_basis
);
1555 ct_addr_hash_add(uint32_t hash
, const struct ct_addr
*addr
)
1557 BUILD_ASSERT_DECL(sizeof *addr
% 4 == 0);
1558 return hash_add_bytes32(hash
, (const uint32_t *) addr
, sizeof *addr
);
1562 ct_endpoint_hash_add(uint32_t hash
, const struct ct_endpoint
*ep
)
1564 BUILD_ASSERT_DECL(sizeof *ep
% 4 == 0);
1565 return hash_add_bytes32(hash
, (const uint32_t *) ep
, sizeof *ep
);
1570 conn_key_hash(const struct conn_key
*key
, uint32_t basis
)
1572 uint32_t hsrc
, hdst
, hash
;
1574 hsrc
= hdst
= basis
;
1575 hsrc
= ct_endpoint_hash_add(hsrc
, &key
->src
);
1576 hdst
= ct_endpoint_hash_add(hdst
, &key
->dst
);
1578 /* Even if source and destination are swapped the hash will be the same. */
1581 /* Hash the rest of the key(L3 and L4 types and zone). */
1582 hash
= hash_words((uint32_t *) (&key
->dst
+ 1),
1583 (uint32_t *) (key
+ 1) - (uint32_t *) (&key
->dst
+ 1),
1586 return hash_finish(hash
, 0);
1590 conn_key_reverse(struct conn_key
*key
)
1592 struct ct_endpoint tmp
;
1595 key
->src
= key
->dst
;
1600 nat_ipv6_addrs_delta(struct in6_addr
*ipv6_aligned_min
,
1601 struct in6_addr
*ipv6_aligned_max
)
1603 uint8_t *ipv6_min_hi
= &ipv6_aligned_min
->s6_addr
[0];
1604 uint8_t *ipv6_min_lo
= &ipv6_aligned_min
->s6_addr
[0] + sizeof(uint64_t);
1605 uint8_t *ipv6_max_hi
= &ipv6_aligned_max
->s6_addr
[0];
1606 uint8_t *ipv6_max_lo
= &ipv6_aligned_max
->s6_addr
[0] + sizeof(uint64_t);
1608 ovs_be64 addr6_64_min_hi
;
1609 ovs_be64 addr6_64_min_lo
;
1610 memcpy(&addr6_64_min_hi
, ipv6_min_hi
, sizeof addr6_64_min_hi
);
1611 memcpy(&addr6_64_min_lo
, ipv6_min_lo
, sizeof addr6_64_min_lo
);
1613 ovs_be64 addr6_64_max_hi
;
1614 ovs_be64 addr6_64_max_lo
;
1615 memcpy(&addr6_64_max_hi
, ipv6_max_hi
, sizeof addr6_64_max_hi
);
1616 memcpy(&addr6_64_max_lo
, ipv6_max_lo
, sizeof addr6_64_max_lo
);
1619 if (addr6_64_min_hi
== addr6_64_max_hi
&&
1620 ntohll(addr6_64_min_lo
) <= ntohll(addr6_64_max_lo
)) {
1621 diff
= ntohll(addr6_64_max_lo
) - ntohll(addr6_64_min_lo
);
1622 } else if (ntohll(addr6_64_min_hi
) + 1 == ntohll(addr6_64_max_hi
) &&
1623 ntohll(addr6_64_min_lo
) > ntohll(addr6_64_max_lo
)) {
1624 diff
= UINT64_MAX
- (ntohll(addr6_64_min_lo
) -
1625 ntohll(addr6_64_max_lo
) - 1);
1627 /* Limit address delta supported to 32 bits or 4 billion approximately.
1628 * Possibly, this should be visible to the user through a datapath
1629 * support check, however the practical impact is probably nil. */
1632 if (diff
> 0xfffffffe) {
1638 /* This function must be used in tandem with nat_ipv6_addrs_delta(), which
1639 * restricts the input parameters. */
1641 nat_ipv6_addr_increment(struct in6_addr
*ipv6_aligned
, uint32_t increment
)
1643 uint8_t *ipv6_hi
= &ipv6_aligned
->s6_addr
[0];
1644 uint8_t *ipv6_lo
= &ipv6_aligned
->s6_addr
[0] + sizeof(ovs_be64
);
1645 ovs_be64 addr6_64_hi
;
1646 ovs_be64 addr6_64_lo
;
1647 memcpy(&addr6_64_hi
, ipv6_hi
, sizeof addr6_64_hi
);
1648 memcpy(&addr6_64_lo
, ipv6_lo
, sizeof addr6_64_lo
);
1650 if (UINT64_MAX
- increment
>= ntohll(addr6_64_lo
)) {
1651 addr6_64_lo
= htonll(increment
+ ntohll(addr6_64_lo
));
1652 } else if (addr6_64_hi
!= OVS_BE64_MAX
) {
1653 addr6_64_hi
= htonll(1 + ntohll(addr6_64_hi
));
1654 addr6_64_lo
= htonll(increment
- (UINT64_MAX
-
1655 ntohll(addr6_64_lo
) + 1));
1660 memcpy(ipv6_hi
, &addr6_64_hi
, sizeof addr6_64_hi
);
1661 memcpy(ipv6_lo
, &addr6_64_lo
, sizeof addr6_64_lo
);
1667 nat_range_hash(const struct conn
*conn
, uint32_t basis
)
1669 uint32_t hash
= basis
;
1671 hash
= ct_addr_hash_add(hash
, &conn
->nat_info
->min_addr
);
1672 hash
= ct_addr_hash_add(hash
, &conn
->nat_info
->max_addr
);
1673 hash
= hash_add(hash
,
1674 (conn
->nat_info
->max_port
<< 16)
1675 | conn
->nat_info
->min_port
);
1677 hash
= ct_endpoint_hash_add(hash
, &conn
->key
.src
);
1678 hash
= ct_endpoint_hash_add(hash
, &conn
->key
.dst
);
1680 hash
= hash_add(hash
, (OVS_FORCE
uint32_t) conn
->key
.dl_type
);
1681 hash
= hash_add(hash
, conn
->key
.nw_proto
);
1682 hash
= hash_add(hash
, conn
->key
.zone
);
1684 /* The purpose of the second parameter is to distinguish hashes of data of
1685 * different length; our data always has the same length so there is no
1686 * value in counting. */
1687 return hash_finish(hash
, 0);
1691 nat_select_range_tuple(struct conntrack
*ct
, const struct conn
*conn
,
1692 struct conn
*nat_conn
)
1694 #define MIN_NAT_EPHEMERAL_PORT 1024
1695 #define MAX_NAT_EPHEMERAL_PORT 65535
1699 uint16_t first_port
;
1701 uint32_t hash
= nat_range_hash(conn
, ct
->hash_basis
);
1703 if ((conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) &&
1704 (!(conn
->nat_info
->nat_action
& NAT_ACTION_SRC_PORT
))) {
1705 min_port
= ntohs(conn
->key
.src
.port
);
1706 max_port
= ntohs(conn
->key
.src
.port
);
1707 first_port
= min_port
;
1708 } else if ((conn
->nat_info
->nat_action
& NAT_ACTION_DST
) &&
1709 (!(conn
->nat_info
->nat_action
& NAT_ACTION_DST_PORT
))) {
1710 min_port
= ntohs(conn
->key
.dst
.port
);
1711 max_port
= ntohs(conn
->key
.dst
.port
);
1712 first_port
= min_port
;
1714 uint16_t deltap
= conn
->nat_info
->max_port
- conn
->nat_info
->min_port
;
1715 uint32_t port_index
= hash
% (deltap
+ 1);
1716 first_port
= conn
->nat_info
->min_port
+ port_index
;
1717 min_port
= conn
->nat_info
->min_port
;
1718 max_port
= conn
->nat_info
->max_port
;
1721 uint32_t deltaa
= 0;
1722 uint32_t address_index
;
1723 struct ct_addr ct_addr
;
1724 memset(&ct_addr
, 0, sizeof ct_addr
);
1725 struct ct_addr max_ct_addr
;
1726 memset(&max_ct_addr
, 0, sizeof max_ct_addr
);
1727 max_ct_addr
= conn
->nat_info
->max_addr
;
1729 if (conn
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
1730 deltaa
= ntohl(conn
->nat_info
->max_addr
.ipv4_aligned
) -
1731 ntohl(conn
->nat_info
->min_addr
.ipv4_aligned
);
1732 address_index
= hash
% (deltaa
+ 1);
1733 ct_addr
.ipv4_aligned
= htonl(
1734 ntohl(conn
->nat_info
->min_addr
.ipv4_aligned
) + address_index
);
1736 deltaa
= nat_ipv6_addrs_delta(&conn
->nat_info
->min_addr
.ipv6_aligned
,
1737 &conn
->nat_info
->max_addr
.ipv6_aligned
);
1738 /* deltaa must be within 32 bits for full hash coverage. A 64 or
1739 * 128 bit hash is unnecessary and hence not used here. Most code
1740 * is kept common with V4; nat_ipv6_addrs_delta() will do the
1741 * enforcement via max_ct_addr. */
1742 max_ct_addr
= conn
->nat_info
->min_addr
;
1743 nat_ipv6_addr_increment(&max_ct_addr
.ipv6_aligned
, deltaa
);
1745 address_index
= hash
% (deltaa
+ 1);
1746 ct_addr
.ipv6_aligned
= conn
->nat_info
->min_addr
.ipv6_aligned
;
1747 nat_ipv6_addr_increment(&ct_addr
.ipv6_aligned
, address_index
);
1750 uint16_t port
= first_port
;
1751 bool all_ports_tried
= false;
1752 bool original_ports_tried
= false;
1753 struct ct_addr first_addr
= ct_addr
;
1757 if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
1758 nat_conn
->rev_key
.dst
.addr
= ct_addr
;
1760 nat_conn
->rev_key
.src
.addr
= ct_addr
;
1763 if ((conn
->key
.nw_proto
== IPPROTO_ICMP
) ||
1764 (conn
->key
.nw_proto
== IPPROTO_ICMPV6
)) {
1765 all_ports_tried
= true;
1766 } else if (conn
->nat_info
->nat_action
& NAT_ACTION_SRC
) {
1767 nat_conn
->rev_key
.dst
.port
= htons(port
);
1769 nat_conn
->rev_key
.src
.port
= htons(port
);
1772 struct nat_conn_key_node
*nat_conn_key_node
=
1773 nat_conn_keys_lookup(&ct
->nat_conn_keys
, &nat_conn
->rev_key
,
1776 if (!nat_conn_key_node
) {
1777 struct nat_conn_key_node
*nat_conn_key
=
1778 xzalloc(sizeof *nat_conn_key
);
1779 nat_conn_key
->key
= nat_conn
->rev_key
;
1780 nat_conn_key
->value
= nat_conn
->key
;
1781 uint32_t nat_conn_key_hash
= conn_key_hash(&nat_conn_key
->key
,
1783 hmap_insert(&ct
->nat_conn_keys
, &nat_conn_key
->node
,
1786 } else if (!all_ports_tried
) {
1787 if (min_port
== max_port
) {
1788 all_ports_tried
= true;
1789 } else if (port
== max_port
) {
1794 if (port
== first_port
) {
1795 all_ports_tried
= true;
1798 if (memcmp(&ct_addr
, &max_ct_addr
, sizeof ct_addr
)) {
1799 if (conn
->key
.dl_type
== htons(ETH_TYPE_IP
)) {
1800 ct_addr
.ipv4_aligned
= htonl(
1801 ntohl(ct_addr
.ipv4_aligned
) + 1);
1803 nat_ipv6_addr_increment(&ct_addr
.ipv6_aligned
, 1);
1806 ct_addr
= conn
->nat_info
->min_addr
;
1808 if (!memcmp(&ct_addr
, &first_addr
, sizeof ct_addr
)) {
1809 if (!original_ports_tried
) {
1810 original_ports_tried
= true;
1811 ct_addr
= conn
->nat_info
->min_addr
;
1812 min_port
= MIN_NAT_EPHEMERAL_PORT
;
1813 max_port
= MAX_NAT_EPHEMERAL_PORT
;
1818 first_port
= min_port
;
1820 all_ports_tried
= false;
1826 /* This function must be called with the ct->resources lock taken. */
1827 static struct nat_conn_key_node
*
1828 nat_conn_keys_lookup(struct hmap
*nat_conn_keys
,
1829 const struct conn_key
*key
,
1832 struct nat_conn_key_node
*nat_conn_key_node
;
1833 uint32_t nat_conn_key_hash
= conn_key_hash(key
, basis
);
1835 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node
, node
, nat_conn_key_hash
,
1837 if (!conn_key_cmp(&nat_conn_key_node
->key
, key
)) {
1838 return nat_conn_key_node
;
1844 /* This function must be called with the ct->resources write lock taken. */
1846 nat_conn_keys_remove(struct hmap
*nat_conn_keys
, const struct conn_key
*key
,
1849 struct nat_conn_key_node
*nat_conn_key_node
;
1850 uint32_t nat_conn_key_hash
= conn_key_hash(key
, basis
);
1852 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node
, node
, nat_conn_key_hash
,
1854 if (!conn_key_cmp(&nat_conn_key_node
->key
, key
)) {
1855 hmap_remove(nat_conn_keys
, &nat_conn_key_node
->node
);
1856 free(nat_conn_key_node
);
1863 conn_key_lookup(struct conntrack_bucket
*ctb
, struct conn_lookup_ctx
*ctx
,
1865 OVS_REQUIRES(ctb
->lock
)
1867 uint32_t hash
= ctx
->hash
;
1872 HMAP_FOR_EACH_WITH_HASH (conn
, node
, hash
, &ctb
->connections
) {
1873 if (!conn_key_cmp(&conn
->key
, &ctx
->key
)
1874 && !conn_expired(conn
, now
)) {
1879 if (!conn_key_cmp(&conn
->rev_key
, &ctx
->key
)
1880 && !conn_expired(conn
, now
)) {
1888 static enum ct_update_res
1889 conn_update(struct conn
*conn
, struct conntrack_bucket
*ctb
,
1890 struct dp_packet
*pkt
, bool reply
, long long now
)
1892 return l4_protos
[conn
->key
.nw_proto
]->conn_update(conn
, ctb
, pkt
,
1897 conn_expired(struct conn
*conn
, long long now
)
1899 if (conn
->conn_type
== CT_CONN_TYPE_DEFAULT
) {
1900 return now
>= conn
->expiration
;
1906 valid_new(struct dp_packet
*pkt
, struct conn_key
*key
)
1908 return l4_protos
[key
->nw_proto
]->valid_new(pkt
);
1911 static struct conn
*
1912 new_conn(struct conntrack_bucket
*ctb
, struct dp_packet
*pkt
,
1913 struct conn_key
*key
, long long now
)
1915 struct conn
*newconn
;
1917 newconn
= l4_protos
[key
->nw_proto
]->new_conn(ctb
, pkt
, now
);
1920 newconn
->key
= *key
;
1927 delete_conn(struct conn
*conn
)
1929 free(conn
->nat_info
);
1934 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr
*a
,
1935 union ct_dpif_inet_addr
*b
,
1938 if (dl_type
== htons(ETH_TYPE_IP
)) {
1939 b
->ip
= a
->ipv4_aligned
;
1940 } else if (dl_type
== htons(ETH_TYPE_IPV6
)){
1941 b
->in6
= a
->ipv6_aligned
;
1946 conn_key_to_tuple(const struct conn_key
*key
, struct ct_dpif_tuple
*tuple
)
1948 if (key
->dl_type
== htons(ETH_TYPE_IP
)) {
1949 tuple
->l3_type
= AF_INET
;
1950 } else if (key
->dl_type
== htons(ETH_TYPE_IPV6
)) {
1951 tuple
->l3_type
= AF_INET6
;
1953 tuple
->ip_proto
= key
->nw_proto
;
1954 ct_endpoint_to_ct_dpif_inet_addr(&key
->src
.addr
, &tuple
->src
,
1956 ct_endpoint_to_ct_dpif_inet_addr(&key
->dst
.addr
, &tuple
->dst
,
1959 if (key
->nw_proto
== IPPROTO_ICMP
|| key
->nw_proto
== IPPROTO_ICMPV6
) {
1960 tuple
->icmp_id
= key
->src
.icmp_id
;
1961 tuple
->icmp_type
= key
->src
.icmp_type
;
1962 tuple
->icmp_code
= key
->src
.icmp_code
;
1964 tuple
->src_port
= key
->src
.port
;
1965 tuple
->dst_port
= key
->dst
.port
;
1970 conn_to_ct_dpif_entry(const struct conn
*conn
, struct ct_dpif_entry
*entry
,
1971 long long now
, int bkt
)
1973 struct ct_l4_proto
*class;
1974 long long expiration
;
1975 memset(entry
, 0, sizeof *entry
);
1976 conn_key_to_tuple(&conn
->key
, &entry
->tuple_orig
);
1977 conn_key_to_tuple(&conn
->rev_key
, &entry
->tuple_reply
);
1979 entry
->zone
= conn
->key
.zone
;
1980 entry
->mark
= conn
->mark
;
1982 memcpy(&entry
->labels
, &conn
->label
, sizeof entry
->labels
);
1983 /* Not implemented yet */
1984 entry
->timestamp
.start
= 0;
1985 entry
->timestamp
.stop
= 0;
1987 expiration
= conn
->expiration
- now
;
1988 entry
->timeout
= (expiration
> 0) ? expiration
/ 1000 : 0;
1990 class = l4_protos
[conn
->key
.nw_proto
];
1991 if (class->conn_get_protoinfo
) {
1992 class->conn_get_protoinfo(conn
, &entry
->protoinfo
);
1998 conntrack_dump_start(struct conntrack
*ct
, struct conntrack_dump
*dump
,
1999 const uint16_t *pzone
, int *ptot_bkts
)
2001 memset(dump
, 0, sizeof(*dump
));
2003 dump
->zone
= *pzone
;
2004 dump
->filter_zone
= true;
2008 *ptot_bkts
= CONNTRACK_BUCKETS
;
2014 conntrack_dump_next(struct conntrack_dump
*dump
, struct ct_dpif_entry
*entry
)
2016 struct conntrack
*ct
= dump
->ct
;
2017 long long now
= time_msec();
2019 while (dump
->bucket
< CONNTRACK_BUCKETS
) {
2020 struct hmap_node
*node
;
2022 ct_lock_lock(&ct
->buckets
[dump
->bucket
].lock
);
2026 node
= hmap_at_position(&ct
->buckets
[dump
->bucket
].connections
,
2031 INIT_CONTAINER(conn
, node
, node
);
2032 if ((!dump
->filter_zone
|| conn
->key
.zone
== dump
->zone
) &&
2033 (conn
->conn_type
!= CT_CONN_TYPE_UN_NAT
)) {
2034 conn_to_ct_dpif_entry(conn
, entry
, now
, dump
->bucket
);
2037 /* Else continue, until we find an entry in the appropriate zone
2038 * or the bucket has been scanned completely. */
2040 ct_lock_unlock(&ct
->buckets
[dump
->bucket
].lock
);
2043 memset(&dump
->bucket_pos
, 0, sizeof dump
->bucket_pos
);
2053 conntrack_dump_done(struct conntrack_dump
*dump OVS_UNUSED
)
2059 conntrack_flush(struct conntrack
*ct
, const uint16_t *zone
)
2063 for (i
= 0; i
< CONNTRACK_BUCKETS
; i
++) {
2064 struct conn
*conn
, *next
;
2066 ct_lock_lock(&ct
->buckets
[i
].lock
);
2067 HMAP_FOR_EACH_SAFE(conn
, next
, node
, &ct
->buckets
[i
].connections
) {
2068 if ((!zone
|| *zone
== conn
->key
.zone
) &&
2069 (conn
->conn_type
== CT_CONN_TYPE_DEFAULT
)) {
2070 conn_clean(ct
, conn
, &ct
->buckets
[i
]);
2073 ct_lock_unlock(&ct
->buckets
[i
].lock
);