]> git.proxmox.com Git - mirror_ovs.git/blob - lib/conntrack.c
Userspace Datapath: Introduce conn_key_cmp().
[mirror_ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "conntrack.h"
19
20 #include <errno.h>
21 #include <sys/types.h>
22 #include <netinet/in.h>
23 #include <netinet/icmp6.h>
24
25 #include "bitmap.h"
26 #include "conntrack-private.h"
27 #include "coverage.h"
28 #include "csum.h"
29 #include "ct-dpif.h"
30 #include "dp-packet.h"
31 #include "flow.h"
32 #include "netdev.h"
33 #include "odp-netlink.h"
34 #include "openvswitch/hmap.h"
35 #include "openvswitch/vlog.h"
36 #include "ovs-rcu.h"
37 #include "ovs-thread.h"
38 #include "poll-loop.h"
39 #include "random.h"
40 #include "timeval.h"
41
42
43 VLOG_DEFINE_THIS_MODULE(conntrack);
44
45 COVERAGE_DEFINE(conntrack_full);
46 COVERAGE_DEFINE(conntrack_long_cleanup);
47
48 struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
53 bool icmp_related;
54 };
55
56 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
57 ovs_be16 dl_type, struct conn_lookup_ctx *,
58 uint16_t zone);
59 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
60 static void conn_key_reverse(struct conn_key *);
61 static void conn_key_lookup(struct conntrack_bucket *ctb,
62 struct conn_lookup_ctx *ctx,
63 long long now);
64 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
65 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
66 struct conn_key *, long long now);
67 static void delete_conn(struct conn *);
68 static enum ct_update_res conn_update(struct conn *,
69 struct conntrack_bucket *ctb,
70 struct dp_packet *, bool reply,
71 long long now);
72 static bool conn_expired(struct conn *, long long now);
73 static void set_mark(struct dp_packet *, struct conn *,
74 uint32_t val, uint32_t mask);
75 static void set_label(struct dp_packet *, struct conn *,
76 const struct ovs_key_ct_labels *val,
77 const struct ovs_key_ct_labels *mask);
78 static void *clean_thread_main(void *f_);
79
80 static struct nat_conn_key_node *
81 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
82 const struct conn_key *key,
83 uint32_t basis);
84
85 static void
86 nat_conn_keys_remove(struct hmap *nat_conn_keys,
87 const struct conn_key *key,
88 uint32_t basis);
89
90 static bool
91 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
92 struct conn *nat_conn);
93
94 static uint8_t
95 reverse_icmp_type(uint8_t type);
96 static uint8_t
97 reverse_icmp6_type(uint8_t type);
98 static inline bool
99 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
100 const char **new_data, bool validate_checksum);
101 static inline bool
102 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
103 const char **new_data);
104
105 static struct ct_l4_proto *l4_protos[] = {
106 [IPPROTO_TCP] = &ct_proto_tcp,
107 [IPPROTO_UDP] = &ct_proto_other,
108 [IPPROTO_ICMP] = &ct_proto_icmp4,
109 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
110 };
111
112 long long ct_timeout_val[] = {
113 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
114 CT_TIMEOUTS
115 #undef CT_TIMEOUT
116 };
117
118 /* If the total number of connections goes above this value, no new connections
119 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
120 #define DEFAULT_N_CONN_LIMIT 3000000
121
122 /* Does a member by member comparison of two conn_keys; this
123 * function must be kept in sync with struct conn_key; returns 0
124 * if the keys are equal or 1 if the keys are not equal. */
125 static int
126 conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
127 {
128 if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
129 !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
130 (key1->src.icmp_id == key2->src.icmp_id) &&
131 (key1->src.icmp_type == key2->src.icmp_type) &&
132 (key1->src.icmp_code == key2->src.icmp_code) &&
133 (key1->dst.icmp_id == key2->dst.icmp_id) &&
134 (key1->dst.icmp_type == key2->dst.icmp_type) &&
135 (key1->dst.icmp_code == key2->dst.icmp_code) &&
136 (key1->dl_type == key2->dl_type) &&
137 (key1->zone == key2->zone) &&
138 (key1->nw_proto == key2->nw_proto)) {
139
140 return 0;
141 }
142 return 1;
143 }
144
145 /* Initializes the connection tracker 'ct'. The caller is responsible for
146 * calling 'conntrack_destroy()', when the instance is not needed anymore */
147 void
148 conntrack_init(struct conntrack *ct)
149 {
150 unsigned i, j;
151 long long now = time_msec();
152
153 ct_rwlock_init(&ct->resources_lock);
154 ct_rwlock_wrlock(&ct->resources_lock);
155 hmap_init(&ct->nat_conn_keys);
156 ct_rwlock_unlock(&ct->resources_lock);
157
158 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
159 struct conntrack_bucket *ctb = &ct->buckets[i];
160
161 ct_lock_init(&ctb->lock);
162 ct_lock_lock(&ctb->lock);
163 hmap_init(&ctb->connections);
164 for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
165 ovs_list_init(&ctb->exp_lists[j]);
166 }
167 ct_lock_unlock(&ctb->lock);
168 ovs_mutex_init(&ctb->cleanup_mutex);
169 ovs_mutex_lock(&ctb->cleanup_mutex);
170 ctb->next_cleanup = now + CT_TM_MIN;
171 ovs_mutex_unlock(&ctb->cleanup_mutex);
172 }
173 ct->hash_basis = random_uint32();
174 atomic_count_init(&ct->n_conn, 0);
175 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
176 latch_init(&ct->clean_thread_exit);
177 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
178 }
179
180 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
181 void
182 conntrack_destroy(struct conntrack *ct)
183 {
184 unsigned i;
185
186 latch_set(&ct->clean_thread_exit);
187 pthread_join(ct->clean_thread, NULL);
188 latch_destroy(&ct->clean_thread_exit);
189 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
190 struct conntrack_bucket *ctb = &ct->buckets[i];
191 struct conn *conn;
192
193 ovs_mutex_destroy(&ctb->cleanup_mutex);
194 ct_lock_lock(&ctb->lock);
195 HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
196 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
197 atomic_count_dec(&ct->n_conn);
198 }
199 delete_conn(conn);
200 }
201 hmap_destroy(&ctb->connections);
202 ct_lock_unlock(&ctb->lock);
203 ct_lock_destroy(&ctb->lock);
204 }
205 ct_rwlock_wrlock(&ct->resources_lock);
206 struct nat_conn_key_node *nat_conn_key_node;
207 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
208 free(nat_conn_key_node);
209 }
210 hmap_destroy(&ct->nat_conn_keys);
211 ct_rwlock_unlock(&ct->resources_lock);
212 ct_rwlock_destroy(&ct->resources_lock);
213 }
214 \f
215 static unsigned hash_to_bucket(uint32_t hash)
216 {
217 /* Extracts the most significant bits in hash. The least significant bits
218 * are already used internally by the hmap implementation. */
219 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
220
221 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
222 }
223
224 static void
225 write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
226 const struct conn_key *key)
227 {
228 pkt->md.ct_state |= CS_TRACKED;
229 pkt->md.ct_zone = zone;
230 pkt->md.ct_mark = conn ? conn->mark : 0;
231 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
232
233 /* Use the original direction tuple if we have it. */
234 if (conn) {
235 key = &conn->key;
236 }
237 pkt->md.ct_orig_tuple_ipv6 = false;
238 if (key) {
239 if (key->dl_type == htons(ETH_TYPE_IP)) {
240 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
241 key->src.addr.ipv4_aligned,
242 key->dst.addr.ipv4_aligned,
243 key->nw_proto != IPPROTO_ICMP
244 ? key->src.port : htons(key->src.icmp_type),
245 key->nw_proto != IPPROTO_ICMP
246 ? key->dst.port : htons(key->src.icmp_code),
247 key->nw_proto,
248 };
249 } else {
250 pkt->md.ct_orig_tuple_ipv6 = true;
251 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
252 key->src.addr.ipv6_aligned,
253 key->dst.addr.ipv6_aligned,
254 key->nw_proto != IPPROTO_ICMPV6
255 ? key->src.port : htons(key->src.icmp_type),
256 key->nw_proto != IPPROTO_ICMPV6
257 ? key->dst.port : htons(key->src.icmp_code),
258 key->nw_proto,
259 };
260 }
261 } else {
262 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
263 }
264
265 }
266
267 static void
268 pat_packet(struct dp_packet *pkt, const struct conn *conn)
269 {
270 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
271 if (conn->key.nw_proto == IPPROTO_TCP) {
272 struct tcp_header *th = dp_packet_l4(pkt);
273 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
274 } else if (conn->key.nw_proto == IPPROTO_UDP) {
275 struct udp_header *uh = dp_packet_l4(pkt);
276 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
277 }
278 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
279 if (conn->key.nw_proto == IPPROTO_TCP) {
280 struct tcp_header *th = dp_packet_l4(pkt);
281 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
282 } else if (conn->key.nw_proto == IPPROTO_UDP) {
283 struct udp_header *uh = dp_packet_l4(pkt);
284 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
285 }
286 }
287 }
288
289 static void
290 nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
291 {
292 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
293 pkt->md.ct_state |= CS_SRC_NAT;
294 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
295 struct ip_header *nh = dp_packet_l3(pkt);
296 packet_set_ipv4_addr(pkt, &nh->ip_src,
297 conn->rev_key.dst.addr.ipv4_aligned);
298 } else {
299 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
300 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
301 nh6->ip6_src.be32,
302 &conn->rev_key.dst.addr.ipv6_aligned,
303 true);
304 }
305 if (!related) {
306 pat_packet(pkt, conn);
307 }
308 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
309 pkt->md.ct_state |= CS_DST_NAT;
310 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
311 struct ip_header *nh = dp_packet_l3(pkt);
312 packet_set_ipv4_addr(pkt, &nh->ip_dst,
313 conn->rev_key.src.addr.ipv4_aligned);
314 } else {
315 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
316 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
317 nh6->ip6_dst.be32,
318 &conn->rev_key.src.addr.ipv6_aligned,
319 true);
320 }
321 if (!related) {
322 pat_packet(pkt, conn);
323 }
324 }
325 }
326
327 static void
328 un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
329 {
330 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
331 if (conn->key.nw_proto == IPPROTO_TCP) {
332 struct tcp_header *th = dp_packet_l4(pkt);
333 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
334 } else if (conn->key.nw_proto == IPPROTO_UDP) {
335 struct udp_header *uh = dp_packet_l4(pkt);
336 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
337 }
338 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
339 if (conn->key.nw_proto == IPPROTO_TCP) {
340 struct tcp_header *th = dp_packet_l4(pkt);
341 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
342 } else if (conn->key.nw_proto == IPPROTO_UDP) {
343 struct udp_header *uh = dp_packet_l4(pkt);
344 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
345 }
346 }
347 }
348
349 static void
350 reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
351 {
352 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
353 if (conn->key.nw_proto == IPPROTO_TCP) {
354 struct tcp_header *th_in = dp_packet_l4(pkt);
355 packet_set_tcp_port(pkt, conn->key.src.port,
356 th_in->tcp_dst);
357 } else if (conn->key.nw_proto == IPPROTO_UDP) {
358 struct udp_header *uh_in = dp_packet_l4(pkt);
359 packet_set_udp_port(pkt, conn->key.src.port,
360 uh_in->udp_dst);
361 }
362 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
363 if (conn->key.nw_proto == IPPROTO_TCP) {
364 struct tcp_header *th_in = dp_packet_l4(pkt);
365 packet_set_tcp_port(pkt, th_in->tcp_src,
366 conn->key.dst.port);
367 } else if (conn->key.nw_proto == IPPROTO_UDP) {
368 struct udp_header *uh_in = dp_packet_l4(pkt);
369 packet_set_udp_port(pkt, uh_in->udp_src,
370 conn->key.dst.port);
371 }
372 }
373 }
374
375 static void
376 reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
377 {
378 char *tail = dp_packet_tail(pkt);
379 char pad = dp_packet_l2_pad_size(pkt);
380 struct conn_key inner_key;
381 const char *inner_l4 = NULL;
382 uint16_t orig_l3_ofs = pkt->l3_ofs;
383 uint16_t orig_l4_ofs = pkt->l4_ofs;
384
385 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
386 struct ip_header *nh = dp_packet_l3(pkt);
387 struct icmp_header *icmp = dp_packet_l4(pkt);
388 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
389 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
390 -pad, &inner_l4, false);
391
392 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
393 pkt->l4_ofs += inner_l4 - (char *) icmp;
394
395 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
396 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
397 conn->key.src.addr.ipv4_aligned);
398 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
399 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
400 conn->key.dst.addr.ipv4_aligned);
401 }
402 reverse_pat_packet(pkt, conn);
403 icmp->icmp_csum = 0;
404 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
405 } else {
406 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
407 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
408 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
409 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
410 extract_l3_ipv6(&inner_key, inner_l3_6,
411 tail - ((char *)inner_l3_6) - pad,
412 &inner_l4);
413 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
414 pkt->l4_ofs += inner_l4 - (char *) icmp6;
415
416 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
417 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
418 inner_l3_6->ip6_src.be32,
419 &conn->key.src.addr.ipv6_aligned,
420 true);
421 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
422 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
423 inner_l3_6->ip6_dst.be32,
424 &conn->key.dst.addr.ipv6_aligned,
425 true);
426 }
427 reverse_pat_packet(pkt, conn);
428 uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
429 icmp6->icmp6_base.icmp6_cksum = 0;
430 icmp6->icmp6_base.icmp6_cksum = csum_finish(
431 csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
432 }
433 pkt->l3_ofs = orig_l3_ofs;
434 pkt->l4_ofs = orig_l4_ofs;
435 }
436
437 static void
438 un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
439 bool related)
440 {
441 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
442 pkt->md.ct_state |= CS_DST_NAT;
443 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
444 struct ip_header *nh = dp_packet_l3(pkt);
445 packet_set_ipv4_addr(pkt, &nh->ip_dst,
446 conn->key.src.addr.ipv4_aligned);
447 } else {
448 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
449 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
450 nh6->ip6_dst.be32,
451 &conn->key.src.addr.ipv6_aligned, true);
452 }
453
454 if (OVS_UNLIKELY(related)) {
455 reverse_nat_packet(pkt, conn);
456 } else {
457 un_pat_packet(pkt, conn);
458 }
459 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
460 pkt->md.ct_state |= CS_SRC_NAT;
461 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
462 struct ip_header *nh = dp_packet_l3(pkt);
463 packet_set_ipv4_addr(pkt, &nh->ip_src,
464 conn->key.dst.addr.ipv4_aligned);
465 } else {
466 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
467 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
468 nh6->ip6_src.be32,
469 &conn->key.dst.addr.ipv6_aligned, true);
470 }
471
472 if (OVS_UNLIKELY(related)) {
473 reverse_nat_packet(pkt, conn);
474 } else {
475 un_pat_packet(pkt, conn);
476 }
477 }
478 }
479
480 /* Typical usage of this helper is in non per-packet code;
481 * this is because the bucket lock needs to be held for lookup
482 * and a hash would have already been needed. Hence, this function
483 * is just intended for code clarity. */
484 static struct conn *
485 conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
486 {
487 struct conn_lookup_ctx ctx;
488 ctx.conn = NULL;
489 ctx.key = *key;
490 ctx.hash = conn_key_hash(key, ct->hash_basis);
491 unsigned bucket = hash_to_bucket(ctx.hash);
492 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
493 return ctx.conn;
494 }
495
496 static void
497 nat_clean(struct conntrack *ct, struct conn *conn,
498 struct conntrack_bucket *ctb)
499 OVS_REQUIRES(ctb->lock)
500 {
501 long long now = time_msec();
502 ct_rwlock_wrlock(&ct->resources_lock);
503 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
504 ct_rwlock_unlock(&ct->resources_lock);
505 ct_lock_unlock(&ctb->lock);
506
507 uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
508 unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
509
510 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
511 ct_rwlock_wrlock(&ct->resources_lock);
512
513 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
514
515 struct nat_conn_key_node *nat_conn_key_node =
516 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
517 ct->hash_basis);
518
519 /* In the unlikely event, rev conn was recreated, then skip
520 * rev_conn cleanup. */
521 if (rev_conn && (!nat_conn_key_node ||
522 conn_key_cmp(&nat_conn_key_node->value,
523 &rev_conn->rev_key))) {
524 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
525 &rev_conn->node);
526 free(rev_conn);
527 }
528 delete_conn(conn);
529
530 ct_rwlock_unlock(&ct->resources_lock);
531 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
532 ct_lock_lock(&ctb->lock);
533 }
534
535 static void
536 conn_clean(struct conntrack *ct, struct conn *conn,
537 struct conntrack_bucket *ctb)
538 OVS_REQUIRES(ctb->lock)
539 {
540 ovs_list_remove(&conn->exp_node);
541 hmap_remove(&ctb->connections, &conn->node);
542 atomic_count_dec(&ct->n_conn);
543 if (conn->nat_info) {
544 nat_clean(ct, conn, ctb);
545 } else {
546 delete_conn(conn);
547 }
548 }
549
550 /* This function is called with the bucket lock held. */
551 static struct conn *
552 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
553 struct conn_lookup_ctx *ctx, bool commit, long long now,
554 const struct nat_action_info_t *nat_action_info,
555 struct conn *conn_for_un_nat_copy)
556 {
557 unsigned bucket = hash_to_bucket(ctx->hash);
558 struct conn *nc = NULL;
559
560 if (!valid_new(pkt, &ctx->key)) {
561 pkt->md.ct_state = CS_INVALID;
562 return nc;
563 }
564 pkt->md.ct_state = CS_NEW;
565
566 if (commit) {
567 unsigned int n_conn_limit;
568
569 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
570
571 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
572 COVERAGE_INC(conntrack_full);
573 return nc;
574 }
575
576 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
577 ctx->conn = nc;
578 nc->rev_key = nc->key;
579 conn_key_reverse(&nc->rev_key);
580
581 if (nat_action_info) {
582 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
583 ct_rwlock_wrlock(&ct->resources_lock);
584
585 bool nat_res = nat_select_range_tuple(ct, nc,
586 conn_for_un_nat_copy);
587
588 if (!nat_res) {
589 free(nc->nat_info);
590 nc->nat_info = NULL;
591 free (nc);
592 ct_rwlock_unlock(&ct->resources_lock);
593 return NULL;
594 }
595
596 if (conn_for_un_nat_copy &&
597 nc->conn_type == CT_CONN_TYPE_DEFAULT) {
598 *nc = *conn_for_un_nat_copy;
599 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
600 conn_for_un_nat_copy->nat_info = NULL;
601 }
602 ct_rwlock_unlock(&ct->resources_lock);
603
604 nat_packet(pkt, nc, ctx->icmp_related);
605 }
606 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
607 atomic_count_inc(&ct->n_conn);
608 }
609 return nc;
610 }
611
612 static bool
613 conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
614 struct conn_lookup_ctx *ctx, struct conn **conn,
615 long long now, unsigned bucket)
616 OVS_REQUIRES(ct->buckets[bucket].lock)
617 {
618 bool create_new_conn = false;
619
620 if (ctx->icmp_related) {
621 pkt->md.ct_state |= CS_RELATED;
622 if (ctx->reply) {
623 pkt->md.ct_state |= CS_REPLY_DIR;
624 }
625 } else {
626 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
627 pkt, ctx->reply, now);
628
629 switch (res) {
630 case CT_UPDATE_VALID:
631 pkt->md.ct_state |= CS_ESTABLISHED;
632 pkt->md.ct_state &= ~CS_NEW;
633 if (ctx->reply) {
634 pkt->md.ct_state |= CS_REPLY_DIR;
635 }
636 break;
637 case CT_UPDATE_INVALID:
638 pkt->md.ct_state = CS_INVALID;
639 break;
640 case CT_UPDATE_NEW:
641 conn_clean(ct, *conn, &ct->buckets[bucket]);
642 create_new_conn = true;
643 break;
644 default:
645 OVS_NOT_REACHED();
646 }
647 }
648 return create_new_conn;
649 }
650
651 static void
652 create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
653 long long now)
654 {
655 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
656 nc->key = conn_for_un_nat_copy->rev_key;
657 nc->rev_key = conn_for_un_nat_copy->key;
658 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
659 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
660 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
661 ct_rwlock_rdlock(&ct->resources_lock);
662
663 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
664
665 struct nat_conn_key_node *nat_conn_key_node =
666 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
667 if (nat_conn_key_node
668 && !conn_key_cmp(&nat_conn_key_node->value, &nc->rev_key)
669 && !rev_conn) {
670 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
671 &nc->node, un_nat_hash);
672 } else {
673 free(nc);
674 }
675 ct_rwlock_unlock(&ct->resources_lock);
676 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
677 }
678
679 static void
680 handle_nat(struct dp_packet *pkt, struct conn *conn,
681 uint16_t zone, bool reply, bool related)
682 {
683 if (conn->nat_info &&
684 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
685 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
686 zone != pkt->md.ct_zone))) {
687 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
688 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
689 }
690 if (reply) {
691 un_nat_packet(pkt, conn, related);
692 } else {
693 nat_packet(pkt, conn, related);
694 }
695 }
696 }
697
698 static bool
699 check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
700 struct conn_lookup_ctx *ctx_in, long long now,
701 unsigned *bucket, struct conn **conn,
702 const struct nat_action_info_t *nat_action_info)
703 OVS_REQUIRES(ct->buckets[*bucket].lock)
704 {
705 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
706 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
707 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
708 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
709 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
710 nat_action_info) {
711 return false;
712 }
713
714 ct_lock_unlock(&ct->buckets[*bucket].lock);
715 struct conn_lookup_ctx ctx;
716 memset(&ctx, 0 , sizeof ctx);
717 ctx.conn = NULL;
718
719 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
720 ctx.key.src.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
721 ctx.key.dst.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
722
723 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
724 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
725 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
726 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
727 ctx.key.src.icmp_type = (uint8_t) src_port;
728 ctx.key.dst.icmp_type = reverse_icmp_type(ctx.key.src.icmp_type);
729 } else {
730 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
731 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
732 }
733 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
734 } else {
735 ctx.key.src.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
736 ctx.key.dst.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
737
738 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
739 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
740 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
741 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
742 ctx.key.src.icmp_type = (uint8_t) src_port;
743 ctx.key.dst.icmp_type = reverse_icmp6_type(ctx.key.src.icmp_type);
744 } else {
745 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
746 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
747 }
748 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
749 }
750
751 ctx.key.dl_type = ctx_in->key.dl_type;
752 ctx.key.zone = pkt->md.ct_zone;
753
754 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
755 *bucket = hash_to_bucket(ctx.hash);
756 ct_lock_lock(&ct->buckets[*bucket].lock);
757 conn_key_lookup(&ct->buckets[*bucket], &ctx, now);
758 *conn = ctx.conn;
759
760 return *conn ? true : false;
761 }
762
763 static void
764 process_one(struct conntrack *ct, struct dp_packet *pkt,
765 struct conn_lookup_ctx *ctx, uint16_t zone,
766 bool force, bool commit, long long now, const uint32_t *setmark,
767 const struct ovs_key_ct_labels *setlabel,
768 const struct nat_action_info_t *nat_action_info)
769 {
770 struct conn *conn;
771 unsigned bucket = hash_to_bucket(ctx->hash);
772 ct_lock_lock(&ct->buckets[bucket].lock);
773 conn_key_lookup(&ct->buckets[bucket], ctx, now);
774 conn = ctx->conn;
775
776 /* Delete found entry if in wrong direction. 'force' implies commit. */
777 if (conn && force && ctx->reply) {
778 conn_clean(ct, conn, &ct->buckets[bucket]);
779 conn = NULL;
780 }
781
782 if (OVS_LIKELY(conn)) {
783 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
784
785 ctx->reply = true;
786
787 struct conn_lookup_ctx ctx2;
788 ctx2.conn = NULL;
789 ctx2.key = conn->rev_key;
790 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
791
792 ct_lock_unlock(&ct->buckets[bucket].lock);
793 bucket = hash_to_bucket(ctx2.hash);
794
795 ct_lock_lock(&ct->buckets[bucket].lock);
796 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
797
798 if (ctx2.conn) {
799 conn = ctx2.conn;
800 } else {
801 /* It is a race condition where conn has timed out and removed
802 * between unlock of the rev_conn and lock of the forward conn;
803 * nothing to do. */
804 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
805 ct_lock_unlock(&ct->buckets[bucket].lock);
806 return;
807 }
808 }
809 }
810
811 bool create_new_conn = false;
812 struct conn conn_for_un_nat_copy;
813 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
814 if (OVS_LIKELY(conn)) {
815 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
816 if (nat_action_info && !create_new_conn) {
817 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
818 }
819 } else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
820 nat_action_info)) {
821 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
822 } else {
823 if (ctx->icmp_related) {
824 pkt->md.ct_state = CS_INVALID;
825 } else {
826 create_new_conn = true;
827 }
828 }
829
830 if (OVS_UNLIKELY(create_new_conn)) {
831 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
832 &conn_for_un_nat_copy);
833 }
834
835 write_ct_md(pkt, zone, conn, &ctx->key);
836 if (conn && setmark) {
837 set_mark(pkt, conn, setmark[0], setmark[1]);
838 }
839
840 if (conn && setlabel) {
841 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
842 }
843
844 ct_lock_unlock(&ct->buckets[bucket].lock);
845
846 if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) {
847 create_un_nat_conn(ct, &conn_for_un_nat_copy, now);
848 }
849 }
850
851 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
852 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
853 * the l3 and and l4 offset properly set.
854 *
855 * If 'commit' is true, the packets are allowed to create new entries in the
856 * connection tables. 'setmark', if not NULL, should point to a two
857 * elements array containing a value and a mask to set the connection mark.
858 * 'setlabel' behaves similarly for the connection label.*/
859 int
860 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
861 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
862 const uint32_t *setmark,
863 const struct ovs_key_ct_labels *setlabel,
864 const char *helper,
865 const struct nat_action_info_t *nat_action_info)
866 {
867 struct dp_packet **pkts = pkt_batch->packets;
868 size_t cnt = pkt_batch->count;
869 long long now = time_msec();
870 struct conn_lookup_ctx ctx;
871
872 if (helper) {
873 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
874
875 VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
876 /* Continue without the helper */
877 }
878
879 for (size_t i = 0; i < cnt; i++) {
880 if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone)) {
881 pkts[i]->md.ct_state = CS_INVALID;
882 write_ct_md(pkts[i], zone, NULL, NULL);
883 continue;
884 }
885 process_one(ct, pkts[i], &ctx, zone, force, commit,
886 now, setmark, setlabel, nat_action_info);
887 }
888
889 return 0;
890 }
891
892 static void
893 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
894 {
895 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
896 conn->mark = pkt->md.ct_mark;
897 }
898
899 static void
900 set_label(struct dp_packet *pkt, struct conn *conn,
901 const struct ovs_key_ct_labels *val,
902 const struct ovs_key_ct_labels *mask)
903 {
904 ovs_u128 v, m;
905
906 memcpy(&v, val, sizeof v);
907 memcpy(&m, mask, sizeof m);
908
909 pkt->md.ct_label.u64.lo = v.u64.lo
910 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
911 pkt->md.ct_label.u64.hi = v.u64.hi
912 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
913 conn->label = pkt->md.ct_label;
914 }
915
916 \f
917 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
918 * earliest expiration time among the remaining connections in 'ctb'. Returns
919 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
920 * if 'limit' is reached */
921 static long long
922 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
923 size_t limit)
924 OVS_REQUIRES(ctb->lock)
925 {
926 struct conn *conn, *next;
927 long long min_expiration = LLONG_MAX;
928 unsigned i;
929 size_t count = 0;
930
931 for (i = 0; i < N_CT_TM; i++) {
932 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
933 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
934 if (!conn_expired(conn, now) || count >= limit) {
935 min_expiration = MIN(min_expiration, conn->expiration);
936 if (count >= limit) {
937 /* Do not check other lists. */
938 COVERAGE_INC(conntrack_long_cleanup);
939 return min_expiration;
940 }
941 break;
942 }
943 conn_clean(ct, conn, ctb);
944 count++;
945 }
946 }
947 }
948
949 return min_expiration;
950 }
951
952 /* Cleans up old connection entries from 'ct'. Returns the time when the
953 * next expiration might happen. The return value might be smaller than
954 * 'now', meaning that an internal limit has been reached, and some expired
955 * connections have not been deleted. */
956 static long long
957 conntrack_clean(struct conntrack *ct, long long now)
958 {
959 long long next_wakeup = now + CT_TM_MIN;
960 unsigned int n_conn_limit;
961 size_t clean_count = 0;
962 unsigned i;
963
964 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
965
966 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
967 struct conntrack_bucket *ctb = &ct->buckets[i];
968 size_t prev_count;
969 long long min_exp;
970
971 ovs_mutex_lock(&ctb->cleanup_mutex);
972 if (ctb->next_cleanup > now) {
973 goto next_bucket;
974 }
975
976 ct_lock_lock(&ctb->lock);
977 prev_count = hmap_count(&ctb->connections);
978 /* If the connections are well distributed among buckets, we want to
979 * limit to 10% of the global limit equally split among buckets. If
980 * the bucket is busier than the others, we limit to 10% of its
981 * current size. */
982 min_exp = sweep_bucket(ct, ctb, now,
983 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
984 clean_count += prev_count - hmap_count(&ctb->connections);
985
986 if (min_exp > now) {
987 /* We call hmap_shrink() only if sweep_bucket() managed to delete
988 * every expired connection. */
989 hmap_shrink(&ctb->connections);
990 }
991
992 ct_lock_unlock(&ctb->lock);
993
994 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
995
996 next_bucket:
997 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
998 ovs_mutex_unlock(&ctb->cleanup_mutex);
999 }
1000
1001 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
1002 clean_count, time_msec() - now);
1003
1004 return next_wakeup;
1005 }
1006
1007 /* Cleanup:
1008 *
1009 * We must call conntrack_clean() periodically. conntrack_clean() return
1010 * value gives an hint on when the next cleanup must be done (either because
1011 * there is an actual connection that expires, or because a new connection
1012 * might be created with the minimum timeout).
1013 *
1014 * The logic below has two goals:
1015 *
1016 * - We want to reduce the number of wakeups and batch connection cleanup
1017 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1018 * are coping with the current cleanup tasks, then we wait at least
1019 * 5 seconds to do further cleanup.
1020 *
1021 * - We don't want to keep the buckets locked too long, as we might prevent
1022 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
1023 * behind, there is at least some 200ms blocks of time when buckets will be
1024 * left alone, so the datapath can operate unhindered.
1025 */
1026 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1027 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1028
1029 static void *
1030 clean_thread_main(void *f_)
1031 {
1032 struct conntrack *ct = f_;
1033
1034 while (!latch_is_set(&ct->clean_thread_exit)) {
1035 long long next_wake;
1036 long long now = time_msec();
1037
1038 next_wake = conntrack_clean(ct, now);
1039
1040 if (next_wake < now) {
1041 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1042 } else {
1043 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1044 }
1045 latch_wait(&ct->clean_thread_exit);
1046 poll_block();
1047 }
1048
1049 return NULL;
1050 }
1051 \f
1052 /* Key extraction */
1053
1054 /* The function stores a pointer to the first byte after the header in
1055 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1056 * not interested in the header's tail, meaning that the header has
1057 * already been parsed (e.g. by flow_extract): we take this as a hint to
1058 * save a few checks. If 'validate_checksum' is true, the function returns
1059 * false if the IPv4 checksum is invalid. */
1060 static inline bool
1061 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1062 const char **new_data, bool validate_checksum)
1063 {
1064 const struct ip_header *ip = data;
1065 size_t ip_len;
1066
1067 if (new_data) {
1068 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1069 return false;
1070 }
1071 }
1072
1073 ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
1074
1075 if (new_data) {
1076 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1077 return false;
1078 }
1079 if (OVS_UNLIKELY(size < ip_len)) {
1080 return false;
1081 }
1082
1083 *new_data = (char *) data + ip_len;
1084 }
1085
1086 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1087 return false;
1088 }
1089
1090 if (validate_checksum && csum(data, ip_len) != 0) {
1091 return false;
1092 }
1093
1094 key->src.addr.ipv4 = ip->ip_src;
1095 key->dst.addr.ipv4 = ip->ip_dst;
1096 key->nw_proto = ip->ip_proto;
1097
1098 return true;
1099 }
1100
1101 /* The function stores a pointer to the first byte after the header in
1102 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1103 * not interested in the header's tail, meaning that the header has
1104 * already been parsed (e.g. by flow_extract): we take this as a hint to
1105 * save a few checks. */
1106 static inline bool
1107 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1108 const char **new_data)
1109 {
1110 const struct ovs_16aligned_ip6_hdr *ip6 = data;
1111
1112 if (new_data) {
1113 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1114 return false;
1115 }
1116 }
1117
1118 uint8_t nw_proto = ip6->ip6_nxt;
1119 uint8_t nw_frag = 0;
1120
1121 data = ip6 + 1;
1122 size -= sizeof *ip6;
1123
1124 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
1125 return false;
1126 }
1127
1128 if (new_data) {
1129 *new_data = data;
1130 }
1131
1132 if (nw_frag) {
1133 return false;
1134 }
1135
1136 key->src.addr.ipv6 = ip6->ip6_src;
1137 key->dst.addr.ipv6 = ip6->ip6_dst;
1138 key->nw_proto = nw_proto;
1139
1140 return true;
1141 }
1142
1143 static inline bool
1144 checksum_valid(const struct conn_key *key, const void *data, size_t size,
1145 const void *l3)
1146 {
1147 uint32_t csum = 0;
1148
1149 if (key->dl_type == htons(ETH_TYPE_IP)) {
1150 csum = packet_csum_pseudoheader(l3);
1151 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1152 csum = packet_csum_pseudoheader6(l3);
1153 } else {
1154 return false;
1155 }
1156
1157 csum = csum_continue(csum, data, size);
1158
1159 return csum_finish(csum) == 0;
1160 }
1161
1162 static inline bool
1163 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
1164 const void *l3, bool validate_checksum)
1165 {
1166 const struct tcp_header *tcp = data;
1167 if (size < sizeof *tcp) {
1168 return false;
1169 }
1170
1171 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1172 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1173 return false;
1174 }
1175
1176 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
1177 }
1178
1179 static inline bool
1180 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
1181 const void *l3, bool validate_checksum)
1182 {
1183 const struct udp_header *udp = data;
1184 if (size < sizeof *udp) {
1185 return false;
1186 }
1187
1188 size_t udp_len = ntohs(udp->udp_len);
1189 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1190 return false;
1191 }
1192
1193 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1194 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1195 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
1196 }
1197
1198 static inline bool
1199 check_l4_icmp(const void *data, size_t size, bool validate_checksum)
1200 {
1201 return validate_checksum ? csum(data, size) == 0 : true;
1202 }
1203
1204 static inline bool
1205 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
1206 const void *l3, bool validate_checksum)
1207 {
1208 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
1209 }
1210
1211 static inline bool
1212 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1213 {
1214 const struct tcp_header *tcp = data;
1215
1216 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1217 return false;
1218 }
1219
1220 key->src.port = tcp->tcp_src;
1221 key->dst.port = tcp->tcp_dst;
1222
1223 /* Port 0 is invalid */
1224 return key->src.port && key->dst.port;
1225 }
1226
1227 static inline bool
1228 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1229 {
1230 const struct udp_header *udp = data;
1231
1232 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1233 return false;
1234 }
1235
1236 key->src.port = udp->udp_src;
1237 key->dst.port = udp->udp_dst;
1238
1239 /* Port 0 is invalid */
1240 return key->src.port && key->dst.port;
1241 }
1242
1243 static inline bool extract_l4(struct conn_key *key, const void *data,
1244 size_t size, bool *related, const void *l3,
1245 bool validate_checksum);
1246
1247 static uint8_t
1248 reverse_icmp_type(uint8_t type)
1249 {
1250 switch (type) {
1251 case ICMP4_ECHO_REQUEST:
1252 return ICMP4_ECHO_REPLY;
1253 case ICMP4_ECHO_REPLY:
1254 return ICMP4_ECHO_REQUEST;
1255
1256 case ICMP4_TIMESTAMP:
1257 return ICMP4_TIMESTAMPREPLY;
1258 case ICMP4_TIMESTAMPREPLY:
1259 return ICMP4_TIMESTAMP;
1260
1261 case ICMP4_INFOREQUEST:
1262 return ICMP4_INFOREPLY;
1263 case ICMP4_INFOREPLY:
1264 return ICMP4_INFOREQUEST;
1265 default:
1266 OVS_NOT_REACHED();
1267 }
1268 }
1269
1270 /* If 'related' is not NULL and the function is processing an ICMP
1271 * error packet, extract the l3 and l4 fields from the nested header
1272 * instead and set *related to true. If 'related' is NULL we're
1273 * already processing a nested header and no such recursion is
1274 * possible */
1275 static inline int
1276 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1277 bool *related)
1278 {
1279 const struct icmp_header *icmp = data;
1280
1281 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1282 return false;
1283 }
1284
1285 switch (icmp->icmp_type) {
1286 case ICMP4_ECHO_REQUEST:
1287 case ICMP4_ECHO_REPLY:
1288 case ICMP4_TIMESTAMP:
1289 case ICMP4_TIMESTAMPREPLY:
1290 case ICMP4_INFOREQUEST:
1291 case ICMP4_INFOREPLY:
1292 if (icmp->icmp_code != 0) {
1293 return false;
1294 }
1295 /* Separate ICMP connection: identified using id */
1296 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1297 key->src.icmp_type = icmp->icmp_type;
1298 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
1299 break;
1300 case ICMP4_DST_UNREACH:
1301 case ICMP4_TIME_EXCEEDED:
1302 case ICMP4_PARAM_PROB:
1303 case ICMP4_SOURCEQUENCH:
1304 case ICMP4_REDIRECT: {
1305 /* ICMP packet part of another connection. We should
1306 * extract the key from embedded packet header */
1307 struct conn_key inner_key;
1308 const char *l3 = (const char *) (icmp + 1);
1309 const char *tail = (const char *) data + size;
1310 const char *l4;
1311 bool ok;
1312
1313 if (!related) {
1314 return false;
1315 }
1316
1317 memset(&inner_key, 0, sizeof inner_key);
1318 inner_key.dl_type = htons(ETH_TYPE_IP);
1319 ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
1320 if (!ok) {
1321 return false;
1322 }
1323
1324 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
1325 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
1326 return false;
1327 }
1328
1329 key->src = inner_key.src;
1330 key->dst = inner_key.dst;
1331 key->nw_proto = inner_key.nw_proto;
1332
1333 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
1334 if (ok) {
1335 conn_key_reverse(key);
1336 *related = true;
1337 }
1338 return ok;
1339 }
1340 default:
1341 return false;
1342 }
1343
1344 return true;
1345 }
1346
1347 static uint8_t
1348 reverse_icmp6_type(uint8_t type)
1349 {
1350 switch (type) {
1351 case ICMP6_ECHO_REQUEST:
1352 return ICMP6_ECHO_REPLY;
1353 case ICMP6_ECHO_REPLY:
1354 return ICMP6_ECHO_REQUEST;
1355 default:
1356 OVS_NOT_REACHED();
1357 }
1358 }
1359
1360 /* If 'related' is not NULL and the function is processing an ICMP
1361 * error packet, extract the l3 and l4 fields from the nested header
1362 * instead and set *related to true. If 'related' is NULL we're
1363 * already processing a nested header and no such recursion is
1364 * possible */
1365 static inline bool
1366 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1367 bool *related)
1368 {
1369 const struct icmp6_header *icmp6 = data;
1370
1371 /* All the messages that we support need at least 4 bytes after
1372 * the header */
1373 if (size < sizeof *icmp6 + 4) {
1374 return false;
1375 }
1376
1377 switch (icmp6->icmp6_type) {
1378 case ICMP6_ECHO_REQUEST:
1379 case ICMP6_ECHO_REPLY:
1380 if (icmp6->icmp6_code != 0) {
1381 return false;
1382 }
1383 /* Separate ICMP connection: identified using id */
1384 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1385 key->src.icmp_type = icmp6->icmp6_type;
1386 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
1387 break;
1388 case ICMP6_DST_UNREACH:
1389 case ICMP6_PACKET_TOO_BIG:
1390 case ICMP6_TIME_EXCEEDED:
1391 case ICMP6_PARAM_PROB: {
1392 /* ICMP packet part of another connection. We should
1393 * extract the key from embedded packet header */
1394 struct conn_key inner_key;
1395 const char *l3 = (const char *) icmp6 + 8;
1396 const char *tail = (const char *) data + size;
1397 const char *l4 = NULL;
1398 bool ok;
1399
1400 if (!related) {
1401 return false;
1402 }
1403
1404 memset(&inner_key, 0, sizeof inner_key);
1405 inner_key.dl_type = htons(ETH_TYPE_IPV6);
1406 ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
1407 if (!ok) {
1408 return false;
1409 }
1410
1411 /* pf doesn't do this, but it seems a good idea */
1412 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
1413 &key->dst.addr.ipv6_aligned)
1414 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
1415 &key->src.addr.ipv6_aligned)) {
1416 return false;
1417 }
1418
1419 key->src = inner_key.src;
1420 key->dst = inner_key.dst;
1421 key->nw_proto = inner_key.nw_proto;
1422
1423 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
1424 if (ok) {
1425 conn_key_reverse(key);
1426 *related = true;
1427 }
1428 return ok;
1429 }
1430 default:
1431 return false;
1432 }
1433
1434 return true;
1435 }
1436
1437 /* Extract l4 fields into 'key', which must already contain valid l3
1438 * members.
1439 *
1440 * If 'related' is not NULL and an ICMP error packet is being
1441 * processed, the function will extract the key from the packet nested
1442 * in the ICMP payload and set '*related' to true.
1443 *
1444 * If 'related' is NULL, it means that we're already parsing a header nested
1445 * in an ICMP error. In this case, we skip checksum and length validation. */
1446 static inline bool
1447 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
1448 const void *l3, bool validate_checksum)
1449 {
1450 if (key->nw_proto == IPPROTO_TCP) {
1451 return (!related || check_l4_tcp(key, data, size, l3,
1452 validate_checksum)) && extract_l4_tcp(key, data, size);
1453 } else if (key->nw_proto == IPPROTO_UDP) {
1454 return (!related || check_l4_udp(key, data, size, l3,
1455 validate_checksum)) && extract_l4_udp(key, data, size);
1456 } else if (key->dl_type == htons(ETH_TYPE_IP)
1457 && key->nw_proto == IPPROTO_ICMP) {
1458 return (!related || check_l4_icmp(data, size, validate_checksum))
1459 && extract_l4_icmp(key, data, size, related);
1460 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1461 && key->nw_proto == IPPROTO_ICMPV6) {
1462 return (!related || check_l4_icmp6(key, data, size, l3,
1463 validate_checksum)) && extract_l4_icmp6(key, data, size,
1464 related);
1465 } else {
1466 return false;
1467 }
1468 }
1469
1470 static bool
1471 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
1472 struct conn_lookup_ctx *ctx, uint16_t zone)
1473 {
1474 const struct eth_header *l2 = dp_packet_eth(pkt);
1475 const struct ip_header *l3 = dp_packet_l3(pkt);
1476 const char *l4 = dp_packet_l4(pkt);
1477 const char *tail = dp_packet_tail(pkt);
1478 bool ok;
1479
1480 memset(ctx, 0, sizeof *ctx);
1481
1482 if (!l2 || !l3 || !l4) {
1483 return false;
1484 }
1485
1486 ctx->key.zone = zone;
1487
1488 /* XXX In this function we parse the packet (again, it has already
1489 * gone through miniflow_extract()) for two reasons:
1490 *
1491 * 1) To extract the l3 addresses and l4 ports.
1492 * We already have the l3 and l4 headers' pointers. Extracting
1493 * the l3 addresses and the l4 ports is really cheap, since they
1494 * can be found at fixed locations.
1495 * 2) To extract the l4 type.
1496 * Extracting the l4 types, for IPv6 can be quite expensive, because
1497 * it's not at a fixed location.
1498 *
1499 * Here's a way to avoid (2) with the help of the datapath.
1500 * The datapath doesn't keep the packet's extracted flow[1], so
1501 * using that is not an option. We could use the packet's matching
1502 * megaflow, but we have to make sure that the l4 type (nw_proto)
1503 * is unwildcarded. This means either:
1504 *
1505 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1506 * if the actions contains ct().
1507 *
1508 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1509 * action. This is already done in different actions, but it's
1510 * unnecessary for the kernel.
1511 *
1512 * ---
1513 * [1] The reasons for this are that keeping the flow increases
1514 * (slightly) the cache footprint and increases computation
1515 * time as we move the packet around. Most importantly, the flow
1516 * should be updated by the actions and this can be slow, as
1517 * we use a sparse representation (miniflow).
1518 *
1519 */
1520 ctx->key.dl_type = dl_type;
1521 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
1522 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
1523 if (hwol_bad_l3_csum) {
1524 ok = false;
1525 } else {
1526 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
1527 /* Validate the checksum only when hwol is not supported. */
1528 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL,
1529 !hwol_good_l3_csum);
1530 }
1531 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1532 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1533 } else {
1534 ok = false;
1535 }
1536
1537
1538 if (ok) {
1539 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
1540 if (!hwol_bad_l4_csum) {
1541 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt);
1542 /* Validate the checksum only when hwol is not supported. */
1543 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->icmp_related, l3,
1544 !hwol_good_l4_csum)) {
1545 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1546 return true;
1547 }
1548 }
1549 }
1550
1551 return false;
1552 }
1553
1554 static uint32_t
1555 ct_addr_hash_add(uint32_t hash, const struct ct_addr *addr)
1556 {
1557 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
1558 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
1559 }
1560
1561 static uint32_t
1562 ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
1563 {
1564 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
1565 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
1566 }
1567 \f
1568 /* Symmetric */
1569 static uint32_t
1570 conn_key_hash(const struct conn_key *key, uint32_t basis)
1571 {
1572 uint32_t hsrc, hdst, hash;
1573
1574 hsrc = hdst = basis;
1575 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
1576 hdst = ct_endpoint_hash_add(hdst, &key->dst);
1577
1578 /* Even if source and destination are swapped the hash will be the same. */
1579 hash = hsrc ^ hdst;
1580
1581 /* Hash the rest of the key(L3 and L4 types and zone). */
1582 hash = hash_words((uint32_t *) (&key->dst + 1),
1583 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1584 hash);
1585
1586 return hash_finish(hash, 0);
1587 }
1588
1589 static void
1590 conn_key_reverse(struct conn_key *key)
1591 {
1592 struct ct_endpoint tmp;
1593
1594 tmp = key->src;
1595 key->src = key->dst;
1596 key->dst = tmp;
1597 }
1598
1599 static uint32_t
1600 nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
1601 struct in6_addr *ipv6_aligned_max)
1602 {
1603 uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
1604 uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t);
1605 uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
1606 uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
1607
1608 ovs_be64 addr6_64_min_hi;
1609 ovs_be64 addr6_64_min_lo;
1610 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
1611 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
1612
1613 ovs_be64 addr6_64_max_hi;
1614 ovs_be64 addr6_64_max_lo;
1615 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
1616 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
1617
1618 uint64_t diff;
1619 if (addr6_64_min_hi == addr6_64_max_hi &&
1620 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
1621 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
1622 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
1623 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
1624 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
1625 ntohll(addr6_64_max_lo) - 1);
1626 } else {
1627 /* Limit address delta supported to 32 bits or 4 billion approximately.
1628 * Possibly, this should be visible to the user through a datapath
1629 * support check, however the practical impact is probably nil. */
1630 diff = 0xfffffffe;
1631 }
1632 if (diff > 0xfffffffe) {
1633 diff = 0xfffffffe;
1634 }
1635 return diff;
1636 }
1637
1638 /* This function must be used in tandem with nat_ipv6_addrs_delta(), which
1639 * restricts the input parameters. */
1640 static void
1641 nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
1642 {
1643 uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
1644 uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
1645 ovs_be64 addr6_64_hi;
1646 ovs_be64 addr6_64_lo;
1647 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
1648 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
1649
1650 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
1651 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
1652 } else if (addr6_64_hi != OVS_BE64_MAX) {
1653 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
1654 addr6_64_lo = htonll(increment - (UINT64_MAX -
1655 ntohll(addr6_64_lo) + 1));
1656 } else {
1657 OVS_NOT_REACHED();
1658 }
1659
1660 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
1661 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
1662
1663 return;
1664 }
1665
1666 static uint32_t
1667 nat_range_hash(const struct conn *conn, uint32_t basis)
1668 {
1669 uint32_t hash = basis;
1670
1671 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
1672 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
1673 hash = hash_add(hash,
1674 (conn->nat_info->max_port << 16)
1675 | conn->nat_info->min_port);
1676
1677 hash = ct_endpoint_hash_add(hash, &conn->key.src);
1678 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
1679
1680 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
1681 hash = hash_add(hash, conn->key.nw_proto);
1682 hash = hash_add(hash, conn->key.zone);
1683
1684 /* The purpose of the second parameter is to distinguish hashes of data of
1685 * different length; our data always has the same length so there is no
1686 * value in counting. */
1687 return hash_finish(hash, 0);
1688 }
1689
1690 static bool
1691 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
1692 struct conn *nat_conn)
1693 {
1694 #define MIN_NAT_EPHEMERAL_PORT 1024
1695 #define MAX_NAT_EPHEMERAL_PORT 65535
1696
1697 uint16_t min_port;
1698 uint16_t max_port;
1699 uint16_t first_port;
1700
1701 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
1702
1703 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
1704 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
1705 min_port = ntohs(conn->key.src.port);
1706 max_port = ntohs(conn->key.src.port);
1707 first_port = min_port;
1708 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
1709 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
1710 min_port = ntohs(conn->key.dst.port);
1711 max_port = ntohs(conn->key.dst.port);
1712 first_port = min_port;
1713 } else {
1714 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
1715 uint32_t port_index = hash % (deltap + 1);
1716 first_port = conn->nat_info->min_port + port_index;
1717 min_port = conn->nat_info->min_port;
1718 max_port = conn->nat_info->max_port;
1719 }
1720
1721 uint32_t deltaa = 0;
1722 uint32_t address_index;
1723 struct ct_addr ct_addr;
1724 memset(&ct_addr, 0, sizeof ct_addr);
1725 struct ct_addr max_ct_addr;
1726 memset(&max_ct_addr, 0, sizeof max_ct_addr);
1727 max_ct_addr = conn->nat_info->max_addr;
1728
1729 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1730 deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
1731 ntohl(conn->nat_info->min_addr.ipv4_aligned);
1732 address_index = hash % (deltaa + 1);
1733 ct_addr.ipv4_aligned = htonl(
1734 ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
1735 } else {
1736 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
1737 &conn->nat_info->max_addr.ipv6_aligned);
1738 /* deltaa must be within 32 bits for full hash coverage. A 64 or
1739 * 128 bit hash is unnecessary and hence not used here. Most code
1740 * is kept common with V4; nat_ipv6_addrs_delta() will do the
1741 * enforcement via max_ct_addr. */
1742 max_ct_addr = conn->nat_info->min_addr;
1743 nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
1744
1745 address_index = hash % (deltaa + 1);
1746 ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
1747 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
1748 }
1749
1750 uint16_t port = first_port;
1751 bool all_ports_tried = false;
1752 bool original_ports_tried = false;
1753 struct ct_addr first_addr = ct_addr;
1754 *nat_conn = *conn;
1755
1756 while (true) {
1757 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1758 nat_conn->rev_key.dst.addr = ct_addr;
1759 } else {
1760 nat_conn->rev_key.src.addr = ct_addr;
1761 }
1762
1763 if ((conn->key.nw_proto == IPPROTO_ICMP) ||
1764 (conn->key.nw_proto == IPPROTO_ICMPV6)) {
1765 all_ports_tried = true;
1766 } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1767 nat_conn->rev_key.dst.port = htons(port);
1768 } else {
1769 nat_conn->rev_key.src.port = htons(port);
1770 }
1771
1772 struct nat_conn_key_node *nat_conn_key_node =
1773 nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key,
1774 ct->hash_basis);
1775
1776 if (!nat_conn_key_node) {
1777 struct nat_conn_key_node *nat_conn_key =
1778 xzalloc(sizeof *nat_conn_key);
1779 nat_conn_key->key = nat_conn->rev_key;
1780 nat_conn_key->value = nat_conn->key;
1781 uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key,
1782 ct->hash_basis);
1783 hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node,
1784 nat_conn_key_hash);
1785 return true;
1786 } else if (!all_ports_tried) {
1787 if (min_port == max_port) {
1788 all_ports_tried = true;
1789 } else if (port == max_port) {
1790 port = min_port;
1791 } else {
1792 port++;
1793 }
1794 if (port == first_port) {
1795 all_ports_tried = true;
1796 }
1797 } else {
1798 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
1799 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1800 ct_addr.ipv4_aligned = htonl(
1801 ntohl(ct_addr.ipv4_aligned) + 1);
1802 } else {
1803 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
1804 }
1805 } else {
1806 ct_addr = conn->nat_info->min_addr;
1807 }
1808 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
1809 if (!original_ports_tried) {
1810 original_ports_tried = true;
1811 ct_addr = conn->nat_info->min_addr;
1812 min_port = MIN_NAT_EPHEMERAL_PORT;
1813 max_port = MAX_NAT_EPHEMERAL_PORT;
1814 } else {
1815 break;
1816 }
1817 }
1818 first_port = min_port;
1819 port = first_port;
1820 all_ports_tried = false;
1821 }
1822 }
1823 return false;
1824 }
1825
1826 /* This function must be called with the ct->resources lock taken. */
1827 static struct nat_conn_key_node *
1828 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
1829 const struct conn_key *key,
1830 uint32_t basis)
1831 {
1832 struct nat_conn_key_node *nat_conn_key_node;
1833 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1834
1835 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1836 nat_conn_keys) {
1837 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
1838 return nat_conn_key_node;
1839 }
1840 }
1841 return NULL;
1842 }
1843
1844 /* This function must be called with the ct->resources write lock taken. */
1845 static void
1846 nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
1847 uint32_t basis)
1848 {
1849 struct nat_conn_key_node *nat_conn_key_node;
1850 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1851
1852 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1853 nat_conn_keys) {
1854 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
1855 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
1856 free(nat_conn_key_node);
1857 return;
1858 }
1859 }
1860 }
1861
1862 static void
1863 conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
1864 long long now)
1865 OVS_REQUIRES(ctb->lock)
1866 {
1867 uint32_t hash = ctx->hash;
1868 struct conn *conn;
1869
1870 ctx->conn = NULL;
1871
1872 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
1873 if (!conn_key_cmp(&conn->key, &ctx->key)
1874 && !conn_expired(conn, now)) {
1875 ctx->conn = conn;
1876 ctx->reply = false;
1877 break;
1878 }
1879 if (!conn_key_cmp(&conn->rev_key, &ctx->key)
1880 && !conn_expired(conn, now)) {
1881 ctx->conn = conn;
1882 ctx->reply = true;
1883 break;
1884 }
1885 }
1886 }
1887
1888 static enum ct_update_res
1889 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1890 struct dp_packet *pkt, bool reply, long long now)
1891 {
1892 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1893 reply, now);
1894 }
1895
1896 static bool
1897 conn_expired(struct conn *conn, long long now)
1898 {
1899 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1900 return now >= conn->expiration;
1901 }
1902 return false;
1903 }
1904
1905 static bool
1906 valid_new(struct dp_packet *pkt, struct conn_key *key)
1907 {
1908 return l4_protos[key->nw_proto]->valid_new(pkt);
1909 }
1910
1911 static struct conn *
1912 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1913 struct conn_key *key, long long now)
1914 {
1915 struct conn *newconn;
1916
1917 newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1918
1919 if (newconn) {
1920 newconn->key = *key;
1921 }
1922
1923 return newconn;
1924 }
1925
1926 static void
1927 delete_conn(struct conn *conn)
1928 {
1929 free(conn->nat_info);
1930 free(conn);
1931 }
1932 \f
1933 static void
1934 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
1935 union ct_dpif_inet_addr *b,
1936 ovs_be16 dl_type)
1937 {
1938 if (dl_type == htons(ETH_TYPE_IP)) {
1939 b->ip = a->ipv4_aligned;
1940 } else if (dl_type == htons(ETH_TYPE_IPV6)){
1941 b->in6 = a->ipv6_aligned;
1942 }
1943 }
1944
1945 static void
1946 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
1947 {
1948 if (key->dl_type == htons(ETH_TYPE_IP)) {
1949 tuple->l3_type = AF_INET;
1950 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1951 tuple->l3_type = AF_INET6;
1952 }
1953 tuple->ip_proto = key->nw_proto;
1954 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
1955 key->dl_type);
1956 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
1957 key->dl_type);
1958
1959 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
1960 tuple->icmp_id = key->src.icmp_id;
1961 tuple->icmp_type = key->src.icmp_type;
1962 tuple->icmp_code = key->src.icmp_code;
1963 } else {
1964 tuple->src_port = key->src.port;
1965 tuple->dst_port = key->dst.port;
1966 }
1967 }
1968
1969 static void
1970 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
1971 long long now, int bkt)
1972 {
1973 struct ct_l4_proto *class;
1974 long long expiration;
1975 memset(entry, 0, sizeof *entry);
1976 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
1977 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
1978
1979 entry->zone = conn->key.zone;
1980 entry->mark = conn->mark;
1981
1982 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
1983 /* Not implemented yet */
1984 entry->timestamp.start = 0;
1985 entry->timestamp.stop = 0;
1986
1987 expiration = conn->expiration - now;
1988 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
1989
1990 class = l4_protos[conn->key.nw_proto];
1991 if (class->conn_get_protoinfo) {
1992 class->conn_get_protoinfo(conn, &entry->protoinfo);
1993 }
1994 entry->bkt = bkt;
1995 }
1996
1997 int
1998 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
1999 const uint16_t *pzone, int *ptot_bkts)
2000 {
2001 memset(dump, 0, sizeof(*dump));
2002 if (pzone) {
2003 dump->zone = *pzone;
2004 dump->filter_zone = true;
2005 }
2006 dump->ct = ct;
2007
2008 *ptot_bkts = CONNTRACK_BUCKETS;
2009
2010 return 0;
2011 }
2012
2013 int
2014 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2015 {
2016 struct conntrack *ct = dump->ct;
2017 long long now = time_msec();
2018
2019 while (dump->bucket < CONNTRACK_BUCKETS) {
2020 struct hmap_node *node;
2021
2022 ct_lock_lock(&ct->buckets[dump->bucket].lock);
2023 for (;;) {
2024 struct conn *conn;
2025
2026 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
2027 &dump->bucket_pos);
2028 if (!node) {
2029 break;
2030 }
2031 INIT_CONTAINER(conn, node, node);
2032 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2033 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
2034 conn_to_ct_dpif_entry(conn, entry, now, dump->bucket);
2035 break;
2036 }
2037 /* Else continue, until we find an entry in the appropriate zone
2038 * or the bucket has been scanned completely. */
2039 }
2040 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
2041
2042 if (!node) {
2043 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
2044 dump->bucket++;
2045 } else {
2046 return 0;
2047 }
2048 }
2049 return EOF;
2050 }
2051
2052 int
2053 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2054 {
2055 return 0;
2056 }
2057
2058 int
2059 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2060 {
2061 unsigned i;
2062
2063 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
2064 struct conn *conn, *next;
2065
2066 ct_lock_lock(&ct->buckets[i].lock);
2067 HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
2068 if ((!zone || *zone == conn->key.zone) &&
2069 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
2070 conn_clean(ct, conn, &ct->buckets[i]);
2071 }
2072 }
2073 ct_lock_unlock(&ct->buckets[i].lock);
2074 }
2075 return 0;
2076 }