]> git.proxmox.com Git - ovs.git/blob - lib/conntrack.c
dpctl: Add new 'ct-bkts' command.
[ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "conntrack.h"
19
20 #include <errno.h>
21 #include <sys/types.h>
22 #include <netinet/in.h>
23 #include <netinet/icmp6.h>
24
25 #include "bitmap.h"
26 #include "conntrack-private.h"
27 #include "coverage.h"
28 #include "csum.h"
29 #include "ct-dpif.h"
30 #include "dp-packet.h"
31 #include "flow.h"
32 #include "netdev.h"
33 #include "odp-netlink.h"
34 #include "openvswitch/hmap.h"
35 #include "openvswitch/vlog.h"
36 #include "ovs-rcu.h"
37 #include "ovs-thread.h"
38 #include "poll-loop.h"
39 #include "random.h"
40 #include "timeval.h"
41
42
43 VLOG_DEFINE_THIS_MODULE(conntrack);
44
45 COVERAGE_DEFINE(conntrack_full);
46 COVERAGE_DEFINE(conntrack_long_cleanup);
47
48 struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
53 bool icmp_related;
54 };
55
56 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
57 ovs_be16 dl_type, struct conn_lookup_ctx *,
58 uint16_t zone);
59 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
60 static void conn_key_reverse(struct conn_key *);
61 static void conn_key_lookup(struct conntrack_bucket *ctb,
62 struct conn_lookup_ctx *ctx,
63 long long now);
64 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
65 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
66 struct conn_key *, long long now);
67 static void delete_conn(struct conn *);
68 static enum ct_update_res conn_update(struct conn *,
69 struct conntrack_bucket *ctb,
70 struct dp_packet *, bool reply,
71 long long now);
72 static bool conn_expired(struct conn *, long long now);
73 static void set_mark(struct dp_packet *, struct conn *,
74 uint32_t val, uint32_t mask);
75 static void set_label(struct dp_packet *, struct conn *,
76 const struct ovs_key_ct_labels *val,
77 const struct ovs_key_ct_labels *mask);
78 static void *clean_thread_main(void *f_);
79
80 static struct nat_conn_key_node *
81 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
82 const struct conn_key *key,
83 uint32_t basis);
84
85 static void
86 nat_conn_keys_remove(struct hmap *nat_conn_keys,
87 const struct conn_key *key,
88 uint32_t basis);
89
90 static bool
91 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
92 struct conn *nat_conn);
93
94 static uint8_t
95 reverse_icmp_type(uint8_t type);
96 static uint8_t
97 reverse_icmp6_type(uint8_t type);
98 static inline bool
99 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
100 const char **new_data, bool validate_checksum);
101 static inline bool
102 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
103 const char **new_data);
104
105 static struct ct_l4_proto *l4_protos[] = {
106 [IPPROTO_TCP] = &ct_proto_tcp,
107 [IPPROTO_UDP] = &ct_proto_other,
108 [IPPROTO_ICMP] = &ct_proto_icmp4,
109 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
110 };
111
112 long long ct_timeout_val[] = {
113 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
114 CT_TIMEOUTS
115 #undef CT_TIMEOUT
116 };
117
118 /* If the total number of connections goes above this value, no new connections
119 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
120 #define DEFAULT_N_CONN_LIMIT 3000000
121
122 /* Initializes the connection tracker 'ct'. The caller is responsible for
123 * calling 'conntrack_destroy()', when the instance is not needed anymore */
124 void
125 conntrack_init(struct conntrack *ct)
126 {
127 unsigned i, j;
128 long long now = time_msec();
129
130 ct_rwlock_init(&ct->resources_lock);
131 ct_rwlock_wrlock(&ct->resources_lock);
132 hmap_init(&ct->nat_conn_keys);
133 ct_rwlock_unlock(&ct->resources_lock);
134
135 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
136 struct conntrack_bucket *ctb = &ct->buckets[i];
137
138 ct_lock_init(&ctb->lock);
139 ct_lock_lock(&ctb->lock);
140 hmap_init(&ctb->connections);
141 for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
142 ovs_list_init(&ctb->exp_lists[j]);
143 }
144 ct_lock_unlock(&ctb->lock);
145 ovs_mutex_init(&ctb->cleanup_mutex);
146 ovs_mutex_lock(&ctb->cleanup_mutex);
147 ctb->next_cleanup = now + CT_TM_MIN;
148 ovs_mutex_unlock(&ctb->cleanup_mutex);
149 }
150 ct->hash_basis = random_uint32();
151 atomic_count_init(&ct->n_conn, 0);
152 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
153 latch_init(&ct->clean_thread_exit);
154 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
155 }
156
157 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
158 void
159 conntrack_destroy(struct conntrack *ct)
160 {
161 unsigned i;
162
163 latch_set(&ct->clean_thread_exit);
164 pthread_join(ct->clean_thread, NULL);
165 latch_destroy(&ct->clean_thread_exit);
166 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
167 struct conntrack_bucket *ctb = &ct->buckets[i];
168 struct conn *conn;
169
170 ovs_mutex_destroy(&ctb->cleanup_mutex);
171 ct_lock_lock(&ctb->lock);
172 HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
173 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
174 atomic_count_dec(&ct->n_conn);
175 }
176 delete_conn(conn);
177 }
178 hmap_destroy(&ctb->connections);
179 ct_lock_unlock(&ctb->lock);
180 ct_lock_destroy(&ctb->lock);
181 }
182 ct_rwlock_wrlock(&ct->resources_lock);
183 struct nat_conn_key_node *nat_conn_key_node;
184 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
185 free(nat_conn_key_node);
186 }
187 hmap_destroy(&ct->nat_conn_keys);
188 ct_rwlock_unlock(&ct->resources_lock);
189 ct_rwlock_destroy(&ct->resources_lock);
190 }
191 \f
192 static unsigned hash_to_bucket(uint32_t hash)
193 {
194 /* Extracts the most significant bits in hash. The least significant bits
195 * are already used internally by the hmap implementation. */
196 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
197
198 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
199 }
200
201 static void
202 write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
203 const struct conn_key *key)
204 {
205 pkt->md.ct_state |= CS_TRACKED;
206 pkt->md.ct_zone = zone;
207 pkt->md.ct_mark = conn ? conn->mark : 0;
208 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
209
210 /* Use the original direction tuple if we have it. */
211 if (conn) {
212 key = &conn->key;
213 }
214 pkt->md.ct_orig_tuple_ipv6 = false;
215 if (key) {
216 if (key->dl_type == htons(ETH_TYPE_IP)) {
217 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
218 key->src.addr.ipv4_aligned,
219 key->dst.addr.ipv4_aligned,
220 key->nw_proto != IPPROTO_ICMP
221 ? key->src.port : htons(key->src.icmp_type),
222 key->nw_proto != IPPROTO_ICMP
223 ? key->dst.port : htons(key->src.icmp_code),
224 key->nw_proto,
225 };
226 } else {
227 pkt->md.ct_orig_tuple_ipv6 = true;
228 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
229 key->src.addr.ipv6_aligned,
230 key->dst.addr.ipv6_aligned,
231 key->nw_proto != IPPROTO_ICMPV6
232 ? key->src.port : htons(key->src.icmp_type),
233 key->nw_proto != IPPROTO_ICMPV6
234 ? key->dst.port : htons(key->src.icmp_code),
235 key->nw_proto,
236 };
237 }
238 } else {
239 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
240 }
241
242 }
243
244 static void
245 pat_packet(struct dp_packet *pkt, const struct conn *conn)
246 {
247 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
248 if (conn->key.nw_proto == IPPROTO_TCP) {
249 struct tcp_header *th = dp_packet_l4(pkt);
250 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
251 } else if (conn->key.nw_proto == IPPROTO_UDP) {
252 struct udp_header *uh = dp_packet_l4(pkt);
253 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
254 }
255 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
256 if (conn->key.nw_proto == IPPROTO_TCP) {
257 struct tcp_header *th = dp_packet_l4(pkt);
258 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
259 } else if (conn->key.nw_proto == IPPROTO_UDP) {
260 struct udp_header *uh = dp_packet_l4(pkt);
261 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
262 }
263 }
264 }
265
266 static void
267 nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
268 {
269 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
270 pkt->md.ct_state |= CS_SRC_NAT;
271 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
272 struct ip_header *nh = dp_packet_l3(pkt);
273 packet_set_ipv4_addr(pkt, &nh->ip_src,
274 conn->rev_key.dst.addr.ipv4_aligned);
275 } else {
276 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
277 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
278 nh6->ip6_src.be32,
279 &conn->rev_key.dst.addr.ipv6_aligned,
280 true);
281 }
282 if (!related) {
283 pat_packet(pkt, conn);
284 }
285 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
286 pkt->md.ct_state |= CS_DST_NAT;
287 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
288 struct ip_header *nh = dp_packet_l3(pkt);
289 packet_set_ipv4_addr(pkt, &nh->ip_dst,
290 conn->rev_key.src.addr.ipv4_aligned);
291 } else {
292 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
293 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
294 nh6->ip6_dst.be32,
295 &conn->rev_key.src.addr.ipv6_aligned,
296 true);
297 }
298 if (!related) {
299 pat_packet(pkt, conn);
300 }
301 }
302 }
303
304 static void
305 un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
306 {
307 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
308 if (conn->key.nw_proto == IPPROTO_TCP) {
309 struct tcp_header *th = dp_packet_l4(pkt);
310 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
311 } else if (conn->key.nw_proto == IPPROTO_UDP) {
312 struct udp_header *uh = dp_packet_l4(pkt);
313 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
314 }
315 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
316 if (conn->key.nw_proto == IPPROTO_TCP) {
317 struct tcp_header *th = dp_packet_l4(pkt);
318 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
319 } else if (conn->key.nw_proto == IPPROTO_UDP) {
320 struct udp_header *uh = dp_packet_l4(pkt);
321 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
322 }
323 }
324 }
325
326 static void
327 reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
328 {
329 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
330 if (conn->key.nw_proto == IPPROTO_TCP) {
331 struct tcp_header *th_in = dp_packet_l4(pkt);
332 packet_set_tcp_port(pkt, conn->key.src.port,
333 th_in->tcp_dst);
334 } else if (conn->key.nw_proto == IPPROTO_UDP) {
335 struct udp_header *uh_in = dp_packet_l4(pkt);
336 packet_set_udp_port(pkt, conn->key.src.port,
337 uh_in->udp_dst);
338 }
339 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
340 if (conn->key.nw_proto == IPPROTO_TCP) {
341 struct tcp_header *th_in = dp_packet_l4(pkt);
342 packet_set_tcp_port(pkt, th_in->tcp_src,
343 conn->key.dst.port);
344 } else if (conn->key.nw_proto == IPPROTO_UDP) {
345 struct udp_header *uh_in = dp_packet_l4(pkt);
346 packet_set_udp_port(pkt, uh_in->udp_src,
347 conn->key.dst.port);
348 }
349 }
350 }
351
352 static void
353 reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
354 {
355 char *tail = dp_packet_tail(pkt);
356 char pad = dp_packet_l2_pad_size(pkt);
357 struct conn_key inner_key;
358 const char *inner_l4 = NULL;
359 uint16_t orig_l3_ofs = pkt->l3_ofs;
360 uint16_t orig_l4_ofs = pkt->l4_ofs;
361
362 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
363 struct ip_header *nh = dp_packet_l3(pkt);
364 struct icmp_header *icmp = dp_packet_l4(pkt);
365 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
366 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
367 -pad, &inner_l4, false);
368
369 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
370 pkt->l4_ofs += inner_l4 - (char *) icmp;
371
372 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
373 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
374 conn->key.src.addr.ipv4_aligned);
375 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
376 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
377 conn->key.dst.addr.ipv4_aligned);
378 }
379 reverse_pat_packet(pkt, conn);
380 icmp->icmp_csum = 0;
381 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
382 } else {
383 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
384 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
385 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
386 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
387 extract_l3_ipv6(&inner_key, inner_l3_6,
388 tail - ((char *)inner_l3_6) - pad,
389 &inner_l4);
390 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
391 pkt->l4_ofs += inner_l4 - (char *) icmp6;
392
393 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
394 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
395 inner_l3_6->ip6_src.be32,
396 &conn->key.src.addr.ipv6_aligned,
397 true);
398 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
399 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
400 inner_l3_6->ip6_dst.be32,
401 &conn->key.dst.addr.ipv6_aligned,
402 true);
403 }
404 reverse_pat_packet(pkt, conn);
405 uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
406 icmp6->icmp6_base.icmp6_cksum = 0;
407 icmp6->icmp6_base.icmp6_cksum = csum_finish(
408 csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
409 }
410 pkt->l3_ofs = orig_l3_ofs;
411 pkt->l4_ofs = orig_l4_ofs;
412 }
413
414 static void
415 un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
416 bool related)
417 {
418 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
419 pkt->md.ct_state |= CS_DST_NAT;
420 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
421 struct ip_header *nh = dp_packet_l3(pkt);
422 packet_set_ipv4_addr(pkt, &nh->ip_dst,
423 conn->key.src.addr.ipv4_aligned);
424 } else {
425 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
426 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
427 nh6->ip6_dst.be32,
428 &conn->key.src.addr.ipv6_aligned, true);
429 }
430
431 if (OVS_UNLIKELY(related)) {
432 reverse_nat_packet(pkt, conn);
433 } else {
434 un_pat_packet(pkt, conn);
435 }
436 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
437 pkt->md.ct_state |= CS_SRC_NAT;
438 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
439 struct ip_header *nh = dp_packet_l3(pkt);
440 packet_set_ipv4_addr(pkt, &nh->ip_src,
441 conn->key.dst.addr.ipv4_aligned);
442 } else {
443 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
444 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
445 nh6->ip6_src.be32,
446 &conn->key.dst.addr.ipv6_aligned, true);
447 }
448
449 if (OVS_UNLIKELY(related)) {
450 reverse_nat_packet(pkt, conn);
451 } else {
452 un_pat_packet(pkt, conn);
453 }
454 }
455 }
456
457 /* Typical usage of this helper is in non per-packet code;
458 * this is because the bucket lock needs to be held for lookup
459 * and a hash would have already been needed. Hence, this function
460 * is just intended for code clarity. */
461 static struct conn *
462 conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
463 {
464 struct conn_lookup_ctx ctx;
465 ctx.conn = NULL;
466 ctx.key = *key;
467 ctx.hash = conn_key_hash(key, ct->hash_basis);
468 unsigned bucket = hash_to_bucket(ctx.hash);
469 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
470 return ctx.conn;
471 }
472
473 static void
474 nat_clean(struct conntrack *ct, struct conn *conn,
475 struct conntrack_bucket *ctb)
476 OVS_REQUIRES(ctb->lock)
477 {
478 long long now = time_msec();
479 ct_rwlock_wrlock(&ct->resources_lock);
480 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
481 ct_rwlock_unlock(&ct->resources_lock);
482 ct_lock_unlock(&ctb->lock);
483
484 uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
485 unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
486
487 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
488 ct_rwlock_wrlock(&ct->resources_lock);
489
490 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
491
492 struct nat_conn_key_node *nat_conn_key_node =
493 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
494 ct->hash_basis);
495
496 /* In the unlikely event, rev conn was recreated, then skip
497 * rev_conn cleanup. */
498 if (rev_conn && (!nat_conn_key_node ||
499 memcmp(&nat_conn_key_node->value, &rev_conn->rev_key,
500 sizeof nat_conn_key_node->value))) {
501 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
502 &rev_conn->node);
503 free(rev_conn);
504 }
505 delete_conn(conn);
506
507 ct_rwlock_unlock(&ct->resources_lock);
508 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
509 ct_lock_lock(&ctb->lock);
510 }
511
512 static void
513 conn_clean(struct conntrack *ct, struct conn *conn,
514 struct conntrack_bucket *ctb)
515 OVS_REQUIRES(ctb->lock)
516 {
517 ovs_list_remove(&conn->exp_node);
518 hmap_remove(&ctb->connections, &conn->node);
519 atomic_count_dec(&ct->n_conn);
520 if (conn->nat_info) {
521 nat_clean(ct, conn, ctb);
522 } else {
523 delete_conn(conn);
524 }
525 }
526
527 /* This function is called with the bucket lock held. */
528 static struct conn *
529 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
530 struct conn_lookup_ctx *ctx, bool commit, long long now,
531 const struct nat_action_info_t *nat_action_info,
532 struct conn *conn_for_un_nat_copy)
533 {
534 unsigned bucket = hash_to_bucket(ctx->hash);
535 struct conn *nc = NULL;
536
537 if (!valid_new(pkt, &ctx->key)) {
538 pkt->md.ct_state = CS_INVALID;
539 return nc;
540 }
541 pkt->md.ct_state = CS_NEW;
542
543 if (commit) {
544 unsigned int n_conn_limit;
545
546 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
547
548 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
549 COVERAGE_INC(conntrack_full);
550 return nc;
551 }
552
553 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
554 ctx->conn = nc;
555 nc->rev_key = nc->key;
556 conn_key_reverse(&nc->rev_key);
557
558 if (nat_action_info) {
559 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
560 ct_rwlock_wrlock(&ct->resources_lock);
561
562 bool nat_res = nat_select_range_tuple(ct, nc,
563 conn_for_un_nat_copy);
564
565 if (!nat_res) {
566 free(nc->nat_info);
567 nc->nat_info = NULL;
568 free (nc);
569 ct_rwlock_unlock(&ct->resources_lock);
570 return NULL;
571 }
572
573 if (conn_for_un_nat_copy &&
574 nc->conn_type == CT_CONN_TYPE_DEFAULT) {
575 *nc = *conn_for_un_nat_copy;
576 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
577 conn_for_un_nat_copy->nat_info = NULL;
578 }
579 ct_rwlock_unlock(&ct->resources_lock);
580
581 nat_packet(pkt, nc, ctx->icmp_related);
582 }
583 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
584 atomic_count_inc(&ct->n_conn);
585 }
586 return nc;
587 }
588
589 static bool
590 conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
591 struct conn_lookup_ctx *ctx, struct conn **conn,
592 long long now, unsigned bucket)
593 OVS_REQUIRES(ct->buckets[bucket].lock)
594 {
595 bool create_new_conn = false;
596
597 if (ctx->icmp_related) {
598 pkt->md.ct_state |= CS_RELATED;
599 if (ctx->reply) {
600 pkt->md.ct_state |= CS_REPLY_DIR;
601 }
602 } else {
603 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
604 pkt, ctx->reply, now);
605
606 switch (res) {
607 case CT_UPDATE_VALID:
608 pkt->md.ct_state |= CS_ESTABLISHED;
609 pkt->md.ct_state &= ~CS_NEW;
610 if (ctx->reply) {
611 pkt->md.ct_state |= CS_REPLY_DIR;
612 }
613 break;
614 case CT_UPDATE_INVALID:
615 pkt->md.ct_state = CS_INVALID;
616 break;
617 case CT_UPDATE_NEW:
618 conn_clean(ct, *conn, &ct->buckets[bucket]);
619 create_new_conn = true;
620 break;
621 default:
622 OVS_NOT_REACHED();
623 }
624 }
625 return create_new_conn;
626 }
627
628 static void
629 create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
630 long long now)
631 {
632 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
633 nc->key = conn_for_un_nat_copy->rev_key;
634 nc->rev_key = conn_for_un_nat_copy->key;
635 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
636 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
637 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
638 ct_rwlock_rdlock(&ct->resources_lock);
639
640 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
641
642 struct nat_conn_key_node *nat_conn_key_node =
643 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
644 if (nat_conn_key_node
645 && !memcmp(&nat_conn_key_node->value, &nc->rev_key,
646 sizeof nat_conn_key_node->value)
647 && !rev_conn) {
648 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
649 &nc->node, un_nat_hash);
650 } else {
651 free(nc);
652 }
653 ct_rwlock_unlock(&ct->resources_lock);
654 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
655 }
656
657 static void
658 handle_nat(struct dp_packet *pkt, struct conn *conn,
659 uint16_t zone, bool reply, bool related)
660 {
661 if (conn->nat_info &&
662 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
663 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
664 zone != pkt->md.ct_zone))) {
665 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
666 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
667 }
668 if (reply) {
669 un_nat_packet(pkt, conn, related);
670 } else {
671 nat_packet(pkt, conn, related);
672 }
673 }
674 }
675
676 static bool
677 check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
678 struct conn_lookup_ctx *ctx_in, long long now,
679 unsigned *bucket, struct conn **conn,
680 const struct nat_action_info_t *nat_action_info)
681 OVS_REQUIRES(ct->buckets[*bucket].lock)
682 {
683 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
684 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
685 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
686 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
687 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
688 nat_action_info) {
689 return false;
690 }
691
692 ct_lock_unlock(&ct->buckets[*bucket].lock);
693 struct conn_lookup_ctx ctx;
694 memset(&ctx, 0 , sizeof ctx);
695 ctx.conn = NULL;
696
697 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
698 ctx.key.src.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
699 ctx.key.dst.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
700
701 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
702 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
703 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
704 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
705 ctx.key.src.icmp_type = (uint8_t) src_port;
706 ctx.key.dst.icmp_type = reverse_icmp_type(ctx.key.src.icmp_type);
707 } else {
708 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
709 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
710 }
711 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
712 } else {
713 ctx.key.src.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
714 ctx.key.dst.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
715
716 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
717 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
718 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
719 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
720 ctx.key.src.icmp_type = (uint8_t) src_port;
721 ctx.key.dst.icmp_type = reverse_icmp6_type(ctx.key.src.icmp_type);
722 } else {
723 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
724 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
725 }
726 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
727 }
728
729 ctx.key.dl_type = ctx_in->key.dl_type;
730 ctx.key.zone = pkt->md.ct_zone;
731
732 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
733 *bucket = hash_to_bucket(ctx.hash);
734 ct_lock_lock(&ct->buckets[*bucket].lock);
735 conn_key_lookup(&ct->buckets[*bucket], &ctx, now);
736 *conn = ctx.conn;
737
738 return *conn ? true : false;
739 }
740
741 static void
742 process_one(struct conntrack *ct, struct dp_packet *pkt,
743 struct conn_lookup_ctx *ctx, uint16_t zone,
744 bool force, bool commit, long long now, const uint32_t *setmark,
745 const struct ovs_key_ct_labels *setlabel,
746 const struct nat_action_info_t *nat_action_info)
747 {
748 struct conn *conn;
749 unsigned bucket = hash_to_bucket(ctx->hash);
750 ct_lock_lock(&ct->buckets[bucket].lock);
751 conn_key_lookup(&ct->buckets[bucket], ctx, now);
752 conn = ctx->conn;
753
754 /* Delete found entry if in wrong direction. 'force' implies commit. */
755 if (conn && force && ctx->reply) {
756 conn_clean(ct, conn, &ct->buckets[bucket]);
757 conn = NULL;
758 }
759
760 if (OVS_LIKELY(conn)) {
761 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
762
763 ctx->reply = true;
764
765 struct conn_lookup_ctx ctx2;
766 ctx2.conn = NULL;
767 ctx2.key = conn->rev_key;
768 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
769
770 ct_lock_unlock(&ct->buckets[bucket].lock);
771 bucket = hash_to_bucket(ctx2.hash);
772
773 ct_lock_lock(&ct->buckets[bucket].lock);
774 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
775
776 if (ctx2.conn) {
777 conn = ctx2.conn;
778 } else {
779 /* It is a race condition where conn has timed out and removed
780 * between unlock of the rev_conn and lock of the forward conn;
781 * nothing to do. */
782 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
783 ct_lock_unlock(&ct->buckets[bucket].lock);
784 return;
785 }
786 }
787 }
788
789 bool create_new_conn = false;
790 struct conn conn_for_un_nat_copy;
791 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
792 if (OVS_LIKELY(conn)) {
793 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
794 if (nat_action_info && !create_new_conn) {
795 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
796 }
797 } else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
798 nat_action_info)) {
799 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
800 } else {
801 if (ctx->icmp_related) {
802 pkt->md.ct_state = CS_INVALID;
803 } else {
804 create_new_conn = true;
805 }
806 }
807
808 if (OVS_UNLIKELY(create_new_conn)) {
809 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
810 &conn_for_un_nat_copy);
811 }
812
813 write_ct_md(pkt, zone, conn, &ctx->key);
814 if (conn && setmark) {
815 set_mark(pkt, conn, setmark[0], setmark[1]);
816 }
817
818 if (conn && setlabel) {
819 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
820 }
821
822 ct_lock_unlock(&ct->buckets[bucket].lock);
823
824 if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) {
825 create_un_nat_conn(ct, &conn_for_un_nat_copy, now);
826 }
827 }
828
829 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
830 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
831 * the l3 and and l4 offset properly set.
832 *
833 * If 'commit' is true, the packets are allowed to create new entries in the
834 * connection tables. 'setmark', if not NULL, should point to a two
835 * elements array containing a value and a mask to set the connection mark.
836 * 'setlabel' behaves similarly for the connection label.*/
837 int
838 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
839 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
840 const uint32_t *setmark,
841 const struct ovs_key_ct_labels *setlabel,
842 const char *helper,
843 const struct nat_action_info_t *nat_action_info)
844 {
845 struct dp_packet **pkts = pkt_batch->packets;
846 size_t cnt = pkt_batch->count;
847 long long now = time_msec();
848 struct conn_lookup_ctx ctx;
849
850 if (helper) {
851 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
852
853 VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
854 /* Continue without the helper */
855 }
856
857 for (size_t i = 0; i < cnt; i++) {
858 if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone)) {
859 pkts[i]->md.ct_state = CS_INVALID;
860 write_ct_md(pkts[i], zone, NULL, NULL);
861 continue;
862 }
863 process_one(ct, pkts[i], &ctx, zone, force, commit,
864 now, setmark, setlabel, nat_action_info);
865 }
866
867 return 0;
868 }
869
870 static void
871 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
872 {
873 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
874 conn->mark = pkt->md.ct_mark;
875 }
876
877 static void
878 set_label(struct dp_packet *pkt, struct conn *conn,
879 const struct ovs_key_ct_labels *val,
880 const struct ovs_key_ct_labels *mask)
881 {
882 ovs_u128 v, m;
883
884 memcpy(&v, val, sizeof v);
885 memcpy(&m, mask, sizeof m);
886
887 pkt->md.ct_label.u64.lo = v.u64.lo
888 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
889 pkt->md.ct_label.u64.hi = v.u64.hi
890 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
891 conn->label = pkt->md.ct_label;
892 }
893
894 \f
895 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
896 * earliest expiration time among the remaining connections in 'ctb'. Returns
897 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
898 * if 'limit' is reached */
899 static long long
900 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
901 size_t limit)
902 OVS_REQUIRES(ctb->lock)
903 {
904 struct conn *conn, *next;
905 long long min_expiration = LLONG_MAX;
906 unsigned i;
907 size_t count = 0;
908
909 for (i = 0; i < N_CT_TM; i++) {
910 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
911 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
912 if (!conn_expired(conn, now) || count >= limit) {
913 min_expiration = MIN(min_expiration, conn->expiration);
914 if (count >= limit) {
915 /* Do not check other lists. */
916 COVERAGE_INC(conntrack_long_cleanup);
917 return min_expiration;
918 }
919 break;
920 }
921 conn_clean(ct, conn, ctb);
922 count++;
923 }
924 }
925 }
926
927 return min_expiration;
928 }
929
930 /* Cleans up old connection entries from 'ct'. Returns the time when the
931 * next expiration might happen. The return value might be smaller than
932 * 'now', meaning that an internal limit has been reached, and some expired
933 * connections have not been deleted. */
934 static long long
935 conntrack_clean(struct conntrack *ct, long long now)
936 {
937 long long next_wakeup = now + CT_TM_MIN;
938 unsigned int n_conn_limit;
939 size_t clean_count = 0;
940 unsigned i;
941
942 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
943
944 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
945 struct conntrack_bucket *ctb = &ct->buckets[i];
946 size_t prev_count;
947 long long min_exp;
948
949 ovs_mutex_lock(&ctb->cleanup_mutex);
950 if (ctb->next_cleanup > now) {
951 goto next_bucket;
952 }
953
954 ct_lock_lock(&ctb->lock);
955 prev_count = hmap_count(&ctb->connections);
956 /* If the connections are well distributed among buckets, we want to
957 * limit to 10% of the global limit equally split among buckets. If
958 * the bucket is busier than the others, we limit to 10% of its
959 * current size. */
960 min_exp = sweep_bucket(ct, ctb, now,
961 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
962 clean_count += prev_count - hmap_count(&ctb->connections);
963
964 if (min_exp > now) {
965 /* We call hmap_shrink() only if sweep_bucket() managed to delete
966 * every expired connection. */
967 hmap_shrink(&ctb->connections);
968 }
969
970 ct_lock_unlock(&ctb->lock);
971
972 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
973
974 next_bucket:
975 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
976 ovs_mutex_unlock(&ctb->cleanup_mutex);
977 }
978
979 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
980 clean_count, time_msec() - now);
981
982 return next_wakeup;
983 }
984
985 /* Cleanup:
986 *
987 * We must call conntrack_clean() periodically. conntrack_clean() return
988 * value gives an hint on when the next cleanup must be done (either because
989 * there is an actual connection that expires, or because a new connection
990 * might be created with the minimum timeout).
991 *
992 * The logic below has two goals:
993 *
994 * - We want to reduce the number of wakeups and batch connection cleanup
995 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
996 * are coping with the current cleanup tasks, then we wait at least
997 * 5 seconds to do further cleanup.
998 *
999 * - We don't want to keep the buckets locked too long, as we might prevent
1000 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
1001 * behind, there is at least some 200ms blocks of time when buckets will be
1002 * left alone, so the datapath can operate unhindered.
1003 */
1004 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1005 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1006
1007 static void *
1008 clean_thread_main(void *f_)
1009 {
1010 struct conntrack *ct = f_;
1011
1012 while (!latch_is_set(&ct->clean_thread_exit)) {
1013 long long next_wake;
1014 long long now = time_msec();
1015
1016 next_wake = conntrack_clean(ct, now);
1017
1018 if (next_wake < now) {
1019 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1020 } else {
1021 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1022 }
1023 latch_wait(&ct->clean_thread_exit);
1024 poll_block();
1025 }
1026
1027 return NULL;
1028 }
1029 \f
1030 /* Key extraction */
1031
1032 /* The function stores a pointer to the first byte after the header in
1033 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1034 * not interested in the header's tail, meaning that the header has
1035 * already been parsed (e.g. by flow_extract): we take this as a hint to
1036 * save a few checks. If 'validate_checksum' is true, the function returns
1037 * false if the IPv4 checksum is invalid. */
1038 static inline bool
1039 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1040 const char **new_data, bool validate_checksum)
1041 {
1042 const struct ip_header *ip = data;
1043 size_t ip_len;
1044
1045 if (new_data) {
1046 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1047 return false;
1048 }
1049 }
1050
1051 ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
1052
1053 if (new_data) {
1054 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1055 return false;
1056 }
1057 if (OVS_UNLIKELY(size < ip_len)) {
1058 return false;
1059 }
1060
1061 *new_data = (char *) data + ip_len;
1062 }
1063
1064 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1065 return false;
1066 }
1067
1068 if (validate_checksum && csum(data, ip_len) != 0) {
1069 return false;
1070 }
1071
1072 key->src.addr.ipv4 = ip->ip_src;
1073 key->dst.addr.ipv4 = ip->ip_dst;
1074 key->nw_proto = ip->ip_proto;
1075
1076 return true;
1077 }
1078
1079 /* The function stores a pointer to the first byte after the header in
1080 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1081 * not interested in the header's tail, meaning that the header has
1082 * already been parsed (e.g. by flow_extract): we take this as a hint to
1083 * save a few checks. */
1084 static inline bool
1085 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1086 const char **new_data)
1087 {
1088 const struct ovs_16aligned_ip6_hdr *ip6 = data;
1089
1090 if (new_data) {
1091 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1092 return false;
1093 }
1094 }
1095
1096 uint8_t nw_proto = ip6->ip6_nxt;
1097 uint8_t nw_frag = 0;
1098
1099 data = ip6 + 1;
1100 size -= sizeof *ip6;
1101
1102 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
1103 return false;
1104 }
1105
1106 if (new_data) {
1107 *new_data = data;
1108 }
1109
1110 if (nw_frag) {
1111 return false;
1112 }
1113
1114 key->src.addr.ipv6 = ip6->ip6_src;
1115 key->dst.addr.ipv6 = ip6->ip6_dst;
1116 key->nw_proto = nw_proto;
1117
1118 return true;
1119 }
1120
1121 static inline bool
1122 checksum_valid(const struct conn_key *key, const void *data, size_t size,
1123 const void *l3)
1124 {
1125 uint32_t csum = 0;
1126
1127 if (key->dl_type == htons(ETH_TYPE_IP)) {
1128 csum = packet_csum_pseudoheader(l3);
1129 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1130 csum = packet_csum_pseudoheader6(l3);
1131 } else {
1132 return false;
1133 }
1134
1135 csum = csum_continue(csum, data, size);
1136
1137 return csum_finish(csum) == 0;
1138 }
1139
1140 static inline bool
1141 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
1142 const void *l3, bool validate_checksum)
1143 {
1144 const struct tcp_header *tcp = data;
1145 if (size < sizeof *tcp) {
1146 return false;
1147 }
1148
1149 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1150 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1151 return false;
1152 }
1153
1154 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
1155 }
1156
1157 static inline bool
1158 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
1159 const void *l3, bool validate_checksum)
1160 {
1161 const struct udp_header *udp = data;
1162 if (size < sizeof *udp) {
1163 return false;
1164 }
1165
1166 size_t udp_len = ntohs(udp->udp_len);
1167 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1168 return false;
1169 }
1170
1171 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1172 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1173 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
1174 }
1175
1176 static inline bool
1177 check_l4_icmp(const void *data, size_t size, bool validate_checksum)
1178 {
1179 return validate_checksum ? csum(data, size) == 0 : true;
1180 }
1181
1182 static inline bool
1183 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
1184 const void *l3, bool validate_checksum)
1185 {
1186 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
1187 }
1188
1189 static inline bool
1190 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1191 {
1192 const struct tcp_header *tcp = data;
1193
1194 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1195 return false;
1196 }
1197
1198 key->src.port = tcp->tcp_src;
1199 key->dst.port = tcp->tcp_dst;
1200
1201 /* Port 0 is invalid */
1202 return key->src.port && key->dst.port;
1203 }
1204
1205 static inline bool
1206 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1207 {
1208 const struct udp_header *udp = data;
1209
1210 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1211 return false;
1212 }
1213
1214 key->src.port = udp->udp_src;
1215 key->dst.port = udp->udp_dst;
1216
1217 /* Port 0 is invalid */
1218 return key->src.port && key->dst.port;
1219 }
1220
1221 static inline bool extract_l4(struct conn_key *key, const void *data,
1222 size_t size, bool *related, const void *l3,
1223 bool validate_checksum);
1224
1225 static uint8_t
1226 reverse_icmp_type(uint8_t type)
1227 {
1228 switch (type) {
1229 case ICMP4_ECHO_REQUEST:
1230 return ICMP4_ECHO_REPLY;
1231 case ICMP4_ECHO_REPLY:
1232 return ICMP4_ECHO_REQUEST;
1233
1234 case ICMP4_TIMESTAMP:
1235 return ICMP4_TIMESTAMPREPLY;
1236 case ICMP4_TIMESTAMPREPLY:
1237 return ICMP4_TIMESTAMP;
1238
1239 case ICMP4_INFOREQUEST:
1240 return ICMP4_INFOREPLY;
1241 case ICMP4_INFOREPLY:
1242 return ICMP4_INFOREQUEST;
1243 default:
1244 OVS_NOT_REACHED();
1245 }
1246 }
1247
1248 /* If 'related' is not NULL and the function is processing an ICMP
1249 * error packet, extract the l3 and l4 fields from the nested header
1250 * instead and set *related to true. If 'related' is NULL we're
1251 * already processing a nested header and no such recursion is
1252 * possible */
1253 static inline int
1254 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1255 bool *related)
1256 {
1257 const struct icmp_header *icmp = data;
1258
1259 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1260 return false;
1261 }
1262
1263 switch (icmp->icmp_type) {
1264 case ICMP4_ECHO_REQUEST:
1265 case ICMP4_ECHO_REPLY:
1266 case ICMP4_TIMESTAMP:
1267 case ICMP4_TIMESTAMPREPLY:
1268 case ICMP4_INFOREQUEST:
1269 case ICMP4_INFOREPLY:
1270 if (icmp->icmp_code != 0) {
1271 return false;
1272 }
1273 /* Separate ICMP connection: identified using id */
1274 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1275 key->src.icmp_type = icmp->icmp_type;
1276 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
1277 break;
1278 case ICMP4_DST_UNREACH:
1279 case ICMP4_TIME_EXCEEDED:
1280 case ICMP4_PARAM_PROB:
1281 case ICMP4_SOURCEQUENCH:
1282 case ICMP4_REDIRECT: {
1283 /* ICMP packet part of another connection. We should
1284 * extract the key from embedded packet header */
1285 struct conn_key inner_key;
1286 const char *l3 = (const char *) (icmp + 1);
1287 const char *tail = (const char *) data + size;
1288 const char *l4;
1289 bool ok;
1290
1291 if (!related) {
1292 return false;
1293 }
1294
1295 memset(&inner_key, 0, sizeof inner_key);
1296 inner_key.dl_type = htons(ETH_TYPE_IP);
1297 ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
1298 if (!ok) {
1299 return false;
1300 }
1301
1302 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
1303 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
1304 return false;
1305 }
1306
1307 key->src = inner_key.src;
1308 key->dst = inner_key.dst;
1309 key->nw_proto = inner_key.nw_proto;
1310
1311 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
1312 if (ok) {
1313 conn_key_reverse(key);
1314 *related = true;
1315 }
1316 return ok;
1317 }
1318 default:
1319 return false;
1320 }
1321
1322 return true;
1323 }
1324
1325 static uint8_t
1326 reverse_icmp6_type(uint8_t type)
1327 {
1328 switch (type) {
1329 case ICMP6_ECHO_REQUEST:
1330 return ICMP6_ECHO_REPLY;
1331 case ICMP6_ECHO_REPLY:
1332 return ICMP6_ECHO_REQUEST;
1333 default:
1334 OVS_NOT_REACHED();
1335 }
1336 }
1337
1338 /* If 'related' is not NULL and the function is processing an ICMP
1339 * error packet, extract the l3 and l4 fields from the nested header
1340 * instead and set *related to true. If 'related' is NULL we're
1341 * already processing a nested header and no such recursion is
1342 * possible */
1343 static inline bool
1344 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1345 bool *related)
1346 {
1347 const struct icmp6_header *icmp6 = data;
1348
1349 /* All the messages that we support need at least 4 bytes after
1350 * the header */
1351 if (size < sizeof *icmp6 + 4) {
1352 return false;
1353 }
1354
1355 switch (icmp6->icmp6_type) {
1356 case ICMP6_ECHO_REQUEST:
1357 case ICMP6_ECHO_REPLY:
1358 if (icmp6->icmp6_code != 0) {
1359 return false;
1360 }
1361 /* Separate ICMP connection: identified using id */
1362 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1363 key->src.icmp_type = icmp6->icmp6_type;
1364 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
1365 break;
1366 case ICMP6_DST_UNREACH:
1367 case ICMP6_PACKET_TOO_BIG:
1368 case ICMP6_TIME_EXCEEDED:
1369 case ICMP6_PARAM_PROB: {
1370 /* ICMP packet part of another connection. We should
1371 * extract the key from embedded packet header */
1372 struct conn_key inner_key;
1373 const char *l3 = (const char *) icmp6 + 8;
1374 const char *tail = (const char *) data + size;
1375 const char *l4 = NULL;
1376 bool ok;
1377
1378 if (!related) {
1379 return false;
1380 }
1381
1382 memset(&inner_key, 0, sizeof inner_key);
1383 inner_key.dl_type = htons(ETH_TYPE_IPV6);
1384 ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
1385 if (!ok) {
1386 return false;
1387 }
1388
1389 /* pf doesn't do this, but it seems a good idea */
1390 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
1391 &key->dst.addr.ipv6_aligned)
1392 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
1393 &key->src.addr.ipv6_aligned)) {
1394 return false;
1395 }
1396
1397 key->src = inner_key.src;
1398 key->dst = inner_key.dst;
1399 key->nw_proto = inner_key.nw_proto;
1400
1401 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
1402 if (ok) {
1403 conn_key_reverse(key);
1404 *related = true;
1405 }
1406 return ok;
1407 }
1408 default:
1409 return false;
1410 }
1411
1412 return true;
1413 }
1414
1415 /* Extract l4 fields into 'key', which must already contain valid l3
1416 * members.
1417 *
1418 * If 'related' is not NULL and an ICMP error packet is being
1419 * processed, the function will extract the key from the packet nested
1420 * in the ICMP payload and set '*related' to true.
1421 *
1422 * If 'related' is NULL, it means that we're already parsing a header nested
1423 * in an ICMP error. In this case, we skip checksum and length validation. */
1424 static inline bool
1425 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
1426 const void *l3, bool validate_checksum)
1427 {
1428 if (key->nw_proto == IPPROTO_TCP) {
1429 return (!related || check_l4_tcp(key, data, size, l3,
1430 validate_checksum)) && extract_l4_tcp(key, data, size);
1431 } else if (key->nw_proto == IPPROTO_UDP) {
1432 return (!related || check_l4_udp(key, data, size, l3,
1433 validate_checksum)) && extract_l4_udp(key, data, size);
1434 } else if (key->dl_type == htons(ETH_TYPE_IP)
1435 && key->nw_proto == IPPROTO_ICMP) {
1436 return (!related || check_l4_icmp(data, size, validate_checksum))
1437 && extract_l4_icmp(key, data, size, related);
1438 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1439 && key->nw_proto == IPPROTO_ICMPV6) {
1440 return (!related || check_l4_icmp6(key, data, size, l3,
1441 validate_checksum)) && extract_l4_icmp6(key, data, size,
1442 related);
1443 } else {
1444 return false;
1445 }
1446 }
1447
1448 static bool
1449 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
1450 struct conn_lookup_ctx *ctx, uint16_t zone)
1451 {
1452 const struct eth_header *l2 = dp_packet_eth(pkt);
1453 const struct ip_header *l3 = dp_packet_l3(pkt);
1454 const char *l4 = dp_packet_l4(pkt);
1455 const char *tail = dp_packet_tail(pkt);
1456 bool ok;
1457
1458 memset(ctx, 0, sizeof *ctx);
1459
1460 if (!l2 || !l3 || !l4) {
1461 return false;
1462 }
1463
1464 ctx->key.zone = zone;
1465
1466 /* XXX In this function we parse the packet (again, it has already
1467 * gone through miniflow_extract()) for two reasons:
1468 *
1469 * 1) To extract the l3 addresses and l4 ports.
1470 * We already have the l3 and l4 headers' pointers. Extracting
1471 * the l3 addresses and the l4 ports is really cheap, since they
1472 * can be found at fixed locations.
1473 * 2) To extract the l4 type.
1474 * Extracting the l4 types, for IPv6 can be quite expensive, because
1475 * it's not at a fixed location.
1476 *
1477 * Here's a way to avoid (2) with the help of the datapath.
1478 * The datapath doesn't keep the packet's extracted flow[1], so
1479 * using that is not an option. We could use the packet's matching
1480 * megaflow, but we have to make sure that the l4 type (nw_proto)
1481 * is unwildcarded. This means either:
1482 *
1483 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1484 * if the actions contains ct().
1485 *
1486 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1487 * action. This is already done in different actions, but it's
1488 * unnecessary for the kernel.
1489 *
1490 * ---
1491 * [1] The reasons for this are that keeping the flow increases
1492 * (slightly) the cache footprint and increases computation
1493 * time as we move the packet around. Most importantly, the flow
1494 * should be updated by the actions and this can be slow, as
1495 * we use a sparse representation (miniflow).
1496 *
1497 */
1498 ctx->key.dl_type = dl_type;
1499 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
1500 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
1501 if (hwol_bad_l3_csum) {
1502 ok = false;
1503 } else {
1504 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
1505 /* Validate the checksum only when hwol is not supported. */
1506 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL,
1507 !hwol_good_l3_csum);
1508 }
1509 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1510 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1511 } else {
1512 ok = false;
1513 }
1514
1515
1516 if (ok) {
1517 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
1518 if (!hwol_bad_l4_csum) {
1519 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt);
1520 /* Validate the checksum only when hwol is not supported. */
1521 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->icmp_related, l3,
1522 !hwol_good_l4_csum)) {
1523 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1524 return true;
1525 }
1526 }
1527 }
1528
1529 return false;
1530 }
1531
1532 static uint32_t
1533 ct_addr_hash_add(uint32_t hash, const struct ct_addr *addr)
1534 {
1535 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
1536 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
1537 }
1538
1539 static uint32_t
1540 ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
1541 {
1542 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
1543 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
1544 }
1545 \f
1546 /* Symmetric */
1547 static uint32_t
1548 conn_key_hash(const struct conn_key *key, uint32_t basis)
1549 {
1550 uint32_t hsrc, hdst, hash;
1551
1552 hsrc = hdst = basis;
1553 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
1554 hdst = ct_endpoint_hash_add(hdst, &key->dst);
1555
1556 /* Even if source and destination are swapped the hash will be the same. */
1557 hash = hsrc ^ hdst;
1558
1559 /* Hash the rest of the key(L3 and L4 types and zone). */
1560 hash = hash_words((uint32_t *) (&key->dst + 1),
1561 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1562 hash);
1563
1564 return hash_finish(hash, 0);
1565 }
1566
1567 static void
1568 conn_key_reverse(struct conn_key *key)
1569 {
1570 struct ct_endpoint tmp;
1571
1572 tmp = key->src;
1573 key->src = key->dst;
1574 key->dst = tmp;
1575 }
1576
1577 static uint32_t
1578 nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
1579 struct in6_addr *ipv6_aligned_max)
1580 {
1581 uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
1582 uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t);
1583 uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
1584 uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
1585
1586 ovs_be64 addr6_64_min_hi;
1587 ovs_be64 addr6_64_min_lo;
1588 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
1589 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
1590
1591 ovs_be64 addr6_64_max_hi;
1592 ovs_be64 addr6_64_max_lo;
1593 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
1594 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
1595
1596 uint64_t diff;
1597 if (addr6_64_min_hi == addr6_64_max_hi &&
1598 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
1599 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
1600 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
1601 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
1602 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
1603 ntohll(addr6_64_max_lo) - 1);
1604 } else {
1605 /* Limit address delta supported to 32 bits or 4 billion approximately.
1606 * Possibly, this should be visible to the user through a datapath
1607 * support check, however the practical impact is probably nil. */
1608 diff = 0xfffffffe;
1609 }
1610 if (diff > 0xfffffffe) {
1611 diff = 0xfffffffe;
1612 }
1613 return diff;
1614 }
1615
1616 /* This function must be used in tandem with nat_ipv6_addrs_delta(), which
1617 * restricts the input parameters. */
1618 static void
1619 nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
1620 {
1621 uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
1622 uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
1623 ovs_be64 addr6_64_hi;
1624 ovs_be64 addr6_64_lo;
1625 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
1626 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
1627
1628 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
1629 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
1630 } else if (addr6_64_hi != OVS_BE64_MAX) {
1631 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
1632 addr6_64_lo = htonll(increment - (UINT64_MAX -
1633 ntohll(addr6_64_lo) + 1));
1634 } else {
1635 OVS_NOT_REACHED();
1636 }
1637
1638 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
1639 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
1640
1641 return;
1642 }
1643
1644 static uint32_t
1645 nat_range_hash(const struct conn *conn, uint32_t basis)
1646 {
1647 uint32_t hash = basis;
1648
1649 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
1650 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
1651 hash = hash_add(hash,
1652 (conn->nat_info->max_port << 16)
1653 | conn->nat_info->min_port);
1654
1655 hash = ct_endpoint_hash_add(hash, &conn->key.src);
1656 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
1657
1658 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
1659 hash = hash_add(hash, conn->key.nw_proto);
1660 hash = hash_add(hash, conn->key.zone);
1661
1662 /* The purpose of the second parameter is to distinguish hashes of data of
1663 * different length; our data always has the same length so there is no
1664 * value in counting. */
1665 return hash_finish(hash, 0);
1666 }
1667
1668 static bool
1669 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
1670 struct conn *nat_conn)
1671 {
1672 #define MIN_NAT_EPHEMERAL_PORT 1024
1673 #define MAX_NAT_EPHEMERAL_PORT 65535
1674
1675 uint16_t min_port;
1676 uint16_t max_port;
1677 uint16_t first_port;
1678
1679 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
1680
1681 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
1682 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
1683 min_port = ntohs(conn->key.src.port);
1684 max_port = ntohs(conn->key.src.port);
1685 first_port = min_port;
1686 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
1687 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
1688 min_port = ntohs(conn->key.dst.port);
1689 max_port = ntohs(conn->key.dst.port);
1690 first_port = min_port;
1691 } else {
1692 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
1693 uint32_t port_index = hash % (deltap + 1);
1694 first_port = conn->nat_info->min_port + port_index;
1695 min_port = conn->nat_info->min_port;
1696 max_port = conn->nat_info->max_port;
1697 }
1698
1699 uint32_t deltaa = 0;
1700 uint32_t address_index;
1701 struct ct_addr ct_addr;
1702 memset(&ct_addr, 0, sizeof ct_addr);
1703 struct ct_addr max_ct_addr;
1704 memset(&max_ct_addr, 0, sizeof max_ct_addr);
1705 max_ct_addr = conn->nat_info->max_addr;
1706
1707 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1708 deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
1709 ntohl(conn->nat_info->min_addr.ipv4_aligned);
1710 address_index = hash % (deltaa + 1);
1711 ct_addr.ipv4_aligned = htonl(
1712 ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
1713 } else {
1714 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
1715 &conn->nat_info->max_addr.ipv6_aligned);
1716 /* deltaa must be within 32 bits for full hash coverage. A 64 or
1717 * 128 bit hash is unnecessary and hence not used here. Most code
1718 * is kept common with V4; nat_ipv6_addrs_delta() will do the
1719 * enforcement via max_ct_addr. */
1720 max_ct_addr = conn->nat_info->min_addr;
1721 nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
1722
1723 address_index = hash % (deltaa + 1);
1724 ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
1725 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
1726 }
1727
1728 uint16_t port = first_port;
1729 bool all_ports_tried = false;
1730 bool original_ports_tried = false;
1731 struct ct_addr first_addr = ct_addr;
1732 *nat_conn = *conn;
1733
1734 while (true) {
1735 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1736 nat_conn->rev_key.dst.addr = ct_addr;
1737 } else {
1738 nat_conn->rev_key.src.addr = ct_addr;
1739 }
1740
1741 if ((conn->key.nw_proto == IPPROTO_ICMP) ||
1742 (conn->key.nw_proto == IPPROTO_ICMPV6)) {
1743 all_ports_tried = true;
1744 } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1745 nat_conn->rev_key.dst.port = htons(port);
1746 } else {
1747 nat_conn->rev_key.src.port = htons(port);
1748 }
1749
1750 struct nat_conn_key_node *nat_conn_key_node =
1751 nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key,
1752 ct->hash_basis);
1753
1754 if (!nat_conn_key_node) {
1755 struct nat_conn_key_node *nat_conn_key =
1756 xzalloc(sizeof *nat_conn_key);
1757 nat_conn_key->key = nat_conn->rev_key;
1758 nat_conn_key->value = nat_conn->key;
1759 uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key,
1760 ct->hash_basis);
1761 hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node,
1762 nat_conn_key_hash);
1763 return true;
1764 } else if (!all_ports_tried) {
1765 if (min_port == max_port) {
1766 all_ports_tried = true;
1767 } else if (port == max_port) {
1768 port = min_port;
1769 } else {
1770 port++;
1771 }
1772 if (port == first_port) {
1773 all_ports_tried = true;
1774 }
1775 } else {
1776 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
1777 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1778 ct_addr.ipv4_aligned = htonl(
1779 ntohl(ct_addr.ipv4_aligned) + 1);
1780 } else {
1781 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
1782 }
1783 } else {
1784 ct_addr = conn->nat_info->min_addr;
1785 }
1786 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
1787 if (!original_ports_tried) {
1788 original_ports_tried = true;
1789 ct_addr = conn->nat_info->min_addr;
1790 min_port = MIN_NAT_EPHEMERAL_PORT;
1791 max_port = MAX_NAT_EPHEMERAL_PORT;
1792 } else {
1793 break;
1794 }
1795 }
1796 first_port = min_port;
1797 port = first_port;
1798 all_ports_tried = false;
1799 }
1800 }
1801 return false;
1802 }
1803
1804 /* This function must be called with the ct->resources lock taken. */
1805 static struct nat_conn_key_node *
1806 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
1807 const struct conn_key *key,
1808 uint32_t basis)
1809 {
1810 struct nat_conn_key_node *nat_conn_key_node;
1811 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1812
1813 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1814 nat_conn_keys) {
1815 if (!memcmp(&nat_conn_key_node->key, key,
1816 sizeof nat_conn_key_node->key)) {
1817 return nat_conn_key_node;
1818 }
1819 }
1820 return NULL;
1821 }
1822
1823 /* This function must be called with the ct->resources write lock taken. */
1824 static void
1825 nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
1826 uint32_t basis)
1827 {
1828 struct nat_conn_key_node *nat_conn_key_node;
1829 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1830
1831 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1832 nat_conn_keys) {
1833 if (!memcmp(&nat_conn_key_node->key, key,
1834 sizeof nat_conn_key_node->key)) {
1835 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
1836 free(nat_conn_key_node);
1837 return;
1838 }
1839 }
1840 }
1841
1842 static void
1843 conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
1844 long long now)
1845 OVS_REQUIRES(ctb->lock)
1846 {
1847 uint32_t hash = ctx->hash;
1848 struct conn *conn;
1849
1850 ctx->conn = NULL;
1851
1852 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
1853 if (!memcmp(&conn->key, &ctx->key, sizeof conn->key)
1854 && !conn_expired(conn, now)) {
1855 ctx->conn = conn;
1856 ctx->reply = false;
1857 break;
1858 }
1859 if (!memcmp(&conn->rev_key, &ctx->key, sizeof conn->rev_key)
1860 && !conn_expired(conn, now)) {
1861 ctx->conn = conn;
1862 ctx->reply = true;
1863 break;
1864 }
1865 }
1866 }
1867
1868 static enum ct_update_res
1869 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1870 struct dp_packet *pkt, bool reply, long long now)
1871 {
1872 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1873 reply, now);
1874 }
1875
1876 static bool
1877 conn_expired(struct conn *conn, long long now)
1878 {
1879 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1880 return now >= conn->expiration;
1881 }
1882 return false;
1883 }
1884
1885 static bool
1886 valid_new(struct dp_packet *pkt, struct conn_key *key)
1887 {
1888 return l4_protos[key->nw_proto]->valid_new(pkt);
1889 }
1890
1891 static struct conn *
1892 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1893 struct conn_key *key, long long now)
1894 {
1895 struct conn *newconn;
1896
1897 newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1898
1899 if (newconn) {
1900 newconn->key = *key;
1901 }
1902
1903 return newconn;
1904 }
1905
1906 static void
1907 delete_conn(struct conn *conn)
1908 {
1909 free(conn->nat_info);
1910 free(conn);
1911 }
1912 \f
1913 static void
1914 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
1915 union ct_dpif_inet_addr *b,
1916 ovs_be16 dl_type)
1917 {
1918 if (dl_type == htons(ETH_TYPE_IP)) {
1919 b->ip = a->ipv4_aligned;
1920 } else if (dl_type == htons(ETH_TYPE_IPV6)){
1921 b->in6 = a->ipv6_aligned;
1922 }
1923 }
1924
1925 static void
1926 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
1927 {
1928 if (key->dl_type == htons(ETH_TYPE_IP)) {
1929 tuple->l3_type = AF_INET;
1930 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1931 tuple->l3_type = AF_INET6;
1932 }
1933 tuple->ip_proto = key->nw_proto;
1934 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
1935 key->dl_type);
1936 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
1937 key->dl_type);
1938
1939 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
1940 tuple->icmp_id = key->src.icmp_id;
1941 tuple->icmp_type = key->src.icmp_type;
1942 tuple->icmp_code = key->src.icmp_code;
1943 } else {
1944 tuple->src_port = key->src.port;
1945 tuple->dst_port = key->dst.port;
1946 }
1947 }
1948
1949 static void
1950 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
1951 long long now, int bkt)
1952 {
1953 struct ct_l4_proto *class;
1954 long long expiration;
1955 memset(entry, 0, sizeof *entry);
1956 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
1957 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
1958
1959 entry->zone = conn->key.zone;
1960 entry->mark = conn->mark;
1961
1962 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
1963 /* Not implemented yet */
1964 entry->timestamp.start = 0;
1965 entry->timestamp.stop = 0;
1966
1967 expiration = conn->expiration - now;
1968 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
1969
1970 class = l4_protos[conn->key.nw_proto];
1971 if (class->conn_get_protoinfo) {
1972 class->conn_get_protoinfo(conn, &entry->protoinfo);
1973 }
1974 entry->bkt = bkt;
1975 }
1976
1977 int
1978 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
1979 const uint16_t *pzone, int *ptot_bkts)
1980 {
1981 memset(dump, 0, sizeof(*dump));
1982 if (pzone) {
1983 dump->zone = *pzone;
1984 dump->filter_zone = true;
1985 }
1986 dump->ct = ct;
1987
1988 *ptot_bkts = CONNTRACK_BUCKETS;
1989
1990 return 0;
1991 }
1992
1993 int
1994 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
1995 {
1996 struct conntrack *ct = dump->ct;
1997 long long now = time_msec();
1998
1999 while (dump->bucket < CONNTRACK_BUCKETS) {
2000 struct hmap_node *node;
2001
2002 ct_lock_lock(&ct->buckets[dump->bucket].lock);
2003 for (;;) {
2004 struct conn *conn;
2005
2006 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
2007 &dump->bucket_pos);
2008 if (!node) {
2009 break;
2010 }
2011 INIT_CONTAINER(conn, node, node);
2012 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2013 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
2014 conn_to_ct_dpif_entry(conn, entry, now, dump->bucket);
2015 break;
2016 }
2017 /* Else continue, until we find an entry in the appropriate zone
2018 * or the bucket has been scanned completely. */
2019 }
2020 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
2021
2022 if (!node) {
2023 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
2024 dump->bucket++;
2025 } else {
2026 return 0;
2027 }
2028 }
2029 return EOF;
2030 }
2031
2032 int
2033 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2034 {
2035 return 0;
2036 }
2037
2038 int
2039 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2040 {
2041 unsigned i;
2042
2043 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
2044 struct conn *conn, *next;
2045
2046 ct_lock_lock(&ct->buckets[i].lock);
2047 HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
2048 if ((!zone || *zone == conn->key.zone) &&
2049 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
2050 conn_clean(ct, conn, &ct->buckets[i]);
2051 }
2052 }
2053 ct_lock_unlock(&ct->buckets[i].lock);
2054 }
2055 return 0;
2056 }