]> git.proxmox.com Git - mirror_ovs.git/blob - lib/conntrack.c
conntrack: Hash entire NAT data structure in nat_range_hash().
[mirror_ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "conntrack.h"
19
20 #include <errno.h>
21 #include <sys/types.h>
22 #include <netinet/in.h>
23 #include <netinet/icmp6.h>
24
25 #include "bitmap.h"
26 #include "conntrack-private.h"
27 #include "coverage.h"
28 #include "csum.h"
29 #include "ct-dpif.h"
30 #include "dp-packet.h"
31 #include "flow.h"
32 #include "netdev.h"
33 #include "odp-netlink.h"
34 #include "openvswitch/hmap.h"
35 #include "openvswitch/vlog.h"
36 #include "ovs-rcu.h"
37 #include "ovs-thread.h"
38 #include "poll-loop.h"
39 #include "random.h"
40 #include "timeval.h"
41
42
43 VLOG_DEFINE_THIS_MODULE(conntrack);
44
45 COVERAGE_DEFINE(conntrack_full);
46 COVERAGE_DEFINE(conntrack_long_cleanup);
47
48 struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
53 bool related;
54 };
55
56 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
57 ovs_be16 dl_type, struct conn_lookup_ctx *,
58 uint16_t zone);
59 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
60 static void conn_key_reverse(struct conn_key *);
61 static void conn_key_lookup(struct conntrack_bucket *ctb,
62 struct conn_lookup_ctx *ctx,
63 long long now);
64 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
65 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
66 struct conn_key *, long long now);
67 static void delete_conn(struct conn *);
68 static enum ct_update_res conn_update(struct conn *,
69 struct conntrack_bucket *ctb,
70 struct dp_packet *, bool reply,
71 long long now);
72 static bool conn_expired(struct conn *, long long now);
73 static void set_mark(struct dp_packet *, struct conn *,
74 uint32_t val, uint32_t mask);
75 static void set_label(struct dp_packet *, struct conn *,
76 const struct ovs_key_ct_labels *val,
77 const struct ovs_key_ct_labels *mask);
78 static void *clean_thread_main(void *f_);
79
80 static struct nat_conn_key_node *
81 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
82 const struct conn_key *key,
83 uint32_t basis);
84
85 static void
86 nat_conn_keys_remove(struct hmap *nat_conn_keys,
87 const struct conn_key *key,
88 uint32_t basis);
89
90 static bool
91 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
92 struct conn *nat_conn);
93
94 static uint8_t
95 reverse_icmp_type(uint8_t type);
96 static uint8_t
97 reverse_icmp6_type(uint8_t type);
98 static inline bool
99 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
100 const char **new_data, bool validate_checksum);
101 static inline bool
102 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
103 const char **new_data);
104
105 static struct ct_l4_proto *l4_protos[] = {
106 [IPPROTO_TCP] = &ct_proto_tcp,
107 [IPPROTO_UDP] = &ct_proto_other,
108 [IPPROTO_ICMP] = &ct_proto_icmp4,
109 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
110 };
111
112 long long ct_timeout_val[] = {
113 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
114 CT_TIMEOUTS
115 #undef CT_TIMEOUT
116 };
117
118 /* If the total number of connections goes above this value, no new connections
119 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
120 #define DEFAULT_N_CONN_LIMIT 3000000
121
122 /* Initializes the connection tracker 'ct'. The caller is responsible for
123 * calling 'conntrack_destroy()', when the instance is not needed anymore */
124 void
125 conntrack_init(struct conntrack *ct)
126 {
127 unsigned i, j;
128 long long now = time_msec();
129
130 ct_rwlock_init(&ct->nat_resources_lock);
131 ct_rwlock_wrlock(&ct->nat_resources_lock);
132 hmap_init(&ct->nat_conn_keys);
133 ct_rwlock_unlock(&ct->nat_resources_lock);
134
135 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
136 struct conntrack_bucket *ctb = &ct->buckets[i];
137
138 ct_lock_init(&ctb->lock);
139 ct_lock_lock(&ctb->lock);
140 hmap_init(&ctb->connections);
141 for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
142 ovs_list_init(&ctb->exp_lists[j]);
143 }
144 ct_lock_unlock(&ctb->lock);
145 ovs_mutex_init(&ctb->cleanup_mutex);
146 ovs_mutex_lock(&ctb->cleanup_mutex);
147 ctb->next_cleanup = now + CT_TM_MIN;
148 ovs_mutex_unlock(&ctb->cleanup_mutex);
149 }
150 ct->hash_basis = random_uint32();
151 atomic_count_init(&ct->n_conn, 0);
152 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
153 latch_init(&ct->clean_thread_exit);
154 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
155 }
156
157 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
158 void
159 conntrack_destroy(struct conntrack *ct)
160 {
161 unsigned i;
162
163 latch_set(&ct->clean_thread_exit);
164 pthread_join(ct->clean_thread, NULL);
165 latch_destroy(&ct->clean_thread_exit);
166 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
167 struct conntrack_bucket *ctb = &ct->buckets[i];
168 struct conn *conn;
169
170 ovs_mutex_destroy(&ctb->cleanup_mutex);
171 ct_lock_lock(&ctb->lock);
172 HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
173 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
174 atomic_count_dec(&ct->n_conn);
175 }
176 delete_conn(conn);
177 }
178 hmap_destroy(&ctb->connections);
179 ct_lock_unlock(&ctb->lock);
180 ct_lock_destroy(&ctb->lock);
181 }
182 ct_rwlock_wrlock(&ct->nat_resources_lock);
183 struct nat_conn_key_node *nat_conn_key_node;
184 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
185 free(nat_conn_key_node);
186 }
187 hmap_destroy(&ct->nat_conn_keys);
188 ct_rwlock_unlock(&ct->nat_resources_lock);
189 ct_rwlock_destroy(&ct->nat_resources_lock);
190 }
191 \f
192 static unsigned hash_to_bucket(uint32_t hash)
193 {
194 /* Extracts the most significant bits in hash. The least significant bits
195 * are already used internally by the hmap implementation. */
196 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
197
198 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
199 }
200
201 static void
202 write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
203 const struct conn_key *key)
204 {
205 pkt->md.ct_state |= CS_TRACKED;
206 pkt->md.ct_zone = zone;
207 pkt->md.ct_mark = conn ? conn->mark : 0;
208 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
209
210 /* Use the original direction tuple if we have it. */
211 if (conn) {
212 key = &conn->key;
213 }
214 pkt->md.ct_orig_tuple_ipv6 = false;
215 if (key) {
216 if (key->dl_type == htons(ETH_TYPE_IP)) {
217 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
218 key->src.addr.ipv4_aligned,
219 key->dst.addr.ipv4_aligned,
220 key->nw_proto != IPPROTO_ICMP
221 ? key->src.port : htons(key->src.icmp_type),
222 key->nw_proto != IPPROTO_ICMP
223 ? key->dst.port : htons(key->src.icmp_code),
224 key->nw_proto,
225 };
226 } else {
227 pkt->md.ct_orig_tuple_ipv6 = true;
228 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
229 key->src.addr.ipv6_aligned,
230 key->dst.addr.ipv6_aligned,
231 key->nw_proto != IPPROTO_ICMPV6
232 ? key->src.port : htons(key->src.icmp_type),
233 key->nw_proto != IPPROTO_ICMPV6
234 ? key->dst.port : htons(key->src.icmp_code),
235 key->nw_proto,
236 };
237 }
238 } else {
239 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
240 }
241
242 }
243
244 static void
245 pat_packet(struct dp_packet *pkt, const struct conn *conn)
246 {
247 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
248 if (conn->key.nw_proto == IPPROTO_TCP) {
249 struct tcp_header *th = dp_packet_l4(pkt);
250 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
251 } else if (conn->key.nw_proto == IPPROTO_UDP) {
252 struct udp_header *uh = dp_packet_l4(pkt);
253 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
254 }
255 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
256 if (conn->key.nw_proto == IPPROTO_TCP) {
257 struct tcp_header *th = dp_packet_l4(pkt);
258 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
259 } else if (conn->key.nw_proto == IPPROTO_UDP) {
260 struct udp_header *uh = dp_packet_l4(pkt);
261 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
262 }
263 }
264 }
265
266 static void
267 nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
268 {
269 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
270 pkt->md.ct_state |= CS_SRC_NAT;
271 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
272 struct ip_header *nh = dp_packet_l3(pkt);
273 packet_set_ipv4_addr(pkt, &nh->ip_src,
274 conn->rev_key.dst.addr.ipv4_aligned);
275 } else {
276 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
277 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
278 nh6->ip6_src.be32,
279 &conn->rev_key.dst.addr.ipv6_aligned,
280 true);
281 }
282 if (!related) {
283 pat_packet(pkt, conn);
284 }
285 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
286 pkt->md.ct_state |= CS_DST_NAT;
287 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
288 struct ip_header *nh = dp_packet_l3(pkt);
289 packet_set_ipv4_addr(pkt, &nh->ip_dst,
290 conn->rev_key.src.addr.ipv4_aligned);
291 } else {
292 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
293 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
294 nh6->ip6_dst.be32,
295 &conn->rev_key.src.addr.ipv6_aligned,
296 true);
297 }
298 if (!related) {
299 pat_packet(pkt, conn);
300 }
301 }
302 }
303
304 static void
305 un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
306 {
307 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
308 if (conn->key.nw_proto == IPPROTO_TCP) {
309 struct tcp_header *th = dp_packet_l4(pkt);
310 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
311 } else if (conn->key.nw_proto == IPPROTO_UDP) {
312 struct udp_header *uh = dp_packet_l4(pkt);
313 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
314 }
315 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
316 if (conn->key.nw_proto == IPPROTO_TCP) {
317 struct tcp_header *th = dp_packet_l4(pkt);
318 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
319 } else if (conn->key.nw_proto == IPPROTO_UDP) {
320 struct udp_header *uh = dp_packet_l4(pkt);
321 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
322 }
323 }
324 }
325
326 static void
327 reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
328 {
329 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
330 if (conn->key.nw_proto == IPPROTO_TCP) {
331 struct tcp_header *th_in = dp_packet_l4(pkt);
332 packet_set_tcp_port(pkt, conn->key.src.port,
333 th_in->tcp_dst);
334 } else if (conn->key.nw_proto == IPPROTO_UDP) {
335 struct udp_header *uh_in = dp_packet_l4(pkt);
336 packet_set_udp_port(pkt, conn->key.src.port,
337 uh_in->udp_dst);
338 }
339 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
340 if (conn->key.nw_proto == IPPROTO_TCP) {
341 struct tcp_header *th_in = dp_packet_l4(pkt);
342 packet_set_tcp_port(pkt, th_in->tcp_src,
343 conn->key.dst.port);
344 } else if (conn->key.nw_proto == IPPROTO_UDP) {
345 struct udp_header *uh_in = dp_packet_l4(pkt);
346 packet_set_udp_port(pkt, uh_in->udp_src,
347 conn->key.dst.port);
348 }
349 }
350 }
351
352 static void
353 reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
354 {
355 char *tail = dp_packet_tail(pkt);
356 char pad = dp_packet_l2_pad_size(pkt);
357 struct conn_key inner_key;
358 const char *inner_l4 = NULL;
359 uint16_t orig_l3_ofs = pkt->l3_ofs;
360 uint16_t orig_l4_ofs = pkt->l4_ofs;
361
362 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
363 struct ip_header *nh = dp_packet_l3(pkt);
364 struct icmp_header *icmp = dp_packet_l4(pkt);
365 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
366 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
367 -pad, &inner_l4, false);
368
369 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
370 pkt->l4_ofs += inner_l4 - (char *) icmp;
371
372 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
373 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
374 conn->key.src.addr.ipv4_aligned);
375 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
376 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
377 conn->key.dst.addr.ipv4_aligned);
378 }
379 reverse_pat_packet(pkt, conn);
380 icmp->icmp_csum = 0;
381 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
382 } else {
383 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
384 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
385 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
386 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
387 extract_l3_ipv6(&inner_key, inner_l3_6,
388 tail - ((char *)inner_l3_6) - pad,
389 &inner_l4);
390 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
391 pkt->l4_ofs += inner_l4 - (char *) icmp6;
392
393 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
394 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
395 inner_l3_6->ip6_src.be32,
396 &conn->key.src.addr.ipv6_aligned,
397 true);
398 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
399 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
400 inner_l3_6->ip6_dst.be32,
401 &conn->key.dst.addr.ipv6_aligned,
402 true);
403 }
404 reverse_pat_packet(pkt, conn);
405 uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
406 icmp6->icmp6_base.icmp6_cksum = 0;
407 icmp6->icmp6_base.icmp6_cksum = csum_finish(
408 csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
409 }
410 pkt->l3_ofs = orig_l3_ofs;
411 pkt->l4_ofs = orig_l4_ofs;
412 }
413
414 static void
415 un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
416 bool related)
417 {
418 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
419 pkt->md.ct_state |= CS_DST_NAT;
420 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
421 struct ip_header *nh = dp_packet_l3(pkt);
422 packet_set_ipv4_addr(pkt, &nh->ip_dst,
423 conn->key.src.addr.ipv4_aligned);
424 } else {
425 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
426 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
427 nh6->ip6_dst.be32,
428 &conn->key.src.addr.ipv6_aligned, true);
429 }
430
431 if (OVS_UNLIKELY(related)) {
432 reverse_nat_packet(pkt, conn);
433 } else {
434 un_pat_packet(pkt, conn);
435 }
436 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
437 pkt->md.ct_state |= CS_SRC_NAT;
438 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
439 struct ip_header *nh = dp_packet_l3(pkt);
440 packet_set_ipv4_addr(pkt, &nh->ip_src,
441 conn->key.dst.addr.ipv4_aligned);
442 } else {
443 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
444 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
445 nh6->ip6_src.be32,
446 &conn->key.dst.addr.ipv6_aligned, true);
447 }
448
449 if (OVS_UNLIKELY(related)) {
450 reverse_nat_packet(pkt, conn);
451 } else {
452 un_pat_packet(pkt, conn);
453 }
454 }
455 }
456
457 /* Typical usage of this helper is in non per-packet code;
458 * this is because the bucket lock needs to be held for lookup
459 * and a hash would have already been needed. Hence, this function
460 * is just intended for code clarity. */
461 static struct conn *
462 conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
463 {
464 struct conn_lookup_ctx ctx;
465 ctx.conn = NULL;
466 ctx.key = *key;
467 ctx.hash = conn_key_hash(key, ct->hash_basis);
468 unsigned bucket = hash_to_bucket(ctx.hash);
469 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
470 return ctx.conn;
471 }
472
473 static void
474 nat_clean(struct conntrack *ct, struct conn *conn,
475 struct conntrack_bucket *ctb)
476 OVS_REQUIRES(ctb->lock)
477 {
478 long long now = time_msec();
479 ct_rwlock_wrlock(&ct->nat_resources_lock);
480 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
481 ct_rwlock_unlock(&ct->nat_resources_lock);
482 ct_lock_unlock(&ctb->lock);
483
484 uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
485 unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
486
487 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
488 ct_rwlock_wrlock(&ct->nat_resources_lock);
489
490 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
491
492 struct nat_conn_key_node *nat_conn_key_node =
493 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
494 ct->hash_basis);
495
496 /* In the unlikely event, rev conn was recreated, then skip
497 * rev_conn cleanup. */
498 if (rev_conn && (!nat_conn_key_node ||
499 memcmp(&nat_conn_key_node->value, &rev_conn->rev_key,
500 sizeof nat_conn_key_node->value))) {
501 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
502 &rev_conn->node);
503 free(rev_conn);
504 }
505 delete_conn(conn);
506
507 ct_rwlock_unlock(&ct->nat_resources_lock);
508 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
509 ct_lock_lock(&ctb->lock);
510 }
511
512 static void
513 conn_clean(struct conntrack *ct, struct conn *conn,
514 struct conntrack_bucket *ctb)
515 OVS_REQUIRES(ctb->lock)
516 {
517 ovs_list_remove(&conn->exp_node);
518 hmap_remove(&ctb->connections, &conn->node);
519 atomic_count_dec(&ct->n_conn);
520 if (conn->nat_info) {
521 nat_clean(ct, conn, ctb);
522 } else {
523 delete_conn(conn);
524 }
525 }
526
527 static struct conn *
528 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
529 struct conn_lookup_ctx *ctx, bool commit, long long now,
530 const struct nat_action_info_t *nat_action_info,
531 struct conn *conn_for_un_nat_copy)
532 {
533 unsigned bucket = hash_to_bucket(ctx->hash);
534 struct conn *nc = NULL;
535
536 if (!valid_new(pkt, &ctx->key)) {
537 pkt->md.ct_state = CS_INVALID;
538 return nc;
539 }
540 pkt->md.ct_state = CS_NEW;
541
542 if (commit) {
543 unsigned int n_conn_limit;
544
545 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
546
547 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
548 COVERAGE_INC(conntrack_full);
549 return nc;
550 }
551
552 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
553 ctx->conn = nc;
554 nc->rev_key = nc->key;
555 conn_key_reverse(&nc->rev_key);
556
557 if (nat_action_info) {
558 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
559 ct_rwlock_wrlock(&ct->nat_resources_lock);
560
561 bool nat_res = nat_select_range_tuple(ct, nc,
562 conn_for_un_nat_copy);
563
564 if (!nat_res) {
565 free(nc->nat_info);
566 nc->nat_info = NULL;
567 free (nc);
568 ct_rwlock_unlock(&ct->nat_resources_lock);
569 return NULL;
570 }
571
572 if (conn_for_un_nat_copy &&
573 nc->conn_type == CT_CONN_TYPE_DEFAULT) {
574 *nc = *conn_for_un_nat_copy;
575 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
576 }
577 ct_rwlock_unlock(&ct->nat_resources_lock);
578
579 nat_packet(pkt, nc, ctx->related);
580 }
581 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
582 atomic_count_inc(&ct->n_conn);
583 }
584 return nc;
585 }
586
587 static bool
588 conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
589 struct conn_lookup_ctx *ctx, struct conn **conn,
590 long long now, unsigned bucket)
591 OVS_REQUIRES(ct->buckets[bucket].lock)
592 {
593 bool create_new_conn = false;
594
595 if (ctx->related) {
596 pkt->md.ct_state |= CS_RELATED;
597 if (ctx->reply) {
598 pkt->md.ct_state |= CS_REPLY_DIR;
599 }
600 } else {
601 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
602 pkt, ctx->reply, now);
603
604 switch (res) {
605 case CT_UPDATE_VALID:
606 pkt->md.ct_state |= CS_ESTABLISHED;
607 pkt->md.ct_state &= ~CS_NEW;
608 if (ctx->reply) {
609 pkt->md.ct_state |= CS_REPLY_DIR;
610 }
611 break;
612 case CT_UPDATE_INVALID:
613 pkt->md.ct_state = CS_INVALID;
614 break;
615 case CT_UPDATE_NEW:
616 conn_clean(ct, *conn, &ct->buckets[bucket]);
617 create_new_conn = true;
618 break;
619 default:
620 OVS_NOT_REACHED();
621 }
622 }
623 return create_new_conn;
624 }
625
626 static void
627 create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
628 long long now)
629 {
630 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
631 nc->key = conn_for_un_nat_copy->rev_key;
632 nc->rev_key = conn_for_un_nat_copy->key;
633 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
634 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
635 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
636 ct_rwlock_rdlock(&ct->nat_resources_lock);
637
638 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
639
640 struct nat_conn_key_node *nat_conn_key_node =
641 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
642 if (nat_conn_key_node
643 && !memcmp(&nat_conn_key_node->value, &nc->rev_key,
644 sizeof nat_conn_key_node->value)
645 && !rev_conn) {
646 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
647 &nc->node, un_nat_hash);
648 } else {
649 free(nc);
650 }
651 ct_rwlock_unlock(&ct->nat_resources_lock);
652 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
653 }
654
655 static void
656 handle_nat(struct dp_packet *pkt, struct conn *conn,
657 uint16_t zone, bool reply, bool related)
658 {
659 if (conn->nat_info &&
660 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
661 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
662 zone != pkt->md.ct_zone))) {
663 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
664 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
665 }
666 if (reply) {
667 un_nat_packet(pkt, conn, related);
668 } else {
669 nat_packet(pkt, conn, related);
670 }
671 }
672 }
673
674 static bool
675 check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
676 struct conn_lookup_ctx *ctx_in, long long now,
677 unsigned *bucket, struct conn **conn,
678 const struct nat_action_info_t *nat_action_info)
679 OVS_REQUIRES(ct->buckets[*bucket].lock)
680 {
681 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
682 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
683 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
684 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
685 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
686 nat_action_info) {
687 return false;
688 }
689
690 ct_lock_unlock(&ct->buckets[*bucket].lock);
691 struct conn_lookup_ctx ctx;
692 memset(&ctx, 0 , sizeof ctx);
693 ctx.conn = NULL;
694
695 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
696 ctx.key.src.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
697 ctx.key.dst.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
698
699 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
700 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
701 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
702 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
703 ctx.key.src.icmp_type = (uint8_t) src_port;
704 ctx.key.dst.icmp_type = reverse_icmp_type(ctx.key.src.icmp_type);
705 } else {
706 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
707 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
708 }
709 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
710 } else {
711 ctx.key.src.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
712 ctx.key.dst.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
713
714 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
715 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
716 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
717 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
718 ctx.key.src.icmp_type = (uint8_t) src_port;
719 ctx.key.dst.icmp_type = reverse_icmp6_type(ctx.key.src.icmp_type);
720 } else {
721 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
722 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
723 }
724 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
725 }
726
727 ctx.key.dl_type = ctx_in->key.dl_type;
728 ctx.key.zone = pkt->md.ct_zone;
729
730 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
731 *bucket = hash_to_bucket(ctx.hash);
732 ct_lock_lock(&ct->buckets[*bucket].lock);
733 conn_key_lookup(&ct->buckets[*bucket], &ctx, now);
734 *conn = ctx.conn;
735
736 return *conn ? true : false;
737 }
738
739 static void
740 process_one(struct conntrack *ct, struct dp_packet *pkt,
741 struct conn_lookup_ctx *ctx, uint16_t zone,
742 bool force, bool commit, long long now, const uint32_t *setmark,
743 const struct ovs_key_ct_labels *setlabel,
744 const struct nat_action_info_t *nat_action_info)
745 {
746 struct conn *conn;
747 unsigned bucket = hash_to_bucket(ctx->hash);
748 ct_lock_lock(&ct->buckets[bucket].lock);
749 conn_key_lookup(&ct->buckets[bucket], ctx, now);
750 conn = ctx->conn;
751
752 /* Delete found entry if in wrong direction. 'force' implies commit. */
753 if (conn && force && ctx->reply) {
754 conn_clean(ct, conn, &ct->buckets[bucket]);
755 conn = NULL;
756 }
757
758 if (OVS_LIKELY(conn)) {
759 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
760
761 ctx->reply = true;
762
763 struct conn_lookup_ctx ctx2;
764 ctx2.conn = NULL;
765 ctx2.key = conn->rev_key;
766 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
767
768 ct_lock_unlock(&ct->buckets[bucket].lock);
769 bucket = hash_to_bucket(ctx2.hash);
770
771 ct_lock_lock(&ct->buckets[bucket].lock);
772 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
773
774 if (ctx2.conn) {
775 conn = ctx2.conn;
776 } else {
777 /* It is a race condition where conn has timed out and removed
778 * between unlock of the rev_conn and lock of the forward conn;
779 * nothing to do. */
780 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
781 ct_lock_unlock(&ct->buckets[bucket].lock);
782 return;
783 }
784 }
785 }
786
787 bool create_new_conn = false;
788 struct conn conn_for_un_nat_copy;
789 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
790 if (OVS_LIKELY(conn)) {
791 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
792 if (nat_action_info && !create_new_conn) {
793 handle_nat(pkt, conn, zone, ctx->reply, ctx->related);
794 }
795 } else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
796 nat_action_info)) {
797 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
798 } else {
799 if (ctx->related) {
800 pkt->md.ct_state = CS_INVALID;
801 } else {
802 create_new_conn = true;
803 }
804 }
805
806 if (OVS_UNLIKELY(create_new_conn)) {
807 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
808 &conn_for_un_nat_copy);
809 }
810
811 write_ct_md(pkt, zone, conn, &ctx->key);
812 if (conn && setmark) {
813 set_mark(pkt, conn, setmark[0], setmark[1]);
814 }
815
816 if (conn && setlabel) {
817 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
818 }
819
820 ct_lock_unlock(&ct->buckets[bucket].lock);
821
822 if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) {
823 create_un_nat_conn(ct, &conn_for_un_nat_copy, now);
824 }
825 }
826
827 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
828 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
829 * the l3 and and l4 offset properly set.
830 *
831 * If 'commit' is true, the packets are allowed to create new entries in the
832 * connection tables. 'setmark', if not NULL, should point to a two
833 * elements array containing a value and a mask to set the connection mark.
834 * 'setlabel' behaves similarly for the connection label.*/
835 int
836 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
837 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
838 const uint32_t *setmark,
839 const struct ovs_key_ct_labels *setlabel,
840 const char *helper,
841 const struct nat_action_info_t *nat_action_info)
842 {
843 struct dp_packet **pkts = pkt_batch->packets;
844 size_t cnt = pkt_batch->count;
845 long long now = time_msec();
846 struct conn_lookup_ctx ctx;
847
848 if (helper) {
849 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
850
851 VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
852 /* Continue without the helper */
853 }
854
855 for (size_t i = 0; i < cnt; i++) {
856 if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone)) {
857 pkts[i]->md.ct_state = CS_INVALID;
858 write_ct_md(pkts[i], zone, NULL, NULL);
859 continue;
860 }
861 process_one(ct, pkts[i], &ctx, zone, force, commit,
862 now, setmark, setlabel, nat_action_info);
863 }
864
865 return 0;
866 }
867
868 static void
869 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
870 {
871 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
872 conn->mark = pkt->md.ct_mark;
873 }
874
875 static void
876 set_label(struct dp_packet *pkt, struct conn *conn,
877 const struct ovs_key_ct_labels *val,
878 const struct ovs_key_ct_labels *mask)
879 {
880 ovs_u128 v, m;
881
882 memcpy(&v, val, sizeof v);
883 memcpy(&m, mask, sizeof m);
884
885 pkt->md.ct_label.u64.lo = v.u64.lo
886 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
887 pkt->md.ct_label.u64.hi = v.u64.hi
888 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
889 conn->label = pkt->md.ct_label;
890 }
891
892 \f
893 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
894 * earliest expiration time among the remaining connections in 'ctb'. Returns
895 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
896 * if 'limit' is reached */
897 static long long
898 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
899 size_t limit)
900 OVS_REQUIRES(ctb->lock)
901 {
902 struct conn *conn, *next;
903 long long min_expiration = LLONG_MAX;
904 unsigned i;
905 size_t count = 0;
906
907 for (i = 0; i < N_CT_TM; i++) {
908 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
909 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
910 if (!conn_expired(conn, now) || count >= limit) {
911 min_expiration = MIN(min_expiration, conn->expiration);
912 if (count >= limit) {
913 /* Do not check other lists. */
914 COVERAGE_INC(conntrack_long_cleanup);
915 return min_expiration;
916 }
917 break;
918 }
919 conn_clean(ct, conn, ctb);
920 count++;
921 }
922 }
923 }
924
925 return min_expiration;
926 }
927
928 /* Cleans up old connection entries from 'ct'. Returns the time when the
929 * next expiration might happen. The return value might be smaller than
930 * 'now', meaning that an internal limit has been reached, and some expired
931 * connections have not been deleted. */
932 static long long
933 conntrack_clean(struct conntrack *ct, long long now)
934 {
935 long long next_wakeup = now + CT_TM_MIN;
936 unsigned int n_conn_limit;
937 size_t clean_count = 0;
938 unsigned i;
939
940 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
941
942 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
943 struct conntrack_bucket *ctb = &ct->buckets[i];
944 size_t prev_count;
945 long long min_exp;
946
947 ovs_mutex_lock(&ctb->cleanup_mutex);
948 if (ctb->next_cleanup > now) {
949 goto next_bucket;
950 }
951
952 ct_lock_lock(&ctb->lock);
953 prev_count = hmap_count(&ctb->connections);
954 /* If the connections are well distributed among buckets, we want to
955 * limit to 10% of the global limit equally split among buckets. If
956 * the bucket is busier than the others, we limit to 10% of its
957 * current size. */
958 min_exp = sweep_bucket(ct, ctb, now,
959 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
960 clean_count += prev_count - hmap_count(&ctb->connections);
961
962 if (min_exp > now) {
963 /* We call hmap_shrink() only if sweep_bucket() managed to delete
964 * every expired connection. */
965 hmap_shrink(&ctb->connections);
966 }
967
968 ct_lock_unlock(&ctb->lock);
969
970 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
971
972 next_bucket:
973 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
974 ovs_mutex_unlock(&ctb->cleanup_mutex);
975 }
976
977 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
978 clean_count, time_msec() - now);
979
980 return next_wakeup;
981 }
982
983 /* Cleanup:
984 *
985 * We must call conntrack_clean() periodically. conntrack_clean() return
986 * value gives an hint on when the next cleanup must be done (either because
987 * there is an actual connection that expires, or because a new connection
988 * might be created with the minimum timeout).
989 *
990 * The logic below has two goals:
991 *
992 * - We want to reduce the number of wakeups and batch connection cleanup
993 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
994 * are coping with the current cleanup tasks, then we wait at least
995 * 5 seconds to do further cleanup.
996 *
997 * - We don't want to keep the buckets locked too long, as we might prevent
998 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
999 * behind, there is at least some 200ms blocks of time when buckets will be
1000 * left alone, so the datapath can operate unhindered.
1001 */
1002 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1003 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1004
1005 static void *
1006 clean_thread_main(void *f_)
1007 {
1008 struct conntrack *ct = f_;
1009
1010 while (!latch_is_set(&ct->clean_thread_exit)) {
1011 long long next_wake;
1012 long long now = time_msec();
1013
1014 next_wake = conntrack_clean(ct, now);
1015
1016 if (next_wake < now) {
1017 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1018 } else {
1019 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1020 }
1021 latch_wait(&ct->clean_thread_exit);
1022 poll_block();
1023 }
1024
1025 return NULL;
1026 }
1027 \f
1028 /* Key extraction */
1029
1030 /* The function stores a pointer to the first byte after the header in
1031 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1032 * not interested in the header's tail, meaning that the header has
1033 * already been parsed (e.g. by flow_extract): we take this as a hint to
1034 * save a few checks. If 'validate_checksum' is true, the function returns
1035 * false if the IPv4 checksum is invalid. */
1036 static inline bool
1037 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1038 const char **new_data, bool validate_checksum)
1039 {
1040 const struct ip_header *ip = data;
1041 size_t ip_len;
1042
1043 if (new_data) {
1044 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1045 return false;
1046 }
1047 }
1048
1049 ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
1050
1051 if (new_data) {
1052 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1053 return false;
1054 }
1055 if (OVS_UNLIKELY(size < ip_len)) {
1056 return false;
1057 }
1058
1059 *new_data = (char *) data + ip_len;
1060 }
1061
1062 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1063 return false;
1064 }
1065
1066 if (validate_checksum && csum(data, ip_len) != 0) {
1067 return false;
1068 }
1069
1070 key->src.addr.ipv4 = ip->ip_src;
1071 key->dst.addr.ipv4 = ip->ip_dst;
1072 key->nw_proto = ip->ip_proto;
1073
1074 return true;
1075 }
1076
1077 /* The function stores a pointer to the first byte after the header in
1078 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1079 * not interested in the header's tail, meaning that the header has
1080 * already been parsed (e.g. by flow_extract): we take this as a hint to
1081 * save a few checks. */
1082 static inline bool
1083 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1084 const char **new_data)
1085 {
1086 const struct ovs_16aligned_ip6_hdr *ip6 = data;
1087
1088 if (new_data) {
1089 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1090 return false;
1091 }
1092 }
1093
1094 uint8_t nw_proto = ip6->ip6_nxt;
1095 uint8_t nw_frag = 0;
1096
1097 data = ip6 + 1;
1098 size -= sizeof *ip6;
1099
1100 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
1101 return false;
1102 }
1103
1104 if (new_data) {
1105 *new_data = data;
1106 }
1107
1108 if (nw_frag) {
1109 return false;
1110 }
1111
1112 key->src.addr.ipv6 = ip6->ip6_src;
1113 key->dst.addr.ipv6 = ip6->ip6_dst;
1114 key->nw_proto = nw_proto;
1115
1116 return true;
1117 }
1118
1119 static inline bool
1120 checksum_valid(const struct conn_key *key, const void *data, size_t size,
1121 const void *l3)
1122 {
1123 uint32_t csum = 0;
1124
1125 if (key->dl_type == htons(ETH_TYPE_IP)) {
1126 csum = packet_csum_pseudoheader(l3);
1127 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1128 csum = packet_csum_pseudoheader6(l3);
1129 } else {
1130 return false;
1131 }
1132
1133 csum = csum_continue(csum, data, size);
1134
1135 return csum_finish(csum) == 0;
1136 }
1137
1138 static inline bool
1139 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
1140 const void *l3)
1141 {
1142 const struct tcp_header *tcp = data;
1143 if (size < sizeof *tcp) {
1144 return false;
1145 }
1146
1147 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1148 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1149 return false;
1150 }
1151
1152 return checksum_valid(key, data, size, l3);
1153 }
1154
1155 static inline bool
1156 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
1157 const void *l3)
1158 {
1159 const struct udp_header *udp = data;
1160 if (size < sizeof *udp) {
1161 return false;
1162 }
1163
1164 size_t udp_len = ntohs(udp->udp_len);
1165 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1166 return false;
1167 }
1168
1169 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1170 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1171 || checksum_valid(key, data, size, l3);
1172 }
1173
1174 static inline bool
1175 check_l4_icmp(const void *data, size_t size)
1176 {
1177 return csum(data, size) == 0;
1178 }
1179
1180 static inline bool
1181 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
1182 const void *l3)
1183 {
1184 return checksum_valid(key, data, size, l3);
1185 }
1186
1187 static inline bool
1188 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1189 {
1190 const struct tcp_header *tcp = data;
1191
1192 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1193 return false;
1194 }
1195
1196 key->src.port = tcp->tcp_src;
1197 key->dst.port = tcp->tcp_dst;
1198
1199 /* Port 0 is invalid */
1200 return key->src.port && key->dst.port;
1201 }
1202
1203 static inline bool
1204 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1205 {
1206 const struct udp_header *udp = data;
1207
1208 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1209 return false;
1210 }
1211
1212 key->src.port = udp->udp_src;
1213 key->dst.port = udp->udp_dst;
1214
1215 /* Port 0 is invalid */
1216 return key->src.port && key->dst.port;
1217 }
1218
1219 static inline bool extract_l4(struct conn_key *key, const void *data,
1220 size_t size, bool *related, const void *l3);
1221
1222 static uint8_t
1223 reverse_icmp_type(uint8_t type)
1224 {
1225 switch (type) {
1226 case ICMP4_ECHO_REQUEST:
1227 return ICMP4_ECHO_REPLY;
1228 case ICMP4_ECHO_REPLY:
1229 return ICMP4_ECHO_REQUEST;
1230
1231 case ICMP4_TIMESTAMP:
1232 return ICMP4_TIMESTAMPREPLY;
1233 case ICMP4_TIMESTAMPREPLY:
1234 return ICMP4_TIMESTAMP;
1235
1236 case ICMP4_INFOREQUEST:
1237 return ICMP4_INFOREPLY;
1238 case ICMP4_INFOREPLY:
1239 return ICMP4_INFOREQUEST;
1240 default:
1241 OVS_NOT_REACHED();
1242 }
1243 }
1244
1245 /* If 'related' is not NULL and the function is processing an ICMP
1246 * error packet, extract the l3 and l4 fields from the nested header
1247 * instead and set *related to true. If 'related' is NULL we're
1248 * already processing a nested header and no such recursion is
1249 * possible */
1250 static inline int
1251 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1252 bool *related)
1253 {
1254 const struct icmp_header *icmp = data;
1255
1256 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1257 return false;
1258 }
1259
1260 switch (icmp->icmp_type) {
1261 case ICMP4_ECHO_REQUEST:
1262 case ICMP4_ECHO_REPLY:
1263 case ICMP4_TIMESTAMP:
1264 case ICMP4_TIMESTAMPREPLY:
1265 case ICMP4_INFOREQUEST:
1266 case ICMP4_INFOREPLY:
1267 if (icmp->icmp_code != 0) {
1268 return false;
1269 }
1270 /* Separate ICMP connection: identified using id */
1271 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1272 key->src.icmp_type = icmp->icmp_type;
1273 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
1274 break;
1275 case ICMP4_DST_UNREACH:
1276 case ICMP4_TIME_EXCEEDED:
1277 case ICMP4_PARAM_PROB:
1278 case ICMP4_SOURCEQUENCH:
1279 case ICMP4_REDIRECT: {
1280 /* ICMP packet part of another connection. We should
1281 * extract the key from embedded packet header */
1282 struct conn_key inner_key;
1283 const char *l3 = (const char *) (icmp + 1);
1284 const char *tail = (const char *) data + size;
1285 const char *l4;
1286 bool ok;
1287
1288 if (!related) {
1289 return false;
1290 }
1291
1292 memset(&inner_key, 0, sizeof inner_key);
1293 inner_key.dl_type = htons(ETH_TYPE_IP);
1294 ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
1295 if (!ok) {
1296 return false;
1297 }
1298
1299 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
1300 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
1301 return false;
1302 }
1303
1304 key->src = inner_key.src;
1305 key->dst = inner_key.dst;
1306 key->nw_proto = inner_key.nw_proto;
1307
1308 ok = extract_l4(key, l4, tail - l4, NULL, l3);
1309 if (ok) {
1310 conn_key_reverse(key);
1311 *related = true;
1312 }
1313 return ok;
1314 }
1315 default:
1316 return false;
1317 }
1318
1319 return true;
1320 }
1321
1322 static uint8_t
1323 reverse_icmp6_type(uint8_t type)
1324 {
1325 switch (type) {
1326 case ICMP6_ECHO_REQUEST:
1327 return ICMP6_ECHO_REPLY;
1328 case ICMP6_ECHO_REPLY:
1329 return ICMP6_ECHO_REQUEST;
1330 default:
1331 OVS_NOT_REACHED();
1332 }
1333 }
1334
1335 /* If 'related' is not NULL and the function is processing an ICMP
1336 * error packet, extract the l3 and l4 fields from the nested header
1337 * instead and set *related to true. If 'related' is NULL we're
1338 * already processing a nested header and no such recursion is
1339 * possible */
1340 static inline bool
1341 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1342 bool *related)
1343 {
1344 const struct icmp6_header *icmp6 = data;
1345
1346 /* All the messages that we support need at least 4 bytes after
1347 * the header */
1348 if (size < sizeof *icmp6 + 4) {
1349 return false;
1350 }
1351
1352 switch (icmp6->icmp6_type) {
1353 case ICMP6_ECHO_REQUEST:
1354 case ICMP6_ECHO_REPLY:
1355 if (icmp6->icmp6_code != 0) {
1356 return false;
1357 }
1358 /* Separate ICMP connection: identified using id */
1359 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1360 key->src.icmp_type = icmp6->icmp6_type;
1361 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
1362 break;
1363 case ICMP6_DST_UNREACH:
1364 case ICMP6_PACKET_TOO_BIG:
1365 case ICMP6_TIME_EXCEEDED:
1366 case ICMP6_PARAM_PROB: {
1367 /* ICMP packet part of another connection. We should
1368 * extract the key from embedded packet header */
1369 struct conn_key inner_key;
1370 const char *l3 = (const char *) icmp6 + 8;
1371 const char *tail = (const char *) data + size;
1372 const char *l4 = NULL;
1373 bool ok;
1374
1375 if (!related) {
1376 return false;
1377 }
1378
1379 memset(&inner_key, 0, sizeof inner_key);
1380 inner_key.dl_type = htons(ETH_TYPE_IPV6);
1381 ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
1382 if (!ok) {
1383 return false;
1384 }
1385
1386 /* pf doesn't do this, but it seems a good idea */
1387 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
1388 &key->dst.addr.ipv6_aligned)
1389 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
1390 &key->src.addr.ipv6_aligned)) {
1391 return false;
1392 }
1393
1394 key->src = inner_key.src;
1395 key->dst = inner_key.dst;
1396 key->nw_proto = inner_key.nw_proto;
1397
1398 ok = extract_l4(key, l4, tail - l4, NULL, l3);
1399 if (ok) {
1400 conn_key_reverse(key);
1401 *related = true;
1402 }
1403 return ok;
1404 }
1405 default:
1406 return false;
1407 }
1408
1409 return true;
1410 }
1411
1412 /* Extract l4 fields into 'key', which must already contain valid l3
1413 * members.
1414 *
1415 * If 'related' is not NULL and an ICMP error packet is being
1416 * processed, the function will extract the key from the packet nested
1417 * in the ICMP paylod and set '*related' to true.
1418 *
1419 * If 'related' is NULL, it means that we're already parsing a header nested
1420 * in an ICMP error. In this case, we skip checksum and length validation. */
1421 static inline bool
1422 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
1423 const void *l3)
1424 {
1425 if (key->nw_proto == IPPROTO_TCP) {
1426 return (!related || check_l4_tcp(key, data, size, l3))
1427 && extract_l4_tcp(key, data, size);
1428 } else if (key->nw_proto == IPPROTO_UDP) {
1429 return (!related || check_l4_udp(key, data, size, l3))
1430 && extract_l4_udp(key, data, size);
1431 } else if (key->dl_type == htons(ETH_TYPE_IP)
1432 && key->nw_proto == IPPROTO_ICMP) {
1433 return (!related || check_l4_icmp(data, size))
1434 && extract_l4_icmp(key, data, size, related);
1435 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1436 && key->nw_proto == IPPROTO_ICMPV6) {
1437 return (!related || check_l4_icmp6(key, data, size, l3))
1438 && extract_l4_icmp6(key, data, size, related);
1439 } else {
1440 return false;
1441 }
1442 }
1443
1444 static bool
1445 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
1446 struct conn_lookup_ctx *ctx, uint16_t zone)
1447 {
1448 const struct eth_header *l2 = dp_packet_eth(pkt);
1449 const struct ip_header *l3 = dp_packet_l3(pkt);
1450 const char *l4 = dp_packet_l4(pkt);
1451 const char *tail = dp_packet_tail(pkt);
1452 bool ok;
1453
1454 memset(ctx, 0, sizeof *ctx);
1455
1456 if (!l2 || !l3 || !l4) {
1457 return false;
1458 }
1459
1460 ctx->key.zone = zone;
1461
1462 /* XXX In this function we parse the packet (again, it has already
1463 * gone through miniflow_extract()) for two reasons:
1464 *
1465 * 1) To extract the l3 addresses and l4 ports.
1466 * We already have the l3 and l4 headers' pointers. Extracting
1467 * the l3 addresses and the l4 ports is really cheap, since they
1468 * can be found at fixed locations.
1469 * 2) To extract the l4 type.
1470 * Extracting the l4 types, for IPv6 can be quite expensive, because
1471 * it's not at a fixed location.
1472 *
1473 * Here's a way to avoid (2) with the help of the datapath.
1474 * The datapath doesn't keep the packet's extracted flow[1], so
1475 * using that is not an option. We could use the packet's matching
1476 * megaflow, but we have to make sure that the l4 type (nw_proto)
1477 * is unwildcarded. This means either:
1478 *
1479 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1480 * if the actions contains ct().
1481 *
1482 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1483 * action. This is already done in different actions, but it's
1484 * unnecessary for the kernel.
1485 *
1486 * ---
1487 * [1] The reasons for this are that keeping the flow increases
1488 * (slightly) the cache footprint and increases computation
1489 * time as we move the packet around. Most importantly, the flow
1490 * should be updated by the actions and this can be slow, as
1491 * we use a sparse representation (miniflow).
1492 *
1493 */
1494 ctx->key.dl_type = dl_type;
1495 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
1496 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
1497 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1498 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1499 } else {
1500 ok = false;
1501 }
1502
1503 if (ok) {
1504 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
1505 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1506 return true;
1507 }
1508 }
1509
1510 return false;
1511 }
1512
1513 static uint32_t
1514 ct_addr_hash_add(uint32_t hash, const struct ct_addr *addr)
1515 {
1516 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
1517 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
1518 }
1519
1520 static uint32_t
1521 ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
1522 {
1523 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
1524 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
1525 }
1526 \f
1527 /* Symmetric */
1528 static uint32_t
1529 conn_key_hash(const struct conn_key *key, uint32_t basis)
1530 {
1531 uint32_t hsrc, hdst, hash;
1532 int i;
1533
1534 hsrc = hdst = basis;
1535
1536 for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
1537 hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
1538 hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
1539 }
1540
1541 /* Even if source and destination are swapped the hash will be the same. */
1542 hash = hsrc ^ hdst;
1543
1544 /* Hash the rest of the key(L3 and L4 types and zone). */
1545 hash = hash_words((uint32_t *) (&key->dst + 1),
1546 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1547 hash);
1548
1549 return hash;
1550 }
1551
1552 static void
1553 conn_key_reverse(struct conn_key *key)
1554 {
1555 struct ct_endpoint tmp;
1556
1557 tmp = key->src;
1558 key->src = key->dst;
1559 key->dst = tmp;
1560 }
1561
1562 static uint32_t
1563 nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
1564 struct in6_addr *ipv6_aligned_max)
1565 {
1566 uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
1567 uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t);
1568 uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
1569 uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
1570
1571 ovs_be64 addr6_64_min_hi;
1572 ovs_be64 addr6_64_min_lo;
1573 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
1574 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
1575
1576 ovs_be64 addr6_64_max_hi;
1577 ovs_be64 addr6_64_max_lo;
1578 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
1579 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
1580
1581 uint64_t diff;
1582 if (addr6_64_min_hi == addr6_64_max_hi &&
1583 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
1584 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
1585 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
1586 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
1587 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
1588 ntohll(addr6_64_max_lo) - 1);
1589 } else {
1590 /* Limit address delta supported to 32 bits or 4 billion approximately.
1591 * Possibly, this should be visible to the user through a datapath
1592 * support check, however the practical impact is probably nil. */
1593 diff = 0xfffffffe;
1594 }
1595 if (diff > 0xfffffffe) {
1596 diff = 0xfffffffe;
1597 }
1598 return diff;
1599 }
1600
1601 /* This function must be used in tandem with nat_ipv6_addrs_delta(), which
1602 * restricts the input parameters. */
1603 static void
1604 nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
1605 {
1606 uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
1607 uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
1608 ovs_be64 addr6_64_hi;
1609 ovs_be64 addr6_64_lo;
1610 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
1611 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
1612
1613 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
1614 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
1615 } else if (addr6_64_hi != OVS_BE64_MAX) {
1616 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
1617 addr6_64_lo = htonll(increment - (UINT64_MAX -
1618 ntohll(addr6_64_lo) + 1));
1619 } else {
1620 OVS_NOT_REACHED();
1621 }
1622
1623 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
1624 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
1625
1626 return;
1627 }
1628
1629 static uint32_t
1630 nat_range_hash(const struct conn *conn, uint32_t basis)
1631 {
1632 uint32_t hash = basis;
1633
1634 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
1635 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
1636 hash = hash_add(hash,
1637 (conn->nat_info->max_port << 16)
1638 | conn->nat_info->min_port);
1639
1640 hash = ct_endpoint_hash_add(hash, &conn->key.src);
1641 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
1642
1643 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
1644 hash = hash_add(hash, conn->key.nw_proto);
1645 hash = hash_add(hash, conn->key.zone);
1646
1647 /* The purpose of the second parameter is to distinguish hashes of data of
1648 * different length; our data always has the same length so there is no
1649 * value in counting. */
1650 return hash_finish(hash, 0);
1651 }
1652
1653 static bool
1654 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
1655 struct conn *nat_conn)
1656 {
1657 #define MIN_NAT_EPHEMERAL_PORT 1024
1658 #define MAX_NAT_EPHEMERAL_PORT 65535
1659
1660 uint16_t min_port;
1661 uint16_t max_port;
1662 uint16_t first_port;
1663
1664 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
1665
1666 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
1667 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
1668 min_port = ntohs(conn->key.src.port);
1669 max_port = ntohs(conn->key.src.port);
1670 first_port = min_port;
1671 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
1672 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
1673 min_port = ntohs(conn->key.dst.port);
1674 max_port = ntohs(conn->key.dst.port);
1675 first_port = min_port;
1676 } else {
1677 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
1678 uint32_t port_index = hash % (deltap + 1);
1679 first_port = conn->nat_info->min_port + port_index;
1680 min_port = conn->nat_info->min_port;
1681 max_port = conn->nat_info->max_port;
1682 }
1683
1684 uint32_t deltaa = 0;
1685 uint32_t address_index;
1686 struct ct_addr ct_addr;
1687 memset(&ct_addr, 0, sizeof ct_addr);
1688 struct ct_addr max_ct_addr;
1689 memset(&max_ct_addr, 0, sizeof max_ct_addr);
1690 max_ct_addr = conn->nat_info->max_addr;
1691
1692 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1693 deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
1694 ntohl(conn->nat_info->min_addr.ipv4_aligned);
1695 address_index = hash % (deltaa + 1);
1696 ct_addr.ipv4_aligned = htonl(
1697 ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
1698 } else {
1699 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
1700 &conn->nat_info->max_addr.ipv6_aligned);
1701 /* deltaa must be within 32 bits for full hash coverage. A 64 or
1702 * 128 bit hash is unnecessary and hence not used here. Most code
1703 * is kept common with V4; nat_ipv6_addrs_delta() will do the
1704 * enforcement via max_ct_addr. */
1705 max_ct_addr = conn->nat_info->min_addr;
1706 nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
1707
1708 address_index = hash % (deltaa + 1);
1709 ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
1710 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
1711 }
1712
1713 uint16_t port = first_port;
1714 bool all_ports_tried = false;
1715 bool original_ports_tried = false;
1716 struct ct_addr first_addr = ct_addr;
1717 *nat_conn = *conn;
1718
1719 while (true) {
1720 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1721 nat_conn->rev_key.dst.addr = ct_addr;
1722 } else {
1723 nat_conn->rev_key.src.addr = ct_addr;
1724 }
1725
1726 if ((conn->key.nw_proto == IPPROTO_ICMP) ||
1727 (conn->key.nw_proto == IPPROTO_ICMPV6)) {
1728 all_ports_tried = true;
1729 } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1730 nat_conn->rev_key.dst.port = htons(port);
1731 } else {
1732 nat_conn->rev_key.src.port = htons(port);
1733 }
1734
1735 struct nat_conn_key_node *nat_conn_key_node =
1736 nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key,
1737 ct->hash_basis);
1738
1739 if (!nat_conn_key_node) {
1740 struct nat_conn_key_node *nat_conn_key =
1741 xzalloc(sizeof *nat_conn_key);
1742 nat_conn_key->key = nat_conn->rev_key;
1743 nat_conn_key->value = nat_conn->key;
1744 uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key,
1745 ct->hash_basis);
1746 hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node,
1747 nat_conn_key_hash);
1748 return true;
1749 } else if (!all_ports_tried) {
1750 if (min_port == max_port) {
1751 all_ports_tried = true;
1752 } else if (port == max_port) {
1753 port = min_port;
1754 } else {
1755 port++;
1756 }
1757 if (port == first_port) {
1758 all_ports_tried = true;
1759 }
1760 } else {
1761 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
1762 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1763 ct_addr.ipv4_aligned = htonl(
1764 ntohl(ct_addr.ipv4_aligned) + 1);
1765 } else {
1766 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
1767 }
1768 } else {
1769 ct_addr = conn->nat_info->min_addr;
1770 }
1771 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
1772 if (!original_ports_tried) {
1773 original_ports_tried = true;
1774 ct_addr = conn->nat_info->min_addr;
1775 min_port = MIN_NAT_EPHEMERAL_PORT;
1776 max_port = MAX_NAT_EPHEMERAL_PORT;
1777 } else {
1778 break;
1779 }
1780 }
1781 first_port = min_port;
1782 port = first_port;
1783 all_ports_tried = false;
1784 }
1785 }
1786 return false;
1787 }
1788
1789 static struct nat_conn_key_node *
1790 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
1791 const struct conn_key *key,
1792 uint32_t basis)
1793 {
1794 struct nat_conn_key_node *nat_conn_key_node;
1795 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1796
1797 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1798 nat_conn_keys) {
1799 if (!memcmp(&nat_conn_key_node->key, key,
1800 sizeof nat_conn_key_node->key)) {
1801 return nat_conn_key_node;
1802 }
1803 }
1804 return NULL;
1805 }
1806
1807 static void
1808 nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
1809 uint32_t basis)
1810 {
1811 struct nat_conn_key_node *nat_conn_key_node;
1812 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1813
1814 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1815 nat_conn_keys) {
1816 if (!memcmp(&nat_conn_key_node->key, key,
1817 sizeof nat_conn_key_node->key)) {
1818 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
1819 free(nat_conn_key_node);
1820 return;
1821 }
1822 }
1823 }
1824
1825 static void
1826 conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
1827 long long now)
1828 {
1829 uint32_t hash = ctx->hash;
1830 struct conn *conn;
1831
1832 ctx->conn = NULL;
1833
1834 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
1835 if (!memcmp(&conn->key, &ctx->key, sizeof conn->key)
1836 && !conn_expired(conn, now)) {
1837 ctx->conn = conn;
1838 ctx->reply = false;
1839 break;
1840 }
1841 if (!memcmp(&conn->rev_key, &ctx->key, sizeof conn->rev_key)
1842 && !conn_expired(conn, now)) {
1843 ctx->conn = conn;
1844 ctx->reply = true;
1845 break;
1846 }
1847 }
1848 }
1849
1850 static enum ct_update_res
1851 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1852 struct dp_packet *pkt, bool reply, long long now)
1853 {
1854 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1855 reply, now);
1856 }
1857
1858 static bool
1859 conn_expired(struct conn *conn, long long now)
1860 {
1861 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1862 return now >= conn->expiration;
1863 }
1864 return false;
1865 }
1866
1867 static bool
1868 valid_new(struct dp_packet *pkt, struct conn_key *key)
1869 {
1870 return l4_protos[key->nw_proto]->valid_new(pkt);
1871 }
1872
1873 static struct conn *
1874 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1875 struct conn_key *key, long long now)
1876 {
1877 struct conn *newconn;
1878
1879 newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1880
1881 if (newconn) {
1882 newconn->key = *key;
1883 }
1884
1885 return newconn;
1886 }
1887
1888 static void
1889 delete_conn(struct conn *conn)
1890 {
1891 free(conn->nat_info);
1892 free(conn);
1893 }
1894 \f
1895 static void
1896 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
1897 union ct_dpif_inet_addr *b,
1898 ovs_be16 dl_type)
1899 {
1900 if (dl_type == htons(ETH_TYPE_IP)) {
1901 b->ip = a->ipv4_aligned;
1902 } else if (dl_type == htons(ETH_TYPE_IPV6)){
1903 b->in6 = a->ipv6_aligned;
1904 }
1905 }
1906
1907 static void
1908 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
1909 {
1910 if (key->dl_type == htons(ETH_TYPE_IP)) {
1911 tuple->l3_type = AF_INET;
1912 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1913 tuple->l3_type = AF_INET6;
1914 }
1915 tuple->ip_proto = key->nw_proto;
1916 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
1917 key->dl_type);
1918 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
1919 key->dl_type);
1920
1921 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
1922 tuple->icmp_id = key->src.icmp_id;
1923 tuple->icmp_type = key->src.icmp_type;
1924 tuple->icmp_code = key->src.icmp_code;
1925 } else {
1926 tuple->src_port = key->src.port;
1927 tuple->dst_port = key->dst.port;
1928 }
1929 }
1930
1931 static void
1932 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
1933 long long now)
1934 {
1935 struct ct_l4_proto *class;
1936 long long expiration;
1937 memset(entry, 0, sizeof *entry);
1938 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
1939 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
1940
1941 entry->zone = conn->key.zone;
1942 entry->mark = conn->mark;
1943
1944 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
1945 /* Not implemented yet */
1946 entry->timestamp.start = 0;
1947 entry->timestamp.stop = 0;
1948
1949 expiration = conn->expiration - now;
1950 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
1951
1952 class = l4_protos[conn->key.nw_proto];
1953 if (class->conn_get_protoinfo) {
1954 class->conn_get_protoinfo(conn, &entry->protoinfo);
1955 }
1956 }
1957
1958 int
1959 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
1960 const uint16_t *pzone)
1961 {
1962 memset(dump, 0, sizeof(*dump));
1963 if (pzone) {
1964 dump->zone = *pzone;
1965 dump->filter_zone = true;
1966 }
1967 dump->ct = ct;
1968
1969 return 0;
1970 }
1971
1972 int
1973 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
1974 {
1975 struct conntrack *ct = dump->ct;
1976 long long now = time_msec();
1977
1978 while (dump->bucket < CONNTRACK_BUCKETS) {
1979 struct hmap_node *node;
1980
1981 ct_lock_lock(&ct->buckets[dump->bucket].lock);
1982 for (;;) {
1983 struct conn *conn;
1984
1985 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
1986 &dump->bucket_pos);
1987 if (!node) {
1988 break;
1989 }
1990 INIT_CONTAINER(conn, node, node);
1991 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
1992 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
1993 conn_to_ct_dpif_entry(conn, entry, now);
1994 break;
1995 }
1996 /* Else continue, until we find an entry in the appropriate zone
1997 * or the bucket has been scanned completely. */
1998 }
1999 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
2000
2001 if (!node) {
2002 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
2003 dump->bucket++;
2004 } else {
2005 return 0;
2006 }
2007 }
2008 return EOF;
2009 }
2010
2011 int
2012 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2013 {
2014 return 0;
2015 }
2016
2017 int
2018 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2019 {
2020 unsigned i;
2021
2022 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
2023 struct conn *conn, *next;
2024
2025 ct_lock_lock(&ct->buckets[i].lock);
2026 HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
2027 if ((!zone || *zone == conn->key.zone) &&
2028 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
2029 conn_clean(ct, conn, &ct->buckets[i]);
2030 }
2031 }
2032 ct_lock_unlock(&ct->buckets[i].lock);
2033 }
2034 return 0;
2035 }