]> git.proxmox.com Git - mirror_ovs.git/blob - lib/conntrack.c
dpdk: Add more ICMP Related NAT support.
[mirror_ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "conntrack.h"
19
20 #include <errno.h>
21 #include <sys/types.h>
22 #include <netinet/in.h>
23 #include <netinet/icmp6.h>
24
25 #include "bitmap.h"
26 #include "conntrack-private.h"
27 #include "coverage.h"
28 #include "csum.h"
29 #include "ct-dpif.h"
30 #include "dp-packet.h"
31 #include "flow.h"
32 #include "netdev.h"
33 #include "odp-netlink.h"
34 #include "openvswitch/hmap.h"
35 #include "openvswitch/vlog.h"
36 #include "ovs-rcu.h"
37 #include "ovs-thread.h"
38 #include "poll-loop.h"
39 #include "random.h"
40 #include "timeval.h"
41
42
43 VLOG_DEFINE_THIS_MODULE(conntrack);
44
45 COVERAGE_DEFINE(conntrack_full);
46 COVERAGE_DEFINE(conntrack_long_cleanup);
47
48 struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
53 bool related;
54 };
55
56 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
57 ovs_be16 dl_type, struct conn_lookup_ctx *,
58 uint16_t zone);
59 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
60 static void conn_key_reverse(struct conn_key *);
61 static void conn_key_lookup(struct conntrack_bucket *ctb,
62 struct conn_lookup_ctx *ctx,
63 long long now);
64 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
65 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
66 struct conn_key *, long long now);
67 static void delete_conn(struct conn *);
68 static enum ct_update_res conn_update(struct conn *,
69 struct conntrack_bucket *ctb,
70 struct dp_packet *, bool reply,
71 long long now);
72 static bool conn_expired(struct conn *, long long now);
73 static void set_mark(struct dp_packet *, struct conn *,
74 uint32_t val, uint32_t mask);
75 static void set_label(struct dp_packet *, struct conn *,
76 const struct ovs_key_ct_labels *val,
77 const struct ovs_key_ct_labels *mask);
78 static void *clean_thread_main(void *f_);
79
80 static struct nat_conn_key_node *
81 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
82 const struct conn_key *key,
83 uint32_t basis);
84
85 static void
86 nat_conn_keys_remove(struct hmap *nat_conn_keys,
87 const struct conn_key *key,
88 uint32_t basis);
89
90 static bool
91 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
92 struct conn *nat_conn);
93
94 static uint8_t
95 reverse_icmp_type(uint8_t type);
96 static uint8_t
97 reverse_icmp6_type(uint8_t type);
98 static inline bool
99 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
100 const char **new_data, bool validate_checksum);
101 static inline bool
102 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
103 const char **new_data);
104
105 static struct ct_l4_proto *l4_protos[] = {
106 [IPPROTO_TCP] = &ct_proto_tcp,
107 [IPPROTO_UDP] = &ct_proto_other,
108 [IPPROTO_ICMP] = &ct_proto_icmp4,
109 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
110 };
111
112 long long ct_timeout_val[] = {
113 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
114 CT_TIMEOUTS
115 #undef CT_TIMEOUT
116 };
117
118 /* If the total number of connections goes above this value, no new connections
119 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
120 #define DEFAULT_N_CONN_LIMIT 3000000
121
122 /* Initializes the connection tracker 'ct'. The caller is responsible for
123 * calling 'conntrack_destroy()', when the instance is not needed anymore */
124 void
125 conntrack_init(struct conntrack *ct)
126 {
127 unsigned i, j;
128 long long now = time_msec();
129
130 ct_rwlock_init(&ct->nat_resources_lock);
131 ct_rwlock_wrlock(&ct->nat_resources_lock);
132 hmap_init(&ct->nat_conn_keys);
133 ct_rwlock_unlock(&ct->nat_resources_lock);
134
135 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
136 struct conntrack_bucket *ctb = &ct->buckets[i];
137
138 ct_lock_init(&ctb->lock);
139 ct_lock_lock(&ctb->lock);
140 hmap_init(&ctb->connections);
141 for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
142 ovs_list_init(&ctb->exp_lists[j]);
143 }
144 ct_lock_unlock(&ctb->lock);
145 ovs_mutex_init(&ctb->cleanup_mutex);
146 ovs_mutex_lock(&ctb->cleanup_mutex);
147 ctb->next_cleanup = now + CT_TM_MIN;
148 ovs_mutex_unlock(&ctb->cleanup_mutex);
149 }
150 ct->hash_basis = random_uint32();
151 atomic_count_init(&ct->n_conn, 0);
152 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
153 latch_init(&ct->clean_thread_exit);
154 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
155 }
156
157 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
158 void
159 conntrack_destroy(struct conntrack *ct)
160 {
161 unsigned i;
162
163 latch_set(&ct->clean_thread_exit);
164 pthread_join(ct->clean_thread, NULL);
165 latch_destroy(&ct->clean_thread_exit);
166 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
167 struct conntrack_bucket *ctb = &ct->buckets[i];
168 struct conn *conn;
169
170 ovs_mutex_destroy(&ctb->cleanup_mutex);
171 ct_lock_lock(&ctb->lock);
172 HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
173 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
174 atomic_count_dec(&ct->n_conn);
175 }
176 delete_conn(conn);
177 }
178 hmap_destroy(&ctb->connections);
179 ct_lock_unlock(&ctb->lock);
180 ct_lock_destroy(&ctb->lock);
181 }
182 ct_rwlock_wrlock(&ct->nat_resources_lock);
183 struct nat_conn_key_node *nat_conn_key_node;
184 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
185 free(nat_conn_key_node);
186 }
187 hmap_destroy(&ct->nat_conn_keys);
188 ct_rwlock_unlock(&ct->nat_resources_lock);
189 ct_rwlock_destroy(&ct->nat_resources_lock);
190 }
191 \f
192 static unsigned hash_to_bucket(uint32_t hash)
193 {
194 /* Extracts the most significant bits in hash. The least significant bits
195 * are already used internally by the hmap implementation. */
196 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
197
198 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
199 }
200
201 static void
202 write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
203 const struct conn_key *key)
204 {
205 pkt->md.ct_state |= CS_TRACKED;
206 pkt->md.ct_zone = zone;
207 pkt->md.ct_mark = conn ? conn->mark : 0;
208 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
209
210 /* Use the original direction tuple if we have it. */
211 if (conn) {
212 key = &conn->key;
213 }
214 pkt->md.ct_orig_tuple_ipv6 = false;
215 if (key) {
216 if (key->dl_type == htons(ETH_TYPE_IP)) {
217 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
218 key->src.addr.ipv4_aligned,
219 key->dst.addr.ipv4_aligned,
220 key->nw_proto != IPPROTO_ICMP
221 ? key->src.port : htons(key->src.icmp_type),
222 key->nw_proto != IPPROTO_ICMP
223 ? key->dst.port : htons(key->src.icmp_code),
224 key->nw_proto,
225 };
226 } else {
227 pkt->md.ct_orig_tuple_ipv6 = true;
228 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
229 key->src.addr.ipv6_aligned,
230 key->dst.addr.ipv6_aligned,
231 key->nw_proto != IPPROTO_ICMPV6
232 ? key->src.port : htons(key->src.icmp_type),
233 key->nw_proto != IPPROTO_ICMPV6
234 ? key->dst.port : htons(key->src.icmp_code),
235 key->nw_proto,
236 };
237 }
238 } else {
239 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
240 }
241
242 }
243
244 static void
245 pat_packet(struct dp_packet *pkt, const struct conn *conn)
246 {
247 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
248 if (conn->key.nw_proto == IPPROTO_TCP) {
249 struct tcp_header *th = dp_packet_l4(pkt);
250 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
251 } else if (conn->key.nw_proto == IPPROTO_UDP) {
252 struct udp_header *uh = dp_packet_l4(pkt);
253 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
254 }
255 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
256 if (conn->key.nw_proto == IPPROTO_TCP) {
257 struct tcp_header *th = dp_packet_l4(pkt);
258 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
259 } else if (conn->key.nw_proto == IPPROTO_UDP) {
260 struct udp_header *uh = dp_packet_l4(pkt);
261 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
262 }
263 }
264 }
265
266 static void
267 nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
268 {
269 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
270 pkt->md.ct_state |= CS_SRC_NAT;
271 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
272 struct ip_header *nh = dp_packet_l3(pkt);
273 packet_set_ipv4_addr(pkt, &nh->ip_src,
274 conn->rev_key.dst.addr.ipv4_aligned);
275 } else {
276 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
277 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
278 nh6->ip6_src.be32,
279 &conn->rev_key.dst.addr.ipv6_aligned,
280 true);
281 }
282 if (!related) {
283 pat_packet(pkt, conn);
284 }
285 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
286 pkt->md.ct_state |= CS_DST_NAT;
287 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
288 struct ip_header *nh = dp_packet_l3(pkt);
289 packet_set_ipv4_addr(pkt, &nh->ip_dst,
290 conn->rev_key.src.addr.ipv4_aligned);
291 } else {
292 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
293 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
294 nh6->ip6_dst.be32,
295 &conn->rev_key.src.addr.ipv6_aligned,
296 true);
297 }
298 if (!related) {
299 pat_packet(pkt, conn);
300 }
301 }
302 }
303
304 static void
305 un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
306 {
307 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
308 if (conn->key.nw_proto == IPPROTO_TCP) {
309 struct tcp_header *th = dp_packet_l4(pkt);
310 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
311 } else if (conn->key.nw_proto == IPPROTO_UDP) {
312 struct udp_header *uh = dp_packet_l4(pkt);
313 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
314 }
315 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
316 if (conn->key.nw_proto == IPPROTO_TCP) {
317 struct tcp_header *th = dp_packet_l4(pkt);
318 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
319 } else if (conn->key.nw_proto == IPPROTO_UDP) {
320 struct udp_header *uh = dp_packet_l4(pkt);
321 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
322 }
323 }
324 }
325
326 static void
327 reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
328 {
329 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
330 if (conn->key.nw_proto == IPPROTO_TCP) {
331 struct tcp_header *th_in = dp_packet_l4(pkt);
332 packet_set_tcp_port(pkt, conn->key.src.port,
333 th_in->tcp_dst);
334 } else if (conn->key.nw_proto == IPPROTO_UDP) {
335 struct udp_header *uh_in = dp_packet_l4(pkt);
336 packet_set_udp_port(pkt, conn->key.src.port,
337 uh_in->udp_dst);
338 }
339 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
340 if (conn->key.nw_proto == IPPROTO_TCP) {
341 struct tcp_header *th_in = dp_packet_l4(pkt);
342 packet_set_tcp_port(pkt, th_in->tcp_src,
343 conn->key.dst.port);
344 } else if (conn->key.nw_proto == IPPROTO_UDP) {
345 struct udp_header *uh_in = dp_packet_l4(pkt);
346 packet_set_udp_port(pkt, uh_in->udp_src,
347 conn->key.dst.port);
348 }
349 }
350 }
351
352 static void
353 reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
354 {
355 char *tail = dp_packet_tail(pkt);
356 char pad = dp_packet_l2_pad_size(pkt);
357 struct conn_key inner_key;
358 const char *inner_l4 = NULL;
359 uint16_t orig_l3_ofs = pkt->l3_ofs;
360 uint16_t orig_l4_ofs = pkt->l4_ofs;
361
362 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
363 struct ip_header *nh = dp_packet_l3(pkt);
364 struct icmp_header *icmp = dp_packet_l4(pkt);
365 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
366 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3)
367 -pad, &inner_l4, false);
368
369 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
370 pkt->l4_ofs += inner_l4 - (char *) icmp;
371
372 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
373 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
374 conn->key.src.addr.ipv4_aligned);
375 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
376 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
377 conn->key.dst.addr.ipv4_aligned);
378 }
379 reverse_pat_packet(pkt, conn);
380 icmp->icmp_csum = 0;
381 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
382 } else {
383 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
384 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
385 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
386 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
387 extract_l3_ipv6(&inner_key, inner_l3_6,
388 tail - ((char *)inner_l3_6) - pad,
389 &inner_l4);
390 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
391 pkt->l4_ofs += inner_l4 - (char *) icmp6;
392
393 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
394 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
395 inner_l3_6->ip6_src.be32,
396 &conn->key.src.addr.ipv6_aligned,
397 true);
398 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
399 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
400 inner_l3_6->ip6_dst.be32,
401 &conn->key.dst.addr.ipv6_aligned,
402 true);
403 }
404 reverse_pat_packet(pkt, conn);
405 uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
406 icmp6->icmp6_base.icmp6_cksum = 0;
407 icmp6->icmp6_base.icmp6_cksum = csum_finish(
408 csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
409 }
410 pkt->l3_ofs = orig_l3_ofs;
411 pkt->l4_ofs = orig_l4_ofs;
412 }
413
414 static void
415 un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
416 bool related)
417 {
418 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
419 pkt->md.ct_state |= CS_DST_NAT;
420 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
421 struct ip_header *nh = dp_packet_l3(pkt);
422 packet_set_ipv4_addr(pkt, &nh->ip_dst,
423 conn->key.src.addr.ipv4_aligned);
424 } else {
425 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
426 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
427 nh6->ip6_dst.be32,
428 &conn->key.src.addr.ipv6_aligned, true);
429 }
430
431 if (OVS_UNLIKELY(related)) {
432 reverse_nat_packet(pkt, conn);
433 } else {
434 un_pat_packet(pkt, conn);
435 }
436 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
437 pkt->md.ct_state |= CS_SRC_NAT;
438 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
439 struct ip_header *nh = dp_packet_l3(pkt);
440 packet_set_ipv4_addr(pkt, &nh->ip_src,
441 conn->key.dst.addr.ipv4_aligned);
442 } else {
443 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
444 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
445 nh6->ip6_src.be32,
446 &conn->key.dst.addr.ipv6_aligned, true);
447 }
448
449 if (OVS_UNLIKELY(related)) {
450 reverse_nat_packet(pkt, conn);
451 } else {
452 un_pat_packet(pkt, conn);
453 }
454 }
455 }
456
457 /* Typical usage of this helper is in non per-packet code;
458 * this is because the bucket lock needs to be held for lookup
459 * and a hash would have already been needed. Hence, this function
460 * is just intended for code clarity. */
461 static struct conn *
462 conn_lookup(struct conntrack *ct, struct conn_key *key, long long now)
463 {
464 struct conn_lookup_ctx ctx;
465 ctx.conn = NULL;
466 ctx.key = *key;
467 ctx.hash = conn_key_hash(key, ct->hash_basis);
468 unsigned bucket = hash_to_bucket(ctx.hash);
469 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
470 return ctx.conn;
471 }
472
473 static void
474 nat_clean(struct conntrack *ct, struct conn *conn,
475 struct conntrack_bucket *ctb)
476 OVS_REQUIRES(ctb->lock)
477 {
478 long long now = time_msec();
479 ct_rwlock_wrlock(&ct->nat_resources_lock);
480 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
481 ct_rwlock_unlock(&ct->nat_resources_lock);
482 ct_lock_unlock(&ctb->lock);
483
484 uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
485 unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
486
487 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
488 ct_rwlock_wrlock(&ct->nat_resources_lock);
489
490 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
491
492 struct nat_conn_key_node *nat_conn_key_node =
493 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
494 ct->hash_basis);
495
496 /* In the unlikely event, rev conn was recreated, then skip
497 * rev_conn cleanup. */
498 if (rev_conn && (!nat_conn_key_node ||
499 memcmp(&nat_conn_key_node->value, &rev_conn->rev_key,
500 sizeof nat_conn_key_node->value))) {
501 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
502 &rev_conn->node);
503 free(rev_conn);
504 }
505 delete_conn(conn);
506
507 ct_rwlock_unlock(&ct->nat_resources_lock);
508 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
509 ct_lock_lock(&ctb->lock);
510 }
511
512 static void
513 conn_clean(struct conntrack *ct, struct conn *conn,
514 struct conntrack_bucket *ctb)
515 OVS_REQUIRES(ctb->lock)
516 {
517 ovs_list_remove(&conn->exp_node);
518 hmap_remove(&ctb->connections, &conn->node);
519 atomic_count_dec(&ct->n_conn);
520 if (conn->nat_info) {
521 nat_clean(ct, conn, ctb);
522 } else {
523 delete_conn(conn);
524 }
525 }
526
527 static struct conn *
528 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
529 struct conn_lookup_ctx *ctx, bool commit, long long now,
530 const struct nat_action_info_t *nat_action_info,
531 struct conn *conn_for_un_nat_copy)
532 {
533 unsigned bucket = hash_to_bucket(ctx->hash);
534 struct conn *nc = NULL;
535
536 if (!valid_new(pkt, &ctx->key)) {
537 pkt->md.ct_state = CS_INVALID;
538 return nc;
539 }
540 pkt->md.ct_state = CS_NEW;
541
542 if (commit) {
543 unsigned int n_conn_limit;
544
545 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
546
547 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
548 COVERAGE_INC(conntrack_full);
549 return nc;
550 }
551
552 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
553 ctx->conn = nc;
554 nc->rev_key = nc->key;
555 conn_key_reverse(&nc->rev_key);
556
557 if (nat_action_info) {
558 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
559 ct_rwlock_wrlock(&ct->nat_resources_lock);
560
561 bool nat_res = nat_select_range_tuple(ct, nc,
562 conn_for_un_nat_copy);
563
564 if (!nat_res) {
565 free(nc->nat_info);
566 nc->nat_info = NULL;
567 free (nc);
568 ct_rwlock_unlock(&ct->nat_resources_lock);
569 return NULL;
570 }
571
572 if (conn_for_un_nat_copy &&
573 nc->conn_type == CT_CONN_TYPE_DEFAULT) {
574 *nc = *conn_for_un_nat_copy;
575 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
576 }
577 ct_rwlock_unlock(&ct->nat_resources_lock);
578
579 nat_packet(pkt, nc, ctx->related);
580 }
581 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
582 atomic_count_inc(&ct->n_conn);
583 }
584 return nc;
585 }
586
587 static bool
588 conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
589 struct conn_lookup_ctx *ctx, struct conn **conn,
590 long long now, unsigned bucket)
591 OVS_REQUIRES(ct->buckets[bucket].lock)
592 {
593 bool create_new_conn = false;
594
595 if (ctx->related) {
596 pkt->md.ct_state |= CS_RELATED;
597 if (ctx->reply) {
598 pkt->md.ct_state |= CS_REPLY_DIR;
599 }
600 } else {
601 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
602 pkt, ctx->reply, now);
603
604 switch (res) {
605 case CT_UPDATE_VALID:
606 pkt->md.ct_state |= CS_ESTABLISHED;
607 pkt->md.ct_state &= ~CS_NEW;
608 if (ctx->reply) {
609 pkt->md.ct_state |= CS_REPLY_DIR;
610 }
611 break;
612 case CT_UPDATE_INVALID:
613 pkt->md.ct_state = CS_INVALID;
614 break;
615 case CT_UPDATE_NEW:
616 conn_clean(ct, *conn, &ct->buckets[bucket]);
617 create_new_conn = true;
618 break;
619 default:
620 OVS_NOT_REACHED();
621 }
622 }
623 return create_new_conn;
624 }
625
626 static void
627 create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
628 long long now)
629 {
630 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
631 nc->key = conn_for_un_nat_copy->rev_key;
632 nc->rev_key = conn_for_un_nat_copy->key;
633 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
634 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
635 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
636 ct_rwlock_rdlock(&ct->nat_resources_lock);
637
638 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
639
640 struct nat_conn_key_node *nat_conn_key_node =
641 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
642 if (nat_conn_key_node
643 && !memcmp(&nat_conn_key_node->value, &nc->rev_key,
644 sizeof nat_conn_key_node->value)
645 && !rev_conn) {
646 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
647 &nc->node, un_nat_hash);
648 } else {
649 free(nc);
650 }
651 ct_rwlock_unlock(&ct->nat_resources_lock);
652 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
653 }
654
655 static void
656 handle_nat(struct dp_packet *pkt, struct conn *conn,
657 uint16_t zone, bool reply, bool related)
658 {
659 if (conn->nat_info &&
660 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
661 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
662 zone != pkt->md.ct_zone))) {
663 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
664 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
665 }
666 if (reply) {
667 un_nat_packet(pkt, conn, related);
668 } else {
669 nat_packet(pkt, conn, related);
670 }
671 }
672 }
673
674 static void
675 process_one(struct conntrack *ct, struct dp_packet *pkt,
676 struct conn_lookup_ctx *ctx, uint16_t zone,
677 bool force, bool commit, long long now, const uint32_t *setmark,
678 const struct ovs_key_ct_labels *setlabel,
679 const struct nat_action_info_t *nat_action_info)
680 {
681 struct conn *conn;
682 unsigned bucket = hash_to_bucket(ctx->hash);
683 ct_lock_lock(&ct->buckets[bucket].lock);
684 conn_key_lookup(&ct->buckets[bucket], ctx, now);
685 conn = ctx->conn;
686
687 /* Delete found entry if in wrong direction. 'force' implies commit. */
688 if (conn && force && ctx->reply) {
689 conn_clean(ct, conn, &ct->buckets[bucket]);
690 conn = NULL;
691 }
692
693 if (OVS_LIKELY(conn)) {
694 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
695
696 ctx->reply = true;
697
698 struct conn_lookup_ctx ctx2;
699 ctx2.conn = NULL;
700 ctx2.key = conn->rev_key;
701 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
702
703 ct_lock_unlock(&ct->buckets[bucket].lock);
704 bucket = hash_to_bucket(ctx2.hash);
705
706 ct_lock_lock(&ct->buckets[bucket].lock);
707 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
708
709 if (ctx2.conn) {
710 conn = ctx2.conn;
711 } else {
712 /* It is a race condition where conn has timed out and removed
713 * between unlock of the rev_conn and lock of the forward conn;
714 * nothing to do. */
715 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
716 ct_lock_unlock(&ct->buckets[bucket].lock);
717 return;
718 }
719 }
720 }
721
722 bool create_new_conn = false;
723 struct conn conn_for_un_nat_copy;
724 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
725 if (OVS_LIKELY(conn)) {
726 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
727 if (nat_action_info && !create_new_conn) {
728 handle_nat(pkt, conn, zone, ctx->reply, ctx->related);
729 }
730 } else {
731 if (ctx->related) {
732 pkt->md.ct_state = CS_INVALID;
733 } else {
734 create_new_conn = true;
735 }
736 }
737
738 if (OVS_UNLIKELY(create_new_conn)) {
739 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
740 &conn_for_un_nat_copy);
741 }
742
743 write_ct_md(pkt, zone, conn, &ctx->key);
744 if (conn && setmark) {
745 set_mark(pkt, conn, setmark[0], setmark[1]);
746 }
747
748 if (conn && setlabel) {
749 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
750 }
751
752 ct_lock_unlock(&ct->buckets[bucket].lock);
753
754 if (conn_for_un_nat_copy.conn_type == CT_CONN_TYPE_UN_NAT) {
755 create_un_nat_conn(ct, &conn_for_un_nat_copy, now);
756 }
757 }
758
759 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
760 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
761 * the l3 and and l4 offset properly set.
762 *
763 * If 'commit' is true, the packets are allowed to create new entries in the
764 * connection tables. 'setmark', if not NULL, should point to a two
765 * elements array containing a value and a mask to set the connection mark.
766 * 'setlabel' behaves similarly for the connection label.*/
767 int
768 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
769 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
770 const uint32_t *setmark,
771 const struct ovs_key_ct_labels *setlabel,
772 const char *helper,
773 const struct nat_action_info_t *nat_action_info)
774 {
775 struct dp_packet **pkts = pkt_batch->packets;
776 size_t cnt = pkt_batch->count;
777 long long now = time_msec();
778 struct conn_lookup_ctx ctx;
779
780 if (helper) {
781 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
782
783 VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
784 /* Continue without the helper */
785 }
786
787 for (size_t i = 0; i < cnt; i++) {
788 if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone)) {
789 pkts[i]->md.ct_state = CS_INVALID;
790 write_ct_md(pkts[i], zone, NULL, NULL);
791 continue;
792 }
793 process_one(ct, pkts[i], &ctx, zone, force, commit,
794 now, setmark, setlabel, nat_action_info);
795 }
796
797 return 0;
798 }
799
800 static void
801 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
802 {
803 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
804 conn->mark = pkt->md.ct_mark;
805 }
806
807 static void
808 set_label(struct dp_packet *pkt, struct conn *conn,
809 const struct ovs_key_ct_labels *val,
810 const struct ovs_key_ct_labels *mask)
811 {
812 ovs_u128 v, m;
813
814 memcpy(&v, val, sizeof v);
815 memcpy(&m, mask, sizeof m);
816
817 pkt->md.ct_label.u64.lo = v.u64.lo
818 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
819 pkt->md.ct_label.u64.hi = v.u64.hi
820 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
821 conn->label = pkt->md.ct_label;
822 }
823
824 \f
825 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
826 * earliest expiration time among the remaining connections in 'ctb'. Returns
827 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
828 * if 'limit' is reached */
829 static long long
830 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
831 size_t limit)
832 OVS_REQUIRES(ctb->lock)
833 {
834 struct conn *conn, *next;
835 long long min_expiration = LLONG_MAX;
836 unsigned i;
837 size_t count = 0;
838
839 for (i = 0; i < N_CT_TM; i++) {
840 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
841 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
842 if (!conn_expired(conn, now) || count >= limit) {
843 min_expiration = MIN(min_expiration, conn->expiration);
844 if (count >= limit) {
845 /* Do not check other lists. */
846 COVERAGE_INC(conntrack_long_cleanup);
847 return min_expiration;
848 }
849 break;
850 }
851 conn_clean(ct, conn, ctb);
852 count++;
853 }
854 }
855 }
856
857 return min_expiration;
858 }
859
860 /* Cleans up old connection entries from 'ct'. Returns the time when the
861 * next expiration might happen. The return value might be smaller than
862 * 'now', meaning that an internal limit has been reached, and some expired
863 * connections have not been deleted. */
864 static long long
865 conntrack_clean(struct conntrack *ct, long long now)
866 {
867 long long next_wakeup = now + CT_TM_MIN;
868 unsigned int n_conn_limit;
869 size_t clean_count = 0;
870 unsigned i;
871
872 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
873
874 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
875 struct conntrack_bucket *ctb = &ct->buckets[i];
876 size_t prev_count;
877 long long min_exp;
878
879 ovs_mutex_lock(&ctb->cleanup_mutex);
880 if (ctb->next_cleanup > now) {
881 goto next_bucket;
882 }
883
884 ct_lock_lock(&ctb->lock);
885 prev_count = hmap_count(&ctb->connections);
886 /* If the connections are well distributed among buckets, we want to
887 * limit to 10% of the global limit equally split among buckets. If
888 * the bucket is busier than the others, we limit to 10% of its
889 * current size. */
890 min_exp = sweep_bucket(ct, ctb, now,
891 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
892 clean_count += prev_count - hmap_count(&ctb->connections);
893
894 if (min_exp > now) {
895 /* We call hmap_shrink() only if sweep_bucket() managed to delete
896 * every expired connection. */
897 hmap_shrink(&ctb->connections);
898 }
899
900 ct_lock_unlock(&ctb->lock);
901
902 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
903
904 next_bucket:
905 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
906 ovs_mutex_unlock(&ctb->cleanup_mutex);
907 }
908
909 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
910 clean_count, time_msec() - now);
911
912 return next_wakeup;
913 }
914
915 /* Cleanup:
916 *
917 * We must call conntrack_clean() periodically. conntrack_clean() return
918 * value gives an hint on when the next cleanup must be done (either because
919 * there is an actual connection that expires, or because a new connection
920 * might be created with the minimum timeout).
921 *
922 * The logic below has two goals:
923 *
924 * - We want to reduce the number of wakeups and batch connection cleanup
925 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
926 * are coping with the current cleanup tasks, then we wait at least
927 * 5 seconds to do further cleanup.
928 *
929 * - We don't want to keep the buckets locked too long, as we might prevent
930 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
931 * behind, there is at least some 200ms blocks of time when buckets will be
932 * left alone, so the datapath can operate unhindered.
933 */
934 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
935 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
936
937 static void *
938 clean_thread_main(void *f_)
939 {
940 struct conntrack *ct = f_;
941
942 while (!latch_is_set(&ct->clean_thread_exit)) {
943 long long next_wake;
944 long long now = time_msec();
945
946 next_wake = conntrack_clean(ct, now);
947
948 if (next_wake < now) {
949 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
950 } else {
951 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
952 }
953 latch_wait(&ct->clean_thread_exit);
954 poll_block();
955 }
956
957 return NULL;
958 }
959 \f
960 /* Key extraction */
961
962 /* The function stores a pointer to the first byte after the header in
963 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
964 * not interested in the header's tail, meaning that the header has
965 * already been parsed (e.g. by flow_extract): we take this as a hint to
966 * save a few checks. If 'validate_checksum' is true, the function returns
967 * false if the IPv4 checksum is invalid. */
968 static inline bool
969 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
970 const char **new_data, bool validate_checksum)
971 {
972 const struct ip_header *ip = data;
973 size_t ip_len;
974
975 if (new_data) {
976 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
977 return false;
978 }
979 }
980
981 ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
982
983 if (new_data) {
984 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
985 return false;
986 }
987 if (OVS_UNLIKELY(size < ip_len)) {
988 return false;
989 }
990
991 *new_data = (char *) data + ip_len;
992 }
993
994 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
995 return false;
996 }
997
998 if (validate_checksum && csum(data, ip_len) != 0) {
999 return false;
1000 }
1001
1002 key->src.addr.ipv4 = ip->ip_src;
1003 key->dst.addr.ipv4 = ip->ip_dst;
1004 key->nw_proto = ip->ip_proto;
1005
1006 return true;
1007 }
1008
1009 /* The function stores a pointer to the first byte after the header in
1010 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1011 * not interested in the header's tail, meaning that the header has
1012 * already been parsed (e.g. by flow_extract): we take this as a hint to
1013 * save a few checks. */
1014 static inline bool
1015 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1016 const char **new_data)
1017 {
1018 const struct ovs_16aligned_ip6_hdr *ip6 = data;
1019
1020 if (new_data) {
1021 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1022 return false;
1023 }
1024 }
1025
1026 uint8_t nw_proto = ip6->ip6_nxt;
1027 uint8_t nw_frag = 0;
1028
1029 data = ip6 + 1;
1030 size -= sizeof *ip6;
1031
1032 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
1033 return false;
1034 }
1035
1036 if (new_data) {
1037 *new_data = data;
1038 }
1039
1040 if (nw_frag) {
1041 return false;
1042 }
1043
1044 key->src.addr.ipv6 = ip6->ip6_src;
1045 key->dst.addr.ipv6 = ip6->ip6_dst;
1046 key->nw_proto = nw_proto;
1047
1048 return true;
1049 }
1050
1051 static inline bool
1052 checksum_valid(const struct conn_key *key, const void *data, size_t size,
1053 const void *l3)
1054 {
1055 uint32_t csum = 0;
1056
1057 if (key->dl_type == htons(ETH_TYPE_IP)) {
1058 csum = packet_csum_pseudoheader(l3);
1059 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1060 csum = packet_csum_pseudoheader6(l3);
1061 } else {
1062 return false;
1063 }
1064
1065 csum = csum_continue(csum, data, size);
1066
1067 return csum_finish(csum) == 0;
1068 }
1069
1070 static inline bool
1071 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
1072 const void *l3)
1073 {
1074 const struct tcp_header *tcp = data;
1075 if (size < sizeof *tcp) {
1076 return false;
1077 }
1078
1079 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1080 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1081 return false;
1082 }
1083
1084 return checksum_valid(key, data, size, l3);
1085 }
1086
1087 static inline bool
1088 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
1089 const void *l3)
1090 {
1091 const struct udp_header *udp = data;
1092 if (size < sizeof *udp) {
1093 return false;
1094 }
1095
1096 size_t udp_len = ntohs(udp->udp_len);
1097 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1098 return false;
1099 }
1100
1101 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1102 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1103 || checksum_valid(key, data, size, l3);
1104 }
1105
1106 static inline bool
1107 check_l4_icmp(const void *data, size_t size)
1108 {
1109 return csum(data, size) == 0;
1110 }
1111
1112 static inline bool
1113 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
1114 const void *l3)
1115 {
1116 return checksum_valid(key, data, size, l3);
1117 }
1118
1119 static inline bool
1120 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1121 {
1122 const struct tcp_header *tcp = data;
1123
1124 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1125 return false;
1126 }
1127
1128 key->src.port = tcp->tcp_src;
1129 key->dst.port = tcp->tcp_dst;
1130
1131 /* Port 0 is invalid */
1132 return key->src.port && key->dst.port;
1133 }
1134
1135 static inline bool
1136 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1137 {
1138 const struct udp_header *udp = data;
1139
1140 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1141 return false;
1142 }
1143
1144 key->src.port = udp->udp_src;
1145 key->dst.port = udp->udp_dst;
1146
1147 /* Port 0 is invalid */
1148 return key->src.port && key->dst.port;
1149 }
1150
1151 static inline bool extract_l4(struct conn_key *key, const void *data,
1152 size_t size, bool *related, const void *l3);
1153
1154 static uint8_t
1155 reverse_icmp_type(uint8_t type)
1156 {
1157 switch (type) {
1158 case ICMP4_ECHO_REQUEST:
1159 return ICMP4_ECHO_REPLY;
1160 case ICMP4_ECHO_REPLY:
1161 return ICMP4_ECHO_REQUEST;
1162
1163 case ICMP4_TIMESTAMP:
1164 return ICMP4_TIMESTAMPREPLY;
1165 case ICMP4_TIMESTAMPREPLY:
1166 return ICMP4_TIMESTAMP;
1167
1168 case ICMP4_INFOREQUEST:
1169 return ICMP4_INFOREPLY;
1170 case ICMP4_INFOREPLY:
1171 return ICMP4_INFOREQUEST;
1172 default:
1173 OVS_NOT_REACHED();
1174 }
1175 }
1176
1177 /* If 'related' is not NULL and the function is processing an ICMP
1178 * error packet, extract the l3 and l4 fields from the nested header
1179 * instead and set *related to true. If 'related' is NULL we're
1180 * already processing a nested header and no such recursion is
1181 * possible */
1182 static inline int
1183 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1184 bool *related)
1185 {
1186 const struct icmp_header *icmp = data;
1187
1188 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1189 return false;
1190 }
1191
1192 switch (icmp->icmp_type) {
1193 case ICMP4_ECHO_REQUEST:
1194 case ICMP4_ECHO_REPLY:
1195 case ICMP4_TIMESTAMP:
1196 case ICMP4_TIMESTAMPREPLY:
1197 case ICMP4_INFOREQUEST:
1198 case ICMP4_INFOREPLY:
1199 if (icmp->icmp_code != 0) {
1200 return false;
1201 }
1202 /* Separate ICMP connection: identified using id */
1203 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1204 key->src.icmp_type = icmp->icmp_type;
1205 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
1206 break;
1207 case ICMP4_DST_UNREACH:
1208 case ICMP4_TIME_EXCEEDED:
1209 case ICMP4_PARAM_PROB:
1210 case ICMP4_SOURCEQUENCH:
1211 case ICMP4_REDIRECT: {
1212 /* ICMP packet part of another connection. We should
1213 * extract the key from embedded packet header */
1214 struct conn_key inner_key;
1215 const char *l3 = (const char *) (icmp + 1);
1216 const char *tail = (const char *) data + size;
1217 const char *l4;
1218 bool ok;
1219
1220 if (!related) {
1221 return false;
1222 }
1223
1224 memset(&inner_key, 0, sizeof inner_key);
1225 inner_key.dl_type = htons(ETH_TYPE_IP);
1226 ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
1227 if (!ok) {
1228 return false;
1229 }
1230
1231 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
1232 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
1233 return false;
1234 }
1235
1236 key->src = inner_key.src;
1237 key->dst = inner_key.dst;
1238 key->nw_proto = inner_key.nw_proto;
1239
1240 ok = extract_l4(key, l4, tail - l4, NULL, l3);
1241 if (ok) {
1242 conn_key_reverse(key);
1243 *related = true;
1244 }
1245 return ok;
1246 }
1247 default:
1248 return false;
1249 }
1250
1251 return true;
1252 }
1253
1254 static uint8_t
1255 reverse_icmp6_type(uint8_t type)
1256 {
1257 switch (type) {
1258 case ICMP6_ECHO_REQUEST:
1259 return ICMP6_ECHO_REPLY;
1260 case ICMP6_ECHO_REPLY:
1261 return ICMP6_ECHO_REQUEST;
1262 default:
1263 OVS_NOT_REACHED();
1264 }
1265 }
1266
1267 /* If 'related' is not NULL and the function is processing an ICMP
1268 * error packet, extract the l3 and l4 fields from the nested header
1269 * instead and set *related to true. If 'related' is NULL we're
1270 * already processing a nested header and no such recursion is
1271 * possible */
1272 static inline bool
1273 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1274 bool *related)
1275 {
1276 const struct icmp6_header *icmp6 = data;
1277
1278 /* All the messages that we support need at least 4 bytes after
1279 * the header */
1280 if (size < sizeof *icmp6 + 4) {
1281 return false;
1282 }
1283
1284 switch (icmp6->icmp6_type) {
1285 case ICMP6_ECHO_REQUEST:
1286 case ICMP6_ECHO_REPLY:
1287 if (icmp6->icmp6_code != 0) {
1288 return false;
1289 }
1290 /* Separate ICMP connection: identified using id */
1291 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1292 key->src.icmp_type = icmp6->icmp6_type;
1293 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
1294 break;
1295 case ICMP6_DST_UNREACH:
1296 case ICMP6_PACKET_TOO_BIG:
1297 case ICMP6_TIME_EXCEEDED:
1298 case ICMP6_PARAM_PROB: {
1299 /* ICMP packet part of another connection. We should
1300 * extract the key from embedded packet header */
1301 struct conn_key inner_key;
1302 const char *l3 = (const char *) icmp6 + 8;
1303 const char *tail = (const char *) data + size;
1304 const char *l4 = NULL;
1305 bool ok;
1306
1307 if (!related) {
1308 return false;
1309 }
1310
1311 memset(&inner_key, 0, sizeof inner_key);
1312 inner_key.dl_type = htons(ETH_TYPE_IPV6);
1313 ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
1314 if (!ok) {
1315 return false;
1316 }
1317
1318 /* pf doesn't do this, but it seems a good idea */
1319 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
1320 &key->dst.addr.ipv6_aligned)
1321 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
1322 &key->src.addr.ipv6_aligned)) {
1323 return false;
1324 }
1325
1326 key->src = inner_key.src;
1327 key->dst = inner_key.dst;
1328 key->nw_proto = inner_key.nw_proto;
1329
1330 ok = extract_l4(key, l4, tail - l4, NULL, l3);
1331 if (ok) {
1332 conn_key_reverse(key);
1333 *related = true;
1334 }
1335 return ok;
1336 }
1337 default:
1338 return false;
1339 }
1340
1341 return true;
1342 }
1343
1344 /* Extract l4 fields into 'key', which must already contain valid l3
1345 * members.
1346 *
1347 * If 'related' is not NULL and an ICMP error packet is being
1348 * processed, the function will extract the key from the packet nested
1349 * in the ICMP paylod and set '*related' to true.
1350 *
1351 * If 'related' is NULL, it means that we're already parsing a header nested
1352 * in an ICMP error. In this case, we skip checksum and length validation. */
1353 static inline bool
1354 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
1355 const void *l3)
1356 {
1357 if (key->nw_proto == IPPROTO_TCP) {
1358 return (!related || check_l4_tcp(key, data, size, l3))
1359 && extract_l4_tcp(key, data, size);
1360 } else if (key->nw_proto == IPPROTO_UDP) {
1361 return (!related || check_l4_udp(key, data, size, l3))
1362 && extract_l4_udp(key, data, size);
1363 } else if (key->dl_type == htons(ETH_TYPE_IP)
1364 && key->nw_proto == IPPROTO_ICMP) {
1365 return (!related || check_l4_icmp(data, size))
1366 && extract_l4_icmp(key, data, size, related);
1367 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1368 && key->nw_proto == IPPROTO_ICMPV6) {
1369 return (!related || check_l4_icmp6(key, data, size, l3))
1370 && extract_l4_icmp6(key, data, size, related);
1371 } else {
1372 return false;
1373 }
1374 }
1375
1376 static bool
1377 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
1378 struct conn_lookup_ctx *ctx, uint16_t zone)
1379 {
1380 const struct eth_header *l2 = dp_packet_eth(pkt);
1381 const struct ip_header *l3 = dp_packet_l3(pkt);
1382 const char *l4 = dp_packet_l4(pkt);
1383 const char *tail = dp_packet_tail(pkt);
1384 bool ok;
1385
1386 memset(ctx, 0, sizeof *ctx);
1387
1388 if (!l2 || !l3 || !l4) {
1389 return false;
1390 }
1391
1392 ctx->key.zone = zone;
1393
1394 /* XXX In this function we parse the packet (again, it has already
1395 * gone through miniflow_extract()) for two reasons:
1396 *
1397 * 1) To extract the l3 addresses and l4 ports.
1398 * We already have the l3 and l4 headers' pointers. Extracting
1399 * the l3 addresses and the l4 ports is really cheap, since they
1400 * can be found at fixed locations.
1401 * 2) To extract the l4 type.
1402 * Extracting the l4 types, for IPv6 can be quite expensive, because
1403 * it's not at a fixed location.
1404 *
1405 * Here's a way to avoid (2) with the help of the datapath.
1406 * The datapath doesn't keep the packet's extracted flow[1], so
1407 * using that is not an option. We could use the packet's matching
1408 * megaflow, but we have to make sure that the l4 type (nw_proto)
1409 * is unwildcarded. This means either:
1410 *
1411 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1412 * if the actions contains ct().
1413 *
1414 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1415 * action. This is already done in different actions, but it's
1416 * unnecessary for the kernel.
1417 *
1418 * ---
1419 * [1] The reasons for this are that keeping the flow increases
1420 * (slightly) the cache footprint and increases computation
1421 * time as we move the packet around. Most importantly, the flow
1422 * should be updated by the actions and this can be slow, as
1423 * we use a sparse representation (miniflow).
1424 *
1425 */
1426 ctx->key.dl_type = dl_type;
1427 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
1428 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
1429 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1430 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1431 } else {
1432 ok = false;
1433 }
1434
1435 if (ok) {
1436 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
1437 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1438 return true;
1439 }
1440 }
1441
1442 return false;
1443 }
1444 \f
1445 /* Symmetric */
1446 static uint32_t
1447 conn_key_hash(const struct conn_key *key, uint32_t basis)
1448 {
1449 uint32_t hsrc, hdst, hash;
1450 int i;
1451
1452 hsrc = hdst = basis;
1453
1454 for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
1455 hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
1456 hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
1457 }
1458
1459 /* Even if source and destination are swapped the hash will be the same. */
1460 hash = hsrc ^ hdst;
1461
1462 /* Hash the rest of the key(L3 and L4 types and zone). */
1463 hash = hash_words((uint32_t *) (&key->dst + 1),
1464 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1465 hash);
1466
1467 return hash;
1468 }
1469
1470 static void
1471 conn_key_reverse(struct conn_key *key)
1472 {
1473 struct ct_endpoint tmp;
1474
1475 tmp = key->src;
1476 key->src = key->dst;
1477 key->dst = tmp;
1478 }
1479
1480 static uint32_t
1481 nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
1482 struct in6_addr *ipv6_aligned_max)
1483 {
1484 uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
1485 uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t);
1486 uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
1487 uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
1488
1489 ovs_be64 addr6_64_min_hi;
1490 ovs_be64 addr6_64_min_lo;
1491 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
1492 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
1493
1494 ovs_be64 addr6_64_max_hi;
1495 ovs_be64 addr6_64_max_lo;
1496 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
1497 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
1498
1499 uint64_t diff;
1500 if (addr6_64_min_hi == addr6_64_max_hi &&
1501 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
1502 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
1503 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
1504 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
1505 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
1506 ntohll(addr6_64_max_lo) - 1);
1507 } else {
1508 /* Limit address delta supported to 32 bits or 4 billion approximately.
1509 * Possibly, this should be visible to the user through a datapath
1510 * support check, however the practical impact is probably nil. */
1511 diff = 0xfffffffe;
1512 }
1513 if (diff > 0xfffffffe) {
1514 diff = 0xfffffffe;
1515 }
1516 return diff;
1517 }
1518
1519 /* This function must be used in tandem with nat_ipv6_addrs_delta(), which
1520 * restricts the input parameters. */
1521 static void
1522 nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
1523 {
1524 uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
1525 uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
1526 ovs_be64 addr6_64_hi;
1527 ovs_be64 addr6_64_lo;
1528 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
1529 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
1530
1531 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
1532 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
1533 } else if (addr6_64_hi != OVS_BE64_MAX) {
1534 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
1535 addr6_64_lo = htonll(increment - (UINT64_MAX -
1536 ntohll(addr6_64_lo) + 1));
1537 } else {
1538 OVS_NOT_REACHED();
1539 }
1540
1541 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
1542 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
1543
1544 return;
1545 }
1546
1547 static uint32_t
1548 nat_range_hash(const struct conn *conn, uint32_t basis)
1549 {
1550 uint32_t hash = basis;
1551 int i;
1552 uint16_t port;
1553
1554 for (i = 0;
1555 i < sizeof(conn->nat_info->min_addr) / sizeof(uint32_t);
1556 i++) {
1557 hash = hash_add(hash, ((uint32_t *) &conn->nat_info->min_addr)[i]);
1558 hash = hash_add(hash, ((uint32_t *) &conn->nat_info->max_addr)[i]);
1559 }
1560
1561 memcpy(&port, &conn->nat_info->min_port, sizeof port);
1562 hash = hash_add(hash, port);
1563
1564 for (i = 0; i < sizeof(conn->key.src.addr) / sizeof(uint32_t); i++) {
1565 hash = hash_add(hash, ((uint32_t *) &conn->key.src)[i]);
1566 hash = hash_add(hash, ((uint32_t *) &conn->key.dst)[i]);
1567 }
1568
1569 memcpy(&port, &conn->key.src.port, sizeof port);
1570 hash = hash_add(hash, port);
1571 memcpy(&port, &conn->key.dst.port, sizeof port);
1572 hash = hash_add(hash, port);
1573
1574 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
1575 hash = hash_add(hash, conn->key.nw_proto);
1576 hash = hash_add(hash, conn->key.zone);
1577 return hash;
1578 }
1579
1580 static bool
1581 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
1582 struct conn *nat_conn)
1583 {
1584 #define MIN_NAT_EPHEMERAL_PORT 1024
1585 #define MAX_NAT_EPHEMERAL_PORT 65535
1586
1587 uint16_t min_port;
1588 uint16_t max_port;
1589 uint16_t first_port;
1590
1591 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
1592
1593 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
1594 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
1595 min_port = ntohs(conn->key.src.port);
1596 max_port = ntohs(conn->key.src.port);
1597 first_port = min_port;
1598 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
1599 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
1600 min_port = ntohs(conn->key.dst.port);
1601 max_port = ntohs(conn->key.dst.port);
1602 first_port = min_port;
1603 } else {
1604 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
1605 uint32_t port_index = hash % (deltap + 1);
1606 first_port = conn->nat_info->min_port + port_index;
1607 min_port = conn->nat_info->min_port;
1608 max_port = conn->nat_info->max_port;
1609 }
1610
1611 uint32_t deltaa = 0;
1612 uint32_t address_index;
1613 struct ct_addr ct_addr;
1614 memset(&ct_addr, 0, sizeof ct_addr);
1615 struct ct_addr max_ct_addr;
1616 memset(&max_ct_addr, 0, sizeof max_ct_addr);
1617 max_ct_addr = conn->nat_info->max_addr;
1618
1619 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1620 deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
1621 ntohl(conn->nat_info->min_addr.ipv4_aligned);
1622 address_index = hash % (deltaa + 1);
1623 ct_addr.ipv4_aligned = htonl(
1624 ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
1625 } else {
1626 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
1627 &conn->nat_info->max_addr.ipv6_aligned);
1628 /* deltaa must be within 32 bits for full hash coverage. A 64 or
1629 * 128 bit hash is unnecessary and hence not used here. Most code
1630 * is kept common with V4; nat_ipv6_addrs_delta() will do the
1631 * enforcement via max_ct_addr. */
1632 max_ct_addr = conn->nat_info->min_addr;
1633 nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
1634
1635 address_index = hash % (deltaa + 1);
1636 ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
1637 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
1638 }
1639
1640 uint16_t port = first_port;
1641 bool all_ports_tried = false;
1642 bool original_ports_tried = false;
1643 struct ct_addr first_addr = ct_addr;
1644 *nat_conn = *conn;
1645
1646 while (true) {
1647 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1648 nat_conn->rev_key.dst.addr = ct_addr;
1649 } else {
1650 nat_conn->rev_key.src.addr = ct_addr;
1651 }
1652
1653 if ((conn->key.nw_proto == IPPROTO_ICMP) ||
1654 (conn->key.nw_proto == IPPROTO_ICMPV6)) {
1655 all_ports_tried = true;
1656 } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
1657 nat_conn->rev_key.dst.port = htons(port);
1658 } else {
1659 nat_conn->rev_key.src.port = htons(port);
1660 }
1661
1662 struct nat_conn_key_node *nat_conn_key_node =
1663 nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key,
1664 ct->hash_basis);
1665
1666 if (!nat_conn_key_node) {
1667 struct nat_conn_key_node *nat_conn_key =
1668 xzalloc(sizeof *nat_conn_key);
1669 nat_conn_key->key = nat_conn->rev_key;
1670 nat_conn_key->value = nat_conn->key;
1671 uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key,
1672 ct->hash_basis);
1673 hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node,
1674 nat_conn_key_hash);
1675 return true;
1676 } else if (!all_ports_tried) {
1677 if (min_port == max_port) {
1678 all_ports_tried = true;
1679 } else if (port == max_port) {
1680 port = min_port;
1681 } else {
1682 port++;
1683 }
1684 if (port == first_port) {
1685 all_ports_tried = true;
1686 }
1687 } else {
1688 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
1689 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
1690 ct_addr.ipv4_aligned = htonl(
1691 ntohl(ct_addr.ipv4_aligned) + 1);
1692 } else {
1693 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
1694 }
1695 } else {
1696 ct_addr = conn->nat_info->min_addr;
1697 }
1698 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
1699 if (!original_ports_tried) {
1700 original_ports_tried = true;
1701 ct_addr = conn->nat_info->min_addr;
1702 min_port = MIN_NAT_EPHEMERAL_PORT;
1703 max_port = MAX_NAT_EPHEMERAL_PORT;
1704 } else {
1705 break;
1706 }
1707 }
1708 first_port = min_port;
1709 port = first_port;
1710 all_ports_tried = false;
1711 }
1712 }
1713 return false;
1714 }
1715
1716 static struct nat_conn_key_node *
1717 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
1718 const struct conn_key *key,
1719 uint32_t basis)
1720 {
1721 struct nat_conn_key_node *nat_conn_key_node;
1722 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1723
1724 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1725 nat_conn_keys) {
1726 if (!memcmp(&nat_conn_key_node->key, key,
1727 sizeof nat_conn_key_node->key)) {
1728 return nat_conn_key_node;
1729 }
1730 }
1731 return NULL;
1732 }
1733
1734 static void
1735 nat_conn_keys_remove(struct hmap *nat_conn_keys, const struct conn_key *key,
1736 uint32_t basis)
1737 {
1738 struct nat_conn_key_node *nat_conn_key_node;
1739 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
1740
1741 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
1742 nat_conn_keys) {
1743 if (!memcmp(&nat_conn_key_node->key, key,
1744 sizeof nat_conn_key_node->key)) {
1745 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
1746 free(nat_conn_key_node);
1747 return;
1748 }
1749 }
1750 }
1751
1752 static void
1753 conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
1754 long long now)
1755 {
1756 uint32_t hash = ctx->hash;
1757 struct conn *conn;
1758
1759 ctx->conn = NULL;
1760
1761 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
1762 if (!memcmp(&conn->key, &ctx->key, sizeof conn->key)
1763 && !conn_expired(conn, now)) {
1764 ctx->conn = conn;
1765 ctx->reply = false;
1766 break;
1767 }
1768 if (!memcmp(&conn->rev_key, &ctx->key, sizeof conn->rev_key)
1769 && !conn_expired(conn, now)) {
1770 ctx->conn = conn;
1771 ctx->reply = true;
1772 break;
1773 }
1774 }
1775 }
1776
1777 static enum ct_update_res
1778 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1779 struct dp_packet *pkt, bool reply, long long now)
1780 {
1781 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1782 reply, now);
1783 }
1784
1785 static bool
1786 conn_expired(struct conn *conn, long long now)
1787 {
1788 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1789 return now >= conn->expiration;
1790 }
1791 return false;
1792 }
1793
1794 static bool
1795 valid_new(struct dp_packet *pkt, struct conn_key *key)
1796 {
1797 return l4_protos[key->nw_proto]->valid_new(pkt);
1798 }
1799
1800 static struct conn *
1801 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1802 struct conn_key *key, long long now)
1803 {
1804 struct conn *newconn;
1805
1806 newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1807
1808 if (newconn) {
1809 newconn->key = *key;
1810 }
1811
1812 return newconn;
1813 }
1814
1815 static void
1816 delete_conn(struct conn *conn)
1817 {
1818 free(conn->nat_info);
1819 free(conn);
1820 }
1821 \f
1822 static void
1823 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
1824 union ct_dpif_inet_addr *b,
1825 ovs_be16 dl_type)
1826 {
1827 if (dl_type == htons(ETH_TYPE_IP)) {
1828 b->ip = a->ipv4_aligned;
1829 } else if (dl_type == htons(ETH_TYPE_IPV6)){
1830 b->in6 = a->ipv6_aligned;
1831 }
1832 }
1833
1834 static void
1835 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
1836 {
1837 if (key->dl_type == htons(ETH_TYPE_IP)) {
1838 tuple->l3_type = AF_INET;
1839 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1840 tuple->l3_type = AF_INET6;
1841 }
1842 tuple->ip_proto = key->nw_proto;
1843 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
1844 key->dl_type);
1845 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
1846 key->dl_type);
1847
1848 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
1849 tuple->icmp_id = key->src.icmp_id;
1850 tuple->icmp_type = key->src.icmp_type;
1851 tuple->icmp_code = key->src.icmp_code;
1852 } else {
1853 tuple->src_port = key->src.port;
1854 tuple->dst_port = key->dst.port;
1855 }
1856 }
1857
1858 static void
1859 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
1860 long long now)
1861 {
1862 struct ct_l4_proto *class;
1863 long long expiration;
1864 memset(entry, 0, sizeof *entry);
1865 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
1866 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
1867
1868 entry->zone = conn->key.zone;
1869 entry->mark = conn->mark;
1870
1871 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
1872 /* Not implemented yet */
1873 entry->timestamp.start = 0;
1874 entry->timestamp.stop = 0;
1875
1876 expiration = conn->expiration - now;
1877 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
1878
1879 class = l4_protos[conn->key.nw_proto];
1880 if (class->conn_get_protoinfo) {
1881 class->conn_get_protoinfo(conn, &entry->protoinfo);
1882 }
1883 }
1884
1885 int
1886 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
1887 const uint16_t *pzone)
1888 {
1889 memset(dump, 0, sizeof(*dump));
1890 if (pzone) {
1891 dump->zone = *pzone;
1892 dump->filter_zone = true;
1893 }
1894 dump->ct = ct;
1895
1896 return 0;
1897 }
1898
1899 int
1900 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
1901 {
1902 struct conntrack *ct = dump->ct;
1903 long long now = time_msec();
1904
1905 while (dump->bucket < CONNTRACK_BUCKETS) {
1906 struct hmap_node *node;
1907
1908 ct_lock_lock(&ct->buckets[dump->bucket].lock);
1909 for (;;) {
1910 struct conn *conn;
1911
1912 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
1913 &dump->bucket_pos);
1914 if (!node) {
1915 break;
1916 }
1917 INIT_CONTAINER(conn, node, node);
1918 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
1919 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
1920 conn_to_ct_dpif_entry(conn, entry, now);
1921 break;
1922 }
1923 /* Else continue, until we find an entry in the appropriate zone
1924 * or the bucket has been scanned completely. */
1925 }
1926 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
1927
1928 if (!node) {
1929 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
1930 dump->bucket++;
1931 } else {
1932 return 0;
1933 }
1934 }
1935 return EOF;
1936 }
1937
1938 int
1939 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
1940 {
1941 return 0;
1942 }
1943
1944 int
1945 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
1946 {
1947 unsigned i;
1948
1949 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
1950 struct conn *conn, *next;
1951
1952 ct_lock_lock(&ct->buckets[i].lock);
1953 HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
1954 if ((!zone || *zone == conn->key.zone) &&
1955 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
1956 conn_clean(ct, conn, &ct->buckets[i]);
1957 }
1958 }
1959 ct_lock_unlock(&ct->buckets[i].lock);
1960 }
1961 return 0;
1962 }