]> git.proxmox.com Git - mirror_ovs.git/blob - lib/conntrack.c
doc: Remove final markdown references
[mirror_ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "conntrack.h"
19
20 #include <errno.h>
21 #include <sys/types.h>
22 #include <netinet/in.h>
23 #include <netinet/icmp6.h>
24
25 #include "bitmap.h"
26 #include "conntrack-private.h"
27 #include "coverage.h"
28 #include "csum.h"
29 #include "ct-dpif.h"
30 #include "dp-packet.h"
31 #include "flow.h"
32 #include "netdev.h"
33 #include "odp-netlink.h"
34 #include "openvswitch/hmap.h"
35 #include "openvswitch/vlog.h"
36 #include "ovs-rcu.h"
37 #include "ovs-thread.h"
38 #include "poll-loop.h"
39 #include "random.h"
40 #include "timeval.h"
41
42 VLOG_DEFINE_THIS_MODULE(conntrack);
43
44 COVERAGE_DEFINE(conntrack_full);
45 COVERAGE_DEFINE(conntrack_long_cleanup);
46
47 struct conn_lookup_ctx {
48 struct conn_key key;
49 struct conn *conn;
50 uint32_t hash;
51 bool reply;
52 bool related;
53 };
54
55 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
56 ovs_be16 dl_type, struct conn_lookup_ctx *,
57 uint16_t zone);
58 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
59 static void conn_key_reverse(struct conn_key *);
60 static void conn_key_lookup(struct conntrack_bucket *ctb,
61 struct conn_lookup_ctx *ctx,
62 long long now);
63 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
64 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
65 struct conn_key *, long long now);
66 static void delete_conn(struct conn *);
67 static enum ct_update_res conn_update(struct conn *,
68 struct conntrack_bucket *ctb,
69 struct dp_packet *, bool reply,
70 long long now);
71 static bool conn_expired(struct conn *, long long now);
72 static void set_mark(struct dp_packet *, struct conn *,
73 uint32_t val, uint32_t mask);
74 static void set_label(struct dp_packet *, struct conn *,
75 const struct ovs_key_ct_labels *val,
76 const struct ovs_key_ct_labels *mask);
77 static void *clean_thread_main(void *f_);
78
79 static struct ct_l4_proto *l4_protos[] = {
80 [IPPROTO_TCP] = &ct_proto_tcp,
81 [IPPROTO_UDP] = &ct_proto_other,
82 [IPPROTO_ICMP] = &ct_proto_icmp4,
83 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
84 };
85
86 long long ct_timeout_val[] = {
87 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
88 CT_TIMEOUTS
89 #undef CT_TIMEOUT
90 };
91
92 /* If the total number of connections goes above this value, no new connections
93 * are accepted */
94 #define DEFAULT_N_CONN_LIMIT 3000000
95
96 /* Initializes the connection tracker 'ct'. The caller is responsible for
97 * calling 'conntrack_destroy()', when the instance is not needed anymore */
98 void
99 conntrack_init(struct conntrack *ct)
100 {
101 unsigned i, j;
102 long long now = time_msec();
103
104 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
105 struct conntrack_bucket *ctb = &ct->buckets[i];
106
107 ct_lock_init(&ctb->lock);
108 ct_lock_lock(&ctb->lock);
109 hmap_init(&ctb->connections);
110 for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
111 ovs_list_init(&ctb->exp_lists[j]);
112 }
113 ct_lock_unlock(&ctb->lock);
114 ovs_mutex_init(&ctb->cleanup_mutex);
115 ovs_mutex_lock(&ctb->cleanup_mutex);
116 ctb->next_cleanup = now + CT_TM_MIN;
117 ovs_mutex_unlock(&ctb->cleanup_mutex);
118 }
119 ct->hash_basis = random_uint32();
120 atomic_count_init(&ct->n_conn, 0);
121 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
122 latch_init(&ct->clean_thread_exit);
123 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
124 }
125
126 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
127 void
128 conntrack_destroy(struct conntrack *ct)
129 {
130 unsigned i;
131
132 latch_set(&ct->clean_thread_exit);
133 pthread_join(ct->clean_thread, NULL);
134 latch_destroy(&ct->clean_thread_exit);
135 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
136 struct conntrack_bucket *ctb = &ct->buckets[i];
137 struct conn *conn;
138
139 ovs_mutex_destroy(&ctb->cleanup_mutex);
140 ct_lock_lock(&ctb->lock);
141 HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
142 atomic_count_dec(&ct->n_conn);
143 delete_conn(conn);
144 }
145 hmap_destroy(&ctb->connections);
146 ct_lock_unlock(&ctb->lock);
147 ct_lock_destroy(&ctb->lock);
148 }
149 }
150 \f
151 static unsigned hash_to_bucket(uint32_t hash)
152 {
153 /* Extracts the most significant bits in hash. The least significant bits
154 * are already used internally by the hmap implementation. */
155 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
156
157 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
158 }
159
160 static void
161 write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
162 uint32_t mark, ovs_u128 label)
163 {
164 pkt->md.ct_state = state | CS_TRACKED;
165 pkt->md.ct_zone = zone;
166 pkt->md.ct_mark = mark;
167 pkt->md.ct_label = label;
168 }
169
170 static struct conn *
171 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
172 struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
173 long long now)
174 {
175 unsigned bucket = hash_to_bucket(ctx->hash);
176 struct conn *nc = NULL;
177
178 if (!valid_new(pkt, &ctx->key)) {
179 *state |= CS_INVALID;
180 return nc;
181 }
182
183 *state |= CS_NEW;
184
185 if (commit) {
186 unsigned int n_conn_limit;
187
188 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
189
190 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
191 COVERAGE_INC(conntrack_full);
192 return nc;
193 }
194
195 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
196
197 memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
198
199 conn_key_reverse(&nc->rev_key);
200 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
201 atomic_count_inc(&ct->n_conn);
202 }
203
204 return nc;
205 }
206
207 static struct conn *
208 process_one(struct conntrack *ct, struct dp_packet *pkt,
209 struct conn_lookup_ctx *ctx, uint16_t zone,
210 bool commit, long long now)
211 {
212 unsigned bucket = hash_to_bucket(ctx->hash);
213 struct conn *conn = ctx->conn;
214 uint16_t state = 0;
215
216 if (conn) {
217 if (ctx->related) {
218 state |= CS_RELATED;
219 if (ctx->reply) {
220 state |= CS_REPLY_DIR;
221 }
222 } else {
223 enum ct_update_res res;
224
225 res = conn_update(conn, &ct->buckets[bucket], pkt,
226 ctx->reply, now);
227
228 switch (res) {
229 case CT_UPDATE_VALID:
230 state |= CS_ESTABLISHED;
231 if (ctx->reply) {
232 state |= CS_REPLY_DIR;
233 }
234 break;
235 case CT_UPDATE_INVALID:
236 state |= CS_INVALID;
237 break;
238 case CT_UPDATE_NEW:
239 ovs_list_remove(&conn->exp_node);
240 hmap_remove(&ct->buckets[bucket].connections, &conn->node);
241 atomic_count_dec(&ct->n_conn);
242 delete_conn(conn);
243 conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
244 break;
245 default:
246 OVS_NOT_REACHED();
247 }
248 }
249 } else {
250 conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
251 }
252
253 write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
254 conn ? conn->label : OVS_U128_ZERO);
255
256 return conn;
257 }
258
259 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
260 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
261 * the l3 and and l4 offset properly set.
262 *
263 * If 'commit' is true, the packets are allowed to create new entries in the
264 * connection tables. 'setmark', if not NULL, should point to a two
265 * elements array containing a value and a mask to set the connection mark.
266 * 'setlabel' behaves similarly for the connection label.*/
267 int
268 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
269 ovs_be16 dl_type, bool commit, uint16_t zone,
270 const uint32_t *setmark,
271 const struct ovs_key_ct_labels *setlabel,
272 const char *helper)
273 {
274 struct dp_packet **pkts = pkt_batch->packets;
275 size_t cnt = pkt_batch->count;
276 #if !defined(__CHECKER__) && !defined(_WIN32)
277 const size_t KEY_ARRAY_SIZE = cnt;
278 #else
279 enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
280 #endif
281 struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
282 int8_t bucket_list[CONNTRACK_BUCKETS];
283 struct {
284 unsigned bucket;
285 unsigned long maps;
286 } arr[KEY_ARRAY_SIZE];
287 long long now = time_msec();
288 size_t i = 0;
289 uint8_t arrcnt = 0;
290
291 BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
292
293 if (helper) {
294 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
295
296 VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
297 /* Continue without the helper */
298 }
299
300 memset(bucket_list, INT8_C(-1), sizeof bucket_list);
301 for (i = 0; i < cnt; i++) {
302 unsigned bucket;
303
304 if (!conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone)) {
305 write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
306 continue;
307 }
308
309 bucket = hash_to_bucket(ctxs[i].hash);
310 if (bucket_list[bucket] == INT8_C(-1)) {
311 bucket_list[bucket] = arrcnt;
312
313 arr[arrcnt].maps = 0;
314 ULLONG_SET1(arr[arrcnt].maps, i);
315 arr[arrcnt++].bucket = bucket;
316 } else {
317 ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
318 }
319 }
320
321 for (i = 0; i < arrcnt; i++) {
322 struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
323 size_t j;
324
325 ct_lock_lock(&ctb->lock);
326
327 ULLONG_FOR_EACH_1(j, arr[i].maps) {
328 struct conn *conn;
329
330 conn_key_lookup(ctb, &ctxs[j], now);
331
332 conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
333
334 if (conn && setmark) {
335 set_mark(pkts[j], conn, setmark[0], setmark[1]);
336 }
337
338 if (conn && setlabel) {
339 set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
340 }
341 }
342 ct_lock_unlock(&ctb->lock);
343 }
344
345 return 0;
346 }
347
348 static void
349 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
350 {
351 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
352 conn->mark = pkt->md.ct_mark;
353 }
354
355 static void
356 set_label(struct dp_packet *pkt, struct conn *conn,
357 const struct ovs_key_ct_labels *val,
358 const struct ovs_key_ct_labels *mask)
359 {
360 ovs_u128 v, m;
361
362 memcpy(&v, val, sizeof v);
363 memcpy(&m, mask, sizeof m);
364
365 pkt->md.ct_label.u64.lo = v.u64.lo
366 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
367 pkt->md.ct_label.u64.hi = v.u64.hi
368 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
369 conn->label = pkt->md.ct_label;
370 }
371 \f
372 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
373 * earliest expiration time among the remaining connections in 'ctb'. Returns
374 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
375 * if 'limit' is reached */
376 static long long
377 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
378 size_t limit)
379 OVS_REQUIRES(ctb->lock)
380 {
381 struct conn *conn, *next;
382 long long min_expiration = LLONG_MAX;
383 unsigned i;
384 size_t count = 0;
385
386 for (i = 0; i < N_CT_TM; i++) {
387 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
388 if (!conn_expired(conn, now) || count >= limit) {
389 min_expiration = MIN(min_expiration, conn->expiration);
390 if (count >= limit) {
391 /* Do not check other lists. */
392 COVERAGE_INC(conntrack_long_cleanup);
393 return min_expiration;
394 }
395 break;
396 }
397 ovs_list_remove(&conn->exp_node);
398 hmap_remove(&ctb->connections, &conn->node);
399 atomic_count_dec(&ct->n_conn);
400 delete_conn(conn);
401 count++;
402 }
403 }
404
405 return min_expiration;
406 }
407
408 /* Cleans up old connection entries from 'ct'. Returns the time when the
409 * next expiration might happen. The return value might be smaller than
410 * 'now', meaning that an internal limit has been reached, and some expired
411 * connections have not been deleted. */
412 static long long
413 conntrack_clean(struct conntrack *ct, long long now)
414 {
415 long long next_wakeup = now + CT_TM_MIN;
416 unsigned int n_conn_limit;
417 size_t clean_count = 0;
418 unsigned i;
419
420 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
421
422 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
423 struct conntrack_bucket *ctb = &ct->buckets[i];
424 size_t prev_count;
425 long long min_exp;
426
427 ovs_mutex_lock(&ctb->cleanup_mutex);
428 if (ctb->next_cleanup > now) {
429 goto next_bucket;
430 }
431
432 ct_lock_lock(&ctb->lock);
433 prev_count = hmap_count(&ctb->connections);
434 /* If the connections are well distributed among buckets, we want to
435 * limit to 10% of the global limit equally split among buckets. If
436 * the bucket is busier than the others, we limit to 10% of its
437 * current size. */
438 min_exp = sweep_bucket(ct, ctb, now,
439 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
440 clean_count += prev_count - hmap_count(&ctb->connections);
441
442 if (min_exp > now) {
443 /* We call hmap_shrink() only if sweep_bucket() managed to delete
444 * every expired connection. */
445 hmap_shrink(&ctb->connections);
446 }
447
448 ct_lock_unlock(&ctb->lock);
449
450 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
451
452 next_bucket:
453 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
454 ovs_mutex_unlock(&ctb->cleanup_mutex);
455 }
456
457 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
458 clean_count, time_msec() - now);
459
460 return next_wakeup;
461 }
462
463 /* Cleanup:
464 *
465 * We must call conntrack_clean() periodically. conntrack_clean() return
466 * value gives an hint on when the next cleanup must be done (either because
467 * there is an actual connection that expires, or because a new connection
468 * might be created with the minimum timeout).
469 *
470 * The logic below has two goals:
471 *
472 * - We want to reduce the number of wakeups and batch connection cleanup
473 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
474 * are coping with the current cleanup tasks, then we wait at least
475 * 5 seconds to do further cleanup.
476 *
477 * - We don't want to keep the buckets locked too long, as we might prevent
478 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
479 * behind, there is at least some 200ms blocks of time when buckets will be
480 * left alone, so the datapath can operate unhindered.
481 */
482 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
483 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
484
485 static void *
486 clean_thread_main(void *f_)
487 {
488 struct conntrack *ct = f_;
489
490 while (!latch_is_set(&ct->clean_thread_exit)) {
491 long long next_wake;
492 long long now = time_msec();
493
494 next_wake = conntrack_clean(ct, now);
495
496 if (next_wake < now) {
497 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
498 } else {
499 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
500 }
501 latch_wait(&ct->clean_thread_exit);
502 poll_block();
503 }
504
505 return NULL;
506 }
507 \f
508 /* Key extraction */
509
510 /* The function stores a pointer to the first byte after the header in
511 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
512 * not interested in the header's tail, meaning that the header has
513 * already been parsed (e.g. by flow_extract): we take this as a hint to
514 * save a few checks. If 'validate_checksum' is true, the function returns
515 * false if the IPv4 checksum is invalid. */
516 static inline bool
517 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
518 const char **new_data, bool validate_checksum)
519 {
520 const struct ip_header *ip = data;
521 size_t ip_len;
522
523 if (new_data) {
524 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
525 return false;
526 }
527 }
528
529 ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
530
531 if (new_data) {
532 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
533 return false;
534 }
535 if (OVS_UNLIKELY(size < ip_len)) {
536 return false;
537 }
538
539 *new_data = (char *) data + ip_len;
540 }
541
542 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
543 return false;
544 }
545
546 if (validate_checksum && csum(data, ip_len) != 0) {
547 return false;
548 }
549
550 key->src.addr.ipv4 = ip->ip_src;
551 key->dst.addr.ipv4 = ip->ip_dst;
552 key->nw_proto = ip->ip_proto;
553
554 return true;
555 }
556
557 /* The function stores a pointer to the first byte after the header in
558 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
559 * not interested in the header's tail, meaning that the header has
560 * already been parsed (e.g. by flow_extract): we take this as a hint to
561 * save a few checks. */
562 static inline bool
563 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
564 const char **new_data)
565 {
566 const struct ovs_16aligned_ip6_hdr *ip6 = data;
567 uint8_t nw_proto = ip6->ip6_nxt;
568 uint8_t nw_frag = 0;
569
570 if (new_data) {
571 if (OVS_UNLIKELY(size < sizeof *ip6)) {
572 return false;
573 }
574 }
575
576 data = ip6 + 1;
577 size -= sizeof *ip6;
578
579 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
580 return false;
581 }
582
583 if (new_data) {
584 *new_data = data;
585 }
586
587 if (nw_frag) {
588 return false;
589 }
590
591 key->src.addr.ipv6 = ip6->ip6_src;
592 key->dst.addr.ipv6 = ip6->ip6_dst;
593 key->nw_proto = nw_proto;
594
595 return true;
596 }
597
598 static inline bool
599 checksum_valid(const struct conn_key *key, const void *data, size_t size,
600 const void *l3)
601 {
602 uint32_t csum = 0;
603
604 if (key->dl_type == htons(ETH_TYPE_IP)) {
605 csum = packet_csum_pseudoheader(l3);
606 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
607 csum = packet_csum_pseudoheader6(l3);
608 } else {
609 return false;
610 }
611
612 csum = csum_continue(csum, data, size);
613
614 return csum_finish(csum) == 0;
615 }
616
617 static inline bool
618 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
619 const void *l3)
620 {
621 const struct tcp_header *tcp = data;
622 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
623
624 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
625 return false;
626 }
627
628 return checksum_valid(key, data, size, l3);
629 }
630
631 static inline bool
632 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
633 const void *l3)
634 {
635 const struct udp_header *udp = data;
636 size_t udp_len = ntohs(udp->udp_len);
637
638 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
639 return false;
640 }
641
642 /* Validation must be skipped if checksum is 0 on IPv4 packets */
643 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
644 || checksum_valid(key, data, size, l3);
645 }
646
647 static inline bool
648 check_l4_icmp(const void *data, size_t size)
649 {
650 return csum(data, size) == 0;
651 }
652
653 static inline bool
654 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
655 const void *l3)
656 {
657 return checksum_valid(key, data, size, l3);
658 }
659
660 static inline bool
661 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
662 {
663 const struct tcp_header *tcp = data;
664
665 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
666 return false;
667 }
668
669 key->src.port = tcp->tcp_src;
670 key->dst.port = tcp->tcp_dst;
671
672 /* Port 0 is invalid */
673 return key->src.port && key->dst.port;
674 }
675
676 static inline bool
677 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
678 {
679 const struct udp_header *udp = data;
680
681 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
682 return false;
683 }
684
685 key->src.port = udp->udp_src;
686 key->dst.port = udp->udp_dst;
687
688 /* Port 0 is invalid */
689 return key->src.port && key->dst.port;
690 }
691
692 static inline bool extract_l4(struct conn_key *key, const void *data,
693 size_t size, bool *related, const void *l3);
694
695 static uint8_t
696 reverse_icmp_type(uint8_t type)
697 {
698 switch (type) {
699 case ICMP4_ECHO_REQUEST:
700 return ICMP4_ECHO_REPLY;
701 case ICMP4_ECHO_REPLY:
702 return ICMP4_ECHO_REQUEST;
703
704 case ICMP4_TIMESTAMP:
705 return ICMP4_TIMESTAMPREPLY;
706 case ICMP4_TIMESTAMPREPLY:
707 return ICMP4_TIMESTAMP;
708
709 case ICMP4_INFOREQUEST:
710 return ICMP4_INFOREPLY;
711 case ICMP4_INFOREPLY:
712 return ICMP4_INFOREQUEST;
713 default:
714 OVS_NOT_REACHED();
715 }
716 }
717
718 /* If 'related' is not NULL and the function is processing an ICMP
719 * error packet, extract the l3 and l4 fields from the nested header
720 * instead and set *related to true. If 'related' is NULL we're
721 * already processing a nested header and no such recursion is
722 * possible */
723 static inline int
724 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
725 bool *related)
726 {
727 const struct icmp_header *icmp = data;
728
729 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
730 return false;
731 }
732
733 switch (icmp->icmp_type) {
734 case ICMP4_ECHO_REQUEST:
735 case ICMP4_ECHO_REPLY:
736 case ICMP4_TIMESTAMP:
737 case ICMP4_TIMESTAMPREPLY:
738 case ICMP4_INFOREQUEST:
739 case ICMP4_INFOREPLY:
740 if (icmp->icmp_code != 0) {
741 return false;
742 }
743 /* Separate ICMP connection: identified using id */
744 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
745 key->src.icmp_type = icmp->icmp_type;
746 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
747 break;
748 case ICMP4_DST_UNREACH:
749 case ICMP4_TIME_EXCEEDED:
750 case ICMP4_PARAM_PROB:
751 case ICMP4_SOURCEQUENCH:
752 case ICMP4_REDIRECT: {
753 /* ICMP packet part of another connection. We should
754 * extract the key from embedded packet header */
755 struct conn_key inner_key;
756 const char *l3 = (const char *) (icmp + 1);
757 const char *tail = (const char *) data + size;
758 const char *l4;
759 bool ok;
760
761 if (!related) {
762 return false;
763 }
764
765 memset(&inner_key, 0, sizeof inner_key);
766 inner_key.dl_type = htons(ETH_TYPE_IP);
767 ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
768 if (!ok) {
769 return false;
770 }
771
772 /* pf doesn't do this, but it seems a good idea */
773 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
774 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
775 return false;
776 }
777
778 key->src = inner_key.src;
779 key->dst = inner_key.dst;
780 key->nw_proto = inner_key.nw_proto;
781
782 ok = extract_l4(key, l4, tail - l4, NULL, l3);
783 if (ok) {
784 conn_key_reverse(key);
785 *related = true;
786 }
787 return ok;
788 }
789 default:
790 return false;
791 }
792
793 return true;
794 }
795
796 static uint8_t
797 reverse_icmp6_type(uint8_t type)
798 {
799 switch (type) {
800 case ICMP6_ECHO_REQUEST:
801 return ICMP6_ECHO_REPLY;
802 case ICMP6_ECHO_REPLY:
803 return ICMP6_ECHO_REQUEST;
804 default:
805 OVS_NOT_REACHED();
806 }
807 }
808
809 /* If 'related' is not NULL and the function is processing an ICMP
810 * error packet, extract the l3 and l4 fields from the nested header
811 * instead and set *related to true. If 'related' is NULL we're
812 * already processing a nested header and no such recursion is
813 * possible */
814 static inline bool
815 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
816 bool *related)
817 {
818 const struct icmp6_header *icmp6 = data;
819
820 /* All the messages that we support need at least 4 bytes after
821 * the header */
822 if (size < sizeof *icmp6 + 4) {
823 return false;
824 }
825
826 switch (icmp6->icmp6_type) {
827 case ICMP6_ECHO_REQUEST:
828 case ICMP6_ECHO_REPLY:
829 if (icmp6->icmp6_code != 0) {
830 return false;
831 }
832 /* Separate ICMP connection: identified using id */
833 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
834 key->src.icmp_type = icmp6->icmp6_type;
835 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
836 break;
837 case ICMP6_DST_UNREACH:
838 case ICMP6_PACKET_TOO_BIG:
839 case ICMP6_TIME_EXCEEDED:
840 case ICMP6_PARAM_PROB: {
841 /* ICMP packet part of another connection. We should
842 * extract the key from embedded packet header */
843 struct conn_key inner_key;
844 const char *l3 = (const char *) icmp6 + 8;
845 const char *tail = (const char *) data + size;
846 const char *l4 = NULL;
847 bool ok;
848
849 if (!related) {
850 return false;
851 }
852
853 memset(&inner_key, 0, sizeof inner_key);
854 inner_key.dl_type = htons(ETH_TYPE_IPV6);
855 ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
856 if (!ok) {
857 return false;
858 }
859
860 /* pf doesn't do this, but it seems a good idea */
861 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
862 &key->dst.addr.ipv6_aligned)
863 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
864 &key->src.addr.ipv6_aligned)) {
865 return false;
866 }
867
868 key->src = inner_key.src;
869 key->dst = inner_key.dst;
870 key->nw_proto = inner_key.nw_proto;
871
872 ok = extract_l4(key, l4, tail - l4, NULL, l3);
873 if (ok) {
874 conn_key_reverse(key);
875 *related = true;
876 }
877 return ok;
878 }
879 default:
880 return false;
881 }
882
883 return true;
884 }
885
886 /* Extract l4 fields into 'key', which must already contain valid l3
887 * members.
888 *
889 * If 'related' is not NULL and an ICMP error packet is being
890 * processed, the function will extract the key from the packet nested
891 * in the ICMP paylod and set '*related' to true.
892 *
893 * If 'related' is NULL, it means that we're already parsing a header nested
894 * in an ICMP error. In this case, we skip checksum and length validation. */
895 static inline bool
896 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
897 const void *l3)
898 {
899 if (key->nw_proto == IPPROTO_TCP) {
900 return (!related || check_l4_tcp(key, data, size, l3))
901 && extract_l4_tcp(key, data, size);
902 } else if (key->nw_proto == IPPROTO_UDP) {
903 return (!related || check_l4_udp(key, data, size, l3))
904 && extract_l4_udp(key, data, size);
905 } else if (key->dl_type == htons(ETH_TYPE_IP)
906 && key->nw_proto == IPPROTO_ICMP) {
907 return (!related || check_l4_icmp(data, size))
908 && extract_l4_icmp(key, data, size, related);
909 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
910 && key->nw_proto == IPPROTO_ICMPV6) {
911 return (!related || check_l4_icmp6(key, data, size, l3))
912 && extract_l4_icmp6(key, data, size, related);
913 } else {
914 return false;
915 }
916 }
917
918 static bool
919 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
920 struct conn_lookup_ctx *ctx, uint16_t zone)
921 {
922 const struct eth_header *l2 = dp_packet_l2(pkt);
923 const struct ip_header *l3 = dp_packet_l3(pkt);
924 const char *l4 = dp_packet_l4(pkt);
925 const char *tail = dp_packet_tail(pkt);
926 bool ok;
927
928 memset(ctx, 0, sizeof *ctx);
929
930 if (!l2 || !l3 || !l4) {
931 return false;
932 }
933
934 ctx->key.zone = zone;
935
936 /* XXX In this function we parse the packet (again, it has already
937 * gone through miniflow_extract()) for two reasons:
938 *
939 * 1) To extract the l3 addresses and l4 ports.
940 * We already have the l3 and l4 headers' pointers. Extracting
941 * the l3 addresses and the l4 ports is really cheap, since they
942 * can be found at fixed locations.
943 * 2) To extract the l4 type.
944 * Extracting the l4 types, for IPv6 can be quite expensive, because
945 * it's not at a fixed location.
946 *
947 * Here's a way to avoid (2) with the help of the datapath.
948 * The datapath doesn't keep the packet's extracted flow[1], so
949 * using that is not an option. We could use the packet's matching
950 * megaflow, but we have to make sure that the l4 type (nw_proto)
951 * is unwildcarded. This means either:
952 *
953 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
954 * if the actions contains ct().
955 *
956 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
957 * action. This is already done in different actions, but it's
958 * unnecessary for the kernel.
959 *
960 * ---
961 * [1] The reasons for this are that keeping the flow increases
962 * (slightly) the cache footprint and increases computation
963 * time as we move the packet around. Most importantly, the flow
964 * should be updated by the actions and this can be slow, as
965 * we use a sparse representation (miniflow).
966 *
967 */
968 ctx->key.dl_type = dl_type;
969 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
970 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
971 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
972 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
973 } else {
974 ok = false;
975 }
976
977 if (ok) {
978 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
979 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
980 return true;
981 }
982 }
983
984 return false;
985 }
986 \f
987 /* Symmetric */
988 static uint32_t
989 conn_key_hash(const struct conn_key *key, uint32_t basis)
990 {
991 uint32_t hsrc, hdst, hash;
992 int i;
993
994 hsrc = hdst = basis;
995
996 /* Hash the source and destination tuple */
997 for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
998 hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
999 hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
1000 }
1001
1002 /* Even if source and destination are swapped the hash will be the same. */
1003 hash = hsrc ^ hdst;
1004
1005 /* Hash the rest of the key(L3 and L4 types and zone). */
1006 hash = hash_words((uint32_t *) (&key->dst + 1),
1007 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1008 hash);
1009
1010 return hash;
1011 }
1012
1013 static void
1014 conn_key_reverse(struct conn_key *key)
1015 {
1016 struct ct_endpoint tmp;
1017
1018 tmp = key->src;
1019 key->src = key->dst;
1020 key->dst = tmp;
1021 }
1022
1023 static void
1024 conn_key_lookup(struct conntrack_bucket *ctb,
1025 struct conn_lookup_ctx *ctx,
1026 long long now)
1027 {
1028 uint32_t hash = ctx->hash;
1029 struct conn *conn;
1030
1031 ctx->conn = NULL;
1032
1033 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
1034 if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
1035 && !conn_expired(conn, now)) {
1036 ctx->conn = conn;
1037 ctx->reply = false;
1038 break;
1039 }
1040 if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
1041 && !conn_expired(conn, now)) {
1042 ctx->conn = conn;
1043 ctx->reply = true;
1044 break;
1045 }
1046 }
1047 }
1048
1049 static enum ct_update_res
1050 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1051 struct dp_packet *pkt, bool reply, long long now)
1052 {
1053 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1054 reply, now);
1055 }
1056
1057 static bool
1058 conn_expired(struct conn *conn, long long now)
1059 {
1060 return now >= conn->expiration;
1061 }
1062
1063 static bool
1064 valid_new(struct dp_packet *pkt, struct conn_key *key)
1065 {
1066 return l4_protos[key->nw_proto]->valid_new(pkt);
1067 }
1068
1069 static struct conn *
1070 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1071 struct conn_key *key, long long now)
1072 {
1073 struct conn *newconn;
1074
1075 newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1076
1077 if (newconn) {
1078 newconn->key = *key;
1079 }
1080
1081 return newconn;
1082 }
1083
1084 static void
1085 delete_conn(struct conn *conn)
1086 {
1087 free(conn);
1088 }
1089 \f
1090 static void
1091 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
1092 union ct_dpif_inet_addr *b,
1093 ovs_be16 dl_type)
1094 {
1095 if (dl_type == htons(ETH_TYPE_IP)) {
1096 b->ip = a->ipv4_aligned;
1097 } else if (dl_type == htons(ETH_TYPE_IPV6)){
1098 b->in6 = a->ipv6_aligned;
1099 }
1100 }
1101
1102 static void
1103 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
1104 {
1105 if (key->dl_type == htons(ETH_TYPE_IP)) {
1106 tuple->l3_type = AF_INET;
1107 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1108 tuple->l3_type = AF_INET6;
1109 }
1110 tuple->ip_proto = key->nw_proto;
1111 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
1112 key->dl_type);
1113 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
1114 key->dl_type);
1115
1116 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
1117 tuple->icmp_id = key->src.icmp_id;
1118 tuple->icmp_type = key->src.icmp_type;
1119 tuple->icmp_code = key->src.icmp_code;
1120 } else {
1121 tuple->src_port = key->src.port;
1122 tuple->dst_port = key->dst.port;
1123 }
1124 }
1125
1126 static void
1127 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
1128 long long now)
1129 {
1130 struct ct_l4_proto *class;
1131 long long expiration;
1132 memset(entry, 0, sizeof *entry);
1133 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
1134 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
1135
1136 entry->zone = conn->key.zone;
1137 entry->mark = conn->mark;
1138
1139 memcpy(&entry->labels, &conn->label, sizeof(entry->labels));
1140 /* Not implemented yet */
1141 entry->timestamp.start = 0;
1142 entry->timestamp.stop = 0;
1143
1144 expiration = conn->expiration - now;
1145 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
1146
1147 class = l4_protos[conn->key.nw_proto];
1148 if (class->conn_get_protoinfo) {
1149 class->conn_get_protoinfo(conn, &entry->protoinfo);
1150 }
1151 }
1152
1153 int
1154 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
1155 const uint16_t *pzone)
1156 {
1157 memset(dump, 0, sizeof(*dump));
1158 if (pzone) {
1159 dump->zone = *pzone;
1160 dump->filter_zone = true;
1161 }
1162 dump->ct = ct;
1163
1164 return 0;
1165 }
1166
1167 int
1168 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
1169 {
1170 struct conntrack *ct = dump->ct;
1171 long long now = time_msec();
1172
1173 while (dump->bucket < CONNTRACK_BUCKETS) {
1174 struct hmap_node *node;
1175
1176 ct_lock_lock(&ct->buckets[dump->bucket].lock);
1177 for (;;) {
1178 struct conn *conn;
1179
1180 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
1181 &dump->bucket_pos);
1182 if (!node) {
1183 break;
1184 }
1185 INIT_CONTAINER(conn, node, node);
1186 if (!dump->filter_zone || conn->key.zone == dump->zone) {
1187 conn_to_ct_dpif_entry(conn, entry, now);
1188 break;
1189 }
1190 /* Else continue, until we find an entry in the appropriate zone
1191 * or the bucket has been scanned completely. */
1192 }
1193 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
1194
1195 if (!node) {
1196 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
1197 dump->bucket++;
1198 } else {
1199 return 0;
1200 }
1201 }
1202 return EOF;
1203 }
1204
1205 int
1206 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
1207 {
1208 return 0;
1209 }
1210
1211 int
1212 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
1213 {
1214 unsigned i;
1215
1216 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
1217 struct conn *conn, *next;
1218
1219 ct_lock_lock(&ct->buckets[i].lock);
1220 HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
1221 if (!zone || *zone == conn->key.zone) {
1222 ovs_list_remove(&conn->exp_node);
1223 hmap_remove(&ct->buckets[i].connections, &conn->node);
1224 atomic_count_dec(&ct->n_conn);
1225 delete_conn(conn);
1226 }
1227 }
1228 ct_lock_unlock(&ct->buckets[i].lock);
1229 }
1230
1231 return 0;
1232 }