]> git.proxmox.com Git - mirror_ovs.git/blob - lib/conntrack.c
conntrack: Do not create new connections from ICMP errors.
[mirror_ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "conntrack.h"
19
20 #include <errno.h>
21 #include <sys/types.h>
22 #include <netinet/in.h>
23 #include <netinet/icmp6.h>
24
25 #include "bitmap.h"
26 #include "conntrack-private.h"
27 #include "coverage.h"
28 #include "csum.h"
29 #include "ct-dpif.h"
30 #include "dp-packet.h"
31 #include "flow.h"
32 #include "netdev.h"
33 #include "odp-netlink.h"
34 #include "openvswitch/hmap.h"
35 #include "openvswitch/vlog.h"
36 #include "ovs-rcu.h"
37 #include "ovs-thread.h"
38 #include "poll-loop.h"
39 #include "random.h"
40 #include "timeval.h"
41
42 VLOG_DEFINE_THIS_MODULE(conntrack);
43
44 COVERAGE_DEFINE(conntrack_full);
45 COVERAGE_DEFINE(conntrack_long_cleanup);
46
47 struct conn_lookup_ctx {
48 struct conn_key key;
49 struct conn *conn;
50 uint32_t hash;
51 bool reply;
52 bool related;
53 };
54
55 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
56 ovs_be16 dl_type, struct conn_lookup_ctx *,
57 uint16_t zone);
58 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
59 static void conn_key_reverse(struct conn_key *);
60 static void conn_key_lookup(struct conntrack_bucket *ctb,
61 struct conn_lookup_ctx *ctx,
62 long long now);
63 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
64 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
65 struct conn_key *, long long now);
66 static void delete_conn(struct conn *);
67 static enum ct_update_res conn_update(struct conn *,
68 struct conntrack_bucket *ctb,
69 struct dp_packet *, bool reply,
70 long long now);
71 static bool conn_expired(struct conn *, long long now);
72 static void set_mark(struct dp_packet *, struct conn *,
73 uint32_t val, uint32_t mask);
74 static void set_label(struct dp_packet *, struct conn *,
75 const struct ovs_key_ct_labels *val,
76 const struct ovs_key_ct_labels *mask);
77 static void *clean_thread_main(void *f_);
78
79 static struct ct_l4_proto *l4_protos[] = {
80 [IPPROTO_TCP] = &ct_proto_tcp,
81 [IPPROTO_UDP] = &ct_proto_other,
82 [IPPROTO_ICMP] = &ct_proto_icmp4,
83 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
84 };
85
86 long long ct_timeout_val[] = {
87 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
88 CT_TIMEOUTS
89 #undef CT_TIMEOUT
90 };
91
92 /* If the total number of connections goes above this value, no new connections
93 * are accepted */
94 #define DEFAULT_N_CONN_LIMIT 3000000
95
96 /* Initializes the connection tracker 'ct'. The caller is responsible for
97 * calling 'conntrack_destroy()', when the instance is not needed anymore */
98 void
99 conntrack_init(struct conntrack *ct)
100 {
101 unsigned i, j;
102 long long now = time_msec();
103
104 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
105 struct conntrack_bucket *ctb = &ct->buckets[i];
106
107 ct_lock_init(&ctb->lock);
108 ct_lock_lock(&ctb->lock);
109 hmap_init(&ctb->connections);
110 for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
111 ovs_list_init(&ctb->exp_lists[j]);
112 }
113 ct_lock_unlock(&ctb->lock);
114 ovs_mutex_init(&ctb->cleanup_mutex);
115 ovs_mutex_lock(&ctb->cleanup_mutex);
116 ctb->next_cleanup = now + CT_TM_MIN;
117 ovs_mutex_unlock(&ctb->cleanup_mutex);
118 }
119 ct->hash_basis = random_uint32();
120 atomic_count_init(&ct->n_conn, 0);
121 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
122 latch_init(&ct->clean_thread_exit);
123 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
124 }
125
126 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
127 void
128 conntrack_destroy(struct conntrack *ct)
129 {
130 unsigned i;
131
132 latch_set(&ct->clean_thread_exit);
133 pthread_join(ct->clean_thread, NULL);
134 latch_destroy(&ct->clean_thread_exit);
135 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
136 struct conntrack_bucket *ctb = &ct->buckets[i];
137 struct conn *conn;
138
139 ovs_mutex_destroy(&ctb->cleanup_mutex);
140 ct_lock_lock(&ctb->lock);
141 HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
142 atomic_count_dec(&ct->n_conn);
143 delete_conn(conn);
144 }
145 hmap_destroy(&ctb->connections);
146 ct_lock_unlock(&ctb->lock);
147 ct_lock_destroy(&ctb->lock);
148 }
149 }
150 \f
151 static unsigned hash_to_bucket(uint32_t hash)
152 {
153 /* Extracts the most significant bits in hash. The least significant bits
154 * are already used internally by the hmap implementation. */
155 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
156
157 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
158 }
159
160 static void
161 write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
162 uint32_t mark, ovs_u128 label)
163 {
164 pkt->md.ct_state = state | CS_TRACKED;
165 pkt->md.ct_zone = zone;
166 pkt->md.ct_mark = mark;
167 pkt->md.ct_label = label;
168 }
169
170 static struct conn *
171 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
172 struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
173 long long now)
174 {
175 unsigned bucket = hash_to_bucket(ctx->hash);
176 struct conn *nc = NULL;
177
178 if (!valid_new(pkt, &ctx->key)) {
179 *state |= CS_INVALID;
180 return nc;
181 }
182
183 *state |= CS_NEW;
184
185 if (commit) {
186 unsigned int n_conn_limit;
187
188 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
189
190 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
191 COVERAGE_INC(conntrack_full);
192 return nc;
193 }
194
195 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
196
197 memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
198
199 conn_key_reverse(&nc->rev_key);
200 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
201 atomic_count_inc(&ct->n_conn);
202 }
203
204 return nc;
205 }
206
207 static struct conn *
208 process_one(struct conntrack *ct, struct dp_packet *pkt,
209 struct conn_lookup_ctx *ctx, uint16_t zone,
210 bool commit, long long now)
211 {
212 unsigned bucket = hash_to_bucket(ctx->hash);
213 struct conn *conn = ctx->conn;
214 uint16_t state = 0;
215
216 if (conn) {
217 if (ctx->related) {
218 state |= CS_RELATED;
219 if (ctx->reply) {
220 state |= CS_REPLY_DIR;
221 }
222 } else {
223 enum ct_update_res res;
224
225 res = conn_update(conn, &ct->buckets[bucket], pkt,
226 ctx->reply, now);
227
228 switch (res) {
229 case CT_UPDATE_VALID:
230 state |= CS_ESTABLISHED;
231 if (ctx->reply) {
232 state |= CS_REPLY_DIR;
233 }
234 break;
235 case CT_UPDATE_INVALID:
236 state |= CS_INVALID;
237 break;
238 case CT_UPDATE_NEW:
239 ovs_list_remove(&conn->exp_node);
240 hmap_remove(&ct->buckets[bucket].connections, &conn->node);
241 atomic_count_dec(&ct->n_conn);
242 delete_conn(conn);
243 conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
244 break;
245 default:
246 OVS_NOT_REACHED();
247 }
248 }
249 } else {
250 if (ctx->related) {
251 state |= CS_INVALID;
252 } else {
253 conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
254 }
255 }
256
257 write_ct_md(pkt, state, zone, conn ? conn->mark : 0,
258 conn ? conn->label : OVS_U128_ZERO);
259
260 return conn;
261 }
262
263 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
264 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
265 * the l3 and and l4 offset properly set.
266 *
267 * If 'commit' is true, the packets are allowed to create new entries in the
268 * connection tables. 'setmark', if not NULL, should point to a two
269 * elements array containing a value and a mask to set the connection mark.
270 * 'setlabel' behaves similarly for the connection label.*/
271 int
272 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
273 ovs_be16 dl_type, bool commit, uint16_t zone,
274 const uint32_t *setmark,
275 const struct ovs_key_ct_labels *setlabel,
276 const char *helper)
277 {
278 struct dp_packet **pkts = pkt_batch->packets;
279 size_t cnt = pkt_batch->count;
280 #if !defined(__CHECKER__) && !defined(_WIN32)
281 const size_t KEY_ARRAY_SIZE = cnt;
282 #else
283 enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
284 #endif
285 struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
286 int8_t bucket_list[CONNTRACK_BUCKETS];
287 struct {
288 unsigned bucket;
289 unsigned long maps;
290 } arr[KEY_ARRAY_SIZE];
291 long long now = time_msec();
292 size_t i = 0;
293 uint8_t arrcnt = 0;
294
295 BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
296
297 if (helper) {
298 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
299
300 VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
301 /* Continue without the helper */
302 }
303
304 memset(bucket_list, INT8_C(-1), sizeof bucket_list);
305 for (i = 0; i < cnt; i++) {
306 unsigned bucket;
307
308 if (!conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone)) {
309 write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO);
310 continue;
311 }
312
313 bucket = hash_to_bucket(ctxs[i].hash);
314 if (bucket_list[bucket] == INT8_C(-1)) {
315 bucket_list[bucket] = arrcnt;
316
317 arr[arrcnt].maps = 0;
318 ULLONG_SET1(arr[arrcnt].maps, i);
319 arr[arrcnt++].bucket = bucket;
320 } else {
321 ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
322 }
323 }
324
325 for (i = 0; i < arrcnt; i++) {
326 struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
327 size_t j;
328
329 ct_lock_lock(&ctb->lock);
330
331 ULLONG_FOR_EACH_1(j, arr[i].maps) {
332 struct conn *conn;
333
334 conn_key_lookup(ctb, &ctxs[j], now);
335
336 conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now);
337
338 if (conn && setmark) {
339 set_mark(pkts[j], conn, setmark[0], setmark[1]);
340 }
341
342 if (conn && setlabel) {
343 set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
344 }
345 }
346 ct_lock_unlock(&ctb->lock);
347 }
348
349 return 0;
350 }
351
352 static void
353 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
354 {
355 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
356 conn->mark = pkt->md.ct_mark;
357 }
358
359 static void
360 set_label(struct dp_packet *pkt, struct conn *conn,
361 const struct ovs_key_ct_labels *val,
362 const struct ovs_key_ct_labels *mask)
363 {
364 ovs_u128 v, m;
365
366 memcpy(&v, val, sizeof v);
367 memcpy(&m, mask, sizeof m);
368
369 pkt->md.ct_label.u64.lo = v.u64.lo
370 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
371 pkt->md.ct_label.u64.hi = v.u64.hi
372 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
373 conn->label = pkt->md.ct_label;
374 }
375 \f
376 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
377 * earliest expiration time among the remaining connections in 'ctb'. Returns
378 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
379 * if 'limit' is reached */
380 static long long
381 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
382 size_t limit)
383 OVS_REQUIRES(ctb->lock)
384 {
385 struct conn *conn, *next;
386 long long min_expiration = LLONG_MAX;
387 unsigned i;
388 size_t count = 0;
389
390 for (i = 0; i < N_CT_TM; i++) {
391 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
392 if (!conn_expired(conn, now) || count >= limit) {
393 min_expiration = MIN(min_expiration, conn->expiration);
394 if (count >= limit) {
395 /* Do not check other lists. */
396 COVERAGE_INC(conntrack_long_cleanup);
397 return min_expiration;
398 }
399 break;
400 }
401 ovs_list_remove(&conn->exp_node);
402 hmap_remove(&ctb->connections, &conn->node);
403 atomic_count_dec(&ct->n_conn);
404 delete_conn(conn);
405 count++;
406 }
407 }
408
409 return min_expiration;
410 }
411
412 /* Cleans up old connection entries from 'ct'. Returns the time when the
413 * next expiration might happen. The return value might be smaller than
414 * 'now', meaning that an internal limit has been reached, and some expired
415 * connections have not been deleted. */
416 static long long
417 conntrack_clean(struct conntrack *ct, long long now)
418 {
419 long long next_wakeup = now + CT_TM_MIN;
420 unsigned int n_conn_limit;
421 size_t clean_count = 0;
422 unsigned i;
423
424 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
425
426 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
427 struct conntrack_bucket *ctb = &ct->buckets[i];
428 size_t prev_count;
429 long long min_exp;
430
431 ovs_mutex_lock(&ctb->cleanup_mutex);
432 if (ctb->next_cleanup > now) {
433 goto next_bucket;
434 }
435
436 ct_lock_lock(&ctb->lock);
437 prev_count = hmap_count(&ctb->connections);
438 /* If the connections are well distributed among buckets, we want to
439 * limit to 10% of the global limit equally split among buckets. If
440 * the bucket is busier than the others, we limit to 10% of its
441 * current size. */
442 min_exp = sweep_bucket(ct, ctb, now,
443 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
444 clean_count += prev_count - hmap_count(&ctb->connections);
445
446 if (min_exp > now) {
447 /* We call hmap_shrink() only if sweep_bucket() managed to delete
448 * every expired connection. */
449 hmap_shrink(&ctb->connections);
450 }
451
452 ct_lock_unlock(&ctb->lock);
453
454 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
455
456 next_bucket:
457 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
458 ovs_mutex_unlock(&ctb->cleanup_mutex);
459 }
460
461 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
462 clean_count, time_msec() - now);
463
464 return next_wakeup;
465 }
466
467 /* Cleanup:
468 *
469 * We must call conntrack_clean() periodically. conntrack_clean() return
470 * value gives an hint on when the next cleanup must be done (either because
471 * there is an actual connection that expires, or because a new connection
472 * might be created with the minimum timeout).
473 *
474 * The logic below has two goals:
475 *
476 * - We want to reduce the number of wakeups and batch connection cleanup
477 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
478 * are coping with the current cleanup tasks, then we wait at least
479 * 5 seconds to do further cleanup.
480 *
481 * - We don't want to keep the buckets locked too long, as we might prevent
482 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
483 * behind, there is at least some 200ms blocks of time when buckets will be
484 * left alone, so the datapath can operate unhindered.
485 */
486 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
487 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
488
489 static void *
490 clean_thread_main(void *f_)
491 {
492 struct conntrack *ct = f_;
493
494 while (!latch_is_set(&ct->clean_thread_exit)) {
495 long long next_wake;
496 long long now = time_msec();
497
498 next_wake = conntrack_clean(ct, now);
499
500 if (next_wake < now) {
501 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
502 } else {
503 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
504 }
505 latch_wait(&ct->clean_thread_exit);
506 poll_block();
507 }
508
509 return NULL;
510 }
511 \f
512 /* Key extraction */
513
514 /* The function stores a pointer to the first byte after the header in
515 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
516 * not interested in the header's tail, meaning that the header has
517 * already been parsed (e.g. by flow_extract): we take this as a hint to
518 * save a few checks. If 'validate_checksum' is true, the function returns
519 * false if the IPv4 checksum is invalid. */
520 static inline bool
521 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
522 const char **new_data, bool validate_checksum)
523 {
524 const struct ip_header *ip = data;
525 size_t ip_len;
526
527 if (new_data) {
528 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
529 return false;
530 }
531 }
532
533 ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
534
535 if (new_data) {
536 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
537 return false;
538 }
539 if (OVS_UNLIKELY(size < ip_len)) {
540 return false;
541 }
542
543 *new_data = (char *) data + ip_len;
544 }
545
546 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
547 return false;
548 }
549
550 if (validate_checksum && csum(data, ip_len) != 0) {
551 return false;
552 }
553
554 key->src.addr.ipv4 = ip->ip_src;
555 key->dst.addr.ipv4 = ip->ip_dst;
556 key->nw_proto = ip->ip_proto;
557
558 return true;
559 }
560
561 /* The function stores a pointer to the first byte after the header in
562 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
563 * not interested in the header's tail, meaning that the header has
564 * already been parsed (e.g. by flow_extract): we take this as a hint to
565 * save a few checks. */
566 static inline bool
567 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
568 const char **new_data)
569 {
570 const struct ovs_16aligned_ip6_hdr *ip6 = data;
571 uint8_t nw_proto = ip6->ip6_nxt;
572 uint8_t nw_frag = 0;
573
574 if (new_data) {
575 if (OVS_UNLIKELY(size < sizeof *ip6)) {
576 return false;
577 }
578 }
579
580 data = ip6 + 1;
581 size -= sizeof *ip6;
582
583 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
584 return false;
585 }
586
587 if (new_data) {
588 *new_data = data;
589 }
590
591 if (nw_frag) {
592 return false;
593 }
594
595 key->src.addr.ipv6 = ip6->ip6_src;
596 key->dst.addr.ipv6 = ip6->ip6_dst;
597 key->nw_proto = nw_proto;
598
599 return true;
600 }
601
602 static inline bool
603 checksum_valid(const struct conn_key *key, const void *data, size_t size,
604 const void *l3)
605 {
606 uint32_t csum = 0;
607
608 if (key->dl_type == htons(ETH_TYPE_IP)) {
609 csum = packet_csum_pseudoheader(l3);
610 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
611 csum = packet_csum_pseudoheader6(l3);
612 } else {
613 return false;
614 }
615
616 csum = csum_continue(csum, data, size);
617
618 return csum_finish(csum) == 0;
619 }
620
621 static inline bool
622 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
623 const void *l3)
624 {
625 const struct tcp_header *tcp = data;
626 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
627
628 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
629 return false;
630 }
631
632 return checksum_valid(key, data, size, l3);
633 }
634
635 static inline bool
636 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
637 const void *l3)
638 {
639 const struct udp_header *udp = data;
640 size_t udp_len = ntohs(udp->udp_len);
641
642 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
643 return false;
644 }
645
646 /* Validation must be skipped if checksum is 0 on IPv4 packets */
647 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
648 || checksum_valid(key, data, size, l3);
649 }
650
651 static inline bool
652 check_l4_icmp(const void *data, size_t size)
653 {
654 return csum(data, size) == 0;
655 }
656
657 static inline bool
658 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
659 const void *l3)
660 {
661 return checksum_valid(key, data, size, l3);
662 }
663
664 static inline bool
665 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
666 {
667 const struct tcp_header *tcp = data;
668
669 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
670 return false;
671 }
672
673 key->src.port = tcp->tcp_src;
674 key->dst.port = tcp->tcp_dst;
675
676 /* Port 0 is invalid */
677 return key->src.port && key->dst.port;
678 }
679
680 static inline bool
681 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
682 {
683 const struct udp_header *udp = data;
684
685 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
686 return false;
687 }
688
689 key->src.port = udp->udp_src;
690 key->dst.port = udp->udp_dst;
691
692 /* Port 0 is invalid */
693 return key->src.port && key->dst.port;
694 }
695
696 static inline bool extract_l4(struct conn_key *key, const void *data,
697 size_t size, bool *related, const void *l3);
698
699 static uint8_t
700 reverse_icmp_type(uint8_t type)
701 {
702 switch (type) {
703 case ICMP4_ECHO_REQUEST:
704 return ICMP4_ECHO_REPLY;
705 case ICMP4_ECHO_REPLY:
706 return ICMP4_ECHO_REQUEST;
707
708 case ICMP4_TIMESTAMP:
709 return ICMP4_TIMESTAMPREPLY;
710 case ICMP4_TIMESTAMPREPLY:
711 return ICMP4_TIMESTAMP;
712
713 case ICMP4_INFOREQUEST:
714 return ICMP4_INFOREPLY;
715 case ICMP4_INFOREPLY:
716 return ICMP4_INFOREQUEST;
717 default:
718 OVS_NOT_REACHED();
719 }
720 }
721
722 /* If 'related' is not NULL and the function is processing an ICMP
723 * error packet, extract the l3 and l4 fields from the nested header
724 * instead and set *related to true. If 'related' is NULL we're
725 * already processing a nested header and no such recursion is
726 * possible */
727 static inline int
728 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
729 bool *related)
730 {
731 const struct icmp_header *icmp = data;
732
733 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
734 return false;
735 }
736
737 switch (icmp->icmp_type) {
738 case ICMP4_ECHO_REQUEST:
739 case ICMP4_ECHO_REPLY:
740 case ICMP4_TIMESTAMP:
741 case ICMP4_TIMESTAMPREPLY:
742 case ICMP4_INFOREQUEST:
743 case ICMP4_INFOREPLY:
744 if (icmp->icmp_code != 0) {
745 return false;
746 }
747 /* Separate ICMP connection: identified using id */
748 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
749 key->src.icmp_type = icmp->icmp_type;
750 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
751 break;
752 case ICMP4_DST_UNREACH:
753 case ICMP4_TIME_EXCEEDED:
754 case ICMP4_PARAM_PROB:
755 case ICMP4_SOURCEQUENCH:
756 case ICMP4_REDIRECT: {
757 /* ICMP packet part of another connection. We should
758 * extract the key from embedded packet header */
759 struct conn_key inner_key;
760 const char *l3 = (const char *) (icmp + 1);
761 const char *tail = (const char *) data + size;
762 const char *l4;
763 bool ok;
764
765 if (!related) {
766 return false;
767 }
768
769 memset(&inner_key, 0, sizeof inner_key);
770 inner_key.dl_type = htons(ETH_TYPE_IP);
771 ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
772 if (!ok) {
773 return false;
774 }
775
776 /* pf doesn't do this, but it seems a good idea */
777 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
778 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
779 return false;
780 }
781
782 key->src = inner_key.src;
783 key->dst = inner_key.dst;
784 key->nw_proto = inner_key.nw_proto;
785
786 ok = extract_l4(key, l4, tail - l4, NULL, l3);
787 if (ok) {
788 conn_key_reverse(key);
789 *related = true;
790 }
791 return ok;
792 }
793 default:
794 return false;
795 }
796
797 return true;
798 }
799
800 static uint8_t
801 reverse_icmp6_type(uint8_t type)
802 {
803 switch (type) {
804 case ICMP6_ECHO_REQUEST:
805 return ICMP6_ECHO_REPLY;
806 case ICMP6_ECHO_REPLY:
807 return ICMP6_ECHO_REQUEST;
808 default:
809 OVS_NOT_REACHED();
810 }
811 }
812
813 /* If 'related' is not NULL and the function is processing an ICMP
814 * error packet, extract the l3 and l4 fields from the nested header
815 * instead and set *related to true. If 'related' is NULL we're
816 * already processing a nested header and no such recursion is
817 * possible */
818 static inline bool
819 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
820 bool *related)
821 {
822 const struct icmp6_header *icmp6 = data;
823
824 /* All the messages that we support need at least 4 bytes after
825 * the header */
826 if (size < sizeof *icmp6 + 4) {
827 return false;
828 }
829
830 switch (icmp6->icmp6_type) {
831 case ICMP6_ECHO_REQUEST:
832 case ICMP6_ECHO_REPLY:
833 if (icmp6->icmp6_code != 0) {
834 return false;
835 }
836 /* Separate ICMP connection: identified using id */
837 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
838 key->src.icmp_type = icmp6->icmp6_type;
839 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
840 break;
841 case ICMP6_DST_UNREACH:
842 case ICMP6_PACKET_TOO_BIG:
843 case ICMP6_TIME_EXCEEDED:
844 case ICMP6_PARAM_PROB: {
845 /* ICMP packet part of another connection. We should
846 * extract the key from embedded packet header */
847 struct conn_key inner_key;
848 const char *l3 = (const char *) icmp6 + 8;
849 const char *tail = (const char *) data + size;
850 const char *l4 = NULL;
851 bool ok;
852
853 if (!related) {
854 return false;
855 }
856
857 memset(&inner_key, 0, sizeof inner_key);
858 inner_key.dl_type = htons(ETH_TYPE_IPV6);
859 ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
860 if (!ok) {
861 return false;
862 }
863
864 /* pf doesn't do this, but it seems a good idea */
865 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
866 &key->dst.addr.ipv6_aligned)
867 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
868 &key->src.addr.ipv6_aligned)) {
869 return false;
870 }
871
872 key->src = inner_key.src;
873 key->dst = inner_key.dst;
874 key->nw_proto = inner_key.nw_proto;
875
876 ok = extract_l4(key, l4, tail - l4, NULL, l3);
877 if (ok) {
878 conn_key_reverse(key);
879 *related = true;
880 }
881 return ok;
882 }
883 default:
884 return false;
885 }
886
887 return true;
888 }
889
890 /* Extract l4 fields into 'key', which must already contain valid l3
891 * members.
892 *
893 * If 'related' is not NULL and an ICMP error packet is being
894 * processed, the function will extract the key from the packet nested
895 * in the ICMP paylod and set '*related' to true.
896 *
897 * If 'related' is NULL, it means that we're already parsing a header nested
898 * in an ICMP error. In this case, we skip checksum and length validation. */
899 static inline bool
900 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
901 const void *l3)
902 {
903 if (key->nw_proto == IPPROTO_TCP) {
904 return (!related || check_l4_tcp(key, data, size, l3))
905 && extract_l4_tcp(key, data, size);
906 } else if (key->nw_proto == IPPROTO_UDP) {
907 return (!related || check_l4_udp(key, data, size, l3))
908 && extract_l4_udp(key, data, size);
909 } else if (key->dl_type == htons(ETH_TYPE_IP)
910 && key->nw_proto == IPPROTO_ICMP) {
911 return (!related || check_l4_icmp(data, size))
912 && extract_l4_icmp(key, data, size, related);
913 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
914 && key->nw_proto == IPPROTO_ICMPV6) {
915 return (!related || check_l4_icmp6(key, data, size, l3))
916 && extract_l4_icmp6(key, data, size, related);
917 } else {
918 return false;
919 }
920 }
921
922 static bool
923 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
924 struct conn_lookup_ctx *ctx, uint16_t zone)
925 {
926 const struct eth_header *l2 = dp_packet_l2(pkt);
927 const struct ip_header *l3 = dp_packet_l3(pkt);
928 const char *l4 = dp_packet_l4(pkt);
929 const char *tail = dp_packet_tail(pkt);
930 bool ok;
931
932 memset(ctx, 0, sizeof *ctx);
933
934 if (!l2 || !l3 || !l4) {
935 return false;
936 }
937
938 ctx->key.zone = zone;
939
940 /* XXX In this function we parse the packet (again, it has already
941 * gone through miniflow_extract()) for two reasons:
942 *
943 * 1) To extract the l3 addresses and l4 ports.
944 * We already have the l3 and l4 headers' pointers. Extracting
945 * the l3 addresses and the l4 ports is really cheap, since they
946 * can be found at fixed locations.
947 * 2) To extract the l4 type.
948 * Extracting the l4 types, for IPv6 can be quite expensive, because
949 * it's not at a fixed location.
950 *
951 * Here's a way to avoid (2) with the help of the datapath.
952 * The datapath doesn't keep the packet's extracted flow[1], so
953 * using that is not an option. We could use the packet's matching
954 * megaflow, but we have to make sure that the l4 type (nw_proto)
955 * is unwildcarded. This means either:
956 *
957 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
958 * if the actions contains ct().
959 *
960 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
961 * action. This is already done in different actions, but it's
962 * unnecessary for the kernel.
963 *
964 * ---
965 * [1] The reasons for this are that keeping the flow increases
966 * (slightly) the cache footprint and increases computation
967 * time as we move the packet around. Most importantly, the flow
968 * should be updated by the actions and this can be slow, as
969 * we use a sparse representation (miniflow).
970 *
971 */
972 ctx->key.dl_type = dl_type;
973 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
974 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
975 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
976 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
977 } else {
978 ok = false;
979 }
980
981 if (ok) {
982 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
983 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
984 return true;
985 }
986 }
987
988 return false;
989 }
990 \f
991 /* Symmetric */
992 static uint32_t
993 conn_key_hash(const struct conn_key *key, uint32_t basis)
994 {
995 uint32_t hsrc, hdst, hash;
996 int i;
997
998 hsrc = hdst = basis;
999
1000 /* Hash the source and destination tuple */
1001 for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
1002 hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
1003 hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
1004 }
1005
1006 /* Even if source and destination are swapped the hash will be the same. */
1007 hash = hsrc ^ hdst;
1008
1009 /* Hash the rest of the key(L3 and L4 types and zone). */
1010 hash = hash_words((uint32_t *) (&key->dst + 1),
1011 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1012 hash);
1013
1014 return hash;
1015 }
1016
1017 static void
1018 conn_key_reverse(struct conn_key *key)
1019 {
1020 struct ct_endpoint tmp;
1021
1022 tmp = key->src;
1023 key->src = key->dst;
1024 key->dst = tmp;
1025 }
1026
1027 static void
1028 conn_key_lookup(struct conntrack_bucket *ctb,
1029 struct conn_lookup_ctx *ctx,
1030 long long now)
1031 {
1032 uint32_t hash = ctx->hash;
1033 struct conn *conn;
1034
1035 ctx->conn = NULL;
1036
1037 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
1038 if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
1039 && !conn_expired(conn, now)) {
1040 ctx->conn = conn;
1041 ctx->reply = false;
1042 break;
1043 }
1044 if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
1045 && !conn_expired(conn, now)) {
1046 ctx->conn = conn;
1047 ctx->reply = true;
1048 break;
1049 }
1050 }
1051 }
1052
1053 static enum ct_update_res
1054 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1055 struct dp_packet *pkt, bool reply, long long now)
1056 {
1057 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1058 reply, now);
1059 }
1060
1061 static bool
1062 conn_expired(struct conn *conn, long long now)
1063 {
1064 return now >= conn->expiration;
1065 }
1066
1067 static bool
1068 valid_new(struct dp_packet *pkt, struct conn_key *key)
1069 {
1070 return l4_protos[key->nw_proto]->valid_new(pkt);
1071 }
1072
1073 static struct conn *
1074 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1075 struct conn_key *key, long long now)
1076 {
1077 struct conn *newconn;
1078
1079 newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1080
1081 if (newconn) {
1082 newconn->key = *key;
1083 }
1084
1085 return newconn;
1086 }
1087
1088 static void
1089 delete_conn(struct conn *conn)
1090 {
1091 free(conn);
1092 }
1093 \f
1094 static void
1095 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
1096 union ct_dpif_inet_addr *b,
1097 ovs_be16 dl_type)
1098 {
1099 if (dl_type == htons(ETH_TYPE_IP)) {
1100 b->ip = a->ipv4_aligned;
1101 } else if (dl_type == htons(ETH_TYPE_IPV6)){
1102 b->in6 = a->ipv6_aligned;
1103 }
1104 }
1105
1106 static void
1107 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
1108 {
1109 if (key->dl_type == htons(ETH_TYPE_IP)) {
1110 tuple->l3_type = AF_INET;
1111 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1112 tuple->l3_type = AF_INET6;
1113 }
1114 tuple->ip_proto = key->nw_proto;
1115 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
1116 key->dl_type);
1117 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
1118 key->dl_type);
1119
1120 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
1121 tuple->icmp_id = key->src.icmp_id;
1122 tuple->icmp_type = key->src.icmp_type;
1123 tuple->icmp_code = key->src.icmp_code;
1124 } else {
1125 tuple->src_port = key->src.port;
1126 tuple->dst_port = key->dst.port;
1127 }
1128 }
1129
1130 static void
1131 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
1132 long long now)
1133 {
1134 struct ct_l4_proto *class;
1135 long long expiration;
1136 memset(entry, 0, sizeof *entry);
1137 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
1138 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
1139
1140 entry->zone = conn->key.zone;
1141 entry->mark = conn->mark;
1142
1143 memcpy(&entry->labels, &conn->label, sizeof(entry->labels));
1144 /* Not implemented yet */
1145 entry->timestamp.start = 0;
1146 entry->timestamp.stop = 0;
1147
1148 expiration = conn->expiration - now;
1149 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
1150
1151 class = l4_protos[conn->key.nw_proto];
1152 if (class->conn_get_protoinfo) {
1153 class->conn_get_protoinfo(conn, &entry->protoinfo);
1154 }
1155 }
1156
1157 int
1158 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
1159 const uint16_t *pzone)
1160 {
1161 memset(dump, 0, sizeof(*dump));
1162 if (pzone) {
1163 dump->zone = *pzone;
1164 dump->filter_zone = true;
1165 }
1166 dump->ct = ct;
1167
1168 return 0;
1169 }
1170
1171 int
1172 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
1173 {
1174 struct conntrack *ct = dump->ct;
1175 long long now = time_msec();
1176
1177 while (dump->bucket < CONNTRACK_BUCKETS) {
1178 struct hmap_node *node;
1179
1180 ct_lock_lock(&ct->buckets[dump->bucket].lock);
1181 for (;;) {
1182 struct conn *conn;
1183
1184 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
1185 &dump->bucket_pos);
1186 if (!node) {
1187 break;
1188 }
1189 INIT_CONTAINER(conn, node, node);
1190 if (!dump->filter_zone || conn->key.zone == dump->zone) {
1191 conn_to_ct_dpif_entry(conn, entry, now);
1192 break;
1193 }
1194 /* Else continue, until we find an entry in the appropriate zone
1195 * or the bucket has been scanned completely. */
1196 }
1197 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
1198
1199 if (!node) {
1200 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
1201 dump->bucket++;
1202 } else {
1203 return 0;
1204 }
1205 }
1206 return EOF;
1207 }
1208
1209 int
1210 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
1211 {
1212 return 0;
1213 }
1214
1215 int
1216 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
1217 {
1218 unsigned i;
1219
1220 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
1221 struct conn *conn, *next;
1222
1223 ct_lock_lock(&ct->buckets[i].lock);
1224 HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
1225 if (!zone || *zone == conn->key.zone) {
1226 ovs_list_remove(&conn->exp_node);
1227 hmap_remove(&ct->buckets[i].connections, &conn->node);
1228 atomic_count_dec(&ct->n_conn);
1229 delete_conn(conn);
1230 }
1231 }
1232 ct_lock_unlock(&ct->buckets[i].lock);
1233 }
1234
1235 return 0;
1236 }