]> git.proxmox.com Git - mirror_ovs.git/blob - lib/conntrack.c
flow: Fix buffer overread in flow_hash_symmetric_l3l4().
[mirror_ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include "conntrack.h"
19
20 #include <errno.h>
21 #include <sys/types.h>
22 #include <netinet/in.h>
23 #include <netinet/icmp6.h>
24
25 #include "bitmap.h"
26 #include "conntrack-private.h"
27 #include "coverage.h"
28 #include "csum.h"
29 #include "ct-dpif.h"
30 #include "dp-packet.h"
31 #include "flow.h"
32 #include "netdev.h"
33 #include "odp-netlink.h"
34 #include "openvswitch/hmap.h"
35 #include "openvswitch/vlog.h"
36 #include "ovs-rcu.h"
37 #include "ovs-thread.h"
38 #include "poll-loop.h"
39 #include "random.h"
40 #include "timeval.h"
41
42 VLOG_DEFINE_THIS_MODULE(conntrack);
43
44 COVERAGE_DEFINE(conntrack_full);
45 COVERAGE_DEFINE(conntrack_long_cleanup);
46
47 struct conn_lookup_ctx {
48 struct conn_key key;
49 struct conn *conn;
50 uint32_t hash;
51 bool reply;
52 bool related;
53 };
54
55 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
56 ovs_be16 dl_type, struct conn_lookup_ctx *,
57 uint16_t zone);
58 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
59 static void conn_key_reverse(struct conn_key *);
60 static void conn_key_lookup(struct conntrack_bucket *ctb,
61 struct conn_lookup_ctx *ctx,
62 long long now);
63 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
64 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
65 struct conn_key *, long long now);
66 static void delete_conn(struct conn *);
67 static enum ct_update_res conn_update(struct conn *,
68 struct conntrack_bucket *ctb,
69 struct dp_packet *, bool reply,
70 long long now);
71 static bool conn_expired(struct conn *, long long now);
72 static void set_mark(struct dp_packet *, struct conn *,
73 uint32_t val, uint32_t mask);
74 static void set_label(struct dp_packet *, struct conn *,
75 const struct ovs_key_ct_labels *val,
76 const struct ovs_key_ct_labels *mask);
77 static void *clean_thread_main(void *f_);
78
79 static struct ct_l4_proto *l4_protos[] = {
80 [IPPROTO_TCP] = &ct_proto_tcp,
81 [IPPROTO_UDP] = &ct_proto_other,
82 [IPPROTO_ICMP] = &ct_proto_icmp4,
83 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
84 };
85
86 long long ct_timeout_val[] = {
87 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
88 CT_TIMEOUTS
89 #undef CT_TIMEOUT
90 };
91
92 /* If the total number of connections goes above this value, no new connections
93 * are accepted */
94 #define DEFAULT_N_CONN_LIMIT 3000000
95
96 /* Initializes the connection tracker 'ct'. The caller is responsible for
97 * calling 'conntrack_destroy()', when the instance is not needed anymore */
98 void
99 conntrack_init(struct conntrack *ct)
100 {
101 unsigned i, j;
102 long long now = time_msec();
103
104 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
105 struct conntrack_bucket *ctb = &ct->buckets[i];
106
107 ct_lock_init(&ctb->lock);
108 ct_lock_lock(&ctb->lock);
109 hmap_init(&ctb->connections);
110 for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
111 ovs_list_init(&ctb->exp_lists[j]);
112 }
113 ct_lock_unlock(&ctb->lock);
114 ovs_mutex_init(&ctb->cleanup_mutex);
115 ovs_mutex_lock(&ctb->cleanup_mutex);
116 ctb->next_cleanup = now + CT_TM_MIN;
117 ovs_mutex_unlock(&ctb->cleanup_mutex);
118 }
119 ct->hash_basis = random_uint32();
120 atomic_count_init(&ct->n_conn, 0);
121 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
122 latch_init(&ct->clean_thread_exit);
123 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
124 }
125
126 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
127 void
128 conntrack_destroy(struct conntrack *ct)
129 {
130 unsigned i;
131
132 latch_set(&ct->clean_thread_exit);
133 pthread_join(ct->clean_thread, NULL);
134 latch_destroy(&ct->clean_thread_exit);
135 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
136 struct conntrack_bucket *ctb = &ct->buckets[i];
137 struct conn *conn;
138
139 ovs_mutex_destroy(&ctb->cleanup_mutex);
140 ct_lock_lock(&ctb->lock);
141 HMAP_FOR_EACH_POP(conn, node, &ctb->connections) {
142 atomic_count_dec(&ct->n_conn);
143 delete_conn(conn);
144 }
145 hmap_destroy(&ctb->connections);
146 ct_lock_unlock(&ctb->lock);
147 ct_lock_destroy(&ctb->lock);
148 }
149 }
150 \f
151 static unsigned hash_to_bucket(uint32_t hash)
152 {
153 /* Extracts the most significant bits in hash. The least significant bits
154 * are already used internally by the hmap implementation. */
155 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
156
157 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
158 }
159
160 static void
161 write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone,
162 const struct conn *conn, const struct conn_key *key)
163 {
164 pkt->md.ct_state = state | CS_TRACKED;
165 pkt->md.ct_zone = zone;
166 pkt->md.ct_mark = conn ? conn->mark : 0;
167 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
168
169 /* Use the original direction tuple if we have it. */
170 if (conn) {
171 key = &conn->key;
172 }
173 pkt->md.ct_orig_tuple_ipv6 = false;
174 if (key) {
175 if (key->dl_type == htons(ETH_TYPE_IP)) {
176 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
177 key->src.addr.ipv4_aligned,
178 key->dst.addr.ipv4_aligned,
179 key->nw_proto != IPPROTO_ICMP
180 ? key->src.port : htons(key->src.icmp_type),
181 key->nw_proto != IPPROTO_ICMP
182 ? key->dst.port : htons(key->src.icmp_code),
183 key->nw_proto,
184 };
185 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
186 pkt->md.ct_orig_tuple_ipv6 = true;
187 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
188 key->src.addr.ipv6_aligned,
189 key->dst.addr.ipv6_aligned,
190 key->nw_proto != IPPROTO_ICMPV6
191 ? key->src.port : htons(key->src.icmp_type),
192 key->nw_proto != IPPROTO_ICMPV6
193 ? key->dst.port : htons(key->src.icmp_code),
194 key->nw_proto,
195 };
196 }
197 } else {
198 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
199 }
200 }
201
202 static struct conn *
203 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
204 struct conn_lookup_ctx *ctx, uint16_t *state, bool commit,
205 long long now)
206 {
207 unsigned bucket = hash_to_bucket(ctx->hash);
208 struct conn *nc = NULL;
209
210 if (!valid_new(pkt, &ctx->key)) {
211 *state |= CS_INVALID;
212 return nc;
213 }
214
215 *state |= CS_NEW;
216
217 if (commit) {
218 unsigned int n_conn_limit;
219
220 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
221
222 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
223 COVERAGE_INC(conntrack_full);
224 return nc;
225 }
226
227 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
228
229 memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key);
230
231 conn_key_reverse(&nc->rev_key);
232 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
233 atomic_count_inc(&ct->n_conn);
234 }
235
236 return nc;
237 }
238
239 static struct conn *
240 process_one(struct conntrack *ct, struct dp_packet *pkt,
241 struct conn_lookup_ctx *ctx, uint16_t zone,
242 bool force, bool commit, long long now)
243 {
244 unsigned bucket = hash_to_bucket(ctx->hash);
245 struct conn *conn = ctx->conn;
246 uint16_t state = 0;
247
248 /* Delete found entry if in wrong direction. 'force' implies commit. */
249 if (conn && force && ctx->reply) {
250 ovs_list_remove(&conn->exp_node);
251 hmap_remove(&ct->buckets[bucket].connections, &conn->node);
252 atomic_count_dec(&ct->n_conn);
253 delete_conn(conn);
254 conn = NULL;
255 }
256
257 if (conn) {
258 if (ctx->related) {
259 state |= CS_RELATED;
260 if (ctx->reply) {
261 state |= CS_REPLY_DIR;
262 }
263 } else {
264 enum ct_update_res res;
265
266 res = conn_update(conn, &ct->buckets[bucket], pkt,
267 ctx->reply, now);
268
269 switch (res) {
270 case CT_UPDATE_VALID:
271 state |= CS_ESTABLISHED;
272 if (ctx->reply) {
273 state |= CS_REPLY_DIR;
274 }
275 break;
276 case CT_UPDATE_INVALID:
277 state |= CS_INVALID;
278 break;
279 case CT_UPDATE_NEW:
280 ovs_list_remove(&conn->exp_node);
281 hmap_remove(&ct->buckets[bucket].connections, &conn->node);
282 atomic_count_dec(&ct->n_conn);
283 delete_conn(conn);
284 conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
285 break;
286 default:
287 OVS_NOT_REACHED();
288 }
289 }
290 } else {
291 if (ctx->related) {
292 state |= CS_INVALID;
293 } else {
294 conn = conn_not_found(ct, pkt, ctx, &state, commit, now);
295 }
296 }
297
298 write_ct_md(pkt, state, zone, conn, &ctx->key);
299
300 return conn;
301 }
302
303 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
304 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
305 * the l3 and and l4 offset properly set.
306 *
307 * If 'commit' is true, the packets are allowed to create new entries in the
308 * connection tables. 'setmark', if not NULL, should point to a two
309 * elements array containing a value and a mask to set the connection mark.
310 * 'setlabel' behaves similarly for the connection label.*/
311 int
312 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
313 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
314 const uint32_t *setmark,
315 const struct ovs_key_ct_labels *setlabel,
316 const char *helper)
317 {
318 struct dp_packet **pkts = pkt_batch->packets;
319 size_t cnt = pkt_batch->count;
320 #if !defined(__CHECKER__) && !defined(_WIN32)
321 const size_t KEY_ARRAY_SIZE = cnt;
322 #else
323 enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST };
324 #endif
325 struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE];
326 int8_t bucket_list[CONNTRACK_BUCKETS];
327 struct {
328 unsigned bucket;
329 unsigned long maps;
330 } arr[KEY_ARRAY_SIZE];
331 long long now = time_msec();
332 size_t i = 0;
333 uint8_t arrcnt = 0;
334
335 BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST);
336
337 if (helper) {
338 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
339
340 VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper);
341 /* Continue without the helper */
342 }
343
344 memset(bucket_list, INT8_C(-1), sizeof bucket_list);
345 for (i = 0; i < cnt; i++) {
346 unsigned bucket;
347
348 if (!conn_key_extract(ct, pkts[i], dl_type, &ctxs[i], zone)) {
349 write_ct_md(pkts[i], CS_INVALID, zone, NULL, NULL);
350 continue;
351 }
352
353 bucket = hash_to_bucket(ctxs[i].hash);
354 if (bucket_list[bucket] == INT8_C(-1)) {
355 bucket_list[bucket] = arrcnt;
356
357 arr[arrcnt].maps = 0;
358 ULLONG_SET1(arr[arrcnt].maps, i);
359 arr[arrcnt++].bucket = bucket;
360 } else {
361 ULLONG_SET1(arr[bucket_list[bucket]].maps, i);
362 }
363 }
364
365 for (i = 0; i < arrcnt; i++) {
366 struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket];
367 size_t j;
368
369 ct_lock_lock(&ctb->lock);
370
371 ULLONG_FOR_EACH_1(j, arr[i].maps) {
372 struct conn *conn;
373
374 conn_key_lookup(ctb, &ctxs[j], now);
375
376 conn = process_one(ct, pkts[j], &ctxs[j], zone, force, commit,
377 now);
378
379 if (conn && setmark) {
380 set_mark(pkts[j], conn, setmark[0], setmark[1]);
381 }
382
383 if (conn && setlabel) {
384 set_label(pkts[j], conn, &setlabel[0], &setlabel[1]);
385 }
386 }
387 ct_lock_unlock(&ctb->lock);
388 }
389
390 return 0;
391 }
392
393 static void
394 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
395 {
396 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
397 conn->mark = pkt->md.ct_mark;
398 }
399
400 static void
401 set_label(struct dp_packet *pkt, struct conn *conn,
402 const struct ovs_key_ct_labels *val,
403 const struct ovs_key_ct_labels *mask)
404 {
405 ovs_u128 v, m;
406
407 memcpy(&v, val, sizeof v);
408 memcpy(&m, mask, sizeof m);
409
410 pkt->md.ct_label.u64.lo = v.u64.lo
411 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
412 pkt->md.ct_label.u64.hi = v.u64.hi
413 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
414 conn->label = pkt->md.ct_label;
415 }
416 \f
417 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
418 * earliest expiration time among the remaining connections in 'ctb'. Returns
419 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
420 * if 'limit' is reached */
421 static long long
422 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now,
423 size_t limit)
424 OVS_REQUIRES(ctb->lock)
425 {
426 struct conn *conn, *next;
427 long long min_expiration = LLONG_MAX;
428 unsigned i;
429 size_t count = 0;
430
431 for (i = 0; i < N_CT_TM; i++) {
432 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
433 if (!conn_expired(conn, now) || count >= limit) {
434 min_expiration = MIN(min_expiration, conn->expiration);
435 if (count >= limit) {
436 /* Do not check other lists. */
437 COVERAGE_INC(conntrack_long_cleanup);
438 return min_expiration;
439 }
440 break;
441 }
442 ovs_list_remove(&conn->exp_node);
443 hmap_remove(&ctb->connections, &conn->node);
444 atomic_count_dec(&ct->n_conn);
445 delete_conn(conn);
446 count++;
447 }
448 }
449
450 return min_expiration;
451 }
452
453 /* Cleans up old connection entries from 'ct'. Returns the time when the
454 * next expiration might happen. The return value might be smaller than
455 * 'now', meaning that an internal limit has been reached, and some expired
456 * connections have not been deleted. */
457 static long long
458 conntrack_clean(struct conntrack *ct, long long now)
459 {
460 long long next_wakeup = now + CT_TM_MIN;
461 unsigned int n_conn_limit;
462 size_t clean_count = 0;
463 unsigned i;
464
465 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
466
467 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
468 struct conntrack_bucket *ctb = &ct->buckets[i];
469 size_t prev_count;
470 long long min_exp;
471
472 ovs_mutex_lock(&ctb->cleanup_mutex);
473 if (ctb->next_cleanup > now) {
474 goto next_bucket;
475 }
476
477 ct_lock_lock(&ctb->lock);
478 prev_count = hmap_count(&ctb->connections);
479 /* If the connections are well distributed among buckets, we want to
480 * limit to 10% of the global limit equally split among buckets. If
481 * the bucket is busier than the others, we limit to 10% of its
482 * current size. */
483 min_exp = sweep_bucket(ct, ctb, now,
484 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
485 clean_count += prev_count - hmap_count(&ctb->connections);
486
487 if (min_exp > now) {
488 /* We call hmap_shrink() only if sweep_bucket() managed to delete
489 * every expired connection. */
490 hmap_shrink(&ctb->connections);
491 }
492
493 ct_lock_unlock(&ctb->lock);
494
495 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
496
497 next_bucket:
498 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
499 ovs_mutex_unlock(&ctb->cleanup_mutex);
500 }
501
502 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
503 clean_count, time_msec() - now);
504
505 return next_wakeup;
506 }
507
508 /* Cleanup:
509 *
510 * We must call conntrack_clean() periodically. conntrack_clean() return
511 * value gives an hint on when the next cleanup must be done (either because
512 * there is an actual connection that expires, or because a new connection
513 * might be created with the minimum timeout).
514 *
515 * The logic below has two goals:
516 *
517 * - We want to reduce the number of wakeups and batch connection cleanup
518 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
519 * are coping with the current cleanup tasks, then we wait at least
520 * 5 seconds to do further cleanup.
521 *
522 * - We don't want to keep the buckets locked too long, as we might prevent
523 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
524 * behind, there is at least some 200ms blocks of time when buckets will be
525 * left alone, so the datapath can operate unhindered.
526 */
527 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
528 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
529
530 static void *
531 clean_thread_main(void *f_)
532 {
533 struct conntrack *ct = f_;
534
535 while (!latch_is_set(&ct->clean_thread_exit)) {
536 long long next_wake;
537 long long now = time_msec();
538
539 next_wake = conntrack_clean(ct, now);
540
541 if (next_wake < now) {
542 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
543 } else {
544 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
545 }
546 latch_wait(&ct->clean_thread_exit);
547 poll_block();
548 }
549
550 return NULL;
551 }
552 \f
553 /* Key extraction */
554
555 /* The function stores a pointer to the first byte after the header in
556 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
557 * not interested in the header's tail, meaning that the header has
558 * already been parsed (e.g. by flow_extract): we take this as a hint to
559 * save a few checks. If 'validate_checksum' is true, the function returns
560 * false if the IPv4 checksum is invalid. */
561 static inline bool
562 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
563 const char **new_data, bool validate_checksum)
564 {
565 const struct ip_header *ip = data;
566 size_t ip_len;
567
568 if (new_data) {
569 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
570 return false;
571 }
572 }
573
574 ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
575
576 if (new_data) {
577 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
578 return false;
579 }
580 if (OVS_UNLIKELY(size < ip_len)) {
581 return false;
582 }
583
584 *new_data = (char *) data + ip_len;
585 }
586
587 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
588 return false;
589 }
590
591 if (validate_checksum && csum(data, ip_len) != 0) {
592 return false;
593 }
594
595 key->src.addr.ipv4 = ip->ip_src;
596 key->dst.addr.ipv4 = ip->ip_dst;
597 key->nw_proto = ip->ip_proto;
598
599 return true;
600 }
601
602 /* The function stores a pointer to the first byte after the header in
603 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
604 * not interested in the header's tail, meaning that the header has
605 * already been parsed (e.g. by flow_extract): we take this as a hint to
606 * save a few checks. */
607 static inline bool
608 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
609 const char **new_data)
610 {
611 const struct ovs_16aligned_ip6_hdr *ip6 = data;
612 if (new_data) {
613 if (OVS_UNLIKELY(size < sizeof *ip6)) {
614 return false;
615 }
616 }
617
618 uint8_t nw_proto = ip6->ip6_nxt;
619 uint8_t nw_frag = 0;
620
621 data = ip6 + 1;
622 size -= sizeof *ip6;
623
624 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
625 return false;
626 }
627
628 if (new_data) {
629 *new_data = data;
630 }
631
632 if (nw_frag) {
633 return false;
634 }
635
636 key->src.addr.ipv6 = ip6->ip6_src;
637 key->dst.addr.ipv6 = ip6->ip6_dst;
638 key->nw_proto = nw_proto;
639
640 return true;
641 }
642
643 static inline bool
644 checksum_valid(const struct conn_key *key, const void *data, size_t size,
645 const void *l3)
646 {
647 uint32_t csum = 0;
648
649 if (key->dl_type == htons(ETH_TYPE_IP)) {
650 csum = packet_csum_pseudoheader(l3);
651 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
652 csum = packet_csum_pseudoheader6(l3);
653 } else {
654 return false;
655 }
656
657 csum = csum_continue(csum, data, size);
658
659 return csum_finish(csum) == 0;
660 }
661
662 static inline bool
663 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
664 const void *l3)
665 {
666 const struct tcp_header *tcp = data;
667 if (size < sizeof *tcp) {
668 return false;
669 }
670
671 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
672 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
673 return false;
674 }
675
676 return checksum_valid(key, data, size, l3);
677 }
678
679 static inline bool
680 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
681 const void *l3)
682 {
683 const struct udp_header *udp = data;
684 if (size < sizeof *udp) {
685 return false;
686 }
687
688 size_t udp_len = ntohs(udp->udp_len);
689 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
690 return false;
691 }
692
693 /* Validation must be skipped if checksum is 0 on IPv4 packets */
694 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
695 || checksum_valid(key, data, size, l3);
696 }
697
698 static inline bool
699 check_l4_icmp(const void *data, size_t size)
700 {
701 return csum(data, size) == 0;
702 }
703
704 static inline bool
705 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
706 const void *l3)
707 {
708 return checksum_valid(key, data, size, l3);
709 }
710
711 static inline bool
712 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
713 {
714 const struct tcp_header *tcp = data;
715
716 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
717 return false;
718 }
719
720 key->src.port = tcp->tcp_src;
721 key->dst.port = tcp->tcp_dst;
722
723 /* Port 0 is invalid */
724 return key->src.port && key->dst.port;
725 }
726
727 static inline bool
728 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
729 {
730 const struct udp_header *udp = data;
731
732 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
733 return false;
734 }
735
736 key->src.port = udp->udp_src;
737 key->dst.port = udp->udp_dst;
738
739 /* Port 0 is invalid */
740 return key->src.port && key->dst.port;
741 }
742
743 static inline bool extract_l4(struct conn_key *key, const void *data,
744 size_t size, bool *related, const void *l3);
745
746 static uint8_t
747 reverse_icmp_type(uint8_t type)
748 {
749 switch (type) {
750 case ICMP4_ECHO_REQUEST:
751 return ICMP4_ECHO_REPLY;
752 case ICMP4_ECHO_REPLY:
753 return ICMP4_ECHO_REQUEST;
754
755 case ICMP4_TIMESTAMP:
756 return ICMP4_TIMESTAMPREPLY;
757 case ICMP4_TIMESTAMPREPLY:
758 return ICMP4_TIMESTAMP;
759
760 case ICMP4_INFOREQUEST:
761 return ICMP4_INFOREPLY;
762 case ICMP4_INFOREPLY:
763 return ICMP4_INFOREQUEST;
764 default:
765 OVS_NOT_REACHED();
766 }
767 }
768
769 /* If 'related' is not NULL and the function is processing an ICMP
770 * error packet, extract the l3 and l4 fields from the nested header
771 * instead and set *related to true. If 'related' is NULL we're
772 * already processing a nested header and no such recursion is
773 * possible */
774 static inline int
775 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
776 bool *related)
777 {
778 const struct icmp_header *icmp = data;
779
780 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
781 return false;
782 }
783
784 switch (icmp->icmp_type) {
785 case ICMP4_ECHO_REQUEST:
786 case ICMP4_ECHO_REPLY:
787 case ICMP4_TIMESTAMP:
788 case ICMP4_TIMESTAMPREPLY:
789 case ICMP4_INFOREQUEST:
790 case ICMP4_INFOREPLY:
791 if (icmp->icmp_code != 0) {
792 return false;
793 }
794 /* Separate ICMP connection: identified using id */
795 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
796 key->src.icmp_type = icmp->icmp_type;
797 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
798 break;
799 case ICMP4_DST_UNREACH:
800 case ICMP4_TIME_EXCEEDED:
801 case ICMP4_PARAM_PROB:
802 case ICMP4_SOURCEQUENCH:
803 case ICMP4_REDIRECT: {
804 /* ICMP packet part of another connection. We should
805 * extract the key from embedded packet header */
806 struct conn_key inner_key;
807 const char *l3 = (const char *) (icmp + 1);
808 const char *tail = (const char *) data + size;
809 const char *l4;
810 bool ok;
811
812 if (!related) {
813 return false;
814 }
815
816 memset(&inner_key, 0, sizeof inner_key);
817 inner_key.dl_type = htons(ETH_TYPE_IP);
818 ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
819 if (!ok) {
820 return false;
821 }
822
823 /* pf doesn't do this, but it seems a good idea */
824 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
825 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
826 return false;
827 }
828
829 key->src = inner_key.src;
830 key->dst = inner_key.dst;
831 key->nw_proto = inner_key.nw_proto;
832
833 ok = extract_l4(key, l4, tail - l4, NULL, l3);
834 if (ok) {
835 conn_key_reverse(key);
836 *related = true;
837 }
838 return ok;
839 }
840 default:
841 return false;
842 }
843
844 return true;
845 }
846
847 static uint8_t
848 reverse_icmp6_type(uint8_t type)
849 {
850 switch (type) {
851 case ICMP6_ECHO_REQUEST:
852 return ICMP6_ECHO_REPLY;
853 case ICMP6_ECHO_REPLY:
854 return ICMP6_ECHO_REQUEST;
855 default:
856 OVS_NOT_REACHED();
857 }
858 }
859
860 /* If 'related' is not NULL and the function is processing an ICMP
861 * error packet, extract the l3 and l4 fields from the nested header
862 * instead and set *related to true. If 'related' is NULL we're
863 * already processing a nested header and no such recursion is
864 * possible */
865 static inline bool
866 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
867 bool *related)
868 {
869 const struct icmp6_header *icmp6 = data;
870
871 /* All the messages that we support need at least 4 bytes after
872 * the header */
873 if (size < sizeof *icmp6 + 4) {
874 return false;
875 }
876
877 switch (icmp6->icmp6_type) {
878 case ICMP6_ECHO_REQUEST:
879 case ICMP6_ECHO_REPLY:
880 if (icmp6->icmp6_code != 0) {
881 return false;
882 }
883 /* Separate ICMP connection: identified using id */
884 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
885 key->src.icmp_type = icmp6->icmp6_type;
886 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
887 break;
888 case ICMP6_DST_UNREACH:
889 case ICMP6_PACKET_TOO_BIG:
890 case ICMP6_TIME_EXCEEDED:
891 case ICMP6_PARAM_PROB: {
892 /* ICMP packet part of another connection. We should
893 * extract the key from embedded packet header */
894 struct conn_key inner_key;
895 const char *l3 = (const char *) icmp6 + 8;
896 const char *tail = (const char *) data + size;
897 const char *l4 = NULL;
898 bool ok;
899
900 if (!related) {
901 return false;
902 }
903
904 memset(&inner_key, 0, sizeof inner_key);
905 inner_key.dl_type = htons(ETH_TYPE_IPV6);
906 ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
907 if (!ok) {
908 return false;
909 }
910
911 /* pf doesn't do this, but it seems a good idea */
912 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
913 &key->dst.addr.ipv6_aligned)
914 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
915 &key->src.addr.ipv6_aligned)) {
916 return false;
917 }
918
919 key->src = inner_key.src;
920 key->dst = inner_key.dst;
921 key->nw_proto = inner_key.nw_proto;
922
923 ok = extract_l4(key, l4, tail - l4, NULL, l3);
924 if (ok) {
925 conn_key_reverse(key);
926 *related = true;
927 }
928 return ok;
929 }
930 default:
931 return false;
932 }
933
934 return true;
935 }
936
937 /* Extract l4 fields into 'key', which must already contain valid l3
938 * members.
939 *
940 * If 'related' is not NULL and an ICMP error packet is being
941 * processed, the function will extract the key from the packet nested
942 * in the ICMP paylod and set '*related' to true.
943 *
944 * If 'related' is NULL, it means that we're already parsing a header nested
945 * in an ICMP error. In this case, we skip checksum and length validation. */
946 static inline bool
947 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
948 const void *l3)
949 {
950 if (key->nw_proto == IPPROTO_TCP) {
951 return (!related || check_l4_tcp(key, data, size, l3))
952 && extract_l4_tcp(key, data, size);
953 } else if (key->nw_proto == IPPROTO_UDP) {
954 return (!related || check_l4_udp(key, data, size, l3))
955 && extract_l4_udp(key, data, size);
956 } else if (key->dl_type == htons(ETH_TYPE_IP)
957 && key->nw_proto == IPPROTO_ICMP) {
958 return (!related || check_l4_icmp(data, size))
959 && extract_l4_icmp(key, data, size, related);
960 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
961 && key->nw_proto == IPPROTO_ICMPV6) {
962 return (!related || check_l4_icmp6(key, data, size, l3))
963 && extract_l4_icmp6(key, data, size, related);
964 } else {
965 return false;
966 }
967 }
968
969 static bool
970 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
971 struct conn_lookup_ctx *ctx, uint16_t zone)
972 {
973 const struct eth_header *l2 = dp_packet_eth(pkt);
974 const struct ip_header *l3 = dp_packet_l3(pkt);
975 const char *l4 = dp_packet_l4(pkt);
976 const char *tail = dp_packet_tail(pkt);
977 bool ok;
978
979 memset(ctx, 0, sizeof *ctx);
980
981 if (!l2 || !l3 || !l4) {
982 return false;
983 }
984
985 ctx->key.zone = zone;
986
987 /* XXX In this function we parse the packet (again, it has already
988 * gone through miniflow_extract()) for two reasons:
989 *
990 * 1) To extract the l3 addresses and l4 ports.
991 * We already have the l3 and l4 headers' pointers. Extracting
992 * the l3 addresses and the l4 ports is really cheap, since they
993 * can be found at fixed locations.
994 * 2) To extract the l4 type.
995 * Extracting the l4 types, for IPv6 can be quite expensive, because
996 * it's not at a fixed location.
997 *
998 * Here's a way to avoid (2) with the help of the datapath.
999 * The datapath doesn't keep the packet's extracted flow[1], so
1000 * using that is not an option. We could use the packet's matching
1001 * megaflow, but we have to make sure that the l4 type (nw_proto)
1002 * is unwildcarded. This means either:
1003 *
1004 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1005 * if the actions contains ct().
1006 *
1007 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1008 * action. This is already done in different actions, but it's
1009 * unnecessary for the kernel.
1010 *
1011 * ---
1012 * [1] The reasons for this are that keeping the flow increases
1013 * (slightly) the cache footprint and increases computation
1014 * time as we move the packet around. Most importantly, the flow
1015 * should be updated by the actions and this can be slow, as
1016 * we use a sparse representation (miniflow).
1017 *
1018 */
1019 ctx->key.dl_type = dl_type;
1020 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
1021 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true);
1022 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1023 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1024 } else {
1025 ok = false;
1026 }
1027
1028 if (ok) {
1029 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) {
1030 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1031 return true;
1032 }
1033 }
1034
1035 return false;
1036 }
1037 \f
1038 /* Symmetric */
1039 static uint32_t
1040 conn_key_hash(const struct conn_key *key, uint32_t basis)
1041 {
1042 uint32_t hsrc, hdst, hash;
1043 int i;
1044
1045 hsrc = hdst = basis;
1046
1047 /* Hash the source and destination tuple */
1048 for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) {
1049 hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]);
1050 hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]);
1051 }
1052
1053 /* Even if source and destination are swapped the hash will be the same. */
1054 hash = hsrc ^ hdst;
1055
1056 /* Hash the rest of the key(L3 and L4 types and zone). */
1057 hash = hash_words((uint32_t *) (&key->dst + 1),
1058 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1059 hash);
1060
1061 return hash;
1062 }
1063
1064 static void
1065 conn_key_reverse(struct conn_key *key)
1066 {
1067 struct ct_endpoint tmp;
1068
1069 tmp = key->src;
1070 key->src = key->dst;
1071 key->dst = tmp;
1072 }
1073
1074 static void
1075 conn_key_lookup(struct conntrack_bucket *ctb,
1076 struct conn_lookup_ctx *ctx,
1077 long long now)
1078 {
1079 uint32_t hash = ctx->hash;
1080 struct conn *conn;
1081
1082 ctx->conn = NULL;
1083
1084 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
1085 if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))
1086 && !conn_expired(conn, now)) {
1087 ctx->conn = conn;
1088 ctx->reply = false;
1089 break;
1090 }
1091 if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))
1092 && !conn_expired(conn, now)) {
1093 ctx->conn = conn;
1094 ctx->reply = true;
1095 break;
1096 }
1097 }
1098 }
1099
1100 static enum ct_update_res
1101 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
1102 struct dp_packet *pkt, bool reply, long long now)
1103 {
1104 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
1105 reply, now);
1106 }
1107
1108 static bool
1109 conn_expired(struct conn *conn, long long now)
1110 {
1111 return now >= conn->expiration;
1112 }
1113
1114 static bool
1115 valid_new(struct dp_packet *pkt, struct conn_key *key)
1116 {
1117 return l4_protos[key->nw_proto]->valid_new(pkt);
1118 }
1119
1120 static struct conn *
1121 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
1122 struct conn_key *key, long long now)
1123 {
1124 struct conn *newconn;
1125
1126 newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
1127
1128 if (newconn) {
1129 newconn->key = *key;
1130 }
1131
1132 return newconn;
1133 }
1134
1135 static void
1136 delete_conn(struct conn *conn)
1137 {
1138 free(conn);
1139 }
1140 \f
1141 static void
1142 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
1143 union ct_dpif_inet_addr *b,
1144 ovs_be16 dl_type)
1145 {
1146 if (dl_type == htons(ETH_TYPE_IP)) {
1147 b->ip = a->ipv4_aligned;
1148 } else if (dl_type == htons(ETH_TYPE_IPV6)){
1149 b->in6 = a->ipv6_aligned;
1150 }
1151 }
1152
1153 static void
1154 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
1155 {
1156 if (key->dl_type == htons(ETH_TYPE_IP)) {
1157 tuple->l3_type = AF_INET;
1158 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1159 tuple->l3_type = AF_INET6;
1160 }
1161 tuple->ip_proto = key->nw_proto;
1162 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
1163 key->dl_type);
1164 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
1165 key->dl_type);
1166
1167 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
1168 tuple->icmp_id = key->src.icmp_id;
1169 tuple->icmp_type = key->src.icmp_type;
1170 tuple->icmp_code = key->src.icmp_code;
1171 } else {
1172 tuple->src_port = key->src.port;
1173 tuple->dst_port = key->dst.port;
1174 }
1175 }
1176
1177 static void
1178 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
1179 long long now)
1180 {
1181 struct ct_l4_proto *class;
1182 long long expiration;
1183 memset(entry, 0, sizeof *entry);
1184 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
1185 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
1186
1187 entry->zone = conn->key.zone;
1188 entry->mark = conn->mark;
1189
1190 memcpy(&entry->labels, &conn->label, sizeof(entry->labels));
1191 /* Not implemented yet */
1192 entry->timestamp.start = 0;
1193 entry->timestamp.stop = 0;
1194
1195 expiration = conn->expiration - now;
1196 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
1197
1198 class = l4_protos[conn->key.nw_proto];
1199 if (class->conn_get_protoinfo) {
1200 class->conn_get_protoinfo(conn, &entry->protoinfo);
1201 }
1202 }
1203
1204 int
1205 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
1206 const uint16_t *pzone)
1207 {
1208 memset(dump, 0, sizeof(*dump));
1209 if (pzone) {
1210 dump->zone = *pzone;
1211 dump->filter_zone = true;
1212 }
1213 dump->ct = ct;
1214
1215 return 0;
1216 }
1217
1218 int
1219 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
1220 {
1221 struct conntrack *ct = dump->ct;
1222 long long now = time_msec();
1223
1224 while (dump->bucket < CONNTRACK_BUCKETS) {
1225 struct hmap_node *node;
1226
1227 ct_lock_lock(&ct->buckets[dump->bucket].lock);
1228 for (;;) {
1229 struct conn *conn;
1230
1231 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
1232 &dump->bucket_pos);
1233 if (!node) {
1234 break;
1235 }
1236 INIT_CONTAINER(conn, node, node);
1237 if (!dump->filter_zone || conn->key.zone == dump->zone) {
1238 conn_to_ct_dpif_entry(conn, entry, now);
1239 break;
1240 }
1241 /* Else continue, until we find an entry in the appropriate zone
1242 * or the bucket has been scanned completely. */
1243 }
1244 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
1245
1246 if (!node) {
1247 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
1248 dump->bucket++;
1249 } else {
1250 return 0;
1251 }
1252 }
1253 return EOF;
1254 }
1255
1256 int
1257 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
1258 {
1259 return 0;
1260 }
1261
1262 int
1263 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
1264 {
1265 unsigned i;
1266
1267 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
1268 struct conn *conn, *next;
1269
1270 ct_lock_lock(&ct->buckets[i].lock);
1271 HMAP_FOR_EACH_SAFE(conn, next, node, &ct->buckets[i].connections) {
1272 if (!zone || *zone == conn->key.zone) {
1273 ovs_list_remove(&conn->exp_node);
1274 hmap_remove(&ct->buckets[i].connections, &conn->node);
1275 atomic_count_dec(&ct->n_conn);
1276 delete_conn(conn);
1277 }
1278 }
1279 ct_lock_unlock(&ct->buckets[i].lock);
1280 }
1281
1282 return 0;
1283 }