]>
Commit | Line | Data |
---|---|---|
a489b168 DDP |
1 | /* |
2 | * Copyright (c) 2015, 2016 Nicira, Inc. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | #include <config.h> | |
18 | #include "conntrack.h" | |
19 | ||
20 | #include <errno.h> | |
21 | #include <sys/types.h> | |
22 | #include <netinet/in.h> | |
23 | #include <netinet/icmp6.h> | |
24 | ||
25 | #include "bitmap.h" | |
26 | #include "conntrack-private.h" | |
27 | #include "coverage.h" | |
28 | #include "csum.h" | |
29 | #include "dp-packet.h" | |
30 | #include "flow.h" | |
31 | #include "netdev.h" | |
32 | #include "odp-netlink.h" | |
33 | #include "openvswitch/hmap.h" | |
34 | #include "openvswitch/vlog.h" | |
35 | #include "ovs-rcu.h" | |
e6ef6cc6 DDP |
36 | #include "ovs-thread.h" |
37 | #include "poll-loop.h" | |
a489b168 DDP |
38 | #include "random.h" |
39 | #include "timeval.h" | |
40 | ||
41 | VLOG_DEFINE_THIS_MODULE(conntrack); | |
42 | ||
43 | COVERAGE_DEFINE(conntrack_full); | |
e6ef6cc6 | 44 | COVERAGE_DEFINE(conntrack_long_cleanup); |
a489b168 DDP |
45 | |
46 | struct conn_lookup_ctx { | |
47 | struct conn_key key; | |
48 | struct conn *conn; | |
49 | uint32_t hash; | |
50 | bool reply; | |
51 | bool related; | |
52 | }; | |
53 | ||
54 | static bool conn_key_extract(struct conntrack *, struct dp_packet *, | |
55 | struct conn_lookup_ctx *, uint16_t zone); | |
56 | static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis); | |
57 | static void conn_key_reverse(struct conn_key *); | |
58 | static void conn_key_lookup(struct conntrack_bucket *ctb, | |
59 | struct conn_lookup_ctx *ctx, | |
60 | long long now); | |
61 | static bool valid_new(struct dp_packet *pkt, struct conn_key *); | |
e6ef6cc6 DDP |
62 | static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt, |
63 | struct conn_key *, long long now); | |
a489b168 | 64 | static void delete_conn(struct conn *); |
e6ef6cc6 DDP |
65 | static enum ct_update_res conn_update(struct conn *, |
66 | struct conntrack_bucket *ctb, | |
67 | struct dp_packet *, bool reply, | |
68 | long long now); | |
a489b168 DDP |
69 | static bool conn_expired(struct conn *, long long now); |
70 | static void set_mark(struct dp_packet *, struct conn *, | |
71 | uint32_t val, uint32_t mask); | |
72 | static void set_label(struct dp_packet *, struct conn *, | |
73 | const struct ovs_key_ct_labels *val, | |
74 | const struct ovs_key_ct_labels *mask); | |
e6ef6cc6 | 75 | static void *clean_thread_main(void *f_); |
a489b168 DDP |
76 | |
77 | static struct ct_l4_proto *l4_protos[] = { | |
78 | [IPPROTO_TCP] = &ct_proto_tcp, | |
79 | [IPPROTO_UDP] = &ct_proto_other, | |
80 | [IPPROTO_ICMP] = &ct_proto_other, | |
81 | [IPPROTO_ICMPV6] = &ct_proto_other, | |
82 | }; | |
83 | ||
84 | long long ct_timeout_val[] = { | |
85 | #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL, | |
86 | CT_TIMEOUTS | |
87 | #undef CT_TIMEOUT | |
88 | }; | |
89 | ||
90 | /* If the total number of connections goes above this value, no new connections | |
91 | * are accepted */ | |
92 | #define DEFAULT_N_CONN_LIMIT 3000000 | |
93 | ||
94 | /* Initializes the connection tracker 'ct'. The caller is responsible for | |
95 | * calling 'conntrack_destroy()', when the instance is not needed anymore */ | |
96 | void | |
97 | conntrack_init(struct conntrack *ct) | |
98 | { | |
e6ef6cc6 DDP |
99 | unsigned i, j; |
100 | long long now = time_msec(); | |
a489b168 DDP |
101 | |
102 | for (i = 0; i < CONNTRACK_BUCKETS; i++) { | |
103 | struct conntrack_bucket *ctb = &ct->buckets[i]; | |
104 | ||
105 | ct_lock_init(&ctb->lock); | |
106 | ct_lock_lock(&ctb->lock); | |
107 | hmap_init(&ctb->connections); | |
e6ef6cc6 DDP |
108 | for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) { |
109 | ovs_list_init(&ctb->exp_lists[j]); | |
110 | } | |
a489b168 | 111 | ct_lock_unlock(&ctb->lock); |
e6ef6cc6 DDP |
112 | ovs_mutex_init(&ctb->cleanup_mutex); |
113 | ovs_mutex_lock(&ctb->cleanup_mutex); | |
114 | ctb->next_cleanup = now + CT_TM_MIN; | |
115 | ovs_mutex_unlock(&ctb->cleanup_mutex); | |
a489b168 DDP |
116 | } |
117 | ct->hash_basis = random_uint32(); | |
118 | atomic_count_init(&ct->n_conn, 0); | |
119 | atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT); | |
e6ef6cc6 DDP |
120 | latch_init(&ct->clean_thread_exit); |
121 | ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct); | |
a489b168 DDP |
122 | } |
123 | ||
124 | /* Destroys the connection tracker 'ct' and frees all the allocated memory. */ | |
125 | void | |
126 | conntrack_destroy(struct conntrack *ct) | |
127 | { | |
128 | unsigned i; | |
129 | ||
e6ef6cc6 DDP |
130 | latch_set(&ct->clean_thread_exit); |
131 | pthread_join(ct->clean_thread, NULL); | |
132 | latch_destroy(&ct->clean_thread_exit); | |
a489b168 DDP |
133 | for (i = 0; i < CONNTRACK_BUCKETS; i++) { |
134 | struct conntrack_bucket *ctb = &ct->buckets[i]; | |
135 | struct conn *conn; | |
136 | ||
e6ef6cc6 | 137 | ovs_mutex_destroy(&ctb->cleanup_mutex); |
a489b168 DDP |
138 | ct_lock_lock(&ctb->lock); |
139 | HMAP_FOR_EACH_POP(conn, node, &ctb->connections) { | |
140 | atomic_count_dec(&ct->n_conn); | |
141 | delete_conn(conn); | |
142 | } | |
143 | hmap_destroy(&ctb->connections); | |
144 | ct_lock_unlock(&ctb->lock); | |
145 | ct_lock_destroy(&ctb->lock); | |
146 | } | |
147 | } | |
148 | \f | |
149 | static unsigned hash_to_bucket(uint32_t hash) | |
150 | { | |
151 | /* Extracts the most significant bits in hash. The least significant bits | |
152 | * are already used internally by the hmap implementation. */ | |
153 | BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1); | |
154 | ||
155 | return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS; | |
156 | } | |
157 | ||
158 | static void | |
159 | write_ct_md(struct dp_packet *pkt, uint16_t state, uint16_t zone, | |
160 | uint32_t mark, ovs_u128 label) | |
161 | { | |
162 | pkt->md.ct_state = state | CS_TRACKED; | |
163 | pkt->md.ct_zone = zone; | |
164 | pkt->md.ct_mark = mark; | |
165 | pkt->md.ct_label = label; | |
166 | } | |
167 | ||
168 | static struct conn * | |
169 | conn_not_found(struct conntrack *ct, struct dp_packet *pkt, | |
170 | struct conn_lookup_ctx *ctx, uint16_t *state, bool commit, | |
171 | long long now) | |
172 | { | |
173 | unsigned bucket = hash_to_bucket(ctx->hash); | |
174 | struct conn *nc = NULL; | |
175 | ||
176 | if (!valid_new(pkt, &ctx->key)) { | |
177 | *state |= CS_INVALID; | |
178 | return nc; | |
179 | } | |
180 | ||
181 | *state |= CS_NEW; | |
182 | ||
183 | if (commit) { | |
184 | unsigned int n_conn_limit; | |
185 | ||
186 | atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit); | |
187 | ||
188 | if (atomic_count_get(&ct->n_conn) >= n_conn_limit) { | |
189 | COVERAGE_INC(conntrack_full); | |
190 | return nc; | |
191 | } | |
192 | ||
e6ef6cc6 | 193 | nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now); |
a489b168 DDP |
194 | |
195 | memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key); | |
196 | ||
197 | conn_key_reverse(&nc->rev_key); | |
198 | hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash); | |
199 | atomic_count_inc(&ct->n_conn); | |
200 | } | |
201 | ||
202 | return nc; | |
203 | } | |
204 | ||
205 | static struct conn * | |
206 | process_one(struct conntrack *ct, struct dp_packet *pkt, | |
207 | struct conn_lookup_ctx *ctx, uint16_t zone, | |
208 | bool commit, long long now) | |
209 | { | |
210 | unsigned bucket = hash_to_bucket(ctx->hash); | |
211 | struct conn *conn = ctx->conn; | |
212 | uint16_t state = 0; | |
213 | ||
214 | if (conn) { | |
215 | if (ctx->related) { | |
216 | state |= CS_RELATED; | |
217 | if (ctx->reply) { | |
218 | state |= CS_REPLY_DIR; | |
219 | } | |
220 | } else { | |
221 | enum ct_update_res res; | |
222 | ||
e6ef6cc6 DDP |
223 | res = conn_update(conn, &ct->buckets[bucket], pkt, |
224 | ctx->reply, now); | |
a489b168 DDP |
225 | |
226 | switch (res) { | |
227 | case CT_UPDATE_VALID: | |
228 | state |= CS_ESTABLISHED; | |
229 | if (ctx->reply) { | |
230 | state |= CS_REPLY_DIR; | |
231 | } | |
232 | break; | |
233 | case CT_UPDATE_INVALID: | |
234 | state |= CS_INVALID; | |
235 | break; | |
236 | case CT_UPDATE_NEW: | |
e6ef6cc6 | 237 | ovs_list_remove(&conn->exp_node); |
a489b168 DDP |
238 | hmap_remove(&ct->buckets[bucket].connections, &conn->node); |
239 | atomic_count_dec(&ct->n_conn); | |
240 | delete_conn(conn); | |
241 | conn = conn_not_found(ct, pkt, ctx, &state, commit, now); | |
242 | break; | |
243 | default: | |
244 | OVS_NOT_REACHED(); | |
245 | } | |
246 | } | |
247 | } else { | |
248 | conn = conn_not_found(ct, pkt, ctx, &state, commit, now); | |
249 | } | |
250 | ||
251 | write_ct_md(pkt, state, zone, conn ? conn->mark : 0, | |
252 | conn ? conn->label : OVS_U128_ZERO); | |
253 | ||
254 | return conn; | |
255 | } | |
256 | ||
257 | /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All | |
258 | * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have | |
259 | * the l3 and and l4 offset properly set. | |
260 | * | |
261 | * If 'commit' is true, the packets are allowed to create new entries in the | |
262 | * connection tables. 'setmark', if not NULL, should point to a two | |
263 | * elements array containing a value and a mask to set the connection mark. | |
264 | * 'setlabel' behaves similarly for the connection label.*/ | |
265 | int | |
266 | conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch, | |
267 | bool commit, uint16_t zone, const uint32_t *setmark, | |
268 | const struct ovs_key_ct_labels *setlabel, | |
269 | const char *helper) | |
270 | { | |
271 | struct dp_packet **pkts = pkt_batch->packets; | |
272 | size_t cnt = pkt_batch->count; | |
273 | #if !defined(__CHECKER__) && !defined(_WIN32) | |
274 | const size_t KEY_ARRAY_SIZE = cnt; | |
275 | #else | |
276 | enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST }; | |
277 | #endif | |
278 | struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE]; | |
279 | int8_t bucket_list[CONNTRACK_BUCKETS]; | |
280 | struct { | |
281 | unsigned bucket; | |
282 | unsigned long maps; | |
283 | } arr[KEY_ARRAY_SIZE]; | |
284 | long long now = time_msec(); | |
285 | size_t i = 0; | |
286 | uint8_t arrcnt = 0; | |
287 | ||
288 | BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST); | |
289 | ||
290 | if (helper) { | |
291 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); | |
292 | ||
293 | VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper); | |
294 | /* Continue without the helper */ | |
295 | } | |
296 | ||
297 | memset(bucket_list, INT8_C(-1), sizeof bucket_list); | |
298 | for (i = 0; i < cnt; i++) { | |
299 | unsigned bucket; | |
300 | ||
301 | if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) { | |
302 | write_ct_md(pkts[i], CS_INVALID, zone, 0, OVS_U128_ZERO); | |
303 | continue; | |
304 | } | |
305 | ||
306 | bucket = hash_to_bucket(ctxs[i].hash); | |
307 | if (bucket_list[bucket] == INT8_C(-1)) { | |
308 | bucket_list[bucket] = arrcnt; | |
309 | ||
310 | arr[arrcnt].maps = 0; | |
311 | ULLONG_SET1(arr[arrcnt].maps, i); | |
312 | arr[arrcnt++].bucket = bucket; | |
313 | } else { | |
314 | ULLONG_SET1(arr[bucket_list[bucket]].maps, i); | |
315 | arr[bucket_list[bucket]].maps |= 1UL << i; | |
316 | } | |
317 | } | |
318 | ||
319 | for (i = 0; i < arrcnt; i++) { | |
320 | struct conntrack_bucket *ctb = &ct->buckets[arr[i].bucket]; | |
321 | size_t j; | |
322 | ||
323 | ct_lock_lock(&ctb->lock); | |
324 | ||
325 | ULLONG_FOR_EACH_1(j, arr[i].maps) { | |
326 | struct conn *conn; | |
327 | ||
328 | conn_key_lookup(ctb, &ctxs[j], now); | |
329 | ||
330 | conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now); | |
331 | ||
332 | if (conn && setmark) { | |
333 | set_mark(pkts[j], conn, setmark[0], setmark[1]); | |
334 | } | |
335 | ||
336 | if (conn && setlabel) { | |
337 | set_label(pkts[j], conn, &setlabel[0], &setlabel[1]); | |
338 | } | |
339 | } | |
340 | ct_lock_unlock(&ctb->lock); | |
341 | } | |
342 | ||
343 | return 0; | |
344 | } | |
345 | ||
346 | static void | |
347 | set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask) | |
348 | { | |
349 | pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask)); | |
350 | conn->mark = pkt->md.ct_mark; | |
351 | } | |
352 | ||
353 | static void | |
354 | set_label(struct dp_packet *pkt, struct conn *conn, | |
355 | const struct ovs_key_ct_labels *val, | |
356 | const struct ovs_key_ct_labels *mask) | |
357 | { | |
358 | ovs_u128 v, m; | |
359 | ||
360 | memcpy(&v, val, sizeof v); | |
361 | memcpy(&m, mask, sizeof m); | |
362 | ||
363 | pkt->md.ct_label.u64.lo = v.u64.lo | |
364 | | (pkt->md.ct_label.u64.lo & ~(m.u64.lo)); | |
365 | pkt->md.ct_label.u64.hi = v.u64.hi | |
366 | | (pkt->md.ct_label.u64.hi & ~(m.u64.hi)); | |
367 | conn->label = pkt->md.ct_label; | |
368 | } | |
369 | \f | |
e6ef6cc6 DDP |
370 | /* Delete the expired connections from 'ctb', up to 'limit'. Returns the |
371 | * earliest expiration time among the remaining connections in 'ctb'. Returns | |
372 | * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now', | |
373 | * if 'limit' is reached */ | |
374 | static long long | |
375 | sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb, long long now, | |
376 | size_t limit) | |
377 | OVS_REQUIRES(ctb->lock) | |
378 | { | |
379 | struct conn *conn, *next; | |
380 | long long min_expiration = LLONG_MAX; | |
381 | unsigned i; | |
382 | size_t count = 0; | |
383 | ||
384 | for (i = 0; i < N_CT_TM; i++) { | |
385 | LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) { | |
386 | if (!conn_expired(conn, now) || count >= limit) { | |
387 | min_expiration = MIN(min_expiration, conn->expiration); | |
388 | if (count >= limit) { | |
389 | /* Do not check other lists. */ | |
390 | COVERAGE_INC(conntrack_long_cleanup); | |
391 | return min_expiration; | |
392 | } | |
393 | break; | |
394 | } | |
395 | ovs_list_remove(&conn->exp_node); | |
396 | hmap_remove(&ctb->connections, &conn->node); | |
397 | atomic_count_dec(&ct->n_conn); | |
398 | delete_conn(conn); | |
399 | count++; | |
400 | } | |
401 | } | |
402 | ||
403 | return min_expiration; | |
404 | } | |
405 | ||
406 | /* Cleans up old connection entries from 'ct'. Returns the time when the | |
407 | * next expiration might happen. The return value might be smaller than | |
408 | * 'now', meaning that an internal limit has been reached, and some expired | |
409 | * connections have not been deleted. */ | |
410 | static long long | |
411 | conntrack_clean(struct conntrack *ct, long long now) | |
412 | { | |
413 | long long next_wakeup = now + CT_TM_MIN; | |
414 | unsigned int n_conn_limit; | |
415 | size_t clean_count = 0; | |
416 | unsigned i; | |
417 | ||
418 | atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit); | |
419 | ||
420 | for (i = 0; i < CONNTRACK_BUCKETS; i++) { | |
421 | struct conntrack_bucket *ctb = &ct->buckets[i]; | |
422 | size_t prev_count; | |
423 | long long min_exp; | |
424 | ||
425 | ovs_mutex_lock(&ctb->cleanup_mutex); | |
426 | if (ctb->next_cleanup > now) { | |
427 | goto next_bucket; | |
428 | } | |
429 | ||
430 | ct_lock_lock(&ctb->lock); | |
431 | prev_count = hmap_count(&ctb->connections); | |
432 | /* If the connections are well distributed among buckets, we want to | |
433 | * limit to 10% of the global limit equally split among buckets. If | |
434 | * the bucket is busier than the others, we limit to 10% of its | |
435 | * current size. */ | |
436 | min_exp = sweep_bucket(ct, ctb, now, | |
437 | MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10))); | |
438 | clean_count += prev_count - hmap_count(&ctb->connections); | |
439 | ||
440 | if (min_exp > now) { | |
441 | /* We call hmap_shrink() only if sweep_bucket() managed to delete | |
442 | * every expired connection. */ | |
443 | hmap_shrink(&ctb->connections); | |
444 | } | |
445 | ||
446 | ct_lock_unlock(&ctb->lock); | |
447 | ||
448 | ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN); | |
449 | ||
450 | next_bucket: | |
451 | next_wakeup = MIN(next_wakeup, ctb->next_cleanup); | |
452 | ovs_mutex_unlock(&ctb->cleanup_mutex); | |
453 | } | |
454 | ||
455 | VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", | |
456 | clean_count, time_msec() - now); | |
457 | ||
458 | return next_wakeup; | |
459 | } | |
460 | ||
461 | /* Cleanup: | |
e6ef6cc6 DDP |
462 | * |
463 | * We must call conntrack_clean() periodically. conntrack_clean() return | |
464 | * value gives an hint on when the next cleanup must be done (either because | |
465 | * there is an actual connection that expires, or because a new connection | |
466 | * might be created with the minimum timeout). | |
467 | * | |
468 | * The logic below has two goals: | |
469 | * | |
6c54734e DDP |
470 | * - We want to reduce the number of wakeups and batch connection cleanup |
471 | * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we | |
472 | * are coping with the current cleanup tasks, then we wait at least | |
473 | * 5 seconds to do further cleanup. | |
e6ef6cc6 | 474 | * |
6c54734e DDP |
475 | * - We don't want to keep the buckets locked too long, as we might prevent |
476 | * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is | |
477 | * behind, there is at least some 200ms blocks of time when buckets will be | |
478 | * left alone, so the datapath can operate unhindered. | |
e6ef6cc6 DDP |
479 | */ |
480 | #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */ | |
481 | #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */ | |
482 | ||
483 | static void * | |
484 | clean_thread_main(void *f_) | |
485 | { | |
486 | struct conntrack *ct = f_; | |
487 | ||
488 | while (!latch_is_set(&ct->clean_thread_exit)) { | |
489 | long long next_wake; | |
490 | long long now = time_msec(); | |
491 | ||
492 | next_wake = conntrack_clean(ct, now); | |
493 | ||
494 | if (next_wake < now) { | |
495 | poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL); | |
496 | } else { | |
497 | poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL)); | |
498 | } | |
499 | latch_wait(&ct->clean_thread_exit); | |
500 | poll_block(); | |
501 | } | |
502 | ||
503 | return NULL; | |
504 | } | |
505 | \f | |
a489b168 DDP |
506 | /* Key extraction */ |
507 | ||
508 | /* The function stores a pointer to the first byte after the header in | |
509 | * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is | |
510 | * not interested in the header's tail, meaning that the header has | |
511 | * already been parsed (e.g. by flow_extract): we take this as a hint to | |
512 | * save a few checks. If 'validate_checksum' is true, the function returns | |
513 | * false if the IPv4 checksum is invalid. */ | |
514 | static inline bool | |
515 | extract_l3_ipv4(struct conn_key *key, const void *data, size_t size, | |
516 | const char **new_data, bool validate_checksum) | |
517 | { | |
518 | const struct ip_header *ip = data; | |
519 | size_t ip_len; | |
520 | ||
521 | if (new_data) { | |
522 | if (OVS_UNLIKELY(size < IP_HEADER_LEN)) { | |
523 | return false; | |
524 | } | |
525 | } | |
526 | ||
527 | ip_len = IP_IHL(ip->ip_ihl_ver) * 4; | |
528 | ||
529 | if (new_data) { | |
530 | if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) { | |
531 | return false; | |
532 | } | |
533 | if (OVS_UNLIKELY(size < ip_len)) { | |
534 | return false; | |
535 | } | |
536 | ||
537 | *new_data = (char *) data + ip_len; | |
538 | } | |
539 | ||
540 | if (IP_IS_FRAGMENT(ip->ip_frag_off)) { | |
541 | return false; | |
542 | } | |
543 | ||
544 | if (validate_checksum && csum(data, ip_len) != 0) { | |
545 | return false; | |
546 | } | |
547 | ||
548 | key->src.addr.ipv4 = ip->ip_src; | |
549 | key->dst.addr.ipv4 = ip->ip_dst; | |
550 | key->nw_proto = ip->ip_proto; | |
551 | ||
552 | return true; | |
553 | } | |
554 | ||
555 | /* The function stores a pointer to the first byte after the header in | |
556 | * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is | |
557 | * not interested in the header's tail, meaning that the header has | |
558 | * already been parsed (e.g. by flow_extract): we take this as a hint to | |
559 | * save a few checks. */ | |
560 | static inline bool | |
561 | extract_l3_ipv6(struct conn_key *key, const void *data, size_t size, | |
562 | const char **new_data) | |
563 | { | |
564 | const struct ovs_16aligned_ip6_hdr *ip6 = data; | |
565 | uint8_t nw_proto = ip6->ip6_nxt; | |
566 | uint8_t nw_frag = 0; | |
567 | ||
568 | if (new_data) { | |
569 | if (OVS_UNLIKELY(size < sizeof *ip6)) { | |
570 | return false; | |
571 | } | |
572 | } | |
573 | ||
574 | data = ip6 + 1; | |
575 | size -= sizeof *ip6; | |
576 | ||
577 | if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) { | |
578 | return false; | |
579 | } | |
580 | ||
581 | if (new_data) { | |
582 | *new_data = data; | |
583 | } | |
584 | ||
585 | if (nw_frag) { | |
586 | return false; | |
587 | } | |
588 | ||
589 | key->src.addr.ipv6 = ip6->ip6_src; | |
590 | key->dst.addr.ipv6 = ip6->ip6_dst; | |
591 | key->nw_proto = nw_proto; | |
592 | ||
593 | return true; | |
594 | } | |
595 | ||
596 | static inline bool | |
597 | checksum_valid(const struct conn_key *key, const void *data, size_t size, | |
598 | const void *l3) | |
599 | { | |
600 | uint32_t csum = 0; | |
601 | ||
602 | if (key->dl_type == htons(ETH_TYPE_IP)) { | |
603 | csum = packet_csum_pseudoheader(l3); | |
604 | } else if (key->dl_type == htons(ETH_TYPE_IPV6)) { | |
605 | csum = packet_csum_pseudoheader6(l3); | |
606 | } else { | |
607 | return false; | |
608 | } | |
609 | ||
610 | csum = csum_continue(csum, data, size); | |
611 | ||
612 | return csum_finish(csum) == 0; | |
613 | } | |
614 | ||
615 | static inline bool | |
616 | check_l4_tcp(const struct conn_key *key, const void *data, size_t size, | |
617 | const void *l3) | |
618 | { | |
619 | const struct tcp_header *tcp = data; | |
620 | size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; | |
621 | ||
622 | if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) { | |
623 | return false; | |
624 | } | |
625 | ||
626 | return checksum_valid(key, data, size, l3); | |
627 | } | |
628 | ||
629 | static inline bool | |
630 | check_l4_udp(const struct conn_key *key, const void *data, size_t size, | |
631 | const void *l3) | |
632 | { | |
633 | const struct udp_header *udp = data; | |
634 | size_t udp_len = ntohs(udp->udp_len); | |
635 | ||
636 | if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) { | |
637 | return false; | |
638 | } | |
639 | ||
640 | /* Validation must be skipped if checksum is 0 on IPv4 packets */ | |
641 | return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP)) | |
642 | || checksum_valid(key, data, size, l3); | |
643 | } | |
644 | ||
645 | static inline bool | |
646 | check_l4_icmp(const void *data, size_t size) | |
647 | { | |
648 | return csum(data, size) == 0; | |
649 | } | |
650 | ||
651 | static inline bool | |
652 | check_l4_icmp6(const struct conn_key *key, const void *data, size_t size, | |
653 | const void *l3) | |
654 | { | |
655 | return checksum_valid(key, data, size, l3); | |
656 | } | |
657 | ||
658 | static inline bool | |
659 | extract_l4_tcp(struct conn_key *key, const void *data, size_t size) | |
660 | { | |
661 | const struct tcp_header *tcp = data; | |
662 | ||
663 | if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) { | |
664 | return false; | |
665 | } | |
666 | ||
667 | key->src.port = tcp->tcp_src; | |
668 | key->dst.port = tcp->tcp_dst; | |
669 | ||
670 | /* Port 0 is invalid */ | |
671 | return key->src.port && key->dst.port; | |
672 | } | |
673 | ||
674 | static inline bool | |
675 | extract_l4_udp(struct conn_key *key, const void *data, size_t size) | |
676 | { | |
677 | const struct udp_header *udp = data; | |
678 | ||
679 | if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) { | |
680 | return false; | |
681 | } | |
682 | ||
683 | key->src.port = udp->udp_src; | |
684 | key->dst.port = udp->udp_dst; | |
685 | ||
686 | /* Port 0 is invalid */ | |
687 | return key->src.port && key->dst.port; | |
688 | } | |
689 | ||
690 | static inline bool extract_l4(struct conn_key *key, const void *data, | |
691 | size_t size, bool *related, const void *l3); | |
692 | ||
693 | /* If 'related' is not NULL and the function is processing an ICMP | |
694 | * error packet, extract the l3 and l4 fields from the nested header | |
695 | * instead and set *related to true. If 'related' is NULL we're | |
696 | * already processing a nested header and no such recursion is | |
697 | * possible */ | |
698 | static inline int | |
699 | extract_l4_icmp(struct conn_key *key, const void *data, size_t size, | |
700 | bool *related) | |
701 | { | |
702 | const struct icmp_header *icmp = data; | |
703 | ||
704 | if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) { | |
705 | return false; | |
706 | } | |
707 | ||
708 | switch (icmp->icmp_type) { | |
709 | case ICMP4_ECHO_REQUEST: | |
710 | case ICMP4_ECHO_REPLY: | |
711 | case ICMP4_TIMESTAMP: | |
712 | case ICMP4_TIMESTAMPREPLY: | |
713 | case ICMP4_INFOREQUEST: | |
714 | case ICMP4_INFOREPLY: | |
715 | /* Separate ICMP connection: identified using id */ | |
716 | key->src.port = key->dst.port = icmp->icmp_fields.echo.id; | |
717 | break; | |
718 | case ICMP4_DST_UNREACH: | |
719 | case ICMP4_TIME_EXCEEDED: | |
720 | case ICMP4_PARAM_PROB: | |
721 | case ICMP4_SOURCEQUENCH: | |
722 | case ICMP4_REDIRECT: { | |
723 | /* ICMP packet part of another connection. We should | |
724 | * extract the key from embedded packet header */ | |
725 | struct conn_key inner_key; | |
726 | const char *l3 = (const char *) (icmp + 1); | |
727 | const char *tail = (const char *) data + size; | |
728 | const char *l4; | |
729 | bool ok; | |
730 | ||
731 | if (!related) { | |
732 | return false; | |
733 | } | |
734 | ||
735 | memset(&inner_key, 0, sizeof inner_key); | |
736 | inner_key.dl_type = htons(ETH_TYPE_IP); | |
737 | ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false); | |
738 | if (!ok) { | |
739 | return false; | |
740 | } | |
741 | ||
742 | /* pf doesn't do this, but it seems a good idea */ | |
743 | if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned | |
744 | || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) { | |
745 | return false; | |
746 | } | |
747 | ||
748 | key->src = inner_key.src; | |
749 | key->dst = inner_key.dst; | |
750 | key->nw_proto = inner_key.nw_proto; | |
751 | ||
752 | ok = extract_l4(key, l4, tail - l4, NULL, l3); | |
753 | if (ok) { | |
754 | conn_key_reverse(key); | |
755 | *related = true; | |
756 | } | |
757 | return ok; | |
758 | } | |
759 | default: | |
760 | return false; | |
761 | } | |
762 | ||
763 | return true; | |
764 | } | |
765 | ||
766 | /* If 'related' is not NULL and the function is processing an ICMP | |
767 | * error packet, extract the l3 and l4 fields from the nested header | |
768 | * instead and set *related to true. If 'related' is NULL we're | |
769 | * already processing a nested header and no such recursion is | |
770 | * possible */ | |
771 | static inline bool | |
772 | extract_l4_icmp6(struct conn_key *key, const void *data, size_t size, | |
773 | bool *related) | |
774 | { | |
775 | const struct icmp6_header *icmp6 = data; | |
776 | ||
777 | /* All the messages that we support need at least 4 bytes after | |
778 | * the header */ | |
779 | if (size < sizeof *icmp6 + 4) { | |
780 | return false; | |
781 | } | |
782 | ||
783 | switch (icmp6->icmp6_type) { | |
784 | case ICMP6_ECHO_REQUEST: | |
785 | case ICMP6_ECHO_REPLY: | |
786 | /* Separate ICMP connection: identified using id */ | |
787 | key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1); | |
788 | break; | |
789 | case ICMP6_DST_UNREACH: | |
790 | case ICMP6_PACKET_TOO_BIG: | |
791 | case ICMP6_TIME_EXCEEDED: | |
792 | case ICMP6_PARAM_PROB: { | |
793 | /* ICMP packet part of another connection. We should | |
794 | * extract the key from embedded packet header */ | |
795 | struct conn_key inner_key; | |
796 | const char *l3 = (const char *) icmp6 + 8; | |
797 | const char *tail = (const char *) data + size; | |
798 | const char *l4 = NULL; | |
799 | bool ok; | |
800 | ||
801 | if (!related) { | |
802 | return false; | |
803 | } | |
804 | ||
805 | memset(&inner_key, 0, sizeof inner_key); | |
806 | inner_key.dl_type = htons(ETH_TYPE_IPV6); | |
807 | ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4); | |
808 | if (!ok) { | |
809 | return false; | |
810 | } | |
811 | ||
812 | /* pf doesn't do this, but it seems a good idea */ | |
813 | if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned, | |
814 | &key->dst.addr.ipv6_aligned) | |
815 | || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned, | |
816 | &key->src.addr.ipv6_aligned)) { | |
817 | return false; | |
818 | } | |
819 | ||
820 | key->src = inner_key.src; | |
821 | key->dst = inner_key.dst; | |
822 | key->nw_proto = inner_key.nw_proto; | |
823 | ||
824 | ok = extract_l4(key, l4, tail - l4, NULL, l3); | |
825 | if (ok) { | |
826 | conn_key_reverse(key); | |
827 | *related = true; | |
828 | } | |
829 | return ok; | |
830 | } | |
831 | default: | |
832 | return false; | |
833 | } | |
834 | ||
835 | return true; | |
836 | } | |
837 | ||
838 | /* Extract l4 fields into 'key', which must already contain valid l3 | |
839 | * members. | |
840 | * | |
841 | * If 'related' is not NULL and an ICMP error packet is being | |
842 | * processed, the function will extract the key from the packet nested | |
843 | * in the ICMP paylod and set '*related' to true. | |
844 | * | |
845 | * If 'related' is NULL, it means that we're already parsing a header nested | |
846 | * in an ICMP error. In this case, we skip checksum and length validation. */ | |
847 | static inline bool | |
848 | extract_l4(struct conn_key *key, const void *data, size_t size, bool *related, | |
849 | const void *l3) | |
850 | { | |
851 | if (key->nw_proto == IPPROTO_TCP) { | |
852 | return (!related || check_l4_tcp(key, data, size, l3)) | |
853 | && extract_l4_tcp(key, data, size); | |
854 | } else if (key->nw_proto == IPPROTO_UDP) { | |
855 | return (!related || check_l4_udp(key, data, size, l3)) | |
856 | && extract_l4_udp(key, data, size); | |
857 | } else if (key->dl_type == htons(ETH_TYPE_IP) | |
858 | && key->nw_proto == IPPROTO_ICMP) { | |
859 | return (!related || check_l4_icmp(data, size)) | |
860 | && extract_l4_icmp(key, data, size, related); | |
861 | } else if (key->dl_type == htons(ETH_TYPE_IPV6) | |
862 | && key->nw_proto == IPPROTO_ICMPV6) { | |
863 | return (!related || check_l4_icmp6(key, data, size, l3)) | |
864 | && extract_l4_icmp6(key, data, size, related); | |
865 | } else { | |
866 | return false; | |
867 | } | |
868 | } | |
869 | ||
870 | static bool | |
871 | conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, | |
872 | struct conn_lookup_ctx *ctx, uint16_t zone) | |
873 | { | |
874 | const struct eth_header *l2 = dp_packet_l2(pkt); | |
875 | const struct ip_header *l3 = dp_packet_l3(pkt); | |
876 | const char *l4 = dp_packet_l4(pkt); | |
877 | const char *tail = dp_packet_tail(pkt); | |
878 | bool ok; | |
879 | ||
880 | memset(ctx, 0, sizeof *ctx); | |
881 | ||
882 | if (!l2 || !l3 || !l4) { | |
883 | return false; | |
884 | } | |
885 | ||
886 | ctx->key.zone = zone; | |
887 | ||
888 | /* XXX In this function we parse the packet (again, it has already | |
889 | * gone through miniflow_extract()) for two reasons: | |
890 | * | |
891 | * 1) To extract the l3 addresses and l4 ports. | |
892 | * We already have the l3 and l4 headers' pointers. Extracting | |
893 | * the l3 addresses and the l4 ports is really cheap, since they | |
894 | * can be found at fixed locations. | |
895 | * 2) To extract the l3 and l4 types. | |
896 | * Extracting the l3 and l4 types (especially the l3[1]) on the | |
897 | * other hand is quite expensive, because they're not at a | |
898 | * fixed location. | |
899 | * | |
900 | * Here's a way to avoid (2) with the help of the datapath. | |
901 | * The datapath doesn't keep the packet's extracted flow[2], so | |
902 | * using that is not an option. We could use the packet's matching | |
903 | * megaflow for l3 type (it's always unwildcarded), and for l4 type | |
904 | * (we have to unwildcard it first). This means either: | |
905 | * | |
906 | * a) dpif-netdev passes the matching megaflow to dp_execute_cb(), which | |
907 | * is used to extract the l3 type. Unfortunately, dp_execute_cb() is | |
908 | * used also in dpif_netdev_execute(), which doesn't have a matching | |
909 | * megaflow. | |
910 | * | |
911 | * b) We define an alternative OVS_ACTION_ATTR_CT, used only by the | |
912 | * userspace datapath, which includes l3 (and l4) type. The | |
913 | * alternative action could be generated by ofproto-dpif specifically | |
914 | * for the userspace datapath. Having a different interface for | |
915 | * userspace and kernel doesn't seem very clean, though. | |
916 | * | |
917 | * --- | |
918 | * [1] A simple benchmark (running only the connection tracker | |
919 | * over and over on the same packets) shows that if the | |
920 | * l3 type is already provided we are 15% faster (running the | |
921 | * connection tracker over a couple of DPDK devices with a | |
922 | * stream of UDP 64-bytes packets shows that we are 4% faster). | |
923 | * | |
924 | * [2] The reasons for this are that keeping the flow increases | |
925 | * (slightly) the cache footprint and increases computation | |
926 | * time as we move the packet around. Most importantly, the flow | |
927 | * should be updated by the actions and this can be slow, as | |
928 | * we use a sparse representation (miniflow). | |
929 | * | |
930 | */ | |
931 | ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2); | |
932 | if (ctx->key.dl_type == htons(ETH_TYPE_IP)) { | |
933 | ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL, true); | |
934 | } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { | |
935 | ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL); | |
936 | } else { | |
937 | ok = false; | |
938 | } | |
939 | ||
940 | if (ok) { | |
941 | if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related, l3)) { | |
942 | ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); | |
943 | return true; | |
944 | } | |
945 | } | |
946 | ||
947 | return false; | |
948 | } | |
949 | \f | |
950 | /* Symmetric */ | |
951 | static uint32_t | |
952 | conn_key_hash(const struct conn_key *key, uint32_t basis) | |
953 | { | |
954 | uint32_t hsrc, hdst, hash; | |
955 | int i; | |
956 | ||
957 | hsrc = hdst = basis; | |
958 | ||
959 | /* Hash the source and destination tuple */ | |
960 | for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) { | |
961 | hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]); | |
962 | hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]); | |
963 | } | |
964 | ||
965 | /* Even if source and destination are swapped the hash will be the same. */ | |
966 | hash = hsrc ^ hdst; | |
967 | ||
968 | /* Hash the rest of the key(L3 and L4 types and zone). */ | |
969 | hash = hash_words((uint32_t *) &key->dst + 1, | |
970 | (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1), | |
971 | hash); | |
972 | ||
973 | return hash; | |
974 | } | |
975 | ||
976 | static void | |
977 | conn_key_reverse(struct conn_key *key) | |
978 | { | |
979 | struct ct_endpoint tmp; | |
980 | tmp = key->src; | |
981 | key->src = key->dst; | |
982 | key->dst = tmp; | |
983 | } | |
984 | ||
985 | static void | |
986 | conn_key_lookup(struct conntrack_bucket *ctb, | |
987 | struct conn_lookup_ctx *ctx, | |
988 | long long now) | |
989 | { | |
990 | uint32_t hash = ctx->hash; | |
991 | struct conn *conn; | |
992 | ||
993 | ctx->conn = NULL; | |
994 | ||
995 | HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) { | |
996 | if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key)) | |
997 | && !conn_expired(conn, now)) { | |
998 | ctx->conn = conn; | |
999 | ctx->reply = false; | |
1000 | break; | |
1001 | } | |
1002 | if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key)) | |
1003 | && !conn_expired(conn, now)) { | |
1004 | ctx->conn = conn; | |
1005 | ctx->reply = true; | |
1006 | break; | |
1007 | } | |
1008 | } | |
1009 | } | |
1010 | ||
1011 | static enum ct_update_res | |
e6ef6cc6 DDP |
1012 | conn_update(struct conn *conn, struct conntrack_bucket *ctb, |
1013 | struct dp_packet *pkt, bool reply, long long now) | |
a489b168 | 1014 | { |
e6ef6cc6 DDP |
1015 | return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt, |
1016 | reply, now); | |
a489b168 DDP |
1017 | } |
1018 | ||
1019 | static bool | |
1020 | conn_expired(struct conn *conn, long long now) | |
1021 | { | |
1022 | return now >= conn->expiration; | |
1023 | } | |
1024 | ||
1025 | static bool | |
1026 | valid_new(struct dp_packet *pkt, struct conn_key *key) | |
1027 | { | |
1028 | return l4_protos[key->nw_proto]->valid_new(pkt); | |
1029 | } | |
1030 | ||
1031 | static struct conn * | |
e6ef6cc6 DDP |
1032 | new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt, |
1033 | struct conn_key *key, long long now) | |
a489b168 DDP |
1034 | { |
1035 | struct conn *newconn; | |
1036 | ||
e6ef6cc6 | 1037 | newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now); |
a489b168 DDP |
1038 | |
1039 | if (newconn) { | |
1040 | newconn->key = *key; | |
1041 | } | |
1042 | ||
1043 | return newconn; | |
1044 | } | |
1045 | ||
1046 | static void | |
1047 | delete_conn(struct conn *conn) | |
1048 | { | |
1049 | free(conn); | |
1050 | } |