]> git.proxmox.com Git - mirror_ovs.git/blame - lib/conntrack.c
conntrack: Fix conntrack new state
[mirror_ovs.git] / lib / conntrack.c
CommitLineData
a489b168 1/*
4ea96698 2 * Copyright (c) 2015-2019 Nicira, Inc.
a489b168
DDP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
bd5e81a0 18#include <ctype.h>
a489b168 19#include <errno.h>
ff6aa424 20#include <sys/types.h>
a489b168
DDP
21#include <netinet/in.h>
22#include <netinet/icmp6.h>
bd5e81a0 23#include <string.h>
a489b168
DDP
24
25#include "bitmap.h"
bd5e81a0 26#include "conntrack.h"
a489b168
DDP
27#include "conntrack-private.h"
28#include "coverage.h"
29#include "csum.h"
4d4e68ed 30#include "ct-dpif.h"
a489b168
DDP
31#include "dp-packet.h"
32#include "flow.h"
33#include "netdev.h"
34#include "odp-netlink.h"
35#include "openvswitch/hmap.h"
36#include "openvswitch/vlog.h"
37#include "ovs-rcu.h"
e6ef6cc6 38#include "ovs-thread.h"
fd016ae3 39#include "openvswitch/poll-loop.h"
a489b168
DDP
40#include "random.h"
41#include "timeval.h"
42
43VLOG_DEFINE_THIS_MODULE(conntrack);
44
45COVERAGE_DEFINE(conntrack_full);
e6ef6cc6 46COVERAGE_DEFINE(conntrack_long_cleanup);
a489b168
DDP
47
48struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
dbb597d3 53 bool icmp_related;
a489b168
DDP
54};
55
bd5e81a0
DB
56enum ftp_ctl_pkt {
57 /* Control packets with address and/or port specifiers. */
58 CT_FTP_CTL_INTEREST,
59 /* Control packets without address and/or port specifiers. */
60 CT_FTP_CTL_OTHER,
61 CT_FTP_CTL_INVALID,
62};
63
64enum ct_alg_mode {
65 CT_FTP_MODE_ACTIVE,
66 CT_FTP_MODE_PASSIVE,
7be77cb0 67 CT_TFTP_MODE,
bd5e81a0
DB
68};
69
94e71143
DB
70enum ct_alg_ctl_type {
71 CT_ALG_CTL_NONE,
72 CT_ALG_CTL_FTP,
73 CT_ALG_CTL_TFTP,
be38342d
DB
74 /* SIP is not enabled through Openflow and presently only used as
75 * an example of an alg that allows a wildcard src ip. */
76 CT_ALG_CTL_SIP,
94e71143
DB
77};
78
a7f33fdb
DB
79struct zone_limit {
80 struct hmap_node node;
81 struct conntrack_zone_limit czl;
82};
83
a489b168 84static bool conn_key_extract(struct conntrack *, struct dp_packet *,
66e4ad8a
DDP
85 ovs_be16 dl_type, struct conn_lookup_ctx *,
86 uint16_t zone);
a489b168
DDP
87static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
88static void conn_key_reverse(struct conn_key *);
a489b168 89static bool valid_new(struct dp_packet *pkt, struct conn_key *);
967bb5c5 90static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
e6ef6cc6 91 struct conn_key *, long long now);
967bb5c5 92static void delete_conn_cmn(struct conn *);
a489b168 93static void delete_conn(struct conn *);
967bb5c5
DB
94static void delete_conn_one(struct conn *conn);
95static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn,
96 struct dp_packet *pkt,
97 struct conn_lookup_ctx *ctx,
e6ef6cc6 98 long long now);
a489b168
DDP
99static bool conn_expired(struct conn *, long long now);
100static void set_mark(struct dp_packet *, struct conn *,
101 uint32_t val, uint32_t mask);
102static void set_label(struct dp_packet *, struct conn *,
103 const struct ovs_key_ct_labels *val,
104 const struct ovs_key_ct_labels *mask);
e6ef6cc6 105static void *clean_thread_main(void *f_);
a489b168 106
286de272
DB
107static bool
108nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
109 struct conn *nat_conn);
110
111static uint8_t
112reverse_icmp_type(uint8_t type);
113static uint8_t
114reverse_icmp6_type(uint8_t type);
115static inline bool
116extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
117 const char **new_data, bool validate_checksum);
118static inline bool
119extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
120 const char **new_data);
bd5e81a0 121static struct alg_exp_node *
be38342d
DB
122expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
123 uint32_t basis, bool src_ip_wc);
bd5e81a0
DB
124
125static int
126repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
127 char *ftp_data_v4_start,
cd7c99a6 128 size_t addr_offset_from_ftp_data_start, size_t addr_size);
bd5e81a0
DB
129
130static enum ftp_ctl_pkt
131process_ftp_ctl_v4(struct conntrack *ct,
132 struct dp_packet *pkt,
133 const struct conn *conn_for_expectation,
4417ca3d 134 ovs_be32 *v4_addr_rep,
bd5e81a0 135 char **ftp_data_v4_start,
cd7c99a6
DB
136 size_t *addr_offset_from_ftp_data_start,
137 size_t *addr_size);
bd5e81a0
DB
138
139static enum ftp_ctl_pkt
140detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
141 struct dp_packet *pkt);
142
4417ca3d 143static void
967bb5c5 144expectation_clean(struct conntrack *ct, const struct conn_key *master_key);
4417ca3d 145
94e71143
DB
146static struct ct_l4_proto *l4_protos[] = {
147 [IPPROTO_TCP] = &ct_proto_tcp,
148 [IPPROTO_UDP] = &ct_proto_other,
149 [IPPROTO_ICMP] = &ct_proto_icmp4,
150 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
151};
152
bd5e81a0
DB
153static void
154handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
967bb5c5
DB
155 struct dp_packet *pkt, struct conn *ec, long long now,
156 enum ftp_ctl_pkt ftp_ctl, bool nat);
bd5e81a0 157
7be77cb0
DB
158static void
159handle_tftp_ctl(struct conntrack *ct,
94e71143 160 const struct conn_lookup_ctx *ctx OVS_UNUSED,
967bb5c5
DB
161 struct dp_packet *pkt, struct conn *conn_for_expectation,
162 long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED,
163 bool nat OVS_UNUSED);
94e71143
DB
164
165typedef void (*alg_helper)(struct conntrack *ct,
166 const struct conn_lookup_ctx *ctx,
167 struct dp_packet *pkt,
967bb5c5 168 struct conn *conn_for_expectation,
94e71143
DB
169 long long now, enum ftp_ctl_pkt ftp_ctl,
170 bool nat);
171
172static alg_helper alg_helpers[] = {
173 [CT_ALG_CTL_NONE] = NULL,
174 [CT_ALG_CTL_FTP] = handle_ftp_ctl,
175 [CT_ALG_CTL_TFTP] = handle_tftp_ctl,
a489b168
DDP
176};
177
178long long ct_timeout_val[] = {
179#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
180 CT_TIMEOUTS
181#undef CT_TIMEOUT
182};
183
bd5e81a0
DB
184/* The maximum TCP or UDP port number. */
185#define CT_MAX_L4_PORT 65535
bd5e81a0
DB
186/* String buffer used for parsing FTP string messages.
187 * This is sized about twice what is needed to leave some
188 * margin of error. */
189#define LARGEST_FTP_MSG_OF_INTEREST 128
190/* FTP port string used in active mode. */
191#define FTP_PORT_CMD "PORT"
192/* FTP pasv string used in passive mode. */
193#define FTP_PASV_REPLY_CODE "227"
194/* Maximum decimal digits for port in FTP command.
195 * The port is represented as two 3 digit numbers with the
196 * high part a multiple of 256. */
197#define MAX_FTP_PORT_DGTS 3
198
199/* FTP extension EPRT string used for active mode. */
200#define FTP_EPRT_CMD "EPRT"
201/* FTP extension EPSV string used for passive mode. */
202#define FTP_EPSV_REPLY "EXTENDED PASSIVE"
203/* Maximum decimal digits for port in FTP extended command. */
204#define MAX_EXT_FTP_PORT_DGTS 5
205/* FTP extended command code for IPv6. */
206#define FTP_AF_V6 '2'
207/* Used to indicate a wildcard L4 source port number for ALGs.
208 * This is used for port numbers that we cannot predict in
209 * expectations. */
210#define ALG_WC_SRC_PORT 0
211
a489b168 212/* If the total number of connections goes above this value, no new connections
286de272 213 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
a489b168
DDP
214#define DEFAULT_N_CONN_LIMIT 3000000
215
5ed7a0b4
DB
216/* Does a member by member comparison of two conn_keys; this
217 * function must be kept in sync with struct conn_key; returns 0
218 * if the keys are equal or 1 if the keys are not equal. */
219static int
220conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
221{
222 if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
223 !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
224 (key1->src.icmp_id == key2->src.icmp_id) &&
225 (key1->src.icmp_type == key2->src.icmp_type) &&
226 (key1->src.icmp_code == key2->src.icmp_code) &&
227 (key1->dst.icmp_id == key2->dst.icmp_id) &&
228 (key1->dst.icmp_type == key2->dst.icmp_type) &&
229 (key1->dst.icmp_code == key2->dst.icmp_code) &&
230 (key1->dl_type == key2->dl_type) &&
231 (key1->zone == key2->zone) &&
232 (key1->nw_proto == key2->nw_proto)) {
233
234 return 0;
235 }
236 return 1;
237}
238
d8682ee5 239static void
dec0dbbc
DB
240ct_print_conn_info(const struct conn *c, const char *log_msg,
241 enum vlog_level vll, bool force, bool rl_on)
66f400f5
DB
242{
243#define CT_VLOG(RL_ON, LEVEL, ...) \
244 do { \
245 if (RL_ON) { \
246 static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
247 vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \
248 } else { \
249 vlog(&this_module, LEVEL, __VA_ARGS__); \
250 } \
251 } while (0)
252
253 if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) {
254 if (c->key.dl_type == htons(ETH_TYPE_IP)) {
255 CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src "
256 "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports "
257 "%"PRIu16"/%"PRIu16" rev src/dst ports "
258 "%"PRIu16"/%"PRIu16" zone/rev zone "
259 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
260 "%"PRIu8"/%"PRIu8, log_msg,
cda1b109
DB
261 IP_ARGS(c->key.src.addr.ipv4),
262 IP_ARGS(c->key.dst.addr.ipv4),
263 IP_ARGS(c->rev_key.src.addr.ipv4),
264 IP_ARGS(c->rev_key.dst.addr.ipv4),
66f400f5
DB
265 ntohs(c->key.src.port), ntohs(c->key.dst.port),
266 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
267 c->key.zone, c->rev_key.zone, c->key.nw_proto,
268 c->rev_key.nw_proto);
269 } else {
270 char ip6_s[INET6_ADDRSTRLEN];
271 inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s);
272 char ip6_d[INET6_ADDRSTRLEN];
273 inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d);
274 char ip6_rs[INET6_ADDRSTRLEN];
275 inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs,
276 sizeof ip6_rs);
277 char ip6_rd[INET6_ADDRSTRLEN];
278 inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd,
279 sizeof ip6_rd);
280
281 CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s"
282 " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16
283 " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone "
284 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
285 "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs,
286 ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port),
287 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
288 c->key.zone, c->rev_key.zone, c->key.nw_proto,
289 c->rev_key.nw_proto);
290 }
291 }
292}
293
a489b168
DDP
294/* Initializes the connection tracker 'ct'. The caller is responsible for
295 * calling 'conntrack_destroy()', when the instance is not needed anymore */
57593fd2
DB
296struct conntrack *
297conntrack_init(void)
a489b168 298{
57593fd2
DB
299 struct conntrack *ct = xzalloc(sizeof *ct);
300
967bb5c5
DB
301 ovs_rwlock_init(&ct->resources_lock);
302 ovs_rwlock_wrlock(&ct->resources_lock);
bd5e81a0 303 hmap_init(&ct->alg_expectations);
4417ca3d 304 hindex_init(&ct->alg_expectation_refs);
967bb5c5 305 ovs_rwlock_unlock(&ct->resources_lock);
a489b168 306
967bb5c5
DB
307 ovs_mutex_init_adaptive(&ct->ct_lock);
308 ovs_mutex_lock(&ct->ct_lock);
309 cmap_init(&ct->conns);
310 for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) {
311 ovs_list_init(&ct->exp_lists[i]);
a489b168 312 }
a7f33fdb
DB
313 hmap_init(&ct->zone_limits);
314 ct->zone_limit_seq = 0;
967bb5c5
DB
315 ovs_mutex_unlock(&ct->ct_lock);
316
a489b168
DDP
317 ct->hash_basis = random_uint32();
318 atomic_count_init(&ct->n_conn, 0);
319 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
64207120 320 atomic_init(&ct->tcp_seq_chk, true);
e6ef6cc6
DDP
321 latch_init(&ct->clean_thread_exit);
322 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
4ea96698 323 ct->ipf = ipf_init();
57593fd2
DB
324
325 return ct;
a489b168
DDP
326}
327
a7f33fdb
DB
328static uint32_t
329zone_key_hash(int32_t zone, uint32_t basis)
330{
331 size_t hash = hash_int((OVS_FORCE uint32_t) zone, basis);
332 return hash;
333}
334
335static struct zone_limit *
336zone_limit_lookup(struct conntrack *ct, int32_t zone)
337 OVS_REQUIRES(ct->ct_lock)
338{
339 uint32_t hash = zone_key_hash(zone, ct->hash_basis);
340 struct zone_limit *zl;
341 HMAP_FOR_EACH_IN_BUCKET (zl, node, hash, &ct->zone_limits) {
342 if (zl->czl.zone == zone) {
343 return zl;
344 }
345 }
346 return NULL;
347}
348
349static struct zone_limit *
350zone_limit_lookup_or_default(struct conntrack *ct, int32_t zone)
351 OVS_REQUIRES(ct->ct_lock)
352{
353 struct zone_limit *zl = zone_limit_lookup(ct, zone);
354 return zl ? zl : zone_limit_lookup(ct, DEFAULT_ZONE);
355}
356
357struct conntrack_zone_limit
358zone_limit_get(struct conntrack *ct, int32_t zone)
359{
360 ovs_mutex_lock(&ct->ct_lock);
361 struct conntrack_zone_limit czl = {DEFAULT_ZONE, 0, 0, 0};
362 struct zone_limit *zl = zone_limit_lookup_or_default(ct, zone);
363 if (zl) {
364 czl = zl->czl;
365 }
366 ovs_mutex_unlock(&ct->ct_lock);
367 return czl;
368}
369
370static int
371zone_limit_create(struct conntrack *ct, int32_t zone, uint32_t limit)
372 OVS_REQUIRES(ct->ct_lock)
373{
374 if (zone >= DEFAULT_ZONE && zone <= MAX_ZONE) {
375 struct zone_limit *zl = xzalloc(sizeof *zl);
376 zl->czl.limit = limit;
377 zl->czl.zone = zone;
378 zl->czl.zone_limit_seq = ct->zone_limit_seq++;
379 uint32_t hash = zone_key_hash(zone, ct->hash_basis);
380 hmap_insert(&ct->zone_limits, &zl->node, hash);
381 return 0;
382 } else {
383 return EINVAL;
384 }
385}
386
387int
388zone_limit_update(struct conntrack *ct, int32_t zone, uint32_t limit)
389{
390 int err = 0;
391 ovs_mutex_lock(&ct->ct_lock);
392 struct zone_limit *zl = zone_limit_lookup(ct, zone);
393 if (zl) {
394 zl->czl.limit = limit;
395 VLOG_INFO("Changed zone limit of %u for zone %d", limit, zone);
396 } else {
397 err = zone_limit_create(ct, zone, limit);
398 if (!err) {
399 VLOG_INFO("Created zone limit of %u for zone %d", limit, zone);
400 } else {
401 VLOG_WARN("Request to create zone limit for invalid zone %d",
402 zone);
403 }
404 }
405 ovs_mutex_unlock(&ct->ct_lock);
406 return err;
407}
408
409static void
410zone_limit_clean(struct conntrack *ct, struct zone_limit *zl)
411 OVS_REQUIRES(ct->ct_lock)
412{
413 hmap_remove(&ct->zone_limits, &zl->node);
414 free(zl);
415}
416
417int
418zone_limit_delete(struct conntrack *ct, uint16_t zone)
419{
420 ovs_mutex_lock(&ct->ct_lock);
421 struct zone_limit *zl = zone_limit_lookup(ct, zone);
422 if (zl) {
423 zone_limit_clean(ct, zl);
424 VLOG_INFO("Deleted zone limit for zone %d", zone);
425 } else {
426 VLOG_INFO("Attempted delete of non-existent zone limit: zone %d",
427 zone);
428 }
429 ovs_mutex_unlock(&ct->ct_lock);
430 return 0;
431}
432
967bb5c5
DB
433static void
434conn_clean_cmn(struct conntrack *ct, struct conn *conn)
435 OVS_REQUIRES(ct->ct_lock)
436{
437 if (conn->alg) {
438 expectation_clean(ct, &conn->key);
439 }
440
441 uint32_t hash = conn_key_hash(&conn->key, ct->hash_basis);
442 cmap_remove(&ct->conns, &conn->cm_node, hash);
a7f33fdb
DB
443
444 struct zone_limit *zl = zone_limit_lookup(ct, conn->admit_zone);
445 if (zl && zl->czl.zone_limit_seq == conn->zone_limit_seq) {
446 zl->czl.count--;
447 }
967bb5c5
DB
448}
449
450/* Must be called with 'conn' of 'conn_type' CT_CONN_TYPE_DEFAULT. Also
451 * removes the associated nat 'conn' from the lookup datastructures. */
452static void
453conn_clean(struct conntrack *ct, struct conn *conn)
454 OVS_REQUIRES(ct->ct_lock)
455{
456 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
457
458 conn_clean_cmn(ct, conn);
459 if (conn->nat_conn) {
460 uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
461 cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
462 }
463 ovs_list_remove(&conn->exp_node);
5f918a8a 464 conn->cleaned = true;
967bb5c5
DB
465 ovsrcu_postpone(delete_conn, conn);
466 atomic_count_dec(&ct->n_conn);
467}
468
469static void
470conn_clean_one(struct conntrack *ct, struct conn *conn)
471 OVS_REQUIRES(ct->ct_lock)
472{
473 conn_clean_cmn(ct, conn);
474 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
475 ovs_list_remove(&conn->exp_node);
5f918a8a 476 conn->cleaned = true;
967bb5c5
DB
477 atomic_count_dec(&ct->n_conn);
478 }
479 ovsrcu_postpone(delete_conn_one, conn);
480}
481
482/* Destroys the connection tracker 'ct' and frees all the allocated memory.
483 * The caller of this function must already have shut down packet input
484 * and PMD threads (which would have been quiesced). */
a489b168
DDP
485void
486conntrack_destroy(struct conntrack *ct)
487{
967bb5c5 488 struct conn *conn;
e6ef6cc6
DDP
489 latch_set(&ct->clean_thread_exit);
490 pthread_join(ct->clean_thread, NULL);
491 latch_destroy(&ct->clean_thread_exit);
a489b168 492
967bb5c5
DB
493 ovs_mutex_lock(&ct->ct_lock);
494 CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
495 conn_clean_one(ct, conn);
a489b168 496 }
967bb5c5 497 cmap_destroy(&ct->conns);
a7f33fdb
DB
498
499 struct zone_limit *zl;
500 HMAP_FOR_EACH_POP (zl, node, &ct->zone_limits) {
501 free(zl);
502 }
503 hmap_destroy(&ct->zone_limits);
504
967bb5c5
DB
505 ovs_mutex_unlock(&ct->ct_lock);
506 ovs_mutex_destroy(&ct->ct_lock);
bd5e81a0 507
967bb5c5 508 ovs_rwlock_wrlock(&ct->resources_lock);
bd5e81a0
DB
509 struct alg_exp_node *alg_exp_node;
510 HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
511 free(alg_exp_node);
512 }
bd5e81a0 513 hmap_destroy(&ct->alg_expectations);
4417ca3d 514 hindex_destroy(&ct->alg_expectation_refs);
967bb5c5
DB
515 ovs_rwlock_unlock(&ct->resources_lock);
516 ovs_rwlock_destroy(&ct->resources_lock);
517
4ea96698 518 ipf_destroy(ct->ipf);
21ffe409 519 free(ct);
a489b168
DDP
520}
521\f
967bb5c5
DB
522
523static bool
524conn_key_lookup(struct conntrack *ct, const struct conn_key *key,
525 uint32_t hash, long long now, struct conn **conn_out,
526 bool *reply)
a489b168 527{
967bb5c5
DB
528 struct conn *conn;
529 bool found = false;
530
531 CMAP_FOR_EACH_WITH_HASH (conn, cm_node, hash, &ct->conns) {
532 if (!conn_key_cmp(&conn->key, key) && !conn_expired(conn, now)) {
533 found = true;
534 if (reply) {
535 *reply = false;
536 }
537 break;
538 }
539 if (!conn_key_cmp(&conn->rev_key, key) && !conn_expired(conn, now)) {
540 found = true;
541 if (reply) {
542 *reply = true;
543 }
544 break;
545 }
546 }
a489b168 547
967bb5c5
DB
548 if (found && conn_out) {
549 *conn_out = conn;
550 } else if (conn_out) {
551 *conn_out = NULL;
552 }
553 return found;
a489b168
DDP
554}
555
4048c508
DB
556static bool
557conn_lookup(struct conntrack *ct, const struct conn_key *key,
558 long long now, struct conn **conn_out, bool *reply)
559{
560 uint32_t hash = conn_key_hash(key, ct->hash_basis);
561 return conn_key_lookup(ct, key, hash, now, conn_out, reply);
562}
563
a489b168 564static void
286de272 565write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
bd5e81a0 566 const struct conn_key *key, const struct alg_exp_node *alg_exp)
a489b168 567{
286de272 568 pkt->md.ct_state |= CS_TRACKED;
a489b168 569 pkt->md.ct_zone = zone;
967bb5c5
DB
570
571 if (conn) {
572 ovs_mutex_lock(&conn->lock);
573 pkt->md.ct_mark = conn->mark;
574 pkt->md.ct_label = conn->label;
575 ovs_mutex_unlock(&conn->lock);
576 } else {
577 pkt->md.ct_mark = 0;
578 pkt->md.ct_label = OVS_U128_ZERO;
579 }
daf4d3c1
JR
580
581 /* Use the original direction tuple if we have it. */
582 if (conn) {
bd5e81a0
DB
583 if (conn->alg_related) {
584 key = &conn->master_key;
585 } else {
586 key = &conn->key;
587 }
588 } else if (alg_exp) {
589 pkt->md.ct_mark = alg_exp->master_mark;
590 pkt->md.ct_label = alg_exp->master_label;
591 key = &alg_exp->master_key;
daf4d3c1 592 }
dec0dbbc 593
daf4d3c1 594 pkt->md.ct_orig_tuple_ipv6 = false;
dec0dbbc 595
daf4d3c1
JR
596 if (key) {
597 if (key->dl_type == htons(ETH_TYPE_IP)) {
598 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
cda1b109
DB
599 key->src.addr.ipv4,
600 key->dst.addr.ipv4,
daf4d3c1
JR
601 key->nw_proto != IPPROTO_ICMP
602 ? key->src.port : htons(key->src.icmp_type),
603 key->nw_proto != IPPROTO_ICMP
604 ? key->dst.port : htons(key->src.icmp_code),
605 key->nw_proto,
606 };
286de272 607 } else {
daf4d3c1
JR
608 pkt->md.ct_orig_tuple_ipv6 = true;
609 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
cda1b109
DB
610 key->src.addr.ipv6,
611 key->dst.addr.ipv6,
daf4d3c1
JR
612 key->nw_proto != IPPROTO_ICMPV6
613 ? key->src.port : htons(key->src.icmp_type),
614 key->nw_proto != IPPROTO_ICMPV6
615 ? key->dst.port : htons(key->src.icmp_code),
616 key->nw_proto,
617 };
618 }
619 } else {
620 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
621 }
bd5e81a0
DB
622}
623
624static uint8_t
625get_ip_proto(const struct dp_packet *pkt)
626{
627 uint8_t ip_proto;
628 struct eth_header *l2 = dp_packet_eth(pkt);
629 if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
630 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
631 ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
632 } else {
633 struct ip_header *l3_hdr = dp_packet_l3(pkt);
634 ip_proto = l3_hdr->ip_proto;
635 }
286de272 636
bd5e81a0
DB
637 return ip_proto;
638}
639
640static bool
94e71143 641is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl)
bd5e81a0 642{
94e71143 643 return ct_alg_ctl == CT_ALG_CTL_FTP;
bd5e81a0
DB
644}
645
94e71143 646static enum ct_alg_ctl_type
bd7d93f8
DB
647get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst,
648 const char *helper)
7be77cb0 649{
94e71143
DB
650 /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined
651 * in OSX, at least in in.h. Since these values will never change, remove
7be77cb0 652 * the external dependency. */
94e71143
DB
653 enum { CT_IPPORT_FTP = 21 };
654 enum { CT_IPPORT_TFTP = 69 };
bd7d93f8
DB
655 uint8_t ip_proto = get_ip_proto(pkt);
656 struct udp_header *uh = dp_packet_l4(pkt);
657 struct tcp_header *th = dp_packet_l4(pkt);
658 ovs_be16 ftp_src_port = htons(CT_IPPORT_FTP);
659 ovs_be16 ftp_dst_port = htons(CT_IPPORT_FTP);
660 ovs_be16 tftp_dst_port = htons(CT_IPPORT_TFTP);
661
662 if (OVS_UNLIKELY(tp_dst)) {
663 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
664 ftp_dst_port = tp_dst;
665 } else if (helper && !strncmp(helper, "tftp", strlen("tftp"))) {
666 tftp_dst_port = tp_dst;
667 }
668 } else if (OVS_UNLIKELY(tp_src)) {
669 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
670 ftp_src_port = tp_src;
671 }
672 }
7be77cb0 673
bd7d93f8 674 if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_dst_port) {
94e71143
DB
675 return CT_ALG_CTL_TFTP;
676 } else if (ip_proto == IPPROTO_TCP &&
bd7d93f8 677 (th->tcp_src == ftp_src_port || th->tcp_dst == ftp_dst_port)) {
94e71143
DB
678 return CT_ALG_CTL_FTP;
679 }
680 return CT_ALG_CTL_NONE;
681}
682
be38342d
DB
683static bool
684alg_src_ip_wc(enum ct_alg_ctl_type alg_ctl_type)
685{
686 if (alg_ctl_type == CT_ALG_CTL_SIP) {
687 return true;
688 }
689 return false;
690}
691
94e71143
DB
692static void
693handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
694 struct dp_packet *pkt, enum ct_alg_ctl_type ct_alg_ctl,
967bb5c5 695 struct conn *conn, long long now, bool nat)
94e71143
DB
696{
697 /* ALG control packet handling with expectation creation. */
3a2a425b 698 if (OVS_UNLIKELY(alg_helpers[ct_alg_ctl] && conn && conn->alg)) {
967bb5c5
DB
699 ovs_mutex_lock(&conn->lock);
700 alg_helpers[ct_alg_ctl](ct, ctx, pkt, conn, now, CT_FTP_CTL_INTEREST,
701 nat);
702 ovs_mutex_unlock(&conn->lock);
94e71143 703 }
7be77cb0
DB
704}
705
286de272
DB
706static void
707pat_packet(struct dp_packet *pkt, const struct conn *conn)
708{
709 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
710 if (conn->key.nw_proto == IPPROTO_TCP) {
711 struct tcp_header *th = dp_packet_l4(pkt);
712 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
713 } else if (conn->key.nw_proto == IPPROTO_UDP) {
714 struct udp_header *uh = dp_packet_l4(pkt);
715 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
716 }
717 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
718 if (conn->key.nw_proto == IPPROTO_TCP) {
719 struct tcp_header *th = dp_packet_l4(pkt);
720 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
721 } else if (conn->key.nw_proto == IPPROTO_UDP) {
722 struct udp_header *uh = dp_packet_l4(pkt);
723 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
724 }
725 }
726}
727
728static void
729nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
730{
731 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
732 pkt->md.ct_state |= CS_SRC_NAT;
733 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
734 struct ip_header *nh = dp_packet_l3(pkt);
735 packet_set_ipv4_addr(pkt, &nh->ip_src,
cda1b109 736 conn->rev_key.dst.addr.ipv4);
286de272
DB
737 } else {
738 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
739 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
740 nh6->ip6_src.be32,
cda1b109 741 &conn->rev_key.dst.addr.ipv6, true);
286de272
DB
742 }
743 if (!related) {
744 pat_packet(pkt, conn);
745 }
746 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
747 pkt->md.ct_state |= CS_DST_NAT;
748 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
749 struct ip_header *nh = dp_packet_l3(pkt);
750 packet_set_ipv4_addr(pkt, &nh->ip_dst,
cda1b109 751 conn->rev_key.src.addr.ipv4);
286de272
DB
752 } else {
753 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
754 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
755 nh6->ip6_dst.be32,
cda1b109 756 &conn->rev_key.src.addr.ipv6, true);
286de272
DB
757 }
758 if (!related) {
759 pat_packet(pkt, conn);
760 }
761 }
762}
763
764static void
765un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
766{
767 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
768 if (conn->key.nw_proto == IPPROTO_TCP) {
769 struct tcp_header *th = dp_packet_l4(pkt);
770 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
771 } else if (conn->key.nw_proto == IPPROTO_UDP) {
772 struct udp_header *uh = dp_packet_l4(pkt);
773 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
774 }
775 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
776 if (conn->key.nw_proto == IPPROTO_TCP) {
777 struct tcp_header *th = dp_packet_l4(pkt);
778 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
779 } else if (conn->key.nw_proto == IPPROTO_UDP) {
780 struct udp_header *uh = dp_packet_l4(pkt);
781 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
782 }
783 }
784}
785
edd1bef4
DB
786static void
787reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
788{
789 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
790 if (conn->key.nw_proto == IPPROTO_TCP) {
791 struct tcp_header *th_in = dp_packet_l4(pkt);
792 packet_set_tcp_port(pkt, conn->key.src.port,
793 th_in->tcp_dst);
794 } else if (conn->key.nw_proto == IPPROTO_UDP) {
795 struct udp_header *uh_in = dp_packet_l4(pkt);
796 packet_set_udp_port(pkt, conn->key.src.port,
797 uh_in->udp_dst);
798 }
799 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
800 if (conn->key.nw_proto == IPPROTO_TCP) {
801 struct tcp_header *th_in = dp_packet_l4(pkt);
802 packet_set_tcp_port(pkt, th_in->tcp_src,
803 conn->key.dst.port);
804 } else if (conn->key.nw_proto == IPPROTO_UDP) {
805 struct udp_header *uh_in = dp_packet_l4(pkt);
806 packet_set_udp_port(pkt, uh_in->udp_src,
807 conn->key.dst.port);
808 }
809 }
810}
811
812static void
813reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
814{
815 char *tail = dp_packet_tail(pkt);
ba5ca284 816 uint8_t pad = dp_packet_l2_pad_size(pkt);
edd1bef4
DB
817 struct conn_key inner_key;
818 const char *inner_l4 = NULL;
819 uint16_t orig_l3_ofs = pkt->l3_ofs;
820 uint16_t orig_l4_ofs = pkt->l4_ofs;
821
822 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
823 struct ip_header *nh = dp_packet_l3(pkt);
824 struct icmp_header *icmp = dp_packet_l4(pkt);
825 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
ba5ca284
DB
826 /* This call is already verified to succeed during the code path from
827 * 'conn_key_extract()' which calls 'extract_l4_icmp()'. */
bd5e81a0
DB
828 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad,
829 &inner_l4, false);
edd1bef4
DB
830 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
831 pkt->l4_ofs += inner_l4 - (char *) icmp;
832
833 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
834 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
cda1b109 835 conn->key.src.addr.ipv4);
edd1bef4
DB
836 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
837 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
cda1b109 838 conn->key.dst.addr.ipv4);
edd1bef4 839 }
dec0dbbc 840
edd1bef4
DB
841 reverse_pat_packet(pkt, conn);
842 icmp->icmp_csum = 0;
843 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
844 } else {
845 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
361a47d6 846 struct icmp6_data_header *icmp6 = dp_packet_l4(pkt);
edd1bef4
DB
847 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
848 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
ba5ca284
DB
849 /* This call is already verified to succeed during the code path from
850 * 'conn_key_extract()' which calls 'extract_l4_icmp6()'. */
edd1bef4
DB
851 extract_l3_ipv6(&inner_key, inner_l3_6,
852 tail - ((char *)inner_l3_6) - pad,
853 &inner_l4);
854 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
855 pkt->l4_ofs += inner_l4 - (char *) icmp6;
856
857 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
858 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
859 inner_l3_6->ip6_src.be32,
cda1b109 860 &conn->key.src.addr.ipv6, true);
edd1bef4
DB
861 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
862 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
863 inner_l3_6->ip6_dst.be32,
cda1b109 864 &conn->key.dst.addr.ipv6, true);
edd1bef4
DB
865 }
866 reverse_pat_packet(pkt, conn);
edd1bef4 867 icmp6->icmp6_base.icmp6_cksum = 0;
76d85771
DB
868 icmp6->icmp6_base.icmp6_cksum = packet_csum_upperlayer6(nh6, icmp6,
869 IPPROTO_ICMPV6, tail - (char *) icmp6 - pad);
edd1bef4
DB
870 }
871 pkt->l3_ofs = orig_l3_ofs;
872 pkt->l4_ofs = orig_l4_ofs;
873}
874
286de272
DB
875static void
876un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
877 bool related)
878{
879 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
880 pkt->md.ct_state |= CS_DST_NAT;
881 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
882 struct ip_header *nh = dp_packet_l3(pkt);
883 packet_set_ipv4_addr(pkt, &nh->ip_dst,
cda1b109 884 conn->key.src.addr.ipv4);
286de272
DB
885 } else {
886 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
887 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
888 nh6->ip6_dst.be32,
cda1b109 889 &conn->key.src.addr.ipv6, true);
286de272 890 }
edd1bef4
DB
891
892 if (OVS_UNLIKELY(related)) {
893 reverse_nat_packet(pkt, conn);
894 } else {
286de272
DB
895 un_pat_packet(pkt, conn);
896 }
897 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
898 pkt->md.ct_state |= CS_SRC_NAT;
899 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
900 struct ip_header *nh = dp_packet_l3(pkt);
901 packet_set_ipv4_addr(pkt, &nh->ip_src,
cda1b109 902 conn->key.dst.addr.ipv4);
286de272
DB
903 } else {
904 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
905 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
906 nh6->ip6_src.be32,
cda1b109 907 &conn->key.dst.addr.ipv6, true);
286de272 908 }
edd1bef4
DB
909
910 if (OVS_UNLIKELY(related)) {
911 reverse_nat_packet(pkt, conn);
912 } else {
286de272
DB
913 un_pat_packet(pkt, conn);
914 }
915 }
916}
917
bd5e81a0 918static void
967bb5c5 919conn_seq_skew_set(struct conntrack *ct, const struct conn *conn_in,
bd5e81a0 920 long long now, int seq_skew, bool seq_skew_dir)
967bb5c5 921 OVS_NO_THREAD_SAFETY_ANALYSIS
bd5e81a0 922{
967bb5c5 923 struct conn *conn;
967bb5c5 924 ovs_mutex_unlock(&conn_in->lock);
4048c508 925 conn_lookup(ct, &conn_in->key, now, &conn, NULL);
967bb5c5
DB
926 ovs_mutex_lock(&conn_in->lock);
927
bd5e81a0
DB
928 if (conn && seq_skew) {
929 conn->seq_skew = seq_skew;
930 conn->seq_skew_dir = seq_skew_dir;
931 }
a720a7fa
DB
932}
933
3a2a425b
DB
934static bool
935ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
936{
937 if (ct_alg_ctl == CT_ALG_CTL_NONE) {
938 return true;
939 } else if (helper) {
940 if ((ct_alg_ctl == CT_ALG_CTL_FTP) &&
941 !strncmp(helper, "ftp", strlen("ftp"))) {
942 return true;
943 } else if ((ct_alg_ctl == CT_ALG_CTL_TFTP) &&
944 !strncmp(helper, "tftp", strlen("tftp"))) {
945 return true;
946 } else {
947 return false;
948 }
949 } else {
950 return false;
951 }
952}
953
a489b168
DDP
954static struct conn *
955conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
286de272
DB
956 struct conn_lookup_ctx *ctx, bool commit, long long now,
957 const struct nat_action_info_t *nat_action_info,
967bb5c5 958 const char *helper, const struct alg_exp_node *alg_exp,
3a2a425b 959 enum ct_alg_ctl_type ct_alg_ctl)
967bb5c5 960 OVS_REQUIRES(ct->ct_lock)
a489b168 961{
a489b168 962 struct conn *nc = NULL;
967bb5c5 963 struct conn *nat_conn = NULL;
a489b168
DDP
964
965 if (!valid_new(pkt, &ctx->key)) {
286de272 966 pkt->md.ct_state = CS_INVALID;
a489b168
DDP
967 return nc;
968 }
dec0dbbc 969
286de272 970 pkt->md.ct_state = CS_NEW;
dec0dbbc 971
bd5e81a0
DB
972 if (alg_exp) {
973 pkt->md.ct_state |= CS_RELATED;
974 }
a489b168
DDP
975
976 if (commit) {
a7f33fdb
DB
977 struct zone_limit *zl = zone_limit_lookup_or_default(ct,
978 ctx->key.zone);
979 if (zl && zl->czl.count >= zl->czl.limit) {
980 return nc;
981 }
982
a489b168 983 unsigned int n_conn_limit;
a489b168 984 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
a489b168
DDP
985 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
986 COVERAGE_INC(conntrack_full);
987 return nc;
988 }
989
967bb5c5 990 nc = new_conn(ct, pkt, &ctx->key, now);
a720a7fa 991 memcpy(&nc->key, &ctx->key, sizeof nc->key);
82b9ac94 992 memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key);
286de272 993 conn_key_reverse(&nc->rev_key);
a489b168 994
3a2a425b
DB
995 if (ct_verify_helper(helper, ct_alg_ctl)) {
996 nc->alg = nullable_xstrdup(helper);
bd5e81a0
DB
997 }
998
999 if (alg_exp) {
1000 nc->alg_related = true;
1001 nc->mark = alg_exp->master_mark;
1002 nc->label = alg_exp->master_label;
1003 nc->master_key = alg_exp->master_key;
1004 }
1005
286de272
DB
1006 if (nat_action_info) {
1007 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
967bb5c5 1008 nat_conn = xzalloc(sizeof *nat_conn);
a489b168 1009
bd5e81a0 1010 if (alg_exp) {
be38342d 1011 if (alg_exp->nat_rpl_dst) {
bd5e81a0
DB
1012 nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
1013 nc->nat_info->nat_action = NAT_ACTION_SRC;
1014 } else {
1015 nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
1016 nc->nat_info->nat_action = NAT_ACTION_DST;
1017 }
bd5e81a0 1018 } else {
967bb5c5
DB
1019 memcpy(nat_conn, nc, sizeof *nat_conn);
1020 bool nat_res = nat_select_range_tuple(ct, nc, nat_conn);
286de272 1021
bd5e81a0
DB
1022 if (!nat_res) {
1023 goto nat_res_exhaustion;
1024 }
286de272 1025
967bb5c5
DB
1026 /* Update nc with nat adjustments made to nat_conn by
1027 * nat_select_range_tuple(). */
1028 memcpy(nc, nat_conn, sizeof *nc);
286de272 1029 }
967bb5c5 1030
dbb597d3 1031 nat_packet(pkt, nc, ctx->icmp_related);
967bb5c5
DB
1032 memcpy(&nat_conn->key, &nc->rev_key, sizeof nat_conn->key);
1033 memcpy(&nat_conn->rev_key, &nc->key, sizeof nat_conn->rev_key);
1034 nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
1035 nat_conn->nat_info = NULL;
1036 nat_conn->alg = NULL;
1037 nat_conn->nat_conn = NULL;
1038 uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis);
1039 cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
1040 }
1041
1042 nc->nat_conn = nat_conn;
1043 ovs_mutex_init_adaptive(&nc->lock);
1044 nc->conn_type = CT_CONN_TYPE_DEFAULT;
1045 cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
a489b168 1046 atomic_count_inc(&ct->n_conn);
967bb5c5 1047 ctx->conn = nc; /* For completeness. */
a7f33fdb
DB
1048 if (zl) {
1049 nc->admit_zone = zl->czl.zone;
1050 nc->zone_limit_seq = zl->czl.zone_limit_seq;
1051 zl->czl.count++;
1052 } else {
1053 nc->admit_zone = INVALID_ZONE;
1054 }
a489b168 1055 }
bd5e81a0 1056
a489b168 1057 return nc;
bd5e81a0 1058
967bb5c5
DB
1059 /* This would be a user error or a DOS attack. A user error is prevented
1060 * by allocating enough combinations of NAT addresses when combined with
1061 * ephemeral ports. A DOS attack should be protected against with
1062 * firewall rules or a separate firewall. Also using zone partitioning
1063 * can limit DoS impact. */
bd5e81a0 1064nat_res_exhaustion:
967bb5c5
DB
1065 free(nat_conn);
1066 ovs_list_remove(&nc->exp_node);
1067 delete_conn_cmn(nc);
bd5e81a0
DB
1068 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
1069 VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
1070 "if DoS attack, use firewalling and/or zone partitioning.");
1071 return NULL;
a489b168
DDP
1072}
1073
286de272
DB
1074static bool
1075conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
967bb5c5
DB
1076 struct conn_lookup_ctx *ctx, struct conn *conn,
1077 long long now)
286de272 1078{
967bb5c5 1079 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
286de272
DB
1080 bool create_new_conn = false;
1081
dbb597d3 1082 if (ctx->icmp_related) {
286de272
DB
1083 pkt->md.ct_state |= CS_RELATED;
1084 if (ctx->reply) {
1085 pkt->md.ct_state |= CS_REPLY_DIR;
1086 }
1087 } else {
967bb5c5 1088 if (conn->alg_related) {
bd5e81a0
DB
1089 pkt->md.ct_state |= CS_RELATED;
1090 }
dec0dbbc 1091
967bb5c5 1092 enum ct_update_res res = conn_update(ct, conn, pkt, ctx, now);
286de272
DB
1093
1094 switch (res) {
1095 case CT_UPDATE_VALID:
1096 pkt->md.ct_state |= CS_ESTABLISHED;
1097 pkt->md.ct_state &= ~CS_NEW;
1098 if (ctx->reply) {
1099 pkt->md.ct_state |= CS_REPLY_DIR;
1100 }
1101 break;
1102 case CT_UPDATE_INVALID:
1103 pkt->md.ct_state = CS_INVALID;
1104 break;
1105 case CT_UPDATE_NEW:
967bb5c5 1106 ovs_mutex_lock(&ct->ct_lock);
4048c508 1107 if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
28274f77
DB
1108 conn_clean(ct, conn);
1109 }
967bb5c5 1110 ovs_mutex_unlock(&ct->ct_lock);
286de272
DB
1111 create_new_conn = true;
1112 break;
a867c010
YHW
1113 case CT_UPDATE_VALID_NEW:
1114 pkt->md.ct_state |= CS_NEW;
1115 break;
286de272
DB
1116 default:
1117 OVS_NOT_REACHED();
1118 }
1119 }
1120 return create_new_conn;
1121}
1122
286de272
DB
1123static void
1124handle_nat(struct dp_packet *pkt, struct conn *conn,
1125 uint16_t zone, bool reply, bool related)
1126{
1127 if (conn->nat_info &&
1128 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1129 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
1130 zone != pkt->md.ct_zone))) {
bd5e81a0 1131
286de272
DB
1132 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
1133 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
1134 }
1135 if (reply) {
1136 un_nat_packet(pkt, conn, related);
1137 } else {
1138 nat_packet(pkt, conn, related);
1139 }
1140 }
1141}
1142
f8016041
DB
1143static bool
1144check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
1145 struct conn_lookup_ctx *ctx_in, long long now,
967bb5c5 1146 struct conn **conn,
f8016041 1147 const struct nat_action_info_t *nat_action_info)
f8016041 1148{
a0b89c51
DB
1149 if (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1150 (ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
f8016041
DB
1151 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
1152 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
1153 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
f8016041
DB
1154 nat_action_info) {
1155 return false;
1156 }
1157
967bb5c5
DB
1158 struct conn_key key;
1159 memset(&key, 0 , sizeof key);
f8016041
DB
1160
1161 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
967bb5c5
DB
1162 key.src.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
1163 key.dst.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
f8016041
DB
1164
1165 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
967bb5c5
DB
1166 key.src.icmp_id = ctx_in->key.src.icmp_id;
1167 key.dst.icmp_id = ctx_in->key.dst.icmp_id;
f8016041 1168 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
967bb5c5
DB
1169 key.src.icmp_type = (uint8_t) src_port;
1170 key.dst.icmp_type = reverse_icmp_type(key.src.icmp_type);
f8016041 1171 } else {
967bb5c5
DB
1172 key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
1173 key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
f8016041 1174 }
967bb5c5 1175 key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
f8016041 1176 } else {
967bb5c5
DB
1177 key.src.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
1178 key.dst.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
f8016041
DB
1179
1180 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
967bb5c5
DB
1181 key.src.icmp_id = ctx_in->key.src.icmp_id;
1182 key.dst.icmp_id = ctx_in->key.dst.icmp_id;
f8016041 1183 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
967bb5c5
DB
1184 key.src.icmp_type = (uint8_t) src_port;
1185 key.dst.icmp_type = reverse_icmp6_type(key.src.icmp_type);
f8016041 1186 } else {
967bb5c5
DB
1187 key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
1188 key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
f8016041 1189 }
967bb5c5 1190 key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
f8016041
DB
1191 }
1192
967bb5c5
DB
1193 key.dl_type = ctx_in->key.dl_type;
1194 key.zone = pkt->md.ct_zone;
4048c508 1195 conn_lookup(ct, &key, now, conn, NULL);
f8016041
DB
1196 return *conn ? true : false;
1197}
1198
94e71143
DB
1199static bool
1200conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
1201 struct conn_lookup_ctx *ctx, struct conn *conn,
1202 const struct nat_action_info_t *nat_action_info,
1203 enum ct_alg_ctl_type ct_alg_ctl, long long now,
967bb5c5 1204 bool *create_new_conn)
94e71143
DB
1205{
1206 if (is_ftp_ctl(ct_alg_ctl)) {
1207 /* Keep sequence tracking in sync with the source of the
1208 * sequence skew. */
967bb5c5 1209 ovs_mutex_lock(&conn->lock);
94e71143
DB
1210 if (ctx->reply != conn->seq_skew_dir) {
1211 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1212 !!nat_action_info);
967bb5c5
DB
1213 /* conn_update_state locks for unrelated fields, so unlock. */
1214 ovs_mutex_unlock(&conn->lock);
1215 *create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
94e71143 1216 } else {
967bb5c5
DB
1217 /* conn_update_state locks for unrelated fields, so unlock. */
1218 ovs_mutex_unlock(&conn->lock);
1219 *create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1220 ovs_mutex_lock(&conn->lock);
030958a0
DB
1221 if (*create_new_conn == false) {
1222 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1223 !!nat_action_info);
1224 }
967bb5c5 1225 ovs_mutex_unlock(&conn->lock);
94e71143
DB
1226 }
1227 return true;
1228 }
1229 return false;
1230}
1231
594570ea
DB
1232static void
1233set_cached_conn(const struct nat_action_info_t *nat_action_info,
1234 const struct conn_lookup_ctx *ctx, struct conn *conn,
1235 struct dp_packet *pkt)
1236{
1237 if (OVS_LIKELY(!nat_action_info)) {
1238 pkt->md.conn = conn;
1239 pkt->md.reply = ctx->reply;
1240 pkt->md.icmp_related = ctx->icmp_related;
1241 } else {
1242 pkt->md.conn = NULL;
1243 }
1244}
1245
1246static void
1247process_one_fast(uint16_t zone, const uint32_t *setmark,
1248 const struct ovs_key_ct_labels *setlabel,
1249 const struct nat_action_info_t *nat_action_info,
1250 struct conn *conn, struct dp_packet *pkt)
1251{
1252 if (nat_action_info) {
1253 handle_nat(pkt, conn, zone, pkt->md.reply, pkt->md.icmp_related);
1254 pkt->md.conn = NULL;
1255 }
1256
1257 pkt->md.ct_zone = zone;
1258 ovs_mutex_lock(&conn->lock);
1259 pkt->md.ct_mark = conn->mark;
1260 pkt->md.ct_label = conn->label;
1261 ovs_mutex_unlock(&conn->lock);
1262
1263 if (setmark) {
1264 set_mark(pkt, conn, setmark[0], setmark[1]);
1265 }
1266
1267 if (setlabel) {
1268 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1269 }
1270}
1271
286de272 1272static void
a489b168
DDP
1273process_one(struct conntrack *ct, struct dp_packet *pkt,
1274 struct conn_lookup_ctx *ctx, uint16_t zone,
286de272
DB
1275 bool force, bool commit, long long now, const uint32_t *setmark,
1276 const struct ovs_key_ct_labels *setlabel,
bd5e81a0 1277 const struct nat_action_info_t *nat_action_info,
bd7d93f8 1278 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper)
a489b168 1279{
967bb5c5
DB
1280 bool create_new_conn = false;
1281 conn_key_lookup(ct, &ctx->key, ctx->hash, now, &ctx->conn, &ctx->reply);
1282 struct conn *conn = ctx->conn;
a489b168 1283
a76a37ef 1284 /* Delete found entry if in wrong direction. 'force' implies commit. */
a720a7fa 1285 if (OVS_UNLIKELY(force && ctx->reply && conn)) {
967bb5c5 1286 ovs_mutex_lock(&ct->ct_lock);
4048c508 1287 if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
28274f77
DB
1288 conn_clean(ct, conn);
1289 }
967bb5c5 1290 ovs_mutex_unlock(&ct->ct_lock);
a76a37ef
JR
1291 conn = NULL;
1292 }
1293
286de272
DB
1294 if (OVS_LIKELY(conn)) {
1295 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
a489b168 1296
286de272 1297 ctx->reply = true;
967bb5c5 1298 struct conn *rev_conn = conn; /* Save for debugging. */
4048c508 1299 uint32_t hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
967bb5c5 1300 conn_key_lookup(ct, &ctx->key, hash, now, &conn, &ctx->reply);
a489b168 1301
967bb5c5 1302 if (!conn) {
286de272 1303 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
967bb5c5
DB
1304 char *log_msg = xasprintf("Missing master conn %p", rev_conn);
1305 ct_print_conn_info(conn, log_msg, VLL_INFO, true, true);
1306 free(log_msg);
286de272 1307 return;
a489b168
DDP
1308 }
1309 }
286de272
DB
1310 }
1311
bd7d93f8
DB
1312 enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst,
1313 helper);
bd5e81a0 1314
286de272 1315 if (OVS_LIKELY(conn)) {
94e71143
DB
1316 if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn,
1317 nat_action_info,
967bb5c5 1318 ct_alg_ctl, now,
94e71143 1319 &create_new_conn))) {
967bb5c5 1320 create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
bd5e81a0 1321 }
286de272 1322 if (nat_action_info && !create_new_conn) {
dbb597d3 1323 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
286de272 1324 }
bd5e81a0 1325
a0b89c51 1326 } else if (check_orig_tuple(ct, pkt, ctx, now, &conn, nat_action_info)) {
967bb5c5 1327 create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
a489b168 1328 } else {
dbb597d3 1329 if (ctx->icmp_related) {
bd5e81a0
DB
1330 /* An icmp related conn should always be found; no new
1331 connection is created based on an icmp related packet. */
286de272 1332 pkt->md.ct_state = CS_INVALID;
5c2e106b 1333 } else {
286de272 1334 create_new_conn = true;
5c2e106b 1335 }
a489b168
DDP
1336 }
1337
bd5e81a0 1338 const struct alg_exp_node *alg_exp = NULL;
96bbcbf7 1339 struct alg_exp_node alg_exp_entry;
dec0dbbc 1340
286de272 1341 if (OVS_UNLIKELY(create_new_conn)) {
bd5e81a0 1342
967bb5c5 1343 ovs_rwlock_rdlock(&ct->resources_lock);
bd5e81a0 1344 alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
be38342d
DB
1345 ct->hash_basis,
1346 alg_src_ip_wc(ct_alg_ctl));
bd5e81a0 1347 if (alg_exp) {
c3f6bae2 1348 memcpy(&alg_exp_entry, alg_exp, sizeof alg_exp_entry);
bd5e81a0
DB
1349 alg_exp = &alg_exp_entry;
1350 }
967bb5c5 1351 ovs_rwlock_unlock(&ct->resources_lock);
bd5e81a0 1352
967bb5c5 1353 ovs_mutex_lock(&ct->ct_lock);
4048c508 1354 if (!conn_lookup(ct, &ctx->key, now, NULL, NULL)) {
28274f77
DB
1355 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
1356 helper, alg_exp, ct_alg_ctl);
1357 }
967bb5c5 1358 ovs_mutex_unlock(&ct->ct_lock);
286de272
DB
1359 }
1360
bd5e81a0
DB
1361 write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1362
286de272
DB
1363 if (conn && setmark) {
1364 set_mark(pkt, conn, setmark[0], setmark[1]);
1365 }
a489b168 1366
286de272
DB
1367 if (conn && setlabel) {
1368 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1369 }
1370
967bb5c5 1371 handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info);
594570ea
DB
1372
1373 set_cached_conn(nat_action_info, ctx, conn, pkt);
a489b168
DDP
1374}
1375
1376/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
51b9a533 1377 * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have
4ea96698
DB
1378 * the l3 and and l4 offset properly set. Performs fragment reassembly with
1379 * the help of ipf_preprocess_conntrack().
a489b168
DDP
1380 *
1381 * If 'commit' is true, the packets are allowed to create new entries in the
1382 * connection tables. 'setmark', if not NULL, should point to a two
1383 * elements array containing a value and a mask to set the connection mark.
1384 * 'setlabel' behaves similarly for the connection label.*/
1385int
1386conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
a76a37ef 1387 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
66e4ad8a 1388 const uint32_t *setmark,
a489b168 1389 const struct ovs_key_ct_labels *setlabel,
bd7d93f8 1390 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
94053e66
FA
1391 const struct nat_action_info_t *nat_action_info,
1392 long long now)
a489b168 1393{
4ea96698
DB
1394 ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
1395 ct->hash_basis);
1396
43495c45 1397 struct dp_packet *packet;
61ce32b9 1398 struct conn_lookup_ctx ctx;
a489b168 1399
e883448e 1400 DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
594570ea
DB
1401 struct conn *conn = packet->md.conn;
1402 if (OVS_UNLIKELY(packet->md.ct_state == CS_INVALID)) {
1403 write_ct_md(packet, zone, NULL, NULL, NULL);
1404 } else if (conn && conn->key.zone == zone && !force
1405 && !get_alg_ctl_type(packet, tp_src, tp_dst, helper)) {
1406 process_one_fast(zone, setmark, setlabel, nat_action_info,
1407 conn, packet);
1408 } else if (OVS_UNLIKELY(!conn_key_extract(ct, packet, dl_type, &ctx,
1409 zone))) {
43495c45
BB
1410 packet->md.ct_state = CS_INVALID;
1411 write_ct_md(packet, zone, NULL, NULL, NULL);
594570ea
DB
1412 } else {
1413 process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
1414 setlabel, nat_action_info, tp_src, tp_dst, helper);
a489b168 1415 }
a489b168
DDP
1416 }
1417
4ea96698
DB
1418 ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
1419
a489b168
DDP
1420 return 0;
1421}
1422
1fe178d2
EG
1423void
1424conntrack_clear(struct dp_packet *packet)
1425{
1426 /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
1427 * the conntrack fields invalid. */
1428 packet->md.ct_state = 0;
594570ea 1429 pkt_metadata_init_conn(&packet->md);
1fe178d2
EG
1430}
1431
a489b168
DDP
1432static void
1433set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1434{
967bb5c5 1435 ovs_mutex_lock(&conn->lock);
bd5e81a0
DB
1436 if (conn->alg_related) {
1437 pkt->md.ct_mark = conn->mark;
1438 } else {
1439 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1440 conn->mark = pkt->md.ct_mark;
1441 }
967bb5c5 1442 ovs_mutex_unlock(&conn->lock);
a489b168
DDP
1443}
1444
1445static void
1446set_label(struct dp_packet *pkt, struct conn *conn,
1447 const struct ovs_key_ct_labels *val,
1448 const struct ovs_key_ct_labels *mask)
1449{
967bb5c5 1450 ovs_mutex_lock(&conn->lock);
bd5e81a0
DB
1451 if (conn->alg_related) {
1452 pkt->md.ct_label = conn->label;
1453 } else {
1454 ovs_u128 v, m;
a489b168 1455
bd5e81a0
DB
1456 memcpy(&v, val, sizeof v);
1457 memcpy(&m, mask, sizeof m);
a489b168 1458
bd5e81a0 1459 pkt->md.ct_label.u64.lo = v.u64.lo
a489b168 1460 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
bd5e81a0 1461 pkt->md.ct_label.u64.hi = v.u64.hi
a489b168 1462 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
bd5e81a0
DB
1463 conn->label = pkt->md.ct_label;
1464 }
967bb5c5 1465 ovs_mutex_unlock(&conn->lock);
a489b168 1466}
286de272 1467
a489b168 1468\f
e6ef6cc6
DDP
1469/* Delete the expired connections from 'ctb', up to 'limit'. Returns the
1470 * earliest expiration time among the remaining connections in 'ctb'. Returns
1471 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
1472 * if 'limit' is reached */
1473static long long
967bb5c5 1474ct_sweep(struct conntrack *ct, long long now, size_t limit)
e6ef6cc6
DDP
1475{
1476 struct conn *conn, *next;
1477 long long min_expiration = LLONG_MAX;
e6ef6cc6
DDP
1478 size_t count = 0;
1479
967bb5c5
DB
1480 ovs_mutex_lock(&ct->ct_lock);
1481
dec0dbbc 1482 for (unsigned i = 0; i < N_CT_TM; i++) {
967bb5c5
DB
1483 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ct->exp_lists[i]) {
1484 ovs_mutex_lock(&conn->lock);
1485 if (now < conn->expiration || count >= limit) {
a720a7fa 1486 min_expiration = MIN(min_expiration, conn->expiration);
967bb5c5 1487 ovs_mutex_unlock(&conn->lock);
a720a7fa
DB
1488 if (count >= limit) {
1489 /* Do not check other lists. */
1490 COVERAGE_INC(conntrack_long_cleanup);
967bb5c5 1491 goto out;
e6ef6cc6 1492 }
a720a7fa 1493 break;
967bb5c5
DB
1494 } else {
1495 ovs_mutex_unlock(&conn->lock);
1496 conn_clean(ct, conn);
e6ef6cc6 1497 }
a720a7fa 1498 count++;
e6ef6cc6
DDP
1499 }
1500 }
967bb5c5
DB
1501
1502out:
1503 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
1504 time_msec() - now);
1505 ovs_mutex_unlock(&ct->ct_lock);
e6ef6cc6
DDP
1506 return min_expiration;
1507}
1508
1509/* Cleans up old connection entries from 'ct'. Returns the time when the
1510 * next expiration might happen. The return value might be smaller than
1511 * 'now', meaning that an internal limit has been reached, and some expired
1512 * connections have not been deleted. */
1513static long long
1514conntrack_clean(struct conntrack *ct, long long now)
1515{
e6ef6cc6 1516 unsigned int n_conn_limit;
e6ef6cc6 1517 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
967bb5c5
DB
1518 size_t clean_max = n_conn_limit > 10 ? n_conn_limit / 10 : 1;
1519 long long min_exp = ct_sweep(ct, now, clean_max);
1520 long long next_wakeup = MIN(min_exp, now + CT_TM_MIN);
e6ef6cc6
DDP
1521
1522 return next_wakeup;
1523}
1524
1525/* Cleanup:
e6ef6cc6
DDP
1526 *
1527 * We must call conntrack_clean() periodically. conntrack_clean() return
1528 * value gives an hint on when the next cleanup must be done (either because
1529 * there is an actual connection that expires, or because a new connection
1530 * might be created with the minimum timeout).
1531 *
1532 * The logic below has two goals:
1533 *
6c54734e
DDP
1534 * - We want to reduce the number of wakeups and batch connection cleanup
1535 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1536 * are coping with the current cleanup tasks, then we wait at least
1537 * 5 seconds to do further cleanup.
e6ef6cc6 1538 *
967bb5c5 1539 * - We don't want to keep the map locked too long, as we might prevent
6c54734e 1540 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
967bb5c5 1541 * behind, there is at least some 200ms blocks of time when the map will be
6c54734e 1542 * left alone, so the datapath can operate unhindered.
e6ef6cc6
DDP
1543 */
1544#define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1545#define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1546
1547static void *
1548clean_thread_main(void *f_)
1549{
1550 struct conntrack *ct = f_;
1551
1552 while (!latch_is_set(&ct->clean_thread_exit)) {
1553 long long next_wake;
1554 long long now = time_msec();
e6ef6cc6
DDP
1555 next_wake = conntrack_clean(ct, now);
1556
1557 if (next_wake < now) {
1558 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1559 } else {
1560 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1561 }
1562 latch_wait(&ct->clean_thread_exit);
1563 poll_block();
1564 }
1565
1566 return NULL;
1567}
1568\f
e917d3ee
DB
1569/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1570 * used to store a pointer to the first byte after the L3 header. 'Size' is
1571 * the size of the packet beyond the data pointer. */
a489b168
DDP
1572static inline bool
1573extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1574 const char **new_data, bool validate_checksum)
1575{
e917d3ee
DB
1576 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1577 return false;
a489b168
DDP
1578 }
1579
dec0dbbc
DB
1580 const struct ip_header *ip = data;
1581 size_t ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
a489b168 1582
e917d3ee
DB
1583 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1584 return false;
1585 }
a489b168 1586
e917d3ee
DB
1587 if (OVS_UNLIKELY(size < ip_len)) {
1588 return false;
1589 }
a489b168 1590
e917d3ee
DB
1591 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1592 return false;
a489b168
DDP
1593 }
1594
1595 if (validate_checksum && csum(data, ip_len) != 0) {
1596 return false;
1597 }
1598
e917d3ee
DB
1599 if (new_data) {
1600 *new_data = (char *) data + ip_len;
1601 }
1602
cda1b109
DB
1603 key->src.addr.ipv4 = get_16aligned_be32(&ip->ip_src);
1604 key->dst.addr.ipv4 = get_16aligned_be32(&ip->ip_dst);
a489b168
DDP
1605 key->nw_proto = ip->ip_proto;
1606
1607 return true;
1608}
1609
e917d3ee
DB
1610/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1611 * used to store a pointer to the first byte after the L3 header. 'Size' is
1612 * the size of the packet beyond the data pointer. */
a489b168
DDP
1613static inline bool
1614extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1615 const char **new_data)
1616{
1617 const struct ovs_16aligned_ip6_hdr *ip6 = data;
286de272 1618
e917d3ee
DB
1619 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1620 return false;
a489b168
DDP
1621 }
1622
1623 data = ip6 + 1;
1624 size -= sizeof *ip6;
dec0dbbc
DB
1625 uint8_t nw_proto = ip6->ip6_nxt;
1626 uint8_t nw_frag = 0;
a489b168 1627
523464ab
DB
1628 const struct ovs_16aligned_ip6_frag *frag_hdr;
1629 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, &frag_hdr)) {
a489b168
DDP
1630 return false;
1631 }
1632
a489b168
DDP
1633 if (nw_frag) {
1634 return false;
1635 }
1636
c8b1ad49
DB
1637 if (new_data) {
1638 *new_data = data;
1639 }
1640
cda1b109
DB
1641 memcpy(&key->src.addr.ipv6, &ip6->ip6_src, sizeof key->src.addr);
1642 memcpy(&key->dst.addr.ipv6, &ip6->ip6_dst, sizeof key->dst.addr);
a489b168
DDP
1643 key->nw_proto = nw_proto;
1644
1645 return true;
1646}
1647
1648static inline bool
1649checksum_valid(const struct conn_key *key, const void *data, size_t size,
1650 const void *l3)
1651{
a489b168 1652 if (key->dl_type == htons(ETH_TYPE_IP)) {
76d85771
DB
1653 uint32_t csum = packet_csum_pseudoheader(l3);
1654 return csum_finish(csum_continue(csum, data, size)) == 0;
a489b168 1655 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
76d85771 1656 return packet_csum_upperlayer6(l3, data, key->nw_proto, size) == 0;
a489b168
DDP
1657 } else {
1658 return false;
1659 }
a489b168
DDP
1660}
1661
1662static inline bool
1663check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
324459a3 1664 const void *l3, bool validate_checksum)
a489b168
DDP
1665{
1666 const struct tcp_header *tcp = data;
40225b0c
BP
1667 if (size < sizeof *tcp) {
1668 return false;
1669 }
a489b168 1670
40225b0c 1671 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
a489b168
DDP
1672 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1673 return false;
1674 }
1675
324459a3 1676 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1677}
1678
1679static inline bool
1680check_l4_udp(const struct conn_key *key, const void *data, size_t size,
324459a3 1681 const void *l3, bool validate_checksum)
a489b168
DDP
1682{
1683 const struct udp_header *udp = data;
40225b0c
BP
1684 if (size < sizeof *udp) {
1685 return false;
1686 }
a489b168 1687
40225b0c 1688 size_t udp_len = ntohs(udp->udp_len);
a489b168
DDP
1689 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1690 return false;
1691 }
1692
1693 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1694 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
324459a3 1695 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
a489b168
DDP
1696}
1697
1698static inline bool
324459a3 1699check_l4_icmp(const void *data, size_t size, bool validate_checksum)
a489b168 1700{
324459a3 1701 return validate_checksum ? csum(data, size) == 0 : true;
a489b168
DDP
1702}
1703
1704static inline bool
1705check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
324459a3 1706 const void *l3, bool validate_checksum)
a489b168 1707{
324459a3 1708 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1709}
1710
1711static inline bool
6c2a9306
DB
1712extract_l4_tcp(struct conn_key *key, const void *data, size_t size,
1713 size_t *chk_len)
a489b168 1714{
6c2a9306 1715 if (OVS_UNLIKELY(size < (chk_len ? *chk_len : TCP_HEADER_LEN))) {
a489b168
DDP
1716 return false;
1717 }
1718
dec0dbbc 1719 const struct tcp_header *tcp = data;
a489b168
DDP
1720 key->src.port = tcp->tcp_src;
1721 key->dst.port = tcp->tcp_dst;
1722
1723 /* Port 0 is invalid */
1724 return key->src.port && key->dst.port;
1725}
1726
1727static inline bool
6c2a9306
DB
1728extract_l4_udp(struct conn_key *key, const void *data, size_t size,
1729 size_t *chk_len)
a489b168 1730{
6c2a9306 1731 if (OVS_UNLIKELY(size < (chk_len ? *chk_len : UDP_HEADER_LEN))) {
a489b168
DDP
1732 return false;
1733 }
1734
dec0dbbc 1735 const struct udp_header *udp = data;
a489b168
DDP
1736 key->src.port = udp->udp_src;
1737 key->dst.port = udp->udp_dst;
1738
1739 /* Port 0 is invalid */
1740 return key->src.port && key->dst.port;
1741}
1742
1743static inline bool extract_l4(struct conn_key *key, const void *data,
324459a3 1744 size_t size, bool *related, const void *l3,
6c2a9306 1745 bool validate_checksum, size_t *chk_len);
a489b168 1746
b269a122
DDP
1747static uint8_t
1748reverse_icmp_type(uint8_t type)
1749{
1750 switch (type) {
1751 case ICMP4_ECHO_REQUEST:
1752 return ICMP4_ECHO_REPLY;
1753 case ICMP4_ECHO_REPLY:
1754 return ICMP4_ECHO_REQUEST;
1755
1756 case ICMP4_TIMESTAMP:
1757 return ICMP4_TIMESTAMPREPLY;
1758 case ICMP4_TIMESTAMPREPLY:
1759 return ICMP4_TIMESTAMP;
1760
1761 case ICMP4_INFOREQUEST:
1762 return ICMP4_INFOREPLY;
1763 case ICMP4_INFOREPLY:
1764 return ICMP4_INFOREQUEST;
1765 default:
1766 OVS_NOT_REACHED();
1767 }
1768}
1769
a489b168
DDP
1770/* If 'related' is not NULL and the function is processing an ICMP
1771 * error packet, extract the l3 and l4 fields from the nested header
1772 * instead and set *related to true. If 'related' is NULL we're
1773 * already processing a nested header and no such recursion is
1774 * possible */
1775static inline int
1776extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
6c2a9306 1777 bool *related, size_t *chk_len)
a489b168 1778{
6c2a9306 1779 if (OVS_UNLIKELY(size < (chk_len ? *chk_len : ICMP_HEADER_LEN))) {
a489b168
DDP
1780 return false;
1781 }
1782
dec0dbbc
DB
1783 const struct icmp_header *icmp = data;
1784
a489b168
DDP
1785 switch (icmp->icmp_type) {
1786 case ICMP4_ECHO_REQUEST:
1787 case ICMP4_ECHO_REPLY:
1788 case ICMP4_TIMESTAMP:
1789 case ICMP4_TIMESTAMPREPLY:
1790 case ICMP4_INFOREQUEST:
1791 case ICMP4_INFOREPLY:
b269a122
DDP
1792 if (icmp->icmp_code != 0) {
1793 return false;
1794 }
a489b168 1795 /* Separate ICMP connection: identified using id */
b269a122
DDP
1796 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1797 key->src.icmp_type = icmp->icmp_type;
1798 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
a489b168
DDP
1799 break;
1800 case ICMP4_DST_UNREACH:
1801 case ICMP4_TIME_EXCEEDED:
1802 case ICMP4_PARAM_PROB:
1803 case ICMP4_SOURCEQUENCH:
1804 case ICMP4_REDIRECT: {
1805 /* ICMP packet part of another connection. We should
1806 * extract the key from embedded packet header */
1807 struct conn_key inner_key;
1808 const char *l3 = (const char *) (icmp + 1);
1809 const char *tail = (const char *) data + size;
1810 const char *l4;
a489b168
DDP
1811
1812 if (!related) {
1813 return false;
1814 }
1815
1816 memset(&inner_key, 0, sizeof inner_key);
1817 inner_key.dl_type = htons(ETH_TYPE_IP);
dec0dbbc 1818 bool ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
a489b168
DDP
1819 if (!ok) {
1820 return false;
1821 }
1822
cda1b109 1823 if (inner_key.src.addr.ipv4 != key->dst.addr.ipv4) {
a489b168
DDP
1824 return false;
1825 }
1826
1827 key->src = inner_key.src;
1828 key->dst = inner_key.dst;
1829 key->nw_proto = inner_key.nw_proto;
6c2a9306 1830 size_t check_len = ICMP_ERROR_DATA_L4_LEN;
a489b168 1831
6c2a9306 1832 ok = extract_l4(key, l4, tail - l4, NULL, l3, false, &check_len);
a489b168
DDP
1833 if (ok) {
1834 conn_key_reverse(key);
1835 *related = true;
1836 }
1837 return ok;
1838 }
1839 default:
1840 return false;
1841 }
1842
1843 return true;
1844}
1845
b269a122
DDP
1846static uint8_t
1847reverse_icmp6_type(uint8_t type)
1848{
1849 switch (type) {
1850 case ICMP6_ECHO_REQUEST:
1851 return ICMP6_ECHO_REPLY;
1852 case ICMP6_ECHO_REPLY:
1853 return ICMP6_ECHO_REQUEST;
1854 default:
1855 OVS_NOT_REACHED();
1856 }
1857}
1858
a489b168
DDP
1859/* If 'related' is not NULL and the function is processing an ICMP
1860 * error packet, extract the l3 and l4 fields from the nested header
1861 * instead and set *related to true. If 'related' is NULL we're
1862 * already processing a nested header and no such recursion is
1863 * possible */
1864static inline bool
1865extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1866 bool *related)
1867{
1868 const struct icmp6_header *icmp6 = data;
1869
1870 /* All the messages that we support need at least 4 bytes after
1871 * the header */
1872 if (size < sizeof *icmp6 + 4) {
1873 return false;
1874 }
1875
1876 switch (icmp6->icmp6_type) {
1877 case ICMP6_ECHO_REQUEST:
1878 case ICMP6_ECHO_REPLY:
b269a122
DDP
1879 if (icmp6->icmp6_code != 0) {
1880 return false;
1881 }
a489b168 1882 /* Separate ICMP connection: identified using id */
b269a122
DDP
1883 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1884 key->src.icmp_type = icmp6->icmp6_type;
1885 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
a489b168
DDP
1886 break;
1887 case ICMP6_DST_UNREACH:
1888 case ICMP6_PACKET_TOO_BIG:
1889 case ICMP6_TIME_EXCEEDED:
1890 case ICMP6_PARAM_PROB: {
1891 /* ICMP packet part of another connection. We should
1892 * extract the key from embedded packet header */
1893 struct conn_key inner_key;
1894 const char *l3 = (const char *) icmp6 + 8;
1895 const char *tail = (const char *) data + size;
1896 const char *l4 = NULL;
a489b168
DDP
1897
1898 if (!related) {
1899 return false;
1900 }
1901
1902 memset(&inner_key, 0, sizeof inner_key);
1903 inner_key.dl_type = htons(ETH_TYPE_IPV6);
dec0dbbc 1904 bool ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
a489b168
DDP
1905 if (!ok) {
1906 return false;
1907 }
1908
1909 /* pf doesn't do this, but it seems a good idea */
cda1b109
DB
1910 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6,
1911 &key->dst.addr.ipv6)) {
a489b168
DDP
1912 return false;
1913 }
1914
1915 key->src = inner_key.src;
1916 key->dst = inner_key.dst;
1917 key->nw_proto = inner_key.nw_proto;
1918
6c2a9306 1919 ok = extract_l4(key, l4, tail - l4, NULL, l3, false, NULL);
a489b168
DDP
1920 if (ok) {
1921 conn_key_reverse(key);
1922 *related = true;
1923 }
1924 return ok;
1925 }
1926 default:
1927 return false;
1928 }
1929
1930 return true;
1931}
1932
1933/* Extract l4 fields into 'key', which must already contain valid l3
1934 * members.
1935 *
1936 * If 'related' is not NULL and an ICMP error packet is being
1937 * processed, the function will extract the key from the packet nested
1401f6de 1938 * in the ICMP payload and set '*related' to true.
a489b168 1939 *
9171c635
DB
1940 * 'size' here is the layer 4 size, which can be a nested size if parsing
1941 * an ICMP or ICMP6 header.
1942 *
a489b168 1943 * If 'related' is NULL, it means that we're already parsing a header nested
6c2a9306
DB
1944 * in an ICMP error. In this case, we skip the checksum and some length
1945 * validations. */
a489b168
DDP
1946static inline bool
1947extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
6c2a9306 1948 const void *l3, bool validate_checksum, size_t *chk_len)
a489b168
DDP
1949{
1950 if (key->nw_proto == IPPROTO_TCP) {
324459a3 1951 return (!related || check_l4_tcp(key, data, size, l3,
6c2a9306
DB
1952 validate_checksum))
1953 && extract_l4_tcp(key, data, size, chk_len);
a489b168 1954 } else if (key->nw_proto == IPPROTO_UDP) {
324459a3 1955 return (!related || check_l4_udp(key, data, size, l3,
6c2a9306
DB
1956 validate_checksum))
1957 && extract_l4_udp(key, data, size, chk_len);
a489b168
DDP
1958 } else if (key->dl_type == htons(ETH_TYPE_IP)
1959 && key->nw_proto == IPPROTO_ICMP) {
324459a3 1960 return (!related || check_l4_icmp(data, size, validate_checksum))
6c2a9306 1961 && extract_l4_icmp(key, data, size, related, chk_len);
a489b168
DDP
1962 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1963 && key->nw_proto == IPPROTO_ICMPV6) {
324459a3 1964 return (!related || check_l4_icmp6(key, data, size, l3,
6c2a9306
DB
1965 validate_checksum))
1966 && extract_l4_icmp6(key, data, size, related);
a489b168
DDP
1967 } else {
1968 return false;
1969 }
1970}
1971
1972static bool
66e4ad8a 1973conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
a489b168
DDP
1974 struct conn_lookup_ctx *ctx, uint16_t zone)
1975{
2482b0b0 1976 const struct eth_header *l2 = dp_packet_eth(pkt);
a489b168
DDP
1977 const struct ip_header *l3 = dp_packet_l3(pkt);
1978 const char *l4 = dp_packet_l4(pkt);
a489b168
DDP
1979
1980 memset(ctx, 0, sizeof *ctx);
1981
1982 if (!l2 || !l3 || !l4) {
1983 return false;
1984 }
1985
1986 ctx->key.zone = zone;
1987
1988 /* XXX In this function we parse the packet (again, it has already
1989 * gone through miniflow_extract()) for two reasons:
1990 *
1991 * 1) To extract the l3 addresses and l4 ports.
1992 * We already have the l3 and l4 headers' pointers. Extracting
1993 * the l3 addresses and the l4 ports is really cheap, since they
1994 * can be found at fixed locations.
66e4ad8a
DDP
1995 * 2) To extract the l4 type.
1996 * Extracting the l4 types, for IPv6 can be quite expensive, because
1997 * it's not at a fixed location.
a489b168
DDP
1998 *
1999 * Here's a way to avoid (2) with the help of the datapath.
66e4ad8a 2000 * The datapath doesn't keep the packet's extracted flow[1], so
a489b168 2001 * using that is not an option. We could use the packet's matching
66e4ad8a
DDP
2002 * megaflow, but we have to make sure that the l4 type (nw_proto)
2003 * is unwildcarded. This means either:
a489b168 2004 *
66e4ad8a
DDP
2005 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
2006 * if the actions contains ct().
a489b168 2007 *
66e4ad8a
DDP
2008 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
2009 * action. This is already done in different actions, but it's
2010 * unnecessary for the kernel.
a489b168
DDP
2011 *
2012 * ---
66e4ad8a 2013 * [1] The reasons for this are that keeping the flow increases
a489b168
DDP
2014 * (slightly) the cache footprint and increases computation
2015 * time as we move the packet around. Most importantly, the flow
2016 * should be updated by the actions and this can be slow, as
2017 * we use a sparse representation (miniflow).
2018 *
2019 */
dec0dbbc 2020 bool ok;
66e4ad8a 2021 ctx->key.dl_type = dl_type;
dec0dbbc 2022
a489b168 2023 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
dec0dbbc 2024 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
324459a3
SC
2025 if (hwol_bad_l3_csum) {
2026 ok = false;
2027 } else {
29cf9c1b
FL
2028 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt)
2029 || dp_packet_hwol_is_ipv4(pkt);
324459a3 2030 /* Validate the checksum only when hwol is not supported. */
9171c635 2031 ok = extract_l3_ipv4(&ctx->key, l3, dp_packet_l3_size(pkt), NULL,
324459a3
SC
2032 !hwol_good_l3_csum);
2033 }
a489b168 2034 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
9171c635 2035 ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL);
a489b168
DDP
2036 } else {
2037 ok = false;
2038 }
2039
2040 if (ok) {
324459a3
SC
2041 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
2042 if (!hwol_bad_l4_csum) {
29cf9c1b
FL
2043 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt)
2044 || dp_packet_hwol_tx_l4_checksum(pkt);
324459a3 2045 /* Validate the checksum only when hwol is not supported. */
9171c635 2046 if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt),
6c2a9306
DB
2047 &ctx->icmp_related, l3, !hwol_good_l4_csum,
2048 NULL)) {
324459a3
SC
2049 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
2050 return true;
2051 }
a489b168
DDP
2052 }
2053 }
2054
2055 return false;
2056}
92edd073
DB
2057
2058static uint32_t
cda1b109 2059ct_addr_hash_add(uint32_t hash, const union ct_addr *addr)
92edd073
DB
2060{
2061 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
2062 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
2063}
2064
2065static uint32_t
2066ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
2067{
2068 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
2069 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
2070}
a489b168
DDP
2071\f
2072/* Symmetric */
2073static uint32_t
2074conn_key_hash(const struct conn_key *key, uint32_t basis)
2075{
2076 uint32_t hsrc, hdst, hash;
a489b168 2077 hsrc = hdst = basis;
6b1d4625
DB
2078 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
2079 hdst = ct_endpoint_hash_add(hdst, &key->dst);
a489b168
DDP
2080
2081 /* Even if source and destination are swapped the hash will be the same. */
2082 hash = hsrc ^ hdst;
2083
2084 /* Hash the rest of the key(L3 and L4 types and zone). */
763b40b0 2085 return hash_words((uint32_t *) (&key->dst + 1),
a489b168
DDP
2086 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
2087 hash);
a489b168
DDP
2088}
2089
2090static void
2091conn_key_reverse(struct conn_key *key)
2092{
dec0dbbc 2093 struct ct_endpoint tmp = key->src;
a489b168
DDP
2094 key->src = key->dst;
2095 key->dst = tmp;
2096}
2097
286de272 2098static uint32_t
cda1b109 2099nat_ipv6_addrs_delta(struct in6_addr *ipv6_min, struct in6_addr *ipv6_max)
286de272 2100{
cda1b109
DB
2101 uint8_t *ipv6_min_hi = &ipv6_min->s6_addr[0];
2102 uint8_t *ipv6_min_lo = &ipv6_min->s6_addr[0] + sizeof(uint64_t);
2103 uint8_t *ipv6_max_hi = &ipv6_max->s6_addr[0];
2104 uint8_t *ipv6_max_lo = &ipv6_max->s6_addr[0] + sizeof(uint64_t);
286de272
DB
2105
2106 ovs_be64 addr6_64_min_hi;
2107 ovs_be64 addr6_64_min_lo;
2108 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
2109 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
2110
2111 ovs_be64 addr6_64_max_hi;
2112 ovs_be64 addr6_64_max_lo;
2113 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
2114 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
2115
2116 uint64_t diff;
dec0dbbc 2117
286de272
DB
2118 if (addr6_64_min_hi == addr6_64_max_hi &&
2119 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
2120 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
2121 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
2122 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
2123 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
2124 ntohll(addr6_64_max_lo) - 1);
2125 } else {
2126 /* Limit address delta supported to 32 bits or 4 billion approximately.
2127 * Possibly, this should be visible to the user through a datapath
2128 * support check, however the practical impact is probably nil. */
2129 diff = 0xfffffffe;
2130 }
dec0dbbc 2131
286de272
DB
2132 if (diff > 0xfffffffe) {
2133 diff = 0xfffffffe;
2134 }
2135 return diff;
2136}
2137
2138/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
2139 * restricts the input parameters. */
a489b168 2140static void
cda1b109 2141nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t increment)
286de272 2142{
cda1b109
DB
2143 uint8_t *ipv6_hi = &ipv6->s6_addr[0];
2144 uint8_t *ipv6_lo = &ipv6->s6_addr[0] + sizeof(ovs_be64);
286de272
DB
2145 ovs_be64 addr6_64_hi;
2146 ovs_be64 addr6_64_lo;
2147 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
2148 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
2149
2150 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
2151 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
2152 } else if (addr6_64_hi != OVS_BE64_MAX) {
2153 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
2154 addr6_64_lo = htonll(increment - (UINT64_MAX -
2155 ntohll(addr6_64_lo) + 1));
2156 } else {
2157 OVS_NOT_REACHED();
2158 }
2159
2160 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
2161 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
286de272
DB
2162}
2163
2164static uint32_t
2165nat_range_hash(const struct conn *conn, uint32_t basis)
2166{
2167 uint32_t hash = basis;
286de272 2168
92edd073
DB
2169 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
2170 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
2171 hash = hash_add(hash,
2172 (conn->nat_info->max_port << 16)
2173 | conn->nat_info->min_port);
92edd073
DB
2174 hash = ct_endpoint_hash_add(hash, &conn->key.src);
2175 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
286de272
DB
2176 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
2177 hash = hash_add(hash, conn->key.nw_proto);
2178 hash = hash_add(hash, conn->key.zone);
92edd073
DB
2179
2180 /* The purpose of the second parameter is to distinguish hashes of data of
2181 * different length; our data always has the same length so there is no
2182 * value in counting. */
2183 return hash_finish(hash, 0);
286de272
DB
2184}
2185
2186static bool
2187nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
2188 struct conn *nat_conn)
2189{
bd5e81a0
DB
2190 enum { MIN_NAT_EPHEMERAL_PORT = 1024,
2191 MAX_NAT_EPHEMERAL_PORT = 65535 };
286de272
DB
2192
2193 uint16_t min_port;
2194 uint16_t max_port;
2195 uint16_t first_port;
286de272
DB
2196 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
2197
2198 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
2199 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
2200 min_port = ntohs(conn->key.src.port);
2201 max_port = ntohs(conn->key.src.port);
2202 first_port = min_port;
2203 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
2204 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
2205 min_port = ntohs(conn->key.dst.port);
2206 max_port = ntohs(conn->key.dst.port);
2207 first_port = min_port;
2208 } else {
2209 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
2210 uint32_t port_index = hash % (deltap + 1);
2211 first_port = conn->nat_info->min_port + port_index;
2212 min_port = conn->nat_info->min_port;
2213 max_port = conn->nat_info->max_port;
2214 }
2215
2216 uint32_t deltaa = 0;
2217 uint32_t address_index;
cda1b109 2218 union ct_addr ct_addr;
286de272 2219 memset(&ct_addr, 0, sizeof ct_addr);
cda1b109 2220 union ct_addr max_ct_addr;
286de272
DB
2221 memset(&max_ct_addr, 0, sizeof max_ct_addr);
2222 max_ct_addr = conn->nat_info->max_addr;
2223
2224 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
cda1b109
DB
2225 deltaa = ntohl(conn->nat_info->max_addr.ipv4) -
2226 ntohl(conn->nat_info->min_addr.ipv4);
286de272 2227 address_index = hash % (deltaa + 1);
cda1b109
DB
2228 ct_addr.ipv4 = htonl(
2229 ntohl(conn->nat_info->min_addr.ipv4) + address_index);
286de272 2230 } else {
cda1b109
DB
2231 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6,
2232 &conn->nat_info->max_addr.ipv6);
286de272
DB
2233 /* deltaa must be within 32 bits for full hash coverage. A 64 or
2234 * 128 bit hash is unnecessary and hence not used here. Most code
2235 * is kept common with V4; nat_ipv6_addrs_delta() will do the
2236 * enforcement via max_ct_addr. */
2237 max_ct_addr = conn->nat_info->min_addr;
cda1b109 2238 nat_ipv6_addr_increment(&max_ct_addr.ipv6, deltaa);
286de272 2239 address_index = hash % (deltaa + 1);
cda1b109
DB
2240 ct_addr.ipv6 = conn->nat_info->min_addr.ipv6;
2241 nat_ipv6_addr_increment(&ct_addr.ipv6, address_index);
286de272
DB
2242 }
2243
2244 uint16_t port = first_port;
2245 bool all_ports_tried = false;
32b2c81f
DB
2246 /* For DNAT or for specified port ranges, we don't use ephemeral ports. */
2247 bool ephemeral_ports_tried
2248 = conn->nat_info->nat_action & NAT_ACTION_DST ||
2249 conn->nat_info->nat_action & NAT_ACTION_SRC_PORT
2250 ? true : false;
cda1b109 2251 union ct_addr first_addr = ct_addr;
4cd0481c
DB
2252 bool pat_enabled = conn->key.nw_proto != IPPROTO_ICMP &&
2253 conn->key.nw_proto != IPPROTO_ICMPV6;
286de272
DB
2254
2255 while (true) {
2256 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2257 nat_conn->rev_key.dst.addr = ct_addr;
e32cd4c6 2258 if (pat_enabled) {
2259 nat_conn->rev_key.dst.port = htons(port);
2260 }
286de272 2261 } else {
1c8689d7 2262 nat_conn->rev_key.src.addr = ct_addr;
e32cd4c6 2263 if (pat_enabled) {
2264 nat_conn->rev_key.src.port = htons(port);
2265 }
286de272
DB
2266 }
2267
e32cd4c6 2268 bool found = conn_lookup(ct, &nat_conn->rev_key, time_msec(), NULL,
2269 NULL);
967bb5c5 2270 if (!found) {
286de272 2271 return true;
4cd0481c 2272 } else if (pat_enabled && !all_ports_tried) {
286de272
DB
2273 if (min_port == max_port) {
2274 all_ports_tried = true;
2275 } else if (port == max_port) {
2276 port = min_port;
2277 } else {
2278 port++;
2279 }
2280 if (port == first_port) {
2281 all_ports_tried = true;
2282 }
2283 } else {
2284 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
2285 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2286 ct_addr.ipv4 = htonl(ntohl(ct_addr.ipv4) + 1);
286de272 2287 } else {
cda1b109 2288 nat_ipv6_addr_increment(&ct_addr.ipv6, 1);
286de272
DB
2289 }
2290 } else {
2291 ct_addr = conn->nat_info->min_addr;
2292 }
2293 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
4cd0481c 2294 if (pat_enabled && !ephemeral_ports_tried) {
ac04639a 2295 ephemeral_ports_tried = true;
286de272 2296 ct_addr = conn->nat_info->min_addr;
8417e688 2297 first_addr = ct_addr;
286de272
DB
2298 min_port = MIN_NAT_EPHEMERAL_PORT;
2299 max_port = MAX_NAT_EPHEMERAL_PORT;
2300 } else {
2301 break;
2302 }
2303 }
2304 first_port = min_port;
2305 port = first_port;
2306 all_ports_tried = false;
2307 }
2308 }
2309 return false;
2310}
2311
a489b168 2312static enum ct_update_res
967bb5c5
DB
2313conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt,
2314 struct conn_lookup_ctx *ctx, long long now)
a489b168 2315{
967bb5c5
DB
2316 ovs_mutex_lock(&conn->lock);
2317 enum ct_update_res update_res =
2318 l4_protos[conn->key.nw_proto]->conn_update(ct, conn, pkt, ctx->reply,
2319 now);
2320 ovs_mutex_unlock(&conn->lock);
2321 return update_res;
a489b168
DDP
2322}
2323
2324static bool
2325conn_expired(struct conn *conn, long long now)
2326{
286de272 2327 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
967bb5c5
DB
2328 ovs_mutex_lock(&conn->lock);
2329 bool expired = now >= conn->expiration ? true : false;
2330 ovs_mutex_unlock(&conn->lock);
2331 return expired;
286de272
DB
2332 }
2333 return false;
a489b168
DDP
2334}
2335
2336static bool
2337valid_new(struct dp_packet *pkt, struct conn_key *key)
2338{
2339 return l4_protos[key->nw_proto]->valid_new(pkt);
2340}
2341
2342static struct conn *
967bb5c5
DB
2343new_conn(struct conntrack *ct, struct dp_packet *pkt, struct conn_key *key,
2344 long long now)
a489b168 2345{
967bb5c5 2346 return l4_protos[key->nw_proto]->new_conn(ct, pkt, now);
a489b168
DDP
2347}
2348
2349static void
967bb5c5 2350delete_conn_cmn(struct conn *conn)
a489b168 2351{
286de272 2352 free(conn->nat_info);
bd5e81a0 2353 free(conn->alg);
a489b168
DDP
2354 free(conn);
2355}
967bb5c5
DB
2356
2357static void
2358delete_conn(struct conn *conn)
2359{
2360 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
2361 ovs_mutex_destroy(&conn->lock);
2362 free(conn->nat_conn);
2363 delete_conn_cmn(conn);
2364}
2365
2366/* Only used by conn_clean_one(). */
2367static void
2368delete_conn_one(struct conn *conn)
2369{
2370 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
2371 ovs_mutex_destroy(&conn->lock);
2372 }
2373 delete_conn_cmn(conn);
2374}
4d4e68ed 2375\f
271e48a0
YHW
2376/* Convert a conntrack address 'a' into an IP address 'b' based on 'dl_type'.
2377 *
2378 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2379 * in network-byte order. */
4d4e68ed 2380static void
cda1b109 2381ct_endpoint_to_ct_dpif_inet_addr(const union ct_addr *a,
4d4e68ed
DDP
2382 union ct_dpif_inet_addr *b,
2383 ovs_be16 dl_type)
2384{
2385 if (dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2386 b->ip = a->ipv4;
4d4e68ed 2387 } else if (dl_type == htons(ETH_TYPE_IPV6)){
cda1b109 2388 b->in6 = a->ipv6;
4d4e68ed
DDP
2389 }
2390}
2391
271e48a0
YHW
2392/* Convert an IP address 'a' into a conntrack address 'b' based on 'dl_type'.
2393 *
2394 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2395 * in network-byte order. */
2396static void
2397ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
cda1b109 2398 union ct_addr *b, ovs_be16 dl_type)
271e48a0
YHW
2399{
2400 if (dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2401 b->ipv4 = a->ip;
271e48a0 2402 } else if (dl_type == htons(ETH_TYPE_IPV6)){
cda1b109 2403 b->ipv6 = a->in6;
271e48a0
YHW
2404 }
2405}
2406
4d4e68ed
DDP
2407static void
2408conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2409{
2410 if (key->dl_type == htons(ETH_TYPE_IP)) {
2411 tuple->l3_type = AF_INET;
2412 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2413 tuple->l3_type = AF_INET6;
2414 }
2415 tuple->ip_proto = key->nw_proto;
2416 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2417 key->dl_type);
2418 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2419 key->dl_type);
2420
2421 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
b269a122
DDP
2422 tuple->icmp_id = key->src.icmp_id;
2423 tuple->icmp_type = key->src.icmp_type;
2424 tuple->icmp_code = key->src.icmp_code;
4d4e68ed
DDP
2425 } else {
2426 tuple->src_port = key->src.port;
2427 tuple->dst_port = key->dst.port;
2428 }
2429}
2430
271e48a0
YHW
2431static void
2432tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
2433 struct conn_key *key)
2434{
2435 if (tuple->l3_type == AF_INET) {
2436 key->dl_type = htons(ETH_TYPE_IP);
2437 } else if (tuple->l3_type == AF_INET6) {
2438 key->dl_type = htons(ETH_TYPE_IPV6);
2439 }
2440 key->nw_proto = tuple->ip_proto;
2441 ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
2442 key->dl_type);
2443 ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
2444 key->dl_type);
2445
2446 if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
2447 key->src.icmp_id = tuple->icmp_id;
2448 key->src.icmp_type = tuple->icmp_type;
2449 key->src.icmp_code = tuple->icmp_code;
2450 key->dst.icmp_id = tuple->icmp_id;
2451 key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
2452 key->dst.icmp_code = tuple->icmp_code;
2453 } else {
2454 key->src.port = tuple->src_port;
2455 key->dst.port = tuple->dst_port;
2456 }
2457 key->zone = zone;
2458}
2459
4d4e68ed
DDP
2460static void
2461conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
f1a0469e 2462 long long now)
4d4e68ed 2463{
4d4e68ed
DDP
2464 memset(entry, 0, sizeof *entry);
2465 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
2466 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
2467
2468 entry->zone = conn->key.zone;
4d4e68ed 2469
967bb5c5
DB
2470 ovs_mutex_lock(&conn->lock);
2471 entry->mark = conn->mark;
286de272 2472 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
4d4e68ed 2473
dec0dbbc 2474 long long expiration = conn->expiration - now;
4d4e68ed 2475
dec0dbbc 2476 struct ct_l4_proto *class = l4_protos[conn->key.nw_proto];
4d4e68ed
DDP
2477 if (class->conn_get_protoinfo) {
2478 class->conn_get_protoinfo(conn, &entry->protoinfo);
2479 }
f1a0469e 2480 ovs_mutex_unlock(&conn->lock);
bd5e81a0 2481
f1a0469e 2482 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
bd5e81a0
DB
2483
2484 if (conn->alg) {
2485 /* Caller is responsible for freeing. */
2486 entry->helper.name = xstrdup(conn->alg);
2487 }
4d4e68ed
DDP
2488}
2489
4ea96698
DB
2490struct ipf *
2491conntrack_ipf_ctx(struct conntrack *ct)
2492{
2493 return ct->ipf;
2494}
2495
4d4e68ed
DDP
2496int
2497conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
ded30c74 2498 const uint16_t *pzone, int *ptot_bkts)
4d4e68ed
DDP
2499{
2500 memset(dump, 0, sizeof(*dump));
dec0dbbc 2501
4d4e68ed
DDP
2502 if (pzone) {
2503 dump->zone = *pzone;
2504 dump->filter_zone = true;
2505 }
4d4e68ed 2506
dec0dbbc 2507 dump->ct = ct;
967bb5c5 2508 *ptot_bkts = 1; /* Need to clean up the callers. */
4d4e68ed
DDP
2509 return 0;
2510}
2511
2512int
2513conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2514{
2515 struct conntrack *ct = dump->ct;
2516 long long now = time_msec();
2517
967bb5c5
DB
2518 for (;;) {
2519 struct cmap_node *cm_node = cmap_next_position(&ct->conns,
2520 &dump->cm_pos);
2521 if (!cm_node) {
2522 break;
4d4e68ed 2523 }
967bb5c5
DB
2524 struct conn *conn;
2525 INIT_CONTAINER(conn, cm_node, cm_node);
2526 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2527 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
f1a0469e 2528 conn_to_ct_dpif_entry(conn, entry, now);
4d4e68ed
DDP
2529 return 0;
2530 }
2531 }
967bb5c5 2532
4d4e68ed
DDP
2533 return EOF;
2534}
2535
2536int
2537conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2538{
2539 return 0;
2540}
5d9cbb4c
DDP
2541
2542int
2543conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2544{
967bb5c5
DB
2545 struct conn *conn;
2546
2547 ovs_mutex_lock(&ct->ct_lock);
2548 CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
2549 if (!zone || *zone == conn->key.zone) {
2550 conn_clean_one(ct, conn);
5d9cbb4c 2551 }
5d9cbb4c 2552 }
967bb5c5 2553 ovs_mutex_unlock(&ct->ct_lock);
bd5e81a0 2554
5d9cbb4c
DDP
2555 return 0;
2556}
bd5e81a0 2557
271e48a0
YHW
2558int
2559conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
2560 uint16_t zone)
2561{
271e48a0 2562 int error = 0;
4048c508
DB
2563 struct conn_key key;
2564 struct conn *conn;
271e48a0 2565
4048c508
DB
2566 memset(&key, 0, sizeof(key));
2567 tuple_to_conn_key(tuple, zone, &key);
967bb5c5 2568 ovs_mutex_lock(&ct->ct_lock);
4048c508 2569 conn_lookup(ct, &key, time_msec(), &conn, NULL);
271e48a0 2570
4048c508
DB
2571 if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
2572 conn_clean(ct, conn);
271e48a0 2573 } else {
a1d5eeff 2574 VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
271e48a0
YHW
2575 error = ENOENT;
2576 }
967bb5c5
DB
2577
2578 ovs_mutex_unlock(&ct->ct_lock);
271e48a0
YHW
2579 return error;
2580}
2581
c92339ad
DB
2582int
2583conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns)
2584{
2585 atomic_store_relaxed(&ct->n_conn_limit, maxconns);
2586 return 0;
2587}
2588
2589int
2590conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns)
2591{
2592 atomic_read_relaxed(&ct->n_conn_limit, maxconns);
2593 return 0;
2594}
2595
875075b3
DB
2596int
2597conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns)
2598{
2599 *nconns = atomic_count_get(&ct->n_conn);
2600 return 0;
2601}
2602
64207120
DB
2603int
2604conntrack_set_tcp_seq_chk(struct conntrack *ct, bool enabled)
2605{
2606 atomic_store_relaxed(&ct->tcp_seq_chk, enabled);
2607 return 0;
2608}
2609
2610bool
2611conntrack_get_tcp_seq_chk(struct conntrack *ct)
2612{
2613 bool enabled;
2614 atomic_read_relaxed(&ct->tcp_seq_chk, &enabled);
2615 return enabled;
2616}
2617
bd5e81a0
DB
2618/* This function must be called with the ct->resources read lock taken. */
2619static struct alg_exp_node *
be38342d
DB
2620expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
2621 uint32_t basis, bool src_ip_wc)
bd5e81a0 2622{
c3f6bae2
DB
2623 struct conn_key check_key;
2624 memcpy(&check_key, key, sizeof check_key);
bd5e81a0 2625 check_key.src.port = ALG_WC_SRC_PORT;
dec0dbbc 2626
be38342d
DB
2627 if (src_ip_wc) {
2628 memset(&check_key.src.addr, 0, sizeof check_key.src.addr);
2629 }
dec0dbbc 2630
bd5e81a0
DB
2631 struct alg_exp_node *alg_exp_node;
2632
bd5e81a0 2633 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
dec0dbbc 2634 conn_key_hash(&check_key, basis),
bd5e81a0
DB
2635 alg_expectations) {
2636 if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
2637 return alg_exp_node;
2638 }
2639 }
2640 return NULL;
2641}
2642
4417ca3d
DB
2643/* This function must be called with the ct->resources write lock taken. */
2644static void
2645expectation_remove(struct hmap *alg_expectations,
2646 const struct conn_key *key, uint32_t basis)
2647{
2648 struct alg_exp_node *alg_exp_node;
2649
2650 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node, conn_key_hash(key, basis),
2651 alg_expectations) {
2652 if (!conn_key_cmp(&alg_exp_node->key, key)) {
2653 hmap_remove(alg_expectations, &alg_exp_node->node);
2654 break;
2655 }
2656 }
2657}
2658
2659/* This function must be called with the ct->resources read lock taken. */
2660static struct alg_exp_node *
2661expectation_ref_lookup_unique(const struct hindex *alg_expectation_refs,
2662 const struct conn_key *master_key,
2663 const struct conn_key *alg_exp_key,
2664 uint32_t basis)
2665{
2666 struct alg_exp_node *alg_exp_node;
2667
2668 HINDEX_FOR_EACH_WITH_HASH (alg_exp_node, node_ref,
2669 conn_key_hash(master_key, basis),
2670 alg_expectation_refs) {
2671 if (!conn_key_cmp(&alg_exp_node->master_key, master_key) &&
2672 !conn_key_cmp(&alg_exp_node->key, alg_exp_key)) {
2673 return alg_exp_node;
2674 }
2675 }
2676 return NULL;
2677}
2678
2679/* This function must be called with the ct->resources write lock taken. */
2680static void
2681expectation_ref_create(struct hindex *alg_expectation_refs,
2682 struct alg_exp_node *alg_exp_node,
2683 uint32_t basis)
2684{
2685 if (!expectation_ref_lookup_unique(alg_expectation_refs,
2686 &alg_exp_node->master_key,
2687 &alg_exp_node->key, basis)) {
2688 hindex_insert(alg_expectation_refs, &alg_exp_node->node_ref,
2689 conn_key_hash(&alg_exp_node->master_key, basis));
2690 }
2691}
2692
2693static void
967bb5c5 2694expectation_clean(struct conntrack *ct, const struct conn_key *master_key)
4417ca3d 2695{
967bb5c5 2696 ovs_rwlock_wrlock(&ct->resources_lock);
4417ca3d
DB
2697
2698 struct alg_exp_node *node, *next;
2699 HINDEX_FOR_EACH_WITH_HASH_SAFE (node, next, node_ref,
967bb5c5 2700 conn_key_hash(master_key, ct->hash_basis),
4417ca3d
DB
2701 &ct->alg_expectation_refs) {
2702 if (!conn_key_cmp(&node->master_key, master_key)) {
967bb5c5
DB
2703 expectation_remove(&ct->alg_expectations, &node->key,
2704 ct->hash_basis);
4417ca3d
DB
2705 hindex_remove(&ct->alg_expectation_refs, &node->node_ref);
2706 free(node);
2707 }
2708 }
2709
967bb5c5 2710 ovs_rwlock_unlock(&ct->resources_lock);
4417ca3d
DB
2711}
2712
bd5e81a0 2713static void
be38342d
DB
2714expectation_create(struct conntrack *ct, ovs_be16 dst_port,
2715 const struct conn *master_conn, bool reply, bool src_ip_wc,
2716 bool skip_nat)
bd5e81a0 2717{
cda1b109
DB
2718 union ct_addr src_addr;
2719 union ct_addr dst_addr;
2720 union ct_addr alg_nat_repl_addr;
be38342d 2721 struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
bd5e81a0 2722
be38342d 2723 if (reply) {
bd5e81a0
DB
2724 src_addr = master_conn->key.src.addr;
2725 dst_addr = master_conn->key.dst.addr;
efa29a89 2726 alg_exp_node->nat_rpl_dst = true;
be38342d
DB
2727 if (skip_nat) {
2728 alg_nat_repl_addr = dst_addr;
efa29a89
DM
2729 } else if (master_conn->nat_info &&
2730 master_conn->nat_info->nat_action & NAT_ACTION_DST) {
2731 alg_nat_repl_addr = master_conn->rev_key.src.addr;
2732 alg_exp_node->nat_rpl_dst = false;
be38342d
DB
2733 } else {
2734 alg_nat_repl_addr = master_conn->rev_key.dst.addr;
2735 }
be38342d
DB
2736 } else {
2737 src_addr = master_conn->rev_key.src.addr;
2738 dst_addr = master_conn->rev_key.dst.addr;
efa29a89 2739 alg_exp_node->nat_rpl_dst = false;
be38342d
DB
2740 if (skip_nat) {
2741 alg_nat_repl_addr = src_addr;
efa29a89
DM
2742 } else if (master_conn->nat_info &&
2743 master_conn->nat_info->nat_action & NAT_ACTION_DST) {
2744 alg_nat_repl_addr = master_conn->key.dst.addr;
2745 alg_exp_node->nat_rpl_dst = true;
be38342d
DB
2746 } else {
2747 alg_nat_repl_addr = master_conn->key.src.addr;
2748 }
be38342d
DB
2749 }
2750 if (src_ip_wc) {
2751 memset(&src_addr, 0, sizeof src_addr);
bd5e81a0
DB
2752 }
2753
bd5e81a0
DB
2754 alg_exp_node->key.dl_type = master_conn->key.dl_type;
2755 alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
2756 alg_exp_node->key.zone = master_conn->key.zone;
2757 alg_exp_node->key.src.addr = src_addr;
2758 alg_exp_node->key.dst.addr = dst_addr;
2759 alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
2760 alg_exp_node->key.dst.port = dst_port;
2761 alg_exp_node->master_mark = master_conn->mark;
2762 alg_exp_node->master_label = master_conn->label;
82b9ac94
DB
2763 memcpy(&alg_exp_node->master_key, &master_conn->key,
2764 sizeof alg_exp_node->master_key);
bd5e81a0
DB
2765 /* Take the write lock here because it is almost 100%
2766 * likely that the lookup will fail and
2767 * expectation_create() will be called below. */
967bb5c5 2768 ovs_rwlock_wrlock(&ct->resources_lock);
bd5e81a0 2769 struct alg_exp_node *alg_exp = expectation_lookup(
be38342d 2770 &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis, src_ip_wc);
bd5e81a0
DB
2771 if (alg_exp) {
2772 free(alg_exp_node);
967bb5c5 2773 ovs_rwlock_unlock(&ct->resources_lock);
bd5e81a0
DB
2774 return;
2775 }
2776
2777 alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
4417ca3d 2778 hmap_insert(&ct->alg_expectations, &alg_exp_node->node,
dec0dbbc 2779 conn_key_hash(&alg_exp_node->key, ct->hash_basis));
4417ca3d
DB
2780 expectation_ref_create(&ct->alg_expectation_refs, alg_exp_node,
2781 ct->hash_basis);
967bb5c5 2782 ovs_rwlock_unlock(&ct->resources_lock);
bd5e81a0
DB
2783}
2784
bd5e81a0
DB
2785static void
2786replace_substring(char *substr, uint8_t substr_size,
2787 uint8_t total_size, char *rep_str,
2788 uint8_t rep_str_size)
2789{
2790 memmove(substr + rep_str_size, substr + substr_size,
2791 total_size - substr_size);
2792 memcpy(substr, rep_str, rep_str_size);
2793}
2794
cd7c99a6
DB
2795static void
2796repl_bytes(char *str, char c1, char c2)
2797{
2798 while (*str) {
2799 if (*str == c1) {
2800 *str = c2;
2801 }
2802 str++;
2803 }
2804}
2805
2806static void
2807modify_packet(struct dp_packet *pkt, char *pkt_str, size_t size,
2808 char *repl_str, size_t repl_size,
2809 uint32_t orig_used_size)
2810{
2811 replace_substring(pkt_str, size,
2812 (const char *) dp_packet_tail(pkt) - pkt_str,
2813 repl_str, repl_size);
2814 dp_packet_set_size(pkt, orig_used_size + (int) repl_size - (int) size);
2815}
2816
bd5e81a0
DB
2817/* Replace IPV4 address in FTP message with NATed address. */
2818static int
2819repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
2820 char *ftp_data_start,
cd7c99a6
DB
2821 size_t addr_offset_from_ftp_data_start,
2822 size_t addr_size OVS_UNUSED)
bd5e81a0
DB
2823{
2824 enum { MAX_FTP_V4_NAT_DELTA = 8 };
2825
2826 /* Do conservative check for pathological MTU usage. */
2827 uint32_t orig_used_size = dp_packet_size(pkt);
cd7c99a6
DB
2828 if (orig_used_size + MAX_FTP_V4_NAT_DELTA >
2829 dp_packet_get_allocated(pkt)) {
2830
bd5e81a0 2831 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
cd7c99a6
DB
2832 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V4",
2833 dp_packet_get_allocated(pkt));
bd5e81a0
DB
2834 return 0;
2835 }
2836
cd7c99a6
DB
2837 char v4_addr_str[INET_ADDRSTRLEN] = {0};
2838 ovs_assert(inet_ntop(AF_INET, &v4_addr_rep, v4_addr_str,
2839 sizeof v4_addr_str));
2840 repl_bytes(v4_addr_str, '.', ',');
2841 modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
2842 addr_size, v4_addr_str, strlen(v4_addr_str),
2843 orig_used_size);
2844 return (int) strlen(v4_addr_str) - (int) addr_size;
bd5e81a0
DB
2845}
2846
2847static char *
2848skip_non_digits(char *str)
2849{
2850 while (!isdigit(*str) && *str != 0) {
2851 str++;
2852 }
2853 return str;
2854}
2855
2856static char *
2857terminate_number_str(char *str, uint8_t max_digits)
2858{
2859 uint8_t digits_found = 0;
2860 while (isdigit(*str) && digits_found <= max_digits) {
2861 str++;
2862 digits_found++;
2863 }
2864
2865 *str = 0;
2866 return str;
2867}
2868
2869
2870static void
2871get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
2872{
2873 struct tcp_header *th = dp_packet_l4(pkt);
2874 char *tcp_hdr = (char *) th;
2875 uint32_t tcp_payload_len = tcp_payload_length(pkt);
2876 size_t tcp_payload_of_interest = MIN(tcp_payload_len,
2877 LARGEST_FTP_MSG_OF_INTEREST);
2878 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2879
2880 ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
2881 tcp_payload_of_interest);
2882}
2883
2884static enum ftp_ctl_pkt
2885detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
2886 struct dp_packet *pkt)
2887{
bd5e81a0
DB
2888 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2889 get_ftp_ctl_msg(pkt, ftp_msg);
dec0dbbc 2890
bd5e81a0
DB
2891 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2892 if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
2893 !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
2894 return CT_FTP_CTL_OTHER;
2895 }
2896 } else {
2897 if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
2898 strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
2899 strlen(FTP_PASV_REPLY_CODE))) {
2900 return CT_FTP_CTL_OTHER;
2901 }
2902 }
2903
2904 return CT_FTP_CTL_INTEREST;
2905}
2906
2907static enum ftp_ctl_pkt
2908process_ftp_ctl_v4(struct conntrack *ct,
2909 struct dp_packet *pkt,
2910 const struct conn *conn_for_expectation,
4417ca3d 2911 ovs_be32 *v4_addr_rep,
bd5e81a0 2912 char **ftp_data_v4_start,
cd7c99a6
DB
2913 size_t *addr_offset_from_ftp_data_start,
2914 size_t *addr_size)
bd5e81a0
DB
2915{
2916 struct tcp_header *th = dp_packet_l4(pkt);
2917 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2918 char *tcp_hdr = (char *) th;
2919 *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
2920 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2921 get_ftp_ctl_msg(pkt, ftp_msg);
bd5e81a0
DB
2922 char *ftp = ftp_msg;
2923 enum ct_alg_mode mode;
dec0dbbc 2924
23bea975 2925 if (!strncasecmp(ftp, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
bd5e81a0
DB
2926 ftp = ftp_msg + strlen(FTP_PORT_CMD);
2927 mode = CT_FTP_MODE_ACTIVE;
2928 } else {
2929 ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
2930 mode = CT_FTP_MODE_PASSIVE;
2931 }
2932
2933 /* Find first space. */
2934 ftp = strchr(ftp, ' ');
2935 if (!ftp) {
2936 return CT_FTP_CTL_INVALID;
2937 }
2938
2939 /* Find the first digit, after space. */
2940 ftp = skip_non_digits(ftp);
2941 if (*ftp == 0) {
2942 return CT_FTP_CTL_INVALID;
2943 }
2944
2945 char *ip_addr_start = ftp;
2946 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
bd5e81a0 2947
dec0dbbc 2948 uint8_t comma_count = 0;
bd5e81a0
DB
2949 while (comma_count < 4 && *ftp) {
2950 if (*ftp == ',') {
2951 comma_count++;
2952 if (comma_count == 4) {
2953 *ftp = 0;
2954 } else {
2955 *ftp = '.';
2956 }
2957 }
2958 ftp++;
2959 }
2960 if (comma_count != 4) {
2961 return CT_FTP_CTL_INVALID;
2962 }
2963
2964 struct in_addr ip_addr;
2965 int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
2966 if (rc2 != 1) {
2967 return CT_FTP_CTL_INVALID;
2968 }
2969
cd7c99a6 2970 *addr_size = ftp - ip_addr_start - 1;
bd5e81a0
DB
2971 char *save_ftp = ftp;
2972 ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
2973 if (!ftp) {
2974 return CT_FTP_CTL_INVALID;
2975 }
2976 int value;
2977 if (!str_to_int(save_ftp, 10, &value)) {
2978 return CT_FTP_CTL_INVALID;
2979 }
2980
2981 /* This is derived from the L4 port maximum is 65535. */
2982 if (value > 255) {
2983 return CT_FTP_CTL_INVALID;
2984 }
2985
2986 uint16_t port_hs = value;
2987 port_hs <<= 8;
2988
2989 /* Skip over comma. */
2990 ftp++;
2991 save_ftp = ftp;
2992 bool digit_found = false;
2993 while (isdigit(*ftp)) {
2994 ftp++;
2995 digit_found = true;
2996 }
2997 if (!digit_found) {
2998 return CT_FTP_CTL_INVALID;
2999 }
3000 *ftp = 0;
3001 if (!str_to_int(save_ftp, 10, &value)) {
3002 return CT_FTP_CTL_INVALID;
3003 }
3004
3005 if (value > 255) {
3006 return CT_FTP_CTL_INVALID;
3007 }
3008
78a0b272 3009 port_hs |= value;
bd5e81a0
DB
3010 ovs_be16 port = htons(port_hs);
3011 ovs_be32 conn_ipv4_addr;
3012
3013 switch (mode) {
3014 case CT_FTP_MODE_ACTIVE:
cda1b109
DB
3015 *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4;
3016 conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4;
bd5e81a0
DB
3017 break;
3018 case CT_FTP_MODE_PASSIVE:
cda1b109
DB
3019 *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4;
3020 conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4;
bd5e81a0 3021 break;
7be77cb0 3022 case CT_TFTP_MODE:
bd5e81a0
DB
3023 default:
3024 OVS_NOT_REACHED();
3025 }
3026
3027 ovs_be32 ftp_ipv4_addr;
3028 ftp_ipv4_addr = ip_addr.s_addr;
3029 /* Although most servers will block this exploit, there may be some
3030 * less well managed. */
3031 if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
3032 return CT_FTP_CTL_INVALID;
3033 }
3034
be38342d
DB
3035 expectation_create(ct, port, conn_for_expectation,
3036 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
3037 return CT_FTP_CTL_INTEREST;
3038}
3039
3040static char *
3041skip_ipv6_digits(char *str)
3042{
3043 while (isxdigit(*str) || *str == ':' || *str == '.') {
3044 str++;
3045 }
3046 return str;
3047}
3048
3049static enum ftp_ctl_pkt
3050process_ftp_ctl_v6(struct conntrack *ct,
3051 struct dp_packet *pkt,
3052 const struct conn *conn_for_expectation,
cda1b109 3053 union ct_addr *v6_addr_rep, char **ftp_data_start,
bd5e81a0
DB
3054 size_t *addr_offset_from_ftp_data_start,
3055 size_t *addr_size, enum ct_alg_mode *mode)
3056{
3057 struct tcp_header *th = dp_packet_l4(pkt);
3058 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3059 char *tcp_hdr = (char *) th;
3060 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
bd5e81a0
DB
3061 get_ftp_ctl_msg(pkt, ftp_msg);
3062 *ftp_data_start = tcp_hdr + tcp_hdr_len;
bd5e81a0
DB
3063 char *ftp = ftp_msg;
3064 struct in6_addr ip6_addr;
dec0dbbc 3065
23bea975 3066 if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
bd5e81a0
DB
3067 ftp = ftp_msg + strlen(FTP_EPRT_CMD);
3068 ftp = skip_non_digits(ftp);
3069 if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
3070 return CT_FTP_CTL_INVALID;
3071 }
3072 /* Jump over delimiter. */
3073 ftp += 2;
3074
bd5e81a0 3075 memset(&ip6_addr, 0, sizeof ip6_addr);
dec0dbbc 3076 char *ip_addr_start = ftp;
bd5e81a0
DB
3077 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
3078 ftp = skip_ipv6_digits(ftp);
3079 *ftp = 0;
3080 *addr_size = ftp - ip_addr_start;
3081 int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
3082 if (rc2 != 1) {
3083 return CT_FTP_CTL_INVALID;
3084 }
3085 ftp++;
3086 *mode = CT_FTP_MODE_ACTIVE;
3087 } else {
3088 ftp = ftp_msg + strcspn(ftp_msg, "(");
3089 ftp = skip_non_digits(ftp);
3090 if (!isdigit(*ftp)) {
3091 return CT_FTP_CTL_INVALID;
3092 }
3093
3094 /* Not used for passive mode. */
3095 *addr_offset_from_ftp_data_start = 0;
3096 *addr_size = 0;
3097
3098 *mode = CT_FTP_MODE_PASSIVE;
3099 }
3100
3101 char *save_ftp = ftp;
3102 ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
3103 if (!ftp) {
3104 return CT_FTP_CTL_INVALID;
3105 }
dec0dbbc 3106
bd5e81a0
DB
3107 int value;
3108 if (!str_to_int(save_ftp, 10, &value)) {
3109 return CT_FTP_CTL_INVALID;
3110 }
3111 if (value > CT_MAX_L4_PORT) {
3112 return CT_FTP_CTL_INVALID;
3113 }
3114
3115 uint16_t port_hs = value;
3116 ovs_be16 port = htons(port_hs);
3117
3118 switch (*mode) {
3119 case CT_FTP_MODE_ACTIVE:
3120 *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
3121 /* Although most servers will block this exploit, there may be some
3122 * less well managed. */
cda1b109
DB
3123 if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) &&
3124 memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6,
bd5e81a0
DB
3125 sizeof ip6_addr)) {
3126 return CT_FTP_CTL_INVALID;
3127 }
3128 break;
3129 case CT_FTP_MODE_PASSIVE:
3130 *v6_addr_rep = conn_for_expectation->key.dst.addr;
3131 break;
7be77cb0 3132 case CT_TFTP_MODE:
bd5e81a0
DB
3133 default:
3134 OVS_NOT_REACHED();
3135 }
3136
be38342d
DB
3137 expectation_create(ct, port, conn_for_expectation,
3138 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
3139 return CT_FTP_CTL_INTEREST;
3140}
3141
3142static int
cda1b109 3143repl_ftp_v6_addr(struct dp_packet *pkt, union ct_addr v6_addr_rep,
bd5e81a0
DB
3144 char *ftp_data_start,
3145 size_t addr_offset_from_ftp_data_start,
3146 size_t addr_size, enum ct_alg_mode mode)
3147{
3148 /* This is slightly bigger than really possible. */
3149 enum { MAX_FTP_V6_NAT_DELTA = 45 };
3150
3151 if (mode == CT_FTP_MODE_PASSIVE) {
3152 return 0;
3153 }
3154
3155 /* Do conservative check for pathological MTU usage. */
3156 uint32_t orig_used_size = dp_packet_size(pkt);
cd7c99a6
DB
3157 if (orig_used_size + MAX_FTP_V6_NAT_DELTA >
3158 dp_packet_get_allocated(pkt)) {
3159
bd5e81a0 3160 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
cd7c99a6
DB
3161 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V6",
3162 dp_packet_get_allocated(pkt));
bd5e81a0
DB
3163 return 0;
3164 }
3165
298530b8 3166 char v6_addr_str[INET6_ADDRSTRLEN] = {0};
cda1b109 3167 ovs_assert(inet_ntop(AF_INET6, &v6_addr_rep.ipv6, v6_addr_str,
298530b8 3168 sizeof v6_addr_str));
cd7c99a6
DB
3169 modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
3170 addr_size, v6_addr_str, strlen(v6_addr_str),
3171 orig_used_size);
3172 return (int) strlen(v6_addr_str) - (int) addr_size;
bd5e81a0
DB
3173}
3174
d13d7115
DB
3175/* Increment/decrement a TCP sequence number. */
3176static void
3177adj_seqnum(ovs_16aligned_be32 *val, int32_t inc)
3178{
3179 put_16aligned_be32(val, htonl(ntohl(get_16aligned_be32(val)) + inc));
3180}
3181
bd5e81a0
DB
3182static void
3183handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
967bb5c5 3184 struct dp_packet *pkt, struct conn *ec, long long now,
253e4dc0 3185 enum ftp_ctl_pkt ftp_ctl, bool nat)
bd5e81a0
DB
3186{
3187 struct ip_header *l3_hdr = dp_packet_l3(pkt);
3188 ovs_be32 v4_addr_rep = 0;
cda1b109 3189 union ct_addr v6_addr_rep;
faa0826d 3190 size_t addr_offset_from_ftp_data_start = 0;
bd5e81a0
DB
3191 size_t addr_size = 0;
3192 char *ftp_data_start;
bd5e81a0
DB
3193 enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
3194
3195 if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
3196 return;
3197 }
3198
bd5e81a0
DB
3199 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
3200 int64_t seq_skew = 0;
dec0dbbc 3201
253e4dc0 3202 if (ftp_ctl == CT_FTP_CTL_INTEREST) {
bd5e81a0
DB
3203 enum ftp_ctl_pkt rc;
3204 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
253e4dc0 3205 rc = process_ftp_ctl_v6(ct, pkt, ec,
4417ca3d 3206 &v6_addr_rep, &ftp_data_start,
bd5e81a0
DB
3207 &addr_offset_from_ftp_data_start,
3208 &addr_size, &mode);
3209 } else {
253e4dc0 3210 rc = process_ftp_ctl_v4(ct, pkt, ec,
4417ca3d 3211 &v4_addr_rep, &ftp_data_start,
cd7c99a6
DB
3212 &addr_offset_from_ftp_data_start,
3213 &addr_size);
bd5e81a0
DB
3214 }
3215 if (rc == CT_FTP_CTL_INVALID) {
3216 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3217 VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
3218 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
3219 return;
3220 } else if (rc == CT_FTP_CTL_INTEREST) {
3221 uint16_t ip_len;
dec0dbbc 3222
bd5e81a0 3223 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
253e4dc0
DM
3224 if (nat) {
3225 seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep,
3226 ftp_data_start,
3227 addr_offset_from_ftp_data_start,
3228 addr_size, mode);
3229 }
3230
bd5e81a0 3231 if (seq_skew) {
253e4dc0
DM
3232 ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen) +
3233 seq_skew;
bd5e81a0 3234 nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
bd5e81a0
DB
3235 }
3236 } else {
253e4dc0
DM
3237 if (nat) {
3238 seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep,
3239 ftp_data_start,
cd7c99a6
DB
3240 addr_offset_from_ftp_data_start,
3241 addr_size);
253e4dc0 3242 }
bd5e81a0 3243 if (seq_skew) {
253e4dc0 3244 ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew;
29cf9c1b
FL
3245 if (!dp_packet_hwol_is_ipv4(pkt)) {
3246 l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
3247 l3_hdr->ip_tot_len,
3248 htons(ip_len));
3249 }
bd5e81a0 3250 l3_hdr->ip_tot_len = htons(ip_len);
bd5e81a0
DB
3251 }
3252 }
3253 } else {
3254 OVS_NOT_REACHED();
3255 }
bd5e81a0
DB
3256 }
3257
3258 struct tcp_header *th = dp_packet_l4(pkt);
dec0dbbc 3259
253e4dc0 3260 if (nat && ec->seq_skew != 0) {
d13d7115
DB
3261 ctx->reply != ec->seq_skew_dir ?
3262 adj_seqnum(&th->tcp_ack, -ec->seq_skew) :
3263 adj_seqnum(&th->tcp_seq, ec->seq_skew);
bd5e81a0
DB
3264 }
3265
bd5e81a0 3266 th->tcp_csum = 0;
29cf9c1b
FL
3267 if (!dp_packet_hwol_tx_l4_checksum(pkt)) {
3268 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3269 th->tcp_csum = packet_csum_upperlayer6(nh6, th, ctx->key.nw_proto,
3270 dp_packet_l4_size(pkt));
3271 } else {
3272 uint32_t tcp_csum = packet_csum_pseudoheader(l3_hdr);
3273 th->tcp_csum = csum_finish(
3274 csum_continue(tcp_csum, th, dp_packet_l4_size(pkt)));
3275 }
bd5e81a0 3276 }
253e4dc0
DM
3277
3278 if (seq_skew) {
967bb5c5 3279 conn_seq_skew_set(ct, ec, now, seq_skew + ec->seq_skew,
253e4dc0
DM
3280 ctx->reply);
3281 }
bd5e81a0 3282}
7be77cb0
DB
3283
3284static void
3285handle_tftp_ctl(struct conntrack *ct,
94e71143 3286 const struct conn_lookup_ctx *ctx OVS_UNUSED,
967bb5c5
DB
3287 struct dp_packet *pkt, struct conn *conn_for_expectation,
3288 long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED,
3289 bool nat OVS_UNUSED)
7be77cb0 3290{
be38342d
DB
3291 expectation_create(ct, conn_for_expectation->key.src.port,
3292 conn_for_expectation,
3293 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
7be77cb0 3294}