]> git.proxmox.com Git - mirror_ovs.git/blame - lib/conntrack.c
conntrack: Don't re-add cleaned 'conn' to expiry list.
[mirror_ovs.git] / lib / conntrack.c
CommitLineData
a489b168 1/*
4ea96698 2 * Copyright (c) 2015-2019 Nicira, Inc.
a489b168
DDP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
bd5e81a0 18#include <ctype.h>
a489b168 19#include <errno.h>
ff6aa424 20#include <sys/types.h>
a489b168
DDP
21#include <netinet/in.h>
22#include <netinet/icmp6.h>
bd5e81a0 23#include <string.h>
a489b168
DDP
24
25#include "bitmap.h"
bd5e81a0 26#include "conntrack.h"
a489b168
DDP
27#include "conntrack-private.h"
28#include "coverage.h"
29#include "csum.h"
4d4e68ed 30#include "ct-dpif.h"
a489b168
DDP
31#include "dp-packet.h"
32#include "flow.h"
33#include "netdev.h"
34#include "odp-netlink.h"
35#include "openvswitch/hmap.h"
36#include "openvswitch/vlog.h"
37#include "ovs-rcu.h"
e6ef6cc6 38#include "ovs-thread.h"
fd016ae3 39#include "openvswitch/poll-loop.h"
a489b168
DDP
40#include "random.h"
41#include "timeval.h"
42
43VLOG_DEFINE_THIS_MODULE(conntrack);
44
45COVERAGE_DEFINE(conntrack_full);
e6ef6cc6 46COVERAGE_DEFINE(conntrack_long_cleanup);
a489b168
DDP
47
48struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
dbb597d3 53 bool icmp_related;
a489b168
DDP
54};
55
bd5e81a0
DB
56enum ftp_ctl_pkt {
57 /* Control packets with address and/or port specifiers. */
58 CT_FTP_CTL_INTEREST,
59 /* Control packets without address and/or port specifiers. */
60 CT_FTP_CTL_OTHER,
61 CT_FTP_CTL_INVALID,
62};
63
64enum ct_alg_mode {
65 CT_FTP_MODE_ACTIVE,
66 CT_FTP_MODE_PASSIVE,
7be77cb0 67 CT_TFTP_MODE,
bd5e81a0
DB
68};
69
94e71143
DB
70enum ct_alg_ctl_type {
71 CT_ALG_CTL_NONE,
72 CT_ALG_CTL_FTP,
73 CT_ALG_CTL_TFTP,
be38342d
DB
74 /* SIP is not enabled through Openflow and presently only used as
75 * an example of an alg that allows a wildcard src ip. */
76 CT_ALG_CTL_SIP,
94e71143
DB
77};
78
a489b168 79static bool conn_key_extract(struct conntrack *, struct dp_packet *,
66e4ad8a
DDP
80 ovs_be16 dl_type, struct conn_lookup_ctx *,
81 uint16_t zone);
a489b168
DDP
82static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
83static void conn_key_reverse(struct conn_key *);
a489b168 84static bool valid_new(struct dp_packet *pkt, struct conn_key *);
967bb5c5 85static struct conn *new_conn(struct conntrack *ct, struct dp_packet *pkt,
e6ef6cc6 86 struct conn_key *, long long now);
967bb5c5 87static void delete_conn_cmn(struct conn *);
a489b168 88static void delete_conn(struct conn *);
967bb5c5
DB
89static void delete_conn_one(struct conn *conn);
90static enum ct_update_res conn_update(struct conntrack *ct, struct conn *conn,
91 struct dp_packet *pkt,
92 struct conn_lookup_ctx *ctx,
e6ef6cc6 93 long long now);
a489b168
DDP
94static bool conn_expired(struct conn *, long long now);
95static void set_mark(struct dp_packet *, struct conn *,
96 uint32_t val, uint32_t mask);
97static void set_label(struct dp_packet *, struct conn *,
98 const struct ovs_key_ct_labels *val,
99 const struct ovs_key_ct_labels *mask);
e6ef6cc6 100static void *clean_thread_main(void *f_);
a489b168 101
286de272
DB
102static bool
103nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
104 struct conn *nat_conn);
105
106static uint8_t
107reverse_icmp_type(uint8_t type);
108static uint8_t
109reverse_icmp6_type(uint8_t type);
110static inline bool
111extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
112 const char **new_data, bool validate_checksum);
113static inline bool
114extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
115 const char **new_data);
bd5e81a0 116static struct alg_exp_node *
be38342d
DB
117expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
118 uint32_t basis, bool src_ip_wc);
bd5e81a0
DB
119
120static int
121repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
122 char *ftp_data_v4_start,
cd7c99a6 123 size_t addr_offset_from_ftp_data_start, size_t addr_size);
bd5e81a0
DB
124
125static enum ftp_ctl_pkt
126process_ftp_ctl_v4(struct conntrack *ct,
127 struct dp_packet *pkt,
128 const struct conn *conn_for_expectation,
4417ca3d 129 ovs_be32 *v4_addr_rep,
bd5e81a0 130 char **ftp_data_v4_start,
cd7c99a6
DB
131 size_t *addr_offset_from_ftp_data_start,
132 size_t *addr_size);
bd5e81a0
DB
133
134static enum ftp_ctl_pkt
135detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
136 struct dp_packet *pkt);
137
4417ca3d 138static void
967bb5c5 139expectation_clean(struct conntrack *ct, const struct conn_key *master_key);
4417ca3d 140
94e71143
DB
141static struct ct_l4_proto *l4_protos[] = {
142 [IPPROTO_TCP] = &ct_proto_tcp,
143 [IPPROTO_UDP] = &ct_proto_other,
144 [IPPROTO_ICMP] = &ct_proto_icmp4,
145 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
146};
147
bd5e81a0
DB
148static void
149handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
967bb5c5
DB
150 struct dp_packet *pkt, struct conn *ec, long long now,
151 enum ftp_ctl_pkt ftp_ctl, bool nat);
bd5e81a0 152
7be77cb0
DB
153static void
154handle_tftp_ctl(struct conntrack *ct,
94e71143 155 const struct conn_lookup_ctx *ctx OVS_UNUSED,
967bb5c5
DB
156 struct dp_packet *pkt, struct conn *conn_for_expectation,
157 long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED,
158 bool nat OVS_UNUSED);
94e71143
DB
159
160typedef void (*alg_helper)(struct conntrack *ct,
161 const struct conn_lookup_ctx *ctx,
162 struct dp_packet *pkt,
967bb5c5 163 struct conn *conn_for_expectation,
94e71143
DB
164 long long now, enum ftp_ctl_pkt ftp_ctl,
165 bool nat);
166
167static alg_helper alg_helpers[] = {
168 [CT_ALG_CTL_NONE] = NULL,
169 [CT_ALG_CTL_FTP] = handle_ftp_ctl,
170 [CT_ALG_CTL_TFTP] = handle_tftp_ctl,
a489b168
DDP
171};
172
173long long ct_timeout_val[] = {
174#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
175 CT_TIMEOUTS
176#undef CT_TIMEOUT
177};
178
bd5e81a0
DB
179/* The maximum TCP or UDP port number. */
180#define CT_MAX_L4_PORT 65535
bd5e81a0
DB
181/* String buffer used for parsing FTP string messages.
182 * This is sized about twice what is needed to leave some
183 * margin of error. */
184#define LARGEST_FTP_MSG_OF_INTEREST 128
185/* FTP port string used in active mode. */
186#define FTP_PORT_CMD "PORT"
187/* FTP pasv string used in passive mode. */
188#define FTP_PASV_REPLY_CODE "227"
189/* Maximum decimal digits for port in FTP command.
190 * The port is represented as two 3 digit numbers with the
191 * high part a multiple of 256. */
192#define MAX_FTP_PORT_DGTS 3
193
194/* FTP extension EPRT string used for active mode. */
195#define FTP_EPRT_CMD "EPRT"
196/* FTP extension EPSV string used for passive mode. */
197#define FTP_EPSV_REPLY "EXTENDED PASSIVE"
198/* Maximum decimal digits for port in FTP extended command. */
199#define MAX_EXT_FTP_PORT_DGTS 5
200/* FTP extended command code for IPv6. */
201#define FTP_AF_V6 '2'
202/* Used to indicate a wildcard L4 source port number for ALGs.
203 * This is used for port numbers that we cannot predict in
204 * expectations. */
205#define ALG_WC_SRC_PORT 0
206
a489b168 207/* If the total number of connections goes above this value, no new connections
286de272 208 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
a489b168
DDP
209#define DEFAULT_N_CONN_LIMIT 3000000
210
5ed7a0b4
DB
211/* Does a member by member comparison of two conn_keys; this
212 * function must be kept in sync with struct conn_key; returns 0
213 * if the keys are equal or 1 if the keys are not equal. */
214static int
215conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
216{
217 if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
218 !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
219 (key1->src.icmp_id == key2->src.icmp_id) &&
220 (key1->src.icmp_type == key2->src.icmp_type) &&
221 (key1->src.icmp_code == key2->src.icmp_code) &&
222 (key1->dst.icmp_id == key2->dst.icmp_id) &&
223 (key1->dst.icmp_type == key2->dst.icmp_type) &&
224 (key1->dst.icmp_code == key2->dst.icmp_code) &&
225 (key1->dl_type == key2->dl_type) &&
226 (key1->zone == key2->zone) &&
227 (key1->nw_proto == key2->nw_proto)) {
228
229 return 0;
230 }
231 return 1;
232}
233
d8682ee5 234static void
dec0dbbc
DB
235ct_print_conn_info(const struct conn *c, const char *log_msg,
236 enum vlog_level vll, bool force, bool rl_on)
66f400f5
DB
237{
238#define CT_VLOG(RL_ON, LEVEL, ...) \
239 do { \
240 if (RL_ON) { \
241 static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
242 vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \
243 } else { \
244 vlog(&this_module, LEVEL, __VA_ARGS__); \
245 } \
246 } while (0)
247
248 if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) {
249 if (c->key.dl_type == htons(ETH_TYPE_IP)) {
250 CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src "
251 "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports "
252 "%"PRIu16"/%"PRIu16" rev src/dst ports "
253 "%"PRIu16"/%"PRIu16" zone/rev zone "
254 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
255 "%"PRIu8"/%"PRIu8, log_msg,
cda1b109
DB
256 IP_ARGS(c->key.src.addr.ipv4),
257 IP_ARGS(c->key.dst.addr.ipv4),
258 IP_ARGS(c->rev_key.src.addr.ipv4),
259 IP_ARGS(c->rev_key.dst.addr.ipv4),
66f400f5
DB
260 ntohs(c->key.src.port), ntohs(c->key.dst.port),
261 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
262 c->key.zone, c->rev_key.zone, c->key.nw_proto,
263 c->rev_key.nw_proto);
264 } else {
265 char ip6_s[INET6_ADDRSTRLEN];
266 inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s);
267 char ip6_d[INET6_ADDRSTRLEN];
268 inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d);
269 char ip6_rs[INET6_ADDRSTRLEN];
270 inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs,
271 sizeof ip6_rs);
272 char ip6_rd[INET6_ADDRSTRLEN];
273 inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd,
274 sizeof ip6_rd);
275
276 CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s"
277 " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16
278 " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone "
279 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
280 "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs,
281 ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port),
282 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
283 c->key.zone, c->rev_key.zone, c->key.nw_proto,
284 c->rev_key.nw_proto);
285 }
286 }
287}
288
a489b168
DDP
289/* Initializes the connection tracker 'ct'. The caller is responsible for
290 * calling 'conntrack_destroy()', when the instance is not needed anymore */
57593fd2
DB
291struct conntrack *
292conntrack_init(void)
a489b168 293{
57593fd2
DB
294 struct conntrack *ct = xzalloc(sizeof *ct);
295
967bb5c5
DB
296 ovs_rwlock_init(&ct->resources_lock);
297 ovs_rwlock_wrlock(&ct->resources_lock);
bd5e81a0 298 hmap_init(&ct->alg_expectations);
4417ca3d 299 hindex_init(&ct->alg_expectation_refs);
967bb5c5 300 ovs_rwlock_unlock(&ct->resources_lock);
a489b168 301
967bb5c5
DB
302 ovs_mutex_init_adaptive(&ct->ct_lock);
303 ovs_mutex_lock(&ct->ct_lock);
304 cmap_init(&ct->conns);
305 for (unsigned i = 0; i < ARRAY_SIZE(ct->exp_lists); i++) {
306 ovs_list_init(&ct->exp_lists[i]);
a489b168 307 }
967bb5c5
DB
308 ovs_mutex_unlock(&ct->ct_lock);
309
a489b168
DDP
310 ct->hash_basis = random_uint32();
311 atomic_count_init(&ct->n_conn, 0);
312 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
e6ef6cc6
DDP
313 latch_init(&ct->clean_thread_exit);
314 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
4ea96698 315 ct->ipf = ipf_init();
57593fd2
DB
316
317 return ct;
a489b168
DDP
318}
319
967bb5c5
DB
320static void
321conn_clean_cmn(struct conntrack *ct, struct conn *conn)
322 OVS_REQUIRES(ct->ct_lock)
323{
324 if (conn->alg) {
325 expectation_clean(ct, &conn->key);
326 }
327
328 uint32_t hash = conn_key_hash(&conn->key, ct->hash_basis);
329 cmap_remove(&ct->conns, &conn->cm_node, hash);
330}
331
332/* Must be called with 'conn' of 'conn_type' CT_CONN_TYPE_DEFAULT. Also
333 * removes the associated nat 'conn' from the lookup datastructures. */
334static void
335conn_clean(struct conntrack *ct, struct conn *conn)
336 OVS_REQUIRES(ct->ct_lock)
337{
338 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
339
340 conn_clean_cmn(ct, conn);
341 if (conn->nat_conn) {
342 uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
343 cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
344 }
345 ovs_list_remove(&conn->exp_node);
5f918a8a 346 conn->cleaned = true;
967bb5c5
DB
347 ovsrcu_postpone(delete_conn, conn);
348 atomic_count_dec(&ct->n_conn);
349}
350
351static void
352conn_clean_one(struct conntrack *ct, struct conn *conn)
353 OVS_REQUIRES(ct->ct_lock)
354{
355 conn_clean_cmn(ct, conn);
356 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
357 ovs_list_remove(&conn->exp_node);
5f918a8a 358 conn->cleaned = true;
967bb5c5
DB
359 atomic_count_dec(&ct->n_conn);
360 }
361 ovsrcu_postpone(delete_conn_one, conn);
362}
363
364/* Destroys the connection tracker 'ct' and frees all the allocated memory.
365 * The caller of this function must already have shut down packet input
366 * and PMD threads (which would have been quiesced). */
a489b168
DDP
367void
368conntrack_destroy(struct conntrack *ct)
369{
967bb5c5 370 struct conn *conn;
e6ef6cc6
DDP
371 latch_set(&ct->clean_thread_exit);
372 pthread_join(ct->clean_thread, NULL);
373 latch_destroy(&ct->clean_thread_exit);
a489b168 374
967bb5c5
DB
375 ovs_mutex_lock(&ct->ct_lock);
376 CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
377 conn_clean_one(ct, conn);
a489b168 378 }
967bb5c5
DB
379 cmap_destroy(&ct->conns);
380 ovs_mutex_unlock(&ct->ct_lock);
381 ovs_mutex_destroy(&ct->ct_lock);
bd5e81a0 382
967bb5c5 383 ovs_rwlock_wrlock(&ct->resources_lock);
bd5e81a0
DB
384 struct alg_exp_node *alg_exp_node;
385 HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
386 free(alg_exp_node);
387 }
bd5e81a0 388 hmap_destroy(&ct->alg_expectations);
4417ca3d 389 hindex_destroy(&ct->alg_expectation_refs);
967bb5c5
DB
390 ovs_rwlock_unlock(&ct->resources_lock);
391 ovs_rwlock_destroy(&ct->resources_lock);
392
4ea96698 393 ipf_destroy(ct->ipf);
21ffe409 394 free(ct);
a489b168
DDP
395}
396\f
967bb5c5
DB
397
398static bool
399conn_key_lookup(struct conntrack *ct, const struct conn_key *key,
400 uint32_t hash, long long now, struct conn **conn_out,
401 bool *reply)
a489b168 402{
967bb5c5
DB
403 struct conn *conn;
404 bool found = false;
405
406 CMAP_FOR_EACH_WITH_HASH (conn, cm_node, hash, &ct->conns) {
407 if (!conn_key_cmp(&conn->key, key) && !conn_expired(conn, now)) {
408 found = true;
409 if (reply) {
410 *reply = false;
411 }
412 break;
413 }
414 if (!conn_key_cmp(&conn->rev_key, key) && !conn_expired(conn, now)) {
415 found = true;
416 if (reply) {
417 *reply = true;
418 }
419 break;
420 }
421 }
a489b168 422
967bb5c5
DB
423 if (found && conn_out) {
424 *conn_out = conn;
425 } else if (conn_out) {
426 *conn_out = NULL;
427 }
428 return found;
a489b168
DDP
429}
430
431static void
286de272 432write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
bd5e81a0 433 const struct conn_key *key, const struct alg_exp_node *alg_exp)
a489b168 434{
286de272 435 pkt->md.ct_state |= CS_TRACKED;
a489b168 436 pkt->md.ct_zone = zone;
967bb5c5
DB
437
438 if (conn) {
439 ovs_mutex_lock(&conn->lock);
440 pkt->md.ct_mark = conn->mark;
441 pkt->md.ct_label = conn->label;
442 ovs_mutex_unlock(&conn->lock);
443 } else {
444 pkt->md.ct_mark = 0;
445 pkt->md.ct_label = OVS_U128_ZERO;
446 }
daf4d3c1
JR
447
448 /* Use the original direction tuple if we have it. */
449 if (conn) {
bd5e81a0
DB
450 if (conn->alg_related) {
451 key = &conn->master_key;
452 } else {
453 key = &conn->key;
454 }
455 } else if (alg_exp) {
456 pkt->md.ct_mark = alg_exp->master_mark;
457 pkt->md.ct_label = alg_exp->master_label;
458 key = &alg_exp->master_key;
daf4d3c1 459 }
dec0dbbc 460
daf4d3c1 461 pkt->md.ct_orig_tuple_ipv6 = false;
dec0dbbc 462
daf4d3c1
JR
463 if (key) {
464 if (key->dl_type == htons(ETH_TYPE_IP)) {
465 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
cda1b109
DB
466 key->src.addr.ipv4,
467 key->dst.addr.ipv4,
daf4d3c1
JR
468 key->nw_proto != IPPROTO_ICMP
469 ? key->src.port : htons(key->src.icmp_type),
470 key->nw_proto != IPPROTO_ICMP
471 ? key->dst.port : htons(key->src.icmp_code),
472 key->nw_proto,
473 };
286de272 474 } else {
daf4d3c1
JR
475 pkt->md.ct_orig_tuple_ipv6 = true;
476 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
cda1b109
DB
477 key->src.addr.ipv6,
478 key->dst.addr.ipv6,
daf4d3c1
JR
479 key->nw_proto != IPPROTO_ICMPV6
480 ? key->src.port : htons(key->src.icmp_type),
481 key->nw_proto != IPPROTO_ICMPV6
482 ? key->dst.port : htons(key->src.icmp_code),
483 key->nw_proto,
484 };
485 }
486 } else {
487 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
488 }
bd5e81a0
DB
489}
490
491static uint8_t
492get_ip_proto(const struct dp_packet *pkt)
493{
494 uint8_t ip_proto;
495 struct eth_header *l2 = dp_packet_eth(pkt);
496 if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
497 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
498 ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
499 } else {
500 struct ip_header *l3_hdr = dp_packet_l3(pkt);
501 ip_proto = l3_hdr->ip_proto;
502 }
286de272 503
bd5e81a0
DB
504 return ip_proto;
505}
506
507static bool
94e71143 508is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl)
bd5e81a0 509{
94e71143 510 return ct_alg_ctl == CT_ALG_CTL_FTP;
bd5e81a0
DB
511}
512
94e71143 513static enum ct_alg_ctl_type
bd7d93f8
DB
514get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst,
515 const char *helper)
7be77cb0 516{
94e71143
DB
517 /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined
518 * in OSX, at least in in.h. Since these values will never change, remove
7be77cb0 519 * the external dependency. */
94e71143
DB
520 enum { CT_IPPORT_FTP = 21 };
521 enum { CT_IPPORT_TFTP = 69 };
bd7d93f8
DB
522 uint8_t ip_proto = get_ip_proto(pkt);
523 struct udp_header *uh = dp_packet_l4(pkt);
524 struct tcp_header *th = dp_packet_l4(pkt);
525 ovs_be16 ftp_src_port = htons(CT_IPPORT_FTP);
526 ovs_be16 ftp_dst_port = htons(CT_IPPORT_FTP);
527 ovs_be16 tftp_dst_port = htons(CT_IPPORT_TFTP);
528
529 if (OVS_UNLIKELY(tp_dst)) {
530 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
531 ftp_dst_port = tp_dst;
532 } else if (helper && !strncmp(helper, "tftp", strlen("tftp"))) {
533 tftp_dst_port = tp_dst;
534 }
535 } else if (OVS_UNLIKELY(tp_src)) {
536 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
537 ftp_src_port = tp_src;
538 }
539 }
7be77cb0 540
bd7d93f8 541 if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_dst_port) {
94e71143
DB
542 return CT_ALG_CTL_TFTP;
543 } else if (ip_proto == IPPROTO_TCP &&
bd7d93f8 544 (th->tcp_src == ftp_src_port || th->tcp_dst == ftp_dst_port)) {
94e71143
DB
545 return CT_ALG_CTL_FTP;
546 }
547 return CT_ALG_CTL_NONE;
548}
549
be38342d
DB
550static bool
551alg_src_ip_wc(enum ct_alg_ctl_type alg_ctl_type)
552{
553 if (alg_ctl_type == CT_ALG_CTL_SIP) {
554 return true;
555 }
556 return false;
557}
558
94e71143
DB
559static void
560handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
561 struct dp_packet *pkt, enum ct_alg_ctl_type ct_alg_ctl,
967bb5c5 562 struct conn *conn, long long now, bool nat)
94e71143
DB
563{
564 /* ALG control packet handling with expectation creation. */
3a2a425b 565 if (OVS_UNLIKELY(alg_helpers[ct_alg_ctl] && conn && conn->alg)) {
967bb5c5
DB
566 ovs_mutex_lock(&conn->lock);
567 alg_helpers[ct_alg_ctl](ct, ctx, pkt, conn, now, CT_FTP_CTL_INTEREST,
568 nat);
569 ovs_mutex_unlock(&conn->lock);
94e71143 570 }
7be77cb0
DB
571}
572
286de272
DB
573static void
574pat_packet(struct dp_packet *pkt, const struct conn *conn)
575{
576 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
577 if (conn->key.nw_proto == IPPROTO_TCP) {
578 struct tcp_header *th = dp_packet_l4(pkt);
579 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
580 } else if (conn->key.nw_proto == IPPROTO_UDP) {
581 struct udp_header *uh = dp_packet_l4(pkt);
582 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
583 }
584 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
585 if (conn->key.nw_proto == IPPROTO_TCP) {
586 struct tcp_header *th = dp_packet_l4(pkt);
587 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
588 } else if (conn->key.nw_proto == IPPROTO_UDP) {
589 struct udp_header *uh = dp_packet_l4(pkt);
590 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
591 }
592 }
593}
594
595static void
596nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
597{
598 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
599 pkt->md.ct_state |= CS_SRC_NAT;
600 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
601 struct ip_header *nh = dp_packet_l3(pkt);
602 packet_set_ipv4_addr(pkt, &nh->ip_src,
cda1b109 603 conn->rev_key.dst.addr.ipv4);
286de272
DB
604 } else {
605 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
606 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
607 nh6->ip6_src.be32,
cda1b109 608 &conn->rev_key.dst.addr.ipv6, true);
286de272
DB
609 }
610 if (!related) {
611 pat_packet(pkt, conn);
612 }
613 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
614 pkt->md.ct_state |= CS_DST_NAT;
615 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
616 struct ip_header *nh = dp_packet_l3(pkt);
617 packet_set_ipv4_addr(pkt, &nh->ip_dst,
cda1b109 618 conn->rev_key.src.addr.ipv4);
286de272
DB
619 } else {
620 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
621 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
622 nh6->ip6_dst.be32,
cda1b109 623 &conn->rev_key.src.addr.ipv6, true);
286de272
DB
624 }
625 if (!related) {
626 pat_packet(pkt, conn);
627 }
628 }
629}
630
631static void
632un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
633{
634 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
635 if (conn->key.nw_proto == IPPROTO_TCP) {
636 struct tcp_header *th = dp_packet_l4(pkt);
637 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
638 } else if (conn->key.nw_proto == IPPROTO_UDP) {
639 struct udp_header *uh = dp_packet_l4(pkt);
640 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
641 }
642 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
643 if (conn->key.nw_proto == IPPROTO_TCP) {
644 struct tcp_header *th = dp_packet_l4(pkt);
645 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
646 } else if (conn->key.nw_proto == IPPROTO_UDP) {
647 struct udp_header *uh = dp_packet_l4(pkt);
648 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
649 }
650 }
651}
652
edd1bef4
DB
653static void
654reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
655{
656 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
657 if (conn->key.nw_proto == IPPROTO_TCP) {
658 struct tcp_header *th_in = dp_packet_l4(pkt);
659 packet_set_tcp_port(pkt, conn->key.src.port,
660 th_in->tcp_dst);
661 } else if (conn->key.nw_proto == IPPROTO_UDP) {
662 struct udp_header *uh_in = dp_packet_l4(pkt);
663 packet_set_udp_port(pkt, conn->key.src.port,
664 uh_in->udp_dst);
665 }
666 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
667 if (conn->key.nw_proto == IPPROTO_TCP) {
668 struct tcp_header *th_in = dp_packet_l4(pkt);
669 packet_set_tcp_port(pkt, th_in->tcp_src,
670 conn->key.dst.port);
671 } else if (conn->key.nw_proto == IPPROTO_UDP) {
672 struct udp_header *uh_in = dp_packet_l4(pkt);
673 packet_set_udp_port(pkt, uh_in->udp_src,
674 conn->key.dst.port);
675 }
676 }
677}
678
679static void
680reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
681{
682 char *tail = dp_packet_tail(pkt);
683 char pad = dp_packet_l2_pad_size(pkt);
684 struct conn_key inner_key;
685 const char *inner_l4 = NULL;
686 uint16_t orig_l3_ofs = pkt->l3_ofs;
687 uint16_t orig_l4_ofs = pkt->l4_ofs;
688
689 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
690 struct ip_header *nh = dp_packet_l3(pkt);
691 struct icmp_header *icmp = dp_packet_l4(pkt);
692 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
bd5e81a0
DB
693 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad,
694 &inner_l4, false);
edd1bef4
DB
695 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
696 pkt->l4_ofs += inner_l4 - (char *) icmp;
697
698 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
699 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
cda1b109 700 conn->key.src.addr.ipv4);
edd1bef4
DB
701 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
702 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
cda1b109 703 conn->key.dst.addr.ipv4);
edd1bef4 704 }
dec0dbbc 705
edd1bef4
DB
706 reverse_pat_packet(pkt, conn);
707 icmp->icmp_csum = 0;
708 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
709 } else {
710 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
711 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
712 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
713 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
714 extract_l3_ipv6(&inner_key, inner_l3_6,
715 tail - ((char *)inner_l3_6) - pad,
716 &inner_l4);
717 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
718 pkt->l4_ofs += inner_l4 - (char *) icmp6;
719
720 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
721 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
722 inner_l3_6->ip6_src.be32,
cda1b109 723 &conn->key.src.addr.ipv6, true);
edd1bef4
DB
724 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
725 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
726 inner_l3_6->ip6_dst.be32,
cda1b109 727 &conn->key.dst.addr.ipv6, true);
edd1bef4
DB
728 }
729 reverse_pat_packet(pkt, conn);
edd1bef4 730 icmp6->icmp6_base.icmp6_cksum = 0;
76d85771
DB
731 icmp6->icmp6_base.icmp6_cksum = packet_csum_upperlayer6(nh6, icmp6,
732 IPPROTO_ICMPV6, tail - (char *) icmp6 - pad);
edd1bef4
DB
733 }
734 pkt->l3_ofs = orig_l3_ofs;
735 pkt->l4_ofs = orig_l4_ofs;
736}
737
286de272
DB
738static void
739un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
740 bool related)
741{
742 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
743 pkt->md.ct_state |= CS_DST_NAT;
744 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
745 struct ip_header *nh = dp_packet_l3(pkt);
746 packet_set_ipv4_addr(pkt, &nh->ip_dst,
cda1b109 747 conn->key.src.addr.ipv4);
286de272
DB
748 } else {
749 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
750 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
751 nh6->ip6_dst.be32,
cda1b109 752 &conn->key.src.addr.ipv6, true);
286de272 753 }
edd1bef4
DB
754
755 if (OVS_UNLIKELY(related)) {
756 reverse_nat_packet(pkt, conn);
757 } else {
286de272
DB
758 un_pat_packet(pkt, conn);
759 }
760 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
761 pkt->md.ct_state |= CS_SRC_NAT;
762 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
763 struct ip_header *nh = dp_packet_l3(pkt);
764 packet_set_ipv4_addr(pkt, &nh->ip_src,
cda1b109 765 conn->key.dst.addr.ipv4);
286de272
DB
766 } else {
767 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
768 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
769 nh6->ip6_src.be32,
cda1b109 770 &conn->key.dst.addr.ipv6, true);
286de272 771 }
edd1bef4
DB
772
773 if (OVS_UNLIKELY(related)) {
774 reverse_nat_packet(pkt, conn);
775 } else {
286de272
DB
776 un_pat_packet(pkt, conn);
777 }
778 }
779}
780
bd5e81a0 781static void
967bb5c5 782conn_seq_skew_set(struct conntrack *ct, const struct conn *conn_in,
bd5e81a0 783 long long now, int seq_skew, bool seq_skew_dir)
967bb5c5 784 OVS_NO_THREAD_SAFETY_ANALYSIS
bd5e81a0 785{
967bb5c5
DB
786 struct conn *conn;
787 bool reply;
788 uint32_t hash = conn_key_hash(&conn_in->key, ct->hash_basis);
789 ovs_mutex_unlock(&conn_in->lock);
790 conn_key_lookup(ct, &conn_in->key, hash, now, &conn, &reply);
791 ovs_mutex_lock(&conn_in->lock);
792
bd5e81a0
DB
793 if (conn && seq_skew) {
794 conn->seq_skew = seq_skew;
795 conn->seq_skew_dir = seq_skew_dir;
796 }
a720a7fa
DB
797}
798
3a2a425b
DB
799static bool
800ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
801{
802 if (ct_alg_ctl == CT_ALG_CTL_NONE) {
803 return true;
804 } else if (helper) {
805 if ((ct_alg_ctl == CT_ALG_CTL_FTP) &&
806 !strncmp(helper, "ftp", strlen("ftp"))) {
807 return true;
808 } else if ((ct_alg_ctl == CT_ALG_CTL_TFTP) &&
809 !strncmp(helper, "tftp", strlen("tftp"))) {
810 return true;
811 } else {
812 return false;
813 }
814 } else {
815 return false;
816 }
817}
818
a489b168
DDP
819static struct conn *
820conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
286de272
DB
821 struct conn_lookup_ctx *ctx, bool commit, long long now,
822 const struct nat_action_info_t *nat_action_info,
967bb5c5 823 const char *helper, const struct alg_exp_node *alg_exp,
3a2a425b 824 enum ct_alg_ctl_type ct_alg_ctl)
967bb5c5 825 OVS_REQUIRES(ct->ct_lock)
a489b168 826{
a489b168 827 struct conn *nc = NULL;
967bb5c5 828 struct conn *nat_conn = NULL;
a489b168
DDP
829
830 if (!valid_new(pkt, &ctx->key)) {
286de272 831 pkt->md.ct_state = CS_INVALID;
a489b168
DDP
832 return nc;
833 }
dec0dbbc 834
286de272 835 pkt->md.ct_state = CS_NEW;
dec0dbbc 836
bd5e81a0
DB
837 if (alg_exp) {
838 pkt->md.ct_state |= CS_RELATED;
839 }
a489b168
DDP
840
841 if (commit) {
842 unsigned int n_conn_limit;
a489b168 843 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
a489b168
DDP
844 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
845 COVERAGE_INC(conntrack_full);
846 return nc;
847 }
848
967bb5c5 849 nc = new_conn(ct, pkt, &ctx->key, now);
a720a7fa 850 memcpy(&nc->key, &ctx->key, sizeof nc->key);
82b9ac94 851 memcpy(&nc->rev_key, &nc->key, sizeof nc->rev_key);
286de272 852 conn_key_reverse(&nc->rev_key);
a489b168 853
3a2a425b
DB
854 if (ct_verify_helper(helper, ct_alg_ctl)) {
855 nc->alg = nullable_xstrdup(helper);
bd5e81a0
DB
856 }
857
858 if (alg_exp) {
859 nc->alg_related = true;
860 nc->mark = alg_exp->master_mark;
861 nc->label = alg_exp->master_label;
862 nc->master_key = alg_exp->master_key;
863 }
864
286de272
DB
865 if (nat_action_info) {
866 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
967bb5c5 867 nat_conn = xzalloc(sizeof *nat_conn);
a489b168 868
bd5e81a0 869 if (alg_exp) {
be38342d 870 if (alg_exp->nat_rpl_dst) {
bd5e81a0
DB
871 nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
872 nc->nat_info->nat_action = NAT_ACTION_SRC;
873 } else {
874 nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
875 nc->nat_info->nat_action = NAT_ACTION_DST;
876 }
bd5e81a0 877 } else {
967bb5c5
DB
878 memcpy(nat_conn, nc, sizeof *nat_conn);
879 bool nat_res = nat_select_range_tuple(ct, nc, nat_conn);
286de272 880
bd5e81a0
DB
881 if (!nat_res) {
882 goto nat_res_exhaustion;
883 }
286de272 884
967bb5c5
DB
885 /* Update nc with nat adjustments made to nat_conn by
886 * nat_select_range_tuple(). */
887 memcpy(nc, nat_conn, sizeof *nc);
286de272 888 }
967bb5c5 889
dbb597d3 890 nat_packet(pkt, nc, ctx->icmp_related);
967bb5c5
DB
891 memcpy(&nat_conn->key, &nc->rev_key, sizeof nat_conn->key);
892 memcpy(&nat_conn->rev_key, &nc->key, sizeof nat_conn->rev_key);
893 nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
894 nat_conn->nat_info = NULL;
895 nat_conn->alg = NULL;
896 nat_conn->nat_conn = NULL;
897 uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis);
898 cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
899 }
900
901 nc->nat_conn = nat_conn;
902 ovs_mutex_init_adaptive(&nc->lock);
903 nc->conn_type = CT_CONN_TYPE_DEFAULT;
904 cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
a489b168 905 atomic_count_inc(&ct->n_conn);
967bb5c5 906 ctx->conn = nc; /* For completeness. */
a489b168 907 }
bd5e81a0 908
a489b168 909 return nc;
bd5e81a0 910
967bb5c5
DB
911 /* This would be a user error or a DOS attack. A user error is prevented
912 * by allocating enough combinations of NAT addresses when combined with
913 * ephemeral ports. A DOS attack should be protected against with
914 * firewall rules or a separate firewall. Also using zone partitioning
915 * can limit DoS impact. */
bd5e81a0 916nat_res_exhaustion:
967bb5c5
DB
917 free(nat_conn);
918 ovs_list_remove(&nc->exp_node);
919 delete_conn_cmn(nc);
bd5e81a0
DB
920 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
921 VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
922 "if DoS attack, use firewalling and/or zone partitioning.");
923 return NULL;
a489b168
DDP
924}
925
286de272
DB
926static bool
927conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
967bb5c5
DB
928 struct conn_lookup_ctx *ctx, struct conn *conn,
929 long long now)
286de272 930{
967bb5c5 931 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
286de272
DB
932 bool create_new_conn = false;
933
dbb597d3 934 if (ctx->icmp_related) {
286de272
DB
935 pkt->md.ct_state |= CS_RELATED;
936 if (ctx->reply) {
937 pkt->md.ct_state |= CS_REPLY_DIR;
938 }
939 } else {
967bb5c5 940 if (conn->alg_related) {
bd5e81a0
DB
941 pkt->md.ct_state |= CS_RELATED;
942 }
dec0dbbc 943
967bb5c5 944 enum ct_update_res res = conn_update(ct, conn, pkt, ctx, now);
286de272
DB
945
946 switch (res) {
947 case CT_UPDATE_VALID:
948 pkt->md.ct_state |= CS_ESTABLISHED;
949 pkt->md.ct_state &= ~CS_NEW;
950 if (ctx->reply) {
951 pkt->md.ct_state |= CS_REPLY_DIR;
952 }
953 break;
954 case CT_UPDATE_INVALID:
955 pkt->md.ct_state = CS_INVALID;
956 break;
957 case CT_UPDATE_NEW:
967bb5c5
DB
958 ovs_mutex_lock(&ct->ct_lock);
959 conn_clean(ct, conn);
960 ovs_mutex_unlock(&ct->ct_lock);
286de272
DB
961 create_new_conn = true;
962 break;
963 default:
964 OVS_NOT_REACHED();
965 }
966 }
967 return create_new_conn;
968}
969
286de272
DB
970static void
971handle_nat(struct dp_packet *pkt, struct conn *conn,
972 uint16_t zone, bool reply, bool related)
973{
974 if (conn->nat_info &&
975 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
976 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
977 zone != pkt->md.ct_zone))) {
bd5e81a0 978
286de272
DB
979 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
980 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
981 }
982 if (reply) {
983 un_nat_packet(pkt, conn, related);
984 } else {
985 nat_packet(pkt, conn, related);
986 }
987 }
988}
989
f8016041
DB
990static bool
991check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
992 struct conn_lookup_ctx *ctx_in, long long now,
967bb5c5 993 struct conn **conn,
f8016041 994 const struct nat_action_info_t *nat_action_info)
f8016041
DB
995{
996 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
997 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
998 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
999 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
1000 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1001 nat_action_info) {
1002 return false;
1003 }
1004
967bb5c5
DB
1005 struct conn_key key;
1006 memset(&key, 0 , sizeof key);
f8016041
DB
1007
1008 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
967bb5c5
DB
1009 key.src.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
1010 key.dst.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
f8016041
DB
1011
1012 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
967bb5c5
DB
1013 key.src.icmp_id = ctx_in->key.src.icmp_id;
1014 key.dst.icmp_id = ctx_in->key.dst.icmp_id;
f8016041 1015 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
967bb5c5
DB
1016 key.src.icmp_type = (uint8_t) src_port;
1017 key.dst.icmp_type = reverse_icmp_type(key.src.icmp_type);
f8016041 1018 } else {
967bb5c5
DB
1019 key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
1020 key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
f8016041 1021 }
967bb5c5 1022 key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
f8016041 1023 } else {
967bb5c5
DB
1024 key.src.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
1025 key.dst.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
f8016041
DB
1026
1027 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
967bb5c5
DB
1028 key.src.icmp_id = ctx_in->key.src.icmp_id;
1029 key.dst.icmp_id = ctx_in->key.dst.icmp_id;
f8016041 1030 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
967bb5c5
DB
1031 key.src.icmp_type = (uint8_t) src_port;
1032 key.dst.icmp_type = reverse_icmp6_type(key.src.icmp_type);
f8016041 1033 } else {
967bb5c5
DB
1034 key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
1035 key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
f8016041 1036 }
967bb5c5 1037 key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
f8016041
DB
1038 }
1039
967bb5c5
DB
1040 key.dl_type = ctx_in->key.dl_type;
1041 key.zone = pkt->md.ct_zone;
1042 uint32_t hash = conn_key_hash(&key, ct->hash_basis);
1043 bool reply;
1044 conn_key_lookup(ct, &key, hash, now, conn, &reply);
f8016041
DB
1045 return *conn ? true : false;
1046}
1047
94e71143
DB
1048static bool
1049conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
1050 struct conn_lookup_ctx *ctx, struct conn *conn,
1051 const struct nat_action_info_t *nat_action_info,
1052 enum ct_alg_ctl_type ct_alg_ctl, long long now,
967bb5c5 1053 bool *create_new_conn)
94e71143
DB
1054{
1055 if (is_ftp_ctl(ct_alg_ctl)) {
1056 /* Keep sequence tracking in sync with the source of the
1057 * sequence skew. */
967bb5c5 1058 ovs_mutex_lock(&conn->lock);
94e71143
DB
1059 if (ctx->reply != conn->seq_skew_dir) {
1060 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1061 !!nat_action_info);
967bb5c5
DB
1062 /* conn_update_state locks for unrelated fields, so unlock. */
1063 ovs_mutex_unlock(&conn->lock);
1064 *create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
94e71143 1065 } else {
967bb5c5
DB
1066 /* conn_update_state locks for unrelated fields, so unlock. */
1067 ovs_mutex_unlock(&conn->lock);
1068 *create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
1069 ovs_mutex_lock(&conn->lock);
030958a0
DB
1070 if (*create_new_conn == false) {
1071 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1072 !!nat_action_info);
1073 }
967bb5c5 1074 ovs_mutex_unlock(&conn->lock);
94e71143
DB
1075 }
1076 return true;
1077 }
1078 return false;
1079}
1080
286de272 1081static void
a489b168
DDP
1082process_one(struct conntrack *ct, struct dp_packet *pkt,
1083 struct conn_lookup_ctx *ctx, uint16_t zone,
286de272
DB
1084 bool force, bool commit, long long now, const uint32_t *setmark,
1085 const struct ovs_key_ct_labels *setlabel,
bd5e81a0 1086 const struct nat_action_info_t *nat_action_info,
bd7d93f8 1087 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper)
a489b168 1088{
967bb5c5
DB
1089 bool create_new_conn = false;
1090 conn_key_lookup(ct, &ctx->key, ctx->hash, now, &ctx->conn, &ctx->reply);
1091 struct conn *conn = ctx->conn;
a489b168 1092
a76a37ef 1093 /* Delete found entry if in wrong direction. 'force' implies commit. */
a720a7fa 1094 if (OVS_UNLIKELY(force && ctx->reply && conn)) {
967bb5c5
DB
1095 ovs_mutex_lock(&ct->ct_lock);
1096 conn_clean(ct, conn);
1097 ovs_mutex_unlock(&ct->ct_lock);
a76a37ef
JR
1098 conn = NULL;
1099 }
1100
286de272
DB
1101 if (OVS_LIKELY(conn)) {
1102 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
a489b168 1103
286de272 1104 ctx->reply = true;
967bb5c5
DB
1105 struct conn *rev_conn = conn; /* Save for debugging. */
1106 uint32_t hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
1107 conn_key_lookup(ct, &ctx->key, hash, now, &conn, &ctx->reply);
a489b168 1108
967bb5c5 1109 if (!conn) {
286de272 1110 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
967bb5c5
DB
1111 char *log_msg = xasprintf("Missing master conn %p", rev_conn);
1112 ct_print_conn_info(conn, log_msg, VLL_INFO, true, true);
1113 free(log_msg);
286de272 1114 return;
a489b168
DDP
1115 }
1116 }
286de272
DB
1117 }
1118
bd7d93f8
DB
1119 enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst,
1120 helper);
bd5e81a0 1121
286de272 1122 if (OVS_LIKELY(conn)) {
94e71143
DB
1123 if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn,
1124 nat_action_info,
967bb5c5 1125 ct_alg_ctl, now,
94e71143 1126 &create_new_conn))) {
967bb5c5 1127 create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
bd5e81a0 1128 }
286de272 1129 if (nat_action_info && !create_new_conn) {
dbb597d3 1130 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
286de272 1131 }
bd5e81a0 1132
967bb5c5
DB
1133 } else if (check_orig_tuple(ct, pkt, ctx, now, &conn, nat_action_info)) {
1134 create_new_conn = conn_update_state(ct, pkt, ctx, conn, now);
a489b168 1135 } else {
dbb597d3 1136 if (ctx->icmp_related) {
bd5e81a0
DB
1137 /* An icmp related conn should always be found; no new
1138 connection is created based on an icmp related packet. */
286de272 1139 pkt->md.ct_state = CS_INVALID;
5c2e106b 1140 } else {
286de272 1141 create_new_conn = true;
5c2e106b 1142 }
a489b168
DDP
1143 }
1144
bd5e81a0 1145 const struct alg_exp_node *alg_exp = NULL;
96bbcbf7 1146 struct alg_exp_node alg_exp_entry;
dec0dbbc 1147
286de272 1148 if (OVS_UNLIKELY(create_new_conn)) {
bd5e81a0 1149
967bb5c5 1150 ovs_rwlock_rdlock(&ct->resources_lock);
bd5e81a0 1151 alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
be38342d
DB
1152 ct->hash_basis,
1153 alg_src_ip_wc(ct_alg_ctl));
bd5e81a0 1154 if (alg_exp) {
c3f6bae2 1155 memcpy(&alg_exp_entry, alg_exp, sizeof alg_exp_entry);
bd5e81a0
DB
1156 alg_exp = &alg_exp_entry;
1157 }
967bb5c5 1158 ovs_rwlock_unlock(&ct->resources_lock);
bd5e81a0 1159
967bb5c5 1160 ovs_mutex_lock(&ct->ct_lock);
286de272 1161 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
967bb5c5
DB
1162 helper, alg_exp, ct_alg_ctl);
1163 ovs_mutex_unlock(&ct->ct_lock);
286de272
DB
1164 }
1165
bd5e81a0
DB
1166 write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1167
286de272
DB
1168 if (conn && setmark) {
1169 set_mark(pkt, conn, setmark[0], setmark[1]);
1170 }
a489b168 1171
286de272
DB
1172 if (conn && setlabel) {
1173 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1174 }
1175
967bb5c5 1176 handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info);
a489b168
DDP
1177}
1178
1179/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
51b9a533 1180 * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have
4ea96698
DB
1181 * the l3 and and l4 offset properly set. Performs fragment reassembly with
1182 * the help of ipf_preprocess_conntrack().
a489b168
DDP
1183 *
1184 * If 'commit' is true, the packets are allowed to create new entries in the
1185 * connection tables. 'setmark', if not NULL, should point to a two
1186 * elements array containing a value and a mask to set the connection mark.
1187 * 'setlabel' behaves similarly for the connection label.*/
1188int
1189conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
a76a37ef 1190 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
66e4ad8a 1191 const uint32_t *setmark,
a489b168 1192 const struct ovs_key_ct_labels *setlabel,
bd7d93f8 1193 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
94053e66
FA
1194 const struct nat_action_info_t *nat_action_info,
1195 long long now)
a489b168 1196{
4ea96698
DB
1197 ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
1198 ct->hash_basis);
1199
43495c45 1200 struct dp_packet *packet;
61ce32b9 1201 struct conn_lookup_ctx ctx;
a489b168 1202
e883448e 1203 DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
4ea96698
DB
1204 if (packet->md.ct_state == CS_INVALID
1205 || !conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
43495c45
BB
1206 packet->md.ct_state = CS_INVALID;
1207 write_ct_md(packet, zone, NULL, NULL, NULL);
a489b168
DDP
1208 continue;
1209 }
94e71143 1210 process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
bd7d93f8 1211 setlabel, nat_action_info, tp_src, tp_dst, helper);
a489b168
DDP
1212 }
1213
4ea96698
DB
1214 ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
1215
a489b168
DDP
1216 return 0;
1217}
1218
1fe178d2
EG
1219void
1220conntrack_clear(struct dp_packet *packet)
1221{
1222 /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
1223 * the conntrack fields invalid. */
1224 packet->md.ct_state = 0;
1225}
1226
a489b168
DDP
1227static void
1228set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1229{
967bb5c5 1230 ovs_mutex_lock(&conn->lock);
bd5e81a0
DB
1231 if (conn->alg_related) {
1232 pkt->md.ct_mark = conn->mark;
1233 } else {
1234 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1235 conn->mark = pkt->md.ct_mark;
1236 }
967bb5c5 1237 ovs_mutex_unlock(&conn->lock);
a489b168
DDP
1238}
1239
1240static void
1241set_label(struct dp_packet *pkt, struct conn *conn,
1242 const struct ovs_key_ct_labels *val,
1243 const struct ovs_key_ct_labels *mask)
1244{
967bb5c5 1245 ovs_mutex_lock(&conn->lock);
bd5e81a0
DB
1246 if (conn->alg_related) {
1247 pkt->md.ct_label = conn->label;
1248 } else {
1249 ovs_u128 v, m;
a489b168 1250
bd5e81a0
DB
1251 memcpy(&v, val, sizeof v);
1252 memcpy(&m, mask, sizeof m);
a489b168 1253
bd5e81a0 1254 pkt->md.ct_label.u64.lo = v.u64.lo
a489b168 1255 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
bd5e81a0 1256 pkt->md.ct_label.u64.hi = v.u64.hi
a489b168 1257 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
bd5e81a0
DB
1258 conn->label = pkt->md.ct_label;
1259 }
967bb5c5 1260 ovs_mutex_unlock(&conn->lock);
a489b168 1261}
286de272 1262
a489b168 1263\f
e6ef6cc6
DDP
1264/* Delete the expired connections from 'ctb', up to 'limit'. Returns the
1265 * earliest expiration time among the remaining connections in 'ctb'. Returns
1266 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
1267 * if 'limit' is reached */
1268static long long
967bb5c5 1269ct_sweep(struct conntrack *ct, long long now, size_t limit)
e6ef6cc6
DDP
1270{
1271 struct conn *conn, *next;
1272 long long min_expiration = LLONG_MAX;
e6ef6cc6
DDP
1273 size_t count = 0;
1274
967bb5c5
DB
1275 ovs_mutex_lock(&ct->ct_lock);
1276
dec0dbbc 1277 for (unsigned i = 0; i < N_CT_TM; i++) {
967bb5c5
DB
1278 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ct->exp_lists[i]) {
1279 ovs_mutex_lock(&conn->lock);
1280 if (now < conn->expiration || count >= limit) {
a720a7fa 1281 min_expiration = MIN(min_expiration, conn->expiration);
967bb5c5 1282 ovs_mutex_unlock(&conn->lock);
a720a7fa
DB
1283 if (count >= limit) {
1284 /* Do not check other lists. */
1285 COVERAGE_INC(conntrack_long_cleanup);
967bb5c5 1286 goto out;
e6ef6cc6 1287 }
a720a7fa 1288 break;
967bb5c5
DB
1289 } else {
1290 ovs_mutex_unlock(&conn->lock);
1291 conn_clean(ct, conn);
e6ef6cc6 1292 }
a720a7fa 1293 count++;
e6ef6cc6
DDP
1294 }
1295 }
967bb5c5
DB
1296
1297out:
1298 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
1299 time_msec() - now);
1300 ovs_mutex_unlock(&ct->ct_lock);
e6ef6cc6
DDP
1301 return min_expiration;
1302}
1303
1304/* Cleans up old connection entries from 'ct'. Returns the time when the
1305 * next expiration might happen. The return value might be smaller than
1306 * 'now', meaning that an internal limit has been reached, and some expired
1307 * connections have not been deleted. */
1308static long long
1309conntrack_clean(struct conntrack *ct, long long now)
1310{
e6ef6cc6 1311 unsigned int n_conn_limit;
e6ef6cc6 1312 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
967bb5c5
DB
1313 size_t clean_max = n_conn_limit > 10 ? n_conn_limit / 10 : 1;
1314 long long min_exp = ct_sweep(ct, now, clean_max);
1315 long long next_wakeup = MIN(min_exp, now + CT_TM_MIN);
e6ef6cc6
DDP
1316
1317 return next_wakeup;
1318}
1319
1320/* Cleanup:
e6ef6cc6
DDP
1321 *
1322 * We must call conntrack_clean() periodically. conntrack_clean() return
1323 * value gives an hint on when the next cleanup must be done (either because
1324 * there is an actual connection that expires, or because a new connection
1325 * might be created with the minimum timeout).
1326 *
1327 * The logic below has two goals:
1328 *
6c54734e
DDP
1329 * - We want to reduce the number of wakeups and batch connection cleanup
1330 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1331 * are coping with the current cleanup tasks, then we wait at least
1332 * 5 seconds to do further cleanup.
e6ef6cc6 1333 *
967bb5c5 1334 * - We don't want to keep the map locked too long, as we might prevent
6c54734e 1335 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
967bb5c5 1336 * behind, there is at least some 200ms blocks of time when the map will be
6c54734e 1337 * left alone, so the datapath can operate unhindered.
e6ef6cc6
DDP
1338 */
1339#define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1340#define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1341
1342static void *
1343clean_thread_main(void *f_)
1344{
1345 struct conntrack *ct = f_;
1346
1347 while (!latch_is_set(&ct->clean_thread_exit)) {
1348 long long next_wake;
1349 long long now = time_msec();
e6ef6cc6
DDP
1350 next_wake = conntrack_clean(ct, now);
1351
1352 if (next_wake < now) {
1353 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1354 } else {
1355 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1356 }
1357 latch_wait(&ct->clean_thread_exit);
1358 poll_block();
1359 }
1360
1361 return NULL;
1362}
1363\f
e917d3ee
DB
1364/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1365 * used to store a pointer to the first byte after the L3 header. 'Size' is
1366 * the size of the packet beyond the data pointer. */
a489b168
DDP
1367static inline bool
1368extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1369 const char **new_data, bool validate_checksum)
1370{
e917d3ee
DB
1371 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1372 return false;
a489b168
DDP
1373 }
1374
dec0dbbc
DB
1375 const struct ip_header *ip = data;
1376 size_t ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
a489b168 1377
e917d3ee
DB
1378 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1379 return false;
1380 }
a489b168 1381
e917d3ee
DB
1382 if (OVS_UNLIKELY(size < ip_len)) {
1383 return false;
1384 }
a489b168 1385
e917d3ee
DB
1386 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1387 return false;
a489b168
DDP
1388 }
1389
1390 if (validate_checksum && csum(data, ip_len) != 0) {
1391 return false;
1392 }
1393
e917d3ee
DB
1394 if (new_data) {
1395 *new_data = (char *) data + ip_len;
1396 }
1397
cda1b109
DB
1398 key->src.addr.ipv4 = get_16aligned_be32(&ip->ip_src);
1399 key->dst.addr.ipv4 = get_16aligned_be32(&ip->ip_dst);
a489b168
DDP
1400 key->nw_proto = ip->ip_proto;
1401
1402 return true;
1403}
1404
e917d3ee
DB
1405/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1406 * used to store a pointer to the first byte after the L3 header. 'Size' is
1407 * the size of the packet beyond the data pointer. */
a489b168
DDP
1408static inline bool
1409extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1410 const char **new_data)
1411{
1412 const struct ovs_16aligned_ip6_hdr *ip6 = data;
286de272 1413
e917d3ee
DB
1414 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1415 return false;
a489b168
DDP
1416 }
1417
1418 data = ip6 + 1;
1419 size -= sizeof *ip6;
dec0dbbc
DB
1420 uint8_t nw_proto = ip6->ip6_nxt;
1421 uint8_t nw_frag = 0;
a489b168 1422
523464ab
DB
1423 const struct ovs_16aligned_ip6_frag *frag_hdr;
1424 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, &frag_hdr)) {
a489b168
DDP
1425 return false;
1426 }
1427
a489b168
DDP
1428 if (nw_frag) {
1429 return false;
1430 }
1431
c8b1ad49
DB
1432 if (new_data) {
1433 *new_data = data;
1434 }
1435
cda1b109
DB
1436 memcpy(&key->src.addr.ipv6, &ip6->ip6_src, sizeof key->src.addr);
1437 memcpy(&key->dst.addr.ipv6, &ip6->ip6_dst, sizeof key->dst.addr);
a489b168
DDP
1438 key->nw_proto = nw_proto;
1439
1440 return true;
1441}
1442
1443static inline bool
1444checksum_valid(const struct conn_key *key, const void *data, size_t size,
1445 const void *l3)
1446{
a489b168 1447 if (key->dl_type == htons(ETH_TYPE_IP)) {
76d85771
DB
1448 uint32_t csum = packet_csum_pseudoheader(l3);
1449 return csum_finish(csum_continue(csum, data, size)) == 0;
a489b168 1450 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
76d85771 1451 return packet_csum_upperlayer6(l3, data, key->nw_proto, size) == 0;
a489b168
DDP
1452 } else {
1453 return false;
1454 }
a489b168
DDP
1455}
1456
1457static inline bool
1458check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
324459a3 1459 const void *l3, bool validate_checksum)
a489b168
DDP
1460{
1461 const struct tcp_header *tcp = data;
40225b0c
BP
1462 if (size < sizeof *tcp) {
1463 return false;
1464 }
a489b168 1465
40225b0c 1466 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
a489b168
DDP
1467 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1468 return false;
1469 }
1470
324459a3 1471 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1472}
1473
1474static inline bool
1475check_l4_udp(const struct conn_key *key, const void *data, size_t size,
324459a3 1476 const void *l3, bool validate_checksum)
a489b168
DDP
1477{
1478 const struct udp_header *udp = data;
40225b0c
BP
1479 if (size < sizeof *udp) {
1480 return false;
1481 }
a489b168 1482
40225b0c 1483 size_t udp_len = ntohs(udp->udp_len);
a489b168
DDP
1484 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1485 return false;
1486 }
1487
1488 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1489 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
324459a3 1490 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
a489b168
DDP
1491}
1492
1493static inline bool
324459a3 1494check_l4_icmp(const void *data, size_t size, bool validate_checksum)
a489b168 1495{
324459a3 1496 return validate_checksum ? csum(data, size) == 0 : true;
a489b168
DDP
1497}
1498
1499static inline bool
1500check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
324459a3 1501 const void *l3, bool validate_checksum)
a489b168 1502{
324459a3 1503 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1504}
1505
1506static inline bool
1507extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1508{
a489b168
DDP
1509 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1510 return false;
1511 }
1512
dec0dbbc 1513 const struct tcp_header *tcp = data;
a489b168
DDP
1514 key->src.port = tcp->tcp_src;
1515 key->dst.port = tcp->tcp_dst;
1516
1517 /* Port 0 is invalid */
1518 return key->src.port && key->dst.port;
1519}
1520
1521static inline bool
1522extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1523{
a489b168
DDP
1524 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1525 return false;
1526 }
1527
dec0dbbc 1528 const struct udp_header *udp = data;
a489b168
DDP
1529 key->src.port = udp->udp_src;
1530 key->dst.port = udp->udp_dst;
1531
1532 /* Port 0 is invalid */
1533 return key->src.port && key->dst.port;
1534}
1535
1536static inline bool extract_l4(struct conn_key *key, const void *data,
324459a3
SC
1537 size_t size, bool *related, const void *l3,
1538 bool validate_checksum);
a489b168 1539
b269a122
DDP
1540static uint8_t
1541reverse_icmp_type(uint8_t type)
1542{
1543 switch (type) {
1544 case ICMP4_ECHO_REQUEST:
1545 return ICMP4_ECHO_REPLY;
1546 case ICMP4_ECHO_REPLY:
1547 return ICMP4_ECHO_REQUEST;
1548
1549 case ICMP4_TIMESTAMP:
1550 return ICMP4_TIMESTAMPREPLY;
1551 case ICMP4_TIMESTAMPREPLY:
1552 return ICMP4_TIMESTAMP;
1553
1554 case ICMP4_INFOREQUEST:
1555 return ICMP4_INFOREPLY;
1556 case ICMP4_INFOREPLY:
1557 return ICMP4_INFOREQUEST;
1558 default:
1559 OVS_NOT_REACHED();
1560 }
1561}
1562
a489b168
DDP
1563/* If 'related' is not NULL and the function is processing an ICMP
1564 * error packet, extract the l3 and l4 fields from the nested header
1565 * instead and set *related to true. If 'related' is NULL we're
1566 * already processing a nested header and no such recursion is
1567 * possible */
1568static inline int
1569extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1570 bool *related)
1571{
a489b168
DDP
1572 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1573 return false;
1574 }
1575
dec0dbbc
DB
1576 const struct icmp_header *icmp = data;
1577
a489b168
DDP
1578 switch (icmp->icmp_type) {
1579 case ICMP4_ECHO_REQUEST:
1580 case ICMP4_ECHO_REPLY:
1581 case ICMP4_TIMESTAMP:
1582 case ICMP4_TIMESTAMPREPLY:
1583 case ICMP4_INFOREQUEST:
1584 case ICMP4_INFOREPLY:
b269a122
DDP
1585 if (icmp->icmp_code != 0) {
1586 return false;
1587 }
a489b168 1588 /* Separate ICMP connection: identified using id */
b269a122
DDP
1589 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1590 key->src.icmp_type = icmp->icmp_type;
1591 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
a489b168
DDP
1592 break;
1593 case ICMP4_DST_UNREACH:
1594 case ICMP4_TIME_EXCEEDED:
1595 case ICMP4_PARAM_PROB:
1596 case ICMP4_SOURCEQUENCH:
1597 case ICMP4_REDIRECT: {
1598 /* ICMP packet part of another connection. We should
1599 * extract the key from embedded packet header */
1600 struct conn_key inner_key;
1601 const char *l3 = (const char *) (icmp + 1);
1602 const char *tail = (const char *) data + size;
1603 const char *l4;
a489b168
DDP
1604
1605 if (!related) {
1606 return false;
1607 }
1608
1609 memset(&inner_key, 0, sizeof inner_key);
1610 inner_key.dl_type = htons(ETH_TYPE_IP);
dec0dbbc 1611 bool ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
a489b168
DDP
1612 if (!ok) {
1613 return false;
1614 }
1615
cda1b109 1616 if (inner_key.src.addr.ipv4 != key->dst.addr.ipv4) {
a489b168
DDP
1617 return false;
1618 }
1619
1620 key->src = inner_key.src;
1621 key->dst = inner_key.dst;
1622 key->nw_proto = inner_key.nw_proto;
1623
324459a3 1624 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
a489b168
DDP
1625 if (ok) {
1626 conn_key_reverse(key);
1627 *related = true;
1628 }
1629 return ok;
1630 }
1631 default:
1632 return false;
1633 }
1634
1635 return true;
1636}
1637
b269a122
DDP
1638static uint8_t
1639reverse_icmp6_type(uint8_t type)
1640{
1641 switch (type) {
1642 case ICMP6_ECHO_REQUEST:
1643 return ICMP6_ECHO_REPLY;
1644 case ICMP6_ECHO_REPLY:
1645 return ICMP6_ECHO_REQUEST;
1646 default:
1647 OVS_NOT_REACHED();
1648 }
1649}
1650
a489b168
DDP
1651/* If 'related' is not NULL and the function is processing an ICMP
1652 * error packet, extract the l3 and l4 fields from the nested header
1653 * instead and set *related to true. If 'related' is NULL we're
1654 * already processing a nested header and no such recursion is
1655 * possible */
1656static inline bool
1657extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1658 bool *related)
1659{
1660 const struct icmp6_header *icmp6 = data;
1661
1662 /* All the messages that we support need at least 4 bytes after
1663 * the header */
1664 if (size < sizeof *icmp6 + 4) {
1665 return false;
1666 }
1667
1668 switch (icmp6->icmp6_type) {
1669 case ICMP6_ECHO_REQUEST:
1670 case ICMP6_ECHO_REPLY:
b269a122
DDP
1671 if (icmp6->icmp6_code != 0) {
1672 return false;
1673 }
a489b168 1674 /* Separate ICMP connection: identified using id */
b269a122
DDP
1675 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1676 key->src.icmp_type = icmp6->icmp6_type;
1677 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
a489b168
DDP
1678 break;
1679 case ICMP6_DST_UNREACH:
1680 case ICMP6_PACKET_TOO_BIG:
1681 case ICMP6_TIME_EXCEEDED:
1682 case ICMP6_PARAM_PROB: {
1683 /* ICMP packet part of another connection. We should
1684 * extract the key from embedded packet header */
1685 struct conn_key inner_key;
1686 const char *l3 = (const char *) icmp6 + 8;
1687 const char *tail = (const char *) data + size;
1688 const char *l4 = NULL;
a489b168
DDP
1689
1690 if (!related) {
1691 return false;
1692 }
1693
1694 memset(&inner_key, 0, sizeof inner_key);
1695 inner_key.dl_type = htons(ETH_TYPE_IPV6);
dec0dbbc 1696 bool ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
a489b168
DDP
1697 if (!ok) {
1698 return false;
1699 }
1700
1701 /* pf doesn't do this, but it seems a good idea */
cda1b109
DB
1702 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6,
1703 &key->dst.addr.ipv6)) {
a489b168
DDP
1704 return false;
1705 }
1706
1707 key->src = inner_key.src;
1708 key->dst = inner_key.dst;
1709 key->nw_proto = inner_key.nw_proto;
1710
324459a3 1711 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
a489b168
DDP
1712 if (ok) {
1713 conn_key_reverse(key);
1714 *related = true;
1715 }
1716 return ok;
1717 }
1718 default:
1719 return false;
1720 }
1721
1722 return true;
1723}
1724
1725/* Extract l4 fields into 'key', which must already contain valid l3
1726 * members.
1727 *
1728 * If 'related' is not NULL and an ICMP error packet is being
1729 * processed, the function will extract the key from the packet nested
1401f6de 1730 * in the ICMP payload and set '*related' to true.
a489b168 1731 *
9171c635
DB
1732 * 'size' here is the layer 4 size, which can be a nested size if parsing
1733 * an ICMP or ICMP6 header.
1734 *
a489b168
DDP
1735 * If 'related' is NULL, it means that we're already parsing a header nested
1736 * in an ICMP error. In this case, we skip checksum and length validation. */
1737static inline bool
1738extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
324459a3 1739 const void *l3, bool validate_checksum)
a489b168
DDP
1740{
1741 if (key->nw_proto == IPPROTO_TCP) {
324459a3
SC
1742 return (!related || check_l4_tcp(key, data, size, l3,
1743 validate_checksum)) && extract_l4_tcp(key, data, size);
a489b168 1744 } else if (key->nw_proto == IPPROTO_UDP) {
324459a3
SC
1745 return (!related || check_l4_udp(key, data, size, l3,
1746 validate_checksum)) && extract_l4_udp(key, data, size);
a489b168
DDP
1747 } else if (key->dl_type == htons(ETH_TYPE_IP)
1748 && key->nw_proto == IPPROTO_ICMP) {
324459a3 1749 return (!related || check_l4_icmp(data, size, validate_checksum))
a489b168
DDP
1750 && extract_l4_icmp(key, data, size, related);
1751 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1752 && key->nw_proto == IPPROTO_ICMPV6) {
324459a3
SC
1753 return (!related || check_l4_icmp6(key, data, size, l3,
1754 validate_checksum)) && extract_l4_icmp6(key, data, size,
1755 related);
a489b168
DDP
1756 } else {
1757 return false;
1758 }
1759}
1760
1761static bool
66e4ad8a 1762conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
a489b168
DDP
1763 struct conn_lookup_ctx *ctx, uint16_t zone)
1764{
2482b0b0 1765 const struct eth_header *l2 = dp_packet_eth(pkt);
a489b168
DDP
1766 const struct ip_header *l3 = dp_packet_l3(pkt);
1767 const char *l4 = dp_packet_l4(pkt);
a489b168
DDP
1768
1769 memset(ctx, 0, sizeof *ctx);
1770
1771 if (!l2 || !l3 || !l4) {
1772 return false;
1773 }
1774
1775 ctx->key.zone = zone;
1776
1777 /* XXX In this function we parse the packet (again, it has already
1778 * gone through miniflow_extract()) for two reasons:
1779 *
1780 * 1) To extract the l3 addresses and l4 ports.
1781 * We already have the l3 and l4 headers' pointers. Extracting
1782 * the l3 addresses and the l4 ports is really cheap, since they
1783 * can be found at fixed locations.
66e4ad8a
DDP
1784 * 2) To extract the l4 type.
1785 * Extracting the l4 types, for IPv6 can be quite expensive, because
1786 * it's not at a fixed location.
a489b168
DDP
1787 *
1788 * Here's a way to avoid (2) with the help of the datapath.
66e4ad8a 1789 * The datapath doesn't keep the packet's extracted flow[1], so
a489b168 1790 * using that is not an option. We could use the packet's matching
66e4ad8a
DDP
1791 * megaflow, but we have to make sure that the l4 type (nw_proto)
1792 * is unwildcarded. This means either:
a489b168 1793 *
66e4ad8a
DDP
1794 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1795 * if the actions contains ct().
a489b168 1796 *
66e4ad8a
DDP
1797 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1798 * action. This is already done in different actions, but it's
1799 * unnecessary for the kernel.
a489b168
DDP
1800 *
1801 * ---
66e4ad8a 1802 * [1] The reasons for this are that keeping the flow increases
a489b168
DDP
1803 * (slightly) the cache footprint and increases computation
1804 * time as we move the packet around. Most importantly, the flow
1805 * should be updated by the actions and this can be slow, as
1806 * we use a sparse representation (miniflow).
1807 *
1808 */
dec0dbbc 1809 bool ok;
66e4ad8a 1810 ctx->key.dl_type = dl_type;
dec0dbbc 1811
a489b168 1812 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
dec0dbbc 1813 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
324459a3
SC
1814 if (hwol_bad_l3_csum) {
1815 ok = false;
1816 } else {
dec0dbbc 1817 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
324459a3 1818 /* Validate the checksum only when hwol is not supported. */
9171c635 1819 ok = extract_l3_ipv4(&ctx->key, l3, dp_packet_l3_size(pkt), NULL,
324459a3
SC
1820 !hwol_good_l3_csum);
1821 }
a489b168 1822 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
9171c635 1823 ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL);
a489b168
DDP
1824 } else {
1825 ok = false;
1826 }
1827
1828 if (ok) {
324459a3
SC
1829 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
1830 if (!hwol_bad_l4_csum) {
1831 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt);
1832 /* Validate the checksum only when hwol is not supported. */
9171c635
DB
1833 if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt),
1834 &ctx->icmp_related, l3, !hwol_good_l4_csum)) {
324459a3
SC
1835 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1836 return true;
1837 }
a489b168
DDP
1838 }
1839 }
1840
1841 return false;
1842}
92edd073
DB
1843
1844static uint32_t
cda1b109 1845ct_addr_hash_add(uint32_t hash, const union ct_addr *addr)
92edd073
DB
1846{
1847 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
1848 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
1849}
1850
1851static uint32_t
1852ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
1853{
1854 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
1855 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
1856}
a489b168
DDP
1857\f
1858/* Symmetric */
1859static uint32_t
1860conn_key_hash(const struct conn_key *key, uint32_t basis)
1861{
1862 uint32_t hsrc, hdst, hash;
a489b168 1863 hsrc = hdst = basis;
6b1d4625
DB
1864 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
1865 hdst = ct_endpoint_hash_add(hdst, &key->dst);
a489b168
DDP
1866
1867 /* Even if source and destination are swapped the hash will be the same. */
1868 hash = hsrc ^ hdst;
1869
1870 /* Hash the rest of the key(L3 and L4 types and zone). */
763b40b0 1871 return hash_words((uint32_t *) (&key->dst + 1),
a489b168
DDP
1872 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1873 hash);
a489b168
DDP
1874}
1875
1876static void
1877conn_key_reverse(struct conn_key *key)
1878{
dec0dbbc 1879 struct ct_endpoint tmp = key->src;
a489b168
DDP
1880 key->src = key->dst;
1881 key->dst = tmp;
1882}
1883
286de272 1884static uint32_t
cda1b109 1885nat_ipv6_addrs_delta(struct in6_addr *ipv6_min, struct in6_addr *ipv6_max)
286de272 1886{
cda1b109
DB
1887 uint8_t *ipv6_min_hi = &ipv6_min->s6_addr[0];
1888 uint8_t *ipv6_min_lo = &ipv6_min->s6_addr[0] + sizeof(uint64_t);
1889 uint8_t *ipv6_max_hi = &ipv6_max->s6_addr[0];
1890 uint8_t *ipv6_max_lo = &ipv6_max->s6_addr[0] + sizeof(uint64_t);
286de272
DB
1891
1892 ovs_be64 addr6_64_min_hi;
1893 ovs_be64 addr6_64_min_lo;
1894 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
1895 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
1896
1897 ovs_be64 addr6_64_max_hi;
1898 ovs_be64 addr6_64_max_lo;
1899 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
1900 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
1901
1902 uint64_t diff;
dec0dbbc 1903
286de272
DB
1904 if (addr6_64_min_hi == addr6_64_max_hi &&
1905 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
1906 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
1907 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
1908 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
1909 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
1910 ntohll(addr6_64_max_lo) - 1);
1911 } else {
1912 /* Limit address delta supported to 32 bits or 4 billion approximately.
1913 * Possibly, this should be visible to the user through a datapath
1914 * support check, however the practical impact is probably nil. */
1915 diff = 0xfffffffe;
1916 }
dec0dbbc 1917
286de272
DB
1918 if (diff > 0xfffffffe) {
1919 diff = 0xfffffffe;
1920 }
1921 return diff;
1922}
1923
1924/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
1925 * restricts the input parameters. */
a489b168 1926static void
cda1b109 1927nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t increment)
286de272 1928{
cda1b109
DB
1929 uint8_t *ipv6_hi = &ipv6->s6_addr[0];
1930 uint8_t *ipv6_lo = &ipv6->s6_addr[0] + sizeof(ovs_be64);
286de272
DB
1931 ovs_be64 addr6_64_hi;
1932 ovs_be64 addr6_64_lo;
1933 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
1934 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
1935
1936 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
1937 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
1938 } else if (addr6_64_hi != OVS_BE64_MAX) {
1939 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
1940 addr6_64_lo = htonll(increment - (UINT64_MAX -
1941 ntohll(addr6_64_lo) + 1));
1942 } else {
1943 OVS_NOT_REACHED();
1944 }
1945
1946 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
1947 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
286de272
DB
1948}
1949
1950static uint32_t
1951nat_range_hash(const struct conn *conn, uint32_t basis)
1952{
1953 uint32_t hash = basis;
286de272 1954
92edd073
DB
1955 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
1956 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
1957 hash = hash_add(hash,
1958 (conn->nat_info->max_port << 16)
1959 | conn->nat_info->min_port);
92edd073
DB
1960 hash = ct_endpoint_hash_add(hash, &conn->key.src);
1961 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
286de272
DB
1962 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
1963 hash = hash_add(hash, conn->key.nw_proto);
1964 hash = hash_add(hash, conn->key.zone);
92edd073
DB
1965
1966 /* The purpose of the second parameter is to distinguish hashes of data of
1967 * different length; our data always has the same length so there is no
1968 * value in counting. */
1969 return hash_finish(hash, 0);
286de272
DB
1970}
1971
1972static bool
1973nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
1974 struct conn *nat_conn)
1975{
bd5e81a0
DB
1976 enum { MIN_NAT_EPHEMERAL_PORT = 1024,
1977 MAX_NAT_EPHEMERAL_PORT = 65535 };
286de272
DB
1978
1979 uint16_t min_port;
1980 uint16_t max_port;
1981 uint16_t first_port;
286de272
DB
1982 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
1983
1984 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
1985 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
1986 min_port = ntohs(conn->key.src.port);
1987 max_port = ntohs(conn->key.src.port);
1988 first_port = min_port;
1989 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
1990 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
1991 min_port = ntohs(conn->key.dst.port);
1992 max_port = ntohs(conn->key.dst.port);
1993 first_port = min_port;
1994 } else {
1995 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
1996 uint32_t port_index = hash % (deltap + 1);
1997 first_port = conn->nat_info->min_port + port_index;
1998 min_port = conn->nat_info->min_port;
1999 max_port = conn->nat_info->max_port;
2000 }
2001
2002 uint32_t deltaa = 0;
2003 uint32_t address_index;
cda1b109 2004 union ct_addr ct_addr;
286de272 2005 memset(&ct_addr, 0, sizeof ct_addr);
cda1b109 2006 union ct_addr max_ct_addr;
286de272
DB
2007 memset(&max_ct_addr, 0, sizeof max_ct_addr);
2008 max_ct_addr = conn->nat_info->max_addr;
2009
2010 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
cda1b109
DB
2011 deltaa = ntohl(conn->nat_info->max_addr.ipv4) -
2012 ntohl(conn->nat_info->min_addr.ipv4);
286de272 2013 address_index = hash % (deltaa + 1);
cda1b109
DB
2014 ct_addr.ipv4 = htonl(
2015 ntohl(conn->nat_info->min_addr.ipv4) + address_index);
286de272 2016 } else {
cda1b109
DB
2017 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6,
2018 &conn->nat_info->max_addr.ipv6);
286de272
DB
2019 /* deltaa must be within 32 bits for full hash coverage. A 64 or
2020 * 128 bit hash is unnecessary and hence not used here. Most code
2021 * is kept common with V4; nat_ipv6_addrs_delta() will do the
2022 * enforcement via max_ct_addr. */
2023 max_ct_addr = conn->nat_info->min_addr;
cda1b109 2024 nat_ipv6_addr_increment(&max_ct_addr.ipv6, deltaa);
286de272 2025 address_index = hash % (deltaa + 1);
cda1b109
DB
2026 ct_addr.ipv6 = conn->nat_info->min_addr.ipv6;
2027 nat_ipv6_addr_increment(&ct_addr.ipv6, address_index);
286de272
DB
2028 }
2029
2030 uint16_t port = first_port;
2031 bool all_ports_tried = false;
32b2c81f
DB
2032 /* For DNAT or for specified port ranges, we don't use ephemeral ports. */
2033 bool ephemeral_ports_tried
2034 = conn->nat_info->nat_action & NAT_ACTION_DST ||
2035 conn->nat_info->nat_action & NAT_ACTION_SRC_PORT
2036 ? true : false;
cda1b109 2037 union ct_addr first_addr = ct_addr;
4cd0481c
DB
2038 bool pat_enabled = conn->key.nw_proto != IPPROTO_ICMP &&
2039 conn->key.nw_proto != IPPROTO_ICMPV6;
286de272
DB
2040
2041 while (true) {
2042 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2043 nat_conn->rev_key.dst.addr = ct_addr;
286de272
DB
2044 nat_conn->rev_key.dst.port = htons(port);
2045 } else {
1c8689d7 2046 nat_conn->rev_key.src.addr = ct_addr;
286de272
DB
2047 nat_conn->rev_key.src.port = htons(port);
2048 }
2049
967bb5c5
DB
2050 uint32_t conn_hash = conn_key_hash(&nat_conn->rev_key,
2051 ct->hash_basis);
2052 bool found = conn_key_lookup(ct, &nat_conn->rev_key, conn_hash,
2053 time_msec(), NULL, NULL);
2054 if (!found) {
286de272 2055 return true;
4cd0481c 2056 } else if (pat_enabled && !all_ports_tried) {
286de272
DB
2057 if (min_port == max_port) {
2058 all_ports_tried = true;
2059 } else if (port == max_port) {
2060 port = min_port;
2061 } else {
2062 port++;
2063 }
2064 if (port == first_port) {
2065 all_ports_tried = true;
2066 }
2067 } else {
2068 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
2069 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2070 ct_addr.ipv4 = htonl(ntohl(ct_addr.ipv4) + 1);
286de272 2071 } else {
cda1b109 2072 nat_ipv6_addr_increment(&ct_addr.ipv6, 1);
286de272
DB
2073 }
2074 } else {
2075 ct_addr = conn->nat_info->min_addr;
2076 }
2077 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
4cd0481c 2078 if (pat_enabled && !ephemeral_ports_tried) {
ac04639a 2079 ephemeral_ports_tried = true;
286de272 2080 ct_addr = conn->nat_info->min_addr;
8417e688 2081 first_addr = ct_addr;
286de272
DB
2082 min_port = MIN_NAT_EPHEMERAL_PORT;
2083 max_port = MAX_NAT_EPHEMERAL_PORT;
2084 } else {
2085 break;
2086 }
2087 }
2088 first_port = min_port;
2089 port = first_port;
2090 all_ports_tried = false;
2091 }
2092 }
2093 return false;
2094}
2095
a489b168 2096static enum ct_update_res
967bb5c5
DB
2097conn_update(struct conntrack *ct, struct conn *conn, struct dp_packet *pkt,
2098 struct conn_lookup_ctx *ctx, long long now)
a489b168 2099{
967bb5c5
DB
2100 ovs_mutex_lock(&conn->lock);
2101 enum ct_update_res update_res =
2102 l4_protos[conn->key.nw_proto]->conn_update(ct, conn, pkt, ctx->reply,
2103 now);
2104 ovs_mutex_unlock(&conn->lock);
2105 return update_res;
a489b168
DDP
2106}
2107
2108static bool
2109conn_expired(struct conn *conn, long long now)
2110{
286de272 2111 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
967bb5c5
DB
2112 ovs_mutex_lock(&conn->lock);
2113 bool expired = now >= conn->expiration ? true : false;
2114 ovs_mutex_unlock(&conn->lock);
2115 return expired;
286de272
DB
2116 }
2117 return false;
a489b168
DDP
2118}
2119
2120static bool
2121valid_new(struct dp_packet *pkt, struct conn_key *key)
2122{
2123 return l4_protos[key->nw_proto]->valid_new(pkt);
2124}
2125
2126static struct conn *
967bb5c5
DB
2127new_conn(struct conntrack *ct, struct dp_packet *pkt, struct conn_key *key,
2128 long long now)
a489b168 2129{
967bb5c5 2130 return l4_protos[key->nw_proto]->new_conn(ct, pkt, now);
a489b168
DDP
2131}
2132
2133static void
967bb5c5 2134delete_conn_cmn(struct conn *conn)
a489b168 2135{
286de272 2136 free(conn->nat_info);
bd5e81a0 2137 free(conn->alg);
a489b168
DDP
2138 free(conn);
2139}
967bb5c5
DB
2140
2141static void
2142delete_conn(struct conn *conn)
2143{
2144 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
2145 ovs_mutex_destroy(&conn->lock);
2146 free(conn->nat_conn);
2147 delete_conn_cmn(conn);
2148}
2149
2150/* Only used by conn_clean_one(). */
2151static void
2152delete_conn_one(struct conn *conn)
2153{
2154 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
2155 ovs_mutex_destroy(&conn->lock);
2156 }
2157 delete_conn_cmn(conn);
2158}
4d4e68ed 2159\f
271e48a0
YHW
2160/* Convert a conntrack address 'a' into an IP address 'b' based on 'dl_type'.
2161 *
2162 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2163 * in network-byte order. */
4d4e68ed 2164static void
cda1b109 2165ct_endpoint_to_ct_dpif_inet_addr(const union ct_addr *a,
4d4e68ed
DDP
2166 union ct_dpif_inet_addr *b,
2167 ovs_be16 dl_type)
2168{
2169 if (dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2170 b->ip = a->ipv4;
4d4e68ed 2171 } else if (dl_type == htons(ETH_TYPE_IPV6)){
cda1b109 2172 b->in6 = a->ipv6;
4d4e68ed
DDP
2173 }
2174}
2175
271e48a0
YHW
2176/* Convert an IP address 'a' into a conntrack address 'b' based on 'dl_type'.
2177 *
2178 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2179 * in network-byte order. */
2180static void
2181ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
cda1b109 2182 union ct_addr *b, ovs_be16 dl_type)
271e48a0
YHW
2183{
2184 if (dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2185 b->ipv4 = a->ip;
271e48a0 2186 } else if (dl_type == htons(ETH_TYPE_IPV6)){
cda1b109 2187 b->ipv6 = a->in6;
271e48a0
YHW
2188 }
2189}
2190
4d4e68ed
DDP
2191static void
2192conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2193{
2194 if (key->dl_type == htons(ETH_TYPE_IP)) {
2195 tuple->l3_type = AF_INET;
2196 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2197 tuple->l3_type = AF_INET6;
2198 }
2199 tuple->ip_proto = key->nw_proto;
2200 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2201 key->dl_type);
2202 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2203 key->dl_type);
2204
2205 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
b269a122
DDP
2206 tuple->icmp_id = key->src.icmp_id;
2207 tuple->icmp_type = key->src.icmp_type;
2208 tuple->icmp_code = key->src.icmp_code;
4d4e68ed
DDP
2209 } else {
2210 tuple->src_port = key->src.port;
2211 tuple->dst_port = key->dst.port;
2212 }
2213}
2214
271e48a0
YHW
2215static void
2216tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
2217 struct conn_key *key)
2218{
2219 if (tuple->l3_type == AF_INET) {
2220 key->dl_type = htons(ETH_TYPE_IP);
2221 } else if (tuple->l3_type == AF_INET6) {
2222 key->dl_type = htons(ETH_TYPE_IPV6);
2223 }
2224 key->nw_proto = tuple->ip_proto;
2225 ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
2226 key->dl_type);
2227 ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
2228 key->dl_type);
2229
2230 if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
2231 key->src.icmp_id = tuple->icmp_id;
2232 key->src.icmp_type = tuple->icmp_type;
2233 key->src.icmp_code = tuple->icmp_code;
2234 key->dst.icmp_id = tuple->icmp_id;
2235 key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
2236 key->dst.icmp_code = tuple->icmp_code;
2237 } else {
2238 key->src.port = tuple->src_port;
2239 key->dst.port = tuple->dst_port;
2240 }
2241 key->zone = zone;
2242}
2243
4d4e68ed
DDP
2244static void
2245conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
f1a0469e 2246 long long now)
4d4e68ed 2247{
4d4e68ed
DDP
2248 memset(entry, 0, sizeof *entry);
2249 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
2250 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
2251
2252 entry->zone = conn->key.zone;
4d4e68ed 2253
967bb5c5
DB
2254 ovs_mutex_lock(&conn->lock);
2255 entry->mark = conn->mark;
286de272 2256 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
4d4e68ed 2257
dec0dbbc 2258 long long expiration = conn->expiration - now;
4d4e68ed 2259
dec0dbbc 2260 struct ct_l4_proto *class = l4_protos[conn->key.nw_proto];
4d4e68ed
DDP
2261 if (class->conn_get_protoinfo) {
2262 class->conn_get_protoinfo(conn, &entry->protoinfo);
2263 }
f1a0469e 2264 ovs_mutex_unlock(&conn->lock);
bd5e81a0 2265
f1a0469e 2266 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
bd5e81a0
DB
2267
2268 if (conn->alg) {
2269 /* Caller is responsible for freeing. */
2270 entry->helper.name = xstrdup(conn->alg);
2271 }
4d4e68ed
DDP
2272}
2273
4ea96698
DB
2274struct ipf *
2275conntrack_ipf_ctx(struct conntrack *ct)
2276{
2277 return ct->ipf;
2278}
2279
4d4e68ed
DDP
2280int
2281conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
ded30c74 2282 const uint16_t *pzone, int *ptot_bkts)
4d4e68ed
DDP
2283{
2284 memset(dump, 0, sizeof(*dump));
dec0dbbc 2285
4d4e68ed
DDP
2286 if (pzone) {
2287 dump->zone = *pzone;
2288 dump->filter_zone = true;
2289 }
4d4e68ed 2290
dec0dbbc 2291 dump->ct = ct;
967bb5c5 2292 *ptot_bkts = 1; /* Need to clean up the callers. */
4d4e68ed
DDP
2293 return 0;
2294}
2295
2296int
2297conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2298{
2299 struct conntrack *ct = dump->ct;
2300 long long now = time_msec();
2301
967bb5c5
DB
2302 for (;;) {
2303 struct cmap_node *cm_node = cmap_next_position(&ct->conns,
2304 &dump->cm_pos);
2305 if (!cm_node) {
2306 break;
4d4e68ed 2307 }
967bb5c5
DB
2308 struct conn *conn;
2309 INIT_CONTAINER(conn, cm_node, cm_node);
2310 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2311 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
f1a0469e 2312 conn_to_ct_dpif_entry(conn, entry, now);
4d4e68ed
DDP
2313 return 0;
2314 }
2315 }
967bb5c5 2316
4d4e68ed
DDP
2317 return EOF;
2318}
2319
2320int
2321conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2322{
2323 return 0;
2324}
5d9cbb4c
DDP
2325
2326int
2327conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2328{
967bb5c5
DB
2329 struct conn *conn;
2330
2331 ovs_mutex_lock(&ct->ct_lock);
2332 CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
2333 if (!zone || *zone == conn->key.zone) {
2334 conn_clean_one(ct, conn);
5d9cbb4c 2335 }
5d9cbb4c 2336 }
967bb5c5 2337 ovs_mutex_unlock(&ct->ct_lock);
bd5e81a0 2338
5d9cbb4c
DDP
2339 return 0;
2340}
bd5e81a0 2341
271e48a0
YHW
2342int
2343conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
2344 uint16_t zone)
2345{
2346 struct conn_lookup_ctx ctx;
2347 int error = 0;
2348
2349 memset(&ctx, 0, sizeof(ctx));
2350 tuple_to_conn_key(tuple, zone, &ctx.key);
2351 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
967bb5c5
DB
2352 ovs_mutex_lock(&ct->ct_lock);
2353 conn_key_lookup(ct, &ctx.key, ctx.hash, time_msec(), &ctx.conn,
2354 &ctx.reply);
271e48a0 2355
a1d5eeff 2356 if (ctx.conn && ctx.conn->conn_type == CT_CONN_TYPE_DEFAULT) {
967bb5c5 2357 conn_clean(ct, ctx.conn);
271e48a0 2358 } else {
a1d5eeff 2359 VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
271e48a0
YHW
2360 error = ENOENT;
2361 }
967bb5c5
DB
2362
2363 ovs_mutex_unlock(&ct->ct_lock);
271e48a0
YHW
2364 return error;
2365}
2366
c92339ad
DB
2367int
2368conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns)
2369{
2370 atomic_store_relaxed(&ct->n_conn_limit, maxconns);
2371 return 0;
2372}
2373
2374int
2375conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns)
2376{
2377 atomic_read_relaxed(&ct->n_conn_limit, maxconns);
2378 return 0;
2379}
2380
875075b3
DB
2381int
2382conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns)
2383{
2384 *nconns = atomic_count_get(&ct->n_conn);
2385 return 0;
2386}
2387
bd5e81a0
DB
2388/* This function must be called with the ct->resources read lock taken. */
2389static struct alg_exp_node *
be38342d
DB
2390expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
2391 uint32_t basis, bool src_ip_wc)
bd5e81a0 2392{
c3f6bae2
DB
2393 struct conn_key check_key;
2394 memcpy(&check_key, key, sizeof check_key);
bd5e81a0 2395 check_key.src.port = ALG_WC_SRC_PORT;
dec0dbbc 2396
be38342d
DB
2397 if (src_ip_wc) {
2398 memset(&check_key.src.addr, 0, sizeof check_key.src.addr);
2399 }
dec0dbbc 2400
bd5e81a0
DB
2401 struct alg_exp_node *alg_exp_node;
2402
bd5e81a0 2403 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
dec0dbbc 2404 conn_key_hash(&check_key, basis),
bd5e81a0
DB
2405 alg_expectations) {
2406 if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
2407 return alg_exp_node;
2408 }
2409 }
2410 return NULL;
2411}
2412
4417ca3d
DB
2413/* This function must be called with the ct->resources write lock taken. */
2414static void
2415expectation_remove(struct hmap *alg_expectations,
2416 const struct conn_key *key, uint32_t basis)
2417{
2418 struct alg_exp_node *alg_exp_node;
2419
2420 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node, conn_key_hash(key, basis),
2421 alg_expectations) {
2422 if (!conn_key_cmp(&alg_exp_node->key, key)) {
2423 hmap_remove(alg_expectations, &alg_exp_node->node);
2424 break;
2425 }
2426 }
2427}
2428
2429/* This function must be called with the ct->resources read lock taken. */
2430static struct alg_exp_node *
2431expectation_ref_lookup_unique(const struct hindex *alg_expectation_refs,
2432 const struct conn_key *master_key,
2433 const struct conn_key *alg_exp_key,
2434 uint32_t basis)
2435{
2436 struct alg_exp_node *alg_exp_node;
2437
2438 HINDEX_FOR_EACH_WITH_HASH (alg_exp_node, node_ref,
2439 conn_key_hash(master_key, basis),
2440 alg_expectation_refs) {
2441 if (!conn_key_cmp(&alg_exp_node->master_key, master_key) &&
2442 !conn_key_cmp(&alg_exp_node->key, alg_exp_key)) {
2443 return alg_exp_node;
2444 }
2445 }
2446 return NULL;
2447}
2448
2449/* This function must be called with the ct->resources write lock taken. */
2450static void
2451expectation_ref_create(struct hindex *alg_expectation_refs,
2452 struct alg_exp_node *alg_exp_node,
2453 uint32_t basis)
2454{
2455 if (!expectation_ref_lookup_unique(alg_expectation_refs,
2456 &alg_exp_node->master_key,
2457 &alg_exp_node->key, basis)) {
2458 hindex_insert(alg_expectation_refs, &alg_exp_node->node_ref,
2459 conn_key_hash(&alg_exp_node->master_key, basis));
2460 }
2461}
2462
2463static void
967bb5c5 2464expectation_clean(struct conntrack *ct, const struct conn_key *master_key)
4417ca3d 2465{
967bb5c5 2466 ovs_rwlock_wrlock(&ct->resources_lock);
4417ca3d
DB
2467
2468 struct alg_exp_node *node, *next;
2469 HINDEX_FOR_EACH_WITH_HASH_SAFE (node, next, node_ref,
967bb5c5 2470 conn_key_hash(master_key, ct->hash_basis),
4417ca3d
DB
2471 &ct->alg_expectation_refs) {
2472 if (!conn_key_cmp(&node->master_key, master_key)) {
967bb5c5
DB
2473 expectation_remove(&ct->alg_expectations, &node->key,
2474 ct->hash_basis);
4417ca3d
DB
2475 hindex_remove(&ct->alg_expectation_refs, &node->node_ref);
2476 free(node);
2477 }
2478 }
2479
967bb5c5 2480 ovs_rwlock_unlock(&ct->resources_lock);
4417ca3d
DB
2481}
2482
bd5e81a0 2483static void
be38342d
DB
2484expectation_create(struct conntrack *ct, ovs_be16 dst_port,
2485 const struct conn *master_conn, bool reply, bool src_ip_wc,
2486 bool skip_nat)
bd5e81a0 2487{
cda1b109
DB
2488 union ct_addr src_addr;
2489 union ct_addr dst_addr;
2490 union ct_addr alg_nat_repl_addr;
be38342d 2491 struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
bd5e81a0 2492
be38342d 2493 if (reply) {
bd5e81a0
DB
2494 src_addr = master_conn->key.src.addr;
2495 dst_addr = master_conn->key.dst.addr;
efa29a89 2496 alg_exp_node->nat_rpl_dst = true;
be38342d
DB
2497 if (skip_nat) {
2498 alg_nat_repl_addr = dst_addr;
efa29a89
DM
2499 } else if (master_conn->nat_info &&
2500 master_conn->nat_info->nat_action & NAT_ACTION_DST) {
2501 alg_nat_repl_addr = master_conn->rev_key.src.addr;
2502 alg_exp_node->nat_rpl_dst = false;
be38342d
DB
2503 } else {
2504 alg_nat_repl_addr = master_conn->rev_key.dst.addr;
2505 }
be38342d
DB
2506 } else {
2507 src_addr = master_conn->rev_key.src.addr;
2508 dst_addr = master_conn->rev_key.dst.addr;
efa29a89 2509 alg_exp_node->nat_rpl_dst = false;
be38342d
DB
2510 if (skip_nat) {
2511 alg_nat_repl_addr = src_addr;
efa29a89
DM
2512 } else if (master_conn->nat_info &&
2513 master_conn->nat_info->nat_action & NAT_ACTION_DST) {
2514 alg_nat_repl_addr = master_conn->key.dst.addr;
2515 alg_exp_node->nat_rpl_dst = true;
be38342d
DB
2516 } else {
2517 alg_nat_repl_addr = master_conn->key.src.addr;
2518 }
be38342d
DB
2519 }
2520 if (src_ip_wc) {
2521 memset(&src_addr, 0, sizeof src_addr);
bd5e81a0
DB
2522 }
2523
bd5e81a0
DB
2524 alg_exp_node->key.dl_type = master_conn->key.dl_type;
2525 alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
2526 alg_exp_node->key.zone = master_conn->key.zone;
2527 alg_exp_node->key.src.addr = src_addr;
2528 alg_exp_node->key.dst.addr = dst_addr;
2529 alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
2530 alg_exp_node->key.dst.port = dst_port;
2531 alg_exp_node->master_mark = master_conn->mark;
2532 alg_exp_node->master_label = master_conn->label;
82b9ac94
DB
2533 memcpy(&alg_exp_node->master_key, &master_conn->key,
2534 sizeof alg_exp_node->master_key);
bd5e81a0
DB
2535 /* Take the write lock here because it is almost 100%
2536 * likely that the lookup will fail and
2537 * expectation_create() will be called below. */
967bb5c5 2538 ovs_rwlock_wrlock(&ct->resources_lock);
bd5e81a0 2539 struct alg_exp_node *alg_exp = expectation_lookup(
be38342d 2540 &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis, src_ip_wc);
bd5e81a0
DB
2541 if (alg_exp) {
2542 free(alg_exp_node);
967bb5c5 2543 ovs_rwlock_unlock(&ct->resources_lock);
bd5e81a0
DB
2544 return;
2545 }
2546
2547 alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
4417ca3d 2548 hmap_insert(&ct->alg_expectations, &alg_exp_node->node,
dec0dbbc 2549 conn_key_hash(&alg_exp_node->key, ct->hash_basis));
4417ca3d
DB
2550 expectation_ref_create(&ct->alg_expectation_refs, alg_exp_node,
2551 ct->hash_basis);
967bb5c5 2552 ovs_rwlock_unlock(&ct->resources_lock);
bd5e81a0
DB
2553}
2554
bd5e81a0
DB
2555static void
2556replace_substring(char *substr, uint8_t substr_size,
2557 uint8_t total_size, char *rep_str,
2558 uint8_t rep_str_size)
2559{
2560 memmove(substr + rep_str_size, substr + substr_size,
2561 total_size - substr_size);
2562 memcpy(substr, rep_str, rep_str_size);
2563}
2564
cd7c99a6
DB
2565static void
2566repl_bytes(char *str, char c1, char c2)
2567{
2568 while (*str) {
2569 if (*str == c1) {
2570 *str = c2;
2571 }
2572 str++;
2573 }
2574}
2575
2576static void
2577modify_packet(struct dp_packet *pkt, char *pkt_str, size_t size,
2578 char *repl_str, size_t repl_size,
2579 uint32_t orig_used_size)
2580{
2581 replace_substring(pkt_str, size,
2582 (const char *) dp_packet_tail(pkt) - pkt_str,
2583 repl_str, repl_size);
2584 dp_packet_set_size(pkt, orig_used_size + (int) repl_size - (int) size);
2585}
2586
bd5e81a0
DB
2587/* Replace IPV4 address in FTP message with NATed address. */
2588static int
2589repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
2590 char *ftp_data_start,
cd7c99a6
DB
2591 size_t addr_offset_from_ftp_data_start,
2592 size_t addr_size OVS_UNUSED)
bd5e81a0
DB
2593{
2594 enum { MAX_FTP_V4_NAT_DELTA = 8 };
2595
2596 /* Do conservative check for pathological MTU usage. */
2597 uint32_t orig_used_size = dp_packet_size(pkt);
cd7c99a6
DB
2598 if (orig_used_size + MAX_FTP_V4_NAT_DELTA >
2599 dp_packet_get_allocated(pkt)) {
2600
bd5e81a0 2601 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
cd7c99a6
DB
2602 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V4",
2603 dp_packet_get_allocated(pkt));
bd5e81a0
DB
2604 return 0;
2605 }
2606
cd7c99a6
DB
2607 char v4_addr_str[INET_ADDRSTRLEN] = {0};
2608 ovs_assert(inet_ntop(AF_INET, &v4_addr_rep, v4_addr_str,
2609 sizeof v4_addr_str));
2610 repl_bytes(v4_addr_str, '.', ',');
2611 modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
2612 addr_size, v4_addr_str, strlen(v4_addr_str),
2613 orig_used_size);
2614 return (int) strlen(v4_addr_str) - (int) addr_size;
bd5e81a0
DB
2615}
2616
2617static char *
2618skip_non_digits(char *str)
2619{
2620 while (!isdigit(*str) && *str != 0) {
2621 str++;
2622 }
2623 return str;
2624}
2625
2626static char *
2627terminate_number_str(char *str, uint8_t max_digits)
2628{
2629 uint8_t digits_found = 0;
2630 while (isdigit(*str) && digits_found <= max_digits) {
2631 str++;
2632 digits_found++;
2633 }
2634
2635 *str = 0;
2636 return str;
2637}
2638
2639
2640static void
2641get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
2642{
2643 struct tcp_header *th = dp_packet_l4(pkt);
2644 char *tcp_hdr = (char *) th;
2645 uint32_t tcp_payload_len = tcp_payload_length(pkt);
2646 size_t tcp_payload_of_interest = MIN(tcp_payload_len,
2647 LARGEST_FTP_MSG_OF_INTEREST);
2648 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2649
2650 ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
2651 tcp_payload_of_interest);
2652}
2653
2654static enum ftp_ctl_pkt
2655detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
2656 struct dp_packet *pkt)
2657{
bd5e81a0
DB
2658 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2659 get_ftp_ctl_msg(pkt, ftp_msg);
dec0dbbc 2660
bd5e81a0
DB
2661 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2662 if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
2663 !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
2664 return CT_FTP_CTL_OTHER;
2665 }
2666 } else {
2667 if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
2668 strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
2669 strlen(FTP_PASV_REPLY_CODE))) {
2670 return CT_FTP_CTL_OTHER;
2671 }
2672 }
2673
2674 return CT_FTP_CTL_INTEREST;
2675}
2676
2677static enum ftp_ctl_pkt
2678process_ftp_ctl_v4(struct conntrack *ct,
2679 struct dp_packet *pkt,
2680 const struct conn *conn_for_expectation,
4417ca3d 2681 ovs_be32 *v4_addr_rep,
bd5e81a0 2682 char **ftp_data_v4_start,
cd7c99a6
DB
2683 size_t *addr_offset_from_ftp_data_start,
2684 size_t *addr_size)
bd5e81a0
DB
2685{
2686 struct tcp_header *th = dp_packet_l4(pkt);
2687 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2688 char *tcp_hdr = (char *) th;
2689 *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
2690 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2691 get_ftp_ctl_msg(pkt, ftp_msg);
bd5e81a0
DB
2692 char *ftp = ftp_msg;
2693 enum ct_alg_mode mode;
dec0dbbc 2694
23bea975 2695 if (!strncasecmp(ftp, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
bd5e81a0
DB
2696 ftp = ftp_msg + strlen(FTP_PORT_CMD);
2697 mode = CT_FTP_MODE_ACTIVE;
2698 } else {
2699 ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
2700 mode = CT_FTP_MODE_PASSIVE;
2701 }
2702
2703 /* Find first space. */
2704 ftp = strchr(ftp, ' ');
2705 if (!ftp) {
2706 return CT_FTP_CTL_INVALID;
2707 }
2708
2709 /* Find the first digit, after space. */
2710 ftp = skip_non_digits(ftp);
2711 if (*ftp == 0) {
2712 return CT_FTP_CTL_INVALID;
2713 }
2714
2715 char *ip_addr_start = ftp;
2716 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
bd5e81a0 2717
dec0dbbc 2718 uint8_t comma_count = 0;
bd5e81a0
DB
2719 while (comma_count < 4 && *ftp) {
2720 if (*ftp == ',') {
2721 comma_count++;
2722 if (comma_count == 4) {
2723 *ftp = 0;
2724 } else {
2725 *ftp = '.';
2726 }
2727 }
2728 ftp++;
2729 }
2730 if (comma_count != 4) {
2731 return CT_FTP_CTL_INVALID;
2732 }
2733
2734 struct in_addr ip_addr;
2735 int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
2736 if (rc2 != 1) {
2737 return CT_FTP_CTL_INVALID;
2738 }
2739
cd7c99a6 2740 *addr_size = ftp - ip_addr_start - 1;
bd5e81a0
DB
2741 char *save_ftp = ftp;
2742 ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
2743 if (!ftp) {
2744 return CT_FTP_CTL_INVALID;
2745 }
2746 int value;
2747 if (!str_to_int(save_ftp, 10, &value)) {
2748 return CT_FTP_CTL_INVALID;
2749 }
2750
2751 /* This is derived from the L4 port maximum is 65535. */
2752 if (value > 255) {
2753 return CT_FTP_CTL_INVALID;
2754 }
2755
2756 uint16_t port_hs = value;
2757 port_hs <<= 8;
2758
2759 /* Skip over comma. */
2760 ftp++;
2761 save_ftp = ftp;
2762 bool digit_found = false;
2763 while (isdigit(*ftp)) {
2764 ftp++;
2765 digit_found = true;
2766 }
2767 if (!digit_found) {
2768 return CT_FTP_CTL_INVALID;
2769 }
2770 *ftp = 0;
2771 if (!str_to_int(save_ftp, 10, &value)) {
2772 return CT_FTP_CTL_INVALID;
2773 }
2774
2775 if (value > 255) {
2776 return CT_FTP_CTL_INVALID;
2777 }
2778
78a0b272 2779 port_hs |= value;
bd5e81a0
DB
2780 ovs_be16 port = htons(port_hs);
2781 ovs_be32 conn_ipv4_addr;
2782
2783 switch (mode) {
2784 case CT_FTP_MODE_ACTIVE:
cda1b109
DB
2785 *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4;
2786 conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4;
bd5e81a0
DB
2787 break;
2788 case CT_FTP_MODE_PASSIVE:
cda1b109
DB
2789 *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4;
2790 conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4;
bd5e81a0 2791 break;
7be77cb0 2792 case CT_TFTP_MODE:
bd5e81a0
DB
2793 default:
2794 OVS_NOT_REACHED();
2795 }
2796
2797 ovs_be32 ftp_ipv4_addr;
2798 ftp_ipv4_addr = ip_addr.s_addr;
2799 /* Although most servers will block this exploit, there may be some
2800 * less well managed. */
2801 if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
2802 return CT_FTP_CTL_INVALID;
2803 }
2804
be38342d
DB
2805 expectation_create(ct, port, conn_for_expectation,
2806 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
2807 return CT_FTP_CTL_INTEREST;
2808}
2809
2810static char *
2811skip_ipv6_digits(char *str)
2812{
2813 while (isxdigit(*str) || *str == ':' || *str == '.') {
2814 str++;
2815 }
2816 return str;
2817}
2818
2819static enum ftp_ctl_pkt
2820process_ftp_ctl_v6(struct conntrack *ct,
2821 struct dp_packet *pkt,
2822 const struct conn *conn_for_expectation,
cda1b109 2823 union ct_addr *v6_addr_rep, char **ftp_data_start,
bd5e81a0
DB
2824 size_t *addr_offset_from_ftp_data_start,
2825 size_t *addr_size, enum ct_alg_mode *mode)
2826{
2827 struct tcp_header *th = dp_packet_l4(pkt);
2828 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2829 char *tcp_hdr = (char *) th;
2830 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
bd5e81a0
DB
2831 get_ftp_ctl_msg(pkt, ftp_msg);
2832 *ftp_data_start = tcp_hdr + tcp_hdr_len;
bd5e81a0
DB
2833 char *ftp = ftp_msg;
2834 struct in6_addr ip6_addr;
dec0dbbc 2835
23bea975 2836 if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
bd5e81a0
DB
2837 ftp = ftp_msg + strlen(FTP_EPRT_CMD);
2838 ftp = skip_non_digits(ftp);
2839 if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
2840 return CT_FTP_CTL_INVALID;
2841 }
2842 /* Jump over delimiter. */
2843 ftp += 2;
2844
bd5e81a0 2845 memset(&ip6_addr, 0, sizeof ip6_addr);
dec0dbbc 2846 char *ip_addr_start = ftp;
bd5e81a0
DB
2847 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
2848 ftp = skip_ipv6_digits(ftp);
2849 *ftp = 0;
2850 *addr_size = ftp - ip_addr_start;
2851 int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
2852 if (rc2 != 1) {
2853 return CT_FTP_CTL_INVALID;
2854 }
2855 ftp++;
2856 *mode = CT_FTP_MODE_ACTIVE;
2857 } else {
2858 ftp = ftp_msg + strcspn(ftp_msg, "(");
2859 ftp = skip_non_digits(ftp);
2860 if (!isdigit(*ftp)) {
2861 return CT_FTP_CTL_INVALID;
2862 }
2863
2864 /* Not used for passive mode. */
2865 *addr_offset_from_ftp_data_start = 0;
2866 *addr_size = 0;
2867
2868 *mode = CT_FTP_MODE_PASSIVE;
2869 }
2870
2871 char *save_ftp = ftp;
2872 ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
2873 if (!ftp) {
2874 return CT_FTP_CTL_INVALID;
2875 }
dec0dbbc 2876
bd5e81a0
DB
2877 int value;
2878 if (!str_to_int(save_ftp, 10, &value)) {
2879 return CT_FTP_CTL_INVALID;
2880 }
2881 if (value > CT_MAX_L4_PORT) {
2882 return CT_FTP_CTL_INVALID;
2883 }
2884
2885 uint16_t port_hs = value;
2886 ovs_be16 port = htons(port_hs);
2887
2888 switch (*mode) {
2889 case CT_FTP_MODE_ACTIVE:
2890 *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
2891 /* Although most servers will block this exploit, there may be some
2892 * less well managed. */
cda1b109
DB
2893 if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) &&
2894 memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6,
bd5e81a0
DB
2895 sizeof ip6_addr)) {
2896 return CT_FTP_CTL_INVALID;
2897 }
2898 break;
2899 case CT_FTP_MODE_PASSIVE:
2900 *v6_addr_rep = conn_for_expectation->key.dst.addr;
2901 break;
7be77cb0 2902 case CT_TFTP_MODE:
bd5e81a0
DB
2903 default:
2904 OVS_NOT_REACHED();
2905 }
2906
be38342d
DB
2907 expectation_create(ct, port, conn_for_expectation,
2908 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
2909 return CT_FTP_CTL_INTEREST;
2910}
2911
2912static int
cda1b109 2913repl_ftp_v6_addr(struct dp_packet *pkt, union ct_addr v6_addr_rep,
bd5e81a0
DB
2914 char *ftp_data_start,
2915 size_t addr_offset_from_ftp_data_start,
2916 size_t addr_size, enum ct_alg_mode mode)
2917{
2918 /* This is slightly bigger than really possible. */
2919 enum { MAX_FTP_V6_NAT_DELTA = 45 };
2920
2921 if (mode == CT_FTP_MODE_PASSIVE) {
2922 return 0;
2923 }
2924
2925 /* Do conservative check for pathological MTU usage. */
2926 uint32_t orig_used_size = dp_packet_size(pkt);
cd7c99a6
DB
2927 if (orig_used_size + MAX_FTP_V6_NAT_DELTA >
2928 dp_packet_get_allocated(pkt)) {
2929
bd5e81a0 2930 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
cd7c99a6
DB
2931 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V6",
2932 dp_packet_get_allocated(pkt));
bd5e81a0
DB
2933 return 0;
2934 }
2935
298530b8 2936 char v6_addr_str[INET6_ADDRSTRLEN] = {0};
cda1b109 2937 ovs_assert(inet_ntop(AF_INET6, &v6_addr_rep.ipv6, v6_addr_str,
298530b8 2938 sizeof v6_addr_str));
cd7c99a6
DB
2939 modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
2940 addr_size, v6_addr_str, strlen(v6_addr_str),
2941 orig_used_size);
2942 return (int) strlen(v6_addr_str) - (int) addr_size;
bd5e81a0
DB
2943}
2944
d13d7115
DB
2945/* Increment/decrement a TCP sequence number. */
2946static void
2947adj_seqnum(ovs_16aligned_be32 *val, int32_t inc)
2948{
2949 put_16aligned_be32(val, htonl(ntohl(get_16aligned_be32(val)) + inc));
2950}
2951
bd5e81a0
DB
2952static void
2953handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
967bb5c5 2954 struct dp_packet *pkt, struct conn *ec, long long now,
253e4dc0 2955 enum ftp_ctl_pkt ftp_ctl, bool nat)
bd5e81a0
DB
2956{
2957 struct ip_header *l3_hdr = dp_packet_l3(pkt);
2958 ovs_be32 v4_addr_rep = 0;
cda1b109 2959 union ct_addr v6_addr_rep;
faa0826d 2960 size_t addr_offset_from_ftp_data_start = 0;
bd5e81a0
DB
2961 size_t addr_size = 0;
2962 char *ftp_data_start;
bd5e81a0
DB
2963 enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
2964
2965 if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
2966 return;
2967 }
2968
bd5e81a0
DB
2969 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
2970 int64_t seq_skew = 0;
dec0dbbc 2971
253e4dc0 2972 if (ftp_ctl == CT_FTP_CTL_INTEREST) {
bd5e81a0
DB
2973 enum ftp_ctl_pkt rc;
2974 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
253e4dc0 2975 rc = process_ftp_ctl_v6(ct, pkt, ec,
4417ca3d 2976 &v6_addr_rep, &ftp_data_start,
bd5e81a0
DB
2977 &addr_offset_from_ftp_data_start,
2978 &addr_size, &mode);
2979 } else {
253e4dc0 2980 rc = process_ftp_ctl_v4(ct, pkt, ec,
4417ca3d 2981 &v4_addr_rep, &ftp_data_start,
cd7c99a6
DB
2982 &addr_offset_from_ftp_data_start,
2983 &addr_size);
bd5e81a0
DB
2984 }
2985 if (rc == CT_FTP_CTL_INVALID) {
2986 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
2987 VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
2988 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
2989 return;
2990 } else if (rc == CT_FTP_CTL_INTEREST) {
2991 uint16_t ip_len;
dec0dbbc 2992
bd5e81a0 2993 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
253e4dc0
DM
2994 if (nat) {
2995 seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep,
2996 ftp_data_start,
2997 addr_offset_from_ftp_data_start,
2998 addr_size, mode);
2999 }
3000
bd5e81a0 3001 if (seq_skew) {
253e4dc0
DM
3002 ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen) +
3003 seq_skew;
bd5e81a0 3004 nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
bd5e81a0
DB
3005 }
3006 } else {
253e4dc0
DM
3007 if (nat) {
3008 seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep,
3009 ftp_data_start,
cd7c99a6
DB
3010 addr_offset_from_ftp_data_start,
3011 addr_size);
253e4dc0 3012 }
bd5e81a0 3013 if (seq_skew) {
253e4dc0 3014 ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew;
bd5e81a0
DB
3015 l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
3016 l3_hdr->ip_tot_len, htons(ip_len));
3017 l3_hdr->ip_tot_len = htons(ip_len);
bd5e81a0
DB
3018 }
3019 }
3020 } else {
3021 OVS_NOT_REACHED();
3022 }
bd5e81a0
DB
3023 }
3024
3025 struct tcp_header *th = dp_packet_l4(pkt);
dec0dbbc 3026
253e4dc0 3027 if (nat && ec->seq_skew != 0) {
d13d7115
DB
3028 ctx->reply != ec->seq_skew_dir ?
3029 adj_seqnum(&th->tcp_ack, -ec->seq_skew) :
3030 adj_seqnum(&th->tcp_seq, ec->seq_skew);
bd5e81a0
DB
3031 }
3032
bd5e81a0 3033 th->tcp_csum = 0;
bd5e81a0 3034 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
76d85771
DB
3035 th->tcp_csum = packet_csum_upperlayer6(nh6, th, ctx->key.nw_proto,
3036 dp_packet_l4_size(pkt));
bd5e81a0 3037 } else {
76d85771
DB
3038 uint32_t tcp_csum = packet_csum_pseudoheader(l3_hdr);
3039 th->tcp_csum = csum_finish(
3040 csum_continue(tcp_csum, th, dp_packet_l4_size(pkt)));
bd5e81a0 3041 }
253e4dc0
DM
3042
3043 if (seq_skew) {
967bb5c5 3044 conn_seq_skew_set(ct, ec, now, seq_skew + ec->seq_skew,
253e4dc0
DM
3045 ctx->reply);
3046 }
bd5e81a0 3047}
7be77cb0
DB
3048
3049static void
3050handle_tftp_ctl(struct conntrack *ct,
94e71143 3051 const struct conn_lookup_ctx *ctx OVS_UNUSED,
967bb5c5
DB
3052 struct dp_packet *pkt, struct conn *conn_for_expectation,
3053 long long now OVS_UNUSED, enum ftp_ctl_pkt ftp_ctl OVS_UNUSED,
3054 bool nat OVS_UNUSED)
7be77cb0 3055{
be38342d
DB
3056 expectation_create(ct, conn_for_expectation->key.src.port,
3057 conn_for_expectation,
3058 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
7be77cb0 3059}