]> git.proxmox.com Git - mirror_ovs.git/blame - lib/conntrack.c
conntrack: Enforce conn_type for conn_clean().
[mirror_ovs.git] / lib / conntrack.c
CommitLineData
a489b168 1/*
bd5e81a0 2 * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
a489b168
DDP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
bd5e81a0 18#include <ctype.h>
a489b168 19#include <errno.h>
ff6aa424 20#include <sys/types.h>
a489b168
DDP
21#include <netinet/in.h>
22#include <netinet/icmp6.h>
bd5e81a0 23#include <string.h>
a489b168
DDP
24
25#include "bitmap.h"
bd5e81a0 26#include "conntrack.h"
a489b168
DDP
27#include "conntrack-private.h"
28#include "coverage.h"
29#include "csum.h"
4d4e68ed 30#include "ct-dpif.h"
a489b168
DDP
31#include "dp-packet.h"
32#include "flow.h"
33#include "netdev.h"
34#include "odp-netlink.h"
35#include "openvswitch/hmap.h"
36#include "openvswitch/vlog.h"
37#include "ovs-rcu.h"
e6ef6cc6 38#include "ovs-thread.h"
fd016ae3 39#include "openvswitch/poll-loop.h"
a489b168
DDP
40#include "random.h"
41#include "timeval.h"
42
43VLOG_DEFINE_THIS_MODULE(conntrack);
44
45COVERAGE_DEFINE(conntrack_full);
e6ef6cc6 46COVERAGE_DEFINE(conntrack_long_cleanup);
a489b168
DDP
47
48struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
dbb597d3 53 bool icmp_related;
a489b168
DDP
54};
55
bd5e81a0
DB
56enum ftp_ctl_pkt {
57 /* Control packets with address and/or port specifiers. */
58 CT_FTP_CTL_INTEREST,
59 /* Control packets without address and/or port specifiers. */
60 CT_FTP_CTL_OTHER,
61 CT_FTP_CTL_INVALID,
62};
63
64enum ct_alg_mode {
65 CT_FTP_MODE_ACTIVE,
66 CT_FTP_MODE_PASSIVE,
7be77cb0 67 CT_TFTP_MODE,
bd5e81a0
DB
68};
69
94e71143
DB
70enum ct_alg_ctl_type {
71 CT_ALG_CTL_NONE,
72 CT_ALG_CTL_FTP,
73 CT_ALG_CTL_TFTP,
be38342d
DB
74 /* SIP is not enabled through Openflow and presently only used as
75 * an example of an alg that allows a wildcard src ip. */
76 CT_ALG_CTL_SIP,
94e71143
DB
77};
78
a489b168 79static bool conn_key_extract(struct conntrack *, struct dp_packet *,
66e4ad8a
DDP
80 ovs_be16 dl_type, struct conn_lookup_ctx *,
81 uint16_t zone);
a489b168
DDP
82static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
83static void conn_key_reverse(struct conn_key *);
84static void conn_key_lookup(struct conntrack_bucket *ctb,
85 struct conn_lookup_ctx *ctx,
86 long long now);
87static bool valid_new(struct dp_packet *pkt, struct conn_key *);
e6ef6cc6
DDP
88static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
89 struct conn_key *, long long now);
a489b168 90static void delete_conn(struct conn *);
e6ef6cc6
DDP
91static enum ct_update_res conn_update(struct conn *,
92 struct conntrack_bucket *ctb,
93 struct dp_packet *, bool reply,
94 long long now);
a489b168
DDP
95static bool conn_expired(struct conn *, long long now);
96static void set_mark(struct dp_packet *, struct conn *,
97 uint32_t val, uint32_t mask);
98static void set_label(struct dp_packet *, struct conn *,
99 const struct ovs_key_ct_labels *val,
100 const struct ovs_key_ct_labels *mask);
e6ef6cc6 101static void *clean_thread_main(void *f_);
a489b168 102
286de272
DB
103static struct nat_conn_key_node *
104nat_conn_keys_lookup(struct hmap *nat_conn_keys,
105 const struct conn_key *key,
106 uint32_t basis);
107
80cee116
DB
108static bool
109nat_conn_keys_insert(struct hmap *nat_conn_keys,
110 const struct conn *nat_conn,
111 uint32_t hash_basis);
112
286de272
DB
113static void
114nat_conn_keys_remove(struct hmap *nat_conn_keys,
115 const struct conn_key *key,
116 uint32_t basis);
117
118static bool
119nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
120 struct conn *nat_conn);
121
122static uint8_t
123reverse_icmp_type(uint8_t type);
124static uint8_t
125reverse_icmp6_type(uint8_t type);
126static inline bool
127extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
128 const char **new_data, bool validate_checksum);
129static inline bool
130extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
131 const char **new_data);
bd5e81a0 132static struct alg_exp_node *
be38342d
DB
133expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
134 uint32_t basis, bool src_ip_wc);
bd5e81a0
DB
135
136static int
137repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
138 char *ftp_data_v4_start,
139 size_t addr_offset_from_ftp_data_start);
140
141static enum ftp_ctl_pkt
142process_ftp_ctl_v4(struct conntrack *ct,
143 struct dp_packet *pkt,
144 const struct conn *conn_for_expectation,
4417ca3d 145 ovs_be32 *v4_addr_rep,
bd5e81a0
DB
146 char **ftp_data_v4_start,
147 size_t *addr_offset_from_ftp_data_start);
148
149static enum ftp_ctl_pkt
150detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
151 struct dp_packet *pkt);
152
4417ca3d
DB
153static void
154expectation_clean(struct conntrack *ct, const struct conn_key *master_key,
155 uint32_t basis);
156
94e71143
DB
157static struct ct_l4_proto *l4_protos[] = {
158 [IPPROTO_TCP] = &ct_proto_tcp,
159 [IPPROTO_UDP] = &ct_proto_other,
160 [IPPROTO_ICMP] = &ct_proto_icmp4,
161 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
162};
163
bd5e81a0
DB
164static void
165handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
166 struct dp_packet *pkt,
167 const struct conn *conn_for_expectation,
168 long long now, enum ftp_ctl_pkt ftp_ctl, bool nat);
169
7be77cb0
DB
170static void
171handle_tftp_ctl(struct conntrack *ct,
94e71143 172 const struct conn_lookup_ctx *ctx OVS_UNUSED,
be38342d 173 struct dp_packet *pkt,
7be77cb0 174 const struct conn *conn_for_expectation,
4417ca3d
DB
175 long long now OVS_UNUSED,
176 enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED);
94e71143
DB
177
178typedef void (*alg_helper)(struct conntrack *ct,
179 const struct conn_lookup_ctx *ctx,
180 struct dp_packet *pkt,
181 const struct conn *conn_for_expectation,
182 long long now, enum ftp_ctl_pkt ftp_ctl,
183 bool nat);
184
185static alg_helper alg_helpers[] = {
186 [CT_ALG_CTL_NONE] = NULL,
187 [CT_ALG_CTL_FTP] = handle_ftp_ctl,
188 [CT_ALG_CTL_TFTP] = handle_tftp_ctl,
a489b168
DDP
189};
190
191long long ct_timeout_val[] = {
192#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
193 CT_TIMEOUTS
194#undef CT_TIMEOUT
195};
196
bd5e81a0
DB
197/* The maximum TCP or UDP port number. */
198#define CT_MAX_L4_PORT 65535
bd5e81a0
DB
199/* String buffer used for parsing FTP string messages.
200 * This is sized about twice what is needed to leave some
201 * margin of error. */
202#define LARGEST_FTP_MSG_OF_INTEREST 128
203/* FTP port string used in active mode. */
204#define FTP_PORT_CMD "PORT"
205/* FTP pasv string used in passive mode. */
206#define FTP_PASV_REPLY_CODE "227"
207/* Maximum decimal digits for port in FTP command.
208 * The port is represented as two 3 digit numbers with the
209 * high part a multiple of 256. */
210#define MAX_FTP_PORT_DGTS 3
211
212/* FTP extension EPRT string used for active mode. */
213#define FTP_EPRT_CMD "EPRT"
214/* FTP extension EPSV string used for passive mode. */
215#define FTP_EPSV_REPLY "EXTENDED PASSIVE"
216/* Maximum decimal digits for port in FTP extended command. */
217#define MAX_EXT_FTP_PORT_DGTS 5
218/* FTP extended command code for IPv6. */
219#define FTP_AF_V6 '2'
220/* Used to indicate a wildcard L4 source port number for ALGs.
221 * This is used for port numbers that we cannot predict in
222 * expectations. */
223#define ALG_WC_SRC_PORT 0
224
a489b168 225/* If the total number of connections goes above this value, no new connections
286de272 226 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
a489b168
DDP
227#define DEFAULT_N_CONN_LIMIT 3000000
228
5ed7a0b4
DB
229/* Does a member by member comparison of two conn_keys; this
230 * function must be kept in sync with struct conn_key; returns 0
231 * if the keys are equal or 1 if the keys are not equal. */
232static int
233conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
234{
235 if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
236 !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
237 (key1->src.icmp_id == key2->src.icmp_id) &&
238 (key1->src.icmp_type == key2->src.icmp_type) &&
239 (key1->src.icmp_code == key2->src.icmp_code) &&
240 (key1->dst.icmp_id == key2->dst.icmp_id) &&
241 (key1->dst.icmp_type == key2->dst.icmp_type) &&
242 (key1->dst.icmp_code == key2->dst.icmp_code) &&
243 (key1->dl_type == key2->dl_type) &&
244 (key1->zone == key2->zone) &&
245 (key1->nw_proto == key2->nw_proto)) {
246
247 return 0;
248 }
249 return 1;
250}
251
d8682ee5 252static void
dec0dbbc
DB
253ct_print_conn_info(const struct conn *c, const char *log_msg,
254 enum vlog_level vll, bool force, bool rl_on)
66f400f5
DB
255{
256#define CT_VLOG(RL_ON, LEVEL, ...) \
257 do { \
258 if (RL_ON) { \
259 static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
260 vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \
261 } else { \
262 vlog(&this_module, LEVEL, __VA_ARGS__); \
263 } \
264 } while (0)
265
266 if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) {
267 if (c->key.dl_type == htons(ETH_TYPE_IP)) {
268 CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src "
269 "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports "
270 "%"PRIu16"/%"PRIu16" rev src/dst ports "
271 "%"PRIu16"/%"PRIu16" zone/rev zone "
272 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
273 "%"PRIu8"/%"PRIu8, log_msg,
274 IP_ARGS(c->key.src.addr.ipv4_aligned),
275 IP_ARGS(c->key.dst.addr.ipv4_aligned),
276 IP_ARGS(c->rev_key.src.addr.ipv4_aligned),
277 IP_ARGS(c->rev_key.dst.addr.ipv4_aligned),
278 ntohs(c->key.src.port), ntohs(c->key.dst.port),
279 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
280 c->key.zone, c->rev_key.zone, c->key.nw_proto,
281 c->rev_key.nw_proto);
282 } else {
283 char ip6_s[INET6_ADDRSTRLEN];
284 inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s);
285 char ip6_d[INET6_ADDRSTRLEN];
286 inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d);
287 char ip6_rs[INET6_ADDRSTRLEN];
288 inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs,
289 sizeof ip6_rs);
290 char ip6_rd[INET6_ADDRSTRLEN];
291 inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd,
292 sizeof ip6_rd);
293
294 CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s"
295 " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16
296 " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone "
297 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
298 "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs,
299 ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port),
300 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
301 c->key.zone, c->rev_key.zone, c->key.nw_proto,
302 c->rev_key.nw_proto);
303 }
304 }
305}
306
a489b168
DDP
307/* Initializes the connection tracker 'ct'. The caller is responsible for
308 * calling 'conntrack_destroy()', when the instance is not needed anymore */
309void
310conntrack_init(struct conntrack *ct)
311{
e6ef6cc6 312 long long now = time_msec();
a489b168 313
8b934ced
DB
314 ct_rwlock_init(&ct->resources_lock);
315 ct_rwlock_wrlock(&ct->resources_lock);
286de272 316 hmap_init(&ct->nat_conn_keys);
bd5e81a0 317 hmap_init(&ct->alg_expectations);
4417ca3d 318 hindex_init(&ct->alg_expectation_refs);
bd5e81a0 319 ovs_list_init(&ct->alg_exp_list);
8b934ced 320 ct_rwlock_unlock(&ct->resources_lock);
286de272 321
dec0dbbc 322 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
a489b168
DDP
323 struct conntrack_bucket *ctb = &ct->buckets[i];
324
325 ct_lock_init(&ctb->lock);
326 ct_lock_lock(&ctb->lock);
327 hmap_init(&ctb->connections);
dec0dbbc 328 for (unsigned j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
e6ef6cc6
DDP
329 ovs_list_init(&ctb->exp_lists[j]);
330 }
a489b168 331 ct_lock_unlock(&ctb->lock);
e6ef6cc6
DDP
332 ovs_mutex_init(&ctb->cleanup_mutex);
333 ovs_mutex_lock(&ctb->cleanup_mutex);
334 ctb->next_cleanup = now + CT_TM_MIN;
335 ovs_mutex_unlock(&ctb->cleanup_mutex);
a489b168
DDP
336 }
337 ct->hash_basis = random_uint32();
338 atomic_count_init(&ct->n_conn, 0);
339 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
e6ef6cc6
DDP
340 latch_init(&ct->clean_thread_exit);
341 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
a489b168
DDP
342}
343
344/* Destroys the connection tracker 'ct' and frees all the allocated memory. */
345void
346conntrack_destroy(struct conntrack *ct)
347{
e6ef6cc6
DDP
348 latch_set(&ct->clean_thread_exit);
349 pthread_join(ct->clean_thread, NULL);
350 latch_destroy(&ct->clean_thread_exit);
dec0dbbc 351 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
a489b168
DDP
352 struct conntrack_bucket *ctb = &ct->buckets[i];
353 struct conn *conn;
354
e6ef6cc6 355 ovs_mutex_destroy(&ctb->cleanup_mutex);
a489b168 356 ct_lock_lock(&ctb->lock);
bd5e81a0 357 HMAP_FOR_EACH_POP (conn, node, &ctb->connections) {
286de272
DB
358 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
359 atomic_count_dec(&ct->n_conn);
360 }
a489b168
DDP
361 delete_conn(conn);
362 }
363 hmap_destroy(&ctb->connections);
364 ct_lock_unlock(&ctb->lock);
365 ct_lock_destroy(&ctb->lock);
366 }
8b934ced 367 ct_rwlock_wrlock(&ct->resources_lock);
286de272
DB
368 struct nat_conn_key_node *nat_conn_key_node;
369 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
370 free(nat_conn_key_node);
371 }
372 hmap_destroy(&ct->nat_conn_keys);
bd5e81a0
DB
373
374 struct alg_exp_node *alg_exp_node;
375 HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
376 free(alg_exp_node);
377 }
4417ca3d 378
bd5e81a0
DB
379 ovs_list_poison(&ct->alg_exp_list);
380 hmap_destroy(&ct->alg_expectations);
4417ca3d 381 hindex_destroy(&ct->alg_expectation_refs);
8b934ced
DB
382 ct_rwlock_unlock(&ct->resources_lock);
383 ct_rwlock_destroy(&ct->resources_lock);
a489b168
DDP
384}
385\f
386static unsigned hash_to_bucket(uint32_t hash)
387{
388 /* Extracts the most significant bits in hash. The least significant bits
389 * are already used internally by the hmap implementation. */
390 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
391
392 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
393}
394
395static void
286de272 396write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
bd5e81a0 397 const struct conn_key *key, const struct alg_exp_node *alg_exp)
a489b168 398{
286de272 399 pkt->md.ct_state |= CS_TRACKED;
a489b168 400 pkt->md.ct_zone = zone;
daf4d3c1
JR
401 pkt->md.ct_mark = conn ? conn->mark : 0;
402 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
403
404 /* Use the original direction tuple if we have it. */
405 if (conn) {
bd5e81a0
DB
406 if (conn->alg_related) {
407 key = &conn->master_key;
408 } else {
409 key = &conn->key;
410 }
411 } else if (alg_exp) {
412 pkt->md.ct_mark = alg_exp->master_mark;
413 pkt->md.ct_label = alg_exp->master_label;
414 key = &alg_exp->master_key;
daf4d3c1 415 }
dec0dbbc 416
daf4d3c1 417 pkt->md.ct_orig_tuple_ipv6 = false;
dec0dbbc 418
daf4d3c1
JR
419 if (key) {
420 if (key->dl_type == htons(ETH_TYPE_IP)) {
421 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
422 key->src.addr.ipv4_aligned,
423 key->dst.addr.ipv4_aligned,
424 key->nw_proto != IPPROTO_ICMP
425 ? key->src.port : htons(key->src.icmp_type),
426 key->nw_proto != IPPROTO_ICMP
427 ? key->dst.port : htons(key->src.icmp_code),
428 key->nw_proto,
429 };
286de272 430 } else {
daf4d3c1
JR
431 pkt->md.ct_orig_tuple_ipv6 = true;
432 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
433 key->src.addr.ipv6_aligned,
434 key->dst.addr.ipv6_aligned,
435 key->nw_proto != IPPROTO_ICMPV6
436 ? key->src.port : htons(key->src.icmp_type),
437 key->nw_proto != IPPROTO_ICMPV6
438 ? key->dst.port : htons(key->src.icmp_code),
439 key->nw_proto,
440 };
441 }
442 } else {
443 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
444 }
bd5e81a0
DB
445}
446
447static uint8_t
448get_ip_proto(const struct dp_packet *pkt)
449{
450 uint8_t ip_proto;
451 struct eth_header *l2 = dp_packet_eth(pkt);
452 if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
453 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
454 ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
455 } else {
456 struct ip_header *l3_hdr = dp_packet_l3(pkt);
457 ip_proto = l3_hdr->ip_proto;
458 }
286de272 459
bd5e81a0
DB
460 return ip_proto;
461}
462
463static bool
94e71143 464is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl)
bd5e81a0 465{
94e71143 466 return ct_alg_ctl == CT_ALG_CTL_FTP;
bd5e81a0
DB
467}
468
94e71143 469static enum ct_alg_ctl_type
bd7d93f8
DB
470get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst,
471 const char *helper)
7be77cb0 472{
94e71143
DB
473 /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined
474 * in OSX, at least in in.h. Since these values will never change, remove
7be77cb0 475 * the external dependency. */
94e71143
DB
476 enum { CT_IPPORT_FTP = 21 };
477 enum { CT_IPPORT_TFTP = 69 };
bd7d93f8
DB
478 uint8_t ip_proto = get_ip_proto(pkt);
479 struct udp_header *uh = dp_packet_l4(pkt);
480 struct tcp_header *th = dp_packet_l4(pkt);
481 ovs_be16 ftp_src_port = htons(CT_IPPORT_FTP);
482 ovs_be16 ftp_dst_port = htons(CT_IPPORT_FTP);
483 ovs_be16 tftp_dst_port = htons(CT_IPPORT_TFTP);
484
485 if (OVS_UNLIKELY(tp_dst)) {
486 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
487 ftp_dst_port = tp_dst;
488 } else if (helper && !strncmp(helper, "tftp", strlen("tftp"))) {
489 tftp_dst_port = tp_dst;
490 }
491 } else if (OVS_UNLIKELY(tp_src)) {
492 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
493 ftp_src_port = tp_src;
494 }
495 }
7be77cb0 496
bd7d93f8 497 if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_dst_port) {
94e71143
DB
498 return CT_ALG_CTL_TFTP;
499 } else if (ip_proto == IPPROTO_TCP &&
bd7d93f8 500 (th->tcp_src == ftp_src_port || th->tcp_dst == ftp_dst_port)) {
94e71143
DB
501 return CT_ALG_CTL_FTP;
502 }
503 return CT_ALG_CTL_NONE;
504}
505
be38342d
DB
506static bool
507alg_src_ip_wc(enum ct_alg_ctl_type alg_ctl_type)
508{
509 if (alg_ctl_type == CT_ALG_CTL_SIP) {
510 return true;
511 }
512 return false;
513}
514
94e71143
DB
515static void
516handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
517 struct dp_packet *pkt, enum ct_alg_ctl_type ct_alg_ctl,
518 const struct conn *conn, long long now, bool nat,
519 const struct conn *conn_for_expectation)
520{
521 /* ALG control packet handling with expectation creation. */
3a2a425b 522 if (OVS_UNLIKELY(alg_helpers[ct_alg_ctl] && conn && conn->alg)) {
94e71143
DB
523 alg_helpers[ct_alg_ctl](ct, ctx, pkt, conn_for_expectation, now,
524 CT_FTP_CTL_INTEREST, nat);
525 }
7be77cb0
DB
526}
527
286de272
DB
528static void
529pat_packet(struct dp_packet *pkt, const struct conn *conn)
530{
531 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
532 if (conn->key.nw_proto == IPPROTO_TCP) {
533 struct tcp_header *th = dp_packet_l4(pkt);
534 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
535 } else if (conn->key.nw_proto == IPPROTO_UDP) {
536 struct udp_header *uh = dp_packet_l4(pkt);
537 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
538 }
539 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
540 if (conn->key.nw_proto == IPPROTO_TCP) {
541 struct tcp_header *th = dp_packet_l4(pkt);
542 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
543 } else if (conn->key.nw_proto == IPPROTO_UDP) {
544 struct udp_header *uh = dp_packet_l4(pkt);
545 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
546 }
547 }
548}
549
550static void
551nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
552{
553 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
554 pkt->md.ct_state |= CS_SRC_NAT;
555 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
556 struct ip_header *nh = dp_packet_l3(pkt);
557 packet_set_ipv4_addr(pkt, &nh->ip_src,
558 conn->rev_key.dst.addr.ipv4_aligned);
559 } else {
560 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
561 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
562 nh6->ip6_src.be32,
563 &conn->rev_key.dst.addr.ipv6_aligned,
564 true);
565 }
566 if (!related) {
567 pat_packet(pkt, conn);
568 }
569 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
570 pkt->md.ct_state |= CS_DST_NAT;
571 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
572 struct ip_header *nh = dp_packet_l3(pkt);
573 packet_set_ipv4_addr(pkt, &nh->ip_dst,
574 conn->rev_key.src.addr.ipv4_aligned);
575 } else {
576 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
577 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
578 nh6->ip6_dst.be32,
579 &conn->rev_key.src.addr.ipv6_aligned,
580 true);
581 }
582 if (!related) {
583 pat_packet(pkt, conn);
584 }
585 }
586}
587
588static void
589un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
590{
591 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
592 if (conn->key.nw_proto == IPPROTO_TCP) {
593 struct tcp_header *th = dp_packet_l4(pkt);
594 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
595 } else if (conn->key.nw_proto == IPPROTO_UDP) {
596 struct udp_header *uh = dp_packet_l4(pkt);
597 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
598 }
599 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
600 if (conn->key.nw_proto == IPPROTO_TCP) {
601 struct tcp_header *th = dp_packet_l4(pkt);
602 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
603 } else if (conn->key.nw_proto == IPPROTO_UDP) {
604 struct udp_header *uh = dp_packet_l4(pkt);
605 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
606 }
607 }
608}
609
edd1bef4
DB
610static void
611reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
612{
613 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
614 if (conn->key.nw_proto == IPPROTO_TCP) {
615 struct tcp_header *th_in = dp_packet_l4(pkt);
616 packet_set_tcp_port(pkt, conn->key.src.port,
617 th_in->tcp_dst);
618 } else if (conn->key.nw_proto == IPPROTO_UDP) {
619 struct udp_header *uh_in = dp_packet_l4(pkt);
620 packet_set_udp_port(pkt, conn->key.src.port,
621 uh_in->udp_dst);
622 }
623 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
624 if (conn->key.nw_proto == IPPROTO_TCP) {
625 struct tcp_header *th_in = dp_packet_l4(pkt);
626 packet_set_tcp_port(pkt, th_in->tcp_src,
627 conn->key.dst.port);
628 } else if (conn->key.nw_proto == IPPROTO_UDP) {
629 struct udp_header *uh_in = dp_packet_l4(pkt);
630 packet_set_udp_port(pkt, uh_in->udp_src,
631 conn->key.dst.port);
632 }
633 }
634}
635
636static void
637reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
638{
639 char *tail = dp_packet_tail(pkt);
640 char pad = dp_packet_l2_pad_size(pkt);
641 struct conn_key inner_key;
642 const char *inner_l4 = NULL;
643 uint16_t orig_l3_ofs = pkt->l3_ofs;
644 uint16_t orig_l4_ofs = pkt->l4_ofs;
645
646 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
647 struct ip_header *nh = dp_packet_l3(pkt);
648 struct icmp_header *icmp = dp_packet_l4(pkt);
649 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
bd5e81a0
DB
650 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad,
651 &inner_l4, false);
edd1bef4
DB
652 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
653 pkt->l4_ofs += inner_l4 - (char *) icmp;
654
655 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
656 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
657 conn->key.src.addr.ipv4_aligned);
658 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
659 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
660 conn->key.dst.addr.ipv4_aligned);
661 }
dec0dbbc 662
edd1bef4
DB
663 reverse_pat_packet(pkt, conn);
664 icmp->icmp_csum = 0;
665 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
666 } else {
667 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
668 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
669 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
670 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
671 extract_l3_ipv6(&inner_key, inner_l3_6,
672 tail - ((char *)inner_l3_6) - pad,
673 &inner_l4);
674 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
675 pkt->l4_ofs += inner_l4 - (char *) icmp6;
676
677 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
678 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
679 inner_l3_6->ip6_src.be32,
680 &conn->key.src.addr.ipv6_aligned,
681 true);
682 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
683 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
684 inner_l3_6->ip6_dst.be32,
685 &conn->key.dst.addr.ipv6_aligned,
686 true);
687 }
688 reverse_pat_packet(pkt, conn);
689 uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
690 icmp6->icmp6_base.icmp6_cksum = 0;
691 icmp6->icmp6_base.icmp6_cksum = csum_finish(
692 csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
693 }
694 pkt->l3_ofs = orig_l3_ofs;
695 pkt->l4_ofs = orig_l4_ofs;
696}
697
286de272
DB
698static void
699un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
700 bool related)
701{
702 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
703 pkt->md.ct_state |= CS_DST_NAT;
704 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
705 struct ip_header *nh = dp_packet_l3(pkt);
706 packet_set_ipv4_addr(pkt, &nh->ip_dst,
707 conn->key.src.addr.ipv4_aligned);
708 } else {
709 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
710 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
711 nh6->ip6_dst.be32,
712 &conn->key.src.addr.ipv6_aligned, true);
713 }
edd1bef4
DB
714
715 if (OVS_UNLIKELY(related)) {
716 reverse_nat_packet(pkt, conn);
717 } else {
286de272
DB
718 un_pat_packet(pkt, conn);
719 }
720 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
721 pkt->md.ct_state |= CS_SRC_NAT;
722 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
723 struct ip_header *nh = dp_packet_l3(pkt);
724 packet_set_ipv4_addr(pkt, &nh->ip_src,
725 conn->key.dst.addr.ipv4_aligned);
726 } else {
727 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
728 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
729 nh6->ip6_src.be32,
730 &conn->key.dst.addr.ipv6_aligned, true);
731 }
edd1bef4
DB
732
733 if (OVS_UNLIKELY(related)) {
734 reverse_nat_packet(pkt, conn);
735 } else {
286de272
DB
736 un_pat_packet(pkt, conn);
737 }
738 }
739}
740
741/* Typical usage of this helper is in non per-packet code;
742 * this is because the bucket lock needs to be held for lookup
743 * and a hash would have already been needed. Hence, this function
744 * is just intended for code clarity. */
745static struct conn *
bd5e81a0 746conn_lookup(struct conntrack *ct, const struct conn_key *key, long long now)
286de272
DB
747{
748 struct conn_lookup_ctx ctx;
749 ctx.conn = NULL;
750 ctx.key = *key;
751 ctx.hash = conn_key_hash(key, ct->hash_basis);
752 unsigned bucket = hash_to_bucket(ctx.hash);
753 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
754 return ctx.conn;
755}
756
bd5e81a0
DB
757static void
758conn_seq_skew_set(struct conntrack *ct, const struct conn_key *key,
759 long long now, int seq_skew, bool seq_skew_dir)
760{
dec0dbbc 761 unsigned bucket = hash_to_bucket(conn_key_hash(key, ct->hash_basis));
bd5e81a0
DB
762 ct_lock_lock(&ct->buckets[bucket].lock);
763 struct conn *conn = conn_lookup(ct, key, now);
764 if (conn && seq_skew) {
765 conn->seq_skew = seq_skew;
766 conn->seq_skew_dir = seq_skew_dir;
767 }
768 ct_lock_unlock(&ct->buckets[bucket].lock);
769}
770
286de272
DB
771static void
772nat_clean(struct conntrack *ct, struct conn *conn,
773 struct conntrack_bucket *ctb)
774 OVS_REQUIRES(ctb->lock)
775{
8b934ced 776 ct_rwlock_wrlock(&ct->resources_lock);
286de272 777 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
8b934ced 778 ct_rwlock_unlock(&ct->resources_lock);
286de272 779 ct_lock_unlock(&ctb->lock);
dec0dbbc
DB
780 unsigned bucket_rev_conn =
781 hash_to_bucket(conn_key_hash(&conn->rev_key, ct->hash_basis));
286de272 782 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
8b934ced 783 ct_rwlock_wrlock(&ct->resources_lock);
dec0dbbc 784 long long now = time_msec();
286de272 785 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
286de272
DB
786 struct nat_conn_key_node *nat_conn_key_node =
787 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
788 ct->hash_basis);
789
790 /* In the unlikely event, rev conn was recreated, then skip
791 * rev_conn cleanup. */
792 if (rev_conn && (!nat_conn_key_node ||
5ed7a0b4
DB
793 conn_key_cmp(&nat_conn_key_node->value,
794 &rev_conn->rev_key))) {
286de272
DB
795 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
796 &rev_conn->node);
797 free(rev_conn);
798 }
286de272 799
dec0dbbc 800 delete_conn(conn);
8b934ced 801 ct_rwlock_unlock(&ct->resources_lock);
286de272
DB
802 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
803 ct_lock_lock(&ctb->lock);
804}
805
9e8f3960 806/* Must be called with 'CT_CONN_TYPE_DEFAULT' 'conn_type'. */
286de272
DB
807static void
808conn_clean(struct conntrack *ct, struct conn *conn,
809 struct conntrack_bucket *ctb)
810 OVS_REQUIRES(ctb->lock)
811{
9e8f3960
DB
812 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
813
4417ca3d
DB
814 if (conn->alg) {
815 expectation_clean(ct, &conn->key, ct->hash_basis);
816 }
286de272
DB
817 ovs_list_remove(&conn->exp_node);
818 hmap_remove(&ctb->connections, &conn->node);
819 atomic_count_dec(&ct->n_conn);
820 if (conn->nat_info) {
821 nat_clean(ct, conn, ctb);
822 } else {
823 delete_conn(conn);
824 }
a489b168
DDP
825}
826
3a2a425b
DB
827static bool
828ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
829{
830 if (ct_alg_ctl == CT_ALG_CTL_NONE) {
831 return true;
832 } else if (helper) {
833 if ((ct_alg_ctl == CT_ALG_CTL_FTP) &&
834 !strncmp(helper, "ftp", strlen("ftp"))) {
835 return true;
836 } else if ((ct_alg_ctl == CT_ALG_CTL_TFTP) &&
837 !strncmp(helper, "tftp", strlen("tftp"))) {
838 return true;
839 } else {
840 return false;
841 }
842 } else {
843 return false;
844 }
845}
846
ac6abe5f 847/* This function is called with the bucket lock held. */
a489b168
DDP
848static struct conn *
849conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
286de272
DB
850 struct conn_lookup_ctx *ctx, bool commit, long long now,
851 const struct nat_action_info_t *nat_action_info,
bd5e81a0
DB
852 struct conn *conn_for_un_nat_copy,
853 const char *helper,
3a2a425b
DB
854 const struct alg_exp_node *alg_exp,
855 enum ct_alg_ctl_type ct_alg_ctl)
a489b168 856{
a489b168
DDP
857 struct conn *nc = NULL;
858
859 if (!valid_new(pkt, &ctx->key)) {
286de272 860 pkt->md.ct_state = CS_INVALID;
a489b168
DDP
861 return nc;
862 }
dec0dbbc 863
286de272 864 pkt->md.ct_state = CS_NEW;
dec0dbbc 865
bd5e81a0
DB
866 if (alg_exp) {
867 pkt->md.ct_state |= CS_RELATED;
868 }
a489b168
DDP
869
870 if (commit) {
871 unsigned int n_conn_limit;
a489b168
DDP
872 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
873
874 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
875 COVERAGE_INC(conntrack_full);
876 return nc;
877 }
878
dec0dbbc 879 unsigned bucket = hash_to_bucket(ctx->hash);
e6ef6cc6 880 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
286de272
DB
881 ctx->conn = nc;
882 nc->rev_key = nc->key;
883 conn_key_reverse(&nc->rev_key);
a489b168 884
3a2a425b
DB
885 if (ct_verify_helper(helper, ct_alg_ctl)) {
886 nc->alg = nullable_xstrdup(helper);
bd5e81a0
DB
887 }
888
889 if (alg_exp) {
890 nc->alg_related = true;
891 nc->mark = alg_exp->master_mark;
892 nc->label = alg_exp->master_label;
893 nc->master_key = alg_exp->master_key;
894 }
895
286de272
DB
896 if (nat_action_info) {
897 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
a489b168 898
bd5e81a0 899 if (alg_exp) {
be38342d 900 if (alg_exp->nat_rpl_dst) {
bd5e81a0
DB
901 nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
902 nc->nat_info->nat_action = NAT_ACTION_SRC;
903 } else {
904 nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
905 nc->nat_info->nat_action = NAT_ACTION_DST;
906 }
907 *conn_for_un_nat_copy = *nc;
d8682ee5
DB
908 ct_rwlock_wrlock(&ct->resources_lock);
909 bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys,
910 conn_for_un_nat_copy,
911 ct->hash_basis);
912 ct_rwlock_unlock(&ct->resources_lock);
913 if (!new_insert) {
914 char *log_msg = xasprintf("Pre-existing alg "
915 "nat_conn_key");
916 ct_print_conn_info(conn_for_un_nat_copy, log_msg, VLL_INFO,
917 true, false);
918 free(log_msg);
919 }
bd5e81a0
DB
920 } else {
921 *conn_for_un_nat_copy = *nc;
922 ct_rwlock_wrlock(&ct->resources_lock);
dec0dbbc
DB
923 bool nat_res = nat_select_range_tuple(ct, nc,
924 conn_for_un_nat_copy);
286de272 925
bd5e81a0
DB
926 if (!nat_res) {
927 goto nat_res_exhaustion;
928 }
286de272 929
bd5e81a0
DB
930 /* Update nc with nat adjustments made to
931 * conn_for_un_nat_copy by nat_select_range_tuple(). */
286de272 932 *nc = *conn_for_un_nat_copy;
bd5e81a0 933 ct_rwlock_unlock(&ct->resources_lock);
286de272 934 }
bd5e81a0
DB
935 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
936 conn_for_un_nat_copy->nat_info = NULL;
937 conn_for_un_nat_copy->alg = NULL;
dbb597d3 938 nat_packet(pkt, nc, ctx->icmp_related);
286de272 939 }
a489b168
DDP
940 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
941 atomic_count_inc(&ct->n_conn);
942 }
bd5e81a0 943
a489b168 944 return nc;
bd5e81a0
DB
945
946 /* This would be a user error or a DOS attack.
947 * A user error is prevented by allocating enough
948 * combinations of NAT addresses when combined with
949 * ephemeral ports. A DOS attack should be protected
950 * against with firewall rules or a separate firewall.
951 * Also using zone partitioning can limit DoS impact. */
952nat_res_exhaustion:
d8c5a93b 953 ovs_list_remove(&nc->exp_node);
bd5e81a0
DB
954 delete_conn(nc);
955 /* conn_for_un_nat_copy is a local variable in process_one; this
956 * memset() serves to document that conn_for_un_nat_copy is from
957 * this point on unused. */
958 memset(conn_for_un_nat_copy, 0, sizeof *conn_for_un_nat_copy);
959 ct_rwlock_unlock(&ct->resources_lock);
960 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
961 VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
962 "if DoS attack, use firewalling and/or zone partitioning.");
963 return NULL;
a489b168
DDP
964}
965
286de272
DB
966static bool
967conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
968 struct conn_lookup_ctx *ctx, struct conn **conn,
969 long long now, unsigned bucket)
970 OVS_REQUIRES(ct->buckets[bucket].lock)
971{
972 bool create_new_conn = false;
973
dbb597d3 974 if (ctx->icmp_related) {
286de272
DB
975 pkt->md.ct_state |= CS_RELATED;
976 if (ctx->reply) {
977 pkt->md.ct_state |= CS_REPLY_DIR;
978 }
979 } else {
bd5e81a0
DB
980 if ((*conn)->alg_related) {
981 pkt->md.ct_state |= CS_RELATED;
982 }
dec0dbbc 983
286de272
DB
984 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
985 pkt, ctx->reply, now);
986
987 switch (res) {
988 case CT_UPDATE_VALID:
989 pkt->md.ct_state |= CS_ESTABLISHED;
990 pkt->md.ct_state &= ~CS_NEW;
991 if (ctx->reply) {
992 pkt->md.ct_state |= CS_REPLY_DIR;
993 }
994 break;
995 case CT_UPDATE_INVALID:
996 pkt->md.ct_state = CS_INVALID;
997 break;
998 case CT_UPDATE_NEW:
999 conn_clean(ct, *conn, &ct->buckets[bucket]);
1000 create_new_conn = true;
1001 break;
1002 default:
1003 OVS_NOT_REACHED();
1004 }
1005 }
1006 return create_new_conn;
1007}
1008
1009static void
1010create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
bd5e81a0 1011 long long now, bool alg_un_nat)
286de272
DB
1012{
1013 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
1014 nc->key = conn_for_un_nat_copy->rev_key;
1015 nc->rev_key = conn_for_un_nat_copy->key;
1016 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
1017 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
1018 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
286de272
DB
1019 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
1020
bd5e81a0 1021 if (alg_un_nat) {
d8682ee5
DB
1022 if (!rev_conn) {
1023 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
1024 &nc->node, un_nat_hash);
1025 } else {
1026 char *log_msg = xasprintf("Unusual condition for un_nat conn "
1027 "create for alg: rev_conn %p", rev_conn);
1028 ct_print_conn_info(nc, log_msg, VLL_INFO, true, false);
1029 free(log_msg);
1030 free(nc);
1031 }
286de272 1032 } else {
bd5e81a0
DB
1033 ct_rwlock_rdlock(&ct->resources_lock);
1034
1035 struct nat_conn_key_node *nat_conn_key_node =
1036 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
1037 if (nat_conn_key_node && !conn_key_cmp(&nat_conn_key_node->value,
1038 &nc->rev_key) && !rev_conn) {
bd5e81a0
DB
1039 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
1040 &nc->node, un_nat_hash);
1041 } else {
d8682ee5
DB
1042 char *log_msg = xasprintf("Unusual condition for un_nat conn "
1043 "create: nat_conn_key_node/rev_conn "
1044 "%p/%p", nat_conn_key_node, rev_conn);
1045 ct_print_conn_info(nc, log_msg, VLL_INFO, true, false);
1046 free(log_msg);
bd5e81a0
DB
1047 free(nc);
1048 }
1049 ct_rwlock_unlock(&ct->resources_lock);
286de272 1050 }
286de272
DB
1051 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
1052}
1053
1054static void
1055handle_nat(struct dp_packet *pkt, struct conn *conn,
1056 uint16_t zone, bool reply, bool related)
1057{
1058 if (conn->nat_info &&
1059 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1060 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
1061 zone != pkt->md.ct_zone))) {
bd5e81a0 1062
286de272
DB
1063 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
1064 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
1065 }
1066 if (reply) {
1067 un_nat_packet(pkt, conn, related);
1068 } else {
1069 nat_packet(pkt, conn, related);
1070 }
1071 }
1072}
1073
f8016041
DB
1074static bool
1075check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
1076 struct conn_lookup_ctx *ctx_in, long long now,
1077 unsigned *bucket, struct conn **conn,
1078 const struct nat_action_info_t *nat_action_info)
1079 OVS_REQUIRES(ct->buckets[*bucket].lock)
1080{
1081 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
1082 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
1083 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
1084 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
1085 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1086 nat_action_info) {
1087 return false;
1088 }
1089
1090 ct_lock_unlock(&ct->buckets[*bucket].lock);
1091 struct conn_lookup_ctx ctx;
1092 memset(&ctx, 0 , sizeof ctx);
1093 ctx.conn = NULL;
1094
1095 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
1096 ctx.key.src.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
1097 ctx.key.dst.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
1098
1099 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
1100 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
1101 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1102 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
1103 ctx.key.src.icmp_type = (uint8_t) src_port;
1104 ctx.key.dst.icmp_type = reverse_icmp_type(ctx.key.src.icmp_type);
1105 } else {
1106 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
1107 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
1108 }
1109 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
1110 } else {
1111 ctx.key.src.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
1112 ctx.key.dst.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
1113
1114 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
1115 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
1116 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1117 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
1118 ctx.key.src.icmp_type = (uint8_t) src_port;
1119 ctx.key.dst.icmp_type = reverse_icmp6_type(ctx.key.src.icmp_type);
1120 } else {
1121 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
1122 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
1123 }
1124 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
1125 }
1126
1127 ctx.key.dl_type = ctx_in->key.dl_type;
1128 ctx.key.zone = pkt->md.ct_zone;
f8016041
DB
1129 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
1130 *bucket = hash_to_bucket(ctx.hash);
1131 ct_lock_lock(&ct->buckets[*bucket].lock);
1132 conn_key_lookup(&ct->buckets[*bucket], &ctx, now);
1133 *conn = ctx.conn;
f8016041
DB
1134 return *conn ? true : false;
1135}
1136
bd5e81a0
DB
1137static bool
1138is_un_nat_conn_valid(const struct conn *un_nat_conn)
1139{
1140 return un_nat_conn->conn_type == CT_CONN_TYPE_UN_NAT;
1141}
1142
94e71143
DB
1143static bool
1144conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
1145 struct conn_lookup_ctx *ctx, struct conn *conn,
1146 const struct nat_action_info_t *nat_action_info,
1147 enum ct_alg_ctl_type ct_alg_ctl, long long now,
1148 unsigned bucket, bool *create_new_conn)
1149 OVS_REQUIRES(ct->buckets[bucket].lock)
1150{
1151 if (is_ftp_ctl(ct_alg_ctl)) {
1152 /* Keep sequence tracking in sync with the source of the
1153 * sequence skew. */
1154 if (ctx->reply != conn->seq_skew_dir) {
1155 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1156 !!nat_action_info);
1157 *create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1158 bucket);
1159 } else {
1160 *create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1161 bucket);
030958a0
DB
1162
1163 if (*create_new_conn == false) {
1164 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1165 !!nat_action_info);
1166 }
94e71143
DB
1167 }
1168 return true;
1169 }
1170 return false;
1171}
1172
286de272 1173static void
a489b168
DDP
1174process_one(struct conntrack *ct, struct dp_packet *pkt,
1175 struct conn_lookup_ctx *ctx, uint16_t zone,
286de272
DB
1176 bool force, bool commit, long long now, const uint32_t *setmark,
1177 const struct ovs_key_ct_labels *setlabel,
bd5e81a0 1178 const struct nat_action_info_t *nat_action_info,
bd7d93f8 1179 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper)
a489b168 1180{
286de272 1181 struct conn *conn;
a489b168 1182 unsigned bucket = hash_to_bucket(ctx->hash);
286de272
DB
1183 ct_lock_lock(&ct->buckets[bucket].lock);
1184 conn_key_lookup(&ct->buckets[bucket], ctx, now);
1185 conn = ctx->conn;
a489b168 1186
a76a37ef
JR
1187 /* Delete found entry if in wrong direction. 'force' implies commit. */
1188 if (conn && force && ctx->reply) {
286de272 1189 conn_clean(ct, conn, &ct->buckets[bucket]);
a76a37ef
JR
1190 conn = NULL;
1191 }
1192
286de272
DB
1193 if (OVS_LIKELY(conn)) {
1194 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
a489b168 1195
286de272 1196 ctx->reply = true;
a489b168 1197
286de272
DB
1198 struct conn_lookup_ctx ctx2;
1199 ctx2.conn = NULL;
1200 ctx2.key = conn->rev_key;
1201 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
1202
1203 ct_lock_unlock(&ct->buckets[bucket].lock);
1204 bucket = hash_to_bucket(ctx2.hash);
1205
1206 ct_lock_lock(&ct->buckets[bucket].lock);
1207 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
1208
1209 if (ctx2.conn) {
1210 conn = ctx2.conn;
1211 } else {
1212 /* It is a race condition where conn has timed out and removed
1213 * between unlock of the rev_conn and lock of the forward conn;
1214 * nothing to do. */
1215 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
1216 ct_lock_unlock(&ct->buckets[bucket].lock);
1217 return;
a489b168
DDP
1218 }
1219 }
286de272
DB
1220 }
1221
1222 bool create_new_conn = false;
1223 struct conn conn_for_un_nat_copy;
1224 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
94e71143 1225
bd7d93f8
DB
1226 enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst,
1227 helper);
bd5e81a0 1228
286de272 1229 if (OVS_LIKELY(conn)) {
94e71143
DB
1230 if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn,
1231 nat_action_info,
1232 ct_alg_ctl, now, bucket,
1233 &create_new_conn))) {
bd5e81a0
DB
1234 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1235 bucket);
1236 }
286de272 1237 if (nat_action_info && !create_new_conn) {
dbb597d3 1238 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
286de272 1239 }
bd5e81a0 1240
dec0dbbc 1241 } else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
bd5e81a0 1242 nat_action_info)) {
dec0dbbc 1243 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
a489b168 1244 } else {
dbb597d3 1245 if (ctx->icmp_related) {
bd5e81a0
DB
1246 /* An icmp related conn should always be found; no new
1247 connection is created based on an icmp related packet. */
286de272 1248 pkt->md.ct_state = CS_INVALID;
5c2e106b 1249 } else {
286de272 1250 create_new_conn = true;
5c2e106b 1251 }
a489b168
DDP
1252 }
1253
bd5e81a0 1254 const struct alg_exp_node *alg_exp = NULL;
96bbcbf7 1255 struct alg_exp_node alg_exp_entry;
dec0dbbc 1256
286de272 1257 if (OVS_UNLIKELY(create_new_conn)) {
bd5e81a0
DB
1258
1259 ct_rwlock_rdlock(&ct->resources_lock);
1260 alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
be38342d
DB
1261 ct->hash_basis,
1262 alg_src_ip_wc(ct_alg_ctl));
bd5e81a0
DB
1263 if (alg_exp) {
1264 alg_exp_entry = *alg_exp;
1265 alg_exp = &alg_exp_entry;
1266 }
1267 ct_rwlock_unlock(&ct->resources_lock);
1268
286de272 1269 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
3a2a425b
DB
1270 &conn_for_un_nat_copy, helper, alg_exp,
1271 ct_alg_ctl);
286de272
DB
1272 }
1273
bd5e81a0
DB
1274 write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1275
286de272
DB
1276 if (conn && setmark) {
1277 set_mark(pkt, conn, setmark[0], setmark[1]);
1278 }
a489b168 1279
286de272
DB
1280 if (conn && setlabel) {
1281 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1282 }
1283
bd5e81a0 1284 struct conn conn_for_expectation;
94e71143 1285 if (OVS_UNLIKELY((ct_alg_ctl != CT_ALG_CTL_NONE) && conn)) {
bd5e81a0
DB
1286 conn_for_expectation = *conn;
1287 }
1288
286de272
DB
1289 ct_lock_unlock(&ct->buckets[bucket].lock);
1290
bd5e81a0
DB
1291 if (is_un_nat_conn_valid(&conn_for_un_nat_copy)) {
1292 create_un_nat_conn(ct, &conn_for_un_nat_copy, now, !!alg_exp);
1293 }
1294
94e71143
DB
1295 handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info,
1296 &conn_for_expectation);
a489b168
DDP
1297}
1298
1299/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
1300 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
1301 * the l3 and and l4 offset properly set.
1302 *
1303 * If 'commit' is true, the packets are allowed to create new entries in the
1304 * connection tables. 'setmark', if not NULL, should point to a two
1305 * elements array containing a value and a mask to set the connection mark.
1306 * 'setlabel' behaves similarly for the connection label.*/
1307int
1308conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
a76a37ef 1309 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
66e4ad8a 1310 const uint32_t *setmark,
a489b168 1311 const struct ovs_key_ct_labels *setlabel,
bd7d93f8 1312 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
94053e66
FA
1313 const struct nat_action_info_t *nat_action_info,
1314 long long now)
a489b168 1315{
bd5e81a0 1316
43495c45 1317 struct dp_packet *packet;
61ce32b9 1318 struct conn_lookup_ctx ctx;
a489b168 1319
e883448e 1320 DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
43495c45
BB
1321 if (!conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
1322 packet->md.ct_state = CS_INVALID;
1323 write_ct_md(packet, zone, NULL, NULL, NULL);
a489b168
DDP
1324 continue;
1325 }
94e71143 1326 process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
bd7d93f8 1327 setlabel, nat_action_info, tp_src, tp_dst, helper);
a489b168
DDP
1328 }
1329
1330 return 0;
1331}
1332
1fe178d2
EG
1333void
1334conntrack_clear(struct dp_packet *packet)
1335{
1336 /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
1337 * the conntrack fields invalid. */
1338 packet->md.ct_state = 0;
1339}
1340
a489b168
DDP
1341static void
1342set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1343{
bd5e81a0
DB
1344 if (conn->alg_related) {
1345 pkt->md.ct_mark = conn->mark;
1346 } else {
1347 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1348 conn->mark = pkt->md.ct_mark;
1349 }
a489b168
DDP
1350}
1351
1352static void
1353set_label(struct dp_packet *pkt, struct conn *conn,
1354 const struct ovs_key_ct_labels *val,
1355 const struct ovs_key_ct_labels *mask)
1356{
bd5e81a0
DB
1357 if (conn->alg_related) {
1358 pkt->md.ct_label = conn->label;
1359 } else {
1360 ovs_u128 v, m;
a489b168 1361
bd5e81a0
DB
1362 memcpy(&v, val, sizeof v);
1363 memcpy(&m, mask, sizeof m);
a489b168 1364
bd5e81a0 1365 pkt->md.ct_label.u64.lo = v.u64.lo
a489b168 1366 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
bd5e81a0 1367 pkt->md.ct_label.u64.hi = v.u64.hi
a489b168 1368 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
bd5e81a0
DB
1369 conn->label = pkt->md.ct_label;
1370 }
a489b168 1371}
286de272 1372
a489b168 1373\f
e6ef6cc6
DDP
1374/* Delete the expired connections from 'ctb', up to 'limit'. Returns the
1375 * earliest expiration time among the remaining connections in 'ctb'. Returns
1376 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
1377 * if 'limit' is reached */
1378static long long
bd5e81a0
DB
1379sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb,
1380 long long now, size_t limit)
e6ef6cc6
DDP
1381 OVS_REQUIRES(ctb->lock)
1382{
1383 struct conn *conn, *next;
1384 long long min_expiration = LLONG_MAX;
e6ef6cc6
DDP
1385 size_t count = 0;
1386
dec0dbbc 1387 for (unsigned i = 0; i < N_CT_TM; i++) {
e6ef6cc6 1388 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
286de272
DB
1389 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1390 if (!conn_expired(conn, now) || count >= limit) {
1391 min_expiration = MIN(min_expiration, conn->expiration);
1392 if (count >= limit) {
1393 /* Do not check other lists. */
1394 COVERAGE_INC(conntrack_long_cleanup);
1395 return min_expiration;
1396 }
1397 break;
e6ef6cc6 1398 }
286de272
DB
1399 conn_clean(ct, conn, ctb);
1400 count++;
e6ef6cc6 1401 }
e6ef6cc6
DDP
1402 }
1403 }
e6ef6cc6
DDP
1404 return min_expiration;
1405}
1406
1407/* Cleans up old connection entries from 'ct'. Returns the time when the
1408 * next expiration might happen. The return value might be smaller than
1409 * 'now', meaning that an internal limit has been reached, and some expired
1410 * connections have not been deleted. */
1411static long long
1412conntrack_clean(struct conntrack *ct, long long now)
1413{
1414 long long next_wakeup = now + CT_TM_MIN;
1415 unsigned int n_conn_limit;
1416 size_t clean_count = 0;
e6ef6cc6
DDP
1417
1418 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1419
dec0dbbc 1420 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
e6ef6cc6
DDP
1421 struct conntrack_bucket *ctb = &ct->buckets[i];
1422 size_t prev_count;
1423 long long min_exp;
1424
1425 ovs_mutex_lock(&ctb->cleanup_mutex);
1426 if (ctb->next_cleanup > now) {
1427 goto next_bucket;
1428 }
1429
1430 ct_lock_lock(&ctb->lock);
1431 prev_count = hmap_count(&ctb->connections);
1432 /* If the connections are well distributed among buckets, we want to
1433 * limit to 10% of the global limit equally split among buckets. If
1434 * the bucket is busier than the others, we limit to 10% of its
1435 * current size. */
1436 min_exp = sweep_bucket(ct, ctb, now,
1437 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
1438 clean_count += prev_count - hmap_count(&ctb->connections);
1439
1440 if (min_exp > now) {
1441 /* We call hmap_shrink() only if sweep_bucket() managed to delete
1442 * every expired connection. */
1443 hmap_shrink(&ctb->connections);
1444 }
1445
1446 ct_lock_unlock(&ctb->lock);
1447
1448 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
1449
1450next_bucket:
1451 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
1452 ovs_mutex_unlock(&ctb->cleanup_mutex);
1453 }
1454
1455 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
1456 clean_count, time_msec() - now);
1457
1458 return next_wakeup;
1459}
1460
1461/* Cleanup:
e6ef6cc6
DDP
1462 *
1463 * We must call conntrack_clean() periodically. conntrack_clean() return
1464 * value gives an hint on when the next cleanup must be done (either because
1465 * there is an actual connection that expires, or because a new connection
1466 * might be created with the minimum timeout).
1467 *
1468 * The logic below has two goals:
1469 *
6c54734e
DDP
1470 * - We want to reduce the number of wakeups and batch connection cleanup
1471 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1472 * are coping with the current cleanup tasks, then we wait at least
1473 * 5 seconds to do further cleanup.
e6ef6cc6 1474 *
6c54734e
DDP
1475 * - We don't want to keep the buckets locked too long, as we might prevent
1476 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
1477 * behind, there is at least some 200ms blocks of time when buckets will be
1478 * left alone, so the datapath can operate unhindered.
e6ef6cc6
DDP
1479 */
1480#define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1481#define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1482
1483static void *
1484clean_thread_main(void *f_)
1485{
1486 struct conntrack *ct = f_;
1487
1488 while (!latch_is_set(&ct->clean_thread_exit)) {
1489 long long next_wake;
1490 long long now = time_msec();
e6ef6cc6
DDP
1491 next_wake = conntrack_clean(ct, now);
1492
1493 if (next_wake < now) {
1494 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1495 } else {
1496 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1497 }
1498 latch_wait(&ct->clean_thread_exit);
1499 poll_block();
1500 }
1501
1502 return NULL;
1503}
1504\f
e917d3ee
DB
1505/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1506 * used to store a pointer to the first byte after the L3 header. 'Size' is
1507 * the size of the packet beyond the data pointer. */
a489b168
DDP
1508static inline bool
1509extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1510 const char **new_data, bool validate_checksum)
1511{
e917d3ee
DB
1512 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1513 return false;
a489b168
DDP
1514 }
1515
dec0dbbc
DB
1516 const struct ip_header *ip = data;
1517 size_t ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
a489b168 1518
e917d3ee
DB
1519 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1520 return false;
1521 }
a489b168 1522
e917d3ee
DB
1523 if (OVS_UNLIKELY(size < ip_len)) {
1524 return false;
1525 }
a489b168 1526
e917d3ee
DB
1527 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1528 return false;
a489b168
DDP
1529 }
1530
1531 if (validate_checksum && csum(data, ip_len) != 0) {
1532 return false;
1533 }
1534
e917d3ee
DB
1535 if (new_data) {
1536 *new_data = (char *) data + ip_len;
1537 }
1538
a489b168
DDP
1539 key->src.addr.ipv4 = ip->ip_src;
1540 key->dst.addr.ipv4 = ip->ip_dst;
1541 key->nw_proto = ip->ip_proto;
1542
1543 return true;
1544}
1545
e917d3ee
DB
1546/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1547 * used to store a pointer to the first byte after the L3 header. 'Size' is
1548 * the size of the packet beyond the data pointer. */
a489b168
DDP
1549static inline bool
1550extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1551 const char **new_data)
1552{
1553 const struct ovs_16aligned_ip6_hdr *ip6 = data;
286de272 1554
e917d3ee
DB
1555 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1556 return false;
a489b168
DDP
1557 }
1558
1559 data = ip6 + 1;
1560 size -= sizeof *ip6;
dec0dbbc
DB
1561 uint8_t nw_proto = ip6->ip6_nxt;
1562 uint8_t nw_frag = 0;
a489b168
DDP
1563
1564 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
1565 return false;
1566 }
1567
a489b168
DDP
1568 if (nw_frag) {
1569 return false;
1570 }
1571
c8b1ad49
DB
1572 if (new_data) {
1573 *new_data = data;
1574 }
1575
a489b168
DDP
1576 key->src.addr.ipv6 = ip6->ip6_src;
1577 key->dst.addr.ipv6 = ip6->ip6_dst;
1578 key->nw_proto = nw_proto;
1579
1580 return true;
1581}
1582
1583static inline bool
1584checksum_valid(const struct conn_key *key, const void *data, size_t size,
1585 const void *l3)
1586{
1587 uint32_t csum = 0;
1588
1589 if (key->dl_type == htons(ETH_TYPE_IP)) {
1590 csum = packet_csum_pseudoheader(l3);
1591 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1592 csum = packet_csum_pseudoheader6(l3);
1593 } else {
1594 return false;
1595 }
1596
1597 csum = csum_continue(csum, data, size);
1598
1599 return csum_finish(csum) == 0;
1600}
1601
1602static inline bool
1603check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
324459a3 1604 const void *l3, bool validate_checksum)
a489b168
DDP
1605{
1606 const struct tcp_header *tcp = data;
40225b0c
BP
1607 if (size < sizeof *tcp) {
1608 return false;
1609 }
a489b168 1610
40225b0c 1611 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
a489b168
DDP
1612 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1613 return false;
1614 }
1615
324459a3 1616 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1617}
1618
1619static inline bool
1620check_l4_udp(const struct conn_key *key, const void *data, size_t size,
324459a3 1621 const void *l3, bool validate_checksum)
a489b168
DDP
1622{
1623 const struct udp_header *udp = data;
40225b0c
BP
1624 if (size < sizeof *udp) {
1625 return false;
1626 }
a489b168 1627
40225b0c 1628 size_t udp_len = ntohs(udp->udp_len);
a489b168
DDP
1629 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1630 return false;
1631 }
1632
1633 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1634 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
324459a3 1635 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
a489b168
DDP
1636}
1637
1638static inline bool
324459a3 1639check_l4_icmp(const void *data, size_t size, bool validate_checksum)
a489b168 1640{
324459a3 1641 return validate_checksum ? csum(data, size) == 0 : true;
a489b168
DDP
1642}
1643
1644static inline bool
1645check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
324459a3 1646 const void *l3, bool validate_checksum)
a489b168 1647{
324459a3 1648 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1649}
1650
1651static inline bool
1652extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1653{
a489b168
DDP
1654 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1655 return false;
1656 }
1657
dec0dbbc 1658 const struct tcp_header *tcp = data;
a489b168
DDP
1659 key->src.port = tcp->tcp_src;
1660 key->dst.port = tcp->tcp_dst;
1661
1662 /* Port 0 is invalid */
1663 return key->src.port && key->dst.port;
1664}
1665
1666static inline bool
1667extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1668{
a489b168
DDP
1669 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1670 return false;
1671 }
1672
dec0dbbc 1673 const struct udp_header *udp = data;
a489b168
DDP
1674 key->src.port = udp->udp_src;
1675 key->dst.port = udp->udp_dst;
1676
1677 /* Port 0 is invalid */
1678 return key->src.port && key->dst.port;
1679}
1680
1681static inline bool extract_l4(struct conn_key *key, const void *data,
324459a3
SC
1682 size_t size, bool *related, const void *l3,
1683 bool validate_checksum);
a489b168 1684
b269a122
DDP
1685static uint8_t
1686reverse_icmp_type(uint8_t type)
1687{
1688 switch (type) {
1689 case ICMP4_ECHO_REQUEST:
1690 return ICMP4_ECHO_REPLY;
1691 case ICMP4_ECHO_REPLY:
1692 return ICMP4_ECHO_REQUEST;
1693
1694 case ICMP4_TIMESTAMP:
1695 return ICMP4_TIMESTAMPREPLY;
1696 case ICMP4_TIMESTAMPREPLY:
1697 return ICMP4_TIMESTAMP;
1698
1699 case ICMP4_INFOREQUEST:
1700 return ICMP4_INFOREPLY;
1701 case ICMP4_INFOREPLY:
1702 return ICMP4_INFOREQUEST;
1703 default:
1704 OVS_NOT_REACHED();
1705 }
1706}
1707
a489b168
DDP
1708/* If 'related' is not NULL and the function is processing an ICMP
1709 * error packet, extract the l3 and l4 fields from the nested header
1710 * instead and set *related to true. If 'related' is NULL we're
1711 * already processing a nested header and no such recursion is
1712 * possible */
1713static inline int
1714extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1715 bool *related)
1716{
a489b168
DDP
1717 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1718 return false;
1719 }
1720
dec0dbbc
DB
1721 const struct icmp_header *icmp = data;
1722
a489b168
DDP
1723 switch (icmp->icmp_type) {
1724 case ICMP4_ECHO_REQUEST:
1725 case ICMP4_ECHO_REPLY:
1726 case ICMP4_TIMESTAMP:
1727 case ICMP4_TIMESTAMPREPLY:
1728 case ICMP4_INFOREQUEST:
1729 case ICMP4_INFOREPLY:
b269a122
DDP
1730 if (icmp->icmp_code != 0) {
1731 return false;
1732 }
a489b168 1733 /* Separate ICMP connection: identified using id */
b269a122
DDP
1734 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1735 key->src.icmp_type = icmp->icmp_type;
1736 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
a489b168
DDP
1737 break;
1738 case ICMP4_DST_UNREACH:
1739 case ICMP4_TIME_EXCEEDED:
1740 case ICMP4_PARAM_PROB:
1741 case ICMP4_SOURCEQUENCH:
1742 case ICMP4_REDIRECT: {
1743 /* ICMP packet part of another connection. We should
1744 * extract the key from embedded packet header */
1745 struct conn_key inner_key;
1746 const char *l3 = (const char *) (icmp + 1);
1747 const char *tail = (const char *) data + size;
1748 const char *l4;
a489b168
DDP
1749
1750 if (!related) {
1751 return false;
1752 }
1753
1754 memset(&inner_key, 0, sizeof inner_key);
1755 inner_key.dl_type = htons(ETH_TYPE_IP);
dec0dbbc 1756 bool ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
a489b168
DDP
1757 if (!ok) {
1758 return false;
1759 }
1760
a81da080 1761 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned) {
a489b168
DDP
1762 return false;
1763 }
1764
1765 key->src = inner_key.src;
1766 key->dst = inner_key.dst;
1767 key->nw_proto = inner_key.nw_proto;
1768
324459a3 1769 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
a489b168
DDP
1770 if (ok) {
1771 conn_key_reverse(key);
1772 *related = true;
1773 }
1774 return ok;
1775 }
1776 default:
1777 return false;
1778 }
1779
1780 return true;
1781}
1782
b269a122
DDP
1783static uint8_t
1784reverse_icmp6_type(uint8_t type)
1785{
1786 switch (type) {
1787 case ICMP6_ECHO_REQUEST:
1788 return ICMP6_ECHO_REPLY;
1789 case ICMP6_ECHO_REPLY:
1790 return ICMP6_ECHO_REQUEST;
1791 default:
1792 OVS_NOT_REACHED();
1793 }
1794}
1795
a489b168
DDP
1796/* If 'related' is not NULL and the function is processing an ICMP
1797 * error packet, extract the l3 and l4 fields from the nested header
1798 * instead and set *related to true. If 'related' is NULL we're
1799 * already processing a nested header and no such recursion is
1800 * possible */
1801static inline bool
1802extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1803 bool *related)
1804{
1805 const struct icmp6_header *icmp6 = data;
1806
1807 /* All the messages that we support need at least 4 bytes after
1808 * the header */
1809 if (size < sizeof *icmp6 + 4) {
1810 return false;
1811 }
1812
1813 switch (icmp6->icmp6_type) {
1814 case ICMP6_ECHO_REQUEST:
1815 case ICMP6_ECHO_REPLY:
b269a122
DDP
1816 if (icmp6->icmp6_code != 0) {
1817 return false;
1818 }
a489b168 1819 /* Separate ICMP connection: identified using id */
b269a122
DDP
1820 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1821 key->src.icmp_type = icmp6->icmp6_type;
1822 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
a489b168
DDP
1823 break;
1824 case ICMP6_DST_UNREACH:
1825 case ICMP6_PACKET_TOO_BIG:
1826 case ICMP6_TIME_EXCEEDED:
1827 case ICMP6_PARAM_PROB: {
1828 /* ICMP packet part of another connection. We should
1829 * extract the key from embedded packet header */
1830 struct conn_key inner_key;
1831 const char *l3 = (const char *) icmp6 + 8;
1832 const char *tail = (const char *) data + size;
1833 const char *l4 = NULL;
a489b168
DDP
1834
1835 if (!related) {
1836 return false;
1837 }
1838
1839 memset(&inner_key, 0, sizeof inner_key);
1840 inner_key.dl_type = htons(ETH_TYPE_IPV6);
dec0dbbc 1841 bool ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
a489b168
DDP
1842 if (!ok) {
1843 return false;
1844 }
1845
1846 /* pf doesn't do this, but it seems a good idea */
1847 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
a81da080 1848 &key->dst.addr.ipv6_aligned)) {
a489b168
DDP
1849 return false;
1850 }
1851
1852 key->src = inner_key.src;
1853 key->dst = inner_key.dst;
1854 key->nw_proto = inner_key.nw_proto;
1855
324459a3 1856 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
a489b168
DDP
1857 if (ok) {
1858 conn_key_reverse(key);
1859 *related = true;
1860 }
1861 return ok;
1862 }
1863 default:
1864 return false;
1865 }
1866
1867 return true;
1868}
1869
1870/* Extract l4 fields into 'key', which must already contain valid l3
1871 * members.
1872 *
1873 * If 'related' is not NULL and an ICMP error packet is being
1874 * processed, the function will extract the key from the packet nested
1401f6de 1875 * in the ICMP payload and set '*related' to true.
a489b168
DDP
1876 *
1877 * If 'related' is NULL, it means that we're already parsing a header nested
1878 * in an ICMP error. In this case, we skip checksum and length validation. */
1879static inline bool
1880extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
324459a3 1881 const void *l3, bool validate_checksum)
a489b168
DDP
1882{
1883 if (key->nw_proto == IPPROTO_TCP) {
324459a3
SC
1884 return (!related || check_l4_tcp(key, data, size, l3,
1885 validate_checksum)) && extract_l4_tcp(key, data, size);
a489b168 1886 } else if (key->nw_proto == IPPROTO_UDP) {
324459a3
SC
1887 return (!related || check_l4_udp(key, data, size, l3,
1888 validate_checksum)) && extract_l4_udp(key, data, size);
a489b168
DDP
1889 } else if (key->dl_type == htons(ETH_TYPE_IP)
1890 && key->nw_proto == IPPROTO_ICMP) {
324459a3 1891 return (!related || check_l4_icmp(data, size, validate_checksum))
a489b168
DDP
1892 && extract_l4_icmp(key, data, size, related);
1893 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1894 && key->nw_proto == IPPROTO_ICMPV6) {
324459a3
SC
1895 return (!related || check_l4_icmp6(key, data, size, l3,
1896 validate_checksum)) && extract_l4_icmp6(key, data, size,
1897 related);
a489b168
DDP
1898 } else {
1899 return false;
1900 }
1901}
1902
1903static bool
66e4ad8a 1904conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
a489b168
DDP
1905 struct conn_lookup_ctx *ctx, uint16_t zone)
1906{
2482b0b0 1907 const struct eth_header *l2 = dp_packet_eth(pkt);
a489b168
DDP
1908 const struct ip_header *l3 = dp_packet_l3(pkt);
1909 const char *l4 = dp_packet_l4(pkt);
a489b168
DDP
1910
1911 memset(ctx, 0, sizeof *ctx);
1912
1913 if (!l2 || !l3 || !l4) {
1914 return false;
1915 }
1916
1917 ctx->key.zone = zone;
1918
1919 /* XXX In this function we parse the packet (again, it has already
1920 * gone through miniflow_extract()) for two reasons:
1921 *
1922 * 1) To extract the l3 addresses and l4 ports.
1923 * We already have the l3 and l4 headers' pointers. Extracting
1924 * the l3 addresses and the l4 ports is really cheap, since they
1925 * can be found at fixed locations.
66e4ad8a
DDP
1926 * 2) To extract the l4 type.
1927 * Extracting the l4 types, for IPv6 can be quite expensive, because
1928 * it's not at a fixed location.
a489b168
DDP
1929 *
1930 * Here's a way to avoid (2) with the help of the datapath.
66e4ad8a 1931 * The datapath doesn't keep the packet's extracted flow[1], so
a489b168 1932 * using that is not an option. We could use the packet's matching
66e4ad8a
DDP
1933 * megaflow, but we have to make sure that the l4 type (nw_proto)
1934 * is unwildcarded. This means either:
a489b168 1935 *
66e4ad8a
DDP
1936 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1937 * if the actions contains ct().
a489b168 1938 *
66e4ad8a
DDP
1939 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1940 * action. This is already done in different actions, but it's
1941 * unnecessary for the kernel.
a489b168
DDP
1942 *
1943 * ---
66e4ad8a 1944 * [1] The reasons for this are that keeping the flow increases
a489b168
DDP
1945 * (slightly) the cache footprint and increases computation
1946 * time as we move the packet around. Most importantly, the flow
1947 * should be updated by the actions and this can be slow, as
1948 * we use a sparse representation (miniflow).
1949 *
1950 */
dec0dbbc
DB
1951 const char *tail = dp_packet_tail(pkt);
1952 bool ok;
66e4ad8a 1953 ctx->key.dl_type = dl_type;
dec0dbbc 1954
a489b168 1955 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
dec0dbbc 1956 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
324459a3
SC
1957 if (hwol_bad_l3_csum) {
1958 ok = false;
1959 } else {
dec0dbbc 1960 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
324459a3
SC
1961 /* Validate the checksum only when hwol is not supported. */
1962 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL,
1963 !hwol_good_l3_csum);
1964 }
a489b168
DDP
1965 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1966 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1967 } else {
1968 ok = false;
1969 }
1970
1971 if (ok) {
324459a3
SC
1972 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
1973 if (!hwol_bad_l4_csum) {
1974 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt);
1975 /* Validate the checksum only when hwol is not supported. */
1976 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->icmp_related, l3,
1977 !hwol_good_l4_csum)) {
1978 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1979 return true;
1980 }
a489b168
DDP
1981 }
1982 }
1983
1984 return false;
1985}
92edd073
DB
1986
1987static uint32_t
1988ct_addr_hash_add(uint32_t hash, const struct ct_addr *addr)
1989{
1990 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
1991 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
1992}
1993
1994static uint32_t
1995ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
1996{
1997 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
1998 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
1999}
a489b168
DDP
2000\f
2001/* Symmetric */
2002static uint32_t
2003conn_key_hash(const struct conn_key *key, uint32_t basis)
2004{
2005 uint32_t hsrc, hdst, hash;
a489b168 2006 hsrc = hdst = basis;
6b1d4625
DB
2007 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
2008 hdst = ct_endpoint_hash_add(hdst, &key->dst);
a489b168
DDP
2009
2010 /* Even if source and destination are swapped the hash will be the same. */
2011 hash = hsrc ^ hdst;
2012
2013 /* Hash the rest of the key(L3 and L4 types and zone). */
853cca3f 2014 hash = hash_words((uint32_t *) (&key->dst + 1),
a489b168
DDP
2015 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
2016 hash);
2017
6b1d4625 2018 return hash_finish(hash, 0);
a489b168
DDP
2019}
2020
2021static void
2022conn_key_reverse(struct conn_key *key)
2023{
dec0dbbc 2024 struct ct_endpoint tmp = key->src;
a489b168
DDP
2025 key->src = key->dst;
2026 key->dst = tmp;
2027}
2028
286de272
DB
2029static uint32_t
2030nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
2031 struct in6_addr *ipv6_aligned_max)
2032{
2033 uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
2034 uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t);
2035 uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
2036 uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
2037
2038 ovs_be64 addr6_64_min_hi;
2039 ovs_be64 addr6_64_min_lo;
2040 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
2041 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
2042
2043 ovs_be64 addr6_64_max_hi;
2044 ovs_be64 addr6_64_max_lo;
2045 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
2046 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
2047
2048 uint64_t diff;
dec0dbbc 2049
286de272
DB
2050 if (addr6_64_min_hi == addr6_64_max_hi &&
2051 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
2052 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
2053 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
2054 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
2055 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
2056 ntohll(addr6_64_max_lo) - 1);
2057 } else {
2058 /* Limit address delta supported to 32 bits or 4 billion approximately.
2059 * Possibly, this should be visible to the user through a datapath
2060 * support check, however the practical impact is probably nil. */
2061 diff = 0xfffffffe;
2062 }
dec0dbbc 2063
286de272
DB
2064 if (diff > 0xfffffffe) {
2065 diff = 0xfffffffe;
2066 }
2067 return diff;
2068}
2069
2070/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
2071 * restricts the input parameters. */
a489b168 2072static void
286de272
DB
2073nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
2074{
2075 uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
2076 uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
2077 ovs_be64 addr6_64_hi;
2078 ovs_be64 addr6_64_lo;
2079 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
2080 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
2081
2082 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
2083 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
2084 } else if (addr6_64_hi != OVS_BE64_MAX) {
2085 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
2086 addr6_64_lo = htonll(increment - (UINT64_MAX -
2087 ntohll(addr6_64_lo) + 1));
2088 } else {
2089 OVS_NOT_REACHED();
2090 }
2091
2092 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
2093 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
286de272
DB
2094}
2095
2096static uint32_t
2097nat_range_hash(const struct conn *conn, uint32_t basis)
2098{
2099 uint32_t hash = basis;
286de272 2100
92edd073
DB
2101 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
2102 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
2103 hash = hash_add(hash,
2104 (conn->nat_info->max_port << 16)
2105 | conn->nat_info->min_port);
92edd073
DB
2106 hash = ct_endpoint_hash_add(hash, &conn->key.src);
2107 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
286de272
DB
2108 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
2109 hash = hash_add(hash, conn->key.nw_proto);
2110 hash = hash_add(hash, conn->key.zone);
92edd073
DB
2111
2112 /* The purpose of the second parameter is to distinguish hashes of data of
2113 * different length; our data always has the same length so there is no
2114 * value in counting. */
2115 return hash_finish(hash, 0);
286de272
DB
2116}
2117
2118static bool
2119nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
2120 struct conn *nat_conn)
2121{
bd5e81a0
DB
2122 enum { MIN_NAT_EPHEMERAL_PORT = 1024,
2123 MAX_NAT_EPHEMERAL_PORT = 65535 };
286de272
DB
2124
2125 uint16_t min_port;
2126 uint16_t max_port;
2127 uint16_t first_port;
286de272
DB
2128 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
2129
2130 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
2131 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
2132 min_port = ntohs(conn->key.src.port);
2133 max_port = ntohs(conn->key.src.port);
2134 first_port = min_port;
2135 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
2136 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
2137 min_port = ntohs(conn->key.dst.port);
2138 max_port = ntohs(conn->key.dst.port);
2139 first_port = min_port;
2140 } else {
2141 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
2142 uint32_t port_index = hash % (deltap + 1);
2143 first_port = conn->nat_info->min_port + port_index;
2144 min_port = conn->nat_info->min_port;
2145 max_port = conn->nat_info->max_port;
2146 }
2147
2148 uint32_t deltaa = 0;
2149 uint32_t address_index;
2150 struct ct_addr ct_addr;
2151 memset(&ct_addr, 0, sizeof ct_addr);
2152 struct ct_addr max_ct_addr;
2153 memset(&max_ct_addr, 0, sizeof max_ct_addr);
2154 max_ct_addr = conn->nat_info->max_addr;
2155
2156 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
2157 deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
2158 ntohl(conn->nat_info->min_addr.ipv4_aligned);
2159 address_index = hash % (deltaa + 1);
2160 ct_addr.ipv4_aligned = htonl(
2161 ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
2162 } else {
2163 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
2164 &conn->nat_info->max_addr.ipv6_aligned);
2165 /* deltaa must be within 32 bits for full hash coverage. A 64 or
2166 * 128 bit hash is unnecessary and hence not used here. Most code
2167 * is kept common with V4; nat_ipv6_addrs_delta() will do the
2168 * enforcement via max_ct_addr. */
2169 max_ct_addr = conn->nat_info->min_addr;
2170 nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
286de272
DB
2171 address_index = hash % (deltaa + 1);
2172 ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
2173 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
2174 }
2175
2176 uint16_t port = first_port;
2177 bool all_ports_tried = false;
ac04639a
DB
2178 /* For DNAT, we don't use ephemeral ports. */
2179 bool ephemeral_ports_tried = conn->nat_info->nat_action & NAT_ACTION_DST
2180 ? true : false;
286de272 2181 struct ct_addr first_addr = ct_addr;
286de272
DB
2182
2183 while (true) {
2184 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2185 nat_conn->rev_key.dst.addr = ct_addr;
2186 } else {
2187 nat_conn->rev_key.src.addr = ct_addr;
2188 }
2189
2190 if ((conn->key.nw_proto == IPPROTO_ICMP) ||
2191 (conn->key.nw_proto == IPPROTO_ICMPV6)) {
2192 all_ports_tried = true;
2193 } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2194 nat_conn->rev_key.dst.port = htons(port);
2195 } else {
2196 nat_conn->rev_key.src.port = htons(port);
2197 }
2198
80cee116
DB
2199 bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys, nat_conn,
2200 ct->hash_basis);
2201 if (new_insert) {
286de272
DB
2202 return true;
2203 } else if (!all_ports_tried) {
2204 if (min_port == max_port) {
2205 all_ports_tried = true;
2206 } else if (port == max_port) {
2207 port = min_port;
2208 } else {
2209 port++;
2210 }
2211 if (port == first_port) {
2212 all_ports_tried = true;
2213 }
2214 } else {
2215 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
2216 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
2217 ct_addr.ipv4_aligned = htonl(
2218 ntohl(ct_addr.ipv4_aligned) + 1);
2219 } else {
2220 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
2221 }
2222 } else {
2223 ct_addr = conn->nat_info->min_addr;
2224 }
2225 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
ac04639a
DB
2226 if (!ephemeral_ports_tried) {
2227 ephemeral_ports_tried = true;
286de272 2228 ct_addr = conn->nat_info->min_addr;
8417e688 2229 first_addr = ct_addr;
286de272
DB
2230 min_port = MIN_NAT_EPHEMERAL_PORT;
2231 max_port = MAX_NAT_EPHEMERAL_PORT;
2232 } else {
2233 break;
2234 }
2235 }
2236 first_port = min_port;
2237 port = first_port;
2238 all_ports_tried = false;
2239 }
2240 }
2241 return false;
2242}
2243
ac6abe5f 2244/* This function must be called with the ct->resources lock taken. */
286de272
DB
2245static struct nat_conn_key_node *
2246nat_conn_keys_lookup(struct hmap *nat_conn_keys,
2247 const struct conn_key *key,
2248 uint32_t basis)
2249{
2250 struct nat_conn_key_node *nat_conn_key_node;
286de272 2251
dec0dbbc
DB
2252 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node,
2253 conn_key_hash(key, basis), nat_conn_keys) {
5ed7a0b4 2254 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
286de272
DB
2255 return nat_conn_key_node;
2256 }
2257 }
2258 return NULL;
2259}
2260
80cee116
DB
2261/* This function must be called with the ct->resources lock taken. */
2262static bool
2263nat_conn_keys_insert(struct hmap *nat_conn_keys, const struct conn *nat_conn,
2264 uint32_t basis)
2265{
2266 struct nat_conn_key_node *nat_conn_key_node =
2267 nat_conn_keys_lookup(nat_conn_keys, &nat_conn->rev_key, basis);
2268
2269 if (!nat_conn_key_node) {
2270 struct nat_conn_key_node *nat_conn_key = xzalloc(sizeof *nat_conn_key);
2271 nat_conn_key->key = nat_conn->rev_key;
2272 nat_conn_key->value = nat_conn->key;
dec0dbbc
DB
2273 hmap_insert(nat_conn_keys, &nat_conn_key->node,
2274 conn_key_hash(&nat_conn_key->key, basis));
80cee116
DB
2275 return true;
2276 }
2277 return false;
2278}
2279
ac6abe5f 2280/* This function must be called with the ct->resources write lock taken. */
286de272 2281static void
bd5e81a0
DB
2282nat_conn_keys_remove(struct hmap *nat_conn_keys,
2283 const struct conn_key *key,
286de272
DB
2284 uint32_t basis)
2285{
2286 struct nat_conn_key_node *nat_conn_key_node;
286de272 2287
dec0dbbc
DB
2288 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node,
2289 conn_key_hash(key, basis), nat_conn_keys) {
5ed7a0b4 2290 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
286de272
DB
2291 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
2292 free(nat_conn_key_node);
2293 return;
2294 }
2295 }
2296}
2297
2298static void
2299conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
a489b168 2300 long long now)
ac6abe5f 2301 OVS_REQUIRES(ctb->lock)
a489b168
DDP
2302{
2303 uint32_t hash = ctx->hash;
2304 struct conn *conn;
2305
2306 ctx->conn = NULL;
2307
2308 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
5ed7a0b4 2309 if (!conn_key_cmp(&conn->key, &ctx->key)
a489b168
DDP
2310 && !conn_expired(conn, now)) {
2311 ctx->conn = conn;
2312 ctx->reply = false;
2313 break;
2314 }
5ed7a0b4 2315 if (!conn_key_cmp(&conn->rev_key, &ctx->key)
a489b168
DDP
2316 && !conn_expired(conn, now)) {
2317 ctx->conn = conn;
2318 ctx->reply = true;
2319 break;
2320 }
2321 }
2322}
2323
2324static enum ct_update_res
e6ef6cc6
DDP
2325conn_update(struct conn *conn, struct conntrack_bucket *ctb,
2326 struct dp_packet *pkt, bool reply, long long now)
a489b168 2327{
e6ef6cc6
DDP
2328 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
2329 reply, now);
a489b168
DDP
2330}
2331
2332static bool
2333conn_expired(struct conn *conn, long long now)
2334{
286de272
DB
2335 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
2336 return now >= conn->expiration;
2337 }
2338 return false;
a489b168
DDP
2339}
2340
2341static bool
2342valid_new(struct dp_packet *pkt, struct conn_key *key)
2343{
2344 return l4_protos[key->nw_proto]->valid_new(pkt);
2345}
2346
2347static struct conn *
e6ef6cc6
DDP
2348new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
2349 struct conn_key *key, long long now)
a489b168 2350{
dec0dbbc 2351 struct conn *newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
a489b168
DDP
2352 if (newconn) {
2353 newconn->key = *key;
2354 }
2355
2356 return newconn;
2357}
2358
2359static void
2360delete_conn(struct conn *conn)
2361{
286de272 2362 free(conn->nat_info);
bd5e81a0 2363 free(conn->alg);
a489b168
DDP
2364 free(conn);
2365}
4d4e68ed 2366\f
271e48a0
YHW
2367/* Convert a conntrack address 'a' into an IP address 'b' based on 'dl_type'.
2368 *
2369 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2370 * in network-byte order. */
4d4e68ed
DDP
2371static void
2372ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
2373 union ct_dpif_inet_addr *b,
2374 ovs_be16 dl_type)
2375{
2376 if (dl_type == htons(ETH_TYPE_IP)) {
2377 b->ip = a->ipv4_aligned;
2378 } else if (dl_type == htons(ETH_TYPE_IPV6)){
2379 b->in6 = a->ipv6_aligned;
2380 }
2381}
2382
271e48a0
YHW
2383/* Convert an IP address 'a' into a conntrack address 'b' based on 'dl_type'.
2384 *
2385 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2386 * in network-byte order. */
2387static void
2388ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
2389 struct ct_addr *b,
2390 ovs_be16 dl_type)
2391{
2392 if (dl_type == htons(ETH_TYPE_IP)) {
2393 b->ipv4_aligned = a->ip;
2394 } else if (dl_type == htons(ETH_TYPE_IPV6)){
2395 b->ipv6_aligned = a->in6;
2396 }
2397}
2398
4d4e68ed
DDP
2399static void
2400conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2401{
2402 if (key->dl_type == htons(ETH_TYPE_IP)) {
2403 tuple->l3_type = AF_INET;
2404 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2405 tuple->l3_type = AF_INET6;
2406 }
2407 tuple->ip_proto = key->nw_proto;
2408 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2409 key->dl_type);
2410 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2411 key->dl_type);
2412
2413 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
b269a122
DDP
2414 tuple->icmp_id = key->src.icmp_id;
2415 tuple->icmp_type = key->src.icmp_type;
2416 tuple->icmp_code = key->src.icmp_code;
4d4e68ed
DDP
2417 } else {
2418 tuple->src_port = key->src.port;
2419 tuple->dst_port = key->dst.port;
2420 }
2421}
2422
271e48a0
YHW
2423static void
2424tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
2425 struct conn_key *key)
2426{
2427 if (tuple->l3_type == AF_INET) {
2428 key->dl_type = htons(ETH_TYPE_IP);
2429 } else if (tuple->l3_type == AF_INET6) {
2430 key->dl_type = htons(ETH_TYPE_IPV6);
2431 }
2432 key->nw_proto = tuple->ip_proto;
2433 ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
2434 key->dl_type);
2435 ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
2436 key->dl_type);
2437
2438 if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
2439 key->src.icmp_id = tuple->icmp_id;
2440 key->src.icmp_type = tuple->icmp_type;
2441 key->src.icmp_code = tuple->icmp_code;
2442 key->dst.icmp_id = tuple->icmp_id;
2443 key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
2444 key->dst.icmp_code = tuple->icmp_code;
2445 } else {
2446 key->src.port = tuple->src_port;
2447 key->dst.port = tuple->dst_port;
2448 }
2449 key->zone = zone;
2450}
2451
4d4e68ed
DDP
2452static void
2453conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
ded30c74 2454 long long now, int bkt)
4d4e68ed 2455{
4d4e68ed
DDP
2456 memset(entry, 0, sizeof *entry);
2457 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
2458 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
2459
2460 entry->zone = conn->key.zone;
2461 entry->mark = conn->mark;
2462
286de272 2463 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
4d4e68ed
DDP
2464 /* Not implemented yet */
2465 entry->timestamp.start = 0;
2466 entry->timestamp.stop = 0;
2467
dec0dbbc 2468 long long expiration = conn->expiration - now;
4d4e68ed
DDP
2469 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
2470
dec0dbbc 2471 struct ct_l4_proto *class = l4_protos[conn->key.nw_proto];
4d4e68ed
DDP
2472 if (class->conn_get_protoinfo) {
2473 class->conn_get_protoinfo(conn, &entry->protoinfo);
2474 }
bd5e81a0 2475
ded30c74 2476 entry->bkt = bkt;
bd5e81a0
DB
2477
2478 if (conn->alg) {
2479 /* Caller is responsible for freeing. */
2480 entry->helper.name = xstrdup(conn->alg);
2481 }
4d4e68ed
DDP
2482}
2483
2484int
2485conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
ded30c74 2486 const uint16_t *pzone, int *ptot_bkts)
4d4e68ed
DDP
2487{
2488 memset(dump, 0, sizeof(*dump));
dec0dbbc 2489
4d4e68ed
DDP
2490 if (pzone) {
2491 dump->zone = *pzone;
2492 dump->filter_zone = true;
2493 }
4d4e68ed 2494
dec0dbbc 2495 dump->ct = ct;
ded30c74 2496 *ptot_bkts = CONNTRACK_BUCKETS;
4d4e68ed
DDP
2497 return 0;
2498}
2499
2500int
2501conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2502{
2503 struct conntrack *ct = dump->ct;
2504 long long now = time_msec();
2505
2506 while (dump->bucket < CONNTRACK_BUCKETS) {
2507 struct hmap_node *node;
2508
2509 ct_lock_lock(&ct->buckets[dump->bucket].lock);
2510 for (;;) {
2511 struct conn *conn;
2512
2513 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
2514 &dump->bucket_pos);
2515 if (!node) {
2516 break;
2517 }
2518 INIT_CONTAINER(conn, node, node);
286de272
DB
2519 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2520 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
ded30c74 2521 conn_to_ct_dpif_entry(conn, entry, now, dump->bucket);
4d4e68ed
DDP
2522 break;
2523 }
2524 /* Else continue, until we find an entry in the appropriate zone
2525 * or the bucket has been scanned completely. */
2526 }
2527 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
2528
2529 if (!node) {
2530 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
2531 dump->bucket++;
2532 } else {
2533 return 0;
2534 }
2535 }
2536 return EOF;
2537}
2538
2539int
2540conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2541{
2542 return 0;
2543}
5d9cbb4c
DDP
2544
2545int
2546conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2547{
dec0dbbc 2548 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
5d9cbb4c
DDP
2549 struct conn *conn, *next;
2550
2551 ct_lock_lock(&ct->buckets[i].lock);
bd5e81a0 2552 HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
286de272
DB
2553 if ((!zone || *zone == conn->key.zone) &&
2554 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
2555 conn_clean(ct, conn, &ct->buckets[i]);
5d9cbb4c
DDP
2556 }
2557 }
2558 ct_lock_unlock(&ct->buckets[i].lock);
2559 }
bd5e81a0 2560
5d9cbb4c
DDP
2561 return 0;
2562}
bd5e81a0 2563
271e48a0
YHW
2564int
2565conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
2566 uint16_t zone)
2567{
2568 struct conn_lookup_ctx ctx;
2569 int error = 0;
2570
2571 memset(&ctx, 0, sizeof(ctx));
2572 tuple_to_conn_key(tuple, zone, &ctx.key);
2573 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
2574 unsigned bucket = hash_to_bucket(ctx.hash);
2575
2576 ct_lock_lock(&ct->buckets[bucket].lock);
2577 conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
a1d5eeff 2578 if (ctx.conn && ctx.conn->conn_type == CT_CONN_TYPE_DEFAULT) {
271e48a0
YHW
2579 conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
2580 } else {
a1d5eeff 2581 VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
271e48a0
YHW
2582 error = ENOENT;
2583 }
2584 ct_lock_unlock(&ct->buckets[bucket].lock);
2585 return error;
2586}
2587
c92339ad
DB
2588int
2589conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns)
2590{
2591 atomic_store_relaxed(&ct->n_conn_limit, maxconns);
2592 return 0;
2593}
2594
2595int
2596conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns)
2597{
2598 atomic_read_relaxed(&ct->n_conn_limit, maxconns);
2599 return 0;
2600}
2601
875075b3
DB
2602int
2603conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns)
2604{
2605 *nconns = atomic_count_get(&ct->n_conn);
2606 return 0;
2607}
2608
bd5e81a0
DB
2609/* This function must be called with the ct->resources read lock taken. */
2610static struct alg_exp_node *
be38342d
DB
2611expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
2612 uint32_t basis, bool src_ip_wc)
bd5e81a0
DB
2613{
2614 struct conn_key check_key = *key;
2615 check_key.src.port = ALG_WC_SRC_PORT;
dec0dbbc 2616
be38342d
DB
2617 if (src_ip_wc) {
2618 memset(&check_key.src.addr, 0, sizeof check_key.src.addr);
2619 }
dec0dbbc 2620
bd5e81a0
DB
2621 struct alg_exp_node *alg_exp_node;
2622
bd5e81a0 2623 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
dec0dbbc 2624 conn_key_hash(&check_key, basis),
bd5e81a0
DB
2625 alg_expectations) {
2626 if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
2627 return alg_exp_node;
2628 }
2629 }
2630 return NULL;
2631}
2632
4417ca3d
DB
2633/* This function must be called with the ct->resources write lock taken. */
2634static void
2635expectation_remove(struct hmap *alg_expectations,
2636 const struct conn_key *key, uint32_t basis)
2637{
2638 struct alg_exp_node *alg_exp_node;
2639
2640 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node, conn_key_hash(key, basis),
2641 alg_expectations) {
2642 if (!conn_key_cmp(&alg_exp_node->key, key)) {
2643 hmap_remove(alg_expectations, &alg_exp_node->node);
2644 break;
2645 }
2646 }
2647}
2648
2649/* This function must be called with the ct->resources read lock taken. */
2650static struct alg_exp_node *
2651expectation_ref_lookup_unique(const struct hindex *alg_expectation_refs,
2652 const struct conn_key *master_key,
2653 const struct conn_key *alg_exp_key,
2654 uint32_t basis)
2655{
2656 struct alg_exp_node *alg_exp_node;
2657
2658 HINDEX_FOR_EACH_WITH_HASH (alg_exp_node, node_ref,
2659 conn_key_hash(master_key, basis),
2660 alg_expectation_refs) {
2661 if (!conn_key_cmp(&alg_exp_node->master_key, master_key) &&
2662 !conn_key_cmp(&alg_exp_node->key, alg_exp_key)) {
2663 return alg_exp_node;
2664 }
2665 }
2666 return NULL;
2667}
2668
2669/* This function must be called with the ct->resources write lock taken. */
2670static void
2671expectation_ref_create(struct hindex *alg_expectation_refs,
2672 struct alg_exp_node *alg_exp_node,
2673 uint32_t basis)
2674{
2675 if (!expectation_ref_lookup_unique(alg_expectation_refs,
2676 &alg_exp_node->master_key,
2677 &alg_exp_node->key, basis)) {
2678 hindex_insert(alg_expectation_refs, &alg_exp_node->node_ref,
2679 conn_key_hash(&alg_exp_node->master_key, basis));
2680 }
2681}
2682
2683static void
2684expectation_clean(struct conntrack *ct, const struct conn_key *master_key,
2685 uint32_t basis)
2686{
2687 ct_rwlock_wrlock(&ct->resources_lock);
2688
2689 struct alg_exp_node *node, *next;
2690 HINDEX_FOR_EACH_WITH_HASH_SAFE (node, next, node_ref,
2691 conn_key_hash(master_key, basis),
2692 &ct->alg_expectation_refs) {
2693 if (!conn_key_cmp(&node->master_key, master_key)) {
2694 expectation_remove(&ct->alg_expectations, &node->key, basis);
2695 hindex_remove(&ct->alg_expectation_refs, &node->node_ref);
2696 free(node);
2697 }
2698 }
2699
2700 ct_rwlock_unlock(&ct->resources_lock);
2701}
2702
bd5e81a0 2703static void
be38342d
DB
2704expectation_create(struct conntrack *ct, ovs_be16 dst_port,
2705 const struct conn *master_conn, bool reply, bool src_ip_wc,
2706 bool skip_nat)
bd5e81a0
DB
2707{
2708 struct ct_addr src_addr;
2709 struct ct_addr dst_addr;
2710 struct ct_addr alg_nat_repl_addr;
be38342d 2711 struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
bd5e81a0 2712
be38342d 2713 if (reply) {
bd5e81a0
DB
2714 src_addr = master_conn->key.src.addr;
2715 dst_addr = master_conn->key.dst.addr;
be38342d
DB
2716 if (skip_nat) {
2717 alg_nat_repl_addr = dst_addr;
2718 } else {
2719 alg_nat_repl_addr = master_conn->rev_key.dst.addr;
2720 }
2721 alg_exp_node->nat_rpl_dst = true;
2722 } else {
2723 src_addr = master_conn->rev_key.src.addr;
2724 dst_addr = master_conn->rev_key.dst.addr;
2725 if (skip_nat) {
2726 alg_nat_repl_addr = src_addr;
2727 } else {
2728 alg_nat_repl_addr = master_conn->key.src.addr;
2729 }
2730 alg_exp_node->nat_rpl_dst = false;
2731 }
2732 if (src_ip_wc) {
2733 memset(&src_addr, 0, sizeof src_addr);
bd5e81a0
DB
2734 }
2735
bd5e81a0
DB
2736 alg_exp_node->key.dl_type = master_conn->key.dl_type;
2737 alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
2738 alg_exp_node->key.zone = master_conn->key.zone;
2739 alg_exp_node->key.src.addr = src_addr;
2740 alg_exp_node->key.dst.addr = dst_addr;
2741 alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
2742 alg_exp_node->key.dst.port = dst_port;
2743 alg_exp_node->master_mark = master_conn->mark;
2744 alg_exp_node->master_label = master_conn->label;
2745 alg_exp_node->master_key = master_conn->key;
bd5e81a0
DB
2746 /* Take the write lock here because it is almost 100%
2747 * likely that the lookup will fail and
2748 * expectation_create() will be called below. */
2749 ct_rwlock_wrlock(&ct->resources_lock);
2750 struct alg_exp_node *alg_exp = expectation_lookup(
be38342d 2751 &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis, src_ip_wc);
bd5e81a0
DB
2752 if (alg_exp) {
2753 free(alg_exp_node);
2754 ct_rwlock_unlock(&ct->resources_lock);
2755 return;
2756 }
2757
2758 alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
4417ca3d 2759 hmap_insert(&ct->alg_expectations, &alg_exp_node->node,
dec0dbbc 2760 conn_key_hash(&alg_exp_node->key, ct->hash_basis));
4417ca3d
DB
2761 expectation_ref_create(&ct->alg_expectation_refs, alg_exp_node,
2762 ct->hash_basis);
bd5e81a0
DB
2763 ct_rwlock_unlock(&ct->resources_lock);
2764}
2765
2766static uint8_t
2767get_v4_byte_be(ovs_be32 v4_addr, uint8_t index)
2768{
2769 uint8_t *byte_ptr = (OVS_FORCE uint8_t *) &v4_addr;
2770 return byte_ptr[index];
2771}
2772
2773static void
2774replace_substring(char *substr, uint8_t substr_size,
2775 uint8_t total_size, char *rep_str,
2776 uint8_t rep_str_size)
2777{
2778 memmove(substr + rep_str_size, substr + substr_size,
2779 total_size - substr_size);
2780 memcpy(substr, rep_str, rep_str_size);
2781}
2782
2783/* Replace IPV4 address in FTP message with NATed address. */
2784static int
2785repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
2786 char *ftp_data_start,
2787 size_t addr_offset_from_ftp_data_start)
2788{
2789 enum { MAX_FTP_V4_NAT_DELTA = 8 };
2790
2791 /* Do conservative check for pathological MTU usage. */
2792 uint32_t orig_used_size = dp_packet_size(pkt);
2793 uint16_t allocated_size = dp_packet_get_allocated(pkt);
2794 if (orig_used_size + MAX_FTP_V4_NAT_DELTA > allocated_size) {
2795 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
2796 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP",
2797 allocated_size);
2798 return 0;
2799 }
2800
2801 size_t remain_size = tcp_payload_length(pkt) -
2802 addr_offset_from_ftp_data_start;
bd5e81a0
DB
2803 int overall_delta = 0;
2804 char *byte_str = ftp_data_start + addr_offset_from_ftp_data_start;
2805
2806 /* Replace the existing IPv4 address by the new one. */
2807 for (uint8_t i = 0; i < 4; i++) {
2808 /* Find the end of the string for this octet. */
2809 char *next_delim = memchr(byte_str, ',', 4);
2810 ovs_assert(next_delim);
2811 int substr_size = next_delim - byte_str;
2812 remain_size -= substr_size;
2813
2814 /* Compose the new string for this octet, and replace it. */
2815 char rep_str[4];
2816 uint8_t rep_byte = get_v4_byte_be(v4_addr_rep, i);
2817 int replace_size = sprintf(rep_str, "%d", rep_byte);
2818 replace_substring(byte_str, substr_size, remain_size,
2819 rep_str, replace_size);
2820 overall_delta += replace_size - substr_size;
2821
2822 /* Advance past the octet and the following comma. */
2823 byte_str += replace_size + 1;
2824 }
2825
2826 dp_packet_set_size(pkt, orig_used_size + overall_delta);
2827 return overall_delta;
2828}
2829
2830static char *
2831skip_non_digits(char *str)
2832{
2833 while (!isdigit(*str) && *str != 0) {
2834 str++;
2835 }
2836 return str;
2837}
2838
2839static char *
2840terminate_number_str(char *str, uint8_t max_digits)
2841{
2842 uint8_t digits_found = 0;
2843 while (isdigit(*str) && digits_found <= max_digits) {
2844 str++;
2845 digits_found++;
2846 }
2847
2848 *str = 0;
2849 return str;
2850}
2851
2852
2853static void
2854get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
2855{
2856 struct tcp_header *th = dp_packet_l4(pkt);
2857 char *tcp_hdr = (char *) th;
2858 uint32_t tcp_payload_len = tcp_payload_length(pkt);
2859 size_t tcp_payload_of_interest = MIN(tcp_payload_len,
2860 LARGEST_FTP_MSG_OF_INTEREST);
2861 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2862
2863 ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
2864 tcp_payload_of_interest);
2865}
2866
2867static enum ftp_ctl_pkt
2868detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
2869 struct dp_packet *pkt)
2870{
bd5e81a0
DB
2871 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2872 get_ftp_ctl_msg(pkt, ftp_msg);
dec0dbbc 2873
bd5e81a0
DB
2874 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2875 if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
2876 !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
2877 return CT_FTP_CTL_OTHER;
2878 }
2879 } else {
2880 if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
2881 strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
2882 strlen(FTP_PASV_REPLY_CODE))) {
2883 return CT_FTP_CTL_OTHER;
2884 }
2885 }
2886
2887 return CT_FTP_CTL_INTEREST;
2888}
2889
2890static enum ftp_ctl_pkt
2891process_ftp_ctl_v4(struct conntrack *ct,
2892 struct dp_packet *pkt,
2893 const struct conn *conn_for_expectation,
4417ca3d 2894 ovs_be32 *v4_addr_rep,
bd5e81a0
DB
2895 char **ftp_data_v4_start,
2896 size_t *addr_offset_from_ftp_data_start)
2897{
2898 struct tcp_header *th = dp_packet_l4(pkt);
2899 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2900 char *tcp_hdr = (char *) th;
2901 *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
2902 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2903 get_ftp_ctl_msg(pkt, ftp_msg);
bd5e81a0
DB
2904 char *ftp = ftp_msg;
2905 enum ct_alg_mode mode;
dec0dbbc 2906
23bea975 2907 if (!strncasecmp(ftp, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
bd5e81a0
DB
2908 ftp = ftp_msg + strlen(FTP_PORT_CMD);
2909 mode = CT_FTP_MODE_ACTIVE;
2910 } else {
2911 ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
2912 mode = CT_FTP_MODE_PASSIVE;
2913 }
2914
2915 /* Find first space. */
2916 ftp = strchr(ftp, ' ');
2917 if (!ftp) {
2918 return CT_FTP_CTL_INVALID;
2919 }
2920
2921 /* Find the first digit, after space. */
2922 ftp = skip_non_digits(ftp);
2923 if (*ftp == 0) {
2924 return CT_FTP_CTL_INVALID;
2925 }
2926
2927 char *ip_addr_start = ftp;
2928 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
bd5e81a0 2929
dec0dbbc 2930 uint8_t comma_count = 0;
bd5e81a0
DB
2931 while (comma_count < 4 && *ftp) {
2932 if (*ftp == ',') {
2933 comma_count++;
2934 if (comma_count == 4) {
2935 *ftp = 0;
2936 } else {
2937 *ftp = '.';
2938 }
2939 }
2940 ftp++;
2941 }
2942 if (comma_count != 4) {
2943 return CT_FTP_CTL_INVALID;
2944 }
2945
2946 struct in_addr ip_addr;
2947 int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
2948 if (rc2 != 1) {
2949 return CT_FTP_CTL_INVALID;
2950 }
2951
2952 char *save_ftp = ftp;
2953 ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
2954 if (!ftp) {
2955 return CT_FTP_CTL_INVALID;
2956 }
2957 int value;
2958 if (!str_to_int(save_ftp, 10, &value)) {
2959 return CT_FTP_CTL_INVALID;
2960 }
2961
2962 /* This is derived from the L4 port maximum is 65535. */
2963 if (value > 255) {
2964 return CT_FTP_CTL_INVALID;
2965 }
2966
2967 uint16_t port_hs = value;
2968 port_hs <<= 8;
2969
2970 /* Skip over comma. */
2971 ftp++;
2972 save_ftp = ftp;
2973 bool digit_found = false;
2974 while (isdigit(*ftp)) {
2975 ftp++;
2976 digit_found = true;
2977 }
2978 if (!digit_found) {
2979 return CT_FTP_CTL_INVALID;
2980 }
2981 *ftp = 0;
2982 if (!str_to_int(save_ftp, 10, &value)) {
2983 return CT_FTP_CTL_INVALID;
2984 }
2985
2986 if (value > 255) {
2987 return CT_FTP_CTL_INVALID;
2988 }
2989
2990 uint16_t port_lo_hs = value;
2991 if (65535 - port_hs < port_lo_hs) {
2992 return CT_FTP_CTL_INVALID;
2993 }
dec0dbbc 2994
bd5e81a0
DB
2995 port_hs |= port_lo_hs;
2996 ovs_be16 port = htons(port_hs);
2997 ovs_be32 conn_ipv4_addr;
2998
2999 switch (mode) {
3000 case CT_FTP_MODE_ACTIVE:
3001 *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4_aligned;
3002 conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4_aligned;
3003 break;
3004 case CT_FTP_MODE_PASSIVE:
3005 *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4_aligned;
3006 conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4_aligned;
3007 break;
7be77cb0 3008 case CT_TFTP_MODE:
bd5e81a0
DB
3009 default:
3010 OVS_NOT_REACHED();
3011 }
3012
3013 ovs_be32 ftp_ipv4_addr;
3014 ftp_ipv4_addr = ip_addr.s_addr;
3015 /* Although most servers will block this exploit, there may be some
3016 * less well managed. */
3017 if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
3018 return CT_FTP_CTL_INVALID;
3019 }
3020
be38342d
DB
3021 expectation_create(ct, port, conn_for_expectation,
3022 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
3023 return CT_FTP_CTL_INTEREST;
3024}
3025
3026static char *
3027skip_ipv6_digits(char *str)
3028{
3029 while (isxdigit(*str) || *str == ':' || *str == '.') {
3030 str++;
3031 }
3032 return str;
3033}
3034
3035static enum ftp_ctl_pkt
3036process_ftp_ctl_v6(struct conntrack *ct,
3037 struct dp_packet *pkt,
3038 const struct conn *conn_for_expectation,
bd5e81a0
DB
3039 struct ct_addr *v6_addr_rep,
3040 char **ftp_data_start,
3041 size_t *addr_offset_from_ftp_data_start,
3042 size_t *addr_size, enum ct_alg_mode *mode)
3043{
3044 struct tcp_header *th = dp_packet_l4(pkt);
3045 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3046 char *tcp_hdr = (char *) th;
3047 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
bd5e81a0
DB
3048 get_ftp_ctl_msg(pkt, ftp_msg);
3049 *ftp_data_start = tcp_hdr + tcp_hdr_len;
bd5e81a0
DB
3050 char *ftp = ftp_msg;
3051 struct in6_addr ip6_addr;
dec0dbbc 3052
23bea975 3053 if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
bd5e81a0
DB
3054 ftp = ftp_msg + strlen(FTP_EPRT_CMD);
3055 ftp = skip_non_digits(ftp);
3056 if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
3057 return CT_FTP_CTL_INVALID;
3058 }
3059 /* Jump over delimiter. */
3060 ftp += 2;
3061
bd5e81a0 3062 memset(&ip6_addr, 0, sizeof ip6_addr);
dec0dbbc 3063 char *ip_addr_start = ftp;
bd5e81a0
DB
3064 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
3065 ftp = skip_ipv6_digits(ftp);
3066 *ftp = 0;
3067 *addr_size = ftp - ip_addr_start;
3068 int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
3069 if (rc2 != 1) {
3070 return CT_FTP_CTL_INVALID;
3071 }
3072 ftp++;
3073 *mode = CT_FTP_MODE_ACTIVE;
3074 } else {
3075 ftp = ftp_msg + strcspn(ftp_msg, "(");
3076 ftp = skip_non_digits(ftp);
3077 if (!isdigit(*ftp)) {
3078 return CT_FTP_CTL_INVALID;
3079 }
3080
3081 /* Not used for passive mode. */
3082 *addr_offset_from_ftp_data_start = 0;
3083 *addr_size = 0;
3084
3085 *mode = CT_FTP_MODE_PASSIVE;
3086 }
3087
3088 char *save_ftp = ftp;
3089 ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
3090 if (!ftp) {
3091 return CT_FTP_CTL_INVALID;
3092 }
dec0dbbc 3093
bd5e81a0
DB
3094 int value;
3095 if (!str_to_int(save_ftp, 10, &value)) {
3096 return CT_FTP_CTL_INVALID;
3097 }
3098 if (value > CT_MAX_L4_PORT) {
3099 return CT_FTP_CTL_INVALID;
3100 }
3101
3102 uint16_t port_hs = value;
3103 ovs_be16 port = htons(port_hs);
3104
3105 switch (*mode) {
3106 case CT_FTP_MODE_ACTIVE:
3107 *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
3108 /* Although most servers will block this exploit, there may be some
3109 * less well managed. */
3110 if (memcmp(&ip6_addr, &v6_addr_rep->ipv6_aligned, sizeof ip6_addr) &&
3111 memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6_aligned,
3112 sizeof ip6_addr)) {
3113 return CT_FTP_CTL_INVALID;
3114 }
3115 break;
3116 case CT_FTP_MODE_PASSIVE:
3117 *v6_addr_rep = conn_for_expectation->key.dst.addr;
3118 break;
7be77cb0 3119 case CT_TFTP_MODE:
bd5e81a0
DB
3120 default:
3121 OVS_NOT_REACHED();
3122 }
3123
be38342d
DB
3124 expectation_create(ct, port, conn_for_expectation,
3125 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
3126 return CT_FTP_CTL_INTEREST;
3127}
3128
3129static int
3130repl_ftp_v6_addr(struct dp_packet *pkt, struct ct_addr v6_addr_rep,
3131 char *ftp_data_start,
3132 size_t addr_offset_from_ftp_data_start,
3133 size_t addr_size, enum ct_alg_mode mode)
3134{
3135 /* This is slightly bigger than really possible. */
3136 enum { MAX_FTP_V6_NAT_DELTA = 45 };
3137
3138 if (mode == CT_FTP_MODE_PASSIVE) {
3139 return 0;
3140 }
3141
3142 /* Do conservative check for pathological MTU usage. */
3143 uint32_t orig_used_size = dp_packet_size(pkt);
3144 uint16_t allocated_size = dp_packet_get_allocated(pkt);
3145 if (orig_used_size + MAX_FTP_V6_NAT_DELTA > allocated_size) {
3146 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3147 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP",
3148 allocated_size);
3149 return 0;
3150 }
3151
bd5e81a0 3152 char v6_addr_str[IPV6_SCAN_LEN] = {0};
500db308
BP
3153 ovs_assert(inet_ntop(AF_INET6, &v6_addr_rep.ipv6_aligned, v6_addr_str,
3154 IPV6_SCAN_LEN - 1));
bd5e81a0
DB
3155
3156 size_t replace_addr_size = strlen(v6_addr_str);
3157
3158 size_t remain_size = tcp_payload_length(pkt) -
3159 addr_offset_from_ftp_data_start;
3160
3161 char *pkt_addr_str = ftp_data_start + addr_offset_from_ftp_data_start;
3162 replace_substring(pkt_addr_str, addr_size, remain_size,
3163 v6_addr_str, replace_addr_size);
3164
3165 int overall_delta = (int) replace_addr_size - (int) addr_size;
3166
3167 dp_packet_set_size(pkt, orig_used_size + overall_delta);
3168 return overall_delta;
3169}
3170
3171static void
3172handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
3173 struct dp_packet *pkt,
3174 const struct conn *conn_for_expectation,
3175 long long now, enum ftp_ctl_pkt ftp_ctl, bool nat)
3176{
3177 struct ip_header *l3_hdr = dp_packet_l3(pkt);
3178 ovs_be32 v4_addr_rep = 0;
3179 struct ct_addr v6_addr_rep;
3180 size_t addr_offset_from_ftp_data_start;
3181 size_t addr_size = 0;
3182 char *ftp_data_start;
3183 bool do_seq_skew_adj = true;
3184 enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
3185
3186 if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
3187 return;
3188 }
3189
3190 if (!nat || !conn_for_expectation->seq_skew) {
3191 do_seq_skew_adj = false;
3192 }
3193
3194 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
3195 int64_t seq_skew = 0;
dec0dbbc 3196
bd5e81a0
DB
3197 if (ftp_ctl == CT_FTP_CTL_OTHER) {
3198 seq_skew = conn_for_expectation->seq_skew;
bd5e81a0
DB
3199 } else if (ftp_ctl == CT_FTP_CTL_INTEREST) {
3200 enum ftp_ctl_pkt rc;
3201 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3202 rc = process_ftp_ctl_v6(ct, pkt, conn_for_expectation,
4417ca3d 3203 &v6_addr_rep, &ftp_data_start,
bd5e81a0
DB
3204 &addr_offset_from_ftp_data_start,
3205 &addr_size, &mode);
3206 } else {
3207 rc = process_ftp_ctl_v4(ct, pkt, conn_for_expectation,
4417ca3d 3208 &v4_addr_rep, &ftp_data_start,
bd5e81a0
DB
3209 &addr_offset_from_ftp_data_start);
3210 }
3211 if (rc == CT_FTP_CTL_INVALID) {
3212 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3213 VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
3214 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
3215 return;
3216 } else if (rc == CT_FTP_CTL_INTEREST) {
3217 uint16_t ip_len;
dec0dbbc 3218
bd5e81a0
DB
3219 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3220 seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep, ftp_data_start,
3221 addr_offset_from_ftp_data_start,
3222 addr_size, mode);
bd5e81a0
DB
3223 if (seq_skew) {
3224 ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen);
3225 ip_len += seq_skew;
3226 nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
3227 conn_seq_skew_set(ct, &conn_for_expectation->key, now,
23bea975 3228 seq_skew, ctx->reply);
bd5e81a0
DB
3229 }
3230 } else {
3231 seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep, ftp_data_start,
3232 addr_offset_from_ftp_data_start);
bd5e81a0
DB
3233 ip_len = ntohs(l3_hdr->ip_tot_len);
3234 if (seq_skew) {
3235 ip_len += seq_skew;
3236 l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
3237 l3_hdr->ip_tot_len, htons(ip_len));
3238 l3_hdr->ip_tot_len = htons(ip_len);
3239 conn_seq_skew_set(ct, &conn_for_expectation->key, now,
23bea975 3240 seq_skew, ctx->reply);
bd5e81a0
DB
3241 }
3242 }
3243 } else {
3244 OVS_NOT_REACHED();
3245 }
3246 } else {
3247 OVS_NOT_REACHED();
3248 }
3249
3250 struct tcp_header *th = dp_packet_l4(pkt);
dec0dbbc 3251
bd5e81a0
DB
3252 if (do_seq_skew_adj && seq_skew != 0) {
3253 if (ctx->reply != conn_for_expectation->seq_skew_dir) {
3254
3255 uint32_t tcp_ack = ntohl(get_16aligned_be32(&th->tcp_ack));
3256
3257 if ((seq_skew > 0) && (tcp_ack < seq_skew)) {
3258 /* Should not be possible; will be marked invalid. */
3259 tcp_ack = 0;
3260 } else if ((seq_skew < 0) && (UINT32_MAX - tcp_ack < -seq_skew)) {
3261 tcp_ack = (-seq_skew) - (UINT32_MAX - tcp_ack);
3262 } else {
3263 tcp_ack -= seq_skew;
3264 }
3265 ovs_be32 new_tcp_ack = htonl(tcp_ack);
3266 put_16aligned_be32(&th->tcp_ack, new_tcp_ack);
3267 } else {
3268 uint32_t tcp_seq = ntohl(get_16aligned_be32(&th->tcp_seq));
3269 if ((seq_skew > 0) && (UINT32_MAX - tcp_seq < seq_skew)) {
3270 tcp_seq = seq_skew - (UINT32_MAX - tcp_seq);
3271 } else if ((seq_skew < 0) && (tcp_seq < -seq_skew)) {
3272 /* Should not be possible; will be marked invalid. */
3273 tcp_seq = 0;
3274 } else {
3275 tcp_seq += seq_skew;
3276 }
3277 ovs_be32 new_tcp_seq = htonl(tcp_seq);
3278 put_16aligned_be32(&th->tcp_seq, new_tcp_seq);
3279 }
3280 }
3281
bd5e81a0
DB
3282 th->tcp_csum = 0;
3283 uint32_t tcp_csum;
3284 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3285 tcp_csum = packet_csum_pseudoheader6(nh6);
3286 } else {
3287 tcp_csum = packet_csum_pseudoheader(l3_hdr);
3288 }
dec0dbbc
DB
3289 const char *tail = dp_packet_tail(pkt);
3290 uint8_t pad = dp_packet_l2_pad_size(pkt);
bd5e81a0
DB
3291 th->tcp_csum = csum_finish(
3292 csum_continue(tcp_csum, th, tail - (char *) th - pad));
bd5e81a0 3293}
7be77cb0
DB
3294
3295static void
3296handle_tftp_ctl(struct conntrack *ct,
94e71143 3297 const struct conn_lookup_ctx *ctx OVS_UNUSED,
be38342d 3298 struct dp_packet *pkt,
7be77cb0 3299 const struct conn *conn_for_expectation,
4417ca3d
DB
3300 long long now OVS_UNUSED,
3301 enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED)
7be77cb0 3302{
be38342d
DB
3303 expectation_create(ct, conn_for_expectation->key.src.port,
3304 conn_for_expectation,
3305 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
7be77cb0 3306}