]> git.proxmox.com Git - mirror_ovs.git/blame - lib/conntrack.c
dpctl: Stop showing the dpctl/help command.
[mirror_ovs.git] / lib / conntrack.c
CommitLineData
a489b168 1/*
4ea96698 2 * Copyright (c) 2015-2019 Nicira, Inc.
a489b168
DDP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
bd5e81a0 18#include <ctype.h>
a489b168 19#include <errno.h>
ff6aa424 20#include <sys/types.h>
a489b168
DDP
21#include <netinet/in.h>
22#include <netinet/icmp6.h>
bd5e81a0 23#include <string.h>
a489b168
DDP
24
25#include "bitmap.h"
bd5e81a0 26#include "conntrack.h"
a489b168
DDP
27#include "conntrack-private.h"
28#include "coverage.h"
29#include "csum.h"
4d4e68ed 30#include "ct-dpif.h"
a489b168
DDP
31#include "dp-packet.h"
32#include "flow.h"
4ea96698 33#include "ipf.h"
a489b168
DDP
34#include "netdev.h"
35#include "odp-netlink.h"
36#include "openvswitch/hmap.h"
37#include "openvswitch/vlog.h"
38#include "ovs-rcu.h"
e6ef6cc6 39#include "ovs-thread.h"
fd016ae3 40#include "openvswitch/poll-loop.h"
a489b168
DDP
41#include "random.h"
42#include "timeval.h"
43
44VLOG_DEFINE_THIS_MODULE(conntrack);
45
46COVERAGE_DEFINE(conntrack_full);
e6ef6cc6 47COVERAGE_DEFINE(conntrack_long_cleanup);
a489b168
DDP
48
49struct conn_lookup_ctx {
50 struct conn_key key;
51 struct conn *conn;
52 uint32_t hash;
53 bool reply;
dbb597d3 54 bool icmp_related;
a489b168
DDP
55};
56
bd5e81a0
DB
57enum ftp_ctl_pkt {
58 /* Control packets with address and/or port specifiers. */
59 CT_FTP_CTL_INTEREST,
60 /* Control packets without address and/or port specifiers. */
61 CT_FTP_CTL_OTHER,
62 CT_FTP_CTL_INVALID,
63};
64
65enum ct_alg_mode {
66 CT_FTP_MODE_ACTIVE,
67 CT_FTP_MODE_PASSIVE,
7be77cb0 68 CT_TFTP_MODE,
bd5e81a0
DB
69};
70
94e71143
DB
71enum ct_alg_ctl_type {
72 CT_ALG_CTL_NONE,
73 CT_ALG_CTL_FTP,
74 CT_ALG_CTL_TFTP,
be38342d
DB
75 /* SIP is not enabled through Openflow and presently only used as
76 * an example of an alg that allows a wildcard src ip. */
77 CT_ALG_CTL_SIP,
94e71143
DB
78};
79
a489b168 80static bool conn_key_extract(struct conntrack *, struct dp_packet *,
66e4ad8a
DDP
81 ovs_be16 dl_type, struct conn_lookup_ctx *,
82 uint16_t zone);
a489b168
DDP
83static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
84static void conn_key_reverse(struct conn_key *);
85static void conn_key_lookup(struct conntrack_bucket *ctb,
86 struct conn_lookup_ctx *ctx,
87 long long now);
88static bool valid_new(struct dp_packet *pkt, struct conn_key *);
e6ef6cc6
DDP
89static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
90 struct conn_key *, long long now);
a489b168 91static void delete_conn(struct conn *);
e6ef6cc6
DDP
92static enum ct_update_res conn_update(struct conn *,
93 struct conntrack_bucket *ctb,
94 struct dp_packet *, bool reply,
95 long long now);
a489b168
DDP
96static bool conn_expired(struct conn *, long long now);
97static void set_mark(struct dp_packet *, struct conn *,
98 uint32_t val, uint32_t mask);
99static void set_label(struct dp_packet *, struct conn *,
100 const struct ovs_key_ct_labels *val,
101 const struct ovs_key_ct_labels *mask);
e6ef6cc6 102static void *clean_thread_main(void *f_);
a489b168 103
286de272
DB
104static struct nat_conn_key_node *
105nat_conn_keys_lookup(struct hmap *nat_conn_keys,
106 const struct conn_key *key,
107 uint32_t basis);
108
80cee116
DB
109static bool
110nat_conn_keys_insert(struct hmap *nat_conn_keys,
111 const struct conn *nat_conn,
112 uint32_t hash_basis);
113
286de272
DB
114static void
115nat_conn_keys_remove(struct hmap *nat_conn_keys,
116 const struct conn_key *key,
117 uint32_t basis);
118
119static bool
120nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
121 struct conn *nat_conn);
122
123static uint8_t
124reverse_icmp_type(uint8_t type);
125static uint8_t
126reverse_icmp6_type(uint8_t type);
127static inline bool
128extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
129 const char **new_data, bool validate_checksum);
130static inline bool
131extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
132 const char **new_data);
bd5e81a0 133static struct alg_exp_node *
be38342d
DB
134expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
135 uint32_t basis, bool src_ip_wc);
bd5e81a0
DB
136
137static int
138repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
139 char *ftp_data_v4_start,
cd7c99a6 140 size_t addr_offset_from_ftp_data_start, size_t addr_size);
bd5e81a0
DB
141
142static enum ftp_ctl_pkt
143process_ftp_ctl_v4(struct conntrack *ct,
144 struct dp_packet *pkt,
145 const struct conn *conn_for_expectation,
4417ca3d 146 ovs_be32 *v4_addr_rep,
bd5e81a0 147 char **ftp_data_v4_start,
cd7c99a6
DB
148 size_t *addr_offset_from_ftp_data_start,
149 size_t *addr_size);
bd5e81a0
DB
150
151static enum ftp_ctl_pkt
152detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
153 struct dp_packet *pkt);
154
4417ca3d
DB
155static void
156expectation_clean(struct conntrack *ct, const struct conn_key *master_key,
157 uint32_t basis);
158
94e71143
DB
159static struct ct_l4_proto *l4_protos[] = {
160 [IPPROTO_TCP] = &ct_proto_tcp,
161 [IPPROTO_UDP] = &ct_proto_other,
162 [IPPROTO_ICMP] = &ct_proto_icmp4,
163 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
164};
165
bd5e81a0
DB
166static void
167handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
168 struct dp_packet *pkt,
169 const struct conn *conn_for_expectation,
170 long long now, enum ftp_ctl_pkt ftp_ctl, bool nat);
171
7be77cb0
DB
172static void
173handle_tftp_ctl(struct conntrack *ct,
94e71143 174 const struct conn_lookup_ctx *ctx OVS_UNUSED,
be38342d 175 struct dp_packet *pkt,
7be77cb0 176 const struct conn *conn_for_expectation,
4417ca3d
DB
177 long long now OVS_UNUSED,
178 enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED);
94e71143
DB
179
180typedef void (*alg_helper)(struct conntrack *ct,
181 const struct conn_lookup_ctx *ctx,
182 struct dp_packet *pkt,
183 const struct conn *conn_for_expectation,
184 long long now, enum ftp_ctl_pkt ftp_ctl,
185 bool nat);
186
187static alg_helper alg_helpers[] = {
188 [CT_ALG_CTL_NONE] = NULL,
189 [CT_ALG_CTL_FTP] = handle_ftp_ctl,
190 [CT_ALG_CTL_TFTP] = handle_tftp_ctl,
a489b168
DDP
191};
192
193long long ct_timeout_val[] = {
194#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
195 CT_TIMEOUTS
196#undef CT_TIMEOUT
197};
198
bd5e81a0
DB
199/* The maximum TCP or UDP port number. */
200#define CT_MAX_L4_PORT 65535
bd5e81a0
DB
201/* String buffer used for parsing FTP string messages.
202 * This is sized about twice what is needed to leave some
203 * margin of error. */
204#define LARGEST_FTP_MSG_OF_INTEREST 128
205/* FTP port string used in active mode. */
206#define FTP_PORT_CMD "PORT"
207/* FTP pasv string used in passive mode. */
208#define FTP_PASV_REPLY_CODE "227"
209/* Maximum decimal digits for port in FTP command.
210 * The port is represented as two 3 digit numbers with the
211 * high part a multiple of 256. */
212#define MAX_FTP_PORT_DGTS 3
213
214/* FTP extension EPRT string used for active mode. */
215#define FTP_EPRT_CMD "EPRT"
216/* FTP extension EPSV string used for passive mode. */
217#define FTP_EPSV_REPLY "EXTENDED PASSIVE"
218/* Maximum decimal digits for port in FTP extended command. */
219#define MAX_EXT_FTP_PORT_DGTS 5
220/* FTP extended command code for IPv6. */
221#define FTP_AF_V6 '2'
222/* Used to indicate a wildcard L4 source port number for ALGs.
223 * This is used for port numbers that we cannot predict in
224 * expectations. */
225#define ALG_WC_SRC_PORT 0
226
a489b168 227/* If the total number of connections goes above this value, no new connections
286de272 228 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
a489b168
DDP
229#define DEFAULT_N_CONN_LIMIT 3000000
230
5ed7a0b4
DB
231/* Does a member by member comparison of two conn_keys; this
232 * function must be kept in sync with struct conn_key; returns 0
233 * if the keys are equal or 1 if the keys are not equal. */
234static int
235conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
236{
237 if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
238 !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
239 (key1->src.icmp_id == key2->src.icmp_id) &&
240 (key1->src.icmp_type == key2->src.icmp_type) &&
241 (key1->src.icmp_code == key2->src.icmp_code) &&
242 (key1->dst.icmp_id == key2->dst.icmp_id) &&
243 (key1->dst.icmp_type == key2->dst.icmp_type) &&
244 (key1->dst.icmp_code == key2->dst.icmp_code) &&
245 (key1->dl_type == key2->dl_type) &&
246 (key1->zone == key2->zone) &&
247 (key1->nw_proto == key2->nw_proto)) {
248
249 return 0;
250 }
251 return 1;
252}
253
d8682ee5 254static void
dec0dbbc
DB
255ct_print_conn_info(const struct conn *c, const char *log_msg,
256 enum vlog_level vll, bool force, bool rl_on)
66f400f5
DB
257{
258#define CT_VLOG(RL_ON, LEVEL, ...) \
259 do { \
260 if (RL_ON) { \
261 static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
262 vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \
263 } else { \
264 vlog(&this_module, LEVEL, __VA_ARGS__); \
265 } \
266 } while (0)
267
268 if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) {
269 if (c->key.dl_type == htons(ETH_TYPE_IP)) {
270 CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src "
271 "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports "
272 "%"PRIu16"/%"PRIu16" rev src/dst ports "
273 "%"PRIu16"/%"PRIu16" zone/rev zone "
274 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
275 "%"PRIu8"/%"PRIu8, log_msg,
cda1b109
DB
276 IP_ARGS(c->key.src.addr.ipv4),
277 IP_ARGS(c->key.dst.addr.ipv4),
278 IP_ARGS(c->rev_key.src.addr.ipv4),
279 IP_ARGS(c->rev_key.dst.addr.ipv4),
66f400f5
DB
280 ntohs(c->key.src.port), ntohs(c->key.dst.port),
281 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
282 c->key.zone, c->rev_key.zone, c->key.nw_proto,
283 c->rev_key.nw_proto);
284 } else {
285 char ip6_s[INET6_ADDRSTRLEN];
286 inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s);
287 char ip6_d[INET6_ADDRSTRLEN];
288 inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d);
289 char ip6_rs[INET6_ADDRSTRLEN];
290 inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs,
291 sizeof ip6_rs);
292 char ip6_rd[INET6_ADDRSTRLEN];
293 inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd,
294 sizeof ip6_rd);
295
296 CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s"
297 " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16
298 " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone "
299 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
300 "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs,
301 ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port),
302 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
303 c->key.zone, c->rev_key.zone, c->key.nw_proto,
304 c->rev_key.nw_proto);
305 }
306 }
307}
308
a489b168
DDP
309/* Initializes the connection tracker 'ct'. The caller is responsible for
310 * calling 'conntrack_destroy()', when the instance is not needed anymore */
311void
312conntrack_init(struct conntrack *ct)
313{
e6ef6cc6 314 long long now = time_msec();
a489b168 315
8b934ced
DB
316 ct_rwlock_init(&ct->resources_lock);
317 ct_rwlock_wrlock(&ct->resources_lock);
286de272 318 hmap_init(&ct->nat_conn_keys);
bd5e81a0 319 hmap_init(&ct->alg_expectations);
4417ca3d 320 hindex_init(&ct->alg_expectation_refs);
bd5e81a0 321 ovs_list_init(&ct->alg_exp_list);
8b934ced 322 ct_rwlock_unlock(&ct->resources_lock);
286de272 323
dec0dbbc 324 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
a489b168
DDP
325 struct conntrack_bucket *ctb = &ct->buckets[i];
326
327 ct_lock_init(&ctb->lock);
328 ct_lock_lock(&ctb->lock);
329 hmap_init(&ctb->connections);
dec0dbbc 330 for (unsigned j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
e6ef6cc6
DDP
331 ovs_list_init(&ctb->exp_lists[j]);
332 }
a489b168 333 ct_lock_unlock(&ctb->lock);
e6ef6cc6
DDP
334 ovs_mutex_init(&ctb->cleanup_mutex);
335 ovs_mutex_lock(&ctb->cleanup_mutex);
336 ctb->next_cleanup = now + CT_TM_MIN;
337 ovs_mutex_unlock(&ctb->cleanup_mutex);
a489b168
DDP
338 }
339 ct->hash_basis = random_uint32();
340 atomic_count_init(&ct->n_conn, 0);
341 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
e6ef6cc6
DDP
342 latch_init(&ct->clean_thread_exit);
343 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
4ea96698 344 ct->ipf = ipf_init();
a489b168
DDP
345}
346
347/* Destroys the connection tracker 'ct' and frees all the allocated memory. */
348void
349conntrack_destroy(struct conntrack *ct)
350{
e6ef6cc6
DDP
351 latch_set(&ct->clean_thread_exit);
352 pthread_join(ct->clean_thread, NULL);
353 latch_destroy(&ct->clean_thread_exit);
dec0dbbc 354 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
a489b168
DDP
355 struct conntrack_bucket *ctb = &ct->buckets[i];
356 struct conn *conn;
357
e6ef6cc6 358 ovs_mutex_destroy(&ctb->cleanup_mutex);
a489b168 359 ct_lock_lock(&ctb->lock);
bd5e81a0 360 HMAP_FOR_EACH_POP (conn, node, &ctb->connections) {
286de272
DB
361 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
362 atomic_count_dec(&ct->n_conn);
363 }
a489b168
DDP
364 delete_conn(conn);
365 }
366 hmap_destroy(&ctb->connections);
367 ct_lock_unlock(&ctb->lock);
368 ct_lock_destroy(&ctb->lock);
369 }
8b934ced 370 ct_rwlock_wrlock(&ct->resources_lock);
286de272
DB
371 struct nat_conn_key_node *nat_conn_key_node;
372 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
373 free(nat_conn_key_node);
374 }
375 hmap_destroy(&ct->nat_conn_keys);
bd5e81a0
DB
376
377 struct alg_exp_node *alg_exp_node;
378 HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
379 free(alg_exp_node);
380 }
4417ca3d 381
bd5e81a0
DB
382 ovs_list_poison(&ct->alg_exp_list);
383 hmap_destroy(&ct->alg_expectations);
4417ca3d 384 hindex_destroy(&ct->alg_expectation_refs);
8b934ced
DB
385 ct_rwlock_unlock(&ct->resources_lock);
386 ct_rwlock_destroy(&ct->resources_lock);
4ea96698 387 ipf_destroy(ct->ipf);
a489b168
DDP
388}
389\f
390static unsigned hash_to_bucket(uint32_t hash)
391{
392 /* Extracts the most significant bits in hash. The least significant bits
393 * are already used internally by the hmap implementation. */
394 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
395
396 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
397}
398
399static void
286de272 400write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
bd5e81a0 401 const struct conn_key *key, const struct alg_exp_node *alg_exp)
a489b168 402{
286de272 403 pkt->md.ct_state |= CS_TRACKED;
a489b168 404 pkt->md.ct_zone = zone;
daf4d3c1
JR
405 pkt->md.ct_mark = conn ? conn->mark : 0;
406 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
407
408 /* Use the original direction tuple if we have it. */
409 if (conn) {
bd5e81a0
DB
410 if (conn->alg_related) {
411 key = &conn->master_key;
412 } else {
413 key = &conn->key;
414 }
415 } else if (alg_exp) {
416 pkt->md.ct_mark = alg_exp->master_mark;
417 pkt->md.ct_label = alg_exp->master_label;
418 key = &alg_exp->master_key;
daf4d3c1 419 }
dec0dbbc 420
daf4d3c1 421 pkt->md.ct_orig_tuple_ipv6 = false;
dec0dbbc 422
daf4d3c1
JR
423 if (key) {
424 if (key->dl_type == htons(ETH_TYPE_IP)) {
425 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
cda1b109
DB
426 key->src.addr.ipv4,
427 key->dst.addr.ipv4,
daf4d3c1
JR
428 key->nw_proto != IPPROTO_ICMP
429 ? key->src.port : htons(key->src.icmp_type),
430 key->nw_proto != IPPROTO_ICMP
431 ? key->dst.port : htons(key->src.icmp_code),
432 key->nw_proto,
433 };
286de272 434 } else {
daf4d3c1
JR
435 pkt->md.ct_orig_tuple_ipv6 = true;
436 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
cda1b109
DB
437 key->src.addr.ipv6,
438 key->dst.addr.ipv6,
daf4d3c1
JR
439 key->nw_proto != IPPROTO_ICMPV6
440 ? key->src.port : htons(key->src.icmp_type),
441 key->nw_proto != IPPROTO_ICMPV6
442 ? key->dst.port : htons(key->src.icmp_code),
443 key->nw_proto,
444 };
445 }
446 } else {
447 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
448 }
bd5e81a0
DB
449}
450
451static uint8_t
452get_ip_proto(const struct dp_packet *pkt)
453{
454 uint8_t ip_proto;
455 struct eth_header *l2 = dp_packet_eth(pkt);
456 if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
457 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
458 ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
459 } else {
460 struct ip_header *l3_hdr = dp_packet_l3(pkt);
461 ip_proto = l3_hdr->ip_proto;
462 }
286de272 463
bd5e81a0
DB
464 return ip_proto;
465}
466
467static bool
94e71143 468is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl)
bd5e81a0 469{
94e71143 470 return ct_alg_ctl == CT_ALG_CTL_FTP;
bd5e81a0
DB
471}
472
94e71143 473static enum ct_alg_ctl_type
bd7d93f8
DB
474get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst,
475 const char *helper)
7be77cb0 476{
94e71143
DB
477 /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined
478 * in OSX, at least in in.h. Since these values will never change, remove
7be77cb0 479 * the external dependency. */
94e71143
DB
480 enum { CT_IPPORT_FTP = 21 };
481 enum { CT_IPPORT_TFTP = 69 };
bd7d93f8
DB
482 uint8_t ip_proto = get_ip_proto(pkt);
483 struct udp_header *uh = dp_packet_l4(pkt);
484 struct tcp_header *th = dp_packet_l4(pkt);
485 ovs_be16 ftp_src_port = htons(CT_IPPORT_FTP);
486 ovs_be16 ftp_dst_port = htons(CT_IPPORT_FTP);
487 ovs_be16 tftp_dst_port = htons(CT_IPPORT_TFTP);
488
489 if (OVS_UNLIKELY(tp_dst)) {
490 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
491 ftp_dst_port = tp_dst;
492 } else if (helper && !strncmp(helper, "tftp", strlen("tftp"))) {
493 tftp_dst_port = tp_dst;
494 }
495 } else if (OVS_UNLIKELY(tp_src)) {
496 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
497 ftp_src_port = tp_src;
498 }
499 }
7be77cb0 500
bd7d93f8 501 if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_dst_port) {
94e71143
DB
502 return CT_ALG_CTL_TFTP;
503 } else if (ip_proto == IPPROTO_TCP &&
bd7d93f8 504 (th->tcp_src == ftp_src_port || th->tcp_dst == ftp_dst_port)) {
94e71143
DB
505 return CT_ALG_CTL_FTP;
506 }
507 return CT_ALG_CTL_NONE;
508}
509
be38342d
DB
510static bool
511alg_src_ip_wc(enum ct_alg_ctl_type alg_ctl_type)
512{
513 if (alg_ctl_type == CT_ALG_CTL_SIP) {
514 return true;
515 }
516 return false;
517}
518
94e71143
DB
519static void
520handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
521 struct dp_packet *pkt, enum ct_alg_ctl_type ct_alg_ctl,
522 const struct conn *conn, long long now, bool nat,
523 const struct conn *conn_for_expectation)
524{
525 /* ALG control packet handling with expectation creation. */
3a2a425b 526 if (OVS_UNLIKELY(alg_helpers[ct_alg_ctl] && conn && conn->alg)) {
94e71143
DB
527 alg_helpers[ct_alg_ctl](ct, ctx, pkt, conn_for_expectation, now,
528 CT_FTP_CTL_INTEREST, nat);
529 }
7be77cb0
DB
530}
531
286de272
DB
532static void
533pat_packet(struct dp_packet *pkt, const struct conn *conn)
534{
535 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
536 if (conn->key.nw_proto == IPPROTO_TCP) {
537 struct tcp_header *th = dp_packet_l4(pkt);
538 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
539 } else if (conn->key.nw_proto == IPPROTO_UDP) {
540 struct udp_header *uh = dp_packet_l4(pkt);
541 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
542 }
543 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
544 if (conn->key.nw_proto == IPPROTO_TCP) {
545 struct tcp_header *th = dp_packet_l4(pkt);
546 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
547 } else if (conn->key.nw_proto == IPPROTO_UDP) {
548 struct udp_header *uh = dp_packet_l4(pkt);
549 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
550 }
551 }
552}
553
554static void
555nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
556{
557 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
558 pkt->md.ct_state |= CS_SRC_NAT;
559 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
560 struct ip_header *nh = dp_packet_l3(pkt);
561 packet_set_ipv4_addr(pkt, &nh->ip_src,
cda1b109 562 conn->rev_key.dst.addr.ipv4);
286de272
DB
563 } else {
564 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
565 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
566 nh6->ip6_src.be32,
cda1b109 567 &conn->rev_key.dst.addr.ipv6, true);
286de272
DB
568 }
569 if (!related) {
570 pat_packet(pkt, conn);
571 }
572 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
573 pkt->md.ct_state |= CS_DST_NAT;
574 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
575 struct ip_header *nh = dp_packet_l3(pkt);
576 packet_set_ipv4_addr(pkt, &nh->ip_dst,
cda1b109 577 conn->rev_key.src.addr.ipv4);
286de272
DB
578 } else {
579 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
580 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
581 nh6->ip6_dst.be32,
cda1b109 582 &conn->rev_key.src.addr.ipv6, true);
286de272
DB
583 }
584 if (!related) {
585 pat_packet(pkt, conn);
586 }
587 }
588}
589
590static void
591un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
592{
593 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
594 if (conn->key.nw_proto == IPPROTO_TCP) {
595 struct tcp_header *th = dp_packet_l4(pkt);
596 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
597 } else if (conn->key.nw_proto == IPPROTO_UDP) {
598 struct udp_header *uh = dp_packet_l4(pkt);
599 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
600 }
601 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
602 if (conn->key.nw_proto == IPPROTO_TCP) {
603 struct tcp_header *th = dp_packet_l4(pkt);
604 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
605 } else if (conn->key.nw_proto == IPPROTO_UDP) {
606 struct udp_header *uh = dp_packet_l4(pkt);
607 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
608 }
609 }
610}
611
edd1bef4
DB
612static void
613reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
614{
615 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
616 if (conn->key.nw_proto == IPPROTO_TCP) {
617 struct tcp_header *th_in = dp_packet_l4(pkt);
618 packet_set_tcp_port(pkt, conn->key.src.port,
619 th_in->tcp_dst);
620 } else if (conn->key.nw_proto == IPPROTO_UDP) {
621 struct udp_header *uh_in = dp_packet_l4(pkt);
622 packet_set_udp_port(pkt, conn->key.src.port,
623 uh_in->udp_dst);
624 }
625 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
626 if (conn->key.nw_proto == IPPROTO_TCP) {
627 struct tcp_header *th_in = dp_packet_l4(pkt);
628 packet_set_tcp_port(pkt, th_in->tcp_src,
629 conn->key.dst.port);
630 } else if (conn->key.nw_proto == IPPROTO_UDP) {
631 struct udp_header *uh_in = dp_packet_l4(pkt);
632 packet_set_udp_port(pkt, uh_in->udp_src,
633 conn->key.dst.port);
634 }
635 }
636}
637
638static void
639reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
640{
641 char *tail = dp_packet_tail(pkt);
642 char pad = dp_packet_l2_pad_size(pkt);
643 struct conn_key inner_key;
644 const char *inner_l4 = NULL;
645 uint16_t orig_l3_ofs = pkt->l3_ofs;
646 uint16_t orig_l4_ofs = pkt->l4_ofs;
647
648 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
649 struct ip_header *nh = dp_packet_l3(pkt);
650 struct icmp_header *icmp = dp_packet_l4(pkt);
651 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
bd5e81a0
DB
652 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad,
653 &inner_l4, false);
edd1bef4
DB
654 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
655 pkt->l4_ofs += inner_l4 - (char *) icmp;
656
657 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
658 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
cda1b109 659 conn->key.src.addr.ipv4);
edd1bef4
DB
660 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
661 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
cda1b109 662 conn->key.dst.addr.ipv4);
edd1bef4 663 }
dec0dbbc 664
edd1bef4
DB
665 reverse_pat_packet(pkt, conn);
666 icmp->icmp_csum = 0;
667 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
668 } else {
669 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
670 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
671 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
672 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
673 extract_l3_ipv6(&inner_key, inner_l3_6,
674 tail - ((char *)inner_l3_6) - pad,
675 &inner_l4);
676 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
677 pkt->l4_ofs += inner_l4 - (char *) icmp6;
678
679 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
680 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
681 inner_l3_6->ip6_src.be32,
cda1b109 682 &conn->key.src.addr.ipv6, true);
edd1bef4
DB
683 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
684 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
685 inner_l3_6->ip6_dst.be32,
cda1b109 686 &conn->key.dst.addr.ipv6, true);
edd1bef4
DB
687 }
688 reverse_pat_packet(pkt, conn);
edd1bef4 689 icmp6->icmp6_base.icmp6_cksum = 0;
76d85771
DB
690 icmp6->icmp6_base.icmp6_cksum = packet_csum_upperlayer6(nh6, icmp6,
691 IPPROTO_ICMPV6, tail - (char *) icmp6 - pad);
edd1bef4
DB
692 }
693 pkt->l3_ofs = orig_l3_ofs;
694 pkt->l4_ofs = orig_l4_ofs;
695}
696
286de272
DB
697static void
698un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
699 bool related)
700{
701 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
702 pkt->md.ct_state |= CS_DST_NAT;
703 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
704 struct ip_header *nh = dp_packet_l3(pkt);
705 packet_set_ipv4_addr(pkt, &nh->ip_dst,
cda1b109 706 conn->key.src.addr.ipv4);
286de272
DB
707 } else {
708 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
709 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
710 nh6->ip6_dst.be32,
cda1b109 711 &conn->key.src.addr.ipv6, true);
286de272 712 }
edd1bef4
DB
713
714 if (OVS_UNLIKELY(related)) {
715 reverse_nat_packet(pkt, conn);
716 } else {
286de272
DB
717 un_pat_packet(pkt, conn);
718 }
719 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
720 pkt->md.ct_state |= CS_SRC_NAT;
721 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
722 struct ip_header *nh = dp_packet_l3(pkt);
723 packet_set_ipv4_addr(pkt, &nh->ip_src,
cda1b109 724 conn->key.dst.addr.ipv4);
286de272
DB
725 } else {
726 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
727 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
728 nh6->ip6_src.be32,
cda1b109 729 &conn->key.dst.addr.ipv6, true);
286de272 730 }
edd1bef4
DB
731
732 if (OVS_UNLIKELY(related)) {
733 reverse_nat_packet(pkt, conn);
734 } else {
286de272
DB
735 un_pat_packet(pkt, conn);
736 }
737 }
738}
739
740/* Typical usage of this helper is in non per-packet code;
741 * this is because the bucket lock needs to be held for lookup
742 * and a hash would have already been needed. Hence, this function
743 * is just intended for code clarity. */
744static struct conn *
bd5e81a0 745conn_lookup(struct conntrack *ct, const struct conn_key *key, long long now)
286de272
DB
746{
747 struct conn_lookup_ctx ctx;
748 ctx.conn = NULL;
c3f6bae2 749 memcpy(&ctx.key, key, sizeof ctx.key);
286de272
DB
750 ctx.hash = conn_key_hash(key, ct->hash_basis);
751 unsigned bucket = hash_to_bucket(ctx.hash);
752 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
753 return ctx.conn;
754}
755
bd5e81a0
DB
756static void
757conn_seq_skew_set(struct conntrack *ct, const struct conn_key *key,
758 long long now, int seq_skew, bool seq_skew_dir)
759{
dec0dbbc 760 unsigned bucket = hash_to_bucket(conn_key_hash(key, ct->hash_basis));
bd5e81a0
DB
761 ct_lock_lock(&ct->buckets[bucket].lock);
762 struct conn *conn = conn_lookup(ct, key, now);
763 if (conn && seq_skew) {
764 conn->seq_skew = seq_skew;
765 conn->seq_skew_dir = seq_skew_dir;
766 }
767 ct_lock_unlock(&ct->buckets[bucket].lock);
768}
769
286de272
DB
770static void
771nat_clean(struct conntrack *ct, struct conn *conn,
772 struct conntrack_bucket *ctb)
773 OVS_REQUIRES(ctb->lock)
774{
8b934ced 775 ct_rwlock_wrlock(&ct->resources_lock);
286de272 776 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
8b934ced 777 ct_rwlock_unlock(&ct->resources_lock);
286de272 778 ct_lock_unlock(&ctb->lock);
dec0dbbc
DB
779 unsigned bucket_rev_conn =
780 hash_to_bucket(conn_key_hash(&conn->rev_key, ct->hash_basis));
286de272 781 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
8b934ced 782 ct_rwlock_wrlock(&ct->resources_lock);
dec0dbbc 783 long long now = time_msec();
286de272 784 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
286de272
DB
785 struct nat_conn_key_node *nat_conn_key_node =
786 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
787 ct->hash_basis);
788
789 /* In the unlikely event, rev conn was recreated, then skip
790 * rev_conn cleanup. */
791 if (rev_conn && (!nat_conn_key_node ||
5ed7a0b4
DB
792 conn_key_cmp(&nat_conn_key_node->value,
793 &rev_conn->rev_key))) {
286de272
DB
794 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
795 &rev_conn->node);
796 free(rev_conn);
797 }
286de272 798
dec0dbbc 799 delete_conn(conn);
8b934ced 800 ct_rwlock_unlock(&ct->resources_lock);
286de272
DB
801 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
802 ct_lock_lock(&ctb->lock);
803}
804
9e8f3960 805/* Must be called with 'CT_CONN_TYPE_DEFAULT' 'conn_type'. */
286de272
DB
806static void
807conn_clean(struct conntrack *ct, struct conn *conn,
808 struct conntrack_bucket *ctb)
809 OVS_REQUIRES(ctb->lock)
810{
9e8f3960
DB
811 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
812
4417ca3d
DB
813 if (conn->alg) {
814 expectation_clean(ct, &conn->key, ct->hash_basis);
815 }
286de272
DB
816 ovs_list_remove(&conn->exp_node);
817 hmap_remove(&ctb->connections, &conn->node);
818 atomic_count_dec(&ct->n_conn);
819 if (conn->nat_info) {
820 nat_clean(ct, conn, ctb);
821 } else {
822 delete_conn(conn);
823 }
a489b168
DDP
824}
825
3a2a425b
DB
826static bool
827ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
828{
829 if (ct_alg_ctl == CT_ALG_CTL_NONE) {
830 return true;
831 } else if (helper) {
832 if ((ct_alg_ctl == CT_ALG_CTL_FTP) &&
833 !strncmp(helper, "ftp", strlen("ftp"))) {
834 return true;
835 } else if ((ct_alg_ctl == CT_ALG_CTL_TFTP) &&
836 !strncmp(helper, "tftp", strlen("tftp"))) {
837 return true;
838 } else {
839 return false;
840 }
841 } else {
842 return false;
843 }
844}
845
ac6abe5f 846/* This function is called with the bucket lock held. */
a489b168
DDP
847static struct conn *
848conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
286de272
DB
849 struct conn_lookup_ctx *ctx, bool commit, long long now,
850 const struct nat_action_info_t *nat_action_info,
bd5e81a0
DB
851 struct conn *conn_for_un_nat_copy,
852 const char *helper,
3a2a425b
DB
853 const struct alg_exp_node *alg_exp,
854 enum ct_alg_ctl_type ct_alg_ctl)
a489b168 855{
a489b168
DDP
856 struct conn *nc = NULL;
857
858 if (!valid_new(pkt, &ctx->key)) {
286de272 859 pkt->md.ct_state = CS_INVALID;
a489b168
DDP
860 return nc;
861 }
dec0dbbc 862
286de272 863 pkt->md.ct_state = CS_NEW;
dec0dbbc 864
bd5e81a0
DB
865 if (alg_exp) {
866 pkt->md.ct_state |= CS_RELATED;
867 }
a489b168
DDP
868
869 if (commit) {
870 unsigned int n_conn_limit;
a489b168
DDP
871 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
872
873 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
874 COVERAGE_INC(conntrack_full);
875 return nc;
876 }
877
dec0dbbc 878 unsigned bucket = hash_to_bucket(ctx->hash);
e6ef6cc6 879 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
286de272
DB
880 ctx->conn = nc;
881 nc->rev_key = nc->key;
882 conn_key_reverse(&nc->rev_key);
a489b168 883
3a2a425b
DB
884 if (ct_verify_helper(helper, ct_alg_ctl)) {
885 nc->alg = nullable_xstrdup(helper);
bd5e81a0
DB
886 }
887
888 if (alg_exp) {
889 nc->alg_related = true;
890 nc->mark = alg_exp->master_mark;
891 nc->label = alg_exp->master_label;
892 nc->master_key = alg_exp->master_key;
893 }
894
286de272
DB
895 if (nat_action_info) {
896 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
a489b168 897
bd5e81a0 898 if (alg_exp) {
be38342d 899 if (alg_exp->nat_rpl_dst) {
bd5e81a0
DB
900 nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
901 nc->nat_info->nat_action = NAT_ACTION_SRC;
902 } else {
903 nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
904 nc->nat_info->nat_action = NAT_ACTION_DST;
905 }
c3f6bae2 906 memcpy(conn_for_un_nat_copy, nc, sizeof *conn_for_un_nat_copy);
d8682ee5
DB
907 ct_rwlock_wrlock(&ct->resources_lock);
908 bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys,
909 conn_for_un_nat_copy,
910 ct->hash_basis);
911 ct_rwlock_unlock(&ct->resources_lock);
912 if (!new_insert) {
913 char *log_msg = xasprintf("Pre-existing alg "
914 "nat_conn_key");
915 ct_print_conn_info(conn_for_un_nat_copy, log_msg, VLL_INFO,
916 true, false);
917 free(log_msg);
918 }
bd5e81a0 919 } else {
c3f6bae2 920 memcpy(conn_for_un_nat_copy, nc, sizeof *conn_for_un_nat_copy);
bd5e81a0 921 ct_rwlock_wrlock(&ct->resources_lock);
dec0dbbc
DB
922 bool nat_res = nat_select_range_tuple(ct, nc,
923 conn_for_un_nat_copy);
286de272 924
bd5e81a0
DB
925 if (!nat_res) {
926 goto nat_res_exhaustion;
927 }
286de272 928
bd5e81a0
DB
929 /* Update nc with nat adjustments made to
930 * conn_for_un_nat_copy by nat_select_range_tuple(). */
286de272 931 *nc = *conn_for_un_nat_copy;
bd5e81a0 932 ct_rwlock_unlock(&ct->resources_lock);
286de272 933 }
bd5e81a0
DB
934 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
935 conn_for_un_nat_copy->nat_info = NULL;
936 conn_for_un_nat_copy->alg = NULL;
dbb597d3 937 nat_packet(pkt, nc, ctx->icmp_related);
286de272 938 }
a489b168
DDP
939 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
940 atomic_count_inc(&ct->n_conn);
941 }
bd5e81a0 942
a489b168 943 return nc;
bd5e81a0
DB
944
945 /* This would be a user error or a DOS attack.
946 * A user error is prevented by allocating enough
947 * combinations of NAT addresses when combined with
948 * ephemeral ports. A DOS attack should be protected
949 * against with firewall rules or a separate firewall.
950 * Also using zone partitioning can limit DoS impact. */
951nat_res_exhaustion:
d8c5a93b 952 ovs_list_remove(&nc->exp_node);
bd5e81a0
DB
953 delete_conn(nc);
954 /* conn_for_un_nat_copy is a local variable in process_one; this
955 * memset() serves to document that conn_for_un_nat_copy is from
956 * this point on unused. */
957 memset(conn_for_un_nat_copy, 0, sizeof *conn_for_un_nat_copy);
958 ct_rwlock_unlock(&ct->resources_lock);
959 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
960 VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
961 "if DoS attack, use firewalling and/or zone partitioning.");
962 return NULL;
a489b168
DDP
963}
964
286de272
DB
965static bool
966conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
967 struct conn_lookup_ctx *ctx, struct conn **conn,
968 long long now, unsigned bucket)
969 OVS_REQUIRES(ct->buckets[bucket].lock)
970{
971 bool create_new_conn = false;
972
dbb597d3 973 if (ctx->icmp_related) {
286de272
DB
974 pkt->md.ct_state |= CS_RELATED;
975 if (ctx->reply) {
976 pkt->md.ct_state |= CS_REPLY_DIR;
977 }
978 } else {
bd5e81a0
DB
979 if ((*conn)->alg_related) {
980 pkt->md.ct_state |= CS_RELATED;
981 }
dec0dbbc 982
286de272
DB
983 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
984 pkt, ctx->reply, now);
985
986 switch (res) {
987 case CT_UPDATE_VALID:
988 pkt->md.ct_state |= CS_ESTABLISHED;
989 pkt->md.ct_state &= ~CS_NEW;
990 if (ctx->reply) {
991 pkt->md.ct_state |= CS_REPLY_DIR;
992 }
993 break;
994 case CT_UPDATE_INVALID:
995 pkt->md.ct_state = CS_INVALID;
996 break;
997 case CT_UPDATE_NEW:
998 conn_clean(ct, *conn, &ct->buckets[bucket]);
999 create_new_conn = true;
1000 break;
1001 default:
1002 OVS_NOT_REACHED();
1003 }
1004 }
1005 return create_new_conn;
1006}
1007
1008static void
1009create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
bd5e81a0 1010 long long now, bool alg_un_nat)
286de272
DB
1011{
1012 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
1013 nc->key = conn_for_un_nat_copy->rev_key;
1014 nc->rev_key = conn_for_un_nat_copy->key;
1015 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
1016 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
1017 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
286de272
DB
1018 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
1019
bd5e81a0 1020 if (alg_un_nat) {
d8682ee5
DB
1021 if (!rev_conn) {
1022 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
1023 &nc->node, un_nat_hash);
1024 } else {
1025 char *log_msg = xasprintf("Unusual condition for un_nat conn "
1026 "create for alg: rev_conn %p", rev_conn);
1027 ct_print_conn_info(nc, log_msg, VLL_INFO, true, false);
1028 free(log_msg);
1029 free(nc);
1030 }
286de272 1031 } else {
bd5e81a0
DB
1032 ct_rwlock_rdlock(&ct->resources_lock);
1033
1034 struct nat_conn_key_node *nat_conn_key_node =
1035 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
1036 if (nat_conn_key_node && !conn_key_cmp(&nat_conn_key_node->value,
1037 &nc->rev_key) && !rev_conn) {
bd5e81a0
DB
1038 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
1039 &nc->node, un_nat_hash);
1040 } else {
d8682ee5
DB
1041 char *log_msg = xasprintf("Unusual condition for un_nat conn "
1042 "create: nat_conn_key_node/rev_conn "
1043 "%p/%p", nat_conn_key_node, rev_conn);
1044 ct_print_conn_info(nc, log_msg, VLL_INFO, true, false);
1045 free(log_msg);
bd5e81a0
DB
1046 free(nc);
1047 }
1048 ct_rwlock_unlock(&ct->resources_lock);
286de272 1049 }
286de272
DB
1050 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
1051}
1052
1053static void
1054handle_nat(struct dp_packet *pkt, struct conn *conn,
1055 uint16_t zone, bool reply, bool related)
1056{
1057 if (conn->nat_info &&
1058 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1059 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
1060 zone != pkt->md.ct_zone))) {
bd5e81a0 1061
286de272
DB
1062 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
1063 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
1064 }
1065 if (reply) {
1066 un_nat_packet(pkt, conn, related);
1067 } else {
1068 nat_packet(pkt, conn, related);
1069 }
1070 }
1071}
1072
f8016041
DB
1073static bool
1074check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
1075 struct conn_lookup_ctx *ctx_in, long long now,
1076 unsigned *bucket, struct conn **conn,
1077 const struct nat_action_info_t *nat_action_info)
1078 OVS_REQUIRES(ct->buckets[*bucket].lock)
1079{
1080 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
1081 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
1082 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
1083 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
1084 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1085 nat_action_info) {
1086 return false;
1087 }
1088
1089 ct_lock_unlock(&ct->buckets[*bucket].lock);
1090 struct conn_lookup_ctx ctx;
1091 memset(&ctx, 0 , sizeof ctx);
1092 ctx.conn = NULL;
1093
1094 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
cda1b109
DB
1095 ctx.key.src.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
1096 ctx.key.dst.addr.ipv4 = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
f8016041
DB
1097
1098 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
1099 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
1100 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1101 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
1102 ctx.key.src.icmp_type = (uint8_t) src_port;
1103 ctx.key.dst.icmp_type = reverse_icmp_type(ctx.key.src.icmp_type);
1104 } else {
1105 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
1106 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
1107 }
1108 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
1109 } else {
cda1b109
DB
1110 ctx.key.src.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
1111 ctx.key.dst.addr.ipv6 = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
f8016041
DB
1112
1113 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
1114 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
1115 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1116 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
1117 ctx.key.src.icmp_type = (uint8_t) src_port;
1118 ctx.key.dst.icmp_type = reverse_icmp6_type(ctx.key.src.icmp_type);
1119 } else {
1120 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
1121 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
1122 }
1123 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
1124 }
1125
1126 ctx.key.dl_type = ctx_in->key.dl_type;
1127 ctx.key.zone = pkt->md.ct_zone;
f8016041
DB
1128 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
1129 *bucket = hash_to_bucket(ctx.hash);
1130 ct_lock_lock(&ct->buckets[*bucket].lock);
1131 conn_key_lookup(&ct->buckets[*bucket], &ctx, now);
1132 *conn = ctx.conn;
f8016041
DB
1133 return *conn ? true : false;
1134}
1135
bd5e81a0
DB
1136static bool
1137is_un_nat_conn_valid(const struct conn *un_nat_conn)
1138{
1139 return un_nat_conn->conn_type == CT_CONN_TYPE_UN_NAT;
1140}
1141
94e71143
DB
1142static bool
1143conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
1144 struct conn_lookup_ctx *ctx, struct conn *conn,
1145 const struct nat_action_info_t *nat_action_info,
1146 enum ct_alg_ctl_type ct_alg_ctl, long long now,
1147 unsigned bucket, bool *create_new_conn)
1148 OVS_REQUIRES(ct->buckets[bucket].lock)
1149{
1150 if (is_ftp_ctl(ct_alg_ctl)) {
1151 /* Keep sequence tracking in sync with the source of the
1152 * sequence skew. */
1153 if (ctx->reply != conn->seq_skew_dir) {
1154 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1155 !!nat_action_info);
1156 *create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1157 bucket);
1158 } else {
1159 *create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1160 bucket);
030958a0
DB
1161
1162 if (*create_new_conn == false) {
1163 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1164 !!nat_action_info);
1165 }
94e71143
DB
1166 }
1167 return true;
1168 }
1169 return false;
1170}
1171
286de272 1172static void
a489b168
DDP
1173process_one(struct conntrack *ct, struct dp_packet *pkt,
1174 struct conn_lookup_ctx *ctx, uint16_t zone,
286de272
DB
1175 bool force, bool commit, long long now, const uint32_t *setmark,
1176 const struct ovs_key_ct_labels *setlabel,
bd5e81a0 1177 const struct nat_action_info_t *nat_action_info,
bd7d93f8 1178 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper)
a489b168 1179{
286de272 1180 struct conn *conn;
a489b168 1181 unsigned bucket = hash_to_bucket(ctx->hash);
286de272
DB
1182 ct_lock_lock(&ct->buckets[bucket].lock);
1183 conn_key_lookup(&ct->buckets[bucket], ctx, now);
1184 conn = ctx->conn;
a489b168 1185
a76a37ef
JR
1186 /* Delete found entry if in wrong direction. 'force' implies commit. */
1187 if (conn && force && ctx->reply) {
286de272 1188 conn_clean(ct, conn, &ct->buckets[bucket]);
a76a37ef
JR
1189 conn = NULL;
1190 }
1191
286de272
DB
1192 if (OVS_LIKELY(conn)) {
1193 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
a489b168 1194
286de272 1195 ctx->reply = true;
a489b168 1196
286de272
DB
1197 struct conn_lookup_ctx ctx2;
1198 ctx2.conn = NULL;
1199 ctx2.key = conn->rev_key;
1200 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
1201
1202 ct_lock_unlock(&ct->buckets[bucket].lock);
1203 bucket = hash_to_bucket(ctx2.hash);
1204
1205 ct_lock_lock(&ct->buckets[bucket].lock);
1206 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
1207
1208 if (ctx2.conn) {
1209 conn = ctx2.conn;
1210 } else {
1211 /* It is a race condition where conn has timed out and removed
1212 * between unlock of the rev_conn and lock of the forward conn;
1213 * nothing to do. */
1214 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
1215 ct_lock_unlock(&ct->buckets[bucket].lock);
1216 return;
a489b168
DDP
1217 }
1218 }
286de272
DB
1219 }
1220
1221 bool create_new_conn = false;
1222 struct conn conn_for_un_nat_copy;
1223 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
94e71143 1224
bd7d93f8
DB
1225 enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst,
1226 helper);
bd5e81a0 1227
286de272 1228 if (OVS_LIKELY(conn)) {
94e71143
DB
1229 if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn,
1230 nat_action_info,
1231 ct_alg_ctl, now, bucket,
1232 &create_new_conn))) {
bd5e81a0
DB
1233 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1234 bucket);
1235 }
286de272 1236 if (nat_action_info && !create_new_conn) {
dbb597d3 1237 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
286de272 1238 }
bd5e81a0 1239
dec0dbbc 1240 } else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
bd5e81a0 1241 nat_action_info)) {
dec0dbbc 1242 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
a489b168 1243 } else {
dbb597d3 1244 if (ctx->icmp_related) {
bd5e81a0
DB
1245 /* An icmp related conn should always be found; no new
1246 connection is created based on an icmp related packet. */
286de272 1247 pkt->md.ct_state = CS_INVALID;
5c2e106b 1248 } else {
286de272 1249 create_new_conn = true;
5c2e106b 1250 }
a489b168
DDP
1251 }
1252
bd5e81a0 1253 const struct alg_exp_node *alg_exp = NULL;
96bbcbf7 1254 struct alg_exp_node alg_exp_entry;
dec0dbbc 1255
286de272 1256 if (OVS_UNLIKELY(create_new_conn)) {
bd5e81a0
DB
1257
1258 ct_rwlock_rdlock(&ct->resources_lock);
1259 alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
be38342d
DB
1260 ct->hash_basis,
1261 alg_src_ip_wc(ct_alg_ctl));
bd5e81a0 1262 if (alg_exp) {
c3f6bae2 1263 memcpy(&alg_exp_entry, alg_exp, sizeof alg_exp_entry);
bd5e81a0
DB
1264 alg_exp = &alg_exp_entry;
1265 }
1266 ct_rwlock_unlock(&ct->resources_lock);
1267
286de272 1268 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
3a2a425b
DB
1269 &conn_for_un_nat_copy, helper, alg_exp,
1270 ct_alg_ctl);
286de272
DB
1271 }
1272
bd5e81a0
DB
1273 write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1274
286de272
DB
1275 if (conn && setmark) {
1276 set_mark(pkt, conn, setmark[0], setmark[1]);
1277 }
a489b168 1278
286de272
DB
1279 if (conn && setlabel) {
1280 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1281 }
1282
bd5e81a0 1283 struct conn conn_for_expectation;
94e71143 1284 if (OVS_UNLIKELY((ct_alg_ctl != CT_ALG_CTL_NONE) && conn)) {
bd5e81a0
DB
1285 conn_for_expectation = *conn;
1286 }
1287
286de272
DB
1288 ct_lock_unlock(&ct->buckets[bucket].lock);
1289
bd5e81a0
DB
1290 if (is_un_nat_conn_valid(&conn_for_un_nat_copy)) {
1291 create_un_nat_conn(ct, &conn_for_un_nat_copy, now, !!alg_exp);
1292 }
1293
94e71143
DB
1294 handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info,
1295 &conn_for_expectation);
a489b168
DDP
1296}
1297
1298/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
51b9a533 1299 * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have
4ea96698
DB
1300 * the l3 and and l4 offset properly set. Performs fragment reassembly with
1301 * the help of ipf_preprocess_conntrack().
a489b168
DDP
1302 *
1303 * If 'commit' is true, the packets are allowed to create new entries in the
1304 * connection tables. 'setmark', if not NULL, should point to a two
1305 * elements array containing a value and a mask to set the connection mark.
1306 * 'setlabel' behaves similarly for the connection label.*/
1307int
1308conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
a76a37ef 1309 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
66e4ad8a 1310 const uint32_t *setmark,
a489b168 1311 const struct ovs_key_ct_labels *setlabel,
bd7d93f8 1312 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
94053e66
FA
1313 const struct nat_action_info_t *nat_action_info,
1314 long long now)
a489b168 1315{
4ea96698
DB
1316 ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
1317 ct->hash_basis);
1318
43495c45 1319 struct dp_packet *packet;
61ce32b9 1320 struct conn_lookup_ctx ctx;
a489b168 1321
e883448e 1322 DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
4ea96698
DB
1323 if (packet->md.ct_state == CS_INVALID
1324 || !conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
43495c45
BB
1325 packet->md.ct_state = CS_INVALID;
1326 write_ct_md(packet, zone, NULL, NULL, NULL);
a489b168
DDP
1327 continue;
1328 }
94e71143 1329 process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
bd7d93f8 1330 setlabel, nat_action_info, tp_src, tp_dst, helper);
a489b168
DDP
1331 }
1332
4ea96698
DB
1333 ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
1334
a489b168
DDP
1335 return 0;
1336}
1337
1fe178d2
EG
1338void
1339conntrack_clear(struct dp_packet *packet)
1340{
1341 /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
1342 * the conntrack fields invalid. */
1343 packet->md.ct_state = 0;
1344}
1345
a489b168
DDP
1346static void
1347set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1348{
bd5e81a0
DB
1349 if (conn->alg_related) {
1350 pkt->md.ct_mark = conn->mark;
1351 } else {
1352 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1353 conn->mark = pkt->md.ct_mark;
1354 }
a489b168
DDP
1355}
1356
1357static void
1358set_label(struct dp_packet *pkt, struct conn *conn,
1359 const struct ovs_key_ct_labels *val,
1360 const struct ovs_key_ct_labels *mask)
1361{
bd5e81a0
DB
1362 if (conn->alg_related) {
1363 pkt->md.ct_label = conn->label;
1364 } else {
1365 ovs_u128 v, m;
a489b168 1366
bd5e81a0
DB
1367 memcpy(&v, val, sizeof v);
1368 memcpy(&m, mask, sizeof m);
a489b168 1369
bd5e81a0 1370 pkt->md.ct_label.u64.lo = v.u64.lo
a489b168 1371 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
bd5e81a0 1372 pkt->md.ct_label.u64.hi = v.u64.hi
a489b168 1373 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
bd5e81a0
DB
1374 conn->label = pkt->md.ct_label;
1375 }
a489b168 1376}
286de272 1377
a489b168 1378\f
e6ef6cc6
DDP
1379/* Delete the expired connections from 'ctb', up to 'limit'. Returns the
1380 * earliest expiration time among the remaining connections in 'ctb'. Returns
1381 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
1382 * if 'limit' is reached */
1383static long long
bd5e81a0
DB
1384sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb,
1385 long long now, size_t limit)
e6ef6cc6
DDP
1386 OVS_REQUIRES(ctb->lock)
1387{
1388 struct conn *conn, *next;
1389 long long min_expiration = LLONG_MAX;
e6ef6cc6
DDP
1390 size_t count = 0;
1391
dec0dbbc 1392 for (unsigned i = 0; i < N_CT_TM; i++) {
e6ef6cc6 1393 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
286de272
DB
1394 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1395 if (!conn_expired(conn, now) || count >= limit) {
1396 min_expiration = MIN(min_expiration, conn->expiration);
1397 if (count >= limit) {
1398 /* Do not check other lists. */
1399 COVERAGE_INC(conntrack_long_cleanup);
1400 return min_expiration;
1401 }
1402 break;
e6ef6cc6 1403 }
286de272
DB
1404 conn_clean(ct, conn, ctb);
1405 count++;
e6ef6cc6 1406 }
e6ef6cc6
DDP
1407 }
1408 }
e6ef6cc6
DDP
1409 return min_expiration;
1410}
1411
1412/* Cleans up old connection entries from 'ct'. Returns the time when the
1413 * next expiration might happen. The return value might be smaller than
1414 * 'now', meaning that an internal limit has been reached, and some expired
1415 * connections have not been deleted. */
1416static long long
1417conntrack_clean(struct conntrack *ct, long long now)
1418{
1419 long long next_wakeup = now + CT_TM_MIN;
1420 unsigned int n_conn_limit;
1421 size_t clean_count = 0;
e6ef6cc6
DDP
1422
1423 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1424
dec0dbbc 1425 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
e6ef6cc6
DDP
1426 struct conntrack_bucket *ctb = &ct->buckets[i];
1427 size_t prev_count;
1428 long long min_exp;
1429
1430 ovs_mutex_lock(&ctb->cleanup_mutex);
1431 if (ctb->next_cleanup > now) {
1432 goto next_bucket;
1433 }
1434
1435 ct_lock_lock(&ctb->lock);
1436 prev_count = hmap_count(&ctb->connections);
1437 /* If the connections are well distributed among buckets, we want to
1438 * limit to 10% of the global limit equally split among buckets. If
1439 * the bucket is busier than the others, we limit to 10% of its
1440 * current size. */
1441 min_exp = sweep_bucket(ct, ctb, now,
1442 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
1443 clean_count += prev_count - hmap_count(&ctb->connections);
1444
1445 if (min_exp > now) {
1446 /* We call hmap_shrink() only if sweep_bucket() managed to delete
1447 * every expired connection. */
1448 hmap_shrink(&ctb->connections);
1449 }
1450
1451 ct_lock_unlock(&ctb->lock);
1452
1453 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
1454
1455next_bucket:
1456 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
1457 ovs_mutex_unlock(&ctb->cleanup_mutex);
1458 }
1459
1460 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
1461 clean_count, time_msec() - now);
1462
1463 return next_wakeup;
1464}
1465
1466/* Cleanup:
e6ef6cc6
DDP
1467 *
1468 * We must call conntrack_clean() periodically. conntrack_clean() return
1469 * value gives an hint on when the next cleanup must be done (either because
1470 * there is an actual connection that expires, or because a new connection
1471 * might be created with the minimum timeout).
1472 *
1473 * The logic below has two goals:
1474 *
6c54734e
DDP
1475 * - We want to reduce the number of wakeups and batch connection cleanup
1476 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1477 * are coping with the current cleanup tasks, then we wait at least
1478 * 5 seconds to do further cleanup.
e6ef6cc6 1479 *
6c54734e
DDP
1480 * - We don't want to keep the buckets locked too long, as we might prevent
1481 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
1482 * behind, there is at least some 200ms blocks of time when buckets will be
1483 * left alone, so the datapath can operate unhindered.
e6ef6cc6
DDP
1484 */
1485#define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1486#define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1487
1488static void *
1489clean_thread_main(void *f_)
1490{
1491 struct conntrack *ct = f_;
1492
1493 while (!latch_is_set(&ct->clean_thread_exit)) {
1494 long long next_wake;
1495 long long now = time_msec();
e6ef6cc6
DDP
1496 next_wake = conntrack_clean(ct, now);
1497
1498 if (next_wake < now) {
1499 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1500 } else {
1501 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1502 }
1503 latch_wait(&ct->clean_thread_exit);
1504 poll_block();
1505 }
1506
1507 return NULL;
1508}
1509\f
e917d3ee
DB
1510/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1511 * used to store a pointer to the first byte after the L3 header. 'Size' is
1512 * the size of the packet beyond the data pointer. */
a489b168
DDP
1513static inline bool
1514extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1515 const char **new_data, bool validate_checksum)
1516{
e917d3ee
DB
1517 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1518 return false;
a489b168
DDP
1519 }
1520
dec0dbbc
DB
1521 const struct ip_header *ip = data;
1522 size_t ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
a489b168 1523
e917d3ee
DB
1524 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1525 return false;
1526 }
a489b168 1527
e917d3ee
DB
1528 if (OVS_UNLIKELY(size < ip_len)) {
1529 return false;
1530 }
a489b168 1531
e917d3ee
DB
1532 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1533 return false;
a489b168
DDP
1534 }
1535
1536 if (validate_checksum && csum(data, ip_len) != 0) {
1537 return false;
1538 }
1539
e917d3ee
DB
1540 if (new_data) {
1541 *new_data = (char *) data + ip_len;
1542 }
1543
cda1b109
DB
1544 key->src.addr.ipv4 = get_16aligned_be32(&ip->ip_src);
1545 key->dst.addr.ipv4 = get_16aligned_be32(&ip->ip_dst);
a489b168
DDP
1546 key->nw_proto = ip->ip_proto;
1547
1548 return true;
1549}
1550
e917d3ee
DB
1551/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1552 * used to store a pointer to the first byte after the L3 header. 'Size' is
1553 * the size of the packet beyond the data pointer. */
a489b168
DDP
1554static inline bool
1555extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1556 const char **new_data)
1557{
1558 const struct ovs_16aligned_ip6_hdr *ip6 = data;
286de272 1559
e917d3ee
DB
1560 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1561 return false;
a489b168
DDP
1562 }
1563
1564 data = ip6 + 1;
1565 size -= sizeof *ip6;
dec0dbbc
DB
1566 uint8_t nw_proto = ip6->ip6_nxt;
1567 uint8_t nw_frag = 0;
a489b168 1568
523464ab
DB
1569 const struct ovs_16aligned_ip6_frag *frag_hdr;
1570 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, &frag_hdr)) {
a489b168
DDP
1571 return false;
1572 }
1573
a489b168
DDP
1574 if (nw_frag) {
1575 return false;
1576 }
1577
c8b1ad49
DB
1578 if (new_data) {
1579 *new_data = data;
1580 }
1581
cda1b109
DB
1582 memcpy(&key->src.addr.ipv6, &ip6->ip6_src, sizeof key->src.addr);
1583 memcpy(&key->dst.addr.ipv6, &ip6->ip6_dst, sizeof key->dst.addr);
a489b168
DDP
1584 key->nw_proto = nw_proto;
1585
1586 return true;
1587}
1588
1589static inline bool
1590checksum_valid(const struct conn_key *key, const void *data, size_t size,
1591 const void *l3)
1592{
a489b168 1593 if (key->dl_type == htons(ETH_TYPE_IP)) {
76d85771
DB
1594 uint32_t csum = packet_csum_pseudoheader(l3);
1595 return csum_finish(csum_continue(csum, data, size)) == 0;
a489b168 1596 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
76d85771 1597 return packet_csum_upperlayer6(l3, data, key->nw_proto, size) == 0;
a489b168
DDP
1598 } else {
1599 return false;
1600 }
a489b168
DDP
1601}
1602
1603static inline bool
1604check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
324459a3 1605 const void *l3, bool validate_checksum)
a489b168
DDP
1606{
1607 const struct tcp_header *tcp = data;
40225b0c
BP
1608 if (size < sizeof *tcp) {
1609 return false;
1610 }
a489b168 1611
40225b0c 1612 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
a489b168
DDP
1613 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1614 return false;
1615 }
1616
324459a3 1617 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1618}
1619
1620static inline bool
1621check_l4_udp(const struct conn_key *key, const void *data, size_t size,
324459a3 1622 const void *l3, bool validate_checksum)
a489b168
DDP
1623{
1624 const struct udp_header *udp = data;
40225b0c
BP
1625 if (size < sizeof *udp) {
1626 return false;
1627 }
a489b168 1628
40225b0c 1629 size_t udp_len = ntohs(udp->udp_len);
a489b168
DDP
1630 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1631 return false;
1632 }
1633
1634 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1635 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
324459a3 1636 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
a489b168
DDP
1637}
1638
1639static inline bool
324459a3 1640check_l4_icmp(const void *data, size_t size, bool validate_checksum)
a489b168 1641{
324459a3 1642 return validate_checksum ? csum(data, size) == 0 : true;
a489b168
DDP
1643}
1644
1645static inline bool
1646check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
324459a3 1647 const void *l3, bool validate_checksum)
a489b168 1648{
324459a3 1649 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1650}
1651
1652static inline bool
1653extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1654{
a489b168
DDP
1655 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1656 return false;
1657 }
1658
dec0dbbc 1659 const struct tcp_header *tcp = data;
a489b168
DDP
1660 key->src.port = tcp->tcp_src;
1661 key->dst.port = tcp->tcp_dst;
1662
1663 /* Port 0 is invalid */
1664 return key->src.port && key->dst.port;
1665}
1666
1667static inline bool
1668extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1669{
a489b168
DDP
1670 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1671 return false;
1672 }
1673
dec0dbbc 1674 const struct udp_header *udp = data;
a489b168
DDP
1675 key->src.port = udp->udp_src;
1676 key->dst.port = udp->udp_dst;
1677
1678 /* Port 0 is invalid */
1679 return key->src.port && key->dst.port;
1680}
1681
1682static inline bool extract_l4(struct conn_key *key, const void *data,
324459a3
SC
1683 size_t size, bool *related, const void *l3,
1684 bool validate_checksum);
a489b168 1685
b269a122
DDP
1686static uint8_t
1687reverse_icmp_type(uint8_t type)
1688{
1689 switch (type) {
1690 case ICMP4_ECHO_REQUEST:
1691 return ICMP4_ECHO_REPLY;
1692 case ICMP4_ECHO_REPLY:
1693 return ICMP4_ECHO_REQUEST;
1694
1695 case ICMP4_TIMESTAMP:
1696 return ICMP4_TIMESTAMPREPLY;
1697 case ICMP4_TIMESTAMPREPLY:
1698 return ICMP4_TIMESTAMP;
1699
1700 case ICMP4_INFOREQUEST:
1701 return ICMP4_INFOREPLY;
1702 case ICMP4_INFOREPLY:
1703 return ICMP4_INFOREQUEST;
1704 default:
1705 OVS_NOT_REACHED();
1706 }
1707}
1708
a489b168
DDP
1709/* If 'related' is not NULL and the function is processing an ICMP
1710 * error packet, extract the l3 and l4 fields from the nested header
1711 * instead and set *related to true. If 'related' is NULL we're
1712 * already processing a nested header and no such recursion is
1713 * possible */
1714static inline int
1715extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1716 bool *related)
1717{
a489b168
DDP
1718 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1719 return false;
1720 }
1721
dec0dbbc
DB
1722 const struct icmp_header *icmp = data;
1723
a489b168
DDP
1724 switch (icmp->icmp_type) {
1725 case ICMP4_ECHO_REQUEST:
1726 case ICMP4_ECHO_REPLY:
1727 case ICMP4_TIMESTAMP:
1728 case ICMP4_TIMESTAMPREPLY:
1729 case ICMP4_INFOREQUEST:
1730 case ICMP4_INFOREPLY:
b269a122
DDP
1731 if (icmp->icmp_code != 0) {
1732 return false;
1733 }
a489b168 1734 /* Separate ICMP connection: identified using id */
b269a122
DDP
1735 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1736 key->src.icmp_type = icmp->icmp_type;
1737 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
a489b168
DDP
1738 break;
1739 case ICMP4_DST_UNREACH:
1740 case ICMP4_TIME_EXCEEDED:
1741 case ICMP4_PARAM_PROB:
1742 case ICMP4_SOURCEQUENCH:
1743 case ICMP4_REDIRECT: {
1744 /* ICMP packet part of another connection. We should
1745 * extract the key from embedded packet header */
1746 struct conn_key inner_key;
1747 const char *l3 = (const char *) (icmp + 1);
1748 const char *tail = (const char *) data + size;
1749 const char *l4;
a489b168
DDP
1750
1751 if (!related) {
1752 return false;
1753 }
1754
1755 memset(&inner_key, 0, sizeof inner_key);
1756 inner_key.dl_type = htons(ETH_TYPE_IP);
dec0dbbc 1757 bool ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
a489b168
DDP
1758 if (!ok) {
1759 return false;
1760 }
1761
cda1b109 1762 if (inner_key.src.addr.ipv4 != key->dst.addr.ipv4) {
a489b168
DDP
1763 return false;
1764 }
1765
1766 key->src = inner_key.src;
1767 key->dst = inner_key.dst;
1768 key->nw_proto = inner_key.nw_proto;
1769
324459a3 1770 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
a489b168
DDP
1771 if (ok) {
1772 conn_key_reverse(key);
1773 *related = true;
1774 }
1775 return ok;
1776 }
1777 default:
1778 return false;
1779 }
1780
1781 return true;
1782}
1783
b269a122
DDP
1784static uint8_t
1785reverse_icmp6_type(uint8_t type)
1786{
1787 switch (type) {
1788 case ICMP6_ECHO_REQUEST:
1789 return ICMP6_ECHO_REPLY;
1790 case ICMP6_ECHO_REPLY:
1791 return ICMP6_ECHO_REQUEST;
1792 default:
1793 OVS_NOT_REACHED();
1794 }
1795}
1796
a489b168
DDP
1797/* If 'related' is not NULL and the function is processing an ICMP
1798 * error packet, extract the l3 and l4 fields from the nested header
1799 * instead and set *related to true. If 'related' is NULL we're
1800 * already processing a nested header and no such recursion is
1801 * possible */
1802static inline bool
1803extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1804 bool *related)
1805{
1806 const struct icmp6_header *icmp6 = data;
1807
1808 /* All the messages that we support need at least 4 bytes after
1809 * the header */
1810 if (size < sizeof *icmp6 + 4) {
1811 return false;
1812 }
1813
1814 switch (icmp6->icmp6_type) {
1815 case ICMP6_ECHO_REQUEST:
1816 case ICMP6_ECHO_REPLY:
b269a122
DDP
1817 if (icmp6->icmp6_code != 0) {
1818 return false;
1819 }
a489b168 1820 /* Separate ICMP connection: identified using id */
b269a122
DDP
1821 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1822 key->src.icmp_type = icmp6->icmp6_type;
1823 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
a489b168
DDP
1824 break;
1825 case ICMP6_DST_UNREACH:
1826 case ICMP6_PACKET_TOO_BIG:
1827 case ICMP6_TIME_EXCEEDED:
1828 case ICMP6_PARAM_PROB: {
1829 /* ICMP packet part of another connection. We should
1830 * extract the key from embedded packet header */
1831 struct conn_key inner_key;
1832 const char *l3 = (const char *) icmp6 + 8;
1833 const char *tail = (const char *) data + size;
1834 const char *l4 = NULL;
a489b168
DDP
1835
1836 if (!related) {
1837 return false;
1838 }
1839
1840 memset(&inner_key, 0, sizeof inner_key);
1841 inner_key.dl_type = htons(ETH_TYPE_IPV6);
dec0dbbc 1842 bool ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
a489b168
DDP
1843 if (!ok) {
1844 return false;
1845 }
1846
1847 /* pf doesn't do this, but it seems a good idea */
cda1b109
DB
1848 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6,
1849 &key->dst.addr.ipv6)) {
a489b168
DDP
1850 return false;
1851 }
1852
1853 key->src = inner_key.src;
1854 key->dst = inner_key.dst;
1855 key->nw_proto = inner_key.nw_proto;
1856
324459a3 1857 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
a489b168
DDP
1858 if (ok) {
1859 conn_key_reverse(key);
1860 *related = true;
1861 }
1862 return ok;
1863 }
1864 default:
1865 return false;
1866 }
1867
1868 return true;
1869}
1870
1871/* Extract l4 fields into 'key', which must already contain valid l3
1872 * members.
1873 *
1874 * If 'related' is not NULL and an ICMP error packet is being
1875 * processed, the function will extract the key from the packet nested
1401f6de 1876 * in the ICMP payload and set '*related' to true.
a489b168 1877 *
9171c635
DB
1878 * 'size' here is the layer 4 size, which can be a nested size if parsing
1879 * an ICMP or ICMP6 header.
1880 *
a489b168
DDP
1881 * If 'related' is NULL, it means that we're already parsing a header nested
1882 * in an ICMP error. In this case, we skip checksum and length validation. */
1883static inline bool
1884extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
324459a3 1885 const void *l3, bool validate_checksum)
a489b168
DDP
1886{
1887 if (key->nw_proto == IPPROTO_TCP) {
324459a3
SC
1888 return (!related || check_l4_tcp(key, data, size, l3,
1889 validate_checksum)) && extract_l4_tcp(key, data, size);
a489b168 1890 } else if (key->nw_proto == IPPROTO_UDP) {
324459a3
SC
1891 return (!related || check_l4_udp(key, data, size, l3,
1892 validate_checksum)) && extract_l4_udp(key, data, size);
a489b168
DDP
1893 } else if (key->dl_type == htons(ETH_TYPE_IP)
1894 && key->nw_proto == IPPROTO_ICMP) {
324459a3 1895 return (!related || check_l4_icmp(data, size, validate_checksum))
a489b168
DDP
1896 && extract_l4_icmp(key, data, size, related);
1897 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1898 && key->nw_proto == IPPROTO_ICMPV6) {
324459a3
SC
1899 return (!related || check_l4_icmp6(key, data, size, l3,
1900 validate_checksum)) && extract_l4_icmp6(key, data, size,
1901 related);
a489b168
DDP
1902 } else {
1903 return false;
1904 }
1905}
1906
1907static bool
66e4ad8a 1908conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
a489b168
DDP
1909 struct conn_lookup_ctx *ctx, uint16_t zone)
1910{
2482b0b0 1911 const struct eth_header *l2 = dp_packet_eth(pkt);
a489b168
DDP
1912 const struct ip_header *l3 = dp_packet_l3(pkt);
1913 const char *l4 = dp_packet_l4(pkt);
a489b168
DDP
1914
1915 memset(ctx, 0, sizeof *ctx);
1916
1917 if (!l2 || !l3 || !l4) {
1918 return false;
1919 }
1920
1921 ctx->key.zone = zone;
1922
1923 /* XXX In this function we parse the packet (again, it has already
1924 * gone through miniflow_extract()) for two reasons:
1925 *
1926 * 1) To extract the l3 addresses and l4 ports.
1927 * We already have the l3 and l4 headers' pointers. Extracting
1928 * the l3 addresses and the l4 ports is really cheap, since they
1929 * can be found at fixed locations.
66e4ad8a
DDP
1930 * 2) To extract the l4 type.
1931 * Extracting the l4 types, for IPv6 can be quite expensive, because
1932 * it's not at a fixed location.
a489b168
DDP
1933 *
1934 * Here's a way to avoid (2) with the help of the datapath.
66e4ad8a 1935 * The datapath doesn't keep the packet's extracted flow[1], so
a489b168 1936 * using that is not an option. We could use the packet's matching
66e4ad8a
DDP
1937 * megaflow, but we have to make sure that the l4 type (nw_proto)
1938 * is unwildcarded. This means either:
a489b168 1939 *
66e4ad8a
DDP
1940 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1941 * if the actions contains ct().
a489b168 1942 *
66e4ad8a
DDP
1943 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1944 * action. This is already done in different actions, but it's
1945 * unnecessary for the kernel.
a489b168
DDP
1946 *
1947 * ---
66e4ad8a 1948 * [1] The reasons for this are that keeping the flow increases
a489b168
DDP
1949 * (slightly) the cache footprint and increases computation
1950 * time as we move the packet around. Most importantly, the flow
1951 * should be updated by the actions and this can be slow, as
1952 * we use a sparse representation (miniflow).
1953 *
1954 */
dec0dbbc 1955 bool ok;
66e4ad8a 1956 ctx->key.dl_type = dl_type;
dec0dbbc 1957
a489b168 1958 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
dec0dbbc 1959 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
324459a3
SC
1960 if (hwol_bad_l3_csum) {
1961 ok = false;
1962 } else {
dec0dbbc 1963 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
324459a3 1964 /* Validate the checksum only when hwol is not supported. */
9171c635 1965 ok = extract_l3_ipv4(&ctx->key, l3, dp_packet_l3_size(pkt), NULL,
324459a3
SC
1966 !hwol_good_l3_csum);
1967 }
a489b168 1968 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
9171c635 1969 ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL);
a489b168
DDP
1970 } else {
1971 ok = false;
1972 }
1973
1974 if (ok) {
324459a3
SC
1975 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
1976 if (!hwol_bad_l4_csum) {
1977 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt);
1978 /* Validate the checksum only when hwol is not supported. */
9171c635
DB
1979 if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt),
1980 &ctx->icmp_related, l3, !hwol_good_l4_csum)) {
324459a3
SC
1981 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1982 return true;
1983 }
a489b168
DDP
1984 }
1985 }
1986
1987 return false;
1988}
92edd073
DB
1989
1990static uint32_t
cda1b109 1991ct_addr_hash_add(uint32_t hash, const union ct_addr *addr)
92edd073
DB
1992{
1993 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
1994 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
1995}
1996
1997static uint32_t
1998ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
1999{
2000 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
2001 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
2002}
a489b168
DDP
2003\f
2004/* Symmetric */
2005static uint32_t
2006conn_key_hash(const struct conn_key *key, uint32_t basis)
2007{
2008 uint32_t hsrc, hdst, hash;
a489b168 2009 hsrc = hdst = basis;
6b1d4625
DB
2010 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
2011 hdst = ct_endpoint_hash_add(hdst, &key->dst);
a489b168
DDP
2012
2013 /* Even if source and destination are swapped the hash will be the same. */
2014 hash = hsrc ^ hdst;
2015
2016 /* Hash the rest of the key(L3 and L4 types and zone). */
763b40b0 2017 return hash_words((uint32_t *) (&key->dst + 1),
a489b168
DDP
2018 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
2019 hash);
a489b168
DDP
2020}
2021
2022static void
2023conn_key_reverse(struct conn_key *key)
2024{
dec0dbbc 2025 struct ct_endpoint tmp = key->src;
a489b168
DDP
2026 key->src = key->dst;
2027 key->dst = tmp;
2028}
2029
286de272 2030static uint32_t
cda1b109 2031nat_ipv6_addrs_delta(struct in6_addr *ipv6_min, struct in6_addr *ipv6_max)
286de272 2032{
cda1b109
DB
2033 uint8_t *ipv6_min_hi = &ipv6_min->s6_addr[0];
2034 uint8_t *ipv6_min_lo = &ipv6_min->s6_addr[0] + sizeof(uint64_t);
2035 uint8_t *ipv6_max_hi = &ipv6_max->s6_addr[0];
2036 uint8_t *ipv6_max_lo = &ipv6_max->s6_addr[0] + sizeof(uint64_t);
286de272
DB
2037
2038 ovs_be64 addr6_64_min_hi;
2039 ovs_be64 addr6_64_min_lo;
2040 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
2041 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
2042
2043 ovs_be64 addr6_64_max_hi;
2044 ovs_be64 addr6_64_max_lo;
2045 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
2046 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
2047
2048 uint64_t diff;
dec0dbbc 2049
286de272
DB
2050 if (addr6_64_min_hi == addr6_64_max_hi &&
2051 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
2052 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
2053 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
2054 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
2055 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
2056 ntohll(addr6_64_max_lo) - 1);
2057 } else {
2058 /* Limit address delta supported to 32 bits or 4 billion approximately.
2059 * Possibly, this should be visible to the user through a datapath
2060 * support check, however the practical impact is probably nil. */
2061 diff = 0xfffffffe;
2062 }
dec0dbbc 2063
286de272
DB
2064 if (diff > 0xfffffffe) {
2065 diff = 0xfffffffe;
2066 }
2067 return diff;
2068}
2069
2070/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
2071 * restricts the input parameters. */
a489b168 2072static void
cda1b109 2073nat_ipv6_addr_increment(struct in6_addr *ipv6, uint32_t increment)
286de272 2074{
cda1b109
DB
2075 uint8_t *ipv6_hi = &ipv6->s6_addr[0];
2076 uint8_t *ipv6_lo = &ipv6->s6_addr[0] + sizeof(ovs_be64);
286de272
DB
2077 ovs_be64 addr6_64_hi;
2078 ovs_be64 addr6_64_lo;
2079 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
2080 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
2081
2082 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
2083 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
2084 } else if (addr6_64_hi != OVS_BE64_MAX) {
2085 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
2086 addr6_64_lo = htonll(increment - (UINT64_MAX -
2087 ntohll(addr6_64_lo) + 1));
2088 } else {
2089 OVS_NOT_REACHED();
2090 }
2091
2092 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
2093 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
286de272
DB
2094}
2095
2096static uint32_t
2097nat_range_hash(const struct conn *conn, uint32_t basis)
2098{
2099 uint32_t hash = basis;
286de272 2100
92edd073
DB
2101 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
2102 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
2103 hash = hash_add(hash,
2104 (conn->nat_info->max_port << 16)
2105 | conn->nat_info->min_port);
92edd073
DB
2106 hash = ct_endpoint_hash_add(hash, &conn->key.src);
2107 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
286de272
DB
2108 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
2109 hash = hash_add(hash, conn->key.nw_proto);
2110 hash = hash_add(hash, conn->key.zone);
92edd073
DB
2111
2112 /* The purpose of the second parameter is to distinguish hashes of data of
2113 * different length; our data always has the same length so there is no
2114 * value in counting. */
2115 return hash_finish(hash, 0);
286de272
DB
2116}
2117
2118static bool
2119nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
2120 struct conn *nat_conn)
2121{
bd5e81a0
DB
2122 enum { MIN_NAT_EPHEMERAL_PORT = 1024,
2123 MAX_NAT_EPHEMERAL_PORT = 65535 };
286de272
DB
2124
2125 uint16_t min_port;
2126 uint16_t max_port;
2127 uint16_t first_port;
286de272
DB
2128 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
2129
2130 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
2131 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
2132 min_port = ntohs(conn->key.src.port);
2133 max_port = ntohs(conn->key.src.port);
2134 first_port = min_port;
2135 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
2136 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
2137 min_port = ntohs(conn->key.dst.port);
2138 max_port = ntohs(conn->key.dst.port);
2139 first_port = min_port;
2140 } else {
2141 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
2142 uint32_t port_index = hash % (deltap + 1);
2143 first_port = conn->nat_info->min_port + port_index;
2144 min_port = conn->nat_info->min_port;
2145 max_port = conn->nat_info->max_port;
2146 }
2147
2148 uint32_t deltaa = 0;
2149 uint32_t address_index;
cda1b109 2150 union ct_addr ct_addr;
286de272 2151 memset(&ct_addr, 0, sizeof ct_addr);
cda1b109 2152 union ct_addr max_ct_addr;
286de272
DB
2153 memset(&max_ct_addr, 0, sizeof max_ct_addr);
2154 max_ct_addr = conn->nat_info->max_addr;
2155
2156 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
cda1b109
DB
2157 deltaa = ntohl(conn->nat_info->max_addr.ipv4) -
2158 ntohl(conn->nat_info->min_addr.ipv4);
286de272 2159 address_index = hash % (deltaa + 1);
cda1b109
DB
2160 ct_addr.ipv4 = htonl(
2161 ntohl(conn->nat_info->min_addr.ipv4) + address_index);
286de272 2162 } else {
cda1b109
DB
2163 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6,
2164 &conn->nat_info->max_addr.ipv6);
286de272
DB
2165 /* deltaa must be within 32 bits for full hash coverage. A 64 or
2166 * 128 bit hash is unnecessary and hence not used here. Most code
2167 * is kept common with V4; nat_ipv6_addrs_delta() will do the
2168 * enforcement via max_ct_addr. */
2169 max_ct_addr = conn->nat_info->min_addr;
cda1b109 2170 nat_ipv6_addr_increment(&max_ct_addr.ipv6, deltaa);
286de272 2171 address_index = hash % (deltaa + 1);
cda1b109
DB
2172 ct_addr.ipv6 = conn->nat_info->min_addr.ipv6;
2173 nat_ipv6_addr_increment(&ct_addr.ipv6, address_index);
286de272
DB
2174 }
2175
2176 uint16_t port = first_port;
2177 bool all_ports_tried = false;
32b2c81f
DB
2178 /* For DNAT or for specified port ranges, we don't use ephemeral ports. */
2179 bool ephemeral_ports_tried
2180 = conn->nat_info->nat_action & NAT_ACTION_DST ||
2181 conn->nat_info->nat_action & NAT_ACTION_SRC_PORT
2182 ? true : false;
cda1b109 2183 union ct_addr first_addr = ct_addr;
4cd0481c
DB
2184 bool pat_enabled = conn->key.nw_proto != IPPROTO_ICMP &&
2185 conn->key.nw_proto != IPPROTO_ICMPV6;
286de272
DB
2186
2187 while (true) {
2188 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2189 nat_conn->rev_key.dst.addr = ct_addr;
286de272
DB
2190 nat_conn->rev_key.dst.port = htons(port);
2191 } else {
1c8689d7 2192 nat_conn->rev_key.src.addr = ct_addr;
286de272
DB
2193 nat_conn->rev_key.src.port = htons(port);
2194 }
2195
80cee116
DB
2196 bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys, nat_conn,
2197 ct->hash_basis);
2198 if (new_insert) {
286de272 2199 return true;
4cd0481c 2200 } else if (pat_enabled && !all_ports_tried) {
286de272
DB
2201 if (min_port == max_port) {
2202 all_ports_tried = true;
2203 } else if (port == max_port) {
2204 port = min_port;
2205 } else {
2206 port++;
2207 }
2208 if (port == first_port) {
2209 all_ports_tried = true;
2210 }
2211 } else {
2212 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
2213 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2214 ct_addr.ipv4 = htonl(ntohl(ct_addr.ipv4) + 1);
286de272 2215 } else {
cda1b109 2216 nat_ipv6_addr_increment(&ct_addr.ipv6, 1);
286de272
DB
2217 }
2218 } else {
2219 ct_addr = conn->nat_info->min_addr;
2220 }
2221 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
4cd0481c 2222 if (pat_enabled && !ephemeral_ports_tried) {
ac04639a 2223 ephemeral_ports_tried = true;
286de272 2224 ct_addr = conn->nat_info->min_addr;
8417e688 2225 first_addr = ct_addr;
286de272
DB
2226 min_port = MIN_NAT_EPHEMERAL_PORT;
2227 max_port = MAX_NAT_EPHEMERAL_PORT;
2228 } else {
2229 break;
2230 }
2231 }
2232 first_port = min_port;
2233 port = first_port;
2234 all_ports_tried = false;
2235 }
2236 }
2237 return false;
2238}
2239
ac6abe5f 2240/* This function must be called with the ct->resources lock taken. */
286de272
DB
2241static struct nat_conn_key_node *
2242nat_conn_keys_lookup(struct hmap *nat_conn_keys,
2243 const struct conn_key *key,
2244 uint32_t basis)
2245{
2246 struct nat_conn_key_node *nat_conn_key_node;
286de272 2247
dec0dbbc
DB
2248 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node,
2249 conn_key_hash(key, basis), nat_conn_keys) {
5ed7a0b4 2250 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
286de272
DB
2251 return nat_conn_key_node;
2252 }
2253 }
2254 return NULL;
2255}
2256
80cee116
DB
2257/* This function must be called with the ct->resources lock taken. */
2258static bool
2259nat_conn_keys_insert(struct hmap *nat_conn_keys, const struct conn *nat_conn,
2260 uint32_t basis)
2261{
2262 struct nat_conn_key_node *nat_conn_key_node =
2263 nat_conn_keys_lookup(nat_conn_keys, &nat_conn->rev_key, basis);
2264
2265 if (!nat_conn_key_node) {
2266 struct nat_conn_key_node *nat_conn_key = xzalloc(sizeof *nat_conn_key);
2267 nat_conn_key->key = nat_conn->rev_key;
2268 nat_conn_key->value = nat_conn->key;
dec0dbbc
DB
2269 hmap_insert(nat_conn_keys, &nat_conn_key->node,
2270 conn_key_hash(&nat_conn_key->key, basis));
80cee116
DB
2271 return true;
2272 }
2273 return false;
2274}
2275
ac6abe5f 2276/* This function must be called with the ct->resources write lock taken. */
286de272 2277static void
bd5e81a0
DB
2278nat_conn_keys_remove(struct hmap *nat_conn_keys,
2279 const struct conn_key *key,
286de272
DB
2280 uint32_t basis)
2281{
2282 struct nat_conn_key_node *nat_conn_key_node;
286de272 2283
dec0dbbc
DB
2284 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node,
2285 conn_key_hash(key, basis), nat_conn_keys) {
5ed7a0b4 2286 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
286de272
DB
2287 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
2288 free(nat_conn_key_node);
2289 return;
2290 }
2291 }
2292}
2293
2294static void
2295conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
a489b168 2296 long long now)
ac6abe5f 2297 OVS_REQUIRES(ctb->lock)
a489b168
DDP
2298{
2299 uint32_t hash = ctx->hash;
2300 struct conn *conn;
2301
2302 ctx->conn = NULL;
2303
2304 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
5ed7a0b4 2305 if (!conn_key_cmp(&conn->key, &ctx->key)
a489b168
DDP
2306 && !conn_expired(conn, now)) {
2307 ctx->conn = conn;
2308 ctx->reply = false;
2309 break;
2310 }
5ed7a0b4 2311 if (!conn_key_cmp(&conn->rev_key, &ctx->key)
a489b168
DDP
2312 && !conn_expired(conn, now)) {
2313 ctx->conn = conn;
2314 ctx->reply = true;
2315 break;
2316 }
2317 }
2318}
2319
2320static enum ct_update_res
e6ef6cc6
DDP
2321conn_update(struct conn *conn, struct conntrack_bucket *ctb,
2322 struct dp_packet *pkt, bool reply, long long now)
a489b168 2323{
e6ef6cc6
DDP
2324 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
2325 reply, now);
a489b168
DDP
2326}
2327
2328static bool
2329conn_expired(struct conn *conn, long long now)
2330{
286de272
DB
2331 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
2332 return now >= conn->expiration;
2333 }
2334 return false;
a489b168
DDP
2335}
2336
2337static bool
2338valid_new(struct dp_packet *pkt, struct conn_key *key)
2339{
2340 return l4_protos[key->nw_proto]->valid_new(pkt);
2341}
2342
2343static struct conn *
e6ef6cc6
DDP
2344new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
2345 struct conn_key *key, long long now)
a489b168 2346{
dec0dbbc 2347 struct conn *newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
a489b168
DDP
2348 if (newconn) {
2349 newconn->key = *key;
2350 }
2351
2352 return newconn;
2353}
2354
2355static void
2356delete_conn(struct conn *conn)
2357{
286de272 2358 free(conn->nat_info);
bd5e81a0 2359 free(conn->alg);
a489b168
DDP
2360 free(conn);
2361}
4d4e68ed 2362\f
271e48a0
YHW
2363/* Convert a conntrack address 'a' into an IP address 'b' based on 'dl_type'.
2364 *
2365 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2366 * in network-byte order. */
4d4e68ed 2367static void
cda1b109 2368ct_endpoint_to_ct_dpif_inet_addr(const union ct_addr *a,
4d4e68ed
DDP
2369 union ct_dpif_inet_addr *b,
2370 ovs_be16 dl_type)
2371{
2372 if (dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2373 b->ip = a->ipv4;
4d4e68ed 2374 } else if (dl_type == htons(ETH_TYPE_IPV6)){
cda1b109 2375 b->in6 = a->ipv6;
4d4e68ed
DDP
2376 }
2377}
2378
271e48a0
YHW
2379/* Convert an IP address 'a' into a conntrack address 'b' based on 'dl_type'.
2380 *
2381 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2382 * in network-byte order. */
2383static void
2384ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
cda1b109 2385 union ct_addr *b, ovs_be16 dl_type)
271e48a0
YHW
2386{
2387 if (dl_type == htons(ETH_TYPE_IP)) {
cda1b109 2388 b->ipv4 = a->ip;
271e48a0 2389 } else if (dl_type == htons(ETH_TYPE_IPV6)){
cda1b109 2390 b->ipv6 = a->in6;
271e48a0
YHW
2391 }
2392}
2393
4d4e68ed
DDP
2394static void
2395conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2396{
2397 if (key->dl_type == htons(ETH_TYPE_IP)) {
2398 tuple->l3_type = AF_INET;
2399 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2400 tuple->l3_type = AF_INET6;
2401 }
2402 tuple->ip_proto = key->nw_proto;
2403 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2404 key->dl_type);
2405 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2406 key->dl_type);
2407
2408 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
b269a122
DDP
2409 tuple->icmp_id = key->src.icmp_id;
2410 tuple->icmp_type = key->src.icmp_type;
2411 tuple->icmp_code = key->src.icmp_code;
4d4e68ed
DDP
2412 } else {
2413 tuple->src_port = key->src.port;
2414 tuple->dst_port = key->dst.port;
2415 }
2416}
2417
271e48a0
YHW
2418static void
2419tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
2420 struct conn_key *key)
2421{
2422 if (tuple->l3_type == AF_INET) {
2423 key->dl_type = htons(ETH_TYPE_IP);
2424 } else if (tuple->l3_type == AF_INET6) {
2425 key->dl_type = htons(ETH_TYPE_IPV6);
2426 }
2427 key->nw_proto = tuple->ip_proto;
2428 ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
2429 key->dl_type);
2430 ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
2431 key->dl_type);
2432
2433 if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
2434 key->src.icmp_id = tuple->icmp_id;
2435 key->src.icmp_type = tuple->icmp_type;
2436 key->src.icmp_code = tuple->icmp_code;
2437 key->dst.icmp_id = tuple->icmp_id;
2438 key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
2439 key->dst.icmp_code = tuple->icmp_code;
2440 } else {
2441 key->src.port = tuple->src_port;
2442 key->dst.port = tuple->dst_port;
2443 }
2444 key->zone = zone;
2445}
2446
4d4e68ed
DDP
2447static void
2448conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
ded30c74 2449 long long now, int bkt)
4d4e68ed 2450{
4d4e68ed
DDP
2451 memset(entry, 0, sizeof *entry);
2452 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
2453 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
2454
2455 entry->zone = conn->key.zone;
2456 entry->mark = conn->mark;
2457
286de272 2458 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
4d4e68ed
DDP
2459 /* Not implemented yet */
2460 entry->timestamp.start = 0;
2461 entry->timestamp.stop = 0;
2462
dec0dbbc 2463 long long expiration = conn->expiration - now;
4d4e68ed
DDP
2464 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
2465
dec0dbbc 2466 struct ct_l4_proto *class = l4_protos[conn->key.nw_proto];
4d4e68ed
DDP
2467 if (class->conn_get_protoinfo) {
2468 class->conn_get_protoinfo(conn, &entry->protoinfo);
2469 }
bd5e81a0 2470
ded30c74 2471 entry->bkt = bkt;
bd5e81a0
DB
2472
2473 if (conn->alg) {
2474 /* Caller is responsible for freeing. */
2475 entry->helper.name = xstrdup(conn->alg);
2476 }
4d4e68ed
DDP
2477}
2478
4ea96698
DB
2479struct ipf *
2480conntrack_ipf_ctx(struct conntrack *ct)
2481{
2482 return ct->ipf;
2483}
2484
4d4e68ed
DDP
2485int
2486conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
ded30c74 2487 const uint16_t *pzone, int *ptot_bkts)
4d4e68ed
DDP
2488{
2489 memset(dump, 0, sizeof(*dump));
dec0dbbc 2490
4d4e68ed
DDP
2491 if (pzone) {
2492 dump->zone = *pzone;
2493 dump->filter_zone = true;
2494 }
4d4e68ed 2495
dec0dbbc 2496 dump->ct = ct;
ded30c74 2497 *ptot_bkts = CONNTRACK_BUCKETS;
4d4e68ed
DDP
2498 return 0;
2499}
2500
2501int
2502conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2503{
2504 struct conntrack *ct = dump->ct;
2505 long long now = time_msec();
2506
2507 while (dump->bucket < CONNTRACK_BUCKETS) {
2508 struct hmap_node *node;
2509
2510 ct_lock_lock(&ct->buckets[dump->bucket].lock);
2511 for (;;) {
2512 struct conn *conn;
2513
2514 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
2515 &dump->bucket_pos);
2516 if (!node) {
2517 break;
2518 }
2519 INIT_CONTAINER(conn, node, node);
286de272
DB
2520 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2521 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
ded30c74 2522 conn_to_ct_dpif_entry(conn, entry, now, dump->bucket);
4d4e68ed
DDP
2523 break;
2524 }
2525 /* Else continue, until we find an entry in the appropriate zone
2526 * or the bucket has been scanned completely. */
2527 }
2528 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
2529
2530 if (!node) {
2531 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
2532 dump->bucket++;
2533 } else {
2534 return 0;
2535 }
2536 }
2537 return EOF;
2538}
2539
2540int
2541conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2542{
2543 return 0;
2544}
5d9cbb4c
DDP
2545
2546int
2547conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2548{
dec0dbbc 2549 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
5d9cbb4c
DDP
2550 struct conn *conn, *next;
2551
2552 ct_lock_lock(&ct->buckets[i].lock);
bd5e81a0 2553 HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
286de272
DB
2554 if ((!zone || *zone == conn->key.zone) &&
2555 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
2556 conn_clean(ct, conn, &ct->buckets[i]);
5d9cbb4c
DDP
2557 }
2558 }
2559 ct_lock_unlock(&ct->buckets[i].lock);
2560 }
bd5e81a0 2561
5d9cbb4c
DDP
2562 return 0;
2563}
bd5e81a0 2564
271e48a0
YHW
2565int
2566conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
2567 uint16_t zone)
2568{
2569 struct conn_lookup_ctx ctx;
2570 int error = 0;
2571
2572 memset(&ctx, 0, sizeof(ctx));
2573 tuple_to_conn_key(tuple, zone, &ctx.key);
2574 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
2575 unsigned bucket = hash_to_bucket(ctx.hash);
2576
2577 ct_lock_lock(&ct->buckets[bucket].lock);
2578 conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
a1d5eeff 2579 if (ctx.conn && ctx.conn->conn_type == CT_CONN_TYPE_DEFAULT) {
271e48a0
YHW
2580 conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
2581 } else {
a1d5eeff 2582 VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
271e48a0
YHW
2583 error = ENOENT;
2584 }
2585 ct_lock_unlock(&ct->buckets[bucket].lock);
2586 return error;
2587}
2588
c92339ad
DB
2589int
2590conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns)
2591{
2592 atomic_store_relaxed(&ct->n_conn_limit, maxconns);
2593 return 0;
2594}
2595
2596int
2597conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns)
2598{
2599 atomic_read_relaxed(&ct->n_conn_limit, maxconns);
2600 return 0;
2601}
2602
875075b3
DB
2603int
2604conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns)
2605{
2606 *nconns = atomic_count_get(&ct->n_conn);
2607 return 0;
2608}
2609
bd5e81a0
DB
2610/* This function must be called with the ct->resources read lock taken. */
2611static struct alg_exp_node *
be38342d
DB
2612expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
2613 uint32_t basis, bool src_ip_wc)
bd5e81a0 2614{
c3f6bae2
DB
2615 struct conn_key check_key;
2616 memcpy(&check_key, key, sizeof check_key);
bd5e81a0 2617 check_key.src.port = ALG_WC_SRC_PORT;
dec0dbbc 2618
be38342d
DB
2619 if (src_ip_wc) {
2620 memset(&check_key.src.addr, 0, sizeof check_key.src.addr);
2621 }
dec0dbbc 2622
bd5e81a0
DB
2623 struct alg_exp_node *alg_exp_node;
2624
bd5e81a0 2625 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
dec0dbbc 2626 conn_key_hash(&check_key, basis),
bd5e81a0
DB
2627 alg_expectations) {
2628 if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
2629 return alg_exp_node;
2630 }
2631 }
2632 return NULL;
2633}
2634
4417ca3d
DB
2635/* This function must be called with the ct->resources write lock taken. */
2636static void
2637expectation_remove(struct hmap *alg_expectations,
2638 const struct conn_key *key, uint32_t basis)
2639{
2640 struct alg_exp_node *alg_exp_node;
2641
2642 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node, conn_key_hash(key, basis),
2643 alg_expectations) {
2644 if (!conn_key_cmp(&alg_exp_node->key, key)) {
2645 hmap_remove(alg_expectations, &alg_exp_node->node);
2646 break;
2647 }
2648 }
2649}
2650
2651/* This function must be called with the ct->resources read lock taken. */
2652static struct alg_exp_node *
2653expectation_ref_lookup_unique(const struct hindex *alg_expectation_refs,
2654 const struct conn_key *master_key,
2655 const struct conn_key *alg_exp_key,
2656 uint32_t basis)
2657{
2658 struct alg_exp_node *alg_exp_node;
2659
2660 HINDEX_FOR_EACH_WITH_HASH (alg_exp_node, node_ref,
2661 conn_key_hash(master_key, basis),
2662 alg_expectation_refs) {
2663 if (!conn_key_cmp(&alg_exp_node->master_key, master_key) &&
2664 !conn_key_cmp(&alg_exp_node->key, alg_exp_key)) {
2665 return alg_exp_node;
2666 }
2667 }
2668 return NULL;
2669}
2670
2671/* This function must be called with the ct->resources write lock taken. */
2672static void
2673expectation_ref_create(struct hindex *alg_expectation_refs,
2674 struct alg_exp_node *alg_exp_node,
2675 uint32_t basis)
2676{
2677 if (!expectation_ref_lookup_unique(alg_expectation_refs,
2678 &alg_exp_node->master_key,
2679 &alg_exp_node->key, basis)) {
2680 hindex_insert(alg_expectation_refs, &alg_exp_node->node_ref,
2681 conn_key_hash(&alg_exp_node->master_key, basis));
2682 }
2683}
2684
2685static void
2686expectation_clean(struct conntrack *ct, const struct conn_key *master_key,
2687 uint32_t basis)
2688{
2689 ct_rwlock_wrlock(&ct->resources_lock);
2690
2691 struct alg_exp_node *node, *next;
2692 HINDEX_FOR_EACH_WITH_HASH_SAFE (node, next, node_ref,
2693 conn_key_hash(master_key, basis),
2694 &ct->alg_expectation_refs) {
2695 if (!conn_key_cmp(&node->master_key, master_key)) {
2696 expectation_remove(&ct->alg_expectations, &node->key, basis);
2697 hindex_remove(&ct->alg_expectation_refs, &node->node_ref);
2698 free(node);
2699 }
2700 }
2701
2702 ct_rwlock_unlock(&ct->resources_lock);
2703}
2704
bd5e81a0 2705static void
be38342d
DB
2706expectation_create(struct conntrack *ct, ovs_be16 dst_port,
2707 const struct conn *master_conn, bool reply, bool src_ip_wc,
2708 bool skip_nat)
bd5e81a0 2709{
cda1b109
DB
2710 union ct_addr src_addr;
2711 union ct_addr dst_addr;
2712 union ct_addr alg_nat_repl_addr;
be38342d 2713 struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
bd5e81a0 2714
be38342d 2715 if (reply) {
bd5e81a0
DB
2716 src_addr = master_conn->key.src.addr;
2717 dst_addr = master_conn->key.dst.addr;
efa29a89 2718 alg_exp_node->nat_rpl_dst = true;
be38342d
DB
2719 if (skip_nat) {
2720 alg_nat_repl_addr = dst_addr;
efa29a89
DM
2721 } else if (master_conn->nat_info &&
2722 master_conn->nat_info->nat_action & NAT_ACTION_DST) {
2723 alg_nat_repl_addr = master_conn->rev_key.src.addr;
2724 alg_exp_node->nat_rpl_dst = false;
be38342d
DB
2725 } else {
2726 alg_nat_repl_addr = master_conn->rev_key.dst.addr;
2727 }
be38342d
DB
2728 } else {
2729 src_addr = master_conn->rev_key.src.addr;
2730 dst_addr = master_conn->rev_key.dst.addr;
efa29a89 2731 alg_exp_node->nat_rpl_dst = false;
be38342d
DB
2732 if (skip_nat) {
2733 alg_nat_repl_addr = src_addr;
efa29a89
DM
2734 } else if (master_conn->nat_info &&
2735 master_conn->nat_info->nat_action & NAT_ACTION_DST) {
2736 alg_nat_repl_addr = master_conn->key.dst.addr;
2737 alg_exp_node->nat_rpl_dst = true;
be38342d
DB
2738 } else {
2739 alg_nat_repl_addr = master_conn->key.src.addr;
2740 }
be38342d
DB
2741 }
2742 if (src_ip_wc) {
2743 memset(&src_addr, 0, sizeof src_addr);
bd5e81a0
DB
2744 }
2745
bd5e81a0
DB
2746 alg_exp_node->key.dl_type = master_conn->key.dl_type;
2747 alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
2748 alg_exp_node->key.zone = master_conn->key.zone;
2749 alg_exp_node->key.src.addr = src_addr;
2750 alg_exp_node->key.dst.addr = dst_addr;
2751 alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
2752 alg_exp_node->key.dst.port = dst_port;
2753 alg_exp_node->master_mark = master_conn->mark;
2754 alg_exp_node->master_label = master_conn->label;
2755 alg_exp_node->master_key = master_conn->key;
bd5e81a0
DB
2756 /* Take the write lock here because it is almost 100%
2757 * likely that the lookup will fail and
2758 * expectation_create() will be called below. */
2759 ct_rwlock_wrlock(&ct->resources_lock);
2760 struct alg_exp_node *alg_exp = expectation_lookup(
be38342d 2761 &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis, src_ip_wc);
bd5e81a0
DB
2762 if (alg_exp) {
2763 free(alg_exp_node);
2764 ct_rwlock_unlock(&ct->resources_lock);
2765 return;
2766 }
2767
2768 alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
4417ca3d 2769 hmap_insert(&ct->alg_expectations, &alg_exp_node->node,
dec0dbbc 2770 conn_key_hash(&alg_exp_node->key, ct->hash_basis));
4417ca3d
DB
2771 expectation_ref_create(&ct->alg_expectation_refs, alg_exp_node,
2772 ct->hash_basis);
bd5e81a0
DB
2773 ct_rwlock_unlock(&ct->resources_lock);
2774}
2775
bd5e81a0
DB
2776static void
2777replace_substring(char *substr, uint8_t substr_size,
2778 uint8_t total_size, char *rep_str,
2779 uint8_t rep_str_size)
2780{
2781 memmove(substr + rep_str_size, substr + substr_size,
2782 total_size - substr_size);
2783 memcpy(substr, rep_str, rep_str_size);
2784}
2785
cd7c99a6
DB
2786static void
2787repl_bytes(char *str, char c1, char c2)
2788{
2789 while (*str) {
2790 if (*str == c1) {
2791 *str = c2;
2792 }
2793 str++;
2794 }
2795}
2796
2797static void
2798modify_packet(struct dp_packet *pkt, char *pkt_str, size_t size,
2799 char *repl_str, size_t repl_size,
2800 uint32_t orig_used_size)
2801{
2802 replace_substring(pkt_str, size,
2803 (const char *) dp_packet_tail(pkt) - pkt_str,
2804 repl_str, repl_size);
2805 dp_packet_set_size(pkt, orig_used_size + (int) repl_size - (int) size);
2806}
2807
bd5e81a0
DB
2808/* Replace IPV4 address in FTP message with NATed address. */
2809static int
2810repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
2811 char *ftp_data_start,
cd7c99a6
DB
2812 size_t addr_offset_from_ftp_data_start,
2813 size_t addr_size OVS_UNUSED)
bd5e81a0
DB
2814{
2815 enum { MAX_FTP_V4_NAT_DELTA = 8 };
2816
2817 /* Do conservative check for pathological MTU usage. */
2818 uint32_t orig_used_size = dp_packet_size(pkt);
cd7c99a6
DB
2819 if (orig_used_size + MAX_FTP_V4_NAT_DELTA >
2820 dp_packet_get_allocated(pkt)) {
2821
bd5e81a0 2822 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
cd7c99a6
DB
2823 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V4",
2824 dp_packet_get_allocated(pkt));
bd5e81a0
DB
2825 return 0;
2826 }
2827
cd7c99a6
DB
2828 char v4_addr_str[INET_ADDRSTRLEN] = {0};
2829 ovs_assert(inet_ntop(AF_INET, &v4_addr_rep, v4_addr_str,
2830 sizeof v4_addr_str));
2831 repl_bytes(v4_addr_str, '.', ',');
2832 modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
2833 addr_size, v4_addr_str, strlen(v4_addr_str),
2834 orig_used_size);
2835 return (int) strlen(v4_addr_str) - (int) addr_size;
bd5e81a0
DB
2836}
2837
2838static char *
2839skip_non_digits(char *str)
2840{
2841 while (!isdigit(*str) && *str != 0) {
2842 str++;
2843 }
2844 return str;
2845}
2846
2847static char *
2848terminate_number_str(char *str, uint8_t max_digits)
2849{
2850 uint8_t digits_found = 0;
2851 while (isdigit(*str) && digits_found <= max_digits) {
2852 str++;
2853 digits_found++;
2854 }
2855
2856 *str = 0;
2857 return str;
2858}
2859
2860
2861static void
2862get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
2863{
2864 struct tcp_header *th = dp_packet_l4(pkt);
2865 char *tcp_hdr = (char *) th;
2866 uint32_t tcp_payload_len = tcp_payload_length(pkt);
2867 size_t tcp_payload_of_interest = MIN(tcp_payload_len,
2868 LARGEST_FTP_MSG_OF_INTEREST);
2869 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2870
2871 ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
2872 tcp_payload_of_interest);
2873}
2874
2875static enum ftp_ctl_pkt
2876detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
2877 struct dp_packet *pkt)
2878{
bd5e81a0
DB
2879 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2880 get_ftp_ctl_msg(pkt, ftp_msg);
dec0dbbc 2881
bd5e81a0
DB
2882 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2883 if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
2884 !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
2885 return CT_FTP_CTL_OTHER;
2886 }
2887 } else {
2888 if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
2889 strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
2890 strlen(FTP_PASV_REPLY_CODE))) {
2891 return CT_FTP_CTL_OTHER;
2892 }
2893 }
2894
2895 return CT_FTP_CTL_INTEREST;
2896}
2897
2898static enum ftp_ctl_pkt
2899process_ftp_ctl_v4(struct conntrack *ct,
2900 struct dp_packet *pkt,
2901 const struct conn *conn_for_expectation,
4417ca3d 2902 ovs_be32 *v4_addr_rep,
bd5e81a0 2903 char **ftp_data_v4_start,
cd7c99a6
DB
2904 size_t *addr_offset_from_ftp_data_start,
2905 size_t *addr_size)
bd5e81a0
DB
2906{
2907 struct tcp_header *th = dp_packet_l4(pkt);
2908 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2909 char *tcp_hdr = (char *) th;
2910 *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
2911 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2912 get_ftp_ctl_msg(pkt, ftp_msg);
bd5e81a0
DB
2913 char *ftp = ftp_msg;
2914 enum ct_alg_mode mode;
dec0dbbc 2915
23bea975 2916 if (!strncasecmp(ftp, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
bd5e81a0
DB
2917 ftp = ftp_msg + strlen(FTP_PORT_CMD);
2918 mode = CT_FTP_MODE_ACTIVE;
2919 } else {
2920 ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
2921 mode = CT_FTP_MODE_PASSIVE;
2922 }
2923
2924 /* Find first space. */
2925 ftp = strchr(ftp, ' ');
2926 if (!ftp) {
2927 return CT_FTP_CTL_INVALID;
2928 }
2929
2930 /* Find the first digit, after space. */
2931 ftp = skip_non_digits(ftp);
2932 if (*ftp == 0) {
2933 return CT_FTP_CTL_INVALID;
2934 }
2935
2936 char *ip_addr_start = ftp;
2937 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
bd5e81a0 2938
dec0dbbc 2939 uint8_t comma_count = 0;
bd5e81a0
DB
2940 while (comma_count < 4 && *ftp) {
2941 if (*ftp == ',') {
2942 comma_count++;
2943 if (comma_count == 4) {
2944 *ftp = 0;
2945 } else {
2946 *ftp = '.';
2947 }
2948 }
2949 ftp++;
2950 }
2951 if (comma_count != 4) {
2952 return CT_FTP_CTL_INVALID;
2953 }
2954
2955 struct in_addr ip_addr;
2956 int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
2957 if (rc2 != 1) {
2958 return CT_FTP_CTL_INVALID;
2959 }
2960
cd7c99a6 2961 *addr_size = ftp - ip_addr_start - 1;
bd5e81a0
DB
2962 char *save_ftp = ftp;
2963 ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
2964 if (!ftp) {
2965 return CT_FTP_CTL_INVALID;
2966 }
2967 int value;
2968 if (!str_to_int(save_ftp, 10, &value)) {
2969 return CT_FTP_CTL_INVALID;
2970 }
2971
2972 /* This is derived from the L4 port maximum is 65535. */
2973 if (value > 255) {
2974 return CT_FTP_CTL_INVALID;
2975 }
2976
2977 uint16_t port_hs = value;
2978 port_hs <<= 8;
2979
2980 /* Skip over comma. */
2981 ftp++;
2982 save_ftp = ftp;
2983 bool digit_found = false;
2984 while (isdigit(*ftp)) {
2985 ftp++;
2986 digit_found = true;
2987 }
2988 if (!digit_found) {
2989 return CT_FTP_CTL_INVALID;
2990 }
2991 *ftp = 0;
2992 if (!str_to_int(save_ftp, 10, &value)) {
2993 return CT_FTP_CTL_INVALID;
2994 }
2995
2996 if (value > 255) {
2997 return CT_FTP_CTL_INVALID;
2998 }
2999
78a0b272 3000 port_hs |= value;
bd5e81a0
DB
3001 ovs_be16 port = htons(port_hs);
3002 ovs_be32 conn_ipv4_addr;
3003
3004 switch (mode) {
3005 case CT_FTP_MODE_ACTIVE:
cda1b109
DB
3006 *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4;
3007 conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4;
bd5e81a0
DB
3008 break;
3009 case CT_FTP_MODE_PASSIVE:
cda1b109
DB
3010 *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4;
3011 conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4;
bd5e81a0 3012 break;
7be77cb0 3013 case CT_TFTP_MODE:
bd5e81a0
DB
3014 default:
3015 OVS_NOT_REACHED();
3016 }
3017
3018 ovs_be32 ftp_ipv4_addr;
3019 ftp_ipv4_addr = ip_addr.s_addr;
3020 /* Although most servers will block this exploit, there may be some
3021 * less well managed. */
3022 if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
3023 return CT_FTP_CTL_INVALID;
3024 }
3025
be38342d
DB
3026 expectation_create(ct, port, conn_for_expectation,
3027 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
3028 return CT_FTP_CTL_INTEREST;
3029}
3030
3031static char *
3032skip_ipv6_digits(char *str)
3033{
3034 while (isxdigit(*str) || *str == ':' || *str == '.') {
3035 str++;
3036 }
3037 return str;
3038}
3039
3040static enum ftp_ctl_pkt
3041process_ftp_ctl_v6(struct conntrack *ct,
3042 struct dp_packet *pkt,
3043 const struct conn *conn_for_expectation,
cda1b109 3044 union ct_addr *v6_addr_rep, char **ftp_data_start,
bd5e81a0
DB
3045 size_t *addr_offset_from_ftp_data_start,
3046 size_t *addr_size, enum ct_alg_mode *mode)
3047{
3048 struct tcp_header *th = dp_packet_l4(pkt);
3049 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3050 char *tcp_hdr = (char *) th;
3051 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
bd5e81a0
DB
3052 get_ftp_ctl_msg(pkt, ftp_msg);
3053 *ftp_data_start = tcp_hdr + tcp_hdr_len;
bd5e81a0
DB
3054 char *ftp = ftp_msg;
3055 struct in6_addr ip6_addr;
dec0dbbc 3056
23bea975 3057 if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
bd5e81a0
DB
3058 ftp = ftp_msg + strlen(FTP_EPRT_CMD);
3059 ftp = skip_non_digits(ftp);
3060 if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
3061 return CT_FTP_CTL_INVALID;
3062 }
3063 /* Jump over delimiter. */
3064 ftp += 2;
3065
bd5e81a0 3066 memset(&ip6_addr, 0, sizeof ip6_addr);
dec0dbbc 3067 char *ip_addr_start = ftp;
bd5e81a0
DB
3068 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
3069 ftp = skip_ipv6_digits(ftp);
3070 *ftp = 0;
3071 *addr_size = ftp - ip_addr_start;
3072 int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
3073 if (rc2 != 1) {
3074 return CT_FTP_CTL_INVALID;
3075 }
3076 ftp++;
3077 *mode = CT_FTP_MODE_ACTIVE;
3078 } else {
3079 ftp = ftp_msg + strcspn(ftp_msg, "(");
3080 ftp = skip_non_digits(ftp);
3081 if (!isdigit(*ftp)) {
3082 return CT_FTP_CTL_INVALID;
3083 }
3084
3085 /* Not used for passive mode. */
3086 *addr_offset_from_ftp_data_start = 0;
3087 *addr_size = 0;
3088
3089 *mode = CT_FTP_MODE_PASSIVE;
3090 }
3091
3092 char *save_ftp = ftp;
3093 ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
3094 if (!ftp) {
3095 return CT_FTP_CTL_INVALID;
3096 }
dec0dbbc 3097
bd5e81a0
DB
3098 int value;
3099 if (!str_to_int(save_ftp, 10, &value)) {
3100 return CT_FTP_CTL_INVALID;
3101 }
3102 if (value > CT_MAX_L4_PORT) {
3103 return CT_FTP_CTL_INVALID;
3104 }
3105
3106 uint16_t port_hs = value;
3107 ovs_be16 port = htons(port_hs);
3108
3109 switch (*mode) {
3110 case CT_FTP_MODE_ACTIVE:
3111 *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
3112 /* Although most servers will block this exploit, there may be some
3113 * less well managed. */
cda1b109
DB
3114 if (memcmp(&ip6_addr, &v6_addr_rep->ipv6, sizeof ip6_addr) &&
3115 memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6,
bd5e81a0
DB
3116 sizeof ip6_addr)) {
3117 return CT_FTP_CTL_INVALID;
3118 }
3119 break;
3120 case CT_FTP_MODE_PASSIVE:
3121 *v6_addr_rep = conn_for_expectation->key.dst.addr;
3122 break;
7be77cb0 3123 case CT_TFTP_MODE:
bd5e81a0
DB
3124 default:
3125 OVS_NOT_REACHED();
3126 }
3127
be38342d
DB
3128 expectation_create(ct, port, conn_for_expectation,
3129 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
3130 return CT_FTP_CTL_INTEREST;
3131}
3132
3133static int
cda1b109 3134repl_ftp_v6_addr(struct dp_packet *pkt, union ct_addr v6_addr_rep,
bd5e81a0
DB
3135 char *ftp_data_start,
3136 size_t addr_offset_from_ftp_data_start,
3137 size_t addr_size, enum ct_alg_mode mode)
3138{
3139 /* This is slightly bigger than really possible. */
3140 enum { MAX_FTP_V6_NAT_DELTA = 45 };
3141
3142 if (mode == CT_FTP_MODE_PASSIVE) {
3143 return 0;
3144 }
3145
3146 /* Do conservative check for pathological MTU usage. */
3147 uint32_t orig_used_size = dp_packet_size(pkt);
cd7c99a6
DB
3148 if (orig_used_size + MAX_FTP_V6_NAT_DELTA >
3149 dp_packet_get_allocated(pkt)) {
3150
bd5e81a0 3151 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
cd7c99a6
DB
3152 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V6",
3153 dp_packet_get_allocated(pkt));
bd5e81a0
DB
3154 return 0;
3155 }
3156
298530b8 3157 char v6_addr_str[INET6_ADDRSTRLEN] = {0};
cda1b109 3158 ovs_assert(inet_ntop(AF_INET6, &v6_addr_rep.ipv6, v6_addr_str,
298530b8 3159 sizeof v6_addr_str));
cd7c99a6
DB
3160 modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
3161 addr_size, v6_addr_str, strlen(v6_addr_str),
3162 orig_used_size);
3163 return (int) strlen(v6_addr_str) - (int) addr_size;
bd5e81a0
DB
3164}
3165
d13d7115
DB
3166/* Increment/decrement a TCP sequence number. */
3167static void
3168adj_seqnum(ovs_16aligned_be32 *val, int32_t inc)
3169{
3170 put_16aligned_be32(val, htonl(ntohl(get_16aligned_be32(val)) + inc));
3171}
3172
bd5e81a0
DB
3173static void
3174handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
253e4dc0
DM
3175 struct dp_packet *pkt, const struct conn *ec, long long now,
3176 enum ftp_ctl_pkt ftp_ctl, bool nat)
bd5e81a0
DB
3177{
3178 struct ip_header *l3_hdr = dp_packet_l3(pkt);
3179 ovs_be32 v4_addr_rep = 0;
cda1b109 3180 union ct_addr v6_addr_rep;
faa0826d 3181 size_t addr_offset_from_ftp_data_start = 0;
bd5e81a0
DB
3182 size_t addr_size = 0;
3183 char *ftp_data_start;
bd5e81a0
DB
3184 enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
3185
3186 if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
3187 return;
3188 }
3189
bd5e81a0
DB
3190 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
3191 int64_t seq_skew = 0;
dec0dbbc 3192
253e4dc0 3193 if (ftp_ctl == CT_FTP_CTL_INTEREST) {
bd5e81a0
DB
3194 enum ftp_ctl_pkt rc;
3195 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
253e4dc0 3196 rc = process_ftp_ctl_v6(ct, pkt, ec,
4417ca3d 3197 &v6_addr_rep, &ftp_data_start,
bd5e81a0
DB
3198 &addr_offset_from_ftp_data_start,
3199 &addr_size, &mode);
3200 } else {
253e4dc0 3201 rc = process_ftp_ctl_v4(ct, pkt, ec,
4417ca3d 3202 &v4_addr_rep, &ftp_data_start,
cd7c99a6
DB
3203 &addr_offset_from_ftp_data_start,
3204 &addr_size);
bd5e81a0
DB
3205 }
3206 if (rc == CT_FTP_CTL_INVALID) {
3207 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3208 VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
3209 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
3210 return;
3211 } else if (rc == CT_FTP_CTL_INTEREST) {
3212 uint16_t ip_len;
dec0dbbc 3213
bd5e81a0 3214 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
253e4dc0
DM
3215 if (nat) {
3216 seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep,
3217 ftp_data_start,
3218 addr_offset_from_ftp_data_start,
3219 addr_size, mode);
3220 }
3221
bd5e81a0 3222 if (seq_skew) {
253e4dc0
DM
3223 ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen) +
3224 seq_skew;
bd5e81a0 3225 nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
bd5e81a0
DB
3226 }
3227 } else {
253e4dc0
DM
3228 if (nat) {
3229 seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep,
3230 ftp_data_start,
cd7c99a6
DB
3231 addr_offset_from_ftp_data_start,
3232 addr_size);
253e4dc0 3233 }
bd5e81a0 3234 if (seq_skew) {
253e4dc0 3235 ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew;
bd5e81a0
DB
3236 l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
3237 l3_hdr->ip_tot_len, htons(ip_len));
3238 l3_hdr->ip_tot_len = htons(ip_len);
bd5e81a0
DB
3239 }
3240 }
3241 } else {
3242 OVS_NOT_REACHED();
3243 }
bd5e81a0
DB
3244 }
3245
3246 struct tcp_header *th = dp_packet_l4(pkt);
dec0dbbc 3247
253e4dc0 3248 if (nat && ec->seq_skew != 0) {
d13d7115
DB
3249 ctx->reply != ec->seq_skew_dir ?
3250 adj_seqnum(&th->tcp_ack, -ec->seq_skew) :
3251 adj_seqnum(&th->tcp_seq, ec->seq_skew);
bd5e81a0
DB
3252 }
3253
bd5e81a0 3254 th->tcp_csum = 0;
bd5e81a0 3255 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
76d85771
DB
3256 th->tcp_csum = packet_csum_upperlayer6(nh6, th, ctx->key.nw_proto,
3257 dp_packet_l4_size(pkt));
bd5e81a0 3258 } else {
76d85771
DB
3259 uint32_t tcp_csum = packet_csum_pseudoheader(l3_hdr);
3260 th->tcp_csum = csum_finish(
3261 csum_continue(tcp_csum, th, dp_packet_l4_size(pkt)));
bd5e81a0 3262 }
253e4dc0
DM
3263
3264 if (seq_skew) {
3265 conn_seq_skew_set(ct, &ec->key, now, seq_skew + ec->seq_skew,
3266 ctx->reply);
3267 }
bd5e81a0 3268}
7be77cb0
DB
3269
3270static void
3271handle_tftp_ctl(struct conntrack *ct,
94e71143 3272 const struct conn_lookup_ctx *ctx OVS_UNUSED,
be38342d 3273 struct dp_packet *pkt,
7be77cb0 3274 const struct conn *conn_for_expectation,
4417ca3d
DB
3275 long long now OVS_UNUSED,
3276 enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED)
7be77cb0 3277{
be38342d
DB
3278 expectation_create(ct, conn_for_expectation->key.src.port,
3279 conn_for_expectation,
3280 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
7be77cb0 3281}