]> git.proxmox.com Git - mirror_ovs.git/blame - lib/conntrack.c
conntrack: Remove redundant call to 'hash_finish()'.
[mirror_ovs.git] / lib / conntrack.c
CommitLineData
a489b168 1/*
4ea96698 2 * Copyright (c) 2015-2019 Nicira, Inc.
a489b168
DDP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
bd5e81a0 18#include <ctype.h>
a489b168 19#include <errno.h>
ff6aa424 20#include <sys/types.h>
a489b168
DDP
21#include <netinet/in.h>
22#include <netinet/icmp6.h>
bd5e81a0 23#include <string.h>
a489b168
DDP
24
25#include "bitmap.h"
bd5e81a0 26#include "conntrack.h"
a489b168
DDP
27#include "conntrack-private.h"
28#include "coverage.h"
29#include "csum.h"
4d4e68ed 30#include "ct-dpif.h"
a489b168
DDP
31#include "dp-packet.h"
32#include "flow.h"
4ea96698 33#include "ipf.h"
a489b168
DDP
34#include "netdev.h"
35#include "odp-netlink.h"
36#include "openvswitch/hmap.h"
37#include "openvswitch/vlog.h"
38#include "ovs-rcu.h"
e6ef6cc6 39#include "ovs-thread.h"
fd016ae3 40#include "openvswitch/poll-loop.h"
a489b168
DDP
41#include "random.h"
42#include "timeval.h"
43
44VLOG_DEFINE_THIS_MODULE(conntrack);
45
46COVERAGE_DEFINE(conntrack_full);
e6ef6cc6 47COVERAGE_DEFINE(conntrack_long_cleanup);
a489b168
DDP
48
49struct conn_lookup_ctx {
50 struct conn_key key;
51 struct conn *conn;
52 uint32_t hash;
53 bool reply;
dbb597d3 54 bool icmp_related;
a489b168
DDP
55};
56
bd5e81a0
DB
57enum ftp_ctl_pkt {
58 /* Control packets with address and/or port specifiers. */
59 CT_FTP_CTL_INTEREST,
60 /* Control packets without address and/or port specifiers. */
61 CT_FTP_CTL_OTHER,
62 CT_FTP_CTL_INVALID,
63};
64
65enum ct_alg_mode {
66 CT_FTP_MODE_ACTIVE,
67 CT_FTP_MODE_PASSIVE,
7be77cb0 68 CT_TFTP_MODE,
bd5e81a0
DB
69};
70
94e71143
DB
71enum ct_alg_ctl_type {
72 CT_ALG_CTL_NONE,
73 CT_ALG_CTL_FTP,
74 CT_ALG_CTL_TFTP,
be38342d
DB
75 /* SIP is not enabled through Openflow and presently only used as
76 * an example of an alg that allows a wildcard src ip. */
77 CT_ALG_CTL_SIP,
94e71143
DB
78};
79
a489b168 80static bool conn_key_extract(struct conntrack *, struct dp_packet *,
66e4ad8a
DDP
81 ovs_be16 dl_type, struct conn_lookup_ctx *,
82 uint16_t zone);
a489b168
DDP
83static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
84static void conn_key_reverse(struct conn_key *);
85static void conn_key_lookup(struct conntrack_bucket *ctb,
86 struct conn_lookup_ctx *ctx,
87 long long now);
88static bool valid_new(struct dp_packet *pkt, struct conn_key *);
e6ef6cc6
DDP
89static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
90 struct conn_key *, long long now);
a489b168 91static void delete_conn(struct conn *);
e6ef6cc6
DDP
92static enum ct_update_res conn_update(struct conn *,
93 struct conntrack_bucket *ctb,
94 struct dp_packet *, bool reply,
95 long long now);
a489b168
DDP
96static bool conn_expired(struct conn *, long long now);
97static void set_mark(struct dp_packet *, struct conn *,
98 uint32_t val, uint32_t mask);
99static void set_label(struct dp_packet *, struct conn *,
100 const struct ovs_key_ct_labels *val,
101 const struct ovs_key_ct_labels *mask);
e6ef6cc6 102static void *clean_thread_main(void *f_);
a489b168 103
286de272
DB
104static struct nat_conn_key_node *
105nat_conn_keys_lookup(struct hmap *nat_conn_keys,
106 const struct conn_key *key,
107 uint32_t basis);
108
80cee116
DB
109static bool
110nat_conn_keys_insert(struct hmap *nat_conn_keys,
111 const struct conn *nat_conn,
112 uint32_t hash_basis);
113
286de272
DB
114static void
115nat_conn_keys_remove(struct hmap *nat_conn_keys,
116 const struct conn_key *key,
117 uint32_t basis);
118
119static bool
120nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
121 struct conn *nat_conn);
122
123static uint8_t
124reverse_icmp_type(uint8_t type);
125static uint8_t
126reverse_icmp6_type(uint8_t type);
127static inline bool
128extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
129 const char **new_data, bool validate_checksum);
130static inline bool
131extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
132 const char **new_data);
bd5e81a0 133static struct alg_exp_node *
be38342d
DB
134expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
135 uint32_t basis, bool src_ip_wc);
bd5e81a0
DB
136
137static int
138repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
139 char *ftp_data_v4_start,
cd7c99a6 140 size_t addr_offset_from_ftp_data_start, size_t addr_size);
bd5e81a0
DB
141
142static enum ftp_ctl_pkt
143process_ftp_ctl_v4(struct conntrack *ct,
144 struct dp_packet *pkt,
145 const struct conn *conn_for_expectation,
4417ca3d 146 ovs_be32 *v4_addr_rep,
bd5e81a0 147 char **ftp_data_v4_start,
cd7c99a6
DB
148 size_t *addr_offset_from_ftp_data_start,
149 size_t *addr_size);
bd5e81a0
DB
150
151static enum ftp_ctl_pkt
152detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
153 struct dp_packet *pkt);
154
4417ca3d
DB
155static void
156expectation_clean(struct conntrack *ct, const struct conn_key *master_key,
157 uint32_t basis);
158
94e71143
DB
159static struct ct_l4_proto *l4_protos[] = {
160 [IPPROTO_TCP] = &ct_proto_tcp,
161 [IPPROTO_UDP] = &ct_proto_other,
162 [IPPROTO_ICMP] = &ct_proto_icmp4,
163 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
164};
165
bd5e81a0
DB
166static void
167handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
168 struct dp_packet *pkt,
169 const struct conn *conn_for_expectation,
170 long long now, enum ftp_ctl_pkt ftp_ctl, bool nat);
171
7be77cb0
DB
172static void
173handle_tftp_ctl(struct conntrack *ct,
94e71143 174 const struct conn_lookup_ctx *ctx OVS_UNUSED,
be38342d 175 struct dp_packet *pkt,
7be77cb0 176 const struct conn *conn_for_expectation,
4417ca3d
DB
177 long long now OVS_UNUSED,
178 enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED);
94e71143
DB
179
180typedef void (*alg_helper)(struct conntrack *ct,
181 const struct conn_lookup_ctx *ctx,
182 struct dp_packet *pkt,
183 const struct conn *conn_for_expectation,
184 long long now, enum ftp_ctl_pkt ftp_ctl,
185 bool nat);
186
187static alg_helper alg_helpers[] = {
188 [CT_ALG_CTL_NONE] = NULL,
189 [CT_ALG_CTL_FTP] = handle_ftp_ctl,
190 [CT_ALG_CTL_TFTP] = handle_tftp_ctl,
a489b168
DDP
191};
192
193long long ct_timeout_val[] = {
194#define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
195 CT_TIMEOUTS
196#undef CT_TIMEOUT
197};
198
bd5e81a0
DB
199/* The maximum TCP or UDP port number. */
200#define CT_MAX_L4_PORT 65535
bd5e81a0
DB
201/* String buffer used for parsing FTP string messages.
202 * This is sized about twice what is needed to leave some
203 * margin of error. */
204#define LARGEST_FTP_MSG_OF_INTEREST 128
205/* FTP port string used in active mode. */
206#define FTP_PORT_CMD "PORT"
207/* FTP pasv string used in passive mode. */
208#define FTP_PASV_REPLY_CODE "227"
209/* Maximum decimal digits for port in FTP command.
210 * The port is represented as two 3 digit numbers with the
211 * high part a multiple of 256. */
212#define MAX_FTP_PORT_DGTS 3
213
214/* FTP extension EPRT string used for active mode. */
215#define FTP_EPRT_CMD "EPRT"
216/* FTP extension EPSV string used for passive mode. */
217#define FTP_EPSV_REPLY "EXTENDED PASSIVE"
218/* Maximum decimal digits for port in FTP extended command. */
219#define MAX_EXT_FTP_PORT_DGTS 5
220/* FTP extended command code for IPv6. */
221#define FTP_AF_V6 '2'
222/* Used to indicate a wildcard L4 source port number for ALGs.
223 * This is used for port numbers that we cannot predict in
224 * expectations. */
225#define ALG_WC_SRC_PORT 0
226
a489b168 227/* If the total number of connections goes above this value, no new connections
286de272 228 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
a489b168
DDP
229#define DEFAULT_N_CONN_LIMIT 3000000
230
5ed7a0b4
DB
231/* Does a member by member comparison of two conn_keys; this
232 * function must be kept in sync with struct conn_key; returns 0
233 * if the keys are equal or 1 if the keys are not equal. */
234static int
235conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
236{
237 if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
238 !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
239 (key1->src.icmp_id == key2->src.icmp_id) &&
240 (key1->src.icmp_type == key2->src.icmp_type) &&
241 (key1->src.icmp_code == key2->src.icmp_code) &&
242 (key1->dst.icmp_id == key2->dst.icmp_id) &&
243 (key1->dst.icmp_type == key2->dst.icmp_type) &&
244 (key1->dst.icmp_code == key2->dst.icmp_code) &&
245 (key1->dl_type == key2->dl_type) &&
246 (key1->zone == key2->zone) &&
247 (key1->nw_proto == key2->nw_proto)) {
248
249 return 0;
250 }
251 return 1;
252}
253
d8682ee5 254static void
dec0dbbc
DB
255ct_print_conn_info(const struct conn *c, const char *log_msg,
256 enum vlog_level vll, bool force, bool rl_on)
66f400f5
DB
257{
258#define CT_VLOG(RL_ON, LEVEL, ...) \
259 do { \
260 if (RL_ON) { \
261 static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
262 vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \
263 } else { \
264 vlog(&this_module, LEVEL, __VA_ARGS__); \
265 } \
266 } while (0)
267
268 if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) {
269 if (c->key.dl_type == htons(ETH_TYPE_IP)) {
270 CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src "
271 "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports "
272 "%"PRIu16"/%"PRIu16" rev src/dst ports "
273 "%"PRIu16"/%"PRIu16" zone/rev zone "
274 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
275 "%"PRIu8"/%"PRIu8, log_msg,
276 IP_ARGS(c->key.src.addr.ipv4_aligned),
277 IP_ARGS(c->key.dst.addr.ipv4_aligned),
278 IP_ARGS(c->rev_key.src.addr.ipv4_aligned),
279 IP_ARGS(c->rev_key.dst.addr.ipv4_aligned),
280 ntohs(c->key.src.port), ntohs(c->key.dst.port),
281 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
282 c->key.zone, c->rev_key.zone, c->key.nw_proto,
283 c->rev_key.nw_proto);
284 } else {
285 char ip6_s[INET6_ADDRSTRLEN];
286 inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s);
287 char ip6_d[INET6_ADDRSTRLEN];
288 inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d);
289 char ip6_rs[INET6_ADDRSTRLEN];
290 inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs,
291 sizeof ip6_rs);
292 char ip6_rd[INET6_ADDRSTRLEN];
293 inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd,
294 sizeof ip6_rd);
295
296 CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s"
297 " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16
298 " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone "
299 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
300 "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs,
301 ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port),
302 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
303 c->key.zone, c->rev_key.zone, c->key.nw_proto,
304 c->rev_key.nw_proto);
305 }
306 }
307}
308
a489b168
DDP
309/* Initializes the connection tracker 'ct'. The caller is responsible for
310 * calling 'conntrack_destroy()', when the instance is not needed anymore */
311void
312conntrack_init(struct conntrack *ct)
313{
e6ef6cc6 314 long long now = time_msec();
a489b168 315
8b934ced
DB
316 ct_rwlock_init(&ct->resources_lock);
317 ct_rwlock_wrlock(&ct->resources_lock);
286de272 318 hmap_init(&ct->nat_conn_keys);
bd5e81a0 319 hmap_init(&ct->alg_expectations);
4417ca3d 320 hindex_init(&ct->alg_expectation_refs);
bd5e81a0 321 ovs_list_init(&ct->alg_exp_list);
8b934ced 322 ct_rwlock_unlock(&ct->resources_lock);
286de272 323
dec0dbbc 324 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
a489b168
DDP
325 struct conntrack_bucket *ctb = &ct->buckets[i];
326
327 ct_lock_init(&ctb->lock);
328 ct_lock_lock(&ctb->lock);
329 hmap_init(&ctb->connections);
dec0dbbc 330 for (unsigned j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
e6ef6cc6
DDP
331 ovs_list_init(&ctb->exp_lists[j]);
332 }
a489b168 333 ct_lock_unlock(&ctb->lock);
e6ef6cc6
DDP
334 ovs_mutex_init(&ctb->cleanup_mutex);
335 ovs_mutex_lock(&ctb->cleanup_mutex);
336 ctb->next_cleanup = now + CT_TM_MIN;
337 ovs_mutex_unlock(&ctb->cleanup_mutex);
a489b168
DDP
338 }
339 ct->hash_basis = random_uint32();
340 atomic_count_init(&ct->n_conn, 0);
341 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
e6ef6cc6
DDP
342 latch_init(&ct->clean_thread_exit);
343 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
4ea96698 344 ct->ipf = ipf_init();
a489b168
DDP
345}
346
347/* Destroys the connection tracker 'ct' and frees all the allocated memory. */
348void
349conntrack_destroy(struct conntrack *ct)
350{
e6ef6cc6
DDP
351 latch_set(&ct->clean_thread_exit);
352 pthread_join(ct->clean_thread, NULL);
353 latch_destroy(&ct->clean_thread_exit);
dec0dbbc 354 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
a489b168
DDP
355 struct conntrack_bucket *ctb = &ct->buckets[i];
356 struct conn *conn;
357
e6ef6cc6 358 ovs_mutex_destroy(&ctb->cleanup_mutex);
a489b168 359 ct_lock_lock(&ctb->lock);
bd5e81a0 360 HMAP_FOR_EACH_POP (conn, node, &ctb->connections) {
286de272
DB
361 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
362 atomic_count_dec(&ct->n_conn);
363 }
a489b168
DDP
364 delete_conn(conn);
365 }
366 hmap_destroy(&ctb->connections);
367 ct_lock_unlock(&ctb->lock);
368 ct_lock_destroy(&ctb->lock);
369 }
8b934ced 370 ct_rwlock_wrlock(&ct->resources_lock);
286de272
DB
371 struct nat_conn_key_node *nat_conn_key_node;
372 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
373 free(nat_conn_key_node);
374 }
375 hmap_destroy(&ct->nat_conn_keys);
bd5e81a0
DB
376
377 struct alg_exp_node *alg_exp_node;
378 HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
379 free(alg_exp_node);
380 }
4417ca3d 381
bd5e81a0
DB
382 ovs_list_poison(&ct->alg_exp_list);
383 hmap_destroy(&ct->alg_expectations);
4417ca3d 384 hindex_destroy(&ct->alg_expectation_refs);
8b934ced
DB
385 ct_rwlock_unlock(&ct->resources_lock);
386 ct_rwlock_destroy(&ct->resources_lock);
4ea96698 387 ipf_destroy(ct->ipf);
a489b168
DDP
388}
389\f
390static unsigned hash_to_bucket(uint32_t hash)
391{
392 /* Extracts the most significant bits in hash. The least significant bits
393 * are already used internally by the hmap implementation. */
394 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
395
396 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
397}
398
399static void
286de272 400write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
bd5e81a0 401 const struct conn_key *key, const struct alg_exp_node *alg_exp)
a489b168 402{
286de272 403 pkt->md.ct_state |= CS_TRACKED;
a489b168 404 pkt->md.ct_zone = zone;
daf4d3c1
JR
405 pkt->md.ct_mark = conn ? conn->mark : 0;
406 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
407
408 /* Use the original direction tuple if we have it. */
409 if (conn) {
bd5e81a0
DB
410 if (conn->alg_related) {
411 key = &conn->master_key;
412 } else {
413 key = &conn->key;
414 }
415 } else if (alg_exp) {
416 pkt->md.ct_mark = alg_exp->master_mark;
417 pkt->md.ct_label = alg_exp->master_label;
418 key = &alg_exp->master_key;
daf4d3c1 419 }
dec0dbbc 420
daf4d3c1 421 pkt->md.ct_orig_tuple_ipv6 = false;
dec0dbbc 422
daf4d3c1
JR
423 if (key) {
424 if (key->dl_type == htons(ETH_TYPE_IP)) {
425 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
426 key->src.addr.ipv4_aligned,
427 key->dst.addr.ipv4_aligned,
428 key->nw_proto != IPPROTO_ICMP
429 ? key->src.port : htons(key->src.icmp_type),
430 key->nw_proto != IPPROTO_ICMP
431 ? key->dst.port : htons(key->src.icmp_code),
432 key->nw_proto,
433 };
286de272 434 } else {
daf4d3c1
JR
435 pkt->md.ct_orig_tuple_ipv6 = true;
436 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
437 key->src.addr.ipv6_aligned,
438 key->dst.addr.ipv6_aligned,
439 key->nw_proto != IPPROTO_ICMPV6
440 ? key->src.port : htons(key->src.icmp_type),
441 key->nw_proto != IPPROTO_ICMPV6
442 ? key->dst.port : htons(key->src.icmp_code),
443 key->nw_proto,
444 };
445 }
446 } else {
447 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
448 }
bd5e81a0
DB
449}
450
451static uint8_t
452get_ip_proto(const struct dp_packet *pkt)
453{
454 uint8_t ip_proto;
455 struct eth_header *l2 = dp_packet_eth(pkt);
456 if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
457 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
458 ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
459 } else {
460 struct ip_header *l3_hdr = dp_packet_l3(pkt);
461 ip_proto = l3_hdr->ip_proto;
462 }
286de272 463
bd5e81a0
DB
464 return ip_proto;
465}
466
467static bool
94e71143 468is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl)
bd5e81a0 469{
94e71143 470 return ct_alg_ctl == CT_ALG_CTL_FTP;
bd5e81a0
DB
471}
472
94e71143 473static enum ct_alg_ctl_type
bd7d93f8
DB
474get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst,
475 const char *helper)
7be77cb0 476{
94e71143
DB
477 /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined
478 * in OSX, at least in in.h. Since these values will never change, remove
7be77cb0 479 * the external dependency. */
94e71143
DB
480 enum { CT_IPPORT_FTP = 21 };
481 enum { CT_IPPORT_TFTP = 69 };
bd7d93f8
DB
482 uint8_t ip_proto = get_ip_proto(pkt);
483 struct udp_header *uh = dp_packet_l4(pkt);
484 struct tcp_header *th = dp_packet_l4(pkt);
485 ovs_be16 ftp_src_port = htons(CT_IPPORT_FTP);
486 ovs_be16 ftp_dst_port = htons(CT_IPPORT_FTP);
487 ovs_be16 tftp_dst_port = htons(CT_IPPORT_TFTP);
488
489 if (OVS_UNLIKELY(tp_dst)) {
490 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
491 ftp_dst_port = tp_dst;
492 } else if (helper && !strncmp(helper, "tftp", strlen("tftp"))) {
493 tftp_dst_port = tp_dst;
494 }
495 } else if (OVS_UNLIKELY(tp_src)) {
496 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
497 ftp_src_port = tp_src;
498 }
499 }
7be77cb0 500
bd7d93f8 501 if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_dst_port) {
94e71143
DB
502 return CT_ALG_CTL_TFTP;
503 } else if (ip_proto == IPPROTO_TCP &&
bd7d93f8 504 (th->tcp_src == ftp_src_port || th->tcp_dst == ftp_dst_port)) {
94e71143
DB
505 return CT_ALG_CTL_FTP;
506 }
507 return CT_ALG_CTL_NONE;
508}
509
be38342d
DB
510static bool
511alg_src_ip_wc(enum ct_alg_ctl_type alg_ctl_type)
512{
513 if (alg_ctl_type == CT_ALG_CTL_SIP) {
514 return true;
515 }
516 return false;
517}
518
94e71143
DB
519static void
520handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
521 struct dp_packet *pkt, enum ct_alg_ctl_type ct_alg_ctl,
522 const struct conn *conn, long long now, bool nat,
523 const struct conn *conn_for_expectation)
524{
525 /* ALG control packet handling with expectation creation. */
3a2a425b 526 if (OVS_UNLIKELY(alg_helpers[ct_alg_ctl] && conn && conn->alg)) {
94e71143
DB
527 alg_helpers[ct_alg_ctl](ct, ctx, pkt, conn_for_expectation, now,
528 CT_FTP_CTL_INTEREST, nat);
529 }
7be77cb0
DB
530}
531
286de272
DB
532static void
533pat_packet(struct dp_packet *pkt, const struct conn *conn)
534{
535 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
536 if (conn->key.nw_proto == IPPROTO_TCP) {
537 struct tcp_header *th = dp_packet_l4(pkt);
538 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
539 } else if (conn->key.nw_proto == IPPROTO_UDP) {
540 struct udp_header *uh = dp_packet_l4(pkt);
541 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
542 }
543 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
544 if (conn->key.nw_proto == IPPROTO_TCP) {
545 struct tcp_header *th = dp_packet_l4(pkt);
546 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
547 } else if (conn->key.nw_proto == IPPROTO_UDP) {
548 struct udp_header *uh = dp_packet_l4(pkt);
549 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
550 }
551 }
552}
553
554static void
555nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
556{
557 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
558 pkt->md.ct_state |= CS_SRC_NAT;
559 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
560 struct ip_header *nh = dp_packet_l3(pkt);
561 packet_set_ipv4_addr(pkt, &nh->ip_src,
562 conn->rev_key.dst.addr.ipv4_aligned);
563 } else {
564 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
565 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
566 nh6->ip6_src.be32,
567 &conn->rev_key.dst.addr.ipv6_aligned,
568 true);
569 }
570 if (!related) {
571 pat_packet(pkt, conn);
572 }
573 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
574 pkt->md.ct_state |= CS_DST_NAT;
575 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
576 struct ip_header *nh = dp_packet_l3(pkt);
577 packet_set_ipv4_addr(pkt, &nh->ip_dst,
578 conn->rev_key.src.addr.ipv4_aligned);
579 } else {
580 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
581 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
582 nh6->ip6_dst.be32,
583 &conn->rev_key.src.addr.ipv6_aligned,
584 true);
585 }
586 if (!related) {
587 pat_packet(pkt, conn);
588 }
589 }
590}
591
592static void
593un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
594{
595 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
596 if (conn->key.nw_proto == IPPROTO_TCP) {
597 struct tcp_header *th = dp_packet_l4(pkt);
598 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
599 } else if (conn->key.nw_proto == IPPROTO_UDP) {
600 struct udp_header *uh = dp_packet_l4(pkt);
601 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
602 }
603 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
604 if (conn->key.nw_proto == IPPROTO_TCP) {
605 struct tcp_header *th = dp_packet_l4(pkt);
606 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
607 } else if (conn->key.nw_proto == IPPROTO_UDP) {
608 struct udp_header *uh = dp_packet_l4(pkt);
609 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
610 }
611 }
612}
613
edd1bef4
DB
614static void
615reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
616{
617 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
618 if (conn->key.nw_proto == IPPROTO_TCP) {
619 struct tcp_header *th_in = dp_packet_l4(pkt);
620 packet_set_tcp_port(pkt, conn->key.src.port,
621 th_in->tcp_dst);
622 } else if (conn->key.nw_proto == IPPROTO_UDP) {
623 struct udp_header *uh_in = dp_packet_l4(pkt);
624 packet_set_udp_port(pkt, conn->key.src.port,
625 uh_in->udp_dst);
626 }
627 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
628 if (conn->key.nw_proto == IPPROTO_TCP) {
629 struct tcp_header *th_in = dp_packet_l4(pkt);
630 packet_set_tcp_port(pkt, th_in->tcp_src,
631 conn->key.dst.port);
632 } else if (conn->key.nw_proto == IPPROTO_UDP) {
633 struct udp_header *uh_in = dp_packet_l4(pkt);
634 packet_set_udp_port(pkt, uh_in->udp_src,
635 conn->key.dst.port);
636 }
637 }
638}
639
640static void
641reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
642{
643 char *tail = dp_packet_tail(pkt);
644 char pad = dp_packet_l2_pad_size(pkt);
645 struct conn_key inner_key;
646 const char *inner_l4 = NULL;
647 uint16_t orig_l3_ofs = pkt->l3_ofs;
648 uint16_t orig_l4_ofs = pkt->l4_ofs;
649
650 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
651 struct ip_header *nh = dp_packet_l3(pkt);
652 struct icmp_header *icmp = dp_packet_l4(pkt);
653 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
bd5e81a0
DB
654 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad,
655 &inner_l4, false);
edd1bef4
DB
656 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
657 pkt->l4_ofs += inner_l4 - (char *) icmp;
658
659 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
660 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
661 conn->key.src.addr.ipv4_aligned);
662 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
663 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
664 conn->key.dst.addr.ipv4_aligned);
665 }
dec0dbbc 666
edd1bef4
DB
667 reverse_pat_packet(pkt, conn);
668 icmp->icmp_csum = 0;
669 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
670 } else {
671 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
672 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
673 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
674 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
675 extract_l3_ipv6(&inner_key, inner_l3_6,
676 tail - ((char *)inner_l3_6) - pad,
677 &inner_l4);
678 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
679 pkt->l4_ofs += inner_l4 - (char *) icmp6;
680
681 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
682 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
683 inner_l3_6->ip6_src.be32,
684 &conn->key.src.addr.ipv6_aligned,
685 true);
686 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
687 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
688 inner_l3_6->ip6_dst.be32,
689 &conn->key.dst.addr.ipv6_aligned,
690 true);
691 }
692 reverse_pat_packet(pkt, conn);
693 uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
694 icmp6->icmp6_base.icmp6_cksum = 0;
695 icmp6->icmp6_base.icmp6_cksum = csum_finish(
696 csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
697 }
698 pkt->l3_ofs = orig_l3_ofs;
699 pkt->l4_ofs = orig_l4_ofs;
700}
701
286de272
DB
702static void
703un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
704 bool related)
705{
706 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
707 pkt->md.ct_state |= CS_DST_NAT;
708 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
709 struct ip_header *nh = dp_packet_l3(pkt);
710 packet_set_ipv4_addr(pkt, &nh->ip_dst,
711 conn->key.src.addr.ipv4_aligned);
712 } else {
713 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
714 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
715 nh6->ip6_dst.be32,
716 &conn->key.src.addr.ipv6_aligned, true);
717 }
edd1bef4
DB
718
719 if (OVS_UNLIKELY(related)) {
720 reverse_nat_packet(pkt, conn);
721 } else {
286de272
DB
722 un_pat_packet(pkt, conn);
723 }
724 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
725 pkt->md.ct_state |= CS_SRC_NAT;
726 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
727 struct ip_header *nh = dp_packet_l3(pkt);
728 packet_set_ipv4_addr(pkt, &nh->ip_src,
729 conn->key.dst.addr.ipv4_aligned);
730 } else {
731 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
732 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
733 nh6->ip6_src.be32,
734 &conn->key.dst.addr.ipv6_aligned, true);
735 }
edd1bef4
DB
736
737 if (OVS_UNLIKELY(related)) {
738 reverse_nat_packet(pkt, conn);
739 } else {
286de272
DB
740 un_pat_packet(pkt, conn);
741 }
742 }
743}
744
745/* Typical usage of this helper is in non per-packet code;
746 * this is because the bucket lock needs to be held for lookup
747 * and a hash would have already been needed. Hence, this function
748 * is just intended for code clarity. */
749static struct conn *
bd5e81a0 750conn_lookup(struct conntrack *ct, const struct conn_key *key, long long now)
286de272
DB
751{
752 struct conn_lookup_ctx ctx;
753 ctx.conn = NULL;
c3f6bae2 754 memcpy(&ctx.key, key, sizeof ctx.key);
286de272
DB
755 ctx.hash = conn_key_hash(key, ct->hash_basis);
756 unsigned bucket = hash_to_bucket(ctx.hash);
757 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
758 return ctx.conn;
759}
760
bd5e81a0
DB
761static void
762conn_seq_skew_set(struct conntrack *ct, const struct conn_key *key,
763 long long now, int seq_skew, bool seq_skew_dir)
764{
dec0dbbc 765 unsigned bucket = hash_to_bucket(conn_key_hash(key, ct->hash_basis));
bd5e81a0
DB
766 ct_lock_lock(&ct->buckets[bucket].lock);
767 struct conn *conn = conn_lookup(ct, key, now);
768 if (conn && seq_skew) {
769 conn->seq_skew = seq_skew;
770 conn->seq_skew_dir = seq_skew_dir;
771 }
772 ct_lock_unlock(&ct->buckets[bucket].lock);
773}
774
286de272
DB
775static void
776nat_clean(struct conntrack *ct, struct conn *conn,
777 struct conntrack_bucket *ctb)
778 OVS_REQUIRES(ctb->lock)
779{
8b934ced 780 ct_rwlock_wrlock(&ct->resources_lock);
286de272 781 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
8b934ced 782 ct_rwlock_unlock(&ct->resources_lock);
286de272 783 ct_lock_unlock(&ctb->lock);
dec0dbbc
DB
784 unsigned bucket_rev_conn =
785 hash_to_bucket(conn_key_hash(&conn->rev_key, ct->hash_basis));
286de272 786 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
8b934ced 787 ct_rwlock_wrlock(&ct->resources_lock);
dec0dbbc 788 long long now = time_msec();
286de272 789 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
286de272
DB
790 struct nat_conn_key_node *nat_conn_key_node =
791 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
792 ct->hash_basis);
793
794 /* In the unlikely event, rev conn was recreated, then skip
795 * rev_conn cleanup. */
796 if (rev_conn && (!nat_conn_key_node ||
5ed7a0b4
DB
797 conn_key_cmp(&nat_conn_key_node->value,
798 &rev_conn->rev_key))) {
286de272
DB
799 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
800 &rev_conn->node);
801 free(rev_conn);
802 }
286de272 803
dec0dbbc 804 delete_conn(conn);
8b934ced 805 ct_rwlock_unlock(&ct->resources_lock);
286de272
DB
806 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
807 ct_lock_lock(&ctb->lock);
808}
809
9e8f3960 810/* Must be called with 'CT_CONN_TYPE_DEFAULT' 'conn_type'. */
286de272
DB
811static void
812conn_clean(struct conntrack *ct, struct conn *conn,
813 struct conntrack_bucket *ctb)
814 OVS_REQUIRES(ctb->lock)
815{
9e8f3960
DB
816 ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
817
4417ca3d
DB
818 if (conn->alg) {
819 expectation_clean(ct, &conn->key, ct->hash_basis);
820 }
286de272
DB
821 ovs_list_remove(&conn->exp_node);
822 hmap_remove(&ctb->connections, &conn->node);
823 atomic_count_dec(&ct->n_conn);
824 if (conn->nat_info) {
825 nat_clean(ct, conn, ctb);
826 } else {
827 delete_conn(conn);
828 }
a489b168
DDP
829}
830
3a2a425b
DB
831static bool
832ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
833{
834 if (ct_alg_ctl == CT_ALG_CTL_NONE) {
835 return true;
836 } else if (helper) {
837 if ((ct_alg_ctl == CT_ALG_CTL_FTP) &&
838 !strncmp(helper, "ftp", strlen("ftp"))) {
839 return true;
840 } else if ((ct_alg_ctl == CT_ALG_CTL_TFTP) &&
841 !strncmp(helper, "tftp", strlen("tftp"))) {
842 return true;
843 } else {
844 return false;
845 }
846 } else {
847 return false;
848 }
849}
850
ac6abe5f 851/* This function is called with the bucket lock held. */
a489b168
DDP
852static struct conn *
853conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
286de272
DB
854 struct conn_lookup_ctx *ctx, bool commit, long long now,
855 const struct nat_action_info_t *nat_action_info,
bd5e81a0
DB
856 struct conn *conn_for_un_nat_copy,
857 const char *helper,
3a2a425b
DB
858 const struct alg_exp_node *alg_exp,
859 enum ct_alg_ctl_type ct_alg_ctl)
a489b168 860{
a489b168
DDP
861 struct conn *nc = NULL;
862
863 if (!valid_new(pkt, &ctx->key)) {
286de272 864 pkt->md.ct_state = CS_INVALID;
a489b168
DDP
865 return nc;
866 }
dec0dbbc 867
286de272 868 pkt->md.ct_state = CS_NEW;
dec0dbbc 869
bd5e81a0
DB
870 if (alg_exp) {
871 pkt->md.ct_state |= CS_RELATED;
872 }
a489b168
DDP
873
874 if (commit) {
875 unsigned int n_conn_limit;
a489b168
DDP
876 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
877
878 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
879 COVERAGE_INC(conntrack_full);
880 return nc;
881 }
882
dec0dbbc 883 unsigned bucket = hash_to_bucket(ctx->hash);
e6ef6cc6 884 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
286de272
DB
885 ctx->conn = nc;
886 nc->rev_key = nc->key;
887 conn_key_reverse(&nc->rev_key);
a489b168 888
3a2a425b
DB
889 if (ct_verify_helper(helper, ct_alg_ctl)) {
890 nc->alg = nullable_xstrdup(helper);
bd5e81a0
DB
891 }
892
893 if (alg_exp) {
894 nc->alg_related = true;
895 nc->mark = alg_exp->master_mark;
896 nc->label = alg_exp->master_label;
897 nc->master_key = alg_exp->master_key;
898 }
899
286de272
DB
900 if (nat_action_info) {
901 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
a489b168 902
bd5e81a0 903 if (alg_exp) {
be38342d 904 if (alg_exp->nat_rpl_dst) {
bd5e81a0
DB
905 nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
906 nc->nat_info->nat_action = NAT_ACTION_SRC;
907 } else {
908 nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
909 nc->nat_info->nat_action = NAT_ACTION_DST;
910 }
c3f6bae2 911 memcpy(conn_for_un_nat_copy, nc, sizeof *conn_for_un_nat_copy);
d8682ee5
DB
912 ct_rwlock_wrlock(&ct->resources_lock);
913 bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys,
914 conn_for_un_nat_copy,
915 ct->hash_basis);
916 ct_rwlock_unlock(&ct->resources_lock);
917 if (!new_insert) {
918 char *log_msg = xasprintf("Pre-existing alg "
919 "nat_conn_key");
920 ct_print_conn_info(conn_for_un_nat_copy, log_msg, VLL_INFO,
921 true, false);
922 free(log_msg);
923 }
bd5e81a0 924 } else {
c3f6bae2 925 memcpy(conn_for_un_nat_copy, nc, sizeof *conn_for_un_nat_copy);
bd5e81a0 926 ct_rwlock_wrlock(&ct->resources_lock);
dec0dbbc
DB
927 bool nat_res = nat_select_range_tuple(ct, nc,
928 conn_for_un_nat_copy);
286de272 929
bd5e81a0
DB
930 if (!nat_res) {
931 goto nat_res_exhaustion;
932 }
286de272 933
bd5e81a0
DB
934 /* Update nc with nat adjustments made to
935 * conn_for_un_nat_copy by nat_select_range_tuple(). */
286de272 936 *nc = *conn_for_un_nat_copy;
bd5e81a0 937 ct_rwlock_unlock(&ct->resources_lock);
286de272 938 }
bd5e81a0
DB
939 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
940 conn_for_un_nat_copy->nat_info = NULL;
941 conn_for_un_nat_copy->alg = NULL;
dbb597d3 942 nat_packet(pkt, nc, ctx->icmp_related);
286de272 943 }
a489b168
DDP
944 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
945 atomic_count_inc(&ct->n_conn);
946 }
bd5e81a0 947
a489b168 948 return nc;
bd5e81a0
DB
949
950 /* This would be a user error or a DOS attack.
951 * A user error is prevented by allocating enough
952 * combinations of NAT addresses when combined with
953 * ephemeral ports. A DOS attack should be protected
954 * against with firewall rules or a separate firewall.
955 * Also using zone partitioning can limit DoS impact. */
956nat_res_exhaustion:
d8c5a93b 957 ovs_list_remove(&nc->exp_node);
bd5e81a0
DB
958 delete_conn(nc);
959 /* conn_for_un_nat_copy is a local variable in process_one; this
960 * memset() serves to document that conn_for_un_nat_copy is from
961 * this point on unused. */
962 memset(conn_for_un_nat_copy, 0, sizeof *conn_for_un_nat_copy);
963 ct_rwlock_unlock(&ct->resources_lock);
964 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
965 VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
966 "if DoS attack, use firewalling and/or zone partitioning.");
967 return NULL;
a489b168
DDP
968}
969
286de272
DB
970static bool
971conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
972 struct conn_lookup_ctx *ctx, struct conn **conn,
973 long long now, unsigned bucket)
974 OVS_REQUIRES(ct->buckets[bucket].lock)
975{
976 bool create_new_conn = false;
977
dbb597d3 978 if (ctx->icmp_related) {
286de272
DB
979 pkt->md.ct_state |= CS_RELATED;
980 if (ctx->reply) {
981 pkt->md.ct_state |= CS_REPLY_DIR;
982 }
983 } else {
bd5e81a0
DB
984 if ((*conn)->alg_related) {
985 pkt->md.ct_state |= CS_RELATED;
986 }
dec0dbbc 987
286de272
DB
988 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
989 pkt, ctx->reply, now);
990
991 switch (res) {
992 case CT_UPDATE_VALID:
993 pkt->md.ct_state |= CS_ESTABLISHED;
994 pkt->md.ct_state &= ~CS_NEW;
995 if (ctx->reply) {
996 pkt->md.ct_state |= CS_REPLY_DIR;
997 }
998 break;
999 case CT_UPDATE_INVALID:
1000 pkt->md.ct_state = CS_INVALID;
1001 break;
1002 case CT_UPDATE_NEW:
1003 conn_clean(ct, *conn, &ct->buckets[bucket]);
1004 create_new_conn = true;
1005 break;
1006 default:
1007 OVS_NOT_REACHED();
1008 }
1009 }
1010 return create_new_conn;
1011}
1012
1013static void
1014create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
bd5e81a0 1015 long long now, bool alg_un_nat)
286de272
DB
1016{
1017 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
1018 nc->key = conn_for_un_nat_copy->rev_key;
1019 nc->rev_key = conn_for_un_nat_copy->key;
1020 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
1021 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
1022 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
286de272
DB
1023 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
1024
bd5e81a0 1025 if (alg_un_nat) {
d8682ee5
DB
1026 if (!rev_conn) {
1027 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
1028 &nc->node, un_nat_hash);
1029 } else {
1030 char *log_msg = xasprintf("Unusual condition for un_nat conn "
1031 "create for alg: rev_conn %p", rev_conn);
1032 ct_print_conn_info(nc, log_msg, VLL_INFO, true, false);
1033 free(log_msg);
1034 free(nc);
1035 }
286de272 1036 } else {
bd5e81a0
DB
1037 ct_rwlock_rdlock(&ct->resources_lock);
1038
1039 struct nat_conn_key_node *nat_conn_key_node =
1040 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
1041 if (nat_conn_key_node && !conn_key_cmp(&nat_conn_key_node->value,
1042 &nc->rev_key) && !rev_conn) {
bd5e81a0
DB
1043 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
1044 &nc->node, un_nat_hash);
1045 } else {
d8682ee5
DB
1046 char *log_msg = xasprintf("Unusual condition for un_nat conn "
1047 "create: nat_conn_key_node/rev_conn "
1048 "%p/%p", nat_conn_key_node, rev_conn);
1049 ct_print_conn_info(nc, log_msg, VLL_INFO, true, false);
1050 free(log_msg);
bd5e81a0
DB
1051 free(nc);
1052 }
1053 ct_rwlock_unlock(&ct->resources_lock);
286de272 1054 }
286de272
DB
1055 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
1056}
1057
1058static void
1059handle_nat(struct dp_packet *pkt, struct conn *conn,
1060 uint16_t zone, bool reply, bool related)
1061{
1062 if (conn->nat_info &&
1063 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1064 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
1065 zone != pkt->md.ct_zone))) {
bd5e81a0 1066
286de272
DB
1067 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
1068 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
1069 }
1070 if (reply) {
1071 un_nat_packet(pkt, conn, related);
1072 } else {
1073 nat_packet(pkt, conn, related);
1074 }
1075 }
1076}
1077
f8016041
DB
1078static bool
1079check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
1080 struct conn_lookup_ctx *ctx_in, long long now,
1081 unsigned *bucket, struct conn **conn,
1082 const struct nat_action_info_t *nat_action_info)
1083 OVS_REQUIRES(ct->buckets[*bucket].lock)
1084{
1085 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
1086 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
1087 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
1088 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
1089 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1090 nat_action_info) {
1091 return false;
1092 }
1093
1094 ct_lock_unlock(&ct->buckets[*bucket].lock);
1095 struct conn_lookup_ctx ctx;
1096 memset(&ctx, 0 , sizeof ctx);
1097 ctx.conn = NULL;
1098
1099 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
1100 ctx.key.src.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
1101 ctx.key.dst.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
1102
1103 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
1104 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
1105 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1106 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
1107 ctx.key.src.icmp_type = (uint8_t) src_port;
1108 ctx.key.dst.icmp_type = reverse_icmp_type(ctx.key.src.icmp_type);
1109 } else {
1110 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
1111 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
1112 }
1113 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
1114 } else {
1115 ctx.key.src.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
1116 ctx.key.dst.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
1117
1118 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
1119 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
1120 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1121 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
1122 ctx.key.src.icmp_type = (uint8_t) src_port;
1123 ctx.key.dst.icmp_type = reverse_icmp6_type(ctx.key.src.icmp_type);
1124 } else {
1125 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
1126 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
1127 }
1128 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
1129 }
1130
1131 ctx.key.dl_type = ctx_in->key.dl_type;
1132 ctx.key.zone = pkt->md.ct_zone;
f8016041
DB
1133 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
1134 *bucket = hash_to_bucket(ctx.hash);
1135 ct_lock_lock(&ct->buckets[*bucket].lock);
1136 conn_key_lookup(&ct->buckets[*bucket], &ctx, now);
1137 *conn = ctx.conn;
f8016041
DB
1138 return *conn ? true : false;
1139}
1140
bd5e81a0
DB
1141static bool
1142is_un_nat_conn_valid(const struct conn *un_nat_conn)
1143{
1144 return un_nat_conn->conn_type == CT_CONN_TYPE_UN_NAT;
1145}
1146
94e71143
DB
1147static bool
1148conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
1149 struct conn_lookup_ctx *ctx, struct conn *conn,
1150 const struct nat_action_info_t *nat_action_info,
1151 enum ct_alg_ctl_type ct_alg_ctl, long long now,
1152 unsigned bucket, bool *create_new_conn)
1153 OVS_REQUIRES(ct->buckets[bucket].lock)
1154{
1155 if (is_ftp_ctl(ct_alg_ctl)) {
1156 /* Keep sequence tracking in sync with the source of the
1157 * sequence skew. */
1158 if (ctx->reply != conn->seq_skew_dir) {
1159 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1160 !!nat_action_info);
1161 *create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1162 bucket);
1163 } else {
1164 *create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1165 bucket);
030958a0
DB
1166
1167 if (*create_new_conn == false) {
1168 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1169 !!nat_action_info);
1170 }
94e71143
DB
1171 }
1172 return true;
1173 }
1174 return false;
1175}
1176
286de272 1177static void
a489b168
DDP
1178process_one(struct conntrack *ct, struct dp_packet *pkt,
1179 struct conn_lookup_ctx *ctx, uint16_t zone,
286de272
DB
1180 bool force, bool commit, long long now, const uint32_t *setmark,
1181 const struct ovs_key_ct_labels *setlabel,
bd5e81a0 1182 const struct nat_action_info_t *nat_action_info,
bd7d93f8 1183 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper)
a489b168 1184{
286de272 1185 struct conn *conn;
a489b168 1186 unsigned bucket = hash_to_bucket(ctx->hash);
286de272
DB
1187 ct_lock_lock(&ct->buckets[bucket].lock);
1188 conn_key_lookup(&ct->buckets[bucket], ctx, now);
1189 conn = ctx->conn;
a489b168 1190
a76a37ef
JR
1191 /* Delete found entry if in wrong direction. 'force' implies commit. */
1192 if (conn && force && ctx->reply) {
286de272 1193 conn_clean(ct, conn, &ct->buckets[bucket]);
a76a37ef
JR
1194 conn = NULL;
1195 }
1196
286de272
DB
1197 if (OVS_LIKELY(conn)) {
1198 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
a489b168 1199
286de272 1200 ctx->reply = true;
a489b168 1201
286de272
DB
1202 struct conn_lookup_ctx ctx2;
1203 ctx2.conn = NULL;
1204 ctx2.key = conn->rev_key;
1205 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
1206
1207 ct_lock_unlock(&ct->buckets[bucket].lock);
1208 bucket = hash_to_bucket(ctx2.hash);
1209
1210 ct_lock_lock(&ct->buckets[bucket].lock);
1211 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
1212
1213 if (ctx2.conn) {
1214 conn = ctx2.conn;
1215 } else {
1216 /* It is a race condition where conn has timed out and removed
1217 * between unlock of the rev_conn and lock of the forward conn;
1218 * nothing to do. */
1219 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
1220 ct_lock_unlock(&ct->buckets[bucket].lock);
1221 return;
a489b168
DDP
1222 }
1223 }
286de272
DB
1224 }
1225
1226 bool create_new_conn = false;
1227 struct conn conn_for_un_nat_copy;
1228 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
94e71143 1229
bd7d93f8
DB
1230 enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst,
1231 helper);
bd5e81a0 1232
286de272 1233 if (OVS_LIKELY(conn)) {
94e71143
DB
1234 if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn,
1235 nat_action_info,
1236 ct_alg_ctl, now, bucket,
1237 &create_new_conn))) {
bd5e81a0
DB
1238 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1239 bucket);
1240 }
286de272 1241 if (nat_action_info && !create_new_conn) {
dbb597d3 1242 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
286de272 1243 }
bd5e81a0 1244
dec0dbbc 1245 } else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
bd5e81a0 1246 nat_action_info)) {
dec0dbbc 1247 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
a489b168 1248 } else {
dbb597d3 1249 if (ctx->icmp_related) {
bd5e81a0
DB
1250 /* An icmp related conn should always be found; no new
1251 connection is created based on an icmp related packet. */
286de272 1252 pkt->md.ct_state = CS_INVALID;
5c2e106b 1253 } else {
286de272 1254 create_new_conn = true;
5c2e106b 1255 }
a489b168
DDP
1256 }
1257
bd5e81a0 1258 const struct alg_exp_node *alg_exp = NULL;
96bbcbf7 1259 struct alg_exp_node alg_exp_entry;
dec0dbbc 1260
286de272 1261 if (OVS_UNLIKELY(create_new_conn)) {
bd5e81a0
DB
1262
1263 ct_rwlock_rdlock(&ct->resources_lock);
1264 alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
be38342d
DB
1265 ct->hash_basis,
1266 alg_src_ip_wc(ct_alg_ctl));
bd5e81a0 1267 if (alg_exp) {
c3f6bae2 1268 memcpy(&alg_exp_entry, alg_exp, sizeof alg_exp_entry);
bd5e81a0
DB
1269 alg_exp = &alg_exp_entry;
1270 }
1271 ct_rwlock_unlock(&ct->resources_lock);
1272
286de272 1273 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
3a2a425b
DB
1274 &conn_for_un_nat_copy, helper, alg_exp,
1275 ct_alg_ctl);
286de272
DB
1276 }
1277
bd5e81a0
DB
1278 write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1279
286de272
DB
1280 if (conn && setmark) {
1281 set_mark(pkt, conn, setmark[0], setmark[1]);
1282 }
a489b168 1283
286de272
DB
1284 if (conn && setlabel) {
1285 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1286 }
1287
bd5e81a0 1288 struct conn conn_for_expectation;
94e71143 1289 if (OVS_UNLIKELY((ct_alg_ctl != CT_ALG_CTL_NONE) && conn)) {
bd5e81a0
DB
1290 conn_for_expectation = *conn;
1291 }
1292
286de272
DB
1293 ct_lock_unlock(&ct->buckets[bucket].lock);
1294
bd5e81a0
DB
1295 if (is_un_nat_conn_valid(&conn_for_un_nat_copy)) {
1296 create_un_nat_conn(ct, &conn_for_un_nat_copy, now, !!alg_exp);
1297 }
1298
94e71143
DB
1299 handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info,
1300 &conn_for_expectation);
a489b168
DDP
1301}
1302
1303/* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
51b9a533 1304 * the packets must have the same 'dl_type' (IPv4 or IPv6) and should have
4ea96698
DB
1305 * the l3 and and l4 offset properly set. Performs fragment reassembly with
1306 * the help of ipf_preprocess_conntrack().
a489b168
DDP
1307 *
1308 * If 'commit' is true, the packets are allowed to create new entries in the
1309 * connection tables. 'setmark', if not NULL, should point to a two
1310 * elements array containing a value and a mask to set the connection mark.
1311 * 'setlabel' behaves similarly for the connection label.*/
1312int
1313conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
a76a37ef 1314 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
66e4ad8a 1315 const uint32_t *setmark,
a489b168 1316 const struct ovs_key_ct_labels *setlabel,
bd7d93f8 1317 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
94053e66
FA
1318 const struct nat_action_info_t *nat_action_info,
1319 long long now)
a489b168 1320{
4ea96698
DB
1321 ipf_preprocess_conntrack(ct->ipf, pkt_batch, now, dl_type, zone,
1322 ct->hash_basis);
1323
43495c45 1324 struct dp_packet *packet;
61ce32b9 1325 struct conn_lookup_ctx ctx;
a489b168 1326
e883448e 1327 DP_PACKET_BATCH_FOR_EACH (i, packet, pkt_batch) {
4ea96698
DB
1328 if (packet->md.ct_state == CS_INVALID
1329 || !conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
43495c45
BB
1330 packet->md.ct_state = CS_INVALID;
1331 write_ct_md(packet, zone, NULL, NULL, NULL);
a489b168
DDP
1332 continue;
1333 }
94e71143 1334 process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
bd7d93f8 1335 setlabel, nat_action_info, tp_src, tp_dst, helper);
a489b168
DDP
1336 }
1337
4ea96698
DB
1338 ipf_postprocess_conntrack(ct->ipf, pkt_batch, now, dl_type);
1339
a489b168
DDP
1340 return 0;
1341}
1342
1fe178d2
EG
1343void
1344conntrack_clear(struct dp_packet *packet)
1345{
1346 /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
1347 * the conntrack fields invalid. */
1348 packet->md.ct_state = 0;
1349}
1350
a489b168
DDP
1351static void
1352set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1353{
bd5e81a0
DB
1354 if (conn->alg_related) {
1355 pkt->md.ct_mark = conn->mark;
1356 } else {
1357 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1358 conn->mark = pkt->md.ct_mark;
1359 }
a489b168
DDP
1360}
1361
1362static void
1363set_label(struct dp_packet *pkt, struct conn *conn,
1364 const struct ovs_key_ct_labels *val,
1365 const struct ovs_key_ct_labels *mask)
1366{
bd5e81a0
DB
1367 if (conn->alg_related) {
1368 pkt->md.ct_label = conn->label;
1369 } else {
1370 ovs_u128 v, m;
a489b168 1371
bd5e81a0
DB
1372 memcpy(&v, val, sizeof v);
1373 memcpy(&m, mask, sizeof m);
a489b168 1374
bd5e81a0 1375 pkt->md.ct_label.u64.lo = v.u64.lo
a489b168 1376 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
bd5e81a0 1377 pkt->md.ct_label.u64.hi = v.u64.hi
a489b168 1378 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
bd5e81a0
DB
1379 conn->label = pkt->md.ct_label;
1380 }
a489b168 1381}
286de272 1382
a489b168 1383\f
e6ef6cc6
DDP
1384/* Delete the expired connections from 'ctb', up to 'limit'. Returns the
1385 * earliest expiration time among the remaining connections in 'ctb'. Returns
1386 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
1387 * if 'limit' is reached */
1388static long long
bd5e81a0
DB
1389sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb,
1390 long long now, size_t limit)
e6ef6cc6
DDP
1391 OVS_REQUIRES(ctb->lock)
1392{
1393 struct conn *conn, *next;
1394 long long min_expiration = LLONG_MAX;
e6ef6cc6
DDP
1395 size_t count = 0;
1396
dec0dbbc 1397 for (unsigned i = 0; i < N_CT_TM; i++) {
e6ef6cc6 1398 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
286de272
DB
1399 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1400 if (!conn_expired(conn, now) || count >= limit) {
1401 min_expiration = MIN(min_expiration, conn->expiration);
1402 if (count >= limit) {
1403 /* Do not check other lists. */
1404 COVERAGE_INC(conntrack_long_cleanup);
1405 return min_expiration;
1406 }
1407 break;
e6ef6cc6 1408 }
286de272
DB
1409 conn_clean(ct, conn, ctb);
1410 count++;
e6ef6cc6 1411 }
e6ef6cc6
DDP
1412 }
1413 }
e6ef6cc6
DDP
1414 return min_expiration;
1415}
1416
1417/* Cleans up old connection entries from 'ct'. Returns the time when the
1418 * next expiration might happen. The return value might be smaller than
1419 * 'now', meaning that an internal limit has been reached, and some expired
1420 * connections have not been deleted. */
1421static long long
1422conntrack_clean(struct conntrack *ct, long long now)
1423{
1424 long long next_wakeup = now + CT_TM_MIN;
1425 unsigned int n_conn_limit;
1426 size_t clean_count = 0;
e6ef6cc6
DDP
1427
1428 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1429
dec0dbbc 1430 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
e6ef6cc6
DDP
1431 struct conntrack_bucket *ctb = &ct->buckets[i];
1432 size_t prev_count;
1433 long long min_exp;
1434
1435 ovs_mutex_lock(&ctb->cleanup_mutex);
1436 if (ctb->next_cleanup > now) {
1437 goto next_bucket;
1438 }
1439
1440 ct_lock_lock(&ctb->lock);
1441 prev_count = hmap_count(&ctb->connections);
1442 /* If the connections are well distributed among buckets, we want to
1443 * limit to 10% of the global limit equally split among buckets. If
1444 * the bucket is busier than the others, we limit to 10% of its
1445 * current size. */
1446 min_exp = sweep_bucket(ct, ctb, now,
1447 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
1448 clean_count += prev_count - hmap_count(&ctb->connections);
1449
1450 if (min_exp > now) {
1451 /* We call hmap_shrink() only if sweep_bucket() managed to delete
1452 * every expired connection. */
1453 hmap_shrink(&ctb->connections);
1454 }
1455
1456 ct_lock_unlock(&ctb->lock);
1457
1458 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
1459
1460next_bucket:
1461 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
1462 ovs_mutex_unlock(&ctb->cleanup_mutex);
1463 }
1464
1465 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
1466 clean_count, time_msec() - now);
1467
1468 return next_wakeup;
1469}
1470
1471/* Cleanup:
e6ef6cc6
DDP
1472 *
1473 * We must call conntrack_clean() periodically. conntrack_clean() return
1474 * value gives an hint on when the next cleanup must be done (either because
1475 * there is an actual connection that expires, or because a new connection
1476 * might be created with the minimum timeout).
1477 *
1478 * The logic below has two goals:
1479 *
6c54734e
DDP
1480 * - We want to reduce the number of wakeups and batch connection cleanup
1481 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1482 * are coping with the current cleanup tasks, then we wait at least
1483 * 5 seconds to do further cleanup.
e6ef6cc6 1484 *
6c54734e
DDP
1485 * - We don't want to keep the buckets locked too long, as we might prevent
1486 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
1487 * behind, there is at least some 200ms blocks of time when buckets will be
1488 * left alone, so the datapath can operate unhindered.
e6ef6cc6
DDP
1489 */
1490#define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1491#define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1492
1493static void *
1494clean_thread_main(void *f_)
1495{
1496 struct conntrack *ct = f_;
1497
1498 while (!latch_is_set(&ct->clean_thread_exit)) {
1499 long long next_wake;
1500 long long now = time_msec();
e6ef6cc6
DDP
1501 next_wake = conntrack_clean(ct, now);
1502
1503 if (next_wake < now) {
1504 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1505 } else {
1506 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1507 }
1508 latch_wait(&ct->clean_thread_exit);
1509 poll_block();
1510 }
1511
1512 return NULL;
1513}
1514\f
e917d3ee
DB
1515/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1516 * used to store a pointer to the first byte after the L3 header. 'Size' is
1517 * the size of the packet beyond the data pointer. */
a489b168
DDP
1518static inline bool
1519extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1520 const char **new_data, bool validate_checksum)
1521{
e917d3ee
DB
1522 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1523 return false;
a489b168
DDP
1524 }
1525
dec0dbbc
DB
1526 const struct ip_header *ip = data;
1527 size_t ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
a489b168 1528
e917d3ee
DB
1529 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1530 return false;
1531 }
a489b168 1532
e917d3ee
DB
1533 if (OVS_UNLIKELY(size < ip_len)) {
1534 return false;
1535 }
a489b168 1536
e917d3ee
DB
1537 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1538 return false;
a489b168
DDP
1539 }
1540
1541 if (validate_checksum && csum(data, ip_len) != 0) {
1542 return false;
1543 }
1544
e917d3ee
DB
1545 if (new_data) {
1546 *new_data = (char *) data + ip_len;
1547 }
1548
a489b168
DDP
1549 key->src.addr.ipv4 = ip->ip_src;
1550 key->dst.addr.ipv4 = ip->ip_dst;
1551 key->nw_proto = ip->ip_proto;
1552
1553 return true;
1554}
1555
e917d3ee
DB
1556/* 'Data' is a pointer to the beginning of the L3 header and 'new_data' is
1557 * used to store a pointer to the first byte after the L3 header. 'Size' is
1558 * the size of the packet beyond the data pointer. */
a489b168
DDP
1559static inline bool
1560extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1561 const char **new_data)
1562{
1563 const struct ovs_16aligned_ip6_hdr *ip6 = data;
286de272 1564
e917d3ee
DB
1565 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1566 return false;
a489b168
DDP
1567 }
1568
1569 data = ip6 + 1;
1570 size -= sizeof *ip6;
dec0dbbc
DB
1571 uint8_t nw_proto = ip6->ip6_nxt;
1572 uint8_t nw_frag = 0;
a489b168 1573
523464ab
DB
1574 const struct ovs_16aligned_ip6_frag *frag_hdr;
1575 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag, &frag_hdr)) {
a489b168
DDP
1576 return false;
1577 }
1578
a489b168
DDP
1579 if (nw_frag) {
1580 return false;
1581 }
1582
c8b1ad49
DB
1583 if (new_data) {
1584 *new_data = data;
1585 }
1586
a489b168
DDP
1587 key->src.addr.ipv6 = ip6->ip6_src;
1588 key->dst.addr.ipv6 = ip6->ip6_dst;
1589 key->nw_proto = nw_proto;
1590
1591 return true;
1592}
1593
1594static inline bool
1595checksum_valid(const struct conn_key *key, const void *data, size_t size,
1596 const void *l3)
1597{
1598 uint32_t csum = 0;
1599
1600 if (key->dl_type == htons(ETH_TYPE_IP)) {
1601 csum = packet_csum_pseudoheader(l3);
1602 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1603 csum = packet_csum_pseudoheader6(l3);
1604 } else {
1605 return false;
1606 }
1607
1608 csum = csum_continue(csum, data, size);
1609
1610 return csum_finish(csum) == 0;
1611}
1612
1613static inline bool
1614check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
324459a3 1615 const void *l3, bool validate_checksum)
a489b168
DDP
1616{
1617 const struct tcp_header *tcp = data;
40225b0c
BP
1618 if (size < sizeof *tcp) {
1619 return false;
1620 }
a489b168 1621
40225b0c 1622 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
a489b168
DDP
1623 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1624 return false;
1625 }
1626
324459a3 1627 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1628}
1629
1630static inline bool
1631check_l4_udp(const struct conn_key *key, const void *data, size_t size,
324459a3 1632 const void *l3, bool validate_checksum)
a489b168
DDP
1633{
1634 const struct udp_header *udp = data;
40225b0c
BP
1635 if (size < sizeof *udp) {
1636 return false;
1637 }
a489b168 1638
40225b0c 1639 size_t udp_len = ntohs(udp->udp_len);
a489b168
DDP
1640 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1641 return false;
1642 }
1643
1644 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1645 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
324459a3 1646 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
a489b168
DDP
1647}
1648
1649static inline bool
324459a3 1650check_l4_icmp(const void *data, size_t size, bool validate_checksum)
a489b168 1651{
324459a3 1652 return validate_checksum ? csum(data, size) == 0 : true;
a489b168
DDP
1653}
1654
1655static inline bool
1656check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
324459a3 1657 const void *l3, bool validate_checksum)
a489b168 1658{
324459a3 1659 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
a489b168
DDP
1660}
1661
1662static inline bool
1663extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1664{
a489b168
DDP
1665 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1666 return false;
1667 }
1668
dec0dbbc 1669 const struct tcp_header *tcp = data;
a489b168
DDP
1670 key->src.port = tcp->tcp_src;
1671 key->dst.port = tcp->tcp_dst;
1672
1673 /* Port 0 is invalid */
1674 return key->src.port && key->dst.port;
1675}
1676
1677static inline bool
1678extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1679{
a489b168
DDP
1680 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1681 return false;
1682 }
1683
dec0dbbc 1684 const struct udp_header *udp = data;
a489b168
DDP
1685 key->src.port = udp->udp_src;
1686 key->dst.port = udp->udp_dst;
1687
1688 /* Port 0 is invalid */
1689 return key->src.port && key->dst.port;
1690}
1691
1692static inline bool extract_l4(struct conn_key *key, const void *data,
324459a3
SC
1693 size_t size, bool *related, const void *l3,
1694 bool validate_checksum);
a489b168 1695
b269a122
DDP
1696static uint8_t
1697reverse_icmp_type(uint8_t type)
1698{
1699 switch (type) {
1700 case ICMP4_ECHO_REQUEST:
1701 return ICMP4_ECHO_REPLY;
1702 case ICMP4_ECHO_REPLY:
1703 return ICMP4_ECHO_REQUEST;
1704
1705 case ICMP4_TIMESTAMP:
1706 return ICMP4_TIMESTAMPREPLY;
1707 case ICMP4_TIMESTAMPREPLY:
1708 return ICMP4_TIMESTAMP;
1709
1710 case ICMP4_INFOREQUEST:
1711 return ICMP4_INFOREPLY;
1712 case ICMP4_INFOREPLY:
1713 return ICMP4_INFOREQUEST;
1714 default:
1715 OVS_NOT_REACHED();
1716 }
1717}
1718
a489b168
DDP
1719/* If 'related' is not NULL and the function is processing an ICMP
1720 * error packet, extract the l3 and l4 fields from the nested header
1721 * instead and set *related to true. If 'related' is NULL we're
1722 * already processing a nested header and no such recursion is
1723 * possible */
1724static inline int
1725extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1726 bool *related)
1727{
a489b168
DDP
1728 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1729 return false;
1730 }
1731
dec0dbbc
DB
1732 const struct icmp_header *icmp = data;
1733
a489b168
DDP
1734 switch (icmp->icmp_type) {
1735 case ICMP4_ECHO_REQUEST:
1736 case ICMP4_ECHO_REPLY:
1737 case ICMP4_TIMESTAMP:
1738 case ICMP4_TIMESTAMPREPLY:
1739 case ICMP4_INFOREQUEST:
1740 case ICMP4_INFOREPLY:
b269a122
DDP
1741 if (icmp->icmp_code != 0) {
1742 return false;
1743 }
a489b168 1744 /* Separate ICMP connection: identified using id */
b269a122
DDP
1745 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1746 key->src.icmp_type = icmp->icmp_type;
1747 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
a489b168
DDP
1748 break;
1749 case ICMP4_DST_UNREACH:
1750 case ICMP4_TIME_EXCEEDED:
1751 case ICMP4_PARAM_PROB:
1752 case ICMP4_SOURCEQUENCH:
1753 case ICMP4_REDIRECT: {
1754 /* ICMP packet part of another connection. We should
1755 * extract the key from embedded packet header */
1756 struct conn_key inner_key;
1757 const char *l3 = (const char *) (icmp + 1);
1758 const char *tail = (const char *) data + size;
1759 const char *l4;
a489b168
DDP
1760
1761 if (!related) {
1762 return false;
1763 }
1764
1765 memset(&inner_key, 0, sizeof inner_key);
1766 inner_key.dl_type = htons(ETH_TYPE_IP);
dec0dbbc 1767 bool ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
a489b168
DDP
1768 if (!ok) {
1769 return false;
1770 }
1771
a81da080 1772 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned) {
a489b168
DDP
1773 return false;
1774 }
1775
1776 key->src = inner_key.src;
1777 key->dst = inner_key.dst;
1778 key->nw_proto = inner_key.nw_proto;
1779
324459a3 1780 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
a489b168
DDP
1781 if (ok) {
1782 conn_key_reverse(key);
1783 *related = true;
1784 }
1785 return ok;
1786 }
1787 default:
1788 return false;
1789 }
1790
1791 return true;
1792}
1793
b269a122
DDP
1794static uint8_t
1795reverse_icmp6_type(uint8_t type)
1796{
1797 switch (type) {
1798 case ICMP6_ECHO_REQUEST:
1799 return ICMP6_ECHO_REPLY;
1800 case ICMP6_ECHO_REPLY:
1801 return ICMP6_ECHO_REQUEST;
1802 default:
1803 OVS_NOT_REACHED();
1804 }
1805}
1806
a489b168
DDP
1807/* If 'related' is not NULL and the function is processing an ICMP
1808 * error packet, extract the l3 and l4 fields from the nested header
1809 * instead and set *related to true. If 'related' is NULL we're
1810 * already processing a nested header and no such recursion is
1811 * possible */
1812static inline bool
1813extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1814 bool *related)
1815{
1816 const struct icmp6_header *icmp6 = data;
1817
1818 /* All the messages that we support need at least 4 bytes after
1819 * the header */
1820 if (size < sizeof *icmp6 + 4) {
1821 return false;
1822 }
1823
1824 switch (icmp6->icmp6_type) {
1825 case ICMP6_ECHO_REQUEST:
1826 case ICMP6_ECHO_REPLY:
b269a122
DDP
1827 if (icmp6->icmp6_code != 0) {
1828 return false;
1829 }
a489b168 1830 /* Separate ICMP connection: identified using id */
b269a122
DDP
1831 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1832 key->src.icmp_type = icmp6->icmp6_type;
1833 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
a489b168
DDP
1834 break;
1835 case ICMP6_DST_UNREACH:
1836 case ICMP6_PACKET_TOO_BIG:
1837 case ICMP6_TIME_EXCEEDED:
1838 case ICMP6_PARAM_PROB: {
1839 /* ICMP packet part of another connection. We should
1840 * extract the key from embedded packet header */
1841 struct conn_key inner_key;
1842 const char *l3 = (const char *) icmp6 + 8;
1843 const char *tail = (const char *) data + size;
1844 const char *l4 = NULL;
a489b168
DDP
1845
1846 if (!related) {
1847 return false;
1848 }
1849
1850 memset(&inner_key, 0, sizeof inner_key);
1851 inner_key.dl_type = htons(ETH_TYPE_IPV6);
dec0dbbc 1852 bool ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
a489b168
DDP
1853 if (!ok) {
1854 return false;
1855 }
1856
1857 /* pf doesn't do this, but it seems a good idea */
1858 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
a81da080 1859 &key->dst.addr.ipv6_aligned)) {
a489b168
DDP
1860 return false;
1861 }
1862
1863 key->src = inner_key.src;
1864 key->dst = inner_key.dst;
1865 key->nw_proto = inner_key.nw_proto;
1866
324459a3 1867 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
a489b168
DDP
1868 if (ok) {
1869 conn_key_reverse(key);
1870 *related = true;
1871 }
1872 return ok;
1873 }
1874 default:
1875 return false;
1876 }
1877
1878 return true;
1879}
1880
1881/* Extract l4 fields into 'key', which must already contain valid l3
1882 * members.
1883 *
1884 * If 'related' is not NULL and an ICMP error packet is being
1885 * processed, the function will extract the key from the packet nested
1401f6de 1886 * in the ICMP payload and set '*related' to true.
a489b168 1887 *
9171c635
DB
1888 * 'size' here is the layer 4 size, which can be a nested size if parsing
1889 * an ICMP or ICMP6 header.
1890 *
a489b168
DDP
1891 * If 'related' is NULL, it means that we're already parsing a header nested
1892 * in an ICMP error. In this case, we skip checksum and length validation. */
1893static inline bool
1894extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
324459a3 1895 const void *l3, bool validate_checksum)
a489b168
DDP
1896{
1897 if (key->nw_proto == IPPROTO_TCP) {
324459a3
SC
1898 return (!related || check_l4_tcp(key, data, size, l3,
1899 validate_checksum)) && extract_l4_tcp(key, data, size);
a489b168 1900 } else if (key->nw_proto == IPPROTO_UDP) {
324459a3
SC
1901 return (!related || check_l4_udp(key, data, size, l3,
1902 validate_checksum)) && extract_l4_udp(key, data, size);
a489b168
DDP
1903 } else if (key->dl_type == htons(ETH_TYPE_IP)
1904 && key->nw_proto == IPPROTO_ICMP) {
324459a3 1905 return (!related || check_l4_icmp(data, size, validate_checksum))
a489b168
DDP
1906 && extract_l4_icmp(key, data, size, related);
1907 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1908 && key->nw_proto == IPPROTO_ICMPV6) {
324459a3
SC
1909 return (!related || check_l4_icmp6(key, data, size, l3,
1910 validate_checksum)) && extract_l4_icmp6(key, data, size,
1911 related);
a489b168
DDP
1912 } else {
1913 return false;
1914 }
1915}
1916
1917static bool
66e4ad8a 1918conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
a489b168
DDP
1919 struct conn_lookup_ctx *ctx, uint16_t zone)
1920{
2482b0b0 1921 const struct eth_header *l2 = dp_packet_eth(pkt);
a489b168
DDP
1922 const struct ip_header *l3 = dp_packet_l3(pkt);
1923 const char *l4 = dp_packet_l4(pkt);
a489b168
DDP
1924
1925 memset(ctx, 0, sizeof *ctx);
1926
1927 if (!l2 || !l3 || !l4) {
1928 return false;
1929 }
1930
1931 ctx->key.zone = zone;
1932
1933 /* XXX In this function we parse the packet (again, it has already
1934 * gone through miniflow_extract()) for two reasons:
1935 *
1936 * 1) To extract the l3 addresses and l4 ports.
1937 * We already have the l3 and l4 headers' pointers. Extracting
1938 * the l3 addresses and the l4 ports is really cheap, since they
1939 * can be found at fixed locations.
66e4ad8a
DDP
1940 * 2) To extract the l4 type.
1941 * Extracting the l4 types, for IPv6 can be quite expensive, because
1942 * it's not at a fixed location.
a489b168
DDP
1943 *
1944 * Here's a way to avoid (2) with the help of the datapath.
66e4ad8a 1945 * The datapath doesn't keep the packet's extracted flow[1], so
a489b168 1946 * using that is not an option. We could use the packet's matching
66e4ad8a
DDP
1947 * megaflow, but we have to make sure that the l4 type (nw_proto)
1948 * is unwildcarded. This means either:
a489b168 1949 *
66e4ad8a
DDP
1950 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1951 * if the actions contains ct().
a489b168 1952 *
66e4ad8a
DDP
1953 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1954 * action. This is already done in different actions, but it's
1955 * unnecessary for the kernel.
a489b168
DDP
1956 *
1957 * ---
66e4ad8a 1958 * [1] The reasons for this are that keeping the flow increases
a489b168
DDP
1959 * (slightly) the cache footprint and increases computation
1960 * time as we move the packet around. Most importantly, the flow
1961 * should be updated by the actions and this can be slow, as
1962 * we use a sparse representation (miniflow).
1963 *
1964 */
dec0dbbc 1965 bool ok;
66e4ad8a 1966 ctx->key.dl_type = dl_type;
dec0dbbc 1967
a489b168 1968 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
dec0dbbc 1969 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
324459a3
SC
1970 if (hwol_bad_l3_csum) {
1971 ok = false;
1972 } else {
dec0dbbc 1973 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
324459a3 1974 /* Validate the checksum only when hwol is not supported. */
9171c635 1975 ok = extract_l3_ipv4(&ctx->key, l3, dp_packet_l3_size(pkt), NULL,
324459a3
SC
1976 !hwol_good_l3_csum);
1977 }
a489b168 1978 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
9171c635 1979 ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL);
a489b168
DDP
1980 } else {
1981 ok = false;
1982 }
1983
1984 if (ok) {
324459a3
SC
1985 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
1986 if (!hwol_bad_l4_csum) {
1987 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt);
1988 /* Validate the checksum only when hwol is not supported. */
9171c635
DB
1989 if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt),
1990 &ctx->icmp_related, l3, !hwol_good_l4_csum)) {
324459a3
SC
1991 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1992 return true;
1993 }
a489b168
DDP
1994 }
1995 }
1996
1997 return false;
1998}
92edd073
DB
1999
2000static uint32_t
2001ct_addr_hash_add(uint32_t hash, const struct ct_addr *addr)
2002{
2003 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
2004 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
2005}
2006
2007static uint32_t
2008ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
2009{
2010 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
2011 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
2012}
a489b168
DDP
2013\f
2014/* Symmetric */
2015static uint32_t
2016conn_key_hash(const struct conn_key *key, uint32_t basis)
2017{
2018 uint32_t hsrc, hdst, hash;
a489b168 2019 hsrc = hdst = basis;
6b1d4625
DB
2020 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
2021 hdst = ct_endpoint_hash_add(hdst, &key->dst);
a489b168
DDP
2022
2023 /* Even if source and destination are swapped the hash will be the same. */
2024 hash = hsrc ^ hdst;
2025
2026 /* Hash the rest of the key(L3 and L4 types and zone). */
763b40b0 2027 return hash_words((uint32_t *) (&key->dst + 1),
a489b168
DDP
2028 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
2029 hash);
a489b168
DDP
2030}
2031
2032static void
2033conn_key_reverse(struct conn_key *key)
2034{
dec0dbbc 2035 struct ct_endpoint tmp = key->src;
a489b168
DDP
2036 key->src = key->dst;
2037 key->dst = tmp;
2038}
2039
286de272
DB
2040static uint32_t
2041nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
2042 struct in6_addr *ipv6_aligned_max)
2043{
2044 uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
2045 uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t);
2046 uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
2047 uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
2048
2049 ovs_be64 addr6_64_min_hi;
2050 ovs_be64 addr6_64_min_lo;
2051 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
2052 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
2053
2054 ovs_be64 addr6_64_max_hi;
2055 ovs_be64 addr6_64_max_lo;
2056 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
2057 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
2058
2059 uint64_t diff;
dec0dbbc 2060
286de272
DB
2061 if (addr6_64_min_hi == addr6_64_max_hi &&
2062 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
2063 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
2064 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
2065 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
2066 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
2067 ntohll(addr6_64_max_lo) - 1);
2068 } else {
2069 /* Limit address delta supported to 32 bits or 4 billion approximately.
2070 * Possibly, this should be visible to the user through a datapath
2071 * support check, however the practical impact is probably nil. */
2072 diff = 0xfffffffe;
2073 }
dec0dbbc 2074
286de272
DB
2075 if (diff > 0xfffffffe) {
2076 diff = 0xfffffffe;
2077 }
2078 return diff;
2079}
2080
2081/* This function must be used in tandem with nat_ipv6_addrs_delta(), which
2082 * restricts the input parameters. */
a489b168 2083static void
286de272
DB
2084nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
2085{
2086 uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
2087 uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
2088 ovs_be64 addr6_64_hi;
2089 ovs_be64 addr6_64_lo;
2090 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
2091 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
2092
2093 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
2094 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
2095 } else if (addr6_64_hi != OVS_BE64_MAX) {
2096 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
2097 addr6_64_lo = htonll(increment - (UINT64_MAX -
2098 ntohll(addr6_64_lo) + 1));
2099 } else {
2100 OVS_NOT_REACHED();
2101 }
2102
2103 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
2104 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
286de272
DB
2105}
2106
2107static uint32_t
2108nat_range_hash(const struct conn *conn, uint32_t basis)
2109{
2110 uint32_t hash = basis;
286de272 2111
92edd073
DB
2112 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
2113 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
2114 hash = hash_add(hash,
2115 (conn->nat_info->max_port << 16)
2116 | conn->nat_info->min_port);
92edd073
DB
2117 hash = ct_endpoint_hash_add(hash, &conn->key.src);
2118 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
286de272
DB
2119 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
2120 hash = hash_add(hash, conn->key.nw_proto);
2121 hash = hash_add(hash, conn->key.zone);
92edd073
DB
2122
2123 /* The purpose of the second parameter is to distinguish hashes of data of
2124 * different length; our data always has the same length so there is no
2125 * value in counting. */
2126 return hash_finish(hash, 0);
286de272
DB
2127}
2128
2129static bool
2130nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
2131 struct conn *nat_conn)
2132{
bd5e81a0
DB
2133 enum { MIN_NAT_EPHEMERAL_PORT = 1024,
2134 MAX_NAT_EPHEMERAL_PORT = 65535 };
286de272
DB
2135
2136 uint16_t min_port;
2137 uint16_t max_port;
2138 uint16_t first_port;
286de272
DB
2139 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
2140
2141 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
2142 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
2143 min_port = ntohs(conn->key.src.port);
2144 max_port = ntohs(conn->key.src.port);
2145 first_port = min_port;
2146 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
2147 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
2148 min_port = ntohs(conn->key.dst.port);
2149 max_port = ntohs(conn->key.dst.port);
2150 first_port = min_port;
2151 } else {
2152 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
2153 uint32_t port_index = hash % (deltap + 1);
2154 first_port = conn->nat_info->min_port + port_index;
2155 min_port = conn->nat_info->min_port;
2156 max_port = conn->nat_info->max_port;
2157 }
2158
2159 uint32_t deltaa = 0;
2160 uint32_t address_index;
2161 struct ct_addr ct_addr;
2162 memset(&ct_addr, 0, sizeof ct_addr);
2163 struct ct_addr max_ct_addr;
2164 memset(&max_ct_addr, 0, sizeof max_ct_addr);
2165 max_ct_addr = conn->nat_info->max_addr;
2166
2167 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
2168 deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
2169 ntohl(conn->nat_info->min_addr.ipv4_aligned);
2170 address_index = hash % (deltaa + 1);
2171 ct_addr.ipv4_aligned = htonl(
2172 ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
2173 } else {
2174 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
2175 &conn->nat_info->max_addr.ipv6_aligned);
2176 /* deltaa must be within 32 bits for full hash coverage. A 64 or
2177 * 128 bit hash is unnecessary and hence not used here. Most code
2178 * is kept common with V4; nat_ipv6_addrs_delta() will do the
2179 * enforcement via max_ct_addr. */
2180 max_ct_addr = conn->nat_info->min_addr;
2181 nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
286de272
DB
2182 address_index = hash % (deltaa + 1);
2183 ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
2184 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
2185 }
2186
2187 uint16_t port = first_port;
2188 bool all_ports_tried = false;
ac04639a
DB
2189 /* For DNAT, we don't use ephemeral ports. */
2190 bool ephemeral_ports_tried = conn->nat_info->nat_action & NAT_ACTION_DST
2191 ? true : false;
286de272 2192 struct ct_addr first_addr = ct_addr;
286de272
DB
2193
2194 while (true) {
2195 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2196 nat_conn->rev_key.dst.addr = ct_addr;
2197 } else {
2198 nat_conn->rev_key.src.addr = ct_addr;
2199 }
2200
2201 if ((conn->key.nw_proto == IPPROTO_ICMP) ||
2202 (conn->key.nw_proto == IPPROTO_ICMPV6)) {
2203 all_ports_tried = true;
2204 } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2205 nat_conn->rev_key.dst.port = htons(port);
2206 } else {
2207 nat_conn->rev_key.src.port = htons(port);
2208 }
2209
80cee116
DB
2210 bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys, nat_conn,
2211 ct->hash_basis);
2212 if (new_insert) {
286de272
DB
2213 return true;
2214 } else if (!all_ports_tried) {
2215 if (min_port == max_port) {
2216 all_ports_tried = true;
2217 } else if (port == max_port) {
2218 port = min_port;
2219 } else {
2220 port++;
2221 }
2222 if (port == first_port) {
2223 all_ports_tried = true;
2224 }
2225 } else {
2226 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
2227 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
2228 ct_addr.ipv4_aligned = htonl(
2229 ntohl(ct_addr.ipv4_aligned) + 1);
2230 } else {
2231 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
2232 }
2233 } else {
2234 ct_addr = conn->nat_info->min_addr;
2235 }
2236 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
ac04639a
DB
2237 if (!ephemeral_ports_tried) {
2238 ephemeral_ports_tried = true;
286de272 2239 ct_addr = conn->nat_info->min_addr;
8417e688 2240 first_addr = ct_addr;
286de272
DB
2241 min_port = MIN_NAT_EPHEMERAL_PORT;
2242 max_port = MAX_NAT_EPHEMERAL_PORT;
2243 } else {
2244 break;
2245 }
2246 }
2247 first_port = min_port;
2248 port = first_port;
2249 all_ports_tried = false;
2250 }
2251 }
2252 return false;
2253}
2254
ac6abe5f 2255/* This function must be called with the ct->resources lock taken. */
286de272
DB
2256static struct nat_conn_key_node *
2257nat_conn_keys_lookup(struct hmap *nat_conn_keys,
2258 const struct conn_key *key,
2259 uint32_t basis)
2260{
2261 struct nat_conn_key_node *nat_conn_key_node;
286de272 2262
dec0dbbc
DB
2263 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node,
2264 conn_key_hash(key, basis), nat_conn_keys) {
5ed7a0b4 2265 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
286de272
DB
2266 return nat_conn_key_node;
2267 }
2268 }
2269 return NULL;
2270}
2271
80cee116
DB
2272/* This function must be called with the ct->resources lock taken. */
2273static bool
2274nat_conn_keys_insert(struct hmap *nat_conn_keys, const struct conn *nat_conn,
2275 uint32_t basis)
2276{
2277 struct nat_conn_key_node *nat_conn_key_node =
2278 nat_conn_keys_lookup(nat_conn_keys, &nat_conn->rev_key, basis);
2279
2280 if (!nat_conn_key_node) {
2281 struct nat_conn_key_node *nat_conn_key = xzalloc(sizeof *nat_conn_key);
2282 nat_conn_key->key = nat_conn->rev_key;
2283 nat_conn_key->value = nat_conn->key;
dec0dbbc
DB
2284 hmap_insert(nat_conn_keys, &nat_conn_key->node,
2285 conn_key_hash(&nat_conn_key->key, basis));
80cee116
DB
2286 return true;
2287 }
2288 return false;
2289}
2290
ac6abe5f 2291/* This function must be called with the ct->resources write lock taken. */
286de272 2292static void
bd5e81a0
DB
2293nat_conn_keys_remove(struct hmap *nat_conn_keys,
2294 const struct conn_key *key,
286de272
DB
2295 uint32_t basis)
2296{
2297 struct nat_conn_key_node *nat_conn_key_node;
286de272 2298
dec0dbbc
DB
2299 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node,
2300 conn_key_hash(key, basis), nat_conn_keys) {
5ed7a0b4 2301 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
286de272
DB
2302 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
2303 free(nat_conn_key_node);
2304 return;
2305 }
2306 }
2307}
2308
2309static void
2310conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
a489b168 2311 long long now)
ac6abe5f 2312 OVS_REQUIRES(ctb->lock)
a489b168
DDP
2313{
2314 uint32_t hash = ctx->hash;
2315 struct conn *conn;
2316
2317 ctx->conn = NULL;
2318
2319 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
5ed7a0b4 2320 if (!conn_key_cmp(&conn->key, &ctx->key)
a489b168
DDP
2321 && !conn_expired(conn, now)) {
2322 ctx->conn = conn;
2323 ctx->reply = false;
2324 break;
2325 }
5ed7a0b4 2326 if (!conn_key_cmp(&conn->rev_key, &ctx->key)
a489b168
DDP
2327 && !conn_expired(conn, now)) {
2328 ctx->conn = conn;
2329 ctx->reply = true;
2330 break;
2331 }
2332 }
2333}
2334
2335static enum ct_update_res
e6ef6cc6
DDP
2336conn_update(struct conn *conn, struct conntrack_bucket *ctb,
2337 struct dp_packet *pkt, bool reply, long long now)
a489b168 2338{
e6ef6cc6
DDP
2339 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
2340 reply, now);
a489b168
DDP
2341}
2342
2343static bool
2344conn_expired(struct conn *conn, long long now)
2345{
286de272
DB
2346 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
2347 return now >= conn->expiration;
2348 }
2349 return false;
a489b168
DDP
2350}
2351
2352static bool
2353valid_new(struct dp_packet *pkt, struct conn_key *key)
2354{
2355 return l4_protos[key->nw_proto]->valid_new(pkt);
2356}
2357
2358static struct conn *
e6ef6cc6
DDP
2359new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
2360 struct conn_key *key, long long now)
a489b168 2361{
dec0dbbc 2362 struct conn *newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
a489b168
DDP
2363 if (newconn) {
2364 newconn->key = *key;
2365 }
2366
2367 return newconn;
2368}
2369
2370static void
2371delete_conn(struct conn *conn)
2372{
286de272 2373 free(conn->nat_info);
bd5e81a0 2374 free(conn->alg);
a489b168
DDP
2375 free(conn);
2376}
4d4e68ed 2377\f
271e48a0
YHW
2378/* Convert a conntrack address 'a' into an IP address 'b' based on 'dl_type'.
2379 *
2380 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2381 * in network-byte order. */
4d4e68ed
DDP
2382static void
2383ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
2384 union ct_dpif_inet_addr *b,
2385 ovs_be16 dl_type)
2386{
2387 if (dl_type == htons(ETH_TYPE_IP)) {
2388 b->ip = a->ipv4_aligned;
2389 } else if (dl_type == htons(ETH_TYPE_IPV6)){
2390 b->in6 = a->ipv6_aligned;
2391 }
2392}
2393
271e48a0
YHW
2394/* Convert an IP address 'a' into a conntrack address 'b' based on 'dl_type'.
2395 *
2396 * Note that 'dl_type' should be either "ETH_TYPE_IP" or "ETH_TYPE_IPv6"
2397 * in network-byte order. */
2398static void
2399ct_dpif_inet_addr_to_ct_endpoint(const union ct_dpif_inet_addr *a,
2400 struct ct_addr *b,
2401 ovs_be16 dl_type)
2402{
2403 if (dl_type == htons(ETH_TYPE_IP)) {
2404 b->ipv4_aligned = a->ip;
2405 } else if (dl_type == htons(ETH_TYPE_IPV6)){
2406 b->ipv6_aligned = a->in6;
2407 }
2408}
2409
4d4e68ed
DDP
2410static void
2411conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2412{
2413 if (key->dl_type == htons(ETH_TYPE_IP)) {
2414 tuple->l3_type = AF_INET;
2415 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2416 tuple->l3_type = AF_INET6;
2417 }
2418 tuple->ip_proto = key->nw_proto;
2419 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2420 key->dl_type);
2421 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2422 key->dl_type);
2423
2424 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
b269a122
DDP
2425 tuple->icmp_id = key->src.icmp_id;
2426 tuple->icmp_type = key->src.icmp_type;
2427 tuple->icmp_code = key->src.icmp_code;
4d4e68ed
DDP
2428 } else {
2429 tuple->src_port = key->src.port;
2430 tuple->dst_port = key->dst.port;
2431 }
2432}
2433
271e48a0
YHW
2434static void
2435tuple_to_conn_key(const struct ct_dpif_tuple *tuple, uint16_t zone,
2436 struct conn_key *key)
2437{
2438 if (tuple->l3_type == AF_INET) {
2439 key->dl_type = htons(ETH_TYPE_IP);
2440 } else if (tuple->l3_type == AF_INET6) {
2441 key->dl_type = htons(ETH_TYPE_IPV6);
2442 }
2443 key->nw_proto = tuple->ip_proto;
2444 ct_dpif_inet_addr_to_ct_endpoint(&tuple->src, &key->src.addr,
2445 key->dl_type);
2446 ct_dpif_inet_addr_to_ct_endpoint(&tuple->dst, &key->dst.addr,
2447 key->dl_type);
2448
2449 if (tuple->ip_proto == IPPROTO_ICMP || tuple->ip_proto == IPPROTO_ICMPV6) {
2450 key->src.icmp_id = tuple->icmp_id;
2451 key->src.icmp_type = tuple->icmp_type;
2452 key->src.icmp_code = tuple->icmp_code;
2453 key->dst.icmp_id = tuple->icmp_id;
2454 key->dst.icmp_type = reverse_icmp_type(tuple->icmp_type);
2455 key->dst.icmp_code = tuple->icmp_code;
2456 } else {
2457 key->src.port = tuple->src_port;
2458 key->dst.port = tuple->dst_port;
2459 }
2460 key->zone = zone;
2461}
2462
4d4e68ed
DDP
2463static void
2464conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
ded30c74 2465 long long now, int bkt)
4d4e68ed 2466{
4d4e68ed
DDP
2467 memset(entry, 0, sizeof *entry);
2468 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
2469 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
2470
2471 entry->zone = conn->key.zone;
2472 entry->mark = conn->mark;
2473
286de272 2474 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
4d4e68ed
DDP
2475 /* Not implemented yet */
2476 entry->timestamp.start = 0;
2477 entry->timestamp.stop = 0;
2478
dec0dbbc 2479 long long expiration = conn->expiration - now;
4d4e68ed
DDP
2480 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
2481
dec0dbbc 2482 struct ct_l4_proto *class = l4_protos[conn->key.nw_proto];
4d4e68ed
DDP
2483 if (class->conn_get_protoinfo) {
2484 class->conn_get_protoinfo(conn, &entry->protoinfo);
2485 }
bd5e81a0 2486
ded30c74 2487 entry->bkt = bkt;
bd5e81a0
DB
2488
2489 if (conn->alg) {
2490 /* Caller is responsible for freeing. */
2491 entry->helper.name = xstrdup(conn->alg);
2492 }
4d4e68ed
DDP
2493}
2494
4ea96698
DB
2495struct ipf *
2496conntrack_ipf_ctx(struct conntrack *ct)
2497{
2498 return ct->ipf;
2499}
2500
4d4e68ed
DDP
2501int
2502conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
ded30c74 2503 const uint16_t *pzone, int *ptot_bkts)
4d4e68ed
DDP
2504{
2505 memset(dump, 0, sizeof(*dump));
dec0dbbc 2506
4d4e68ed
DDP
2507 if (pzone) {
2508 dump->zone = *pzone;
2509 dump->filter_zone = true;
2510 }
4d4e68ed 2511
dec0dbbc 2512 dump->ct = ct;
ded30c74 2513 *ptot_bkts = CONNTRACK_BUCKETS;
4d4e68ed
DDP
2514 return 0;
2515}
2516
2517int
2518conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2519{
2520 struct conntrack *ct = dump->ct;
2521 long long now = time_msec();
2522
2523 while (dump->bucket < CONNTRACK_BUCKETS) {
2524 struct hmap_node *node;
2525
2526 ct_lock_lock(&ct->buckets[dump->bucket].lock);
2527 for (;;) {
2528 struct conn *conn;
2529
2530 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
2531 &dump->bucket_pos);
2532 if (!node) {
2533 break;
2534 }
2535 INIT_CONTAINER(conn, node, node);
286de272
DB
2536 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2537 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
ded30c74 2538 conn_to_ct_dpif_entry(conn, entry, now, dump->bucket);
4d4e68ed
DDP
2539 break;
2540 }
2541 /* Else continue, until we find an entry in the appropriate zone
2542 * or the bucket has been scanned completely. */
2543 }
2544 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
2545
2546 if (!node) {
2547 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
2548 dump->bucket++;
2549 } else {
2550 return 0;
2551 }
2552 }
2553 return EOF;
2554}
2555
2556int
2557conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2558{
2559 return 0;
2560}
5d9cbb4c
DDP
2561
2562int
2563conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2564{
dec0dbbc 2565 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
5d9cbb4c
DDP
2566 struct conn *conn, *next;
2567
2568 ct_lock_lock(&ct->buckets[i].lock);
bd5e81a0 2569 HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
286de272
DB
2570 if ((!zone || *zone == conn->key.zone) &&
2571 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
2572 conn_clean(ct, conn, &ct->buckets[i]);
5d9cbb4c
DDP
2573 }
2574 }
2575 ct_lock_unlock(&ct->buckets[i].lock);
2576 }
bd5e81a0 2577
5d9cbb4c
DDP
2578 return 0;
2579}
bd5e81a0 2580
271e48a0
YHW
2581int
2582conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
2583 uint16_t zone)
2584{
2585 struct conn_lookup_ctx ctx;
2586 int error = 0;
2587
2588 memset(&ctx, 0, sizeof(ctx));
2589 tuple_to_conn_key(tuple, zone, &ctx.key);
2590 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
2591 unsigned bucket = hash_to_bucket(ctx.hash);
2592
2593 ct_lock_lock(&ct->buckets[bucket].lock);
2594 conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
a1d5eeff 2595 if (ctx.conn && ctx.conn->conn_type == CT_CONN_TYPE_DEFAULT) {
271e48a0
YHW
2596 conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
2597 } else {
a1d5eeff 2598 VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
271e48a0
YHW
2599 error = ENOENT;
2600 }
2601 ct_lock_unlock(&ct->buckets[bucket].lock);
2602 return error;
2603}
2604
c92339ad
DB
2605int
2606conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns)
2607{
2608 atomic_store_relaxed(&ct->n_conn_limit, maxconns);
2609 return 0;
2610}
2611
2612int
2613conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns)
2614{
2615 atomic_read_relaxed(&ct->n_conn_limit, maxconns);
2616 return 0;
2617}
2618
875075b3
DB
2619int
2620conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns)
2621{
2622 *nconns = atomic_count_get(&ct->n_conn);
2623 return 0;
2624}
2625
bd5e81a0
DB
2626/* This function must be called with the ct->resources read lock taken. */
2627static struct alg_exp_node *
be38342d
DB
2628expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
2629 uint32_t basis, bool src_ip_wc)
bd5e81a0 2630{
c3f6bae2
DB
2631 struct conn_key check_key;
2632 memcpy(&check_key, key, sizeof check_key);
bd5e81a0 2633 check_key.src.port = ALG_WC_SRC_PORT;
dec0dbbc 2634
be38342d
DB
2635 if (src_ip_wc) {
2636 memset(&check_key.src.addr, 0, sizeof check_key.src.addr);
2637 }
dec0dbbc 2638
bd5e81a0
DB
2639 struct alg_exp_node *alg_exp_node;
2640
bd5e81a0 2641 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
dec0dbbc 2642 conn_key_hash(&check_key, basis),
bd5e81a0
DB
2643 alg_expectations) {
2644 if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
2645 return alg_exp_node;
2646 }
2647 }
2648 return NULL;
2649}
2650
4417ca3d
DB
2651/* This function must be called with the ct->resources write lock taken. */
2652static void
2653expectation_remove(struct hmap *alg_expectations,
2654 const struct conn_key *key, uint32_t basis)
2655{
2656 struct alg_exp_node *alg_exp_node;
2657
2658 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node, conn_key_hash(key, basis),
2659 alg_expectations) {
2660 if (!conn_key_cmp(&alg_exp_node->key, key)) {
2661 hmap_remove(alg_expectations, &alg_exp_node->node);
2662 break;
2663 }
2664 }
2665}
2666
2667/* This function must be called with the ct->resources read lock taken. */
2668static struct alg_exp_node *
2669expectation_ref_lookup_unique(const struct hindex *alg_expectation_refs,
2670 const struct conn_key *master_key,
2671 const struct conn_key *alg_exp_key,
2672 uint32_t basis)
2673{
2674 struct alg_exp_node *alg_exp_node;
2675
2676 HINDEX_FOR_EACH_WITH_HASH (alg_exp_node, node_ref,
2677 conn_key_hash(master_key, basis),
2678 alg_expectation_refs) {
2679 if (!conn_key_cmp(&alg_exp_node->master_key, master_key) &&
2680 !conn_key_cmp(&alg_exp_node->key, alg_exp_key)) {
2681 return alg_exp_node;
2682 }
2683 }
2684 return NULL;
2685}
2686
2687/* This function must be called with the ct->resources write lock taken. */
2688static void
2689expectation_ref_create(struct hindex *alg_expectation_refs,
2690 struct alg_exp_node *alg_exp_node,
2691 uint32_t basis)
2692{
2693 if (!expectation_ref_lookup_unique(alg_expectation_refs,
2694 &alg_exp_node->master_key,
2695 &alg_exp_node->key, basis)) {
2696 hindex_insert(alg_expectation_refs, &alg_exp_node->node_ref,
2697 conn_key_hash(&alg_exp_node->master_key, basis));
2698 }
2699}
2700
2701static void
2702expectation_clean(struct conntrack *ct, const struct conn_key *master_key,
2703 uint32_t basis)
2704{
2705 ct_rwlock_wrlock(&ct->resources_lock);
2706
2707 struct alg_exp_node *node, *next;
2708 HINDEX_FOR_EACH_WITH_HASH_SAFE (node, next, node_ref,
2709 conn_key_hash(master_key, basis),
2710 &ct->alg_expectation_refs) {
2711 if (!conn_key_cmp(&node->master_key, master_key)) {
2712 expectation_remove(&ct->alg_expectations, &node->key, basis);
2713 hindex_remove(&ct->alg_expectation_refs, &node->node_ref);
2714 free(node);
2715 }
2716 }
2717
2718 ct_rwlock_unlock(&ct->resources_lock);
2719}
2720
bd5e81a0 2721static void
be38342d
DB
2722expectation_create(struct conntrack *ct, ovs_be16 dst_port,
2723 const struct conn *master_conn, bool reply, bool src_ip_wc,
2724 bool skip_nat)
bd5e81a0
DB
2725{
2726 struct ct_addr src_addr;
2727 struct ct_addr dst_addr;
2728 struct ct_addr alg_nat_repl_addr;
be38342d 2729 struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
bd5e81a0 2730
be38342d 2731 if (reply) {
bd5e81a0
DB
2732 src_addr = master_conn->key.src.addr;
2733 dst_addr = master_conn->key.dst.addr;
efa29a89 2734 alg_exp_node->nat_rpl_dst = true;
be38342d
DB
2735 if (skip_nat) {
2736 alg_nat_repl_addr = dst_addr;
efa29a89
DM
2737 } else if (master_conn->nat_info &&
2738 master_conn->nat_info->nat_action & NAT_ACTION_DST) {
2739 alg_nat_repl_addr = master_conn->rev_key.src.addr;
2740 alg_exp_node->nat_rpl_dst = false;
be38342d
DB
2741 } else {
2742 alg_nat_repl_addr = master_conn->rev_key.dst.addr;
2743 }
be38342d
DB
2744 } else {
2745 src_addr = master_conn->rev_key.src.addr;
2746 dst_addr = master_conn->rev_key.dst.addr;
efa29a89 2747 alg_exp_node->nat_rpl_dst = false;
be38342d
DB
2748 if (skip_nat) {
2749 alg_nat_repl_addr = src_addr;
efa29a89
DM
2750 } else if (master_conn->nat_info &&
2751 master_conn->nat_info->nat_action & NAT_ACTION_DST) {
2752 alg_nat_repl_addr = master_conn->key.dst.addr;
2753 alg_exp_node->nat_rpl_dst = true;
be38342d
DB
2754 } else {
2755 alg_nat_repl_addr = master_conn->key.src.addr;
2756 }
be38342d
DB
2757 }
2758 if (src_ip_wc) {
2759 memset(&src_addr, 0, sizeof src_addr);
bd5e81a0
DB
2760 }
2761
bd5e81a0
DB
2762 alg_exp_node->key.dl_type = master_conn->key.dl_type;
2763 alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
2764 alg_exp_node->key.zone = master_conn->key.zone;
2765 alg_exp_node->key.src.addr = src_addr;
2766 alg_exp_node->key.dst.addr = dst_addr;
2767 alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
2768 alg_exp_node->key.dst.port = dst_port;
2769 alg_exp_node->master_mark = master_conn->mark;
2770 alg_exp_node->master_label = master_conn->label;
2771 alg_exp_node->master_key = master_conn->key;
bd5e81a0
DB
2772 /* Take the write lock here because it is almost 100%
2773 * likely that the lookup will fail and
2774 * expectation_create() will be called below. */
2775 ct_rwlock_wrlock(&ct->resources_lock);
2776 struct alg_exp_node *alg_exp = expectation_lookup(
be38342d 2777 &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis, src_ip_wc);
bd5e81a0
DB
2778 if (alg_exp) {
2779 free(alg_exp_node);
2780 ct_rwlock_unlock(&ct->resources_lock);
2781 return;
2782 }
2783
2784 alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
4417ca3d 2785 hmap_insert(&ct->alg_expectations, &alg_exp_node->node,
dec0dbbc 2786 conn_key_hash(&alg_exp_node->key, ct->hash_basis));
4417ca3d
DB
2787 expectation_ref_create(&ct->alg_expectation_refs, alg_exp_node,
2788 ct->hash_basis);
bd5e81a0
DB
2789 ct_rwlock_unlock(&ct->resources_lock);
2790}
2791
bd5e81a0
DB
2792static void
2793replace_substring(char *substr, uint8_t substr_size,
2794 uint8_t total_size, char *rep_str,
2795 uint8_t rep_str_size)
2796{
2797 memmove(substr + rep_str_size, substr + substr_size,
2798 total_size - substr_size);
2799 memcpy(substr, rep_str, rep_str_size);
2800}
2801
cd7c99a6
DB
2802static void
2803repl_bytes(char *str, char c1, char c2)
2804{
2805 while (*str) {
2806 if (*str == c1) {
2807 *str = c2;
2808 }
2809 str++;
2810 }
2811}
2812
2813static void
2814modify_packet(struct dp_packet *pkt, char *pkt_str, size_t size,
2815 char *repl_str, size_t repl_size,
2816 uint32_t orig_used_size)
2817{
2818 replace_substring(pkt_str, size,
2819 (const char *) dp_packet_tail(pkt) - pkt_str,
2820 repl_str, repl_size);
2821 dp_packet_set_size(pkt, orig_used_size + (int) repl_size - (int) size);
2822}
2823
bd5e81a0
DB
2824/* Replace IPV4 address in FTP message with NATed address. */
2825static int
2826repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
2827 char *ftp_data_start,
cd7c99a6
DB
2828 size_t addr_offset_from_ftp_data_start,
2829 size_t addr_size OVS_UNUSED)
bd5e81a0
DB
2830{
2831 enum { MAX_FTP_V4_NAT_DELTA = 8 };
2832
2833 /* Do conservative check for pathological MTU usage. */
2834 uint32_t orig_used_size = dp_packet_size(pkt);
cd7c99a6
DB
2835 if (orig_used_size + MAX_FTP_V4_NAT_DELTA >
2836 dp_packet_get_allocated(pkt)) {
2837
bd5e81a0 2838 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
cd7c99a6
DB
2839 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V4",
2840 dp_packet_get_allocated(pkt));
bd5e81a0
DB
2841 return 0;
2842 }
2843
cd7c99a6
DB
2844 char v4_addr_str[INET_ADDRSTRLEN] = {0};
2845 ovs_assert(inet_ntop(AF_INET, &v4_addr_rep, v4_addr_str,
2846 sizeof v4_addr_str));
2847 repl_bytes(v4_addr_str, '.', ',');
2848 modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
2849 addr_size, v4_addr_str, strlen(v4_addr_str),
2850 orig_used_size);
2851 return (int) strlen(v4_addr_str) - (int) addr_size;
bd5e81a0
DB
2852}
2853
2854static char *
2855skip_non_digits(char *str)
2856{
2857 while (!isdigit(*str) && *str != 0) {
2858 str++;
2859 }
2860 return str;
2861}
2862
2863static char *
2864terminate_number_str(char *str, uint8_t max_digits)
2865{
2866 uint8_t digits_found = 0;
2867 while (isdigit(*str) && digits_found <= max_digits) {
2868 str++;
2869 digits_found++;
2870 }
2871
2872 *str = 0;
2873 return str;
2874}
2875
2876
2877static void
2878get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
2879{
2880 struct tcp_header *th = dp_packet_l4(pkt);
2881 char *tcp_hdr = (char *) th;
2882 uint32_t tcp_payload_len = tcp_payload_length(pkt);
2883 size_t tcp_payload_of_interest = MIN(tcp_payload_len,
2884 LARGEST_FTP_MSG_OF_INTEREST);
2885 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2886
2887 ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
2888 tcp_payload_of_interest);
2889}
2890
2891static enum ftp_ctl_pkt
2892detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
2893 struct dp_packet *pkt)
2894{
bd5e81a0
DB
2895 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2896 get_ftp_ctl_msg(pkt, ftp_msg);
dec0dbbc 2897
bd5e81a0
DB
2898 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2899 if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
2900 !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
2901 return CT_FTP_CTL_OTHER;
2902 }
2903 } else {
2904 if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
2905 strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
2906 strlen(FTP_PASV_REPLY_CODE))) {
2907 return CT_FTP_CTL_OTHER;
2908 }
2909 }
2910
2911 return CT_FTP_CTL_INTEREST;
2912}
2913
2914static enum ftp_ctl_pkt
2915process_ftp_ctl_v4(struct conntrack *ct,
2916 struct dp_packet *pkt,
2917 const struct conn *conn_for_expectation,
4417ca3d 2918 ovs_be32 *v4_addr_rep,
bd5e81a0 2919 char **ftp_data_v4_start,
cd7c99a6
DB
2920 size_t *addr_offset_from_ftp_data_start,
2921 size_t *addr_size)
bd5e81a0
DB
2922{
2923 struct tcp_header *th = dp_packet_l4(pkt);
2924 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2925 char *tcp_hdr = (char *) th;
2926 *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
2927 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2928 get_ftp_ctl_msg(pkt, ftp_msg);
bd5e81a0
DB
2929 char *ftp = ftp_msg;
2930 enum ct_alg_mode mode;
dec0dbbc 2931
23bea975 2932 if (!strncasecmp(ftp, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
bd5e81a0
DB
2933 ftp = ftp_msg + strlen(FTP_PORT_CMD);
2934 mode = CT_FTP_MODE_ACTIVE;
2935 } else {
2936 ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
2937 mode = CT_FTP_MODE_PASSIVE;
2938 }
2939
2940 /* Find first space. */
2941 ftp = strchr(ftp, ' ');
2942 if (!ftp) {
2943 return CT_FTP_CTL_INVALID;
2944 }
2945
2946 /* Find the first digit, after space. */
2947 ftp = skip_non_digits(ftp);
2948 if (*ftp == 0) {
2949 return CT_FTP_CTL_INVALID;
2950 }
2951
2952 char *ip_addr_start = ftp;
2953 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
bd5e81a0 2954
dec0dbbc 2955 uint8_t comma_count = 0;
bd5e81a0
DB
2956 while (comma_count < 4 && *ftp) {
2957 if (*ftp == ',') {
2958 comma_count++;
2959 if (comma_count == 4) {
2960 *ftp = 0;
2961 } else {
2962 *ftp = '.';
2963 }
2964 }
2965 ftp++;
2966 }
2967 if (comma_count != 4) {
2968 return CT_FTP_CTL_INVALID;
2969 }
2970
2971 struct in_addr ip_addr;
2972 int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
2973 if (rc2 != 1) {
2974 return CT_FTP_CTL_INVALID;
2975 }
2976
cd7c99a6 2977 *addr_size = ftp - ip_addr_start - 1;
bd5e81a0
DB
2978 char *save_ftp = ftp;
2979 ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
2980 if (!ftp) {
2981 return CT_FTP_CTL_INVALID;
2982 }
2983 int value;
2984 if (!str_to_int(save_ftp, 10, &value)) {
2985 return CT_FTP_CTL_INVALID;
2986 }
2987
2988 /* This is derived from the L4 port maximum is 65535. */
2989 if (value > 255) {
2990 return CT_FTP_CTL_INVALID;
2991 }
2992
2993 uint16_t port_hs = value;
2994 port_hs <<= 8;
2995
2996 /* Skip over comma. */
2997 ftp++;
2998 save_ftp = ftp;
2999 bool digit_found = false;
3000 while (isdigit(*ftp)) {
3001 ftp++;
3002 digit_found = true;
3003 }
3004 if (!digit_found) {
3005 return CT_FTP_CTL_INVALID;
3006 }
3007 *ftp = 0;
3008 if (!str_to_int(save_ftp, 10, &value)) {
3009 return CT_FTP_CTL_INVALID;
3010 }
3011
3012 if (value > 255) {
3013 return CT_FTP_CTL_INVALID;
3014 }
3015
78a0b272 3016 port_hs |= value;
bd5e81a0
DB
3017 ovs_be16 port = htons(port_hs);
3018 ovs_be32 conn_ipv4_addr;
3019
3020 switch (mode) {
3021 case CT_FTP_MODE_ACTIVE:
3022 *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4_aligned;
3023 conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4_aligned;
3024 break;
3025 case CT_FTP_MODE_PASSIVE:
3026 *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4_aligned;
3027 conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4_aligned;
3028 break;
7be77cb0 3029 case CT_TFTP_MODE:
bd5e81a0
DB
3030 default:
3031 OVS_NOT_REACHED();
3032 }
3033
3034 ovs_be32 ftp_ipv4_addr;
3035 ftp_ipv4_addr = ip_addr.s_addr;
3036 /* Although most servers will block this exploit, there may be some
3037 * less well managed. */
3038 if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
3039 return CT_FTP_CTL_INVALID;
3040 }
3041
be38342d
DB
3042 expectation_create(ct, port, conn_for_expectation,
3043 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
3044 return CT_FTP_CTL_INTEREST;
3045}
3046
3047static char *
3048skip_ipv6_digits(char *str)
3049{
3050 while (isxdigit(*str) || *str == ':' || *str == '.') {
3051 str++;
3052 }
3053 return str;
3054}
3055
3056static enum ftp_ctl_pkt
3057process_ftp_ctl_v6(struct conntrack *ct,
3058 struct dp_packet *pkt,
3059 const struct conn *conn_for_expectation,
bd5e81a0
DB
3060 struct ct_addr *v6_addr_rep,
3061 char **ftp_data_start,
3062 size_t *addr_offset_from_ftp_data_start,
3063 size_t *addr_size, enum ct_alg_mode *mode)
3064{
3065 struct tcp_header *th = dp_packet_l4(pkt);
3066 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
3067 char *tcp_hdr = (char *) th;
3068 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
bd5e81a0
DB
3069 get_ftp_ctl_msg(pkt, ftp_msg);
3070 *ftp_data_start = tcp_hdr + tcp_hdr_len;
bd5e81a0
DB
3071 char *ftp = ftp_msg;
3072 struct in6_addr ip6_addr;
dec0dbbc 3073
23bea975 3074 if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
bd5e81a0
DB
3075 ftp = ftp_msg + strlen(FTP_EPRT_CMD);
3076 ftp = skip_non_digits(ftp);
3077 if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
3078 return CT_FTP_CTL_INVALID;
3079 }
3080 /* Jump over delimiter. */
3081 ftp += 2;
3082
bd5e81a0 3083 memset(&ip6_addr, 0, sizeof ip6_addr);
dec0dbbc 3084 char *ip_addr_start = ftp;
bd5e81a0
DB
3085 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
3086 ftp = skip_ipv6_digits(ftp);
3087 *ftp = 0;
3088 *addr_size = ftp - ip_addr_start;
3089 int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
3090 if (rc2 != 1) {
3091 return CT_FTP_CTL_INVALID;
3092 }
3093 ftp++;
3094 *mode = CT_FTP_MODE_ACTIVE;
3095 } else {
3096 ftp = ftp_msg + strcspn(ftp_msg, "(");
3097 ftp = skip_non_digits(ftp);
3098 if (!isdigit(*ftp)) {
3099 return CT_FTP_CTL_INVALID;
3100 }
3101
3102 /* Not used for passive mode. */
3103 *addr_offset_from_ftp_data_start = 0;
3104 *addr_size = 0;
3105
3106 *mode = CT_FTP_MODE_PASSIVE;
3107 }
3108
3109 char *save_ftp = ftp;
3110 ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
3111 if (!ftp) {
3112 return CT_FTP_CTL_INVALID;
3113 }
dec0dbbc 3114
bd5e81a0
DB
3115 int value;
3116 if (!str_to_int(save_ftp, 10, &value)) {
3117 return CT_FTP_CTL_INVALID;
3118 }
3119 if (value > CT_MAX_L4_PORT) {
3120 return CT_FTP_CTL_INVALID;
3121 }
3122
3123 uint16_t port_hs = value;
3124 ovs_be16 port = htons(port_hs);
3125
3126 switch (*mode) {
3127 case CT_FTP_MODE_ACTIVE:
3128 *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
3129 /* Although most servers will block this exploit, there may be some
3130 * less well managed. */
3131 if (memcmp(&ip6_addr, &v6_addr_rep->ipv6_aligned, sizeof ip6_addr) &&
3132 memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6_aligned,
3133 sizeof ip6_addr)) {
3134 return CT_FTP_CTL_INVALID;
3135 }
3136 break;
3137 case CT_FTP_MODE_PASSIVE:
3138 *v6_addr_rep = conn_for_expectation->key.dst.addr;
3139 break;
7be77cb0 3140 case CT_TFTP_MODE:
bd5e81a0
DB
3141 default:
3142 OVS_NOT_REACHED();
3143 }
3144
be38342d
DB
3145 expectation_create(ct, port, conn_for_expectation,
3146 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
bd5e81a0
DB
3147 return CT_FTP_CTL_INTEREST;
3148}
3149
3150static int
3151repl_ftp_v6_addr(struct dp_packet *pkt, struct ct_addr v6_addr_rep,
3152 char *ftp_data_start,
3153 size_t addr_offset_from_ftp_data_start,
3154 size_t addr_size, enum ct_alg_mode mode)
3155{
3156 /* This is slightly bigger than really possible. */
3157 enum { MAX_FTP_V6_NAT_DELTA = 45 };
3158
3159 if (mode == CT_FTP_MODE_PASSIVE) {
3160 return 0;
3161 }
3162
3163 /* Do conservative check for pathological MTU usage. */
3164 uint32_t orig_used_size = dp_packet_size(pkt);
cd7c99a6
DB
3165 if (orig_used_size + MAX_FTP_V6_NAT_DELTA >
3166 dp_packet_get_allocated(pkt)) {
3167
bd5e81a0 3168 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
cd7c99a6
DB
3169 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP V6",
3170 dp_packet_get_allocated(pkt));
bd5e81a0
DB
3171 return 0;
3172 }
3173
298530b8 3174 char v6_addr_str[INET6_ADDRSTRLEN] = {0};
500db308 3175 ovs_assert(inet_ntop(AF_INET6, &v6_addr_rep.ipv6_aligned, v6_addr_str,
298530b8 3176 sizeof v6_addr_str));
cd7c99a6
DB
3177 modify_packet(pkt, ftp_data_start + addr_offset_from_ftp_data_start,
3178 addr_size, v6_addr_str, strlen(v6_addr_str),
3179 orig_used_size);
3180 return (int) strlen(v6_addr_str) - (int) addr_size;
bd5e81a0
DB
3181}
3182
d13d7115
DB
3183/* Increment/decrement a TCP sequence number. */
3184static void
3185adj_seqnum(ovs_16aligned_be32 *val, int32_t inc)
3186{
3187 put_16aligned_be32(val, htonl(ntohl(get_16aligned_be32(val)) + inc));
3188}
3189
bd5e81a0
DB
3190static void
3191handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
253e4dc0
DM
3192 struct dp_packet *pkt, const struct conn *ec, long long now,
3193 enum ftp_ctl_pkt ftp_ctl, bool nat)
bd5e81a0
DB
3194{
3195 struct ip_header *l3_hdr = dp_packet_l3(pkt);
3196 ovs_be32 v4_addr_rep = 0;
3197 struct ct_addr v6_addr_rep;
faa0826d 3198 size_t addr_offset_from_ftp_data_start = 0;
bd5e81a0
DB
3199 size_t addr_size = 0;
3200 char *ftp_data_start;
bd5e81a0
DB
3201 enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
3202
3203 if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
3204 return;
3205 }
3206
bd5e81a0
DB
3207 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
3208 int64_t seq_skew = 0;
dec0dbbc 3209
253e4dc0 3210 if (ftp_ctl == CT_FTP_CTL_INTEREST) {
bd5e81a0
DB
3211 enum ftp_ctl_pkt rc;
3212 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
253e4dc0 3213 rc = process_ftp_ctl_v6(ct, pkt, ec,
4417ca3d 3214 &v6_addr_rep, &ftp_data_start,
bd5e81a0
DB
3215 &addr_offset_from_ftp_data_start,
3216 &addr_size, &mode);
3217 } else {
253e4dc0 3218 rc = process_ftp_ctl_v4(ct, pkt, ec,
4417ca3d 3219 &v4_addr_rep, &ftp_data_start,
cd7c99a6
DB
3220 &addr_offset_from_ftp_data_start,
3221 &addr_size);
bd5e81a0
DB
3222 }
3223 if (rc == CT_FTP_CTL_INVALID) {
3224 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3225 VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
3226 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
3227 return;
3228 } else if (rc == CT_FTP_CTL_INTEREST) {
3229 uint16_t ip_len;
dec0dbbc 3230
bd5e81a0 3231 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
253e4dc0
DM
3232 if (nat) {
3233 seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep,
3234 ftp_data_start,
3235 addr_offset_from_ftp_data_start,
3236 addr_size, mode);
3237 }
3238
bd5e81a0 3239 if (seq_skew) {
253e4dc0
DM
3240 ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen) +
3241 seq_skew;
bd5e81a0 3242 nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
bd5e81a0
DB
3243 }
3244 } else {
253e4dc0
DM
3245 if (nat) {
3246 seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep,
3247 ftp_data_start,
cd7c99a6
DB
3248 addr_offset_from_ftp_data_start,
3249 addr_size);
253e4dc0 3250 }
bd5e81a0 3251 if (seq_skew) {
253e4dc0 3252 ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew;
bd5e81a0
DB
3253 l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
3254 l3_hdr->ip_tot_len, htons(ip_len));
3255 l3_hdr->ip_tot_len = htons(ip_len);
bd5e81a0
DB
3256 }
3257 }
3258 } else {
3259 OVS_NOT_REACHED();
3260 }
bd5e81a0
DB
3261 }
3262
3263 struct tcp_header *th = dp_packet_l4(pkt);
dec0dbbc 3264
253e4dc0 3265 if (nat && ec->seq_skew != 0) {
d13d7115
DB
3266 ctx->reply != ec->seq_skew_dir ?
3267 adj_seqnum(&th->tcp_ack, -ec->seq_skew) :
3268 adj_seqnum(&th->tcp_seq, ec->seq_skew);
bd5e81a0
DB
3269 }
3270
bd5e81a0
DB
3271 th->tcp_csum = 0;
3272 uint32_t tcp_csum;
3273 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3274 tcp_csum = packet_csum_pseudoheader6(nh6);
3275 } else {
3276 tcp_csum = packet_csum_pseudoheader(l3_hdr);
3277 }
dec0dbbc
DB
3278 const char *tail = dp_packet_tail(pkt);
3279 uint8_t pad = dp_packet_l2_pad_size(pkt);
bd5e81a0
DB
3280 th->tcp_csum = csum_finish(
3281 csum_continue(tcp_csum, th, tail - (char *) th - pad));
253e4dc0
DM
3282
3283 if (seq_skew) {
3284 conn_seq_skew_set(ct, &ec->key, now, seq_skew + ec->seq_skew,
3285 ctx->reply);
3286 }
bd5e81a0 3287}
7be77cb0
DB
3288
3289static void
3290handle_tftp_ctl(struct conntrack *ct,
94e71143 3291 const struct conn_lookup_ctx *ctx OVS_UNUSED,
be38342d 3292 struct dp_packet *pkt,
7be77cb0 3293 const struct conn *conn_for_expectation,
4417ca3d
DB
3294 long long now OVS_UNUSED,
3295 enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED)
7be77cb0 3296{
be38342d
DB
3297 expectation_create(ct, conn_for_expectation->key.src.port,
3298 conn_for_expectation,
3299 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
7be77cb0 3300}