]> git.proxmox.com Git - ovs.git/blob - lib/conntrack.c
dpif: Add support for OVS_ACTION_ATTR_CT_CLEAR
[ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include <ctype.h>
19 #include <errno.h>
20 #include <sys/types.h>
21 #include <netinet/in.h>
22 #include <netinet/icmp6.h>
23 #include <string.h>
24
25 #include "bitmap.h"
26 #include "conntrack.h"
27 #include "conntrack-private.h"
28 #include "coverage.h"
29 #include "csum.h"
30 #include "ct-dpif.h"
31 #include "dp-packet.h"
32 #include "flow.h"
33 #include "netdev.h"
34 #include "odp-netlink.h"
35 #include "openvswitch/hmap.h"
36 #include "openvswitch/vlog.h"
37 #include "ovs-rcu.h"
38 #include "ovs-thread.h"
39 #include "openvswitch/poll-loop.h"
40 #include "random.h"
41 #include "timeval.h"
42
43 VLOG_DEFINE_THIS_MODULE(conntrack);
44
45 COVERAGE_DEFINE(conntrack_full);
46 COVERAGE_DEFINE(conntrack_long_cleanup);
47
48 struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
53 bool icmp_related;
54 };
55
56 enum ftp_ctl_pkt {
57 /* Control packets with address and/or port specifiers. */
58 CT_FTP_CTL_INTEREST,
59 /* Control packets without address and/or port specifiers. */
60 CT_FTP_CTL_OTHER,
61 CT_FTP_CTL_INVALID,
62 };
63
64 enum ct_alg_mode {
65 CT_FTP_MODE_ACTIVE,
66 CT_FTP_MODE_PASSIVE,
67 CT_TFTP_MODE,
68 };
69
70 enum ct_alg_ctl_type {
71 CT_ALG_CTL_NONE,
72 CT_ALG_CTL_FTP,
73 CT_ALG_CTL_TFTP,
74 /* SIP is not enabled through Openflow and presently only used as
75 * an example of an alg that allows a wildcard src ip. */
76 CT_ALG_CTL_SIP,
77 };
78
79 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
80 ovs_be16 dl_type, struct conn_lookup_ctx *,
81 uint16_t zone);
82 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
83 static void conn_key_reverse(struct conn_key *);
84 static void conn_key_lookup(struct conntrack_bucket *ctb,
85 struct conn_lookup_ctx *ctx,
86 long long now);
87 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
88 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
89 struct conn_key *, long long now);
90 static void delete_conn(struct conn *);
91 static enum ct_update_res conn_update(struct conn *,
92 struct conntrack_bucket *ctb,
93 struct dp_packet *, bool reply,
94 long long now);
95 static bool conn_expired(struct conn *, long long now);
96 static void set_mark(struct dp_packet *, struct conn *,
97 uint32_t val, uint32_t mask);
98 static void set_label(struct dp_packet *, struct conn *,
99 const struct ovs_key_ct_labels *val,
100 const struct ovs_key_ct_labels *mask);
101 static void *clean_thread_main(void *f_);
102
103 static struct nat_conn_key_node *
104 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
105 const struct conn_key *key,
106 uint32_t basis);
107
108 static bool
109 nat_conn_keys_insert(struct hmap *nat_conn_keys,
110 const struct conn *nat_conn,
111 uint32_t hash_basis);
112
113 static void
114 nat_conn_keys_remove(struct hmap *nat_conn_keys,
115 const struct conn_key *key,
116 uint32_t basis);
117
118 static bool
119 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
120 struct conn *nat_conn);
121
122 static uint8_t
123 reverse_icmp_type(uint8_t type);
124 static uint8_t
125 reverse_icmp6_type(uint8_t type);
126 static inline bool
127 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
128 const char **new_data, bool validate_checksum);
129 static inline bool
130 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
131 const char **new_data);
132
133 static struct alg_exp_node *
134 expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
135 uint32_t basis, bool src_ip_wc);
136
137 static int
138 repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
139 char *ftp_data_v4_start,
140 size_t addr_offset_from_ftp_data_start);
141
142 static enum ftp_ctl_pkt
143 process_ftp_ctl_v4(struct conntrack *ct,
144 struct dp_packet *pkt,
145 const struct conn *conn_for_expectation,
146 ovs_be32 *v4_addr_rep,
147 char **ftp_data_v4_start,
148 size_t *addr_offset_from_ftp_data_start);
149
150 static enum ftp_ctl_pkt
151 detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
152 struct dp_packet *pkt);
153
154 static void
155 expectation_clean(struct conntrack *ct, const struct conn_key *master_key,
156 uint32_t basis);
157
158 static struct ct_l4_proto *l4_protos[] = {
159 [IPPROTO_TCP] = &ct_proto_tcp,
160 [IPPROTO_UDP] = &ct_proto_other,
161 [IPPROTO_ICMP] = &ct_proto_icmp4,
162 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
163 };
164
165 static void
166 handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
167 struct dp_packet *pkt,
168 const struct conn *conn_for_expectation,
169 long long now, enum ftp_ctl_pkt ftp_ctl, bool nat);
170
171 static void
172 handle_tftp_ctl(struct conntrack *ct,
173 const struct conn_lookup_ctx *ctx OVS_UNUSED,
174 struct dp_packet *pkt,
175 const struct conn *conn_for_expectation,
176 long long now OVS_UNUSED,
177 enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED);
178
179 typedef void (*alg_helper)(struct conntrack *ct,
180 const struct conn_lookup_ctx *ctx,
181 struct dp_packet *pkt,
182 const struct conn *conn_for_expectation,
183 long long now, enum ftp_ctl_pkt ftp_ctl,
184 bool nat);
185
186 static alg_helper alg_helpers[] = {
187 [CT_ALG_CTL_NONE] = NULL,
188 [CT_ALG_CTL_FTP] = handle_ftp_ctl,
189 [CT_ALG_CTL_TFTP] = handle_tftp_ctl,
190 };
191
192 long long ct_timeout_val[] = {
193 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
194 CT_TIMEOUTS
195 #undef CT_TIMEOUT
196 };
197
198 /* The maximum TCP or UDP port number. */
199 #define CT_MAX_L4_PORT 65535
200 /* String buffer used for parsing FTP string messages.
201 * This is sized about twice what is needed to leave some
202 * margin of error. */
203 #define LARGEST_FTP_MSG_OF_INTEREST 128
204 /* FTP port string used in active mode. */
205 #define FTP_PORT_CMD "PORT"
206 /* FTP pasv string used in passive mode. */
207 #define FTP_PASV_REPLY_CODE "227"
208 /* Maximum decimal digits for port in FTP command.
209 * The port is represented as two 3 digit numbers with the
210 * high part a multiple of 256. */
211 #define MAX_FTP_PORT_DGTS 3
212
213 /* FTP extension EPRT string used for active mode. */
214 #define FTP_EPRT_CMD "EPRT"
215 /* FTP extension EPSV string used for passive mode. */
216 #define FTP_EPSV_REPLY "EXTENDED PASSIVE"
217 /* Maximum decimal digits for port in FTP extended command. */
218 #define MAX_EXT_FTP_PORT_DGTS 5
219 /* FTP extended command code for IPv6. */
220 #define FTP_AF_V6 '2'
221 /* Used to indicate a wildcard L4 source port number for ALGs.
222 * This is used for port numbers that we cannot predict in
223 * expectations. */
224 #define ALG_WC_SRC_PORT 0
225
226 /* If the total number of connections goes above this value, no new connections
227 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
228 #define DEFAULT_N_CONN_LIMIT 3000000
229
230 /* Does a member by member comparison of two conn_keys; this
231 * function must be kept in sync with struct conn_key; returns 0
232 * if the keys are equal or 1 if the keys are not equal. */
233 static int
234 conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
235 {
236 if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
237 !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
238 (key1->src.icmp_id == key2->src.icmp_id) &&
239 (key1->src.icmp_type == key2->src.icmp_type) &&
240 (key1->src.icmp_code == key2->src.icmp_code) &&
241 (key1->dst.icmp_id == key2->dst.icmp_id) &&
242 (key1->dst.icmp_type == key2->dst.icmp_type) &&
243 (key1->dst.icmp_code == key2->dst.icmp_code) &&
244 (key1->dl_type == key2->dl_type) &&
245 (key1->zone == key2->zone) &&
246 (key1->nw_proto == key2->nw_proto)) {
247
248 return 0;
249 }
250 return 1;
251 }
252
253 static void
254 ct_print_conn_info(const struct conn *c, const char *log_msg,
255 enum vlog_level vll, bool force, bool rl_on)
256 {
257 #define CT_VLOG(RL_ON, LEVEL, ...) \
258 do { \
259 if (RL_ON) { \
260 static struct vlog_rate_limit rl_ = VLOG_RATE_LIMIT_INIT(5, 5); \
261 vlog_rate_limit(&this_module, LEVEL, &rl_, __VA_ARGS__); \
262 } else { \
263 vlog(&this_module, LEVEL, __VA_ARGS__); \
264 } \
265 } while (0)
266
267 if (OVS_UNLIKELY(force || vlog_is_enabled(&this_module, vll))) {
268 if (c->key.dl_type == htons(ETH_TYPE_IP)) {
269 CT_VLOG(rl_on, vll, "%s: src ip "IP_FMT" dst ip "IP_FMT" rev src "
270 "ip "IP_FMT" rev dst ip "IP_FMT" src/dst ports "
271 "%"PRIu16"/%"PRIu16" rev src/dst ports "
272 "%"PRIu16"/%"PRIu16" zone/rev zone "
273 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
274 "%"PRIu8"/%"PRIu8, log_msg,
275 IP_ARGS(c->key.src.addr.ipv4_aligned),
276 IP_ARGS(c->key.dst.addr.ipv4_aligned),
277 IP_ARGS(c->rev_key.src.addr.ipv4_aligned),
278 IP_ARGS(c->rev_key.dst.addr.ipv4_aligned),
279 ntohs(c->key.src.port), ntohs(c->key.dst.port),
280 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
281 c->key.zone, c->rev_key.zone, c->key.nw_proto,
282 c->rev_key.nw_proto);
283 } else {
284 char ip6_s[INET6_ADDRSTRLEN];
285 inet_ntop(AF_INET6, &c->key.src.addr.ipv6, ip6_s, sizeof ip6_s);
286 char ip6_d[INET6_ADDRSTRLEN];
287 inet_ntop(AF_INET6, &c->key.dst.addr.ipv6, ip6_d, sizeof ip6_d);
288 char ip6_rs[INET6_ADDRSTRLEN];
289 inet_ntop(AF_INET6, &c->rev_key.src.addr.ipv6, ip6_rs,
290 sizeof ip6_rs);
291 char ip6_rd[INET6_ADDRSTRLEN];
292 inet_ntop(AF_INET6, &c->rev_key.dst.addr.ipv6, ip6_rd,
293 sizeof ip6_rd);
294
295 CT_VLOG(rl_on, vll, "%s: src ip %s dst ip %s rev src ip %s"
296 " rev dst ip %s src/dst ports %"PRIu16"/%"PRIu16
297 " rev src/dst ports %"PRIu16"/%"PRIu16" zone/rev zone "
298 "%"PRIu16"/%"PRIu16" nw_proto/rev nw_proto "
299 "%"PRIu8"/%"PRIu8, log_msg, ip6_s, ip6_d, ip6_rs,
300 ip6_rd, ntohs(c->key.src.port), ntohs(c->key.dst.port),
301 ntohs(c->rev_key.src.port), ntohs(c->rev_key.dst.port),
302 c->key.zone, c->rev_key.zone, c->key.nw_proto,
303 c->rev_key.nw_proto);
304 }
305 }
306 }
307
308 /* Initializes the connection tracker 'ct'. The caller is responsible for
309 * calling 'conntrack_destroy()', when the instance is not needed anymore */
310 void
311 conntrack_init(struct conntrack *ct)
312 {
313 long long now = time_msec();
314
315 ct_rwlock_init(&ct->resources_lock);
316 ct_rwlock_wrlock(&ct->resources_lock);
317 hmap_init(&ct->nat_conn_keys);
318 hmap_init(&ct->alg_expectations);
319 hindex_init(&ct->alg_expectation_refs);
320 ovs_list_init(&ct->alg_exp_list);
321 ct_rwlock_unlock(&ct->resources_lock);
322
323 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
324 struct conntrack_bucket *ctb = &ct->buckets[i];
325
326 ct_lock_init(&ctb->lock);
327 ct_lock_lock(&ctb->lock);
328 hmap_init(&ctb->connections);
329 for (unsigned j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
330 ovs_list_init(&ctb->exp_lists[j]);
331 }
332 ct_lock_unlock(&ctb->lock);
333 ovs_mutex_init(&ctb->cleanup_mutex);
334 ovs_mutex_lock(&ctb->cleanup_mutex);
335 ctb->next_cleanup = now + CT_TM_MIN;
336 ovs_mutex_unlock(&ctb->cleanup_mutex);
337 }
338 ct->hash_basis = random_uint32();
339 atomic_count_init(&ct->n_conn, 0);
340 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
341 latch_init(&ct->clean_thread_exit);
342 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
343 }
344
345 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
346 void
347 conntrack_destroy(struct conntrack *ct)
348 {
349 latch_set(&ct->clean_thread_exit);
350 pthread_join(ct->clean_thread, NULL);
351 latch_destroy(&ct->clean_thread_exit);
352 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
353 struct conntrack_bucket *ctb = &ct->buckets[i];
354 struct conn *conn;
355
356 ovs_mutex_destroy(&ctb->cleanup_mutex);
357 ct_lock_lock(&ctb->lock);
358 HMAP_FOR_EACH_POP (conn, node, &ctb->connections) {
359 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
360 atomic_count_dec(&ct->n_conn);
361 }
362 delete_conn(conn);
363 }
364 hmap_destroy(&ctb->connections);
365 ct_lock_unlock(&ctb->lock);
366 ct_lock_destroy(&ctb->lock);
367 }
368 ct_rwlock_wrlock(&ct->resources_lock);
369 struct nat_conn_key_node *nat_conn_key_node;
370 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
371 free(nat_conn_key_node);
372 }
373 hmap_destroy(&ct->nat_conn_keys);
374
375 struct alg_exp_node *alg_exp_node;
376 HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
377 free(alg_exp_node);
378 }
379
380 ovs_list_poison(&ct->alg_exp_list);
381 hmap_destroy(&ct->alg_expectations);
382 hindex_destroy(&ct->alg_expectation_refs);
383 ct_rwlock_unlock(&ct->resources_lock);
384 ct_rwlock_destroy(&ct->resources_lock);
385 }
386 \f
387 static unsigned hash_to_bucket(uint32_t hash)
388 {
389 /* Extracts the most significant bits in hash. The least significant bits
390 * are already used internally by the hmap implementation. */
391 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
392
393 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
394 }
395
396 static void
397 write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
398 const struct conn_key *key, const struct alg_exp_node *alg_exp)
399 {
400 pkt->md.ct_state |= CS_TRACKED;
401 pkt->md.ct_zone = zone;
402 pkt->md.ct_mark = conn ? conn->mark : 0;
403 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
404
405 /* Use the original direction tuple if we have it. */
406 if (conn) {
407 if (conn->alg_related) {
408 key = &conn->master_key;
409 } else {
410 key = &conn->key;
411 }
412 } else if (alg_exp) {
413 pkt->md.ct_mark = alg_exp->master_mark;
414 pkt->md.ct_label = alg_exp->master_label;
415 key = &alg_exp->master_key;
416 }
417
418 pkt->md.ct_orig_tuple_ipv6 = false;
419
420 if (key) {
421 if (key->dl_type == htons(ETH_TYPE_IP)) {
422 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
423 key->src.addr.ipv4_aligned,
424 key->dst.addr.ipv4_aligned,
425 key->nw_proto != IPPROTO_ICMP
426 ? key->src.port : htons(key->src.icmp_type),
427 key->nw_proto != IPPROTO_ICMP
428 ? key->dst.port : htons(key->src.icmp_code),
429 key->nw_proto,
430 };
431 } else {
432 pkt->md.ct_orig_tuple_ipv6 = true;
433 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
434 key->src.addr.ipv6_aligned,
435 key->dst.addr.ipv6_aligned,
436 key->nw_proto != IPPROTO_ICMPV6
437 ? key->src.port : htons(key->src.icmp_type),
438 key->nw_proto != IPPROTO_ICMPV6
439 ? key->dst.port : htons(key->src.icmp_code),
440 key->nw_proto,
441 };
442 }
443 } else {
444 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
445 }
446 }
447
448 static uint8_t
449 get_ip_proto(const struct dp_packet *pkt)
450 {
451 uint8_t ip_proto;
452 struct eth_header *l2 = dp_packet_eth(pkt);
453 if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
454 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
455 ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
456 } else {
457 struct ip_header *l3_hdr = dp_packet_l3(pkt);
458 ip_proto = l3_hdr->ip_proto;
459 }
460
461 return ip_proto;
462 }
463
464 static bool
465 is_ftp_ctl(const enum ct_alg_ctl_type ct_alg_ctl)
466 {
467 return ct_alg_ctl == CT_ALG_CTL_FTP;
468 }
469
470 static enum ct_alg_ctl_type
471 get_alg_ctl_type(const struct dp_packet *pkt, ovs_be16 tp_src, ovs_be16 tp_dst,
472 const char *helper)
473 {
474 /* CT_IPPORT_FTP/TFTP is used because IPPORT_FTP/TFTP in not defined
475 * in OSX, at least in in.h. Since these values will never change, remove
476 * the external dependency. */
477 enum { CT_IPPORT_FTP = 21 };
478 enum { CT_IPPORT_TFTP = 69 };
479 uint8_t ip_proto = get_ip_proto(pkt);
480 struct udp_header *uh = dp_packet_l4(pkt);
481 struct tcp_header *th = dp_packet_l4(pkt);
482 ovs_be16 ftp_src_port = htons(CT_IPPORT_FTP);
483 ovs_be16 ftp_dst_port = htons(CT_IPPORT_FTP);
484 ovs_be16 tftp_dst_port = htons(CT_IPPORT_TFTP);
485
486 if (OVS_UNLIKELY(tp_dst)) {
487 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
488 ftp_dst_port = tp_dst;
489 } else if (helper && !strncmp(helper, "tftp", strlen("tftp"))) {
490 tftp_dst_port = tp_dst;
491 }
492 } else if (OVS_UNLIKELY(tp_src)) {
493 if (helper && !strncmp(helper, "ftp", strlen("ftp"))) {
494 ftp_src_port = tp_src;
495 }
496 }
497
498 if (ip_proto == IPPROTO_UDP && uh->udp_dst == tftp_dst_port) {
499 return CT_ALG_CTL_TFTP;
500 } else if (ip_proto == IPPROTO_TCP &&
501 (th->tcp_src == ftp_src_port || th->tcp_dst == ftp_dst_port)) {
502 return CT_ALG_CTL_FTP;
503 }
504 return CT_ALG_CTL_NONE;
505 }
506
507 static bool
508 alg_src_ip_wc(enum ct_alg_ctl_type alg_ctl_type)
509 {
510 if (alg_ctl_type == CT_ALG_CTL_SIP) {
511 return true;
512 }
513 return false;
514 }
515
516 static void
517 handle_alg_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
518 struct dp_packet *pkt, enum ct_alg_ctl_type ct_alg_ctl,
519 const struct conn *conn, long long now, bool nat,
520 const struct conn *conn_for_expectation)
521 {
522 /* ALG control packet handling with expectation creation. */
523 if (OVS_UNLIKELY(alg_helpers[ct_alg_ctl] && conn && conn->alg)) {
524 alg_helpers[ct_alg_ctl](ct, ctx, pkt, conn_for_expectation, now,
525 CT_FTP_CTL_INTEREST, nat);
526 }
527 }
528
529 static void
530 pat_packet(struct dp_packet *pkt, const struct conn *conn)
531 {
532 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
533 if (conn->key.nw_proto == IPPROTO_TCP) {
534 struct tcp_header *th = dp_packet_l4(pkt);
535 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
536 } else if (conn->key.nw_proto == IPPROTO_UDP) {
537 struct udp_header *uh = dp_packet_l4(pkt);
538 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
539 }
540 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
541 if (conn->key.nw_proto == IPPROTO_TCP) {
542 struct tcp_header *th = dp_packet_l4(pkt);
543 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
544 } else if (conn->key.nw_proto == IPPROTO_UDP) {
545 struct udp_header *uh = dp_packet_l4(pkt);
546 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
547 }
548 }
549 }
550
551 static void
552 nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
553 {
554 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
555 pkt->md.ct_state |= CS_SRC_NAT;
556 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
557 struct ip_header *nh = dp_packet_l3(pkt);
558 packet_set_ipv4_addr(pkt, &nh->ip_src,
559 conn->rev_key.dst.addr.ipv4_aligned);
560 } else {
561 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
562 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
563 nh6->ip6_src.be32,
564 &conn->rev_key.dst.addr.ipv6_aligned,
565 true);
566 }
567 if (!related) {
568 pat_packet(pkt, conn);
569 }
570 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
571 pkt->md.ct_state |= CS_DST_NAT;
572 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
573 struct ip_header *nh = dp_packet_l3(pkt);
574 packet_set_ipv4_addr(pkt, &nh->ip_dst,
575 conn->rev_key.src.addr.ipv4_aligned);
576 } else {
577 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
578 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
579 nh6->ip6_dst.be32,
580 &conn->rev_key.src.addr.ipv6_aligned,
581 true);
582 }
583 if (!related) {
584 pat_packet(pkt, conn);
585 }
586 }
587 }
588
589 static void
590 un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
591 {
592 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
593 if (conn->key.nw_proto == IPPROTO_TCP) {
594 struct tcp_header *th = dp_packet_l4(pkt);
595 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
596 } else if (conn->key.nw_proto == IPPROTO_UDP) {
597 struct udp_header *uh = dp_packet_l4(pkt);
598 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
599 }
600 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
601 if (conn->key.nw_proto == IPPROTO_TCP) {
602 struct tcp_header *th = dp_packet_l4(pkt);
603 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
604 } else if (conn->key.nw_proto == IPPROTO_UDP) {
605 struct udp_header *uh = dp_packet_l4(pkt);
606 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
607 }
608 }
609 }
610
611 static void
612 reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
613 {
614 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
615 if (conn->key.nw_proto == IPPROTO_TCP) {
616 struct tcp_header *th_in = dp_packet_l4(pkt);
617 packet_set_tcp_port(pkt, conn->key.src.port,
618 th_in->tcp_dst);
619 } else if (conn->key.nw_proto == IPPROTO_UDP) {
620 struct udp_header *uh_in = dp_packet_l4(pkt);
621 packet_set_udp_port(pkt, conn->key.src.port,
622 uh_in->udp_dst);
623 }
624 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
625 if (conn->key.nw_proto == IPPROTO_TCP) {
626 struct tcp_header *th_in = dp_packet_l4(pkt);
627 packet_set_tcp_port(pkt, th_in->tcp_src,
628 conn->key.dst.port);
629 } else if (conn->key.nw_proto == IPPROTO_UDP) {
630 struct udp_header *uh_in = dp_packet_l4(pkt);
631 packet_set_udp_port(pkt, uh_in->udp_src,
632 conn->key.dst.port);
633 }
634 }
635 }
636
637 static void
638 reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
639 {
640 char *tail = dp_packet_tail(pkt);
641 char pad = dp_packet_l2_pad_size(pkt);
642 struct conn_key inner_key;
643 const char *inner_l4 = NULL;
644 uint16_t orig_l3_ofs = pkt->l3_ofs;
645 uint16_t orig_l4_ofs = pkt->l4_ofs;
646
647 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
648 struct ip_header *nh = dp_packet_l3(pkt);
649 struct icmp_header *icmp = dp_packet_l4(pkt);
650 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
651 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad,
652 &inner_l4, false);
653 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
654 pkt->l4_ofs += inner_l4 - (char *) icmp;
655
656 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
657 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
658 conn->key.src.addr.ipv4_aligned);
659 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
660 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
661 conn->key.dst.addr.ipv4_aligned);
662 }
663
664 reverse_pat_packet(pkt, conn);
665 icmp->icmp_csum = 0;
666 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
667 } else {
668 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
669 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
670 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
671 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
672 extract_l3_ipv6(&inner_key, inner_l3_6,
673 tail - ((char *)inner_l3_6) - pad,
674 &inner_l4);
675 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
676 pkt->l4_ofs += inner_l4 - (char *) icmp6;
677
678 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
679 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
680 inner_l3_6->ip6_src.be32,
681 &conn->key.src.addr.ipv6_aligned,
682 true);
683 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
684 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
685 inner_l3_6->ip6_dst.be32,
686 &conn->key.dst.addr.ipv6_aligned,
687 true);
688 }
689 reverse_pat_packet(pkt, conn);
690 uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
691 icmp6->icmp6_base.icmp6_cksum = 0;
692 icmp6->icmp6_base.icmp6_cksum = csum_finish(
693 csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
694 }
695 pkt->l3_ofs = orig_l3_ofs;
696 pkt->l4_ofs = orig_l4_ofs;
697 }
698
699 static void
700 un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
701 bool related)
702 {
703 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
704 pkt->md.ct_state |= CS_DST_NAT;
705 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
706 struct ip_header *nh = dp_packet_l3(pkt);
707 packet_set_ipv4_addr(pkt, &nh->ip_dst,
708 conn->key.src.addr.ipv4_aligned);
709 } else {
710 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
711 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
712 nh6->ip6_dst.be32,
713 &conn->key.src.addr.ipv6_aligned, true);
714 }
715
716 if (OVS_UNLIKELY(related)) {
717 reverse_nat_packet(pkt, conn);
718 } else {
719 un_pat_packet(pkt, conn);
720 }
721 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
722 pkt->md.ct_state |= CS_SRC_NAT;
723 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
724 struct ip_header *nh = dp_packet_l3(pkt);
725 packet_set_ipv4_addr(pkt, &nh->ip_src,
726 conn->key.dst.addr.ipv4_aligned);
727 } else {
728 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
729 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
730 nh6->ip6_src.be32,
731 &conn->key.dst.addr.ipv6_aligned, true);
732 }
733
734 if (OVS_UNLIKELY(related)) {
735 reverse_nat_packet(pkt, conn);
736 } else {
737 un_pat_packet(pkt, conn);
738 }
739 }
740 }
741
742 /* Typical usage of this helper is in non per-packet code;
743 * this is because the bucket lock needs to be held for lookup
744 * and a hash would have already been needed. Hence, this function
745 * is just intended for code clarity. */
746 static struct conn *
747 conn_lookup(struct conntrack *ct, const struct conn_key *key, long long now)
748 {
749 struct conn_lookup_ctx ctx;
750 ctx.conn = NULL;
751 ctx.key = *key;
752 ctx.hash = conn_key_hash(key, ct->hash_basis);
753 unsigned bucket = hash_to_bucket(ctx.hash);
754 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
755 return ctx.conn;
756 }
757
758 static void
759 conn_seq_skew_set(struct conntrack *ct, const struct conn_key *key,
760 long long now, int seq_skew, bool seq_skew_dir)
761 {
762 unsigned bucket = hash_to_bucket(conn_key_hash(key, ct->hash_basis));
763 ct_lock_lock(&ct->buckets[bucket].lock);
764 struct conn *conn = conn_lookup(ct, key, now);
765 if (conn && seq_skew) {
766 conn->seq_skew = seq_skew;
767 conn->seq_skew_dir = seq_skew_dir;
768 }
769 ct_lock_unlock(&ct->buckets[bucket].lock);
770 }
771
772 static void
773 nat_clean(struct conntrack *ct, struct conn *conn,
774 struct conntrack_bucket *ctb)
775 OVS_REQUIRES(ctb->lock)
776 {
777 ct_rwlock_wrlock(&ct->resources_lock);
778 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
779 ct_rwlock_unlock(&ct->resources_lock);
780 ct_lock_unlock(&ctb->lock);
781 unsigned bucket_rev_conn =
782 hash_to_bucket(conn_key_hash(&conn->rev_key, ct->hash_basis));
783 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
784 ct_rwlock_wrlock(&ct->resources_lock);
785 long long now = time_msec();
786 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
787 struct nat_conn_key_node *nat_conn_key_node =
788 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
789 ct->hash_basis);
790
791 /* In the unlikely event, rev conn was recreated, then skip
792 * rev_conn cleanup. */
793 if (rev_conn && (!nat_conn_key_node ||
794 conn_key_cmp(&nat_conn_key_node->value,
795 &rev_conn->rev_key))) {
796 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
797 &rev_conn->node);
798 free(rev_conn);
799 }
800
801 delete_conn(conn);
802 ct_rwlock_unlock(&ct->resources_lock);
803 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
804 ct_lock_lock(&ctb->lock);
805 }
806
807 static void
808 conn_clean(struct conntrack *ct, struct conn *conn,
809 struct conntrack_bucket *ctb)
810 OVS_REQUIRES(ctb->lock)
811 {
812 if (conn->alg) {
813 expectation_clean(ct, &conn->key, ct->hash_basis);
814 }
815 ovs_list_remove(&conn->exp_node);
816 hmap_remove(&ctb->connections, &conn->node);
817 atomic_count_dec(&ct->n_conn);
818 if (conn->nat_info) {
819 nat_clean(ct, conn, ctb);
820 } else {
821 delete_conn(conn);
822 }
823 }
824
825 static bool
826 ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
827 {
828 if (ct_alg_ctl == CT_ALG_CTL_NONE) {
829 return true;
830 } else if (helper) {
831 if ((ct_alg_ctl == CT_ALG_CTL_FTP) &&
832 !strncmp(helper, "ftp", strlen("ftp"))) {
833 return true;
834 } else if ((ct_alg_ctl == CT_ALG_CTL_TFTP) &&
835 !strncmp(helper, "tftp", strlen("tftp"))) {
836 return true;
837 } else {
838 return false;
839 }
840 } else {
841 return false;
842 }
843 }
844
845 /* This function is called with the bucket lock held. */
846 static struct conn *
847 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
848 struct conn_lookup_ctx *ctx, bool commit, long long now,
849 const struct nat_action_info_t *nat_action_info,
850 struct conn *conn_for_un_nat_copy,
851 const char *helper,
852 const struct alg_exp_node *alg_exp,
853 enum ct_alg_ctl_type ct_alg_ctl)
854 {
855 struct conn *nc = NULL;
856
857 if (!valid_new(pkt, &ctx->key)) {
858 pkt->md.ct_state = CS_INVALID;
859 return nc;
860 }
861
862 pkt->md.ct_state = CS_NEW;
863
864 if (alg_exp) {
865 pkt->md.ct_state |= CS_RELATED;
866 }
867
868 if (commit) {
869 unsigned int n_conn_limit;
870 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
871
872 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
873 COVERAGE_INC(conntrack_full);
874 return nc;
875 }
876
877 unsigned bucket = hash_to_bucket(ctx->hash);
878 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
879 ctx->conn = nc;
880 nc->rev_key = nc->key;
881 conn_key_reverse(&nc->rev_key);
882
883 if (ct_verify_helper(helper, ct_alg_ctl)) {
884 nc->alg = nullable_xstrdup(helper);
885 }
886
887 if (alg_exp) {
888 nc->alg_related = true;
889 nc->mark = alg_exp->master_mark;
890 nc->label = alg_exp->master_label;
891 nc->master_key = alg_exp->master_key;
892 }
893
894 if (nat_action_info) {
895 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
896
897 if (alg_exp) {
898 if (alg_exp->nat_rpl_dst) {
899 nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
900 nc->nat_info->nat_action = NAT_ACTION_SRC;
901 } else {
902 nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
903 nc->nat_info->nat_action = NAT_ACTION_DST;
904 }
905 *conn_for_un_nat_copy = *nc;
906 ct_rwlock_wrlock(&ct->resources_lock);
907 bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys,
908 conn_for_un_nat_copy,
909 ct->hash_basis);
910 ct_rwlock_unlock(&ct->resources_lock);
911 if (!new_insert) {
912 char *log_msg = xasprintf("Pre-existing alg "
913 "nat_conn_key");
914 ct_print_conn_info(conn_for_un_nat_copy, log_msg, VLL_INFO,
915 true, false);
916 free(log_msg);
917 }
918 } else {
919 *conn_for_un_nat_copy = *nc;
920 ct_rwlock_wrlock(&ct->resources_lock);
921 bool nat_res = nat_select_range_tuple(ct, nc,
922 conn_for_un_nat_copy);
923
924 if (!nat_res) {
925 goto nat_res_exhaustion;
926 }
927
928 /* Update nc with nat adjustments made to
929 * conn_for_un_nat_copy by nat_select_range_tuple(). */
930 *nc = *conn_for_un_nat_copy;
931 ct_rwlock_unlock(&ct->resources_lock);
932 }
933 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
934 conn_for_un_nat_copy->nat_info = NULL;
935 conn_for_un_nat_copy->alg = NULL;
936 nat_packet(pkt, nc, ctx->icmp_related);
937 }
938 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
939 atomic_count_inc(&ct->n_conn);
940 }
941
942 return nc;
943
944 /* This would be a user error or a DOS attack.
945 * A user error is prevented by allocating enough
946 * combinations of NAT addresses when combined with
947 * ephemeral ports. A DOS attack should be protected
948 * against with firewall rules or a separate firewall.
949 * Also using zone partitioning can limit DoS impact. */
950 nat_res_exhaustion:
951 ovs_list_remove(&nc->exp_node);
952 delete_conn(nc);
953 /* conn_for_un_nat_copy is a local variable in process_one; this
954 * memset() serves to document that conn_for_un_nat_copy is from
955 * this point on unused. */
956 memset(conn_for_un_nat_copy, 0, sizeof *conn_for_un_nat_copy);
957 ct_rwlock_unlock(&ct->resources_lock);
958 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
959 VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
960 "if DoS attack, use firewalling and/or zone partitioning.");
961 return NULL;
962 }
963
964 static bool
965 conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
966 struct conn_lookup_ctx *ctx, struct conn **conn,
967 long long now, unsigned bucket)
968 OVS_REQUIRES(ct->buckets[bucket].lock)
969 {
970 bool create_new_conn = false;
971
972 if (ctx->icmp_related) {
973 pkt->md.ct_state |= CS_RELATED;
974 if (ctx->reply) {
975 pkt->md.ct_state |= CS_REPLY_DIR;
976 }
977 } else {
978 if ((*conn)->alg_related) {
979 pkt->md.ct_state |= CS_RELATED;
980 }
981
982 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
983 pkt, ctx->reply, now);
984
985 switch (res) {
986 case CT_UPDATE_VALID:
987 pkt->md.ct_state |= CS_ESTABLISHED;
988 pkt->md.ct_state &= ~CS_NEW;
989 if (ctx->reply) {
990 pkt->md.ct_state |= CS_REPLY_DIR;
991 }
992 break;
993 case CT_UPDATE_INVALID:
994 pkt->md.ct_state = CS_INVALID;
995 break;
996 case CT_UPDATE_NEW:
997 conn_clean(ct, *conn, &ct->buckets[bucket]);
998 create_new_conn = true;
999 break;
1000 default:
1001 OVS_NOT_REACHED();
1002 }
1003 }
1004 return create_new_conn;
1005 }
1006
1007 static void
1008 create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
1009 long long now, bool alg_un_nat)
1010 {
1011 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
1012 nc->key = conn_for_un_nat_copy->rev_key;
1013 nc->rev_key = conn_for_un_nat_copy->key;
1014 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
1015 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
1016 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
1017 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
1018
1019 if (alg_un_nat) {
1020 if (!rev_conn) {
1021 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
1022 &nc->node, un_nat_hash);
1023 } else {
1024 char *log_msg = xasprintf("Unusual condition for un_nat conn "
1025 "create for alg: rev_conn %p", rev_conn);
1026 ct_print_conn_info(nc, log_msg, VLL_INFO, true, false);
1027 free(log_msg);
1028 free(nc);
1029 }
1030 } else {
1031 ct_rwlock_rdlock(&ct->resources_lock);
1032
1033 struct nat_conn_key_node *nat_conn_key_node =
1034 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
1035 if (nat_conn_key_node && !conn_key_cmp(&nat_conn_key_node->value,
1036 &nc->rev_key) && !rev_conn) {
1037 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
1038 &nc->node, un_nat_hash);
1039 } else {
1040 char *log_msg = xasprintf("Unusual condition for un_nat conn "
1041 "create: nat_conn_key_node/rev_conn "
1042 "%p/%p", nat_conn_key_node, rev_conn);
1043 ct_print_conn_info(nc, log_msg, VLL_INFO, true, false);
1044 free(log_msg);
1045 free(nc);
1046 }
1047 ct_rwlock_unlock(&ct->resources_lock);
1048 }
1049 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
1050 }
1051
1052 static void
1053 handle_nat(struct dp_packet *pkt, struct conn *conn,
1054 uint16_t zone, bool reply, bool related)
1055 {
1056 if (conn->nat_info &&
1057 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1058 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
1059 zone != pkt->md.ct_zone))) {
1060
1061 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
1062 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
1063 }
1064 if (reply) {
1065 un_nat_packet(pkt, conn, related);
1066 } else {
1067 nat_packet(pkt, conn, related);
1068 }
1069 }
1070 }
1071
1072 static bool
1073 check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
1074 struct conn_lookup_ctx *ctx_in, long long now,
1075 unsigned *bucket, struct conn **conn,
1076 const struct nat_action_info_t *nat_action_info)
1077 OVS_REQUIRES(ct->buckets[*bucket].lock)
1078 {
1079 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
1080 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
1081 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
1082 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
1083 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
1084 nat_action_info) {
1085 return false;
1086 }
1087
1088 ct_lock_unlock(&ct->buckets[*bucket].lock);
1089 struct conn_lookup_ctx ctx;
1090 memset(&ctx, 0 , sizeof ctx);
1091 ctx.conn = NULL;
1092
1093 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
1094 ctx.key.src.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
1095 ctx.key.dst.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
1096
1097 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
1098 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
1099 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1100 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
1101 ctx.key.src.icmp_type = (uint8_t) src_port;
1102 ctx.key.dst.icmp_type = reverse_icmp_type(ctx.key.src.icmp_type);
1103 } else {
1104 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
1105 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
1106 }
1107 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
1108 } else {
1109 ctx.key.src.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
1110 ctx.key.dst.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
1111
1112 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
1113 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
1114 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
1115 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
1116 ctx.key.src.icmp_type = (uint8_t) src_port;
1117 ctx.key.dst.icmp_type = reverse_icmp6_type(ctx.key.src.icmp_type);
1118 } else {
1119 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
1120 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
1121 }
1122 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
1123 }
1124
1125 ctx.key.dl_type = ctx_in->key.dl_type;
1126 ctx.key.zone = pkt->md.ct_zone;
1127 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
1128 *bucket = hash_to_bucket(ctx.hash);
1129 ct_lock_lock(&ct->buckets[*bucket].lock);
1130 conn_key_lookup(&ct->buckets[*bucket], &ctx, now);
1131 *conn = ctx.conn;
1132 return *conn ? true : false;
1133 }
1134
1135 static bool
1136 is_un_nat_conn_valid(const struct conn *un_nat_conn)
1137 {
1138 return un_nat_conn->conn_type == CT_CONN_TYPE_UN_NAT;
1139 }
1140
1141 static bool
1142 conn_update_state_alg(struct conntrack *ct, struct dp_packet *pkt,
1143 struct conn_lookup_ctx *ctx, struct conn *conn,
1144 const struct nat_action_info_t *nat_action_info,
1145 enum ct_alg_ctl_type ct_alg_ctl, long long now,
1146 unsigned bucket, bool *create_new_conn)
1147 OVS_REQUIRES(ct->buckets[bucket].lock)
1148 {
1149 if (is_ftp_ctl(ct_alg_ctl)) {
1150 /* Keep sequence tracking in sync with the source of the
1151 * sequence skew. */
1152 if (ctx->reply != conn->seq_skew_dir) {
1153 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1154 !!nat_action_info);
1155 *create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1156 bucket);
1157 } else {
1158 *create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1159 bucket);
1160 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1161 !!nat_action_info);
1162 }
1163 return true;
1164 }
1165 return false;
1166 }
1167
1168 static void
1169 process_one(struct conntrack *ct, struct dp_packet *pkt,
1170 struct conn_lookup_ctx *ctx, uint16_t zone,
1171 bool force, bool commit, long long now, const uint32_t *setmark,
1172 const struct ovs_key_ct_labels *setlabel,
1173 const struct nat_action_info_t *nat_action_info,
1174 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper)
1175 {
1176 struct conn *conn;
1177 unsigned bucket = hash_to_bucket(ctx->hash);
1178 ct_lock_lock(&ct->buckets[bucket].lock);
1179 conn_key_lookup(&ct->buckets[bucket], ctx, now);
1180 conn = ctx->conn;
1181
1182 /* Delete found entry if in wrong direction. 'force' implies commit. */
1183 if (conn && force && ctx->reply) {
1184 conn_clean(ct, conn, &ct->buckets[bucket]);
1185 conn = NULL;
1186 }
1187
1188 if (OVS_LIKELY(conn)) {
1189 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
1190
1191 ctx->reply = true;
1192
1193 struct conn_lookup_ctx ctx2;
1194 ctx2.conn = NULL;
1195 ctx2.key = conn->rev_key;
1196 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
1197
1198 ct_lock_unlock(&ct->buckets[bucket].lock);
1199 bucket = hash_to_bucket(ctx2.hash);
1200
1201 ct_lock_lock(&ct->buckets[bucket].lock);
1202 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
1203
1204 if (ctx2.conn) {
1205 conn = ctx2.conn;
1206 } else {
1207 /* It is a race condition where conn has timed out and removed
1208 * between unlock of the rev_conn and lock of the forward conn;
1209 * nothing to do. */
1210 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
1211 ct_lock_unlock(&ct->buckets[bucket].lock);
1212 return;
1213 }
1214 }
1215 }
1216
1217 bool create_new_conn = false;
1218 struct conn conn_for_un_nat_copy;
1219 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
1220
1221 enum ct_alg_ctl_type ct_alg_ctl = get_alg_ctl_type(pkt, tp_src, tp_dst,
1222 helper);
1223
1224 if (OVS_LIKELY(conn)) {
1225 if (OVS_LIKELY(!conn_update_state_alg(ct, pkt, ctx, conn,
1226 nat_action_info,
1227 ct_alg_ctl, now, bucket,
1228 &create_new_conn))) {
1229 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1230 bucket);
1231 }
1232 if (nat_action_info && !create_new_conn) {
1233 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
1234 }
1235
1236 } else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
1237 nat_action_info)) {
1238 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now, bucket);
1239 } else {
1240 if (ctx->icmp_related) {
1241 /* An icmp related conn should always be found; no new
1242 connection is created based on an icmp related packet. */
1243 pkt->md.ct_state = CS_INVALID;
1244 } else {
1245 create_new_conn = true;
1246 }
1247 }
1248
1249 const struct alg_exp_node *alg_exp = NULL;
1250
1251 if (OVS_UNLIKELY(create_new_conn)) {
1252 struct alg_exp_node alg_exp_entry;
1253
1254 ct_rwlock_rdlock(&ct->resources_lock);
1255 alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
1256 ct->hash_basis,
1257 alg_src_ip_wc(ct_alg_ctl));
1258 if (alg_exp) {
1259 alg_exp_entry = *alg_exp;
1260 alg_exp = &alg_exp_entry;
1261 }
1262 ct_rwlock_unlock(&ct->resources_lock);
1263
1264 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
1265 &conn_for_un_nat_copy, helper, alg_exp,
1266 ct_alg_ctl);
1267 }
1268
1269 write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1270
1271 if (conn && setmark) {
1272 set_mark(pkt, conn, setmark[0], setmark[1]);
1273 }
1274
1275 if (conn && setlabel) {
1276 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1277 }
1278
1279 struct conn conn_for_expectation;
1280 if (OVS_UNLIKELY((ct_alg_ctl != CT_ALG_CTL_NONE) && conn)) {
1281 conn_for_expectation = *conn;
1282 }
1283
1284 ct_lock_unlock(&ct->buckets[bucket].lock);
1285
1286 if (is_un_nat_conn_valid(&conn_for_un_nat_copy)) {
1287 create_un_nat_conn(ct, &conn_for_un_nat_copy, now, !!alg_exp);
1288 }
1289
1290 handle_alg_ctl(ct, ctx, pkt, ct_alg_ctl, conn, now, !!nat_action_info,
1291 &conn_for_expectation);
1292 }
1293
1294 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
1295 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
1296 * the l3 and and l4 offset properly set.
1297 *
1298 * If 'commit' is true, the packets are allowed to create new entries in the
1299 * connection tables. 'setmark', if not NULL, should point to a two
1300 * elements array containing a value and a mask to set the connection mark.
1301 * 'setlabel' behaves similarly for the connection label.*/
1302 int
1303 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
1304 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
1305 const uint32_t *setmark,
1306 const struct ovs_key_ct_labels *setlabel,
1307 ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper,
1308 const struct nat_action_info_t *nat_action_info,
1309 long long now)
1310 {
1311
1312 struct dp_packet *packet;
1313 struct conn_lookup_ctx ctx;
1314
1315 DP_PACKET_BATCH_FOR_EACH (packet, pkt_batch) {
1316 if (!conn_key_extract(ct, packet, dl_type, &ctx, zone)) {
1317 packet->md.ct_state = CS_INVALID;
1318 write_ct_md(packet, zone, NULL, NULL, NULL);
1319 continue;
1320 }
1321 process_one(ct, packet, &ctx, zone, force, commit, now, setmark,
1322 setlabel, nat_action_info, tp_src, tp_dst, helper);
1323 }
1324
1325 return 0;
1326 }
1327
1328 void
1329 conntrack_clear(struct dp_packet *packet)
1330 {
1331 /* According to pkt_metadata_init(), ct_state == 0 is enough to make all of
1332 * the conntrack fields invalid. */
1333 packet->md.ct_state = 0;
1334 }
1335
1336 static void
1337 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1338 {
1339 if (conn->alg_related) {
1340 pkt->md.ct_mark = conn->mark;
1341 } else {
1342 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1343 conn->mark = pkt->md.ct_mark;
1344 }
1345 }
1346
1347 static void
1348 set_label(struct dp_packet *pkt, struct conn *conn,
1349 const struct ovs_key_ct_labels *val,
1350 const struct ovs_key_ct_labels *mask)
1351 {
1352 if (conn->alg_related) {
1353 pkt->md.ct_label = conn->label;
1354 } else {
1355 ovs_u128 v, m;
1356
1357 memcpy(&v, val, sizeof v);
1358 memcpy(&m, mask, sizeof m);
1359
1360 pkt->md.ct_label.u64.lo = v.u64.lo
1361 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
1362 pkt->md.ct_label.u64.hi = v.u64.hi
1363 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
1364 conn->label = pkt->md.ct_label;
1365 }
1366 }
1367
1368 \f
1369 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
1370 * earliest expiration time among the remaining connections in 'ctb'. Returns
1371 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
1372 * if 'limit' is reached */
1373 static long long
1374 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb,
1375 long long now, size_t limit)
1376 OVS_REQUIRES(ctb->lock)
1377 {
1378 struct conn *conn, *next;
1379 long long min_expiration = LLONG_MAX;
1380 size_t count = 0;
1381
1382 for (unsigned i = 0; i < N_CT_TM; i++) {
1383 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
1384 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1385 if (!conn_expired(conn, now) || count >= limit) {
1386 min_expiration = MIN(min_expiration, conn->expiration);
1387 if (count >= limit) {
1388 /* Do not check other lists. */
1389 COVERAGE_INC(conntrack_long_cleanup);
1390 return min_expiration;
1391 }
1392 break;
1393 }
1394 conn_clean(ct, conn, ctb);
1395 count++;
1396 }
1397 }
1398 }
1399 return min_expiration;
1400 }
1401
1402 /* Cleans up old connection entries from 'ct'. Returns the time when the
1403 * next expiration might happen. The return value might be smaller than
1404 * 'now', meaning that an internal limit has been reached, and some expired
1405 * connections have not been deleted. */
1406 static long long
1407 conntrack_clean(struct conntrack *ct, long long now)
1408 {
1409 long long next_wakeup = now + CT_TM_MIN;
1410 unsigned int n_conn_limit;
1411 size_t clean_count = 0;
1412
1413 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1414
1415 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
1416 struct conntrack_bucket *ctb = &ct->buckets[i];
1417 size_t prev_count;
1418 long long min_exp;
1419
1420 ovs_mutex_lock(&ctb->cleanup_mutex);
1421 if (ctb->next_cleanup > now) {
1422 goto next_bucket;
1423 }
1424
1425 ct_lock_lock(&ctb->lock);
1426 prev_count = hmap_count(&ctb->connections);
1427 /* If the connections are well distributed among buckets, we want to
1428 * limit to 10% of the global limit equally split among buckets. If
1429 * the bucket is busier than the others, we limit to 10% of its
1430 * current size. */
1431 min_exp = sweep_bucket(ct, ctb, now,
1432 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
1433 clean_count += prev_count - hmap_count(&ctb->connections);
1434
1435 if (min_exp > now) {
1436 /* We call hmap_shrink() only if sweep_bucket() managed to delete
1437 * every expired connection. */
1438 hmap_shrink(&ctb->connections);
1439 }
1440
1441 ct_lock_unlock(&ctb->lock);
1442
1443 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
1444
1445 next_bucket:
1446 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
1447 ovs_mutex_unlock(&ctb->cleanup_mutex);
1448 }
1449
1450 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
1451 clean_count, time_msec() - now);
1452
1453 return next_wakeup;
1454 }
1455
1456 /* Cleanup:
1457 *
1458 * We must call conntrack_clean() periodically. conntrack_clean() return
1459 * value gives an hint on when the next cleanup must be done (either because
1460 * there is an actual connection that expires, or because a new connection
1461 * might be created with the minimum timeout).
1462 *
1463 * The logic below has two goals:
1464 *
1465 * - We want to reduce the number of wakeups and batch connection cleanup
1466 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1467 * are coping with the current cleanup tasks, then we wait at least
1468 * 5 seconds to do further cleanup.
1469 *
1470 * - We don't want to keep the buckets locked too long, as we might prevent
1471 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
1472 * behind, there is at least some 200ms blocks of time when buckets will be
1473 * left alone, so the datapath can operate unhindered.
1474 */
1475 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1476 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1477
1478 static void *
1479 clean_thread_main(void *f_)
1480 {
1481 struct conntrack *ct = f_;
1482
1483 while (!latch_is_set(&ct->clean_thread_exit)) {
1484 long long next_wake;
1485 long long now = time_msec();
1486 next_wake = conntrack_clean(ct, now);
1487
1488 if (next_wake < now) {
1489 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1490 } else {
1491 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1492 }
1493 latch_wait(&ct->clean_thread_exit);
1494 poll_block();
1495 }
1496
1497 return NULL;
1498 }
1499 \f
1500 /* Key extraction */
1501
1502 /* The function stores a pointer to the first byte after the header in
1503 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1504 * not interested in the header's tail, meaning that the header has
1505 * already been parsed (e.g. by flow_extract): we take this as a hint to
1506 * save a few checks. If 'validate_checksum' is true, the function returns
1507 * false if the IPv4 checksum is invalid. */
1508 static inline bool
1509 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1510 const char **new_data, bool validate_checksum)
1511 {
1512 if (new_data) {
1513 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1514 return false;
1515 }
1516 }
1517
1518 const struct ip_header *ip = data;
1519 size_t ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
1520
1521 if (new_data) {
1522 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1523 return false;
1524 }
1525 if (OVS_UNLIKELY(size < ip_len)) {
1526 return false;
1527 }
1528
1529 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1530 return false;
1531 }
1532
1533 *new_data = (char *) data + ip_len;
1534 }
1535
1536 if (validate_checksum && csum(data, ip_len) != 0) {
1537 return false;
1538 }
1539
1540 key->src.addr.ipv4 = ip->ip_src;
1541 key->dst.addr.ipv4 = ip->ip_dst;
1542 key->nw_proto = ip->ip_proto;
1543
1544 return true;
1545 }
1546
1547 /* The function stores a pointer to the first byte after the header in
1548 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1549 * not interested in the header's tail, meaning that the header has
1550 * already been parsed (e.g. by flow_extract): we take this as a hint to
1551 * save a few checks. */
1552 static inline bool
1553 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1554 const char **new_data)
1555 {
1556 const struct ovs_16aligned_ip6_hdr *ip6 = data;
1557
1558 if (new_data) {
1559 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1560 return false;
1561 }
1562 }
1563
1564 data = ip6 + 1;
1565 size -= sizeof *ip6;
1566 uint8_t nw_proto = ip6->ip6_nxt;
1567 uint8_t nw_frag = 0;
1568
1569 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
1570 return false;
1571 }
1572
1573 if (nw_frag) {
1574 return false;
1575 }
1576
1577 if (new_data) {
1578 *new_data = data;
1579 }
1580
1581 key->src.addr.ipv6 = ip6->ip6_src;
1582 key->dst.addr.ipv6 = ip6->ip6_dst;
1583 key->nw_proto = nw_proto;
1584
1585 return true;
1586 }
1587
1588 static inline bool
1589 checksum_valid(const struct conn_key *key, const void *data, size_t size,
1590 const void *l3)
1591 {
1592 uint32_t csum = 0;
1593
1594 if (key->dl_type == htons(ETH_TYPE_IP)) {
1595 csum = packet_csum_pseudoheader(l3);
1596 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1597 csum = packet_csum_pseudoheader6(l3);
1598 } else {
1599 return false;
1600 }
1601
1602 csum = csum_continue(csum, data, size);
1603
1604 return csum_finish(csum) == 0;
1605 }
1606
1607 static inline bool
1608 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
1609 const void *l3, bool validate_checksum)
1610 {
1611 const struct tcp_header *tcp = data;
1612 if (size < sizeof *tcp) {
1613 return false;
1614 }
1615
1616 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1617 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1618 return false;
1619 }
1620
1621 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
1622 }
1623
1624 static inline bool
1625 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
1626 const void *l3, bool validate_checksum)
1627 {
1628 const struct udp_header *udp = data;
1629 if (size < sizeof *udp) {
1630 return false;
1631 }
1632
1633 size_t udp_len = ntohs(udp->udp_len);
1634 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1635 return false;
1636 }
1637
1638 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1639 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1640 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
1641 }
1642
1643 static inline bool
1644 check_l4_icmp(const void *data, size_t size, bool validate_checksum)
1645 {
1646 return validate_checksum ? csum(data, size) == 0 : true;
1647 }
1648
1649 static inline bool
1650 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
1651 const void *l3, bool validate_checksum)
1652 {
1653 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
1654 }
1655
1656 static inline bool
1657 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1658 {
1659 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1660 return false;
1661 }
1662
1663 const struct tcp_header *tcp = data;
1664 key->src.port = tcp->tcp_src;
1665 key->dst.port = tcp->tcp_dst;
1666
1667 /* Port 0 is invalid */
1668 return key->src.port && key->dst.port;
1669 }
1670
1671 static inline bool
1672 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1673 {
1674 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1675 return false;
1676 }
1677
1678 const struct udp_header *udp = data;
1679 key->src.port = udp->udp_src;
1680 key->dst.port = udp->udp_dst;
1681
1682 /* Port 0 is invalid */
1683 return key->src.port && key->dst.port;
1684 }
1685
1686 static inline bool extract_l4(struct conn_key *key, const void *data,
1687 size_t size, bool *related, const void *l3,
1688 bool validate_checksum);
1689
1690 static uint8_t
1691 reverse_icmp_type(uint8_t type)
1692 {
1693 switch (type) {
1694 case ICMP4_ECHO_REQUEST:
1695 return ICMP4_ECHO_REPLY;
1696 case ICMP4_ECHO_REPLY:
1697 return ICMP4_ECHO_REQUEST;
1698
1699 case ICMP4_TIMESTAMP:
1700 return ICMP4_TIMESTAMPREPLY;
1701 case ICMP4_TIMESTAMPREPLY:
1702 return ICMP4_TIMESTAMP;
1703
1704 case ICMP4_INFOREQUEST:
1705 return ICMP4_INFOREPLY;
1706 case ICMP4_INFOREPLY:
1707 return ICMP4_INFOREQUEST;
1708 default:
1709 OVS_NOT_REACHED();
1710 }
1711 }
1712
1713 /* If 'related' is not NULL and the function is processing an ICMP
1714 * error packet, extract the l3 and l4 fields from the nested header
1715 * instead and set *related to true. If 'related' is NULL we're
1716 * already processing a nested header and no such recursion is
1717 * possible */
1718 static inline int
1719 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1720 bool *related)
1721 {
1722 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1723 return false;
1724 }
1725
1726 const struct icmp_header *icmp = data;
1727
1728 switch (icmp->icmp_type) {
1729 case ICMP4_ECHO_REQUEST:
1730 case ICMP4_ECHO_REPLY:
1731 case ICMP4_TIMESTAMP:
1732 case ICMP4_TIMESTAMPREPLY:
1733 case ICMP4_INFOREQUEST:
1734 case ICMP4_INFOREPLY:
1735 if (icmp->icmp_code != 0) {
1736 return false;
1737 }
1738 /* Separate ICMP connection: identified using id */
1739 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1740 key->src.icmp_type = icmp->icmp_type;
1741 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
1742 break;
1743 case ICMP4_DST_UNREACH:
1744 case ICMP4_TIME_EXCEEDED:
1745 case ICMP4_PARAM_PROB:
1746 case ICMP4_SOURCEQUENCH:
1747 case ICMP4_REDIRECT: {
1748 /* ICMP packet part of another connection. We should
1749 * extract the key from embedded packet header */
1750 struct conn_key inner_key;
1751 const char *l3 = (const char *) (icmp + 1);
1752 const char *tail = (const char *) data + size;
1753 const char *l4;
1754
1755 if (!related) {
1756 return false;
1757 }
1758
1759 memset(&inner_key, 0, sizeof inner_key);
1760 inner_key.dl_type = htons(ETH_TYPE_IP);
1761 bool ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
1762 if (!ok) {
1763 return false;
1764 }
1765
1766 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned) {
1767 return false;
1768 }
1769
1770 key->src = inner_key.src;
1771 key->dst = inner_key.dst;
1772 key->nw_proto = inner_key.nw_proto;
1773
1774 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
1775 if (ok) {
1776 conn_key_reverse(key);
1777 *related = true;
1778 }
1779 return ok;
1780 }
1781 default:
1782 return false;
1783 }
1784
1785 return true;
1786 }
1787
1788 static uint8_t
1789 reverse_icmp6_type(uint8_t type)
1790 {
1791 switch (type) {
1792 case ICMP6_ECHO_REQUEST:
1793 return ICMP6_ECHO_REPLY;
1794 case ICMP6_ECHO_REPLY:
1795 return ICMP6_ECHO_REQUEST;
1796 default:
1797 OVS_NOT_REACHED();
1798 }
1799 }
1800
1801 /* If 'related' is not NULL and the function is processing an ICMP
1802 * error packet, extract the l3 and l4 fields from the nested header
1803 * instead and set *related to true. If 'related' is NULL we're
1804 * already processing a nested header and no such recursion is
1805 * possible */
1806 static inline bool
1807 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1808 bool *related)
1809 {
1810 const struct icmp6_header *icmp6 = data;
1811
1812 /* All the messages that we support need at least 4 bytes after
1813 * the header */
1814 if (size < sizeof *icmp6 + 4) {
1815 return false;
1816 }
1817
1818 switch (icmp6->icmp6_type) {
1819 case ICMP6_ECHO_REQUEST:
1820 case ICMP6_ECHO_REPLY:
1821 if (icmp6->icmp6_code != 0) {
1822 return false;
1823 }
1824 /* Separate ICMP connection: identified using id */
1825 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1826 key->src.icmp_type = icmp6->icmp6_type;
1827 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
1828 break;
1829 case ICMP6_DST_UNREACH:
1830 case ICMP6_PACKET_TOO_BIG:
1831 case ICMP6_TIME_EXCEEDED:
1832 case ICMP6_PARAM_PROB: {
1833 /* ICMP packet part of another connection. We should
1834 * extract the key from embedded packet header */
1835 struct conn_key inner_key;
1836 const char *l3 = (const char *) icmp6 + 8;
1837 const char *tail = (const char *) data + size;
1838 const char *l4 = NULL;
1839
1840 if (!related) {
1841 return false;
1842 }
1843
1844 memset(&inner_key, 0, sizeof inner_key);
1845 inner_key.dl_type = htons(ETH_TYPE_IPV6);
1846 bool ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
1847 if (!ok) {
1848 return false;
1849 }
1850
1851 /* pf doesn't do this, but it seems a good idea */
1852 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
1853 &key->dst.addr.ipv6_aligned)) {
1854 return false;
1855 }
1856
1857 key->src = inner_key.src;
1858 key->dst = inner_key.dst;
1859 key->nw_proto = inner_key.nw_proto;
1860
1861 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
1862 if (ok) {
1863 conn_key_reverse(key);
1864 *related = true;
1865 }
1866 return ok;
1867 }
1868 default:
1869 return false;
1870 }
1871
1872 return true;
1873 }
1874
1875 /* Extract l4 fields into 'key', which must already contain valid l3
1876 * members.
1877 *
1878 * If 'related' is not NULL and an ICMP error packet is being
1879 * processed, the function will extract the key from the packet nested
1880 * in the ICMP payload and set '*related' to true.
1881 *
1882 * If 'related' is NULL, it means that we're already parsing a header nested
1883 * in an ICMP error. In this case, we skip checksum and length validation. */
1884 static inline bool
1885 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
1886 const void *l3, bool validate_checksum)
1887 {
1888 if (key->nw_proto == IPPROTO_TCP) {
1889 return (!related || check_l4_tcp(key, data, size, l3,
1890 validate_checksum)) && extract_l4_tcp(key, data, size);
1891 } else if (key->nw_proto == IPPROTO_UDP) {
1892 return (!related || check_l4_udp(key, data, size, l3,
1893 validate_checksum)) && extract_l4_udp(key, data, size);
1894 } else if (key->dl_type == htons(ETH_TYPE_IP)
1895 && key->nw_proto == IPPROTO_ICMP) {
1896 return (!related || check_l4_icmp(data, size, validate_checksum))
1897 && extract_l4_icmp(key, data, size, related);
1898 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1899 && key->nw_proto == IPPROTO_ICMPV6) {
1900 return (!related || check_l4_icmp6(key, data, size, l3,
1901 validate_checksum)) && extract_l4_icmp6(key, data, size,
1902 related);
1903 } else {
1904 return false;
1905 }
1906 }
1907
1908 static bool
1909 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
1910 struct conn_lookup_ctx *ctx, uint16_t zone)
1911 {
1912 const struct eth_header *l2 = dp_packet_eth(pkt);
1913 const struct ip_header *l3 = dp_packet_l3(pkt);
1914 const char *l4 = dp_packet_l4(pkt);
1915
1916 memset(ctx, 0, sizeof *ctx);
1917
1918 if (!l2 || !l3 || !l4) {
1919 return false;
1920 }
1921
1922 ctx->key.zone = zone;
1923
1924 /* XXX In this function we parse the packet (again, it has already
1925 * gone through miniflow_extract()) for two reasons:
1926 *
1927 * 1) To extract the l3 addresses and l4 ports.
1928 * We already have the l3 and l4 headers' pointers. Extracting
1929 * the l3 addresses and the l4 ports is really cheap, since they
1930 * can be found at fixed locations.
1931 * 2) To extract the l4 type.
1932 * Extracting the l4 types, for IPv6 can be quite expensive, because
1933 * it's not at a fixed location.
1934 *
1935 * Here's a way to avoid (2) with the help of the datapath.
1936 * The datapath doesn't keep the packet's extracted flow[1], so
1937 * using that is not an option. We could use the packet's matching
1938 * megaflow, but we have to make sure that the l4 type (nw_proto)
1939 * is unwildcarded. This means either:
1940 *
1941 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1942 * if the actions contains ct().
1943 *
1944 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1945 * action. This is already done in different actions, but it's
1946 * unnecessary for the kernel.
1947 *
1948 * ---
1949 * [1] The reasons for this are that keeping the flow increases
1950 * (slightly) the cache footprint and increases computation
1951 * time as we move the packet around. Most importantly, the flow
1952 * should be updated by the actions and this can be slow, as
1953 * we use a sparse representation (miniflow).
1954 *
1955 */
1956 const char *tail = dp_packet_tail(pkt);
1957 bool ok;
1958 ctx->key.dl_type = dl_type;
1959
1960 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
1961 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
1962 if (hwol_bad_l3_csum) {
1963 ok = false;
1964 } else {
1965 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
1966 /* Validate the checksum only when hwol is not supported. */
1967 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL,
1968 !hwol_good_l3_csum);
1969 }
1970 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1971 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1972 } else {
1973 ok = false;
1974 }
1975
1976 if (ok) {
1977 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
1978 if (!hwol_bad_l4_csum) {
1979 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt);
1980 /* Validate the checksum only when hwol is not supported. */
1981 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->icmp_related, l3,
1982 !hwol_good_l4_csum)) {
1983 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1984 return true;
1985 }
1986 }
1987 }
1988
1989 return false;
1990 }
1991
1992 static uint32_t
1993 ct_addr_hash_add(uint32_t hash, const struct ct_addr *addr)
1994 {
1995 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
1996 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
1997 }
1998
1999 static uint32_t
2000 ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
2001 {
2002 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
2003 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
2004 }
2005 \f
2006 /* Symmetric */
2007 static uint32_t
2008 conn_key_hash(const struct conn_key *key, uint32_t basis)
2009 {
2010 uint32_t hsrc, hdst, hash;
2011 hsrc = hdst = basis;
2012 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
2013 hdst = ct_endpoint_hash_add(hdst, &key->dst);
2014
2015 /* Even if source and destination are swapped the hash will be the same. */
2016 hash = hsrc ^ hdst;
2017
2018 /* Hash the rest of the key(L3 and L4 types and zone). */
2019 hash = hash_words((uint32_t *) (&key->dst + 1),
2020 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
2021 hash);
2022
2023 return hash_finish(hash, 0);
2024 }
2025
2026 static void
2027 conn_key_reverse(struct conn_key *key)
2028 {
2029 struct ct_endpoint tmp = key->src;
2030 key->src = key->dst;
2031 key->dst = tmp;
2032 }
2033
2034 static uint32_t
2035 nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
2036 struct in6_addr *ipv6_aligned_max)
2037 {
2038 uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
2039 uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t);
2040 uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
2041 uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
2042
2043 ovs_be64 addr6_64_min_hi;
2044 ovs_be64 addr6_64_min_lo;
2045 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
2046 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
2047
2048 ovs_be64 addr6_64_max_hi;
2049 ovs_be64 addr6_64_max_lo;
2050 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
2051 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
2052
2053 uint64_t diff;
2054
2055 if (addr6_64_min_hi == addr6_64_max_hi &&
2056 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
2057 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
2058 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
2059 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
2060 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
2061 ntohll(addr6_64_max_lo) - 1);
2062 } else {
2063 /* Limit address delta supported to 32 bits or 4 billion approximately.
2064 * Possibly, this should be visible to the user through a datapath
2065 * support check, however the practical impact is probably nil. */
2066 diff = 0xfffffffe;
2067 }
2068
2069 if (diff > 0xfffffffe) {
2070 diff = 0xfffffffe;
2071 }
2072 return diff;
2073 }
2074
2075 /* This function must be used in tandem with nat_ipv6_addrs_delta(), which
2076 * restricts the input parameters. */
2077 static void
2078 nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
2079 {
2080 uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
2081 uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
2082 ovs_be64 addr6_64_hi;
2083 ovs_be64 addr6_64_lo;
2084 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
2085 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
2086
2087 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
2088 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
2089 } else if (addr6_64_hi != OVS_BE64_MAX) {
2090 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
2091 addr6_64_lo = htonll(increment - (UINT64_MAX -
2092 ntohll(addr6_64_lo) + 1));
2093 } else {
2094 OVS_NOT_REACHED();
2095 }
2096
2097 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
2098 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
2099
2100 return;
2101 }
2102
2103 static uint32_t
2104 nat_range_hash(const struct conn *conn, uint32_t basis)
2105 {
2106 uint32_t hash = basis;
2107
2108 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
2109 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
2110 hash = hash_add(hash,
2111 (conn->nat_info->max_port << 16)
2112 | conn->nat_info->min_port);
2113 hash = ct_endpoint_hash_add(hash, &conn->key.src);
2114 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
2115 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
2116 hash = hash_add(hash, conn->key.nw_proto);
2117 hash = hash_add(hash, conn->key.zone);
2118
2119 /* The purpose of the second parameter is to distinguish hashes of data of
2120 * different length; our data always has the same length so there is no
2121 * value in counting. */
2122 return hash_finish(hash, 0);
2123 }
2124
2125 static bool
2126 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
2127 struct conn *nat_conn)
2128 {
2129 enum { MIN_NAT_EPHEMERAL_PORT = 1024,
2130 MAX_NAT_EPHEMERAL_PORT = 65535 };
2131
2132 uint16_t min_port;
2133 uint16_t max_port;
2134 uint16_t first_port;
2135 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
2136
2137 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
2138 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
2139 min_port = ntohs(conn->key.src.port);
2140 max_port = ntohs(conn->key.src.port);
2141 first_port = min_port;
2142 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
2143 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
2144 min_port = ntohs(conn->key.dst.port);
2145 max_port = ntohs(conn->key.dst.port);
2146 first_port = min_port;
2147 } else {
2148 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
2149 uint32_t port_index = hash % (deltap + 1);
2150 first_port = conn->nat_info->min_port + port_index;
2151 min_port = conn->nat_info->min_port;
2152 max_port = conn->nat_info->max_port;
2153 }
2154
2155 uint32_t deltaa = 0;
2156 uint32_t address_index;
2157 struct ct_addr ct_addr;
2158 memset(&ct_addr, 0, sizeof ct_addr);
2159 struct ct_addr max_ct_addr;
2160 memset(&max_ct_addr, 0, sizeof max_ct_addr);
2161 max_ct_addr = conn->nat_info->max_addr;
2162
2163 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
2164 deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
2165 ntohl(conn->nat_info->min_addr.ipv4_aligned);
2166 address_index = hash % (deltaa + 1);
2167 ct_addr.ipv4_aligned = htonl(
2168 ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
2169 } else {
2170 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
2171 &conn->nat_info->max_addr.ipv6_aligned);
2172 /* deltaa must be within 32 bits for full hash coverage. A 64 or
2173 * 128 bit hash is unnecessary and hence not used here. Most code
2174 * is kept common with V4; nat_ipv6_addrs_delta() will do the
2175 * enforcement via max_ct_addr. */
2176 max_ct_addr = conn->nat_info->min_addr;
2177 nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
2178 address_index = hash % (deltaa + 1);
2179 ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
2180 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
2181 }
2182
2183 uint16_t port = first_port;
2184 bool all_ports_tried = false;
2185 bool original_ports_tried = false;
2186 struct ct_addr first_addr = ct_addr;
2187
2188 while (true) {
2189 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2190 nat_conn->rev_key.dst.addr = ct_addr;
2191 } else {
2192 nat_conn->rev_key.src.addr = ct_addr;
2193 }
2194
2195 if ((conn->key.nw_proto == IPPROTO_ICMP) ||
2196 (conn->key.nw_proto == IPPROTO_ICMPV6)) {
2197 all_ports_tried = true;
2198 } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2199 nat_conn->rev_key.dst.port = htons(port);
2200 } else {
2201 nat_conn->rev_key.src.port = htons(port);
2202 }
2203
2204 bool new_insert = nat_conn_keys_insert(&ct->nat_conn_keys, nat_conn,
2205 ct->hash_basis);
2206 if (new_insert) {
2207 return true;
2208 } else if (!all_ports_tried) {
2209 if (min_port == max_port) {
2210 all_ports_tried = true;
2211 } else if (port == max_port) {
2212 port = min_port;
2213 } else {
2214 port++;
2215 }
2216 if (port == first_port) {
2217 all_ports_tried = true;
2218 }
2219 } else {
2220 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
2221 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
2222 ct_addr.ipv4_aligned = htonl(
2223 ntohl(ct_addr.ipv4_aligned) + 1);
2224 } else {
2225 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
2226 }
2227 } else {
2228 ct_addr = conn->nat_info->min_addr;
2229 }
2230 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
2231 if (!original_ports_tried) {
2232 original_ports_tried = true;
2233 ct_addr = conn->nat_info->min_addr;
2234 min_port = MIN_NAT_EPHEMERAL_PORT;
2235 max_port = MAX_NAT_EPHEMERAL_PORT;
2236 } else {
2237 break;
2238 }
2239 }
2240 first_port = min_port;
2241 port = first_port;
2242 all_ports_tried = false;
2243 }
2244 }
2245 return false;
2246 }
2247
2248 /* This function must be called with the ct->resources lock taken. */
2249 static struct nat_conn_key_node *
2250 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
2251 const struct conn_key *key,
2252 uint32_t basis)
2253 {
2254 struct nat_conn_key_node *nat_conn_key_node;
2255
2256 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node,
2257 conn_key_hash(key, basis), nat_conn_keys) {
2258 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
2259 return nat_conn_key_node;
2260 }
2261 }
2262 return NULL;
2263 }
2264
2265 /* This function must be called with the ct->resources lock taken. */
2266 static bool
2267 nat_conn_keys_insert(struct hmap *nat_conn_keys, const struct conn *nat_conn,
2268 uint32_t basis)
2269 {
2270 struct nat_conn_key_node *nat_conn_key_node =
2271 nat_conn_keys_lookup(nat_conn_keys, &nat_conn->rev_key, basis);
2272
2273 if (!nat_conn_key_node) {
2274 struct nat_conn_key_node *nat_conn_key = xzalloc(sizeof *nat_conn_key);
2275 nat_conn_key->key = nat_conn->rev_key;
2276 nat_conn_key->value = nat_conn->key;
2277 hmap_insert(nat_conn_keys, &nat_conn_key->node,
2278 conn_key_hash(&nat_conn_key->key, basis));
2279 return true;
2280 }
2281 return false;
2282 }
2283
2284 /* This function must be called with the ct->resources write lock taken. */
2285 static void
2286 nat_conn_keys_remove(struct hmap *nat_conn_keys,
2287 const struct conn_key *key,
2288 uint32_t basis)
2289 {
2290 struct nat_conn_key_node *nat_conn_key_node;
2291
2292 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node,
2293 conn_key_hash(key, basis), nat_conn_keys) {
2294 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
2295 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
2296 free(nat_conn_key_node);
2297 return;
2298 }
2299 }
2300 }
2301
2302 static void
2303 conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
2304 long long now)
2305 OVS_REQUIRES(ctb->lock)
2306 {
2307 uint32_t hash = ctx->hash;
2308 struct conn *conn;
2309
2310 ctx->conn = NULL;
2311
2312 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
2313 if (!conn_key_cmp(&conn->key, &ctx->key)
2314 && !conn_expired(conn, now)) {
2315 ctx->conn = conn;
2316 ctx->reply = false;
2317 break;
2318 }
2319 if (!conn_key_cmp(&conn->rev_key, &ctx->key)
2320 && !conn_expired(conn, now)) {
2321 ctx->conn = conn;
2322 ctx->reply = true;
2323 break;
2324 }
2325 }
2326 }
2327
2328 static enum ct_update_res
2329 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
2330 struct dp_packet *pkt, bool reply, long long now)
2331 {
2332 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
2333 reply, now);
2334 }
2335
2336 static bool
2337 conn_expired(struct conn *conn, long long now)
2338 {
2339 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
2340 return now >= conn->expiration;
2341 }
2342 return false;
2343 }
2344
2345 static bool
2346 valid_new(struct dp_packet *pkt, struct conn_key *key)
2347 {
2348 return l4_protos[key->nw_proto]->valid_new(pkt);
2349 }
2350
2351 static struct conn *
2352 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
2353 struct conn_key *key, long long now)
2354 {
2355 struct conn *newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
2356 if (newconn) {
2357 newconn->key = *key;
2358 }
2359
2360 return newconn;
2361 }
2362
2363 static void
2364 delete_conn(struct conn *conn)
2365 {
2366 free(conn->nat_info);
2367 free(conn->alg);
2368 free(conn);
2369 }
2370 \f
2371 static void
2372 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
2373 union ct_dpif_inet_addr *b,
2374 ovs_be16 dl_type)
2375 {
2376 if (dl_type == htons(ETH_TYPE_IP)) {
2377 b->ip = a->ipv4_aligned;
2378 } else if (dl_type == htons(ETH_TYPE_IPV6)){
2379 b->in6 = a->ipv6_aligned;
2380 }
2381 }
2382
2383 static void
2384 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2385 {
2386 if (key->dl_type == htons(ETH_TYPE_IP)) {
2387 tuple->l3_type = AF_INET;
2388 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2389 tuple->l3_type = AF_INET6;
2390 }
2391 tuple->ip_proto = key->nw_proto;
2392 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2393 key->dl_type);
2394 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2395 key->dl_type);
2396
2397 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
2398 tuple->icmp_id = key->src.icmp_id;
2399 tuple->icmp_type = key->src.icmp_type;
2400 tuple->icmp_code = key->src.icmp_code;
2401 } else {
2402 tuple->src_port = key->src.port;
2403 tuple->dst_port = key->dst.port;
2404 }
2405 }
2406
2407 static void
2408 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
2409 long long now, int bkt)
2410 {
2411 memset(entry, 0, sizeof *entry);
2412 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
2413 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
2414
2415 entry->zone = conn->key.zone;
2416 entry->mark = conn->mark;
2417
2418 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
2419 /* Not implemented yet */
2420 entry->timestamp.start = 0;
2421 entry->timestamp.stop = 0;
2422
2423 long long expiration = conn->expiration - now;
2424 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
2425
2426 struct ct_l4_proto *class = l4_protos[conn->key.nw_proto];
2427 if (class->conn_get_protoinfo) {
2428 class->conn_get_protoinfo(conn, &entry->protoinfo);
2429 }
2430
2431 entry->bkt = bkt;
2432
2433 if (conn->alg) {
2434 /* Caller is responsible for freeing. */
2435 entry->helper.name = xstrdup(conn->alg);
2436 }
2437 }
2438
2439 int
2440 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
2441 const uint16_t *pzone, int *ptot_bkts)
2442 {
2443 memset(dump, 0, sizeof(*dump));
2444
2445 if (pzone) {
2446 dump->zone = *pzone;
2447 dump->filter_zone = true;
2448 }
2449
2450 dump->ct = ct;
2451 *ptot_bkts = CONNTRACK_BUCKETS;
2452 return 0;
2453 }
2454
2455 int
2456 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2457 {
2458 struct conntrack *ct = dump->ct;
2459 long long now = time_msec();
2460
2461 while (dump->bucket < CONNTRACK_BUCKETS) {
2462 struct hmap_node *node;
2463
2464 ct_lock_lock(&ct->buckets[dump->bucket].lock);
2465 for (;;) {
2466 struct conn *conn;
2467
2468 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
2469 &dump->bucket_pos);
2470 if (!node) {
2471 break;
2472 }
2473 INIT_CONTAINER(conn, node, node);
2474 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2475 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
2476 conn_to_ct_dpif_entry(conn, entry, now, dump->bucket);
2477 break;
2478 }
2479 /* Else continue, until we find an entry in the appropriate zone
2480 * or the bucket has been scanned completely. */
2481 }
2482 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
2483
2484 if (!node) {
2485 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
2486 dump->bucket++;
2487 } else {
2488 return 0;
2489 }
2490 }
2491 return EOF;
2492 }
2493
2494 int
2495 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2496 {
2497 return 0;
2498 }
2499
2500 int
2501 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2502 {
2503 for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
2504 struct conn *conn, *next;
2505
2506 ct_lock_lock(&ct->buckets[i].lock);
2507 HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
2508 if ((!zone || *zone == conn->key.zone) &&
2509 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
2510 conn_clean(ct, conn, &ct->buckets[i]);
2511 }
2512 }
2513 ct_lock_unlock(&ct->buckets[i].lock);
2514 }
2515
2516 return 0;
2517 }
2518
2519 int
2520 conntrack_set_maxconns(struct conntrack *ct, uint32_t maxconns)
2521 {
2522 atomic_store_relaxed(&ct->n_conn_limit, maxconns);
2523 return 0;
2524 }
2525
2526 int
2527 conntrack_get_maxconns(struct conntrack *ct, uint32_t *maxconns)
2528 {
2529 atomic_read_relaxed(&ct->n_conn_limit, maxconns);
2530 return 0;
2531 }
2532
2533 int
2534 conntrack_get_nconns(struct conntrack *ct, uint32_t *nconns)
2535 {
2536 *nconns = atomic_count_get(&ct->n_conn);
2537 return 0;
2538 }
2539
2540 /* This function must be called with the ct->resources read lock taken. */
2541 static struct alg_exp_node *
2542 expectation_lookup(struct hmap *alg_expectations, const struct conn_key *key,
2543 uint32_t basis, bool src_ip_wc)
2544 {
2545 struct conn_key check_key = *key;
2546 check_key.src.port = ALG_WC_SRC_PORT;
2547
2548 if (src_ip_wc) {
2549 memset(&check_key.src.addr, 0, sizeof check_key.src.addr);
2550 }
2551
2552 struct alg_exp_node *alg_exp_node;
2553
2554 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
2555 conn_key_hash(&check_key, basis),
2556 alg_expectations) {
2557 if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
2558 return alg_exp_node;
2559 }
2560 }
2561 return NULL;
2562 }
2563
2564 /* This function must be called with the ct->resources write lock taken. */
2565 static void
2566 expectation_remove(struct hmap *alg_expectations,
2567 const struct conn_key *key, uint32_t basis)
2568 {
2569 struct alg_exp_node *alg_exp_node;
2570
2571 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node, conn_key_hash(key, basis),
2572 alg_expectations) {
2573 if (!conn_key_cmp(&alg_exp_node->key, key)) {
2574 hmap_remove(alg_expectations, &alg_exp_node->node);
2575 break;
2576 }
2577 }
2578 }
2579
2580 /* This function must be called with the ct->resources read lock taken. */
2581 static struct alg_exp_node *
2582 expectation_ref_lookup_unique(const struct hindex *alg_expectation_refs,
2583 const struct conn_key *master_key,
2584 const struct conn_key *alg_exp_key,
2585 uint32_t basis)
2586 {
2587 struct alg_exp_node *alg_exp_node;
2588
2589 HINDEX_FOR_EACH_WITH_HASH (alg_exp_node, node_ref,
2590 conn_key_hash(master_key, basis),
2591 alg_expectation_refs) {
2592 if (!conn_key_cmp(&alg_exp_node->master_key, master_key) &&
2593 !conn_key_cmp(&alg_exp_node->key, alg_exp_key)) {
2594 return alg_exp_node;
2595 }
2596 }
2597 return NULL;
2598 }
2599
2600 /* This function must be called with the ct->resources write lock taken. */
2601 static void
2602 expectation_ref_create(struct hindex *alg_expectation_refs,
2603 struct alg_exp_node *alg_exp_node,
2604 uint32_t basis)
2605 {
2606 if (!expectation_ref_lookup_unique(alg_expectation_refs,
2607 &alg_exp_node->master_key,
2608 &alg_exp_node->key, basis)) {
2609 hindex_insert(alg_expectation_refs, &alg_exp_node->node_ref,
2610 conn_key_hash(&alg_exp_node->master_key, basis));
2611 }
2612 }
2613
2614 static void
2615 expectation_clean(struct conntrack *ct, const struct conn_key *master_key,
2616 uint32_t basis)
2617 {
2618 ct_rwlock_wrlock(&ct->resources_lock);
2619
2620 struct alg_exp_node *node, *next;
2621 HINDEX_FOR_EACH_WITH_HASH_SAFE (node, next, node_ref,
2622 conn_key_hash(master_key, basis),
2623 &ct->alg_expectation_refs) {
2624 if (!conn_key_cmp(&node->master_key, master_key)) {
2625 expectation_remove(&ct->alg_expectations, &node->key, basis);
2626 hindex_remove(&ct->alg_expectation_refs, &node->node_ref);
2627 free(node);
2628 }
2629 }
2630
2631 ct_rwlock_unlock(&ct->resources_lock);
2632 }
2633
2634 static void
2635 expectation_create(struct conntrack *ct, ovs_be16 dst_port,
2636 const struct conn *master_conn, bool reply, bool src_ip_wc,
2637 bool skip_nat)
2638 {
2639 struct ct_addr src_addr;
2640 struct ct_addr dst_addr;
2641 struct ct_addr alg_nat_repl_addr;
2642 struct alg_exp_node *alg_exp_node = xzalloc(sizeof *alg_exp_node);
2643
2644 if (reply) {
2645 src_addr = master_conn->key.src.addr;
2646 dst_addr = master_conn->key.dst.addr;
2647 if (skip_nat) {
2648 alg_nat_repl_addr = dst_addr;
2649 } else {
2650 alg_nat_repl_addr = master_conn->rev_key.dst.addr;
2651 }
2652 alg_exp_node->nat_rpl_dst = true;
2653 } else {
2654 src_addr = master_conn->rev_key.src.addr;
2655 dst_addr = master_conn->rev_key.dst.addr;
2656 if (skip_nat) {
2657 alg_nat_repl_addr = src_addr;
2658 } else {
2659 alg_nat_repl_addr = master_conn->key.src.addr;
2660 }
2661 alg_exp_node->nat_rpl_dst = false;
2662 }
2663 if (src_ip_wc) {
2664 memset(&src_addr, 0, sizeof src_addr);
2665 }
2666
2667 alg_exp_node->key.dl_type = master_conn->key.dl_type;
2668 alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
2669 alg_exp_node->key.zone = master_conn->key.zone;
2670 alg_exp_node->key.src.addr = src_addr;
2671 alg_exp_node->key.dst.addr = dst_addr;
2672 alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
2673 alg_exp_node->key.dst.port = dst_port;
2674 alg_exp_node->master_mark = master_conn->mark;
2675 alg_exp_node->master_label = master_conn->label;
2676 alg_exp_node->master_key = master_conn->key;
2677 /* Take the write lock here because it is almost 100%
2678 * likely that the lookup will fail and
2679 * expectation_create() will be called below. */
2680 ct_rwlock_wrlock(&ct->resources_lock);
2681 struct alg_exp_node *alg_exp = expectation_lookup(
2682 &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis, src_ip_wc);
2683 if (alg_exp) {
2684 free(alg_exp_node);
2685 ct_rwlock_unlock(&ct->resources_lock);
2686 return;
2687 }
2688
2689 alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
2690 hmap_insert(&ct->alg_expectations, &alg_exp_node->node,
2691 conn_key_hash(&alg_exp_node->key, ct->hash_basis));
2692 expectation_ref_create(&ct->alg_expectation_refs, alg_exp_node,
2693 ct->hash_basis);
2694 ct_rwlock_unlock(&ct->resources_lock);
2695 }
2696
2697 static uint8_t
2698 get_v4_byte_be(ovs_be32 v4_addr, uint8_t index)
2699 {
2700 uint8_t *byte_ptr = (OVS_FORCE uint8_t *) &v4_addr;
2701 return byte_ptr[index];
2702 }
2703
2704 static void
2705 replace_substring(char *substr, uint8_t substr_size,
2706 uint8_t total_size, char *rep_str,
2707 uint8_t rep_str_size)
2708 {
2709 memmove(substr + rep_str_size, substr + substr_size,
2710 total_size - substr_size);
2711 memcpy(substr, rep_str, rep_str_size);
2712 }
2713
2714 /* Replace IPV4 address in FTP message with NATed address. */
2715 static int
2716 repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
2717 char *ftp_data_start,
2718 size_t addr_offset_from_ftp_data_start)
2719 {
2720 enum { MAX_FTP_V4_NAT_DELTA = 8 };
2721
2722 /* Do conservative check for pathological MTU usage. */
2723 uint32_t orig_used_size = dp_packet_size(pkt);
2724 uint16_t allocated_size = dp_packet_get_allocated(pkt);
2725 if (orig_used_size + MAX_FTP_V4_NAT_DELTA > allocated_size) {
2726 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
2727 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP",
2728 allocated_size);
2729 return 0;
2730 }
2731
2732 size_t remain_size = tcp_payload_length(pkt) -
2733 addr_offset_from_ftp_data_start;
2734 int overall_delta = 0;
2735 char *byte_str = ftp_data_start + addr_offset_from_ftp_data_start;
2736
2737 /* Replace the existing IPv4 address by the new one. */
2738 for (uint8_t i = 0; i < 4; i++) {
2739 /* Find the end of the string for this octet. */
2740 char *next_delim = memchr(byte_str, ',', 4);
2741 ovs_assert(next_delim);
2742 int substr_size = next_delim - byte_str;
2743 remain_size -= substr_size;
2744
2745 /* Compose the new string for this octet, and replace it. */
2746 char rep_str[4];
2747 uint8_t rep_byte = get_v4_byte_be(v4_addr_rep, i);
2748 int replace_size = sprintf(rep_str, "%d", rep_byte);
2749 replace_substring(byte_str, substr_size, remain_size,
2750 rep_str, replace_size);
2751 overall_delta += replace_size - substr_size;
2752
2753 /* Advance past the octet and the following comma. */
2754 byte_str += replace_size + 1;
2755 }
2756
2757 dp_packet_set_size(pkt, orig_used_size + overall_delta);
2758 return overall_delta;
2759 }
2760
2761 static char *
2762 skip_non_digits(char *str)
2763 {
2764 while (!isdigit(*str) && *str != 0) {
2765 str++;
2766 }
2767 return str;
2768 }
2769
2770 static char *
2771 terminate_number_str(char *str, uint8_t max_digits)
2772 {
2773 uint8_t digits_found = 0;
2774 while (isdigit(*str) && digits_found <= max_digits) {
2775 str++;
2776 digits_found++;
2777 }
2778
2779 *str = 0;
2780 return str;
2781 }
2782
2783
2784 static void
2785 get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
2786 {
2787 struct tcp_header *th = dp_packet_l4(pkt);
2788 char *tcp_hdr = (char *) th;
2789 uint32_t tcp_payload_len = tcp_payload_length(pkt);
2790 size_t tcp_payload_of_interest = MIN(tcp_payload_len,
2791 LARGEST_FTP_MSG_OF_INTEREST);
2792 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2793
2794 ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
2795 tcp_payload_of_interest);
2796 }
2797
2798 static enum ftp_ctl_pkt
2799 detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
2800 struct dp_packet *pkt)
2801 {
2802 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2803 get_ftp_ctl_msg(pkt, ftp_msg);
2804
2805 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2806 if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
2807 !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
2808 return CT_FTP_CTL_OTHER;
2809 }
2810 } else {
2811 if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
2812 strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
2813 strlen(FTP_PASV_REPLY_CODE))) {
2814 return CT_FTP_CTL_OTHER;
2815 }
2816 }
2817
2818 return CT_FTP_CTL_INTEREST;
2819 }
2820
2821 static enum ftp_ctl_pkt
2822 process_ftp_ctl_v4(struct conntrack *ct,
2823 struct dp_packet *pkt,
2824 const struct conn *conn_for_expectation,
2825 ovs_be32 *v4_addr_rep,
2826 char **ftp_data_v4_start,
2827 size_t *addr_offset_from_ftp_data_start)
2828 {
2829 struct tcp_header *th = dp_packet_l4(pkt);
2830 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2831 char *tcp_hdr = (char *) th;
2832 *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
2833 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2834 get_ftp_ctl_msg(pkt, ftp_msg);
2835 char *ftp = ftp_msg;
2836 enum ct_alg_mode mode;
2837
2838 if (!strncasecmp(ftp, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
2839 ftp = ftp_msg + strlen(FTP_PORT_CMD);
2840 mode = CT_FTP_MODE_ACTIVE;
2841 } else {
2842 ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
2843 mode = CT_FTP_MODE_PASSIVE;
2844 }
2845
2846 /* Find first space. */
2847 ftp = strchr(ftp, ' ');
2848 if (!ftp) {
2849 return CT_FTP_CTL_INVALID;
2850 }
2851
2852 /* Find the first digit, after space. */
2853 ftp = skip_non_digits(ftp);
2854 if (*ftp == 0) {
2855 return CT_FTP_CTL_INVALID;
2856 }
2857
2858 char *ip_addr_start = ftp;
2859 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
2860
2861 uint8_t comma_count = 0;
2862 while (comma_count < 4 && *ftp) {
2863 if (*ftp == ',') {
2864 comma_count++;
2865 if (comma_count == 4) {
2866 *ftp = 0;
2867 } else {
2868 *ftp = '.';
2869 }
2870 }
2871 ftp++;
2872 }
2873 if (comma_count != 4) {
2874 return CT_FTP_CTL_INVALID;
2875 }
2876
2877 struct in_addr ip_addr;
2878 int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
2879 if (rc2 != 1) {
2880 return CT_FTP_CTL_INVALID;
2881 }
2882
2883 char *save_ftp = ftp;
2884 ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
2885 if (!ftp) {
2886 return CT_FTP_CTL_INVALID;
2887 }
2888 int value;
2889 if (!str_to_int(save_ftp, 10, &value)) {
2890 return CT_FTP_CTL_INVALID;
2891 }
2892
2893 /* This is derived from the L4 port maximum is 65535. */
2894 if (value > 255) {
2895 return CT_FTP_CTL_INVALID;
2896 }
2897
2898 uint16_t port_hs = value;
2899 port_hs <<= 8;
2900
2901 /* Skip over comma. */
2902 ftp++;
2903 save_ftp = ftp;
2904 bool digit_found = false;
2905 while (isdigit(*ftp)) {
2906 ftp++;
2907 digit_found = true;
2908 }
2909 if (!digit_found) {
2910 return CT_FTP_CTL_INVALID;
2911 }
2912 *ftp = 0;
2913 if (!str_to_int(save_ftp, 10, &value)) {
2914 return CT_FTP_CTL_INVALID;
2915 }
2916
2917 if (value > 255) {
2918 return CT_FTP_CTL_INVALID;
2919 }
2920
2921 uint16_t port_lo_hs = value;
2922 if (65535 - port_hs < port_lo_hs) {
2923 return CT_FTP_CTL_INVALID;
2924 }
2925
2926 port_hs |= port_lo_hs;
2927 ovs_be16 port = htons(port_hs);
2928 ovs_be32 conn_ipv4_addr;
2929
2930 switch (mode) {
2931 case CT_FTP_MODE_ACTIVE:
2932 *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4_aligned;
2933 conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4_aligned;
2934 break;
2935 case CT_FTP_MODE_PASSIVE:
2936 *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4_aligned;
2937 conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4_aligned;
2938 break;
2939 case CT_TFTP_MODE:
2940 default:
2941 OVS_NOT_REACHED();
2942 }
2943
2944 ovs_be32 ftp_ipv4_addr;
2945 ftp_ipv4_addr = ip_addr.s_addr;
2946 /* Although most servers will block this exploit, there may be some
2947 * less well managed. */
2948 if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
2949 return CT_FTP_CTL_INVALID;
2950 }
2951
2952 expectation_create(ct, port, conn_for_expectation,
2953 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
2954 return CT_FTP_CTL_INTEREST;
2955 }
2956
2957 static char *
2958 skip_ipv6_digits(char *str)
2959 {
2960 while (isxdigit(*str) || *str == ':' || *str == '.') {
2961 str++;
2962 }
2963 return str;
2964 }
2965
2966 static enum ftp_ctl_pkt
2967 process_ftp_ctl_v6(struct conntrack *ct,
2968 struct dp_packet *pkt,
2969 const struct conn *conn_for_expectation,
2970 struct ct_addr *v6_addr_rep,
2971 char **ftp_data_start,
2972 size_t *addr_offset_from_ftp_data_start,
2973 size_t *addr_size, enum ct_alg_mode *mode)
2974 {
2975 struct tcp_header *th = dp_packet_l4(pkt);
2976 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2977 char *tcp_hdr = (char *) th;
2978 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2979 get_ftp_ctl_msg(pkt, ftp_msg);
2980 *ftp_data_start = tcp_hdr + tcp_hdr_len;
2981 char *ftp = ftp_msg;
2982 struct in6_addr ip6_addr;
2983
2984 if (!strncasecmp(ftp, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
2985 ftp = ftp_msg + strlen(FTP_EPRT_CMD);
2986 ftp = skip_non_digits(ftp);
2987 if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
2988 return CT_FTP_CTL_INVALID;
2989 }
2990 /* Jump over delimiter. */
2991 ftp += 2;
2992
2993 memset(&ip6_addr, 0, sizeof ip6_addr);
2994 char *ip_addr_start = ftp;
2995 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
2996 ftp = skip_ipv6_digits(ftp);
2997 *ftp = 0;
2998 *addr_size = ftp - ip_addr_start;
2999 int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
3000 if (rc2 != 1) {
3001 return CT_FTP_CTL_INVALID;
3002 }
3003 ftp++;
3004 *mode = CT_FTP_MODE_ACTIVE;
3005 } else {
3006 ftp = ftp_msg + strcspn(ftp_msg, "(");
3007 ftp = skip_non_digits(ftp);
3008 if (!isdigit(*ftp)) {
3009 return CT_FTP_CTL_INVALID;
3010 }
3011
3012 /* Not used for passive mode. */
3013 *addr_offset_from_ftp_data_start = 0;
3014 *addr_size = 0;
3015
3016 *mode = CT_FTP_MODE_PASSIVE;
3017 }
3018
3019 char *save_ftp = ftp;
3020 ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
3021 if (!ftp) {
3022 return CT_FTP_CTL_INVALID;
3023 }
3024
3025 int value;
3026 if (!str_to_int(save_ftp, 10, &value)) {
3027 return CT_FTP_CTL_INVALID;
3028 }
3029 if (value > CT_MAX_L4_PORT) {
3030 return CT_FTP_CTL_INVALID;
3031 }
3032
3033 uint16_t port_hs = value;
3034 ovs_be16 port = htons(port_hs);
3035
3036 switch (*mode) {
3037 case CT_FTP_MODE_ACTIVE:
3038 *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
3039 /* Although most servers will block this exploit, there may be some
3040 * less well managed. */
3041 if (memcmp(&ip6_addr, &v6_addr_rep->ipv6_aligned, sizeof ip6_addr) &&
3042 memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6_aligned,
3043 sizeof ip6_addr)) {
3044 return CT_FTP_CTL_INVALID;
3045 }
3046 break;
3047 case CT_FTP_MODE_PASSIVE:
3048 *v6_addr_rep = conn_for_expectation->key.dst.addr;
3049 break;
3050 case CT_TFTP_MODE:
3051 default:
3052 OVS_NOT_REACHED();
3053 }
3054
3055 expectation_create(ct, port, conn_for_expectation,
3056 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
3057 return CT_FTP_CTL_INTEREST;
3058 }
3059
3060 static int
3061 repl_ftp_v6_addr(struct dp_packet *pkt, struct ct_addr v6_addr_rep,
3062 char *ftp_data_start,
3063 size_t addr_offset_from_ftp_data_start,
3064 size_t addr_size, enum ct_alg_mode mode)
3065 {
3066 /* This is slightly bigger than really possible. */
3067 enum { MAX_FTP_V6_NAT_DELTA = 45 };
3068
3069 if (mode == CT_FTP_MODE_PASSIVE) {
3070 return 0;
3071 }
3072
3073 /* Do conservative check for pathological MTU usage. */
3074 uint32_t orig_used_size = dp_packet_size(pkt);
3075 uint16_t allocated_size = dp_packet_get_allocated(pkt);
3076 if (orig_used_size + MAX_FTP_V6_NAT_DELTA > allocated_size) {
3077 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3078 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP",
3079 allocated_size);
3080 return 0;
3081 }
3082
3083 const char *rc;
3084 char v6_addr_str[IPV6_SCAN_LEN] = {0};
3085 rc = inet_ntop(AF_INET6, &v6_addr_rep.ipv6_aligned, v6_addr_str,
3086 IPV6_SCAN_LEN - 1);
3087 ovs_assert(rc != NULL);
3088
3089 size_t replace_addr_size = strlen(v6_addr_str);
3090
3091 size_t remain_size = tcp_payload_length(pkt) -
3092 addr_offset_from_ftp_data_start;
3093
3094 char *pkt_addr_str = ftp_data_start + addr_offset_from_ftp_data_start;
3095 replace_substring(pkt_addr_str, addr_size, remain_size,
3096 v6_addr_str, replace_addr_size);
3097
3098 int overall_delta = (int) replace_addr_size - (int) addr_size;
3099
3100 dp_packet_set_size(pkt, orig_used_size + overall_delta);
3101 return overall_delta;
3102 }
3103
3104 static void
3105 handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
3106 struct dp_packet *pkt,
3107 const struct conn *conn_for_expectation,
3108 long long now, enum ftp_ctl_pkt ftp_ctl, bool nat)
3109 {
3110 struct ip_header *l3_hdr = dp_packet_l3(pkt);
3111 ovs_be32 v4_addr_rep = 0;
3112 struct ct_addr v6_addr_rep;
3113 size_t addr_offset_from_ftp_data_start;
3114 size_t addr_size = 0;
3115 char *ftp_data_start;
3116 bool do_seq_skew_adj = true;
3117 enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
3118
3119 if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
3120 return;
3121 }
3122
3123 if (!nat || !conn_for_expectation->seq_skew) {
3124 do_seq_skew_adj = false;
3125 }
3126
3127 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
3128 int64_t seq_skew = 0;
3129
3130 if (ftp_ctl == CT_FTP_CTL_OTHER) {
3131 seq_skew = conn_for_expectation->seq_skew;
3132 } else if (ftp_ctl == CT_FTP_CTL_INTEREST) {
3133 enum ftp_ctl_pkt rc;
3134 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3135 rc = process_ftp_ctl_v6(ct, pkt, conn_for_expectation,
3136 &v6_addr_rep, &ftp_data_start,
3137 &addr_offset_from_ftp_data_start,
3138 &addr_size, &mode);
3139 } else {
3140 rc = process_ftp_ctl_v4(ct, pkt, conn_for_expectation,
3141 &v4_addr_rep, &ftp_data_start,
3142 &addr_offset_from_ftp_data_start);
3143 }
3144 if (rc == CT_FTP_CTL_INVALID) {
3145 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3146 VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
3147 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
3148 return;
3149 } else if (rc == CT_FTP_CTL_INTEREST) {
3150 uint16_t ip_len;
3151
3152 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3153 seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep, ftp_data_start,
3154 addr_offset_from_ftp_data_start,
3155 addr_size, mode);
3156 if (seq_skew) {
3157 ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen);
3158 ip_len += seq_skew;
3159 nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
3160 conn_seq_skew_set(ct, &conn_for_expectation->key, now,
3161 seq_skew, ctx->reply);
3162 }
3163 } else {
3164 seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep, ftp_data_start,
3165 addr_offset_from_ftp_data_start);
3166 ip_len = ntohs(l3_hdr->ip_tot_len);
3167 if (seq_skew) {
3168 ip_len += seq_skew;
3169 l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
3170 l3_hdr->ip_tot_len, htons(ip_len));
3171 l3_hdr->ip_tot_len = htons(ip_len);
3172 conn_seq_skew_set(ct, &conn_for_expectation->key, now,
3173 seq_skew, ctx->reply);
3174 }
3175 }
3176 } else {
3177 OVS_NOT_REACHED();
3178 }
3179 } else {
3180 OVS_NOT_REACHED();
3181 }
3182
3183 struct tcp_header *th = dp_packet_l4(pkt);
3184
3185 if (do_seq_skew_adj && seq_skew != 0) {
3186 if (ctx->reply != conn_for_expectation->seq_skew_dir) {
3187
3188 uint32_t tcp_ack = ntohl(get_16aligned_be32(&th->tcp_ack));
3189
3190 if ((seq_skew > 0) && (tcp_ack < seq_skew)) {
3191 /* Should not be possible; will be marked invalid. */
3192 tcp_ack = 0;
3193 } else if ((seq_skew < 0) && (UINT32_MAX - tcp_ack < -seq_skew)) {
3194 tcp_ack = (-seq_skew) - (UINT32_MAX - tcp_ack);
3195 } else {
3196 tcp_ack -= seq_skew;
3197 }
3198 ovs_be32 new_tcp_ack = htonl(tcp_ack);
3199 put_16aligned_be32(&th->tcp_ack, new_tcp_ack);
3200 } else {
3201 uint32_t tcp_seq = ntohl(get_16aligned_be32(&th->tcp_seq));
3202 if ((seq_skew > 0) && (UINT32_MAX - tcp_seq < seq_skew)) {
3203 tcp_seq = seq_skew - (UINT32_MAX - tcp_seq);
3204 } else if ((seq_skew < 0) && (tcp_seq < -seq_skew)) {
3205 /* Should not be possible; will be marked invalid. */
3206 tcp_seq = 0;
3207 } else {
3208 tcp_seq += seq_skew;
3209 }
3210 ovs_be32 new_tcp_seq = htonl(tcp_seq);
3211 put_16aligned_be32(&th->tcp_seq, new_tcp_seq);
3212 }
3213 }
3214
3215 th->tcp_csum = 0;
3216 uint32_t tcp_csum;
3217 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3218 tcp_csum = packet_csum_pseudoheader6(nh6);
3219 } else {
3220 tcp_csum = packet_csum_pseudoheader(l3_hdr);
3221 }
3222 const char *tail = dp_packet_tail(pkt);
3223 uint8_t pad = dp_packet_l2_pad_size(pkt);
3224 th->tcp_csum = csum_finish(
3225 csum_continue(tcp_csum, th, tail - (char *) th - pad));
3226 return;
3227 }
3228
3229 static void
3230 handle_tftp_ctl(struct conntrack *ct,
3231 const struct conn_lookup_ctx *ctx OVS_UNUSED,
3232 struct dp_packet *pkt,
3233 const struct conn *conn_for_expectation,
3234 long long now OVS_UNUSED,
3235 enum ftp_ctl_pkt ftp_ctl OVS_UNUSED, bool nat OVS_UNUSED)
3236 {
3237 expectation_create(ct, conn_for_expectation->key.src.port,
3238 conn_for_expectation,
3239 !!(pkt->md.ct_state & CS_REPLY_DIR), false, false);
3240 return;
3241 }