]> git.proxmox.com Git - mirror_ovs.git/blob - lib/conntrack.c
419cb1def6553938d53d4e5443db006dec32e832
[mirror_ovs.git] / lib / conntrack.c
1 /*
2 * Copyright (c) 2015, 2016, 2017 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18 #include <ctype.h>
19 #include <errno.h>
20 #include <netinet/in.h>
21 #include <netinet/icmp6.h>
22 #include <string.h>
23 #include <sys/types.h>
24
25 #include "bitmap.h"
26 #include "conntrack.h"
27 #include "conntrack-private.h"
28 #include "coverage.h"
29 #include "csum.h"
30 #include "ct-dpif.h"
31 #include "dp-packet.h"
32 #include "flow.h"
33 #include "netdev.h"
34 #include "odp-netlink.h"
35 #include "openvswitch/hmap.h"
36 #include "openvswitch/vlog.h"
37 #include "ovs-rcu.h"
38 #include "ovs-thread.h"
39 #include "poll-loop.h"
40 #include "random.h"
41 #include "timeval.h"
42
43 VLOG_DEFINE_THIS_MODULE(conntrack);
44
45 COVERAGE_DEFINE(conntrack_full);
46 COVERAGE_DEFINE(conntrack_long_cleanup);
47
48 struct conn_lookup_ctx {
49 struct conn_key key;
50 struct conn *conn;
51 uint32_t hash;
52 bool reply;
53 bool icmp_related;
54 };
55
56 enum ftp_ctl_pkt {
57 /* Control packets with address and/or port specifiers. */
58 CT_FTP_CTL_INTEREST,
59 /* Control packets without address and/or port specifiers. */
60 CT_FTP_CTL_OTHER,
61 CT_FTP_CTL_INVALID,
62 };
63
64 enum ct_alg_mode {
65 CT_FTP_MODE_ACTIVE,
66 CT_FTP_MODE_PASSIVE,
67 CT_TFTP_MODE,
68 };
69
70 static bool conn_key_extract(struct conntrack *, struct dp_packet *,
71 ovs_be16 dl_type, struct conn_lookup_ctx *,
72 uint16_t zone);
73 static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis);
74 static void conn_key_reverse(struct conn_key *);
75 static void conn_key_lookup(struct conntrack_bucket *ctb,
76 struct conn_lookup_ctx *ctx,
77 long long now);
78 static bool valid_new(struct dp_packet *pkt, struct conn_key *);
79 static struct conn *new_conn(struct conntrack_bucket *, struct dp_packet *pkt,
80 struct conn_key *, long long now);
81 static void delete_conn(struct conn *);
82 static enum ct_update_res conn_update(struct conn *,
83 struct conntrack_bucket *ctb,
84 struct dp_packet *, bool reply,
85 long long now);
86 static bool conn_expired(struct conn *, long long now);
87 static void set_mark(struct dp_packet *, struct conn *,
88 uint32_t val, uint32_t mask);
89 static void set_label(struct dp_packet *, struct conn *,
90 const struct ovs_key_ct_labels *val,
91 const struct ovs_key_ct_labels *mask);
92 static void *clean_thread_main(void *f_);
93
94 static struct nat_conn_key_node *
95 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
96 const struct conn_key *key,
97 uint32_t basis);
98
99 static void
100 nat_conn_keys_remove(struct hmap *nat_conn_keys,
101 const struct conn_key *key,
102 uint32_t basis);
103
104 static bool
105 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
106 struct conn *nat_conn);
107
108 static uint8_t
109 reverse_icmp_type(uint8_t type);
110 static uint8_t
111 reverse_icmp6_type(uint8_t type);
112 static inline bool
113 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
114 const char **new_data, bool validate_checksum);
115 static inline bool
116 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
117 const char **new_data);
118
119 static struct alg_exp_node *
120 expectation_lookup(struct hmap *alg_expectations,
121 const struct conn_key *key, uint32_t basis);
122
123 static int
124 repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
125 char *ftp_data_v4_start,
126 size_t addr_offset_from_ftp_data_start);
127
128 static enum ftp_ctl_pkt
129 process_ftp_ctl_v4(struct conntrack *ct,
130 struct dp_packet *pkt,
131 const struct conn *conn_for_expectation,
132 long long now, ovs_be32 *v4_addr_rep,
133 char **ftp_data_v4_start,
134 size_t *addr_offset_from_ftp_data_start);
135
136 static enum ftp_ctl_pkt
137 detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
138 struct dp_packet *pkt);
139
140 static void
141 handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
142 struct dp_packet *pkt,
143 const struct conn *conn_for_expectation,
144 long long now, enum ftp_ctl_pkt ftp_ctl, bool nat);
145
146 static void
147 handle_tftp_ctl(struct conntrack *ct,
148 const struct conn *conn_for_expectation,
149 long long now);
150
151 static struct ct_l4_proto *l4_protos[] = {
152 [IPPROTO_TCP] = &ct_proto_tcp,
153 [IPPROTO_UDP] = &ct_proto_other,
154 [IPPROTO_ICMP] = &ct_proto_icmp4,
155 [IPPROTO_ICMPV6] = &ct_proto_icmp6,
156 };
157
158 long long ct_timeout_val[] = {
159 #define CT_TIMEOUT(NAME, VAL) [CT_TM_##NAME] = VAL,
160 CT_TIMEOUTS
161 #undef CT_TIMEOUT
162 };
163
164 /* The maximum TCP or UDP port number. */
165 #define CT_MAX_L4_PORT 65535
166 /* Alg expectation timeout. */
167 #define CT_ALG_EXP_TIMEOUT (30 * 1000)
168 /* String buffer used for parsing FTP string messages.
169 * This is sized about twice what is needed to leave some
170 * margin of error. */
171 #define LARGEST_FTP_MSG_OF_INTEREST 128
172 /* FTP port string used in active mode. */
173 #define FTP_PORT_CMD "PORT"
174 /* FTP pasv string used in passive mode. */
175 #define FTP_PASV_REPLY_CODE "227"
176 /* Maximum decimal digits for port in FTP command.
177 * The port is represented as two 3 digit numbers with the
178 * high part a multiple of 256. */
179 #define MAX_FTP_PORT_DGTS 3
180
181 /* FTP extension EPRT string used for active mode. */
182 #define FTP_EPRT_CMD "EPRT"
183 /* FTP extension EPSV string used for passive mode. */
184 #define FTP_EPSV_REPLY "EXTENDED PASSIVE"
185 /* Maximum decimal digits for port in FTP extended command. */
186 #define MAX_EXT_FTP_PORT_DGTS 5
187 /* FTP extended command code for IPv6. */
188 #define FTP_AF_V6 '2'
189 /* Used to indicate a wildcard L4 source port number for ALGs.
190 * This is used for port numbers that we cannot predict in
191 * expectations. */
192 #define ALG_WC_SRC_PORT 0
193
194 /* If the total number of connections goes above this value, no new connections
195 * are accepted; this is for CT_CONN_TYPE_DEFAULT connections. */
196 #define DEFAULT_N_CONN_LIMIT 3000000
197
198 /* Does a member by member comparison of two conn_keys; this
199 * function must be kept in sync with struct conn_key; returns 0
200 * if the keys are equal or 1 if the keys are not equal. */
201 static int
202 conn_key_cmp(const struct conn_key *key1, const struct conn_key *key2)
203 {
204 if (!memcmp(&key1->src.addr, &key2->src.addr, sizeof key1->src.addr) &&
205 !memcmp(&key1->dst.addr, &key2->dst.addr, sizeof key1->dst.addr) &&
206 (key1->src.icmp_id == key2->src.icmp_id) &&
207 (key1->src.icmp_type == key2->src.icmp_type) &&
208 (key1->src.icmp_code == key2->src.icmp_code) &&
209 (key1->dst.icmp_id == key2->dst.icmp_id) &&
210 (key1->dst.icmp_type == key2->dst.icmp_type) &&
211 (key1->dst.icmp_code == key2->dst.icmp_code) &&
212 (key1->dl_type == key2->dl_type) &&
213 (key1->zone == key2->zone) &&
214 (key1->nw_proto == key2->nw_proto)) {
215
216 return 0;
217 }
218 return 1;
219 }
220
221 /* Initializes the connection tracker 'ct'. The caller is responsible for
222 * calling 'conntrack_destroy()', when the instance is not needed anymore */
223 void
224 conntrack_init(struct conntrack *ct)
225 {
226 unsigned i, j;
227 long long now = time_msec();
228
229 ct_rwlock_init(&ct->resources_lock);
230 ct_rwlock_wrlock(&ct->resources_lock);
231 hmap_init(&ct->nat_conn_keys);
232 hmap_init(&ct->alg_expectations);
233 ovs_list_init(&ct->alg_exp_list);
234 ct_rwlock_unlock(&ct->resources_lock);
235
236 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
237 struct conntrack_bucket *ctb = &ct->buckets[i];
238
239 ct_lock_init(&ctb->lock);
240 ct_lock_lock(&ctb->lock);
241 hmap_init(&ctb->connections);
242 for (j = 0; j < ARRAY_SIZE(ctb->exp_lists); j++) {
243 ovs_list_init(&ctb->exp_lists[j]);
244 }
245 ct_lock_unlock(&ctb->lock);
246 ovs_mutex_init(&ctb->cleanup_mutex);
247 ovs_mutex_lock(&ctb->cleanup_mutex);
248 ctb->next_cleanup = now + CT_TM_MIN;
249 ovs_mutex_unlock(&ctb->cleanup_mutex);
250 }
251 ct->hash_basis = random_uint32();
252 atomic_count_init(&ct->n_conn, 0);
253 atomic_init(&ct->n_conn_limit, DEFAULT_N_CONN_LIMIT);
254 latch_init(&ct->clean_thread_exit);
255 ct->clean_thread = ovs_thread_create("ct_clean", clean_thread_main, ct);
256 }
257
258 /* Destroys the connection tracker 'ct' and frees all the allocated memory. */
259 void
260 conntrack_destroy(struct conntrack *ct)
261 {
262 unsigned i;
263
264 latch_set(&ct->clean_thread_exit);
265 pthread_join(ct->clean_thread, NULL);
266 latch_destroy(&ct->clean_thread_exit);
267 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
268 struct conntrack_bucket *ctb = &ct->buckets[i];
269 struct conn *conn;
270
271 ovs_mutex_destroy(&ctb->cleanup_mutex);
272 ct_lock_lock(&ctb->lock);
273 HMAP_FOR_EACH_POP (conn, node, &ctb->connections) {
274 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
275 atomic_count_dec(&ct->n_conn);
276 }
277 delete_conn(conn);
278 }
279 hmap_destroy(&ctb->connections);
280 ct_lock_unlock(&ctb->lock);
281 ct_lock_destroy(&ctb->lock);
282 }
283 ct_rwlock_wrlock(&ct->resources_lock);
284 struct nat_conn_key_node *nat_conn_key_node;
285 HMAP_FOR_EACH_POP (nat_conn_key_node, node, &ct->nat_conn_keys) {
286 free(nat_conn_key_node);
287 }
288 hmap_destroy(&ct->nat_conn_keys);
289
290 struct alg_exp_node *alg_exp_node;
291 HMAP_FOR_EACH_POP (alg_exp_node, node, &ct->alg_expectations) {
292 free(alg_exp_node);
293 }
294 ovs_list_poison(&ct->alg_exp_list);
295 hmap_destroy(&ct->alg_expectations);
296 ct_rwlock_unlock(&ct->resources_lock);
297 ct_rwlock_destroy(&ct->resources_lock);
298 }
299 \f
300 static unsigned hash_to_bucket(uint32_t hash)
301 {
302 /* Extracts the most significant bits in hash. The least significant bits
303 * are already used internally by the hmap implementation. */
304 BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= 1);
305
306 return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS;
307 }
308
309 static void
310 write_ct_md(struct dp_packet *pkt, uint16_t zone, const struct conn *conn,
311 const struct conn_key *key, const struct alg_exp_node *alg_exp)
312 {
313 pkt->md.ct_state |= CS_TRACKED;
314 pkt->md.ct_zone = zone;
315 pkt->md.ct_mark = conn ? conn->mark : 0;
316 pkt->md.ct_label = conn ? conn->label : OVS_U128_ZERO;
317
318 /* Use the original direction tuple if we have it. */
319 if (conn) {
320 if (conn->alg_related) {
321 key = &conn->master_key;
322 } else {
323 key = &conn->key;
324 }
325 } else if (alg_exp) {
326 pkt->md.ct_mark = alg_exp->master_mark;
327 pkt->md.ct_label = alg_exp->master_label;
328 key = &alg_exp->master_key;
329 }
330 pkt->md.ct_orig_tuple_ipv6 = false;
331 if (key) {
332 if (key->dl_type == htons(ETH_TYPE_IP)) {
333
334 pkt->md.ct_orig_tuple.ipv4 = (struct ovs_key_ct_tuple_ipv4) {
335 key->src.addr.ipv4_aligned,
336 key->dst.addr.ipv4_aligned,
337 key->nw_proto != IPPROTO_ICMP
338 ? key->src.port : htons(key->src.icmp_type),
339 key->nw_proto != IPPROTO_ICMP
340 ? key->dst.port : htons(key->src.icmp_code),
341 key->nw_proto,
342 };
343 } else {
344 pkt->md.ct_orig_tuple_ipv6 = true;
345 pkt->md.ct_orig_tuple.ipv6 = (struct ovs_key_ct_tuple_ipv6) {
346 key->src.addr.ipv6_aligned,
347 key->dst.addr.ipv6_aligned,
348 key->nw_proto != IPPROTO_ICMPV6
349 ? key->src.port : htons(key->src.icmp_type),
350 key->nw_proto != IPPROTO_ICMPV6
351 ? key->dst.port : htons(key->src.icmp_code),
352 key->nw_proto,
353 };
354 }
355 } else {
356 memset(&pkt->md.ct_orig_tuple, 0, sizeof pkt->md.ct_orig_tuple);
357 }
358 }
359
360 static uint8_t
361 get_ip_proto(const struct dp_packet *pkt)
362 {
363 uint8_t ip_proto;
364 struct eth_header *l2 = dp_packet_eth(pkt);
365 if (l2->eth_type == htons(ETH_TYPE_IPV6)) {
366 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
367 ip_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
368 } else {
369 struct ip_header *l3_hdr = dp_packet_l3(pkt);
370 ip_proto = l3_hdr->ip_proto;
371 }
372
373 return ip_proto;
374 }
375
376 static bool
377 is_ftp_ctl(const struct dp_packet *pkt)
378 {
379 uint8_t ip_proto = get_ip_proto(pkt);
380 struct tcp_header *th = dp_packet_l4(pkt);
381
382 /* CT_IPPORT_FTP is used because IPPORT_FTP in not defined in OSX,
383 * at least in in.h. Since this value will never change, just remove
384 * the external dependency. */
385 #define CT_IPPORT_FTP 21
386
387 return (ip_proto == IPPROTO_TCP &&
388 (th->tcp_src == htons(CT_IPPORT_FTP) ||
389 th->tcp_dst == htons(CT_IPPORT_FTP)));
390
391 }
392
393 static bool
394 is_tftp_ctl(const struct dp_packet *pkt)
395 {
396 uint8_t ip_proto = get_ip_proto(pkt);
397 struct udp_header *uh = dp_packet_l4(pkt);
398
399 /* CT_IPPORT_TFTP is used because IPPORT_TFTP in not defined in OSX,
400 * at least in in.h. Since this value will never change, remove
401 * the external dependency. */
402 #define CT_IPPORT_TFTP 69
403 return (ip_proto == IPPROTO_UDP &&
404 uh->udp_dst == htons(CT_IPPORT_TFTP));
405
406 }
407
408 static void
409 alg_exp_init_expiration(struct conntrack *ct,
410 struct alg_exp_node *alg_exp_node,
411 long long now)
412 OVS_REQ_WRLOCK(ct->resources_lock)
413 {
414 alg_exp_node->expiration = now + CT_ALG_EXP_TIMEOUT;
415 ovs_list_push_back(&ct->alg_exp_list, &alg_exp_node->exp_node);
416 }
417
418 static void
419 pat_packet(struct dp_packet *pkt, const struct conn *conn)
420 {
421 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
422 if (conn->key.nw_proto == IPPROTO_TCP) {
423 struct tcp_header *th = dp_packet_l4(pkt);
424 packet_set_tcp_port(pkt, conn->rev_key.dst.port, th->tcp_dst);
425 } else if (conn->key.nw_proto == IPPROTO_UDP) {
426 struct udp_header *uh = dp_packet_l4(pkt);
427 packet_set_udp_port(pkt, conn->rev_key.dst.port, uh->udp_dst);
428 }
429 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
430 if (conn->key.nw_proto == IPPROTO_TCP) {
431 struct tcp_header *th = dp_packet_l4(pkt);
432 packet_set_tcp_port(pkt, th->tcp_src, conn->rev_key.src.port);
433 } else if (conn->key.nw_proto == IPPROTO_UDP) {
434 struct udp_header *uh = dp_packet_l4(pkt);
435 packet_set_udp_port(pkt, uh->udp_src, conn->rev_key.src.port);
436 }
437 }
438 }
439
440 static void
441 nat_packet(struct dp_packet *pkt, const struct conn *conn, bool related)
442 {
443 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
444 pkt->md.ct_state |= CS_SRC_NAT;
445 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
446 struct ip_header *nh = dp_packet_l3(pkt);
447 packet_set_ipv4_addr(pkt, &nh->ip_src,
448 conn->rev_key.dst.addr.ipv4_aligned);
449 } else {
450 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
451 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
452 nh6->ip6_src.be32,
453 &conn->rev_key.dst.addr.ipv6_aligned,
454 true);
455 }
456 if (!related) {
457 pat_packet(pkt, conn);
458 }
459 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
460 pkt->md.ct_state |= CS_DST_NAT;
461 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
462 struct ip_header *nh = dp_packet_l3(pkt);
463 packet_set_ipv4_addr(pkt, &nh->ip_dst,
464 conn->rev_key.src.addr.ipv4_aligned);
465 } else {
466 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
467 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
468 nh6->ip6_dst.be32,
469 &conn->rev_key.src.addr.ipv6_aligned,
470 true);
471 }
472 if (!related) {
473 pat_packet(pkt, conn);
474 }
475 }
476 }
477
478 static void
479 un_pat_packet(struct dp_packet *pkt, const struct conn *conn)
480 {
481 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
482 if (conn->key.nw_proto == IPPROTO_TCP) {
483 struct tcp_header *th = dp_packet_l4(pkt);
484 packet_set_tcp_port(pkt, th->tcp_src, conn->key.src.port);
485 } else if (conn->key.nw_proto == IPPROTO_UDP) {
486 struct udp_header *uh = dp_packet_l4(pkt);
487 packet_set_udp_port(pkt, uh->udp_src, conn->key.src.port);
488 }
489 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
490 if (conn->key.nw_proto == IPPROTO_TCP) {
491 struct tcp_header *th = dp_packet_l4(pkt);
492 packet_set_tcp_port(pkt, conn->key.dst.port, th->tcp_dst);
493 } else if (conn->key.nw_proto == IPPROTO_UDP) {
494 struct udp_header *uh = dp_packet_l4(pkt);
495 packet_set_udp_port(pkt, conn->key.dst.port, uh->udp_dst);
496 }
497 }
498 }
499
500 static void
501 reverse_pat_packet(struct dp_packet *pkt, const struct conn *conn)
502 {
503 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
504 if (conn->key.nw_proto == IPPROTO_TCP) {
505 struct tcp_header *th_in = dp_packet_l4(pkt);
506 packet_set_tcp_port(pkt, conn->key.src.port,
507 th_in->tcp_dst);
508 } else if (conn->key.nw_proto == IPPROTO_UDP) {
509 struct udp_header *uh_in = dp_packet_l4(pkt);
510 packet_set_udp_port(pkt, conn->key.src.port,
511 uh_in->udp_dst);
512 }
513 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
514 if (conn->key.nw_proto == IPPROTO_TCP) {
515 struct tcp_header *th_in = dp_packet_l4(pkt);
516 packet_set_tcp_port(pkt, th_in->tcp_src,
517 conn->key.dst.port);
518 } else if (conn->key.nw_proto == IPPROTO_UDP) {
519 struct udp_header *uh_in = dp_packet_l4(pkt);
520 packet_set_udp_port(pkt, uh_in->udp_src,
521 conn->key.dst.port);
522 }
523 }
524 }
525
526 static void
527 reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn)
528 {
529 char *tail = dp_packet_tail(pkt);
530 char pad = dp_packet_l2_pad_size(pkt);
531 struct conn_key inner_key;
532 const char *inner_l4 = NULL;
533 uint16_t orig_l3_ofs = pkt->l3_ofs;
534 uint16_t orig_l4_ofs = pkt->l4_ofs;
535
536 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
537 struct ip_header *nh = dp_packet_l3(pkt);
538 struct icmp_header *icmp = dp_packet_l4(pkt);
539 struct ip_header *inner_l3 = (struct ip_header *) (icmp + 1);
540 extract_l3_ipv4(&inner_key, inner_l3, tail - ((char *)inner_l3) - pad,
541 &inner_l4, false);
542
543 pkt->l3_ofs += (char *) inner_l3 - (char *) nh;
544 pkt->l4_ofs += inner_l4 - (char *) icmp;
545
546 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
547 packet_set_ipv4_addr(pkt, &inner_l3->ip_src,
548 conn->key.src.addr.ipv4_aligned);
549 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
550 packet_set_ipv4_addr(pkt, &inner_l3->ip_dst,
551 conn->key.dst.addr.ipv4_aligned);
552 }
553 reverse_pat_packet(pkt, conn);
554 icmp->icmp_csum = 0;
555 icmp->icmp_csum = csum(icmp, tail - (char *) icmp - pad);
556 } else {
557 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
558 struct icmp6_error_header *icmp6 = dp_packet_l4(pkt);
559 struct ovs_16aligned_ip6_hdr *inner_l3_6 =
560 (struct ovs_16aligned_ip6_hdr *) (icmp6 + 1);
561 extract_l3_ipv6(&inner_key, inner_l3_6,
562 tail - ((char *)inner_l3_6) - pad,
563 &inner_l4);
564 pkt->l3_ofs += (char *) inner_l3_6 - (char *) nh6;
565 pkt->l4_ofs += inner_l4 - (char *) icmp6;
566
567 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
568 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
569 inner_l3_6->ip6_src.be32,
570 &conn->key.src.addr.ipv6_aligned,
571 true);
572 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
573 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
574 inner_l3_6->ip6_dst.be32,
575 &conn->key.dst.addr.ipv6_aligned,
576 true);
577 }
578 reverse_pat_packet(pkt, conn);
579 uint32_t icmp6_csum = packet_csum_pseudoheader6(nh6);
580 icmp6->icmp6_base.icmp6_cksum = 0;
581 icmp6->icmp6_base.icmp6_cksum = csum_finish(
582 csum_continue(icmp6_csum, icmp6, tail - (char *) icmp6 - pad));
583 }
584 pkt->l3_ofs = orig_l3_ofs;
585 pkt->l4_ofs = orig_l4_ofs;
586 }
587
588 static void
589 un_nat_packet(struct dp_packet *pkt, const struct conn *conn,
590 bool related)
591 {
592 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
593 pkt->md.ct_state |= CS_DST_NAT;
594 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
595 struct ip_header *nh = dp_packet_l3(pkt);
596 packet_set_ipv4_addr(pkt, &nh->ip_dst,
597 conn->key.src.addr.ipv4_aligned);
598 } else {
599 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
600 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
601 nh6->ip6_dst.be32,
602 &conn->key.src.addr.ipv6_aligned, true);
603 }
604
605 if (OVS_UNLIKELY(related)) {
606 reverse_nat_packet(pkt, conn);
607 } else {
608 un_pat_packet(pkt, conn);
609 }
610 } else if (conn->nat_info->nat_action & NAT_ACTION_DST) {
611 pkt->md.ct_state |= CS_SRC_NAT;
612 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
613 struct ip_header *nh = dp_packet_l3(pkt);
614 packet_set_ipv4_addr(pkt, &nh->ip_src,
615 conn->key.dst.addr.ipv4_aligned);
616 } else {
617 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
618 packet_set_ipv6_addr(pkt, conn->key.nw_proto,
619 nh6->ip6_src.be32,
620 &conn->key.dst.addr.ipv6_aligned, true);
621 }
622
623 if (OVS_UNLIKELY(related)) {
624 reverse_nat_packet(pkt, conn);
625 } else {
626 un_pat_packet(pkt, conn);
627 }
628 }
629 }
630
631 /* Typical usage of this helper is in non per-packet code;
632 * this is because the bucket lock needs to be held for lookup
633 * and a hash would have already been needed. Hence, this function
634 * is just intended for code clarity. */
635 static struct conn *
636 conn_lookup(struct conntrack *ct, const struct conn_key *key, long long now)
637 {
638 struct conn_lookup_ctx ctx;
639 ctx.conn = NULL;
640 ctx.key = *key;
641 ctx.hash = conn_key_hash(key, ct->hash_basis);
642 unsigned bucket = hash_to_bucket(ctx.hash);
643 conn_key_lookup(&ct->buckets[bucket], &ctx, now);
644 return ctx.conn;
645 }
646
647 static void
648 conn_seq_skew_set(struct conntrack *ct, const struct conn_key *key,
649 long long now, int seq_skew, bool seq_skew_dir)
650 {
651 uint32_t hash = conn_key_hash(key, ct->hash_basis);
652 unsigned bucket = hash_to_bucket(hash);
653 ct_lock_lock(&ct->buckets[bucket].lock);
654 struct conn *conn = conn_lookup(ct, key, now);
655 if (conn && seq_skew) {
656 conn->seq_skew = seq_skew;
657 conn->seq_skew_dir = seq_skew_dir;
658 }
659 ct_lock_unlock(&ct->buckets[bucket].lock);
660 }
661
662 static void
663 nat_clean(struct conntrack *ct, struct conn *conn,
664 struct conntrack_bucket *ctb)
665 OVS_REQUIRES(ctb->lock)
666 {
667 long long now = time_msec();
668 ct_rwlock_wrlock(&ct->resources_lock);
669 nat_conn_keys_remove(&ct->nat_conn_keys, &conn->rev_key, ct->hash_basis);
670 ct_rwlock_unlock(&ct->resources_lock);
671 ct_lock_unlock(&ctb->lock);
672
673 uint32_t hash_rev_conn = conn_key_hash(&conn->rev_key, ct->hash_basis);
674 unsigned bucket_rev_conn = hash_to_bucket(hash_rev_conn);
675
676 ct_lock_lock(&ct->buckets[bucket_rev_conn].lock);
677 ct_rwlock_wrlock(&ct->resources_lock);
678
679 struct conn *rev_conn = conn_lookup(ct, &conn->rev_key, now);
680
681 struct nat_conn_key_node *nat_conn_key_node =
682 nat_conn_keys_lookup(&ct->nat_conn_keys, &conn->rev_key,
683 ct->hash_basis);
684
685 /* In the unlikely event, rev conn was recreated, then skip
686 * rev_conn cleanup. */
687 if (rev_conn && (!nat_conn_key_node ||
688 conn_key_cmp(&nat_conn_key_node->value,
689 &rev_conn->rev_key))) {
690 hmap_remove(&ct->buckets[bucket_rev_conn].connections,
691 &rev_conn->node);
692 free(rev_conn);
693 }
694 delete_conn(conn);
695
696 ct_rwlock_unlock(&ct->resources_lock);
697 ct_lock_unlock(&ct->buckets[bucket_rev_conn].lock);
698 ct_lock_lock(&ctb->lock);
699 }
700
701 static void
702 conn_clean(struct conntrack *ct, struct conn *conn,
703 struct conntrack_bucket *ctb)
704 OVS_REQUIRES(ctb->lock)
705 {
706 ovs_list_remove(&conn->exp_node);
707 hmap_remove(&ctb->connections, &conn->node);
708 atomic_count_dec(&ct->n_conn);
709 if (conn->nat_info) {
710 nat_clean(ct, conn, ctb);
711 } else {
712 delete_conn(conn);
713 }
714 }
715
716 /* This function is called with the bucket lock held. */
717 static struct conn *
718 conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
719 struct conn_lookup_ctx *ctx, bool commit, long long now,
720 const struct nat_action_info_t *nat_action_info,
721 struct conn *conn_for_un_nat_copy,
722 const char *helper,
723 const struct alg_exp_node *alg_exp)
724 {
725 unsigned bucket = hash_to_bucket(ctx->hash);
726 struct conn *nc = NULL;
727
728 if (!valid_new(pkt, &ctx->key)) {
729 pkt->md.ct_state = CS_INVALID;
730 return nc;
731 }
732 pkt->md.ct_state = CS_NEW;
733 if (alg_exp) {
734 pkt->md.ct_state |= CS_RELATED;
735 }
736
737 if (commit) {
738 unsigned int n_conn_limit;
739
740 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
741
742 if (atomic_count_get(&ct->n_conn) >= n_conn_limit) {
743 COVERAGE_INC(conntrack_full);
744 return nc;
745 }
746
747 nc = new_conn(&ct->buckets[bucket], pkt, &ctx->key, now);
748 ctx->conn = nc;
749 nc->rev_key = nc->key;
750 conn_key_reverse(&nc->rev_key);
751
752 if (helper) {
753 nc->alg = xstrdup(helper);
754 }
755
756 if (alg_exp) {
757 nc->alg_related = true;
758 nc->mark = alg_exp->master_mark;
759 nc->label = alg_exp->master_label;
760 nc->master_key = alg_exp->master_key;
761 }
762
763 if (nat_action_info) {
764 nc->nat_info = xmemdup(nat_action_info, sizeof *nc->nat_info);
765
766 if (alg_exp) {
767 if (alg_exp->passive_mode) {
768 nc->rev_key.dst.addr = alg_exp->alg_nat_repl_addr;
769 nc->nat_info->nat_action = NAT_ACTION_SRC;
770 } else {
771 nc->rev_key.src.addr = alg_exp->alg_nat_repl_addr;
772 nc->nat_info->nat_action = NAT_ACTION_DST;
773 }
774 *conn_for_un_nat_copy = *nc;
775 } else {
776 *conn_for_un_nat_copy = *nc;
777 ct_rwlock_wrlock(&ct->resources_lock);
778 bool nat_res = nat_select_range_tuple(
779 ct, nc, conn_for_un_nat_copy);
780
781 if (!nat_res) {
782 goto nat_res_exhaustion;
783 }
784
785 /* Update nc with nat adjustments made to
786 * conn_for_un_nat_copy by nat_select_range_tuple(). */
787 *nc = *conn_for_un_nat_copy;
788 ct_rwlock_unlock(&ct->resources_lock);
789 }
790 conn_for_un_nat_copy->conn_type = CT_CONN_TYPE_UN_NAT;
791 conn_for_un_nat_copy->nat_info = NULL;
792 conn_for_un_nat_copy->alg = NULL;
793 nat_packet(pkt, nc, ctx->icmp_related);
794 }
795 hmap_insert(&ct->buckets[bucket].connections, &nc->node, ctx->hash);
796 atomic_count_inc(&ct->n_conn);
797 }
798
799 return nc;
800
801 /* This would be a user error or a DOS attack.
802 * A user error is prevented by allocating enough
803 * combinations of NAT addresses when combined with
804 * ephemeral ports. A DOS attack should be protected
805 * against with firewall rules or a separate firewall.
806 * Also using zone partitioning can limit DoS impact. */
807 nat_res_exhaustion:
808 ovs_list_remove(&nc->exp_node);
809 delete_conn(nc);
810 /* conn_for_un_nat_copy is a local variable in process_one; this
811 * memset() serves to document that conn_for_un_nat_copy is from
812 * this point on unused. */
813 memset(conn_for_un_nat_copy, 0, sizeof *conn_for_un_nat_copy);
814 ct_rwlock_unlock(&ct->resources_lock);
815 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
816 VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
817 "if DoS attack, use firewalling and/or zone partitioning.");
818 return NULL;
819 }
820
821 static bool
822 conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
823 struct conn_lookup_ctx *ctx, struct conn **conn,
824 long long now, unsigned bucket)
825 OVS_REQUIRES(ct->buckets[bucket].lock)
826 {
827 bool create_new_conn = false;
828
829 if (ctx->icmp_related) {
830 pkt->md.ct_state |= CS_RELATED;
831 if (ctx->reply) {
832 pkt->md.ct_state |= CS_REPLY_DIR;
833 }
834 } else {
835 if ((*conn)->alg_related) {
836 pkt->md.ct_state |= CS_RELATED;
837 }
838 enum ct_update_res res = conn_update(*conn, &ct->buckets[bucket],
839 pkt, ctx->reply, now);
840
841 switch (res) {
842 case CT_UPDATE_VALID:
843 pkt->md.ct_state |= CS_ESTABLISHED;
844 pkt->md.ct_state &= ~CS_NEW;
845 if (ctx->reply) {
846 pkt->md.ct_state |= CS_REPLY_DIR;
847 }
848 break;
849 case CT_UPDATE_INVALID:
850 pkt->md.ct_state = CS_INVALID;
851 break;
852 case CT_UPDATE_NEW:
853 conn_clean(ct, *conn, &ct->buckets[bucket]);
854 create_new_conn = true;
855 break;
856 default:
857 OVS_NOT_REACHED();
858 }
859 }
860 return create_new_conn;
861 }
862
863 static void
864 create_un_nat_conn(struct conntrack *ct, struct conn *conn_for_un_nat_copy,
865 long long now, bool alg_un_nat)
866 {
867 struct conn *nc = xmemdup(conn_for_un_nat_copy, sizeof *nc);
868 nc->key = conn_for_un_nat_copy->rev_key;
869 nc->rev_key = conn_for_un_nat_copy->key;
870 uint32_t un_nat_hash = conn_key_hash(&nc->key, ct->hash_basis);
871 unsigned un_nat_conn_bucket = hash_to_bucket(un_nat_hash);
872 ct_lock_lock(&ct->buckets[un_nat_conn_bucket].lock);
873 struct conn *rev_conn = conn_lookup(ct, &nc->key, now);
874
875 if (alg_un_nat) {
876 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
877 &nc->node, un_nat_hash);
878 } else {
879 ct_rwlock_rdlock(&ct->resources_lock);
880
881 struct nat_conn_key_node *nat_conn_key_node =
882 nat_conn_keys_lookup(&ct->nat_conn_keys, &nc->key, ct->hash_basis);
883 if (nat_conn_key_node && !conn_key_cmp(&nat_conn_key_node->value,
884 &nc->rev_key) && !rev_conn) {
885
886 hmap_insert(&ct->buckets[un_nat_conn_bucket].connections,
887 &nc->node, un_nat_hash);
888 } else {
889 free(nc);
890 }
891 ct_rwlock_unlock(&ct->resources_lock);
892 }
893 ct_lock_unlock(&ct->buckets[un_nat_conn_bucket].lock);
894 }
895
896 static void
897 handle_nat(struct dp_packet *pkt, struct conn *conn,
898 uint16_t zone, bool reply, bool related)
899 {
900 if (conn->nat_info &&
901 (!(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
902 (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT) &&
903 zone != pkt->md.ct_zone))) {
904
905 if (pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) {
906 pkt->md.ct_state &= ~(CS_SRC_NAT | CS_DST_NAT);
907 }
908 if (reply) {
909 un_nat_packet(pkt, conn, related);
910 } else {
911 nat_packet(pkt, conn, related);
912 }
913 }
914 }
915
916 static bool
917 check_orig_tuple(struct conntrack *ct, struct dp_packet *pkt,
918 struct conn_lookup_ctx *ctx_in, long long now,
919 unsigned *bucket, struct conn **conn,
920 const struct nat_action_info_t *nat_action_info)
921 OVS_REQUIRES(ct->buckets[*bucket].lock)
922 {
923 if ((ctx_in->key.dl_type == htons(ETH_TYPE_IP) &&
924 !pkt->md.ct_orig_tuple.ipv4.ipv4_proto) ||
925 (ctx_in->key.dl_type == htons(ETH_TYPE_IPV6) &&
926 !pkt->md.ct_orig_tuple.ipv6.ipv6_proto) ||
927 !(pkt->md.ct_state & (CS_SRC_NAT | CS_DST_NAT)) ||
928 nat_action_info) {
929 return false;
930 }
931
932 ct_lock_unlock(&ct->buckets[*bucket].lock);
933 struct conn_lookup_ctx ctx;
934 memset(&ctx, 0 , sizeof ctx);
935 ctx.conn = NULL;
936
937 if (ctx_in->key.dl_type == htons(ETH_TYPE_IP)) {
938 ctx.key.src.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_src;
939 ctx.key.dst.addr.ipv4_aligned = pkt->md.ct_orig_tuple.ipv4.ipv4_dst;
940
941 if (ctx_in->key.nw_proto == IPPROTO_ICMP) {
942 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
943 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
944 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv4.src_port);
945 ctx.key.src.icmp_type = (uint8_t) src_port;
946 ctx.key.dst.icmp_type = reverse_icmp_type(ctx.key.src.icmp_type);
947 } else {
948 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv4.src_port;
949 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv4.dst_port;
950 }
951 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv4.ipv4_proto;
952 } else {
953 ctx.key.src.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_src;
954 ctx.key.dst.addr.ipv6_aligned = pkt->md.ct_orig_tuple.ipv6.ipv6_dst;
955
956 if (ctx_in->key.nw_proto == IPPROTO_ICMPV6) {
957 ctx.key.src.icmp_id = ctx_in->key.src.icmp_id;
958 ctx.key.dst.icmp_id = ctx_in->key.dst.icmp_id;
959 uint16_t src_port = ntohs(pkt->md.ct_orig_tuple.ipv6.src_port);
960 ctx.key.src.icmp_type = (uint8_t) src_port;
961 ctx.key.dst.icmp_type = reverse_icmp6_type(ctx.key.src.icmp_type);
962 } else {
963 ctx.key.src.port = pkt->md.ct_orig_tuple.ipv6.src_port;
964 ctx.key.dst.port = pkt->md.ct_orig_tuple.ipv6.dst_port;
965 }
966 ctx.key.nw_proto = pkt->md.ct_orig_tuple.ipv6.ipv6_proto;
967 }
968
969 ctx.key.dl_type = ctx_in->key.dl_type;
970 ctx.key.zone = pkt->md.ct_zone;
971
972 ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
973 *bucket = hash_to_bucket(ctx.hash);
974 ct_lock_lock(&ct->buckets[*bucket].lock);
975 conn_key_lookup(&ct->buckets[*bucket], &ctx, now);
976 *conn = ctx.conn;
977
978 return *conn ? true : false;
979 }
980
981 static bool
982 is_un_nat_conn_valid(const struct conn *un_nat_conn)
983 {
984 return un_nat_conn->conn_type == CT_CONN_TYPE_UN_NAT;
985 }
986
987 static void
988 process_one(struct conntrack *ct, struct dp_packet *pkt,
989 struct conn_lookup_ctx *ctx, uint16_t zone,
990 bool force, bool commit, long long now, const uint32_t *setmark,
991 const struct ovs_key_ct_labels *setlabel,
992 const struct nat_action_info_t *nat_action_info,
993 const char *helper)
994 {
995 struct conn *conn;
996 unsigned bucket = hash_to_bucket(ctx->hash);
997 ct_lock_lock(&ct->buckets[bucket].lock);
998 conn_key_lookup(&ct->buckets[bucket], ctx, now);
999 conn = ctx->conn;
1000
1001 /* Delete found entry if in wrong direction. 'force' implies commit. */
1002 if (conn && force && ctx->reply) {
1003 conn_clean(ct, conn, &ct->buckets[bucket]);
1004 conn = NULL;
1005 }
1006
1007 if (OVS_LIKELY(conn)) {
1008 if (conn->conn_type == CT_CONN_TYPE_UN_NAT) {
1009
1010 ctx->reply = true;
1011
1012 struct conn_lookup_ctx ctx2;
1013 ctx2.conn = NULL;
1014 ctx2.key = conn->rev_key;
1015 ctx2.hash = conn_key_hash(&conn->rev_key, ct->hash_basis);
1016
1017 ct_lock_unlock(&ct->buckets[bucket].lock);
1018 bucket = hash_to_bucket(ctx2.hash);
1019
1020 ct_lock_lock(&ct->buckets[bucket].lock);
1021 conn_key_lookup(&ct->buckets[bucket], &ctx2, now);
1022
1023 if (ctx2.conn) {
1024 conn = ctx2.conn;
1025 } else {
1026 /* It is a race condition where conn has timed out and removed
1027 * between unlock of the rev_conn and lock of the forward conn;
1028 * nothing to do. */
1029 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
1030 ct_lock_unlock(&ct->buckets[bucket].lock);
1031 return;
1032 }
1033 }
1034 }
1035
1036 bool create_new_conn = false;
1037 struct conn conn_for_un_nat_copy;
1038 conn_for_un_nat_copy.conn_type = CT_CONN_TYPE_DEFAULT;
1039 bool ftp_ctl = is_ftp_ctl(pkt);
1040
1041 if (OVS_LIKELY(conn)) {
1042 if (ftp_ctl) {
1043 /* Keep sequence tracking in sync with the source of the
1044 * sequence skew. */
1045 if (ctx->reply != conn->seq_skew_dir) {
1046 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1047 !!nat_action_info);
1048 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1049 bucket);
1050 } else {
1051 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1052 bucket);
1053 handle_ftp_ctl(ct, ctx, pkt, conn, now, CT_FTP_CTL_OTHER,
1054 !!nat_action_info);
1055 }
1056 } else {
1057 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1058 bucket);
1059 }
1060 if (nat_action_info && !create_new_conn) {
1061 handle_nat(pkt, conn, zone, ctx->reply, ctx->icmp_related);
1062 }
1063
1064 }else if (check_orig_tuple(ct, pkt, ctx, now, &bucket, &conn,
1065 nat_action_info)) {
1066 create_new_conn = conn_update_state(ct, pkt, ctx, &conn, now,
1067 bucket);
1068 } else {
1069 if (ctx->icmp_related) {
1070 /* An icmp related conn should always be found; no new
1071 connection is created based on an icmp related packet. */
1072 pkt->md.ct_state = CS_INVALID;
1073 } else {
1074 create_new_conn = true;
1075 }
1076 }
1077
1078 const struct alg_exp_node *alg_exp = NULL;
1079 if (OVS_UNLIKELY(create_new_conn)) {
1080 struct alg_exp_node alg_exp_entry;
1081
1082 ct_rwlock_rdlock(&ct->resources_lock);
1083 alg_exp = expectation_lookup(&ct->alg_expectations, &ctx->key,
1084 ct->hash_basis);
1085 if (alg_exp) {
1086 alg_exp_entry = *alg_exp;
1087 alg_exp = &alg_exp_entry;
1088 }
1089 ct_rwlock_unlock(&ct->resources_lock);
1090
1091 conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
1092 &conn_for_un_nat_copy, helper, alg_exp);
1093 }
1094
1095 write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
1096
1097 if (conn && setmark) {
1098 set_mark(pkt, conn, setmark[0], setmark[1]);
1099 }
1100
1101 if (conn && setlabel) {
1102 set_label(pkt, conn, &setlabel[0], &setlabel[1]);
1103 }
1104
1105 bool tftp_ctl = is_tftp_ctl(pkt);
1106 struct conn conn_for_expectation;
1107 if (conn && (ftp_ctl || tftp_ctl)) {
1108 conn_for_expectation = *conn;
1109 }
1110
1111 ct_lock_unlock(&ct->buckets[bucket].lock);
1112
1113 if (is_un_nat_conn_valid(&conn_for_un_nat_copy)) {
1114 create_un_nat_conn(ct, &conn_for_un_nat_copy, now, !!alg_exp);
1115 }
1116
1117 /* FTP control packet handling with expectation creation. */
1118 if (OVS_UNLIKELY(conn && ftp_ctl)) {
1119 handle_ftp_ctl(ct, ctx, pkt, &conn_for_expectation,
1120 now, CT_FTP_CTL_INTEREST, !!nat_action_info);
1121 } else if (OVS_UNLIKELY(conn && tftp_ctl)) {
1122 handle_tftp_ctl(ct, &conn_for_expectation, now);
1123 }
1124 }
1125
1126 /* Sends the packets in '*pkt_batch' through the connection tracker 'ct'. All
1127 * the packets should have the same 'dl_type' (IPv4 or IPv6) and should have
1128 * the l3 and and l4 offset properly set.
1129 *
1130 * If 'commit' is true, the packets are allowed to create new entries in the
1131 * connection tables. 'setmark', if not NULL, should point to a two
1132 * elements array containing a value and a mask to set the connection mark.
1133 * 'setlabel' behaves similarly for the connection label.*/
1134 int
1135 conntrack_execute(struct conntrack *ct, struct dp_packet_batch *pkt_batch,
1136 ovs_be16 dl_type, bool force, bool commit, uint16_t zone,
1137 const uint32_t *setmark,
1138 const struct ovs_key_ct_labels *setlabel,
1139 const char *helper,
1140 const struct nat_action_info_t *nat_action_info,
1141 long long now)
1142 {
1143
1144 struct dp_packet **pkts = pkt_batch->packets;
1145 size_t cnt = pkt_batch->count;
1146 struct conn_lookup_ctx ctx;
1147
1148 for (size_t i = 0; i < cnt; i++) {
1149 if (!conn_key_extract(ct, pkts[i], dl_type, &ctx, zone)) {
1150 pkts[i]->md.ct_state = CS_INVALID;
1151 write_ct_md(pkts[i], zone, NULL, NULL, NULL);
1152 continue;
1153 }
1154 process_one(ct, pkts[i], &ctx, zone, force, commit,
1155 now, setmark, setlabel, nat_action_info, helper);
1156 }
1157
1158 return 0;
1159 }
1160
1161 static void
1162 set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t mask)
1163 {
1164 if (conn->alg_related) {
1165 pkt->md.ct_mark = conn->mark;
1166 } else {
1167 pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask));
1168 conn->mark = pkt->md.ct_mark;
1169 }
1170 }
1171
1172 static void
1173 set_label(struct dp_packet *pkt, struct conn *conn,
1174 const struct ovs_key_ct_labels *val,
1175 const struct ovs_key_ct_labels *mask)
1176 {
1177 if (conn->alg_related) {
1178 pkt->md.ct_label = conn->label;
1179 } else {
1180 ovs_u128 v, m;
1181
1182 memcpy(&v, val, sizeof v);
1183 memcpy(&m, mask, sizeof m);
1184
1185 pkt->md.ct_label.u64.lo = v.u64.lo
1186 | (pkt->md.ct_label.u64.lo & ~(m.u64.lo));
1187 pkt->md.ct_label.u64.hi = v.u64.hi
1188 | (pkt->md.ct_label.u64.hi & ~(m.u64.hi));
1189 conn->label = pkt->md.ct_label;
1190 }
1191 }
1192
1193 \f
1194 /* Delete the expired connections from 'ctb', up to 'limit'. Returns the
1195 * earliest expiration time among the remaining connections in 'ctb'. Returns
1196 * LLONG_MAX if 'ctb' is empty. The return value might be smaller than 'now',
1197 * if 'limit' is reached */
1198 static long long
1199 sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb,
1200 long long now, size_t limit)
1201 OVS_REQUIRES(ctb->lock)
1202 {
1203 struct conn *conn, *next;
1204 long long min_expiration = LLONG_MAX;
1205 unsigned i;
1206 size_t count = 0;
1207
1208 for (i = 0; i < N_CT_TM; i++) {
1209 LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
1210 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
1211 if (!conn_expired(conn, now) || count >= limit) {
1212 min_expiration = MIN(min_expiration, conn->expiration);
1213 if (count >= limit) {
1214 /* Do not check other lists. */
1215 COVERAGE_INC(conntrack_long_cleanup);
1216 return min_expiration;
1217 }
1218 break;
1219 }
1220 conn_clean(ct, conn, ctb);
1221 count++;
1222 }
1223 }
1224 }
1225
1226 enum { MAX_ALG_EXP_TO_EXPIRE = 1000 };
1227 size_t alg_exp_count = hmap_count(&ct->alg_expectations);
1228 /* XXX: revisit this. */
1229 size_t max_to_expire = MAX(alg_exp_count/10, MAX_ALG_EXP_TO_EXPIRE);
1230 count = 0;
1231 ct_rwlock_wrlock(&ct->resources_lock);
1232 struct alg_exp_node *alg_exp_node, *alg_exp_node_next;
1233 LIST_FOR_EACH_SAFE (alg_exp_node, alg_exp_node_next,
1234 exp_node, &ct->alg_exp_list) {
1235 if (now < alg_exp_node->expiration || count >= max_to_expire) {
1236 min_expiration = MIN(min_expiration, alg_exp_node->expiration);
1237 break;
1238 }
1239 ovs_list_remove(&alg_exp_node->exp_node);
1240 hmap_remove(&ct->alg_expectations, &alg_exp_node->node);
1241 free(alg_exp_node);
1242 count++;
1243 }
1244 ct_rwlock_unlock(&ct->resources_lock);
1245
1246 return min_expiration;
1247 }
1248
1249 /* Cleans up old connection entries from 'ct'. Returns the time when the
1250 * next expiration might happen. The return value might be smaller than
1251 * 'now', meaning that an internal limit has been reached, and some expired
1252 * connections have not been deleted. */
1253 static long long
1254 conntrack_clean(struct conntrack *ct, long long now)
1255 {
1256 long long next_wakeup = now + CT_TM_MIN;
1257 unsigned int n_conn_limit;
1258 size_t clean_count = 0;
1259 unsigned i;
1260
1261 atomic_read_relaxed(&ct->n_conn_limit, &n_conn_limit);
1262
1263 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
1264 struct conntrack_bucket *ctb = &ct->buckets[i];
1265 size_t prev_count;
1266 long long min_exp;
1267
1268 ovs_mutex_lock(&ctb->cleanup_mutex);
1269 if (ctb->next_cleanup > now) {
1270 goto next_bucket;
1271 }
1272
1273 ct_lock_lock(&ctb->lock);
1274 prev_count = hmap_count(&ctb->connections);
1275 /* If the connections are well distributed among buckets, we want to
1276 * limit to 10% of the global limit equally split among buckets. If
1277 * the bucket is busier than the others, we limit to 10% of its
1278 * current size. */
1279 min_exp = sweep_bucket(ct, ctb, now,
1280 MAX(prev_count/10, n_conn_limit/(CONNTRACK_BUCKETS*10)));
1281 clean_count += prev_count - hmap_count(&ctb->connections);
1282
1283 if (min_exp > now) {
1284 /* We call hmap_shrink() only if sweep_bucket() managed to delete
1285 * every expired connection. */
1286 hmap_shrink(&ctb->connections);
1287 }
1288
1289 ct_lock_unlock(&ctb->lock);
1290
1291 ctb->next_cleanup = MIN(min_exp, now + CT_TM_MIN);
1292
1293 next_bucket:
1294 next_wakeup = MIN(next_wakeup, ctb->next_cleanup);
1295 ovs_mutex_unlock(&ctb->cleanup_mutex);
1296 }
1297
1298 VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec",
1299 clean_count, time_msec() - now);
1300
1301 return next_wakeup;
1302 }
1303
1304 /* Cleanup:
1305 *
1306 * We must call conntrack_clean() periodically. conntrack_clean() return
1307 * value gives an hint on when the next cleanup must be done (either because
1308 * there is an actual connection that expires, or because a new connection
1309 * might be created with the minimum timeout).
1310 *
1311 * The logic below has two goals:
1312 *
1313 * - We want to reduce the number of wakeups and batch connection cleanup
1314 * when the load is not very high. CT_CLEAN_INTERVAL ensures that if we
1315 * are coping with the current cleanup tasks, then we wait at least
1316 * 5 seconds to do further cleanup.
1317 *
1318 * - We don't want to keep the buckets locked too long, as we might prevent
1319 * traffic from flowing. CT_CLEAN_MIN_INTERVAL ensures that if cleanup is
1320 * behind, there is at least some 200ms blocks of time when buckets will be
1321 * left alone, so the datapath can operate unhindered.
1322 */
1323 #define CT_CLEAN_INTERVAL 5000 /* 5 seconds */
1324 #define CT_CLEAN_MIN_INTERVAL 200 /* 0.2 seconds */
1325
1326 static void *
1327 clean_thread_main(void *f_)
1328 {
1329 struct conntrack *ct = f_;
1330
1331 while (!latch_is_set(&ct->clean_thread_exit)) {
1332 long long next_wake;
1333 long long now = time_msec();
1334
1335 next_wake = conntrack_clean(ct, now);
1336
1337 if (next_wake < now) {
1338 poll_timer_wait_until(now + CT_CLEAN_MIN_INTERVAL);
1339 } else {
1340 poll_timer_wait_until(MAX(next_wake, now + CT_CLEAN_INTERVAL));
1341 }
1342 latch_wait(&ct->clean_thread_exit);
1343 poll_block();
1344 }
1345
1346 return NULL;
1347 }
1348 \f
1349 /* Key extraction */
1350
1351 /* The function stores a pointer to the first byte after the header in
1352 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1353 * not interested in the header's tail, meaning that the header has
1354 * already been parsed (e.g. by flow_extract): we take this as a hint to
1355 * save a few checks. If 'validate_checksum' is true, the function returns
1356 * false if the IPv4 checksum is invalid. */
1357 static inline bool
1358 extract_l3_ipv4(struct conn_key *key, const void *data, size_t size,
1359 const char **new_data, bool validate_checksum)
1360 {
1361 const struct ip_header *ip = data;
1362 size_t ip_len;
1363
1364 if (new_data) {
1365 if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
1366 return false;
1367 }
1368 }
1369
1370 ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
1371
1372 if (new_data) {
1373 if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
1374 return false;
1375 }
1376 if (OVS_UNLIKELY(size < ip_len)) {
1377 return false;
1378 }
1379
1380 *new_data = (char *) data + ip_len;
1381 }
1382
1383 if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
1384 return false;
1385 }
1386
1387 if (validate_checksum && csum(data, ip_len) != 0) {
1388 return false;
1389 }
1390
1391 key->src.addr.ipv4 = ip->ip_src;
1392 key->dst.addr.ipv4 = ip->ip_dst;
1393 key->nw_proto = ip->ip_proto;
1394
1395 return true;
1396 }
1397
1398 /* The function stores a pointer to the first byte after the header in
1399 * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is
1400 * not interested in the header's tail, meaning that the header has
1401 * already been parsed (e.g. by flow_extract): we take this as a hint to
1402 * save a few checks. */
1403 static inline bool
1404 extract_l3_ipv6(struct conn_key *key, const void *data, size_t size,
1405 const char **new_data)
1406 {
1407 const struct ovs_16aligned_ip6_hdr *ip6 = data;
1408
1409 if (new_data) {
1410 if (OVS_UNLIKELY(size < sizeof *ip6)) {
1411 return false;
1412 }
1413 }
1414
1415 uint8_t nw_proto = ip6->ip6_nxt;
1416 uint8_t nw_frag = 0;
1417
1418 data = ip6 + 1;
1419 size -= sizeof *ip6;
1420
1421 if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) {
1422 return false;
1423 }
1424
1425 if (new_data) {
1426 *new_data = data;
1427 }
1428
1429 if (nw_frag) {
1430 return false;
1431 }
1432
1433 key->src.addr.ipv6 = ip6->ip6_src;
1434 key->dst.addr.ipv6 = ip6->ip6_dst;
1435 key->nw_proto = nw_proto;
1436
1437 return true;
1438 }
1439
1440 static inline bool
1441 checksum_valid(const struct conn_key *key, const void *data, size_t size,
1442 const void *l3)
1443 {
1444 uint32_t csum = 0;
1445
1446 if (key->dl_type == htons(ETH_TYPE_IP)) {
1447 csum = packet_csum_pseudoheader(l3);
1448 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
1449 csum = packet_csum_pseudoheader6(l3);
1450 } else {
1451 return false;
1452 }
1453
1454 csum = csum_continue(csum, data, size);
1455
1456 return csum_finish(csum) == 0;
1457 }
1458
1459 static inline bool
1460 check_l4_tcp(const struct conn_key *key, const void *data, size_t size,
1461 const void *l3, bool validate_checksum)
1462 {
1463 const struct tcp_header *tcp = data;
1464 if (size < sizeof *tcp) {
1465 return false;
1466 }
1467
1468 size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
1469 if (OVS_UNLIKELY(tcp_len < TCP_HEADER_LEN || tcp_len > size)) {
1470 return false;
1471 }
1472
1473 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
1474 }
1475
1476 static inline bool
1477 check_l4_udp(const struct conn_key *key, const void *data, size_t size,
1478 const void *l3, bool validate_checksum)
1479 {
1480 const struct udp_header *udp = data;
1481 if (size < sizeof *udp) {
1482 return false;
1483 }
1484
1485 size_t udp_len = ntohs(udp->udp_len);
1486 if (OVS_UNLIKELY(udp_len < UDP_HEADER_LEN || udp_len > size)) {
1487 return false;
1488 }
1489
1490 /* Validation must be skipped if checksum is 0 on IPv4 packets */
1491 return (udp->udp_csum == 0 && key->dl_type == htons(ETH_TYPE_IP))
1492 || (validate_checksum ? checksum_valid(key, data, size, l3) : true);
1493 }
1494
1495 static inline bool
1496 check_l4_icmp(const void *data, size_t size, bool validate_checksum)
1497 {
1498 return validate_checksum ? csum(data, size) == 0 : true;
1499 }
1500
1501 static inline bool
1502 check_l4_icmp6(const struct conn_key *key, const void *data, size_t size,
1503 const void *l3, bool validate_checksum)
1504 {
1505 return validate_checksum ? checksum_valid(key, data, size, l3) : true;
1506 }
1507
1508 static inline bool
1509 extract_l4_tcp(struct conn_key *key, const void *data, size_t size)
1510 {
1511 const struct tcp_header *tcp = data;
1512
1513 if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) {
1514 return false;
1515 }
1516
1517 key->src.port = tcp->tcp_src;
1518 key->dst.port = tcp->tcp_dst;
1519
1520 /* Port 0 is invalid */
1521 return key->src.port && key->dst.port;
1522 }
1523
1524 static inline bool
1525 extract_l4_udp(struct conn_key *key, const void *data, size_t size)
1526 {
1527 const struct udp_header *udp = data;
1528
1529 if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) {
1530 return false;
1531 }
1532
1533 key->src.port = udp->udp_src;
1534 key->dst.port = udp->udp_dst;
1535
1536 /* Port 0 is invalid */
1537 return key->src.port && key->dst.port;
1538 }
1539
1540 static inline bool extract_l4(struct conn_key *key, const void *data,
1541 size_t size, bool *related, const void *l3,
1542 bool validate_checksum);
1543
1544 static uint8_t
1545 reverse_icmp_type(uint8_t type)
1546 {
1547 switch (type) {
1548 case ICMP4_ECHO_REQUEST:
1549 return ICMP4_ECHO_REPLY;
1550 case ICMP4_ECHO_REPLY:
1551 return ICMP4_ECHO_REQUEST;
1552
1553 case ICMP4_TIMESTAMP:
1554 return ICMP4_TIMESTAMPREPLY;
1555 case ICMP4_TIMESTAMPREPLY:
1556 return ICMP4_TIMESTAMP;
1557
1558 case ICMP4_INFOREQUEST:
1559 return ICMP4_INFOREPLY;
1560 case ICMP4_INFOREPLY:
1561 return ICMP4_INFOREQUEST;
1562 default:
1563 OVS_NOT_REACHED();
1564 }
1565 }
1566
1567 /* If 'related' is not NULL and the function is processing an ICMP
1568 * error packet, extract the l3 and l4 fields from the nested header
1569 * instead and set *related to true. If 'related' is NULL we're
1570 * already processing a nested header and no such recursion is
1571 * possible */
1572 static inline int
1573 extract_l4_icmp(struct conn_key *key, const void *data, size_t size,
1574 bool *related)
1575 {
1576 const struct icmp_header *icmp = data;
1577
1578 if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) {
1579 return false;
1580 }
1581
1582 switch (icmp->icmp_type) {
1583 case ICMP4_ECHO_REQUEST:
1584 case ICMP4_ECHO_REPLY:
1585 case ICMP4_TIMESTAMP:
1586 case ICMP4_TIMESTAMPREPLY:
1587 case ICMP4_INFOREQUEST:
1588 case ICMP4_INFOREPLY:
1589 if (icmp->icmp_code != 0) {
1590 return false;
1591 }
1592 /* Separate ICMP connection: identified using id */
1593 key->src.icmp_id = key->dst.icmp_id = icmp->icmp_fields.echo.id;
1594 key->src.icmp_type = icmp->icmp_type;
1595 key->dst.icmp_type = reverse_icmp_type(icmp->icmp_type);
1596 break;
1597 case ICMP4_DST_UNREACH:
1598 case ICMP4_TIME_EXCEEDED:
1599 case ICMP4_PARAM_PROB:
1600 case ICMP4_SOURCEQUENCH:
1601 case ICMP4_REDIRECT: {
1602 /* ICMP packet part of another connection. We should
1603 * extract the key from embedded packet header */
1604 struct conn_key inner_key;
1605 const char *l3 = (const char *) (icmp + 1);
1606 const char *tail = (const char *) data + size;
1607 const char *l4;
1608 bool ok;
1609
1610 if (!related) {
1611 return false;
1612 }
1613
1614 memset(&inner_key, 0, sizeof inner_key);
1615 inner_key.dl_type = htons(ETH_TYPE_IP);
1616 ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4, false);
1617 if (!ok) {
1618 return false;
1619 }
1620
1621 if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned
1622 || inner_key.dst.addr.ipv4_aligned != key->src.addr.ipv4_aligned) {
1623 return false;
1624 }
1625
1626 key->src = inner_key.src;
1627 key->dst = inner_key.dst;
1628 key->nw_proto = inner_key.nw_proto;
1629
1630 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
1631 if (ok) {
1632 conn_key_reverse(key);
1633 *related = true;
1634 }
1635 return ok;
1636 }
1637 default:
1638 return false;
1639 }
1640
1641 return true;
1642 }
1643
1644 static uint8_t
1645 reverse_icmp6_type(uint8_t type)
1646 {
1647 switch (type) {
1648 case ICMP6_ECHO_REQUEST:
1649 return ICMP6_ECHO_REPLY;
1650 case ICMP6_ECHO_REPLY:
1651 return ICMP6_ECHO_REQUEST;
1652 default:
1653 OVS_NOT_REACHED();
1654 }
1655 }
1656
1657 /* If 'related' is not NULL and the function is processing an ICMP
1658 * error packet, extract the l3 and l4 fields from the nested header
1659 * instead and set *related to true. If 'related' is NULL we're
1660 * already processing a nested header and no such recursion is
1661 * possible */
1662 static inline bool
1663 extract_l4_icmp6(struct conn_key *key, const void *data, size_t size,
1664 bool *related)
1665 {
1666 const struct icmp6_header *icmp6 = data;
1667
1668 /* All the messages that we support need at least 4 bytes after
1669 * the header */
1670 if (size < sizeof *icmp6 + 4) {
1671 return false;
1672 }
1673
1674 switch (icmp6->icmp6_type) {
1675 case ICMP6_ECHO_REQUEST:
1676 case ICMP6_ECHO_REPLY:
1677 if (icmp6->icmp6_code != 0) {
1678 return false;
1679 }
1680 /* Separate ICMP connection: identified using id */
1681 key->src.icmp_id = key->dst.icmp_id = *(ovs_be16 *) (icmp6 + 1);
1682 key->src.icmp_type = icmp6->icmp6_type;
1683 key->dst.icmp_type = reverse_icmp6_type(icmp6->icmp6_type);
1684 break;
1685 case ICMP6_DST_UNREACH:
1686 case ICMP6_PACKET_TOO_BIG:
1687 case ICMP6_TIME_EXCEEDED:
1688 case ICMP6_PARAM_PROB: {
1689 /* ICMP packet part of another connection. We should
1690 * extract the key from embedded packet header */
1691 struct conn_key inner_key;
1692 const char *l3 = (const char *) icmp6 + 8;
1693 const char *tail = (const char *) data + size;
1694 const char *l4 = NULL;
1695 bool ok;
1696
1697 if (!related) {
1698 return false;
1699 }
1700
1701 memset(&inner_key, 0, sizeof inner_key);
1702 inner_key.dl_type = htons(ETH_TYPE_IPV6);
1703 ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4);
1704 if (!ok) {
1705 return false;
1706 }
1707
1708 /* pf doesn't do this, but it seems a good idea */
1709 if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned,
1710 &key->dst.addr.ipv6_aligned)
1711 || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned,
1712 &key->src.addr.ipv6_aligned)) {
1713 return false;
1714 }
1715
1716 key->src = inner_key.src;
1717 key->dst = inner_key.dst;
1718 key->nw_proto = inner_key.nw_proto;
1719
1720 ok = extract_l4(key, l4, tail - l4, NULL, l3, false);
1721 if (ok) {
1722 conn_key_reverse(key);
1723 *related = true;
1724 }
1725 return ok;
1726 }
1727 default:
1728 return false;
1729 }
1730
1731 return true;
1732 }
1733
1734 /* Extract l4 fields into 'key', which must already contain valid l3
1735 * members.
1736 *
1737 * If 'related' is not NULL and an ICMP error packet is being
1738 * processed, the function will extract the key from the packet nested
1739 * in the ICMP payload and set '*related' to true.
1740 *
1741 * If 'related' is NULL, it means that we're already parsing a header nested
1742 * in an ICMP error. In this case, we skip checksum and length validation. */
1743 static inline bool
1744 extract_l4(struct conn_key *key, const void *data, size_t size, bool *related,
1745 const void *l3, bool validate_checksum)
1746 {
1747 if (key->nw_proto == IPPROTO_TCP) {
1748 return (!related || check_l4_tcp(key, data, size, l3,
1749 validate_checksum)) && extract_l4_tcp(key, data, size);
1750 } else if (key->nw_proto == IPPROTO_UDP) {
1751 return (!related || check_l4_udp(key, data, size, l3,
1752 validate_checksum)) && extract_l4_udp(key, data, size);
1753 } else if (key->dl_type == htons(ETH_TYPE_IP)
1754 && key->nw_proto == IPPROTO_ICMP) {
1755 return (!related || check_l4_icmp(data, size, validate_checksum))
1756 && extract_l4_icmp(key, data, size, related);
1757 } else if (key->dl_type == htons(ETH_TYPE_IPV6)
1758 && key->nw_proto == IPPROTO_ICMPV6) {
1759 return (!related || check_l4_icmp6(key, data, size, l3,
1760 validate_checksum)) && extract_l4_icmp6(key, data, size,
1761 related);
1762 } else {
1763 return false;
1764 }
1765 }
1766
1767 static bool
1768 conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, ovs_be16 dl_type,
1769 struct conn_lookup_ctx *ctx, uint16_t zone)
1770 {
1771 const struct eth_header *l2 = dp_packet_eth(pkt);
1772 const struct ip_header *l3 = dp_packet_l3(pkt);
1773 const char *l4 = dp_packet_l4(pkt);
1774 const char *tail = dp_packet_tail(pkt);
1775 bool ok;
1776
1777 memset(ctx, 0, sizeof *ctx);
1778
1779 if (!l2 || !l3 || !l4) {
1780 return false;
1781 }
1782
1783 ctx->key.zone = zone;
1784
1785 /* XXX In this function we parse the packet (again, it has already
1786 * gone through miniflow_extract()) for two reasons:
1787 *
1788 * 1) To extract the l3 addresses and l4 ports.
1789 * We already have the l3 and l4 headers' pointers. Extracting
1790 * the l3 addresses and the l4 ports is really cheap, since they
1791 * can be found at fixed locations.
1792 * 2) To extract the l4 type.
1793 * Extracting the l4 types, for IPv6 can be quite expensive, because
1794 * it's not at a fixed location.
1795 *
1796 * Here's a way to avoid (2) with the help of the datapath.
1797 * The datapath doesn't keep the packet's extracted flow[1], so
1798 * using that is not an option. We could use the packet's matching
1799 * megaflow, but we have to make sure that the l4 type (nw_proto)
1800 * is unwildcarded. This means either:
1801 *
1802 * a) dpif-netdev unwildcards the l4 type when a new flow is installed
1803 * if the actions contains ct().
1804 *
1805 * b) ofproto-dpif-xlate unwildcards the l4 type when translating a ct()
1806 * action. This is already done in different actions, but it's
1807 * unnecessary for the kernel.
1808 *
1809 * ---
1810 * [1] The reasons for this are that keeping the flow increases
1811 * (slightly) the cache footprint and increases computation
1812 * time as we move the packet around. Most importantly, the flow
1813 * should be updated by the actions and this can be slow, as
1814 * we use a sparse representation (miniflow).
1815 *
1816 */
1817 ctx->key.dl_type = dl_type;
1818 if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
1819 bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
1820 if (hwol_bad_l3_csum) {
1821 ok = false;
1822 } else {
1823 bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt);
1824 /* Validate the checksum only when hwol is not supported. */
1825 ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL,
1826 !hwol_good_l3_csum);
1827 }
1828 } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
1829 ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL);
1830 } else {
1831 ok = false;
1832 }
1833
1834
1835 if (ok) {
1836 bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
1837 if (!hwol_bad_l4_csum) {
1838 bool hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt);
1839 /* Validate the checksum only when hwol is not supported. */
1840 if (extract_l4(&ctx->key, l4, tail - l4, &ctx->icmp_related, l3,
1841 !hwol_good_l4_csum)) {
1842 ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis);
1843 return true;
1844 }
1845 }
1846 }
1847
1848 return false;
1849 }
1850
1851 static uint32_t
1852 ct_addr_hash_add(uint32_t hash, const struct ct_addr *addr)
1853 {
1854 BUILD_ASSERT_DECL(sizeof *addr % 4 == 0);
1855 return hash_add_bytes32(hash, (const uint32_t *) addr, sizeof *addr);
1856 }
1857
1858 static uint32_t
1859 ct_endpoint_hash_add(uint32_t hash, const struct ct_endpoint *ep)
1860 {
1861 BUILD_ASSERT_DECL(sizeof *ep % 4 == 0);
1862 return hash_add_bytes32(hash, (const uint32_t *) ep, sizeof *ep);
1863 }
1864 \f
1865 /* Symmetric */
1866 static uint32_t
1867 conn_key_hash(const struct conn_key *key, uint32_t basis)
1868 {
1869 uint32_t hsrc, hdst, hash;
1870
1871 hsrc = hdst = basis;
1872 hsrc = ct_endpoint_hash_add(hsrc, &key->src);
1873 hdst = ct_endpoint_hash_add(hdst, &key->dst);
1874
1875 /* Even if source and destination are swapped the hash will be the same. */
1876 hash = hsrc ^ hdst;
1877
1878 /* Hash the rest of the key(L3 and L4 types and zone). */
1879 hash = hash_words((uint32_t *) (&key->dst + 1),
1880 (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1),
1881 hash);
1882
1883 return hash_finish(hash, 0);
1884 }
1885
1886 static void
1887 conn_key_reverse(struct conn_key *key)
1888 {
1889 struct ct_endpoint tmp;
1890
1891 tmp = key->src;
1892 key->src = key->dst;
1893 key->dst = tmp;
1894 }
1895
1896 static uint32_t
1897 nat_ipv6_addrs_delta(struct in6_addr *ipv6_aligned_min,
1898 struct in6_addr *ipv6_aligned_max)
1899 {
1900 uint8_t *ipv6_min_hi = &ipv6_aligned_min->s6_addr[0];
1901 uint8_t *ipv6_min_lo = &ipv6_aligned_min->s6_addr[0] + sizeof(uint64_t);
1902 uint8_t *ipv6_max_hi = &ipv6_aligned_max->s6_addr[0];
1903 uint8_t *ipv6_max_lo = &ipv6_aligned_max->s6_addr[0] + sizeof(uint64_t);
1904
1905 ovs_be64 addr6_64_min_hi;
1906 ovs_be64 addr6_64_min_lo;
1907 memcpy(&addr6_64_min_hi, ipv6_min_hi, sizeof addr6_64_min_hi);
1908 memcpy(&addr6_64_min_lo, ipv6_min_lo, sizeof addr6_64_min_lo);
1909
1910 ovs_be64 addr6_64_max_hi;
1911 ovs_be64 addr6_64_max_lo;
1912 memcpy(&addr6_64_max_hi, ipv6_max_hi, sizeof addr6_64_max_hi);
1913 memcpy(&addr6_64_max_lo, ipv6_max_lo, sizeof addr6_64_max_lo);
1914
1915 uint64_t diff;
1916 if (addr6_64_min_hi == addr6_64_max_hi &&
1917 ntohll(addr6_64_min_lo) <= ntohll(addr6_64_max_lo)) {
1918 diff = ntohll(addr6_64_max_lo) - ntohll(addr6_64_min_lo);
1919 } else if (ntohll(addr6_64_min_hi) + 1 == ntohll(addr6_64_max_hi) &&
1920 ntohll(addr6_64_min_lo) > ntohll(addr6_64_max_lo)) {
1921 diff = UINT64_MAX - (ntohll(addr6_64_min_lo) -
1922 ntohll(addr6_64_max_lo) - 1);
1923 } else {
1924 /* Limit address delta supported to 32 bits or 4 billion approximately.
1925 * Possibly, this should be visible to the user through a datapath
1926 * support check, however the practical impact is probably nil. */
1927 diff = 0xfffffffe;
1928 }
1929 if (diff > 0xfffffffe) {
1930 diff = 0xfffffffe;
1931 }
1932 return diff;
1933 }
1934
1935 /* This function must be used in tandem with nat_ipv6_addrs_delta(), which
1936 * restricts the input parameters. */
1937 static void
1938 nat_ipv6_addr_increment(struct in6_addr *ipv6_aligned, uint32_t increment)
1939 {
1940 uint8_t *ipv6_hi = &ipv6_aligned->s6_addr[0];
1941 uint8_t *ipv6_lo = &ipv6_aligned->s6_addr[0] + sizeof(ovs_be64);
1942 ovs_be64 addr6_64_hi;
1943 ovs_be64 addr6_64_lo;
1944 memcpy(&addr6_64_hi, ipv6_hi, sizeof addr6_64_hi);
1945 memcpy(&addr6_64_lo, ipv6_lo, sizeof addr6_64_lo);
1946
1947 if (UINT64_MAX - increment >= ntohll(addr6_64_lo)) {
1948 addr6_64_lo = htonll(increment + ntohll(addr6_64_lo));
1949 } else if (addr6_64_hi != OVS_BE64_MAX) {
1950 addr6_64_hi = htonll(1 + ntohll(addr6_64_hi));
1951 addr6_64_lo = htonll(increment - (UINT64_MAX -
1952 ntohll(addr6_64_lo) + 1));
1953 } else {
1954 OVS_NOT_REACHED();
1955 }
1956
1957 memcpy(ipv6_hi, &addr6_64_hi, sizeof addr6_64_hi);
1958 memcpy(ipv6_lo, &addr6_64_lo, sizeof addr6_64_lo);
1959
1960 return;
1961 }
1962
1963 static uint32_t
1964 nat_range_hash(const struct conn *conn, uint32_t basis)
1965 {
1966 uint32_t hash = basis;
1967
1968 hash = ct_addr_hash_add(hash, &conn->nat_info->min_addr);
1969 hash = ct_addr_hash_add(hash, &conn->nat_info->max_addr);
1970 hash = hash_add(hash,
1971 (conn->nat_info->max_port << 16)
1972 | conn->nat_info->min_port);
1973
1974 hash = ct_endpoint_hash_add(hash, &conn->key.src);
1975 hash = ct_endpoint_hash_add(hash, &conn->key.dst);
1976
1977 hash = hash_add(hash, (OVS_FORCE uint32_t) conn->key.dl_type);
1978 hash = hash_add(hash, conn->key.nw_proto);
1979 hash = hash_add(hash, conn->key.zone);
1980
1981 /* The purpose of the second parameter is to distinguish hashes of data of
1982 * different length; our data always has the same length so there is no
1983 * value in counting. */
1984 return hash_finish(hash, 0);
1985 }
1986
1987 static bool
1988 nat_select_range_tuple(struct conntrack *ct, const struct conn *conn,
1989 struct conn *nat_conn)
1990 {
1991 enum { MIN_NAT_EPHEMERAL_PORT = 1024,
1992 MAX_NAT_EPHEMERAL_PORT = 65535 };
1993
1994 uint16_t min_port;
1995 uint16_t max_port;
1996 uint16_t first_port;
1997
1998 uint32_t hash = nat_range_hash(conn, ct->hash_basis);
1999
2000 if ((conn->nat_info->nat_action & NAT_ACTION_SRC) &&
2001 (!(conn->nat_info->nat_action & NAT_ACTION_SRC_PORT))) {
2002 min_port = ntohs(conn->key.src.port);
2003 max_port = ntohs(conn->key.src.port);
2004 first_port = min_port;
2005 } else if ((conn->nat_info->nat_action & NAT_ACTION_DST) &&
2006 (!(conn->nat_info->nat_action & NAT_ACTION_DST_PORT))) {
2007 min_port = ntohs(conn->key.dst.port);
2008 max_port = ntohs(conn->key.dst.port);
2009 first_port = min_port;
2010 } else {
2011 uint16_t deltap = conn->nat_info->max_port - conn->nat_info->min_port;
2012 uint32_t port_index = hash % (deltap + 1);
2013 first_port = conn->nat_info->min_port + port_index;
2014 min_port = conn->nat_info->min_port;
2015 max_port = conn->nat_info->max_port;
2016 }
2017
2018 uint32_t deltaa = 0;
2019 uint32_t address_index;
2020 struct ct_addr ct_addr;
2021 memset(&ct_addr, 0, sizeof ct_addr);
2022 struct ct_addr max_ct_addr;
2023 memset(&max_ct_addr, 0, sizeof max_ct_addr);
2024 max_ct_addr = conn->nat_info->max_addr;
2025
2026 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
2027 deltaa = ntohl(conn->nat_info->max_addr.ipv4_aligned) -
2028 ntohl(conn->nat_info->min_addr.ipv4_aligned);
2029 address_index = hash % (deltaa + 1);
2030 ct_addr.ipv4_aligned = htonl(
2031 ntohl(conn->nat_info->min_addr.ipv4_aligned) + address_index);
2032 } else {
2033 deltaa = nat_ipv6_addrs_delta(&conn->nat_info->min_addr.ipv6_aligned,
2034 &conn->nat_info->max_addr.ipv6_aligned);
2035 /* deltaa must be within 32 bits for full hash coverage. A 64 or
2036 * 128 bit hash is unnecessary and hence not used here. Most code
2037 * is kept common with V4; nat_ipv6_addrs_delta() will do the
2038 * enforcement via max_ct_addr. */
2039 max_ct_addr = conn->nat_info->min_addr;
2040 nat_ipv6_addr_increment(&max_ct_addr.ipv6_aligned, deltaa);
2041
2042 address_index = hash % (deltaa + 1);
2043 ct_addr.ipv6_aligned = conn->nat_info->min_addr.ipv6_aligned;
2044 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, address_index);
2045 }
2046
2047 uint16_t port = first_port;
2048 bool all_ports_tried = false;
2049 bool original_ports_tried = false;
2050 struct ct_addr first_addr = ct_addr;
2051
2052 while (true) {
2053 if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2054 nat_conn->rev_key.dst.addr = ct_addr;
2055 } else {
2056 nat_conn->rev_key.src.addr = ct_addr;
2057 }
2058
2059 if ((conn->key.nw_proto == IPPROTO_ICMP) ||
2060 (conn->key.nw_proto == IPPROTO_ICMPV6)) {
2061 all_ports_tried = true;
2062 } else if (conn->nat_info->nat_action & NAT_ACTION_SRC) {
2063 nat_conn->rev_key.dst.port = htons(port);
2064 } else {
2065 nat_conn->rev_key.src.port = htons(port);
2066 }
2067
2068 struct nat_conn_key_node *nat_conn_key_node =
2069 nat_conn_keys_lookup(&ct->nat_conn_keys, &nat_conn->rev_key,
2070 ct->hash_basis);
2071
2072 if (!nat_conn_key_node) {
2073 struct nat_conn_key_node *nat_conn_key =
2074 xzalloc(sizeof *nat_conn_key);
2075 nat_conn_key->key = nat_conn->rev_key;
2076 nat_conn_key->value = nat_conn->key;
2077 uint32_t nat_conn_key_hash = conn_key_hash(&nat_conn_key->key,
2078 ct->hash_basis);
2079 hmap_insert(&ct->nat_conn_keys, &nat_conn_key->node,
2080 nat_conn_key_hash);
2081 return true;
2082 } else if (!all_ports_tried) {
2083 if (min_port == max_port) {
2084 all_ports_tried = true;
2085 } else if (port == max_port) {
2086 port = min_port;
2087 } else {
2088 port++;
2089 }
2090 if (port == first_port) {
2091 all_ports_tried = true;
2092 }
2093 } else {
2094 if (memcmp(&ct_addr, &max_ct_addr, sizeof ct_addr)) {
2095 if (conn->key.dl_type == htons(ETH_TYPE_IP)) {
2096 ct_addr.ipv4_aligned = htonl(
2097 ntohl(ct_addr.ipv4_aligned) + 1);
2098 } else {
2099 nat_ipv6_addr_increment(&ct_addr.ipv6_aligned, 1);
2100 }
2101 } else {
2102 ct_addr = conn->nat_info->min_addr;
2103 }
2104 if (!memcmp(&ct_addr, &first_addr, sizeof ct_addr)) {
2105 if (!original_ports_tried) {
2106 original_ports_tried = true;
2107 ct_addr = conn->nat_info->min_addr;
2108 min_port = MIN_NAT_EPHEMERAL_PORT;
2109 max_port = MAX_NAT_EPHEMERAL_PORT;
2110 } else {
2111 break;
2112 }
2113 }
2114 first_port = min_port;
2115 port = first_port;
2116 all_ports_tried = false;
2117 }
2118 }
2119 return false;
2120 }
2121
2122 /* This function must be called with the ct->resources lock taken. */
2123 static struct nat_conn_key_node *
2124 nat_conn_keys_lookup(struct hmap *nat_conn_keys,
2125 const struct conn_key *key,
2126 uint32_t basis)
2127 {
2128 struct nat_conn_key_node *nat_conn_key_node;
2129 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
2130
2131 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
2132 nat_conn_keys) {
2133 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
2134 return nat_conn_key_node;
2135 }
2136 }
2137 return NULL;
2138 }
2139
2140 /* This function must be called with the ct->resources write lock taken. */
2141 static void
2142 nat_conn_keys_remove(struct hmap *nat_conn_keys,
2143 const struct conn_key *key,
2144 uint32_t basis)
2145 {
2146 struct nat_conn_key_node *nat_conn_key_node;
2147 uint32_t nat_conn_key_hash = conn_key_hash(key, basis);
2148
2149 HMAP_FOR_EACH_WITH_HASH (nat_conn_key_node, node, nat_conn_key_hash,
2150 nat_conn_keys) {
2151 if (!conn_key_cmp(&nat_conn_key_node->key, key)) {
2152 hmap_remove(nat_conn_keys, &nat_conn_key_node->node);
2153 free(nat_conn_key_node);
2154 return;
2155 }
2156 }
2157 }
2158
2159 static void
2160 conn_key_lookup(struct conntrack_bucket *ctb, struct conn_lookup_ctx *ctx,
2161 long long now)
2162 OVS_REQUIRES(ctb->lock)
2163 {
2164 uint32_t hash = ctx->hash;
2165 struct conn *conn;
2166
2167 ctx->conn = NULL;
2168
2169 HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
2170 if (!conn_key_cmp(&conn->key, &ctx->key)
2171 && !conn_expired(conn, now)) {
2172 ctx->conn = conn;
2173 ctx->reply = false;
2174 break;
2175 }
2176 if (!conn_key_cmp(&conn->rev_key, &ctx->key)
2177 && !conn_expired(conn, now)) {
2178 ctx->conn = conn;
2179 ctx->reply = true;
2180 break;
2181 }
2182 }
2183 }
2184
2185 static enum ct_update_res
2186 conn_update(struct conn *conn, struct conntrack_bucket *ctb,
2187 struct dp_packet *pkt, bool reply, long long now)
2188 {
2189 return l4_protos[conn->key.nw_proto]->conn_update(conn, ctb, pkt,
2190 reply, now);
2191 }
2192
2193 static bool
2194 conn_expired(struct conn *conn, long long now)
2195 {
2196 if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
2197 return now >= conn->expiration;
2198 }
2199 return false;
2200 }
2201
2202 static bool
2203 valid_new(struct dp_packet *pkt, struct conn_key *key)
2204 {
2205 return l4_protos[key->nw_proto]->valid_new(pkt);
2206 }
2207
2208 static struct conn *
2209 new_conn(struct conntrack_bucket *ctb, struct dp_packet *pkt,
2210 struct conn_key *key, long long now)
2211 {
2212 struct conn *newconn;
2213
2214 newconn = l4_protos[key->nw_proto]->new_conn(ctb, pkt, now);
2215
2216 if (newconn) {
2217 newconn->key = *key;
2218 }
2219
2220 return newconn;
2221 }
2222
2223 static void
2224 delete_conn(struct conn *conn)
2225 {
2226 free(conn->nat_info);
2227 free(conn->alg);
2228 free(conn);
2229 }
2230 \f
2231 static void
2232 ct_endpoint_to_ct_dpif_inet_addr(const struct ct_addr *a,
2233 union ct_dpif_inet_addr *b,
2234 ovs_be16 dl_type)
2235 {
2236 if (dl_type == htons(ETH_TYPE_IP)) {
2237 b->ip = a->ipv4_aligned;
2238 } else if (dl_type == htons(ETH_TYPE_IPV6)){
2239 b->in6 = a->ipv6_aligned;
2240 }
2241 }
2242
2243 static void
2244 conn_key_to_tuple(const struct conn_key *key, struct ct_dpif_tuple *tuple)
2245 {
2246 if (key->dl_type == htons(ETH_TYPE_IP)) {
2247 tuple->l3_type = AF_INET;
2248 } else if (key->dl_type == htons(ETH_TYPE_IPV6)) {
2249 tuple->l3_type = AF_INET6;
2250 }
2251 tuple->ip_proto = key->nw_proto;
2252 ct_endpoint_to_ct_dpif_inet_addr(&key->src.addr, &tuple->src,
2253 key->dl_type);
2254 ct_endpoint_to_ct_dpif_inet_addr(&key->dst.addr, &tuple->dst,
2255 key->dl_type);
2256
2257 if (key->nw_proto == IPPROTO_ICMP || key->nw_proto == IPPROTO_ICMPV6) {
2258 tuple->icmp_id = key->src.icmp_id;
2259 tuple->icmp_type = key->src.icmp_type;
2260 tuple->icmp_code = key->src.icmp_code;
2261 } else {
2262 tuple->src_port = key->src.port;
2263 tuple->dst_port = key->dst.port;
2264 }
2265 }
2266
2267 static void
2268 conn_to_ct_dpif_entry(const struct conn *conn, struct ct_dpif_entry *entry,
2269 long long now, int bkt)
2270 {
2271 struct ct_l4_proto *class;
2272 long long expiration;
2273 memset(entry, 0, sizeof *entry);
2274 conn_key_to_tuple(&conn->key, &entry->tuple_orig);
2275 conn_key_to_tuple(&conn->rev_key, &entry->tuple_reply);
2276
2277 entry->zone = conn->key.zone;
2278 entry->mark = conn->mark;
2279
2280 memcpy(&entry->labels, &conn->label, sizeof entry->labels);
2281 /* Not implemented yet */
2282 entry->timestamp.start = 0;
2283 entry->timestamp.stop = 0;
2284
2285 expiration = conn->expiration - now;
2286 entry->timeout = (expiration > 0) ? expiration / 1000 : 0;
2287
2288 class = l4_protos[conn->key.nw_proto];
2289 if (class->conn_get_protoinfo) {
2290 class->conn_get_protoinfo(conn, &entry->protoinfo);
2291 }
2292
2293 entry->bkt = bkt;
2294
2295 if (conn->alg) {
2296 /* Caller is responsible for freeing. */
2297 entry->helper.name = xstrdup(conn->alg);
2298 }
2299 }
2300
2301 int
2302 conntrack_dump_start(struct conntrack *ct, struct conntrack_dump *dump,
2303 const uint16_t *pzone, int *ptot_bkts)
2304 {
2305 memset(dump, 0, sizeof(*dump));
2306 if (pzone) {
2307 dump->zone = *pzone;
2308 dump->filter_zone = true;
2309 }
2310 dump->ct = ct;
2311
2312 *ptot_bkts = CONNTRACK_BUCKETS;
2313
2314 return 0;
2315 }
2316
2317 int
2318 conntrack_dump_next(struct conntrack_dump *dump, struct ct_dpif_entry *entry)
2319 {
2320 struct conntrack *ct = dump->ct;
2321 long long now = time_msec();
2322
2323 while (dump->bucket < CONNTRACK_BUCKETS) {
2324 struct hmap_node *node;
2325
2326 ct_lock_lock(&ct->buckets[dump->bucket].lock);
2327 for (;;) {
2328 struct conn *conn;
2329
2330 node = hmap_at_position(&ct->buckets[dump->bucket].connections,
2331 &dump->bucket_pos);
2332 if (!node) {
2333 break;
2334 }
2335 INIT_CONTAINER(conn, node, node);
2336 if ((!dump->filter_zone || conn->key.zone == dump->zone) &&
2337 (conn->conn_type != CT_CONN_TYPE_UN_NAT)) {
2338 conn_to_ct_dpif_entry(conn, entry, now, dump->bucket);
2339 break;
2340 }
2341 /* Else continue, until we find an entry in the appropriate zone
2342 * or the bucket has been scanned completely. */
2343 }
2344 ct_lock_unlock(&ct->buckets[dump->bucket].lock);
2345
2346 if (!node) {
2347 memset(&dump->bucket_pos, 0, sizeof dump->bucket_pos);
2348 dump->bucket++;
2349 } else {
2350 return 0;
2351 }
2352 }
2353 return EOF;
2354 }
2355
2356 int
2357 conntrack_dump_done(struct conntrack_dump *dump OVS_UNUSED)
2358 {
2359 return 0;
2360 }
2361
2362 int
2363 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
2364 {
2365 unsigned i;
2366
2367 for (i = 0; i < CONNTRACK_BUCKETS; i++) {
2368 struct conn *conn, *next;
2369
2370 ct_lock_lock(&ct->buckets[i].lock);
2371 HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
2372 if ((!zone || *zone == conn->key.zone) &&
2373 (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
2374 conn_clean(ct, conn, &ct->buckets[i]);
2375 }
2376 }
2377 ct_lock_unlock(&ct->buckets[i].lock);
2378 }
2379
2380 ct_rwlock_wrlock(&ct->resources_lock);
2381 struct alg_exp_node *alg_exp_node, *alg_exp_node_next;
2382 HMAP_FOR_EACH_SAFE (alg_exp_node, alg_exp_node_next,
2383 node, &ct->alg_expectations) {
2384 if (!zone || *zone == alg_exp_node->key.zone) {
2385 ovs_list_remove(&alg_exp_node->exp_node);
2386 hmap_remove(&ct->alg_expectations, &alg_exp_node->node);
2387 free(alg_exp_node);
2388 }
2389 }
2390 ct_rwlock_unlock(&ct->resources_lock);
2391 return 0;
2392 }
2393
2394 /* This function must be called with the ct->resources read lock taken. */
2395 static struct alg_exp_node *
2396 expectation_lookup(struct hmap *alg_expectations,
2397 const struct conn_key *key, uint32_t basis)
2398 {
2399 struct conn_key check_key = *key;
2400 check_key.src.port = ALG_WC_SRC_PORT;
2401 struct alg_exp_node *alg_exp_node;
2402
2403 uint32_t alg_exp_conn_key_hash = conn_key_hash(&check_key, basis);
2404 HMAP_FOR_EACH_WITH_HASH (alg_exp_node, node,
2405 alg_exp_conn_key_hash,
2406 alg_expectations) {
2407 if (!conn_key_cmp(&alg_exp_node->key, &check_key)) {
2408 return alg_exp_node;
2409 }
2410 }
2411 return NULL;
2412 }
2413
2414 static void
2415 expectation_create(struct conntrack *ct,
2416 ovs_be16 dst_port,
2417 const long long now,
2418 enum ct_alg_mode mode,
2419 const struct conn *master_conn)
2420 {
2421 struct ct_addr src_addr;
2422 struct ct_addr dst_addr;
2423 struct ct_addr alg_nat_repl_addr;
2424
2425 switch (mode) {
2426 case CT_FTP_MODE_ACTIVE:
2427 case CT_TFTP_MODE:
2428 src_addr = master_conn->rev_key.src.addr;
2429 dst_addr = master_conn->rev_key.dst.addr;
2430 alg_nat_repl_addr = master_conn->key.src.addr;
2431 break;
2432 case CT_FTP_MODE_PASSIVE:
2433 src_addr = master_conn->key.src.addr;
2434 dst_addr = master_conn->key.dst.addr;
2435 alg_nat_repl_addr = master_conn->rev_key.dst.addr;
2436 break;
2437 default:
2438 OVS_NOT_REACHED();
2439 }
2440
2441 struct alg_exp_node *alg_exp_node =
2442 xzalloc(sizeof *alg_exp_node);
2443 alg_exp_node->key.dl_type = master_conn->key.dl_type;
2444 alg_exp_node->key.nw_proto = master_conn->key.nw_proto;
2445 alg_exp_node->key.zone = master_conn->key.zone;
2446 alg_exp_node->key.src.addr = src_addr;
2447 alg_exp_node->key.dst.addr = dst_addr;
2448 alg_exp_node->key.src.port = ALG_WC_SRC_PORT;
2449 alg_exp_node->key.dst.port = dst_port;
2450 alg_exp_node->master_mark = master_conn->mark;
2451 alg_exp_node->master_label = master_conn->label;
2452 alg_exp_node->master_key = master_conn->key;
2453 alg_exp_node->passive_mode = mode == CT_FTP_MODE_PASSIVE;
2454 /* Take the write lock here because it is almost 100%
2455 * likely that the lookup will fail and
2456 * expectation_create() will be called below. */
2457 ct_rwlock_wrlock(&ct->resources_lock);
2458 struct alg_exp_node *alg_exp = expectation_lookup(
2459 &ct->alg_expectations, &alg_exp_node->key, ct->hash_basis);
2460 if (alg_exp) {
2461 free(alg_exp_node);
2462 ct_rwlock_unlock(&ct->resources_lock);
2463 return;
2464 }
2465
2466 alg_exp_node->alg_nat_repl_addr = alg_nat_repl_addr;
2467 uint32_t alg_exp_conn_key_hash =
2468 conn_key_hash(&alg_exp_node->key,
2469 ct->hash_basis);
2470 hmap_insert(&ct->alg_expectations,
2471 &alg_exp_node->node,
2472 alg_exp_conn_key_hash);
2473
2474 alg_exp_init_expiration(ct, alg_exp_node, now);
2475 ct_rwlock_unlock(&ct->resources_lock);
2476 }
2477
2478 static uint8_t
2479 get_v4_byte_be(ovs_be32 v4_addr, uint8_t index)
2480 {
2481 uint8_t *byte_ptr = (OVS_FORCE uint8_t *) &v4_addr;
2482 return byte_ptr[index];
2483 }
2484
2485 static void
2486 replace_substring(char *substr, uint8_t substr_size,
2487 uint8_t total_size, char *rep_str,
2488 uint8_t rep_str_size)
2489 {
2490 memmove(substr + rep_str_size, substr + substr_size,
2491 total_size - substr_size);
2492 memcpy(substr, rep_str, rep_str_size);
2493 }
2494
2495 /* Replace IPV4 address in FTP message with NATed address. */
2496 static int
2497 repl_ftp_v4_addr(struct dp_packet *pkt, ovs_be32 v4_addr_rep,
2498 char *ftp_data_start,
2499 size_t addr_offset_from_ftp_data_start)
2500 {
2501 enum { MAX_FTP_V4_NAT_DELTA = 8 };
2502
2503 /* Do conservative check for pathological MTU usage. */
2504 uint32_t orig_used_size = dp_packet_size(pkt);
2505 uint16_t allocated_size = dp_packet_get_allocated(pkt);
2506 if (orig_used_size + MAX_FTP_V4_NAT_DELTA > allocated_size) {
2507 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
2508 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP",
2509 allocated_size);
2510 return 0;
2511 }
2512
2513 size_t remain_size = tcp_payload_length(pkt) -
2514 addr_offset_from_ftp_data_start;
2515
2516 int overall_delta = 0;
2517 char *byte_str = ftp_data_start + addr_offset_from_ftp_data_start;
2518
2519 /* Replace the existing IPv4 address by the new one. */
2520 for (uint8_t i = 0; i < 4; i++) {
2521 /* Find the end of the string for this octet. */
2522 char *next_delim = memchr(byte_str, ',', 4);
2523 ovs_assert(next_delim);
2524 int substr_size = next_delim - byte_str;
2525 remain_size -= substr_size;
2526
2527 /* Compose the new string for this octet, and replace it. */
2528 char rep_str[4];
2529 uint8_t rep_byte = get_v4_byte_be(v4_addr_rep, i);
2530 int replace_size = sprintf(rep_str, "%d", rep_byte);
2531 replace_substring(byte_str, substr_size, remain_size,
2532 rep_str, replace_size);
2533 overall_delta += replace_size - substr_size;
2534
2535 /* Advance past the octet and the following comma. */
2536 byte_str += replace_size + 1;
2537 }
2538
2539 dp_packet_set_size(pkt, orig_used_size + overall_delta);
2540 return overall_delta;
2541 }
2542
2543 static char *
2544 skip_non_digits(char *str)
2545 {
2546 while (!isdigit(*str) && *str != 0) {
2547 str++;
2548 }
2549 return str;
2550 }
2551
2552 static char *
2553 terminate_number_str(char *str, uint8_t max_digits)
2554 {
2555 uint8_t digits_found = 0;
2556 while (isdigit(*str) && digits_found <= max_digits) {
2557 str++;
2558 digits_found++;
2559 }
2560
2561 *str = 0;
2562 return str;
2563 }
2564
2565
2566 static void
2567 get_ftp_ctl_msg(struct dp_packet *pkt, char *ftp_msg)
2568 {
2569 struct tcp_header *th = dp_packet_l4(pkt);
2570 char *tcp_hdr = (char *) th;
2571 uint32_t tcp_payload_len = tcp_payload_length(pkt);
2572 size_t tcp_payload_of_interest = MIN(tcp_payload_len,
2573 LARGEST_FTP_MSG_OF_INTEREST);
2574 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2575
2576 ovs_strlcpy(ftp_msg, tcp_hdr + tcp_hdr_len,
2577 tcp_payload_of_interest);
2578 }
2579
2580 static enum ftp_ctl_pkt
2581 detect_ftp_ctl_type(const struct conn_lookup_ctx *ctx,
2582 struct dp_packet *pkt)
2583 {
2584
2585 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2586 get_ftp_ctl_msg(pkt, ftp_msg);
2587 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2588 if (strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD)) &&
2589 !strcasestr(ftp_msg, FTP_EPSV_REPLY)) {
2590 return CT_FTP_CTL_OTHER;
2591 }
2592 } else {
2593 if (strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD)) &&
2594 strncasecmp(ftp_msg, FTP_PASV_REPLY_CODE,
2595 strlen(FTP_PASV_REPLY_CODE))) {
2596 return CT_FTP_CTL_OTHER;
2597 }
2598 }
2599
2600 return CT_FTP_CTL_INTEREST;
2601 }
2602
2603 static enum ftp_ctl_pkt
2604 process_ftp_ctl_v4(struct conntrack *ct,
2605 struct dp_packet *pkt,
2606 const struct conn *conn_for_expectation,
2607 long long now, ovs_be32 *v4_addr_rep,
2608 char **ftp_data_v4_start,
2609 size_t *addr_offset_from_ftp_data_start)
2610 {
2611 struct tcp_header *th = dp_packet_l4(pkt);
2612 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2613 char *tcp_hdr = (char *) th;
2614 *ftp_data_v4_start = tcp_hdr + tcp_hdr_len;
2615 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2616 get_ftp_ctl_msg(pkt, ftp_msg);
2617
2618 char *ftp = ftp_msg;
2619 enum ct_alg_mode mode;
2620 if (!strncasecmp(ftp_msg, FTP_PORT_CMD, strlen(FTP_PORT_CMD))) {
2621 ftp = ftp_msg + strlen(FTP_PORT_CMD);
2622 mode = CT_FTP_MODE_ACTIVE;
2623 } else {
2624 ftp = ftp_msg + strlen(FTP_PASV_REPLY_CODE);
2625 mode = CT_FTP_MODE_PASSIVE;
2626 }
2627
2628 /* Find first space. */
2629 ftp = strchr(ftp, ' ');
2630 if (!ftp) {
2631 return CT_FTP_CTL_INVALID;
2632 }
2633
2634 /* Find the first digit, after space. */
2635 ftp = skip_non_digits(ftp);
2636 if (*ftp == 0) {
2637 return CT_FTP_CTL_INVALID;
2638 }
2639
2640 char *ip_addr_start = ftp;
2641 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
2642 uint8_t comma_count = 0;
2643
2644 while (comma_count < 4 && *ftp) {
2645 if (*ftp == ',') {
2646 comma_count++;
2647 if (comma_count == 4) {
2648 *ftp = 0;
2649 } else {
2650 *ftp = '.';
2651 }
2652 }
2653 ftp++;
2654 }
2655 if (comma_count != 4) {
2656 return CT_FTP_CTL_INVALID;
2657 }
2658
2659 struct in_addr ip_addr;
2660 int rc2 = inet_pton(AF_INET, ip_addr_start, &ip_addr);
2661 if (rc2 != 1) {
2662 return CT_FTP_CTL_INVALID;
2663 }
2664
2665 char *save_ftp = ftp;
2666 ftp = terminate_number_str(ftp, MAX_FTP_PORT_DGTS);
2667 if (!ftp) {
2668 return CT_FTP_CTL_INVALID;
2669 }
2670 int value;
2671 if (!str_to_int(save_ftp, 10, &value)) {
2672 return CT_FTP_CTL_INVALID;
2673 }
2674
2675 /* This is derived from the L4 port maximum is 65535. */
2676 if (value > 255) {
2677 return CT_FTP_CTL_INVALID;
2678 }
2679
2680 uint16_t port_hs = value;
2681 port_hs <<= 8;
2682
2683 /* Skip over comma. */
2684 ftp++;
2685 save_ftp = ftp;
2686 bool digit_found = false;
2687 while (isdigit(*ftp)) {
2688 ftp++;
2689 digit_found = true;
2690 }
2691 if (!digit_found) {
2692 return CT_FTP_CTL_INVALID;
2693 }
2694 *ftp = 0;
2695 if (!str_to_int(save_ftp, 10, &value)) {
2696 return CT_FTP_CTL_INVALID;
2697 }
2698
2699 if (value > 255) {
2700 return CT_FTP_CTL_INVALID;
2701 }
2702
2703 uint16_t port_lo_hs = value;
2704 if (65535 - port_hs < port_lo_hs) {
2705 return CT_FTP_CTL_INVALID;
2706 }
2707 port_hs |= port_lo_hs;
2708 ovs_be16 port = htons(port_hs);
2709 ovs_be32 conn_ipv4_addr;
2710
2711 switch (mode) {
2712 case CT_FTP_MODE_ACTIVE:
2713 *v4_addr_rep = conn_for_expectation->rev_key.dst.addr.ipv4_aligned;
2714 conn_ipv4_addr = conn_for_expectation->key.src.addr.ipv4_aligned;
2715 break;
2716 case CT_FTP_MODE_PASSIVE:
2717 *v4_addr_rep = conn_for_expectation->key.dst.addr.ipv4_aligned;
2718 conn_ipv4_addr = conn_for_expectation->rev_key.src.addr.ipv4_aligned;
2719 break;
2720 case CT_TFTP_MODE:
2721 default:
2722 OVS_NOT_REACHED();
2723 }
2724
2725 ovs_be32 ftp_ipv4_addr;
2726 ftp_ipv4_addr = ip_addr.s_addr;
2727 /* Although most servers will block this exploit, there may be some
2728 * less well managed. */
2729 if (ftp_ipv4_addr != conn_ipv4_addr && ftp_ipv4_addr != *v4_addr_rep) {
2730 return CT_FTP_CTL_INVALID;
2731 }
2732
2733 expectation_create(ct, port, now, mode, conn_for_expectation);
2734 return CT_FTP_CTL_INTEREST;
2735 }
2736
2737 static char *
2738 skip_ipv6_digits(char *str)
2739 {
2740 while (isxdigit(*str) || *str == ':' || *str == '.') {
2741 str++;
2742 }
2743 return str;
2744 }
2745
2746 static enum ftp_ctl_pkt
2747 process_ftp_ctl_v6(struct conntrack *ct,
2748 struct dp_packet *pkt,
2749 const struct conn *conn_for_expectation,
2750 long long now,
2751 struct ct_addr *v6_addr_rep,
2752 char **ftp_data_start,
2753 size_t *addr_offset_from_ftp_data_start,
2754 size_t *addr_size, enum ct_alg_mode *mode)
2755 {
2756 struct tcp_header *th = dp_packet_l4(pkt);
2757 size_t tcp_hdr_len = TCP_OFFSET(th->tcp_ctl) * 4;
2758 char *tcp_hdr = (char *) th;
2759 char ftp_msg[LARGEST_FTP_MSG_OF_INTEREST + 1] = {0};
2760
2761 get_ftp_ctl_msg(pkt, ftp_msg);
2762 *ftp_data_start = tcp_hdr + tcp_hdr_len;
2763
2764 char *ftp = ftp_msg;
2765 struct in6_addr ip6_addr;
2766 if (!strncasecmp(ftp_msg, FTP_EPRT_CMD, strlen(FTP_EPRT_CMD))) {
2767 ftp = ftp_msg + strlen(FTP_EPRT_CMD);
2768 ftp = skip_non_digits(ftp);
2769 if (*ftp != FTP_AF_V6 || isdigit(ftp[1])) {
2770 return CT_FTP_CTL_INVALID;
2771 }
2772 /* Jump over delimiter. */
2773 ftp += 2;
2774
2775 char *ip_addr_start = ftp;
2776 memset(&ip6_addr, 0, sizeof ip6_addr);
2777 *addr_offset_from_ftp_data_start = ip_addr_start - ftp_msg;
2778 ftp = skip_ipv6_digits(ftp);
2779 *ftp = 0;
2780 *addr_size = ftp - ip_addr_start;
2781 int rc2 = inet_pton(AF_INET6, ip_addr_start, &ip6_addr);
2782 if (rc2 != 1) {
2783 return CT_FTP_CTL_INVALID;
2784 }
2785 ftp++;
2786 *mode = CT_FTP_MODE_ACTIVE;
2787 } else {
2788 ftp = ftp_msg + strcspn(ftp_msg, "(");
2789 ftp = skip_non_digits(ftp);
2790 if (!isdigit(*ftp)) {
2791 return CT_FTP_CTL_INVALID;
2792 }
2793
2794 /* Not used for passive mode. */
2795 *addr_offset_from_ftp_data_start = 0;
2796 *addr_size = 0;
2797
2798 *mode = CT_FTP_MODE_PASSIVE;
2799 }
2800
2801 char *save_ftp = ftp;
2802 ftp = terminate_number_str(ftp, MAX_EXT_FTP_PORT_DGTS);
2803 if (!ftp) {
2804 return CT_FTP_CTL_INVALID;
2805 }
2806 int value;
2807 if (!str_to_int(save_ftp, 10, &value)) {
2808 return CT_FTP_CTL_INVALID;
2809 }
2810 if (value > CT_MAX_L4_PORT) {
2811 return CT_FTP_CTL_INVALID;
2812 }
2813
2814 uint16_t port_hs = value;
2815 ovs_be16 port = htons(port_hs);
2816
2817 switch (*mode) {
2818 case CT_FTP_MODE_ACTIVE:
2819 *v6_addr_rep = conn_for_expectation->rev_key.dst.addr;
2820 /* Although most servers will block this exploit, there may be some
2821 * less well managed. */
2822 if (memcmp(&ip6_addr, &v6_addr_rep->ipv6_aligned, sizeof ip6_addr) &&
2823 memcmp(&ip6_addr, &conn_for_expectation->key.src.addr.ipv6_aligned,
2824 sizeof ip6_addr)) {
2825 return CT_FTP_CTL_INVALID;
2826 }
2827 break;
2828 case CT_FTP_MODE_PASSIVE:
2829 *v6_addr_rep = conn_for_expectation->key.dst.addr;
2830 break;
2831 case CT_TFTP_MODE:
2832 default:
2833 OVS_NOT_REACHED();
2834 }
2835
2836 expectation_create(ct, port, now, *mode, conn_for_expectation);
2837 return CT_FTP_CTL_INTEREST;
2838 }
2839
2840 static int
2841 repl_ftp_v6_addr(struct dp_packet *pkt, struct ct_addr v6_addr_rep,
2842 char *ftp_data_start,
2843 size_t addr_offset_from_ftp_data_start,
2844 size_t addr_size, enum ct_alg_mode mode)
2845 {
2846 /* This is slightly bigger than really possible. */
2847 enum { MAX_FTP_V6_NAT_DELTA = 45 };
2848
2849 if (mode == CT_FTP_MODE_PASSIVE) {
2850 return 0;
2851 }
2852
2853 /* Do conservative check for pathological MTU usage. */
2854 uint32_t orig_used_size = dp_packet_size(pkt);
2855 uint16_t allocated_size = dp_packet_get_allocated(pkt);
2856 if (orig_used_size + MAX_FTP_V6_NAT_DELTA > allocated_size) {
2857 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
2858 VLOG_WARN_RL(&rl, "Unsupported effective MTU %u used with FTP",
2859 allocated_size);
2860 return 0;
2861 }
2862
2863 const char *rc;
2864 char v6_addr_str[IPV6_SCAN_LEN] = {0};
2865 rc = inet_ntop(AF_INET6, &v6_addr_rep.ipv6_aligned, v6_addr_str,
2866 IPV6_SCAN_LEN - 1);
2867 ovs_assert(rc != NULL);
2868
2869 size_t replace_addr_size = strlen(v6_addr_str);
2870
2871 size_t remain_size = tcp_payload_length(pkt) -
2872 addr_offset_from_ftp_data_start;
2873
2874 char *pkt_addr_str = ftp_data_start + addr_offset_from_ftp_data_start;
2875 replace_substring(pkt_addr_str, addr_size, remain_size,
2876 v6_addr_str, replace_addr_size);
2877
2878 int overall_delta = (int) replace_addr_size - (int) addr_size;
2879
2880 dp_packet_set_size(pkt, orig_used_size + overall_delta);
2881 return overall_delta;
2882 }
2883
2884 static void
2885 handle_ftp_ctl(struct conntrack *ct, const struct conn_lookup_ctx *ctx,
2886 struct dp_packet *pkt,
2887 const struct conn *conn_for_expectation,
2888 long long now, enum ftp_ctl_pkt ftp_ctl, bool nat)
2889 {
2890 struct ip_header *l3_hdr = dp_packet_l3(pkt);
2891 ovs_be32 v4_addr_rep = 0;
2892 struct ct_addr v6_addr_rep;
2893 size_t addr_offset_from_ftp_data_start;
2894 size_t addr_size = 0;
2895 char *ftp_data_start;
2896 bool do_seq_skew_adj = true;
2897 enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE;
2898
2899 if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) {
2900 return;
2901 }
2902
2903 if (!nat || !conn_for_expectation->seq_skew) {
2904 do_seq_skew_adj = false;
2905 }
2906
2907 struct ovs_16aligned_ip6_hdr *nh6 = dp_packet_l3(pkt);
2908 int64_t seq_skew = 0;
2909 bool seq_skew_dir;
2910 if (ftp_ctl == CT_FTP_CTL_OTHER) {
2911 seq_skew = conn_for_expectation->seq_skew;
2912 seq_skew_dir = conn_for_expectation->seq_skew_dir;
2913 } else if (ftp_ctl == CT_FTP_CTL_INTEREST) {
2914 enum ftp_ctl_pkt rc;
2915 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2916 rc = process_ftp_ctl_v6(ct, pkt, conn_for_expectation,
2917 now, &v6_addr_rep, &ftp_data_start,
2918 &addr_offset_from_ftp_data_start,
2919 &addr_size, &mode);
2920 } else {
2921 rc = process_ftp_ctl_v4(ct, pkt, conn_for_expectation,
2922 now, &v4_addr_rep, &ftp_data_start,
2923 &addr_offset_from_ftp_data_start);
2924 }
2925 if (rc == CT_FTP_CTL_INVALID) {
2926 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
2927 VLOG_WARN_RL(&rl, "Invalid FTP control packet format");
2928 pkt->md.ct_state |= CS_TRACKED | CS_INVALID;
2929 return;
2930 } else if (rc == CT_FTP_CTL_INTEREST) {
2931 uint16_t ip_len;
2932 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
2933 seq_skew = repl_ftp_v6_addr(pkt, v6_addr_rep, ftp_data_start,
2934 addr_offset_from_ftp_data_start,
2935 addr_size, mode);
2936 seq_skew_dir = ctx->reply;
2937 if (seq_skew) {
2938 ip_len = ntohs(nh6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2939 ip_len += seq_skew;
2940 nh6->ip6_ctlun.ip6_un1.ip6_un1_plen = htons(ip_len);
2941 conn_seq_skew_set(ct, &conn_for_expectation->key, now,
2942 seq_skew, seq_skew_dir);
2943 }
2944 } else {
2945 seq_skew = repl_ftp_v4_addr(pkt, v4_addr_rep, ftp_data_start,
2946 addr_offset_from_ftp_data_start);
2947 seq_skew_dir = ctx->reply;
2948 ip_len = ntohs(l3_hdr->ip_tot_len);
2949 if (seq_skew) {
2950 ip_len += seq_skew;
2951 l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
2952 l3_hdr->ip_tot_len, htons(ip_len));
2953 l3_hdr->ip_tot_len = htons(ip_len);
2954 conn_seq_skew_set(ct, &conn_for_expectation->key, now,
2955 seq_skew, seq_skew_dir);
2956 }
2957 }
2958 } else {
2959 OVS_NOT_REACHED();
2960 }
2961 } else {
2962 OVS_NOT_REACHED();
2963 }
2964
2965 struct tcp_header *th = dp_packet_l4(pkt);
2966 if (do_seq_skew_adj && seq_skew != 0) {
2967 if (ctx->reply != conn_for_expectation->seq_skew_dir) {
2968
2969 uint32_t tcp_ack = ntohl(get_16aligned_be32(&th->tcp_ack));
2970
2971 if ((seq_skew > 0) && (tcp_ack < seq_skew)) {
2972 /* Should not be possible; will be marked invalid. */
2973 tcp_ack = 0;
2974 } else if ((seq_skew < 0) && (UINT32_MAX - tcp_ack < -seq_skew)) {
2975 tcp_ack = (-seq_skew) - (UINT32_MAX - tcp_ack);
2976 } else {
2977 tcp_ack -= seq_skew;
2978 }
2979 ovs_be32 new_tcp_ack = htonl(tcp_ack);
2980 put_16aligned_be32(&th->tcp_ack, new_tcp_ack);
2981 } else {
2982 uint32_t tcp_seq = ntohl(get_16aligned_be32(&th->tcp_seq));
2983 if ((seq_skew > 0) && (UINT32_MAX - tcp_seq < seq_skew)) {
2984 tcp_seq = seq_skew - (UINT32_MAX - tcp_seq);
2985 } else if ((seq_skew < 0) && (tcp_seq < -seq_skew)) {
2986 /* Should not be possible; will be marked invalid. */
2987 tcp_seq = 0;
2988 } else {
2989 tcp_seq += seq_skew;
2990 }
2991 ovs_be32 new_tcp_seq = htonl(tcp_seq);
2992 put_16aligned_be32(&th->tcp_seq, new_tcp_seq);
2993 }
2994 }
2995
2996 const char *tail = dp_packet_tail(pkt);
2997 uint8_t pad = dp_packet_l2_pad_size(pkt);
2998 th->tcp_csum = 0;
2999 uint32_t tcp_csum;
3000 if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
3001 tcp_csum = packet_csum_pseudoheader6(nh6);
3002 } else {
3003 tcp_csum = packet_csum_pseudoheader(l3_hdr);
3004 }
3005 th->tcp_csum = csum_finish(
3006 csum_continue(tcp_csum, th, tail - (char *) th - pad));
3007 return;
3008 }
3009
3010 static void
3011 handle_tftp_ctl(struct conntrack *ct,
3012 const struct conn *conn_for_expectation,
3013 long long now)
3014 {
3015 expectation_create(ct, conn_for_expectation->key.src.port, now,
3016 CT_TFTP_MODE, conn_for_expectation);
3017 return;
3018 }