]> git.proxmox.com Git - mirror_ovs.git/blame - datapath/conntrack.c
datapath: Refactor labels initialization.
[mirror_ovs.git] / datapath / conntrack.c
CommitLineData
a94ebc39
JS
1/*
2 * Copyright (c) 2015 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13
14#include <linux/kconfig.h>
15#include <linux/version.h>
16
8063e095 17#if IS_ENABLED(CONFIG_NF_CONNTRACK)
a94ebc39
JS
18
19#include <linux/module.h>
20#include <linux/openvswitch.h>
f8f97cdc
JR
21#include <linux/tcp.h>
22#include <linux/udp.h>
23#include <linux/sctp.h>
a94ebc39
JS
24#include <net/ip.h>
25#include <net/netfilter/nf_conntrack_core.h>
11251c17 26#include <net/netfilter/nf_conntrack_helper.h>
038e34ab 27#include <net/netfilter/nf_conntrack_labels.h>
f8f97cdc 28#include <net/netfilter/nf_conntrack_seqadj.h>
a94ebc39
JS
29#include <net/netfilter/nf_conntrack_zones.h>
30#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
31
f8f97cdc
JR
32#ifdef CONFIG_NF_NAT_NEEDED
33#include <linux/netfilter/nf_nat.h>
34#include <net/netfilter/nf_nat_core.h>
35#include <net/netfilter/nf_nat_l3proto.h>
36#endif
37
a94ebc39
JS
38#include "datapath.h"
39#include "conntrack.h"
40#include "flow.h"
41#include "flow_netlink.h"
86c2eb45 42#include "gso.h"
a94ebc39
JS
43
44struct ovs_ct_len_tbl {
f8f97cdc
JR
45 int maxlen;
46 int minlen;
a94ebc39
JS
47};
48
372ce973
JS
49/* Metadata mark for masked write to conntrack mark */
50struct md_mark {
51 u32 value;
52 u32 mask;
53};
54
038e34ab 55/* Metadata label for masked write to conntrack label. */
c05e2094
JS
56struct md_labels {
57 struct ovs_key_ct_labels value;
58 struct ovs_key_ct_labels mask;
038e34ab
JS
59};
60
f8f97cdc
JR
61enum ovs_ct_nat {
62 OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */
63 OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
64 OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
65};
66
a94ebc39
JS
67/* Conntrack action context for execution. */
68struct ovs_conntrack_info {
11251c17 69 struct nf_conntrack_helper *helper;
a94ebc39
JS
70 struct nf_conntrack_zone zone;
71 struct nf_conn *ct;
c05e2094 72 u8 commit : 1;
f8f97cdc 73 u8 nat : 3; /* enum ovs_ct_nat */
9f1de150 74 u8 random_fully_compat : 1; /* bool */
a94ebc39 75 u16 family;
372ce973 76 struct md_mark mark;
c05e2094 77 struct md_labels labels;
f8f97cdc
JR
78#ifdef CONFIG_NF_NAT_NEEDED
79 struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */
80#endif
a94ebc39
JS
81};
82
11251c17
JS
83static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
84
a94ebc39
JS
85static u16 key_to_nfproto(const struct sw_flow_key *key)
86{
87 switch (ntohs(key->eth.type)) {
88 case ETH_P_IP:
89 return NFPROTO_IPV4;
90 case ETH_P_IPV6:
91 return NFPROTO_IPV6;
92 default:
93 return NFPROTO_UNSPEC;
94 }
95}
96
97/* Map SKB connection state into the values used by flow definition. */
98static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
99{
100 u8 ct_state = OVS_CS_F_TRACKED;
101
102 switch (ctinfo) {
103 case IP_CT_ESTABLISHED_REPLY:
104 case IP_CT_RELATED_REPLY:
a94ebc39
JS
105 ct_state |= OVS_CS_F_REPLY_DIR;
106 break;
107 default:
108 break;
109 }
110
111 switch (ctinfo) {
112 case IP_CT_ESTABLISHED:
113 case IP_CT_ESTABLISHED_REPLY:
114 ct_state |= OVS_CS_F_ESTABLISHED;
115 break;
116 case IP_CT_RELATED:
117 case IP_CT_RELATED_REPLY:
118 ct_state |= OVS_CS_F_RELATED;
119 break;
120 case IP_CT_NEW:
a94ebc39
JS
121 ct_state |= OVS_CS_F_NEW;
122 break;
123 default:
124 break;
125 }
126
127 return ct_state;
128}
129
c05e2094
JS
130static u32 ovs_ct_get_mark(const struct nf_conn *ct)
131{
132#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
133 return ct ? ct->mark : 0;
134#else
135 return 0;
136#endif
137}
138
237f9413
JR
139/* Guard against conntrack labels max size shrinking below 128 bits. */
140#if NF_CT_LABELS_MAX_SIZE < 16
141#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes
142#endif
143
c05e2094
JS
144static void ovs_ct_get_labels(const struct nf_conn *ct,
145 struct ovs_key_ct_labels *labels)
038e34ab
JS
146{
147 struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
148
237f9413
JR
149 if (cl)
150 memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
151 else
c05e2094 152 memset(labels, 0, OVS_CT_LABELS_LEN);
038e34ab
JS
153}
154
a94ebc39 155static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
372ce973
JS
156 const struct nf_conntrack_zone *zone,
157 const struct nf_conn *ct)
a94ebc39
JS
158{
159 key->ct.state = state;
160 key->ct.zone = zone->id;
c05e2094
JS
161 key->ct.mark = ovs_ct_get_mark(ct);
162 ovs_ct_get_labels(ct, &key->ct.labels);
a94ebc39
JS
163}
164
e3c42eb8 165/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has
f8f97cdc
JR
166 * previously sent the packet to conntrack via the ct action. If
167 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
168 * initialized from the connection status.
a94ebc39
JS
169 */
170static void ovs_ct_update_key(const struct sk_buff *skb,
f23593a1 171 const struct ovs_conntrack_info *info,
f8f97cdc
JR
172 struct sw_flow_key *key, bool post_ct,
173 bool keep_nat_flags)
a94ebc39
JS
174{
175 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
176 enum ip_conntrack_info ctinfo;
177 struct nf_conn *ct;
178 u8 state = 0;
179
180 ct = nf_ct_get(skb, &ctinfo);
181 if (ct) {
182 state = ovs_ct_get_state(ctinfo);
b0f251cd 183 /* All unconfirmed entries are NEW connections. */
c05e2094
JS
184 if (!nf_ct_is_confirmed(ct))
185 state |= OVS_CS_F_NEW;
b0f251cd
JR
186 /* OVS persists the related flag for the duration of the
187 * connection.
188 */
a94ebc39
JS
189 if (ct->master)
190 state |= OVS_CS_F_RELATED;
f8f97cdc
JR
191 if (keep_nat_flags) {
192 state |= key->ct.state & OVS_CS_F_NAT_MASK;
193 } else {
194 if (ct->status & IPS_SRC_NAT)
195 state |= OVS_CS_F_SRC_NAT;
196 if (ct->status & IPS_DST_NAT)
197 state |= OVS_CS_F_DST_NAT;
198 }
a94ebc39
JS
199 zone = nf_ct_zone(ct);
200 } else if (post_ct) {
201 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
f23593a1
JS
202 if (info)
203 zone = &info->zone;
a94ebc39 204 }
372ce973 205 __ovs_ct_update_key(key, state, zone, ct);
a94ebc39
JS
206}
207
b0f251cd
JR
208/* This is called to initialize CT key fields possibly coming in from the local
209 * stack.
210 */
a94ebc39
JS
211void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
212{
f8f97cdc 213 ovs_ct_update_key(skb, NULL, key, false, false);
a94ebc39
JS
214}
215
216int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
217{
c05e2094 218 if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))
a94ebc39
JS
219 return -EMSGSIZE;
220
221 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
222 nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone))
223 return -EMSGSIZE;
224
372ce973
JS
225 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
226 nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
227 return -EMSGSIZE;
228
c05e2094
JS
229 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
230 nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels),
231 &key->ct.labels))
038e34ab
JS
232 return -EMSGSIZE;
233
372ce973
JS
234 return 0;
235}
236
efd40994 237static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
372ce973
JS
238 u32 ct_mark, u32 mask)
239{
c05e2094 240#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
372ce973
JS
241 u32 new_mark;
242
372ce973
JS
243 new_mark = ct_mark | (ct->mark & ~(mask));
244 if (ct->mark != new_mark) {
245 ct->mark = new_mark;
1b22e621
JR
246 if (nf_ct_is_confirmed(ct))
247 nf_conntrack_event_cache(IPCT_MARK, ct);
372ce973
JS
248 key->ct.mark = new_mark;
249 }
250
a94ebc39 251 return 0;
c05e2094
JS
252#else
253 return -ENOTSUPP;
254#endif
a94ebc39
JS
255}
256
efd40994 257static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct)
038e34ab 258{
038e34ab 259 struct nf_conn_labels *cl;
038e34ab
JS
260
261 cl = nf_ct_labels_find(ct);
262 if (!cl) {
263 nf_ct_labels_ext_add(ct);
264 cl = nf_ct_labels_find(ct);
265 }
9f04eabe 266
efd40994
JR
267 return cl;
268}
269
270/* Initialize labels for a new, yet to be committed conntrack entry. Note that
271 * since the new connection is not yet confirmed, and thus no-one else has
272 * access to it's labels, we simply write them over.
273 */
274static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key,
275 const struct ovs_key_ct_labels *labels,
276 const struct ovs_key_ct_labels *mask)
277{
278 struct nf_conn_labels *cl;
279 u32 *dst;
280 int i;
281
282 cl = ovs_ct_get_conn_labels(ct);
283 if (!cl)
038e34ab
JS
284 return -ENOSPC;
285
efd40994
JR
286 dst = (u32 *)cl->bits;
287 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
288 dst[i] = (dst[i] & ~mask->ct_labels_32[i]) |
289 (labels->ct_labels_32[i] & mask->ct_labels_32[i]);
1b22e621 290
efd40994
JR
291 /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the
292 * IPCT_LABEL bit it set in the event cache.
293 */
294 nf_conntrack_event_cache(IPCT_LABEL, ct);
1b22e621 295
efd40994
JR
296 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
297
298 return 0;
299}
300
301static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
302 const struct ovs_key_ct_labels *labels,
303 const struct ovs_key_ct_labels *mask)
304{
305 struct nf_conn_labels *cl;
306 int err;
307
308 cl = ovs_ct_get_conn_labels(ct);
309 if (!cl)
310 return -ENOSPC;
311
312 err = nf_connlabels_replace(ct, labels->ct_labels_32,
313 mask->ct_labels_32,
314 OVS_CT_LABELS_LEN_32);
315 if (err)
316 return err;
317
318 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN);
038e34ab 319
038e34ab
JS
320 return 0;
321}
322
11251c17
JS
323/* 'skb' should already be pulled to nh_ofs. */
324static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
325{
326 const struct nf_conntrack_helper *helper;
327 const struct nf_conn_help *help;
328 enum ip_conntrack_info ctinfo;
329 unsigned int protoff;
330 struct nf_conn *ct;
4cc85f28 331 u8 nexthdr;
f8f97cdc 332 int err;
11251c17 333
3d47fa46
JH
334#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
335 bool dst_set = false;
336 struct rtable rt = { .rt_flags = 0 };
337#endif
338
11251c17
JS
339 ct = nf_ct_get(skb, &ctinfo);
340 if (!ct || ctinfo == IP_CT_RELATED_REPLY)
341 return NF_ACCEPT;
342
343 help = nfct_help(ct);
344 if (!help)
345 return NF_ACCEPT;
346
347 helper = rcu_dereference(help->helper);
348 if (!helper)
349 return NF_ACCEPT;
350
351 switch (proto) {
352 case NFPROTO_IPV4:
353 protoff = ip_hdrlen(skb);
354 break;
355 case NFPROTO_IPV6: {
11251c17 356 __be16 frag_off;
c05e2094 357 int ofs;
11251c17 358
4cc85f28 359 nexthdr = ipv6_hdr(skb)->nexthdr;
c05e2094
JS
360 ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
361 &frag_off);
362 if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
11251c17
JS
363 pr_debug("proto header not found\n");
364 return NF_ACCEPT;
365 }
c05e2094 366 protoff = ofs;
11251c17
JS
367 break;
368 }
369 default:
370 WARN_ONCE(1, "helper invoked on non-IP family!");
371 return NF_DROP;
372 }
373
4cc85f28
JR
374#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
375 /* Linux 4.5 and older depend on skb_dst being set when recalculating
376 * checksums after NAT helper has mangled TCP or UDP packet payload.
3d47fa46
JH
377 * skb_dst is cast to a rtable struct and the flags examined.
378 * Forcing these flags to have RTCF_LOCAL not set ensures checksum mod
379 * is carried out in the same way as kernel versions > 4.5
4cc85f28 380 */
3d47fa46
JH
381 if (ct->status & IPS_NAT_MASK && skb->ip_summed != CHECKSUM_PARTIAL
382 && !skb_dst(skb)) {
383 dst_set = true;
384 skb_dst_set(skb, &rt.dst);
4cc85f28
JR
385 }
386#endif
f8f97cdc
JR
387 err = helper->help(skb, protoff, ct, ctinfo);
388 if (err != NF_ACCEPT)
389 return err;
390
3d47fa46
JH
391#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
392 if (dst_set)
393 skb_dst_set(skb, NULL);
394#endif
395
f8f97cdc
JR
396 /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
397 * FTP with NAT) adusting the TCP payload size when mangling IP
398 * addresses and/or port numbers in the text-based control connection.
399 */
400 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
401 !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
402 return NF_DROP;
403 return NF_ACCEPT;
11251c17
JS
404}
405
c05e2094
JS
406/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
407 * value if 'skb' is freed.
408 */
a94ebc39
JS
409static int handle_fragments(struct net *net, struct sw_flow_key *key,
410 u16 zone, struct sk_buff *skb)
411{
86c2eb45 412 struct ovs_gso_cb ovs_cb = *OVS_GSO_CB(skb);
2e602ea3 413 int err;
a94ebc39 414
a94ebc39
JS
415 if (key->eth.type == htons(ETH_P_IP)) {
416 enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
a94ebc39
JS
417
418 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
39c0ff22 419 err = ip_defrag(net, skb, user);
a94ebc39
JS
420 if (err)
421 return err;
422
86c2eb45 423 ovs_cb.dp_cb.mru = IPCB(skb)->frag_max_size;
a94ebc39 424#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
c05e2094 425 } else if (key->eth.type == htons(ETH_P_IPV6)) {
a94ebc39 426 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
a94ebc39 427
66ec6da8 428 skb_orphan(skb);
a94ebc39 429 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
2e602ea3 430 err = nf_ct_frag6_gather(net, skb, user);
6b4fe5a9
DDP
431 if (err) {
432 if (err != -EINPROGRESS)
433 kfree_skb(skb);
2e602ea3 434 return err;
6b4fe5a9 435 }
a94ebc39 436
2e602ea3 437 key->ip.proto = ipv6_hdr(skb)->nexthdr;
86c2eb45 438 ovs_cb.dp_cb.mru = IP6CB(skb)->frag_max_size;
a94ebc39
JS
439#endif /* IP frag support */
440 } else {
c05e2094 441 kfree_skb(skb);
a94ebc39
JS
442 return -EPFNOSUPPORT;
443 }
444
445 key->ip.frag = OVS_FRAG_TYPE_NONE;
446 skb_clear_hash(skb);
447 skb->ignore_df = 1;
86c2eb45 448 *OVS_GSO_CB(skb) = ovs_cb;
a94ebc39
JS
449
450 return 0;
451}
452
453static struct nf_conntrack_expect *
454ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
455 u16 proto, const struct sk_buff *skb)
456{
457 struct nf_conntrack_tuple tuple;
458
fa67f8e0 459 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
a94ebc39
JS
460 return NULL;
461 return __nf_ct_expect_find(net, zone, &tuple);
462}
463
3dd9e118
JR
464/* This replicates logic from nf_conntrack_core.c that is not exported. */
465static enum ip_conntrack_info
466ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
467{
468 const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
469
470 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
471 return IP_CT_ESTABLISHED_REPLY;
472 /* Once we've had two way comms, always ESTABLISHED. */
473 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
474 return IP_CT_ESTABLISHED;
475 if (test_bit(IPS_EXPECTED_BIT, &ct->status))
476 return IP_CT_RELATED;
477 return IP_CT_NEW;
478}
479
480/* Find an existing connection which this packet belongs to without
481 * re-attributing statistics or modifying the connection state. This allows an
e3c42eb8 482 * skb->_nfct lost due to an upcall to be recovered during actions execution.
3dd9e118
JR
483 *
484 * Must be called with rcu_read_lock.
485 *
e3c42eb8
JR
486 * On success, populates skb->_nfct and returns the connection. Returns NULL
487 * if there is no existing entry.
3dd9e118
JR
488 */
489static struct nf_conn *
490ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
c0324e37 491 u8 l3num, struct sk_buff *skb, bool natted)
3dd9e118
JR
492{
493 struct nf_conntrack_l3proto *l3proto;
494 struct nf_conntrack_l4proto *l4proto;
495 struct nf_conntrack_tuple tuple;
496 struct nf_conntrack_tuple_hash *h;
3dd9e118
JR
497 struct nf_conn *ct;
498 unsigned int dataoff;
499 u8 protonum;
500
501 l3proto = __nf_ct_l3proto_find(l3num);
3dd9e118
JR
502 if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
503 &protonum) <= 0) {
504 pr_debug("ovs_ct_find_existing: Can't get protonum\n");
505 return NULL;
506 }
507 l4proto = __nf_ct_l4proto_find(l3num, protonum);
3dd9e118
JR
508 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
509 protonum, net, &tuple, l3proto, l4proto)) {
510 pr_debug("ovs_ct_find_existing: Can't get tuple\n");
511 return NULL;
512 }
513
c0324e37
JR
514 /* Must invert the tuple if skb has been transformed by NAT. */
515 if (natted) {
516 struct nf_conntrack_tuple inverse;
517
518 if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
519 pr_debug("ovs_ct_find_existing: Inversion failed!\n");
520 return NULL;
521 }
522 tuple = inverse;
523 }
524
3dd9e118
JR
525 /* look for tuple match */
526 h = nf_conntrack_find_get(net, zone, &tuple);
527 if (!h)
528 return NULL; /* Not found. */
529
530 ct = nf_ct_tuplehash_to_ctrack(h);
531
c0324e37
JR
532 /* Inverted packet tuple matches the reverse direction conntrack tuple,
533 * select the other tuplehash to get the right 'ctinfo' bits for this
534 * packet.
535 */
536 if (natted)
537 h = &ct->tuplehash[!h->tuple.dst.dir];
538
dfa791b2 539 nf_ct_set(skb, ct, ovs_ct_get_info(h));
3dd9e118
JR
540 return ct;
541}
542
e3c42eb8 543/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
3dd9e118
JR
544static bool skb_nfct_cached(struct net *net,
545 const struct sw_flow_key *key,
546 const struct ovs_conntrack_info *info,
547 struct sk_buff *skb)
a94ebc39
JS
548{
549 enum ip_conntrack_info ctinfo;
550 struct nf_conn *ct;
551
552 ct = nf_ct_get(skb, &ctinfo);
3dd9e118 553 /* If no ct, check if we have evidence that an existing conntrack entry
e3c42eb8 554 * might be found for this skb. This happens when we lose a skb->_nfct
3dd9e118
JR
555 * due to an upcall. If the connection was not confirmed, it is not
556 * cached and needs to be run through conntrack again.
557 */
558 if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
559 !(key->ct.state & OVS_CS_F_INVALID) &&
560 key->ct.zone == info->zone.id)
c0324e37
JR
561 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb,
562 !!(key->ct.state
563 & OVS_CS_F_NAT_MASK));
a94ebc39
JS
564 if (!ct)
565 return false;
566 if (!net_eq(net, read_pnet(&ct->ct_net)))
567 return false;
568 if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct)))
569 return false;
11251c17
JS
570 if (info->helper) {
571 struct nf_conn_help *help;
572
573 help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
574 if (help && rcu_access_pointer(help->helper) != info->helper)
575 return false;
576 }
a94ebc39
JS
577
578 return true;
579}
580
f8f97cdc
JR
581#ifdef CONFIG_NF_NAT_NEEDED
582/* Modelled after nf_nat_ipv[46]_fn().
583 * range is only used for new, uninitialized NAT state.
584 * Returns either NF_ACCEPT or NF_DROP.
585 */
586static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
587 enum ip_conntrack_info ctinfo,
588 const struct nf_nat_range *range,
589 enum nf_nat_manip_type maniptype)
590{
591 int hooknum, nh_off, err = NF_ACCEPT;
592
593 nh_off = skb_network_offset(skb);
073c7b86 594 skb_pull_rcsum(skb, nh_off);
f8f97cdc
JR
595
596 /* See HOOK2MANIP(). */
597 if (maniptype == NF_NAT_MANIP_SRC)
598 hooknum = NF_INET_LOCAL_IN; /* Source NAT */
599 else
600 hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
601
602 switch (ctinfo) {
603 case IP_CT_RELATED:
604 case IP_CT_RELATED_REPLY:
90b01477
AB
605 if (IS_ENABLED(CONFIG_NF_NAT_IPV4) &&
606 skb->protocol == htons(ETH_P_IP) &&
f8f97cdc
JR
607 ip_hdr(skb)->protocol == IPPROTO_ICMP) {
608 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
609 hooknum))
610 err = NF_DROP;
611 goto push;
90b01477
AB
612 } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) &&
613 skb->protocol == htons(ETH_P_IPV6)) {
f8f97cdc
JR
614 __be16 frag_off;
615 u8 nexthdr = ipv6_hdr(skb)->nexthdr;
616 int hdrlen = ipv6_skip_exthdr(skb,
617 sizeof(struct ipv6hdr),
618 &nexthdr, &frag_off);
619
620 if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
621 if (!nf_nat_icmpv6_reply_translation(skb, ct,
622 ctinfo,
623 hooknum,
624 hdrlen))
625 err = NF_DROP;
626 goto push;
627 }
f8f97cdc
JR
628 }
629 /* Non-ICMP, fall thru to initialize if needed. */
630 case IP_CT_NEW:
631 /* Seen it before? This can happen for loopback, retrans,
632 * or local packets.
633 */
634 if (!nf_nat_initialized(ct, maniptype)) {
635 /* Initialize according to the NAT action. */
636 err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
637 /* Action is set up to establish a new
638 * mapping.
639 */
640 ? nf_nat_setup_info(ct, range, maniptype)
641 : nf_nat_alloc_null_binding(ct, hooknum);
642 if (err != NF_ACCEPT)
643 goto push;
644 }
645 break;
646
647 case IP_CT_ESTABLISHED:
648 case IP_CT_ESTABLISHED_REPLY:
649 break;
650
651 default:
652 err = NF_DROP;
653 goto push;
654 }
655
656 err = nf_nat_packet(ct, ctinfo, hooknum, skb);
657push:
658 skb_push(skb, nh_off);
073c7b86 659 skb_postpush_rcsum(skb, skb->data, nh_off);
f8f97cdc
JR
660
661 return err;
662}
663
664static void ovs_nat_update_key(struct sw_flow_key *key,
665 const struct sk_buff *skb,
666 enum nf_nat_manip_type maniptype)
667{
668 if (maniptype == NF_NAT_MANIP_SRC) {
669 __be16 src;
670
671 key->ct.state |= OVS_CS_F_SRC_NAT;
672 if (key->eth.type == htons(ETH_P_IP))
673 key->ipv4.addr.src = ip_hdr(skb)->saddr;
674 else if (key->eth.type == htons(ETH_P_IPV6))
675 memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
676 sizeof(key->ipv6.addr.src));
677 else
678 return;
679
680 if (key->ip.proto == IPPROTO_UDP)
681 src = udp_hdr(skb)->source;
682 else if (key->ip.proto == IPPROTO_TCP)
683 src = tcp_hdr(skb)->source;
684 else if (key->ip.proto == IPPROTO_SCTP)
685 src = sctp_hdr(skb)->source;
686 else
687 return;
688
689 key->tp.src = src;
690 } else {
691 __be16 dst;
692
693 key->ct.state |= OVS_CS_F_DST_NAT;
694 if (key->eth.type == htons(ETH_P_IP))
695 key->ipv4.addr.dst = ip_hdr(skb)->daddr;
696 else if (key->eth.type == htons(ETH_P_IPV6))
697 memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
698 sizeof(key->ipv6.addr.dst));
699 else
700 return;
701
702 if (key->ip.proto == IPPROTO_UDP)
703 dst = udp_hdr(skb)->dest;
704 else if (key->ip.proto == IPPROTO_TCP)
705 dst = tcp_hdr(skb)->dest;
706 else if (key->ip.proto == IPPROTO_SCTP)
707 dst = sctp_hdr(skb)->dest;
708 else
709 return;
710
711 key->tp.dst = dst;
712 }
713}
714
715/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
716static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
717 const struct ovs_conntrack_info *info,
718 struct sk_buff *skb, struct nf_conn *ct,
719 enum ip_conntrack_info ctinfo)
720{
721 enum nf_nat_manip_type maniptype;
722 int err;
723
724 if (nf_ct_is_untracked(ct)) {
725 /* A NAT action may only be performed on tracked packets. */
726 return NF_ACCEPT;
727 }
728
729 /* Add NAT extension if not confirmed yet. */
730 if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
731 return NF_ACCEPT; /* Can't NAT. */
732
733 /* Determine NAT type.
734 * Check if the NAT type can be deduced from the tracked connection.
d2e8b514
JR
735 * Make sure new expected connections (IP_CT_RELATED) are NATted only
736 * when committing.
f8f97cdc
JR
737 */
738 if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
739 ct->status & IPS_NAT_MASK &&
d2e8b514 740 (ctinfo != IP_CT_RELATED || info->commit)) {
f8f97cdc
JR
741 /* NAT an established or related connection like before. */
742 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
743 /* This is the REPLY direction for a connection
744 * for which NAT was applied in the forward
745 * direction. Do the reverse NAT.
746 */
747 maniptype = ct->status & IPS_SRC_NAT
748 ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
749 else
750 maniptype = ct->status & IPS_SRC_NAT
751 ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
752 } else if (info->nat & OVS_CT_SRC_NAT) {
753 maniptype = NF_NAT_MANIP_SRC;
754 } else if (info->nat & OVS_CT_DST_NAT) {
755 maniptype = NF_NAT_MANIP_DST;
756 } else {
757 return NF_ACCEPT; /* Connection is not NATed. */
758 }
759 err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
760
761 /* Mark NAT done if successful and update the flow key. */
762 if (err == NF_ACCEPT)
763 ovs_nat_update_key(key, skb, maniptype);
764
765 return err;
766}
767#else /* !CONFIG_NF_NAT_NEEDED */
768static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
769 const struct ovs_conntrack_info *info,
770 struct sk_buff *skb, struct nf_conn *ct,
771 enum ip_conntrack_info ctinfo)
772{
773 return NF_ACCEPT;
774}
775#endif
776
b0f251cd 777/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
a04a5794
JR
778 * not done already. Update key with new CT state after passing the packet
779 * through conntrack.
e3c42eb8 780 * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be
b0f251cd
JR
781 * set to NULL and 0 will be returned.
782 */
c05e2094 783static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
a94ebc39
JS
784 const struct ovs_conntrack_info *info,
785 struct sk_buff *skb)
786{
787 /* If we are recirculating packets to match on conntrack fields and
788 * committing with a separate conntrack action, then we don't need to
789 * actually run the packet through conntrack twice unless it's for a
790 * different zone.
791 */
b21d237e
JR
792 bool cached = skb_nfct_cached(net, key, info, skb);
793 enum ip_conntrack_info ctinfo;
794 struct nf_conn *ct;
795
796 if (!cached) {
a94ebc39 797 struct nf_conn *tmpl = info->ct;
9bf67b92 798 int err;
a94ebc39
JS
799
800 /* Associate skb with specified zone. */
801 if (tmpl) {
d3c313c1
FW
802 if (skb_nfct(skb))
803 nf_conntrack_put(skb_nfct(skb));
a94ebc39 804 nf_conntrack_get(&tmpl->ct_general);
dfa791b2 805 nf_ct_set(skb, tmpl, IP_CT_NEW);
a94ebc39
JS
806 }
807
a6d28f7c
PNA
808 err = nf_conntrack_in(net, info->family,
809 NF_INET_PRE_ROUTING, skb);
9bf67b92 810 if (err != NF_ACCEPT)
a94ebc39 811 return -ENOENT;
11251c17 812
f8f97cdc
JR
813 /* Clear CT state NAT flags to mark that we have not yet done
814 * NAT after the nf_conntrack_in() call. We can actually clear
815 * the whole state, as it will be re-initialized below.
816 */
817 key->ct.state = 0;
818
819 /* Update the key, but keep the NAT flags. */
820 ovs_ct_update_key(skb, info, key, true, true);
b21d237e 821 }
a04a5794 822
b21d237e 823 ct = nf_ct_get(skb, &ctinfo);
f8f97cdc
JR
824 if (ct) {
825 /* Packets starting a new connection must be NATted before the
826 * helper, so that the helper knows about the NAT. We enforce
827 * this by delaying both NAT and helper calls for unconfirmed
828 * connections until the committing CT action. For later
829 * packets NAT and Helper may be called in either order.
830 *
831 * NAT will be done only if the CT action has NAT, and only
832 * once per packet (per zone), as guarded by the NAT bits in
833 * the key->ct.state.
834 */
835 if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
836 (nf_ct_is_confirmed(ct) || info->commit) &&
837 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
838 return -EINVAL;
839 }
840
b87a5aac
JS
841 /* Userspace may decide to perform a ct lookup without a helper
842 * specified followed by a (recirculate and) commit with one.
843 * Therefore, for unconfirmed connections which we will commit,
844 * we need to attach the helper here.
845 */
846 if (!nf_ct_is_confirmed(ct) && info->commit &&
847 info->helper && !nfct_help(ct)) {
848 int err = __nf_ct_try_assign_helper(ct, info->ct,
849 GFP_ATOMIC);
850 if (err)
851 return err;
852 }
853
f8f97cdc
JR
854 /* Call the helper only if:
855 * - nf_conntrack_in() was executed above ("!cached") for a
856 * confirmed connection, or
857 * - When committing an unconfirmed connection.
858 */
859 if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
860 ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
861 return -EINVAL;
862 }
a94ebc39
JS
863 }
864
865 return 0;
866}
867
868/* Lookup connection and read fields into key. */
869static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
870 const struct ovs_conntrack_info *info,
871 struct sk_buff *skb)
872{
873 struct nf_conntrack_expect *exp;
874
b0f251cd
JR
875 /* If we pass an expected packet through nf_conntrack_in() the
876 * expectation is typically removed, but the packet could still be
877 * lost in upcall processing. To prevent this from happening we
878 * perform an explicit expectation lookup. Expected connections are
879 * always new, and will be passed through conntrack only when they are
880 * committed, as it is OK to remove the expectation at that time.
881 */
a94ebc39
JS
882 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
883 if (exp) {
884 u8 state;
885
f8f97cdc
JR
886 /* NOTE: New connections are NATted and Helped only when
887 * committed, so we are not calling into NAT here.
888 */
a94ebc39 889 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
372ce973 890 __ovs_ct_update_key(key, state, &info->zone, exp->master);
f325530e
PS
891 } else {
892 struct nf_conn *ct;
893 int err;
894
895 err = __ovs_ct_lookup(net, key, info, skb);
896 if (err)
897 return err;
898
d3c313c1 899 ct = (struct nf_conn *)skb_nfct(skb);
f325530e
PS
900 if (ct)
901 nf_ct_deliver_cached_events(ct);
902 }
a94ebc39
JS
903
904 return 0;
905}
906
c05e2094 907static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
038e34ab
JS
908{
909 size_t i;
910
83495bd9
JR
911 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++)
912 if (labels->ct_labels_32[i])
038e34ab
JS
913 return true;
914
915 return false;
916}
917
39a6542b
JR
918/* Lookup connection and confirm if unconfirmed. */
919static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
920 const struct ovs_conntrack_info *info,
921 struct sk_buff *skb)
922{
efd40994
JR
923 enum ip_conntrack_info ctinfo;
924 struct nf_conn *ct;
39a6542b
JR
925 int err;
926
927 err = __ovs_ct_lookup(net, key, info, skb);
928 if (err)
929 return err;
930
efd40994
JR
931 /* The connection could be invalid, in which case this is a no-op.*/
932 ct = nf_ct_get(skb, &ctinfo);
933 if (!ct)
934 return 0;
935
39a6542b
JR
936 /* Apply changes before confirming the connection so that the initial
937 * conntrack NEW netlink event carries the values given in the CT
938 * action.
939 */
940 if (info->mark.mask) {
efd40994 941 err = ovs_ct_set_mark(ct, key, info->mark.value,
39a6542b
JR
942 info->mark.mask);
943 if (err)
944 return err;
945 }
946 if (labels_nonzero(&info->labels.mask)) {
efd40994
JR
947 if (!nf_ct_is_confirmed(ct))
948 err = ovs_ct_init_labels(ct, key, &info->labels.value,
949 &info->labels.mask);
950 else
951 err = ovs_ct_set_labels(ct, key, &info->labels.value,
952 &info->labels.mask);
39a6542b
JR
953 if (err)
954 return err;
955 }
956 /* This will take care of sending queued events even if the connection
957 * is already confirmed.
958 */
959 if (nf_conntrack_confirm(skb) != NF_ACCEPT)
960 return -EINVAL;
961
962 return 0;
963}
964
c05e2094
JS
965/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
966 * value if 'skb' is freed.
967 */
a94ebc39
JS
968int ovs_ct_execute(struct net *net, struct sk_buff *skb,
969 struct sw_flow_key *key,
970 const struct ovs_conntrack_info *info)
971{
972 int nh_ofs;
973 int err;
974
975 /* The conntrack module expects to be working at L3. */
976 nh_ofs = skb_network_offset(skb);
073c7b86 977 skb_pull_rcsum(skb, nh_ofs);
a94ebc39
JS
978
979 if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
980 err = handle_fragments(net, key, info->zone.id, skb);
981 if (err)
982 return err;
983 }
984
c05e2094 985 if (info->commit)
39a6542b 986 err = ovs_ct_commit(net, key, info, skb);
a94ebc39
JS
987 else
988 err = ovs_ct_lookup(net, key, info, skb);
989
990 skb_push(skb, nh_ofs);
073c7b86 991 skb_postpush_rcsum(skb, skb->data, nh_ofs);
c05e2094
JS
992 if (err)
993 kfree_skb(skb);
a94ebc39
JS
994 return err;
995}
996
11251c17
JS
997static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
998 const struct sw_flow_key *key, bool log)
999{
1000 struct nf_conntrack_helper *helper;
1001 struct nf_conn_help *help;
1002
1003 helper = nf_conntrack_helper_try_module_get(name, info->family,
1004 key->ip.proto);
1005 if (!helper) {
1006 OVS_NLERR(log, "Unknown helper \"%s\"", name);
1007 return -EINVAL;
1008 }
1009
1010 help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
1011 if (!help) {
1012 module_put(helper->me);
1013 return -ENOMEM;
1014 }
1015
1016 rcu_assign_pointer(help->helper, helper);
1017 info->helper = helper;
1018 return 0;
1019}
1020
f8f97cdc
JR
1021#ifdef CONFIG_NF_NAT_NEEDED
1022static int parse_nat(const struct nlattr *attr,
1023 struct ovs_conntrack_info *info, bool log)
1024{
1025 struct nlattr *a;
1026 int rem;
1027 bool have_ip_max = false;
1028 bool have_proto_max = false;
1029 bool ip_vers = (info->family == NFPROTO_IPV6);
1030
1031 nla_for_each_nested(a, attr, rem) {
1032 static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
1033 [OVS_NAT_ATTR_SRC] = {0, 0},
1034 [OVS_NAT_ATTR_DST] = {0, 0},
1035 [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
1036 sizeof(struct in6_addr)},
1037 [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
1038 sizeof(struct in6_addr)},
1039 [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
1040 [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
1041 [OVS_NAT_ATTR_PERSISTENT] = {0, 0},
1042 [OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
1043 [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
1044 };
1045 int type = nla_type(a);
1046
1047 if (type > OVS_NAT_ATTR_MAX) {
1048 OVS_NLERR(log,
1049 "Unknown NAT attribute (type=%d, max=%d).\n",
1050 type, OVS_NAT_ATTR_MAX);
1051 return -EINVAL;
1052 }
1053
1054 if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
1055 OVS_NLERR(log,
1056 "NAT attribute type %d has unexpected length (%d != %d).\n",
1057 type, nla_len(a),
1058 ovs_nat_attr_lens[type][ip_vers]);
1059 return -EINVAL;
1060 }
1061
1062 switch (type) {
1063 case OVS_NAT_ATTR_SRC:
1064 case OVS_NAT_ATTR_DST:
1065 if (info->nat) {
1066 OVS_NLERR(log,
1067 "Only one type of NAT may be specified.\n"
1068 );
1069 return -ERANGE;
1070 }
1071 info->nat |= OVS_CT_NAT;
1072 info->nat |= ((type == OVS_NAT_ATTR_SRC)
1073 ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
1074 break;
1075
1076 case OVS_NAT_ATTR_IP_MIN:
70e71d27
HY
1077 nla_memcpy(&info->range.min_addr, a,
1078 sizeof(info->range.min_addr));
f8f97cdc
JR
1079 info->range.flags |= NF_NAT_RANGE_MAP_IPS;
1080 break;
1081
1082 case OVS_NAT_ATTR_IP_MAX:
1083 have_ip_max = true;
1084 nla_memcpy(&info->range.max_addr, a,
1085 sizeof(info->range.max_addr));
1086 info->range.flags |= NF_NAT_RANGE_MAP_IPS;
1087 break;
1088
1089 case OVS_NAT_ATTR_PROTO_MIN:
1090 info->range.min_proto.all = htons(nla_get_u16(a));
1091 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
1092 break;
1093
1094 case OVS_NAT_ATTR_PROTO_MAX:
1095 have_proto_max = true;
1096 info->range.max_proto.all = htons(nla_get_u16(a));
1097 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
1098 break;
1099
1100 case OVS_NAT_ATTR_PERSISTENT:
1101 info->range.flags |= NF_NAT_RANGE_PERSISTENT;
1102 break;
1103
1104 case OVS_NAT_ATTR_PROTO_HASH:
1105 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
1106 break;
1107
1108 case OVS_NAT_ATTR_PROTO_RANDOM:
9f1de150 1109#ifdef NF_NAT_RANGE_PROTO_RANDOM_FULLY
f8f97cdc 1110 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
9f1de150
JR
1111#else
1112 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
1113 info->random_fully_compat = true;
1114#endif
f8f97cdc
JR
1115 break;
1116
1117 default:
1118 OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
1119 return -EINVAL;
1120 }
1121 }
1122
1123 if (rem > 0) {
1124 OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
1125 return -EINVAL;
1126 }
1127 if (!info->nat) {
1128 /* Do not allow flags if no type is given. */
1129 if (info->range.flags) {
1130 OVS_NLERR(log,
1131 "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
1132 );
1133 return -EINVAL;
1134 }
1135 info->nat = OVS_CT_NAT; /* NAT existing connections. */
1136 } else if (!info->commit) {
1137 OVS_NLERR(log,
1138 "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
1139 );
1140 return -EINVAL;
1141 }
1142 /* Allow missing IP_MAX. */
1143 if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
1144 memcpy(&info->range.max_addr, &info->range.min_addr,
1145 sizeof(info->range.max_addr));
1146 }
1147 /* Allow missing PROTO_MAX. */
1148 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
1149 !have_proto_max) {
1150 info->range.max_proto.all = info->range.min_proto.all;
1151 }
1152 return 0;
1153}
1154#endif
1155
a94ebc39 1156static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
c05e2094 1157 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
a94ebc39
JS
1158 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
1159 .maxlen = sizeof(u16) },
372ce973
JS
1160 [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark),
1161 .maxlen = sizeof(struct md_mark) },
c05e2094
JS
1162 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels),
1163 .maxlen = sizeof(struct md_labels) },
11251c17 1164 [OVS_CT_ATTR_HELPER] = { .minlen = 1,
f8f97cdc
JR
1165 .maxlen = NF_CT_HELPER_NAME_LEN },
1166#ifdef CONFIG_NF_NAT_NEEDED
1167 /* NAT length is checked when parsing the nested attributes. */
1168 [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
1169#endif
a94ebc39
JS
1170};
1171
1172static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
11251c17 1173 const char **helper, bool log)
a94ebc39
JS
1174{
1175 struct nlattr *a;
1176 int rem;
1177
1178 nla_for_each_nested(a, attr, rem) {
1179 int type = nla_type(a);
1180 int maxlen = ovs_ct_attr_lens[type].maxlen;
1181 int minlen = ovs_ct_attr_lens[type].minlen;
1182
1183 if (type > OVS_CT_ATTR_MAX) {
1184 OVS_NLERR(log,
1185 "Unknown conntrack attr (type=%d, max=%d)",
1186 type, OVS_CT_ATTR_MAX);
1187 return -EINVAL;
1188 }
1189 if (nla_len(a) < minlen || nla_len(a) > maxlen) {
1190 OVS_NLERR(log,
1191 "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)",
1192 type, nla_len(a), maxlen);
1193 return -EINVAL;
1194 }
1195
1196 switch (type) {
c05e2094
JS
1197 case OVS_CT_ATTR_COMMIT:
1198 info->commit = true;
a94ebc39
JS
1199 break;
1200#ifdef CONFIG_NF_CONNTRACK_ZONES
1201 case OVS_CT_ATTR_ZONE:
1202 info->zone.id = nla_get_u16(a);
1203 break;
372ce973
JS
1204#endif
1205#ifdef CONFIG_NF_CONNTRACK_MARK
1206 case OVS_CT_ATTR_MARK: {
1207 struct md_mark *mark = nla_data(a);
1208
c05e2094
JS
1209 if (!mark->mask) {
1210 OVS_NLERR(log, "ct_mark mask cannot be 0");
1211 return -EINVAL;
1212 }
372ce973
JS
1213 info->mark = *mark;
1214 break;
1215 }
038e34ab
JS
1216#endif
1217#ifdef CONFIG_NF_CONNTRACK_LABELS
c05e2094
JS
1218 case OVS_CT_ATTR_LABELS: {
1219 struct md_labels *labels = nla_data(a);
038e34ab 1220
c05e2094
JS
1221 if (!labels_nonzero(&labels->mask)) {
1222 OVS_NLERR(log, "ct_labels mask cannot be 0");
1223 return -EINVAL;
1224 }
1225 info->labels = *labels;
038e34ab
JS
1226 break;
1227 }
a94ebc39 1228#endif
11251c17
JS
1229 case OVS_CT_ATTR_HELPER:
1230 *helper = nla_data(a);
1231 if (!memchr(*helper, '\0', nla_len(a))) {
1232 OVS_NLERR(log, "Invalid conntrack helper");
1233 return -EINVAL;
1234 }
1235 break;
f8f97cdc
JR
1236#ifdef CONFIG_NF_NAT_NEEDED
1237 case OVS_CT_ATTR_NAT: {
1238 int err = parse_nat(a, info, log);
1239
1240 if (err)
1241 return err;
1242 break;
1243 }
1244#endif
a94ebc39
JS
1245 default:
1246 OVS_NLERR(log, "Unknown conntrack attr (%d)",
1247 type);
1248 return -EINVAL;
1249 }
1250 }
1251
39a6542b
JR
1252#ifdef CONFIG_NF_CONNTRACK_MARK
1253 if (!info->commit && info->mark.mask) {
1254 OVS_NLERR(log,
1255 "Setting conntrack mark requires 'commit' flag.");
1256 return -EINVAL;
1257 }
1258#endif
1259#ifdef CONFIG_NF_CONNTRACK_LABELS
1260 if (!info->commit && labels_nonzero(&info->labels.mask)) {
1261 OVS_NLERR(log,
1262 "Setting conntrack labels requires 'commit' flag.");
1263 return -EINVAL;
1264 }
1265#endif
a94ebc39
JS
1266 if (rem > 0) {
1267 OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem);
1268 return -EINVAL;
1269 }
1270
1271 return 0;
1272}
1273
038e34ab 1274bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr)
a94ebc39
JS
1275{
1276 if (attr == OVS_KEY_ATTR_CT_STATE)
1277 return true;
1278 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
1279 attr == OVS_KEY_ATTR_CT_ZONE)
1280 return true;
372ce973
JS
1281 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
1282 attr == OVS_KEY_ATTR_CT_MARK)
1283 return true;
038e34ab 1284 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
c05e2094 1285 attr == OVS_KEY_ATTR_CT_LABELS) {
038e34ab
JS
1286 struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
1287
1288 return ovs_net->xt_label;
1289 }
a94ebc39
JS
1290
1291 return false;
1292}
1293
1294int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
1295 const struct sw_flow_key *key,
1296 struct sw_flow_actions **sfa, bool log)
1297{
1298 struct ovs_conntrack_info ct_info;
11251c17 1299 const char *helper = NULL;
a94ebc39
JS
1300 u16 family;
1301 int err;
1302
1303 family = key_to_nfproto(key);
1304 if (family == NFPROTO_UNSPEC) {
1305 OVS_NLERR(log, "ct family unspecified");
1306 return -EINVAL;
1307 }
1308
1309 memset(&ct_info, 0, sizeof(ct_info));
1310 ct_info.family = family;
1311
1312 nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID,
1313 NF_CT_DEFAULT_ZONE_DIR, 0);
1314
11251c17 1315 err = parse_ct(attr, &ct_info, &helper, log);
a94ebc39
JS
1316 if (err)
1317 return err;
1318
1319 /* Set up template for tracking connections in specific zones. */
1320 ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL);
1321 if (!ct_info.ct) {
1322 OVS_NLERR(log, "Failed to allocate conntrack template");
1323 return -ENOMEM;
1324 }
a3a68d63
JS
1325
1326 __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
1327 nf_conntrack_get(&ct_info.ct->ct_general);
1328
11251c17
JS
1329 if (helper) {
1330 err = ovs_ct_add_helper(&ct_info, helper, key, log);
1331 if (err)
1332 goto err_free_ct;
1333 }
a94ebc39
JS
1334
1335 err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info,
1336 sizeof(ct_info), log);
1337 if (err)
1338 goto err_free_ct;
1339
a94ebc39
JS
1340 return 0;
1341err_free_ct:
11251c17 1342 __ovs_ct_free_action(&ct_info);
a94ebc39
JS
1343 return err;
1344}
1345
f8f97cdc
JR
1346#ifdef CONFIG_NF_NAT_NEEDED
1347static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
1348 struct sk_buff *skb)
1349{
1350 struct nlattr *start;
1351
1352 start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
1353 if (!start)
1354 return false;
1355
1356 if (info->nat & OVS_CT_SRC_NAT) {
1357 if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
1358 return false;
1359 } else if (info->nat & OVS_CT_DST_NAT) {
1360 if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
1361 return false;
1362 } else {
1363 goto out;
1364 }
1365
1366 if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
90b01477
AB
1367 if (IS_ENABLED(CONFIG_NF_NAT_IPV4) &&
1368 info->family == NFPROTO_IPV4) {
f8f97cdc
JR
1369 if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
1370 info->range.min_addr.ip) ||
1371 (info->range.max_addr.ip
1372 != info->range.min_addr.ip &&
1373 (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
1374 info->range.max_addr.ip))))
1375 return false;
90b01477
AB
1376 } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) &&
1377 info->family == NFPROTO_IPV6) {
f8f97cdc
JR
1378 if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
1379 &info->range.min_addr.in6) ||
1380 (memcmp(&info->range.max_addr.in6,
1381 &info->range.min_addr.in6,
1382 sizeof(info->range.max_addr.in6)) &&
1383 (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
1384 &info->range.max_addr.in6))))
1385 return false;
f8f97cdc
JR
1386 } else {
1387 return false;
1388 }
1389 }
1390 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
1391 (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
1392 ntohs(info->range.min_proto.all)) ||
1393 (info->range.max_proto.all != info->range.min_proto.all &&
1394 nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
1395 ntohs(info->range.max_proto.all)))))
1396 return false;
1397
1398 if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
1399 nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
1400 return false;
1401 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
9f1de150
JR
1402 nla_put_flag(skb, info->random_fully_compat
1403 ? OVS_NAT_ATTR_PROTO_RANDOM
1404 : OVS_NAT_ATTR_PROTO_HASH))
f8f97cdc 1405 return false;
9f1de150 1406#ifdef NF_NAT_RANGE_PROTO_RANDOM_FULLY
f8f97cdc
JR
1407 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
1408 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
1409 return false;
9f1de150 1410#endif
f8f97cdc
JR
1411out:
1412 nla_nest_end(skb, start);
1413
1414 return true;
1415}
1416#endif
1417
a94ebc39
JS
1418int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
1419 struct sk_buff *skb)
1420{
1421 struct nlattr *start;
1422
1423 start = nla_nest_start(skb, OVS_ACTION_ATTR_CT);
1424 if (!start)
1425 return -EMSGSIZE;
1426
c05e2094 1427 if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))
a94ebc39
JS
1428 return -EMSGSIZE;
1429 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
1430 nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))
1431 return -EMSGSIZE;
c05e2094 1432 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask &&
372ce973
JS
1433 nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark),
1434 &ct_info->mark))
1435 return -EMSGSIZE;
038e34ab 1436 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
c05e2094
JS
1437 labels_nonzero(&ct_info->labels.mask) &&
1438 nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels),
1439 &ct_info->labels))
038e34ab 1440 return -EMSGSIZE;
11251c17
JS
1441 if (ct_info->helper) {
1442 if (nla_put_string(skb, OVS_CT_ATTR_HELPER,
1443 ct_info->helper->name))
1444 return -EMSGSIZE;
1445 }
f8f97cdc
JR
1446#ifdef CONFIG_NF_NAT_NEEDED
1447 if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
1448 return -EMSGSIZE;
1449#endif
a94ebc39
JS
1450 nla_nest_end(skb, start);
1451
1452 return 0;
1453}
1454
1455void ovs_ct_free_action(const struct nlattr *a)
1456{
1457 struct ovs_conntrack_info *ct_info = nla_data(a);
1458
11251c17
JS
1459 __ovs_ct_free_action(ct_info);
1460}
1461
1462static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
1463{
1464 if (ct_info->helper)
1465 module_put(ct_info->helper->me);
a94ebc39
JS
1466 if (ct_info->ct)
1467 nf_ct_tmpl_free(ct_info->ct);
1468}
1469
038e34ab
JS
1470void ovs_ct_init(struct net *net)
1471{
c05e2094 1472 unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
038e34ab
JS
1473 struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
1474
7f2ab8cd 1475 if (nf_connlabels_get(net, n_bits - 1)) {
038e34ab
JS
1476 ovs_net->xt_label = false;
1477 OVS_NLERR(true, "Failed to set connlabel length");
1478 } else {
1479 ovs_net->xt_label = true;
1480 }
1481}
1482
1483void ovs_ct_exit(struct net *net)
1484{
1485 struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
1486
1487 if (ovs_net->xt_label)
1488 nf_connlabels_put(net);
1489}
1490
8063e095 1491#endif /* CONFIG_NF_CONNTRACK */