]> git.proxmox.com Git - mirror_ovs.git/blob - lib/netlink-conntrack.c
datapath-windows: Correct endianness for deleting zone.
[mirror_ovs.git] / lib / netlink-conntrack.c
1 /*
2 * Copyright (c) 2015 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "netlink-conntrack.h"
20
21 #include <errno.h>
22 #include <linux/netfilter/nfnetlink.h>
23 #include <linux/netfilter/nfnetlink_conntrack.h>
24 #include <linux/netfilter/nf_conntrack_common.h>
25 #include <linux/netfilter/nf_conntrack_tcp.h>
26 #include <linux/netfilter/nf_conntrack_ftp.h>
27 #include <linux/netfilter/nf_conntrack_sctp.h>
28
29 #include "byte-order.h"
30 #include "compiler.h"
31 #include "openvswitch/dynamic-string.h"
32 #include "netlink.h"
33 #include "netlink-socket.h"
34 #include "openvswitch/ofpbuf.h"
35 #include "openvswitch/vlog.h"
36 #include "openvswitch/poll-loop.h"
37 #include "timeval.h"
38 #include "unixctl.h"
39 #include "util.h"
40
41 VLOG_DEFINE_THIS_MODULE(netlink_conntrack);
42 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
43
44 /* This module works only if conntrack modules and features are enabled in the
45 * Linux kernel. This can be done from a root shell like this:
46 *
47 * $ modprobe ip_conntrack
48 * $ sysctl -w net.netfilter.nf_conntrack_acct=1
49 * $ sysctl -w net.netfilter.nf_conntrack_timestamp=1
50 *
51 * Also, if testing conntrack label feature without conntrack-aware OVS kernel
52 * module, there must be a connlabel rule in iptables for space to be reserved
53 * for the labels (see kernel source connlabel_mt_check()). Such a rule can be
54 * inserted from a root shell like this:
55 *
56 * $ iptables -A INPUT -m conntrack -m connlabel \
57 * --ctstate NEW,ESTABLISHED,RELATED --label 127 -j ACCEPT
58 */
59
60 /* Some attributes were introduced in later kernels: with these definitions
61 * we should be able to compile userspace against Linux 2.6.32+. */
62
63 #define CTA_ZONE (CTA_SECMARK + 1)
64 #define CTA_SECCTX (CTA_SECMARK + 2)
65 #define CTA_TIMESTAMP (CTA_SECMARK + 3)
66 #define CTA_MARK_MASK (CTA_SECMARK + 4)
67 #define CTA_LABELS (CTA_SECMARK + 5)
68 #define CTA_LABELS_MASK (CTA_SECMARK + 6)
69
70 #define CTA_TIMESTAMP_START 1
71 #define CTA_TIMESTAMP_STOP 2
72
73 #define IPS_TEMPLATE_BIT 11
74 #define IPS_TEMPLATE (1 << IPS_TEMPLATE_BIT)
75
76 #define IPS_UNTRACKED_BIT 12
77 #define IPS_UNTRACKED (1 << IPS_UNTRACKED_BIT)
78
79 static const struct nl_policy nfnlgrp_conntrack_policy[] = {
80 [CTA_TUPLE_ORIG] = { .type = NL_A_NESTED, .optional = false },
81 [CTA_TUPLE_REPLY] = { .type = NL_A_NESTED, .optional = false },
82 [CTA_ZONE] = { .type = NL_A_BE16, .optional = true },
83 [CTA_STATUS] = { .type = NL_A_BE32, .optional = false },
84 [CTA_TIMESTAMP] = { .type = NL_A_NESTED, .optional = true },
85 [CTA_TIMEOUT] = { .type = NL_A_BE32, .optional = true },
86 [CTA_COUNTERS_ORIG] = { .type = NL_A_NESTED, .optional = true },
87 [CTA_COUNTERS_REPLY] = { .type = NL_A_NESTED, .optional = true },
88 [CTA_PROTOINFO] = { .type = NL_A_NESTED, .optional = true },
89 [CTA_HELP] = { .type = NL_A_NESTED, .optional = true },
90 [CTA_MARK] = { .type = NL_A_BE32, .optional = true },
91 [CTA_SECCTX] = { .type = NL_A_NESTED, .optional = true },
92 [CTA_ID] = { .type = NL_A_BE32, .optional = false },
93 [CTA_USE] = { .type = NL_A_BE32, .optional = true },
94 [CTA_TUPLE_MASTER] = { .type = NL_A_NESTED, .optional = true },
95 [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NL_A_NESTED, .optional = true },
96 [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NL_A_NESTED, .optional = true },
97 [CTA_LABELS] = { .type = NL_A_UNSPEC, .optional = true },
98 /* CTA_NAT_SRC, CTA_NAT_DST, CTA_TIMESTAMP, CTA_MARK_MASK, and
99 * CTA_LABELS_MASK are not received from kernel. */
100 };
101
102 /* Declarations for conntrack netlink dumping. */
103 static void nl_msg_put_nfgenmsg(struct ofpbuf *msg, size_t expected_payload,
104 int family, uint8_t subsystem, uint8_t cmd,
105 uint32_t flags);
106
107 static bool nl_ct_parse_header_policy(struct ofpbuf *buf,
108 enum nl_ct_event_type *event_type,
109 uint8_t *nfgen_family,
110 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)]);
111
112 static bool nl_ct_attrs_to_ct_dpif_entry(struct ct_dpif_entry *entry,
113 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)],
114 uint8_t nfgen_family);
115 static bool nl_ct_put_ct_tuple(struct ofpbuf *buf,
116 const struct ct_dpif_tuple *tuple, enum ctattr_type type);
117
118 struct nl_ct_dump_state {
119 struct nl_dump dump;
120 struct ofpbuf buf;
121 bool filter_zone;
122 uint16_t zone;
123 };
124
125 /* Conntrack netlink dumping. */
126
127 /* Initialize a conntrack netlink dump. */
128 int
129 nl_ct_dump_start(struct nl_ct_dump_state **statep, const uint16_t *zone,
130 int *ptot_bkts)
131 {
132 struct nl_ct_dump_state *state;
133
134 *statep = state = xzalloc(sizeof *state);
135 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
136
137 if (zone) {
138 state->filter_zone = true;
139 state->zone = *zone;
140 }
141
142 nl_msg_put_nfgenmsg(&state->buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
143 IPCTNL_MSG_CT_GET, NLM_F_REQUEST);
144 nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf);
145 ofpbuf_clear(&state->buf);
146
147 /* Buckets to store connections are not used. */
148 *ptot_bkts = -1;
149
150 return 0;
151 }
152
153 /* Receive the next 'entry' from the conntrack netlink dump with 'state'.
154 * Returns 'EOF' when no more entries are available, 0 otherwise. 'entry' may
155 * be uninitilized memory on entry, and must be uninitialized with
156 * ct_dpif_entry_uninit() afterwards by the caller. In case the same 'entry' is
157 * passed to this function again, the entry must also be uninitialized before
158 * the next call. */
159 int
160 nl_ct_dump_next(struct nl_ct_dump_state *state, struct ct_dpif_entry *entry)
161 {
162 struct ofpbuf buf;
163
164 memset(entry, 0, sizeof *entry);
165 for (;;) {
166 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)];
167 enum nl_ct_event_type type;
168 uint8_t nfgen_family;
169
170 if (!nl_dump_next(&state->dump, &buf, &state->buf)) {
171 return EOF;
172 }
173
174 if (!nl_ct_parse_header_policy(&buf, &type, &nfgen_family, attrs)) {
175 continue;
176 };
177
178 if (state->filter_zone) {
179 uint16_t entry_zone = attrs[CTA_ZONE]
180 ? ntohs(nl_attr_get_be16(attrs[CTA_ZONE]))
181 : 0;
182 if (entry_zone != state->zone) {
183 continue;
184 }
185 }
186
187 if (nl_ct_attrs_to_ct_dpif_entry(entry, attrs, nfgen_family)) {
188 break;
189 }
190
191 ct_dpif_entry_uninit(entry);
192 memset(entry, 0, sizeof *entry);
193 /* Ignore the failed entry and get the next one. */
194 }
195
196 ofpbuf_uninit(&buf);
197 return 0;
198 }
199
200 /* End a conntrack netlink dump. */
201 int
202 nl_ct_dump_done(struct nl_ct_dump_state *state)
203 {
204 int error = nl_dump_done(&state->dump);
205
206 ofpbuf_uninit(&state->buf);
207 free(state);
208 return error;
209 }
210
211 /* Format conntrack event 'entry' of 'type' to 'ds'. */
212 void
213 nl_ct_format_event_entry(const struct ct_dpif_entry *entry,
214 enum nl_ct_event_type type, struct ds *ds,
215 bool verbose, bool print_stats)
216 {
217 ds_put_format(ds, "%s ",
218 type == NL_CT_EVENT_NEW ? "NEW"
219 : type == NL_CT_EVENT_UPDATE ? "UPDATE"
220 : type == NL_CT_EVENT_DELETE ? "DELETE"
221 : "UNKNOWN");
222 ct_dpif_format_entry(entry, ds, verbose, print_stats);
223 }
224
225 int
226 nl_ct_flush(void)
227 {
228 struct ofpbuf buf;
229 int err;
230
231 ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
232
233 nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
234 IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
235
236 err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
237 ofpbuf_uninit(&buf);
238
239 /* Expectations are flushed automatically, because they do not
240 * have a master connection anymore */
241
242 return err;
243 }
244
245 int
246 nl_ct_flush_tuple(const struct ct_dpif_tuple *tuple, uint16_t zone)
247 {
248 int err;
249 struct ofpbuf buf;
250
251 ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
252 nl_msg_put_nfgenmsg(&buf, 0, tuple->l3_type, NFNL_SUBSYS_CTNETLINK,
253 IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
254
255 nl_msg_put_be16(&buf, CTA_ZONE, htons(zone));
256 if (!nl_ct_put_ct_tuple(&buf, tuple, CTA_TUPLE_ORIG)) {
257 err = EOPNOTSUPP;
258 goto out;
259 }
260 err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
261 out:
262 ofpbuf_uninit(&buf);
263 return err;
264 }
265
266 #ifdef _WIN32
267 int
268 nl_ct_flush_zone(uint16_t flush_zone)
269 {
270 /* Windows can flush a specific zone */
271 struct ofpbuf buf;
272 int err;
273
274 ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
275
276 nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
277 IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
278 nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone));
279
280 err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
281 ofpbuf_uninit(&buf);
282
283 return err;
284 }
285 #else
286 int
287 nl_ct_flush_zone(uint16_t flush_zone)
288 {
289 /* Apparently, there's no netlink interface to flush a specific zone.
290 * This code dumps every connection, checks the zone and eventually
291 * delete the entry.
292 *
293 * This is race-prone, but it is better than using shell scripts. */
294
295 struct nl_dump dump;
296 struct ofpbuf buf, reply, delete;
297
298 ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
299 ofpbuf_init(&delete, NL_DUMP_BUFSIZE);
300
301 nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
302 IPCTNL_MSG_CT_GET, NLM_F_REQUEST);
303 nl_dump_start(&dump, NETLINK_NETFILTER, &buf);
304 ofpbuf_clear(&buf);
305
306 for (;;) {
307 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)];
308 enum nl_ct_event_type event_type;
309 uint8_t nfgen_family;
310 uint16_t zone = 0;
311
312 if (!nl_dump_next(&dump, &reply, &buf)) {
313 break;
314 }
315
316 if (!nl_ct_parse_header_policy(&reply, &event_type, &nfgen_family,
317 attrs)) {
318 continue;
319 };
320
321 if (attrs[CTA_ZONE]) {
322 zone = ntohs(nl_attr_get_be16(attrs[CTA_ZONE]));
323 }
324
325 if (zone != flush_zone) {
326 /* The entry is not in the zone we're flushing. */
327 continue;
328 }
329 nl_msg_put_nfgenmsg(&delete, 0, nfgen_family, NFNL_SUBSYS_CTNETLINK,
330 IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
331
332 nl_msg_put_be16(&delete, CTA_ZONE, htons(zone));
333 nl_msg_put_unspec(&delete, CTA_TUPLE_ORIG, attrs[CTA_TUPLE_ORIG] + 1,
334 attrs[CTA_TUPLE_ORIG]->nla_len - NLA_HDRLEN);
335 nl_msg_put_unspec(&delete, CTA_ID, attrs[CTA_ID] + 1,
336 attrs[CTA_ID]->nla_len - NLA_HDRLEN);
337 nl_transact(NETLINK_NETFILTER, &delete, NULL);
338 ofpbuf_clear(&delete);
339 }
340
341 nl_dump_done(&dump);
342
343 ofpbuf_uninit(&delete);
344 ofpbuf_uninit(&buf);
345
346 /* Expectations are flushed automatically, because they do not
347 * have a master connection anymore */
348 return 0;
349 }
350 #endif
351
352 /* Conntrack netlink parsing. */
353
354 static bool
355 nl_ct_parse_counters(struct nlattr *nla, struct ct_dpif_counters *counters)
356 {
357 static const struct nl_policy policy[] = {
358 [CTA_COUNTERS_PACKETS] = { .type = NL_A_BE64, .optional = false },
359 [CTA_COUNTERS_BYTES] = { .type = NL_A_BE64, .optional = false },
360 };
361 struct nlattr *attrs[ARRAY_SIZE(policy)];
362 bool parsed;
363
364 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
365
366 if (parsed) {
367 counters->packets
368 = ntohll(nl_attr_get_be64(attrs[CTA_COUNTERS_PACKETS]));
369 counters->bytes = ntohll(nl_attr_get_be64(attrs[CTA_COUNTERS_BYTES]));
370 } else {
371 VLOG_ERR_RL(&rl, "Could not parse nested counters. "
372 "Possibly incompatible Linux kernel version.");
373 }
374
375 return parsed;
376 }
377
378 static bool
379 nl_ct_parse_timestamp(struct nlattr *nla, struct ct_dpif_timestamp *timestamp)
380 {
381 static const struct nl_policy policy[] = {
382 [CTA_TIMESTAMP_START] = { .type = NL_A_BE64, .optional = false },
383 [CTA_TIMESTAMP_STOP] = { .type = NL_A_BE64, .optional = true },
384 };
385 struct nlattr *attrs[ARRAY_SIZE(policy)];
386 bool parsed;
387
388 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
389
390 if (parsed) {
391 timestamp->start
392 = ntohll(nl_attr_get_be64(attrs[CTA_TIMESTAMP_START]));
393 if (attrs[CTA_TIMESTAMP_STOP]) {
394 timestamp->stop
395 = ntohll(nl_attr_get_be64(attrs[CTA_TIMESTAMP_STOP]));
396 }
397 } else {
398 VLOG_ERR_RL(&rl, "Could not parse nested timestamp. "
399 "Possibly incompatible Linux kernel version.");
400 }
401
402 return parsed;
403 }
404
405 static bool
406 nl_ct_parse_tuple_ip(struct nlattr *nla, struct ct_dpif_tuple *tuple)
407 {
408 static const struct nl_policy policy[] = {
409 [CTA_IP_V4_SRC] = { .type = NL_A_BE32, .optional = true },
410 [CTA_IP_V4_DST] = { .type = NL_A_BE32, .optional = true },
411 [CTA_IP_V6_SRC] = { NL_POLICY_FOR(struct in6_addr), .optional = true },
412 [CTA_IP_V6_DST] = { NL_POLICY_FOR(struct in6_addr), .optional = true },
413 };
414 struct nlattr *attrs[ARRAY_SIZE(policy)];
415 bool parsed;
416
417 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
418
419 if (parsed) {
420 if (tuple->l3_type == AF_INET) {
421 if (attrs[CTA_IP_V4_SRC]) {
422 tuple->src.ip = nl_attr_get_be32(attrs[CTA_IP_V4_SRC]);
423 }
424 if (attrs[CTA_IP_V4_DST]) {
425 tuple->dst.ip = nl_attr_get_be32(attrs[CTA_IP_V4_DST]);
426 }
427 } else if (tuple->l3_type == AF_INET6) {
428 if (attrs[CTA_IP_V6_SRC]) {
429 memcpy(&tuple->src.in6, nl_attr_get(attrs[CTA_IP_V6_SRC]),
430 sizeof tuple->src.in6);
431 }
432 if (attrs[CTA_IP_V6_DST]) {
433 memcpy(&tuple->dst.in6, nl_attr_get(attrs[CTA_IP_V6_DST]),
434 sizeof tuple->dst.in6);
435 }
436 } else {
437 VLOG_WARN_RL(&rl, "Unsupported IP protocol: %u.", tuple->l3_type);
438 return false;
439 }
440 } else {
441 VLOG_ERR_RL(&rl, "Could not parse nested tuple IP options. "
442 "Possibly incompatible Linux kernel version.");
443 }
444
445 return parsed;
446 }
447
448 static bool
449 nl_ct_parse_tuple_proto(struct nlattr *nla, struct ct_dpif_tuple *tuple)
450 {
451 static const struct nl_policy policy[] = {
452 [CTA_PROTO_NUM] = { .type = NL_A_U8, .optional = false },
453 [CTA_PROTO_SRC_PORT] = { .type = NL_A_BE16, .optional = true },
454 [CTA_PROTO_DST_PORT] = { .type = NL_A_BE16, .optional = true },
455 [CTA_PROTO_ICMP_ID] = { .type = NL_A_BE16, .optional = true },
456 [CTA_PROTO_ICMP_TYPE] = { .type = NL_A_U8, .optional = true },
457 [CTA_PROTO_ICMP_CODE] = { .type = NL_A_U8, .optional = true },
458 [CTA_PROTO_ICMPV6_ID] = { .type = NL_A_BE16, .optional = true },
459 [CTA_PROTO_ICMPV6_TYPE] = { .type = NL_A_U8, .optional = true },
460 [CTA_PROTO_ICMPV6_CODE] = { .type = NL_A_U8, .optional = true },
461 };
462 struct nlattr *attrs[ARRAY_SIZE(policy)];
463 bool parsed;
464
465 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
466
467 if (parsed) {
468 tuple->ip_proto = nl_attr_get_u8(attrs[CTA_PROTO_NUM]);
469
470 if (tuple->l3_type == AF_INET && tuple->ip_proto == IPPROTO_ICMP) {
471 if (!attrs[CTA_PROTO_ICMP_ID] || !attrs[CTA_PROTO_ICMP_TYPE]
472 || !attrs[CTA_PROTO_ICMP_CODE]) {
473 VLOG_ERR_RL(&rl, "Tuple ICMP data missing.");
474 return false;
475 }
476 tuple->icmp_id = nl_attr_get_be16(attrs[CTA_PROTO_ICMP_ID]);
477 tuple->icmp_type = nl_attr_get_u8(attrs[CTA_PROTO_ICMP_TYPE]);
478 tuple->icmp_code = nl_attr_get_u8(attrs[CTA_PROTO_ICMP_CODE]);
479 } else if (tuple->l3_type == AF_INET6 &&
480 tuple->ip_proto == IPPROTO_ICMPV6) {
481 if (!attrs[CTA_PROTO_ICMPV6_ID] || !attrs[CTA_PROTO_ICMPV6_TYPE]
482 || !attrs[CTA_PROTO_ICMPV6_CODE]) {
483 VLOG_ERR_RL(&rl, "Tuple ICMPv6 data missing.");
484 return false;
485 }
486 tuple->icmp_id = nl_attr_get_be16(attrs[CTA_PROTO_ICMPV6_ID]);
487 tuple->icmp_type = nl_attr_get_u8(attrs[CTA_PROTO_ICMPV6_TYPE]);
488 tuple->icmp_code = nl_attr_get_u8(attrs[CTA_PROTO_ICMPV6_CODE]);
489 } else if (attrs[CTA_PROTO_SRC_PORT] && attrs[CTA_PROTO_DST_PORT]) {
490 tuple->src_port = nl_attr_get_be16(attrs[CTA_PROTO_SRC_PORT]);
491 tuple->dst_port = nl_attr_get_be16(attrs[CTA_PROTO_DST_PORT]);
492 } else {
493 /* Unsupported IPPROTO and no ports, leave them zeroed.
494 * We have parsed the ip_proto, so this is not a failure. */
495 VLOG_DBG_RL(&rl, "Unsupported L4 protocol: %u.", tuple->ip_proto);
496 }
497 } else {
498 VLOG_ERR_RL(&rl, "Could not parse nested tuple protocol options. "
499 "Possibly incompatible Linux kernel version.");
500 }
501
502 return parsed;
503 }
504
505 static bool
506 nl_ct_parse_tuple(struct nlattr *nla, struct ct_dpif_tuple *tuple,
507 uint16_t l3_type)
508 {
509 static const struct nl_policy policy[] = {
510 [CTA_TUPLE_IP] = { .type = NL_A_NESTED, .optional = false },
511 [CTA_TUPLE_PROTO] = { .type = NL_A_NESTED, .optional = false },
512 };
513 struct nlattr *attrs[ARRAY_SIZE(policy)];
514 bool parsed;
515
516 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
517
518 memset(tuple, 0, sizeof *tuple);
519
520 if (parsed) {
521 tuple->l3_type = l3_type;
522
523 if (!nl_ct_parse_tuple_ip(attrs[CTA_TUPLE_IP], tuple)
524 || !nl_ct_parse_tuple_proto(attrs[CTA_TUPLE_PROTO], tuple)) {
525 struct ds ds;
526
527 ds_init(&ds);
528 ct_dpif_format_tuple(&ds, tuple);
529
530 VLOG_ERR_RL(&rl, "Failed to parse tuple: %s", ds_cstr(&ds));
531 ds_destroy(&ds);
532
533 memset(tuple, 0, sizeof *tuple);
534 return false;
535 }
536 } else {
537 VLOG_ERR_RL(&rl, "Could not parse nested tuple options. "
538 "Possibly incompatible Linux kernel version.");
539 }
540
541 return parsed;
542 }
543
544 static bool
545 nl_ct_put_tuple_ip(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple)
546 {
547 size_t offset = nl_msg_start_nested(buf, CTA_TUPLE_IP);
548
549 if (tuple->l3_type == AF_INET) {
550 nl_msg_put_be32(buf, CTA_IP_V4_SRC, tuple->src.ip);
551 nl_msg_put_be32(buf, CTA_IP_V4_DST, tuple->dst.ip);
552 } else if (tuple->l3_type == AF_INET6) {
553 nl_msg_put_in6_addr(buf, CTA_IP_V6_SRC, &tuple->src.in6);
554 nl_msg_put_in6_addr(buf, CTA_IP_V6_DST, &tuple->dst.in6);
555 } else {
556 VLOG_WARN_RL(&rl, "Unsupported IP protocol: %"PRIu16".",
557 tuple->l3_type);
558 return false;
559 }
560
561 nl_msg_end_nested(buf, offset);
562 return true;
563 }
564
565 static bool
566 nl_ct_put_tuple_proto(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple)
567 {
568 size_t offset = nl_msg_start_nested(buf, CTA_TUPLE_PROTO);
569
570 nl_msg_put_u8(buf, CTA_PROTO_NUM, tuple->ip_proto);
571
572 if (tuple->l3_type == AF_INET && tuple->ip_proto == IPPROTO_ICMP) {
573 nl_msg_put_be16(buf, CTA_PROTO_ICMP_ID, tuple->icmp_id);
574 nl_msg_put_u8(buf, CTA_PROTO_ICMP_TYPE, tuple->icmp_type);
575 nl_msg_put_u8(buf, CTA_PROTO_ICMP_CODE, tuple->icmp_code);
576 } else if (tuple->l3_type == AF_INET6 &&
577 tuple->ip_proto == IPPROTO_ICMPV6) {
578 nl_msg_put_be16(buf, CTA_PROTO_ICMPV6_ID, tuple->icmp_id);
579 nl_msg_put_u8(buf, CTA_PROTO_ICMPV6_TYPE, tuple->icmp_type);
580 nl_msg_put_u8(buf, CTA_PROTO_ICMPV6_CODE, tuple->icmp_code);
581 } else if (tuple->ip_proto == IPPROTO_TCP ||
582 tuple->ip_proto == IPPROTO_UDP) {
583 nl_msg_put_be16(buf, CTA_PROTO_SRC_PORT, tuple->src_port);
584 nl_msg_put_be16(buf, CTA_PROTO_DST_PORT, tuple->dst_port);
585 } else {
586 VLOG_WARN_RL(&rl, "Unsupported L4 protocol: %"PRIu8".",
587 tuple->ip_proto);
588 return false;
589 }
590
591 nl_msg_end_nested(buf, offset);
592 return true;
593 }
594
595 static bool
596 nl_ct_put_ct_tuple(struct ofpbuf *buf, const struct ct_dpif_tuple *tuple,
597 enum ctattr_type type)
598 {
599 if (type != CTA_TUPLE_ORIG && type != CTA_TUPLE_REPLY &&
600 type != CTA_TUPLE_MASTER) {
601 return false;
602 }
603
604 size_t offset = nl_msg_start_nested(buf, type);
605
606 if (!nl_ct_put_tuple_ip(buf, tuple)) {
607 return false;
608 }
609 if (!nl_ct_put_tuple_proto(buf, tuple)) {
610 return false;
611 }
612
613 nl_msg_end_nested(buf, offset);
614 return true;
615 }
616
617 /* Translate netlink TCP state to CT_DPIF_TCP state. */
618 static uint8_t
619 nl_ct_tcp_state_to_dpif(uint8_t state)
620 {
621 #ifdef _WIN32
622 /* Windows currently sends up CT_DPIF_TCP state */
623 return state;
624 #else
625 switch (state) {
626 case TCP_CONNTRACK_NONE:
627 return CT_DPIF_TCPS_CLOSED;
628 case TCP_CONNTRACK_SYN_SENT:
629 return CT_DPIF_TCPS_SYN_SENT;
630 case TCP_CONNTRACK_SYN_SENT2:
631 return CT_DPIF_TCPS_SYN_SENT;
632 case TCP_CONNTRACK_SYN_RECV:
633 return CT_DPIF_TCPS_SYN_RECV;
634 case TCP_CONNTRACK_ESTABLISHED:
635 return CT_DPIF_TCPS_ESTABLISHED;
636 case TCP_CONNTRACK_FIN_WAIT:
637 return CT_DPIF_TCPS_FIN_WAIT_1;
638 case TCP_CONNTRACK_CLOSE_WAIT:
639 return CT_DPIF_TCPS_CLOSE_WAIT;
640 case TCP_CONNTRACK_LAST_ACK:
641 return CT_DPIF_TCPS_LAST_ACK;
642 case TCP_CONNTRACK_TIME_WAIT:
643 return CT_DPIF_TCPS_TIME_WAIT;
644 case TCP_CONNTRACK_CLOSE:
645 return CT_DPIF_TCPS_CLOSING;
646 default:
647 return CT_DPIF_TCPS_CLOSED;
648 }
649 #endif
650 }
651
652 static uint8_t
653 ip_ct_tcp_flags_to_dpif(uint8_t flags)
654 {
655 #ifdef _WIN32
656 /* Windows currently sends up CT_DPIF_TCP flags */
657 return flags;
658 #else
659 uint8_t ret = 0;
660 #define CT_DPIF_TCP_FLAG(FLAG) \
661 ret |= (flags & IP_CT_TCP_FLAG_##FLAG) ? CT_DPIF_TCPF_##FLAG : 0;
662 CT_DPIF_TCP_FLAGS
663 #undef CT_DPIF_STATUS_FLAG
664 return ret;
665 #endif
666 }
667
668 static bool
669 nl_ct_parse_protoinfo_tcp(struct nlattr *nla,
670 struct ct_dpif_protoinfo *protoinfo)
671 {
672 static const struct nl_policy policy[] = {
673 [CTA_PROTOINFO_TCP_STATE] = { .type = NL_A_U8, .optional = false },
674 [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NL_A_U8,
675 .optional = false },
676 [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NL_A_U8,
677 .optional = false },
678 [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .type = NL_A_U16,
679 .optional = false },
680 [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .type = NL_A_U16,
681 .optional = false },
682 };
683 struct nlattr *attrs[ARRAY_SIZE(policy)];
684 bool parsed;
685
686 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
687
688 if (parsed) {
689 const struct nf_ct_tcp_flags *flags_orig, *flags_reply;
690 uint8_t state;
691 protoinfo->proto = IPPROTO_TCP;
692 state = nl_ct_tcp_state_to_dpif(
693 nl_attr_get_u8(attrs[CTA_PROTOINFO_TCP_STATE]));
694 /* The connection tracker keeps only one tcp state for the
695 * connection, but our structures store a separate state for
696 * each endpoint. Here we duplicate the state. */
697 protoinfo->tcp.state_orig = protoinfo->tcp.state_reply = state;
698 protoinfo->tcp.wscale_orig = nl_attr_get_u8(
699 attrs[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
700 protoinfo->tcp.wscale_reply = nl_attr_get_u8(
701 attrs[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
702 flags_orig =
703 nl_attr_get_unspec(attrs[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL],
704 sizeof *flags_orig);
705 protoinfo->tcp.flags_orig =
706 ip_ct_tcp_flags_to_dpif(flags_orig->flags);
707 flags_reply =
708 nl_attr_get_unspec(attrs[CTA_PROTOINFO_TCP_FLAGS_REPLY],
709 sizeof *flags_reply);
710 protoinfo->tcp.flags_reply =
711 ip_ct_tcp_flags_to_dpif(flags_reply->flags);
712 } else {
713 VLOG_ERR_RL(&rl, "Could not parse nested TCP protoinfo options. "
714 "Possibly incompatible Linux kernel version.");
715 }
716
717 return parsed;
718 }
719
720 static bool
721 nl_ct_parse_protoinfo(struct nlattr *nla, struct ct_dpif_protoinfo *protoinfo)
722 {
723 /* These are mutually exclusive. */
724 static const struct nl_policy policy[] = {
725 [CTA_PROTOINFO_TCP] = { .type = NL_A_NESTED, .optional = true },
726 [CTA_PROTOINFO_SCTP] = { .type = NL_A_NESTED, .optional = true },
727 };
728 struct nlattr *attrs[ARRAY_SIZE(policy)];
729 bool parsed;
730
731 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
732
733 memset(protoinfo, 0, sizeof *protoinfo);
734
735 if (parsed) {
736 if (attrs[CTA_PROTOINFO_TCP]) {
737 parsed = nl_ct_parse_protoinfo_tcp(attrs[CTA_PROTOINFO_TCP],
738 protoinfo);
739 } else if (attrs[CTA_PROTOINFO_SCTP]) {
740 VLOG_WARN_RL(&rl, "SCTP protoinfo not yet supported!");
741 } else {
742 VLOG_WARN_RL(&rl, "Empty protoinfo!");
743 }
744 } else {
745 VLOG_ERR_RL(&rl, "Could not parse nested protoinfo options. "
746 "Possibly incompatible Linux kernel version.");
747 }
748
749 return parsed;
750 }
751
752 static bool
753 nl_ct_parse_helper(struct nlattr *nla, struct ct_dpif_helper *helper)
754 {
755 static const struct nl_policy policy[] = {
756 [CTA_HELP_NAME] = { .type = NL_A_STRING, .optional = false },
757 };
758 struct nlattr *attrs[ARRAY_SIZE(policy)];
759 bool parsed;
760
761 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
762
763 memset(helper, 0, sizeof *helper);
764
765 if (parsed) {
766 helper->name = xstrdup(nl_attr_get_string(attrs[CTA_HELP_NAME]));
767 } else {
768 VLOG_ERR_RL(&rl, "Could not parse nested helper options. "
769 "Possibly incompatible Linux kernel version.");
770 }
771
772 return parsed;
773 }
774
775 /* Translate netlink entry status flags to CT_DPIF_TCP status flags. */
776 static uint32_t
777 ips_status_to_dpif_flags(uint32_t status)
778 {
779 uint32_t ret = 0;
780 #define CT_DPIF_STATUS_FLAG(FLAG) \
781 ret |= (status & IPS_##FLAG) ? CT_DPIF_STATUS_##FLAG : 0;
782 CT_DPIF_STATUS_FLAGS
783 #undef CT_DPIF_STATUS_FLAG
784 return ret;
785 }
786
787 static bool
788 nl_ct_parse_header_policy(struct ofpbuf *buf,
789 enum nl_ct_event_type *event_type,
790 uint8_t *nfgen_family,
791 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)])
792 {
793 struct nlmsghdr *nlh;
794 struct nfgenmsg *nfm;
795 uint8_t type;
796
797 nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN);
798 nfm = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *nfm);
799 if (!nfm) {
800 VLOG_ERR_RL(&rl, "Received bad nfnl message (no nfgenmsg).");
801 return false;
802 }
803 if (NFNL_SUBSYS_ID(nlh->nlmsg_type) != NFNL_SUBSYS_CTNETLINK) {
804 VLOG_ERR_RL(&rl, "Received non-conntrack message (subsystem: %u).",
805 NFNL_SUBSYS_ID(nlh->nlmsg_type));
806 return false;
807 }
808 if (nfm->version != NFNETLINK_V0) {
809 VLOG_ERR_RL(&rl, "Received unsupported nfnetlink version (%u).",
810 NFNL_MSG_TYPE(nfm->version));
811 return false;
812 }
813
814 if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof *nfm,
815 nfnlgrp_conntrack_policy, attrs,
816 ARRAY_SIZE(nfnlgrp_conntrack_policy))) {
817 VLOG_ERR_RL(&rl, "Received bad nfnl message (policy).");
818 return false;
819 }
820
821 type = NFNL_MSG_TYPE(nlh->nlmsg_type);
822 *nfgen_family = nfm->nfgen_family;
823
824 switch (type) {
825 case IPCTNL_MSG_CT_NEW:
826 *event_type = nlh->nlmsg_flags & NLM_F_CREATE
827 ? NL_CT_EVENT_NEW : NL_CT_EVENT_UPDATE;
828 break;
829 case IPCTNL_MSG_CT_DELETE:
830 *event_type = NL_CT_EVENT_DELETE;
831 break;
832 default:
833 VLOG_ERR_RL(&rl, "Can't parse conntrack event type.");
834 return false;
835 }
836
837 return true;
838 }
839
840 static bool
841 nl_ct_attrs_to_ct_dpif_entry(struct ct_dpif_entry *entry,
842 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)],
843 uint8_t nfgen_family)
844 {
845 if (!nl_ct_parse_tuple(attrs[CTA_TUPLE_ORIG], &entry->tuple_orig,
846 nfgen_family)) {
847 return false;
848 }
849 if (!nl_ct_parse_tuple(attrs[CTA_TUPLE_REPLY], &entry->tuple_reply,
850 nfgen_family)) {
851 return false;
852 }
853 if (attrs[CTA_COUNTERS_ORIG] &&
854 !nl_ct_parse_counters(attrs[CTA_COUNTERS_ORIG],
855 &entry->counters_orig)) {
856 return false;
857 }
858 if (attrs[CTA_COUNTERS_REPLY] &&
859 !nl_ct_parse_counters(attrs[CTA_COUNTERS_REPLY],
860 &entry->counters_reply)) {
861 return false;
862 }
863 if (attrs[CTA_TIMESTAMP] &&
864 !nl_ct_parse_timestamp(attrs[CTA_TIMESTAMP], &entry->timestamp)) {
865 return false;
866 }
867 if (attrs[CTA_ID]) {
868 entry->id = ntohl(nl_attr_get_be32(attrs[CTA_ID]));
869 }
870 if (attrs[CTA_ZONE]) {
871 entry->zone = ntohs(nl_attr_get_be16(attrs[CTA_ZONE]));
872 }
873 if (attrs[CTA_STATUS]) {
874 entry->status = ips_status_to_dpif_flags(
875 ntohl(nl_attr_get_be32(attrs[CTA_STATUS])));
876 }
877 if (attrs[CTA_TIMEOUT]) {
878 entry->timeout = ntohl(nl_attr_get_be32(attrs[CTA_TIMEOUT]));
879 }
880 if (attrs[CTA_MARK]) {
881 entry->mark = ntohl(nl_attr_get_be32(attrs[CTA_MARK]));
882 }
883 if (attrs[CTA_LABELS]) {
884 entry->have_labels = true;
885 memcpy(&entry->labels, nl_attr_get(attrs[CTA_LABELS]),
886 MIN(sizeof entry->labels, nl_attr_get_size(attrs[CTA_LABELS])));
887 }
888 if (attrs[CTA_PROTOINFO] &&
889 !nl_ct_parse_protoinfo(attrs[CTA_PROTOINFO], &entry->protoinfo)) {
890 return false;
891 }
892 if (attrs[CTA_HELP] &&
893 !nl_ct_parse_helper(attrs[CTA_HELP], &entry->helper)) {
894 return false;
895 }
896 if (attrs[CTA_TUPLE_MASTER] &&
897 !nl_ct_parse_tuple(attrs[CTA_TUPLE_MASTER], &entry->tuple_master,
898 nfgen_family)) {
899 return false;
900 }
901 return true;
902 }
903
904 bool
905 nl_ct_parse_entry(struct ofpbuf *buf, struct ct_dpif_entry *entry,
906 enum nl_ct_event_type *event_type)
907 {
908 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)];
909 uint8_t nfgen_family;
910
911 memset(entry, 0, sizeof *entry);
912 if (!nl_ct_parse_header_policy(buf, event_type, &nfgen_family, attrs)) {
913 return false;
914 };
915
916 if (!nl_ct_attrs_to_ct_dpif_entry(entry, attrs, nfgen_family)) {
917 ct_dpif_entry_uninit(entry);
918 memset(entry, 0, sizeof *entry);
919 return false;
920 }
921
922 return true;
923 }
924
925 /* NetFilter utility functions. */
926
927 /* Puts a nlmsghdr and nfgenmsg at the beginning of 'msg', which must be
928 * initially empty. 'expected_payload' should be an estimate of the number of
929 * payload bytes to be supplied; if the size of the payload is unknown a value
930 * of 0 is acceptable.
931 *
932 * Non-zero 'family' is the address family of items to get (e.g. AF_INET).
933 *
934 * 'flags' is a bit-mask that indicates what kind of request is being made. It
935 * is often NLM_F_REQUEST indicating that a request is being made, commonly
936 * or'd with NLM_F_ACK to request an acknowledgement. NLM_F_DUMP flag reguests
937 * a dump of the table.
938 *
939 * 'subsystem' is a netfilter subsystem id, e.g., NFNL_SUBSYS_CTNETLINK.
940 *
941 * 'cmd' is an enumerated value specific to the 'subsystem'.
942 *
943 * Sets the new nlmsghdr's nlmsg_pid field to 0 for now. nl_sock_send() will
944 * fill it in just before sending the message.
945 *
946 * nl_msg_put_nlmsghdr() should be used to compose Netlink messages that are
947 * not NetFilter Netlink messages. */
948 static void
949 nl_msg_put_nfgenmsg(struct ofpbuf *msg, size_t expected_payload,
950 int family, uint8_t subsystem, uint8_t cmd,
951 uint32_t flags)
952 {
953 struct nfgenmsg *nfm;
954
955 nl_msg_put_nlmsghdr(msg, sizeof *nfm + expected_payload,
956 subsystem << 8 | cmd, flags);
957 ovs_assert(msg->size == NLMSG_HDRLEN);
958 nfm = nl_msg_put_uninit(msg, sizeof *nfm);
959 nfm->nfgen_family = family;
960 nfm->version = NFNETLINK_V0;
961 nfm->res_id = 0;
962 #ifdef _WIN32
963 /* nfgenmsg contains ovsHdr padding in windows */
964 nfm->ovsHdr.dp_ifindex = 0;
965 #endif
966 }