]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netlink-conntrack.c
Windows: Add conntrack dump and flush support in userspace
[mirror_ovs.git] / lib / netlink-conntrack.c
CommitLineData
6830a0c0
DDP
1/*
2 * Copyright (c) 2015 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
18
19#include "netlink-conntrack.h"
20
21#include <linux/netfilter/nfnetlink.h>
22#include <linux/netfilter/nfnetlink_conntrack.h>
23#include <linux/netfilter/nf_conntrack_common.h>
24#include <linux/netfilter/nf_conntrack_tcp.h>
25#include <linux/netfilter/nf_conntrack_ftp.h>
26#include <linux/netfilter/nf_conntrack_sctp.h>
27
28#include "byte-order.h"
29#include "compiler.h"
3e8a2ad1 30#include "openvswitch/dynamic-string.h"
6830a0c0
DDP
31#include "netlink.h"
32#include "netlink-socket.h"
64c96779 33#include "openvswitch/ofpbuf.h"
6830a0c0
DDP
34#include "openvswitch/vlog.h"
35#include "poll-loop.h"
36#include "timeval.h"
37#include "unixctl.h"
38#include "util.h"
39
40VLOG_DEFINE_THIS_MODULE(netlink_conntrack);
41static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
42
43/* This module works only if conntrack modules and features are enabled in the
44 * Linux kernel. This can be done from a root shell like this:
45 *
46 * $ modprobe ip_conntrack
47 * $ sysctl -w net.netfilter.nf_conntrack_acct=1
48 * $ sysctl -w net.netfilter.nf_conntrack_timestamp=1
49 *
50 * Also, if testing conntrack label feature without conntrack-aware OVS kernel
51 * module, there must be a connlabel rule in iptables for space to be reserved
52 * for the labels (see kernel source connlabel_mt_check()). Such a rule can be
53 * inserted from a root shell like this:
54 *
55 * $ iptables -A INPUT -m conntrack -m connlabel \
56 * --ctstate NEW,ESTABLISHED,RELATED --label 127 -j ACCEPT
57 */
58
59/* Some attributes were introduced in later kernels: with these definitions
60 * we should be able to compile userspace against Linux 2.6.32+. */
61
62#define CTA_ZONE (CTA_SECMARK + 1)
63#define CTA_SECCTX (CTA_SECMARK + 2)
64#define CTA_TIMESTAMP (CTA_SECMARK + 3)
65#define CTA_MARK_MASK (CTA_SECMARK + 4)
66#define CTA_LABELS (CTA_SECMARK + 5)
67#define CTA_LABELS_MASK (CTA_SECMARK + 6)
68
69#define CTA_TIMESTAMP_START 1
70#define CTA_TIMESTAMP_STOP 2
71
72#define IPS_TEMPLATE_BIT 11
73#define IPS_TEMPLATE (1 << IPS_TEMPLATE_BIT)
74
75#define IPS_UNTRACKED_BIT 12
76#define IPS_UNTRACKED (1 << IPS_UNTRACKED_BIT)
77
e0467f6d
SV
78#ifdef _WIN32
79#ifdef NETLINK_NETFILTER
80#undef NETLINK_NETFILTER
81#endif
82/* Reuse same socket for nfgenmsg and genlmsghdr in Windows*/
83#define NETLINK_NETFILTER NETLINK_GENERIC
84#endif
85
6830a0c0
DDP
86static const struct nl_policy nfnlgrp_conntrack_policy[] = {
87 [CTA_TUPLE_ORIG] = { .type = NL_A_NESTED, .optional = false },
88 [CTA_TUPLE_REPLY] = { .type = NL_A_NESTED, .optional = false },
89 [CTA_ZONE] = { .type = NL_A_BE16, .optional = true },
90 [CTA_STATUS] = { .type = NL_A_BE32, .optional = false },
91 [CTA_TIMESTAMP] = { .type = NL_A_NESTED, .optional = true },
92 [CTA_TIMEOUT] = { .type = NL_A_BE32, .optional = true },
93 [CTA_COUNTERS_ORIG] = { .type = NL_A_NESTED, .optional = true },
94 [CTA_COUNTERS_REPLY] = { .type = NL_A_NESTED, .optional = true },
95 [CTA_PROTOINFO] = { .type = NL_A_NESTED, .optional = true },
96 [CTA_HELP] = { .type = NL_A_NESTED, .optional = true },
97 [CTA_MARK] = { .type = NL_A_BE32, .optional = true },
98 [CTA_SECCTX] = { .type = NL_A_NESTED, .optional = true },
99 [CTA_ID] = { .type = NL_A_BE32, .optional = false },
100 [CTA_USE] = { .type = NL_A_BE32, .optional = true },
101 [CTA_TUPLE_MASTER] = { .type = NL_A_NESTED, .optional = true },
102 [CTA_NAT_SEQ_ADJ_ORIG] = { .type = NL_A_NESTED, .optional = true },
103 [CTA_NAT_SEQ_ADJ_REPLY] = { .type = NL_A_NESTED, .optional = true },
104 [CTA_LABELS] = { .type = NL_A_UNSPEC, .optional = true },
105 /* CTA_NAT_SRC, CTA_NAT_DST, CTA_TIMESTAMP, CTA_MARK_MASK, and
106 * CTA_LABELS_MASK are not received from kernel. */
107};
108
109/* Declarations for conntrack netlink dumping. */
110static void nl_msg_put_nfgenmsg(struct ofpbuf *msg, size_t expected_payload,
111 int family, uint8_t subsystem, uint8_t cmd,
112 uint32_t flags);
113
114static bool nl_ct_parse_header_policy(struct ofpbuf *buf,
115 enum nl_ct_event_type *event_type,
116 uint8_t *nfgen_family,
117 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)]);
118
119static bool nl_ct_attrs_to_ct_dpif_entry(struct ct_dpif_entry *entry,
120 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)],
121 uint8_t nfgen_family);
122
123struct nl_ct_dump_state {
124 struct nl_dump dump;
125 struct ofpbuf buf;
126 bool filter_zone;
127 uint16_t zone;
128};
e0467f6d 129
6830a0c0
DDP
130/* Conntrack netlink dumping. */
131
132/* Initialize a conntrack netlink dump. */
133int
134nl_ct_dump_start(struct nl_ct_dump_state **statep, const uint16_t *zone)
135{
136 struct nl_ct_dump_state *state;
137
138 *statep = state = xzalloc(sizeof *state);
139 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
140
141 if (zone) {
142 state->filter_zone = true;
143 state->zone = *zone;
144 }
145
146 nl_msg_put_nfgenmsg(&state->buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
147 IPCTNL_MSG_CT_GET, NLM_F_REQUEST);
148 nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf);
149 ofpbuf_clear(&state->buf);
150
151 return 0;
152}
153
154/* Receive the next 'entry' from the conntrack netlink dump with 'state'.
155 * Returns 'EOF' when no more entries are available, 0 otherwise. 'entry' may
156 * be uninitilized memory on entry, and must be uninitialized with
157 * ct_dpif_entry_uninit() afterwards by the caller. In case the same 'entry' is
158 * passed to this function again, the entry must also be uninitialized before
159 * the next call. */
160int
161nl_ct_dump_next(struct nl_ct_dump_state *state, struct ct_dpif_entry *entry)
162{
163 struct ofpbuf buf;
164
165 memset(entry, 0, sizeof *entry);
166 for (;;) {
167 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)];
168 enum nl_ct_event_type type;
169 uint8_t nfgen_family;
170
171 if (!nl_dump_next(&state->dump, &buf, &state->buf)) {
172 return EOF;
173 }
174
175 if (!nl_ct_parse_header_policy(&buf, &type, &nfgen_family, attrs)) {
176 continue;
177 };
178
179 if (state->filter_zone) {
180 uint16_t entry_zone = attrs[CTA_ZONE]
181 ? ntohs(nl_attr_get_be16(attrs[CTA_ZONE]))
182 : 0;
183 if (entry_zone != state->zone) {
184 continue;
185 }
186 }
187
188 if (nl_ct_attrs_to_ct_dpif_entry(entry, attrs, nfgen_family)) {
189 break;
190 }
191
192 ct_dpif_entry_uninit(entry);
193 memset(entry, 0, sizeof *entry);
194 /* Ignore the failed entry and get the next one. */
195 }
196
197 ofpbuf_uninit(&buf);
198 return 0;
199}
200
201/* End a conntrack netlink dump. */
202int
203nl_ct_dump_done(struct nl_ct_dump_state *state)
204{
205 int error = nl_dump_done(&state->dump);
206
207 ofpbuf_uninit(&state->buf);
208 free(state);
209 return error;
210}
e0467f6d 211
6830a0c0
DDP
212/* Format conntrack event 'entry' of 'type' to 'ds'. */
213void
214nl_ct_format_event_entry(const struct ct_dpif_entry *entry,
215 enum nl_ct_event_type type, struct ds *ds,
216 bool verbose, bool print_stats)
217{
218 ds_put_format(ds, "%s ",
219 type == NL_CT_EVENT_NEW ? "NEW"
220 : type == NL_CT_EVENT_UPDATE ? "UPDATE"
221 : type == NL_CT_EVENT_DELETE ? "DELETE"
222 : "UNKNOWN");
223 ct_dpif_format_entry(entry, ds, verbose, print_stats);
224}
225
226int
227nl_ct_flush(void)
228{
229 struct ofpbuf buf;
230 int err;
231
232 ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
233
234 nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
235 IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
236
237 err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
238 ofpbuf_uninit(&buf);
239
240 /* Expectations are flushed automatically, because they do not
241 * have a master connection anymore */
242
243 return err;
244}
245
e0467f6d
SV
246#ifdef _WIN32
247int
248nl_ct_flush_zone(uint16_t flush_zone)
249{
250 /* Windows can flush a specific zone */
251 struct ofpbuf buf;
252 int err;
253
254 ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
255
256 nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
257 IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
258 nl_msg_put_be16(&buf, CTA_ZONE, flush_zone);
259
260 err = nl_transact(NETLINK_NETFILTER, &buf, NULL);
261 ofpbuf_uninit(&buf);
262
263 return err;
264}
265#else
6830a0c0
DDP
266int
267nl_ct_flush_zone(uint16_t flush_zone)
268{
269 /* Apparently, there's no netlink interface to flush a specific zone.
270 * This code dumps every connection, checks the zone and eventually
271 * delete the entry.
272 *
273 * This is race-prone, but it is better than using shell scripts. */
274
275 struct nl_dump dump;
276 struct ofpbuf buf, reply, delete;
277
278 ofpbuf_init(&buf, NL_DUMP_BUFSIZE);
279 ofpbuf_init(&delete, NL_DUMP_BUFSIZE);
280
281 nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK,
282 IPCTNL_MSG_CT_GET, NLM_F_REQUEST);
283 nl_dump_start(&dump, NETLINK_NETFILTER, &buf);
284 ofpbuf_clear(&buf);
285
286 for (;;) {
287 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)];
288 enum nl_ct_event_type event_type;
289 uint8_t nfgen_family;
290 uint16_t zone = 0;
291
292 if (!nl_dump_next(&dump, &reply, &buf)) {
293 break;
294 }
295
296 if (!nl_ct_parse_header_policy(&reply, &event_type, &nfgen_family,
297 attrs)) {
298 continue;
299 };
300
301 if (attrs[CTA_ZONE]) {
302 zone = ntohs(nl_attr_get_be16(attrs[CTA_ZONE]));
303 }
304
305 if (zone != flush_zone) {
306 /* The entry is not in the zone we're flushing. */
307 continue;
308 }
309 nl_msg_put_nfgenmsg(&delete, 0, nfgen_family, NFNL_SUBSYS_CTNETLINK,
310 IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST);
311
312 nl_msg_put_be16(&delete, CTA_ZONE, htons(zone));
313 nl_msg_put_unspec(&delete, CTA_TUPLE_ORIG, attrs[CTA_TUPLE_ORIG] + 1,
314 attrs[CTA_TUPLE_ORIG]->nla_len - NLA_HDRLEN);
315 nl_msg_put_unspec(&delete, CTA_ID, attrs[CTA_ID] + 1,
316 attrs[CTA_ID]->nla_len - NLA_HDRLEN);
317 nl_transact(NETLINK_NETFILTER, &delete, NULL);
318 ofpbuf_clear(&delete);
319 }
320
321 nl_dump_done(&dump);
322
323 ofpbuf_uninit(&delete);
324 ofpbuf_uninit(&buf);
325
326 /* Expectations are flushed automatically, because they do not
327 * have a master connection anymore */
328 return 0;
329}
e0467f6d
SV
330#endif
331
6830a0c0
DDP
332/* Conntrack netlink parsing. */
333
334static bool
335nl_ct_parse_counters(struct nlattr *nla, struct ct_dpif_counters *counters)
336{
337 static const struct nl_policy policy[] = {
338 [CTA_COUNTERS_PACKETS] = { .type = NL_A_BE64, .optional = false },
339 [CTA_COUNTERS_BYTES] = { .type = NL_A_BE64, .optional = false },
340 };
341 struct nlattr *attrs[ARRAY_SIZE(policy)];
342 bool parsed;
343
344 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
345
346 if (parsed) {
347 counters->packets
348 = ntohll(nl_attr_get_be64(attrs[CTA_COUNTERS_PACKETS]));
349 counters->bytes = ntohll(nl_attr_get_be64(attrs[CTA_COUNTERS_BYTES]));
350 } else {
351 VLOG_ERR_RL(&rl, "Could not parse nested counters. "
352 "Possibly incompatible Linux kernel version.");
353 }
354
355 return parsed;
356}
357
358static bool
359nl_ct_parse_timestamp(struct nlattr *nla, struct ct_dpif_timestamp *timestamp)
360{
361 static const struct nl_policy policy[] = {
362 [CTA_TIMESTAMP_START] = { .type = NL_A_BE64, .optional = false },
363 [CTA_TIMESTAMP_STOP] = { .type = NL_A_BE64, .optional = true },
364 };
365 struct nlattr *attrs[ARRAY_SIZE(policy)];
366 bool parsed;
367
368 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
369
370 if (parsed) {
371 timestamp->start
372 = ntohll(nl_attr_get_be64(attrs[CTA_TIMESTAMP_START]));
373 if (attrs[CTA_TIMESTAMP_STOP]) {
374 timestamp->stop
375 = ntohll(nl_attr_get_be64(attrs[CTA_TIMESTAMP_STOP]));
376 }
377 } else {
378 VLOG_ERR_RL(&rl, "Could not parse nested timestamp. "
379 "Possibly incompatible Linux kernel version.");
380 }
381
382 return parsed;
383}
384
385static bool
386nl_ct_parse_tuple_ip(struct nlattr *nla, struct ct_dpif_tuple *tuple)
387{
388 static const struct nl_policy policy[] = {
389 [CTA_IP_V4_SRC] = { .type = NL_A_BE32, .optional = true },
390 [CTA_IP_V4_DST] = { .type = NL_A_BE32, .optional = true },
391 [CTA_IP_V6_SRC] = { NL_POLICY_FOR(struct in6_addr), .optional = true },
392 [CTA_IP_V6_DST] = { NL_POLICY_FOR(struct in6_addr), .optional = true },
393 };
394 struct nlattr *attrs[ARRAY_SIZE(policy)];
395 bool parsed;
396
397 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
398
399 if (parsed) {
400 if (tuple->l3_type == AF_INET) {
401 if (attrs[CTA_IP_V4_SRC]) {
402 tuple->src.ip = nl_attr_get_be32(attrs[CTA_IP_V4_SRC]);
403 }
404 if (attrs[CTA_IP_V4_DST]) {
405 tuple->dst.ip = nl_attr_get_be32(attrs[CTA_IP_V4_DST]);
406 }
407 } else if (tuple->l3_type == AF_INET6) {
408 if (attrs[CTA_IP_V6_SRC]) {
409 memcpy(&tuple->src.in6, nl_attr_get(attrs[CTA_IP_V6_SRC]),
410 sizeof tuple->src.in6);
411 }
412 if (attrs[CTA_IP_V6_DST]) {
413 memcpy(&tuple->dst.in6, nl_attr_get(attrs[CTA_IP_V6_DST]),
414 sizeof tuple->dst.in6);
415 }
416 } else {
417 VLOG_WARN_RL(&rl, "Unsupported IP protocol: %u.", tuple->l3_type);
418 return false;
419 }
420 } else {
421 VLOG_ERR_RL(&rl, "Could not parse nested tuple IP options. "
422 "Possibly incompatible Linux kernel version.");
423 }
424
425 return parsed;
426}
427
428static bool
429nl_ct_parse_tuple_proto(struct nlattr *nla, struct ct_dpif_tuple *tuple)
430{
431 static const struct nl_policy policy[] = {
432 [CTA_PROTO_NUM] = { .type = NL_A_U8, .optional = false },
433 [CTA_PROTO_SRC_PORT] = { .type = NL_A_BE16, .optional = true },
434 [CTA_PROTO_DST_PORT] = { .type = NL_A_BE16, .optional = true },
435 [CTA_PROTO_ICMP_ID] = { .type = NL_A_BE16, .optional = true },
436 [CTA_PROTO_ICMP_TYPE] = { .type = NL_A_U8, .optional = true },
437 [CTA_PROTO_ICMP_CODE] = { .type = NL_A_U8, .optional = true },
438 [CTA_PROTO_ICMPV6_ID] = { .type = NL_A_BE16, .optional = true },
439 [CTA_PROTO_ICMPV6_TYPE] = { .type = NL_A_U8, .optional = true },
440 [CTA_PROTO_ICMPV6_CODE] = { .type = NL_A_U8, .optional = true },
441 };
442 struct nlattr *attrs[ARRAY_SIZE(policy)];
443 bool parsed;
444
445 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
446
447 if (parsed) {
448 tuple->ip_proto = nl_attr_get_u8(attrs[CTA_PROTO_NUM]);
449
450 if (tuple->l3_type == AF_INET && tuple->ip_proto == IPPROTO_ICMP) {
451 if (!attrs[CTA_PROTO_ICMP_ID] || !attrs[CTA_PROTO_ICMP_TYPE]
452 || !attrs[CTA_PROTO_ICMP_CODE]) {
453 VLOG_ERR_RL(&rl, "Tuple ICMP data missing.");
454 return false;
455 }
456 tuple->icmp_id = nl_attr_get_be16(attrs[CTA_PROTO_ICMP_ID]);
457 tuple->icmp_type = nl_attr_get_u8(attrs[CTA_PROTO_ICMP_TYPE]);
458 tuple->icmp_code = nl_attr_get_u8(attrs[CTA_PROTO_ICMP_CODE]);
459 } else if (tuple->l3_type == AF_INET6 &&
460 tuple->ip_proto == IPPROTO_ICMPV6) {
461 if (!attrs[CTA_PROTO_ICMPV6_ID] || !attrs[CTA_PROTO_ICMPV6_TYPE]
462 || !attrs[CTA_PROTO_ICMPV6_CODE]) {
463 VLOG_ERR_RL(&rl, "Tuple ICMPv6 data missing.");
464 return false;
465 }
466 tuple->icmp_id = nl_attr_get_be16(attrs[CTA_PROTO_ICMPV6_ID]);
467 tuple->icmp_type = nl_attr_get_u8(attrs[CTA_PROTO_ICMPV6_TYPE]);
468 tuple->icmp_code = nl_attr_get_u8(attrs[CTA_PROTO_ICMPV6_CODE]);
469 } else if (attrs[CTA_PROTO_SRC_PORT] && attrs[CTA_PROTO_DST_PORT]) {
470 tuple->src_port = nl_attr_get_be16(attrs[CTA_PROTO_SRC_PORT]);
471 tuple->dst_port = nl_attr_get_be16(attrs[CTA_PROTO_DST_PORT]);
472 } else {
473 /* Unsupported IPPROTO and no ports, leave them zeroed.
474 * We have parsed the ip_proto, so this is not a total failure. */
475 VLOG_INFO_RL(&rl, "Unsupported L4 protocol: %u.", tuple->ip_proto);
476 }
477 } else {
478 VLOG_ERR_RL(&rl, "Could not parse nested tuple protocol options. "
479 "Possibly incompatible Linux kernel version.");
480 }
481
482 return parsed;
483}
484
485static bool
486nl_ct_parse_tuple(struct nlattr *nla, struct ct_dpif_tuple *tuple,
487 uint16_t l3_type)
488{
489 static const struct nl_policy policy[] = {
490 [CTA_TUPLE_IP] = { .type = NL_A_NESTED, .optional = false },
491 [CTA_TUPLE_PROTO] = { .type = NL_A_NESTED, .optional = false },
492 };
493 struct nlattr *attrs[ARRAY_SIZE(policy)];
494 bool parsed;
495
496 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
497
498 memset(tuple, 0, sizeof *tuple);
499
500 if (parsed) {
501 tuple->l3_type = l3_type;
502
503 if (!nl_ct_parse_tuple_ip(attrs[CTA_TUPLE_IP], tuple)
504 || !nl_ct_parse_tuple_proto(attrs[CTA_TUPLE_PROTO], tuple)) {
505 struct ds ds;
506
507 ds_init(&ds);
508 ct_dpif_format_tuple(&ds, tuple, true);
509
510 VLOG_ERR_RL(&rl, "Failed to parse tuple: %s", ds_cstr(&ds));
511 ds_destroy(&ds);
512
513 memset(tuple, 0, sizeof *tuple);
514 return false;
515 }
516 } else {
517 VLOG_ERR_RL(&rl, "Could not parse nested tuple options. "
518 "Possibly incompatible Linux kernel version.");
519 }
520
521 return parsed;
522}
523
524/* Translate netlink TCP state to CT_DPIF_TCP state. */
525static uint8_t
526nl_ct_tcp_state_to_dpif(uint8_t state)
527{
528 switch (state) {
529 case TCP_CONNTRACK_NONE:
530 return CT_DPIF_TCPS_CLOSED;
531 case TCP_CONNTRACK_SYN_SENT:
532 return CT_DPIF_TCPS_SYN_SENT;
533 case TCP_CONNTRACK_SYN_SENT2:
534 return CT_DPIF_TCPS_SYN_SENT;
535 case TCP_CONNTRACK_SYN_RECV:
536 return CT_DPIF_TCPS_SYN_RECV;
537 case TCP_CONNTRACK_ESTABLISHED:
538 return CT_DPIF_TCPS_ESTABLISHED;
539 case TCP_CONNTRACK_FIN_WAIT:
540 return CT_DPIF_TCPS_FIN_WAIT_1;
541 case TCP_CONNTRACK_CLOSE_WAIT:
542 return CT_DPIF_TCPS_CLOSE_WAIT;
543 case TCP_CONNTRACK_LAST_ACK:
544 return CT_DPIF_TCPS_LAST_ACK;
545 case TCP_CONNTRACK_TIME_WAIT:
546 return CT_DPIF_TCPS_TIME_WAIT;
547 case TCP_CONNTRACK_CLOSE:
548 return CT_DPIF_TCPS_CLOSING;
549 default:
550 return CT_DPIF_TCPS_CLOSED;
551 }
552}
553
554static uint8_t
555ip_ct_tcp_flags_to_dpif(uint8_t flags)
556{
557 uint8_t ret = 0;
558#define CT_DPIF_TCP_FLAG(FLAG) \
559 ret |= (flags & IP_CT_TCP_FLAG_##FLAG) ? CT_DPIF_TCPF_##FLAG : 0;
560 CT_DPIF_TCP_FLAGS
561#undef CT_DPIF_STATUS_FLAG
562 return ret;
563}
564
565static bool
566nl_ct_parse_protoinfo_tcp(struct nlattr *nla,
567 struct ct_dpif_protoinfo *protoinfo)
568{
569 static const struct nl_policy policy[] = {
570 [CTA_PROTOINFO_TCP_STATE] = { .type = NL_A_U8, .optional = false },
571 [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NL_A_U8,
572 .optional = false },
573 [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NL_A_U8,
574 .optional = false },
575 [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .type = NL_A_U16,
576 .optional = false },
577 [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .type = NL_A_U16,
578 .optional = false },
579 };
580 struct nlattr *attrs[ARRAY_SIZE(policy)];
581 bool parsed;
582
583 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
584
585 if (parsed) {
586 const struct nf_ct_tcp_flags *flags_orig, *flags_reply;
587 uint8_t state;
588 protoinfo->proto = IPPROTO_TCP;
589 state = nl_ct_tcp_state_to_dpif(
590 nl_attr_get_u8(attrs[CTA_PROTOINFO_TCP_STATE]));
591 /* The connection tracker keeps only one tcp state for the
592 * connection, but our structures store a separate state for
593 * each endpoint. Here we duplicate the state. */
594 protoinfo->tcp.state_orig = protoinfo->tcp.state_reply = state;
595 protoinfo->tcp.wscale_orig = nl_attr_get_u8(
596 attrs[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
597 protoinfo->tcp.wscale_reply = nl_attr_get_u8(
598 attrs[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
599 flags_orig =
600 nl_attr_get_unspec(attrs[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL],
601 sizeof *flags_orig);
602 protoinfo->tcp.flags_orig =
603 ip_ct_tcp_flags_to_dpif(flags_orig->flags);
604 flags_reply =
605 nl_attr_get_unspec(attrs[CTA_PROTOINFO_TCP_FLAGS_REPLY],
606 sizeof *flags_reply);
607 protoinfo->tcp.flags_reply =
608 ip_ct_tcp_flags_to_dpif(flags_reply->flags);
609 } else {
610 VLOG_ERR_RL(&rl, "Could not parse nested TCP protoinfo options. "
611 "Possibly incompatible Linux kernel version.");
612 }
613
614 return parsed;
615}
616
617static bool
618nl_ct_parse_protoinfo(struct nlattr *nla, struct ct_dpif_protoinfo *protoinfo)
619{
620 /* These are mutually exclusive. */
621 static const struct nl_policy policy[] = {
622 [CTA_PROTOINFO_TCP] = { .type = NL_A_NESTED, .optional = true },
623 [CTA_PROTOINFO_SCTP] = { .type = NL_A_NESTED, .optional = true },
624 };
625 struct nlattr *attrs[ARRAY_SIZE(policy)];
626 bool parsed;
627
628 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
629
630 memset(protoinfo, 0, sizeof *protoinfo);
631
632 if (parsed) {
633 if (attrs[CTA_PROTOINFO_TCP]) {
634 parsed = nl_ct_parse_protoinfo_tcp(attrs[CTA_PROTOINFO_TCP],
635 protoinfo);
636 } else if (attrs[CTA_PROTOINFO_SCTP]) {
637 VLOG_WARN_RL(&rl, "SCTP protoinfo not yet supported!");
638 } else {
639 VLOG_WARN_RL(&rl, "Empty protoinfo!");
640 }
641 } else {
642 VLOG_ERR_RL(&rl, "Could not parse nested protoinfo options. "
643 "Possibly incompatible Linux kernel version.");
644 }
645
646 return parsed;
647}
648
649static bool
650nl_ct_parse_helper(struct nlattr *nla, struct ct_dpif_helper *helper)
651{
652 static const struct nl_policy policy[] = {
653 [CTA_HELP_NAME] = { .type = NL_A_STRING, .optional = false },
654 };
655 struct nlattr *attrs[ARRAY_SIZE(policy)];
656 bool parsed;
657
658 parsed = nl_parse_nested(nla, policy, attrs, ARRAY_SIZE(policy));
659
660 memset(helper, 0, sizeof *helper);
661
662 if (parsed) {
663 helper->name = xstrdup(nl_attr_get_string(attrs[CTA_HELP_NAME]));
664 } else {
665 VLOG_ERR_RL(&rl, "Could not parse nested helper options. "
666 "Possibly incompatible Linux kernel version.");
667 }
668
669 return parsed;
670}
671
672/* Translate netlink entry status flags to CT_DPIF_TCP status flags. */
673static uint32_t
674ips_status_to_dpif_flags(uint32_t status)
675{
676 uint32_t ret = 0;
677#define CT_DPIF_STATUS_FLAG(FLAG) \
678 ret |= (status & IPS_##FLAG) ? CT_DPIF_STATUS_##FLAG : 0;
679 CT_DPIF_STATUS_FLAGS
680#undef CT_DPIF_STATUS_FLAG
681 return ret;
682}
683
684static bool
685nl_ct_parse_header_policy(struct ofpbuf *buf,
686 enum nl_ct_event_type *event_type,
687 uint8_t *nfgen_family,
688 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)])
689{
690 struct nlmsghdr *nlh;
691 struct nfgenmsg *nfm;
692 uint8_t type;
693
694 nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN);
695 nfm = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *nfm);
696 if (!nfm) {
697 VLOG_ERR_RL(&rl, "Received bad nfnl message (no nfgenmsg).");
698 return false;
699 }
700 if (NFNL_SUBSYS_ID(nlh->nlmsg_type) != NFNL_SUBSYS_CTNETLINK) {
701 VLOG_ERR_RL(&rl, "Received non-conntrack message (subsystem: %u).",
702 NFNL_SUBSYS_ID(nlh->nlmsg_type));
703 return false;
704 }
705 if (nfm->version != NFNETLINK_V0) {
706 VLOG_ERR_RL(&rl, "Received unsupported nfnetlink version (%u).",
707 NFNL_MSG_TYPE(nfm->version));
708 return false;
709 }
710
711 if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof *nfm,
712 nfnlgrp_conntrack_policy, attrs,
713 ARRAY_SIZE(nfnlgrp_conntrack_policy))) {
714 VLOG_ERR_RL(&rl, "Received bad nfnl message (policy).");
715 return false;
716 }
717
718 type = NFNL_MSG_TYPE(nlh->nlmsg_type);
719 *nfgen_family = nfm->nfgen_family;
720
721 switch (type) {
722 case IPCTNL_MSG_CT_NEW:
723 *event_type = nlh->nlmsg_flags & NLM_F_CREATE
724 ? NL_CT_EVENT_NEW : NL_CT_EVENT_UPDATE;
725 break;
726 case IPCTNL_MSG_CT_DELETE:
727 *event_type = NL_CT_EVENT_DELETE;
728 break;
729 default:
730 VLOG_ERR_RL(&rl, "Can't parse conntrack event type.");
731 return false;
732 }
733
734 return true;
735}
736
737static bool
738nl_ct_attrs_to_ct_dpif_entry(struct ct_dpif_entry *entry,
739 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)],
740 uint8_t nfgen_family)
741{
742 if (!nl_ct_parse_tuple(attrs[CTA_TUPLE_ORIG], &entry->tuple_orig,
743 nfgen_family)) {
744 return false;
745 }
746 if (!nl_ct_parse_tuple(attrs[CTA_TUPLE_REPLY], &entry->tuple_reply,
747 nfgen_family)) {
748 return false;
749 }
750 if (attrs[CTA_COUNTERS_ORIG] &&
751 !nl_ct_parse_counters(attrs[CTA_COUNTERS_ORIG],
752 &entry->counters_orig)) {
753 return false;
754 }
755 if (attrs[CTA_COUNTERS_REPLY] &&
756 !nl_ct_parse_counters(attrs[CTA_COUNTERS_REPLY],
757 &entry->counters_reply)) {
758 return false;
759 }
760 if (attrs[CTA_TIMESTAMP] &&
761 !nl_ct_parse_timestamp(attrs[CTA_TIMESTAMP], &entry->timestamp)) {
762 return false;
763 }
764 if (attrs[CTA_ID]) {
765 entry->id = ntohl(nl_attr_get_be32(attrs[CTA_ID]));
766 }
767 if (attrs[CTA_ZONE]) {
768 entry->zone = ntohs(nl_attr_get_be16(attrs[CTA_ZONE]));
769 }
770 if (attrs[CTA_STATUS]) {
771 entry->status = ips_status_to_dpif_flags(
772 ntohl(nl_attr_get_be32(attrs[CTA_STATUS])));
773 }
774 if (attrs[CTA_TIMEOUT]) {
775 entry->timeout = ntohl(nl_attr_get_be32(attrs[CTA_TIMEOUT]));
776 }
777 if (attrs[CTA_MARK]) {
778 entry->mark = ntohl(nl_attr_get_be32(attrs[CTA_MARK]));
779 }
780 if (attrs[CTA_LABELS]) {
781 memcpy(&entry->labels, nl_attr_get(attrs[CTA_LABELS]),
782 MIN(sizeof entry->labels, nl_attr_get_size(attrs[CTA_LABELS])));
783 }
784 if (attrs[CTA_PROTOINFO] &&
785 !nl_ct_parse_protoinfo(attrs[CTA_PROTOINFO], &entry->protoinfo)) {
786 return false;
787 }
788 if (attrs[CTA_HELP] &&
789 !nl_ct_parse_helper(attrs[CTA_HELP], &entry->helper)) {
790 return false;
791 }
792 if (attrs[CTA_TUPLE_MASTER] &&
793 !nl_ct_parse_tuple(attrs[CTA_TUPLE_MASTER], &entry->tuple_master,
794 nfgen_family)) {
795 return false;
796 }
797 return true;
798}
799
800bool
801nl_ct_parse_entry(struct ofpbuf *buf, struct ct_dpif_entry *entry,
802 enum nl_ct_event_type *event_type)
803{
804 struct nlattr *attrs[ARRAY_SIZE(nfnlgrp_conntrack_policy)];
805 uint8_t nfgen_family;
806
807 memset(entry, 0, sizeof *entry);
808 if (!nl_ct_parse_header_policy(buf, event_type, &nfgen_family, attrs)) {
809 return false;
810 };
811
812 if (!nl_ct_attrs_to_ct_dpif_entry(entry, attrs, nfgen_family)) {
813 ct_dpif_entry_uninit(entry);
814 memset(entry, 0, sizeof *entry);
815 return false;
816 }
817
818 return true;
819}
e0467f6d 820
6830a0c0
DDP
821/* NetFilter utility functions. */
822
823/* Puts a nlmsghdr and nfgenmsg at the beginning of 'msg', which must be
824 * initially empty. 'expected_payload' should be an estimate of the number of
825 * payload bytes to be supplied; if the size of the payload is unknown a value
826 * of 0 is acceptable.
827 *
828 * Non-zero 'family' is the address family of items to get (e.g. AF_INET).
829 *
830 * 'flags' is a bit-mask that indicates what kind of request is being made. It
831 * is often NLM_F_REQUEST indicating that a request is being made, commonly
832 * or'd with NLM_F_ACK to request an acknowledgement. NLM_F_DUMP flag reguests
833 * a dump of the table.
834 *
835 * 'subsystem' is a netfilter subsystem id, e.g., NFNL_SUBSYS_CTNETLINK.
836 *
837 * 'cmd' is an enumerated value specific to the 'subsystem'.
838 *
839 * Sets the new nlmsghdr's nlmsg_pid field to 0 for now. nl_sock_send() will
840 * fill it in just before sending the message.
841 *
842 * nl_msg_put_nlmsghdr() should be used to compose Netlink messages that are
843 * not NetFilter Netlink messages. */
844static void
845nl_msg_put_nfgenmsg(struct ofpbuf *msg, size_t expected_payload,
846 int family, uint8_t subsystem, uint8_t cmd,
847 uint32_t flags)
848{
849 struct nfgenmsg *nfm;
850
851 nl_msg_put_nlmsghdr(msg, sizeof *nfm + expected_payload,
852 subsystem << 8 | cmd, flags);
853 ovs_assert(msg->size == NLMSG_HDRLEN);
854 nfm = nl_msg_put_uninit(msg, sizeof *nfm);
855 nfm->nfgen_family = family;
856 nfm->version = NFNETLINK_V0;
857 nfm->res_id = 0;
e0467f6d
SV
858#ifdef _WIN32
859 /* nfgenmsg contains ovsHdr padding in windows */
860 nfm->ovsHdr.dp_ifindex = 0;
861#endif
6830a0c0 862}