]> git.proxmox.com Git - mirror_ovs.git/blame - lib/dpif-netlink.c
Userspace datapath: Add fragmentation handling.
[mirror_ovs.git] / lib / dpif-netlink.c
CommitLineData
96fba48f 1/*
4ea96698 2 * Copyright (c) 2008-2018 Nicira, Inc.
96fba48f
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
9fe3b9a2 18
93451a0a 19#include "dpif-netlink.h"
96fba48f 20
96fba48f
BP
21#include <ctype.h>
22#include <errno.h>
23#include <fcntl.h>
24#include <inttypes.h>
25#include <net/if.h>
b90fa799 26#include <linux/types.h>
aae51f53 27#include <linux/pkt_sched.h>
8522ba09 28#include <poll.h>
96fba48f 29#include <stdlib.h>
8522ba09 30#include <strings.h>
50f80534 31#include <sys/epoll.h>
10dcf8de 32#include <sys/stat.h>
96fba48f
BP
33#include <unistd.h>
34
773cd538 35#include "bitmap.h"
c4e08753 36#include "dpif-netlink-rtnl.h"
0d71302e 37#include "dpif-provider.h"
1579cf67 38#include "fat-rwlock.h"
0d71302e 39#include "flow.h"
032aa6a3 40#include "netdev-linux.h"
0d71302e 41#include "netdev-provider.h"
c3827f61 42#include "netdev-vport.h"
0d71302e 43#include "netdev.h"
c11c9f4a 44#include "netlink-conntrack.h"
45c8d3a1 45#include "netlink-notifier.h"
982b8810 46#include "netlink-socket.h"
856081f6 47#include "netlink.h"
bfda5239 48#include "netnsid.h"
feebdea2 49#include "odp-util.h"
0d71302e
BP
50#include "openvswitch/dynamic-string.h"
51#include "openvswitch/flow.h"
52#include "openvswitch/match.h"
64c96779 53#include "openvswitch/ofpbuf.h"
fd016ae3 54#include "openvswitch/poll-loop.h"
ee89ea7b 55#include "openvswitch/shash.h"
92d0d515 56#include "openvswitch/thread.h"
0d71302e
BP
57#include "openvswitch/vlog.h"
58#include "packets.h"
59#include "random.h"
b3c01ed3 60#include "sset.h"
14b4d2f9 61#include "timeval.h"
d6569377 62#include "unaligned.h"
96fba48f 63#include "util.h"
5136ce49 64
93451a0a 65VLOG_DEFINE_THIS_MODULE(dpif_netlink);
09cac43f 66#ifdef _WIN32
da467899 67#include "wmi.h"
09cac43f
NR
68enum { WINDOWS = 1 };
69#else
70enum { WINDOWS = 0 };
71#endif
95b1d73a 72enum { MAX_PORTS = USHRT_MAX };
773cd538 73
24b019f8
JP
74/* This ethtool flag was introduced in Linux 2.6.24, so it might be
75 * missing if we have old headers. */
76#define ETH_FLAG_LRO (1 << 15) /* LRO is enabled */
77
f2280b41 78#define FLOW_DUMP_MAX_BATCH 50
8b668ee3 79#define OPERATE_MAX_OPS 50
f2280b41 80
69c51582
MC
81#ifndef EPOLLEXCLUSIVE
82#define EPOLLEXCLUSIVE (1u << 28)
83#endif
84
93451a0a 85struct dpif_netlink_dp {
aaff4b55
BP
86 /* Generic Netlink header. */
87 uint8_t cmd;
d6569377 88
df2c07f4 89 /* struct ovs_header. */
254f2dc8 90 int dp_ifindex;
d6569377
BP
91
92 /* Attributes. */
df2c07f4 93 const char *name; /* OVS_DP_ATTR_NAME. */
fcd5d230 94 const uint32_t *upcall_pid; /* OVS_DP_ATTR_UPCALL_PID. */
b7fd5e38 95 uint32_t user_features; /* OVS_DP_ATTR_USER_FEATURES */
6a54dedc
BP
96 const struct ovs_dp_stats *stats; /* OVS_DP_ATTR_STATS. */
97 const struct ovs_dp_megaflow_stats *megaflow_stats;
847108dc 98 /* OVS_DP_ATTR_MEGAFLOW_STATS.*/
d6569377
BP
99};
100
93451a0a
AS
101static void dpif_netlink_dp_init(struct dpif_netlink_dp *);
102static int dpif_netlink_dp_from_ofpbuf(struct dpif_netlink_dp *,
103 const struct ofpbuf *);
104static void dpif_netlink_dp_dump_start(struct nl_dump *);
105static int dpif_netlink_dp_transact(const struct dpif_netlink_dp *request,
106 struct dpif_netlink_dp *reply,
107 struct ofpbuf **bufp);
108static int dpif_netlink_dp_get(const struct dpif *,
109 struct dpif_netlink_dp *reply,
110 struct ofpbuf **bufp);
111
112struct dpif_netlink_flow {
37a1300c
BP
113 /* Generic Netlink header. */
114 uint8_t cmd;
d6569377 115
df2c07f4 116 /* struct ovs_header. */
d6569377 117 unsigned int nlmsg_flags;
254f2dc8 118 int dp_ifindex;
d6569377
BP
119
120 /* Attributes.
121 *
0e70cdcb
BP
122 * The 'stats' member points to 64-bit data that might only be aligned on
123 * 32-bit boundaries, so get_unaligned_u64() should be used to access its
124 * values.
d2a23af2 125 *
df2c07f4 126 * If 'actions' is nonnull then OVS_FLOW_ATTR_ACTIONS will be included in
d2a23af2 127 * the Netlink version of the command, even if actions_len is zero. */
df2c07f4 128 const struct nlattr *key; /* OVS_FLOW_ATTR_KEY. */
d6569377 129 size_t key_len;
e6cc0bab
AZ
130 const struct nlattr *mask; /* OVS_FLOW_ATTR_MASK. */
131 size_t mask_len;
df2c07f4 132 const struct nlattr *actions; /* OVS_FLOW_ATTR_ACTIONS. */
d6569377 133 size_t actions_len;
70e5ed6f
JS
134 ovs_u128 ufid; /* OVS_FLOW_ATTR_FLOW_ID. */
135 bool ufid_present; /* Is there a UFID? */
136 bool ufid_terse; /* Skip serializing key/mask/acts? */
df2c07f4
JP
137 const struct ovs_flow_stats *stats; /* OVS_FLOW_ATTR_STATS. */
138 const uint8_t *tcp_flags; /* OVS_FLOW_ATTR_TCP_FLAGS. */
0e70cdcb 139 const ovs_32aligned_u64 *used; /* OVS_FLOW_ATTR_USED. */
df2c07f4 140 bool clear; /* OVS_FLOW_ATTR_CLEAR. */
43f9ac0a 141 bool probe; /* OVS_FLOW_ATTR_PROBE. */
d6569377
BP
142};
143
93451a0a
AS
144static void dpif_netlink_flow_init(struct dpif_netlink_flow *);
145static int dpif_netlink_flow_from_ofpbuf(struct dpif_netlink_flow *,
146 const struct ofpbuf *);
147static void dpif_netlink_flow_to_ofpbuf(const struct dpif_netlink_flow *,
148 struct ofpbuf *);
149static int dpif_netlink_flow_transact(struct dpif_netlink_flow *request,
150 struct dpif_netlink_flow *reply,
151 struct ofpbuf **bufp);
152static void dpif_netlink_flow_get_stats(const struct dpif_netlink_flow *,
153 struct dpif_flow_stats *);
7af12bd7 154static void dpif_netlink_flow_to_dpif_flow(struct dpif *, struct dpif_flow *,
93451a0a 155 const struct dpif_netlink_flow *);
d6569377 156
989fd548 157/* One of the dpif channels between the kernel and userspace. */
fe3d61b3 158struct dpif_channel {
14b4d2f9 159 struct nl_sock *sock; /* Netlink socket. */
14b4d2f9 160 long long int last_poll; /* Last time this channel was polled. */
fe3d61b3
BP
161};
162
09cac43f
NR
163#ifdef _WIN32
164#define VPORT_SOCK_POOL_SIZE 1
165/* On Windows, there is no native support for epoll. There are equivalent
166 * interfaces though, that are not used currently. For simpicity, a pool of
167 * netlink sockets is used. Each socket is represented by 'struct
168 * dpif_windows_vport_sock'. Since it is a pool, multiple OVS ports may be
169 * sharing the same socket. In the future, we can add a reference count and
170 * such fields. */
171struct dpif_windows_vport_sock {
172 struct nl_sock *nl_sock; /* netlink socket. */
173};
174#endif
175
1579cf67 176struct dpif_handler {
1579cf67
AW
177 struct epoll_event *epoll_events;
178 int epoll_fd; /* epoll fd that includes channel socks. */
179 int n_events; /* Num events returned by epoll_wait(). */
180 int event_offset; /* Offset into 'epoll_events'. */
09cac43f
NR
181
182#ifdef _WIN32
183 /* Pool of sockets. */
184 struct dpif_windows_vport_sock *vport_sock_pool;
185 size_t last_used_pool_idx; /* Index to aid in allocating a
186 socket in the pool to a port. */
187#endif
1579cf67 188};
14b4d2f9 189
96fba48f 190/* Datapath interface for the openvswitch Linux kernel module. */
93451a0a 191struct dpif_netlink {
96fba48f 192 struct dpif dpif;
254f2dc8 193 int dp_ifindex;
e9e28be3 194
b063d9f0 195 /* Upcall messages. */
1579cf67
AW
196 struct fat_rwlock upcall_lock;
197 struct dpif_handler *handlers;
198 uint32_t n_handlers; /* Num of upcall handlers. */
69c51582 199 struct dpif_channel *channels; /* Array of channels for each port. */
1579cf67
AW
200 int uc_array_size; /* Size of 'handler->channels' and */
201 /* 'handler->epoll_events'. */
982b8810 202
e9e28be3 203 /* Change notification. */
e4516b20 204 struct nl_sock *port_notifier; /* vport multicast group subscriber. */
61eae437 205 bool refresh_channels;
96fba48f
BP
206};
207
93451a0a 208static void report_loss(struct dpif_netlink *, struct dpif_channel *,
9b00386b 209 uint32_t ch_idx, uint32_t handler_id);
1579cf67 210
96fba48f
BP
211static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(9999, 5);
212
e4516b20
BP
213/* Generic Netlink family numbers for OVS.
214 *
93451a0a 215 * Initialized by dpif_netlink_init(). */
df2c07f4
JP
216static int ovs_datapath_family;
217static int ovs_vport_family;
218static int ovs_flow_family;
219static int ovs_packet_family;
80738e5f 220static int ovs_meter_family;
906ff9d2 221static int ovs_ct_limit_family;
982b8810 222
e4516b20
BP
223/* Generic Netlink multicast groups for OVS.
224 *
93451a0a 225 * Initialized by dpif_netlink_init(). */
e4516b20 226static unsigned int ovs_vport_mcgroup;
982b8810 227
921c370a
EG
228/* If true, tunnel devices are created using OVS compat/genetlink.
229 * If false, tunnel devices are created with rtnetlink and using light weight
230 * tunnels. If we fail to create the tunnel the rtnetlink+LWT, then we fallback
231 * to using the compat interface. */
232static bool ovs_tunnels_out_of_tree = true;
233
93451a0a
AS
234static int dpif_netlink_init(void);
235static int open_dpif(const struct dpif_netlink_dp *, struct dpif **);
236static uint32_t dpif_netlink_port_get_pid(const struct dpif *,
769b5034 237 odp_port_t port_no);
09cac43f 238static void dpif_netlink_handler_uninit(struct dpif_handler *handler);
93451a0a
AS
239static int dpif_netlink_refresh_channels(struct dpif_netlink *,
240 uint32_t n_handlers);
241static void dpif_netlink_vport_to_ofpbuf(const struct dpif_netlink_vport *,
242 struct ofpbuf *);
243static int dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *,
244 const struct ofpbuf *);
921c370a
EG
245static int dpif_netlink_port_query__(const struct dpif_netlink *dpif,
246 odp_port_t port_no, const char *port_name,
247 struct dpif_port *dpif_port);
f0fef760 248
d240e46a
AGS
249static int
250create_nl_sock(struct dpif_netlink *dpif OVS_UNUSED, struct nl_sock **socksp)
251 OVS_REQ_WRLOCK(dpif->upcall_lock)
252{
253#ifndef _WIN32
254 return nl_sock_create(NETLINK_GENERIC, socksp);
255#else
256 /* Pick netlink sockets to use in a round-robin fashion from each
257 * handler's pool of sockets. */
258 struct dpif_handler *handler = &dpif->handlers[0];
259 struct dpif_windows_vport_sock *sock_pool = handler->vport_sock_pool;
260 size_t index = handler->last_used_pool_idx;
261
262 /* A pool of sockets is allocated when the handler is initialized. */
263 if (sock_pool == NULL) {
264 *socksp = NULL;
265 return EINVAL;
266 }
267
268 ovs_assert(index < VPORT_SOCK_POOL_SIZE);
269 *socksp = sock_pool[index].nl_sock;
270 ovs_assert(*socksp);
271 index = (index == VPORT_SOCK_POOL_SIZE - 1) ? 0 : index + 1;
272 handler->last_used_pool_idx = index;
273 return 0;
274#endif
275}
276
277static void
278close_nl_sock(struct nl_sock *socksp)
279{
280#ifndef _WIN32
281 nl_sock_destroy(socksp);
282#endif
283}
284
93451a0a
AS
285static struct dpif_netlink *
286dpif_netlink_cast(const struct dpif *dpif)
96fba48f 287{
93451a0a
AS
288 dpif_assert_class(dpif, &dpif_netlink_class);
289 return CONTAINER_OF(dpif, struct dpif_netlink, dpif);
96fba48f
BP
290}
291
d3d22744 292static int
93451a0a
AS
293dpif_netlink_enumerate(struct sset *all_dps,
294 const struct dpif_class *dpif_class OVS_UNUSED)
d3d22744 295{
aaff4b55 296 struct nl_dump dump;
d57695d7
JS
297 uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
298 struct ofpbuf msg, buf;
aaff4b55 299 int error;
982b8810 300
93451a0a 301 error = dpif_netlink_init();
aaff4b55
BP
302 if (error) {
303 return error;
982b8810 304 }
d3d22744 305
d57695d7 306 ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
93451a0a 307 dpif_netlink_dp_dump_start(&dump);
d57695d7 308 while (nl_dump_next(&dump, &msg, &buf)) {
93451a0a 309 struct dpif_netlink_dp dp;
d6569377 310
93451a0a 311 if (!dpif_netlink_dp_from_ofpbuf(&dp, &msg)) {
d0c23a1a 312 sset_add(all_dps, dp.name);
d3d22744
BP
313 }
314 }
d57695d7 315 ofpbuf_uninit(&buf);
aaff4b55 316 return nl_dump_done(&dump);
d3d22744
BP
317}
318
96fba48f 319static int
93451a0a
AS
320dpif_netlink_open(const struct dpif_class *class OVS_UNUSED, const char *name,
321 bool create, struct dpif **dpifp)
96fba48f 322{
93451a0a 323 struct dpif_netlink_dp dp_request, dp;
c19e6535 324 struct ofpbuf *buf;
ea36840f 325 uint32_t upcall_pid;
c19e6535 326 int error;
96fba48f 327
93451a0a 328 error = dpif_netlink_init();
982b8810
BP
329 if (error) {
330 return error;
331 }
332
982b8810 333 /* Create or look up datapath. */
93451a0a 334 dpif_netlink_dp_init(&dp_request);
ea36840f
BP
335 if (create) {
336 dp_request.cmd = OVS_DP_CMD_NEW;
337 upcall_pid = 0;
338 dp_request.upcall_pid = &upcall_pid;
339 } else {
b7fd5e38
TG
340 /* Use OVS_DP_CMD_SET to report user features */
341 dp_request.cmd = OVS_DP_CMD_SET;
ea36840f 342 }
254f2dc8 343 dp_request.name = name;
b7fd5e38 344 dp_request.user_features |= OVS_DP_F_UNALIGNED;
1579cf67 345 dp_request.user_features |= OVS_DP_F_VPORT_PIDS;
93451a0a 346 error = dpif_netlink_dp_transact(&dp_request, &dp, &buf);
982b8810
BP
347 if (error) {
348 return error;
c19e6535 349 }
254f2dc8 350
e4516b20 351 error = open_dpif(&dp, dpifp);
8f4a4df5 352 ofpbuf_delete(buf);
e4516b20 353 return error;
c19e6535
BP
354}
355
e4516b20 356static int
93451a0a 357open_dpif(const struct dpif_netlink_dp *dp, struct dpif **dpifp)
c19e6535 358{
93451a0a 359 struct dpif_netlink *dpif;
c19e6535 360
17411ecf 361 dpif = xzalloc(sizeof *dpif);
e4516b20 362 dpif->port_notifier = NULL;
1579cf67 363 fat_rwlock_init(&dpif->upcall_lock);
c19e6535 364
93451a0a 365 dpif_init(&dpif->dpif, &dpif_netlink_class, dp->name,
254f2dc8 366 dp->dp_ifindex, dp->dp_ifindex);
c19e6535 367
254f2dc8 368 dpif->dp_ifindex = dp->dp_ifindex;
c19e6535 369 *dpifp = &dpif->dpif;
e4516b20
BP
370
371 return 0;
96fba48f
BP
372}
373
09cac43f
NR
374#ifdef _WIN32
375static void
376vport_delete_sock_pool(struct dpif_handler *handler)
377 OVS_REQ_WRLOCK(dpif->upcall_lock)
378{
379 if (handler->vport_sock_pool) {
380 uint32_t i;
381 struct dpif_windows_vport_sock *sock_pool =
382 handler->vport_sock_pool;
383
384 for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
385 if (sock_pool[i].nl_sock) {
386 nl_sock_unsubscribe_packets(sock_pool[i].nl_sock);
387 nl_sock_destroy(sock_pool[i].nl_sock);
388 sock_pool[i].nl_sock = NULL;
389 }
390 }
391
392 free(handler->vport_sock_pool);
393 handler->vport_sock_pool = NULL;
394 }
395}
396
397static int
398vport_create_sock_pool(struct dpif_handler *handler)
399 OVS_REQ_WRLOCK(dpif->upcall_lock)
400{
401 struct dpif_windows_vport_sock *sock_pool;
402 size_t i;
403 int error = 0;
404
405 sock_pool = xzalloc(VPORT_SOCK_POOL_SIZE * sizeof *sock_pool);
406 for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
407 error = nl_sock_create(NETLINK_GENERIC, &sock_pool[i].nl_sock);
408 if (error) {
409 goto error;
410 }
411
412 /* Enable the netlink socket to receive packets. This is equivalent to
413 * calling nl_sock_join_mcgroup() to receive events. */
414 error = nl_sock_subscribe_packets(sock_pool[i].nl_sock);
415 if (error) {
416 goto error;
417 }
418 }
419
420 handler->vport_sock_pool = sock_pool;
421 handler->last_used_pool_idx = 0;
422 return 0;
423
424error:
425 vport_delete_sock_pool(handler);
426 return error;
427}
09cac43f
NR
428#endif /* _WIN32 */
429
69c51582
MC
430/* Given the port number 'port_idx', extracts the pid of netlink socket
431 * associated to the port and assigns it to 'upcall_pid'. */
1579cf67 432static bool
69c51582
MC
433vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx,
434 uint32_t *upcall_pid)
1579cf67 435{
1579cf67 436 /* Since the nl_sock can only be assigned in either all
69c51582 437 * or none "dpif" channels, the following check
1579cf67 438 * would suffice. */
69c51582 439 if (!dpif->channels[port_idx].sock) {
1579cf67
AW
440 return false;
441 }
09cac43f 442 ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
1579cf67 443
69c51582 444 *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock);
989fd548 445
1579cf67 446 return true;
989fd548
JP
447}
448
449static int
69c51582
MC
450vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no,
451 struct nl_sock *socksp)
989fd548
JP
452{
453 struct epoll_event event;
4e022ec0 454 uint32_t port_idx = odp_to_u32(port_no);
69c51582 455 size_t i;
1579cf67 456 int error;
989fd548 457
1579cf67 458 if (dpif->handlers == NULL) {
989fd548
JP
459 return 0;
460 }
461
1579cf67
AW
462 /* We assume that the datapath densely chooses port numbers, which can
463 * therefore be used as an index into 'channels' and 'epoll_events' of
69c51582 464 * 'dpif'. */
4e022ec0
AW
465 if (port_idx >= dpif->uc_array_size) {
466 uint32_t new_size = port_idx + 1;
989fd548 467
12d76859 468 if (new_size > MAX_PORTS) {
989fd548
JP
469 VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big",
470 dpif_name(&dpif->dpif), port_no);
471 return EFBIG;
472 }
473
69c51582
MC
474 dpif->channels = xrealloc(dpif->channels,
475 new_size * sizeof *dpif->channels);
1579cf67 476
69c51582
MC
477 for (i = dpif->uc_array_size; i < new_size; i++) {
478 dpif->channels[i].sock = NULL;
479 }
1579cf67 480
69c51582
MC
481 for (i = 0; i < dpif->n_handlers; i++) {
482 struct dpif_handler *handler = &dpif->handlers[i];
1579cf67
AW
483
484 handler->epoll_events = xrealloc(handler->epoll_events,
485 new_size * sizeof *handler->epoll_events);
989fd548 486
1579cf67 487 }
989fd548
JP
488 dpif->uc_array_size = new_size;
489 }
490
491 memset(&event, 0, sizeof event);
69c51582 492 event.events = EPOLLIN | EPOLLEXCLUSIVE;
4e022ec0 493 event.data.u32 = port_idx;
989fd548 494
1579cf67
AW
495 for (i = 0; i < dpif->n_handlers; i++) {
496 struct dpif_handler *handler = &dpif->handlers[i];
497
09cac43f 498#ifndef _WIN32
69c51582 499 if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(socksp),
1579cf67
AW
500 &event) < 0) {
501 error = errno;
502 goto error;
503 }
93451a0a 504#endif
1579cf67 505 }
69c51582
MC
506 dpif->channels[port_idx].sock = socksp;
507 dpif->channels[port_idx].last_poll = LLONG_MIN;
989fd548
JP
508
509 return 0;
1579cf67
AW
510
511error:
09cac43f 512#ifndef _WIN32
69c51582
MC
513 while (i--) {
514 epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL,
515 nl_sock_fd(socksp), NULL);
1579cf67 516 }
69c51582
MC
517#endif
518 dpif->channels[port_idx].sock = NULL;
1579cf67
AW
519
520 return error;
989fd548
JP
521}
522
523static void
93451a0a 524vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no)
989fd548 525{
4e022ec0 526 uint32_t port_idx = odp_to_u32(port_no);
1579cf67 527 size_t i;
989fd548 528
69c51582
MC
529 if (!dpif->handlers || port_idx >= dpif->uc_array_size
530 || !dpif->channels[port_idx].sock) {
989fd548
JP
531 return;
532 }
533
1579cf67
AW
534 for (i = 0; i < dpif->n_handlers; i++) {
535 struct dpif_handler *handler = &dpif->handlers[i];
09cac43f 536#ifndef _WIN32
1579cf67 537 epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL,
69c51582 538 nl_sock_fd(dpif->channels[port_idx].sock), NULL);
09cac43f 539#endif
1579cf67
AW
540 handler->event_offset = handler->n_events = 0;
541 }
69c51582
MC
542#ifndef _WIN32
543 nl_sock_destroy(dpif->channels[port_idx].sock);
544#endif
545 dpif->channels[port_idx].sock = NULL;
1579cf67
AW
546}
547
548static void
93451a0a
AS
549destroy_all_channels(struct dpif_netlink *dpif)
550 OVS_REQ_WRLOCK(dpif->upcall_lock)
1579cf67
AW
551{
552 unsigned int i;
553
554 if (!dpif->handlers) {
555 return;
556 }
557
558 for (i = 0; i < dpif->uc_array_size; i++ ) {
93451a0a 559 struct dpif_netlink_vport vport_request;
1579cf67
AW
560 uint32_t upcall_pids = 0;
561
69c51582 562 if (!dpif->channels[i].sock) {
1579cf67
AW
563 continue;
564 }
565
566 /* Turn off upcalls. */
93451a0a 567 dpif_netlink_vport_init(&vport_request);
1579cf67
AW
568 vport_request.cmd = OVS_VPORT_CMD_SET;
569 vport_request.dp_ifindex = dpif->dp_ifindex;
570 vport_request.port_no = u32_to_odp(i);
a78f446a 571 vport_request.n_upcall_pids = 1;
1579cf67 572 vport_request.upcall_pids = &upcall_pids;
93451a0a 573 dpif_netlink_vport_transact(&vport_request, NULL, NULL);
1579cf67
AW
574
575 vport_del_channels(dpif, u32_to_odp(i));
576 }
577
578 for (i = 0; i < dpif->n_handlers; i++) {
579 struct dpif_handler *handler = &dpif->handlers[i];
580
09cac43f 581 dpif_netlink_handler_uninit(handler);
1579cf67 582 free(handler->epoll_events);
1579cf67 583 }
69c51582 584 free(dpif->channels);
1579cf67
AW
585 free(dpif->handlers);
586 dpif->handlers = NULL;
69c51582 587 dpif->channels = NULL;
1579cf67
AW
588 dpif->n_handlers = 0;
589 dpif->uc_array_size = 0;
17411ecf
JG
590}
591
96fba48f 592static void
93451a0a 593dpif_netlink_close(struct dpif *dpif_)
96fba48f 594{
93451a0a 595 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
c7178a0b 596
e4516b20 597 nl_sock_destroy(dpif->port_notifier);
1579cf67
AW
598
599 fat_rwlock_wrlock(&dpif->upcall_lock);
600 destroy_all_channels(dpif);
601 fat_rwlock_unlock(&dpif->upcall_lock);
602
603 fat_rwlock_destroy(&dpif->upcall_lock);
96fba48f
BP
604 free(dpif);
605}
606
607static int
93451a0a 608dpif_netlink_destroy(struct dpif *dpif_)
96fba48f 609{
93451a0a
AS
610 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
611 struct dpif_netlink_dp dp;
d6569377 612
93451a0a 613 dpif_netlink_dp_init(&dp);
df2c07f4 614 dp.cmd = OVS_DP_CMD_DEL;
254f2dc8 615 dp.dp_ifindex = dpif->dp_ifindex;
93451a0a 616 return dpif_netlink_dp_transact(&dp, NULL, NULL);
96fba48f
BP
617}
618
a36de779 619static bool
93451a0a 620dpif_netlink_run(struct dpif *dpif_)
61eae437 621{
93451a0a 622 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
1579cf67 623
61eae437
BP
624 if (dpif->refresh_channels) {
625 dpif->refresh_channels = false;
1579cf67 626 fat_rwlock_wrlock(&dpif->upcall_lock);
93451a0a 627 dpif_netlink_refresh_channels(dpif, dpif->n_handlers);
1579cf67 628 fat_rwlock_unlock(&dpif->upcall_lock);
61eae437 629 }
a36de779 630 return false;
61eae437
BP
631}
632
96fba48f 633static int
93451a0a 634dpif_netlink_get_stats(const struct dpif *dpif_, struct dpif_dp_stats *stats)
96fba48f 635{
93451a0a 636 struct dpif_netlink_dp dp;
d6569377
BP
637 struct ofpbuf *buf;
638 int error;
639
93451a0a 640 error = dpif_netlink_dp_get(dpif_, &dp, &buf);
d6569377 641 if (!error) {
6a54dedc
BP
642 memset(stats, 0, sizeof *stats);
643
644 if (dp.stats) {
645 stats->n_hit = get_32aligned_u64(&dp.stats->n_hit);
646 stats->n_missed = get_32aligned_u64(&dp.stats->n_missed);
647 stats->n_lost = get_32aligned_u64(&dp.stats->n_lost);
648 stats->n_flows = get_32aligned_u64(&dp.stats->n_flows);
649 }
650
651 if (dp.megaflow_stats) {
652 stats->n_masks = dp.megaflow_stats->n_masks;
653 stats->n_mask_hit = get_32aligned_u64(
654 &dp.megaflow_stats->n_mask_hit);
655 } else {
656 stats->n_masks = UINT32_MAX;
657 stats->n_mask_hit = UINT64_MAX;
658 }
d6569377
BP
659 ofpbuf_delete(buf);
660 }
661 return error;
96fba48f
BP
662}
663
b9ad7294 664static const char *
93451a0a 665get_vport_type(const struct dpif_netlink_vport *vport)
b9ad7294
EJ
666{
667 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
668
669 switch (vport->type) {
5ed51209
JS
670 case OVS_VPORT_TYPE_NETDEV: {
671 const char *type = netdev_get_type_from_name(vport->name);
672
673 return type ? type : "system";
674 }
b9ad7294
EJ
675
676 case OVS_VPORT_TYPE_INTERNAL:
677 return "internal";
678
c1fc1411
JG
679 case OVS_VPORT_TYPE_GENEVE:
680 return "geneve";
681
b9ad7294
EJ
682 case OVS_VPORT_TYPE_GRE:
683 return "gre";
684
b9ad7294
EJ
685 case OVS_VPORT_TYPE_VXLAN:
686 return "vxlan";
687
a6ae068b
LJ
688 case OVS_VPORT_TYPE_LISP:
689 return "lisp";
690
4237026e
PS
691 case OVS_VPORT_TYPE_STT:
692 return "stt";
693
c387d817 694 case OVS_VPORT_TYPE_ERSPAN:
98514eea
WT
695 return "erspan";
696
c387d817 697 case OVS_VPORT_TYPE_IP6ERSPAN:
3b10ceee
GR
698 return "ip6erspan";
699
c387d817 700 case OVS_VPORT_TYPE_IP6GRE:
3b10ceee 701 return "ip6gre";
c387d817 702
b9ad7294
EJ
703 case OVS_VPORT_TYPE_UNSPEC:
704 case __OVS_VPORT_TYPE_MAX:
705 break;
706 }
707
708 VLOG_WARN_RL(&rl, "dp%d: port `%s' has unsupported type %u",
709 vport->dp_ifindex, vport->name, (unsigned int) vport->type);
710 return "unknown";
711}
712
c4e08753 713enum ovs_vport_type
20c57607 714netdev_to_ovs_vport_type(const char *type)
c060c4cf 715{
c060c4cf
EJ
716 if (!strcmp(type, "tap") || !strcmp(type, "system")) {
717 return OVS_VPORT_TYPE_NETDEV;
718 } else if (!strcmp(type, "internal")) {
719 return OVS_VPORT_TYPE_INTERNAL;
4237026e
PS
720 } else if (strstr(type, "stt")) {
721 return OVS_VPORT_TYPE_STT;
c1fc1411
JG
722 } else if (!strcmp(type, "geneve")) {
723 return OVS_VPORT_TYPE_GENEVE;
c060c4cf
EJ
724 } else if (!strcmp(type, "vxlan")) {
725 return OVS_VPORT_TYPE_VXLAN;
a6ae068b
LJ
726 } else if (!strcmp(type, "lisp")) {
727 return OVS_VPORT_TYPE_LISP;
7dc18ae9
WT
728 } else if (!strcmp(type, "erspan")) {
729 return OVS_VPORT_TYPE_ERSPAN;
730 } else if (!strcmp(type, "ip6erspan")) {
731 return OVS_VPORT_TYPE_IP6ERSPAN;
3b10ceee
GR
732 } else if (!strcmp(type, "ip6gre")) {
733 return OVS_VPORT_TYPE_IP6GRE;
1c385f49
GR
734 } else if (!strcmp(type, "gre")) {
735 return OVS_VPORT_TYPE_GRE;
c060c4cf
EJ
736 } else {
737 return OVS_VPORT_TYPE_UNSPEC;
738 }
739}
740
96fba48f 741static int
20c57607
EG
742dpif_netlink_port_add__(struct dpif_netlink *dpif, const char *name,
743 enum ovs_vport_type type,
744 struct ofpbuf *options,
93451a0a 745 odp_port_t *port_nop)
b90de034 746 OVS_REQ_WRLOCK(dpif->upcall_lock)
96fba48f 747{
93451a0a 748 struct dpif_netlink_vport request, reply;
c19e6535 749 struct ofpbuf *buf;
69c51582 750 struct nl_sock *socksp = NULL;
790a4372 751 uint32_t upcall_pids = 0;
1579cf67 752 int error = 0;
96fba48f 753
1579cf67 754 if (dpif->handlers) {
d240e46a 755 error = create_nl_sock(dpif, &socksp);
713a45db 756 if (error) {
989fd548
JP
757 return error;
758 }
759 }
760
93451a0a 761 dpif_netlink_vport_init(&request);
df2c07f4 762 request.cmd = OVS_VPORT_CMD_NEW;
254f2dc8 763 request.dp_ifindex = dpif->dp_ifindex;
20c57607
EG
764 request.type = type;
765 request.name = name;
766
767 request.port_no = *port_nop;
790a4372
MC
768 if (socksp) {
769 upcall_pids = nl_sock_pid(socksp);
770 }
69c51582
MC
771 request.n_upcall_pids = 1;
772 request.upcall_pids = &upcall_pids;
20c57607
EG
773
774 if (options) {
775 request.options = options->data;
776 request.options_len = options->size;
777 }
778
779 error = dpif_netlink_vport_transact(&request, &reply, &buf);
780 if (!error) {
781 *port_nop = reply.port_no;
782 } else {
783 if (error == EBUSY && *port_nop != ODPP_NONE) {
784 VLOG_INFO("%s: requested port %"PRIu32" is in use",
785 dpif_name(&dpif->dpif), *port_nop);
786 }
787
d240e46a 788 close_nl_sock(socksp);
20c57607
EG
789 goto exit;
790 }
791
69c51582
MC
792 error = vport_add_channel(dpif, *port_nop, socksp);
793 if (error) {
794 VLOG_INFO("%s: could not add channel for port %s",
795 dpif_name(&dpif->dpif), name);
796
797 /* Delete the port. */
798 dpif_netlink_vport_init(&request);
799 request.cmd = OVS_VPORT_CMD_DEL;
800 request.dp_ifindex = dpif->dp_ifindex;
801 request.port_no = *port_nop;
802 dpif_netlink_vport_transact(&request, NULL, NULL);
d240e46a 803 close_nl_sock(socksp);
69c51582 804 goto exit;
20c57607 805 }
20c57607
EG
806
807exit:
808 ofpbuf_delete(buf);
20c57607
EG
809
810 return error;
811}
812
813static int
814dpif_netlink_port_add_compat(struct dpif_netlink *dpif, struct netdev *netdev,
815 odp_port_t *port_nop)
816 OVS_REQ_WRLOCK(dpif->upcall_lock)
817{
818 const struct netdev_tunnel_config *tnl_cfg;
819 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
820 const char *type = netdev_get_type(netdev);
821 uint64_t options_stub[64 / 8];
822 enum ovs_vport_type ovs_type;
823 struct ofpbuf options;
824 const char *name;
825
826 name = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
827
828 ovs_type = netdev_to_ovs_vport_type(netdev_get_type(netdev));
829 if (ovs_type == OVS_VPORT_TYPE_UNSPEC) {
c283069c
BP
830 VLOG_WARN_RL(&error_rl, "%s: cannot create port `%s' because it has "
831 "unsupported type `%s'",
9b00386b 832 dpif_name(&dpif->dpif), name, type);
c283069c
BP
833 return EINVAL;
834 }
c3827f61 835
20c57607 836 if (ovs_type == OVS_VPORT_TYPE_NETDEV) {
93451a0a 837#ifdef _WIN32
09cac43f 838 /* XXX : Map appropiate Windows handle */
93451a0a 839#else
24b019f8 840 netdev_linux_ethtool_set_flag(netdev, ETH_FLAG_LRO, "LRO", false);
93451a0a 841#endif
24b019f8
JP
842 }
843
da467899 844#ifdef _WIN32
20c57607 845 if (ovs_type == OVS_VPORT_TYPE_INTERNAL) {
da467899
AS
846 if (!create_wmi_port(name)){
847 VLOG_ERR("Could not create wmi internal port with name:%s", name);
da467899
AS
848 return EINVAL;
849 };
850 }
851#endif
852
26508d9a 853 tnl_cfg = netdev_get_tunnel_config(netdev);
526df7d8 854 if (tnl_cfg && (tnl_cfg->dst_port != 0 || tnl_cfg->exts)) {
26508d9a 855 ofpbuf_use_stack(&options, options_stub, sizeof options_stub);
526df7d8
TG
856 if (tnl_cfg->dst_port) {
857 nl_msg_put_u16(&options, OVS_TUNNEL_ATTR_DST_PORT,
858 ntohs(tnl_cfg->dst_port));
859 }
860 if (tnl_cfg->exts) {
861 size_t ext_ofs;
862 int i;
863
864 ext_ofs = nl_msg_start_nested(&options, OVS_TUNNEL_ATTR_EXTENSION);
865 for (i = 0; i < 32; i++) {
866 if (tnl_cfg->exts & (1 << i)) {
867 nl_msg_put_flag(&options, i);
868 }
869 }
870 nl_msg_end_nested(&options, ext_ofs);
871 }
20c57607
EG
872 return dpif_netlink_port_add__(dpif, name, ovs_type, &options,
873 port_nop);
2510ba7c 874 } else {
20c57607 875 return dpif_netlink_port_add__(dpif, name, ovs_type, NULL, port_nop);
78a2d59c 876 }
c3827f61 877
20c57607 878}
989fd548 879
921c370a 880static int
c4e08753
EG
881dpif_netlink_rtnl_port_create_and_add(struct dpif_netlink *dpif,
882 struct netdev *netdev,
883 odp_port_t *port_nop)
884 OVS_REQ_WRLOCK(dpif->upcall_lock)
885{
886 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
887 char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
888 const char *name;
889 int error;
989fd548 890
c4e08753
EG
891 error = dpif_netlink_rtnl_port_create(netdev);
892 if (error) {
893 if (error != EOPNOTSUPP) {
d52ef4eb 894 VLOG_WARN_RL(&rl, "Failed to create %s with rtnetlink: %s",
c4e08753
EG
895 netdev_get_name(netdev), ovs_strerror(error));
896 }
897 return error;
898 }
1579cf67 899
c4e08753
EG
900 name = netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
901 error = dpif_netlink_port_add__(dpif, name, OVS_VPORT_TYPE_NETDEV, NULL,
902 port_nop);
c37cb3ee 903 if (error) {
c4e08753
EG
904 dpif_netlink_rtnl_port_destroy(name, netdev_get_type(netdev));
905 }
906 return error;
907}
96fba48f
BP
908
909static int
93451a0a
AS
910dpif_netlink_port_add(struct dpif *dpif_, struct netdev *netdev,
911 odp_port_t *port_nop)
9fafa796 912{
93451a0a 913 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
921c370a 914 int error = EOPNOTSUPP;
9fafa796 915
1579cf67 916 fat_rwlock_wrlock(&dpif->upcall_lock);
921c370a
EG
917 if (!ovs_tunnels_out_of_tree) {
918 error = dpif_netlink_rtnl_port_create_and_add(dpif, netdev, port_nop);
919 }
c37cb3ee 920 if (error) {
921c370a
EG
921 error = dpif_netlink_port_add_compat(dpif, netdev, port_nop);
922 }
1579cf67 923 fat_rwlock_unlock(&dpif->upcall_lock);
9fafa796
BP
924
925 return error;
926}
927
928static int
93451a0a 929dpif_netlink_port_del__(struct dpif_netlink *dpif, odp_port_t port_no)
b90de034 930 OVS_REQ_WRLOCK(dpif->upcall_lock)
96fba48f 931{
93451a0a 932 struct dpif_netlink_vport vport;
921c370a 933 struct dpif_port dpif_port;
773cd538 934 int error;
c19e6535 935
921c370a
EG
936 error = dpif_netlink_port_query__(dpif, port_no, NULL, &dpif_port);
937 if (error) {
938 return error;
939 }
940
93451a0a 941 dpif_netlink_vport_init(&vport);
df2c07f4 942 vport.cmd = OVS_VPORT_CMD_DEL;
254f2dc8 943 vport.dp_ifindex = dpif->dp_ifindex;
c19e6535 944 vport.port_no = port_no;
da467899 945#ifdef _WIN32
921c370a
EG
946 if (!strcmp(dpif_port.type, "internal")) {
947 if (!delete_wmi_port(dpif_port.name)) {
da467899 948 VLOG_ERR("Could not delete wmi port with name: %s",
921c370a 949 dpif_port.name);
da467899
AS
950 };
951 }
952#endif
93451a0a 953 error = dpif_netlink_vport_transact(&vport, NULL, NULL);
773cd538 954
1579cf67 955 vport_del_channels(dpif, port_no);
989fd548 956
921c370a
EG
957 if (!error && !ovs_tunnels_out_of_tree) {
958 error = dpif_netlink_rtnl_port_destroy(dpif_port.name, dpif_port.type);
959 if (error == EOPNOTSUPP) {
960 error = 0;
961 }
962 }
963
964 dpif_port_destroy(&dpif_port);
965
773cd538 966 return error;
c3827f61 967}
3abc4a1a 968
9fafa796 969static int
93451a0a 970dpif_netlink_port_del(struct dpif *dpif_, odp_port_t port_no)
9fafa796 971{
93451a0a 972 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
9fafa796
BP
973 int error;
974
1579cf67 975 fat_rwlock_wrlock(&dpif->upcall_lock);
93451a0a 976 error = dpif_netlink_port_del__(dpif, port_no);
1579cf67 977 fat_rwlock_unlock(&dpif->upcall_lock);
9fafa796
BP
978
979 return error;
980}
981
c3827f61 982static int
93451a0a
AS
983dpif_netlink_port_query__(const struct dpif_netlink *dpif, odp_port_t port_no,
984 const char *port_name, struct dpif_port *dpif_port)
c3827f61 985{
93451a0a
AS
986 struct dpif_netlink_vport request;
987 struct dpif_netlink_vport reply;
c19e6535 988 struct ofpbuf *buf;
4c738a8d
BP
989 int error;
990
93451a0a 991 dpif_netlink_vport_init(&request);
df2c07f4 992 request.cmd = OVS_VPORT_CMD_GET;
9b00386b 993 request.dp_ifindex = dpif->dp_ifindex;
c19e6535
BP
994 request.port_no = port_no;
995 request.name = port_name;
4c738a8d 996
93451a0a 997 error = dpif_netlink_vport_transact(&request, &reply, &buf);
c19e6535 998 if (!error) {
33db1592
BP
999 if (reply.dp_ifindex != request.dp_ifindex) {
1000 /* A query by name reported that 'port_name' is in some datapath
1001 * other than 'dpif', but the caller wants to know about 'dpif'. */
1002 error = ENODEV;
4afba28d 1003 } else if (dpif_port) {
33db1592 1004 dpif_port->name = xstrdup(reply.name);
b9ad7294 1005 dpif_port->type = xstrdup(get_vport_type(&reply));
33db1592
BP
1006 dpif_port->port_no = reply.port_no;
1007 }
c19e6535 1008 ofpbuf_delete(buf);
3abc4a1a 1009 }
c19e6535 1010 return error;
96fba48f
BP
1011}
1012
1013static int
93451a0a
AS
1014dpif_netlink_port_query_by_number(const struct dpif *dpif_, odp_port_t port_no,
1015 struct dpif_port *dpif_port)
96fba48f 1016{
93451a0a 1017 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
9b00386b 1018
93451a0a 1019 return dpif_netlink_port_query__(dpif, port_no, NULL, dpif_port);
96fba48f
BP
1020}
1021
1022static int
93451a0a 1023dpif_netlink_port_query_by_name(const struct dpif *dpif_, const char *devname,
4c738a8d 1024 struct dpif_port *dpif_port)
96fba48f 1025{
93451a0a 1026 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
9b00386b 1027
93451a0a 1028 return dpif_netlink_port_query__(dpif, 0, devname, dpif_port);
96fba48f
BP
1029}
1030
98403001 1031static uint32_t
93451a0a 1032dpif_netlink_port_get_pid__(const struct dpif_netlink *dpif,
769b5034 1033 odp_port_t port_no)
b90de034 1034 OVS_REQ_RDLOCK(dpif->upcall_lock)
98403001 1035{
4e022ec0 1036 uint32_t port_idx = odp_to_u32(port_no);
9fafa796 1037 uint32_t pid = 0;
98403001 1038
f8fc5489 1039 if (dpif->handlers && dpif->uc_array_size > 0) {
4e022ec0 1040 /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s
989fd548 1041 * channel, since it is not heavily loaded. */
4e022ec0 1042 uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx;
1579cf67 1043
17f2748d
AW
1044 /* Needs to check in case the socket pointer is changed in between
1045 * the holding of upcall_lock. A known case happens when the main
1046 * thread deletes the vport while the handler thread is handling
1047 * the upcall from that port. */
69c51582
MC
1048 if (dpif->channels[idx].sock) {
1049 pid = nl_sock_pid(dpif->channels[idx].sock);
17f2748d 1050 }
98403001 1051 }
9fafa796
BP
1052
1053 return pid;
98403001
BP
1054}
1055
b90de034 1056static uint32_t
769b5034 1057dpif_netlink_port_get_pid(const struct dpif *dpif_, odp_port_t port_no)
b90de034 1058{
93451a0a 1059 const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
b90de034
AW
1060 uint32_t ret;
1061
1062 fat_rwlock_rdlock(&dpif->upcall_lock);
769b5034 1063 ret = dpif_netlink_port_get_pid__(dpif, port_no);
b90de034
AW
1064 fat_rwlock_unlock(&dpif->upcall_lock);
1065
1066 return ret;
1067}
1068
96fba48f 1069static int
93451a0a 1070dpif_netlink_flow_flush(struct dpif *dpif_)
96fba48f 1071{
93451a0a
AS
1072 const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
1073 struct dpif_netlink_flow flow;
37a1300c 1074
93451a0a 1075 dpif_netlink_flow_init(&flow);
df2c07f4 1076 flow.cmd = OVS_FLOW_CMD_DEL;
254f2dc8 1077 flow.dp_ifindex = dpif->dp_ifindex;
f7dde6df
PB
1078
1079 if (netdev_is_flow_api_enabled()) {
dfaf79dd 1080 netdev_ports_flow_flush(dpif_->dpif_class);
f7dde6df
PB
1081 }
1082
93451a0a 1083 return dpif_netlink_flow_transact(&flow, NULL, NULL);
96fba48f
BP
1084}
1085
93451a0a 1086struct dpif_netlink_port_state {
f0fef760 1087 struct nl_dump dump;
d57695d7 1088 struct ofpbuf buf;
c19e6535
BP
1089};
1090
222837c4 1091static void
93451a0a
AS
1092dpif_netlink_port_dump_start__(const struct dpif_netlink *dpif,
1093 struct nl_dump *dump)
96fba48f 1094{
93451a0a 1095 struct dpif_netlink_vport request;
f0fef760
BP
1096 struct ofpbuf *buf;
1097
93451a0a 1098 dpif_netlink_vport_init(&request);
067f1e23 1099 request.cmd = OVS_VPORT_CMD_GET;
254f2dc8 1100 request.dp_ifindex = dpif->dp_ifindex;
f0fef760
BP
1101
1102 buf = ofpbuf_new(1024);
93451a0a 1103 dpif_netlink_vport_to_ofpbuf(&request, buf);
222837c4 1104 nl_dump_start(dump, NETLINK_GENERIC, buf);
f0fef760 1105 ofpbuf_delete(buf);
222837c4
BP
1106}
1107
1108static int
93451a0a 1109dpif_netlink_port_dump_start(const struct dpif *dpif_, void **statep)
222837c4 1110{
93451a0a
AS
1111 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
1112 struct dpif_netlink_port_state *state;
222837c4
BP
1113
1114 *statep = state = xmalloc(sizeof *state);
93451a0a 1115 dpif_netlink_port_dump_start__(dpif, &state->dump);
f0fef760 1116
d57695d7 1117 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
b0ec0f27
BP
1118 return 0;
1119}
1120
7c1ef244 1121static int
93451a0a
AS
1122dpif_netlink_port_dump_next__(const struct dpif_netlink *dpif,
1123 struct nl_dump *dump,
1124 struct dpif_netlink_vport *vport,
1125 struct ofpbuf *buffer)
222837c4 1126{
222837c4
BP
1127 struct ofpbuf buf;
1128 int error;
1129
d57695d7 1130 if (!nl_dump_next(dump, &buf, buffer)) {
222837c4
BP
1131 return EOF;
1132 }
1133
93451a0a 1134 error = dpif_netlink_vport_from_ofpbuf(vport, &buf);
222837c4
BP
1135 if (error) {
1136 VLOG_WARN_RL(&error_rl, "%s: failed to parse vport record (%s)",
1137 dpif_name(&dpif->dpif), ovs_strerror(error));
1138 }
1139 return error;
1140}
1141
b0ec0f27 1142static int
93451a0a
AS
1143dpif_netlink_port_dump_next(const struct dpif *dpif_, void *state_,
1144 struct dpif_port *dpif_port)
b0ec0f27 1145{
93451a0a
AS
1146 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
1147 struct dpif_netlink_port_state *state = state_;
1148 struct dpif_netlink_vport vport;
96fba48f
BP
1149 int error;
1150
93451a0a
AS
1151 error = dpif_netlink_port_dump_next__(dpif, &state->dump, &vport,
1152 &state->buf);
c3827f61 1153 if (error) {
f0fef760 1154 return error;
c3827f61 1155 }
ebc56baa 1156 dpif_port->name = CONST_CAST(char *, vport.name);
b9ad7294 1157 dpif_port->type = CONST_CAST(char *, get_vport_type(&vport));
f0fef760
BP
1158 dpif_port->port_no = vport.port_no;
1159 return 0;
b0ec0f27
BP
1160}
1161
1162static int
93451a0a 1163dpif_netlink_port_dump_done(const struct dpif *dpif_ OVS_UNUSED, void *state_)
b0ec0f27 1164{
93451a0a 1165 struct dpif_netlink_port_state *state = state_;
f0fef760 1166 int error = nl_dump_done(&state->dump);
8522b383 1167
d57695d7 1168 ofpbuf_uninit(&state->buf);
b0ec0f27 1169 free(state);
f0fef760 1170 return error;
96fba48f
BP
1171}
1172
e9e28be3 1173static int
93451a0a 1174dpif_netlink_port_poll(const struct dpif *dpif_, char **devnamep)
e9e28be3 1175{
93451a0a 1176 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
e9e28be3 1177
e4516b20
BP
1178 /* Lazily create the Netlink socket to listen for notifications. */
1179 if (!dpif->port_notifier) {
1180 struct nl_sock *sock;
1181 int error;
1182
1183 error = nl_sock_create(NETLINK_GENERIC, &sock);
1184 if (error) {
1185 return error;
1186 }
1187
1188 error = nl_sock_join_mcgroup(sock, ovs_vport_mcgroup);
1189 if (error) {
1190 nl_sock_destroy(sock);
1191 return error;
1192 }
1193 dpif->port_notifier = sock;
1194
1195 /* We have no idea of the current state so report that everything
1196 * changed. */
1197 return ENOBUFS;
1198 }
1199
1200 for (;;) {
1201 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1202 uint64_t buf_stub[4096 / 8];
1203 struct ofpbuf buf;
1204 int error;
1205
1206 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
a86bd14e 1207 error = nl_sock_recv(dpif->port_notifier, &buf, NULL, false);
e4516b20 1208 if (!error) {
93451a0a 1209 struct dpif_netlink_vport vport;
e4516b20 1210
93451a0a 1211 error = dpif_netlink_vport_from_ofpbuf(&vport, &buf);
e4516b20
BP
1212 if (!error) {
1213 if (vport.dp_ifindex == dpif->dp_ifindex
1214 && (vport.cmd == OVS_VPORT_CMD_NEW
1215 || vport.cmd == OVS_VPORT_CMD_DEL
1216 || vport.cmd == OVS_VPORT_CMD_SET)) {
1217 VLOG_DBG("port_changed: dpif:%s vport:%s cmd:%"PRIu8,
1218 dpif->dpif.full_name, vport.name, vport.cmd);
1579cf67 1219 if (vport.cmd == OVS_VPORT_CMD_DEL && dpif->handlers) {
61eae437
BP
1220 dpif->refresh_channels = true;
1221 }
e4516b20 1222 *devnamep = xstrdup(vport.name);
59e0c910 1223 ofpbuf_uninit(&buf);
e4516b20 1224 return 0;
e4516b20
BP
1225 }
1226 }
59e0c910
BP
1227 } else if (error != EAGAIN) {
1228 VLOG_WARN_RL(&rl, "error reading or parsing netlink (%s)",
1229 ovs_strerror(error));
1230 nl_sock_drain(dpif->port_notifier);
1231 error = ENOBUFS;
e4516b20
BP
1232 }
1233
59e0c910
BP
1234 ofpbuf_uninit(&buf);
1235 if (error) {
1236 return error;
1237 }
e9e28be3 1238 }
e9e28be3
BP
1239}
1240
1241static void
93451a0a 1242dpif_netlink_port_poll_wait(const struct dpif *dpif_)
e9e28be3 1243{
93451a0a 1244 const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
e4516b20
BP
1245
1246 if (dpif->port_notifier) {
1247 nl_sock_wait(dpif->port_notifier, POLLIN);
1248 } else {
e9e28be3 1249 poll_immediate_wake();
e9e28be3
BP
1250 }
1251}
1252
6fe09f8c 1253static void
70e5ed6f
JS
1254dpif_netlink_flow_init_ufid(struct dpif_netlink_flow *request,
1255 const ovs_u128 *ufid, bool terse)
1256{
1257 if (ufid) {
1258 request->ufid = *ufid;
1259 request->ufid_present = true;
1260 } else {
1261 request->ufid_present = false;
1262 }
1263 request->ufid_terse = terse;
1264}
1265
1266static void
1267dpif_netlink_init_flow_get__(const struct dpif_netlink *dpif,
1268 const struct nlattr *key, size_t key_len,
1269 const ovs_u128 *ufid, bool terse,
1270 struct dpif_netlink_flow *request)
96fba48f 1271{
93451a0a 1272 dpif_netlink_flow_init(request);
6fe09f8c
JS
1273 request->cmd = OVS_FLOW_CMD_GET;
1274 request->dp_ifindex = dpif->dp_ifindex;
1275 request->key = key;
1276 request->key_len = key_len;
70e5ed6f
JS
1277 dpif_netlink_flow_init_ufid(request, ufid, terse);
1278}
1279
1280static void
1281dpif_netlink_init_flow_get(const struct dpif_netlink *dpif,
1282 const struct dpif_flow_get *get,
1283 struct dpif_netlink_flow *request)
1284{
1285 dpif_netlink_init_flow_get__(dpif, get->key, get->key_len, get->ufid,
1286 false, request);
30053024
BP
1287}
1288
1289static int
70e5ed6f
JS
1290dpif_netlink_flow_get__(const struct dpif_netlink *dpif,
1291 const struct nlattr *key, size_t key_len,
1292 const ovs_u128 *ufid, bool terse,
1293 struct dpif_netlink_flow *reply, struct ofpbuf **bufp)
30053024 1294{
93451a0a 1295 struct dpif_netlink_flow request;
30053024 1296
70e5ed6f 1297 dpif_netlink_init_flow_get__(dpif, key, key_len, ufid, terse, &request);
93451a0a 1298 return dpif_netlink_flow_transact(&request, reply, bufp);
96fba48f
BP
1299}
1300
70e5ed6f
JS
1301static int
1302dpif_netlink_flow_get(const struct dpif_netlink *dpif,
1303 const struct dpif_netlink_flow *flow,
1304 struct dpif_netlink_flow *reply, struct ofpbuf **bufp)
1305{
1306 return dpif_netlink_flow_get__(dpif, flow->key, flow->key_len,
1307 flow->ufid_present ? &flow->ufid : NULL,
1308 false, reply, bufp);
1309}
1310
6bc60024 1311static void
93451a0a
AS
1312dpif_netlink_init_flow_put(struct dpif_netlink *dpif,
1313 const struct dpif_flow_put *put,
1314 struct dpif_netlink_flow *request)
6bc60024 1315{
d64e176c 1316 static const struct nlattr dummy_action;
6bc60024 1317
93451a0a 1318 dpif_netlink_flow_init(request);
89625d1e 1319 request->cmd = (put->flags & DPIF_FP_CREATE
6bc60024
BP
1320 ? OVS_FLOW_CMD_NEW : OVS_FLOW_CMD_SET);
1321 request->dp_ifindex = dpif->dp_ifindex;
89625d1e
BP
1322 request->key = put->key;
1323 request->key_len = put->key_len;
e6cc0bab
AZ
1324 request->mask = put->mask;
1325 request->mask_len = put->mask_len;
70e5ed6f
JS
1326 dpif_netlink_flow_init_ufid(request, put->ufid, false);
1327
6bc60024 1328 /* Ensure that OVS_FLOW_ATTR_ACTIONS will always be included. */
d64e176c
BP
1329 request->actions = (put->actions
1330 ? put->actions
1331 : CONST_CAST(struct nlattr *, &dummy_action));
89625d1e
BP
1332 request->actions_len = put->actions_len;
1333 if (put->flags & DPIF_FP_ZERO_STATS) {
6bc60024
BP
1334 request->clear = true;
1335 }
43f9ac0a
JR
1336 if (put->flags & DPIF_FP_PROBE) {
1337 request->probe = true;
1338 }
89625d1e 1339 request->nlmsg_flags = put->flags & DPIF_FP_MODIFY ? 0 : NLM_F_CREATE;
6bc60024
BP
1340}
1341
b99d3cee 1342static void
70e5ed6f
JS
1343dpif_netlink_init_flow_del__(struct dpif_netlink *dpif,
1344 const struct nlattr *key, size_t key_len,
1345 const ovs_u128 *ufid, bool terse,
1346 struct dpif_netlink_flow *request)
96fba48f 1347{
93451a0a 1348 dpif_netlink_flow_init(request);
b99d3cee
BP
1349 request->cmd = OVS_FLOW_CMD_DEL;
1350 request->dp_ifindex = dpif->dp_ifindex;
70e5ed6f
JS
1351 request->key = key;
1352 request->key_len = key_len;
1353 dpif_netlink_flow_init_ufid(request, ufid, terse);
1354}
1355
1356static void
1357dpif_netlink_init_flow_del(struct dpif_netlink *dpif,
1358 const struct dpif_flow_del *del,
1359 struct dpif_netlink_flow *request)
1360{
37382aa6
AS
1361 dpif_netlink_init_flow_del__(dpif, del->key, del->key_len,
1362 del->ufid, del->terse, request);
70e5ed6f
JS
1363}
1364
93451a0a 1365struct dpif_netlink_flow_dump {
ac64794a
BP
1366 struct dpif_flow_dump up;
1367 struct nl_dump nl_dump;
d2ad7ef1 1368 atomic_int status;
f2280b41
PB
1369 struct netdev_flow_dump **netdev_dumps;
1370 int netdev_dumps_num; /* Number of netdev_flow_dumps */
1371 struct ovs_mutex netdev_lock; /* Guards the following. */
1372 int netdev_current_dump OVS_GUARDED; /* Shared current dump */
a692410a 1373 struct dpif_flow_dump_types types; /* Type of dump */
e723fd32
JS
1374};
1375
93451a0a
AS
1376static struct dpif_netlink_flow_dump *
1377dpif_netlink_flow_dump_cast(struct dpif_flow_dump *dump)
e723fd32 1378{
93451a0a 1379 return CONTAINER_OF(dump, struct dpif_netlink_flow_dump, up);
e723fd32
JS
1380}
1381
f2280b41
PB
1382static void
1383start_netdev_dump(const struct dpif *dpif_,
1384 struct dpif_netlink_flow_dump *dump)
1385{
1386 ovs_mutex_init(&dump->netdev_lock);
1387
a692410a 1388 if (!(dump->types.netdev_flows)) {
f2280b41
PB
1389 dump->netdev_dumps_num = 0;
1390 dump->netdev_dumps = NULL;
1391 return;
1392 }
1393
1394 ovs_mutex_lock(&dump->netdev_lock);
1395 dump->netdev_current_dump = 0;
1396 dump->netdev_dumps
dfaf79dd 1397 = netdev_ports_flow_dump_create(dpif_->dpif_class,
f2280b41
PB
1398 &dump->netdev_dumps_num);
1399 ovs_mutex_unlock(&dump->netdev_lock);
1400}
1401
a692410a
GT
1402static void
1403dpif_netlink_populate_flow_dump_types(struct dpif_netlink_flow_dump *dump,
1404 struct dpif_flow_dump_types *types)
1405{
1406 if (!types) {
1407 dump->types.ovs_flows = true;
1408 dump->types.netdev_flows = true;
1409 } else {
1410 memcpy(&dump->types, types, sizeof *types);
494a7455 1411 }
7e8b7199
PB
1412}
1413
ac64794a 1414static struct dpif_flow_dump *
7e8b7199 1415dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse,
a692410a 1416 struct dpif_flow_dump_types *types)
96fba48f 1417{
93451a0a
AS
1418 const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
1419 struct dpif_netlink_flow_dump *dump;
1420 struct dpif_netlink_flow request;
37a1300c
BP
1421 struct ofpbuf *buf;
1422
ac64794a
BP
1423 dump = xmalloc(sizeof *dump);
1424 dpif_flow_dump_init(&dump->up, dpif_);
37a1300c 1425
a692410a 1426 dpif_netlink_populate_flow_dump_types(dump, types);
37a1300c 1427
a692410a 1428 if (dump->types.ovs_flows) {
7e8b7199
PB
1429 dpif_netlink_flow_init(&request);
1430 request.cmd = OVS_FLOW_CMD_GET;
1431 request.dp_ifindex = dpif->dp_ifindex;
1432 request.ufid_present = false;
1433 request.ufid_terse = terse;
1434
1435 buf = ofpbuf_new(1024);
1436 dpif_netlink_flow_to_ofpbuf(&request, buf);
1437 nl_dump_start(&dump->nl_dump, NETLINK_GENERIC, buf);
1438 ofpbuf_delete(buf);
1439 }
ac64794a 1440 atomic_init(&dump->status, 0);
64bb477f 1441 dump->up.terse = terse;
30053024 1442
f2280b41
PB
1443 start_netdev_dump(dpif_, dump);
1444
ac64794a 1445 return &dump->up;
704a1e09
BP
1446}
1447
1448static int
93451a0a 1449dpif_netlink_flow_dump_destroy(struct dpif_flow_dump *dump_)
704a1e09 1450{
93451a0a 1451 struct dpif_netlink_flow_dump *dump = dpif_netlink_flow_dump_cast(dump_);
7e8b7199 1452 unsigned int nl_status = 0;
ac64794a 1453 int dump_status;
96fba48f 1454
a692410a 1455 if (dump->types.ovs_flows) {
7e8b7199
PB
1456 nl_status = nl_dump_done(&dump->nl_dump);
1457 }
1458
f2280b41
PB
1459 for (int i = 0; i < dump->netdev_dumps_num; i++) {
1460 int err = netdev_flow_dump_destroy(dump->netdev_dumps[i]);
1461
1462 if (err != 0 && err != EOPNOTSUPP) {
1463 VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
1464 }
1465 }
1466
1467 free(dump->netdev_dumps);
1468 ovs_mutex_destroy(&dump->netdev_lock);
1469
7424fc44
JR
1470 /* No other thread has access to 'dump' at this point. */
1471 atomic_read_relaxed(&dump->status, &dump_status);
ac64794a
BP
1472 free(dump);
1473 return dump_status ? dump_status : nl_status;
1474}
feebdea2 1475
93451a0a 1476struct dpif_netlink_flow_dump_thread {
ac64794a 1477 struct dpif_flow_dump_thread up;
93451a0a
AS
1478 struct dpif_netlink_flow_dump *dump;
1479 struct dpif_netlink_flow flow;
ac64794a
BP
1480 struct dpif_flow_stats stats;
1481 struct ofpbuf nl_flows; /* Always used to store flows. */
1482 struct ofpbuf *nl_actions; /* Used if kernel does not supply actions. */
f2280b41
PB
1483 int netdev_dump_idx; /* This thread current netdev dump index */
1484 bool netdev_done; /* If we are finished dumping netdevs */
1485
1486 /* (Key/Mask/Actions) Buffers for netdev dumping */
1487 struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
1488 struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
1489 struct odputil_keybuf actbuf[FLOW_DUMP_MAX_BATCH];
ac64794a
BP
1490};
1491
93451a0a
AS
1492static struct dpif_netlink_flow_dump_thread *
1493dpif_netlink_flow_dump_thread_cast(struct dpif_flow_dump_thread *thread)
ac64794a 1494{
93451a0a 1495 return CONTAINER_OF(thread, struct dpif_netlink_flow_dump_thread, up);
ac64794a
BP
1496}
1497
1498static struct dpif_flow_dump_thread *
93451a0a 1499dpif_netlink_flow_dump_thread_create(struct dpif_flow_dump *dump_)
ac64794a 1500{
93451a0a
AS
1501 struct dpif_netlink_flow_dump *dump = dpif_netlink_flow_dump_cast(dump_);
1502 struct dpif_netlink_flow_dump_thread *thread;
ac64794a
BP
1503
1504 thread = xmalloc(sizeof *thread);
1505 dpif_flow_dump_thread_init(&thread->up, &dump->up);
1506 thread->dump = dump;
1507 ofpbuf_init(&thread->nl_flows, NL_DUMP_BUFSIZE);
1508 thread->nl_actions = NULL;
f2280b41
PB
1509 thread->netdev_dump_idx = 0;
1510 thread->netdev_done = !(thread->netdev_dump_idx < dump->netdev_dumps_num);
ac64794a
BP
1511
1512 return &thread->up;
1513}
1514
1515static void
93451a0a 1516dpif_netlink_flow_dump_thread_destroy(struct dpif_flow_dump_thread *thread_)
ac64794a 1517{
93451a0a
AS
1518 struct dpif_netlink_flow_dump_thread *thread
1519 = dpif_netlink_flow_dump_thread_cast(thread_);
ac64794a
BP
1520
1521 ofpbuf_uninit(&thread->nl_flows);
1522 ofpbuf_delete(thread->nl_actions);
1523 free(thread);
1524}
1525
1526static void
7af12bd7 1527dpif_netlink_flow_to_dpif_flow(struct dpif *dpif, struct dpif_flow *dpif_flow,
7fe98598 1528 const struct dpif_netlink_flow *datapath_flow)
ac64794a 1529{
7fe98598
NR
1530 dpif_flow->key = datapath_flow->key;
1531 dpif_flow->key_len = datapath_flow->key_len;
1532 dpif_flow->mask = datapath_flow->mask;
1533 dpif_flow->mask_len = datapath_flow->mask_len;
1534 dpif_flow->actions = datapath_flow->actions;
1535 dpif_flow->actions_len = datapath_flow->actions_len;
70e5ed6f 1536 dpif_flow->ufid_present = datapath_flow->ufid_present;
ec97c2df 1537 dpif_flow->pmd_id = PMD_ID_NULL;
70e5ed6f
JS
1538 if (datapath_flow->ufid_present) {
1539 dpif_flow->ufid = datapath_flow->ufid;
1540 } else {
1541 ovs_assert(datapath_flow->key && datapath_flow->key_len);
1542 dpif_flow_hash(dpif, datapath_flow->key, datapath_flow->key_len,
1543 &dpif_flow->ufid);
1544 }
7fe98598 1545 dpif_netlink_flow_get_stats(datapath_flow, &dpif_flow->stats);
d63ca532
GT
1546 dpif_flow->attrs.offloaded = false;
1547 dpif_flow->attrs.dp_layer = "ovs";
ac64794a
BP
1548}
1549
f2280b41
PB
1550/* The design is such that all threads are working together on the first dump
1551 * to the last, in order (at first they all on dump 0).
1552 * When the first thread finds that the given dump is finished,
1553 * they all move to the next. If two or more threads find the same dump
1554 * is finished at the same time, the first one will advance the shared
1555 * netdev_current_dump and the others will catch up. */
1556static void
1557dpif_netlink_advance_netdev_dump(struct dpif_netlink_flow_dump_thread *thread)
1558{
1559 struct dpif_netlink_flow_dump *dump = thread->dump;
1560
1561 ovs_mutex_lock(&dump->netdev_lock);
1562 /* if we haven't finished (dumped everything) */
1563 if (dump->netdev_current_dump < dump->netdev_dumps_num) {
1564 /* if we are the first to find that current dump is finished
1565 * advance it. */
1566 if (thread->netdev_dump_idx == dump->netdev_current_dump) {
1567 thread->netdev_dump_idx = ++dump->netdev_current_dump;
1568 /* did we just finish the last dump? done. */
1569 if (dump->netdev_current_dump == dump->netdev_dumps_num) {
1570 thread->netdev_done = true;
1571 }
1572 } else {
1573 /* otherwise, we are behind, catch up */
1574 thread->netdev_dump_idx = dump->netdev_current_dump;
1575 }
1576 } else {
1577 /* some other thread finished */
1578 thread->netdev_done = true;
1579 }
1580 ovs_mutex_unlock(&dump->netdev_lock);
1581}
1582
1583static int
1584dpif_netlink_netdev_match_to_dpif_flow(struct match *match,
1585 struct ofpbuf *key_buf,
1586 struct ofpbuf *mask_buf,
1587 struct nlattr *actions,
1588 struct dpif_flow_stats *stats,
d63ca532 1589 struct dpif_flow_attrs *attrs,
f2280b41
PB
1590 ovs_u128 *ufid,
1591 struct dpif_flow *flow,
1592 bool terse OVS_UNUSED)
1593{
1594
1595 struct odp_flow_key_parms odp_parms = {
1596 .flow = &match->flow,
1597 .mask = &match->wc.masks,
1598 .support = {
f9885dc5 1599 .max_vlan_headers = 2,
f2280b41
PB
1600 },
1601 };
1602 size_t offset;
1603
1604 memset(flow, 0, sizeof *flow);
1605
1606 /* Key */
1607 offset = key_buf->size;
1608 flow->key = ofpbuf_tail(key_buf);
1609 odp_flow_key_from_flow(&odp_parms, key_buf);
1610 flow->key_len = key_buf->size - offset;
1611
1612 /* Mask */
1613 offset = mask_buf->size;
1614 flow->mask = ofpbuf_tail(mask_buf);
1615 odp_parms.key_buf = key_buf;
1616 odp_flow_key_from_mask(&odp_parms, mask_buf);
1617 flow->mask_len = mask_buf->size - offset;
1618
1619 /* Actions */
1620 flow->actions = nl_attr_get(actions);
1621 flow->actions_len = nl_attr_get_size(actions);
1622
1623 /* Stats */
1624 memcpy(&flow->stats, stats, sizeof *stats);
1625
1626 /* UFID */
1627 flow->ufid_present = true;
1628 flow->ufid = *ufid;
1629
1630 flow->pmd_id = PMD_ID_NULL;
4742003c 1631
d63ca532 1632 memcpy(&flow->attrs, attrs, sizeof *attrs);
4742003c 1633
f2280b41
PB
1634 return 0;
1635}
1636
ac64794a 1637static int
93451a0a
AS
1638dpif_netlink_flow_dump_next(struct dpif_flow_dump_thread *thread_,
1639 struct dpif_flow *flows, int max_flows)
ac64794a 1640{
93451a0a
AS
1641 struct dpif_netlink_flow_dump_thread *thread
1642 = dpif_netlink_flow_dump_thread_cast(thread_);
1643 struct dpif_netlink_flow_dump *dump = thread->dump;
1644 struct dpif_netlink *dpif = dpif_netlink_cast(thread->up.dpif);
ac64794a
BP
1645 int n_flows;
1646
1647 ofpbuf_delete(thread->nl_actions);
1648 thread->nl_actions = NULL;
1649
1650 n_flows = 0;
f2280b41
PB
1651 max_flows = MIN(max_flows, FLOW_DUMP_MAX_BATCH);
1652
1653 while (!thread->netdev_done && n_flows < max_flows) {
1654 struct odputil_keybuf *maskbuf = &thread->maskbuf[n_flows];
1655 struct odputil_keybuf *keybuf = &thread->keybuf[n_flows];
1656 struct odputil_keybuf *actbuf = &thread->actbuf[n_flows];
1657 struct ofpbuf key, mask, act;
1658 struct dpif_flow *f = &flows[n_flows];
1659 int cur = thread->netdev_dump_idx;
1660 struct netdev_flow_dump *netdev_dump = dump->netdev_dumps[cur];
1661 struct match match;
1662 struct nlattr *actions;
1663 struct dpif_flow_stats stats;
d63ca532 1664 struct dpif_flow_attrs attrs;
f2280b41
PB
1665 ovs_u128 ufid;
1666 bool has_next;
1667
1668 ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
1669 ofpbuf_use_stack(&act, actbuf, sizeof *actbuf);
1670 ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
1671 has_next = netdev_flow_dump_next(netdev_dump, &match,
d63ca532 1672 &actions, &stats, &attrs,
f2280b41
PB
1673 &ufid,
1674 &thread->nl_flows,
1675 &act);
1676 if (has_next) {
1677 dpif_netlink_netdev_match_to_dpif_flow(&match,
1678 &key, &mask,
1679 actions,
1680 &stats,
d63ca532 1681 &attrs,
f2280b41
PB
1682 &ufid,
1683 f,
1684 dump->up.terse);
1685 n_flows++;
1686 } else {
1687 dpif_netlink_advance_netdev_dump(thread);
1688 }
1689 }
1690
a692410a 1691 if (!(dump->types.ovs_flows)) {
7e8b7199
PB
1692 return n_flows;
1693 }
1694
ac64794a 1695 while (!n_flows
6fd6ed71 1696 || (n_flows < max_flows && thread->nl_flows.size)) {
7fe98598 1697 struct dpif_netlink_flow datapath_flow;
ac64794a
BP
1698 struct ofpbuf nl_flow;
1699 int error;
1700
1701 /* Try to grab another flow. */
1702 if (!nl_dump_next(&dump->nl_dump, &nl_flow, &thread->nl_flows)) {
1703 break;
feebdea2 1704 }
30053024 1705
ac64794a 1706 /* Convert the flow to our output format. */
7fe98598 1707 error = dpif_netlink_flow_from_ofpbuf(&datapath_flow, &nl_flow);
30053024 1708 if (error) {
7424fc44 1709 atomic_store_relaxed(&dump->status, error);
ac64794a 1710 break;
feebdea2 1711 }
30053024 1712
64bb477f
JS
1713 if (dump->up.terse || datapath_flow.actions) {
1714 /* Common case: we don't want actions, or the flow includes
1715 * actions. */
7af12bd7
JS
1716 dpif_netlink_flow_to_dpif_flow(&dpif->dpif, &flows[n_flows++],
1717 &datapath_flow);
ac64794a
BP
1718 } else {
1719 /* Rare case: the flow does not include actions. Retrieve this
1720 * individual flow again to get the actions. */
70e5ed6f 1721 error = dpif_netlink_flow_get(dpif, &datapath_flow,
7fe98598 1722 &datapath_flow, &thread->nl_actions);
30053024
BP
1723 if (error == ENOENT) {
1724 VLOG_DBG("dumped flow disappeared on get");
ac64794a 1725 continue;
30053024 1726 } else if (error) {
10a89ef0
BP
1727 VLOG_WARN("error fetching dumped flow: %s",
1728 ovs_strerror(error));
7424fc44 1729 atomic_store_relaxed(&dump->status, error);
ac64794a 1730 break;
30053024 1731 }
30053024 1732
ac64794a
BP
1733 /* Save this flow. Then exit, because we only have one buffer to
1734 * handle this case. */
7af12bd7
JS
1735 dpif_netlink_flow_to_dpif_flow(&dpif->dpif, &flows[n_flows++],
1736 &datapath_flow);
ac64794a
BP
1737 break;
1738 }
feebdea2 1739 }
ac64794a 1740 return n_flows;
96fba48f
BP
1741}
1742
eabe7c68 1743static void
93451a0a
AS
1744dpif_netlink_encode_execute(int dp_ifindex, const struct dpif_execute *d_exec,
1745 struct ofpbuf *buf)
96fba48f 1746{
89625d1e 1747 struct ovs_header *k_exec;
758c456d 1748 size_t key_ofs;
f7cd0081 1749
eabe7c68 1750 ofpbuf_prealloc_tailroom(buf, (64
cf62fa4c 1751 + dp_packet_size(d_exec->packet)
758c456d 1752 + ODP_KEY_METADATA_SIZE
eabe7c68 1753 + d_exec->actions_len));
f7cd0081 1754
df2c07f4 1755 nl_msg_put_genlmsghdr(buf, 0, ovs_packet_family, NLM_F_REQUEST,
69685a88 1756 OVS_PACKET_CMD_EXECUTE, OVS_PACKET_VERSION);
f7cd0081 1757
89625d1e
BP
1758 k_exec = ofpbuf_put_uninit(buf, sizeof *k_exec);
1759 k_exec->dp_ifindex = dp_ifindex;
f7cd0081 1760
89625d1e 1761 nl_msg_put_unspec(buf, OVS_PACKET_ATTR_PACKET,
cf62fa4c
PS
1762 dp_packet_data(d_exec->packet),
1763 dp_packet_size(d_exec->packet));
758c456d
JR
1764
1765 key_ofs = nl_msg_start_nested(buf, OVS_PACKET_ATTR_KEY);
beb75a40 1766 odp_key_from_dp_packet(buf, d_exec->packet);
758c456d
JR
1767 nl_msg_end_nested(buf, key_ofs);
1768
89625d1e
BP
1769 nl_msg_put_unspec(buf, OVS_PACKET_ATTR_ACTIONS,
1770 d_exec->actions, d_exec->actions_len);
43f9ac0a 1771 if (d_exec->probe) {
2e460098 1772 nl_msg_put_flag(buf, OVS_PACKET_ATTR_PROBE);
43f9ac0a 1773 }
27130224
AZ
1774 if (d_exec->mtu) {
1775 nl_msg_put_u16(buf, OVS_PACKET_ATTR_MRU, d_exec->mtu);
1776 }
6bc60024
BP
1777}
1778
0f3358ea
BP
1779/* Executes, against 'dpif', up to the first 'n_ops' operations in 'ops'.
1780 * Returns the number actually executed (at least 1, if 'n_ops' is
1781 * positive). */
1782static size_t
93451a0a
AS
1783dpif_netlink_operate__(struct dpif_netlink *dpif,
1784 struct dpif_op **ops, size_t n_ops)
6bc60024 1785{
eabe7c68
BP
1786 struct op_auxdata {
1787 struct nl_transaction txn;
72d32ac0 1788
eabe7c68
BP
1789 struct ofpbuf request;
1790 uint64_t request_stub[1024 / 8];
72d32ac0
BP
1791
1792 struct ofpbuf reply;
1793 uint64_t reply_stub[1024 / 8];
8b668ee3 1794 } auxes[OPERATE_MAX_OPS];
eabe7c68 1795
8b668ee3 1796 struct nl_transaction *txnsp[OPERATE_MAX_OPS];
6bc60024
BP
1797 size_t i;
1798
8b668ee3 1799 n_ops = MIN(n_ops, OPERATE_MAX_OPS);
6bc60024 1800 for (i = 0; i < n_ops; i++) {
eabe7c68 1801 struct op_auxdata *aux = &auxes[i];
c2b565b5 1802 struct dpif_op *op = ops[i];
b99d3cee
BP
1803 struct dpif_flow_put *put;
1804 struct dpif_flow_del *del;
6fe09f8c 1805 struct dpif_flow_get *get;
93451a0a 1806 struct dpif_netlink_flow flow;
eabe7c68
BP
1807
1808 ofpbuf_use_stub(&aux->request,
1809 aux->request_stub, sizeof aux->request_stub);
1810 aux->txn.request = &aux->request;
b99d3cee 1811
72d32ac0
BP
1812 ofpbuf_use_stub(&aux->reply, aux->reply_stub, sizeof aux->reply_stub);
1813 aux->txn.reply = NULL;
1814
b99d3cee
BP
1815 switch (op->type) {
1816 case DPIF_OP_FLOW_PUT:
fa37affa 1817 put = &op->flow_put;
93451a0a 1818 dpif_netlink_init_flow_put(dpif, put, &flow);
6bc60024 1819 if (put->stats) {
eabe7c68 1820 flow.nlmsg_flags |= NLM_F_ECHO;
72d32ac0 1821 aux->txn.reply = &aux->reply;
6bc60024 1822 }
93451a0a 1823 dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
b99d3cee
BP
1824 break;
1825
1826 case DPIF_OP_FLOW_DEL:
fa37affa 1827 del = &op->flow_del;
93451a0a 1828 dpif_netlink_init_flow_del(dpif, del, &flow);
b99d3cee 1829 if (del->stats) {
eabe7c68 1830 flow.nlmsg_flags |= NLM_F_ECHO;
72d32ac0 1831 aux->txn.reply = &aux->reply;
b99d3cee 1832 }
93451a0a 1833 dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
b99d3cee 1834 break;
6bc60024 1835
b99d3cee 1836 case DPIF_OP_EXECUTE:
0f3358ea
BP
1837 /* Can't execute a packet that won't fit in a Netlink attribute. */
1838 if (OVS_UNLIKELY(nl_attr_oversized(
fa37affa 1839 dp_packet_size(op->execute.packet)))) {
0f3358ea
BP
1840 /* Report an error immediately if this is the first operation.
1841 * Otherwise the easiest thing to do is to postpone to the next
1842 * call (when this will be the first operation). */
1843 if (i == 0) {
1844 VLOG_ERR_RL(&error_rl,
1845 "dropping oversized %"PRIu32"-byte packet",
fa37affa 1846 dp_packet_size(op->execute.packet));
0f3358ea
BP
1847 op->error = ENOBUFS;
1848 return 1;
1849 }
1850 n_ops = i;
1851 } else {
fa37affa 1852 dpif_netlink_encode_execute(dpif->dp_ifindex, &op->execute,
0f3358ea
BP
1853 &aux->request);
1854 }
b99d3cee
BP
1855 break;
1856
6fe09f8c 1857 case DPIF_OP_FLOW_GET:
fa37affa 1858 get = &op->flow_get;
70e5ed6f 1859 dpif_netlink_init_flow_get(dpif, get, &flow);
6fe09f8c 1860 aux->txn.reply = get->buffer;
93451a0a 1861 dpif_netlink_flow_to_ofpbuf(&flow, &aux->request);
6fe09f8c
JS
1862 break;
1863
b99d3cee 1864 default:
428b2edd 1865 OVS_NOT_REACHED();
6bc60024
BP
1866 }
1867 }
1868
6bc60024 1869 for (i = 0; i < n_ops; i++) {
eabe7c68 1870 txnsp[i] = &auxes[i].txn;
6bc60024 1871 }
a88b4e04 1872 nl_transact_multiple(NETLINK_GENERIC, txnsp, n_ops);
6bc60024 1873
6bc60024 1874 for (i = 0; i < n_ops; i++) {
72d32ac0 1875 struct op_auxdata *aux = &auxes[i];
eabe7c68 1876 struct nl_transaction *txn = &auxes[i].txn;
c2b565b5 1877 struct dpif_op *op = ops[i];
b99d3cee
BP
1878 struct dpif_flow_put *put;
1879 struct dpif_flow_del *del;
6fe09f8c 1880 struct dpif_flow_get *get;
6bc60024 1881
b99d3cee 1882 op->error = txn->error;
6bc60024 1883
b99d3cee
BP
1884 switch (op->type) {
1885 case DPIF_OP_FLOW_PUT:
fa37affa 1886 put = &op->flow_put;
cfceb2b5 1887 if (put->stats) {
b99d3cee 1888 if (!op->error) {
93451a0a 1889 struct dpif_netlink_flow reply;
cfceb2b5 1890
93451a0a
AS
1891 op->error = dpif_netlink_flow_from_ofpbuf(&reply,
1892 txn->reply);
cfceb2b5 1893 if (!op->error) {
93451a0a 1894 dpif_netlink_flow_get_stats(&reply, put->stats);
cfceb2b5
BP
1895 }
1896 }
6bc60024 1897 }
b99d3cee
BP
1898 break;
1899
1900 case DPIF_OP_FLOW_DEL:
fa37affa 1901 del = &op->flow_del;
cfceb2b5 1902 if (del->stats) {
b99d3cee 1903 if (!op->error) {
93451a0a 1904 struct dpif_netlink_flow reply;
cfceb2b5 1905
93451a0a
AS
1906 op->error = dpif_netlink_flow_from_ofpbuf(&reply,
1907 txn->reply);
cfceb2b5 1908 if (!op->error) {
93451a0a 1909 dpif_netlink_flow_get_stats(&reply, del->stats);
cfceb2b5
BP
1910 }
1911 }
b99d3cee
BP
1912 }
1913 break;
1914
1915 case DPIF_OP_EXECUTE:
1916 break;
1917
6fe09f8c 1918 case DPIF_OP_FLOW_GET:
fa37affa 1919 get = &op->flow_get;
6fe09f8c 1920 if (!op->error) {
93451a0a 1921 struct dpif_netlink_flow reply;
6fe09f8c 1922
93451a0a 1923 op->error = dpif_netlink_flow_from_ofpbuf(&reply, txn->reply);
6fe09f8c 1924 if (!op->error) {
7af12bd7
JS
1925 dpif_netlink_flow_to_dpif_flow(&dpif->dpif, get->flow,
1926 &reply);
6fe09f8c
JS
1927 }
1928 }
1929 break;
1930
b99d3cee 1931 default:
428b2edd 1932 OVS_NOT_REACHED();
6bc60024
BP
1933 }
1934
72d32ac0
BP
1935 ofpbuf_uninit(&aux->request);
1936 ofpbuf_uninit(&aux->reply);
6bc60024 1937 }
0f3358ea
BP
1938
1939 return n_ops;
eabe7c68
BP
1940}
1941
6c343984
PB
1942static int
1943parse_flow_get(struct dpif_netlink *dpif, struct dpif_flow_get *get)
1944{
1945 struct dpif_flow *dpif_flow = get->flow;
1946 struct match match;
1947 struct nlattr *actions;
1948 struct dpif_flow_stats stats;
d63ca532 1949 struct dpif_flow_attrs attrs;
6c343984
PB
1950 struct ofpbuf buf;
1951 uint64_t act_buf[1024 / 8];
1952 struct odputil_keybuf maskbuf;
1953 struct odputil_keybuf keybuf;
1954 struct odputil_keybuf actbuf;
1955 struct ofpbuf key, mask, act;
1956 int err;
1957
1958 ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf);
dfaf79dd 1959 err = netdev_ports_flow_get(dpif->dpif.dpif_class, &match,
d63ca532 1960 &actions, get->ufid, &stats, &attrs, &buf);
6c343984
PB
1961 if (err) {
1962 return err;
1963 }
1964
1965 VLOG_DBG("found flow from netdev, translating to dpif flow");
1966
1967 ofpbuf_use_stack(&key, &keybuf, sizeof keybuf);
1968 ofpbuf_use_stack(&act, &actbuf, sizeof actbuf);
1969 ofpbuf_use_stack(&mask, &maskbuf, sizeof maskbuf);
1970 dpif_netlink_netdev_match_to_dpif_flow(&match, &key, &mask, actions,
d63ca532 1971 &stats, &attrs,
6c343984
PB
1972 (ovs_u128 *) get->ufid,
1973 dpif_flow,
1974 false);
1975 ofpbuf_put(get->buffer, nl_attr_get(actions), nl_attr_get_size(actions));
1976 dpif_flow->actions = ofpbuf_at(get->buffer, 0, 0);
1977 dpif_flow->actions_len = nl_attr_get_size(actions);
1978
1979 return 0;
1980}
1981
8b668ee3
PB
1982static int
1983parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put)
1984{
1985 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
dfaf79dd 1986 const struct dpif_class *dpif_class = dpif->dpif.dpif_class;
8b668ee3
PB
1987 struct match match;
1988 odp_port_t in_port;
1989 const struct nlattr *nla;
1990 size_t left;
8b668ee3
PB
1991 struct netdev *dev;
1992 struct offload_info info;
1993 ovs_be16 dst_port = 0;
d9677a1f 1994 uint8_t csum_on = false;
8b668ee3
PB
1995 int err;
1996
1997 if (put->flags & DPIF_FP_PROBE) {
1998 return EOPNOTSUPP;
1999 }
2000
2001 err = parse_key_and_mask_to_match(put->key, put->key_len, put->mask,
2002 put->mask_len, &match);
2003 if (err) {
2004 return err;
2005 }
2006
2007 /* When we try to install a dummy flow from a probed feature. */
2008 if (match.flow.dl_type == htons(0x1234)) {
2009 return EOPNOTSUPP;
2010 }
2011
2012 in_port = match.flow.in_port.odp_port;
dfaf79dd 2013 dev = netdev_ports_get(in_port, dpif_class);
8b668ee3
PB
2014 if (!dev) {
2015 return EOPNOTSUPP;
2016 }
2017
00a0a011 2018 /* Get tunnel dst port */
8b668ee3
PB
2019 NL_ATTR_FOR_EACH(nla, left, put->actions, put->actions_len) {
2020 if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
2021 const struct netdev_tunnel_config *tnl_cfg;
2022 struct netdev *outdev;
2023 odp_port_t out_port;
2024
8b668ee3 2025 out_port = nl_attr_get_odp_port(nla);
dfaf79dd 2026 outdev = netdev_ports_get(out_port, dpif_class);
8b668ee3
PB
2027 if (!outdev) {
2028 err = EOPNOTSUPP;
2029 goto out;
2030 }
2031 tnl_cfg = netdev_get_tunnel_config(outdev);
2032 if (tnl_cfg && tnl_cfg->dst_port != 0) {
2033 dst_port = tnl_cfg->dst_port;
2034 }
d9677a1f
EB
2035 if (tnl_cfg) {
2036 csum_on = tnl_cfg->csum;
2037 }
8b668ee3
PB
2038 netdev_close(outdev);
2039 }
2040 }
2041
dfaf79dd 2042 info.dpif_class = dpif_class;
8b668ee3 2043 info.tp_dst_port = dst_port;
d9677a1f 2044 info.tunnel_csum_on = csum_on;
8b668ee3
PB
2045 err = netdev_flow_put(dev, &match,
2046 CONST_CAST(struct nlattr *, put->actions),
2047 put->actions_len,
2048 CONST_CAST(ovs_u128 *, put->ufid),
2049 &info, put->stats);
2050
2051 if (!err) {
2052 if (put->flags & DPIF_FP_MODIFY) {
2053 struct dpif_op *opp;
2054 struct dpif_op op;
2055
2056 op.type = DPIF_OP_FLOW_DEL;
fa37affa
BP
2057 op.flow_del.key = put->key;
2058 op.flow_del.key_len = put->key_len;
2059 op.flow_del.ufid = put->ufid;
2060 op.flow_del.pmd_id = put->pmd_id;
2061 op.flow_del.stats = NULL;
2062 op.flow_del.terse = false;
8b668ee3
PB
2063
2064 opp = &op;
2065 dpif_netlink_operate__(dpif, &opp, 1);
2066 }
2067
2068 VLOG_DBG("added flow");
2069 } else if (err != EEXIST) {
738c785f
SB
2070 struct netdev *oor_netdev = NULL;
2071 if (err == ENOSPC && netdev_is_offload_rebalance_policy_enabled()) {
2072 /*
2073 * We need to set OOR on the input netdev (i.e, 'dev') for the
2074 * flow. But if the flow has a tunnel attribute (i.e, decap action,
2075 * with a virtual device like a VxLAN interface as its in-port),
2076 * then lookup and set OOR on the underlying tunnel (real) netdev.
2077 */
2078 oor_netdev = flow_get_tunnel_netdev(&match.flow.tunnel);
2079 if (!oor_netdev) {
2080 /* Not a 'tunnel' flow */
2081 oor_netdev = dev;
2082 }
2083 netdev_set_hw_info(oor_netdev, HW_INFO_TYPE_OOR, true);
2084 }
2085 VLOG_ERR_RL(&rl, "failed to offload flow: %s: %s", ovs_strerror(err),
2086 (oor_netdev ? oor_netdev->name : dev->name));
8b668ee3
PB
2087 }
2088
2089out:
2090 if (err && err != EEXIST && (put->flags & DPIF_FP_MODIFY)) {
2091 /* Modified rule can't be offloaded, try and delete from HW */
2092 int del_err = netdev_flow_del(dev, put->ufid, put->stats);
2093
2094 if (!del_err) {
2095 /* Delete from hw success, so old flow was offloaded.
2096 * Change flags to create the flow in kernel */
2097 put->flags &= ~DPIF_FP_MODIFY;
2098 put->flags |= DPIF_FP_CREATE;
2099 } else if (del_err != ENOENT) {
2100 VLOG_ERR_RL(&rl, "failed to delete offloaded flow: %s",
2101 ovs_strerror(del_err));
2102 /* stop proccesing the flow in kernel */
2103 err = 0;
2104 }
2105 }
2106
2107 netdev_close(dev);
2108
2109 return err;
2110}
2111
8b668ee3
PB
2112static int
2113try_send_to_netdev(struct dpif_netlink *dpif, struct dpif_op *op)
eabe7c68 2114{
8b668ee3 2115 int err = EOPNOTSUPP;
9b00386b 2116
8b668ee3
PB
2117 switch (op->type) {
2118 case DPIF_OP_FLOW_PUT: {
fa37affa 2119 struct dpif_flow_put *put = &op->flow_put;
8b668ee3
PB
2120
2121 if (!put->ufid) {
2122 break;
2123 }
3cd99886
RD
2124
2125 log_flow_put_message(&dpif->dpif, &this_module, put, 0);
8b668ee3
PB
2126 err = parse_flow_put(dpif, put);
2127 break;
2128 }
0335a89c 2129 case DPIF_OP_FLOW_DEL: {
fa37affa 2130 struct dpif_flow_del *del = &op->flow_del;
0335a89c
PB
2131
2132 if (!del->ufid) {
2133 break;
2134 }
3cd99886
RD
2135
2136 log_flow_del_message(&dpif->dpif, &this_module, del, 0);
dfaf79dd 2137 err = netdev_ports_flow_del(dpif->dpif.dpif_class, del->ufid,
0335a89c
PB
2138 del->stats);
2139 break;
2140 }
6c343984 2141 case DPIF_OP_FLOW_GET: {
fa37affa 2142 struct dpif_flow_get *get = &op->flow_get;
6c343984 2143
fa37affa 2144 if (!op->flow_get.ufid) {
6c343984
PB
2145 break;
2146 }
3cd99886
RD
2147
2148 log_flow_get_message(&dpif->dpif, &this_module, get, 0);
6c343984
PB
2149 err = parse_flow_get(dpif, get);
2150 break;
2151 }
8b668ee3
PB
2152 case DPIF_OP_EXECUTE:
2153 default:
2154 break;
2155 }
2156
2157 return err;
2158}
2159
2160static void
2161dpif_netlink_operate_chunks(struct dpif_netlink *dpif, struct dpif_op **ops,
2162 size_t n_ops)
2163{
eabe7c68 2164 while (n_ops > 0) {
0f3358ea 2165 size_t chunk = dpif_netlink_operate__(dpif, ops, n_ops);
8b668ee3 2166
eabe7c68
BP
2167 ops += chunk;
2168 n_ops -= chunk;
2169 }
6bc60024
BP
2170}
2171
8b668ee3 2172static void
57924fc9
SB
2173dpif_netlink_operate(struct dpif *dpif_, struct dpif_op **ops, size_t n_ops,
2174 enum dpif_offload_type offload_type)
8b668ee3
PB
2175{
2176 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
2177 struct dpif_op *new_ops[OPERATE_MAX_OPS];
2178 int count = 0;
2179 int i = 0;
2180 int err = 0;
2181
57924fc9
SB
2182 if (offload_type == DPIF_OFFLOAD_ALWAYS && !netdev_is_flow_api_enabled()) {
2183 VLOG_DBG("Invalid offload_type: %d", offload_type);
2184 return;
2185 }
2186
2187 if (offload_type != DPIF_OFFLOAD_NEVER && netdev_is_flow_api_enabled()) {
8b668ee3
PB
2188 while (n_ops > 0) {
2189 count = 0;
2190
2191 while (n_ops > 0 && count < OPERATE_MAX_OPS) {
2192 struct dpif_op *op = ops[i++];
2193
2194 err = try_send_to_netdev(dpif, op);
2195 if (err && err != EEXIST) {
57924fc9
SB
2196 if (offload_type == DPIF_OFFLOAD_ALWAYS) {
2197 /* We got an error while offloading an op. Since
2198 * OFFLOAD_ALWAYS is specified, we stop further
2199 * processing and return to the caller without
2200 * invoking kernel datapath as fallback. But the
2201 * interface requires us to process all n_ops; so
2202 * return the same error in the remaining ops too.
2203 */
2204 op->error = err;
2205 n_ops--;
2206 while (n_ops > 0) {
2207 op = ops[i++];
2208 op->error = err;
2209 n_ops--;
2210 }
2211 return;
2212 }
8b668ee3
PB
2213 new_ops[count++] = op;
2214 } else {
2215 op->error = err;
2216 }
2217
2218 n_ops--;
2219 }
2220
2221 dpif_netlink_operate_chunks(dpif, new_ops, count);
2222 }
57924fc9 2223 } else if (offload_type != DPIF_OFFLOAD_ALWAYS) {
8b668ee3
PB
2224 dpif_netlink_operate_chunks(dpif, ops, n_ops);
2225 }
2226}
2227
09cac43f
NR
2228#if _WIN32
2229static void
2230dpif_netlink_handler_uninit(struct dpif_handler *handler)
2231{
2232 vport_delete_sock_pool(handler);
2233}
2234
2235static int
2236dpif_netlink_handler_init(struct dpif_handler *handler)
2237{
2238 return vport_create_sock_pool(handler);
2239}
2240#else
2241
2242static int
2243dpif_netlink_handler_init(struct dpif_handler *handler)
2244{
2245 handler->epoll_fd = epoll_create(10);
2246 return handler->epoll_fd < 0 ? errno : 0;
2247}
2248
2249static void
2250dpif_netlink_handler_uninit(struct dpif_handler *handler)
2251{
2252 close(handler->epoll_fd);
2253}
2254#endif
2255
1579cf67
AW
2256/* Synchronizes 'channels' in 'dpif->handlers' with the set of vports
2257 * currently in 'dpif' in the kernel, by adding a new set of channels for
2258 * any kernel vport that lacks one and deleting any channels that have no
2259 * backing kernel vports. */
96fba48f 2260static int
93451a0a 2261dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
b90de034 2262 OVS_REQ_WRLOCK(dpif->upcall_lock)
96fba48f 2263{
8381a3d3 2264 unsigned long int *keep_channels;
93451a0a 2265 struct dpif_netlink_vport vport;
8381a3d3
BP
2266 size_t keep_channels_nbits;
2267 struct nl_dump dump;
d57695d7
JS
2268 uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
2269 struct ofpbuf buf;
8381a3d3
BP
2270 int retval = 0;
2271 size_t i;
982b8810 2272
09cac43f
NR
2273 ovs_assert(!WINDOWS || n_handlers <= 1);
2274 ovs_assert(!WINDOWS || dpif->n_handlers <= 1);
2275
1579cf67
AW
2276 if (dpif->n_handlers != n_handlers) {
2277 destroy_all_channels(dpif);
2278 dpif->handlers = xzalloc(n_handlers * sizeof *dpif->handlers);
2279 for (i = 0; i < n_handlers; i++) {
09cac43f 2280 int error;
1579cf67
AW
2281 struct dpif_handler *handler = &dpif->handlers[i];
2282
09cac43f
NR
2283 error = dpif_netlink_handler_init(handler);
2284 if (error) {
1579cf67
AW
2285 size_t j;
2286
2287 for (j = 0; j < i; j++) {
aa5c0216 2288 struct dpif_handler *tmp = &dpif->handlers[j];
09cac43f 2289 dpif_netlink_handler_uninit(tmp);
1579cf67
AW
2290 }
2291 free(dpif->handlers);
2292 dpif->handlers = NULL;
2293
09cac43f 2294 return error;
1579cf67 2295 }
8381a3d3 2296 }
1579cf67
AW
2297 dpif->n_handlers = n_handlers;
2298 }
2299
2300 for (i = 0; i < n_handlers; i++) {
2301 struct dpif_handler *handler = &dpif->handlers[i];
2302
2303 handler->event_offset = handler->n_events = 0;
17411ecf 2304 }
b063d9f0 2305
8381a3d3
BP
2306 keep_channels_nbits = dpif->uc_array_size;
2307 keep_channels = bitmap_allocate(keep_channels_nbits);
982b8810 2308
d57695d7 2309 ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
93451a0a
AS
2310 dpif_netlink_port_dump_start__(dpif, &dump);
2311 while (!dpif_netlink_port_dump_next__(dpif, &dump, &vport, &buf)) {
8381a3d3 2312 uint32_t port_no = odp_to_u32(vport.port_no);
69c51582 2313 uint32_t upcall_pid;
8381a3d3 2314 int error;
50f80534 2315
1579cf67 2316 if (port_no >= dpif->uc_array_size
69c51582
MC
2317 || !vport_get_pid(dpif, port_no, &upcall_pid)) {
2318 struct nl_sock *socksp;
d240e46a 2319 error = create_nl_sock(dpif, &socksp);
1579cf67 2320
d240e46a 2321 if (error) {
1579cf67
AW
2322 goto error;
2323 }
2324
69c51582 2325 error = vport_add_channel(dpif, vport.port_no, socksp);
b063d9f0 2326 if (error) {
1579cf67 2327 VLOG_INFO("%s: could not add channels for port %s",
9b00386b 2328 dpif_name(&dpif->dpif), vport.name);
69c51582 2329 nl_sock_destroy(socksp);
8381a3d3
BP
2330 retval = error;
2331 goto error;
982b8810 2332 }
69c51582 2333 upcall_pid = nl_sock_pid(socksp);
8381a3d3 2334 }
50f80534 2335
8381a3d3 2336 /* Configure the vport to deliver misses to 'sock'. */
1579cf67 2337 if (vport.upcall_pids[0] == 0
69c51582
MC
2338 || vport.n_upcall_pids != 1
2339 || upcall_pid != vport.upcall_pids[0]) {
93451a0a 2340 struct dpif_netlink_vport vport_request;
989fd548 2341
93451a0a 2342 dpif_netlink_vport_init(&vport_request);
989fd548
JP
2343 vport_request.cmd = OVS_VPORT_CMD_SET;
2344 vport_request.dp_ifindex = dpif->dp_ifindex;
8381a3d3 2345 vport_request.port_no = vport.port_no;
69c51582
MC
2346 vport_request.n_upcall_pids = 1;
2347 vport_request.upcall_pids = &upcall_pid;
93451a0a 2348 error = dpif_netlink_vport_transact(&vport_request, NULL, NULL);
1579cf67 2349 if (error) {
989fd548
JP
2350 VLOG_WARN_RL(&error_rl,
2351 "%s: failed to set upcall pid on port: %s",
10a89ef0 2352 dpif_name(&dpif->dpif), ovs_strerror(error));
989fd548 2353
8381a3d3
BP
2354 if (error != ENODEV && error != ENOENT) {
2355 retval = error;
989fd548 2356 } else {
8381a3d3
BP
2357 /* The vport isn't really there, even though the dump says
2358 * it is. Probably we just hit a race after a port
2359 * disappeared. */
989fd548 2360 }
8381a3d3 2361 goto error;
50f80534 2362 }
8381a3d3 2363 }
14b4d2f9 2364
8381a3d3
BP
2365 if (port_no < keep_channels_nbits) {
2366 bitmap_set1(keep_channels, port_no);
2367 }
2368 continue;
2369
2370 error:
1579cf67 2371 vport_del_channels(dpif, vport.port_no);
982b8810 2372 }
8381a3d3 2373 nl_dump_done(&dump);
d57695d7 2374 ofpbuf_uninit(&buf);
b063d9f0 2375
8381a3d3
BP
2376 /* Discard any saved channels that we didn't reuse. */
2377 for (i = 0; i < keep_channels_nbits; i++) {
2378 if (!bitmap_is_set(keep_channels, i)) {
1579cf67 2379 vport_del_channels(dpif, u32_to_odp(i));
8381a3d3
BP
2380 }
2381 }
2382 free(keep_channels);
2383
2384 return retval;
2385}
2386
2387static int
93451a0a 2388dpif_netlink_recv_set__(struct dpif_netlink *dpif, bool enable)
b90de034 2389 OVS_REQ_WRLOCK(dpif->upcall_lock)
8381a3d3 2390{
1579cf67 2391 if ((dpif->handlers != NULL) == enable) {
8381a3d3
BP
2392 return 0;
2393 } else if (!enable) {
1579cf67 2394 destroy_all_channels(dpif);
8381a3d3
BP
2395 return 0;
2396 } else {
93451a0a 2397 return dpif_netlink_refresh_channels(dpif, 1);
8381a3d3 2398 }
96fba48f
BP
2399}
2400
9fafa796 2401static int
93451a0a 2402dpif_netlink_recv_set(struct dpif *dpif_, bool enable)
9fafa796 2403{
93451a0a 2404 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
9fafa796
BP
2405 int error;
2406
1579cf67 2407 fat_rwlock_wrlock(&dpif->upcall_lock);
93451a0a 2408 error = dpif_netlink_recv_set__(dpif, enable);
1579cf67 2409 fat_rwlock_unlock(&dpif->upcall_lock);
9fafa796
BP
2410
2411 return error;
2412}
2413
1954e6bb 2414static int
93451a0a 2415dpif_netlink_handlers_set(struct dpif *dpif_, uint32_t n_handlers)
1954e6bb 2416{
93451a0a 2417 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
1579cf67
AW
2418 int error = 0;
2419
09cac43f
NR
2420#ifdef _WIN32
2421 /* Multiple upcall handlers will be supported once kernel datapath supports
2422 * it. */
2423 if (n_handlers > 1) {
2424 return error;
2425 }
2426#endif
2427
1579cf67
AW
2428 fat_rwlock_wrlock(&dpif->upcall_lock);
2429 if (dpif->handlers) {
93451a0a 2430 error = dpif_netlink_refresh_channels(dpif, n_handlers);
1579cf67
AW
2431 }
2432 fat_rwlock_unlock(&dpif->upcall_lock);
2433
2434 return error;
1954e6bb
AW
2435}
2436
aae51f53 2437static int
93451a0a 2438dpif_netlink_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
aae51f53
BP
2439 uint32_t queue_id, uint32_t *priority)
2440{
2441 if (queue_id < 0xf000) {
17ee3c1f 2442 *priority = TC_H_MAKE(1 << 16, queue_id + 1);
aae51f53
BP
2443 return 0;
2444 } else {
2445 return EINVAL;
2446 }
2447}
2448
96fba48f 2449static int
7af12bd7
JS
2450parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf,
2451 struct dpif_upcall *upcall, int *dp_ifindex)
856081f6 2452{
df2c07f4 2453 static const struct nl_policy ovs_packet_policy[] = {
856081f6 2454 /* Always present. */
df2c07f4 2455 [OVS_PACKET_ATTR_PACKET] = { .type = NL_A_UNSPEC,
856081f6 2456 .min_len = ETH_HEADER_LEN },
df2c07f4 2457 [OVS_PACKET_ATTR_KEY] = { .type = NL_A_NESTED },
856081f6 2458
df2c07f4 2459 /* OVS_PACKET_CMD_ACTION only. */
e995e3df 2460 [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_UNSPEC, .optional = true },
8b7ea2d4 2461 [OVS_PACKET_ATTR_EGRESS_TUN_KEY] = { .type = NL_A_NESTED, .optional = true },
7321bda3 2462 [OVS_PACKET_ATTR_ACTIONS] = { .type = NL_A_NESTED, .optional = true },
27130224 2463 [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = true }
856081f6
BP
2464 };
2465
0a2869d5
BP
2466 struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size);
2467 struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
2468 struct genlmsghdr *genl = ofpbuf_try_pull(&b, sizeof *genl);
2469 struct ovs_header *ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
982b8810 2470
0a2869d5 2471 struct nlattr *a[ARRAY_SIZE(ovs_packet_policy)];
df2c07f4
JP
2472 if (!nlmsg || !genl || !ovs_header
2473 || nlmsg->nlmsg_type != ovs_packet_family
2474 || !nl_policy_parse(&b, 0, ovs_packet_policy, a,
2475 ARRAY_SIZE(ovs_packet_policy))) {
856081f6
BP
2476 return EINVAL;
2477 }
2478
0a2869d5
BP
2479 int type = (genl->cmd == OVS_PACKET_CMD_MISS ? DPIF_UC_MISS
2480 : genl->cmd == OVS_PACKET_CMD_ACTION ? DPIF_UC_ACTION
2481 : -1);
aaff4b55
BP
2482 if (type < 0) {
2483 return EINVAL;
2484 }
82272ede 2485
877c9270 2486 /* (Re)set ALL fields of '*upcall' on successful return. */
aaff4b55 2487 upcall->type = type;
ebc56baa
BP
2488 upcall->key = CONST_CAST(struct nlattr *,
2489 nl_attr_get(a[OVS_PACKET_ATTR_KEY]));
df2c07f4 2490 upcall->key_len = nl_attr_get_size(a[OVS_PACKET_ATTR_KEY]);
7af12bd7 2491 dpif_flow_hash(&dpif->dpif, upcall->key, upcall->key_len, &upcall->ufid);
e995e3df 2492 upcall->userdata = a[OVS_PACKET_ATTR_USERDATA];
8b7ea2d4 2493 upcall->out_tun_key = a[OVS_PACKET_ATTR_EGRESS_TUN_KEY];
7321bda3 2494 upcall->actions = a[OVS_PACKET_ATTR_ACTIONS];
27130224 2495 upcall->mru = a[OVS_PACKET_ATTR_MRU];
da546e07
JR
2496
2497 /* Allow overwriting the netlink attribute header without reallocating. */
cf62fa4c 2498 dp_packet_use_stub(&upcall->packet,
da546e07
JR
2499 CONST_CAST(struct nlattr *,
2500 nl_attr_get(a[OVS_PACKET_ATTR_PACKET])) - 1,
2501 nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]) +
2502 sizeof(struct nlattr));
cf62fa4c
PS
2503 dp_packet_set_data(&upcall->packet,
2504 (char *)dp_packet_data(&upcall->packet) + sizeof(struct nlattr));
2505 dp_packet_set_size(&upcall->packet, nl_attr_get_size(a[OVS_PACKET_ATTR_PACKET]));
da546e07 2506
2482b0b0
JS
2507 if (nl_attr_find__(upcall->key, upcall->key_len, OVS_KEY_ATTR_ETHERNET)) {
2508 /* Ethernet frame */
2509 upcall->packet.packet_type = htonl(PT_ETH);
2510 } else {
2511 /* Non-Ethernet packet. Get the Ethertype from the NL attributes */
2512 ovs_be16 ethertype = 0;
2513 const struct nlattr *et_nla = nl_attr_find__(upcall->key,
2514 upcall->key_len,
2515 OVS_KEY_ATTR_ETHERTYPE);
2516 if (et_nla) {
2517 ethertype = nl_attr_get_be16(et_nla);
2518 }
2519 upcall->packet.packet_type = PACKET_TYPE_BE(OFPHTN_ETHERTYPE,
2520 ntohs(ethertype));
2521 dp_packet_set_l3(&upcall->packet, dp_packet_data(&upcall->packet));
2522 }
2523
df2c07f4 2524 *dp_ifindex = ovs_header->dp_ifindex;
982b8810 2525
856081f6
BP
2526 return 0;
2527}
2528
09cac43f
NR
2529#ifdef _WIN32
2530#define PACKET_RECV_BATCH_SIZE 50
2531static int
2532dpif_netlink_recv_windows(struct dpif_netlink *dpif, uint32_t handler_id,
2533 struct dpif_upcall *upcall, struct ofpbuf *buf)
2534 OVS_REQ_RDLOCK(dpif->upcall_lock)
2535{
2536 struct dpif_handler *handler;
2537 int read_tries = 0;
2538 struct dpif_windows_vport_sock *sock_pool;
2539 uint32_t i;
2540
2541 if (!dpif->handlers) {
2542 return EAGAIN;
2543 }
2544
2545 /* Only one handler is supported currently. */
2546 if (handler_id >= 1) {
2547 return EAGAIN;
2548 }
2549
2550 if (handler_id >= dpif->n_handlers) {
2551 return EAGAIN;
2552 }
2553
2554 handler = &dpif->handlers[handler_id];
2555 sock_pool = handler->vport_sock_pool;
2556
2557 for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
2558 for (;;) {
2559 int dp_ifindex;
2560 int error;
2561
2562 if (++read_tries > PACKET_RECV_BATCH_SIZE) {
2563 return EAGAIN;
2564 }
2565
a86bd14e 2566 error = nl_sock_recv(sock_pool[i].nl_sock, buf, NULL, false);
09cac43f
NR
2567 if (error == ENOBUFS) {
2568 /* ENOBUFS typically means that we've received so many
2569 * packets that the buffer overflowed. Try again
2570 * immediately because there's almost certainly a packet
2571 * waiting for us. */
2572 /* XXX: report_loss(dpif, ch, idx, handler_id); */
2573 continue;
2574 }
2575
2576 /* XXX: ch->last_poll = time_msec(); */
2577 if (error) {
2578 if (error == EAGAIN) {
2579 break;
2580 }
2581 return error;
2582 }
2583
27edb4aa 2584 error = parse_odp_packet(dpif, buf, upcall, &dp_ifindex);
09cac43f
NR
2585 if (!error && dp_ifindex == dpif->dp_ifindex) {
2586 return 0;
2587 } else if (error) {
2588 return error;
2589 }
2590 }
2591 }
2592
2593 return EAGAIN;
2594}
2595#else
856081f6 2596static int
93451a0a
AS
2597dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
2598 struct dpif_upcall *upcall, struct ofpbuf *buf)
b90de034 2599 OVS_REQ_RDLOCK(dpif->upcall_lock)
96fba48f 2600{
1579cf67 2601 struct dpif_handler *handler;
17411ecf 2602 int read_tries = 0;
96fba48f 2603
1579cf67
AW
2604 if (!dpif->handlers || handler_id >= dpif->n_handlers) {
2605 return EAGAIN;
982b8810
BP
2606 }
2607
1579cf67
AW
2608 handler = &dpif->handlers[handler_id];
2609 if (handler->event_offset >= handler->n_events) {
8522ba09 2610 int retval;
989fd548 2611
1579cf67 2612 handler->event_offset = handler->n_events = 0;
f6d1465c 2613
8522ba09 2614 do {
1579cf67 2615 retval = epoll_wait(handler->epoll_fd, handler->epoll_events,
989fd548 2616 dpif->uc_array_size, 0);
8522ba09 2617 } while (retval < 0 && errno == EINTR);
09cac43f 2618
8522ba09
BP
2619 if (retval < 0) {
2620 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
10a89ef0 2621 VLOG_WARN_RL(&rl, "epoll_wait failed (%s)", ovs_strerror(errno));
989fd548 2622 } else if (retval > 0) {
1579cf67 2623 handler->n_events = retval;
8522ba09 2624 }
8522ba09
BP
2625 }
2626
1579cf67
AW
2627 while (handler->event_offset < handler->n_events) {
2628 int idx = handler->epoll_events[handler->event_offset].data.u32;
69c51582 2629 struct dpif_channel *ch = &dpif->channels[idx];
8522ba09 2630
1579cf67 2631 handler->event_offset++;
17411ecf 2632
f6d1465c 2633 for (;;) {
8522ba09 2634 int dp_ifindex;
f6d1465c 2635 int error;
17411ecf 2636
f6d1465c
BP
2637 if (++read_tries > 50) {
2638 return EAGAIN;
2639 }
17411ecf 2640
a86bd14e 2641 error = nl_sock_recv(ch->sock, buf, NULL, false);
14b4d2f9
BP
2642 if (error == ENOBUFS) {
2643 /* ENOBUFS typically means that we've received so many
2644 * packets that the buffer overflowed. Try again
2645 * immediately because there's almost certainly a packet
2646 * waiting for us. */
9b00386b 2647 report_loss(dpif, ch, idx, handler_id);
14b4d2f9
BP
2648 continue;
2649 }
2650
2651 ch->last_poll = time_msec();
72d32ac0 2652 if (error) {
72d32ac0
BP
2653 if (error == EAGAIN) {
2654 break;
2655 }
f6d1465c
BP
2656 return error;
2657 }
17411ecf 2658
7af12bd7 2659 error = parse_odp_packet(dpif, buf, upcall, &dp_ifindex);
a12b3ead 2660 if (!error && dp_ifindex == dpif->dp_ifindex) {
f6d1465c 2661 return 0;
989fd548 2662 } else if (error) {
f6d1465c 2663 return error;
17411ecf 2664 }
982b8810 2665 }
50f80534 2666 }
982b8810
BP
2667
2668 return EAGAIN;
96fba48f 2669}
09cac43f 2670#endif
96fba48f 2671
9fafa796 2672static int
93451a0a
AS
2673dpif_netlink_recv(struct dpif *dpif_, uint32_t handler_id,
2674 struct dpif_upcall *upcall, struct ofpbuf *buf)
9fafa796 2675{
93451a0a 2676 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
9fafa796
BP
2677 int error;
2678
1579cf67 2679 fat_rwlock_rdlock(&dpif->upcall_lock);
09cac43f
NR
2680#ifdef _WIN32
2681 error = dpif_netlink_recv_windows(dpif, handler_id, upcall, buf);
2682#else
93451a0a 2683 error = dpif_netlink_recv__(dpif, handler_id, upcall, buf);
09cac43f 2684#endif
1579cf67 2685 fat_rwlock_unlock(&dpif->upcall_lock);
9fafa796
BP
2686
2687 return error;
2688}
2689
96fba48f 2690static void
93451a0a 2691dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
b90de034 2692 OVS_REQ_RDLOCK(dpif->upcall_lock)
96fba48f 2693{
93451a0a 2694#ifdef _WIN32
09cac43f
NR
2695 uint32_t i;
2696 struct dpif_windows_vport_sock *sock_pool =
2697 dpif->handlers[handler_id].vport_sock_pool;
2698
2699 /* Only one handler is supported currently. */
2700 if (handler_id >= 1) {
2701 return;
2702 }
2703
2704 for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
2705 nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
2706 }
93451a0a 2707#else
1579cf67
AW
2708 if (dpif->handlers && handler_id < dpif->n_handlers) {
2709 struct dpif_handler *handler = &dpif->handlers[handler_id];
2710
2711 poll_fd_wait(handler->epoll_fd, POLLIN);
17411ecf 2712 }
93451a0a 2713#endif
96fba48f
BP
2714}
2715
1ba530f4 2716static void
93451a0a 2717dpif_netlink_recv_wait(struct dpif *dpif_, uint32_t handler_id)
1ba530f4 2718{
93451a0a 2719 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
17411ecf 2720
b90de034 2721 fat_rwlock_rdlock(&dpif->upcall_lock);
93451a0a 2722 dpif_netlink_recv_wait__(dpif, handler_id);
b90de034
AW
2723 fat_rwlock_unlock(&dpif->upcall_lock);
2724}
2725
2726static void
93451a0a 2727dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
b90de034
AW
2728 OVS_REQ_WRLOCK(dpif->upcall_lock)
2729{
1579cf67 2730 if (dpif->handlers) {
69c51582 2731 size_t i;
1579cf67 2732
69c51582
MC
2733 if (!dpif->channels[0].sock) {
2734 return;
2735 }
1579cf67 2736 for (i = 0; i < dpif->uc_array_size; i++ ) {
1ba530f4 2737
69c51582 2738 nl_sock_drain(dpif->channels[i].sock);
989fd548 2739 }
1ba530f4 2740 }
b90de034
AW
2741}
2742
2743static void
93451a0a 2744dpif_netlink_recv_purge(struct dpif *dpif_)
b90de034 2745{
93451a0a 2746 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
b90de034
AW
2747
2748 fat_rwlock_wrlock(&dpif->upcall_lock);
93451a0a 2749 dpif_netlink_recv_purge__(dpif);
1579cf67 2750 fat_rwlock_unlock(&dpif->upcall_lock);
1ba530f4
BP
2751}
2752
b5cbbcf6
AZ
2753static char *
2754dpif_netlink_get_datapath_version(void)
2755{
2756 char *version_str = NULL;
2757
2758#ifdef __linux__
2759
2760#define MAX_VERSION_STR_SIZE 80
2761#define LINUX_DATAPATH_VERSION_FILE "/sys/module/openvswitch/version"
2762 FILE *f;
2763
2764 f = fopen(LINUX_DATAPATH_VERSION_FILE, "r");
2765 if (f) {
2766 char *newline;
2767 char version[MAX_VERSION_STR_SIZE];
2768
2769 if (fgets(version, MAX_VERSION_STR_SIZE, f)) {
2770 newline = strchr(version, '\n');
2771 if (newline) {
2772 *newline = '\0';
2773 }
2774 version_str = xstrdup(version);
2775 }
2776 fclose(f);
2777 }
2778#endif
2779
2780 return version_str;
2781}
2782
c11c9f4a
DDP
2783struct dpif_netlink_ct_dump_state {
2784 struct ct_dpif_dump_state up;
2785 struct nl_ct_dump_state *nl_ct_dump;
2786};
2787
2788static int
2789dpif_netlink_ct_dump_start(struct dpif *dpif OVS_UNUSED,
2790 struct ct_dpif_dump_state **dump_,
ded30c74 2791 const uint16_t *zone, int *ptot_bkts)
c11c9f4a
DDP
2792{
2793 struct dpif_netlink_ct_dump_state *dump;
2794 int err;
2795
2796 dump = xzalloc(sizeof *dump);
ded30c74 2797 err = nl_ct_dump_start(&dump->nl_ct_dump, zone, ptot_bkts);
c11c9f4a
DDP
2798 if (err) {
2799 free(dump);
2800 return err;
2801 }
2802
2803 *dump_ = &dump->up;
2804
2805 return 0;
2806}
2807
2808static int
2809dpif_netlink_ct_dump_next(struct dpif *dpif OVS_UNUSED,
2810 struct ct_dpif_dump_state *dump_,
2811 struct ct_dpif_entry *entry)
2812{
2813 struct dpif_netlink_ct_dump_state *dump;
2814
2815 INIT_CONTAINER(dump, dump_, up);
2816
2817 return nl_ct_dump_next(dump->nl_ct_dump, entry);
2818}
2819
2820static int
2821dpif_netlink_ct_dump_done(struct dpif *dpif OVS_UNUSED,
2822 struct ct_dpif_dump_state *dump_)
2823{
2824 struct dpif_netlink_ct_dump_state *dump;
2825 int err;
2826
2827 INIT_CONTAINER(dump, dump_, up);
2828
2829 err = nl_ct_dump_done(dump->nl_ct_dump);
2830 free(dump);
2831 return err;
2832}
15eabc97
DDP
2833
2834static int
817a7657
YHW
2835dpif_netlink_ct_flush(struct dpif *dpif OVS_UNUSED, const uint16_t *zone,
2836 const struct ct_dpif_tuple *tuple)
15eabc97 2837{
817a7657
YHW
2838 if (tuple) {
2839 return nl_ct_flush_tuple(tuple, zone ? *zone : 0);
2840 } else if (zone) {
15eabc97
DDP
2841 return nl_ct_flush_zone(*zone);
2842 } else {
2843 return nl_ct_flush();
2844 }
2845}
c11c9f4a 2846
906ff9d2
YHW
2847static int
2848dpif_netlink_ct_set_limits(struct dpif *dpif OVS_UNUSED,
2849 const uint32_t *default_limits,
2850 const struct ovs_list *zone_limits)
2851{
2852 struct ovs_zone_limit req_zone_limit;
2853
2854 if (ovs_ct_limit_family < 0) {
2855 return EOPNOTSUPP;
2856 }
2857
2858 struct ofpbuf *request = ofpbuf_new(NL_DUMP_BUFSIZE);
2859 nl_msg_put_genlmsghdr(request, 0, ovs_ct_limit_family,
2860 NLM_F_REQUEST | NLM_F_ECHO, OVS_CT_LIMIT_CMD_SET,
2861 OVS_CT_LIMIT_VERSION);
2862
2863 struct ovs_header *ovs_header;
2864 ovs_header = ofpbuf_put_uninit(request, sizeof *ovs_header);
2865 ovs_header->dp_ifindex = 0;
2866
2867 size_t opt_offset;
2868 opt_offset = nl_msg_start_nested(request, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
2869 if (default_limits) {
2870 req_zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE;
2871 req_zone_limit.limit = *default_limits;
2872 nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit);
2873 }
2874
2875 if (!ovs_list_is_empty(zone_limits)) {
2876 struct ct_dpif_zone_limit *zone_limit;
2877
2878 LIST_FOR_EACH (zone_limit, node, zone_limits) {
2879 req_zone_limit.zone_id = zone_limit->zone;
2880 req_zone_limit.limit = zone_limit->limit;
2881 nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit);
2882 }
2883 }
2884 nl_msg_end_nested(request, opt_offset);
2885
2886 int err = nl_transact(NETLINK_GENERIC, request, NULL);
2887 ofpbuf_uninit(request);
2888 return err;
2889}
2890
2891static int
2892dpif_netlink_zone_limits_from_ofpbuf(const struct ofpbuf *buf,
2893 uint32_t *default_limit,
2894 struct ovs_list *zone_limits)
2895{
2896 static const struct nl_policy ovs_ct_limit_policy[] = {
2897 [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NL_A_NESTED,
2898 .optional = true },
2899 };
2900
2901 struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size);
2902 struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
2903 struct genlmsghdr *genl = ofpbuf_try_pull(&b, sizeof *genl);
2904 struct ovs_header *ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
2905
2906 struct nlattr *attr[ARRAY_SIZE(ovs_ct_limit_policy)];
2907
2908 if (!nlmsg || !genl || !ovs_header
2909 || nlmsg->nlmsg_type != ovs_ct_limit_family
2910 || !nl_policy_parse(&b, 0, ovs_ct_limit_policy, attr,
2911 ARRAY_SIZE(ovs_ct_limit_policy))) {
2912 return EINVAL;
2913 }
2914
2915
2916 if (!attr[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
2917 return EINVAL;
2918 }
2919
2920 int rem = NLA_ALIGN(
2921 nl_attr_get_size(attr[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]));
2922 const struct ovs_zone_limit *zone_limit =
2923 nl_attr_get(attr[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]);
2924
2925 while (rem >= sizeof *zone_limit) {
2926 if (zone_limit->zone_id == OVS_ZONE_LIMIT_DEFAULT_ZONE) {
2927 *default_limit = zone_limit->limit;
2928 } else if (zone_limit->zone_id < OVS_ZONE_LIMIT_DEFAULT_ZONE ||
2929 zone_limit->zone_id > UINT16_MAX) {
2930 } else {
2931 ct_dpif_push_zone_limit(zone_limits, zone_limit->zone_id,
2932 zone_limit->limit, zone_limit->count);
2933 }
2934 rem -= NLA_ALIGN(sizeof *zone_limit);
2935 zone_limit = ALIGNED_CAST(struct ovs_zone_limit *,
2936 (unsigned char *) zone_limit + NLA_ALIGN(sizeof *zone_limit));
2937 }
2938 return 0;
2939}
2940
2941static int
2942dpif_netlink_ct_get_limits(struct dpif *dpif OVS_UNUSED,
2943 uint32_t *default_limit,
2944 const struct ovs_list *zone_limits_request,
2945 struct ovs_list *zone_limits_reply)
2946{
2947 if (ovs_ct_limit_family < 0) {
2948 return EOPNOTSUPP;
2949 }
2950
2951 struct ofpbuf *request = ofpbuf_new(NL_DUMP_BUFSIZE);
2952 nl_msg_put_genlmsghdr(request, 0, ovs_ct_limit_family,
2953 NLM_F_REQUEST | NLM_F_ECHO, OVS_CT_LIMIT_CMD_GET,
2954 OVS_CT_LIMIT_VERSION);
2955
2956 struct ovs_header *ovs_header;
2957 ovs_header = ofpbuf_put_uninit(request, sizeof *ovs_header);
2958 ovs_header->dp_ifindex = 0;
2959
2960 if (!ovs_list_is_empty(zone_limits_request)) {
2961 size_t opt_offset = nl_msg_start_nested(request,
2962 OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
2963
2964 struct ovs_zone_limit req_zone_limit;
2965 req_zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE;
2966 nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit);
2967
2968 struct ct_dpif_zone_limit *zone_limit;
2969 LIST_FOR_EACH (zone_limit, node, zone_limits_request) {
2970 req_zone_limit.zone_id = zone_limit->zone;
2971 nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit);
2972 }
2973
2974 nl_msg_end_nested(request, opt_offset);
2975 }
2976
2977 struct ofpbuf *reply;
2978 int err = nl_transact(NETLINK_GENERIC, request, &reply);
2979 if (err) {
2980 goto out;
2981 }
2982
2983 err = dpif_netlink_zone_limits_from_ofpbuf(reply, default_limit,
2984 zone_limits_reply);
2985
2986out:
2987 ofpbuf_uninit(request);
2988 ofpbuf_uninit(reply);
2989 return err;
2990}
2991
2992static int
2993dpif_netlink_ct_del_limits(struct dpif *dpif OVS_UNUSED,
2994 const struct ovs_list *zone_limits)
2995{
2996 if (ovs_ct_limit_family < 0) {
2997 return EOPNOTSUPP;
2998 }
2999
3000 struct ofpbuf *request = ofpbuf_new(NL_DUMP_BUFSIZE);
3001 nl_msg_put_genlmsghdr(request, 0, ovs_ct_limit_family,
3002 NLM_F_REQUEST | NLM_F_ECHO, OVS_CT_LIMIT_CMD_DEL,
3003 OVS_CT_LIMIT_VERSION);
3004
3005 struct ovs_header *ovs_header;
3006 ovs_header = ofpbuf_put_uninit(request, sizeof *ovs_header);
3007 ovs_header->dp_ifindex = 0;
3008
3009 if (!ovs_list_is_empty(zone_limits)) {
3010 size_t opt_offset =
3011 nl_msg_start_nested(request, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
3012
3013 struct ct_dpif_zone_limit *zone_limit;
3014 LIST_FOR_EACH (zone_limit, node, zone_limits) {
3015 struct ovs_zone_limit req_zone_limit;
3016 req_zone_limit.zone_id = zone_limit->zone;
3017 nl_msg_put(request, &req_zone_limit, sizeof req_zone_limit);
3018 }
3019 nl_msg_end_nested(request, opt_offset);
3020 }
3021
3022 int err = nl_transact(NETLINK_GENERIC, request, NULL);
3023
3024 ofpbuf_uninit(request);
3025 return err;
3026}
5dddf960
JR
3027\f
3028/* Meters */
80738e5f
AZ
3029
3030/* Set of supported meter flags */
3031#define DP_SUPPORTED_METER_FLAGS_MASK \
3032 (OFPMF13_STATS | OFPMF13_PKTPS | OFPMF13_KBPS | OFPMF13_BURST)
3033
92d0d515
JP
3034/* Meter support was introduced in Linux 4.15. In some versions of
3035 * Linux 4.15, 4.16, and 4.17, there was a bug that never set the id
3036 * when the meter was created, so all meters essentially had an id of
3037 * zero. Check for that condition and disable meters on those kernels. */
3038static bool probe_broken_meters(struct dpif *);
3039
5dddf960 3040static void
80738e5f
AZ
3041dpif_netlink_meter_init(struct dpif_netlink *dpif, struct ofpbuf *buf,
3042 void *stub, size_t size, uint32_t command)
3043{
3044 ofpbuf_use_stub(buf, stub, size);
3045
3046 nl_msg_put_genlmsghdr(buf, 0, ovs_meter_family, NLM_F_REQUEST | NLM_F_ECHO,
3047 command, OVS_METER_VERSION);
3048
3049 struct ovs_header *ovs_header;
3050 ovs_header = ofpbuf_put_uninit(buf, sizeof *ovs_header);
3051 ovs_header->dp_ifindex = dpif->dp_ifindex;
3052}
3053
3054/* Execute meter 'request' in the kernel datapath. If the command
3055 * fails, returns a positive errno value. Otherwise, stores the reply
3056 * in '*replyp', parses the policy according to 'reply_policy' into the
3057 * array of Netlink attribute in 'a', and returns 0. On success, the
3058 * caller is responsible for calling ofpbuf_delete() on '*replyp'
3059 * ('replyp' will contain pointers into 'a'). */
3060static int
3061dpif_netlink_meter_transact(struct ofpbuf *request, struct ofpbuf **replyp,
3062 const struct nl_policy *reply_policy,
3063 struct nlattr **a, size_t size_a)
3064{
3065 int error = nl_transact(NETLINK_GENERIC, request, replyp);
3066 ofpbuf_uninit(request);
3067
3068 if (error) {
3069 return error;
3070 }
3071
3072 struct nlmsghdr *nlmsg = ofpbuf_try_pull(*replyp, sizeof *nlmsg);
3073 struct genlmsghdr *genl = ofpbuf_try_pull(*replyp, sizeof *genl);
3074 struct ovs_header *ovs_header = ofpbuf_try_pull(*replyp,
3075 sizeof *ovs_header);
3076 if (!nlmsg || !genl || !ovs_header
3077 || nlmsg->nlmsg_type != ovs_meter_family
3078 || !nl_policy_parse(*replyp, 0, reply_policy, a, size_a)) {
3079 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3080 VLOG_DBG_RL(&rl,
3081 "Kernel module response to meter tranaction is invalid");
3082 return EINVAL;
3083 }
3084 return 0;
3085}
3086
3087static void
3088dpif_netlink_meter_get_features(const struct dpif *dpif_,
5dddf960
JR
3089 struct ofputil_meter_features *features)
3090{
92d0d515
JP
3091 if (probe_broken_meters(CONST_CAST(struct dpif *, dpif_))) {
3092 features = NULL;
3093 return;
3094 }
3095
80738e5f
AZ
3096 struct ofpbuf buf, *msg;
3097 uint64_t stub[1024 / 8];
3098
3099 static const struct nl_policy ovs_meter_features_policy[] = {
3100 [OVS_METER_ATTR_MAX_METERS] = { .type = NL_A_U32 },
3101 [OVS_METER_ATTR_MAX_BANDS] = { .type = NL_A_U32 },
3102 [OVS_METER_ATTR_BANDS] = { .type = NL_A_NESTED, .optional = true },
3103 };
3104 struct nlattr *a[ARRAY_SIZE(ovs_meter_features_policy)];
3105
3106 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
3107 dpif_netlink_meter_init(dpif, &buf, stub, sizeof stub,
3108 OVS_METER_CMD_FEATURES);
3109 if (dpif_netlink_meter_transact(&buf, &msg, ovs_meter_features_policy, a,
3110 ARRAY_SIZE(ovs_meter_features_policy))) {
3111 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3112 VLOG_INFO_RL(&rl,
3113 "dpif_netlink_meter_transact OVS_METER_CMD_FEATURES failed");
3114 return;
3115 }
3116
3117 features->max_meters = nl_attr_get_u32(a[OVS_METER_ATTR_MAX_METERS]);
3118 features->max_bands = nl_attr_get_u32(a[OVS_METER_ATTR_MAX_BANDS]);
3119
3120 /* Bands is a nested attribute of zero or more nested
3121 * band attributes. */
3122 if (a[OVS_METER_ATTR_BANDS]) {
3123 const struct nlattr *nla;
3124 size_t left;
3125
3126 NL_NESTED_FOR_EACH (nla, left, a[OVS_METER_ATTR_BANDS]) {
3127 const struct nlattr *band_nla;
3128 size_t band_left;
3129
3130 NL_NESTED_FOR_EACH (band_nla, band_left, nla) {
3131 if (nl_attr_type(band_nla) == OVS_BAND_ATTR_TYPE) {
3132 if (nl_attr_get_size(band_nla) == sizeof(uint32_t)) {
3133 switch (nl_attr_get_u32(band_nla)) {
3134 case OVS_METER_BAND_TYPE_DROP:
3135 features->band_types |= 1 << OFPMBT13_DROP;
3136 break;
3137 }
3138 }
3139 }
3140 }
3141 }
3142 }
3143 features->capabilities = DP_SUPPORTED_METER_FLAGS_MASK;
3144
3145 ofpbuf_delete(msg);
5dddf960
JR
3146}
3147
3148static int
60ebc04d
JP
3149dpif_netlink_meter_set__(struct dpif *dpif_, ofproto_meter_id meter_id,
3150 struct ofputil_meter_config *config)
5dddf960 3151{
80738e5f
AZ
3152 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
3153 struct ofpbuf buf, *msg;
3154 uint64_t stub[1024 / 8];
3155
3156 static const struct nl_policy ovs_meter_set_response_policy[] = {
3157 [OVS_METER_ATTR_ID] = { .type = NL_A_U32 },
3158 };
3159 struct nlattr *a[ARRAY_SIZE(ovs_meter_set_response_policy)];
3160
3161 if (config->flags & ~DP_SUPPORTED_METER_FLAGS_MASK) {
3162 return EBADF; /* Unsupported flags set */
3163 }
3164
3165 for (size_t i = 0; i < config->n_bands; i++) {
3166 switch (config->bands[i].type) {
3167 case OFPMBT13_DROP:
3168 break;
3169 default:
3170 return ENODEV; /* Unsupported band type */
3171 }
3172 }
3173
3174 dpif_netlink_meter_init(dpif, &buf, stub, sizeof stub, OVS_METER_CMD_SET);
3175
8101f03f
JP
3176 nl_msg_put_u32(&buf, OVS_METER_ATTR_ID, meter_id.uint32);
3177
80738e5f
AZ
3178 if (config->flags & OFPMF13_KBPS) {
3179 nl_msg_put_flag(&buf, OVS_METER_ATTR_KBPS);
3180 }
3181
3182 size_t bands_offset = nl_msg_start_nested(&buf, OVS_METER_ATTR_BANDS);
3183 /* Bands */
3184 for (size_t i = 0; i < config->n_bands; ++i) {
3185 struct ofputil_meter_band * band = &config->bands[i];
3186 uint32_t band_type;
3187
3188 size_t band_offset = nl_msg_start_nested(&buf, OVS_BAND_ATTR_UNSPEC);
3189
3190 switch (band->type) {
3191 case OFPMBT13_DROP:
3192 band_type = OVS_METER_BAND_TYPE_DROP;
3193 break;
3194 default:
3195 band_type = OVS_METER_BAND_TYPE_UNSPEC;
3196 }
3197 nl_msg_put_u32(&buf, OVS_BAND_ATTR_TYPE, band_type);
3198 nl_msg_put_u32(&buf, OVS_BAND_ATTR_RATE, band->rate);
3199 nl_msg_put_u32(&buf, OVS_BAND_ATTR_BURST,
3200 config->flags & OFPMF13_BURST ?
3201 band->burst_size : band->rate);
3202 nl_msg_end_nested(&buf, band_offset);
3203 }
3204 nl_msg_end_nested(&buf, bands_offset);
3205
3206 int error = dpif_netlink_meter_transact(&buf, &msg,
3207 ovs_meter_set_response_policy, a,
3208 ARRAY_SIZE(ovs_meter_set_response_policy));
3209 if (error) {
3210 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3211 VLOG_INFO_RL(&rl,
3212 "dpif_netlink_meter_transact OVS_METER_CMD_SET failed");
3213 return error;
3214 }
3215
8101f03f
JP
3216 if (nl_attr_get_u32(a[OVS_METER_ATTR_ID]) != meter_id.uint32) {
3217 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3218 VLOG_INFO_RL(&rl,
3219 "Kernel returned a different meter id than requested");
3220 }
80738e5f
AZ
3221 ofpbuf_delete(msg);
3222 return 0;
5dddf960
JR
3223}
3224
60ebc04d
JP
3225static int
3226dpif_netlink_meter_set(struct dpif *dpif_, ofproto_meter_id meter_id,
3227 struct ofputil_meter_config *config)
3228{
3229 if (probe_broken_meters(dpif_)) {
3230 return ENOMEM;
3231 }
3232
3233 return dpif_netlink_meter_set__(dpif_, meter_id, config);
3234}
3235
80738e5f
AZ
3236/* Retrieve statistics and/or delete meter 'meter_id'. Statistics are
3237 * stored in 'stats', if it is not null. If 'command' is
3238 * OVS_METER_CMD_DEL, the meter is deleted and statistics are optionally
3239 * retrieved. If 'command' is OVS_METER_CMD_GET, then statistics are
3240 * simply retrieved. */
5dddf960 3241static int
80738e5f
AZ
3242dpif_netlink_meter_get_stats(const struct dpif *dpif_,
3243 ofproto_meter_id meter_id,
3244 struct ofputil_meter_stats *stats,
3245 uint16_t max_bands,
3246 enum ovs_meter_cmd command)
5dddf960 3247{
80738e5f
AZ
3248 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
3249 struct ofpbuf buf, *msg;
3250 uint64_t stub[1024 / 8];
3251
3252 static const struct nl_policy ovs_meter_stats_policy[] = {
3253 [OVS_METER_ATTR_ID] = { .type = NL_A_U32, .optional = true},
3254 [OVS_METER_ATTR_STATS] = { NL_POLICY_FOR(struct ovs_flow_stats),
3255 .optional = true},
3256 [OVS_METER_ATTR_BANDS] = { .type = NL_A_NESTED, .optional = true },
3257 };
3258 struct nlattr *a[ARRAY_SIZE(ovs_meter_stats_policy)];
3259
3260 dpif_netlink_meter_init(dpif, &buf, stub, sizeof stub, command);
3261
3262 nl_msg_put_u32(&buf, OVS_METER_ATTR_ID, meter_id.uint32);
3263
3264 int error = dpif_netlink_meter_transact(&buf, &msg,
3265 ovs_meter_stats_policy, a,
3266 ARRAY_SIZE(ovs_meter_stats_policy));
3267 if (error) {
3268 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
3269 VLOG_INFO_RL(&rl, "dpif_netlink_meter_transact %s failed",
3270 command == OVS_METER_CMD_GET ? "get" : "del");
3271 return error;
3272 }
3273
3274 if (stats
3275 && a[OVS_METER_ATTR_ID]
3276 && a[OVS_METER_ATTR_STATS]
3277 && nl_attr_get_u32(a[OVS_METER_ATTR_ID]) == meter_id.uint32) {
3278 /* return stats */
3279 const struct ovs_flow_stats *stat;
3280 const struct nlattr *nla;
3281 size_t left;
3282
3283 stat = nl_attr_get(a[OVS_METER_ATTR_STATS]);
3284 stats->packet_in_count = get_32aligned_u64(&stat->n_packets);
3285 stats->byte_in_count = get_32aligned_u64(&stat->n_bytes);
3286
3287 if (a[OVS_METER_ATTR_BANDS]) {
3288 size_t n_bands = 0;
3289 NL_NESTED_FOR_EACH (nla, left, a[OVS_METER_ATTR_BANDS]) {
3290 const struct nlattr *band_nla;
3291 band_nla = nl_attr_find_nested(nla, OVS_BAND_ATTR_STATS);
3292 if (band_nla && nl_attr_get_size(band_nla) \
3293 == sizeof(struct ovs_flow_stats)) {
3294 stat = nl_attr_get(band_nla);
3295
3296 if (n_bands < max_bands) {
3297 stats->bands[n_bands].packet_count
3298 = get_32aligned_u64(&stat->n_packets);
3299 stats->bands[n_bands].byte_count
3300 = get_32aligned_u64(&stat->n_bytes);
3301 ++n_bands;
3302 }
3303 } else {
3304 stats->bands[n_bands].packet_count = 0;
3305 stats->bands[n_bands].byte_count = 0;
3306 ++n_bands;
3307 }
3308 }
3309 stats->n_bands = n_bands;
3310 } else {
3311 /* For a non-existent meter, return 0 stats. */
3312 stats->n_bands = 0;
3313 }
3314 }
3315
3316 ofpbuf_delete(msg);
3317 return error;
5dddf960
JR
3318}
3319
3320static int
80738e5f
AZ
3321dpif_netlink_meter_get(const struct dpif *dpif, ofproto_meter_id meter_id,
3322 struct ofputil_meter_stats *stats, uint16_t max_bands)
5dddf960 3323{
80738e5f
AZ
3324 return dpif_netlink_meter_get_stats(dpif, meter_id, stats, max_bands,
3325 OVS_METER_CMD_GET);
3326}
3327
3328static int
3329dpif_netlink_meter_del(struct dpif *dpif, ofproto_meter_id meter_id,
3330 struct ofputil_meter_stats *stats, uint16_t max_bands)
3331{
3332 return dpif_netlink_meter_get_stats(dpif, meter_id, stats, max_bands,
3333 OVS_METER_CMD_DEL);
5dddf960
JR
3334}
3335
92d0d515
JP
3336static bool
3337probe_broken_meters__(struct dpif *dpif)
3338{
3339 /* This test is destructive if a probe occurs while ovs-vswitchd is
3340 * running (e.g., an ovs-dpctl meter command is called), so choose a
3341 * random high meter id to make this less likely to occur. */
3342 ofproto_meter_id id1 = { 54545401 };
3343 ofproto_meter_id id2 = { 54545402 };
3344 struct ofputil_meter_band band = {OFPMBT13_DROP, 0, 1, 0};
3345 struct ofputil_meter_config config1 = { 1, OFPMF13_KBPS, 1, &band};
3346 struct ofputil_meter_config config2 = { 2, OFPMF13_KBPS, 1, &band};
3347
3348 /* Try adding two meters and make sure that they both come back with
60ebc04d
JP
3349 * the proper meter id. Use the "__" version so that we don't cause
3350 * a recurve deadlock. */
3351 dpif_netlink_meter_set__(dpif, id1, &config1);
3352 dpif_netlink_meter_set__(dpif, id2, &config2);
92d0d515
JP
3353
3354 if (dpif_netlink_meter_get(dpif, id1, NULL, 0)
3355 || dpif_netlink_meter_get(dpif, id2, NULL, 0)) {
3356 VLOG_INFO("The kernel module has a broken meter implementation.");
3357 return true;
3358 }
3359
3360 dpif_netlink_meter_del(dpif, id1, NULL, 0);
3361 dpif_netlink_meter_del(dpif, id2, NULL, 0);
3362
3363 return false;
3364}
3365
3366static bool
3367probe_broken_meters(struct dpif *dpif)
3368{
3369 /* This is a once-only test because currently OVS only has at most a single
3370 * Netlink capable datapath on any given platform. */
3371 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3372
3373 static bool broken_meters = false;
3374 if (ovsthread_once_start(&once)) {
3375 broken_meters = probe_broken_meters__(dpif);
3376 ovsthread_once_done(&once);
3377 }
3378 return broken_meters;
3379}
5dddf960 3380\f
93451a0a 3381const struct dpif_class dpif_netlink_class = {
1a6f1e2a 3382 "system",
c8973eb6 3383 NULL, /* init */
93451a0a 3384 dpif_netlink_enumerate,
0aeaabc8 3385 NULL,
93451a0a
AS
3386 dpif_netlink_open,
3387 dpif_netlink_close,
3388 dpif_netlink_destroy,
3389 dpif_netlink_run,
e4516b20 3390 NULL, /* wait */
93451a0a
AS
3391 dpif_netlink_get_stats,
3392 dpif_netlink_port_add,
3393 dpif_netlink_port_del,
91364d18 3394 NULL, /* port_set_config */
93451a0a
AS
3395 dpif_netlink_port_query_by_number,
3396 dpif_netlink_port_query_by_name,
3397 dpif_netlink_port_get_pid,
3398 dpif_netlink_port_dump_start,
3399 dpif_netlink_port_dump_next,
3400 dpif_netlink_port_dump_done,
3401 dpif_netlink_port_poll,
3402 dpif_netlink_port_poll_wait,
3403 dpif_netlink_flow_flush,
3404 dpif_netlink_flow_dump_create,
3405 dpif_netlink_flow_dump_destroy,
3406 dpif_netlink_flow_dump_thread_create,
3407 dpif_netlink_flow_dump_thread_destroy,
3408 dpif_netlink_flow_dump_next,
3409 dpif_netlink_operate,
3410 dpif_netlink_recv_set,
3411 dpif_netlink_handlers_set,
d4f6865c 3412 NULL, /* set_config */
93451a0a
AS
3413 dpif_netlink_queue_to_priority,
3414 dpif_netlink_recv,
3415 dpif_netlink_recv_wait,
3416 dpif_netlink_recv_purge,
e4e74c3a 3417 NULL, /* register_dp_purge_cb */
6b31e073
RW
3418 NULL, /* register_upcall_cb */
3419 NULL, /* enable_upcall */
3420 NULL, /* disable_upcall */
b5cbbcf6 3421 dpif_netlink_get_datapath_version, /* get_datapath_version */
c11c9f4a
DDP
3422 dpif_netlink_ct_dump_start,
3423 dpif_netlink_ct_dump_next,
3424 dpif_netlink_ct_dump_done,
5dddf960 3425 dpif_netlink_ct_flush,
c92339ad
DB
3426 NULL, /* ct_set_maxconns */
3427 NULL, /* ct_get_maxconns */
875075b3 3428 NULL, /* ct_get_nconns */
906ff9d2
YHW
3429 dpif_netlink_ct_set_limits,
3430 dpif_netlink_ct_get_limits,
3431 dpif_netlink_ct_del_limits,
4ea96698
DB
3432 NULL, /* ipf_set_enabled */
3433 NULL, /* ipf_set_min_frag */
3434 NULL, /* ipf_set_max_nfrags */
3435 NULL, /* ipf_get_status */
3436 NULL, /* ipf_dump_start */
3437 NULL, /* ipf_dump_next */
3438 NULL, /* ipf_dump_done */
5dddf960
JR
3439 dpif_netlink_meter_get_features,
3440 dpif_netlink_meter_set,
3441 dpif_netlink_meter_get,
3442 dpif_netlink_meter_del,
96fba48f 3443};
93451a0a 3444
96fba48f 3445static int
93451a0a 3446dpif_netlink_init(void)
96fba48f 3447{
eb8ed438
BP
3448 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
3449 static int error;
982b8810 3450
eb8ed438 3451 if (ovsthread_once_start(&once)) {
df2c07f4
JP
3452 error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,
3453 &ovs_datapath_family);
37a1300c 3454 if (error) {
e0e2410d 3455 VLOG_INFO("Generic Netlink family '%s' does not exist. "
cae7529c
CL
3456 "The Open vSwitch kernel module is probably not loaded.",
3457 OVS_DATAPATH_FAMILY);
37a1300c 3458 }
f0fef760 3459 if (!error) {
df2c07f4 3460 error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
f0fef760 3461 }
37a1300c 3462 if (!error) {
df2c07f4 3463 error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
37a1300c 3464 }
aaff4b55 3465 if (!error) {
df2c07f4
JP
3466 error = nl_lookup_genl_family(OVS_PACKET_FAMILY,
3467 &ovs_packet_family);
aaff4b55 3468 }
c7178a0b
EJ
3469 if (!error) {
3470 error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,
b3dcb73c 3471 &ovs_vport_mcgroup);
c7178a0b 3472 }
80738e5f
AZ
3473 if (!error) {
3474 if (nl_lookup_genl_family(OVS_METER_FAMILY, &ovs_meter_family)) {
3475 VLOG_INFO("The kernel module does not support meters.");
3476 }
3477 }
906ff9d2
YHW
3478 if (nl_lookup_genl_family(OVS_CT_LIMIT_FAMILY,
3479 &ovs_ct_limit_family) < 0) {
3480 VLOG_INFO("Generic Netlink family '%s' does not exist. "
3481 "Please update the Open vSwitch kernel module to enable "
3482 "the conntrack limit feature.", OVS_CT_LIMIT_FAMILY);
3483 }
eb8ed438 3484
921c370a
EG
3485 ovs_tunnels_out_of_tree = dpif_netlink_rtnl_probe_oot_tunnels();
3486
eb8ed438 3487 ovsthread_once_done(&once);
982b8810
BP
3488 }
3489
3490 return error;
96fba48f
BP
3491}
3492
c19e6535 3493bool
93451a0a 3494dpif_netlink_is_internal_device(const char *name)
9fe3b9a2 3495{
93451a0a 3496 struct dpif_netlink_vport reply;
c19e6535 3497 struct ofpbuf *buf;
9fe3b9a2 3498 int error;
96fba48f 3499
93451a0a 3500 error = dpif_netlink_vport_get(name, &reply, &buf);
c19e6535
BP
3501 if (!error) {
3502 ofpbuf_delete(buf);
141d9ce4 3503 } else if (error != ENODEV && error != ENOENT) {
c19e6535 3504 VLOG_WARN_RL(&error_rl, "%s: vport query failed (%s)",
10a89ef0 3505 name, ovs_strerror(error));
96fba48f
BP
3506 }
3507
df2c07f4 3508 return reply.type == OVS_VPORT_TYPE_INTERNAL;
96fba48f 3509}
e0467f6d 3510
df2c07f4 3511/* Parses the contents of 'buf', which contains a "struct ovs_header" followed
c19e6535
BP
3512 * by Netlink attributes, into 'vport'. Returns 0 if successful, otherwise a
3513 * positive errno value.
3514 *
3515 * 'vport' will contain pointers into 'buf', so the caller should not free
3516 * 'buf' while 'vport' is still in use. */
3517static int
93451a0a 3518dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *vport,
c19e6535
BP
3519 const struct ofpbuf *buf)
3520{
df2c07f4
JP
3521 static const struct nl_policy ovs_vport_policy[] = {
3522 [OVS_VPORT_ATTR_PORT_NO] = { .type = NL_A_U32 },
3523 [OVS_VPORT_ATTR_TYPE] = { .type = NL_A_U32 },
3524 [OVS_VPORT_ATTR_NAME] = { .type = NL_A_STRING, .max_len = IFNAMSIZ },
1579cf67 3525 [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NL_A_UNSPEC },
f7df9823 3526 [OVS_VPORT_ATTR_STATS] = { NL_POLICY_FOR(struct ovs_vport_stats),
c19e6535 3527 .optional = true },
df2c07f4 3528 [OVS_VPORT_ATTR_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
bfda5239 3529 [OVS_VPORT_ATTR_NETNSID] = { .type = NL_A_U32, .optional = true },
c19e6535
BP
3530 };
3531
93451a0a 3532 dpif_netlink_vport_init(vport);
c19e6535 3533
0a2869d5
BP
3534 struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size);
3535 struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
3536 struct genlmsghdr *genl = ofpbuf_try_pull(&b, sizeof *genl);
3537 struct ovs_header *ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
3538
3539 struct nlattr *a[ARRAY_SIZE(ovs_vport_policy)];
df2c07f4
JP
3540 if (!nlmsg || !genl || !ovs_header
3541 || nlmsg->nlmsg_type != ovs_vport_family
3542 || !nl_policy_parse(&b, 0, ovs_vport_policy, a,
3543 ARRAY_SIZE(ovs_vport_policy))) {
c19e6535
BP
3544 return EINVAL;
3545 }
c19e6535 3546
f0fef760 3547 vport->cmd = genl->cmd;
df2c07f4 3548 vport->dp_ifindex = ovs_header->dp_ifindex;
4e022ec0 3549 vport->port_no = nl_attr_get_odp_port(a[OVS_VPORT_ATTR_PORT_NO]);
df2c07f4
JP
3550 vport->type = nl_attr_get_u32(a[OVS_VPORT_ATTR_TYPE]);
3551 vport->name = nl_attr_get_string(a[OVS_VPORT_ATTR_NAME]);
b063d9f0 3552 if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
1579cf67
AW
3553 vport->n_upcall_pids = nl_attr_get_size(a[OVS_VPORT_ATTR_UPCALL_PID])
3554 / (sizeof *vport->upcall_pids);
3555 vport->upcall_pids = nl_attr_get(a[OVS_VPORT_ATTR_UPCALL_PID]);
3556
b063d9f0 3557 }
df2c07f4
JP
3558 if (a[OVS_VPORT_ATTR_STATS]) {
3559 vport->stats = nl_attr_get(a[OVS_VPORT_ATTR_STATS]);
3560 }
df2c07f4
JP
3561 if (a[OVS_VPORT_ATTR_OPTIONS]) {
3562 vport->options = nl_attr_get(a[OVS_VPORT_ATTR_OPTIONS]);
3563 vport->options_len = nl_attr_get_size(a[OVS_VPORT_ATTR_OPTIONS]);
c19e6535 3564 }
bfda5239
FL
3565 if (a[OVS_VPORT_ATTR_NETNSID]) {
3566 netnsid_set(&vport->netnsid,
3567 nl_attr_get_u32(a[OVS_VPORT_ATTR_NETNSID]));
3568 } else {
3569 netnsid_set_local(&vport->netnsid);
3570 }
c19e6535
BP
3571 return 0;
3572}
3573
df2c07f4 3574/* Appends to 'buf' (which must initially be empty) a "struct ovs_header"
c19e6535
BP
3575 * followed by Netlink attributes corresponding to 'vport'. */
3576static void
93451a0a
AS
3577dpif_netlink_vport_to_ofpbuf(const struct dpif_netlink_vport *vport,
3578 struct ofpbuf *buf)
c19e6535 3579{
df2c07f4 3580 struct ovs_header *ovs_header;
f0fef760 3581
df2c07f4 3582 nl_msg_put_genlmsghdr(buf, 0, ovs_vport_family, NLM_F_REQUEST | NLM_F_ECHO,
69685a88 3583 vport->cmd, OVS_VPORT_VERSION);
c19e6535 3584
df2c07f4
JP
3585 ovs_header = ofpbuf_put_uninit(buf, sizeof *ovs_header);
3586 ovs_header->dp_ifindex = vport->dp_ifindex;
c19e6535 3587
4e022ec0
AW
3588 if (vport->port_no != ODPP_NONE) {
3589 nl_msg_put_odp_port(buf, OVS_VPORT_ATTR_PORT_NO, vport->port_no);
c19e6535
BP
3590 }
3591
df2c07f4
JP
3592 if (vport->type != OVS_VPORT_TYPE_UNSPEC) {
3593 nl_msg_put_u32(buf, OVS_VPORT_ATTR_TYPE, vport->type);
c19e6535
BP
3594 }
3595
3596 if (vport->name) {
df2c07f4 3597 nl_msg_put_string(buf, OVS_VPORT_ATTR_NAME, vport->name);
c19e6535
BP
3598 }
3599
1579cf67
AW
3600 if (vport->upcall_pids) {
3601 nl_msg_put_unspec(buf, OVS_VPORT_ATTR_UPCALL_PID,
3602 vport->upcall_pids,
3603 vport->n_upcall_pids * sizeof *vport->upcall_pids);
a24a6574 3604 }
b063d9f0 3605
c19e6535 3606 if (vport->stats) {
df2c07f4 3607 nl_msg_put_unspec(buf, OVS_VPORT_ATTR_STATS,
c19e6535
BP
3608 vport->stats, sizeof *vport->stats);
3609 }
3610
c19e6535 3611 if (vport->options) {
df2c07f4 3612 nl_msg_put_nested(buf, OVS_VPORT_ATTR_OPTIONS,
c19e6535
BP
3613 vport->options, vport->options_len);
3614 }
c19e6535
BP
3615}
3616
3617/* Clears 'vport' to "empty" values. */
3618void
93451a0a 3619dpif_netlink_vport_init(struct dpif_netlink_vport *vport)
c19e6535
BP
3620{
3621 memset(vport, 0, sizeof *vport);
4e022ec0 3622 vport->port_no = ODPP_NONE;
c19e6535
BP
3623}
3624
3625/* Executes 'request' in the kernel datapath. If the command fails, returns a
3626 * positive errno value. Otherwise, if 'reply' and 'bufp' are null, returns 0
3627 * without doing anything else. If 'reply' and 'bufp' are nonnull, then the
df2c07f4 3628 * result of the command is expected to be an ovs_vport also, which is decoded
c19e6535
BP
3629 * and stored in '*reply' and '*bufp'. The caller must free '*bufp' when the
3630 * reply is no longer needed ('reply' will contain pointers into '*bufp'). */
3631int
93451a0a
AS
3632dpif_netlink_vport_transact(const struct dpif_netlink_vport *request,
3633 struct dpif_netlink_vport *reply,
3634 struct ofpbuf **bufp)
c19e6535 3635{
f0fef760 3636 struct ofpbuf *request_buf;
c19e6535
BP
3637 int error;
3638
cb22974d 3639 ovs_assert((reply != NULL) == (bufp != NULL));
c19e6535 3640
93451a0a 3641 error = dpif_netlink_init();
42bb6c72
BP
3642 if (error) {
3643 if (reply) {
3644 *bufp = NULL;
93451a0a 3645 dpif_netlink_vport_init(reply);
42bb6c72
BP
3646 }
3647 return error;
3648 }
3649
f0fef760 3650 request_buf = ofpbuf_new(1024);
93451a0a 3651 dpif_netlink_vport_to_ofpbuf(request, request_buf);
a88b4e04 3652 error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
f0fef760 3653 ofpbuf_delete(request_buf);
c19e6535 3654
f0fef760
BP
3655 if (reply) {
3656 if (!error) {
93451a0a 3657 error = dpif_netlink_vport_from_ofpbuf(reply, *bufp);
f0fef760 3658 }
c19e6535 3659 if (error) {
93451a0a 3660 dpif_netlink_vport_init(reply);
f0fef760
BP
3661 ofpbuf_delete(*bufp);
3662 *bufp = NULL;
c19e6535 3663 }
c19e6535
BP
3664 }
3665 return error;
3666}
3667
3668/* Obtains information about the kernel vport named 'name' and stores it into
3669 * '*reply' and '*bufp'. The caller must free '*bufp' when the reply is no
3670 * longer needed ('reply' will contain pointers into '*bufp'). */
3671int
93451a0a
AS
3672dpif_netlink_vport_get(const char *name, struct dpif_netlink_vport *reply,
3673 struct ofpbuf **bufp)
c19e6535 3674{
93451a0a 3675 struct dpif_netlink_vport request;
c19e6535 3676
93451a0a 3677 dpif_netlink_vport_init(&request);
df2c07f4 3678 request.cmd = OVS_VPORT_CMD_GET;
c19e6535
BP
3679 request.name = name;
3680
93451a0a 3681 return dpif_netlink_vport_transact(&request, reply, bufp);
c19e6535 3682}
93451a0a 3683
df2c07f4 3684/* Parses the contents of 'buf', which contains a "struct ovs_header" followed
aaff4b55
BP
3685 * by Netlink attributes, into 'dp'. Returns 0 if successful, otherwise a
3686 * positive errno value.
d6569377
BP
3687 *
3688 * 'dp' will contain pointers into 'buf', so the caller should not free 'buf'
3689 * while 'dp' is still in use. */
3690static int
93451a0a 3691dpif_netlink_dp_from_ofpbuf(struct dpif_netlink_dp *dp, const struct ofpbuf *buf)
d6569377 3692{
df2c07f4
JP
3693 static const struct nl_policy ovs_datapath_policy[] = {
3694 [OVS_DP_ATTR_NAME] = { .type = NL_A_STRING, .max_len = IFNAMSIZ },
f7df9823 3695 [OVS_DP_ATTR_STATS] = { NL_POLICY_FOR(struct ovs_dp_stats),
d6569377 3696 .optional = true },
847108dc
AZ
3697 [OVS_DP_ATTR_MEGAFLOW_STATS] = {
3698 NL_POLICY_FOR(struct ovs_dp_megaflow_stats),
3699 .optional = true },
d6569377
BP
3700 };
3701
93451a0a 3702 dpif_netlink_dp_init(dp);
d6569377 3703
0a2869d5
BP
3704 struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size);
3705 struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
3706 struct genlmsghdr *genl = ofpbuf_try_pull(&b, sizeof *genl);
3707 struct ovs_header *ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
3708
3709 struct nlattr *a[ARRAY_SIZE(ovs_datapath_policy)];
df2c07f4
JP
3710 if (!nlmsg || !genl || !ovs_header
3711 || nlmsg->nlmsg_type != ovs_datapath_family
3712 || !nl_policy_parse(&b, 0, ovs_datapath_policy, a,
3713 ARRAY_SIZE(ovs_datapath_policy))) {
d6569377
BP
3714 return EINVAL;
3715 }
d6569377 3716
aaff4b55 3717 dp->cmd = genl->cmd;
df2c07f4
JP
3718 dp->dp_ifindex = ovs_header->dp_ifindex;
3719 dp->name = nl_attr_get_string(a[OVS_DP_ATTR_NAME]);
3720 if (a[OVS_DP_ATTR_STATS]) {
6a54dedc 3721 dp->stats = nl_attr_get(a[OVS_DP_ATTR_STATS]);
d6569377 3722 }
982b8810 3723
847108dc 3724 if (a[OVS_DP_ATTR_MEGAFLOW_STATS]) {
6a54dedc 3725 dp->megaflow_stats = nl_attr_get(a[OVS_DP_ATTR_MEGAFLOW_STATS]);
847108dc
AZ
3726 }
3727
d6569377
BP
3728 return 0;
3729}
3730
aaff4b55 3731/* Appends to 'buf' the Generic Netlink message described by 'dp'. */
d6569377 3732static void
93451a0a 3733dpif_netlink_dp_to_ofpbuf(const struct dpif_netlink_dp *dp, struct ofpbuf *buf)
d6569377 3734{
df2c07f4 3735 struct ovs_header *ovs_header;
d6569377 3736
df2c07f4 3737 nl_msg_put_genlmsghdr(buf, 0, ovs_datapath_family,
69685a88
JG
3738 NLM_F_REQUEST | NLM_F_ECHO, dp->cmd,
3739 OVS_DATAPATH_VERSION);
aaff4b55 3740
df2c07f4
JP
3741 ovs_header = ofpbuf_put_uninit(buf, sizeof *ovs_header);
3742 ovs_header->dp_ifindex = dp->dp_ifindex;
d6569377
BP
3743
3744 if (dp->name) {
df2c07f4 3745 nl_msg_put_string(buf, OVS_DP_ATTR_NAME, dp->name);
d6569377
BP
3746 }
3747
a24a6574
BP
3748 if (dp->upcall_pid) {
3749 nl_msg_put_u32(buf, OVS_DP_ATTR_UPCALL_PID, *dp->upcall_pid);
3750 }
b063d9f0 3751
b7fd5e38
TG
3752 if (dp->user_features) {
3753 nl_msg_put_u32(buf, OVS_DP_ATTR_USER_FEATURES, dp->user_features);
3754 }
3755
df2c07f4 3756 /* Skip OVS_DP_ATTR_STATS since we never have a reason to serialize it. */
d6569377
BP
3757}
3758
3759/* Clears 'dp' to "empty" values. */
d3d8f1f7 3760static void
93451a0a 3761dpif_netlink_dp_init(struct dpif_netlink_dp *dp)
d6569377
BP
3762{
3763 memset(dp, 0, sizeof *dp);
d6569377
BP
3764}
3765
aaff4b55 3766static void
93451a0a 3767dpif_netlink_dp_dump_start(struct nl_dump *dump)
aaff4b55 3768{
93451a0a 3769 struct dpif_netlink_dp request;
aaff4b55
BP
3770 struct ofpbuf *buf;
3771
93451a0a 3772 dpif_netlink_dp_init(&request);
df2c07f4 3773 request.cmd = OVS_DP_CMD_GET;
aaff4b55
BP
3774
3775 buf = ofpbuf_new(1024);
93451a0a 3776 dpif_netlink_dp_to_ofpbuf(&request, buf);
a88b4e04 3777 nl_dump_start(dump, NETLINK_GENERIC, buf);
aaff4b55
BP
3778 ofpbuf_delete(buf);
3779}
3780
d6569377
BP
3781/* Executes 'request' in the kernel datapath. If the command fails, returns a
3782 * positive errno value. Otherwise, if 'reply' and 'bufp' are null, returns 0
3783 * without doing anything else. If 'reply' and 'bufp' are nonnull, then the
aaff4b55
BP
3784 * result of the command is expected to be of the same form, which is decoded
3785 * and stored in '*reply' and '*bufp'. The caller must free '*bufp' when the
3786 * reply is no longer needed ('reply' will contain pointers into '*bufp'). */
d3d8f1f7 3787static int
93451a0a
AS
3788dpif_netlink_dp_transact(const struct dpif_netlink_dp *request,
3789 struct dpif_netlink_dp *reply, struct ofpbuf **bufp)
d6569377 3790{
aaff4b55 3791 struct ofpbuf *request_buf;
d6569377 3792 int error;
d6569377 3793
cb22974d 3794 ovs_assert((reply != NULL) == (bufp != NULL));
d6569377 3795
aaff4b55 3796 request_buf = ofpbuf_new(1024);
93451a0a 3797 dpif_netlink_dp_to_ofpbuf(request, request_buf);
a88b4e04 3798 error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
aaff4b55 3799 ofpbuf_delete(request_buf);
d6569377 3800
aaff4b55 3801 if (reply) {
93451a0a 3802 dpif_netlink_dp_init(reply);
aaff4b55 3803 if (!error) {
93451a0a 3804 error = dpif_netlink_dp_from_ofpbuf(reply, *bufp);
aaff4b55 3805 }
d6569377 3806 if (error) {
aaff4b55
BP
3807 ofpbuf_delete(*bufp);
3808 *bufp = NULL;
d6569377 3809 }
d6569377
BP
3810 }
3811 return error;
3812}
3813
3814/* Obtains information about 'dpif_' and stores it into '*reply' and '*bufp'.
3815 * The caller must free '*bufp' when the reply is no longer needed ('reply'
3816 * will contain pointers into '*bufp'). */
d3d8f1f7 3817static int
93451a0a
AS
3818dpif_netlink_dp_get(const struct dpif *dpif_, struct dpif_netlink_dp *reply,
3819 struct ofpbuf **bufp)
d6569377 3820{
93451a0a
AS
3821 struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
3822 struct dpif_netlink_dp request;
d6569377 3823
93451a0a 3824 dpif_netlink_dp_init(&request);
df2c07f4 3825 request.cmd = OVS_DP_CMD_GET;
254f2dc8 3826 request.dp_ifindex = dpif->dp_ifindex;
d6569377 3827
93451a0a 3828 return dpif_netlink_dp_transact(&request, reply, bufp);
d6569377 3829}
93451a0a 3830
df2c07f4 3831/* Parses the contents of 'buf', which contains a "struct ovs_header" followed
37a1300c 3832 * by Netlink attributes, into 'flow'. Returns 0 if successful, otherwise a
d6569377
BP
3833 * positive errno value.
3834 *
3835 * 'flow' will contain pointers into 'buf', so the caller should not free 'buf'
3836 * while 'flow' is still in use. */
3837static int
93451a0a
AS
3838dpif_netlink_flow_from_ofpbuf(struct dpif_netlink_flow *flow,
3839 const struct ofpbuf *buf)
d6569377 3840{
70e5ed6f
JS
3841 static const struct nl_policy ovs_flow_policy[__OVS_FLOW_ATTR_MAX] = {
3842 [OVS_FLOW_ATTR_KEY] = { .type = NL_A_NESTED, .optional = true },
e6cc0bab 3843 [OVS_FLOW_ATTR_MASK] = { .type = NL_A_NESTED, .optional = true },
df2c07f4 3844 [OVS_FLOW_ATTR_ACTIONS] = { .type = NL_A_NESTED, .optional = true },
f7df9823 3845 [OVS_FLOW_ATTR_STATS] = { NL_POLICY_FOR(struct ovs_flow_stats),
d6569377 3846 .optional = true },
df2c07f4
JP
3847 [OVS_FLOW_ATTR_TCP_FLAGS] = { .type = NL_A_U8, .optional = true },
3848 [OVS_FLOW_ATTR_USED] = { .type = NL_A_U64, .optional = true },
ab79d262 3849 [OVS_FLOW_ATTR_UFID] = { .type = NL_A_U128, .optional = true },
df2c07f4 3850 /* The kernel never uses OVS_FLOW_ATTR_CLEAR. */
43f9ac0a 3851 /* The kernel never uses OVS_FLOW_ATTR_PROBE. */
70e5ed6f 3852 /* The kernel never uses OVS_FLOW_ATTR_UFID_FLAGS. */
d6569377
BP
3853 };
3854
93451a0a 3855 dpif_netlink_flow_init(flow);
d6569377 3856
0a2869d5
BP
3857 struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size);
3858 struct nlmsghdr *nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
3859 struct genlmsghdr *genl = ofpbuf_try_pull(&b, sizeof *genl);
3860 struct ovs_header *ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
3861
3862 struct nlattr *a[ARRAY_SIZE(ovs_flow_policy)];
df2c07f4
JP
3863 if (!nlmsg || !genl || !ovs_header
3864 || nlmsg->nlmsg_type != ovs_flow_family
3865 || !nl_policy_parse(&b, 0, ovs_flow_policy, a,
3866 ARRAY_SIZE(ovs_flow_policy))) {
d6569377
BP
3867 return EINVAL;
3868 }
70e5ed6f
JS
3869 if (!a[OVS_FLOW_ATTR_KEY] && !a[OVS_FLOW_ATTR_UFID]) {
3870 return EINVAL;
3871 }
d6569377 3872
37a1300c 3873 flow->nlmsg_flags = nlmsg->nlmsg_flags;
df2c07f4 3874 flow->dp_ifindex = ovs_header->dp_ifindex;
70e5ed6f
JS
3875 if (a[OVS_FLOW_ATTR_KEY]) {
3876 flow->key = nl_attr_get(a[OVS_FLOW_ATTR_KEY]);
3877 flow->key_len = nl_attr_get_size(a[OVS_FLOW_ATTR_KEY]);
3878 }
e6cc0bab 3879
70e5ed6f 3880 if (a[OVS_FLOW_ATTR_UFID]) {
ab79d262 3881 flow->ufid = nl_attr_get_u128(a[OVS_FLOW_ATTR_UFID]);
70e5ed6f
JS
3882 flow->ufid_present = true;
3883 }
e6cc0bab
AZ
3884 if (a[OVS_FLOW_ATTR_MASK]) {
3885 flow->mask = nl_attr_get(a[OVS_FLOW_ATTR_MASK]);
3886 flow->mask_len = nl_attr_get_size(a[OVS_FLOW_ATTR_MASK]);
3887 }
df2c07f4
JP
3888 if (a[OVS_FLOW_ATTR_ACTIONS]) {
3889 flow->actions = nl_attr_get(a[OVS_FLOW_ATTR_ACTIONS]);
3890 flow->actions_len = nl_attr_get_size(a[OVS_FLOW_ATTR_ACTIONS]);
d6569377 3891 }
df2c07f4
JP
3892 if (a[OVS_FLOW_ATTR_STATS]) {
3893 flow->stats = nl_attr_get(a[OVS_FLOW_ATTR_STATS]);
d6569377 3894 }
df2c07f4
JP
3895 if (a[OVS_FLOW_ATTR_TCP_FLAGS]) {
3896 flow->tcp_flags = nl_attr_get(a[OVS_FLOW_ATTR_TCP_FLAGS]);
d6569377 3897 }
df2c07f4
JP
3898 if (a[OVS_FLOW_ATTR_USED]) {
3899 flow->used = nl_attr_get(a[OVS_FLOW_ATTR_USED]);
9e980142 3900 }
d6569377
BP
3901 return 0;
3902}
3903
beb75a40
JS
3904
3905/*
a8a3eee4
JS
3906 * If PACKET_TYPE attribute is present in 'data', it filters PACKET_TYPE out.
3907 * If the flow is not Ethernet, the OVS_KEY_ATTR_PACKET_TYPE is converted to
3908 * OVS_KEY_ATTR_ETHERTYPE. Puts 'data' to 'buf'.
beb75a40
JS
3909 */
3910static void
3911put_exclude_packet_type(struct ofpbuf *buf, uint16_t type,
3912 const struct nlattr *data, uint16_t data_len)
3913{
3914 const struct nlattr *packet_type;
3915
3916 packet_type = nl_attr_find__(data, data_len, OVS_KEY_ATTR_PACKET_TYPE);
3917
3918 if (packet_type) {
3919 /* exclude PACKET_TYPE Netlink attribute. */
3920 ovs_assert(NLA_ALIGN(packet_type->nla_len) == NL_A_U32_SIZE);
3921 size_t packet_type_len = NL_A_U32_SIZE;
3922 size_t first_chunk_size = (uint8_t *)packet_type - (uint8_t *)data;
3923 size_t second_chunk_size = data_len - first_chunk_size
3924 - packet_type_len;
beb75a40 3925 struct nlattr *next_attr = nl_attr_next(packet_type);
1ca5b61b 3926 size_t ofs;
beb75a40 3927
1ca5b61b
JS
3928 ofs = nl_msg_start_nested(buf, type);
3929 nl_msg_put(buf, data, first_chunk_size);
3930 nl_msg_put(buf, next_attr, second_chunk_size);
a8a3eee4
JS
3931 if (!nl_attr_find__(data, data_len, OVS_KEY_ATTR_ETHERNET)) {
3932 ovs_be16 pt = pt_ns_type_be(nl_attr_get_be32(packet_type));
3933 const struct nlattr *nla;
3934
7c5793e6 3935 nla = nl_attr_find(buf, ofs + NLA_HDRLEN, OVS_KEY_ATTR_ETHERTYPE);
a8a3eee4
JS
3936 if (nla) {
3937 ovs_be16 *ethertype;
3938
3939 ethertype = CONST_CAST(ovs_be16 *, nl_attr_get(nla));
3940 *ethertype = pt;
3941 } else {
3942 nl_msg_put_be16(buf, OVS_KEY_ATTR_ETHERTYPE, pt);
3943 }
3944 }
1ca5b61b 3945 nl_msg_end_nested(buf, ofs);
beb75a40
JS
3946 } else {
3947 nl_msg_put_unspec(buf, type, data, data_len);
3948 }
3949}
3950
df2c07f4 3951/* Appends to 'buf' (which must initially be empty) a "struct ovs_header"
d6569377
BP
3952 * followed by Netlink attributes corresponding to 'flow'. */
3953static void
93451a0a
AS
3954dpif_netlink_flow_to_ofpbuf(const struct dpif_netlink_flow *flow,
3955 struct ofpbuf *buf)
d6569377 3956{
df2c07f4 3957 struct ovs_header *ovs_header;
d6569377 3958
df2c07f4 3959 nl_msg_put_genlmsghdr(buf, 0, ovs_flow_family,
30b44744 3960 NLM_F_REQUEST | flow->nlmsg_flags,
69685a88 3961 flow->cmd, OVS_FLOW_VERSION);
37a1300c 3962
df2c07f4
JP
3963 ovs_header = ofpbuf_put_uninit(buf, sizeof *ovs_header);
3964 ovs_header->dp_ifindex = flow->dp_ifindex;
d6569377 3965
70e5ed6f 3966 if (flow->ufid_present) {
ab79d262 3967 nl_msg_put_u128(buf, OVS_FLOW_ATTR_UFID, flow->ufid);
70e5ed6f
JS
3968 }
3969 if (flow->ufid_terse) {
3970 nl_msg_put_u32(buf, OVS_FLOW_ATTR_UFID_FLAGS,
3971 OVS_UFID_F_OMIT_KEY | OVS_UFID_F_OMIT_MASK
3972 | OVS_UFID_F_OMIT_ACTIONS);
3973 }
64bb477f
JS
3974 if (!flow->ufid_terse || !flow->ufid_present) {
3975 if (flow->key_len) {
beb75a40
JS
3976 put_exclude_packet_type(buf, OVS_FLOW_ATTR_KEY, flow->key,
3977 flow->key_len);
64bb477f 3978 }
64bb477f 3979 if (flow->mask_len) {
beb75a40
JS
3980 put_exclude_packet_type(buf, OVS_FLOW_ATTR_MASK, flow->mask,
3981 flow->mask_len);
64bb477f
JS
3982 }
3983 if (flow->actions || flow->actions_len) {
3984 nl_msg_put_unspec(buf, OVS_FLOW_ATTR_ACTIONS,
3985 flow->actions, flow->actions_len);
3986 }
d6569377
BP
3987 }
3988
3989 /* We never need to send these to the kernel. */
cb22974d
BP
3990 ovs_assert(!flow->stats);
3991 ovs_assert(!flow->tcp_flags);
3992 ovs_assert(!flow->used);
d6569377
BP
3993
3994 if (flow->clear) {
df2c07f4 3995 nl_msg_put_flag(buf, OVS_FLOW_ATTR_CLEAR);
d6569377 3996 }
43f9ac0a
JR
3997 if (flow->probe) {
3998 nl_msg_put_flag(buf, OVS_FLOW_ATTR_PROBE);
3999 }
d6569377
BP
4000}
4001
4002/* Clears 'flow' to "empty" values. */
d3d8f1f7 4003static void
93451a0a 4004dpif_netlink_flow_init(struct dpif_netlink_flow *flow)
d6569377
BP
4005{
4006 memset(flow, 0, sizeof *flow);
4007}
4008
4009/* Executes 'request' in the kernel datapath. If the command fails, returns a
4010 * positive errno value. Otherwise, if 'reply' and 'bufp' are null, returns 0
4011 * without doing anything else. If 'reply' and 'bufp' are nonnull, then the
37a1300c
BP
4012 * result of the command is expected to be a flow also, which is decoded and
4013 * stored in '*reply' and '*bufp'. The caller must free '*bufp' when the reply
4014 * is no longer needed ('reply' will contain pointers into '*bufp'). */
d3d8f1f7 4015static int
93451a0a
AS
4016dpif_netlink_flow_transact(struct dpif_netlink_flow *request,
4017 struct dpif_netlink_flow *reply,
4018 struct ofpbuf **bufp)
d6569377 4019{
37a1300c 4020 struct ofpbuf *request_buf;
d6569377 4021 int error;
d6569377 4022
cb22974d 4023 ovs_assert((reply != NULL) == (bufp != NULL));
d6569377 4024
30b44744
BP
4025 if (reply) {
4026 request->nlmsg_flags |= NLM_F_ECHO;
4027 }
4028
37a1300c 4029 request_buf = ofpbuf_new(1024);
93451a0a 4030 dpif_netlink_flow_to_ofpbuf(request, request_buf);
a88b4e04 4031 error = nl_transact(NETLINK_GENERIC, request_buf, bufp);
37a1300c 4032 ofpbuf_delete(request_buf);
d6569377 4033
37a1300c
BP
4034 if (reply) {
4035 if (!error) {
93451a0a 4036 error = dpif_netlink_flow_from_ofpbuf(reply, *bufp);
37a1300c 4037 }
d6569377 4038 if (error) {
93451a0a 4039 dpif_netlink_flow_init(reply);
37a1300c
BP
4040 ofpbuf_delete(*bufp);
4041 *bufp = NULL;
d6569377 4042 }
d6569377
BP
4043 }
4044 return error;
4045}
4046
4047static void
93451a0a
AS
4048dpif_netlink_flow_get_stats(const struct dpif_netlink_flow *flow,
4049 struct dpif_flow_stats *stats)
d6569377
BP
4050{
4051 if (flow->stats) {
6a54dedc
BP
4052 stats->n_packets = get_32aligned_u64(&flow->stats->n_packets);
4053 stats->n_bytes = get_32aligned_u64(&flow->stats->n_bytes);
d6569377
BP
4054 } else {
4055 stats->n_packets = 0;
4056 stats->n_bytes = 0;
4057 }
0e70cdcb 4058 stats->used = flow->used ? get_32aligned_u64(flow->used) : 0;
d6569377
BP
4059 stats->tcp_flags = flow->tcp_flags ? *flow->tcp_flags : 0;
4060}
e0467f6d 4061
14b4d2f9
BP
4062/* Logs information about a packet that was recently lost in 'ch' (in
4063 * 'dpif_'). */
4064static void
93451a0a 4065report_loss(struct dpif_netlink *dpif, struct dpif_channel *ch, uint32_t ch_idx,
1579cf67 4066 uint32_t handler_id)
14b4d2f9 4067{
14b4d2f9 4068 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
14b4d2f9
BP
4069 struct ds s;
4070
8d675c5a 4071 if (VLOG_DROP_WARN(&rl)) {
14b4d2f9
BP
4072 return;
4073 }
4074
4075 ds_init(&s);
4076 if (ch->last_poll != LLONG_MIN) {
4077 ds_put_format(&s, " (last polled %lld ms ago)",
4078 time_msec() - ch->last_poll);
4079 }
14b4d2f9 4080
1579cf67 4081 VLOG_WARN("%s: lost packet on port channel %u of handler %u",
9b00386b 4082 dpif_name(&dpif->dpif), ch_idx, handler_id);
14b4d2f9
BP
4083 ds_destroy(&s);
4084}