]> git.proxmox.com Git - mirror_ovs.git/blame - lib/netdev-linux.c
netdev-offload-tc: Use single 'once' variable for probing tc features
[mirror_ovs.git] / lib / netdev-linux.c
CommitLineData
e9e28be3 1/*
48c6733c 2 * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 Nicira, Inc.
e9e28be3
BP
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <config.h>
d3980822
BP
18
19#include "netdev-linux.h"
0de1b425 20#include "netdev-linux-private.h"
d3980822 21
e9e28be3 22#include <errno.h>
8b61709d 23#include <fcntl.h>
b2befd5b
BP
24#include <sys/types.h>
25#include <netinet/in.h>
55bc98d6 26#include <arpa/inet.h>
8b61709d 27#include <inttypes.h>
2f564bb1 28#include <math.h>
32383c3b 29#include <linux/filter.h>
c1c9c9c4 30#include <linux/gen_stats.h>
bb7d0e22 31#include <linux/if_ether.h>
29cf9c1b 32#include <linux/if_packet.h>
8b61709d
BP
33#include <linux/if_tun.h>
34#include <linux/types.h>
35#include <linux/ethtool.h>
63331829 36#include <linux/mii.h>
ef3767f5 37#include <linux/rtnetlink.h>
8b61709d 38#include <linux/sockios.h>
29cf9c1b 39#include <linux/virtio_net.h>
8b61709d
BP
40#include <sys/ioctl.h>
41#include <sys/socket.h>
29cf9c1b 42#include <sys/uio.h>
ac3e3aaa 43#include <sys/utsname.h>
8b61709d
BP
44#include <net/if.h>
45#include <net/if_arp.h>
8b61709d 46#include <net/route.h>
e9e28be3 47#include <poll.h>
8b61709d
BP
48#include <stdlib.h>
49#include <string.h>
50#include <unistd.h>
e9e28be3
BP
51
52#include "coverage.h"
e14deea0 53#include "dp-packet.h"
93451a0a 54#include "dpif-netlink.h"
df1e5a3b 55#include "dpif-netdev.h"
3e8a2ad1 56#include "openvswitch/dynamic-string.h"
8b61709d 57#include "fatal-signal.h"
93b13be8 58#include "hash.h"
ee89ea7b 59#include "openvswitch/hmap.h"
0de1b425 60#include "netdev-afxdp.h"
8b61709d 61#include "netdev-provider.h"
7fbef77a 62#include "netdev-vport.h"
45c8d3a1 63#include "netlink-notifier.h"
2fe27d5a 64#include "netlink-socket.h"
c060c4cf 65#include "netlink.h"
bfda5239 66#include "netnsid.h"
64c96779 67#include "openvswitch/ofpbuf.h"
8b61709d 68#include "openflow/openflow.h"
19c8e9c1 69#include "ovs-atomic.h"
105cf8df 70#include "ovs-numa.h"
8b61709d 71#include "packets.h"
fd016ae3 72#include "openvswitch/poll-loop.h"
7e9dcc0f 73#include "rtnetlink.h"
ee89ea7b 74#include "openvswitch/shash.h"
c060c4cf 75#include "socket-util.h"
19993ef3 76#include "sset.h"
c1c5c723 77#include "tc.h"
1670c579 78#include "timer.h"
c060c4cf 79#include "unaligned.h"
e6211adc 80#include "openvswitch/vlog.h"
29cf9c1b 81#include "userspace-tso.h"
ee89ea7b 82#include "util.h"
5136ce49 83
d98e6007 84VLOG_DEFINE_THIS_MODULE(netdev_linux);
d76f09ea 85
d76f09ea
BP
86COVERAGE_DEFINE(netdev_set_policing);
87COVERAGE_DEFINE(netdev_arp_lookup);
88COVERAGE_DEFINE(netdev_get_ifindex);
89COVERAGE_DEFINE(netdev_get_hwaddr);
90COVERAGE_DEFINE(netdev_set_hwaddr);
ab985a77
BP
91COVERAGE_DEFINE(netdev_get_ethtool);
92COVERAGE_DEFINE(netdev_set_ethtool);
4f925bd3 93
8b61709d 94\f
756819dd
FL
95#ifndef IFLA_IF_NETNSID
96#define IFLA_IF_NETNSID 0x45
97#endif
8b61709d
BP
98/* These were introduced in Linux 2.6.14, so they might be missing if we have
99 * old headers. */
100#ifndef ADVERTISED_Pause
101#define ADVERTISED_Pause (1 << 13)
102#endif
103#ifndef ADVERTISED_Asym_Pause
104#define ADVERTISED_Asym_Pause (1 << 14)
105#endif
106
e47bd51a
JP
107/* These were introduced in Linux 2.6.24, so they might be missing if we
108 * have old headers. */
109#ifndef ETHTOOL_GFLAGS
110#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
111#endif
112#ifndef ETHTOOL_SFLAGS
113#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
114#endif
115
c1c9c9c4
BP
116/* This was introduced in Linux 2.6.25, so it might be missing if we have old
117 * headers. */
118#ifndef TC_RTAB_SIZE
119#define TC_RTAB_SIZE 1024
120#endif
121
b73c8518
SH
122/* Linux 2.6.21 introduced struct tpacket_auxdata.
123 * Linux 2.6.27 added the tp_vlan_tci member.
124 * Linux 3.0 defined TP_STATUS_VLAN_VALID.
125 * Linux 3.13 repurposed a padding member for tp_vlan_tpid and defined
126 * TP_STATUS_VLAN_TPID_VALID.
127 *
128 * With all this churn it's easiest to unconditionally define a replacement
129 * structure that has everything we want.
130 */
55bc98d6
BP
131#ifndef PACKET_AUXDATA
132#define PACKET_AUXDATA 8
133#endif
b73c8518
SH
134#ifndef TP_STATUS_VLAN_VALID
135#define TP_STATUS_VLAN_VALID (1 << 4)
136#endif
137#ifndef TP_STATUS_VLAN_TPID_VALID
138#define TP_STATUS_VLAN_TPID_VALID (1 << 6)
139#endif
140#undef tpacket_auxdata
141#define tpacket_auxdata rpl_tpacket_auxdata
142struct tpacket_auxdata {
143 uint32_t tp_status;
144 uint32_t tp_len;
145 uint32_t tp_snaplen;
146 uint16_t tp_mac;
147 uint16_t tp_net;
148 uint16_t tp_vlan_tci;
149 uint16_t tp_vlan_tpid;
150};
151
0c615356
SH
152/* Linux 2.6.27 introduced ethtool_cmd_speed
153 *
154 * To avoid revisiting problems reported with using configure to detect
155 * compatibility (see report at
8a7903c6 156 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html)
0c615356
SH
157 * unconditionally replace ethtool_cmd_speed. */
158#define ethtool_cmd_speed rpl_ethtool_cmd_speed
159static inline uint32_t rpl_ethtool_cmd_speed(const struct ethtool_cmd *ep)
160{
161 return ep->speed | (ep->speed_hi << 16);
162}
163
67bed84c
SH
164/* Linux 2.6.30 introduced supported and advertised flags for
165 * 1G base KX, and 10G base KX4, KR and R. */
166#ifndef SUPPORTED_1000baseKX_Full
167#define SUPPORTED_1000baseKX_Full (1 << 17)
168#define SUPPORTED_10000baseKX4_Full (1 << 18)
169#define SUPPORTED_10000baseKR_Full (1 << 19)
170#define SUPPORTED_10000baseR_FEC (1 << 20)
171#define ADVERTISED_1000baseKX_Full (1 << 17)
172#define ADVERTISED_10000baseKX4_Full (1 << 18)
173#define ADVERTISED_10000baseKR_Full (1 << 19)
174#define ADVERTISED_10000baseR_FEC (1 << 20)
175#endif
176
177/* Linux 3.5 introduced supported and advertised flags for
178 * 40G base KR4, CR4, SR4 and LR4. */
179#ifndef SUPPORTED_40000baseKR4_Full
180#define SUPPORTED_40000baseKR4_Full (1 << 23)
181#define SUPPORTED_40000baseCR4_Full (1 << 24)
182#define SUPPORTED_40000baseSR4_Full (1 << 25)
183#define SUPPORTED_40000baseLR4_Full (1 << 26)
184#define ADVERTISED_40000baseKR4_Full (1 << 23)
185#define ADVERTISED_40000baseCR4_Full (1 << 24)
186#define ADVERTISED_40000baseSR4_Full (1 << 25)
187#define ADVERTISED_40000baseLR4_Full (1 << 26)
188#endif
189
fa373af4
BP
190/* Linux 2.6.35 introduced IFLA_STATS64 and rtnl_link_stats64.
191 *
192 * Tests for rtnl_link_stats64 don't seem to consistently work, e.g. on
193 * 2.6.32-431.29.2.el6.x86_64 (see report at
8a7903c6
JP
194 * https://mail.openvswitch.org/pipermail/ovs-dev/2014-October/291521.html).
195 * Maybe if_link.h is not self-contained on those kernels. It is easiest to
fa373af4
BP
196 * unconditionally define a replacement. */
197#ifndef IFLA_STATS64
337c9b99 198#define IFLA_STATS64 23
fa373af4
BP
199#endif
200#define rtnl_link_stats64 rpl_rtnl_link_stats64
337c9b99
BP
201struct rtnl_link_stats64 {
202 uint64_t rx_packets;
203 uint64_t tx_packets;
204 uint64_t rx_bytes;
205 uint64_t tx_bytes;
206 uint64_t rx_errors;
207 uint64_t tx_errors;
208 uint64_t rx_dropped;
209 uint64_t tx_dropped;
210 uint64_t multicast;
211 uint64_t collisions;
212
213 uint64_t rx_length_errors;
214 uint64_t rx_over_errors;
215 uint64_t rx_crc_errors;
216 uint64_t rx_frame_errors;
217 uint64_t rx_fifo_errors;
218 uint64_t rx_missed_errors;
219
220 uint64_t tx_aborted_errors;
221 uint64_t tx_carrier_errors;
222 uint64_t tx_fifo_errors;
223 uint64_t tx_heartbeat_errors;
224 uint64_t tx_window_errors;
225
226 uint64_t rx_compressed;
227 uint64_t tx_compressed;
228};
337c9b99 229
fd4d4777
YHW
230/* Linux 3.19 introduced virtio_types.h. It might be missing
231 * if we are using old kernel. */
232#ifndef HAVE_VIRTIO_TYPES
233typedef __u16 __bitwise__ __virtio16;
234typedef __u32 __bitwise__ __virtio32;
235typedef __u64 __bitwise__ __virtio64;
236#endif
237
8b61709d 238enum {
7fbef77a
JG
239 VALID_IFINDEX = 1 << 0,
240 VALID_ETHERADDR = 1 << 1,
6b6e1329
PS
241 VALID_IN = 1 << 2,
242 VALID_MTU = 1 << 3,
243 VALID_POLICING = 1 << 4,
244 VALID_VPORT_STAT_ERROR = 1 << 5,
245 VALID_DRVINFO = 1 << 6,
246 VALID_FEATURES = 1 << 7,
105cf8df 247 VALID_NUMA_ID = 1 << 8,
8b61709d 248};
29cf9c1b
FL
249
250/* Use one for the packet buffer and another for the aux buffer to receive
251 * TSO packets. */
252#define IOV_STD_SIZE 1
253#define IOV_TSO_SIZE 2
254
255enum {
256 IOV_PACKET = 0,
257 IOV_AUXBUF = 1,
258};
c1c9c9c4 259\f
91fc374a 260struct linux_lag_member {
d22f8927
JH
261 uint32_t block_id;
262 struct shash_node *node;
263};
264
91fc374a 265/* Protects 'lag_shash' and the mutable members of struct linux_lag_member. */
d22f8927
JH
266static struct ovs_mutex lag_mutex = OVS_MUTEX_INITIALIZER;
267
91fc374a 268/* All members whose LAG primary interfaces are OVS network devices. */
d22f8927
JH
269static struct shash lag_shash OVS_GUARDED_BY(lag_mutex)
270 = SHASH_INITIALIZER(&lag_shash);
271
c1c9c9c4
BP
272/* Traffic control. */
273
274/* An instance of a traffic control class. Always associated with a particular
93b13be8
BP
275 * network device.
276 *
277 * Each TC implementation subclasses this with whatever additional data it
278 * needs. */
c1c9c9c4
BP
279struct tc {
280 const struct tc_ops *ops;
93b13be8
BP
281 struct hmap queues; /* Contains "struct tc_queue"s.
282 * Read by generic TC layer.
283 * Written only by TC implementation. */
284};
c1c9c9c4 285
559eb230
BP
286#define TC_INITIALIZER(TC, OPS) { OPS, HMAP_INITIALIZER(&(TC)->queues) }
287
93b13be8
BP
288/* One traffic control queue.
289 *
290 * Each TC implementation subclasses this with whatever additional data it
291 * needs. */
292struct tc_queue {
293 struct hmap_node hmap_node; /* In struct tc's "queues" hmap. */
294 unsigned int queue_id; /* OpenFlow queue ID. */
6dc34a0d 295 long long int created; /* Time queue was created, in msecs. */
c1c9c9c4
BP
296};
297
298/* A particular kind of traffic control. Each implementation generally maps to
299 * one particular Linux qdisc class.
300 *
301 * The functions below return 0 if successful or a positive errno value on
302 * failure, except where otherwise noted. All of them must be provided, except
303 * where otherwise noted. */
304struct tc_ops {
305 /* Name used by kernel in the TCA_KIND attribute of tcmsg, e.g. "htb".
306 * This is null for tc_ops_default and tc_ops_other, for which there are no
307 * appropriate values. */
308 const char *linux_name;
309
310 /* Name used in OVS database, e.g. "linux-htb". Must be nonnull. */
311 const char *ovs_name;
312
313 /* Number of supported OpenFlow queues, 0 for qdiscs that have no
314 * queues. The queues are numbered 0 through n_queues - 1. */
315 unsigned int n_queues;
316
317 /* Called to install this TC class on 'netdev'. The implementation should
318 * make the Netlink calls required to set up 'netdev' with the right qdisc
319 * and configure it according to 'details'. The implementation may assume
320 * that the current qdisc is the default; that is, there is no need for it
321 * to delete the current qdisc before installing itself.
322 *
323 * The contents of 'details' should be documented as valid for 'ovs_name'
324 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
325 * (which is built as ovs-vswitchd.conf.db(8)).
326 *
327 * This function must return 0 if and only if it sets 'netdev->tc' to an
328 * initialized 'struct tc'.
329 *
330 * (This function is null for tc_ops_other, which cannot be installed. For
331 * other TC classes it should always be nonnull.) */
79f1cbe9 332 int (*tc_install)(struct netdev *netdev, const struct smap *details);
c1c9c9c4
BP
333
334 /* Called when the netdev code determines (through a Netlink query) that
335 * this TC class's qdisc is installed on 'netdev', but we didn't install
336 * it ourselves and so don't know any of the details.
337 *
338 * 'nlmsg' is the kernel reply to a RTM_GETQDISC Netlink message for
339 * 'netdev'. The TCA_KIND attribute of 'nlmsg' is 'linux_name'. The
340 * implementation should parse the other attributes of 'nlmsg' as
341 * necessary to determine its configuration. If necessary it should also
342 * use Netlink queries to determine the configuration of queues on
343 * 'netdev'.
344 *
345 * This function must return 0 if and only if it sets 'netdev->tc' to an
346 * initialized 'struct tc'. */
347 int (*tc_load)(struct netdev *netdev, struct ofpbuf *nlmsg);
348
349 /* Destroys the data structures allocated by the implementation as part of
350 * 'tc'. (This includes destroying 'tc->queues' by calling
351 * tc_destroy(tc).
352 *
353 * The implementation should not need to perform any Netlink calls. If
354 * desirable, the caller is responsible for deconfiguring the kernel qdisc.
355 * (But it may not be desirable.)
356 *
357 * This function may be null if 'tc' is trivial. */
358 void (*tc_destroy)(struct tc *tc);
359
360 /* Retrieves details of 'netdev->tc' configuration into 'details'.
361 *
362 * The implementation should not need to perform any Netlink calls, because
363 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
364 * cached the configuration.
365 *
366 * The contents of 'details' should be documented as valid for 'ovs_name'
367 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
368 * (which is built as ovs-vswitchd.conf.db(8)).
369 *
370 * This function may be null if 'tc' is not configurable.
371 */
79f1cbe9 372 int (*qdisc_get)(const struct netdev *netdev, struct smap *details);
c1c9c9c4
BP
373
374 /* Reconfigures 'netdev->tc' according to 'details', performing any
375 * required Netlink calls to complete the reconfiguration.
376 *
377 * The contents of 'details' should be documented as valid for 'ovs_name'
378 * in the "other_config" column in the "QoS" table in vswitchd/vswitch.xml
379 * (which is built as ovs-vswitchd.conf.db(8)).
380 *
381 * This function may be null if 'tc' is not configurable.
382 */
79f1cbe9 383 int (*qdisc_set)(struct netdev *, const struct smap *details);
c1c9c9c4 384
93b13be8
BP
385 /* Retrieves details of 'queue' on 'netdev->tc' into 'details'. 'queue' is
386 * one of the 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
387 *
388 * The contents of 'details' should be documented as valid for 'ovs_name'
389 * in the "other_config" column in the "Queue" table in
390 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
391 *
392 * The implementation should not need to perform any Netlink calls, because
393 * the 'tc_install' or 'tc_load' that instantiated 'netdev->tc' should have
394 * cached the queue configuration.
395 *
396 * This function may be null if 'tc' does not have queues ('n_queues' is
397 * 0). */
93b13be8 398 int (*class_get)(const struct netdev *netdev, const struct tc_queue *queue,
79f1cbe9 399 struct smap *details);
c1c9c9c4
BP
400
401 /* Configures or reconfigures 'queue_id' on 'netdev->tc' according to
402 * 'details', perfoming any required Netlink calls to complete the
403 * reconfiguration. The caller ensures that 'queue_id' is less than
404 * 'n_queues'.
405 *
406 * The contents of 'details' should be documented as valid for 'ovs_name'
407 * in the "other_config" column in the "Queue" table in
408 * vswitchd/vswitch.xml (which is built as ovs-vswitchd.conf.db(8)).
409 *
410 * This function may be null if 'tc' does not have queues or its queues are
411 * not configurable. */
412 int (*class_set)(struct netdev *, unsigned int queue_id,
79f1cbe9 413 const struct smap *details);
c1c9c9c4 414
93b13be8
BP
415 /* Deletes 'queue' from 'netdev->tc'. 'queue' is one of the 'struct
416 * tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
417 *
418 * This function may be null if 'tc' does not have queues or its queues
419 * cannot be deleted. */
93b13be8 420 int (*class_delete)(struct netdev *, struct tc_queue *queue);
c1c9c9c4 421
93b13be8
BP
422 /* Obtains stats for 'queue' from 'netdev->tc'. 'queue' is one of the
423 * 'struct tc_queue's within 'netdev->tc->queues'.
c1c9c9c4
BP
424 *
425 * On success, initializes '*stats'.
426 *
427 * This function may be null if 'tc' does not have queues or if it cannot
428 * report queue statistics. */
93b13be8
BP
429 int (*class_get_stats)(const struct netdev *netdev,
430 const struct tc_queue *queue,
c1c9c9c4
BP
431 struct netdev_queue_stats *stats);
432
433 /* Extracts queue stats from 'nlmsg', which is a response to a
434 * RTM_GETTCLASS message, and passes them to 'cb' along with 'aux'.
435 *
436 * This function may be null if 'tc' does not have queues or if it cannot
437 * report queue statistics. */
438 int (*class_dump_stats)(const struct netdev *netdev,
439 const struct ofpbuf *nlmsg,
440 netdev_dump_queue_stats_cb *cb, void *aux);
441};
442
443static void
444tc_init(struct tc *tc, const struct tc_ops *ops)
445{
446 tc->ops = ops;
93b13be8 447 hmap_init(&tc->queues);
c1c9c9c4
BP
448}
449
450static void
451tc_destroy(struct tc *tc)
452{
93b13be8 453 hmap_destroy(&tc->queues);
c1c9c9c4
BP
454}
455
456static const struct tc_ops tc_ops_htb;
a339aa81 457static const struct tc_ops tc_ops_hfsc;
677d9158
JV
458static const struct tc_ops tc_ops_codel;
459static const struct tc_ops tc_ops_fqcodel;
460static const struct tc_ops tc_ops_sfq;
2f564bb1 461static const struct tc_ops tc_ops_netem;
c1c9c9c4 462static const struct tc_ops tc_ops_default;
6cf888b8 463static const struct tc_ops tc_ops_noop;
c1c9c9c4
BP
464static const struct tc_ops tc_ops_other;
465
559eb230 466static const struct tc_ops *const tcs[] = {
c1c9c9c4 467 &tc_ops_htb, /* Hierarchy token bucket (see tc-htb(8)). */
a339aa81 468 &tc_ops_hfsc, /* Hierarchical fair service curve. */
677d9158
JV
469 &tc_ops_codel, /* Controlled delay */
470 &tc_ops_fqcodel, /* Fair queue controlled delay */
471 &tc_ops_sfq, /* Stochastic fair queueing */
2f564bb1 472 &tc_ops_netem, /* Network Emulator */
6cf888b8 473 &tc_ops_noop, /* Non operating qos type. */
c1c9c9c4
BP
474 &tc_ops_default, /* Default qdisc (see tc-pfifo_fast(8)). */
475 &tc_ops_other, /* Some other qdisc. */
476 NULL
477};
149f577a 478
c1c9c9c4
BP
479static unsigned int tc_ticks_to_bytes(unsigned int rate, unsigned int ticks);
480static unsigned int tc_bytes_to_ticks(unsigned int rate, unsigned int size);
481static unsigned int tc_buffer_per_jiffy(unsigned int rate);
2f564bb1 482static uint32_t tc_time_to_ticks(uint32_t time);
c1c9c9c4 483
7874bdff
RD
484static struct tcmsg *netdev_linux_tc_make_request(const struct netdev *,
485 int type,
486 unsigned int flags,
487 struct ofpbuf *);
c7952afb
BP
488static int tc_add_policer(struct netdev *,
489 uint32_t kbits_rate, uint32_t kbits_burst);
c1c9c9c4
BP
490
491static int tc_parse_qdisc(const struct ofpbuf *, const char **kind,
492 struct nlattr **options);
493static int tc_parse_class(const struct ofpbuf *, unsigned int *queue_id,
494 struct nlattr **options,
495 struct netdev_queue_stats *);
496static int tc_query_class(const struct netdev *,
497 unsigned int handle, unsigned int parent,
498 struct ofpbuf **replyp);
499static int tc_delete_class(const struct netdev *, unsigned int handle);
500
501static int tc_del_qdisc(struct netdev *netdev);
502static int tc_query_qdisc(const struct netdev *netdev);
503
e7f6ba22
PJV
504void
505tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate);
c1c9c9c4
BP
506static int tc_calc_cell_log(unsigned int mtu);
507static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
c1c9c9c4
BP
508static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
509\f
8b61709d 510
8b61709d
BP
511/* This is set pretty low because we probably won't learn anything from the
512 * additional log messages. */
513static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
514
19c8e9c1
JS
515/* Polling miimon status for all ports causes performance degradation when
516 * handling a large number of ports. If there are no devices using miimon, then
812c272c
JR
517 * we skip netdev_linux_miimon_run() and netdev_linux_miimon_wait().
518 *
519 * Readers do not depend on this variable synchronizing with the related
520 * changes in the device miimon status, so we can use atomic_count. */
521static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
19c8e9c1 522
29cf9c1b
FL
523static int netdev_linux_parse_vnet_hdr(struct dp_packet *b);
524static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu);
0b0544d7 525static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
8b61709d 526 int cmd, const char *cmd_name);
b5d57fc8 527static int get_flags(const struct netdev *, unsigned int *flags);
4b609110 528static int set_flags(const char *, unsigned int flags);
4f9f3f21
BP
529static int update_flags(struct netdev_linux *netdev, enum netdev_flags off,
530 enum netdev_flags on, enum netdev_flags *old_flagsp)
531 OVS_REQUIRES(netdev->mutex);
8b61709d
BP
532static int get_ifindex(const struct netdev *, int *ifindexp);
533static int do_set_addr(struct netdev *netdev,
534 int ioctl_nr, const char *ioctl_name,
535 struct in_addr addr);
74ff3298
JR
536static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
537static int set_etheraddr(const char *netdev_name, const struct eth_addr);
488d734d 538static int af_packet_sock(void);
19c8e9c1 539static bool netdev_linux_miimon_enabled(void);
1670c579
EJ
540static void netdev_linux_miimon_run(void);
541static void netdev_linux_miimon_wait(void);
df1e5a3b 542static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
8b61709d 543
796223f5
BP
544static bool
545is_tap_netdev(const struct netdev *netdev)
546{
b5d57fc8 547 return netdev_get_class(netdev) == &netdev_tap_class;
6c88d577 548}
ff4ed3c9 549\f
bfda5239
FL
550static int
551netdev_linux_netnsid_update__(struct netdev_linux *netdev)
552{
553 struct dpif_netlink_vport reply;
554 struct ofpbuf *buf;
555 int error;
556
557 error = dpif_netlink_vport_get(netdev_get_name(&netdev->up), &reply, &buf);
558 if (error) {
629e1476
FL
559 if (error == ENOENT) {
560 /* Assume it is local if there is no API (e.g. if the openvswitch
561 * kernel module is not loaded). */
562 netnsid_set_local(&netdev->netnsid);
563 } else {
564 netnsid_unset(&netdev->netnsid);
565 }
bfda5239
FL
566 return error;
567 }
568
569 netnsid_set(&netdev->netnsid, reply.netnsid);
570 ofpbuf_delete(buf);
571 return 0;
572}
573
574static int
575netdev_linux_netnsid_update(struct netdev_linux *netdev)
576{
577 if (netnsid_is_unset(netdev->netnsid)) {
3dbcbfe4
FL
578 if (netdev_get_class(&netdev->up) == &netdev_tap_class) {
579 netnsid_set_local(&netdev->netnsid);
580 } else {
581 return netdev_linux_netnsid_update__(netdev);
582 }
bfda5239
FL
583 }
584
585 return 0;
586}
587
588static bool
589netdev_linux_netnsid_is_eq(struct netdev_linux *netdev, int nsid)
590{
591 netdev_linux_netnsid_update(netdev);
592 return netnsid_eq(netdev->netnsid, nsid);
593}
594
756819dd
FL
595static bool
596netdev_linux_netnsid_is_remote(struct netdev_linux *netdev)
597{
598 netdev_linux_netnsid_update(netdev);
599 return netnsid_is_remote(netdev->netnsid);
600}
601
602static int netdev_linux_update_via_netlink(struct netdev_linux *);
bfda5239 603static void netdev_linux_update(struct netdev_linux *netdev, int,
7e9dcc0f 604 const struct rtnetlink_change *)
86383816 605 OVS_REQUIRES(netdev->mutex);
cee87338 606static void netdev_linux_changed(struct netdev_linux *netdev,
86383816
BP
607 unsigned int ifi_flags, unsigned int mask)
608 OVS_REQUIRES(netdev->mutex);
cee87338 609
d6384a3a
AW
610/* Returns a NETLINK_ROUTE socket listening for RTNLGRP_LINK,
611 * RTNLGRP_IPV4_IFADDR and RTNLGRP_IPV6_IFADDR changes, or NULL
cee87338
BP
612 * if no such socket could be created. */
613static struct nl_sock *
614netdev_linux_notify_sock(void)
615{
616 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
617 static struct nl_sock *sock;
989d7135
PS
618 unsigned int mcgroups[] = {RTNLGRP_LINK, RTNLGRP_IPV4_IFADDR,
619 RTNLGRP_IPV6_IFADDR, RTNLGRP_IPV6_IFINFO};
cee87338
BP
620
621 if (ovsthread_once_start(&once)) {
622 int error;
623
624 error = nl_sock_create(NETLINK_ROUTE, &sock);
625 if (!error) {
d6384a3a
AW
626 size_t i;
627
628 for (i = 0; i < ARRAY_SIZE(mcgroups); i++) {
629 error = nl_sock_join_mcgroup(sock, mcgroups[i]);
630 if (error) {
631 nl_sock_destroy(sock);
632 sock = NULL;
633 break;
634 }
cee87338
BP
635 }
636 }
cf114a7f 637 nl_sock_listen_all_nsid(sock, true);
cee87338
BP
638 ovsthread_once_done(&once);
639 }
640
641 return sock;
642}
643
19c8e9c1
JS
644static bool
645netdev_linux_miimon_enabled(void)
646{
812c272c 647 return atomic_count_get(&miimon_cnt) > 0;
19c8e9c1
JS
648}
649
3d9c99ab
JH
650static bool
651netdev_linux_kind_is_lag(const char *kind)
652{
653 if (!strcmp(kind, "bond") || !strcmp(kind, "team")) {
654 return true;
655 }
656
657 return false;
658}
659
d22f8927
JH
660static void
661netdev_linux_update_lag(struct rtnetlink_change *change)
662 OVS_REQUIRES(lag_mutex)
663{
91fc374a 664 struct linux_lag_member *lag;
d22f8927 665
91fc374a 666 if (change->sub && netdev_linux_kind_is_lag(change->sub)) {
d22f8927
JH
667 lag = shash_find_data(&lag_shash, change->ifname);
668
669 if (!lag) {
670 struct netdev *master_netdev;
671 char master_name[IFNAMSIZ];
672 uint32_t block_id;
673 int error = 0;
674
675 if_indextoname(change->master_ifindex, master_name);
676 master_netdev = netdev_from_name(master_name);
e3b5d7c5
TL
677 if (!master_netdev) {
678 return;
679 }
d22f8927
JH
680
681 if (is_netdev_linux_class(master_netdev->netdev_class)) {
682 block_id = netdev_get_block_id(master_netdev);
683 if (!block_id) {
e3b5d7c5
TL
684 netdev_close(master_netdev);
685 return;
d22f8927
JH
686 }
687
688 lag = xmalloc(sizeof *lag);
689 lag->block_id = block_id;
690 lag->node = shash_add(&lag_shash, change->ifname, lag);
691
cae64353 692 /* delete ingress block in case it exists */
95255018 693 tc_add_del_qdisc(change->if_index, false, 0, TC_INGRESS);
91fc374a 694 /* LAG master is linux netdev so add member to same block. */
95255018
JH
695 error = tc_add_del_qdisc(change->if_index, true, block_id,
696 TC_INGRESS);
d22f8927 697 if (error) {
91fc374a
BP
698 VLOG_WARN("failed to bind LAG member %s to "
699 "primary's block", change->ifname);
d22f8927
JH
700 shash_delete(&lag_shash, lag->node);
701 free(lag);
702 }
703 }
e3b5d7c5
TL
704
705 netdev_close(master_netdev);
d22f8927
JH
706 }
707 } else if (change->master_ifindex == 0) {
91fc374a 708 /* Check if this was a lag member that has been removed. */
d22f8927
JH
709 lag = shash_find_data(&lag_shash, change->ifname);
710
711 if (lag) {
95255018
JH
712 tc_add_del_qdisc(change->if_index, false, lag->block_id,
713 TC_INGRESS);
d22f8927
JH
714 shash_delete(&lag_shash, lag->node);
715 free(lag);
716 }
717 }
718}
719
0de1b425 720void
1c33f0c3 721netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 722{
cee87338
BP
723 struct nl_sock *sock;
724 int error;
725
19c8e9c1
JS
726 if (netdev_linux_miimon_enabled()) {
727 netdev_linux_miimon_run();
728 }
cee87338
BP
729
730 sock = netdev_linux_notify_sock();
731 if (!sock) {
732 return;
733 }
734
735 do {
cee87338 736 uint64_t buf_stub[4096 / 8];
bfda5239 737 int nsid;
cee87338
BP
738 struct ofpbuf buf;
739
740 ofpbuf_use_stub(&buf, buf_stub, sizeof buf_stub);
bfda5239 741 error = nl_sock_recv(sock, &buf, &nsid, false);
cee87338 742 if (!error) {
7e9dcc0f 743 struct rtnetlink_change change;
cee87338 744
7e9dcc0f 745 if (rtnetlink_parse(&buf, &change)) {
989d7135
PS
746 struct netdev *netdev_ = NULL;
747 char dev_name[IFNAMSIZ];
748
749 if (!change.ifname) {
750 change.ifname = if_indextoname(change.if_index, dev_name);
751 }
752
753 if (change.ifname) {
754 netdev_ = netdev_from_name(change.ifname);
755 }
cee87338
BP
756 if (netdev_ && is_netdev_linux_class(netdev_->netdev_class)) {
757 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816
BP
758
759 ovs_mutex_lock(&netdev->mutex);
bfda5239 760 netdev_linux_update(netdev, nsid, &change);
86383816 761 ovs_mutex_unlock(&netdev->mutex);
cee87338 762 }
7a076a53
AC
763
764 if (change.ifname &&
765 rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) {
766
767 /* Need to try updating the LAG information. */
d22f8927
JH
768 ovs_mutex_lock(&lag_mutex);
769 netdev_linux_update_lag(&change);
770 ovs_mutex_unlock(&lag_mutex);
771 }
38e0065b 772 netdev_close(netdev_);
cee87338
BP
773 }
774 } else if (error == ENOBUFS) {
775 struct shash device_shash;
776 struct shash_node *node;
777
778 nl_sock_drain(sock);
779
780 shash_init(&device_shash);
781 netdev_get_devices(&netdev_linux_class, &device_shash);
782 SHASH_FOR_EACH (node, &device_shash) {
783 struct netdev *netdev_ = node->data;
784 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
785 unsigned int flags;
786
86383816 787 ovs_mutex_lock(&netdev->mutex);
cee87338
BP
788 get_flags(netdev_, &flags);
789 netdev_linux_changed(netdev, flags, 0);
86383816
BP
790 ovs_mutex_unlock(&netdev->mutex);
791
cee87338
BP
792 netdev_close(netdev_);
793 }
794 shash_destroy(&device_shash);
795 } else if (error != EAGAIN) {
7ed58d4a
JP
796 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
797 VLOG_WARN_RL(&rll, "error reading or parsing netlink (%s)",
cee87338
BP
798 ovs_strerror(error));
799 }
800 ofpbuf_uninit(&buf);
801 } while (!error);
8b61709d
BP
802}
803
804static void
1c33f0c3 805netdev_linux_wait(const struct netdev_class *netdev_class OVS_UNUSED)
8b61709d 806{
cee87338
BP
807 struct nl_sock *sock;
808
19c8e9c1
JS
809 if (netdev_linux_miimon_enabled()) {
810 netdev_linux_miimon_wait();
811 }
cee87338
BP
812 sock = netdev_linux_notify_sock();
813 if (sock) {
814 nl_sock_wait(sock, POLLIN);
815 }
8b61709d
BP
816}
817
ac4d3bcb 818static void
b5d57fc8
BP
819netdev_linux_changed(struct netdev_linux *dev,
820 unsigned int ifi_flags, unsigned int mask)
86383816 821 OVS_REQUIRES(dev->mutex)
ac4d3bcb 822{
3e912ffc 823 netdev_change_seq_changed(&dev->up);
8aa77183
BP
824
825 if ((dev->ifi_flags ^ ifi_flags) & IFF_RUNNING) {
826 dev->carrier_resets++;
827 }
828 dev->ifi_flags = ifi_flags;
829
4f925bd3 830 dev->cache_valid &= mask;
6b6e1329 831 if (!(mask & VALID_IN)) {
a8704b50
PS
832 netdev_get_addrs_list_flush();
833 }
4f925bd3
PS
834}
835
836static void
bfda5239
FL
837netdev_linux_update__(struct netdev_linux *dev,
838 const struct rtnetlink_change *change)
86383816 839 OVS_REQUIRES(dev->mutex)
4f925bd3 840{
bfda5239 841 if (rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
d6384a3a 842 if (change->nlmsg_type == RTM_NEWLINK) {
105cf8df 843 /* Keep drv-info, ip addresses, and NUMA id. */
d6384a3a 844 netdev_linux_changed(dev, change->ifi_flags,
105cf8df 845 VALID_DRVINFO | VALID_IN | VALID_NUMA_ID);
d6384a3a
AW
846
847 /* Update netdev from rtnl-change msg. */
848 if (change->mtu) {
849 dev->mtu = change->mtu;
850 dev->cache_valid |= VALID_MTU;
851 dev->netdev_mtu_error = 0;
852 }
90a6637d 853
74ff3298
JR
854 if (!eth_addr_is_zero(change->mac)) {
855 dev->etheraddr = change->mac;
d6384a3a
AW
856 dev->cache_valid |= VALID_ETHERADDR;
857 dev->ether_addr_error = 0;
e8e1a409
TZ
858
859 /* The mac addr has been changed, report it now. */
860 rtnetlink_report_link();
d6384a3a 861 }
44445cac 862
91fc374a 863 if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
3d9c99ab
JH
864 dev->is_lag_master = true;
865 }
866
d6384a3a
AW
867 dev->ifindex = change->if_index;
868 dev->cache_valid |= VALID_IFINDEX;
869 dev->get_ifindex_error = 0;
22dcb534 870 dev->present = true;
d6384a3a 871 } else {
bfda5239 872 /* FIXME */
d6384a3a 873 netdev_linux_changed(dev, change->ifi_flags, 0);
22dcb534 874 dev->present = false;
bfda5239 875 netnsid_unset(&dev->netnsid);
d6384a3a
AW
876 }
877 } else if (rtnetlink_type_is_rtnlgrp_addr(change->nlmsg_type)) {
878 /* Invalidates in4, in6. */
6b6e1329 879 netdev_linux_changed(dev, dev->ifi_flags, ~VALID_IN);
4f925bd3 880 } else {
d6384a3a 881 OVS_NOT_REACHED();
4f925bd3 882 }
ac4d3bcb
EJ
883}
884
bfda5239
FL
885static void
886netdev_linux_update(struct netdev_linux *dev, int nsid,
887 const struct rtnetlink_change *change)
888 OVS_REQUIRES(dev->mutex)
889{
890 if (netdev_linux_netnsid_is_eq(dev, nsid)) {
891 netdev_linux_update__(dev, change);
892 }
893}
894
9dc63482
BP
895static struct netdev *
896netdev_linux_alloc(void)
897{
898 struct netdev_linux *netdev = xzalloc(sizeof *netdev);
899 return &netdev->up;
900}
901
48c6733c
WT
902static int
903netdev_linux_common_construct(struct netdev *netdev_)
9dc63482 904{
48c6733c
WT
905 /* Prevent any attempt to create (or open) a network device named "default"
906 * or "all". These device names are effectively reserved on Linux because
907 * /proc/sys/net/ipv4/conf/ always contains directories by these names. By
908 * itself this wouldn't call for any special treatment, but in practice if
909 * a program tries to create devices with these names, it causes the kernel
910 * to fire a "new device" notification event even though creation failed,
911 * and in turn that causes OVS to wake up and try to create them again,
912 * which ends up as a 100% CPU loop. */
913 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
914 const char *name = netdev_->name;
915 if (!strcmp(name, "default") || !strcmp(name, "all")) {
7ed58d4a
JP
916 static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 1);
917 VLOG_WARN_RL(&rll, "%s: Linux forbids network device with this name",
48c6733c
WT
918 name);
919 return EINVAL;
920 }
921
bfda5239
FL
922 /* The device could be in the same network namespace or in another one. */
923 netnsid_unset(&netdev->netnsid);
834d6caf 924 ovs_mutex_init(&netdev->mutex);
29cf9c1b
FL
925
926 if (userspace_tso_enabled()) {
927 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
928 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
8c5163fe 929 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
35b5586b 930 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
29cf9c1b
FL
931 netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
932 }
933
48c6733c 934 return 0;
9dc63482
BP
935}
936
1f6e0fbd 937/* Creates system and internal devices. */
f627cf1d 938int
9dc63482 939netdev_linux_construct(struct netdev *netdev_)
1f6e0fbd 940{
9dc63482 941 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
48c6733c
WT
942 int error = netdev_linux_common_construct(netdev_);
943 if (error) {
944 return error;
945 }
1f6e0fbd 946
b5d57fc8
BP
947 error = get_flags(&netdev->up, &netdev->ifi_flags);
948 if (error == ENODEV) {
9dc63482 949 if (netdev->up.netdev_class != &netdev_internal_class) {
b5d57fc8 950 /* The device does not exist, so don't allow it to be opened. */
b5d57fc8
BP
951 return ENODEV;
952 } else {
953 /* "Internal" netdevs have to be created as netdev objects before
954 * they exist in the kernel, because creating them in the kernel
955 * happens by passing a netdev object to dpif_port_add().
956 * Therefore, ignore the error. */
957 }
958 }
46415c90 959
a740f0de
JG
960 return 0;
961}
962
5b7448ed
JG
963/* For most types of netdevs we open the device for each call of
964 * netdev_open(). However, this is not the case with tap devices,
965 * since it is only possible to open the device once. In this
966 * situation we share a single file descriptor, and consequently
967 * buffers, across all readers. Therefore once data is read it will
968 * be unavailable to other reads for tap devices. */
a740f0de 969static int
9dc63482 970netdev_linux_construct_tap(struct netdev *netdev_)
a740f0de 971{
9dc63482 972 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a740f0de 973 static const char tap_dev[] = "/dev/net/tun";
9dc63482 974 const char *name = netdev_->name;
a740f0de 975 struct ifreq ifr;
a740f0de 976
48c6733c
WT
977 int error = netdev_linux_common_construct(netdev_);
978 if (error) {
979 return error;
980 }
1f6e0fbd 981
6c88d577 982 /* Open tap device. */
d0d08f8a
BP
983 netdev->tap_fd = open(tap_dev, O_RDWR);
984 if (netdev->tap_fd < 0) {
6c88d577 985 error = errno;
10a89ef0 986 VLOG_WARN("opening \"%s\" failed: %s", tap_dev, ovs_strerror(error));
cee87338 987 return error;
6c88d577
JP
988 }
989
990 /* Create tap device. */
61b9d078 991 get_flags(&netdev->up, &netdev->ifi_flags);
6c88d577 992 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
29cf9c1b
FL
993 if (userspace_tso_enabled()) {
994 ifr.ifr_flags |= IFF_VNET_HDR;
995 }
996
71d7c22f 997 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
d0d08f8a 998 if (ioctl(netdev->tap_fd, TUNSETIFF, &ifr) == -1) {
6c88d577 999 VLOG_WARN("%s: creating tap device failed: %s", name,
10a89ef0 1000 ovs_strerror(errno));
6c88d577 1001 error = errno;
f61d8d29 1002 goto error_close;
6c88d577
JP
1003 }
1004
1005 /* Make non-blocking. */
d0d08f8a 1006 error = set_nonblocking(netdev->tap_fd);
a740f0de 1007 if (error) {
f61d8d29 1008 goto error_close;
a740f0de
JG
1009 }
1010
0f28164b
FL
1011 if (ioctl(netdev->tap_fd, TUNSETPERSIST, 1)) {
1012 VLOG_WARN("%s: creating tap device failed (persist): %s", name,
1013 ovs_strerror(errno));
1014 error = errno;
1015 goto error_close;
1016 }
1017
6211ad57
FL
1018 if (userspace_tso_enabled()) {
1019 /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is
1020 * available, it will return EINVAL when a flag is unknown.
1021 * Therefore, try enabling offload with no flags to check
1022 * if TUNSETOFFLOAD support is available or not. */
1023 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, 0) == 0 || errno != EINVAL) {
1024 unsigned long oflags = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6;
1025
1026 if (ioctl(netdev->tap_fd, TUNSETOFFLOAD, oflags) == -1) {
1027 VLOG_WARN("%s: enabling tap offloading failed: %s", name,
1028 ovs_strerror(errno));
1029 error = errno;
1030 goto error_close;
1031 }
1032 }
1033 }
1034
19aac14a 1035 netdev->present = true;
a740f0de
JG
1036 return 0;
1037
f61d8d29 1038error_close:
d0d08f8a 1039 close(netdev->tap_fd);
a740f0de
JG
1040 return error;
1041}
1042
6c88d577 1043static void
9dc63482 1044netdev_linux_destruct(struct netdev *netdev_)
6c88d577 1045{
b5d57fc8 1046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
6c88d577 1047
b5d57fc8
BP
1048 if (netdev->tc && netdev->tc->ops->tc_destroy) {
1049 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4
BP
1050 }
1051
d0d08f8a
BP
1052 if (netdev_get_class(netdev_) == &netdev_tap_class
1053 && netdev->tap_fd >= 0)
1054 {
0f28164b 1055 ioctl(netdev->tap_fd, TUNSETPERSIST, 0);
d0d08f8a 1056 close(netdev->tap_fd);
6c88d577 1057 }
86383816 1058
19c8e9c1 1059 if (netdev->miimon_interval > 0) {
812c272c 1060 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
1061 }
1062
86383816 1063 ovs_mutex_destroy(&netdev->mutex);
6c88d577
JP
1064}
1065
9dc63482
BP
1066static void
1067netdev_linux_dealloc(struct netdev *netdev_)
1068{
1069 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1070 free(netdev);
1071}
1072
f7791740
PS
1073static struct netdev_rxq *
1074netdev_linux_rxq_alloc(void)
9dc63482 1075{
f7791740 1076 struct netdev_rxq_linux *rx = xzalloc(sizeof *rx);
9dc63482
BP
1077 return &rx->up;
1078}
1079
7b6b0ef4 1080static int
f7791740 1081netdev_linux_rxq_construct(struct netdev_rxq *rxq_)
7b6b0ef4 1082{
f7791740 1083 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1084 struct netdev *netdev_ = rx->up.netdev;
7b6b0ef4 1085 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7b6b0ef4 1086 int error;
7b6b0ef4 1087
86383816 1088 ovs_mutex_lock(&netdev->mutex);
9dc63482
BP
1089 rx->is_tap = is_tap_netdev(netdev_);
1090 if (rx->is_tap) {
1091 rx->fd = netdev->tap_fd;
796223f5
BP
1092 } else {
1093 struct sockaddr_ll sll;
b73c8518 1094 int ifindex, val;
32383c3b 1095 /* Result of tcpdump -dd inbound */
259e0b1a 1096 static const struct sock_filter filt[] = {
32383c3b
MM
1097 { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */
1098 { 0x15, 0, 1, 0x00000004 }, /* jeq #4 jt 2 jf 3 */
1099 { 0x6, 0, 0, 0x00000000 }, /* ret #0 */
1100 { 0x6, 0, 0, 0x0000ffff } /* ret #65535 */
1101 };
259e0b1a
BP
1102 static const struct sock_fprog fprog = {
1103 ARRAY_SIZE(filt), (struct sock_filter *) filt
1104 };
7b6b0ef4 1105
796223f5 1106 /* Create file descriptor. */
9dc63482
BP
1107 rx->fd = socket(PF_PACKET, SOCK_RAW, 0);
1108 if (rx->fd < 0) {
796223f5 1109 error = errno;
10a89ef0 1110 VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error));
796223f5
BP
1111 goto error;
1112 }
33d82a56 1113
b73c8518
SH
1114 val = 1;
1115 if (setsockopt(rx->fd, SOL_PACKET, PACKET_AUXDATA, &val, sizeof val)) {
1116 error = errno;
1117 VLOG_ERR("%s: failed to mark socket for auxdata (%s)",
1118 netdev_get_name(netdev_), ovs_strerror(error));
1119 goto error;
1120 }
1121
29cf9c1b
FL
1122 if (userspace_tso_enabled()
1123 && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val,
1124 sizeof val)) {
1125 error = errno;
1126 VLOG_ERR("%s: failed to enable vnet hdr in txq raw socket: %s",
1127 netdev_get_name(netdev_), ovs_strerror(errno));
1128 goto error;
1129 }
1130
796223f5 1131 /* Set non-blocking mode. */
9dc63482 1132 error = set_nonblocking(rx->fd);
796223f5
BP
1133 if (error) {
1134 goto error;
1135 }
7b6b0ef4 1136
796223f5 1137 /* Get ethernet device index. */
180c6d0b 1138 error = get_ifindex(&netdev->up, &ifindex);
796223f5
BP
1139 if (error) {
1140 goto error;
1141 }
7b6b0ef4 1142
796223f5
BP
1143 /* Bind to specific ethernet device. */
1144 memset(&sll, 0, sizeof sll);
1145 sll.sll_family = AF_PACKET;
1146 sll.sll_ifindex = ifindex;
b73c8518 1147 sll.sll_protocol = htons(ETH_P_ALL);
9dc63482 1148 if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
796223f5
BP
1149 error = errno;
1150 VLOG_ERR("%s: failed to bind raw socket (%s)",
10a89ef0 1151 netdev_get_name(netdev_), ovs_strerror(error));
796223f5
BP
1152 goto error;
1153 }
32383c3b
MM
1154
1155 /* Filter for only inbound packets. */
9dc63482 1156 error = setsockopt(rx->fd, SOL_SOCKET, SO_ATTACH_FILTER, &fprog,
32383c3b
MM
1157 sizeof fprog);
1158 if (error) {
1159 error = errno;
259e0b1a 1160 VLOG_ERR("%s: failed to attach filter (%s)",
10a89ef0 1161 netdev_get_name(netdev_), ovs_strerror(error));
32383c3b
MM
1162 goto error;
1163 }
7b6b0ef4 1164 }
86383816 1165 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4 1166
7b6b0ef4
BP
1167 return 0;
1168
1169error:
9dc63482
BP
1170 if (rx->fd >= 0) {
1171 close(rx->fd);
7b6b0ef4 1172 }
86383816 1173 ovs_mutex_unlock(&netdev->mutex);
7b6b0ef4
BP
1174 return error;
1175}
1176
796223f5 1177static void
f7791740 1178netdev_linux_rxq_destruct(struct netdev_rxq *rxq_)
8b61709d 1179{
f7791740 1180 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
29cf9c1b 1181 int i;
8b61709d 1182
796223f5
BP
1183 if (!rx->is_tap) {
1184 close(rx->fd);
8b61709d 1185 }
29cf9c1b
FL
1186
1187 for (i = 0; i < NETDEV_MAX_BURST; i++) {
73858f9d 1188 dp_packet_delete(rx->aux_bufs[i]);
29cf9c1b 1189 }
9dc63482
BP
1190}
1191
1192static void
f7791740 1193netdev_linux_rxq_dealloc(struct netdev_rxq *rxq_)
9dc63482 1194{
f7791740 1195 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
9dc63482 1196
796223f5
BP
1197 free(rx);
1198}
8b61709d 1199
b73c8518 1200static ovs_be16
1ebdc7eb 1201auxdata_to_vlan_tpid(const struct tpacket_auxdata *aux, bool double_tagged)
b73c8518
SH
1202{
1203 if (aux->tp_status & TP_STATUS_VLAN_TPID_VALID) {
1204 return htons(aux->tp_vlan_tpid);
1ebdc7eb
EG
1205 } else if (double_tagged) {
1206 return htons(ETH_TYPE_VLAN_8021AD);
b73c8518 1207 } else {
1ebdc7eb 1208 return htons(ETH_TYPE_VLAN_8021Q);
b73c8518
SH
1209 }
1210}
1211
1212static bool
1213auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
1214{
1215 return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
1216}
1217
2109841b
YY
1218/*
1219 * Receive packets from raw socket in batch process for better performance,
1220 * it can receive NETDEV_MAX_BURST packets at most once, the received
1221 * packets are added into *batch. The return value is 0 or errno.
1222 *
1223 * It also used recvmmsg to reduce multiple syscalls overhead;
1224 */
796223f5 1225static int
29cf9c1b 1226netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu,
2109841b 1227 struct dp_packet_batch *batch)
796223f5 1228{
29cf9c1b
FL
1229 int iovlen;
1230 size_t std_len;
796223f5 1231 ssize_t retval;
29cf9c1b
FL
1232 int virtio_net_hdr_size;
1233 struct iovec iovs[NETDEV_MAX_BURST][IOV_TSO_SIZE];
b73c8518
SH
1234 struct cmsghdr *cmsg;
1235 union {
1236 struct cmsghdr cmsg;
1237 char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
2109841b
YY
1238 } cmsg_buffers[NETDEV_MAX_BURST];
1239 struct mmsghdr mmsgs[NETDEV_MAX_BURST];
1240 struct dp_packet *buffers[NETDEV_MAX_BURST];
1241 int i;
1242
29cf9c1b
FL
1243 if (userspace_tso_enabled()) {
1244 /* Use the buffer from the allocated packet below to receive MTU
1245 * sized packets and an aux_buf for extra TSO data. */
1246 iovlen = IOV_TSO_SIZE;
1247 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1248 } else {
1249 /* Use only the buffer from the allocated packet. */
1250 iovlen = IOV_STD_SIZE;
1251 virtio_net_hdr_size = 0;
1252 }
1253
73858f9d
FL
1254 /* The length here needs to be accounted in the same way when the
1255 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1256 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
2109841b 1257 for (i = 0; i < NETDEV_MAX_BURST; i++) {
29cf9c1b
FL
1258 buffers[i] = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1259 iovs[i][IOV_PACKET].iov_base = dp_packet_data(buffers[i]);
1260 iovs[i][IOV_PACKET].iov_len = std_len;
73858f9d
FL
1261 if (iovlen == IOV_TSO_SIZE) {
1262 iovs[i][IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1263 iovs[i][IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1264 }
1265
2109841b
YY
1266 mmsgs[i].msg_hdr.msg_name = NULL;
1267 mmsgs[i].msg_hdr.msg_namelen = 0;
29cf9c1b
FL
1268 mmsgs[i].msg_hdr.msg_iov = iovs[i];
1269 mmsgs[i].msg_hdr.msg_iovlen = iovlen;
2109841b
YY
1270 mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
1271 mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
1272 mmsgs[i].msg_hdr.msg_flags = 0;
1273 }
8e8cddf7 1274
796223f5 1275 do {
29cf9c1b 1276 retval = recvmmsg(rx->fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
796223f5
BP
1277 } while (retval < 0 && errno == EINTR);
1278
bfd3367b 1279 if (retval < 0) {
29cf9c1b
FL
1280 retval = errno;
1281 for (i = 0; i < NETDEV_MAX_BURST; i++) {
1282 dp_packet_delete(buffers[i]);
1283 }
1284
1285 return retval;
b73c8518
SH
1286 }
1287
2109841b 1288 for (i = 0; i < retval; i++) {
73858f9d
FL
1289 struct dp_packet *pkt;
1290
2109841b 1291 if (mmsgs[i].msg_len < ETH_HEADER_LEN) {
29cf9c1b
FL
1292 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1294
1295 dp_packet_delete(buffers[i]);
1296 netdev->rx_dropped += 1;
1297 VLOG_WARN_RL(&rl, "%s: Dropped packet: less than ether hdr size",
1298 netdev_get_name(netdev_));
1299 continue;
1300 }
1301
1302 if (mmsgs[i].msg_len > std_len) {
73858f9d
FL
1303 /* Build a single linear TSO packet by prepending the data from
1304 * std_len buffer to the aux_buf. */
1305 pkt = rx->aux_bufs[i];
1306 dp_packet_set_size(pkt, mmsgs[i].msg_len - std_len);
1307 dp_packet_push(pkt, dp_packet_data(buffers[i]), std_len);
1308 /* The headroom should be the same in buffers[i], pkt and
1309 * DP_NETDEV_HEADROOM. */
1310 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1311 dp_packet_delete(buffers[i]);
1312 rx->aux_bufs[i] = NULL;
1313 } else {
1314 dp_packet_set_size(buffers[i], mmsgs[i].msg_len);
1315 pkt = buffers[i];
1316 }
b73c8518 1317
73858f9d 1318 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
29cf9c1b
FL
1319 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1320 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1321
1322 /* Unexpected error situation: the virtio header is not present
1323 * or corrupted. Drop the packet but continue in case next ones
1324 * are correct. */
73858f9d 1325 dp_packet_delete(pkt);
29cf9c1b
FL
1326 netdev->rx_dropped += 1;
1327 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1328 netdev_get_name(netdev_));
1329 continue;
1330 }
1ebdc7eb 1331
2109841b
YY
1332 for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
1333 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
1334 const struct tpacket_auxdata *aux;
1335
1336 if (cmsg->cmsg_level != SOL_PACKET
1337 || cmsg->cmsg_type != PACKET_AUXDATA
1338 || cmsg->cmsg_len <
1339 CMSG_LEN(sizeof(struct tpacket_auxdata))) {
1340 continue;
b73c8518
SH
1341 }
1342
2109841b
YY
1343 aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
1344 if (auxdata_has_vlan_tci(aux)) {
1345 struct eth_header *eth;
1346 bool double_tagged;
1ebdc7eb 1347
73858f9d 1348 eth = dp_packet_data(pkt);
2109841b
YY
1349 double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
1350
73858f9d 1351 eth_push_vlan(pkt,
2109841b
YY
1352 auxdata_to_vlan_tpid(aux, double_tagged),
1353 htons(aux->tp_vlan_tci));
1354 break;
1355 }
b73c8518 1356 }
73858f9d 1357 dp_packet_batch_add(batch, pkt);
2109841b
YY
1358 }
1359
29cf9c1b 1360 /* Delete unused buffers. */
2109841b
YY
1361 for (; i < NETDEV_MAX_BURST; i++) {
1362 dp_packet_delete(buffers[i]);
1363 }
1364
b73c8518
SH
1365 return 0;
1366}
1367
2109841b
YY
1368/*
1369 * Receive packets from tap by batch process for better performance,
1370 * it can receive NETDEV_MAX_BURST packets at most once, the received
1371 * packets are added into *batch. The return value is 0 or errno.
1372 */
b73c8518 1373static int
29cf9c1b
FL
1374netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
1375 struct dp_packet_batch *batch)
b73c8518 1376{
29cf9c1b 1377 int virtio_net_hdr_size;
b73c8518 1378 ssize_t retval;
29cf9c1b
FL
1379 size_t std_len;
1380 int iovlen;
2109841b
YY
1381 int i;
1382
29cf9c1b
FL
1383 if (userspace_tso_enabled()) {
1384 /* Use the buffer from the allocated packet below to receive MTU
1385 * sized packets and an aux_buf for extra TSO data. */
1386 iovlen = IOV_TSO_SIZE;
1387 virtio_net_hdr_size = sizeof(struct virtio_net_hdr);
1388 } else {
1389 /* Use only the buffer from the allocated packet. */
1390 iovlen = IOV_STD_SIZE;
1391 virtio_net_hdr_size = 0;
1392 }
1393
73858f9d
FL
1394 /* The length here needs to be accounted in the same way when the
1395 * aux_buf is allocated so that it can be prepended to TSO buffer. */
1396 std_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu;
2109841b 1397 for (i = 0; i < NETDEV_MAX_BURST; i++) {
73858f9d
FL
1398 struct dp_packet *buffer;
1399 struct dp_packet *pkt;
29cf9c1b
FL
1400 struct iovec iov[IOV_TSO_SIZE];
1401
2109841b 1402 /* Assume Ethernet port. No need to set packet_type. */
29cf9c1b
FL
1403 buffer = dp_packet_new_with_headroom(std_len, DP_NETDEV_HEADROOM);
1404 iov[IOV_PACKET].iov_base = dp_packet_data(buffer);
1405 iov[IOV_PACKET].iov_len = std_len;
73858f9d
FL
1406 if (iovlen == IOV_TSO_SIZE) {
1407 iov[IOV_AUXBUF].iov_base = dp_packet_data(rx->aux_bufs[i]);
1408 iov[IOV_AUXBUF].iov_len = dp_packet_tailroom(rx->aux_bufs[i]);
1409 }
29cf9c1b 1410
2109841b 1411 do {
29cf9c1b 1412 retval = readv(rx->fd, iov, iovlen);
2109841b
YY
1413 } while (retval < 0 && errno == EINTR);
1414
1415 if (retval < 0) {
1416 dp_packet_delete(buffer);
1417 break;
1418 }
b73c8518 1419
29cf9c1b 1420 if (retval > std_len) {
73858f9d
FL
1421 /* Build a single linear TSO packet by prepending the data from
1422 * std_len buffer to the aux_buf. */
1423 pkt = rx->aux_bufs[i];
1424 dp_packet_set_size(pkt, retval - std_len);
1425 dp_packet_push(pkt, dp_packet_data(buffer), std_len);
1426 /* The headroom should be the same in buffers[i], pkt and
1427 * DP_NETDEV_HEADROOM. */
1428 dp_packet_resize(pkt, DP_NETDEV_HEADROOM, 0);
1429 dp_packet_delete(buffer);
1430 rx->aux_bufs[i] = NULL;
29cf9c1b
FL
1431 } else {
1432 dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
73858f9d 1433 pkt = buffer;
29cf9c1b
FL
1434 }
1435
73858f9d 1436 if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(pkt)) {
29cf9c1b
FL
1437 struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up);
1438 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1439
1440 /* Unexpected error situation: the virtio header is not present
1441 * or corrupted. Drop the packet but continue in case next ones
1442 * are correct. */
73858f9d 1443 dp_packet_delete(pkt);
29cf9c1b
FL
1444 netdev->rx_dropped += 1;
1445 VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header",
1446 netdev_get_name(netdev_));
1447 continue;
1448 }
1449
73858f9d 1450 dp_packet_batch_add(batch, pkt);
2109841b 1451 }
b73c8518 1452
2109841b 1453 if ((i == 0) && (retval < 0)) {
bfd3367b 1454 return errno;
8b61709d 1455 }
b73c8518 1456
b73c8518
SH
1457 return 0;
1458}
1459
1460static int
8492adc2
JS
1461netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
1462 int *qfill)
b73c8518 1463{
f7791740 1464 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
df1e5a3b 1465 struct netdev *netdev = rx->up.netdev;
df1e5a3b
PS
1466 ssize_t retval;
1467 int mtu;
1468
1469 if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
1470 mtu = ETH_PAYLOAD_MAX;
1471 }
1472
73858f9d
FL
1473 if (userspace_tso_enabled()) {
1474 /* Allocate TSO packets. The packet has enough headroom to store
1475 * a full non-TSO packet. When a TSO packet is received, the data
1476 * from non-TSO buffer (std_len) is prepended to the TSO packet
1477 * (aux_buf). */
1478 size_t std_len = sizeof(struct virtio_net_hdr) + VLAN_ETH_HEADER_LEN
1479 + DP_NETDEV_HEADROOM + mtu;
1480 size_t data_len = LINUX_RXQ_TSO_MAX_LEN - std_len;
1481 for (int i = 0; i < NETDEV_MAX_BURST; i++) {
1482 if (rx->aux_bufs[i]) {
1483 continue;
1484 }
1485
1486 rx->aux_bufs[i] = dp_packet_new_with_headroom(data_len, std_len);
1487 }
1488 }
1489
2109841b 1490 dp_packet_batch_init(batch);
b73c8518 1491 retval = (rx->is_tap
29cf9c1b
FL
1492 ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch)
1493 : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch));
df1e5a3b
PS
1494
1495 if (retval) {
1496 if (retval != EAGAIN && retval != EMSGSIZE) {
1497 VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
7c6401ca 1498 netdev_rxq_get_name(rxq_), ovs_strerror(errno));
df1e5a3b 1499 }
b73c8518
SH
1500 }
1501
8492adc2
JS
1502 if (qfill) {
1503 *qfill = -ENOTSUP;
1504 }
1505
b73c8518 1506 return retval;
8b61709d
BP
1507}
1508
8b61709d 1509static void
f7791740 1510netdev_linux_rxq_wait(struct netdev_rxq *rxq_)
8b61709d 1511{
f7791740 1512 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1513 poll_fd_wait(rx->fd, POLLIN);
8b61709d
BP
1514}
1515
8b61709d 1516static int
f7791740 1517netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
8b61709d 1518{
f7791740 1519 struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
796223f5 1520 if (rx->is_tap) {
8b61709d 1521 struct ifreq ifr;
f7791740 1522 int error = af_inet_ifreq_ioctl(netdev_rxq_get_name(rxq_), &ifr,
259e0b1a 1523 SIOCGIFTXQLEN, "SIOCGIFTXQLEN");
8b61709d
BP
1524 if (error) {
1525 return error;
1526 }
796223f5 1527 drain_fd(rx->fd, ifr.ifr_qlen);
8b61709d
BP
1528 return 0;
1529 } else {
796223f5 1530 return drain_rcvbuf(rx->fd);
8b61709d
BP
1531 }
1532}
1533
d19cf8bb 1534static int
29cf9c1b 1535netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu,
d19cf8bb
ZG
1536 struct dp_packet_batch *batch)
1537{
e0a00cee 1538 const size_t size = dp_packet_batch_size(batch);
d19cf8bb
ZG
1539 /* We don't bother setting most fields in sockaddr_ll because the
1540 * kernel ignores them for SOCK_RAW. */
1541 struct sockaddr_ll sll = { .sll_family = AF_PACKET,
1542 .sll_ifindex = ifindex };
1543
e0a00cee
BB
1544 struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * size);
1545 struct iovec *iov = xmalloc(sizeof(*iov) * size);
d19cf8bb 1546
e0a00cee 1547 struct dp_packet *packet;
e883448e 1548 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
29cf9c1b
FL
1549 if (tso) {
1550 netdev_linux_prepend_vnet_hdr(packet, mtu);
1551 }
1552
d19cf8bb 1553 iov[i].iov_base = dp_packet_data(packet);
ad8b0b4f 1554 iov[i].iov_len = dp_packet_size(packet);
d19cf8bb
ZG
1555 mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
1556 .msg_namelen = sizeof sll,
1557 .msg_iov = &iov[i],
1558 .msg_iovlen = 1 };
1559 }
1560
1561 int error = 0;
e0a00cee 1562 for (uint32_t ofs = 0; ofs < size; ) {
d19cf8bb
ZG
1563 ssize_t retval;
1564 do {
e0a00cee 1565 retval = sendmmsg(sock, mmsg + ofs, size - ofs, 0);
d19cf8bb
ZG
1566 error = retval < 0 ? errno : 0;
1567 } while (error == EINTR);
1568 if (error) {
1569 break;
1570 }
1571 ofs += retval;
1572 }
1573
1574 free(mmsg);
1575 free(iov);
1576 return error;
1577}
1578
1579/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap fd is
1580 * essential, because packets sent to a tap device with an AF_PACKET socket
1581 * will loop back to be *received* again on the tap device. This doesn't occur
1582 * on other interface types because we attach a socket filter to the rx
1583 * socket. */
1584static int
29cf9c1b 1585netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu,
d19cf8bb
ZG
1586 struct dp_packet_batch *batch)
1587{
1588 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
13708b21 1589 struct dp_packet *packet;
22dcb534
FL
1590
1591 /* The Linux tap driver returns EIO if the device is not up,
1592 * so if the device is not up, don't waste time sending it.
1593 * However, if the device is in another network namespace
1594 * then OVS can't retrieve the state. In that case, send the
1595 * packets anyway. */
1596 if (netdev->present && !(netdev->ifi_flags & IFF_UP)) {
1597 netdev->tx_dropped += dp_packet_batch_size(batch);
1598 return 0;
1599 }
1600
e883448e 1601 DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
29cf9c1b 1602 size_t size;
d19cf8bb
ZG
1603 ssize_t retval;
1604 int error;
1605
29cf9c1b
FL
1606 if (tso) {
1607 netdev_linux_prepend_vnet_hdr(packet, mtu);
1608 }
1609
1610 size = dp_packet_size(packet);
d19cf8bb
ZG
1611 do {
1612 retval = write(netdev->tap_fd, dp_packet_data(packet), size);
1613 error = retval < 0 ? errno : 0;
1614 } while (error == EINTR);
1615
1616 if (error) {
1617 /* The Linux tap driver returns EIO if the device is not up. From
1618 * the OVS side this is not an error, so we ignore it; otherwise,
1619 * return the erro. */
1620 if (error != EIO) {
1621 return error;
1622 }
1623 } else if (retval != size) {
1624 VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
1625 "bytes of %"PRIuSIZE") on %s",
1626 retval, size, netdev_get_name(netdev_));
1627 return EMSGSIZE;
1628 }
1629 }
1630 return 0;
1631}
1632
105cf8df
WT
1633static int
1634netdev_linux_get_numa_id__(struct netdev_linux *netdev)
1635 OVS_REQUIRES(netdev->mutex)
1636{
1637 char *numa_node_path;
1638 const char *name;
1639 int node_id;
1640 FILE *stream;
1641
1642 if (netdev->cache_valid & VALID_NUMA_ID) {
1643 return netdev->numa_id;
1644 }
1645
1646 netdev->numa_id = 0;
1647 netdev->cache_valid |= VALID_NUMA_ID;
1648
1649 if (ovs_numa_get_n_numas() < 2) {
1650 /* No need to check on system with a single NUMA node. */
1651 return 0;
1652 }
1653
1654 name = netdev_get_name(&netdev->up);
1655 if (strpbrk(name, "/\\")) {
1656 VLOG_ERR_RL(&rl, "\"%s\" is not a valid name for a port. "
1657 "A valid name must not include '/' or '\\'."
1658 "Using numa_id 0", name);
1659 return 0;
1660 }
1661
1662 numa_node_path = xasprintf("/sys/class/net/%s/device/numa_node", name);
1663
1664 stream = fopen(numa_node_path, "r");
1665 if (!stream) {
1666 /* Virtual device does not have this info. */
1667 VLOG_INFO_RL(&rl, "%s: Can't open '%s': %s, using numa_id 0",
1668 name, numa_node_path, ovs_strerror(errno));
1669 free(numa_node_path);
1670 return 0;
1671 }
1672
1673 if (fscanf(stream, "%d", &node_id) != 1
1674 || !ovs_numa_numa_id_is_valid(node_id)) {
1675 VLOG_WARN_RL(&rl, "%s: Can't detect NUMA node, using numa_id 0", name);
1676 node_id = 0;
1677 }
1678
1679 netdev->numa_id = node_id;
1680 fclose(stream);
1681 free(numa_node_path);
1682 return node_id;
1683}
1684
1685static int OVS_UNUSED
1686netdev_linux_get_numa_id(const struct netdev *netdev_)
1687{
1688 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1689 int numa_id;
1690
1691 ovs_mutex_lock(&netdev->mutex);
1692 numa_id = netdev_linux_get_numa_id__(netdev);
1693 ovs_mutex_unlock(&netdev->mutex);
1694
1695 return numa_id;
1696}
1697
d19cf8bb 1698/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive
8b61709d
BP
1699 * errno value. Returns EAGAIN without blocking if the packet cannot be queued
1700 * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
1701 * the packet is too big or too small to transmit on the device.
1702 *
8b61709d
BP
1703 * The kernel maintains a packet transmission queue, so the caller is not
1704 * expected to do additional queuing of packets. */
1705static int
f00fa8cb 1706netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
b30896c9 1707 struct dp_packet_batch *batch,
324c8374 1708 bool concurrent_txq OVS_UNUSED)
8b61709d 1709{
29cf9c1b
FL
1710 bool tso = userspace_tso_enabled();
1711 int mtu = ETH_PAYLOAD_MAX;
f4fd623c 1712 int error = 0;
0a62ae2c
ZG
1713 int sock = 0;
1714
29cf9c1b
FL
1715 if (tso) {
1716 netdev_linux_get_mtu__(netdev_linux_cast(netdev_), &mtu);
1717 }
1718
0a62ae2c 1719 if (!is_tap_netdev(netdev_)) {
e0e2410d
FL
1720 if (netdev_linux_netnsid_is_remote(netdev_linux_cast(netdev_))) {
1721 error = EOPNOTSUPP;
1722 goto free_batch;
1723 }
1724
0a62ae2c
ZG
1725 sock = af_packet_sock();
1726 if (sock < 0) {
1727 error = -sock;
1728 goto free_batch;
1729 }
1730
1731 int ifindex = netdev_get_ifindex(netdev_);
1732 if (ifindex < 0) {
1733 error = -ifindex;
1734 goto free_batch;
1735 }
1736
29cf9c1b 1737 error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch);
d19cf8bb 1738 } else {
29cf9c1b 1739 error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch);
0a62ae2c 1740 }
d19cf8bb
ZG
1741 if (error) {
1742 if (error == ENOBUFS) {
1743 /* The Linux AF_PACKET implementation never blocks waiting
1744 * for room for packets, instead returning ENOBUFS.
1745 * Translate this into EAGAIN for the caller. */
1746 error = EAGAIN;
f23347ea 1747 } else {
f4fd623c
DDP
1748 VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
1749 netdev_get_name(netdev_), ovs_strerror(error));
d19cf8bb 1750 }
f4fd623c
DDP
1751 }
1752
0a62ae2c 1753free_batch:
b30896c9 1754 dp_packet_delete_batch(batch, true);
f4fd623c 1755 return error;
8b61709d
BP
1756}
1757
1758/* Registers with the poll loop to wake up from the next call to poll_block()
1759 * when the packet transmission queue has sufficient room to transmit a packet
1760 * with netdev_send().
1761 *
1762 * The kernel maintains a packet transmission queue, so the client is not
1763 * expected to do additional queuing of packets. Thus, this function is
1764 * unlikely to ever be used. It is included for completeness. */
1765static void
f00fa8cb 1766netdev_linux_send_wait(struct netdev *netdev, int qid OVS_UNUSED)
8b61709d 1767{
796223f5 1768 if (is_tap_netdev(netdev)) {
8b61709d
BP
1769 /* TAP device always accepts packets.*/
1770 poll_immediate_wake();
1771 }
1772}
1773
1774/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
1775 * otherwise a positive errno value. */
1776static int
74ff3298 1777netdev_linux_set_etheraddr(struct netdev *netdev_, const struct eth_addr mac)
8b61709d 1778{
b5d57fc8 1779 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4f9f3f21 1780 enum netdev_flags old_flags = 0;
eb395f2e
BP
1781 int error;
1782
86383816 1783 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1784 if (netdev_linux_netnsid_is_remote(netdev)) {
1785 error = EOPNOTSUPP;
1786 goto exit;
1787 }
86383816 1788
b5d57fc8 1789 if (netdev->cache_valid & VALID_ETHERADDR) {
86383816
BP
1790 error = netdev->ether_addr_error;
1791 if (error || eth_addr_equals(netdev->etheraddr, mac)) {
1792 goto exit;
44445cac 1793 }
b5d57fc8 1794 netdev->cache_valid &= ~VALID_ETHERADDR;
44445cac
PS
1795 }
1796
7eb1bd81 1797 /* Tap devices must be brought down before setting the address. */
796223f5 1798 if (is_tap_netdev(netdev_)) {
4f9f3f21 1799 update_flags(netdev, NETDEV_UP, 0, &old_flags);
7eb1bd81 1800 }
44445cac
PS
1801 error = set_etheraddr(netdev_get_name(netdev_), mac);
1802 if (!error || error == ENODEV) {
b5d57fc8
BP
1803 netdev->ether_addr_error = error;
1804 netdev->cache_valid |= VALID_ETHERADDR;
eb395f2e 1805 if (!error) {
74ff3298 1806 netdev->etheraddr = mac;
eb395f2e 1807 }
8b61709d 1808 }
44445cac 1809
4f9f3f21
BP
1810 if (is_tap_netdev(netdev_) && old_flags & NETDEV_UP) {
1811 update_flags(netdev, 0, NETDEV_UP, &old_flags);
1812 }
7eb1bd81 1813
86383816
BP
1814exit:
1815 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
1816 return error;
1817}
1818
44445cac 1819/* Copies 'netdev''s MAC address to 'mac' which is passed as param. */
8b61709d 1820static int
74ff3298 1821netdev_linux_get_etheraddr(const struct netdev *netdev_, struct eth_addr *mac)
8b61709d 1822{
b5d57fc8 1823 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 1824 int error;
44445cac 1825
86383816 1826 ovs_mutex_lock(&netdev->mutex);
b5d57fc8 1827 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
756819dd
FL
1828 netdev_linux_update_via_netlink(netdev);
1829 }
1830
1831 if (!(netdev->cache_valid & VALID_ETHERADDR)) {
1832 /* Fall back to ioctl if netlink fails */
86383816 1833 netdev->ether_addr_error = get_etheraddr(netdev_get_name(netdev_),
74ff3298 1834 &netdev->etheraddr);
b5d57fc8 1835 netdev->cache_valid |= VALID_ETHERADDR;
8b61709d 1836 }
44445cac 1837
86383816
BP
1838 error = netdev->ether_addr_error;
1839 if (!error) {
74ff3298 1840 *mac = netdev->etheraddr;
44445cac 1841 }
86383816 1842 ovs_mutex_unlock(&netdev->mutex);
44445cac 1843
86383816 1844 return error;
8b61709d
BP
1845}
1846
8b61709d 1847static int
73371c09 1848netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup)
8b61709d 1849{
86383816
BP
1850 int error;
1851
b5d57fc8 1852 if (!(netdev->cache_valid & VALID_MTU)) {
756819dd
FL
1853 netdev_linux_update_via_netlink(netdev);
1854 }
1855
1856 if (!(netdev->cache_valid & VALID_MTU)) {
1857 /* Fall back to ioctl if netlink fails */
8b61709d 1858 struct ifreq ifr;
90a6637d 1859
86383816 1860 netdev->netdev_mtu_error = af_inet_ifreq_ioctl(
73371c09 1861 netdev_get_name(&netdev->up), &ifr, SIOCGIFMTU, "SIOCGIFMTU");
b5d57fc8
BP
1862 netdev->mtu = ifr.ifr_mtu;
1863 netdev->cache_valid |= VALID_MTU;
8b61709d 1864 }
90a6637d 1865
86383816
BP
1866 error = netdev->netdev_mtu_error;
1867 if (!error) {
b5d57fc8 1868 *mtup = netdev->mtu;
90a6637d 1869 }
73371c09
BP
1870
1871 return error;
1872}
1873
1874/* Returns the maximum size of transmitted (and received) packets on 'netdev',
1875 * in bytes, not including the hardware header; thus, this is typically 1500
1876 * bytes for Ethernet devices. */
1877static int
1878netdev_linux_get_mtu(const struct netdev *netdev_, int *mtup)
1879{
1880 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1881 int error;
1882
1883 ovs_mutex_lock(&netdev->mutex);
1884 error = netdev_linux_get_mtu__(netdev, mtup);
86383816
BP
1885 ovs_mutex_unlock(&netdev->mutex);
1886
1887 return error;
8b61709d
BP
1888}
1889
9b020780
PS
1890/* Sets the maximum size of transmitted (MTU) for given device using linux
1891 * networking ioctl interface.
1892 */
1893static int
4124cb12 1894netdev_linux_set_mtu(struct netdev *netdev_, int mtu)
9b020780 1895{
b5d57fc8 1896 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9b020780
PS
1897 struct ifreq ifr;
1898 int error;
1899
86383816 1900 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1901 if (netdev_linux_netnsid_is_remote(netdev)) {
1902 error = EOPNOTSUPP;
1903 goto exit;
1904 }
1905
52b5a5c0
EC
1906#ifdef HAVE_AF_XDP
1907 if (netdev_get_class(netdev_) == &netdev_afxdp_class) {
1908 error = netdev_afxdp_verify_mtu_size(netdev_, mtu);
1909 if (error) {
1910 goto exit;
1911 }
1912 }
1913#endif
1914
b5d57fc8 1915 if (netdev->cache_valid & VALID_MTU) {
86383816
BP
1916 error = netdev->netdev_mtu_error;
1917 if (error || netdev->mtu == mtu) {
1918 goto exit;
90a6637d 1919 }
b5d57fc8 1920 netdev->cache_valid &= ~VALID_MTU;
153e5481 1921 }
9b020780 1922 ifr.ifr_mtu = mtu;
259e0b1a
BP
1923 error = af_inet_ifreq_ioctl(netdev_get_name(netdev_), &ifr,
1924 SIOCSIFMTU, "SIOCSIFMTU");
90a6637d 1925 if (!error || error == ENODEV) {
b5d57fc8
BP
1926 netdev->netdev_mtu_error = error;
1927 netdev->mtu = ifr.ifr_mtu;
1928 netdev->cache_valid |= VALID_MTU;
9b020780 1929 }
86383816
BP
1930exit:
1931 ovs_mutex_unlock(&netdev->mutex);
90a6637d 1932 return error;
9b020780
PS
1933}
1934
9ab3d9a3
BP
1935/* Returns the ifindex of 'netdev', if successful, as a positive number.
1936 * On failure, returns a negative errno value. */
1937static int
86383816 1938netdev_linux_get_ifindex(const struct netdev *netdev_)
9ab3d9a3 1939{
86383816 1940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
9ab3d9a3
BP
1941 int ifindex, error;
1942
86383816 1943 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
1944 if (netdev_linux_netnsid_is_remote(netdev)) {
1945 error = EOPNOTSUPP;
1946 goto exit;
1947 }
86383816 1948 error = get_ifindex(netdev_, &ifindex);
86383816 1949
e0e2410d
FL
1950exit:
1951 ovs_mutex_unlock(&netdev->mutex);
9ab3d9a3
BP
1952 return error ? -error : ifindex;
1953}
1954
8b61709d
BP
1955static int
1956netdev_linux_get_carrier(const struct netdev *netdev_, bool *carrier)
1957{
b5d57fc8 1958 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 1959
86383816 1960 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
1961 if (netdev->miimon_interval > 0) {
1962 *carrier = netdev->miimon;
3a183124 1963 } else {
b5d57fc8 1964 *carrier = (netdev->ifi_flags & IFF_RUNNING) != 0;
8b61709d 1965 }
86383816 1966 ovs_mutex_unlock(&netdev->mutex);
8b61709d 1967
3a183124 1968 return 0;
8b61709d
BP
1969}
1970
65c3058c 1971static long long int
86383816 1972netdev_linux_get_carrier_resets(const struct netdev *netdev_)
65c3058c 1973{
86383816
BP
1974 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1975 long long int carrier_resets;
1976
1977 ovs_mutex_lock(&netdev->mutex);
1978 carrier_resets = netdev->carrier_resets;
1979 ovs_mutex_unlock(&netdev->mutex);
1980
1981 return carrier_resets;
65c3058c
EJ
1982}
1983
63331829 1984static int
1670c579
EJ
1985netdev_linux_do_miimon(const char *name, int cmd, const char *cmd_name,
1986 struct mii_ioctl_data *data)
63331829 1987{
63331829 1988 struct ifreq ifr;
782e6111 1989 int error;
63331829 1990
63331829 1991 memset(&ifr, 0, sizeof ifr);
782e6111 1992 memcpy(&ifr.ifr_data, data, sizeof *data);
259e0b1a 1993 error = af_inet_ifreq_ioctl(name, &ifr, cmd, cmd_name);
782e6111 1994 memcpy(data, &ifr.ifr_data, sizeof *data);
63331829 1995
782e6111
EJ
1996 return error;
1997}
1998
1999static int
1670c579 2000netdev_linux_get_miimon(const char *name, bool *miimon)
782e6111 2001{
782e6111
EJ
2002 struct mii_ioctl_data data;
2003 int error;
63331829 2004
782e6111
EJ
2005 *miimon = false;
2006
2007 memset(&data, 0, sizeof data);
1670c579 2008 error = netdev_linux_do_miimon(name, SIOCGMIIPHY, "SIOCGMIIPHY", &data);
782e6111
EJ
2009 if (!error) {
2010 /* data.phy_id is filled out by previous SIOCGMIIPHY miimon call. */
2011 data.reg_num = MII_BMSR;
1670c579 2012 error = netdev_linux_do_miimon(name, SIOCGMIIREG, "SIOCGMIIREG",
782e6111 2013 &data);
63331829
EJ
2014
2015 if (!error) {
782e6111 2016 *miimon = !!(data.val_out & BMSR_LSTATUS);
63331829 2017 }
9120cfc0
DH
2018 }
2019 if (error) {
63331829 2020 struct ethtool_cmd ecmd;
63331829
EJ
2021
2022 VLOG_DBG_RL(&rl, "%s: failed to query MII, falling back to ethtool",
2023 name);
2024
ab985a77 2025 COVERAGE_INC(netdev_get_ethtool);
63331829
EJ
2026 memset(&ecmd, 0, sizeof ecmd);
2027 error = netdev_linux_do_ethtool(name, &ecmd, ETHTOOL_GLINK,
2028 "ETHTOOL_GLINK");
2029 if (!error) {
782e6111
EJ
2030 struct ethtool_value eval;
2031
2032 memcpy(&eval, &ecmd, sizeof eval);
2033 *miimon = !!eval.data;
63331829
EJ
2034 } else {
2035 VLOG_WARN_RL(&rl, "%s: ethtool link status failed", name);
2036 }
2037 }
2038
2039 return error;
2040}
2041
1670c579
EJ
2042static int
2043netdev_linux_set_miimon_interval(struct netdev *netdev_,
2044 long long int interval)
2045{
b5d57fc8 2046 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
1670c579 2047
86383816 2048 ovs_mutex_lock(&netdev->mutex);
1670c579 2049 interval = interval > 0 ? MAX(interval, 100) : 0;
b5d57fc8 2050 if (netdev->miimon_interval != interval) {
19c8e9c1 2051 if (interval && !netdev->miimon_interval) {
812c272c 2052 atomic_count_inc(&miimon_cnt);
19c8e9c1 2053 } else if (!interval && netdev->miimon_interval) {
812c272c 2054 atomic_count_dec(&miimon_cnt);
19c8e9c1
JS
2055 }
2056
b5d57fc8
BP
2057 netdev->miimon_interval = interval;
2058 timer_set_expired(&netdev->miimon_timer);
1670c579 2059 }
86383816 2060 ovs_mutex_unlock(&netdev->mutex);
1670c579
EJ
2061
2062 return 0;
2063}
2064
2065static void
2066netdev_linux_miimon_run(void)
2067{
2068 struct shash device_shash;
2069 struct shash_node *node;
2070
2071 shash_init(&device_shash);
b5d57fc8 2072 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 2073 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
2074 struct netdev *netdev = node->data;
2075 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579
EJ
2076 bool miimon;
2077
86383816
BP
2078 ovs_mutex_lock(&dev->mutex);
2079 if (dev->miimon_interval > 0 && timer_expired(&dev->miimon_timer)) {
2080 netdev_linux_get_miimon(dev->up.name, &miimon);
2081 if (miimon != dev->miimon) {
2082 dev->miimon = miimon;
2083 netdev_linux_changed(dev, dev->ifi_flags, 0);
2084 }
1670c579 2085
86383816 2086 timer_set_duration(&dev->miimon_timer, dev->miimon_interval);
1670c579 2087 }
86383816 2088 ovs_mutex_unlock(&dev->mutex);
2f980d74 2089 netdev_close(netdev);
1670c579
EJ
2090 }
2091
2092 shash_destroy(&device_shash);
2093}
2094
2095static void
2096netdev_linux_miimon_wait(void)
2097{
2098 struct shash device_shash;
2099 struct shash_node *node;
2100
2101 shash_init(&device_shash);
b5d57fc8 2102 netdev_get_devices(&netdev_linux_class, &device_shash);
1670c579 2103 SHASH_FOR_EACH (node, &device_shash) {
96172faa
BP
2104 struct netdev *netdev = node->data;
2105 struct netdev_linux *dev = netdev_linux_cast(netdev);
1670c579 2106
86383816 2107 ovs_mutex_lock(&dev->mutex);
1670c579
EJ
2108 if (dev->miimon_interval > 0) {
2109 timer_wait(&dev->miimon_timer);
2110 }
86383816 2111 ovs_mutex_unlock(&dev->mutex);
2f980d74 2112 netdev_close(netdev);
1670c579
EJ
2113 }
2114 shash_destroy(&device_shash);
2115}
2116
92df599c
JG
2117static void
2118swap_uint64(uint64_t *a, uint64_t *b)
2119{
1de0e8ae
BP
2120 uint64_t tmp = *a;
2121 *a = *b;
2122 *b = tmp;
92df599c
JG
2123}
2124
c060c4cf
EJ
2125/* Copies 'src' into 'dst', performing format conversion in the process.
2126 *
2127 * 'src' is allowed to be misaligned. */
2128static void
2129netdev_stats_from_ovs_vport_stats(struct netdev_stats *dst,
2130 const struct ovs_vport_stats *src)
2131{
6a54dedc
BP
2132 dst->rx_packets = get_32aligned_u64(&src->rx_packets);
2133 dst->tx_packets = get_32aligned_u64(&src->tx_packets);
2134 dst->rx_bytes = get_32aligned_u64(&src->rx_bytes);
2135 dst->tx_bytes = get_32aligned_u64(&src->tx_bytes);
2136 dst->rx_errors = get_32aligned_u64(&src->rx_errors);
2137 dst->tx_errors = get_32aligned_u64(&src->tx_errors);
2138 dst->rx_dropped = get_32aligned_u64(&src->rx_dropped);
2139 dst->tx_dropped = get_32aligned_u64(&src->tx_dropped);
c060c4cf
EJ
2140 dst->multicast = 0;
2141 dst->collisions = 0;
2142 dst->rx_length_errors = 0;
2143 dst->rx_over_errors = 0;
2144 dst->rx_crc_errors = 0;
2145 dst->rx_frame_errors = 0;
2146 dst->rx_fifo_errors = 0;
2147 dst->rx_missed_errors = 0;
2148 dst->tx_aborted_errors = 0;
2149 dst->tx_carrier_errors = 0;
2150 dst->tx_fifo_errors = 0;
2151 dst->tx_heartbeat_errors = 0;
2152 dst->tx_window_errors = 0;
2153}
2154
2155static int
2156get_stats_via_vport__(const struct netdev *netdev, struct netdev_stats *stats)
2157{
93451a0a 2158 struct dpif_netlink_vport reply;
c060c4cf
EJ
2159 struct ofpbuf *buf;
2160 int error;
2161
93451a0a 2162 error = dpif_netlink_vport_get(netdev_get_name(netdev), &reply, &buf);
c060c4cf
EJ
2163 if (error) {
2164 return error;
2165 } else if (!reply.stats) {
2166 ofpbuf_delete(buf);
2167 return EOPNOTSUPP;
2168 }
2169
2170 netdev_stats_from_ovs_vport_stats(stats, reply.stats);
2171
2172 ofpbuf_delete(buf);
2173
2174 return 0;
2175}
2176
f613a0d7
PS
2177static void
2178get_stats_via_vport(const struct netdev *netdev_,
2179 struct netdev_stats *stats)
8b61709d 2180{
b5d57fc8 2181 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d 2182
b5d57fc8
BP
2183 if (!netdev->vport_stats_error ||
2184 !(netdev->cache_valid & VALID_VPORT_STAT_ERROR)) {
f613a0d7 2185 int error;
7fbef77a 2186
c060c4cf 2187 error = get_stats_via_vport__(netdev_, stats);
bb13fe5e 2188 if (error && error != ENOENT && error != ENODEV) {
a57a8488 2189 VLOG_WARN_RL(&rl, "%s: obtaining netdev stats via vport failed "
10a89ef0
BP
2190 "(%s)",
2191 netdev_get_name(netdev_), ovs_strerror(error));
f613a0d7 2192 }
b5d57fc8
BP
2193 netdev->vport_stats_error = error;
2194 netdev->cache_valid |= VALID_VPORT_STAT_ERROR;
8b61709d 2195 }
f613a0d7 2196}
8b61709d 2197
f613a0d7
PS
2198/* Retrieves current device stats for 'netdev-linux'. */
2199static int
2200netdev_linux_get_stats(const struct netdev *netdev_,
2201 struct netdev_stats *stats)
2202{
b5d57fc8 2203 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
2204 struct netdev_stats dev_stats;
2205 int error;
2206
86383816 2207 ovs_mutex_lock(&netdev->mutex);
f613a0d7 2208 get_stats_via_vport(netdev_, stats);
35eef899 2209 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 2210 if (error) {
86383816
BP
2211 if (!netdev->vport_stats_error) {
2212 error = 0;
f613a0d7 2213 }
86383816 2214 } else if (netdev->vport_stats_error) {
04c881eb 2215 /* stats not available from OVS then use netdev stats. */
f613a0d7
PS
2216 *stats = dev_stats;
2217 } else {
f613a0d7
PS
2218 stats->multicast += dev_stats.multicast;
2219 stats->collisions += dev_stats.collisions;
2220 stats->rx_length_errors += dev_stats.rx_length_errors;
2221 stats->rx_over_errors += dev_stats.rx_over_errors;
2222 stats->rx_crc_errors += dev_stats.rx_crc_errors;
2223 stats->rx_frame_errors += dev_stats.rx_frame_errors;
2224 stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
2225 stats->rx_missed_errors += dev_stats.rx_missed_errors;
2226 stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
2227 stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
2228 stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
2229 stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
2230 stats->tx_window_errors += dev_stats.tx_window_errors;
2231 }
86383816
BP
2232 ovs_mutex_unlock(&netdev->mutex);
2233
2234 return error;
f613a0d7
PS
2235}
2236
2237/* Retrieves current device stats for 'netdev-tap' netdev or
2238 * netdev-internal. */
2239static int
15aee116 2240netdev_tap_get_stats(const struct netdev *netdev_, struct netdev_stats *stats)
f613a0d7 2241{
b5d57fc8 2242 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
f613a0d7
PS
2243 struct netdev_stats dev_stats;
2244 int error;
2245
86383816 2246 ovs_mutex_lock(&netdev->mutex);
f613a0d7 2247 get_stats_via_vport(netdev_, stats);
35eef899 2248 error = get_stats_via_netlink(netdev_, &dev_stats);
f613a0d7 2249 if (error) {
86383816
BP
2250 if (!netdev->vport_stats_error) {
2251 error = 0;
8b61709d 2252 }
86383816
BP
2253 } else if (netdev->vport_stats_error) {
2254 /* Transmit and receive stats will appear to be swapped relative to the
2255 * other ports since we are the one sending the data, not a remote
2256 * computer. For consistency, we swap them back here. This does not
2257 * apply if we are getting stats from the vport layer because it always
2258 * tracks stats from the perspective of the switch. */
fe6b0e03 2259
f613a0d7 2260 *stats = dev_stats;
92df599c
JG
2261 swap_uint64(&stats->rx_packets, &stats->tx_packets);
2262 swap_uint64(&stats->rx_bytes, &stats->tx_bytes);
2263 swap_uint64(&stats->rx_errors, &stats->tx_errors);
2264 swap_uint64(&stats->rx_dropped, &stats->tx_dropped);
fe6b0e03
JG
2265 stats->rx_length_errors = 0;
2266 stats->rx_over_errors = 0;
2267 stats->rx_crc_errors = 0;
2268 stats->rx_frame_errors = 0;
2269 stats->rx_fifo_errors = 0;
2270 stats->rx_missed_errors = 0;
2271 stats->tx_aborted_errors = 0;
2272 stats->tx_carrier_errors = 0;
2273 stats->tx_fifo_errors = 0;
2274 stats->tx_heartbeat_errors = 0;
2275 stats->tx_window_errors = 0;
f613a0d7 2276 } else {
04c881eb
AZ
2277 /* Use kernel netdev's packet and byte counts since vport counters
2278 * do not reflect packet counts on the wire when GSO, TSO or GRO
2279 * are enabled. */
2280 stats->rx_packets = dev_stats.tx_packets;
2281 stats->rx_bytes = dev_stats.tx_bytes;
2282 stats->tx_packets = dev_stats.rx_packets;
2283 stats->tx_bytes = dev_stats.rx_bytes;
2284
f613a0d7
PS
2285 stats->rx_dropped += dev_stats.tx_dropped;
2286 stats->tx_dropped += dev_stats.rx_dropped;
fe6b0e03 2287
f613a0d7
PS
2288 stats->rx_errors += dev_stats.tx_errors;
2289 stats->tx_errors += dev_stats.rx_errors;
2290
2291 stats->multicast += dev_stats.multicast;
2292 stats->collisions += dev_stats.collisions;
2293 }
22dcb534 2294 stats->tx_dropped += netdev->tx_dropped;
29cf9c1b 2295 stats->rx_dropped += netdev->rx_dropped;
86383816
BP
2296 ovs_mutex_unlock(&netdev->mutex);
2297
2298 return error;
8b61709d
BP
2299}
2300
bba1e6f3
PS
2301static int
2302netdev_internal_get_stats(const struct netdev *netdev_,
2303 struct netdev_stats *stats)
2304{
b5d57fc8 2305 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2306 int error;
bba1e6f3 2307
86383816 2308 ovs_mutex_lock(&netdev->mutex);
bba1e6f3 2309 get_stats_via_vport(netdev_, stats);
86383816
BP
2310 error = netdev->vport_stats_error;
2311 ovs_mutex_unlock(&netdev->mutex);
2312
2313 return error;
bba1e6f3
PS
2314}
2315
51f87458 2316static void
b5d57fc8 2317netdev_linux_read_features(struct netdev_linux *netdev)
8b61709d
BP
2318{
2319 struct ethtool_cmd ecmd;
6c038611 2320 uint32_t speed;
8b61709d
BP
2321 int error;
2322
b5d57fc8 2323 if (netdev->cache_valid & VALID_FEATURES) {
51f87458
PS
2324 return;
2325 }
2326
ab985a77 2327 COVERAGE_INC(netdev_get_ethtool);
8b61709d 2328 memset(&ecmd, 0, sizeof ecmd);
b5d57fc8 2329 error = netdev_linux_do_ethtool(netdev->up.name, &ecmd,
8b61709d
BP
2330 ETHTOOL_GSET, "ETHTOOL_GSET");
2331 if (error) {
51f87458 2332 goto out;
8b61709d
BP
2333 }
2334
2335 /* Supported features. */
b5d57fc8 2336 netdev->supported = 0;
8b61709d 2337 if (ecmd.supported & SUPPORTED_10baseT_Half) {
b5d57fc8 2338 netdev->supported |= NETDEV_F_10MB_HD;
8b61709d
BP
2339 }
2340 if (ecmd.supported & SUPPORTED_10baseT_Full) {
b5d57fc8 2341 netdev->supported |= NETDEV_F_10MB_FD;
8b61709d
BP
2342 }
2343 if (ecmd.supported & SUPPORTED_100baseT_Half) {
b5d57fc8 2344 netdev->supported |= NETDEV_F_100MB_HD;
8b61709d
BP
2345 }
2346 if (ecmd.supported & SUPPORTED_100baseT_Full) {
b5d57fc8 2347 netdev->supported |= NETDEV_F_100MB_FD;
8b61709d
BP
2348 }
2349 if (ecmd.supported & SUPPORTED_1000baseT_Half) {
b5d57fc8 2350 netdev->supported |= NETDEV_F_1GB_HD;
8b61709d 2351 }
67bed84c
SH
2352 if ((ecmd.supported & SUPPORTED_1000baseT_Full) ||
2353 (ecmd.supported & SUPPORTED_1000baseKX_Full)) {
b5d57fc8 2354 netdev->supported |= NETDEV_F_1GB_FD;
8b61709d 2355 }
67bed84c
SH
2356 if ((ecmd.supported & SUPPORTED_10000baseT_Full) ||
2357 (ecmd.supported & SUPPORTED_10000baseKX4_Full) ||
2358 (ecmd.supported & SUPPORTED_10000baseKR_Full) ||
2359 (ecmd.supported & SUPPORTED_10000baseR_FEC)) {
b5d57fc8 2360 netdev->supported |= NETDEV_F_10GB_FD;
8b61709d 2361 }
67bed84c
SH
2362 if ((ecmd.supported & SUPPORTED_40000baseKR4_Full) ||
2363 (ecmd.supported & SUPPORTED_40000baseCR4_Full) ||
2364 (ecmd.supported & SUPPORTED_40000baseSR4_Full) ||
2365 (ecmd.supported & SUPPORTED_40000baseLR4_Full)) {
2366 netdev->supported |= NETDEV_F_40GB_FD;
2367 }
8b61709d 2368 if (ecmd.supported & SUPPORTED_TP) {
b5d57fc8 2369 netdev->supported |= NETDEV_F_COPPER;
8b61709d
BP
2370 }
2371 if (ecmd.supported & SUPPORTED_FIBRE) {
b5d57fc8 2372 netdev->supported |= NETDEV_F_FIBER;
8b61709d
BP
2373 }
2374 if (ecmd.supported & SUPPORTED_Autoneg) {
b5d57fc8 2375 netdev->supported |= NETDEV_F_AUTONEG;
8b61709d
BP
2376 }
2377 if (ecmd.supported & SUPPORTED_Pause) {
b5d57fc8 2378 netdev->supported |= NETDEV_F_PAUSE;
8b61709d
BP
2379 }
2380 if (ecmd.supported & SUPPORTED_Asym_Pause) {
b5d57fc8 2381 netdev->supported |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2382 }
2383
2384 /* Advertised features. */
b5d57fc8 2385 netdev->advertised = 0;
8b61709d 2386 if (ecmd.advertising & ADVERTISED_10baseT_Half) {
b5d57fc8 2387 netdev->advertised |= NETDEV_F_10MB_HD;
8b61709d
BP
2388 }
2389 if (ecmd.advertising & ADVERTISED_10baseT_Full) {
b5d57fc8 2390 netdev->advertised |= NETDEV_F_10MB_FD;
8b61709d
BP
2391 }
2392 if (ecmd.advertising & ADVERTISED_100baseT_Half) {
b5d57fc8 2393 netdev->advertised |= NETDEV_F_100MB_HD;
8b61709d
BP
2394 }
2395 if (ecmd.advertising & ADVERTISED_100baseT_Full) {
b5d57fc8 2396 netdev->advertised |= NETDEV_F_100MB_FD;
8b61709d
BP
2397 }
2398 if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
b5d57fc8 2399 netdev->advertised |= NETDEV_F_1GB_HD;
8b61709d 2400 }
67bed84c
SH
2401 if ((ecmd.advertising & ADVERTISED_1000baseT_Full) ||
2402 (ecmd.advertising & ADVERTISED_1000baseKX_Full)) {
b5d57fc8 2403 netdev->advertised |= NETDEV_F_1GB_FD;
8b61709d 2404 }
67bed84c
SH
2405 if ((ecmd.advertising & ADVERTISED_10000baseT_Full) ||
2406 (ecmd.advertising & ADVERTISED_10000baseKX4_Full) ||
2407 (ecmd.advertising & ADVERTISED_10000baseKR_Full) ||
2408 (ecmd.advertising & ADVERTISED_10000baseR_FEC)) {
b5d57fc8 2409 netdev->advertised |= NETDEV_F_10GB_FD;
8b61709d 2410 }
67bed84c
SH
2411 if ((ecmd.advertising & ADVERTISED_40000baseKR4_Full) ||
2412 (ecmd.advertising & ADVERTISED_40000baseCR4_Full) ||
2413 (ecmd.advertising & ADVERTISED_40000baseSR4_Full) ||
2414 (ecmd.advertising & ADVERTISED_40000baseLR4_Full)) {
2415 netdev->advertised |= NETDEV_F_40GB_FD;
2416 }
8b61709d 2417 if (ecmd.advertising & ADVERTISED_TP) {
b5d57fc8 2418 netdev->advertised |= NETDEV_F_COPPER;
8b61709d
BP
2419 }
2420 if (ecmd.advertising & ADVERTISED_FIBRE) {
b5d57fc8 2421 netdev->advertised |= NETDEV_F_FIBER;
8b61709d
BP
2422 }
2423 if (ecmd.advertising & ADVERTISED_Autoneg) {
b5d57fc8 2424 netdev->advertised |= NETDEV_F_AUTONEG;
8b61709d
BP
2425 }
2426 if (ecmd.advertising & ADVERTISED_Pause) {
b5d57fc8 2427 netdev->advertised |= NETDEV_F_PAUSE;
8b61709d
BP
2428 }
2429 if (ecmd.advertising & ADVERTISED_Asym_Pause) {
b5d57fc8 2430 netdev->advertised |= NETDEV_F_PAUSE_ASYM;
8b61709d
BP
2431 }
2432
2433 /* Current settings. */
0c615356 2434 speed = ethtool_cmd_speed(&ecmd);
6c038611 2435 if (speed == SPEED_10) {
b5d57fc8 2436 netdev->current = ecmd.duplex ? NETDEV_F_10MB_FD : NETDEV_F_10MB_HD;
6c038611 2437 } else if (speed == SPEED_100) {
b5d57fc8 2438 netdev->current = ecmd.duplex ? NETDEV_F_100MB_FD : NETDEV_F_100MB_HD;
6c038611 2439 } else if (speed == SPEED_1000) {
b5d57fc8 2440 netdev->current = ecmd.duplex ? NETDEV_F_1GB_FD : NETDEV_F_1GB_HD;
6c038611 2441 } else if (speed == SPEED_10000) {
b5d57fc8 2442 netdev->current = NETDEV_F_10GB_FD;
6c038611 2443 } else if (speed == 40000) {
b5d57fc8 2444 netdev->current = NETDEV_F_40GB_FD;
6c038611 2445 } else if (speed == 100000) {
b5d57fc8 2446 netdev->current = NETDEV_F_100GB_FD;
6c038611 2447 } else if (speed == 1000000) {
b5d57fc8 2448 netdev->current = NETDEV_F_1TB_FD;
8b61709d 2449 } else {
b5d57fc8 2450 netdev->current = 0;
8b61709d
BP
2451 }
2452
2453 if (ecmd.port == PORT_TP) {
b5d57fc8 2454 netdev->current |= NETDEV_F_COPPER;
8b61709d 2455 } else if (ecmd.port == PORT_FIBRE) {
b5d57fc8 2456 netdev->current |= NETDEV_F_FIBER;
8b61709d
BP
2457 }
2458
2459 if (ecmd.autoneg) {
b5d57fc8 2460 netdev->current |= NETDEV_F_AUTONEG;
8b61709d
BP
2461 }
2462
51f87458 2463out:
b5d57fc8
BP
2464 netdev->cache_valid |= VALID_FEATURES;
2465 netdev->get_features_error = error;
51f87458
PS
2466}
2467
887ed8b2
BP
2468/* Stores the features supported by 'netdev' into of '*current', '*advertised',
2469 * '*supported', and '*peer'. Each value is a bitmap of NETDEV_* bits.
2470 * Returns 0 if successful, otherwise a positive errno value. */
51f87458
PS
2471static int
2472netdev_linux_get_features(const struct netdev *netdev_,
a00ca915
EJ
2473 enum netdev_features *current,
2474 enum netdev_features *advertised,
2475 enum netdev_features *supported,
2476 enum netdev_features *peer)
51f87458 2477{
b5d57fc8 2478 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
86383816 2479 int error;
51f87458 2480
86383816 2481 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2482 if (netdev_linux_netnsid_is_remote(netdev)) {
2483 error = EOPNOTSUPP;
2484 goto exit;
2485 }
2486
b5d57fc8 2487 netdev_linux_read_features(netdev);
b5d57fc8
BP
2488 if (!netdev->get_features_error) {
2489 *current = netdev->current;
2490 *advertised = netdev->advertised;
2491 *supported = netdev->supported;
887ed8b2 2492 *peer = 0; /* XXX */
51f87458 2493 }
86383816 2494 error = netdev->get_features_error;
86383816 2495
e0e2410d
FL
2496exit:
2497 ovs_mutex_unlock(&netdev->mutex);
86383816 2498 return error;
8b61709d
BP
2499}
2500
2501/* Set the features advertised by 'netdev' to 'advertise'. */
2502static int
86383816 2503netdev_linux_set_advertisements(struct netdev *netdev_,
6c038611 2504 enum netdev_features advertise)
8b61709d 2505{
86383816 2506 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
2507 struct ethtool_cmd ecmd;
2508 int error;
2509
86383816
BP
2510 ovs_mutex_lock(&netdev->mutex);
2511
ab985a77 2512 COVERAGE_INC(netdev_get_ethtool);
e0e2410d
FL
2513
2514 if (netdev_linux_netnsid_is_remote(netdev)) {
2515 error = EOPNOTSUPP;
2516 goto exit;
2517 }
2518
8b61709d 2519 memset(&ecmd, 0, sizeof ecmd);
86383816 2520 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
8b61709d
BP
2521 ETHTOOL_GSET, "ETHTOOL_GSET");
2522 if (error) {
86383816 2523 goto exit;
8b61709d
BP
2524 }
2525
2526 ecmd.advertising = 0;
6c038611 2527 if (advertise & NETDEV_F_10MB_HD) {
8b61709d
BP
2528 ecmd.advertising |= ADVERTISED_10baseT_Half;
2529 }
6c038611 2530 if (advertise & NETDEV_F_10MB_FD) {
8b61709d
BP
2531 ecmd.advertising |= ADVERTISED_10baseT_Full;
2532 }
6c038611 2533 if (advertise & NETDEV_F_100MB_HD) {
8b61709d
BP
2534 ecmd.advertising |= ADVERTISED_100baseT_Half;
2535 }
6c038611 2536 if (advertise & NETDEV_F_100MB_FD) {
8b61709d
BP
2537 ecmd.advertising |= ADVERTISED_100baseT_Full;
2538 }
6c038611 2539 if (advertise & NETDEV_F_1GB_HD) {
8b61709d
BP
2540 ecmd.advertising |= ADVERTISED_1000baseT_Half;
2541 }
6c038611 2542 if (advertise & NETDEV_F_1GB_FD) {
8b61709d
BP
2543 ecmd.advertising |= ADVERTISED_1000baseT_Full;
2544 }
6c038611 2545 if (advertise & NETDEV_F_10GB_FD) {
8b61709d
BP
2546 ecmd.advertising |= ADVERTISED_10000baseT_Full;
2547 }
6c038611 2548 if (advertise & NETDEV_F_COPPER) {
8b61709d
BP
2549 ecmd.advertising |= ADVERTISED_TP;
2550 }
6c038611 2551 if (advertise & NETDEV_F_FIBER) {
8b61709d
BP
2552 ecmd.advertising |= ADVERTISED_FIBRE;
2553 }
6c038611 2554 if (advertise & NETDEV_F_AUTONEG) {
8b61709d
BP
2555 ecmd.advertising |= ADVERTISED_Autoneg;
2556 }
6c038611 2557 if (advertise & NETDEV_F_PAUSE) {
8b61709d
BP
2558 ecmd.advertising |= ADVERTISED_Pause;
2559 }
6c038611 2560 if (advertise & NETDEV_F_PAUSE_ASYM) {
8b61709d
BP
2561 ecmd.advertising |= ADVERTISED_Asym_Pause;
2562 }
ab985a77 2563 COVERAGE_INC(netdev_set_ethtool);
86383816
BP
2564 error = netdev_linux_do_ethtool(netdev_get_name(netdev_), &ecmd,
2565 ETHTOOL_SSET, "ETHTOOL_SSET");
2566
2567exit:
2568 ovs_mutex_unlock(&netdev->mutex);
2569 return error;
8b61709d
BP
2570}
2571
e7f6ba22
PJV
2572static struct tc_police
2573tc_matchall_fill_police(uint32_t kbits_rate, uint32_t kbits_burst)
2574{
2575 unsigned int bsize = MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 64;
2576 unsigned int bps = ((uint64_t) kbits_rate * 1000) / 8;
2577 struct tc_police police;
2578 struct tc_ratespec rate;
2579 int mtu = 65535;
2580
2581 memset(&rate, 0, sizeof rate);
2582 rate.rate = bps;
2583 rate.cell_log = tc_calc_cell_log(mtu);
2584 rate.mpu = ETH_TOTAL_MIN;
2585
2586 memset(&police, 0, sizeof police);
2587 police.burst = tc_bytes_to_ticks(bps, bsize);
2588 police.action = TC_POLICE_SHOT;
2589 police.rate = rate;
2590 police.mtu = mtu;
2591
2592 return police;
2593}
2594
2595static void
2596nl_msg_put_act_police(struct ofpbuf *request, struct tc_police police)
2597{
2598 size_t offset;
2599
2600 nl_msg_put_string(request, TCA_ACT_KIND, "police");
2601 offset = nl_msg_start_nested(request, TCA_ACT_OPTIONS);
2602 nl_msg_put_unspec(request, TCA_POLICE_TBF, &police, sizeof police);
2603 tc_put_rtab(request, TCA_POLICE_RATE, &police.rate);
2604 nl_msg_put_u32(request, TCA_POLICE_RESULT, TC_ACT_UNSPEC);
2605 nl_msg_end_nested(request, offset);
2606}
2607
2608static int
2609tc_add_matchall_policer(struct netdev *netdev, uint32_t kbits_rate,
2610 uint32_t kbits_burst)
2611{
2612 uint16_t eth_type = (OVS_FORCE uint16_t) htons(ETH_P_ALL);
2613 size_t basic_offset, action_offset, inner_offset;
2614 uint16_t prio = TC_RESERVED_PRIORITY_POLICE;
fed4282c 2615 int ifindex, err = 0;
e7f6ba22 2616 struct tc_police pol_act;
e7f6ba22
PJV
2617 struct ofpbuf request;
2618 struct ofpbuf *reply;
2619 struct tcmsg *tcmsg;
2620 uint32_t handle = 1;
2621
2622 err = get_ifindex(netdev, &ifindex);
2623 if (err) {
2624 return err;
2625 }
2626
fed4282c 2627 tcmsg = tc_make_request(ifindex, RTM_NEWTFILTER, NLM_F_CREATE | NLM_F_ECHO,
e7f6ba22 2628 &request);
fed4282c 2629 tcmsg->tcm_parent = TC_INGRESS_PARENT;
e7f6ba22
PJV
2630 tcmsg->tcm_info = tc_make_handle(prio, eth_type);
2631 tcmsg->tcm_handle = handle;
2632
2633 pol_act = tc_matchall_fill_police(kbits_rate, kbits_burst);
2634 nl_msg_put_string(&request, TCA_KIND, "matchall");
2635 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
2636 action_offset = nl_msg_start_nested(&request, TCA_MATCHALL_ACT);
2637 inner_offset = nl_msg_start_nested(&request, 1);
2638 nl_msg_put_act_police(&request, pol_act);
2639 nl_msg_end_nested(&request, inner_offset);
2640 nl_msg_end_nested(&request, action_offset);
2641 nl_msg_end_nested(&request, basic_offset);
2642
2643 err = tc_transact(&request, &reply);
2644 if (!err) {
2645 struct tcmsg *tc =
2646 ofpbuf_at_assert(reply, NLMSG_HDRLEN, sizeof *tc);
2647 ofpbuf_delete(reply);
2648 }
2649
2650 return err;
2651}
2652
2653static int
2654tc_del_matchall_policer(struct netdev *netdev)
2655{
acdd544c 2656 int prio = TC_RESERVED_PRIORITY_POLICE;
e7f6ba22 2657 uint32_t block_id = 0;
acdd544c 2658 struct tcf_id id;
e7f6ba22
PJV
2659 int ifindex;
2660 int err;
2661
2662 err = get_ifindex(netdev, &ifindex);
2663 if (err) {
2664 return err;
2665 }
2666
acdd544c
PB
2667 id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
2668 err = tc_del_filter(&id);
e7f6ba22
PJV
2669 if (err) {
2670 return err;
2671 }
2672
2673 return 0;
2674}
2675
f8500004
JP
2676/* Attempts to set input rate limiting (policing) policy. Returns 0 if
2677 * successful, otherwise a positive errno value. */
8b61709d 2678static int
b5d57fc8 2679netdev_linux_set_policing(struct netdev *netdev_,
8b61709d
BP
2680 uint32_t kbits_rate, uint32_t kbits_burst)
2681{
b5d57fc8
BP
2682 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
2683 const char *netdev_name = netdev_get_name(netdev_);
7874bdff 2684 int ifindex;
f8500004 2685 int error;
8b61709d 2686
80a86fbe 2687 kbits_burst = (!kbits_rate ? 0 /* Force to 0 if no rate specified. */
79abacc8 2688 : !kbits_burst ? 8000 /* Default to 8000 kbits if 0. */
80a86fbe
BP
2689 : kbits_burst); /* Stick with user-specified value. */
2690
86383816 2691 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2692 if (netdev_linux_netnsid_is_remote(netdev)) {
2693 error = EOPNOTSUPP;
2694 goto out;
2695 }
2696
b5d57fc8 2697 if (netdev->cache_valid & VALID_POLICING) {
86383816
BP
2698 error = netdev->netdev_policing_error;
2699 if (error || (netdev->kbits_rate == kbits_rate &&
2700 netdev->kbits_burst == kbits_burst)) {
c9f71668 2701 /* Assume that settings haven't changed since we last set them. */
86383816 2702 goto out;
c9f71668 2703 }
b5d57fc8 2704 netdev->cache_valid &= ~VALID_POLICING;
80a86fbe
BP
2705 }
2706
718be50d 2707 COVERAGE_INC(netdev_set_policing);
7874bdff 2708
e7f6ba22
PJV
2709 /* Use matchall for policing when offloadling ovs with tc-flower. */
2710 if (netdev_is_flow_api_enabled()) {
2711 error = tc_del_matchall_policer(netdev_);
2712 if (kbits_rate) {
2713 error = tc_add_matchall_policer(netdev_, kbits_rate, kbits_burst);
2714 }
2715 ovs_mutex_unlock(&netdev->mutex);
2716 return error;
2717 }
2718
718be50d
TZ
2719 error = get_ifindex(netdev_, &ifindex);
2720 if (error) {
2721 goto out;
2722 }
2723
f8500004 2724 /* Remove any existing ingress qdisc. */
95255018 2725 error = tc_add_del_qdisc(ifindex, false, 0, TC_INGRESS);
f8500004
JP
2726 if (error) {
2727 VLOG_WARN_RL(&rl, "%s: removing policing failed: %s",
10a89ef0 2728 netdev_name, ovs_strerror(error));
c9f71668 2729 goto out;
f8500004
JP
2730 }
2731
8b61709d 2732 if (kbits_rate) {
95255018 2733 error = tc_add_del_qdisc(ifindex, true, 0, TC_INGRESS);
f8500004
JP
2734 if (error) {
2735 VLOG_WARN_RL(&rl, "%s: adding policing qdisc failed: %s",
10a89ef0 2736 netdev_name, ovs_strerror(error));
c9f71668 2737 goto out;
8b61709d
BP
2738 }
2739
b5d57fc8 2740 error = tc_add_policer(netdev_, kbits_rate, kbits_burst);
f8500004
JP
2741 if (error){
2742 VLOG_WARN_RL(&rl, "%s: adding policing action failed: %s",
10a89ef0 2743 netdev_name, ovs_strerror(error));
c9f71668 2744 goto out;
8b61709d 2745 }
8b61709d
BP
2746 }
2747
b5d57fc8
BP
2748 netdev->kbits_rate = kbits_rate;
2749 netdev->kbits_burst = kbits_burst;
f8500004 2750
c9f71668
PS
2751out:
2752 if (!error || error == ENODEV) {
b5d57fc8
BP
2753 netdev->netdev_policing_error = error;
2754 netdev->cache_valid |= VALID_POLICING;
c9f71668 2755 }
86383816 2756 ovs_mutex_unlock(&netdev->mutex);
c9f71668 2757 return error;
8b61709d
BP
2758}
2759
c1c9c9c4
BP
2760static int
2761netdev_linux_get_qos_types(const struct netdev *netdev OVS_UNUSED,
19993ef3 2762 struct sset *types)
c1c9c9c4 2763{
559eb230 2764 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2765 for (opsp = tcs; *opsp != NULL; opsp++) {
2766 const struct tc_ops *ops = *opsp;
2767 if (ops->tc_install && ops->ovs_name[0] != '\0') {
19993ef3 2768 sset_add(types, ops->ovs_name);
c1c9c9c4
BP
2769 }
2770 }
2771 return 0;
2772}
2773
2774static const struct tc_ops *
2775tc_lookup_ovs_name(const char *name)
2776{
559eb230 2777 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2778
2779 for (opsp = tcs; *opsp != NULL; opsp++) {
2780 const struct tc_ops *ops = *opsp;
2781 if (!strcmp(name, ops->ovs_name)) {
2782 return ops;
2783 }
2784 }
2785 return NULL;
2786}
2787
2788static const struct tc_ops *
2789tc_lookup_linux_name(const char *name)
2790{
559eb230 2791 const struct tc_ops *const *opsp;
c1c9c9c4
BP
2792
2793 for (opsp = tcs; *opsp != NULL; opsp++) {
2794 const struct tc_ops *ops = *opsp;
2795 if (ops->linux_name && !strcmp(name, ops->linux_name)) {
2796 return ops;
2797 }
2798 }
2799 return NULL;
2800}
2801
93b13be8 2802static struct tc_queue *
b5d57fc8 2803tc_find_queue__(const struct netdev *netdev_, unsigned int queue_id,
93b13be8
BP
2804 size_t hash)
2805{
b5d57fc8 2806 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
93b13be8
BP
2807 struct tc_queue *queue;
2808
b5d57fc8 2809 HMAP_FOR_EACH_IN_BUCKET (queue, hmap_node, hash, &netdev->tc->queues) {
93b13be8
BP
2810 if (queue->queue_id == queue_id) {
2811 return queue;
2812 }
2813 }
2814 return NULL;
2815}
2816
2817static struct tc_queue *
2818tc_find_queue(const struct netdev *netdev, unsigned int queue_id)
2819{
2820 return tc_find_queue__(netdev, queue_id, hash_int(queue_id, 0));
2821}
2822
c1c9c9c4
BP
2823static int
2824netdev_linux_get_qos_capabilities(const struct netdev *netdev OVS_UNUSED,
2825 const char *type,
2826 struct netdev_qos_capabilities *caps)
2827{
2828 const struct tc_ops *ops = tc_lookup_ovs_name(type);
2829 if (!ops) {
2830 return EOPNOTSUPP;
2831 }
2832 caps->n_queues = ops->n_queues;
2833 return 0;
2834}
2835
2836static int
b5d57fc8 2837netdev_linux_get_qos(const struct netdev *netdev_,
79f1cbe9 2838 const char **typep, struct smap *details)
c1c9c9c4 2839{
b5d57fc8 2840 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2841 int error;
2842
86383816 2843 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2844 if (netdev_linux_netnsid_is_remote(netdev)) {
2845 error = EOPNOTSUPP;
2846 goto exit;
2847 }
2848
b5d57fc8 2849 error = tc_query_qdisc(netdev_);
86383816
BP
2850 if (!error) {
2851 *typep = netdev->tc->ops->ovs_name;
2852 error = (netdev->tc->ops->qdisc_get
2853 ? netdev->tc->ops->qdisc_get(netdev_, details)
2854 : 0);
c1c9c9c4
BP
2855 }
2856
e0e2410d
FL
2857exit:
2858 ovs_mutex_unlock(&netdev->mutex);
86383816 2859 return error;
c1c9c9c4
BP
2860}
2861
2862static int
b5d57fc8 2863netdev_linux_set_qos(struct netdev *netdev_,
79f1cbe9 2864 const char *type, const struct smap *details)
c1c9c9c4 2865{
b5d57fc8 2866 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2867 const struct tc_ops *new_ops;
2868 int error;
2869
2870 new_ops = tc_lookup_ovs_name(type);
2871 if (!new_ops || !new_ops->tc_install) {
2872 return EOPNOTSUPP;
2873 }
2874
6cf888b8
BS
2875 if (new_ops == &tc_ops_noop) {
2876 return new_ops->tc_install(netdev_, details);
2877 }
2878
86383816 2879 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2880 if (netdev_linux_netnsid_is_remote(netdev)) {
2881 error = EOPNOTSUPP;
2882 goto exit;
2883 }
2884
b5d57fc8 2885 error = tc_query_qdisc(netdev_);
c1c9c9c4 2886 if (error) {
86383816 2887 goto exit;
c1c9c9c4
BP
2888 }
2889
b5d57fc8 2890 if (new_ops == netdev->tc->ops) {
86383816 2891 error = new_ops->qdisc_set ? new_ops->qdisc_set(netdev_, details) : 0;
c1c9c9c4
BP
2892 } else {
2893 /* Delete existing qdisc. */
b5d57fc8 2894 error = tc_del_qdisc(netdev_);
c1c9c9c4 2895 if (error) {
86383816 2896 goto exit;
c1c9c9c4 2897 }
b5d57fc8 2898 ovs_assert(netdev->tc == NULL);
c1c9c9c4
BP
2899
2900 /* Install new qdisc. */
b5d57fc8
BP
2901 error = new_ops->tc_install(netdev_, details);
2902 ovs_assert((error == 0) == (netdev->tc != NULL));
c1c9c9c4 2903 }
86383816
BP
2904
2905exit:
2906 ovs_mutex_unlock(&netdev->mutex);
2907 return error;
c1c9c9c4
BP
2908}
2909
2910static int
b5d57fc8 2911netdev_linux_get_queue(const struct netdev *netdev_,
79f1cbe9 2912 unsigned int queue_id, struct smap *details)
c1c9c9c4 2913{
b5d57fc8 2914 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2915 int error;
2916
86383816 2917 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2918 if (netdev_linux_netnsid_is_remote(netdev)) {
2919 error = EOPNOTSUPP;
2920 goto exit;
2921 }
2922
b5d57fc8 2923 error = tc_query_qdisc(netdev_);
86383816 2924 if (!error) {
b5d57fc8 2925 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
86383816 2926 error = (queue
b5d57fc8 2927 ? netdev->tc->ops->class_get(netdev_, queue, details)
93b13be8 2928 : ENOENT);
c1c9c9c4 2929 }
86383816 2930
e0e2410d
FL
2931exit:
2932 ovs_mutex_unlock(&netdev->mutex);
86383816 2933 return error;
c1c9c9c4
BP
2934}
2935
2936static int
b5d57fc8 2937netdev_linux_set_queue(struct netdev *netdev_,
79f1cbe9 2938 unsigned int queue_id, const struct smap *details)
c1c9c9c4 2939{
b5d57fc8 2940 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2941 int error;
2942
86383816 2943 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2944 if (netdev_linux_netnsid_is_remote(netdev)) {
2945 error = EOPNOTSUPP;
2946 goto exit;
2947 }
2948
b5d57fc8 2949 error = tc_query_qdisc(netdev_);
86383816
BP
2950 if (!error) {
2951 error = (queue_id < netdev->tc->ops->n_queues
2952 && netdev->tc->ops->class_set
2953 ? netdev->tc->ops->class_set(netdev_, queue_id, details)
2954 : EINVAL);
c1c9c9c4
BP
2955 }
2956
e0e2410d
FL
2957exit:
2958 ovs_mutex_unlock(&netdev->mutex);
86383816 2959 return error;
c1c9c9c4
BP
2960}
2961
2962static int
b5d57fc8 2963netdev_linux_delete_queue(struct netdev *netdev_, unsigned int queue_id)
c1c9c9c4 2964{
b5d57fc8 2965 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2966 int error;
2967
86383816 2968 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
2969 if (netdev_linux_netnsid_is_remote(netdev)) {
2970 error = EOPNOTSUPP;
2971 goto exit;
2972 }
2973
b5d57fc8 2974 error = tc_query_qdisc(netdev_);
86383816
BP
2975 if (!error) {
2976 if (netdev->tc->ops->class_delete) {
2977 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
2978 error = (queue
2979 ? netdev->tc->ops->class_delete(netdev_, queue)
2980 : ENOENT);
2981 } else {
2982 error = EINVAL;
2983 }
c1c9c9c4 2984 }
86383816 2985
e0e2410d
FL
2986exit:
2987 ovs_mutex_unlock(&netdev->mutex);
86383816 2988 return error;
c1c9c9c4
BP
2989}
2990
2991static int
b5d57fc8 2992netdev_linux_get_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
2993 unsigned int queue_id,
2994 struct netdev_queue_stats *stats)
2995{
b5d57fc8 2996 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
2997 int error;
2998
86383816 2999 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3000 if (netdev_linux_netnsid_is_remote(netdev)) {
3001 error = EOPNOTSUPP;
3002 goto exit;
3003 }
3004
b5d57fc8 3005 error = tc_query_qdisc(netdev_);
86383816
BP
3006 if (!error) {
3007 if (netdev->tc->ops->class_get_stats) {
3008 const struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3009 if (queue) {
3010 stats->created = queue->created;
3011 error = netdev->tc->ops->class_get_stats(netdev_, queue,
3012 stats);
3013 } else {
3014 error = ENOENT;
3015 }
3016 } else {
3017 error = EOPNOTSUPP;
6dc34a0d 3018 }
c1c9c9c4 3019 }
86383816 3020
e0e2410d
FL
3021exit:
3022 ovs_mutex_unlock(&netdev->mutex);
86383816 3023 return error;
c1c9c9c4
BP
3024}
3025
d57695d7
JS
3026struct queue_dump_state {
3027 struct nl_dump dump;
3028 struct ofpbuf buf;
3029};
3030
23a98ffe 3031static bool
d57695d7 3032start_queue_dump(const struct netdev *netdev, struct queue_dump_state *state)
c1c9c9c4
BP
3033{
3034 struct ofpbuf request;
3035 struct tcmsg *tcmsg;
3036
7874bdff 3037 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, 0, &request);
23a98ffe
BP
3038 if (!tcmsg) {
3039 return false;
3040 }
3c4de644 3041 tcmsg->tcm_parent = 0;
d57695d7 3042 nl_dump_start(&state->dump, NETLINK_ROUTE, &request);
c1c9c9c4 3043 ofpbuf_uninit(&request);
d57695d7
JS
3044
3045 ofpbuf_init(&state->buf, NL_DUMP_BUFSIZE);
23a98ffe 3046 return true;
c1c9c9c4
BP
3047}
3048
d57695d7
JS
3049static int
3050finish_queue_dump(struct queue_dump_state *state)
3051{
3052 ofpbuf_uninit(&state->buf);
3053 return nl_dump_done(&state->dump);
3054}
3055
89454bf4
BP
3056struct netdev_linux_queue_state {
3057 unsigned int *queues;
3058 size_t cur_queue;
3059 size_t n_queues;
3060};
3061
c1c9c9c4 3062static int
89454bf4 3063netdev_linux_queue_dump_start(const struct netdev *netdev_, void **statep)
c1c9c9c4 3064{
e0e2410d 3065 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3066 int error;
3067
86383816 3068 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3069 if (netdev_linux_netnsid_is_remote(netdev)) {
3070 error = EOPNOTSUPP;
3071 goto exit;
3072 }
3073
b5d57fc8 3074 error = tc_query_qdisc(netdev_);
86383816
BP
3075 if (!error) {
3076 if (netdev->tc->ops->class_get) {
89454bf4
BP
3077 struct netdev_linux_queue_state *state;
3078 struct tc_queue *queue;
3079 size_t i;
3080
3081 *statep = state = xmalloc(sizeof *state);
3082 state->n_queues = hmap_count(&netdev->tc->queues);
3083 state->cur_queue = 0;
3084 state->queues = xmalloc(state->n_queues * sizeof *state->queues);
3085
3086 i = 0;
3087 HMAP_FOR_EACH (queue, hmap_node, &netdev->tc->queues) {
3088 state->queues[i++] = queue->queue_id;
86383816 3089 }
c1c9c9c4 3090 } else {
86383816 3091 error = EOPNOTSUPP;
c1c9c9c4
BP
3092 }
3093 }
c1c9c9c4 3094
e0e2410d
FL
3095exit:
3096 ovs_mutex_unlock(&netdev->mutex);
86383816 3097 return error;
c1c9c9c4
BP
3098}
3099
89454bf4
BP
3100static int
3101netdev_linux_queue_dump_next(const struct netdev *netdev_, void *state_,
3102 unsigned int *queue_idp, struct smap *details)
3103{
e0e2410d 3104 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
89454bf4
BP
3105 struct netdev_linux_queue_state *state = state_;
3106 int error = EOF;
3107
3108 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3109 if (netdev_linux_netnsid_is_remote(netdev)) {
3110 error = EOPNOTSUPP;
3111 goto exit;
3112 }
3113
89454bf4
BP
3114 while (state->cur_queue < state->n_queues) {
3115 unsigned int queue_id = state->queues[state->cur_queue++];
3116 struct tc_queue *queue = tc_find_queue(netdev_, queue_id);
3117
3118 if (queue) {
3119 *queue_idp = queue_id;
3120 error = netdev->tc->ops->class_get(netdev_, queue, details);
3121 break;
3122 }
3123 }
89454bf4 3124
e0e2410d
FL
3125exit:
3126 ovs_mutex_unlock(&netdev->mutex);
89454bf4
BP
3127 return error;
3128}
3129
3130static int
3131netdev_linux_queue_dump_done(const struct netdev *netdev OVS_UNUSED,
3132 void *state_)
3133{
3134 struct netdev_linux_queue_state *state = state_;
3135
3136 free(state->queues);
3137 free(state);
3138 return 0;
3139}
3140
c1c9c9c4 3141static int
b5d57fc8 3142netdev_linux_dump_queue_stats(const struct netdev *netdev_,
c1c9c9c4
BP
3143 netdev_dump_queue_stats_cb *cb, void *aux)
3144{
b5d57fc8 3145 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
3146 int error;
3147
86383816 3148 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3149 if (netdev_linux_netnsid_is_remote(netdev)) {
3150 error = EOPNOTSUPP;
3151 goto exit;
3152 }
3153
b5d57fc8 3154 error = tc_query_qdisc(netdev_);
86383816 3155 if (!error) {
d57695d7 3156 struct queue_dump_state state;
c1c9c9c4 3157
86383816
BP
3158 if (!netdev->tc->ops->class_dump_stats) {
3159 error = EOPNOTSUPP;
d57695d7 3160 } else if (!start_queue_dump(netdev_, &state)) {
86383816
BP
3161 error = ENODEV;
3162 } else {
3163 struct ofpbuf msg;
3164 int retval;
3165
d57695d7 3166 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
86383816
BP
3167 retval = netdev->tc->ops->class_dump_stats(netdev_, &msg,
3168 cb, aux);
3169 if (retval) {
3170 error = retval;
3171 }
3172 }
3173
d57695d7 3174 retval = finish_queue_dump(&state);
86383816
BP
3175 if (retval) {
3176 error = retval;
3177 }
c1c9c9c4
BP
3178 }
3179 }
3180
e0e2410d
FL
3181exit:
3182 ovs_mutex_unlock(&netdev->mutex);
86383816 3183 return error;
c1c9c9c4
BP
3184}
3185
8b61709d 3186static int
f1acd62b
BP
3187netdev_linux_set_in4(struct netdev *netdev_, struct in_addr address,
3188 struct in_addr netmask)
8b61709d 3189{
b5d57fc8 3190 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
8b61709d
BP
3191 int error;
3192
86383816 3193 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3194 if (netdev_linux_netnsid_is_remote(netdev)) {
3195 error = EOPNOTSUPP;
3196 goto exit;
3197 }
3198
f1acd62b 3199 error = do_set_addr(netdev_, SIOCSIFADDR, "SIOCSIFADDR", address);
8b61709d 3200 if (!error) {
f1acd62b 3201 if (address.s_addr != INADDR_ANY) {
8b61709d 3202 error = do_set_addr(netdev_, SIOCSIFNETMASK,
f1acd62b 3203 "SIOCSIFNETMASK", netmask);
8b61709d
BP
3204 }
3205 }
49af9a3d 3206
e0e2410d 3207exit:
86383816 3208 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
3209 return error;
3210}
3211
7df6932e
AW
3212/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address.
3213 * Otherwise, sets '*in6' to 'in6addr_any' and returns the corresponding
3214 * error. */
8b61709d 3215static int
a8704b50
PS
3216netdev_linux_get_addr_list(const struct netdev *netdev_,
3217 struct in6_addr **addr, struct in6_addr **mask, int *n_cnt)
8b61709d 3218{
b5d57fc8 3219 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
7df6932e 3220 int error;
86383816
BP
3221
3222 ovs_mutex_lock(&netdev->mutex);
e0e2410d
FL
3223 if (netdev_linux_netnsid_is_remote(netdev)) {
3224 error = EOPNOTSUPP;
3225 goto exit;
3226 }
3227
a8704b50 3228 error = netdev_get_addrs(netdev_get_name(netdev_), addr, mask, n_cnt);
86383816 3229
e0e2410d
FL
3230exit:
3231 ovs_mutex_unlock(&netdev->mutex);
7df6932e 3232 return error;
8b61709d
BP
3233}
3234
3235static void
3236make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
3237{
3238 struct sockaddr_in sin;
3239 memset(&sin, 0, sizeof sin);
3240 sin.sin_family = AF_INET;
3241 sin.sin_addr = addr;
3242 sin.sin_port = 0;
3243
3244 memset(sa, 0, sizeof *sa);
3245 memcpy(sa, &sin, sizeof sin);
3246}
3247
3248static int
3249do_set_addr(struct netdev *netdev,
3250 int ioctl_nr, const char *ioctl_name, struct in_addr addr)
3251{
3252 struct ifreq ifr;
149f577a 3253
259e0b1a
BP
3254 make_in4_sockaddr(&ifr.ifr_addr, addr);
3255 return af_inet_ifreq_ioctl(netdev_get_name(netdev), &ifr, ioctl_nr,
3256 ioctl_name);
8b61709d
BP
3257}
3258
3259/* Adds 'router' as a default IP gateway. */
3260static int
67a4917b 3261netdev_linux_add_router(struct netdev *netdev OVS_UNUSED, struct in_addr router)
8b61709d
BP
3262{
3263 struct in_addr any = { INADDR_ANY };
3264 struct rtentry rt;
3265 int error;
3266
3267 memset(&rt, 0, sizeof rt);
3268 make_in4_sockaddr(&rt.rt_dst, any);
3269 make_in4_sockaddr(&rt.rt_gateway, router);
3270 make_in4_sockaddr(&rt.rt_genmask, any);
3271 rt.rt_flags = RTF_UP | RTF_GATEWAY;
259e0b1a 3272 error = af_inet_ioctl(SIOCADDRT, &rt);
8b61709d 3273 if (error) {
10a89ef0 3274 VLOG_WARN("ioctl(SIOCADDRT): %s", ovs_strerror(error));
8b61709d
BP
3275 }
3276 return error;
3277}
3278
f1acd62b
BP
3279static int
3280netdev_linux_get_next_hop(const struct in_addr *host, struct in_addr *next_hop,
3281 char **netdev_name)
3282{
3283 static const char fn[] = "/proc/net/route";
3284 FILE *stream;
3285 char line[256];
3286 int ln;
3287
3288 *netdev_name = NULL;
3289 stream = fopen(fn, "r");
3290 if (stream == NULL) {
10a89ef0 3291 VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, ovs_strerror(errno));
f1acd62b
BP
3292 return errno;
3293 }
3294
3295 ln = 0;
3296 while (fgets(line, sizeof line, stream)) {
3297 if (++ln >= 2) {
3298 char iface[17];
dbba996b 3299 ovs_be32 dest, gateway, mask;
f1acd62b
BP
3300 int refcnt, metric, mtu;
3301 unsigned int flags, use, window, irtt;
3302
c2c28dfd
BP
3303 if (!ovs_scan(line,
3304 "%16s %"SCNx32" %"SCNx32" %04X %d %u %d %"SCNx32
3305 " %d %u %u\n",
3306 iface, &dest, &gateway, &flags, &refcnt,
3307 &use, &metric, &mask, &mtu, &window, &irtt)) {
d295e8e9 3308 VLOG_WARN_RL(&rl, "%s: could not parse line %d: %s",
f1acd62b
BP
3309 fn, ln, line);
3310 continue;
3311 }
3312 if (!(flags & RTF_UP)) {
3313 /* Skip routes that aren't up. */
3314 continue;
3315 }
3316
3317 /* The output of 'dest', 'mask', and 'gateway' were given in
d295e8e9 3318 * network byte order, so we don't need need any endian
f1acd62b
BP
3319 * conversions here. */
3320 if ((dest & mask) == (host->s_addr & mask)) {
3321 if (!gateway) {
3322 /* The host is directly reachable. */
3323 next_hop->s_addr = 0;
3324 } else {
3325 /* To reach the host, we must go through a gateway. */
3326 next_hop->s_addr = gateway;
3327 }
3328 *netdev_name = xstrdup(iface);
3329 fclose(stream);
3330 return 0;
3331 }
3332 }
3333 }
3334
3335 fclose(stream);
3336 return ENXIO;
3337}
3338
e210037e 3339static int
b5d57fc8 3340netdev_linux_get_status(const struct netdev *netdev_, struct smap *smap)
e210037e 3341{
b5d57fc8 3342 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
275707c3
EJ
3343 int error = 0;
3344
86383816 3345 ovs_mutex_lock(&netdev->mutex);
b5d57fc8
BP
3346 if (!(netdev->cache_valid & VALID_DRVINFO)) {
3347 struct ethtool_cmd *cmd = (struct ethtool_cmd *) &netdev->drvinfo;
275707c3
EJ
3348
3349 COVERAGE_INC(netdev_get_ethtool);
b5d57fc8
BP
3350 memset(&netdev->drvinfo, 0, sizeof netdev->drvinfo);
3351 error = netdev_linux_do_ethtool(netdev->up.name,
275707c3
EJ
3352 cmd,
3353 ETHTOOL_GDRVINFO,
3354 "ETHTOOL_GDRVINFO");
3355 if (!error) {
b5d57fc8 3356 netdev->cache_valid |= VALID_DRVINFO;
275707c3
EJ
3357 }
3358 }
e210037e 3359
e210037e 3360 if (!error) {
b5d57fc8
BP
3361 smap_add(smap, "driver_name", netdev->drvinfo.driver);
3362 smap_add(smap, "driver_version", netdev->drvinfo.version);
3363 smap_add(smap, "firmware_version", netdev->drvinfo.fw_version);
e210037e 3364 }
86383816
BP
3365 ovs_mutex_unlock(&netdev->mutex);
3366
e210037e
AE
3367 return error;
3368}
3369
4f925bd3 3370static int
275707c3
EJ
3371netdev_internal_get_status(const struct netdev *netdev OVS_UNUSED,
3372 struct smap *smap)
4f925bd3 3373{
79f1cbe9 3374 smap_add(smap, "driver_name", "openvswitch");
4f925bd3
PS
3375 return 0;
3376}
3377
25db83be
JH
3378static uint32_t
3379netdev_linux_get_block_id(struct netdev *netdev_)
3380{
3381 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3382 uint32_t block_id = 0;
3383
3384 ovs_mutex_lock(&netdev->mutex);
3385 /* Ensure the linux netdev has had its fields populated. */
3386 if (!(netdev->cache_valid & VALID_IFINDEX)) {
3387 netdev_linux_update_via_netlink(netdev);
3388 }
3389
3390 /* Only assigning block ids to linux netdevs that are LAG masters. */
3391 if (netdev->is_lag_master) {
3392 block_id = netdev->ifindex;
3393 }
3394 ovs_mutex_unlock(&netdev->mutex);
3395
3396 return block_id;
3397}
3398
8b61709d
BP
3399/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
3400 * successfully retrieved, it stores the corresponding MAC address in 'mac' and
3401 * returns 0. Otherwise, it returns a positive errno value; in particular,
3402 * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
3403static int
3404netdev_linux_arp_lookup(const struct netdev *netdev,
74ff3298 3405 ovs_be32 ip, struct eth_addr *mac)
8b61709d
BP
3406{
3407 struct arpreq r;
c100e025 3408 struct sockaddr_in sin;
8b61709d
BP
3409 int retval;
3410
3411 memset(&r, 0, sizeof r);
f2cc621b 3412 memset(&sin, 0, sizeof sin);
c100e025
BP
3413 sin.sin_family = AF_INET;
3414 sin.sin_addr.s_addr = ip;
3415 sin.sin_port = 0;
3416 memcpy(&r.arp_pa, &sin, sizeof sin);
8b61709d
BP
3417 r.arp_ha.sa_family = ARPHRD_ETHER;
3418 r.arp_flags = 0;
71d7c22f 3419 ovs_strzcpy(r.arp_dev, netdev_get_name(netdev), sizeof r.arp_dev);
8b61709d 3420 COVERAGE_INC(netdev_arp_lookup);
259e0b1a 3421 retval = af_inet_ioctl(SIOCGARP, &r);
8b61709d
BP
3422 if (!retval) {
3423 memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
3424 } else if (retval != ENXIO) {
3425 VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
10a89ef0
BP
3426 netdev_get_name(netdev), IP_ARGS(ip),
3427 ovs_strerror(retval));
8b61709d
BP
3428 }
3429 return retval;
3430}
3431
b24751ff 3432static unsigned int
8b61709d
BP
3433nd_to_iff_flags(enum netdev_flags nd)
3434{
b24751ff 3435 unsigned int iff = 0;
8b61709d
BP
3436 if (nd & NETDEV_UP) {
3437 iff |= IFF_UP;
3438 }
3439 if (nd & NETDEV_PROMISC) {
3440 iff |= IFF_PROMISC;
3441 }
7ba19d41
AC
3442 if (nd & NETDEV_LOOPBACK) {
3443 iff |= IFF_LOOPBACK;
3444 }
8b61709d
BP
3445 return iff;
3446}
3447
3448static int
b24751ff 3449iff_to_nd_flags(unsigned int iff)
8b61709d
BP
3450{
3451 enum netdev_flags nd = 0;
3452 if (iff & IFF_UP) {
3453 nd |= NETDEV_UP;
3454 }
3455 if (iff & IFF_PROMISC) {
3456 nd |= NETDEV_PROMISC;
3457 }
7ba19d41
AC
3458 if (iff & IFF_LOOPBACK) {
3459 nd |= NETDEV_LOOPBACK;
3460 }
8b61709d
BP
3461 return nd;
3462}
3463
3464static int
4f9f3f21
BP
3465update_flags(struct netdev_linux *netdev, enum netdev_flags off,
3466 enum netdev_flags on, enum netdev_flags *old_flagsp)
3467 OVS_REQUIRES(netdev->mutex)
8b61709d 3468{
b24751ff 3469 unsigned int old_flags, new_flags;
c37d4da4
EJ
3470 int error = 0;
3471
b5d57fc8 3472 old_flags = netdev->ifi_flags;
c37d4da4
EJ
3473 *old_flagsp = iff_to_nd_flags(old_flags);
3474 new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
3475 if (new_flags != old_flags) {
4f9f3f21
BP
3476 error = set_flags(netdev_get_name(&netdev->up), new_flags);
3477 get_flags(&netdev->up, &netdev->ifi_flags);
8b61709d 3478 }
4f9f3f21
BP
3479
3480 return error;
3481}
3482
3483static int
3484netdev_linux_update_flags(struct netdev *netdev_, enum netdev_flags off,
3485 enum netdev_flags on, enum netdev_flags *old_flagsp)
3486{
3487 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
756819dd 3488 int error = 0;
4f9f3f21
BP
3489
3490 ovs_mutex_lock(&netdev->mutex);
756819dd
FL
3491 if (on || off) {
3492 /* Changing flags over netlink isn't support yet. */
e0e2410d
FL
3493 if (netdev_linux_netnsid_is_remote(netdev)) {
3494 error = EOPNOTSUPP;
3495 goto exit;
3496 }
756819dd
FL
3497 error = update_flags(netdev, off, on, old_flagsp);
3498 } else {
3499 /* Try reading flags over netlink, or fall back to ioctl. */
3500 if (!netdev_linux_update_via_netlink(netdev)) {
3501 *old_flagsp = iff_to_nd_flags(netdev->ifi_flags);
3502 } else {
3503 error = update_flags(netdev, off, on, old_flagsp);
3504 }
3505 }
e0e2410d
FL
3506
3507exit:
86383816 3508 ovs_mutex_unlock(&netdev->mutex);
8b61709d
BP
3509 return error;
3510}
3511
89c09c1c
BP
3512#define NETDEV_LINUX_CLASS_COMMON \
3513 .run = netdev_linux_run, \
3514 .wait = netdev_linux_wait, \
3515 .alloc = netdev_linux_alloc, \
89c09c1c 3516 .dealloc = netdev_linux_dealloc, \
89c09c1c
BP
3517 .send_wait = netdev_linux_send_wait, \
3518 .set_etheraddr = netdev_linux_set_etheraddr, \
3519 .get_etheraddr = netdev_linux_get_etheraddr, \
3520 .get_mtu = netdev_linux_get_mtu, \
3521 .set_mtu = netdev_linux_set_mtu, \
3522 .get_ifindex = netdev_linux_get_ifindex, \
3523 .get_carrier = netdev_linux_get_carrier, \
3524 .get_carrier_resets = netdev_linux_get_carrier_resets, \
3525 .set_miimon_interval = netdev_linux_set_miimon_interval, \
3526 .set_advertisements = netdev_linux_set_advertisements, \
3527 .set_policing = netdev_linux_set_policing, \
3528 .get_qos_types = netdev_linux_get_qos_types, \
3529 .get_qos_capabilities = netdev_linux_get_qos_capabilities, \
3530 .get_qos = netdev_linux_get_qos, \
3531 .set_qos = netdev_linux_set_qos, \
3532 .get_queue = netdev_linux_get_queue, \
3533 .set_queue = netdev_linux_set_queue, \
3534 .delete_queue = netdev_linux_delete_queue, \
3535 .get_queue_stats = netdev_linux_get_queue_stats, \
3536 .queue_dump_start = netdev_linux_queue_dump_start, \
3537 .queue_dump_next = netdev_linux_queue_dump_next, \
3538 .queue_dump_done = netdev_linux_queue_dump_done, \
3539 .dump_queue_stats = netdev_linux_dump_queue_stats, \
3540 .set_in4 = netdev_linux_set_in4, \
3541 .get_addr_list = netdev_linux_get_addr_list, \
3542 .add_router = netdev_linux_add_router, \
3543 .get_next_hop = netdev_linux_get_next_hop, \
3544 .arp_lookup = netdev_linux_arp_lookup, \
3545 .update_flags = netdev_linux_update_flags, \
3546 .rxq_alloc = netdev_linux_rxq_alloc, \
89c09c1c 3547 .rxq_dealloc = netdev_linux_rxq_dealloc, \
89c09c1c
BP
3548 .rxq_wait = netdev_linux_rxq_wait, \
3549 .rxq_drain = netdev_linux_rxq_drain
3550
3551const struct netdev_class netdev_linux_class = {
3552 NETDEV_LINUX_CLASS_COMMON,
89c09c1c 3553 .type = "system",
0de1b425 3554 .is_pmd = false,
89c09c1c 3555 .construct = netdev_linux_construct,
0de1b425 3556 .destruct = netdev_linux_destruct,
89c09c1c
BP
3557 .get_stats = netdev_linux_get_stats,
3558 .get_features = netdev_linux_get_features,
3559 .get_status = netdev_linux_get_status,
0de1b425
WT
3560 .get_block_id = netdev_linux_get_block_id,
3561 .send = netdev_linux_send,
3562 .rxq_construct = netdev_linux_rxq_construct,
3563 .rxq_destruct = netdev_linux_rxq_destruct,
3564 .rxq_recv = netdev_linux_rxq_recv,
89c09c1c
BP
3565};
3566
3567const struct netdev_class netdev_tap_class = {
3568 NETDEV_LINUX_CLASS_COMMON,
3569 .type = "tap",
0de1b425 3570 .is_pmd = false,
89c09c1c 3571 .construct = netdev_linux_construct_tap,
0de1b425 3572 .destruct = netdev_linux_destruct,
89c09c1c
BP
3573 .get_stats = netdev_tap_get_stats,
3574 .get_features = netdev_linux_get_features,
3575 .get_status = netdev_linux_get_status,
0de1b425
WT
3576 .send = netdev_linux_send,
3577 .rxq_construct = netdev_linux_rxq_construct,
3578 .rxq_destruct = netdev_linux_rxq_destruct,
3579 .rxq_recv = netdev_linux_rxq_recv,
89c09c1c
BP
3580};
3581
3582const struct netdev_class netdev_internal_class = {
3583 NETDEV_LINUX_CLASS_COMMON,
3584 .type = "internal",
0de1b425 3585 .is_pmd = false,
89c09c1c 3586 .construct = netdev_linux_construct,
0de1b425 3587 .destruct = netdev_linux_destruct,
89c09c1c
BP
3588 .get_stats = netdev_internal_get_stats,
3589 .get_status = netdev_internal_get_status,
0de1b425
WT
3590 .send = netdev_linux_send,
3591 .rxq_construct = netdev_linux_rxq_construct,
3592 .rxq_destruct = netdev_linux_rxq_destruct,
3593 .rxq_recv = netdev_linux_rxq_recv,
89c09c1c 3594};
0de1b425
WT
3595
3596#ifdef HAVE_AF_XDP
5bfc519f 3597#define NETDEV_AFXDP_CLASS_COMMON \
5119cfe3 3598 .init = netdev_afxdp_init, \
5bfc519f
WT
3599 .construct = netdev_afxdp_construct, \
3600 .destruct = netdev_afxdp_destruct, \
3601 .get_stats = netdev_afxdp_get_stats, \
3602 .get_custom_stats = netdev_afxdp_get_custom_stats, \
3603 .get_status = netdev_linux_get_status, \
3604 .set_config = netdev_afxdp_set_config, \
3605 .get_config = netdev_afxdp_get_config, \
3606 .reconfigure = netdev_afxdp_reconfigure, \
3607 .get_numa_id = netdev_linux_get_numa_id, \
3608 .send = netdev_afxdp_batch_send, \
3609 .rxq_construct = netdev_afxdp_rxq_construct, \
3610 .rxq_destruct = netdev_afxdp_rxq_destruct, \
3611 .rxq_recv = netdev_afxdp_rxq_recv
3612
0de1b425
WT
3613const struct netdev_class netdev_afxdp_class = {
3614 NETDEV_LINUX_CLASS_COMMON,
5bfc519f 3615 NETDEV_AFXDP_CLASS_COMMON,
0de1b425
WT
3616 .type = "afxdp",
3617 .is_pmd = true,
5bfc519f
WT
3618};
3619
3620const struct netdev_class netdev_afxdp_nonpmd_class = {
3621 NETDEV_LINUX_CLASS_COMMON,
3622 NETDEV_AFXDP_CLASS_COMMON,
3623 .type = "afxdp-nonpmd",
3624 .is_pmd = false,
0de1b425
WT
3625};
3626#endif
8b61709d 3627\f
677d9158
JV
3628
3629#define CODEL_N_QUEUES 0x0000
3630
2f4298ce
BP
3631/* In sufficiently new kernel headers these are defined as enums in
3632 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3633 * kernels. (This overrides any enum definition in the header file but that's
3634 * harmless.) */
3635#define TCA_CODEL_TARGET 1
3636#define TCA_CODEL_LIMIT 2
3637#define TCA_CODEL_INTERVAL 3
3638
677d9158
JV
3639struct codel {
3640 struct tc tc;
3641 uint32_t target;
3642 uint32_t limit;
3643 uint32_t interval;
3644};
3645
3646static struct codel *
3647codel_get__(const struct netdev *netdev_)
3648{
3649 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3650 return CONTAINER_OF(netdev->tc, struct codel, tc);
3651}
3652
3653static void
3654codel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3655 uint32_t interval)
3656{
3657 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3658 struct codel *codel;
3659
3660 codel = xmalloc(sizeof *codel);
3661 tc_init(&codel->tc, &tc_ops_codel);
3662 codel->target = target;
3663 codel->limit = limit;
3664 codel->interval = interval;
3665
3666 netdev->tc = &codel->tc;
3667}
3668
3669static int
3670codel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3671 uint32_t interval)
3672{
3673 size_t opt_offset;
3674 struct ofpbuf request;
3675 struct tcmsg *tcmsg;
3676 uint32_t otarget, olimit, ointerval;
3677 int error;
3678
3679 tc_del_qdisc(netdev);
3680
7874bdff
RD
3681 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3682 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3683 if (!tcmsg) {
3684 return ENODEV;
3685 }
3686 tcmsg->tcm_handle = tc_make_handle(1, 0);
3687 tcmsg->tcm_parent = TC_H_ROOT;
3688
3689 otarget = target ? target : 5000;
3690 olimit = limit ? limit : 10240;
3691 ointerval = interval ? interval : 100000;
3692
3693 nl_msg_put_string(&request, TCA_KIND, "codel");
3694 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3695 nl_msg_put_u32(&request, TCA_CODEL_TARGET, otarget);
3696 nl_msg_put_u32(&request, TCA_CODEL_LIMIT, olimit);
3697 nl_msg_put_u32(&request, TCA_CODEL_INTERVAL, ointerval);
3698 nl_msg_end_nested(&request, opt_offset);
3699
3700 error = tc_transact(&request, NULL);
3701 if (error) {
3702 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3703 "target %u, limit %u, interval %u error %d(%s)",
3704 netdev_get_name(netdev),
3705 otarget, olimit, ointerval,
3706 error, ovs_strerror(error));
3707 }
3708 return error;
3709}
3710
3711static void
3712codel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3713 const struct smap *details, struct codel *codel)
3714{
13c1637f
BP
3715 codel->target = smap_get_ullong(details, "target", 0);
3716 codel->limit = smap_get_ullong(details, "limit", 0);
3717 codel->interval = smap_get_ullong(details, "interval", 0);
677d9158
JV
3718
3719 if (!codel->target) {
3720 codel->target = 5000;
3721 }
3722 if (!codel->limit) {
3723 codel->limit = 10240;
3724 }
3725 if (!codel->interval) {
3726 codel->interval = 100000;
3727 }
3728}
3729
3730static int
3731codel_tc_install(struct netdev *netdev, const struct smap *details)
3732{
3733 int error;
3734 struct codel codel;
3735
3736 codel_parse_qdisc_details__(netdev, details, &codel);
3737 error = codel_setup_qdisc__(netdev, codel.target, codel.limit,
3738 codel.interval);
3739 if (!error) {
3740 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3741 }
3742 return error;
3743}
3744
3745static int
3746codel_parse_tca_options__(struct nlattr *nl_options, struct codel *codel)
3747{
3748 static const struct nl_policy tca_codel_policy[] = {
3749 [TCA_CODEL_TARGET] = { .type = NL_A_U32 },
3750 [TCA_CODEL_LIMIT] = { .type = NL_A_U32 },
3751 [TCA_CODEL_INTERVAL] = { .type = NL_A_U32 }
3752 };
3753
3754 struct nlattr *attrs[ARRAY_SIZE(tca_codel_policy)];
3755
3756 if (!nl_parse_nested(nl_options, tca_codel_policy,
3757 attrs, ARRAY_SIZE(tca_codel_policy))) {
3758 VLOG_WARN_RL(&rl, "failed to parse CoDel class options");
3759 return EPROTO;
3760 }
3761
3762 codel->target = nl_attr_get_u32(attrs[TCA_CODEL_TARGET]);
3763 codel->limit = nl_attr_get_u32(attrs[TCA_CODEL_LIMIT]);
3764 codel->interval = nl_attr_get_u32(attrs[TCA_CODEL_INTERVAL]);
3765 return 0;
3766}
3767
3768static int
3769codel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
3770{
3771 struct nlattr *nlattr;
3772 const char * kind;
3773 int error;
3774 struct codel codel;
3775
3776 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
3777 if (error != 0) {
3778 return error;
3779 }
3780
3781 error = codel_parse_tca_options__(nlattr, &codel);
3782 if (error != 0) {
3783 return error;
3784 }
3785
3786 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3787 return 0;
3788}
3789
3790
3791static void
3792codel_tc_destroy(struct tc *tc)
3793{
3794 struct codel *codel = CONTAINER_OF(tc, struct codel, tc);
3795 tc_destroy(tc);
3796 free(codel);
3797}
3798
3799static int
3800codel_qdisc_get(const struct netdev *netdev, struct smap *details)
3801{
3802 const struct codel *codel = codel_get__(netdev);
3803 smap_add_format(details, "target", "%u", codel->target);
3804 smap_add_format(details, "limit", "%u", codel->limit);
3805 smap_add_format(details, "interval", "%u", codel->interval);
3806 return 0;
3807}
3808
3809static int
3810codel_qdisc_set(struct netdev *netdev, const struct smap *details)
3811{
3812 struct codel codel;
3813
3814 codel_parse_qdisc_details__(netdev, details, &codel);
3815 codel_install__(netdev, codel.target, codel.limit, codel.interval);
3816 codel_get__(netdev)->target = codel.target;
3817 codel_get__(netdev)->limit = codel.limit;
3818 codel_get__(netdev)->interval = codel.interval;
3819 return 0;
3820}
3821
3822static const struct tc_ops tc_ops_codel = {
89c09c1c
BP
3823 .linux_name = "codel",
3824 .ovs_name = "linux-codel",
3825 .n_queues = CODEL_N_QUEUES,
3826 .tc_install = codel_tc_install,
3827 .tc_load = codel_tc_load,
3828 .tc_destroy = codel_tc_destroy,
3829 .qdisc_get = codel_qdisc_get,
3830 .qdisc_set = codel_qdisc_set,
677d9158
JV
3831};
3832\f
3833/* FQ-CoDel traffic control class. */
3834
3835#define FQCODEL_N_QUEUES 0x0000
3836
2f4298ce
BP
3837/* In sufficiently new kernel headers these are defined as enums in
3838 * <linux/pkt_sched.h>. Define them here as macros to help out with older
3839 * kernels. (This overrides any enum definition in the header file but that's
3840 * harmless.) */
3841#define TCA_FQ_CODEL_TARGET 1
3842#define TCA_FQ_CODEL_LIMIT 2
3843#define TCA_FQ_CODEL_INTERVAL 3
3844#define TCA_FQ_CODEL_ECN 4
3845#define TCA_FQ_CODEL_FLOWS 5
3846#define TCA_FQ_CODEL_QUANTUM 6
3847
677d9158
JV
3848struct fqcodel {
3849 struct tc tc;
3850 uint32_t target;
3851 uint32_t limit;
3852 uint32_t interval;
3853 uint32_t flows;
3854 uint32_t quantum;
3855};
3856
3857static struct fqcodel *
3858fqcodel_get__(const struct netdev *netdev_)
3859{
3860 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3861 return CONTAINER_OF(netdev->tc, struct fqcodel, tc);
3862}
3863
3864static void
3865fqcodel_install__(struct netdev *netdev_, uint32_t target, uint32_t limit,
3866 uint32_t interval, uint32_t flows, uint32_t quantum)
3867{
3868 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
3869 struct fqcodel *fqcodel;
3870
3871 fqcodel = xmalloc(sizeof *fqcodel);
3872 tc_init(&fqcodel->tc, &tc_ops_fqcodel);
3873 fqcodel->target = target;
3874 fqcodel->limit = limit;
3875 fqcodel->interval = interval;
3876 fqcodel->flows = flows;
3877 fqcodel->quantum = quantum;
3878
3879 netdev->tc = &fqcodel->tc;
3880}
3881
3882static int
3883fqcodel_setup_qdisc__(struct netdev *netdev, uint32_t target, uint32_t limit,
3884 uint32_t interval, uint32_t flows, uint32_t quantum)
3885{
3886 size_t opt_offset;
3887 struct ofpbuf request;
3888 struct tcmsg *tcmsg;
3889 uint32_t otarget, olimit, ointerval, oflows, oquantum;
3890 int error;
3891
3892 tc_del_qdisc(netdev);
3893
7874bdff
RD
3894 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
3895 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
3896 if (!tcmsg) {
3897 return ENODEV;
3898 }
3899 tcmsg->tcm_handle = tc_make_handle(1, 0);
3900 tcmsg->tcm_parent = TC_H_ROOT;
3901
3902 otarget = target ? target : 5000;
3903 olimit = limit ? limit : 10240;
3904 ointerval = interval ? interval : 100000;
3905 oflows = flows ? flows : 1024;
3906 oquantum = quantum ? quantum : 1514; /* fq_codel default quantum is 1514
3907 not mtu */
3908
3909 nl_msg_put_string(&request, TCA_KIND, "fq_codel");
3910 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
3911 nl_msg_put_u32(&request, TCA_FQ_CODEL_TARGET, otarget);
3912 nl_msg_put_u32(&request, TCA_FQ_CODEL_LIMIT, olimit);
3913 nl_msg_put_u32(&request, TCA_FQ_CODEL_INTERVAL, ointerval);
3914 nl_msg_put_u32(&request, TCA_FQ_CODEL_FLOWS, oflows);
3915 nl_msg_put_u32(&request, TCA_FQ_CODEL_QUANTUM, oquantum);
3916 nl_msg_end_nested(&request, opt_offset);
3917
3918 error = tc_transact(&request, NULL);
3919 if (error) {
3920 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
3921 "target %u, limit %u, interval %u, flows %u, quantum %u error %d(%s)",
3922 netdev_get_name(netdev),
3923 otarget, olimit, ointerval, oflows, oquantum,
3924 error, ovs_strerror(error));
3925 }
3926 return error;
3927}
3928
3929static void
3930fqcodel_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
3931 const struct smap *details, struct fqcodel *fqcodel)
3932{
13c1637f
BP
3933 fqcodel->target = smap_get_ullong(details, "target", 0);
3934 fqcodel->limit = smap_get_ullong(details, "limit", 0);
3935 fqcodel->interval = smap_get_ullong(details, "interval", 0);
3936 fqcodel->flows = smap_get_ullong(details, "flows", 0);
3937 fqcodel->quantum = smap_get_ullong(details, "quantum", 0);
3938
677d9158
JV
3939 if (!fqcodel->target) {
3940 fqcodel->target = 5000;
3941 }
3942 if (!fqcodel->limit) {
3943 fqcodel->limit = 10240;
3944 }
3945 if (!fqcodel->interval) {
3946 fqcodel->interval = 1000000;
3947 }
3948 if (!fqcodel->flows) {
3949 fqcodel->flows = 1024;
3950 }
3951 if (!fqcodel->quantum) {
3952 fqcodel->quantum = 1514;
3953 }
3954}
3955
3956static int
3957fqcodel_tc_install(struct netdev *netdev, const struct smap *details)
3958{
3959 int error;
3960 struct fqcodel fqcodel;
3961
3962 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
3963 error = fqcodel_setup_qdisc__(netdev, fqcodel.target, fqcodel.limit,
3964 fqcodel.interval, fqcodel.flows,
3965 fqcodel.quantum);
3966 if (!error) {
3967 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit,
3968 fqcodel.interval, fqcodel.flows, fqcodel.quantum);
3969 }
3970 return error;
3971}
3972
3973static int
3974fqcodel_parse_tca_options__(struct nlattr *nl_options, struct fqcodel *fqcodel)
3975{
3976 static const struct nl_policy tca_fqcodel_policy[] = {
3977 [TCA_FQ_CODEL_TARGET] = { .type = NL_A_U32 },
3978 [TCA_FQ_CODEL_LIMIT] = { .type = NL_A_U32 },
3979 [TCA_FQ_CODEL_INTERVAL] = { .type = NL_A_U32 },
3980 [TCA_FQ_CODEL_FLOWS] = { .type = NL_A_U32 },
3981 [TCA_FQ_CODEL_QUANTUM] = { .type = NL_A_U32 }
3982 };
3983
3984 struct nlattr *attrs[ARRAY_SIZE(tca_fqcodel_policy)];
3985
3986 if (!nl_parse_nested(nl_options, tca_fqcodel_policy,
3987 attrs, ARRAY_SIZE(tca_fqcodel_policy))) {
3988 VLOG_WARN_RL(&rl, "failed to parse FQ_CoDel class options");
3989 return EPROTO;
3990 }
3991
3992 fqcodel->target = nl_attr_get_u32(attrs[TCA_FQ_CODEL_TARGET]);
3993 fqcodel->limit = nl_attr_get_u32(attrs[TCA_FQ_CODEL_LIMIT]);
3994 fqcodel->interval =nl_attr_get_u32(attrs[TCA_FQ_CODEL_INTERVAL]);
3995 fqcodel->flows = nl_attr_get_u32(attrs[TCA_FQ_CODEL_FLOWS]);
3996 fqcodel->quantum = nl_attr_get_u32(attrs[TCA_FQ_CODEL_QUANTUM]);
3997 return 0;
3998}
3999
4000static int
4001fqcodel_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4002{
4003 struct nlattr *nlattr;
4004 const char * kind;
4005 int error;
4006 struct fqcodel fqcodel;
4007
4008 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4009 if (error != 0) {
4010 return error;
4011 }
4012
4013 error = fqcodel_parse_tca_options__(nlattr, &fqcodel);
4014 if (error != 0) {
4015 return error;
4016 }
4017
4018 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4019 fqcodel.flows, fqcodel.quantum);
4020 return 0;
4021}
4022
4023static void
4024fqcodel_tc_destroy(struct tc *tc)
4025{
4026 struct fqcodel *fqcodel = CONTAINER_OF(tc, struct fqcodel, tc);
4027 tc_destroy(tc);
4028 free(fqcodel);
4029}
4030
4031static int
4032fqcodel_qdisc_get(const struct netdev *netdev, struct smap *details)
4033{
4034 const struct fqcodel *fqcodel = fqcodel_get__(netdev);
4035 smap_add_format(details, "target", "%u", fqcodel->target);
4036 smap_add_format(details, "limit", "%u", fqcodel->limit);
4037 smap_add_format(details, "interval", "%u", fqcodel->interval);
4038 smap_add_format(details, "flows", "%u", fqcodel->flows);
4039 smap_add_format(details, "quantum", "%u", fqcodel->quantum);
4040 return 0;
4041}
4042
4043static int
4044fqcodel_qdisc_set(struct netdev *netdev, const struct smap *details)
4045{
4046 struct fqcodel fqcodel;
4047
4048 fqcodel_parse_qdisc_details__(netdev, details, &fqcodel);
4049 fqcodel_install__(netdev, fqcodel.target, fqcodel.limit, fqcodel.interval,
4050 fqcodel.flows, fqcodel.quantum);
4051 fqcodel_get__(netdev)->target = fqcodel.target;
4052 fqcodel_get__(netdev)->limit = fqcodel.limit;
4053 fqcodel_get__(netdev)->interval = fqcodel.interval;
4054 fqcodel_get__(netdev)->flows = fqcodel.flows;
4055 fqcodel_get__(netdev)->quantum = fqcodel.quantum;
4056 return 0;
4057}
4058
4059static const struct tc_ops tc_ops_fqcodel = {
89c09c1c
BP
4060 .linux_name = "fq_codel",
4061 .ovs_name = "linux-fq_codel",
4062 .n_queues = FQCODEL_N_QUEUES,
4063 .tc_install = fqcodel_tc_install,
4064 .tc_load = fqcodel_tc_load,
4065 .tc_destroy = fqcodel_tc_destroy,
4066 .qdisc_get = fqcodel_qdisc_get,
4067 .qdisc_set = fqcodel_qdisc_set,
677d9158
JV
4068};
4069\f
4070/* SFQ traffic control class. */
4071
4072#define SFQ_N_QUEUES 0x0000
4073
4074struct sfq {
4075 struct tc tc;
4076 uint32_t quantum;
4077 uint32_t perturb;
4078};
4079
4080static struct sfq *
4081sfq_get__(const struct netdev *netdev_)
4082{
4083 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4084 return CONTAINER_OF(netdev->tc, struct sfq, tc);
4085}
4086
4087static void
4088sfq_install__(struct netdev *netdev_, uint32_t quantum, uint32_t perturb)
4089{
4090 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4091 struct sfq *sfq;
4092
4093 sfq = xmalloc(sizeof *sfq);
4094 tc_init(&sfq->tc, &tc_ops_sfq);
4095 sfq->perturb = perturb;
4096 sfq->quantum = quantum;
4097
4098 netdev->tc = &sfq->tc;
4099}
4100
4101static int
4102sfq_setup_qdisc__(struct netdev *netdev, uint32_t quantum, uint32_t perturb)
4103{
4104 struct tc_sfq_qopt opt;
4105 struct ofpbuf request;
4106 struct tcmsg *tcmsg;
4107 int mtu;
4108 int mtu_error, error;
4109 mtu_error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
4110
4111 tc_del_qdisc(netdev);
4112
7874bdff
RD
4113 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4114 NLM_F_EXCL | NLM_F_CREATE, &request);
677d9158
JV
4115 if (!tcmsg) {
4116 return ENODEV;
4117 }
4118 tcmsg->tcm_handle = tc_make_handle(1, 0);
4119 tcmsg->tcm_parent = TC_H_ROOT;
4120
4121 memset(&opt, 0, sizeof opt);
4122 if (!quantum) {
4123 if (!mtu_error) {
4124 opt.quantum = mtu; /* if we cannot find mtu, use default */
4125 }
4126 } else {
4127 opt.quantum = quantum;
4128 }
4129
4130 if (!perturb) {
4131 opt.perturb_period = 10;
4132 } else {
4133 opt.perturb_period = perturb;
4134 }
4135
4136 nl_msg_put_string(&request, TCA_KIND, "sfq");
4137 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4138
4139 error = tc_transact(&request, NULL);
4140 if (error) {
4141 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4142 "quantum %u, perturb %u error %d(%s)",
4143 netdev_get_name(netdev),
4144 opt.quantum, opt.perturb_period,
4145 error, ovs_strerror(error));
4146 }
4147 return error;
4148}
4149
4150static void
4151sfq_parse_qdisc_details__(struct netdev *netdev,
4152 const struct smap *details, struct sfq *sfq)
4153{
13c1637f
BP
4154 sfq->perturb = smap_get_ullong(details, "perturb", 0);
4155 sfq->quantum = smap_get_ullong(details, "quantum", 0);
677d9158 4156
677d9158
JV
4157 if (!sfq->perturb) {
4158 sfq->perturb = 10;
4159 }
4160
4161 if (!sfq->quantum) {
13c1637f
BP
4162 int mtu;
4163 if (!netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
677d9158
JV
4164 sfq->quantum = mtu;
4165 } else {
4166 VLOG_WARN_RL(&rl, "when using SFQ, you must specify quantum on a "
4167 "device without mtu");
677d9158
JV
4168 }
4169 }
4170}
4171
4172static int
4173sfq_tc_install(struct netdev *netdev, const struct smap *details)
4174{
4175 int error;
4176 struct sfq sfq;
4177
4178 sfq_parse_qdisc_details__(netdev, details, &sfq);
4179 error = sfq_setup_qdisc__(netdev, sfq.quantum, sfq.perturb);
4180 if (!error) {
4181 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4182 }
4183 return error;
4184}
4185
4186static int
4187sfq_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4188{
4189 const struct tc_sfq_qopt *sfq;
4190 struct nlattr *nlattr;
4191 const char * kind;
4192 int error;
4193
4194 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4195 if (error == 0) {
4196 sfq = nl_attr_get(nlattr);
61265c03 4197 sfq_install__(netdev, sfq->quantum, sfq->perturb_period);
677d9158
JV
4198 return 0;
4199 }
4200
4201 return error;
4202}
4203
4204static void
4205sfq_tc_destroy(struct tc *tc)
4206{
4207 struct sfq *sfq = CONTAINER_OF(tc, struct sfq, tc);
4208 tc_destroy(tc);
4209 free(sfq);
4210}
4211
4212static int
4213sfq_qdisc_get(const struct netdev *netdev, struct smap *details)
4214{
4215 const struct sfq *sfq = sfq_get__(netdev);
4216 smap_add_format(details, "quantum", "%u", sfq->quantum);
4217 smap_add_format(details, "perturb", "%u", sfq->perturb);
4218 return 0;
4219}
4220
4221static int
4222sfq_qdisc_set(struct netdev *netdev, const struct smap *details)
4223{
4224 struct sfq sfq;
4225
4226 sfq_parse_qdisc_details__(netdev, details, &sfq);
4227 sfq_install__(netdev, sfq.quantum, sfq.perturb);
4228 sfq_get__(netdev)->quantum = sfq.quantum;
4229 sfq_get__(netdev)->perturb = sfq.perturb;
4230 return 0;
4231}
4232
4233static const struct tc_ops tc_ops_sfq = {
89c09c1c
BP
4234 .linux_name = "sfq",
4235 .ovs_name = "linux-sfq",
4236 .n_queues = SFQ_N_QUEUES,
4237 .tc_install = sfq_tc_install,
4238 .tc_load = sfq_tc_load,
4239 .tc_destroy = sfq_tc_destroy,
4240 .qdisc_get = sfq_qdisc_get,
4241 .qdisc_set = sfq_qdisc_set,
677d9158
JV
4242};
4243\f
2f564bb1
S
4244/* netem traffic control class. */
4245
4246struct netem {
4247 struct tc tc;
4248 uint32_t latency;
4249 uint32_t limit;
4250 uint32_t loss;
4251};
4252
4253static struct netem *
4254netem_get__(const struct netdev *netdev_)
4255{
4256 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4257 return CONTAINER_OF(netdev->tc, struct netem, tc);
4258}
4259
4260static void
4261netem_install__(struct netdev *netdev_, uint32_t latency,
4262 uint32_t limit, uint32_t loss)
4263{
4264 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4265 struct netem *netem;
4266
4267 netem = xmalloc(sizeof *netem);
4268 tc_init(&netem->tc, &tc_ops_netem);
4269 netem->latency = latency;
4270 netem->limit = limit;
4271 netem->loss = loss;
4272
4273 netdev->tc = &netem->tc;
4274}
4275
4276static int
4277netem_setup_qdisc__(struct netdev *netdev, uint32_t latency,
4278 uint32_t limit, uint32_t loss)
4279{
4280 struct tc_netem_qopt opt;
4281 struct ofpbuf request;
4282 struct tcmsg *tcmsg;
4283 int error;
4284
4285 tc_del_qdisc(netdev);
4286
4287 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4288 NLM_F_EXCL | NLM_F_CREATE, &request);
4289 if (!tcmsg) {
4290 return ENODEV;
4291 }
4292 tcmsg->tcm_handle = tc_make_handle(1, 0);
4293 tcmsg->tcm_parent = TC_H_ROOT;
4294
4295 memset(&opt, 0, sizeof opt);
4296
4297 if (!limit) {
4298 opt.limit = 1000;
4299 } else {
4300 opt.limit = limit;
4301 }
4302
4303 if (loss) {
4304 if (loss > 100) {
4305 VLOG_WARN_RL(&rl,
4306 "loss should be a percentage value between 0 to 100, "
4307 "loss was %u", loss);
4308 return EINVAL;
4309 }
4310 opt.loss = floor(UINT32_MAX * (loss / 100.0));
4311 }
4312
4313 opt.latency = tc_time_to_ticks(latency);
4314
4315 nl_msg_put_string(&request, TCA_KIND, "netem");
4316 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
4317
4318 error = tc_transact(&request, NULL);
4319 if (error) {
4320 VLOG_WARN_RL(&rl, "failed to replace %s qdisc, "
4321 "latency %u, limit %u, loss %u error %d(%s)",
4322 netdev_get_name(netdev),
4323 opt.latency, opt.limit, opt.loss,
4324 error, ovs_strerror(error));
4325 }
4326 return error;
4327}
4328
4329static void
4330netem_parse_qdisc_details__(struct netdev *netdev OVS_UNUSED,
4331 const struct smap *details, struct netem *netem)
4332{
4333 netem->latency = smap_get_ullong(details, "latency", 0);
4334 netem->limit = smap_get_ullong(details, "limit", 0);
4335 netem->loss = smap_get_ullong(details, "loss", 0);
4336
4337 if (!netem->limit) {
4338 netem->limit = 1000;
4339 }
4340}
4341
4342static int
4343netem_tc_install(struct netdev *netdev, const struct smap *details)
4344{
4345 int error;
4346 struct netem netem;
4347
4348 netem_parse_qdisc_details__(netdev, details, &netem);
4349 error = netem_setup_qdisc__(netdev, netem.latency,
4350 netem.limit, netem.loss);
4351 if (!error) {
4352 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4353 }
4354 return error;
4355}
4356
4357static int
4358netem_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg)
4359{
4360 const struct tc_netem_qopt *netem;
4361 struct nlattr *nlattr;
4362 const char *kind;
4363 int error;
4364
4365 error = tc_parse_qdisc(nlmsg, &kind, &nlattr);
4366 if (error == 0) {
4367 netem = nl_attr_get(nlattr);
4368 netem_install__(netdev, netem->latency, netem->limit, netem->loss);
4369 return 0;
4370 }
4371
4372 return error;
4373}
4374
4375static void
4376netem_tc_destroy(struct tc *tc)
4377{
4378 struct netem *netem = CONTAINER_OF(tc, struct netem, tc);
4379 tc_destroy(tc);
4380 free(netem);
4381}
4382
4383static int
4384netem_qdisc_get(const struct netdev *netdev, struct smap *details)
4385{
4386 const struct netem *netem = netem_get__(netdev);
4387 smap_add_format(details, "latency", "%u", netem->latency);
4388 smap_add_format(details, "limit", "%u", netem->limit);
4389 smap_add_format(details, "loss", "%u", netem->loss);
4390 return 0;
4391}
4392
4393static int
4394netem_qdisc_set(struct netdev *netdev, const struct smap *details)
4395{
4396 struct netem netem;
4397
4398 netem_parse_qdisc_details__(netdev, details, &netem);
4399 netem_install__(netdev, netem.latency, netem.limit, netem.loss);
4400 netem_get__(netdev)->latency = netem.latency;
4401 netem_get__(netdev)->limit = netem.limit;
4402 netem_get__(netdev)->loss = netem.loss;
4403 return 0;
4404}
4405
4406static const struct tc_ops tc_ops_netem = {
4407 .linux_name = "netem",
4408 .ovs_name = "linux-netem",
4409 .n_queues = 0,
4410 .tc_install = netem_tc_install,
4411 .tc_load = netem_tc_load,
4412 .tc_destroy = netem_tc_destroy,
4413 .qdisc_get = netem_qdisc_get,
4414 .qdisc_set = netem_qdisc_set,
4415};
4416\f
c1c9c9c4 4417/* HTB traffic control class. */
559843ed 4418
c1c9c9c4 4419#define HTB_N_QUEUES 0xf000
4f631ccd 4420#define HTB_RATE2QUANTUM 10
8b61709d 4421
c1c9c9c4
BP
4422struct htb {
4423 struct tc tc;
4424 unsigned int max_rate; /* In bytes/s. */
4425};
8b61709d 4426
c1c9c9c4 4427struct htb_class {
93b13be8 4428 struct tc_queue tc_queue;
c1c9c9c4
BP
4429 unsigned int min_rate; /* In bytes/s. */
4430 unsigned int max_rate; /* In bytes/s. */
4431 unsigned int burst; /* In bytes. */
4432 unsigned int priority; /* Lower values are higher priorities. */
4433};
8b61709d 4434
c1c9c9c4 4435static struct htb *
b5d57fc8 4436htb_get__(const struct netdev *netdev_)
c1c9c9c4 4437{
b5d57fc8
BP
4438 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4439 return CONTAINER_OF(netdev->tc, struct htb, tc);
c1c9c9c4
BP
4440}
4441
24045e35 4442static void
b5d57fc8 4443htb_install__(struct netdev *netdev_, uint64_t max_rate)
c1c9c9c4 4444{
b5d57fc8 4445 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
4446 struct htb *htb;
4447
4448 htb = xmalloc(sizeof *htb);
4449 tc_init(&htb->tc, &tc_ops_htb);
4450 htb->max_rate = max_rate;
4451
b5d57fc8 4452 netdev->tc = &htb->tc;
c1c9c9c4
BP
4453}
4454
4455/* Create an HTB qdisc.
4456 *
a339aa81 4457 * Equivalent to "tc qdisc add dev <dev> root handle 1: htb default 1". */
c1c9c9c4
BP
4458static int
4459htb_setup_qdisc__(struct netdev *netdev)
4460{
4461 size_t opt_offset;
4462 struct tc_htb_glob opt;
4463 struct ofpbuf request;
4464 struct tcmsg *tcmsg;
4465
4466 tc_del_qdisc(netdev);
4467
7874bdff
RD
4468 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
4469 NLM_F_EXCL | NLM_F_CREATE, &request);
23a98ffe
BP
4470 if (!tcmsg) {
4471 return ENODEV;
4472 }
c1c9c9c4
BP
4473 tcmsg->tcm_handle = tc_make_handle(1, 0);
4474 tcmsg->tcm_parent = TC_H_ROOT;
4475
4476 nl_msg_put_string(&request, TCA_KIND, "htb");
4477
4478 memset(&opt, 0, sizeof opt);
4f631ccd 4479 opt.rate2quantum = HTB_RATE2QUANTUM;
c1c9c9c4 4480 opt.version = 3;
4ecf12d5 4481 opt.defcls = 1;
c1c9c9c4
BP
4482
4483 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4484 nl_msg_put_unspec(&request, TCA_HTB_INIT, &opt, sizeof opt);
4485 nl_msg_end_nested(&request, opt_offset);
4486
4487 return tc_transact(&request, NULL);
4488}
4489
4490/* Equivalent to "tc class replace <dev> classid <handle> parent <parent> htb
4491 * rate <min_rate>bps ceil <max_rate>bps burst <burst>b prio <priority>". */
4492static int
4493htb_setup_class__(struct netdev *netdev, unsigned int handle,
4494 unsigned int parent, struct htb_class *class)
4495{
4496 size_t opt_offset;
4497 struct tc_htb_opt opt;
4498 struct ofpbuf request;
4499 struct tcmsg *tcmsg;
4500 int error;
4501 int mtu;
4502
73371c09 4503 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4504 if (error) {
f915f1a8
BP
4505 VLOG_WARN_RL(&rl, "cannot set up HTB on device %s that lacks MTU",
4506 netdev_get_name(netdev));
9b020780 4507 return error;
f915f1a8 4508 }
c1c9c9c4
BP
4509
4510 memset(&opt, 0, sizeof opt);
4511 tc_fill_rate(&opt.rate, class->min_rate, mtu);
4512 tc_fill_rate(&opt.ceil, class->max_rate, mtu);
4f631ccd
AW
4513 /* Makes sure the quantum is at least MTU. Setting quantum will
4514 * make htb ignore the r2q for this class. */
4515 if ((class->min_rate / HTB_RATE2QUANTUM) < mtu) {
4516 opt.quantum = mtu;
4517 }
c1c9c9c4
BP
4518 opt.buffer = tc_calc_buffer(opt.rate.rate, mtu, class->burst);
4519 opt.cbuffer = tc_calc_buffer(opt.ceil.rate, mtu, class->burst);
4520 opt.prio = class->priority;
4521
7874bdff
RD
4522 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
4523 &request);
23a98ffe
BP
4524 if (!tcmsg) {
4525 return ENODEV;
4526 }
c1c9c9c4
BP
4527 tcmsg->tcm_handle = handle;
4528 tcmsg->tcm_parent = parent;
4529
4530 nl_msg_put_string(&request, TCA_KIND, "htb");
4531 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
4532 nl_msg_put_unspec(&request, TCA_HTB_PARMS, &opt, sizeof opt);
4533 tc_put_rtab(&request, TCA_HTB_RTAB, &opt.rate);
4534 tc_put_rtab(&request, TCA_HTB_CTAB, &opt.ceil);
4535 nl_msg_end_nested(&request, opt_offset);
4536
4537 error = tc_transact(&request, NULL);
4538 if (error) {
4539 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
4540 "min_rate=%u max_rate=%u burst=%u prio=%u (%s)",
4541 netdev_get_name(netdev),
4542 tc_get_major(handle), tc_get_minor(handle),
4543 tc_get_major(parent), tc_get_minor(parent),
4544 class->min_rate, class->max_rate,
10a89ef0 4545 class->burst, class->priority, ovs_strerror(error));
c1c9c9c4
BP
4546 }
4547 return error;
4548}
4549
4550/* Parses Netlink attributes in 'options' for HTB parameters and stores a
4551 * description of them into 'details'. The description complies with the
4552 * specification given in the vswitch database documentation for linux-htb
4553 * queue details. */
4554static int
4555htb_parse_tca_options__(struct nlattr *nl_options, struct htb_class *class)
4556{
4557 static const struct nl_policy tca_htb_policy[] = {
4558 [TCA_HTB_PARMS] = { .type = NL_A_UNSPEC, .optional = false,
4559 .min_len = sizeof(struct tc_htb_opt) },
4560 };
4561
4562 struct nlattr *attrs[ARRAY_SIZE(tca_htb_policy)];
4563 const struct tc_htb_opt *htb;
4564
4565 if (!nl_parse_nested(nl_options, tca_htb_policy,
4566 attrs, ARRAY_SIZE(tca_htb_policy))) {
4567 VLOG_WARN_RL(&rl, "failed to parse HTB class options");
4568 return EPROTO;
4569 }
4570
4571 htb = nl_attr_get(attrs[TCA_HTB_PARMS]);
4572 class->min_rate = htb->rate.rate;
4573 class->max_rate = htb->ceil.rate;
4574 class->burst = tc_ticks_to_bytes(htb->rate.rate, htb->buffer);
4575 class->priority = htb->prio;
4576 return 0;
4577}
4578
4579static int
4580htb_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
4581 struct htb_class *options,
4582 struct netdev_queue_stats *stats)
4583{
4584 struct nlattr *nl_options;
4585 unsigned int handle;
4586 int error;
4587
4588 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
4589 if (!error && queue_id) {
17ee3c1f
BP
4590 unsigned int major = tc_get_major(handle);
4591 unsigned int minor = tc_get_minor(handle);
4592 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
4593 *queue_id = minor - 1;
c1c9c9c4
BP
4594 } else {
4595 error = EPROTO;
4596 }
4597 }
4598 if (!error && options) {
4599 error = htb_parse_tca_options__(nl_options, options);
4600 }
4601 return error;
4602}
4603
4604static void
73371c09 4605htb_parse_qdisc_details__(struct netdev *netdev_,
79f1cbe9 4606 const struct smap *details, struct htb_class *hc)
c1c9c9c4 4607{
73371c09 4608 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4 4609
13c1637f 4610 hc->max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
c1c9c9c4 4611 if (!hc->max_rate) {
a00ca915 4612 enum netdev_features current;
c1c9c9c4 4613
73371c09
BP
4614 netdev_linux_read_features(netdev);
4615 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 4616 hc->max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
c1c9c9c4
BP
4617 }
4618 hc->min_rate = hc->max_rate;
4619 hc->burst = 0;
4620 hc->priority = 0;
4621}
4622
4623static int
4624htb_parse_class_details__(struct netdev *netdev,
79f1cbe9 4625 const struct smap *details, struct htb_class *hc)
c1c9c9c4
BP
4626{
4627 const struct htb *htb = htb_get__(netdev);
9b020780 4628 int mtu, error;
214117fd 4629 unsigned long long int max_rate_bit;
c1c9c9c4 4630
73371c09 4631 error = netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu);
9b020780 4632 if (error) {
f915f1a8
BP
4633 VLOG_WARN_RL(&rl, "cannot parse HTB class on device %s that lacks MTU",
4634 netdev_get_name(netdev));
9b020780 4635 return error;
f915f1a8
BP
4636 }
4637
4f104611
EJ
4638 /* HTB requires at least an mtu sized min-rate to send any traffic even
4639 * on uncongested links. */
13c1637f 4640 hc->min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
4f104611 4641 hc->min_rate = MAX(hc->min_rate, mtu);
c1c9c9c4
BP
4642 hc->min_rate = MIN(hc->min_rate, htb->max_rate);
4643
4644 /* max-rate */
214117fd
KF
4645 max_rate_bit = smap_get_ullong(details, "max-rate", 0);
4646 hc->max_rate = max_rate_bit ? max_rate_bit / 8 : htb->max_rate;
c1c9c9c4
BP
4647 hc->max_rate = MAX(hc->max_rate, hc->min_rate);
4648 hc->max_rate = MIN(hc->max_rate, htb->max_rate);
4649
4650 /* burst
4651 *
4652 * According to hints in the documentation that I've read, it is important
4653 * that 'burst' be at least as big as the largest frame that might be
4654 * transmitted. Also, making 'burst' a bit bigger than necessary is OK,
4655 * but having it a bit too small is a problem. Since netdev_get_mtu()
4656 * doesn't include the Ethernet header, we need to add at least 14 (18?) to
4657 * the MTU. We actually add 64, instead of 14, as a guard against
4658 * additional headers get tacked on somewhere that we're not aware of. */
13c1637f 4659 hc->burst = smap_get_ullong(details, "burst", 0) / 8;
c1c9c9c4
BP
4660 hc->burst = MAX(hc->burst, mtu + 64);
4661
4662 /* priority */
13c1637f 4663 hc->priority = smap_get_ullong(details, "priority", 0);
c1c9c9c4
BP
4664
4665 return 0;
4666}
4667
4668static int
4669htb_query_class__(const struct netdev *netdev, unsigned int handle,
4670 unsigned int parent, struct htb_class *options,
4671 struct netdev_queue_stats *stats)
4672{
4673 struct ofpbuf *reply;
4674 int error;
4675
4676 error = tc_query_class(netdev, handle, parent, &reply);
4677 if (!error) {
4678 error = htb_parse_tcmsg__(reply, NULL, options, stats);
4679 ofpbuf_delete(reply);
4680 }
4681 return error;
4682}
4683
4684static int
79f1cbe9 4685htb_tc_install(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4686{
4687 int error;
4688
4689 error = htb_setup_qdisc__(netdev);
4690 if (!error) {
4691 struct htb_class hc;
4692
4693 htb_parse_qdisc_details__(netdev, details, &hc);
4694 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4695 tc_make_handle(1, 0), &hc);
4696 if (!error) {
4697 htb_install__(netdev, hc.max_rate);
4698 }
4699 }
4700 return error;
4701}
4702
93b13be8
BP
4703static struct htb_class *
4704htb_class_cast__(const struct tc_queue *queue)
4705{
4706 return CONTAINER_OF(queue, struct htb_class, tc_queue);
4707}
4708
c1c9c9c4
BP
4709static void
4710htb_update_queue__(struct netdev *netdev, unsigned int queue_id,
4711 const struct htb_class *hc)
4712{
4713 struct htb *htb = htb_get__(netdev);
93b13be8
BP
4714 size_t hash = hash_int(queue_id, 0);
4715 struct tc_queue *queue;
c1c9c9c4
BP
4716 struct htb_class *hcp;
4717
93b13be8
BP
4718 queue = tc_find_queue__(netdev, queue_id, hash);
4719 if (queue) {
4720 hcp = htb_class_cast__(queue);
4721 } else {
c1c9c9c4 4722 hcp = xmalloc(sizeof *hcp);
93b13be8
BP
4723 queue = &hcp->tc_queue;
4724 queue->queue_id = queue_id;
6dc34a0d 4725 queue->created = time_msec();
93b13be8 4726 hmap_insert(&htb->tc.queues, &queue->hmap_node, hash);
c1c9c9c4 4727 }
93b13be8
BP
4728
4729 hcp->min_rate = hc->min_rate;
4730 hcp->max_rate = hc->max_rate;
4731 hcp->burst = hc->burst;
4732 hcp->priority = hc->priority;
c1c9c9c4
BP
4733}
4734
4735static int
4736htb_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
4737{
c1c9c9c4 4738 struct ofpbuf msg;
d57695d7 4739 struct queue_dump_state state;
c1c9c9c4 4740 struct htb_class hc;
c1c9c9c4
BP
4741
4742 /* Get qdisc options. */
4743 hc.max_rate = 0;
4744 htb_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 4745 htb_install__(netdev, hc.max_rate);
c1c9c9c4
BP
4746
4747 /* Get queues. */
d57695d7 4748 if (!start_queue_dump(netdev, &state)) {
23a98ffe
BP
4749 return ENODEV;
4750 }
d57695d7 4751 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
c1c9c9c4
BP
4752 unsigned int queue_id;
4753
4754 if (!htb_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
4755 htb_update_queue__(netdev, queue_id, &hc);
4756 }
4757 }
d57695d7 4758 finish_queue_dump(&state);
c1c9c9c4
BP
4759
4760 return 0;
4761}
4762
4763static void
4764htb_tc_destroy(struct tc *tc)
4765{
4766 struct htb *htb = CONTAINER_OF(tc, struct htb, tc);
4ec3d7c7 4767 struct htb_class *hc;
c1c9c9c4 4768
4ec3d7c7 4769 HMAP_FOR_EACH_POP (hc, tc_queue.hmap_node, &htb->tc.queues) {
c1c9c9c4
BP
4770 free(hc);
4771 }
4772 tc_destroy(tc);
4773 free(htb);
4774}
4775
4776static int
79f1cbe9 4777htb_qdisc_get(const struct netdev *netdev, struct smap *details)
c1c9c9c4
BP
4778{
4779 const struct htb *htb = htb_get__(netdev);
79f1cbe9 4780 smap_add_format(details, "max-rate", "%llu", 8ULL * htb->max_rate);
c1c9c9c4
BP
4781 return 0;
4782}
4783
4784static int
79f1cbe9 4785htb_qdisc_set(struct netdev *netdev, const struct smap *details)
c1c9c9c4
BP
4786{
4787 struct htb_class hc;
4788 int error;
4789
4790 htb_parse_qdisc_details__(netdev, details, &hc);
4791 error = htb_setup_class__(netdev, tc_make_handle(1, 0xfffe),
4792 tc_make_handle(1, 0), &hc);
4793 if (!error) {
4794 htb_get__(netdev)->max_rate = hc.max_rate;
4795 }
4796 return error;
4797}
4798
4799static int
93b13be8 4800htb_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 4801 const struct tc_queue *queue, struct smap *details)
c1c9c9c4 4802{
93b13be8 4803 const struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4804
79f1cbe9 4805 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
c1c9c9c4 4806 if (hc->min_rate != hc->max_rate) {
79f1cbe9 4807 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
c1c9c9c4 4808 }
79f1cbe9 4809 smap_add_format(details, "burst", "%llu", 8ULL * hc->burst);
c1c9c9c4 4810 if (hc->priority) {
79f1cbe9 4811 smap_add_format(details, "priority", "%u", hc->priority);
c1c9c9c4
BP
4812 }
4813 return 0;
4814}
4815
4816static int
4817htb_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 4818 const struct smap *details)
c1c9c9c4
BP
4819{
4820 struct htb_class hc;
4821 int error;
4822
4823 error = htb_parse_class_details__(netdev, details, &hc);
4824 if (error) {
4825 return error;
4826 }
4827
17ee3c1f 4828 error = htb_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
c1c9c9c4
BP
4829 tc_make_handle(1, 0xfffe), &hc);
4830 if (error) {
4831 return error;
4832 }
4833
4834 htb_update_queue__(netdev, queue_id, &hc);
4835 return 0;
4836}
4837
4838static int
93b13be8 4839htb_class_delete(struct netdev *netdev, struct tc_queue *queue)
c1c9c9c4 4840{
93b13be8 4841 struct htb_class *hc = htb_class_cast__(queue);
c1c9c9c4 4842 struct htb *htb = htb_get__(netdev);
c1c9c9c4
BP
4843 int error;
4844
93b13be8 4845 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
c1c9c9c4 4846 if (!error) {
93b13be8 4847 hmap_remove(&htb->tc.queues, &hc->tc_queue.hmap_node);
c1c9c9c4 4848 free(hc);
c1c9c9c4
BP
4849 }
4850 return error;
4851}
4852
4853static int
93b13be8 4854htb_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
c1c9c9c4
BP
4855 struct netdev_queue_stats *stats)
4856{
93b13be8 4857 return htb_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
c1c9c9c4
BP
4858 tc_make_handle(1, 0xfffe), NULL, stats);
4859}
4860
4861static int
4862htb_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
4863 const struct ofpbuf *nlmsg,
4864 netdev_dump_queue_stats_cb *cb, void *aux)
4865{
4866 struct netdev_queue_stats stats;
17ee3c1f 4867 unsigned int handle, major, minor;
c1c9c9c4
BP
4868 int error;
4869
4870 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
4871 if (error) {
4872 return error;
4873 }
4874
17ee3c1f
BP
4875 major = tc_get_major(handle);
4876 minor = tc_get_minor(handle);
4877 if (major == 1 && minor > 0 && minor <= HTB_N_QUEUES) {
d5590e7e 4878 (*cb)(minor - 1, &stats, aux);
c1c9c9c4
BP
4879 }
4880 return 0;
4881}
4882
4883static const struct tc_ops tc_ops_htb = {
89c09c1c
BP
4884 .linux_name = "htb",
4885 .ovs_name = "linux-htb",
4886 .n_queues = HTB_N_QUEUES,
4887 .tc_install = htb_tc_install,
4888 .tc_load = htb_tc_load,
4889 .tc_destroy = htb_tc_destroy,
4890 .qdisc_get = htb_qdisc_get,
4891 .qdisc_set = htb_qdisc_set,
4892 .class_get = htb_class_get,
4893 .class_set = htb_class_set,
4894 .class_delete = htb_class_delete,
4895 .class_get_stats = htb_class_get_stats,
4896 .class_dump_stats = htb_class_dump_stats
c1c9c9c4
BP
4897};
4898\f
a339aa81
EJ
4899/* "linux-hfsc" traffic control class. */
4900
4901#define HFSC_N_QUEUES 0xf000
4902
4903struct hfsc {
4904 struct tc tc;
4905 uint32_t max_rate;
4906};
4907
4908struct hfsc_class {
4909 struct tc_queue tc_queue;
4910 uint32_t min_rate;
4911 uint32_t max_rate;
4912};
4913
4914static struct hfsc *
b5d57fc8 4915hfsc_get__(const struct netdev *netdev_)
a339aa81 4916{
b5d57fc8
BP
4917 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
4918 return CONTAINER_OF(netdev->tc, struct hfsc, tc);
a339aa81
EJ
4919}
4920
4921static struct hfsc_class *
4922hfsc_class_cast__(const struct tc_queue *queue)
4923{
4924 return CONTAINER_OF(queue, struct hfsc_class, tc_queue);
4925}
4926
24045e35 4927static void
b5d57fc8 4928hfsc_install__(struct netdev *netdev_, uint32_t max_rate)
a339aa81 4929{
b5d57fc8 4930 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81
EJ
4931 struct hfsc *hfsc;
4932
a339aa81
EJ
4933 hfsc = xmalloc(sizeof *hfsc);
4934 tc_init(&hfsc->tc, &tc_ops_hfsc);
4935 hfsc->max_rate = max_rate;
b5d57fc8 4936 netdev->tc = &hfsc->tc;
a339aa81
EJ
4937}
4938
4939static void
4940hfsc_update_queue__(struct netdev *netdev, unsigned int queue_id,
4941 const struct hfsc_class *hc)
4942{
4943 size_t hash;
4944 struct hfsc *hfsc;
4945 struct hfsc_class *hcp;
4946 struct tc_queue *queue;
4947
4948 hfsc = hfsc_get__(netdev);
4949 hash = hash_int(queue_id, 0);
4950
4951 queue = tc_find_queue__(netdev, queue_id, hash);
4952 if (queue) {
4953 hcp = hfsc_class_cast__(queue);
4954 } else {
4955 hcp = xmalloc(sizeof *hcp);
4956 queue = &hcp->tc_queue;
4957 queue->queue_id = queue_id;
6dc34a0d 4958 queue->created = time_msec();
a339aa81
EJ
4959 hmap_insert(&hfsc->tc.queues, &queue->hmap_node, hash);
4960 }
4961
4962 hcp->min_rate = hc->min_rate;
4963 hcp->max_rate = hc->max_rate;
4964}
4965
4966static int
4967hfsc_parse_tca_options__(struct nlattr *nl_options, struct hfsc_class *class)
4968{
4969 const struct tc_service_curve *rsc, *fsc, *usc;
4970 static const struct nl_policy tca_hfsc_policy[] = {
4971 [TCA_HFSC_RSC] = {
4972 .type = NL_A_UNSPEC,
4973 .optional = false,
4974 .min_len = sizeof(struct tc_service_curve),
4975 },
4976 [TCA_HFSC_FSC] = {
4977 .type = NL_A_UNSPEC,
4978 .optional = false,
4979 .min_len = sizeof(struct tc_service_curve),
4980 },
4981 [TCA_HFSC_USC] = {
4982 .type = NL_A_UNSPEC,
4983 .optional = false,
4984 .min_len = sizeof(struct tc_service_curve),
4985 },
4986 };
4987 struct nlattr *attrs[ARRAY_SIZE(tca_hfsc_policy)];
4988
4989 if (!nl_parse_nested(nl_options, tca_hfsc_policy,
4990 attrs, ARRAY_SIZE(tca_hfsc_policy))) {
4991 VLOG_WARN_RL(&rl, "failed to parse HFSC class options");
4992 return EPROTO;
4993 }
4994
4995 rsc = nl_attr_get(attrs[TCA_HFSC_RSC]);
4996 fsc = nl_attr_get(attrs[TCA_HFSC_FSC]);
4997 usc = nl_attr_get(attrs[TCA_HFSC_USC]);
4998
4999 if (rsc->m1 != 0 || rsc->d != 0 ||
5000 fsc->m1 != 0 || fsc->d != 0 ||
5001 usc->m1 != 0 || usc->d != 0) {
5002 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5003 "Non-linear service curves are not supported.");
5004 return EPROTO;
5005 }
5006
5007 if (rsc->m2 != fsc->m2) {
5008 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5009 "Real-time service curves are not supported ");
5010 return EPROTO;
5011 }
5012
5013 if (rsc->m2 > usc->m2) {
5014 VLOG_WARN_RL(&rl, "failed to parse HFSC class options. "
5015 "Min-rate service curve is greater than "
5016 "the max-rate service curve.");
5017 return EPROTO;
5018 }
5019
5020 class->min_rate = fsc->m2;
5021 class->max_rate = usc->m2;
5022 return 0;
5023}
5024
5025static int
5026hfsc_parse_tcmsg__(struct ofpbuf *tcmsg, unsigned int *queue_id,
5027 struct hfsc_class *options,
5028 struct netdev_queue_stats *stats)
5029{
5030 int error;
5031 unsigned int handle;
5032 struct nlattr *nl_options;
5033
5034 error = tc_parse_class(tcmsg, &handle, &nl_options, stats);
5035 if (error) {
5036 return error;
5037 }
5038
5039 if (queue_id) {
5040 unsigned int major, minor;
5041
5042 major = tc_get_major(handle);
5043 minor = tc_get_minor(handle);
5044 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5045 *queue_id = minor - 1;
5046 } else {
5047 return EPROTO;
5048 }
5049 }
5050
5051 if (options) {
5052 error = hfsc_parse_tca_options__(nl_options, options);
5053 }
5054
5055 return error;
5056}
5057
5058static int
5059hfsc_query_class__(const struct netdev *netdev, unsigned int handle,
5060 unsigned int parent, struct hfsc_class *options,
5061 struct netdev_queue_stats *stats)
5062{
5063 int error;
5064 struct ofpbuf *reply;
5065
5066 error = tc_query_class(netdev, handle, parent, &reply);
5067 if (error) {
5068 return error;
5069 }
5070
5071 error = hfsc_parse_tcmsg__(reply, NULL, options, stats);
5072 ofpbuf_delete(reply);
5073 return error;
5074}
5075
5076static void
73371c09 5077hfsc_parse_qdisc_details__(struct netdev *netdev_, const struct smap *details,
a339aa81
EJ
5078 struct hfsc_class *class)
5079{
73371c09 5080 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
a339aa81 5081
13c1637f 5082 uint32_t max_rate = smap_get_ullong(details, "max-rate", 0) / 8;
a339aa81 5083 if (!max_rate) {
a00ca915 5084 enum netdev_features current;
a339aa81 5085
73371c09
BP
5086 netdev_linux_read_features(netdev);
5087 current = !netdev->get_features_error ? netdev->current : 0;
d02a5f8e 5088 max_rate = netdev_features_to_bps(current, 100 * 1000 * 1000) / 8;
a339aa81
EJ
5089 }
5090
5091 class->min_rate = max_rate;
5092 class->max_rate = max_rate;
5093}
5094
5095static int
5096hfsc_parse_class_details__(struct netdev *netdev,
79f1cbe9 5097 const struct smap *details,
a339aa81
EJ
5098 struct hfsc_class * class)
5099{
5100 const struct hfsc *hfsc;
5101 uint32_t min_rate, max_rate;
a339aa81
EJ
5102
5103 hfsc = hfsc_get__(netdev);
a339aa81 5104
13c1637f 5105 min_rate = smap_get_ullong(details, "min-rate", 0) / 8;
79398bad 5106 min_rate = MAX(min_rate, 1);
a339aa81
EJ
5107 min_rate = MIN(min_rate, hfsc->max_rate);
5108
13c1637f 5109 max_rate = smap_get_ullong(details, "max-rate", hfsc->max_rate * 8) / 8;
a339aa81
EJ
5110 max_rate = MAX(max_rate, min_rate);
5111 max_rate = MIN(max_rate, hfsc->max_rate);
5112
5113 class->min_rate = min_rate;
5114 class->max_rate = max_rate;
5115
5116 return 0;
5117}
5118
5119/* Create an HFSC qdisc.
5120 *
5121 * Equivalent to "tc qdisc add dev <dev> root handle 1: hfsc default 1". */
5122static int
5123hfsc_setup_qdisc__(struct netdev * netdev)
5124{
5125 struct tcmsg *tcmsg;
5126 struct ofpbuf request;
5127 struct tc_hfsc_qopt opt;
5128
5129 tc_del_qdisc(netdev);
5130
7874bdff
RD
5131 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWQDISC,
5132 NLM_F_EXCL | NLM_F_CREATE, &request);
a339aa81
EJ
5133
5134 if (!tcmsg) {
5135 return ENODEV;
5136 }
5137
5138 tcmsg->tcm_handle = tc_make_handle(1, 0);
5139 tcmsg->tcm_parent = TC_H_ROOT;
5140
5141 memset(&opt, 0, sizeof opt);
5142 opt.defcls = 1;
5143
5144 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5145 nl_msg_put_unspec(&request, TCA_OPTIONS, &opt, sizeof opt);
5146
5147 return tc_transact(&request, NULL);
5148}
5149
5150/* Create an HFSC class.
5151 *
5152 * Equivalent to "tc class add <dev> parent <parent> classid <handle> hfsc
5153 * sc rate <min_rate> ul rate <max_rate>" */
5154static int
5155hfsc_setup_class__(struct netdev *netdev, unsigned int handle,
5156 unsigned int parent, struct hfsc_class *class)
5157{
5158 int error;
5159 size_t opt_offset;
5160 struct tcmsg *tcmsg;
5161 struct ofpbuf request;
5162 struct tc_service_curve min, max;
5163
7874bdff
RD
5164 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTCLASS, NLM_F_CREATE,
5165 &request);
a339aa81
EJ
5166
5167 if (!tcmsg) {
5168 return ENODEV;
5169 }
5170
5171 tcmsg->tcm_handle = handle;
5172 tcmsg->tcm_parent = parent;
5173
5174 min.m1 = 0;
5175 min.d = 0;
5176 min.m2 = class->min_rate;
5177
5178 max.m1 = 0;
5179 max.d = 0;
5180 max.m2 = class->max_rate;
5181
5182 nl_msg_put_string(&request, TCA_KIND, "hfsc");
5183 opt_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5184 nl_msg_put_unspec(&request, TCA_HFSC_RSC, &min, sizeof min);
5185 nl_msg_put_unspec(&request, TCA_HFSC_FSC, &min, sizeof min);
5186 nl_msg_put_unspec(&request, TCA_HFSC_USC, &max, sizeof max);
5187 nl_msg_end_nested(&request, opt_offset);
5188
5189 error = tc_transact(&request, NULL);
5190 if (error) {
5191 VLOG_WARN_RL(&rl, "failed to replace %s class %u:%u, parent %u:%u, "
5192 "min-rate %ubps, max-rate %ubps (%s)",
5193 netdev_get_name(netdev),
5194 tc_get_major(handle), tc_get_minor(handle),
5195 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5196 class->min_rate, class->max_rate, ovs_strerror(error));
a339aa81
EJ
5197 }
5198
5199 return error;
5200}
5201
5202static int
79f1cbe9 5203hfsc_tc_install(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
5204{
5205 int error;
5206 struct hfsc_class class;
5207
5208 error = hfsc_setup_qdisc__(netdev);
5209
5210 if (error) {
5211 return error;
5212 }
5213
5214 hfsc_parse_qdisc_details__(netdev, details, &class);
5215 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5216 tc_make_handle(1, 0), &class);
5217
5218 if (error) {
5219 return error;
5220 }
5221
5222 hfsc_install__(netdev, class.max_rate);
5223 return 0;
5224}
5225
5226static int
5227hfsc_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5228{
5229 struct ofpbuf msg;
d57695d7 5230 struct queue_dump_state state;
a339aa81
EJ
5231 struct hfsc_class hc;
5232
5233 hc.max_rate = 0;
5234 hfsc_query_class__(netdev, tc_make_handle(1, 0xfffe), 0, &hc, NULL);
24045e35 5235 hfsc_install__(netdev, hc.max_rate);
a339aa81 5236
d57695d7 5237 if (!start_queue_dump(netdev, &state)) {
a339aa81
EJ
5238 return ENODEV;
5239 }
5240
d57695d7 5241 while (nl_dump_next(&state.dump, &msg, &state.buf)) {
a339aa81
EJ
5242 unsigned int queue_id;
5243
5244 if (!hfsc_parse_tcmsg__(&msg, &queue_id, &hc, NULL)) {
5245 hfsc_update_queue__(netdev, queue_id, &hc);
5246 }
5247 }
5248
d57695d7 5249 finish_queue_dump(&state);
a339aa81
EJ
5250 return 0;
5251}
5252
5253static void
5254hfsc_tc_destroy(struct tc *tc)
5255{
5256 struct hfsc *hfsc;
5257 struct hfsc_class *hc, *next;
5258
5259 hfsc = CONTAINER_OF(tc, struct hfsc, tc);
5260
5261 HMAP_FOR_EACH_SAFE (hc, next, tc_queue.hmap_node, &hfsc->tc.queues) {
5262 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5263 free(hc);
5264 }
5265
5266 tc_destroy(tc);
5267 free(hfsc);
5268}
5269
5270static int
79f1cbe9 5271hfsc_qdisc_get(const struct netdev *netdev, struct smap *details)
a339aa81
EJ
5272{
5273 const struct hfsc *hfsc;
5274 hfsc = hfsc_get__(netdev);
79f1cbe9 5275 smap_add_format(details, "max-rate", "%llu", 8ULL * hfsc->max_rate);
a339aa81
EJ
5276 return 0;
5277}
5278
5279static int
79f1cbe9 5280hfsc_qdisc_set(struct netdev *netdev, const struct smap *details)
a339aa81
EJ
5281{
5282 int error;
5283 struct hfsc_class class;
5284
5285 hfsc_parse_qdisc_details__(netdev, details, &class);
5286 error = hfsc_setup_class__(netdev, tc_make_handle(1, 0xfffe),
5287 tc_make_handle(1, 0), &class);
5288
5289 if (!error) {
5290 hfsc_get__(netdev)->max_rate = class.max_rate;
5291 }
5292
5293 return error;
5294}
5295
5296static int
5297hfsc_class_get(const struct netdev *netdev OVS_UNUSED,
79f1cbe9 5298 const struct tc_queue *queue, struct smap *details)
a339aa81
EJ
5299{
5300 const struct hfsc_class *hc;
5301
5302 hc = hfsc_class_cast__(queue);
79f1cbe9 5303 smap_add_format(details, "min-rate", "%llu", 8ULL * hc->min_rate);
a339aa81 5304 if (hc->min_rate != hc->max_rate) {
79f1cbe9 5305 smap_add_format(details, "max-rate", "%llu", 8ULL * hc->max_rate);
a339aa81
EJ
5306 }
5307 return 0;
5308}
5309
5310static int
5311hfsc_class_set(struct netdev *netdev, unsigned int queue_id,
79f1cbe9 5312 const struct smap *details)
a339aa81
EJ
5313{
5314 int error;
5315 struct hfsc_class class;
5316
5317 error = hfsc_parse_class_details__(netdev, details, &class);
5318 if (error) {
5319 return error;
5320 }
5321
5322 error = hfsc_setup_class__(netdev, tc_make_handle(1, queue_id + 1),
5323 tc_make_handle(1, 0xfffe), &class);
5324 if (error) {
5325 return error;
5326 }
5327
5328 hfsc_update_queue__(netdev, queue_id, &class);
5329 return 0;
5330}
5331
5332static int
5333hfsc_class_delete(struct netdev *netdev, struct tc_queue *queue)
5334{
5335 int error;
5336 struct hfsc *hfsc;
5337 struct hfsc_class *hc;
5338
5339 hc = hfsc_class_cast__(queue);
5340 hfsc = hfsc_get__(netdev);
5341
5342 error = tc_delete_class(netdev, tc_make_handle(1, queue->queue_id + 1));
5343 if (!error) {
5344 hmap_remove(&hfsc->tc.queues, &hc->tc_queue.hmap_node);
5345 free(hc);
5346 }
5347 return error;
5348}
5349
5350static int
5351hfsc_class_get_stats(const struct netdev *netdev, const struct tc_queue *queue,
5352 struct netdev_queue_stats *stats)
5353{
5354 return hfsc_query_class__(netdev, tc_make_handle(1, queue->queue_id + 1),
5355 tc_make_handle(1, 0xfffe), NULL, stats);
5356}
5357
5358static int
5359hfsc_class_dump_stats(const struct netdev *netdev OVS_UNUSED,
5360 const struct ofpbuf *nlmsg,
5361 netdev_dump_queue_stats_cb *cb, void *aux)
5362{
5363 struct netdev_queue_stats stats;
5364 unsigned int handle, major, minor;
5365 int error;
5366
5367 error = tc_parse_class(nlmsg, &handle, NULL, &stats);
5368 if (error) {
5369 return error;
5370 }
5371
5372 major = tc_get_major(handle);
5373 minor = tc_get_minor(handle);
5374 if (major == 1 && minor > 0 && minor <= HFSC_N_QUEUES) {
5375 (*cb)(minor - 1, &stats, aux);
5376 }
5377 return 0;
5378}
5379
5380static const struct tc_ops tc_ops_hfsc = {
89c09c1c
BP
5381 .linux_name = "hfsc",
5382 .ovs_name = "linux-hfsc",
5383 .n_queues = HFSC_N_QUEUES, /* n_queues */
5384 .tc_install = hfsc_tc_install,
5385 .tc_load = hfsc_tc_load,
5386 .tc_destroy = hfsc_tc_destroy,
5387 .qdisc_get = hfsc_qdisc_get,
5388 .qdisc_set = hfsc_qdisc_set,
5389 .class_get = hfsc_class_get,
5390 .class_set = hfsc_class_set,
5391 .class_delete = hfsc_class_delete,
5392 .class_get_stats = hfsc_class_get_stats,
5393 .class_dump_stats = hfsc_class_dump_stats,
a339aa81
EJ
5394};
5395\f
6cf888b8
BS
5396/* "linux-noop" traffic control class. */
5397
5398static void
5399noop_install__(struct netdev *netdev_)
5400{
5401 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
5402 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
5403
5404 netdev->tc = CONST_CAST(struct tc *, &tc);
5405}
5406
5407static int
5408noop_tc_install(struct netdev *netdev,
5409 const struct smap *details OVS_UNUSED)
5410{
5411 noop_install__(netdev);
5412 return 0;
5413}
5414
5415static int
5416noop_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5417{
5418 noop_install__(netdev);
5419 return 0;
5420}
5421
5422static const struct tc_ops tc_ops_noop = {
89c09c1c
BP
5423 .ovs_name = "linux-noop", /* ovs_name */
5424 .tc_install = noop_tc_install,
5425 .tc_load = noop_tc_load,
6cf888b8
BS
5426};
5427\f
c1c9c9c4
BP
5428/* "linux-default" traffic control class.
5429 *
5430 * This class represents the default, unnamed Linux qdisc. It corresponds to
5431 * the "" (empty string) QoS type in the OVS database. */
5432
5433static void
b5d57fc8 5434default_install__(struct netdev *netdev_)
c1c9c9c4 5435{
b5d57fc8 5436 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 5437 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_default);
c1c9c9c4 5438
559eb230
BP
5439 /* Nothing but a tc class implementation is allowed to write to a tc. This
5440 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 5441 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
5442}
5443
5444static int
5445default_tc_install(struct netdev *netdev,
79f1cbe9 5446 const struct smap *details OVS_UNUSED)
c1c9c9c4
BP
5447{
5448 default_install__(netdev);
5449 return 0;
5450}
5451
5452static int
5453default_tc_load(struct netdev *netdev, struct ofpbuf *nlmsg OVS_UNUSED)
5454{
5455 default_install__(netdev);
5456 return 0;
5457}
5458
5459static const struct tc_ops tc_ops_default = {
89c09c1c
BP
5460 .ovs_name = "", /* ovs_name */
5461 .tc_install = default_tc_install,
5462 .tc_load = default_tc_load,
c1c9c9c4
BP
5463};
5464\f
5465/* "linux-other" traffic control class.
5466 *
5467 * */
5468
5469static int
b5d57fc8 5470other_tc_load(struct netdev *netdev_, struct ofpbuf *nlmsg OVS_UNUSED)
c1c9c9c4 5471{
b5d57fc8 5472 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
559eb230 5473 static const struct tc tc = TC_INITIALIZER(&tc, &tc_ops_other);
c1c9c9c4 5474
559eb230
BP
5475 /* Nothing but a tc class implementation is allowed to write to a tc. This
5476 * class never does that, so we can legitimately use a const tc object. */
b5d57fc8 5477 netdev->tc = CONST_CAST(struct tc *, &tc);
c1c9c9c4
BP
5478 return 0;
5479}
5480
5481static const struct tc_ops tc_ops_other = {
89c09c1c
BP
5482 .ovs_name = "linux-other",
5483 .tc_load = other_tc_load,
c1c9c9c4
BP
5484};
5485\f
5486/* Traffic control. */
5487
5488/* Number of kernel "tc" ticks per second. */
5489static double ticks_per_s;
5490
5491/* Number of kernel "jiffies" per second. This is used for the purpose of
5492 * computing buffer sizes. Generally kernel qdiscs need to be able to buffer
5493 * one jiffy's worth of data.
5494 *
5495 * There are two possibilities here:
5496 *
5497 * - 'buffer_hz' is the kernel's real timer tick rate, a small number in the
5498 * approximate range of 100 to 1024. That means that we really need to
5499 * make sure that the qdisc can buffer that much data.
5500 *
5501 * - 'buffer_hz' is an absurdly large number. That means that the kernel
5502 * has finely granular timers and there's no need to fudge additional room
5503 * for buffers. (There's no extra effort needed to implement that: the
5504 * large 'buffer_hz' is used as a divisor, so practically any number will
5505 * come out as 0 in the division. Small integer results in the case of
5506 * really high dividends won't have any real effect anyhow.)
5507 */
5508static unsigned int buffer_hz;
5509
7874bdff
RD
5510static struct tcmsg *
5511netdev_linux_tc_make_request(const struct netdev *netdev, int type,
5512 unsigned int flags, struct ofpbuf *request)
5513{
5514 int ifindex;
5515 int error;
5516
5517 error = get_ifindex(netdev, &ifindex);
5518 if (error) {
5519 return NULL;
5520 }
5521
5522 return tc_make_request(ifindex, type, flags, request);
5523}
5524
f8500004
JP
5525/* Adds a policer to 'netdev' with a rate of 'kbits_rate' and a burst size
5526 * of 'kbits_burst'.
5527 *
5528 * This function is equivalent to running:
5529 * /sbin/tc filter add dev <devname> parent ffff: protocol all prio 49
5530 * basic police rate <kbits_rate>kbit burst <kbits_burst>k
5531 * mtu 65535 drop
5532 *
5533 * The configuration and stats may be seen with the following command:
c7952afb 5534 * /sbin/tc -s filter show dev <devname> parent ffff:
f8500004
JP
5535 *
5536 * Returns 0 if successful, otherwise a positive errno value.
5537 */
5538static int
c7952afb
BP
5539tc_add_policer(struct netdev *netdev,
5540 uint32_t kbits_rate, uint32_t kbits_burst)
f8500004
JP
5541{
5542 struct tc_police tc_police;
5543 struct ofpbuf request;
5544 struct tcmsg *tcmsg;
5545 size_t basic_offset;
5546 size_t police_offset;
5547 int error;
5548 int mtu = 65535;
5549
5550 memset(&tc_police, 0, sizeof tc_police);
5551 tc_police.action = TC_POLICE_SHOT;
5552 tc_police.mtu = mtu;
1aca400c 5553 tc_fill_rate(&tc_police.rate, ((uint64_t) kbits_rate * 1000)/8, mtu);
c7952afb 5554
79abacc8
MAA
5555 /* The following appears wrong in one way: In networking a kilobit is
5556 * usually 1000 bits but this uses 1024 bits.
c7952afb
BP
5557 *
5558 * However if you "fix" those problems then "tc filter show ..." shows
5559 * "125000b", meaning 125,000 bits, when OVS configures it for 1000 kbit ==
5560 * 1,000,000 bits, whereas this actually ends up doing the right thing from
5561 * tc's point of view. Whatever. */
5562 tc_police.burst = tc_bytes_to_ticks(
79abacc8 5563 tc_police.rate.rate, MIN(UINT32_MAX / 1024, kbits_burst) * 1024 / 8);
f8500004 5564
7874bdff
RD
5565 tcmsg = netdev_linux_tc_make_request(netdev, RTM_NEWTFILTER,
5566 NLM_F_EXCL | NLM_F_CREATE, &request);
f8500004
JP
5567 if (!tcmsg) {
5568 return ENODEV;
5569 }
5570 tcmsg->tcm_parent = tc_make_handle(0xffff, 0);
5571 tcmsg->tcm_info = tc_make_handle(49,
5572 (OVS_FORCE uint16_t) htons(ETH_P_ALL));
5573
5574 nl_msg_put_string(&request, TCA_KIND, "basic");
5575 basic_offset = nl_msg_start_nested(&request, TCA_OPTIONS);
5576 police_offset = nl_msg_start_nested(&request, TCA_BASIC_POLICE);
5577 nl_msg_put_unspec(&request, TCA_POLICE_TBF, &tc_police, sizeof tc_police);
5578 tc_put_rtab(&request, TCA_POLICE_RATE, &tc_police.rate);
5579 nl_msg_end_nested(&request, police_offset);
5580 nl_msg_end_nested(&request, basic_offset);
5581
5582 error = tc_transact(&request, NULL);
5583 if (error) {
5584 return error;
5585 }
5586
5587 return 0;
5588}
5589
c1c9c9c4
BP
5590static void
5591read_psched(void)
5592{
5593 /* The values in psched are not individually very meaningful, but they are
5594 * important. The tables below show some values seen in the wild.
5595 *
5596 * Some notes:
5597 *
5598 * - "c" has always been a constant 1000000 since at least Linux 2.4.14.
5599 * (Before that, there are hints that it was 1000000000.)
5600 *
5601 * - "d" can be unrealistically large, see the comment on 'buffer_hz'
5602 * above.
5603 *
5604 * /proc/net/psched
5605 * -----------------------------------
5606 * [1] 000c8000 000f4240 000f4240 00000064
5607 * [2] 000003e8 00000400 000f4240 3b9aca00
5608 * [3] 000003e8 00000400 000f4240 3b9aca00
5609 * [4] 000003e8 00000400 000f4240 00000064
5610 * [5] 000003e8 00000040 000f4240 3b9aca00
5611 * [6] 000003e8 00000040 000f4240 000000f9
5612 *
5613 * a b c d ticks_per_s buffer_hz
5614 * ------- --------- ---------- ------------- ----------- -------------
5615 * [1] 819,200 1,000,000 1,000,000 100 819,200 100
5616 * [2] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5617 * [3] 1,000 1,024 1,000,000 1,000,000,000 976,562 1,000,000,000
5618 * [4] 1,000 1,024 1,000,000 100 976,562 100
5619 * [5] 1,000 64 1,000,000 1,000,000,000 15,625,000 1,000,000,000
5620 * [6] 1,000 64 1,000,000 249 15,625,000 249
5621 *
5622 * [1] 2.6.18-128.1.6.el5.xs5.5.0.505.1024xen from XenServer 5.5.0-24648p
5623 * [2] 2.6.26-1-686-bigmem from Debian lenny
5624 * [3] 2.6.26-2-sparc64 from Debian lenny
5625 * [4] 2.6.27.42-0.1.1.xs5.6.810.44.111163xen from XenServer 5.6.810-31078p
5626 * [5] 2.6.32.21.22 (approx.) from Ubuntu 10.04 on VMware Fusion
5627 * [6] 2.6.34 from kernel.org on KVM
5628 */
23882115 5629 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
c1c9c9c4
BP
5630 static const char fn[] = "/proc/net/psched";
5631 unsigned int a, b, c, d;
5632 FILE *stream;
5633
23882115
BP
5634 if (!ovsthread_once_start(&once)) {
5635 return;
5636 }
5637
c1c9c9c4
BP
5638 ticks_per_s = 1.0;
5639 buffer_hz = 100;
5640
5641 stream = fopen(fn, "r");
5642 if (!stream) {
10a89ef0 5643 VLOG_WARN("%s: open failed: %s", fn, ovs_strerror(errno));
23882115 5644 goto exit;
c1c9c9c4
BP
5645 }
5646
5647 if (fscanf(stream, "%x %x %x %x", &a, &b, &c, &d) != 4) {
5648 VLOG_WARN("%s: read failed", fn);
5649 fclose(stream);
23882115 5650 goto exit;
c1c9c9c4
BP
5651 }
5652 VLOG_DBG("%s: psched parameters are: %u %u %u %u", fn, a, b, c, d);
5653 fclose(stream);
5654
1bab4901 5655 if (!a || !b || !c) {
c1c9c9c4 5656 VLOG_WARN("%s: invalid scheduler parameters", fn);
23882115 5657 goto exit;
c1c9c9c4
BP
5658 }
5659
5660 ticks_per_s = (double) a * c / b;
5661 if (c == 1000000) {
5662 buffer_hz = d;
5663 } else {
5664 VLOG_WARN("%s: unexpected psched parameters: %u %u %u %u",
5665 fn, a, b, c, d);
5666 }
5667 VLOG_DBG("%s: ticks_per_s=%f buffer_hz=%u", fn, ticks_per_s, buffer_hz);
23882115
BP
5668
5669exit:
5670 ovsthread_once_done(&once);
c1c9c9c4
BP
5671}
5672
5673/* Returns the number of bytes that can be transmitted in 'ticks' ticks at a
5674 * rate of 'rate' bytes per second. */
5675static unsigned int
5676tc_ticks_to_bytes(unsigned int rate, unsigned int ticks)
5677{
23882115 5678 read_psched();
c1c9c9c4
BP
5679 return (rate * ticks) / ticks_per_s;
5680}
5681
5682/* Returns the number of ticks that it would take to transmit 'size' bytes at a
5683 * rate of 'rate' bytes per second. */
5684static unsigned int
5685tc_bytes_to_ticks(unsigned int rate, unsigned int size)
5686{
23882115 5687 read_psched();
015c93a4 5688 return rate ? ((unsigned long long int) ticks_per_s * size) / rate : 0;
c1c9c9c4
BP
5689}
5690
5691/* Returns the number of bytes that need to be reserved for qdisc buffering at
5692 * a transmission rate of 'rate' bytes per second. */
5693static unsigned int
5694tc_buffer_per_jiffy(unsigned int rate)
5695{
23882115 5696 read_psched();
c1c9c9c4
BP
5697 return rate / buffer_hz;
5698}
5699
2f564bb1
S
5700static uint32_t
5701tc_time_to_ticks(uint32_t time) {
5702 read_psched();
5703 return time * (ticks_per_s / 1000000);
5704}
5705
c1c9c9c4
BP
5706/* Given Netlink 'msg' that describes a qdisc, extracts the name of the qdisc,
5707 * e.g. "htb", into '*kind' (if it is nonnull). If 'options' is nonnull,
5708 * extracts 'msg''s TCA_OPTIONS attributes into '*options' if it is present or
5709 * stores NULL into it if it is absent.
5710 *
5711 * '*kind' and '*options' point into 'msg', so they are owned by whoever owns
5712 * 'msg'.
5713 *
5714 * Returns 0 if successful, otherwise a positive errno value. */
5715static int
5716tc_parse_qdisc(const struct ofpbuf *msg, const char **kind,
5717 struct nlattr **options)
5718{
5719 static const struct nl_policy tca_policy[] = {
5720 [TCA_KIND] = { .type = NL_A_STRING, .optional = false },
5721 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
5722 };
5723 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5724
5725 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5726 tca_policy, ta, ARRAY_SIZE(ta))) {
5727 VLOG_WARN_RL(&rl, "failed to parse qdisc message");
5728 goto error;
5729 }
5730
5731 if (kind) {
5732 *kind = nl_attr_get_string(ta[TCA_KIND]);
5733 }
5734
5735 if (options) {
5736 *options = ta[TCA_OPTIONS];
5737 }
5738
5739 return 0;
5740
5741error:
5742 if (kind) {
5743 *kind = NULL;
5744 }
5745 if (options) {
5746 *options = NULL;
5747 }
5748 return EPROTO;
5749}
5750
5751/* Given Netlink 'msg' that describes a class, extracts the queue ID (e.g. the
5752 * minor number of its class ID) into '*queue_id', its TCA_OPTIONS attribute
5753 * into '*options', and its queue statistics into '*stats'. Any of the output
5754 * arguments may be null.
5755 *
5756 * Returns 0 if successful, otherwise a positive errno value. */
5757static int
5758tc_parse_class(const struct ofpbuf *msg, unsigned int *handlep,
5759 struct nlattr **options, struct netdev_queue_stats *stats)
5760{
5761 static const struct nl_policy tca_policy[] = {
5762 [TCA_OPTIONS] = { .type = NL_A_NESTED, .optional = false },
5763 [TCA_STATS2] = { .type = NL_A_NESTED, .optional = false },
5764 };
5765 struct nlattr *ta[ARRAY_SIZE(tca_policy)];
5766
5767 if (!nl_policy_parse(msg, NLMSG_HDRLEN + sizeof(struct tcmsg),
5768 tca_policy, ta, ARRAY_SIZE(ta))) {
5769 VLOG_WARN_RL(&rl, "failed to parse class message");
5770 goto error;
5771 }
5772
5773 if (handlep) {
5774 struct tcmsg *tc = ofpbuf_at_assert(msg, NLMSG_HDRLEN, sizeof *tc);
5775 *handlep = tc->tcm_handle;
5776 }
5777
5778 if (options) {
5779 *options = ta[TCA_OPTIONS];
5780 }
5781
5782 if (stats) {
5783 const struct gnet_stats_queue *gsq;
5784 struct gnet_stats_basic gsb;
5785
5786 static const struct nl_policy stats_policy[] = {
5787 [TCA_STATS_BASIC] = { .type = NL_A_UNSPEC, .optional = false,
5788 .min_len = sizeof gsb },
5789 [TCA_STATS_QUEUE] = { .type = NL_A_UNSPEC, .optional = false,
5790 .min_len = sizeof *gsq },
5791 };
5792 struct nlattr *sa[ARRAY_SIZE(stats_policy)];
5793
5794 if (!nl_parse_nested(ta[TCA_STATS2], stats_policy,
5795 sa, ARRAY_SIZE(sa))) {
5796 VLOG_WARN_RL(&rl, "failed to parse class stats");
5797 goto error;
5798 }
5799
5800 /* Alignment issues screw up the length of struct gnet_stats_basic on
5801 * some arch/bitsize combinations. Newer versions of Linux have a
5802 * struct gnet_stats_basic_packed, but we can't depend on that. The
5803 * easiest thing to do is just to make a copy. */
5804 memset(&gsb, 0, sizeof gsb);
5805 memcpy(&gsb, nl_attr_get(sa[TCA_STATS_BASIC]),
5806 MIN(nl_attr_get_size(sa[TCA_STATS_BASIC]), sizeof gsb));
5807 stats->tx_bytes = gsb.bytes;
5808 stats->tx_packets = gsb.packets;
5809
5810 gsq = nl_attr_get(sa[TCA_STATS_QUEUE]);
5811 stats->tx_errors = gsq->drops;
5812 }
5813
5814 return 0;
5815
5816error:
5817 if (options) {
5818 *options = NULL;
5819 }
5820 if (stats) {
5821 memset(stats, 0, sizeof *stats);
5822 }
5823 return EPROTO;
5824}
5825
5826/* Queries the kernel for class with identifier 'handle' and parent 'parent'
5827 * on 'netdev'. */
5828static int
5829tc_query_class(const struct netdev *netdev,
5830 unsigned int handle, unsigned int parent,
5831 struct ofpbuf **replyp)
5832{
5833 struct ofpbuf request;
5834 struct tcmsg *tcmsg;
5835 int error;
5836
7874bdff
RD
5837 tcmsg = netdev_linux_tc_make_request(netdev, RTM_GETTCLASS, NLM_F_ECHO,
5838 &request);
23a98ffe
BP
5839 if (!tcmsg) {
5840 return ENODEV;
5841 }
c1c9c9c4
BP
5842 tcmsg->tcm_handle = handle;
5843 tcmsg->tcm_parent = parent;
5844
5845 error = tc_transact(&request, replyp);
5846 if (error) {
5847 VLOG_WARN_RL(&rl, "query %s class %u:%u (parent %u:%u) failed (%s)",
5848 netdev_get_name(netdev),
5849 tc_get_major(handle), tc_get_minor(handle),
5850 tc_get_major(parent), tc_get_minor(parent),
10a89ef0 5851 ovs_strerror(error));
c1c9c9c4
BP
5852 }
5853 return error;
5854}
5855
5856/* Equivalent to "tc class del dev <name> handle <handle>". */
5857static int
5858tc_delete_class(const struct netdev *netdev, unsigned int handle)
5859{
5860 struct ofpbuf request;
5861 struct tcmsg *tcmsg;
5862 int error;
5863
7874bdff 5864 tcmsg = netdev_linux_tc_make_request(netdev, RTM_DELTCLASS, 0, &request);
23a98ffe
BP
5865 if (!tcmsg) {
5866 return ENODEV;
5867 }
c1c9c9c4
BP
5868 tcmsg->tcm_handle = handle;
5869 tcmsg->tcm_parent = 0;
5870
5871 error = tc_transact(&request, NULL);
5872 if (error) {
5873 VLOG_WARN_RL(&rl, "delete %s class %u:%u failed (%s)",
5874 netdev_get_name(netdev),
5875 tc_get_major(handle), tc_get_minor(handle),
10a89ef0 5876 ovs_strerror(error));
c1c9c9c4
BP
5877 }
5878 return error;
5879}
5880
5881/* Equivalent to "tc qdisc del dev <name> root". */
5882static int
b5d57fc8 5883tc_del_qdisc(struct netdev *netdev_)
c1c9c9c4 5884{
b5d57fc8 5885 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5886 struct ofpbuf request;
5887 struct tcmsg *tcmsg;
5888 int error;
5889
7874bdff 5890 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_DELQDISC, 0, &request);
23a98ffe
BP
5891 if (!tcmsg) {
5892 return ENODEV;
5893 }
c1c9c9c4
BP
5894 tcmsg->tcm_handle = tc_make_handle(1, 0);
5895 tcmsg->tcm_parent = TC_H_ROOT;
5896
5897 error = tc_transact(&request, NULL);
5898 if (error == EINVAL) {
5899 /* EINVAL probably means that the default qdisc was in use, in which
5900 * case we've accomplished our purpose. */
5901 error = 0;
5902 }
b5d57fc8
BP
5903 if (!error && netdev->tc) {
5904 if (netdev->tc->ops->tc_destroy) {
5905 netdev->tc->ops->tc_destroy(netdev->tc);
c1c9c9c4 5906 }
b5d57fc8 5907 netdev->tc = NULL;
c1c9c9c4
BP
5908 }
5909 return error;
5910}
5911
ac3e3aaa
BP
5912static bool
5913getqdisc_is_safe(void)
5914{
5915 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
5916 static bool safe = false;
5917
5918 if (ovsthread_once_start(&once)) {
5919 struct utsname utsname;
5920 int major, minor;
5921
5922 if (uname(&utsname) == -1) {
5923 VLOG_WARN("uname failed (%s)", ovs_strerror(errno));
5924 } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) {
5925 VLOG_WARN("uname reported bad OS release (%s)", utsname.release);
5926 } else if (major < 2 || (major == 2 && minor < 35)) {
5927 VLOG_INFO("disabling unsafe RTM_GETQDISC in Linux kernel %s",
5928 utsname.release);
5929 } else {
5930 safe = true;
5931 }
5932 ovsthread_once_done(&once);
5933 }
5934 return safe;
5935}
5936
c1c9c9c4
BP
5937/* If 'netdev''s qdisc type and parameters are not yet known, queries the
5938 * kernel to determine what they are. Returns 0 if successful, otherwise a
5939 * positive errno value. */
5940static int
b5d57fc8 5941tc_query_qdisc(const struct netdev *netdev_)
c1c9c9c4 5942{
b5d57fc8 5943 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c1c9c9c4
BP
5944 struct ofpbuf request, *qdisc;
5945 const struct tc_ops *ops;
5946 struct tcmsg *tcmsg;
5947 int load_error;
5948 int error;
5949
b5d57fc8 5950 if (netdev->tc) {
c1c9c9c4
BP
5951 return 0;
5952 }
5953
5954 /* This RTM_GETQDISC is crafted to avoid OOPSing kernels that do not have
5955 * commit 53b0f08 "net_sched: Fix qdisc_notify()", which is anything before
5956 * 2.6.35 without that fix backported to it.
5957 *
5958 * To avoid the OOPS, we must not make a request that would attempt to dump
5959 * a "built-in" qdisc, that is, the default pfifo_fast qdisc or one of a
5960 * few others. There are a few ways that I can see to do this, but most of
5961 * them seem to be racy (and if you lose the race the kernel OOPSes). The
5962 * technique chosen here is to assume that any non-default qdisc that we
5963 * create will have a class with handle 1:0. The built-in qdiscs only have
5964 * a class with handle 0:0.
5965 *
ac3e3aaa
BP
5966 * On Linux 2.6.35+ we use the straightforward method because it allows us
5967 * to handle non-builtin qdiscs without handle 1:0 (e.g. codel). However,
5968 * in such a case we get no response at all from the kernel (!) if a
5969 * builtin qdisc is in use (which is later caught by "!error &&
5970 * !qdisc->size"). */
7874bdff
RD
5971 tcmsg = netdev_linux_tc_make_request(netdev_, RTM_GETQDISC, NLM_F_ECHO,
5972 &request);
23a98ffe
BP
5973 if (!tcmsg) {
5974 return ENODEV;
5975 }
ac3e3aaa
BP
5976 tcmsg->tcm_handle = tc_make_handle(getqdisc_is_safe() ? 0 : 1, 0);
5977 tcmsg->tcm_parent = getqdisc_is_safe() ? TC_H_ROOT : 0;
c1c9c9c4
BP
5978
5979 /* Figure out what tc class to instantiate. */
5980 error = tc_transact(&request, &qdisc);
ac3e3aaa 5981 if (!error && qdisc->size) {
c1c9c9c4
BP
5982 const char *kind;
5983
5984 error = tc_parse_qdisc(qdisc, &kind, NULL);
5985 if (error) {
5986 ops = &tc_ops_other;
5987 } else {
5988 ops = tc_lookup_linux_name(kind);
5989 if (!ops) {
5990 static struct vlog_rate_limit rl2 = VLOG_RATE_LIMIT_INIT(1, 1);
ac3e3aaa 5991 VLOG_DBG_RL(&rl2, "unknown qdisc \"%s\"", kind);
c1c9c9c4
BP
5992
5993 ops = &tc_ops_other;
5994 }
5995 }
ac3e3aaa
BP
5996 } else if ((!error && !qdisc->size) || error == ENOENT) {
5997 /* Either it's a built-in qdisc, or (on Linux pre-2.6.35) it's a qdisc
5998 * set up by some other entity that doesn't have a handle 1:0. We will
5999 * assume that it's the system default qdisc. */
c1c9c9c4
BP
6000 ops = &tc_ops_default;
6001 error = 0;
6002 } else {
6003 /* Who knows? Maybe the device got deleted. */
6004 VLOG_WARN_RL(&rl, "query %s qdisc failed (%s)",
10a89ef0 6005 netdev_get_name(netdev_), ovs_strerror(error));
c1c9c9c4
BP
6006 ops = &tc_ops_other;
6007 }
6008
6009 /* Instantiate it. */
b5d57fc8
BP
6010 load_error = ops->tc_load(CONST_CAST(struct netdev *, netdev_), qdisc);
6011 ovs_assert((load_error == 0) == (netdev->tc != NULL));
c1c9c9c4
BP
6012 ofpbuf_delete(qdisc);
6013
6014 return error ? error : load_error;
6015}
6016
6017/* Linux traffic control uses tables with 256 entries ("rtab" tables) to
6018 approximate the time to transmit packets of various lengths. For an MTU of
6019 256 or less, each entry is exact; for an MTU of 257 through 512, each entry
6020 represents two possible packet lengths; for a MTU of 513 through 1024, four
6021 possible lengths; and so on.
6022
6023 Returns, for the specified 'mtu', the number of bits that packet lengths
6024 need to be shifted right to fit within such a 256-entry table. */
6025static int
6026tc_calc_cell_log(unsigned int mtu)
6027{
6028 int cell_log;
6029
6030 if (!mtu) {
6031 mtu = ETH_PAYLOAD_MAX;
6032 }
6033 mtu += ETH_HEADER_LEN + VLAN_HEADER_LEN;
6034
6035 for (cell_log = 0; mtu >= 256; cell_log++) {
6036 mtu >>= 1;
6037 }
6038
6039 return cell_log;
6040}
6041
6042/* Initializes 'rate' properly for a rate of 'Bps' bytes per second with an MTU
6043 * of 'mtu'. */
6044static void
6045tc_fill_rate(struct tc_ratespec *rate, uint64_t Bps, int mtu)
6046{
6047 memset(rate, 0, sizeof *rate);
6048 rate->cell_log = tc_calc_cell_log(mtu);
6049 /* rate->overhead = 0; */ /* New in 2.6.24, not yet in some */
6050 /* rate->cell_align = 0; */ /* distro headers. */
6051 rate->mpu = ETH_TOTAL_MIN;
6052 rate->rate = Bps;
6053}
6054
6055/* Appends to 'msg' an "rtab" table for the specified 'rate' as a Netlink
6056 * attribute of the specified "type".
6057 *
6058 * See tc_calc_cell_log() above for a description of "rtab"s. */
e7f6ba22 6059void
c1c9c9c4
BP
6060tc_put_rtab(struct ofpbuf *msg, uint16_t type, const struct tc_ratespec *rate)
6061{
6062 uint32_t *rtab;
6063 unsigned int i;
6064
6065 rtab = nl_msg_put_unspec_uninit(msg, type, TC_RTAB_SIZE);
6066 for (i = 0; i < TC_RTAB_SIZE / sizeof *rtab; i++) {
6067 unsigned packet_size = (i + 1) << rate->cell_log;
6068 if (packet_size < rate->mpu) {
6069 packet_size = rate->mpu;
6070 }
6071 rtab[i] = tc_bytes_to_ticks(rate->rate, packet_size);
6072 }
6073}
6074
6075/* Calculates the proper value of 'buffer' or 'cbuffer' in HTB options given a
6076 * rate of 'Bps' bytes per second, the specified 'mtu', and a user-requested
6077 * burst size of 'burst_bytes'. (If no value was requested, a 'burst_bytes' of
015c93a4 6078 * 0 is fine.) */
c1c9c9c4
BP
6079static int
6080tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes)
6081{
6082 unsigned int min_burst = tc_buffer_per_jiffy(Bps) + mtu;
6083 return tc_bytes_to_ticks(Bps, MAX(burst_bytes, min_burst));
6084}
d3980822 6085\f
aaf2fb1a
BP
6086/* Linux-only functions declared in netdev-linux.h */
6087
6088/* Modifies the 'flag' bit in ethtool's flags field for 'netdev'. If
6089 * 'enable' is true, the bit is set. Otherwise, it is cleared. */
6090int
6091netdev_linux_ethtool_set_flag(struct netdev *netdev, uint32_t flag,
6092 const char *flag_name, bool enable)
6093{
6094 const char *netdev_name = netdev_get_name(netdev);
6095 struct ethtool_value evalue;
6096 uint32_t new_flags;
6097 int error;
6098
ab985a77 6099 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
6100 memset(&evalue, 0, sizeof evalue);
6101 error = netdev_linux_do_ethtool(netdev_name,
6102 (struct ethtool_cmd *)&evalue,
6103 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6104 if (error) {
6105 return error;
6106 }
6107
ab985a77 6108 COVERAGE_INC(netdev_set_ethtool);
ad2dade5
AS
6109 new_flags = (evalue.data & ~flag) | (enable ? flag : 0);
6110 if (new_flags == evalue.data) {
6111 return 0;
6112 }
6113 evalue.data = new_flags;
aaf2fb1a
BP
6114 error = netdev_linux_do_ethtool(netdev_name,
6115 (struct ethtool_cmd *)&evalue,
6116 ETHTOOL_SFLAGS, "ETHTOOL_SFLAGS");
6117 if (error) {
6118 return error;
6119 }
6120
ab985a77 6121 COVERAGE_INC(netdev_get_ethtool);
aaf2fb1a
BP
6122 memset(&evalue, 0, sizeof evalue);
6123 error = netdev_linux_do_ethtool(netdev_name,
6124 (struct ethtool_cmd *)&evalue,
6125 ETHTOOL_GFLAGS, "ETHTOOL_GFLAGS");
6126 if (error) {
6127 return error;
6128 }
6129
6130 if (new_flags != evalue.data) {
6131 VLOG_WARN_RL(&rl, "attempt to %s ethtool %s flag on network "
6132 "device %s failed", enable ? "enable" : "disable",
6133 flag_name, netdev_name);
6134 return EOPNOTSUPP;
6135 }
6136
6137 return 0;
6138}
6139\f
6140/* Utility functions. */
6141
d3980822 6142/* Copies 'src' into 'dst', performing format conversion in the process. */
f613a0d7 6143static void
d3980822
BP
6144netdev_stats_from_rtnl_link_stats(struct netdev_stats *dst,
6145 const struct rtnl_link_stats *src)
6146{
f613a0d7
PS
6147 dst->rx_packets = src->rx_packets;
6148 dst->tx_packets = src->tx_packets;
6149 dst->rx_bytes = src->rx_bytes;
6150 dst->tx_bytes = src->tx_bytes;
6151 dst->rx_errors = src->rx_errors;
6152 dst->tx_errors = src->tx_errors;
6153 dst->rx_dropped = src->rx_dropped;
6154 dst->tx_dropped = src->tx_dropped;
6155 dst->multicast = src->multicast;
6156 dst->collisions = src->collisions;
6157 dst->rx_length_errors = src->rx_length_errors;
6158 dst->rx_over_errors = src->rx_over_errors;
6159 dst->rx_crc_errors = src->rx_crc_errors;
6160 dst->rx_frame_errors = src->rx_frame_errors;
6161 dst->rx_fifo_errors = src->rx_fifo_errors;
6162 dst->rx_missed_errors = src->rx_missed_errors;
6163 dst->tx_aborted_errors = src->tx_aborted_errors;
6164 dst->tx_carrier_errors = src->tx_carrier_errors;
6165 dst->tx_fifo_errors = src->tx_fifo_errors;
6166 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6167 dst->tx_window_errors = src->tx_window_errors;
d3980822
BP
6168}
6169
337c9b99
BP
6170/* Copies 'src' into 'dst', performing format conversion in the process. */
6171static void
6172netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
6173 const struct rtnl_link_stats64 *src)
6174{
6175 dst->rx_packets = src->rx_packets;
6176 dst->tx_packets = src->tx_packets;
6177 dst->rx_bytes = src->rx_bytes;
6178 dst->tx_bytes = src->tx_bytes;
6179 dst->rx_errors = src->rx_errors;
6180 dst->tx_errors = src->tx_errors;
6181 dst->rx_dropped = src->rx_dropped;
6182 dst->tx_dropped = src->tx_dropped;
6183 dst->multicast = src->multicast;
6184 dst->collisions = src->collisions;
6185 dst->rx_length_errors = src->rx_length_errors;
6186 dst->rx_over_errors = src->rx_over_errors;
6187 dst->rx_crc_errors = src->rx_crc_errors;
6188 dst->rx_frame_errors = src->rx_frame_errors;
6189 dst->rx_fifo_errors = src->rx_fifo_errors;
6190 dst->rx_missed_errors = src->rx_missed_errors;
6191 dst->tx_aborted_errors = src->tx_aborted_errors;
6192 dst->tx_carrier_errors = src->tx_carrier_errors;
6193 dst->tx_fifo_errors = src->tx_fifo_errors;
6194 dst->tx_heartbeat_errors = src->tx_heartbeat_errors;
6195 dst->tx_window_errors = src->tx_window_errors;
6196}
6197
0de1b425 6198int
35eef899 6199get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
c1c9c9c4 6200{
c1c9c9c4
BP
6201 struct ofpbuf request;
6202 struct ofpbuf *reply;
c1c9c9c4
BP
6203 int error;
6204
d6e3feb5 6205 /* Filtering all counters by default */
6206 memset(stats, 0xFF, sizeof(struct netdev_stats));
6207
c1c9c9c4 6208 ofpbuf_init(&request, 0);
13a24df8
BP
6209 nl_msg_put_nlmsghdr(&request,
6210 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ),
6211 RTM_GETLINK, NLM_F_REQUEST);
6212 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6213 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(netdev_));
a88b4e04 6214 error = nl_transact(NETLINK_ROUTE, &request, &reply);
c1c9c9c4
BP
6215 ofpbuf_uninit(&request);
6216 if (error) {
6217 return error;
6218 }
6219
13a24df8 6220 if (ofpbuf_try_pull(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg))) {
337c9b99
BP
6221 const struct nlattr *a = nl_attr_find(reply, 0, IFLA_STATS64);
6222 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats64)) {
6223 netdev_stats_from_rtnl_link_stats64(stats, nl_attr_get(a));
13a24df8
BP
6224 error = 0;
6225 } else {
71f21279 6226 a = nl_attr_find(reply, 0, IFLA_STATS);
337c9b99
BP
6227 if (a && nl_attr_get_size(a) >= sizeof(struct rtnl_link_stats)) {
6228 netdev_stats_from_rtnl_link_stats(stats, nl_attr_get(a));
6229 error = 0;
6230 } else {
6231 VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
6232 error = EPROTO;
6233 }
13a24df8
BP
6234 }
6235 } else {
6236 VLOG_WARN_RL(&rl, "short RTM_GETLINK reply");
6237 error = EPROTO;
c1c9c9c4 6238 }
8b61709d 6239
8b61709d 6240
576e26d7 6241 ofpbuf_delete(reply);
35eef899 6242 return error;
8b61709d 6243}
c1c9c9c4 6244
3a183124 6245static int
b5d57fc8 6246get_flags(const struct netdev *dev, unsigned int *flags)
8b61709d
BP
6247{
6248 struct ifreq ifr;
6249 int error;
6250
755be9ea 6251 *flags = 0;
259e0b1a 6252 error = af_inet_ifreq_ioctl(dev->name, &ifr, SIOCGIFFLAGS, "SIOCGIFFLAGS");
755be9ea
EJ
6253 if (!error) {
6254 *flags = ifr.ifr_flags;
6255 }
8b61709d
BP
6256 return error;
6257}
6258
6259static int
4b609110 6260set_flags(const char *name, unsigned int flags)
8b61709d
BP
6261{
6262 struct ifreq ifr;
6263
6264 ifr.ifr_flags = flags;
259e0b1a 6265 return af_inet_ifreq_ioctl(name, &ifr, SIOCSIFFLAGS, "SIOCSIFFLAGS");
8b61709d
BP
6266}
6267
01b25786
PB
6268int
6269linux_get_ifindex(const char *netdev_name)
8b61709d
BP
6270{
6271 struct ifreq ifr;
259e0b1a 6272 int error;
8b61709d 6273
71d7c22f 6274 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 6275 COVERAGE_INC(netdev_get_ifindex);
259e0b1a
BP
6276
6277 error = af_inet_ioctl(SIOCGIFINDEX, &ifr);
6278 if (error) {
580e1152
RD
6279 /* ENODEV probably means that a vif disappeared asynchronously and
6280 * hasn't been removed from the database yet, so reduce the log level
6281 * to INFO for that case. */
6282 VLOG_RL(&rl, error == ENODEV ? VLL_INFO : VLL_ERR,
6283 "ioctl(SIOCGIFINDEX) on %s device failed: %s",
6284 netdev_name, ovs_strerror(error));
259e0b1a 6285 return -error;
8b61709d
BP
6286 }
6287 return ifr.ifr_ifindex;
6288}
6289
6290static int
6291get_ifindex(const struct netdev *netdev_, int *ifindexp)
6292{
b5d57fc8 6293 struct netdev_linux *netdev = netdev_linux_cast(netdev_);
c7b1b0a5 6294
b5d57fc8 6295 if (!(netdev->cache_valid & VALID_IFINDEX)) {
756819dd
FL
6296 netdev_linux_update_via_netlink(netdev);
6297 }
6298
6299 if (!(netdev->cache_valid & VALID_IFINDEX)) {
6300 /* Fall back to ioctl if netlink fails */
01b25786 6301 int ifindex = linux_get_ifindex(netdev_get_name(netdev_));
c7b1b0a5 6302
8b61709d 6303 if (ifindex < 0) {
b5d57fc8
BP
6304 netdev->get_ifindex_error = -ifindex;
6305 netdev->ifindex = 0;
c7b1b0a5 6306 } else {
b5d57fc8
BP
6307 netdev->get_ifindex_error = 0;
6308 netdev->ifindex = ifindex;
8b61709d 6309 }
b5d57fc8 6310 netdev->cache_valid |= VALID_IFINDEX;
8b61709d 6311 }
c7b1b0a5 6312
b5d57fc8
BP
6313 *ifindexp = netdev->ifindex;
6314 return netdev->get_ifindex_error;
8b61709d
BP
6315}
6316
6317static int
756819dd
FL
6318netdev_linux_update_via_netlink(struct netdev_linux *netdev)
6319{
6320 struct ofpbuf request;
6321 struct ofpbuf *reply;
6322 struct rtnetlink_change chg;
6323 struct rtnetlink_change *change = &chg;
6324 int error;
6325
6326 ofpbuf_init(&request, 0);
6327 nl_msg_put_nlmsghdr(&request,
b43762a5
FL
6328 sizeof(struct ifinfomsg) + NL_ATTR_SIZE(IFNAMSIZ) +
6329 NL_A_U32_SIZE, RTM_GETLINK, NLM_F_REQUEST);
756819dd
FL
6330 ofpbuf_put_zeros(&request, sizeof(struct ifinfomsg));
6331
6332 /* The correct identifiers for a Linux device are netnsid and ifindex,
6333 * but ifindex changes as the port is moved to another network namespace
6334 * and the interface name statically stored in ovsdb. */
6335 nl_msg_put_string(&request, IFLA_IFNAME, netdev_get_name(&netdev->up));
6336 if (netdev_linux_netnsid_is_remote(netdev)) {
23fa50f6 6337 nl_msg_put_u32(&request, IFLA_IF_NETNSID, netdev->netnsid);
756819dd
FL
6338 }
6339 error = nl_transact(NETLINK_ROUTE, &request, &reply);
6340 ofpbuf_uninit(&request);
6341 if (error) {
6342 ofpbuf_delete(reply);
6343 return error;
6344 }
6345
6346 if (rtnetlink_parse(reply, change)
6347 && change->nlmsg_type == RTM_NEWLINK) {
6348 bool changed = false;
6349 error = 0;
6350
6351 /* Update netdev from rtnl msg and increment its seq if needed. */
6352 if ((change->ifi_flags ^ netdev->ifi_flags) & IFF_RUNNING) {
6353 netdev->carrier_resets++;
6354 changed = true;
6355 }
6356 if (change->ifi_flags != netdev->ifi_flags) {
6357 netdev->ifi_flags = change->ifi_flags;
6358 changed = true;
6359 }
6360 if (change->mtu && change->mtu != netdev->mtu) {
6361 netdev->mtu = change->mtu;
6362 netdev->cache_valid |= VALID_MTU;
6363 netdev->netdev_mtu_error = 0;
6364 changed = true;
6365 }
6366 if (!eth_addr_is_zero(change->mac)
6367 && !eth_addr_equals(change->mac, netdev->etheraddr)) {
6368 netdev->etheraddr = change->mac;
6369 netdev->cache_valid |= VALID_ETHERADDR;
6370 netdev->ether_addr_error = 0;
6371 changed = true;
6372 }
6373 if (change->if_index != netdev->ifindex) {
6374 netdev->ifindex = change->if_index;
6375 netdev->cache_valid |= VALID_IFINDEX;
6376 netdev->get_ifindex_error = 0;
6377 changed = true;
6378 }
91fc374a 6379 if (change->primary && netdev_linux_kind_is_lag(change->primary)) {
3d9c99ab
JH
6380 netdev->is_lag_master = true;
6381 }
756819dd
FL
6382 if (changed) {
6383 netdev_change_seq_changed(&netdev->up);
6384 }
6385 } else {
6386 error = EINVAL;
6387 }
6388
6389 ofpbuf_delete(reply);
6390 return error;
6391}
6392
6393static int
74ff3298 6394get_etheraddr(const char *netdev_name, struct eth_addr *ea)
8b61709d
BP
6395{
6396 struct ifreq ifr;
6397 int hwaddr_family;
259e0b1a 6398 int error;
8b61709d
BP
6399
6400 memset(&ifr, 0, sizeof ifr);
71d7c22f 6401 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
8b61709d 6402 COVERAGE_INC(netdev_get_hwaddr);
259e0b1a
BP
6403 error = af_inet_ioctl(SIOCGIFHWADDR, &ifr);
6404 if (error) {
78857dfb
BP
6405 /* ENODEV probably means that a vif disappeared asynchronously and
6406 * hasn't been removed from the database yet, so reduce the log level
6407 * to INFO for that case. */
259e0b1a 6408 VLOG(error == ENODEV ? VLL_INFO : VLL_ERR,
78857dfb 6409 "ioctl(SIOCGIFHWADDR) on %s device failed: %s",
259e0b1a
BP
6410 netdev_name, ovs_strerror(error));
6411 return error;
8b61709d
BP
6412 }
6413 hwaddr_family = ifr.ifr_hwaddr.sa_family;
2482b0b0
JS
6414 if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER &&
6415 hwaddr_family != ARPHRD_NONE) {
c9697f35 6416 VLOG_INFO("%s device has unknown hardware address family %d",
8b61709d 6417 netdev_name, hwaddr_family);
c9697f35 6418 return EINVAL;
8b61709d
BP
6419 }
6420 memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
6421 return 0;
6422}
6423
6424static int
74ff3298 6425set_etheraddr(const char *netdev_name, const struct eth_addr mac)
8b61709d
BP
6426{
6427 struct ifreq ifr;
259e0b1a 6428 int error;
8b61709d
BP
6429
6430 memset(&ifr, 0, sizeof ifr);
71d7c22f 6431 ovs_strzcpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
44445cac 6432 ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER;
74ff3298 6433 memcpy(ifr.ifr_hwaddr.sa_data, &mac, ETH_ADDR_LEN);
8b61709d 6434 COVERAGE_INC(netdev_set_hwaddr);
259e0b1a
BP
6435 error = af_inet_ioctl(SIOCSIFHWADDR, &ifr);
6436 if (error) {
8b61709d 6437 VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
259e0b1a 6438 netdev_name, ovs_strerror(error));
8b61709d 6439 }
259e0b1a 6440 return error;
8b61709d
BP
6441}
6442
6443static int
0b0544d7 6444netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *ecmd,
8b61709d
BP
6445 int cmd, const char *cmd_name)
6446{
6447 struct ifreq ifr;
259e0b1a 6448 int error;
8b61709d
BP
6449
6450 memset(&ifr, 0, sizeof ifr);
71d7c22f 6451 ovs_strzcpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
8b61709d
BP
6452 ifr.ifr_data = (caddr_t) ecmd;
6453
6454 ecmd->cmd = cmd;
259e0b1a
BP
6455 error = af_inet_ioctl(SIOCETHTOOL, &ifr);
6456 if (error) {
6457 if (error != EOPNOTSUPP) {
8b61709d 6458 VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
259e0b1a 6459 "failed: %s", cmd_name, name, ovs_strerror(error));
8b61709d
BP
6460 } else {
6461 /* The device doesn't support this operation. That's pretty
6462 * common, so there's no point in logging anything. */
6463 }
8b61709d 6464 }
259e0b1a 6465 return error;
8b61709d 6466}
f1acd62b 6467
488d734d
BP
6468/* Returns an AF_PACKET raw socket or a negative errno value. */
6469static int
6470af_packet_sock(void)
6471{
23882115
BP
6472 static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
6473 static int sock;
488d734d 6474
23882115 6475 if (ovsthread_once_start(&once)) {
488d734d
BP
6476 sock = socket(AF_PACKET, SOCK_RAW, 0);
6477 if (sock >= 0) {
8450059e
BP
6478 int error = set_nonblocking(sock);
6479 if (error) {
6480 close(sock);
6481 sock = -error;
29cf9c1b
FL
6482 } else if (userspace_tso_enabled()) {
6483 int val = 1;
6484 error = setsockopt(sock, SOL_PACKET, PACKET_VNET_HDR, &val,
6485 sizeof val);
6486 if (error) {
6487 error = errno;
6488 VLOG_ERR("failed to enable vnet hdr in raw socket: %s",
6489 ovs_strerror(errno));
6490 close(sock);
6491 sock = -error;
6492 }
8450059e 6493 }
488d734d
BP
6494 } else {
6495 sock = -errno;
10a89ef0
BP
6496 VLOG_ERR("failed to create packet socket: %s",
6497 ovs_strerror(errno));
488d734d 6498 }
23882115 6499 ovsthread_once_done(&once);
488d734d
BP
6500 }
6501
6502 return sock;
6503}
29cf9c1b
FL
6504
6505static int
6506netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
6507{
6508 struct eth_header *eth_hdr;
6509 ovs_be16 eth_type;
6510 int l2_len;
6511
6512 eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
6513 if (!eth_hdr) {
6514 return -EINVAL;
6515 }
6516
6517 l2_len = ETH_HEADER_LEN;
6518 eth_type = eth_hdr->eth_type;
6519 if (eth_type_vlan(eth_type)) {
6520 struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
6521
6522 if (!vlan) {
6523 return -EINVAL;
6524 }
6525
6526 eth_type = vlan->vlan_next_type;
6527 l2_len += VLAN_HEADER_LEN;
6528 }
6529
6530 if (eth_type == htons(ETH_TYPE_IP)) {
6531 struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
6532
6533 if (!ip_hdr) {
6534 return -EINVAL;
6535 }
6536
6537 *l4proto = ip_hdr->ip_proto;
6538 dp_packet_hwol_set_tx_ipv4(b);
6539 } else if (eth_type == htons(ETH_TYPE_IPV6)) {
6540 struct ovs_16aligned_ip6_hdr *nh6;
6541
6542 nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
6543 if (!nh6) {
6544 return -EINVAL;
6545 }
6546
6547 *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
6548 dp_packet_hwol_set_tx_ipv6(b);
6549 }
6550
6551 return 0;
6552}
6553
6554static int
6555netdev_linux_parse_vnet_hdr(struct dp_packet *b)
6556{
6557 struct virtio_net_hdr *vnet = dp_packet_pull(b, sizeof *vnet);
6558 uint16_t l4proto = 0;
6559
6560 if (OVS_UNLIKELY(!vnet)) {
6561 return -EINVAL;
6562 }
6563
6564 if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
6565 return 0;
6566 }
6567
6568 if (netdev_linux_parse_l2(b, &l4proto)) {
6569 return -EINVAL;
6570 }
6571
6572 if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
6573 if (l4proto == IPPROTO_TCP) {
6574 dp_packet_hwol_set_csum_tcp(b);
6575 } else if (l4proto == IPPROTO_UDP) {
6576 dp_packet_hwol_set_csum_udp(b);
6577 } else if (l4proto == IPPROTO_SCTP) {
6578 dp_packet_hwol_set_csum_sctp(b);
6579 }
6580 }
6581
6582 if (l4proto && vnet->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
6583 uint8_t allowed_mask = VIRTIO_NET_HDR_GSO_TCPV4
6584 | VIRTIO_NET_HDR_GSO_TCPV6
6585 | VIRTIO_NET_HDR_GSO_UDP;
6586 uint8_t type = vnet->gso_type & allowed_mask;
6587
6588 if (type == VIRTIO_NET_HDR_GSO_TCPV4
6589 || type == VIRTIO_NET_HDR_GSO_TCPV6) {
6590 dp_packet_hwol_set_tcp_seg(b);
6591 }
6592 }
6593
6594 return 0;
6595}
6596
6597static void
6598netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
6599{
6600 struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet);
6601
6602 if (dp_packet_hwol_is_tso(b)) {
6603 uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b))
6604 + TCP_HEADER_LEN;
6605
6606 vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
6607 vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len);
6608 if (dp_packet_hwol_is_ipv4(b)) {
6609 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
6610 } else {
6611 vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
6612 }
6613
6614 } else {
6615 vnet->flags = VIRTIO_NET_HDR_GSO_NONE;
6616 }
6617
6618 if (dp_packet_hwol_l4_mask(b)) {
6619 vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
6620 vnet->csum_start = (OVS_FORCE __virtio16)((char *)dp_packet_l4(b)
6621 - (char *)dp_packet_eth(b));
6622
6623 if (dp_packet_hwol_l4_is_tcp(b)) {
6624 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6625 struct tcp_header, tcp_csum);
6626 } else if (dp_packet_hwol_l4_is_udp(b)) {
6627 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6628 struct udp_header, udp_csum);
6629 } else if (dp_packet_hwol_l4_is_sctp(b)) {
6630 vnet->csum_offset = (OVS_FORCE __virtio16) __builtin_offsetof(
6631 struct sctp_header, sctp_csum);
6632 } else {
6633 VLOG_WARN_RL(&rl, "Unsupported L4 protocol");
6634 }
6635 }
6636}